This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Integrate:
[perl5.git] / utf8.c
... / ...
CommitLineData
1/* utf8.c
2 *
3 * Copyright (c) 1998-2003, Larry Wall
4 *
5 * You may distribute under the terms of either the GNU General Public
6 * License or the Artistic License, as specified in the README file.
7 *
8 */
9
10/*
11 * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
12 * heard of that we don't want to see any closer; and that's the one place
13 * we're trying to get to! And that's just where we can't get, nohow.'
14 *
15 * 'Well do I understand your speech,' he answered in the same language;
16 * 'yet few strangers do so. Why then do you not speak in the Common Tongue,
17 * as is the custom in the West, if you wish to be answered?'
18 *
19 * ...the travellers perceived that the floor was paved with stones of many
20 * hues; branching runes and strange devices intertwined beneath their feet.
21 */
22
23#include "EXTERN.h"
24#define PERL_IN_UTF8_C
25#include "perl.h"
26
27static char unees[] = "Malformed UTF-8 character (unexpected end of string)";
28
29/*
30=head1 Unicode Support
31
32=for apidoc A|U8 *|uvuni_to_utf8_flags|U8 *d|UV uv|UV flags
33
34Adds the UTF8 representation of the Unicode codepoint C<uv> to the end
35of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
36bytes available. The return value is the pointer to the byte after the
37end of the new character. In other words,
38
39 d = uvuni_to_utf8_flags(d, uv, flags);
40
41or, in most cases,
42
43 d = uvuni_to_utf8(d, uv);
44
45(which is equivalent to)
46
47 d = uvuni_to_utf8_flags(d, uv, 0);
48
49is the recommended Unicode-aware way of saying
50
51 *(d++) = uv;
52
53=cut
54*/
55
56U8 *
57Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
58{
59 if (ckWARN(WARN_UTF8)) {
60 if (UNICODE_IS_SURROGATE(uv) &&
61 !(flags & UNICODE_ALLOW_SURROGATE))
62 Perl_warner(aTHX_ packWARN(WARN_UTF8), "UTF-16 surrogate 0x%04"UVxf, uv);
63 else if (
64 ((uv >= 0xFDD0 && uv <= 0xFDEF &&
65 !(flags & UNICODE_ALLOW_FDD0))
66 ||
67 ((uv & 0xFFFE) == 0xFFFE && /* Either FFFE or FFFF. */
68 !(flags & UNICODE_ALLOW_FFFF))) &&
69 /* UNICODE_ALLOW_SUPER includes
70 * FFFEs and FFFFs beyond 0x10FFFF. */
71 ((uv <= PERL_UNICODE_MAX) ||
72 !(flags & UNICODE_ALLOW_SUPER))
73 )
74 Perl_warner(aTHX_ packWARN(WARN_UTF8),
75 "Unicode character 0x%04"UVxf" is illegal", uv);
76 }
77 if (UNI_IS_INVARIANT(uv)) {
78 *d++ = (U8)UTF_TO_NATIVE(uv);
79 return d;
80 }
81#if defined(EBCDIC)
82 else {
83 STRLEN len = UNISKIP(uv);
84 U8 *p = d+len-1;
85 while (p > d) {
86 *p-- = (U8)UTF_TO_NATIVE((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
87 uv >>= UTF_ACCUMULATION_SHIFT;
88 }
89 *p = (U8)UTF_TO_NATIVE((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
90 return d+len;
91 }
92#else /* Non loop style */
93 if (uv < 0x800) {
94 *d++ = (U8)(( uv >> 6) | 0xc0);
95 *d++ = (U8)(( uv & 0x3f) | 0x80);
96 return d;
97 }
98 if (uv < 0x10000) {
99 *d++ = (U8)(( uv >> 12) | 0xe0);
100 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
101 *d++ = (U8)(( uv & 0x3f) | 0x80);
102 return d;
103 }
104 if (uv < 0x200000) {
105 *d++ = (U8)(( uv >> 18) | 0xf0);
106 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
107 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
108 *d++ = (U8)(( uv & 0x3f) | 0x80);
109 return d;
110 }
111 if (uv < 0x4000000) {
112 *d++ = (U8)(( uv >> 24) | 0xf8);
113 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
114 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
115 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
116 *d++ = (U8)(( uv & 0x3f) | 0x80);
117 return d;
118 }
119 if (uv < 0x80000000) {
120 *d++ = (U8)(( uv >> 30) | 0xfc);
121 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
122 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
123 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
124 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
125 *d++ = (U8)(( uv & 0x3f) | 0x80);
126 return d;
127 }
128#ifdef HAS_QUAD
129 if (uv < UTF8_QUAD_MAX)
130#endif
131 {
132 *d++ = 0xfe; /* Can't match U+FEFF! */
133 *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
134 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
135 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
136 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
137 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
138 *d++ = (U8)(( uv & 0x3f) | 0x80);
139 return d;
140 }
141#ifdef HAS_QUAD
142 {
143 *d++ = 0xff; /* Can't match U+FFFE! */
144 *d++ = 0x80; /* 6 Reserved bits */
145 *d++ = (U8)(((uv >> 60) & 0x0f) | 0x80); /* 2 Reserved bits */
146 *d++ = (U8)(((uv >> 54) & 0x3f) | 0x80);
147 *d++ = (U8)(((uv >> 48) & 0x3f) | 0x80);
148 *d++ = (U8)(((uv >> 42) & 0x3f) | 0x80);
149 *d++ = (U8)(((uv >> 36) & 0x3f) | 0x80);
150 *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
151 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
152 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
153 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
154 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
155 *d++ = (U8)(( uv & 0x3f) | 0x80);
156 return d;
157 }
158#endif
159#endif /* Loop style */
160}
161
162U8 *
163Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv)
164{
165 return Perl_uvuni_to_utf8_flags(aTHX_ d, uv, 0);
166}
167
168
169/*
170=for apidoc A|STRLEN|is_utf8_char|U8 *s
171
172Tests if some arbitrary number of bytes begins in a valid UTF-8
173character. Note that an INVARIANT (i.e. ASCII) character is a valid
174UTF-8 character. The actual number of bytes in the UTF-8 character
175will be returned if it is valid, otherwise 0.
176
177=cut */
178STRLEN
179Perl_is_utf8_char(pTHX_ U8 *s)
180{
181 U8 u = *s;
182 STRLEN slen, len;
183 UV uv, ouv;
184
185 if (UTF8_IS_INVARIANT(u))
186 return 1;
187
188 if (!UTF8_IS_START(u))
189 return 0;
190
191 len = UTF8SKIP(s);
192
193 if (len < 2 || !UTF8_IS_CONTINUATION(s[1]))
194 return 0;
195
196 slen = len - 1;
197 s++;
198 u &= UTF_START_MASK(len);
199 uv = u;
200 ouv = uv;
201 while (slen--) {
202 if (!UTF8_IS_CONTINUATION(*s))
203 return 0;
204 uv = UTF8_ACCUMULATE(uv, *s);
205 if (uv < ouv)
206 return 0;
207 ouv = uv;
208 s++;
209 }
210
211 if ((STRLEN)UNISKIP(uv) < len)
212 return 0;
213
214 return len;
215}
216
217/*
218=for apidoc A|bool|is_utf8_string|U8 *s|STRLEN len
219
220Returns true if first C<len> bytes of the given string form a valid
221UTF8 string, false otherwise. Note that 'a valid UTF8 string' does
222not mean 'a string that contains code points above 0x7F encoded in
223UTF8' because a valid ASCII string is a valid UTF8 string.
224
225=cut
226*/
227
228bool
229Perl_is_utf8_string(pTHX_ U8 *s, STRLEN len)
230{
231 U8* x = s;
232 U8* send;
233 STRLEN c;
234
235 if (!len)
236 len = strlen((char *)s);
237 send = s + len;
238
239 while (x < send) {
240 /* Inline the easy bits of is_utf8_char() here for speed... */
241 if (UTF8_IS_INVARIANT(*x))
242 c = 1;
243 else if (!UTF8_IS_START(*x))
244 return FALSE;
245 else {
246 /* ... and call is_utf8_char() only if really needed. */
247 c = is_utf8_char(x);
248 if (!c)
249 return FALSE;
250 }
251 x += c;
252 }
253 if (x != send)
254 return FALSE;
255
256 return TRUE;
257}
258
259/*
260=for apidoc A|UV|utf8n_to_uvuni|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
261
262Bottom level UTF-8 decode routine.
263Returns the unicode code point value of the first character in the string C<s>
264which is assumed to be in UTF8 encoding and no longer than C<curlen>;
265C<retlen> will be set to the length, in bytes, of that character.
266
267If C<s> does not point to a well-formed UTF8 character, the behaviour
268is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
269it is assumed that the caller will raise a warning, and this function
270will silently just set C<retlen> to C<-1> and return zero. If the
271C<flags> does not contain UTF8_CHECK_ONLY, warnings about
272malformations will be given, C<retlen> will be set to the expected
273length of the UTF-8 character in bytes, and zero will be returned.
274
275The C<flags> can also contain various flags to allow deviations from
276the strict UTF-8 encoding (see F<utf8.h>).
277
278Most code should use utf8_to_uvchr() rather than call this directly.
279
280=cut
281*/
282
283UV
284Perl_utf8n_to_uvuni(pTHX_ U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
285{
286 U8 *s0 = s;
287 UV uv = *s, ouv = 0;
288 STRLEN len = 1;
289 bool dowarn = ckWARN_d(WARN_UTF8);
290 UV startbyte = *s;
291 STRLEN expectlen = 0;
292 U32 warning = 0;
293
294/* This list is a superset of the UTF8_ALLOW_XXX. */
295
296#define UTF8_WARN_EMPTY 1
297#define UTF8_WARN_CONTINUATION 2
298#define UTF8_WARN_NON_CONTINUATION 3
299#define UTF8_WARN_FE_FF 4
300#define UTF8_WARN_SHORT 5
301#define UTF8_WARN_OVERFLOW 6
302#define UTF8_WARN_SURROGATE 7
303#define UTF8_WARN_LONG 8
304#define UTF8_WARN_FFFF 9 /* Also FFFE. */
305
306 if (curlen == 0 &&
307 !(flags & UTF8_ALLOW_EMPTY)) {
308 warning = UTF8_WARN_EMPTY;
309 goto malformed;
310 }
311
312 if (UTF8_IS_INVARIANT(uv)) {
313 if (retlen)
314 *retlen = 1;
315 return (UV) (NATIVE_TO_UTF(*s));
316 }
317
318 if (UTF8_IS_CONTINUATION(uv) &&
319 !(flags & UTF8_ALLOW_CONTINUATION)) {
320 warning = UTF8_WARN_CONTINUATION;
321 goto malformed;
322 }
323
324 if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
325 !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
326 warning = UTF8_WARN_NON_CONTINUATION;
327 goto malformed;
328 }
329
330#ifdef EBCDIC
331 uv = NATIVE_TO_UTF(uv);
332#else
333 if ((uv == 0xfe || uv == 0xff) &&
334 !(flags & UTF8_ALLOW_FE_FF)) {
335 warning = UTF8_WARN_FE_FF;
336 goto malformed;
337 }
338#endif
339
340 if (!(uv & 0x20)) { len = 2; uv &= 0x1f; }
341 else if (!(uv & 0x10)) { len = 3; uv &= 0x0f; }
342 else if (!(uv & 0x08)) { len = 4; uv &= 0x07; }
343 else if (!(uv & 0x04)) { len = 5; uv &= 0x03; }
344#ifdef EBCDIC
345 else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
346 else { len = 7; uv &= 0x01; }
347#else
348 else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
349 else if (!(uv & 0x01)) { len = 7; uv = 0; }
350 else { len = 13; uv = 0; } /* whoa! */
351#endif
352
353 if (retlen)
354 *retlen = len;
355
356 expectlen = len;
357
358 if ((curlen < expectlen) &&
359 !(flags & UTF8_ALLOW_SHORT)) {
360 warning = UTF8_WARN_SHORT;
361 goto malformed;
362 }
363
364 len--;
365 s++;
366 ouv = uv;
367
368 while (len--) {
369 if (!UTF8_IS_CONTINUATION(*s) &&
370 !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
371 s--;
372 warning = UTF8_WARN_NON_CONTINUATION;
373 goto malformed;
374 }
375 else
376 uv = UTF8_ACCUMULATE(uv, *s);
377 if (!(uv > ouv)) {
378 /* These cannot be allowed. */
379 if (uv == ouv) {
380 if (!(flags & UTF8_ALLOW_LONG)) {
381 warning = UTF8_WARN_LONG;
382 goto malformed;
383 }
384 }
385 else { /* uv < ouv */
386 /* This cannot be allowed. */
387 warning = UTF8_WARN_OVERFLOW;
388 goto malformed;
389 }
390 }
391 s++;
392 ouv = uv;
393 }
394
395 if (UNICODE_IS_SURROGATE(uv) &&
396 !(flags & UTF8_ALLOW_SURROGATE)) {
397 warning = UTF8_WARN_SURROGATE;
398 goto malformed;
399 } else if ((expectlen > (STRLEN)UNISKIP(uv)) &&
400 !(flags & UTF8_ALLOW_LONG)) {
401 warning = UTF8_WARN_LONG;
402 goto malformed;
403 } else if (UNICODE_IS_ILLEGAL(uv) &&
404 !(flags & UTF8_ALLOW_FFFF)) {
405 warning = UTF8_WARN_FFFF;
406 goto malformed;
407 }
408
409 return uv;
410
411malformed:
412
413 if (flags & UTF8_CHECK_ONLY) {
414 if (retlen)
415 *retlen = -1;
416 return 0;
417 }
418
419 if (dowarn) {
420 SV* sv = sv_2mortal(newSVpv("Malformed UTF-8 character ", 0));
421
422 switch (warning) {
423 case 0: /* Intentionally empty. */ break;
424 case UTF8_WARN_EMPTY:
425 Perl_sv_catpvf(aTHX_ sv, "(empty string)");
426 break;
427 case UTF8_WARN_CONTINUATION:
428 Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf", with no preceding start byte)", uv);
429 break;
430 case UTF8_WARN_NON_CONTINUATION:
431 if (s == s0)
432 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", immediately after start byte 0x%02"UVxf")",
433 (UV)s[1], startbyte);
434 else
435 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", %d byte%s after start byte 0x%02"UVxf", expected %d bytes)",
436 (UV)s[1], s - s0, s - s0 > 1 ? "s" : "", startbyte, expectlen);
437
438 break;
439 case UTF8_WARN_FE_FF:
440 Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
441 break;
442 case UTF8_WARN_SHORT:
443 Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
444 curlen, curlen == 1 ? "" : "s", expectlen, startbyte);
445 expectlen = curlen; /* distance for caller to skip */
446 break;
447 case UTF8_WARN_OVERFLOW:
448 Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x, after start byte 0x%02"UVxf")",
449 ouv, *s, startbyte);
450 break;
451 case UTF8_WARN_SURROGATE:
452 Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
453 break;
454 case UTF8_WARN_LONG:
455 Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
456 expectlen, expectlen == 1 ? "": "s", UNISKIP(uv), startbyte);
457 break;
458 case UTF8_WARN_FFFF:
459 Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv);
460 break;
461 default:
462 Perl_sv_catpvf(aTHX_ sv, "(unknown reason)");
463 break;
464 }
465
466 if (warning) {
467 char *s = SvPVX(sv);
468
469 if (PL_op)
470 Perl_warner(aTHX_ packWARN(WARN_UTF8),
471 "%s in %s", s, OP_DESC(PL_op));
472 else
473 Perl_warner(aTHX_ packWARN(WARN_UTF8), "%s", s);
474 }
475 }
476
477 if (retlen)
478 *retlen = expectlen ? expectlen : len;
479
480 return 0;
481}
482
483/*
484=for apidoc A|UV|utf8_to_uvchr|U8 *s|STRLEN *retlen
485
486Returns the native character value of the first character in the string C<s>
487which is assumed to be in UTF8 encoding; C<retlen> will be set to the
488length, in bytes, of that character.
489
490If C<s> does not point to a well-formed UTF8 character, zero is
491returned and retlen is set, if possible, to -1.
492
493=cut
494*/
495
496UV
497Perl_utf8_to_uvchr(pTHX_ U8 *s, STRLEN *retlen)
498{
499 return Perl_utf8n_to_uvchr(aTHX_ s, UTF8_MAXLEN, retlen,
500 ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
501}
502
503/*
504=for apidoc A|UV|utf8_to_uvuni|U8 *s|STRLEN *retlen
505
506Returns the Unicode code point of the first character in the string C<s>
507which is assumed to be in UTF8 encoding; C<retlen> will be set to the
508length, in bytes, of that character.
509
510This function should only be used when returned UV is considered
511an index into the Unicode semantic tables (e.g. swashes).
512
513If C<s> does not point to a well-formed UTF8 character, zero is
514returned and retlen is set, if possible, to -1.
515
516=cut
517*/
518
519UV
520Perl_utf8_to_uvuni(pTHX_ U8 *s, STRLEN *retlen)
521{
522 /* Call the low level routine asking for checks */
523 return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXLEN, retlen,
524 ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
525}
526
527/*
528=for apidoc A|STRLEN|utf8_length|U8 *s|U8 *e
529
530Return the length of the UTF-8 char encoded string C<s> in characters.
531Stops at C<e> (inclusive). If C<e E<lt> s> or if the scan would end
532up past C<e>, croaks.
533
534=cut
535*/
536
537STRLEN
538Perl_utf8_length(pTHX_ U8 *s, U8 *e)
539{
540 STRLEN len = 0;
541
542 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
543 * the bitops (especially ~) can create illegal UTF-8.
544 * In other words: in Perl UTF-8 is not just for Unicode. */
545
546 if (e < s) {
547 if (ckWARN_d(WARN_UTF8)) {
548 if (PL_op)
549 Perl_warner(aTHX_ packWARN(WARN_UTF8),
550 "%s in %s", unees, OP_DESC(PL_op));
551 else
552 Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
553 }
554 return 0;
555 }
556 while (s < e) {
557 U8 t = UTF8SKIP(s);
558
559 if (e - s < t) {
560 if (ckWARN_d(WARN_UTF8)) {
561 if (PL_op)
562 Perl_warner(aTHX_ packWARN(WARN_UTF8),
563 unees, OP_DESC(PL_op));
564 else
565 Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
566 }
567 return len;
568 }
569 s += t;
570 len++;
571 }
572
573 return len;
574}
575
576/*
577=for apidoc A|IV|utf8_distance|U8 *a|U8 *b
578
579Returns the number of UTF8 characters between the UTF-8 pointers C<a>
580and C<b>.
581
582WARNING: use only if you *know* that the pointers point inside the
583same UTF-8 buffer.
584
585=cut
586*/
587
588IV
589Perl_utf8_distance(pTHX_ U8 *a, U8 *b)
590{
591 IV off = 0;
592
593 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
594 * the bitops (especially ~) can create illegal UTF-8.
595 * In other words: in Perl UTF-8 is not just for Unicode. */
596
597 if (a < b) {
598 while (a < b) {
599 U8 c = UTF8SKIP(a);
600
601 if (b - a < c) {
602 if (ckWARN_d(WARN_UTF8)) {
603 if (PL_op)
604 Perl_warner(aTHX_ packWARN(WARN_UTF8),
605 "%s in %s", unees, OP_DESC(PL_op));
606 else
607 Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
608 }
609 return off;
610 }
611 a += c;
612 off--;
613 }
614 }
615 else {
616 while (b < a) {
617 U8 c = UTF8SKIP(b);
618
619 if (a - b < c) {
620 if (ckWARN_d(WARN_UTF8)) {
621 if (PL_op)
622 Perl_warner(aTHX_ packWARN(WARN_UTF8),
623 "%s in %s", unees, OP_DESC(PL_op));
624 else
625 Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
626 }
627 return off;
628 }
629 b += c;
630 off++;
631 }
632 }
633
634 return off;
635}
636
637/*
638=for apidoc A|U8 *|utf8_hop|U8 *s|I32 off
639
640Return the UTF-8 pointer C<s> displaced by C<off> characters, either
641forward or backward.
642
643WARNING: do not use the following unless you *know* C<off> is within
644the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
645on the first byte of character or just after the last byte of a character.
646
647=cut
648*/
649
650U8 *
651Perl_utf8_hop(pTHX_ U8 *s, I32 off)
652{
653 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
654 * the bitops (especially ~) can create illegal UTF-8.
655 * In other words: in Perl UTF-8 is not just for Unicode. */
656
657 if (off >= 0) {
658 while (off--)
659 s += UTF8SKIP(s);
660 }
661 else {
662 while (off++) {
663 s--;
664 while (UTF8_IS_CONTINUATION(*s))
665 s--;
666 }
667 }
668 return s;
669}
670
671/*
672=for apidoc A|U8 *|utf8_to_bytes|U8 *s|STRLEN *len
673
674Converts a string C<s> of length C<len> from UTF8 into byte encoding.
675Unlike C<bytes_to_utf8>, this over-writes the original string, and
676updates len to contain the new length.
677Returns zero on failure, setting C<len> to -1.
678
679=cut
680*/
681
682U8 *
683Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len)
684{
685 U8 *send;
686 U8 *d;
687 U8 *save = s;
688
689 /* ensure valid UTF8 and chars < 256 before updating string */
690 for (send = s + *len; s < send; ) {
691 U8 c = *s++;
692
693 if (!UTF8_IS_INVARIANT(c) &&
694 (!UTF8_IS_DOWNGRADEABLE_START(c) || (s >= send)
695 || !(c = *s++) || !UTF8_IS_CONTINUATION(c))) {
696 *len = -1;
697 return 0;
698 }
699 }
700
701 d = s = save;
702 while (s < send) {
703 STRLEN ulen;
704 *d++ = (U8)utf8_to_uvchr(s, &ulen);
705 s += ulen;
706 }
707 *d = '\0';
708 *len = d - save;
709 return save;
710}
711
712/*
713=for apidoc A|U8 *|bytes_from_utf8|U8 *s|STRLEN *len|bool *is_utf8
714
715Converts a string C<s> of length C<len> from UTF8 into byte encoding.
716Unlike <utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
717the newly-created string, and updates C<len> to contain the new
718length. Returns the original string if no conversion occurs, C<len>
719is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
7200 if C<s> is converted or contains all 7bit characters.
721
722=cut
723*/
724
725U8 *
726Perl_bytes_from_utf8(pTHX_ U8 *s, STRLEN *len, bool *is_utf8)
727{
728 U8 *d;
729 U8 *start = s;
730 U8 *send;
731 I32 count = 0;
732
733 if (!*is_utf8)
734 return start;
735
736 /* ensure valid UTF8 and chars < 256 before converting string */
737 for (send = s + *len; s < send;) {
738 U8 c = *s++;
739 if (!UTF8_IS_INVARIANT(c)) {
740 if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send &&
741 (c = *s++) && UTF8_IS_CONTINUATION(c))
742 count++;
743 else
744 return start;
745 }
746 }
747
748 *is_utf8 = 0;
749
750 Newz(801, d, (*len) - count + 1, U8);
751 s = start; start = d;
752 while (s < send) {
753 U8 c = *s++;
754 if (!UTF8_IS_INVARIANT(c)) {
755 /* Then it is two-byte encoded */
756 c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), *s++);
757 c = ASCII_TO_NATIVE(c);
758 }
759 *d++ = c;
760 }
761 *d = '\0';
762 *len = d - start;
763 return start;
764}
765
766/*
767=for apidoc A|U8 *|bytes_to_utf8|U8 *s|STRLEN *len
768
769Converts a string C<s> of length C<len> from ASCII into UTF8 encoding.
770Returns a pointer to the newly-created string, and sets C<len> to
771reflect the new length.
772
773If you want to convert to UTF8 from other encodings than ASCII,
774see sv_recode_to_utf8().
775
776=cut
777*/
778
779U8*
780Perl_bytes_to_utf8(pTHX_ U8 *s, STRLEN *len)
781{
782 U8 *send;
783 U8 *d;
784 U8 *dst;
785 send = s + (*len);
786
787 Newz(801, d, (*len) * 2 + 1, U8);
788 dst = d;
789
790 while (s < send) {
791 UV uv = NATIVE_TO_ASCII(*s++);
792 if (UNI_IS_INVARIANT(uv))
793 *d++ = (U8)UTF_TO_NATIVE(uv);
794 else {
795 *d++ = (U8)UTF8_EIGHT_BIT_HI(uv);
796 *d++ = (U8)UTF8_EIGHT_BIT_LO(uv);
797 }
798 }
799 *d = '\0';
800 *len = d-dst;
801 return dst;
802}
803
804/*
805 * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
806 *
807 * Destination must be pre-extended to 3/2 source. Do not use in-place.
808 * We optimize for native, for obvious reasons. */
809
810U8*
811Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
812{
813 U8* pend;
814 U8* dstart = d;
815
816 if (bytelen & 1)
817 Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen");
818
819 pend = p + bytelen;
820
821 while (p < pend) {
822 UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
823 p += 2;
824 if (uv < 0x80) {
825 *d++ = (U8)uv;
826 continue;
827 }
828 if (uv < 0x800) {
829 *d++ = (U8)(( uv >> 6) | 0xc0);
830 *d++ = (U8)(( uv & 0x3f) | 0x80);
831 continue;
832 }
833 if (uv >= 0xd800 && uv < 0xdbff) { /* surrogates */
834 UV low = *p++;
835 if (low < 0xdc00 || low >= 0xdfff)
836 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
837 uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
838 }
839 if (uv < 0x10000) {
840 *d++ = (U8)(( uv >> 12) | 0xe0);
841 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
842 *d++ = (U8)(( uv & 0x3f) | 0x80);
843 continue;
844 }
845 else {
846 *d++ = (U8)(( uv >> 18) | 0xf0);
847 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
848 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
849 *d++ = (U8)(( uv & 0x3f) | 0x80);
850 continue;
851 }
852 }
853 *newlen = d - dstart;
854 return d;
855}
856
857/* Note: this one is slightly destructive of the source. */
858
859U8*
860Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
861{
862 U8* s = (U8*)p;
863 U8* send = s + bytelen;
864 while (s < send) {
865 U8 tmp = s[0];
866 s[0] = s[1];
867 s[1] = tmp;
868 s += 2;
869 }
870 return utf16_to_utf8(p, d, bytelen, newlen);
871}
872
873/* for now these are all defined (inefficiently) in terms of the utf8 versions */
874
875bool
876Perl_is_uni_alnum(pTHX_ UV c)
877{
878 U8 tmpbuf[UTF8_MAXLEN+1];
879 uvchr_to_utf8(tmpbuf, c);
880 return is_utf8_alnum(tmpbuf);
881}
882
883bool
884Perl_is_uni_alnumc(pTHX_ UV c)
885{
886 U8 tmpbuf[UTF8_MAXLEN+1];
887 uvchr_to_utf8(tmpbuf, c);
888 return is_utf8_alnumc(tmpbuf);
889}
890
891bool
892Perl_is_uni_idfirst(pTHX_ UV c)
893{
894 U8 tmpbuf[UTF8_MAXLEN+1];
895 uvchr_to_utf8(tmpbuf, c);
896 return is_utf8_idfirst(tmpbuf);
897}
898
899bool
900Perl_is_uni_alpha(pTHX_ UV c)
901{
902 U8 tmpbuf[UTF8_MAXLEN+1];
903 uvchr_to_utf8(tmpbuf, c);
904 return is_utf8_alpha(tmpbuf);
905}
906
907bool
908Perl_is_uni_ascii(pTHX_ UV c)
909{
910 U8 tmpbuf[UTF8_MAXLEN+1];
911 uvchr_to_utf8(tmpbuf, c);
912 return is_utf8_ascii(tmpbuf);
913}
914
915bool
916Perl_is_uni_space(pTHX_ UV c)
917{
918 U8 tmpbuf[UTF8_MAXLEN+1];
919 uvchr_to_utf8(tmpbuf, c);
920 return is_utf8_space(tmpbuf);
921}
922
923bool
924Perl_is_uni_digit(pTHX_ UV c)
925{
926 U8 tmpbuf[UTF8_MAXLEN+1];
927 uvchr_to_utf8(tmpbuf, c);
928 return is_utf8_digit(tmpbuf);
929}
930
931bool
932Perl_is_uni_upper(pTHX_ UV c)
933{
934 U8 tmpbuf[UTF8_MAXLEN+1];
935 uvchr_to_utf8(tmpbuf, c);
936 return is_utf8_upper(tmpbuf);
937}
938
939bool
940Perl_is_uni_lower(pTHX_ UV c)
941{
942 U8 tmpbuf[UTF8_MAXLEN+1];
943 uvchr_to_utf8(tmpbuf, c);
944 return is_utf8_lower(tmpbuf);
945}
946
947bool
948Perl_is_uni_cntrl(pTHX_ UV c)
949{
950 U8 tmpbuf[UTF8_MAXLEN+1];
951 uvchr_to_utf8(tmpbuf, c);
952 return is_utf8_cntrl(tmpbuf);
953}
954
955bool
956Perl_is_uni_graph(pTHX_ UV c)
957{
958 U8 tmpbuf[UTF8_MAXLEN+1];
959 uvchr_to_utf8(tmpbuf, c);
960 return is_utf8_graph(tmpbuf);
961}
962
963bool
964Perl_is_uni_print(pTHX_ UV c)
965{
966 U8 tmpbuf[UTF8_MAXLEN+1];
967 uvchr_to_utf8(tmpbuf, c);
968 return is_utf8_print(tmpbuf);
969}
970
971bool
972Perl_is_uni_punct(pTHX_ UV c)
973{
974 U8 tmpbuf[UTF8_MAXLEN+1];
975 uvchr_to_utf8(tmpbuf, c);
976 return is_utf8_punct(tmpbuf);
977}
978
979bool
980Perl_is_uni_xdigit(pTHX_ UV c)
981{
982 U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
983 uvchr_to_utf8(tmpbuf, c);
984 return is_utf8_xdigit(tmpbuf);
985}
986
987UV
988Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
989{
990 uvchr_to_utf8(p, c);
991 return to_utf8_upper(p, p, lenp);
992}
993
994UV
995Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
996{
997 uvchr_to_utf8(p, c);
998 return to_utf8_title(p, p, lenp);
999}
1000
1001UV
1002Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
1003{
1004 uvchr_to_utf8(p, c);
1005 return to_utf8_lower(p, p, lenp);
1006}
1007
1008UV
1009Perl_to_uni_fold(pTHX_ UV c, U8* p, STRLEN *lenp)
1010{
1011 uvchr_to_utf8(p, c);
1012 return to_utf8_fold(p, p, lenp);
1013}
1014
1015/* for now these all assume no locale info available for Unicode > 255 */
1016
1017bool
1018Perl_is_uni_alnum_lc(pTHX_ UV c)
1019{
1020 return is_uni_alnum(c); /* XXX no locale support yet */
1021}
1022
1023bool
1024Perl_is_uni_alnumc_lc(pTHX_ UV c)
1025{
1026 return is_uni_alnumc(c); /* XXX no locale support yet */
1027}
1028
1029bool
1030Perl_is_uni_idfirst_lc(pTHX_ UV c)
1031{
1032 return is_uni_idfirst(c); /* XXX no locale support yet */
1033}
1034
1035bool
1036Perl_is_uni_alpha_lc(pTHX_ UV c)
1037{
1038 return is_uni_alpha(c); /* XXX no locale support yet */
1039}
1040
1041bool
1042Perl_is_uni_ascii_lc(pTHX_ UV c)
1043{
1044 return is_uni_ascii(c); /* XXX no locale support yet */
1045}
1046
1047bool
1048Perl_is_uni_space_lc(pTHX_ UV c)
1049{
1050 return is_uni_space(c); /* XXX no locale support yet */
1051}
1052
1053bool
1054Perl_is_uni_digit_lc(pTHX_ UV c)
1055{
1056 return is_uni_digit(c); /* XXX no locale support yet */
1057}
1058
1059bool
1060Perl_is_uni_upper_lc(pTHX_ UV c)
1061{
1062 return is_uni_upper(c); /* XXX no locale support yet */
1063}
1064
1065bool
1066Perl_is_uni_lower_lc(pTHX_ UV c)
1067{
1068 return is_uni_lower(c); /* XXX no locale support yet */
1069}
1070
1071bool
1072Perl_is_uni_cntrl_lc(pTHX_ UV c)
1073{
1074 return is_uni_cntrl(c); /* XXX no locale support yet */
1075}
1076
1077bool
1078Perl_is_uni_graph_lc(pTHX_ UV c)
1079{
1080 return is_uni_graph(c); /* XXX no locale support yet */
1081}
1082
1083bool
1084Perl_is_uni_print_lc(pTHX_ UV c)
1085{
1086 return is_uni_print(c); /* XXX no locale support yet */
1087}
1088
1089bool
1090Perl_is_uni_punct_lc(pTHX_ UV c)
1091{
1092 return is_uni_punct(c); /* XXX no locale support yet */
1093}
1094
1095bool
1096Perl_is_uni_xdigit_lc(pTHX_ UV c)
1097{
1098 return is_uni_xdigit(c); /* XXX no locale support yet */
1099}
1100
1101U32
1102Perl_to_uni_upper_lc(pTHX_ U32 c)
1103{
1104 /* XXX returns only the first character -- do not use XXX */
1105 /* XXX no locale support yet */
1106 STRLEN len;
1107 U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
1108 return (U32)to_uni_upper(c, tmpbuf, &len);
1109}
1110
1111U32
1112Perl_to_uni_title_lc(pTHX_ U32 c)
1113{
1114 /* XXX returns only the first character XXX -- do not use XXX */
1115 /* XXX no locale support yet */
1116 STRLEN len;
1117 U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
1118 return (U32)to_uni_title(c, tmpbuf, &len);
1119}
1120
1121U32
1122Perl_to_uni_lower_lc(pTHX_ U32 c)
1123{
1124 /* XXX returns only the first character -- do not use XXX */
1125 /* XXX no locale support yet */
1126 STRLEN len;
1127 U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
1128 return (U32)to_uni_lower(c, tmpbuf, &len);
1129}
1130
1131bool
1132Perl_is_utf8_alnum(pTHX_ U8 *p)
1133{
1134 if (!is_utf8_char(p))
1135 return FALSE;
1136 if (!PL_utf8_alnum)
1137 /* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
1138 * descendant of isalnum(3), in other words, it doesn't
1139 * contain the '_'. --jhi */
1140 PL_utf8_alnum = swash_init("utf8", "IsWord", &PL_sv_undef, 0, 0);
1141 return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
1142/* return *p == '_' || is_utf8_alpha(p) || is_utf8_digit(p); */
1143#ifdef SURPRISINGLY_SLOWER /* probably because alpha is usually true */
1144 if (!PL_utf8_alnum)
1145 PL_utf8_alnum = swash_init("utf8", "",
1146 sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
1147 return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
1148#endif
1149}
1150
1151bool
1152Perl_is_utf8_alnumc(pTHX_ U8 *p)
1153{
1154 if (!is_utf8_char(p))
1155 return FALSE;
1156 if (!PL_utf8_alnum)
1157 PL_utf8_alnum = swash_init("utf8", "IsAlnumC", &PL_sv_undef, 0, 0);
1158 return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
1159/* return is_utf8_alpha(p) || is_utf8_digit(p); */
1160#ifdef SURPRISINGLY_SLOWER /* probably because alpha is usually true */
1161 if (!PL_utf8_alnum)
1162 PL_utf8_alnum = swash_init("utf8", "",
1163 sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
1164 return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
1165#endif
1166}
1167
1168bool
1169Perl_is_utf8_idfirst(pTHX_ U8 *p) /* The naming is historical. */
1170{
1171 if (*p == '_')
1172 return TRUE;
1173 if (!is_utf8_char(p))
1174 return FALSE;
1175 if (!PL_utf8_idstart) /* is_utf8_idstart would be more logical. */
1176 PL_utf8_idstart = swash_init("utf8", "IdStart", &PL_sv_undef, 0, 0);
1177 return swash_fetch(PL_utf8_idstart, p, TRUE) != 0;
1178}
1179
1180bool
1181Perl_is_utf8_idcont(pTHX_ U8 *p)
1182{
1183 if (*p == '_')
1184 return TRUE;
1185 if (!is_utf8_char(p))
1186 return FALSE;
1187 if (!PL_utf8_idcont)
1188 PL_utf8_idcont = swash_init("utf8", "IdContinue", &PL_sv_undef, 0, 0);
1189 return swash_fetch(PL_utf8_idcont, p, TRUE) != 0;
1190}
1191
1192bool
1193Perl_is_utf8_alpha(pTHX_ U8 *p)
1194{
1195 if (!is_utf8_char(p))
1196 return FALSE;
1197 if (!PL_utf8_alpha)
1198 PL_utf8_alpha = swash_init("utf8", "IsAlpha", &PL_sv_undef, 0, 0);
1199 return swash_fetch(PL_utf8_alpha, p, TRUE) != 0;
1200}
1201
1202bool
1203Perl_is_utf8_ascii(pTHX_ U8 *p)
1204{
1205 if (!is_utf8_char(p))
1206 return FALSE;
1207 if (!PL_utf8_ascii)
1208 PL_utf8_ascii = swash_init("utf8", "IsAscii", &PL_sv_undef, 0, 0);
1209 return swash_fetch(PL_utf8_ascii, p, TRUE) != 0;
1210}
1211
1212bool
1213Perl_is_utf8_space(pTHX_ U8 *p)
1214{
1215 if (!is_utf8_char(p))
1216 return FALSE;
1217 if (!PL_utf8_space)
1218 PL_utf8_space = swash_init("utf8", "IsSpacePerl", &PL_sv_undef, 0, 0);
1219 return swash_fetch(PL_utf8_space, p, TRUE) != 0;
1220}
1221
1222bool
1223Perl_is_utf8_digit(pTHX_ U8 *p)
1224{
1225 if (!is_utf8_char(p))
1226 return FALSE;
1227 if (!PL_utf8_digit)
1228 PL_utf8_digit = swash_init("utf8", "IsDigit", &PL_sv_undef, 0, 0);
1229 return swash_fetch(PL_utf8_digit, p, TRUE) != 0;
1230}
1231
1232bool
1233Perl_is_utf8_upper(pTHX_ U8 *p)
1234{
1235 if (!is_utf8_char(p))
1236 return FALSE;
1237 if (!PL_utf8_upper)
1238 PL_utf8_upper = swash_init("utf8", "IsUpper", &PL_sv_undef, 0, 0);
1239 return swash_fetch(PL_utf8_upper, p, TRUE) != 0;
1240}
1241
1242bool
1243Perl_is_utf8_lower(pTHX_ U8 *p)
1244{
1245 if (!is_utf8_char(p))
1246 return FALSE;
1247 if (!PL_utf8_lower)
1248 PL_utf8_lower = swash_init("utf8", "IsLower", &PL_sv_undef, 0, 0);
1249 return swash_fetch(PL_utf8_lower, p, TRUE) != 0;
1250}
1251
1252bool
1253Perl_is_utf8_cntrl(pTHX_ U8 *p)
1254{
1255 if (!is_utf8_char(p))
1256 return FALSE;
1257 if (!PL_utf8_cntrl)
1258 PL_utf8_cntrl = swash_init("utf8", "IsCntrl", &PL_sv_undef, 0, 0);
1259 return swash_fetch(PL_utf8_cntrl, p, TRUE) != 0;
1260}
1261
1262bool
1263Perl_is_utf8_graph(pTHX_ U8 *p)
1264{
1265 if (!is_utf8_char(p))
1266 return FALSE;
1267 if (!PL_utf8_graph)
1268 PL_utf8_graph = swash_init("utf8", "IsGraph", &PL_sv_undef, 0, 0);
1269 return swash_fetch(PL_utf8_graph, p, TRUE) != 0;
1270}
1271
1272bool
1273Perl_is_utf8_print(pTHX_ U8 *p)
1274{
1275 if (!is_utf8_char(p))
1276 return FALSE;
1277 if (!PL_utf8_print)
1278 PL_utf8_print = swash_init("utf8", "IsPrint", &PL_sv_undef, 0, 0);
1279 return swash_fetch(PL_utf8_print, p, TRUE) != 0;
1280}
1281
1282bool
1283Perl_is_utf8_punct(pTHX_ U8 *p)
1284{
1285 if (!is_utf8_char(p))
1286 return FALSE;
1287 if (!PL_utf8_punct)
1288 PL_utf8_punct = swash_init("utf8", "IsPunct", &PL_sv_undef, 0, 0);
1289 return swash_fetch(PL_utf8_punct, p, TRUE) != 0;
1290}
1291
1292bool
1293Perl_is_utf8_xdigit(pTHX_ U8 *p)
1294{
1295 if (!is_utf8_char(p))
1296 return FALSE;
1297 if (!PL_utf8_xdigit)
1298 PL_utf8_xdigit = swash_init("utf8", "IsXDigit", &PL_sv_undef, 0, 0);
1299 return swash_fetch(PL_utf8_xdigit, p, TRUE) != 0;
1300}
1301
1302bool
1303Perl_is_utf8_mark(pTHX_ U8 *p)
1304{
1305 if (!is_utf8_char(p))
1306 return FALSE;
1307 if (!PL_utf8_mark)
1308 PL_utf8_mark = swash_init("utf8", "IsM", &PL_sv_undef, 0, 0);
1309 return swash_fetch(PL_utf8_mark, p, TRUE) != 0;
1310}
1311
1312/*
1313=for apidoc A|UV|to_utf8_case|U8 *p|U8* ustrp|STRLEN *lenp|SV **swash|char *normal|char *special
1314
1315The "p" contains the pointer to the UTF-8 string encoding
1316the character that is being converted.
1317
1318The "ustrp" is a pointer to the character buffer to put the
1319conversion result to. The "lenp" is a pointer to the length
1320of the result.
1321
1322The "swashp" is a pointer to the swash to use.
1323
1324Both the special and normal mappings are stored lib/unicore/To/Foo.pl,
1325and loaded by SWASHGET, using lib/utf8_heavy.pl. The special (usually,
1326but not always, a multicharacter mapping), is tried first.
1327
1328The "special" is a string like "utf8::ToSpecLower", which means the
1329hash %utf8::ToSpecLower. The access to the hash is through
1330Perl_to_utf8_case().
1331
1332The "normal" is a string like "ToLower" which means the swash
1333%utf8::ToLower.
1334
1335=cut */
1336
1337UV
1338Perl_to_utf8_case(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp, SV **swashp, char *normal, char *special)
1339{
1340 UV uv0, uv1;
1341 U8 tmpbuf[UTF8_MAXLEN_FOLD+1];
1342 STRLEN len = 0;
1343
1344 uv0 = utf8_to_uvchr(p, 0);
1345 /* The NATIVE_TO_UNI() and UNI_TO_NATIVE() mappings
1346 * are necessary in EBCDIC, they are redundant no-ops
1347 * in ASCII-ish platforms, and hopefully optimized away. */
1348 uv1 = NATIVE_TO_UNI(uv0);
1349 uvuni_to_utf8(tmpbuf, uv1);
1350
1351 if (!*swashp) /* load on-demand */
1352 *swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0);
1353
1354 if (special) {
1355 /* It might be "special" (sometimes, but not always,
1356 * a multicharacter mapping) */
1357 HV *hv;
1358 SV *keysv;
1359 HE *he;
1360 SV *val;
1361
1362 if ((hv = get_hv(special, FALSE)) &&
1363 (keysv = sv_2mortal(Perl_newSVpvf(aTHX_ "%04"UVXf, uv1))) &&
1364 (he = hv_fetch_ent(hv, keysv, FALSE, 0)) &&
1365 (val = HeVAL(he))) {
1366 char *s;
1367
1368 s = SvPV(val, len);
1369 if (len == 1)
1370 len = uvuni_to_utf8(ustrp, NATIVE_TO_UNI(*(U8*)s)) - ustrp;
1371 else {
1372#ifdef EBCDIC
1373 /* If we have EBCDIC we need to remap the characters
1374 * since any characters in the low 256 are Unicode
1375 * code points, not EBCDIC. */
1376 U8 *t = (U8*)s, *tend = t + len, *d;
1377
1378 d = tmpbuf;
1379 if (SvUTF8(val)) {
1380 STRLEN tlen = 0;
1381
1382 while (t < tend) {
1383 UV c = utf8_to_uvchr(t, &tlen);
1384 if (tlen > 0) {
1385 d = uvchr_to_utf8(d, UNI_TO_NATIVE(c));
1386 t += tlen;
1387 }
1388 else
1389 break;
1390 }
1391 }
1392 else {
1393 while (t < tend) {
1394 d = uvchr_to_utf8(d, UNI_TO_NATIVE(*t));
1395 t++;
1396 }
1397 }
1398 len = d - tmpbuf;
1399 Copy(tmpbuf, ustrp, len, U8);
1400#else
1401 Copy(s, ustrp, len, U8);
1402#endif
1403 }
1404 }
1405 }
1406
1407 if (!len && *swashp) {
1408 UV uv2 = swash_fetch(*swashp, tmpbuf, TRUE);
1409
1410 if (uv2) {
1411 /* It was "normal" (a single character mapping). */
1412 UV uv3 = UNI_TO_NATIVE(uv2);
1413
1414 len = uvchr_to_utf8(ustrp, uv3) - ustrp;
1415 }
1416 }
1417
1418 if (!len) /* Neither: just copy. */
1419 len = uvchr_to_utf8(ustrp, uv0) - ustrp;
1420
1421 if (lenp)
1422 *lenp = len;
1423
1424 return len ? utf8_to_uvchr(ustrp, 0) : 0;
1425}
1426
1427/*
1428=for apidoc A|UV|to_utf8_upper|U8 *p|U8 *ustrp|STRLEN *lenp
1429
1430Convert the UTF-8 encoded character at p to its uppercase version and
1431store that in UTF-8 in ustrp and its length in bytes in lenp. Note
1432that the ustrp needs to be at least UTF8_MAXLEN_UCLC+1 bytes since the
1433uppercase version may be longer than the original character (up to two
1434characters).
1435
1436The first character of the uppercased version is returned
1437(but note, as explained above, that there may be more.)
1438
1439=cut */
1440
1441UV
1442Perl_to_utf8_upper(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1443{
1444 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1445 &PL_utf8_toupper, "ToUpper", "utf8::ToSpecUpper");
1446}
1447
1448/*
1449=for apidoc A|UV|to_utf8_title|U8 *p|U8 *ustrp|STRLEN *lenp
1450
1451Convert the UTF-8 encoded character at p to its titlecase version and
1452store that in UTF-8 in ustrp and its length in bytes in lenp. Note
1453that the ustrp needs to be at least UTF8_MAXLEN_UCLC+1 bytes since the
1454titlecase version may be longer than the original character (up to two
1455characters).
1456
1457The first character of the titlecased version is returned
1458(but note, as explained above, that there may be more.)
1459
1460=cut */
1461
1462UV
1463Perl_to_utf8_title(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1464{
1465 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1466 &PL_utf8_totitle, "ToTitle", "utf8::ToSpecTitle");
1467}
1468
1469/*
1470=for apidoc A|UV|to_utf8_lower|U8 *p|U8 *ustrp|STRLEN *lenp
1471
1472Convert the UTF-8 encoded character at p to its lowercase version and
1473store that in UTF-8 in ustrp and its length in bytes in lenp. Note
1474that the ustrp needs to be at least UTF8_MAXLEN_UCLC+1 bytes since the
1475lowercase version may be longer than the original character (up to two
1476characters).
1477
1478The first character of the lowercased version is returned
1479(but note, as explained above, that there may be more.)
1480
1481=cut */
1482
1483UV
1484Perl_to_utf8_lower(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1485{
1486 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1487 &PL_utf8_tolower, "ToLower", "utf8::ToSpecLower");
1488}
1489
1490/*
1491=for apidoc A|UV|to_utf8_fold|U8 *p|U8 *ustrp|STRLEN *lenp
1492
1493Convert the UTF-8 encoded character at p to its foldcase version and
1494store that in UTF-8 in ustrp and its length in bytes in lenp. Note
1495that the ustrp needs to be at least UTF8_MAXLEN_FOLD+1 bytes since the
1496foldcase version may be longer than the original character (up to
1497three characters).
1498
1499The first character of the foldcased version is returned
1500(but note, as explained above, that there may be more.)
1501
1502=cut */
1503
1504UV
1505Perl_to_utf8_fold(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1506{
1507 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1508 &PL_utf8_tofold, "ToFold", "utf8::ToSpecFold");
1509}
1510
1511/* a "swash" is a swatch hash */
1512
1513SV*
1514Perl_swash_init(pTHX_ char* pkg, char* name, SV *listsv, I32 minbits, I32 none)
1515{
1516 SV* retval;
1517 SV* tokenbufsv = sv_2mortal(NEWSV(0,0));
1518 dSP;
1519 HV *stash = gv_stashpvn(pkg, strlen(pkg), FALSE);
1520 SV* errsv_save;
1521
1522 if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) { /* demand load utf8 */
1523 ENTER;
1524 errsv_save = newSVsv(ERRSV);
1525 Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpv(pkg,0), Nullsv);
1526 if (!SvTRUE(ERRSV))
1527 sv_setsv(ERRSV, errsv_save);
1528 SvREFCNT_dec(errsv_save);
1529 LEAVE;
1530 }
1531 SPAGAIN;
1532 PUSHSTACKi(PERLSI_MAGIC);
1533 PUSHMARK(SP);
1534 EXTEND(SP,5);
1535 PUSHs(sv_2mortal(newSVpvn(pkg, strlen(pkg))));
1536 PUSHs(sv_2mortal(newSVpvn(name, strlen(name))));
1537 PUSHs(listsv);
1538 PUSHs(sv_2mortal(newSViv(minbits)));
1539 PUSHs(sv_2mortal(newSViv(none)));
1540 PUTBACK;
1541 ENTER;
1542 SAVEI32(PL_hints);
1543 PL_hints = 0;
1544 save_re_context();
1545 if (PL_curcop == &PL_compiling) {
1546 /* XXX ought to be handled by lex_start */
1547 SAVEI32(PL_in_my);
1548 PL_in_my = 0;
1549 sv_setpv(tokenbufsv, PL_tokenbuf);
1550 }
1551 errsv_save = newSVsv(ERRSV);
1552 if (call_method("SWASHNEW", G_SCALAR))
1553 retval = newSVsv(*PL_stack_sp--);
1554 else
1555 retval = &PL_sv_undef;
1556 if (!SvTRUE(ERRSV))
1557 sv_setsv(ERRSV, errsv_save);
1558 SvREFCNT_dec(errsv_save);
1559 LEAVE;
1560 POPSTACK;
1561 if (PL_curcop == &PL_compiling) {
1562 STRLEN len;
1563 char* pv = SvPV(tokenbufsv, len);
1564
1565 Copy(pv, PL_tokenbuf, len+1, char);
1566 PL_curcop->op_private = (U8)(PL_hints & HINT_PRIVATE_MASK);
1567 }
1568 if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV) {
1569 if (SvPOK(retval))
1570 Perl_croak(aTHX_ "Can't find Unicode property definition \"%"SVf"\"",
1571 retval);
1572 Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
1573 }
1574 return retval;
1575}
1576
1577
1578/* This API is wrong for special case conversions since we may need to
1579 * return several Unicode characters for a single Unicode character
1580 * (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
1581 * the lower-level routine, and it is similarly broken for returning
1582 * multiple values. --jhi */
1583UV
1584Perl_swash_fetch(pTHX_ SV *sv, U8 *ptr, bool do_utf8)
1585{
1586 HV* hv = (HV*)SvRV(sv);
1587 U32 klen;
1588 U32 off;
1589 STRLEN slen;
1590 STRLEN needents;
1591 U8 *tmps = NULL;
1592 U32 bit;
1593 SV *retval;
1594 U8 tmputf8[2];
1595 UV c = NATIVE_TO_ASCII(*ptr);
1596
1597 if (!do_utf8 && !UNI_IS_INVARIANT(c)) {
1598 tmputf8[0] = (U8)UTF8_EIGHT_BIT_HI(c);
1599 tmputf8[1] = (U8)UTF8_EIGHT_BIT_LO(c);
1600 ptr = tmputf8;
1601 }
1602 /* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
1603 * then the "swatch" is a vec() for al the chars which start
1604 * with 0xAA..0xYY
1605 * So the key in the hash (klen) is length of encoded char -1
1606 */
1607 klen = UTF8SKIP(ptr) - 1;
1608 off = ptr[klen];
1609
1610 if (klen == 0)
1611 {
1612 /* If char in invariant then swatch is for all the invariant chars
1613 * In both UTF-8 and UTF8-MOD that happens to be UTF_CONTINUATION_MARK
1614 */
1615 needents = UTF_CONTINUATION_MARK;
1616 off = NATIVE_TO_UTF(ptr[klen]);
1617 }
1618 else
1619 {
1620 /* If char is encoded then swatch is for the prefix */
1621 needents = (1 << UTF_ACCUMULATION_SHIFT);
1622 off = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
1623 }
1624
1625 /*
1626 * This single-entry cache saves about 1/3 of the utf8 overhead in test
1627 * suite. (That is, only 7-8% overall over just a hash cache. Still,
1628 * it's nothing to sniff at.) Pity we usually come through at least
1629 * two function calls to get here...
1630 *
1631 * NB: this code assumes that swatches are never modified, once generated!
1632 */
1633
1634 if (hv == PL_last_swash_hv &&
1635 klen == PL_last_swash_klen &&
1636 (!klen || memEQ((char *)ptr, (char *)PL_last_swash_key, klen)) )
1637 {
1638 tmps = PL_last_swash_tmps;
1639 slen = PL_last_swash_slen;
1640 }
1641 else {
1642 /* Try our second-level swatch cache, kept in a hash. */
1643 SV** svp = hv_fetch(hv, (char*)ptr, klen, FALSE);
1644
1645 /* If not cached, generate it via utf8::SWASHGET */
1646 if (!svp || !SvPOK(*svp) || !(tmps = (U8*)SvPV(*svp, slen))) {
1647 dSP;
1648 /* We use utf8n_to_uvuni() as we want an index into
1649 Unicode tables, not a native character number.
1650 */
1651 UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXLEN, 0,
1652 ckWARN(WARN_UTF8) ?
1653 0 : UTF8_ALLOW_ANY);
1654 SV *errsv_save;
1655 ENTER;
1656 SAVETMPS;
1657 save_re_context();
1658 PUSHSTACKi(PERLSI_MAGIC);
1659 PUSHMARK(SP);
1660 EXTEND(SP,3);
1661 PUSHs((SV*)sv);
1662 /* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
1663 PUSHs(sv_2mortal(newSViv((klen) ?
1664 (code_point & ~(needents - 1)) : 0)));
1665 PUSHs(sv_2mortal(newSViv(needents)));
1666 PUTBACK;
1667 errsv_save = newSVsv(ERRSV);
1668 if (call_method("SWASHGET", G_SCALAR))
1669 retval = newSVsv(*PL_stack_sp--);
1670 else
1671 retval = &PL_sv_undef;
1672 if (!SvTRUE(ERRSV))
1673 sv_setsv(ERRSV, errsv_save);
1674 SvREFCNT_dec(errsv_save);
1675 POPSTACK;
1676 FREETMPS;
1677 LEAVE;
1678 if (PL_curcop == &PL_compiling)
1679 PL_curcop->op_private = (U8)(PL_hints & HINT_PRIVATE_MASK);
1680
1681 svp = hv_store(hv, (char*)ptr, klen, retval, 0);
1682
1683 if (!svp || !(tmps = (U8*)SvPV(*svp, slen)) || (slen << 3) < needents)
1684 Perl_croak(aTHX_ "SWASHGET didn't return result of proper length");
1685 }
1686
1687 PL_last_swash_hv = hv;
1688 PL_last_swash_klen = klen;
1689 PL_last_swash_tmps = tmps;
1690 PL_last_swash_slen = slen;
1691 if (klen)
1692 Copy(ptr, PL_last_swash_key, klen, U8);
1693 }
1694
1695 switch ((int)((slen << 3) / needents)) {
1696 case 1:
1697 bit = 1 << (off & 7);
1698 off >>= 3;
1699 return (tmps[off] & bit) != 0;
1700 case 8:
1701 return tmps[off];
1702 case 16:
1703 off <<= 1;
1704 return (tmps[off] << 8) + tmps[off + 1] ;
1705 case 32:
1706 off <<= 2;
1707 return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
1708 }
1709 Perl_croak(aTHX_ "panic: swash_fetch");
1710 return 0;
1711}
1712
1713
1714/*
1715=for apidoc A|U8 *|uvchr_to_utf8|U8 *d|UV uv
1716
1717Adds the UTF8 representation of the Native codepoint C<uv> to the end
1718of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
1719bytes available. The return value is the pointer to the byte after the
1720end of the new character. In other words,
1721
1722 d = uvchr_to_utf8(d, uv);
1723
1724is the recommended wide native character-aware way of saying
1725
1726 *(d++) = uv;
1727
1728=cut
1729*/
1730
1731/* On ASCII machines this is normally a macro but we want a
1732 real function in case XS code wants it
1733*/
1734#undef Perl_uvchr_to_utf8
1735U8 *
1736Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
1737{
1738 return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), 0);
1739}
1740
1741U8 *
1742Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
1743{
1744 return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), flags);
1745}
1746
1747/*
1748=for apidoc A|UV|utf8n_to_uvchr|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
1749
1750Returns the native character value of the first character in the string C<s>
1751which is assumed to be in UTF8 encoding; C<retlen> will be set to the
1752length, in bytes, of that character.
1753
1754Allows length and flags to be passed to low level routine.
1755
1756=cut
1757*/
1758/* On ASCII machines this is normally a macro but we want
1759 a real function in case XS code wants it
1760*/
1761#undef Perl_utf8n_to_uvchr
1762UV
1763Perl_utf8n_to_uvchr(pTHX_ U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
1764{
1765 UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
1766 return UNI_TO_NATIVE(uv);
1767}
1768
1769/*
1770=for apidoc A|char *|pv_uni_display|SV *dsv|U8 *spv|STRLEN len|STRLEN pvlim|UV flags
1771
1772Build to the scalar dsv a displayable version of the string spv,
1773length len, the displayable version being at most pvlim bytes long
1774(if longer, the rest is truncated and "..." will be appended).
1775
1776The flags argument can have UNI_DISPLAY_ISPRINT set to display
1777isPRINT()able characters as themselves, UNI_DISPLAY_BACKSLASH
1778to display the \\[nrfta\\] as the backslashed versions (like '\n')
1779(UNI_DISPLAY_BACKSLASH is preferred over UNI_DISPLAY_ISPRINT for \\).
1780UNI_DISPLAY_QQ (and its alias UNI_DISPLAY_REGEX) have both
1781UNI_DISPLAY_BACKSLASH and UNI_DISPLAY_ISPRINT turned on.
1782
1783The pointer to the PV of the dsv is returned.
1784
1785=cut */
1786char *
1787Perl_pv_uni_display(pTHX_ SV *dsv, U8 *spv, STRLEN len, STRLEN pvlim, UV flags)
1788{
1789 int truncated = 0;
1790 char *s, *e;
1791
1792 sv_setpvn(dsv, "", 0);
1793 for (s = (char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
1794 UV u;
1795 bool ok = FALSE;
1796
1797 if (pvlim && SvCUR(dsv) >= pvlim) {
1798 truncated++;
1799 break;
1800 }
1801 u = utf8_to_uvchr((U8*)s, 0);
1802 if (u < 256) {
1803 if (!ok && (flags & UNI_DISPLAY_BACKSLASH)) {
1804 switch (u & 0xFF) {
1805 case '\n':
1806 Perl_sv_catpvf(aTHX_ dsv, "\\n"); ok = TRUE; break;
1807 case '\r':
1808 Perl_sv_catpvf(aTHX_ dsv, "\\r"); ok = TRUE; break;
1809 case '\t':
1810 Perl_sv_catpvf(aTHX_ dsv, "\\t"); ok = TRUE; break;
1811 case '\f':
1812 Perl_sv_catpvf(aTHX_ dsv, "\\f"); ok = TRUE; break;
1813 case '\a':
1814 Perl_sv_catpvf(aTHX_ dsv, "\\a"); ok = TRUE; break;
1815 case '\\':
1816 Perl_sv_catpvf(aTHX_ dsv, "\\\\" ); ok = TRUE; break;
1817 default: break;
1818 }
1819 }
1820 /* isPRINT() is the locale-blind version. */
1821 if (!ok && (flags & UNI_DISPLAY_ISPRINT) && isPRINT(u & 0xFF)) {
1822 Perl_sv_catpvf(aTHX_ dsv, "%c", (char)(u & 0xFF));
1823 ok = TRUE;
1824 }
1825 }
1826 if (!ok)
1827 Perl_sv_catpvf(aTHX_ dsv, "\\x{%"UVxf"}", u);
1828 }
1829 if (truncated)
1830 sv_catpvn(dsv, "...", 3);
1831
1832 return SvPVX(dsv);
1833}
1834
1835/*
1836=for apidoc A|char *|sv_uni_display|SV *dsv|SV *ssv|STRLEN pvlim|UV flags
1837
1838Build to the scalar dsv a displayable version of the scalar sv,
1839the displayable version being at most pvlim bytes long
1840(if longer, the rest is truncated and "..." will be appended).
1841
1842The flags argument is as in pv_uni_display().
1843
1844The pointer to the PV of the dsv is returned.
1845
1846=cut */
1847char *
1848Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags)
1849{
1850 return Perl_pv_uni_display(aTHX_ dsv, (U8*)SvPVX(ssv), SvCUR(ssv),
1851 pvlim, flags);
1852}
1853
1854/*
1855=for apidoc A|I32|ibcmp_utf8|const char *s1|char **pe1|register UV l1|bool u1|const char *s2|char **pe2|register UV l2|bool u2
1856
1857Return true if the strings s1 and s2 differ case-insensitively, false
1858if not (if they are equal case-insensitively). If u1 is true, the
1859string s1 is assumed to be in UTF-8-encoded Unicode. If u2 is true,
1860the string s2 is assumed to be in UTF-8-encoded Unicode. If u1 or u2
1861are false, the respective string is assumed to be in native 8-bit
1862encoding.
1863
1864If the pe1 and pe2 are non-NULL, the scanning pointers will be copied
1865in there (they will point at the beginning of the I<next> character).
1866If the pointers behind pe1 or pe2 are non-NULL, they are the end
1867pointers beyond which scanning will not continue under any
1868circustances. If the byte lengths l1 and l2 are non-zero, s1+l1 and
1869s2+l2 will be used as goal end pointers that will also stop the scan,
1870and which qualify towards defining a successful match: all the scans
1871that define an explicit length must reach their goal pointers for
1872a match to succeed).
1873
1874For case-insensitiveness, the "casefolding" of Unicode is used
1875instead of upper/lowercasing both the characters, see
1876http://www.unicode.org/unicode/reports/tr21/ (Case Mappings).
1877
1878=cut */
1879I32
1880Perl_ibcmp_utf8(pTHX_ const char *s1, char **pe1, register UV l1, bool u1, const char *s2, char **pe2, register UV l2, bool u2)
1881{
1882 register U8 *p1 = (U8*)s1;
1883 register U8 *p2 = (U8*)s2;
1884 register U8 *e1 = 0, *f1 = 0, *q1 = 0;
1885 register U8 *e2 = 0, *f2 = 0, *q2 = 0;
1886 STRLEN n1 = 0, n2 = 0;
1887 U8 foldbuf1[UTF8_MAXLEN_FOLD+1];
1888 U8 foldbuf2[UTF8_MAXLEN_FOLD+1];
1889 U8 natbuf[1+1];
1890 STRLEN foldlen1, foldlen2;
1891 bool match;
1892
1893 if (pe1)
1894 e1 = *(U8**)pe1;
1895 if (e1 == 0 || (l1 && l1 < (UV)(e1 - (U8*)s1)))
1896 f1 = (U8*)s1 + l1;
1897 if (pe2)
1898 e2 = *(U8**)pe2;
1899 if (e2 == 0 || (l2 && l2 < (UV)(e2 - (U8*)s2)))
1900 f2 = (U8*)s2 + l2;
1901
1902 if ((e1 == 0 && f1 == 0) || (e2 == 0 && f2 == 0) || (f1 == 0 && f2 == 0))
1903 return 1; /* mismatch; possible infinite loop or false positive */
1904
1905 if (!u1 || !u2)
1906 natbuf[1] = 0; /* Need to terminate the buffer. */
1907
1908 while ((e1 == 0 || p1 < e1) &&
1909 (f1 == 0 || p1 < f1) &&
1910 (e2 == 0 || p2 < e2) &&
1911 (f2 == 0 || p2 < f2)) {
1912 if (n1 == 0) {
1913 if (u1)
1914 to_utf8_fold(p1, foldbuf1, &foldlen1);
1915 else {
1916 natbuf[0] = *p1;
1917 to_utf8_fold(natbuf, foldbuf1, &foldlen1);
1918 }
1919 q1 = foldbuf1;
1920 n1 = foldlen1;
1921 }
1922 if (n2 == 0) {
1923 if (u2)
1924 to_utf8_fold(p2, foldbuf2, &foldlen2);
1925 else {
1926 natbuf[0] = *p2;
1927 to_utf8_fold(natbuf, foldbuf2, &foldlen2);
1928 }
1929 q2 = foldbuf2;
1930 n2 = foldlen2;
1931 }
1932 while (n1 && n2) {
1933 if ( UTF8SKIP(q1) != UTF8SKIP(q2) ||
1934 (UTF8SKIP(q1) == 1 && *q1 != *q2) ||
1935 memNE((char*)q1, (char*)q2, UTF8SKIP(q1)) )
1936 return 1; /* mismatch */
1937 n1 -= UTF8SKIP(q1);
1938 q1 += UTF8SKIP(q1);
1939 n2 -= UTF8SKIP(q2);
1940 q2 += UTF8SKIP(q2);
1941 }
1942 if (n1 == 0)
1943 p1 += u1 ? UTF8SKIP(p1) : 1;
1944 if (n2 == 0)
1945 p2 += u2 ? UTF8SKIP(p2) : 1;
1946
1947 }
1948
1949 /* A match is defined by all the scans that specified
1950 * an explicit length reaching their final goals. */
1951 match = (f1 == 0 || p1 == f1) && (f2 == 0 || p2 == f2);
1952
1953 if (match) {
1954 if (pe1)
1955 *pe1 = (char*)p1;
1956 if (pe2)
1957 *pe2 = (char*)p2;
1958 }
1959
1960 return match ? 0 : 1; /* 0 match, 1 mismatch */
1961}
1962