3 * Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
4 * by Larry Wall and others
6 * You may distribute under the terms of either the GNU General Public
7 * License or the Artistic License, as specified in the README file.
12 * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
13 * heard of that we don't want to see any closer; and that's the one place
14 * we're trying to get to! And that's just where we can't get, nohow.'
16 * [p.603 of _The Lord of the Rings_, IV/I: "The Taming of Sméagol"]
18 * 'Well do I understand your speech,' he answered in the same language;
19 * 'yet few strangers do so. Why then do you not speak in the Common Tongue,
20 * as is the custom in the West, if you wish to be answered?'
21 * --Gandalf, addressing Théoden's door wardens
23 * [p.508 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"]
25 * ...the travellers perceived that the floor was paved with stones of many
26 * hues; branching runes and strange devices intertwined beneath their feet.
28 * [p.512 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"]
32 #define PERL_IN_UTF8_C
34 #include "inline_invlist.c"
36 static const char unees[] =
37 "Malformed UTF-8 character (unexpected end of string)";
40 =head1 Unicode Support
42 This file contains various utility functions for manipulating UTF8-encoded
43 strings. For the uninitiated, this is a method of representing arbitrary
44 Unicode characters as a variable number of bytes, in such a way that
45 characters in the ASCII range are unmodified, and a zero byte never appears
46 within non-zero characters.
52 =for apidoc is_ascii_string
54 Returns true if the first C<len> bytes of the string C<s> are the same whether
55 or not the string is encoded in UTF-8 (or UTF-EBCDIC on EBCDIC machines). That
56 is, if they are invariant. On ASCII-ish machines, only ASCII characters
57 fit this definition, hence the function's name.
59 If C<len> is 0, it will be calculated using C<strlen(s)>.
61 See also L</is_utf8_string>(), L</is_utf8_string_loclen>(), and L</is_utf8_string_loc>().
67 Perl_is_ascii_string(const U8 *s, STRLEN len)
69 const U8* const send = s + (len ? len : strlen((const char *)s));
72 PERL_ARGS_ASSERT_IS_ASCII_STRING;
74 for (; x < send; ++x) {
75 if (!UTF8_IS_INVARIANT(*x))
83 =for apidoc uvoffuni_to_utf8_flags
85 THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
86 Instead, B<Almost all code should use L</uvchr_to_utf8> or
87 L</uvchr_to_utf8_flags>>.
89 This function is like them, but the input is a strict Unicode
90 (as opposed to native) code point. Only in very rare circumstances should code
91 not be using the native code point.
93 For details, see the description for L</uvchr_to_utf8_flags>>.
99 Perl_uvoffuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
101 PERL_ARGS_ASSERT_UVOFFUNI_TO_UTF8_FLAGS;
103 if (UNI_IS_INVARIANT(uv)) {
104 *d++ = (U8) LATIN1_TO_NATIVE(uv);
108 /* The first problematic code point is the first surrogate */
109 if (uv >= UNICODE_SURROGATE_FIRST
110 && ckWARN4_d(WARN_UTF8, WARN_SURROGATE, WARN_NON_UNICODE, WARN_NONCHAR))
112 if (UNICODE_IS_SURROGATE(uv)) {
113 if (flags & UNICODE_WARN_SURROGATE) {
114 Perl_ck_warner_d(aTHX_ packWARN(WARN_SURROGATE),
115 "UTF-16 surrogate U+%04"UVXf, uv);
117 if (flags & UNICODE_DISALLOW_SURROGATE) {
121 else if (UNICODE_IS_SUPER(uv)) {
122 if (flags & UNICODE_WARN_SUPER
123 || (UNICODE_IS_FE_FF(uv) && (flags & UNICODE_WARN_FE_FF)))
125 Perl_ck_warner_d(aTHX_ packWARN(WARN_NON_UNICODE),
126 "Code point 0x%04"UVXf" is not Unicode, may not be portable", uv);
128 if (flags & UNICODE_DISALLOW_SUPER
129 || (UNICODE_IS_FE_FF(uv) && (flags & UNICODE_DISALLOW_FE_FF)))
134 else if (UNICODE_IS_NONCHAR(uv)) {
135 if (flags & UNICODE_WARN_NONCHAR) {
136 Perl_ck_warner_d(aTHX_ packWARN(WARN_NONCHAR),
137 "Unicode non-character U+%04"UVXf" is illegal for open interchange",
140 if (flags & UNICODE_DISALLOW_NONCHAR) {
148 STRLEN len = OFFUNISKIP(uv);
151 *p-- = (U8) I8_TO_NATIVE_UTF8((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
152 uv >>= UTF_ACCUMULATION_SHIFT;
154 *p = (U8) I8_TO_NATIVE_UTF8((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
157 #else /* Non loop style */
159 *d++ = (U8)(( uv >> 6) | 0xc0);
160 *d++ = (U8)(( uv & 0x3f) | 0x80);
164 *d++ = (U8)(( uv >> 12) | 0xe0);
165 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
166 *d++ = (U8)(( uv & 0x3f) | 0x80);
170 *d++ = (U8)(( uv >> 18) | 0xf0);
171 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
172 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
173 *d++ = (U8)(( uv & 0x3f) | 0x80);
176 if (uv < 0x4000000) {
177 *d++ = (U8)(( uv >> 24) | 0xf8);
178 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
179 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
180 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
181 *d++ = (U8)(( uv & 0x3f) | 0x80);
184 if (uv < 0x80000000) {
185 *d++ = (U8)(( uv >> 30) | 0xfc);
186 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
187 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
188 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
189 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
190 *d++ = (U8)(( uv & 0x3f) | 0x80);
194 if (uv < UTF8_QUAD_MAX)
197 *d++ = 0xfe; /* Can't match U+FEFF! */
198 *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
199 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
200 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
201 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
202 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
203 *d++ = (U8)(( uv & 0x3f) | 0x80);
208 *d++ = 0xff; /* Can't match U+FFFE! */
209 *d++ = 0x80; /* 6 Reserved bits */
210 *d++ = (U8)(((uv >> 60) & 0x0f) | 0x80); /* 2 Reserved bits */
211 *d++ = (U8)(((uv >> 54) & 0x3f) | 0x80);
212 *d++ = (U8)(((uv >> 48) & 0x3f) | 0x80);
213 *d++ = (U8)(((uv >> 42) & 0x3f) | 0x80);
214 *d++ = (U8)(((uv >> 36) & 0x3f) | 0x80);
215 *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
216 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
217 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
218 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
219 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
220 *d++ = (U8)(( uv & 0x3f) | 0x80);
224 #endif /* Non loop style */
227 =for apidoc uvchr_to_utf8
229 Adds the UTF-8 representation of the native code point C<uv> to the end
230 of the string C<d>; C<d> should have at least C<UTF8_MAXBYTES+1> free
231 bytes available. The return value is the pointer to the byte after the
232 end of the new character. In other words,
234 d = uvchr_to_utf8(d, uv);
236 is the recommended wide native character-aware way of saying
240 This function accepts any UV as input. To forbid or warn on non-Unicode code
241 points, or those that may be problematic, see L</uvchr_to_utf8_flags>.
246 /* This is also a macro */
247 PERL_CALLCONV U8* Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv);
250 Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
252 return uvchr_to_utf8(d, uv);
256 =for apidoc uvchr_to_utf8_flags
258 Adds the UTF-8 representation of the native code point C<uv> to the end
259 of the string C<d>; C<d> should have at least C<UTF8_MAXBYTES+1> free
260 bytes available. The return value is the pointer to the byte after the
261 end of the new character. In other words,
263 d = uvchr_to_utf8_flags(d, uv, flags);
267 d = uvchr_to_utf8_flags(d, uv, 0);
269 This is the Unicode-aware way of saying
273 This function will convert to UTF-8 (and not warn) even code points that aren't
274 legal Unicode or are problematic, unless C<flags> contains one or more of the
277 If C<uv> is a Unicode surrogate code point and UNICODE_WARN_SURROGATE is set,
278 the function will raise a warning, provided UTF8 warnings are enabled. If instead
279 UNICODE_DISALLOW_SURROGATE is set, the function will fail and return NULL.
280 If both flags are set, the function will both warn and return NULL.
282 The UNICODE_WARN_NONCHAR and UNICODE_DISALLOW_NONCHAR flags correspondingly
283 affect how the function handles a Unicode non-character. And likewise, the
284 UNICODE_WARN_SUPER and UNICODE_DISALLOW_SUPER flags, affect the handling of
286 above the Unicode maximum of 0x10FFFF. Code points above 0x7FFF_FFFF (which are
287 even less portable) can be warned and/or disallowed even if other above-Unicode
288 code points are accepted, by the UNICODE_WARN_FE_FF and UNICODE_DISALLOW_FE_FF
291 And finally, the flag UNICODE_WARN_ILLEGAL_INTERCHANGE selects all four of the
292 above WARN flags; and UNICODE_DISALLOW_ILLEGAL_INTERCHANGE selects all four
298 /* This is also a macro */
299 PERL_CALLCONV U8* Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags);
302 Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
304 return uvchr_to_utf8_flags(d, uv, flags);
309 Tests if the first C<len> bytes of string C<s> form a valid UTF-8
310 character. Note that an INVARIANT (i.e. ASCII on non-EBCDIC) character is a
311 valid UTF-8 character. The number of bytes in the UTF-8 character
312 will be returned if it is valid, otherwise 0.
314 This is the "slow" version as opposed to the "fast" version which is
315 the "unrolled" IS_UTF8_CHAR(). E.g. for t/uni/class.t the speed
316 difference is a factor of 2 to 3. For lengths (UTF8SKIP(s)) of four
317 or less you should use the IS_UTF8_CHAR(), for lengths of five or more
318 you should use the _slow(). In practice this means that the _slow()
319 will be used very rarely, since the maximum Unicode code point (as of
320 Unicode 4.1) is U+10FFFF, which encodes in UTF-8 to four bytes. Only
321 the "Perl extended UTF-8" (e.g, the infamous 'v-strings') will encode into
325 PERL_STATIC_INLINE STRLEN
326 S_is_utf8_char_slow(const U8 *s, const STRLEN len)
328 dTHX; /* The function called below requires thread context */
332 PERL_ARGS_ASSERT_IS_UTF8_CHAR_SLOW;
334 utf8n_to_uvchr(s, len, &actual_len, UTF8_CHECK_ONLY);
336 return (actual_len == (STRLEN) -1) ? 0 : actual_len;
340 =for apidoc is_utf8_char_buf
342 Returns the number of bytes that comprise the first UTF-8 encoded character in
343 buffer C<buf>. C<buf_end> should point to one position beyond the end of the
344 buffer. 0 is returned if C<buf> does not point to a complete, valid UTF-8
347 Note that an INVARIANT character (i.e. ASCII on non-EBCDIC
348 machines) is a valid UTF-8 character.
353 Perl_is_utf8_char_buf(const U8 *buf, const U8* buf_end)
358 PERL_ARGS_ASSERT_IS_UTF8_CHAR_BUF;
360 if (buf_end <= buf) {
365 if (len > UTF8SKIP(buf)) {
370 if (IS_UTF8_CHAR_FAST(len))
371 return IS_UTF8_CHAR(buf, len) ? len : 0;
372 #endif /* #ifdef IS_UTF8_CHAR */
373 return is_utf8_char_slow(buf, len);
377 =for apidoc is_utf8_char
379 Tests if some arbitrary number of bytes begins in a valid UTF-8
380 character. Note that an INVARIANT (i.e. ASCII on non-EBCDIC machines)
381 character is a valid UTF-8 character. The actual number of bytes in the UTF-8
382 character will be returned if it is valid, otherwise 0.
384 This function is deprecated due to the possibility that malformed input could
385 cause reading beyond the end of the input buffer. Use L</is_utf8_char_buf>
391 Perl_is_utf8_char(const U8 *s)
393 PERL_ARGS_ASSERT_IS_UTF8_CHAR;
395 /* Assumes we have enough space, which is why this is deprecated */
396 return is_utf8_char_buf(s, s + UTF8SKIP(s));
401 =for apidoc is_utf8_string
403 Returns true if the first C<len> bytes of string C<s> form a valid
404 UTF-8 string, false otherwise. If C<len> is 0, it will be calculated
405 using C<strlen(s)> (which means if you use this option, that C<s> has to have a
406 terminating NUL byte). Note that all characters being ASCII constitute 'a
409 See also L</is_ascii_string>(), L</is_utf8_string_loclen>(), and L</is_utf8_string_loc>().
415 Perl_is_utf8_string(const U8 *s, STRLEN len)
417 const U8* const send = s + (len ? len : strlen((const char *)s));
420 PERL_ARGS_ASSERT_IS_UTF8_STRING;
423 /* Inline the easy bits of is_utf8_char() here for speed... */
424 if (UTF8_IS_INVARIANT(*x)) {
428 /* ... and call is_utf8_char() only if really needed. */
429 const STRLEN c = UTF8SKIP(x);
430 const U8* const next_char_ptr = x + c;
432 if (next_char_ptr > send) {
436 if (IS_UTF8_CHAR_FAST(c)) {
437 if (!IS_UTF8_CHAR(x, c))
440 else if (! is_utf8_char_slow(x, c)) {
451 Implemented as a macro in utf8.h
453 =for apidoc is_utf8_string_loc
455 Like L</is_utf8_string> but stores the location of the failure (in the
456 case of "utf8ness failure") or the location C<s>+C<len> (in the case of
457 "utf8ness success") in the C<ep>.
459 See also L</is_utf8_string_loclen>() and L</is_utf8_string>().
461 =for apidoc is_utf8_string_loclen
463 Like L</is_utf8_string>() but stores the location of the failure (in the
464 case of "utf8ness failure") or the location C<s>+C<len> (in the case of
465 "utf8ness success") in the C<ep>, and the number of UTF-8
466 encoded characters in the C<el>.
468 See also L</is_utf8_string_loc>() and L</is_utf8_string>().
474 Perl_is_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
476 const U8* const send = s + (len ? len : strlen((const char *)s));
481 PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN;
484 const U8* next_char_ptr;
486 /* Inline the easy bits of is_utf8_char() here for speed... */
487 if (UTF8_IS_INVARIANT(*x))
488 next_char_ptr = x + 1;
490 /* ... and call is_utf8_char() only if really needed. */
492 next_char_ptr = c + x;
493 if (next_char_ptr > send) {
496 if (IS_UTF8_CHAR_FAST(c)) {
497 if (!IS_UTF8_CHAR(x, c))
500 c = is_utf8_char_slow(x, c);
519 =for apidoc utf8n_to_uvchr
521 THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
522 Most code should use L</utf8_to_uvchr_buf>() rather than call this directly.
524 Bottom level UTF-8 decode routine.
525 Returns the native code point value of the first character in the string C<s>,
526 which is assumed to be in UTF-8 (or UTF-EBCDIC) encoding, and no longer than
527 C<curlen> bytes; C<*retlen> (if C<retlen> isn't NULL) will be set to
528 the length, in bytes, of that character.
530 The value of C<flags> determines the behavior when C<s> does not point to a
531 well-formed UTF-8 character. If C<flags> is 0, when a malformation is found,
532 zero is returned and C<*retlen> is set so that (S<C<s> + C<*retlen>>) is the
533 next possible position in C<s> that could begin a non-malformed character.
534 Also, if UTF-8 warnings haven't been lexically disabled, a warning is raised.
536 Various ALLOW flags can be set in C<flags> to allow (and not warn on)
537 individual types of malformations, such as the sequence being overlong (that
538 is, when there is a shorter sequence that can express the same code point;
539 overlong sequences are expressly forbidden in the UTF-8 standard due to
540 potential security issues). Another malformation example is the first byte of
541 a character not being a legal first byte. See F<utf8.h> for the list of such
542 flags. For allowed 0 length strings, this function returns 0; for allowed
543 overlong sequences, the computed code point is returned; for all other allowed
544 malformations, the Unicode REPLACEMENT CHARACTER is returned, as these have no
545 determinable reasonable value.
547 The UTF8_CHECK_ONLY flag overrides the behavior when a non-allowed (by other
548 flags) malformation is found. If this flag is set, the routine assumes that
549 the caller will raise a warning, and this function will silently just set
550 C<retlen> to C<-1> (cast to C<STRLEN>) and return zero.
552 Note that this API requires disambiguation between successful decoding a NUL
553 character, and an error return (unless the UTF8_CHECK_ONLY flag is set), as
554 in both cases, 0 is returned. To disambiguate, upon a zero return, see if the
555 first byte of C<s> is 0 as well. If so, the input was a NUL; if not, the input
558 Certain code points are considered problematic. These are Unicode surrogates,
559 Unicode non-characters, and code points above the Unicode maximum of 0x10FFFF.
560 By default these are considered regular code points, but certain situations
561 warrant special handling for them. If C<flags> contains
562 UTF8_DISALLOW_ILLEGAL_INTERCHANGE, all three classes are treated as
563 malformations and handled as such. The flags UTF8_DISALLOW_SURROGATE,
564 UTF8_DISALLOW_NONCHAR, and UTF8_DISALLOW_SUPER (meaning above the legal Unicode
565 maximum) can be set to disallow these categories individually.
567 The flags UTF8_WARN_ILLEGAL_INTERCHANGE, UTF8_WARN_SURROGATE,
568 UTF8_WARN_NONCHAR, and UTF8_WARN_SUPER will cause warning messages to be raised
569 for their respective categories, but otherwise the code points are considered
570 valid (not malformations). To get a category to both be treated as a
571 malformation and raise a warning, specify both the WARN and DISALLOW flags.
572 (But note that warnings are not raised if lexically disabled nor if
573 UTF8_CHECK_ONLY is also specified.)
575 Very large code points (above 0x7FFF_FFFF) are considered more problematic than
576 the others that are above the Unicode legal maximum. There are several
577 reasons: they requre at least 32 bits to represent them on ASCII platforms, are
578 not representable at all on EBCDIC platforms, and the original UTF-8
579 specification never went above this number (the current 0x10FFFF limit was
580 imposed later). (The smaller ones, those that fit into 32 bits, are
581 representable by a UV on ASCII platforms, but not by an IV, which means that
582 the number of operations that can be performed on them is quite restricted.)
583 The UTF-8 encoding on ASCII platforms for these large code points begins with a
584 byte containing 0xFE or 0xFF. The UTF8_DISALLOW_FE_FF flag will cause them to
585 be treated as malformations, while allowing smaller above-Unicode code points.
586 (Of course UTF8_DISALLOW_SUPER will treat all above-Unicode code points,
587 including these, as malformations.) Similarly, UTF8_WARN_FE_FF acts just like
588 the other WARN flags, but applies just to these code points.
590 All other code points corresponding to Unicode characters, including private
591 use and those yet to be assigned, are never considered malformed and never
598 Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
601 const U8 * const s0 = s;
602 U8 overflow_byte = '\0'; /* Save byte in case of overflow */
607 UV outlier_ret = 0; /* return value when input is in error or problematic
609 UV pack_warn = 0; /* Save result of packWARN() for later */
610 bool unexpected_non_continuation = FALSE;
611 bool overflowed = FALSE;
612 bool do_overlong_test = TRUE; /* May have to skip this test */
614 const char* const malformed_text = "Malformed UTF-8 character";
616 PERL_ARGS_ASSERT_UTF8N_TO_UVCHR;
618 /* The order of malformation tests here is important. We should consume as
619 * few bytes as possible in order to not skip any valid character. This is
620 * required by the Unicode Standard (section 3.9 of Unicode 6.0); see also
621 * http://unicode.org/reports/tr36 for more discussion as to why. For
622 * example, once we've done a UTF8SKIP, we can tell the expected number of
623 * bytes, and could fail right off the bat if the input parameters indicate
624 * that there are too few available. But it could be that just that first
625 * byte is garbled, and the intended character occupies fewer bytes. If we
626 * blindly assumed that the first byte is correct, and skipped based on
627 * that number, we could skip over a valid input character. So instead, we
628 * always examine the sequence byte-by-byte.
630 * We also should not consume too few bytes, otherwise someone could inject
631 * things. For example, an input could be deliberately designed to
632 * overflow, and if this code bailed out immediately upon discovering that,
633 * returning to the caller *retlen pointing to the very next byte (one
634 * which is actually part of of the overflowing sequence), that could look
635 * legitimate to the caller, which could discard the initial partial
636 * sequence and process the rest, inappropriately */
638 /* Zero length strings, if allowed, of necessity are zero */
639 if (UNLIKELY(curlen == 0)) {
644 if (flags & UTF8_ALLOW_EMPTY) {
647 if (! (flags & UTF8_CHECK_ONLY)) {
648 sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (empty string)", malformed_text));
653 expectlen = UTF8SKIP(s);
655 /* A well-formed UTF-8 character, as the vast majority of calls to this
656 * function will be for, has this expected length. For efficiency, set
657 * things up here to return it. It will be overriden only in those rare
658 * cases where a malformation is found */
663 /* An invariant is trivially well-formed */
664 if (UTF8_IS_INVARIANT(uv)) {
668 /* A continuation character can't start a valid sequence */
669 if (UNLIKELY(UTF8_IS_CONTINUATION(uv))) {
670 if (flags & UTF8_ALLOW_CONTINUATION) {
674 return UNICODE_REPLACEMENT;
677 if (! (flags & UTF8_CHECK_ONLY)) {
678 sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (unexpected continuation byte 0x%02x, with no preceding start byte)", malformed_text, *s0));
684 /* Here is not a continuation byte, nor an invariant. The only thing left
685 * is a start byte (possibly for an overlong) */
688 uv = NATIVE_UTF8_TO_I8(uv);
691 /* Remove the leading bits that indicate the number of bytes in the
692 * character's whole UTF-8 sequence, leaving just the bits that are part of
694 uv &= UTF_START_MASK(expectlen);
696 /* Now, loop through the remaining bytes in the character's sequence,
697 * accumulating each into the working value as we go. Be sure to not look
698 * past the end of the input string */
699 send = (U8*) s0 + ((expectlen <= curlen) ? expectlen : curlen);
701 for (s = s0 + 1; s < send; s++) {
702 if (LIKELY(UTF8_IS_CONTINUATION(*s))) {
703 #ifndef EBCDIC /* Can't overflow in EBCDIC */
704 if (uv & UTF_ACCUMULATION_OVERFLOW_MASK) {
706 /* The original implementors viewed this malformation as more
707 * serious than the others (though I, khw, don't understand
708 * why, since other malformations also give very very wrong
709 * results), so there is no way to turn off checking for it.
710 * Set a flag, but keep going in the loop, so that we absorb
711 * the rest of the bytes that comprise the character. */
713 overflow_byte = *s; /* Save for warning message's use */
716 uv = UTF8_ACCUMULATE(uv, *s);
719 /* Here, found a non-continuation before processing all expected
720 * bytes. This byte begins a new character, so quit, even if
721 * allowing this malformation. */
722 unexpected_non_continuation = TRUE;
725 } /* End of loop through the character's bytes */
727 /* Save how many bytes were actually in the character */
730 /* The loop above finds two types of malformations: non-continuation and/or
731 * overflow. The non-continuation malformation is really a too-short
732 * malformation, as it means that the current character ended before it was
733 * expected to (being terminated prematurely by the beginning of the next
734 * character, whereas in the too-short malformation there just are too few
735 * bytes available to hold the character. In both cases, the check below
736 * that we have found the expected number of bytes would fail if executed.)
737 * Thus the non-continuation malformation is really unnecessary, being a
738 * subset of the too-short malformation. But there may be existing
739 * applications that are expecting the non-continuation type, so we retain
740 * it, and return it in preference to the too-short malformation. (If this
741 * code were being written from scratch, the two types might be collapsed
742 * into one.) I, khw, am also giving priority to returning the
743 * non-continuation and too-short malformations over overflow when multiple
744 * ones are present. I don't know of any real reason to prefer one over
745 * the other, except that it seems to me that multiple-byte errors trumps
746 * errors from a single byte */
747 if (UNLIKELY(unexpected_non_continuation)) {
748 if (!(flags & UTF8_ALLOW_NON_CONTINUATION)) {
749 if (! (flags & UTF8_CHECK_ONLY)) {
751 sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (unexpected non-continuation byte 0x%02x, immediately after start byte 0x%02x)", malformed_text, *s, *s0));
754 sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (unexpected non-continuation byte 0x%02x, %d bytes after start byte 0x%02x, expected %d bytes)", malformed_text, *s, (int) curlen, *s0, (int)expectlen));
759 uv = UNICODE_REPLACEMENT;
761 /* Skip testing for overlongs, as the REPLACEMENT may not be the same
762 * as what the original expectations were. */
763 do_overlong_test = FALSE;
768 else if (UNLIKELY(curlen < expectlen)) {
769 if (! (flags & UTF8_ALLOW_SHORT)) {
770 if (! (flags & UTF8_CHECK_ONLY)) {
771 sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (%d byte%s, need %d, after start byte 0x%02x)", malformed_text, (int)curlen, curlen == 1 ? "" : "s", (int)expectlen, *s0));
775 uv = UNICODE_REPLACEMENT;
776 do_overlong_test = FALSE;
782 #ifndef EBCDIC /* EBCDIC allows FE, FF, can't overflow */
783 if ((*s0 & 0xFE) == 0xFE /* matches both FE, FF */
784 && (flags & (UTF8_WARN_FE_FF|UTF8_DISALLOW_FE_FF)))
786 /* By adding UTF8_CHECK_ONLY to the test, we avoid unnecessary
787 * generation of the sv, since no warnings are raised under CHECK */
788 if ((flags & (UTF8_WARN_FE_FF|UTF8_CHECK_ONLY)) == UTF8_WARN_FE_FF
789 && ckWARN_d(WARN_UTF8))
791 /* This message is deliberately not of the same syntax as the other
792 * messages for malformations, for backwards compatibility in the
793 * unlikely event that code is relying on its precise earlier text
795 sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s Code point beginning with byte 0x%02X is not Unicode, and not portable", malformed_text, *s0));
796 pack_warn = packWARN(WARN_UTF8);
798 if (flags & UTF8_DISALLOW_FE_FF) {
802 if (UNLIKELY(overflowed)) {
804 /* If the first byte is FF, it will overflow a 32-bit word. If the
805 * first byte is FE, it will overflow a signed 32-bit word. The
806 * above preserves backward compatibility, since its message was used
807 * in earlier versions of this code in preference to overflow */
808 sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (overflow at byte 0x%02x, after start byte 0x%02x)", malformed_text, overflow_byte, *s0));
814 && expectlen > (STRLEN) OFFUNISKIP(uv)
815 && ! (flags & UTF8_ALLOW_LONG))
817 /* The overlong malformation has lower precedence than the others.
818 * Note that if this malformation is allowed, we return the actual
819 * value, instead of the replacement character. This is because this
820 * value is actually well-defined. */
821 if (! (flags & UTF8_CHECK_ONLY)) {
822 sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (%d byte%s, need %d, after start byte 0x%02x)", malformed_text, (int)expectlen, expectlen == 1 ? "": "s", OFFUNISKIP(uv), *s0));
827 /* Here, the input is considered to be well-formed , but could be a
828 * problematic code point that is not allowed by the input parameters. */
829 if (uv >= UNICODE_SURROGATE_FIRST /* isn't problematic if < this */
830 && (flags & (UTF8_DISALLOW_ILLEGAL_INTERCHANGE
831 |UTF8_WARN_ILLEGAL_INTERCHANGE)))
833 if (UNICODE_IS_SURROGATE(uv)) {
834 if ((flags & (UTF8_WARN_SURROGATE|UTF8_CHECK_ONLY)) == UTF8_WARN_SURROGATE
835 && ckWARN2_d(WARN_UTF8, WARN_SURROGATE))
837 sv = sv_2mortal(Perl_newSVpvf(aTHX_ "UTF-16 surrogate U+%04"UVXf"", uv));
838 pack_warn = packWARN2(WARN_UTF8, WARN_SURROGATE);
840 if (flags & UTF8_DISALLOW_SURROGATE) {
844 else if ((uv > PERL_UNICODE_MAX)) {
845 if ((flags & (UTF8_WARN_SUPER|UTF8_CHECK_ONLY)) == UTF8_WARN_SUPER
846 && ckWARN2_d(WARN_UTF8, WARN_NON_UNICODE))
848 sv = sv_2mortal(Perl_newSVpvf(aTHX_ "Code point 0x%04"UVXf" is not Unicode, may not be portable", uv));
849 pack_warn = packWARN2(WARN_UTF8, WARN_NON_UNICODE);
851 if (flags & UTF8_DISALLOW_SUPER) {
855 else if (UNICODE_IS_NONCHAR(uv)) {
856 if ((flags & (UTF8_WARN_NONCHAR|UTF8_CHECK_ONLY)) == UTF8_WARN_NONCHAR
857 && ckWARN2_d(WARN_UTF8, WARN_NONCHAR))
859 sv = sv_2mortal(Perl_newSVpvf(aTHX_ "Unicode non-character U+%04"UVXf" is illegal for open interchange", uv));
860 pack_warn = packWARN2(WARN_UTF8, WARN_NONCHAR);
862 if (flags & UTF8_DISALLOW_NONCHAR) {
868 outlier_ret = uv; /* Note we don't bother to convert to native,
869 as all the outlier code points are the same
870 in both ASCII and EBCDIC */
874 /* Here, this is not considered a malformed character, so drop through
878 return UNI_TO_NATIVE(uv);
880 /* There are three cases which get to beyond this point. In all 3 cases:
881 * <sv> if not null points to a string to print as a warning.
882 * <curlen> is what <*retlen> should be set to if UTF8_CHECK_ONLY isn't
884 * <outlier_ret> is what return value to use if UTF8_CHECK_ONLY isn't set.
885 * This is done by initializing it to 0, and changing it only
888 * 1) The input is valid but problematic, and to be warned about. The
889 * return value is the resultant code point; <*retlen> is set to
890 * <curlen>, the number of bytes that comprise the code point.
891 * <pack_warn> contains the result of packWARN() for the warning
892 * types. The entry point for this case is the label <do_warn>;
893 * 2) The input is a valid code point but disallowed by the parameters to
894 * this function. The return value is 0. If UTF8_CHECK_ONLY is set,
895 * <*relen> is -1; otherwise it is <curlen>, the number of bytes that
896 * comprise the code point. <pack_warn> contains the result of
897 * packWARN() for the warning types. The entry point for this case is
898 * the label <disallowed>.
899 * 3) The input is malformed. The return value is 0. If UTF8_CHECK_ONLY
900 * is set, <*relen> is -1; otherwise it is <curlen>, the number of
901 * bytes that comprise the malformation. All such malformations are
902 * assumed to be warning type <utf8>. The entry point for this case
903 * is the label <malformed>.
908 if (sv && ckWARN_d(WARN_UTF8)) {
909 pack_warn = packWARN(WARN_UTF8);
914 if (flags & UTF8_CHECK_ONLY) {
916 *retlen = ((STRLEN) -1);
922 if (pack_warn) { /* <pack_warn> was initialized to 0, and changed only
923 if warnings are to be raised. */
924 const char * const string = SvPVX_const(sv);
927 Perl_warner(aTHX_ pack_warn, "%s in %s", string, OP_DESC(PL_op));
929 Perl_warner(aTHX_ pack_warn, "%s", string);
940 =for apidoc utf8_to_uvchr_buf
942 Returns the native code point of the first character in the string C<s> which
943 is assumed to be in UTF-8 encoding; C<send> points to 1 beyond the end of C<s>.
944 C<*retlen> will be set to the length, in bytes, of that character.
946 If C<s> does not point to a well-formed UTF-8 character and UTF8 warnings are
947 enabled, zero is returned and C<*retlen> is set (if C<retlen> isn't
948 NULL) to -1. If those warnings are off, the computed value, if well-defined
949 (or the Unicode REPLACEMENT CHARACTER if not), is silently returned, and
950 C<*retlen> is set (if C<retlen> isn't NULL) so that (S<C<s> + C<*retlen>>) is
951 the next possible position in C<s> that could begin a non-malformed character.
952 See L</utf8n_to_uvchr> for details on when the REPLACEMENT CHARACTER is
960 Perl_utf8_to_uvchr_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
964 return utf8n_to_uvchr(s, send - s, retlen,
965 ckWARN_d(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
968 /* Like L</utf8_to_uvchr_buf>(), but should only be called when it is known that
969 * there are no malformations in the input UTF-8 string C<s>. surrogates,
970 * non-character code points, and non-Unicode code points are allowed. A macro
971 * in utf8.h is used to normally avoid this function wrapper */
974 Perl_valid_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen)
976 UV expectlen = UTF8SKIP(s);
977 const U8* send = s + expectlen;
980 PERL_ARGS_ASSERT_VALID_UTF8_TO_UVCHR;
986 /* An invariant is trivially returned */
987 if (expectlen == 1) {
992 uv = NATIVE_UTF8_TO_I8(uv);
995 /* Remove the leading bits that indicate the number of bytes, leaving just
996 * the bits that are part of the value */
997 uv &= UTF_START_MASK(expectlen);
999 /* Now, loop through the remaining bytes, accumulating each into the
1000 * working total as we go. (I khw tried unrolling the loop for up to 4
1001 * bytes, but there was no performance improvement) */
1002 for (++s; s < send; s++) {
1003 uv = UTF8_ACCUMULATE(uv, *s);
1006 return UNI_TO_NATIVE(uv);
1011 =for apidoc utf8_to_uvchr
1013 Returns the native code point of the first character in the string C<s>
1014 which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
1015 length, in bytes, of that character.
1017 Some, but not all, UTF-8 malformations are detected, and in fact, some
1018 malformed input could cause reading beyond the end of the input buffer, which
1019 is why this function is deprecated. Use L</utf8_to_uvchr_buf> instead.
1021 If C<s> points to one of the detected malformations, and UTF8 warnings are
1022 enabled, zero is returned and C<*retlen> is set (if C<retlen> isn't
1023 NULL) to -1. If those warnings are off, the computed value if well-defined (or
1024 the Unicode REPLACEMENT CHARACTER, if not) is silently returned, and C<*retlen>
1025 is set (if C<retlen> isn't NULL) so that (S<C<s> + C<*retlen>>) is the
1026 next possible position in C<s> that could begin a non-malformed character.
1027 See L</utf8n_to_uvchr> for details on when the REPLACEMENT CHARACTER is returned.
1033 Perl_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen)
1035 PERL_ARGS_ASSERT_UTF8_TO_UVCHR;
1037 return utf8_to_uvchr_buf(s, s + UTF8_MAXBYTES, retlen);
1041 =for apidoc utf8_to_uvuni_buf
1043 Only in very rare circumstances should code need to be dealing in Unicode
1044 (as opposed to native) code points. In those few cases, use
1045 C<L<NATIVE_TO_UNI(utf8_to_uvchr_buf(...))|/utf8_to_uvchr_buf>> instead.
1047 Returns the Unicode (not-native) code point of the first character in the
1049 is assumed to be in UTF-8 encoding; C<send> points to 1 beyond the end of C<s>.
1050 C<retlen> will be set to the length, in bytes, of that character.
1052 If C<s> does not point to a well-formed UTF-8 character and UTF8 warnings are
1053 enabled, zero is returned and C<*retlen> is set (if C<retlen> isn't
1054 NULL) to -1. If those warnings are off, the computed value if well-defined (or
1055 the Unicode REPLACEMENT CHARACTER, if not) is silently returned, and C<*retlen>
1056 is set (if C<retlen> isn't NULL) so that (S<C<s> + C<*retlen>>) is the
1057 next possible position in C<s> that could begin a non-malformed character.
1058 See L</utf8n_to_uvchr> for details on when the REPLACEMENT CHARACTER is returned.
1064 Perl_utf8_to_uvuni_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
1066 PERL_ARGS_ASSERT_UTF8_TO_UVUNI_BUF;
1070 /* Call the low level routine asking for checks */
1071 return NATIVE_TO_UNI(Perl_utf8n_to_uvchr(aTHX_ s, send -s, retlen,
1072 ckWARN_d(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY));
1076 * Like L</utf8_to_uvuni_buf>(), but should only be called when it is known that
1077 * there are no malformations in the input UTF-8 string C<s>. Surrogates,
1078 * non-character code points, and non-Unicode code points are allowed */
1081 Perl_valid_utf8_to_uvuni(pTHX_ const U8 *s, STRLEN *retlen)
1083 PERL_ARGS_ASSERT_VALID_UTF8_TO_UVUNI;
1085 return NATIVE_TO_UNI(valid_utf8_to_uvchr(s, retlen));
1089 =for apidoc utf8_to_uvuni
1091 Returns the Unicode code point of the first character in the string C<s>
1092 which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
1093 length, in bytes, of that character.
1095 Some, but not all, UTF-8 malformations are detected, and in fact, some
1096 malformed input could cause reading beyond the end of the input buffer, which
1097 is one reason why this function is deprecated. The other is that only in
1098 extremely limited circumstances should the Unicode versus native code point be
1099 of any interest to you. See L</utf8_to_uvuni_buf> for alternatives.
1101 If C<s> points to one of the detected malformations, and UTF8 warnings are
1102 enabled, zero is returned and C<*retlen> is set (if C<retlen> doesn't point to
1103 NULL) to -1. If those warnings are off, the computed value if well-defined (or
1104 the Unicode REPLACEMENT CHARACTER, if not) is silently returned, and C<*retlen>
1105 is set (if C<retlen> isn't NULL) so that (S<C<s> + C<*retlen>>) is the
1106 next possible position in C<s> that could begin a non-malformed character.
1107 See L</utf8n_to_uvchr> for details on when the REPLACEMENT CHARACTER is returned.
1113 Perl_utf8_to_uvuni(pTHX_ const U8 *s, STRLEN *retlen)
1115 PERL_ARGS_ASSERT_UTF8_TO_UVUNI;
1117 return NATIVE_TO_UNI(valid_utf8_to_uvchr(s, retlen));
1121 =for apidoc utf8_length
1123 Return the length of the UTF-8 char encoded string C<s> in characters.
1124 Stops at C<e> (inclusive). If C<e E<lt> s> or if the scan would end
1125 up past C<e>, croaks.
1131 Perl_utf8_length(pTHX_ const U8 *s, const U8 *e)
1136 PERL_ARGS_ASSERT_UTF8_LENGTH;
1138 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
1139 * the bitops (especially ~) can create illegal UTF-8.
1140 * In other words: in Perl UTF-8 is not just for Unicode. */
1143 goto warn_and_return;
1153 Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
1154 "%s in %s", unees, OP_DESC(PL_op));
1156 Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), "%s", unees);
1163 =for apidoc utf8_distance
1165 Returns the number of UTF-8 characters between the UTF-8 pointers C<a>
1168 WARNING: use only if you *know* that the pointers point inside the
1175 Perl_utf8_distance(pTHX_ const U8 *a, const U8 *b)
1177 PERL_ARGS_ASSERT_UTF8_DISTANCE;
1179 return (a < b) ? -1 * (IV) utf8_length(a, b) : (IV) utf8_length(b, a);
1183 =for apidoc utf8_hop
1185 Return the UTF-8 pointer C<s> displaced by C<off> characters, either
1186 forward or backward.
1188 WARNING: do not use the following unless you *know* C<off> is within
1189 the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
1190 on the first byte of character or just after the last byte of a character.
1196 Perl_utf8_hop(pTHX_ const U8 *s, I32 off)
1198 PERL_ARGS_ASSERT_UTF8_HOP;
1200 PERL_UNUSED_CONTEXT;
1201 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
1202 * the bitops (especially ~) can create illegal UTF-8.
1203 * In other words: in Perl UTF-8 is not just for Unicode. */
1212 while (UTF8_IS_CONTINUATION(*s))
1220 =for apidoc bytes_cmp_utf8
1222 Compares the sequence of characters (stored as octets) in C<b>, C<blen> with the
1223 sequence of characters (stored as UTF-8) in C<u>, C<ulen>. Returns 0 if they are
1224 equal, -1 or -2 if the first string is less than the second string, +1 or +2
1225 if the first string is greater than the second string.
1227 -1 or +1 is returned if the shorter string was identical to the start of the
1228 longer string. -2 or +2 is returned if the was a difference between characters
1235 Perl_bytes_cmp_utf8(pTHX_ const U8 *b, STRLEN blen, const U8 *u, STRLEN ulen)
1237 const U8 *const bend = b + blen;
1238 const U8 *const uend = u + ulen;
1240 PERL_ARGS_ASSERT_BYTES_CMP_UTF8;
1242 PERL_UNUSED_CONTEXT;
1244 while (b < bend && u < uend) {
1246 if (!UTF8_IS_INVARIANT(c)) {
1247 if (UTF8_IS_DOWNGRADEABLE_START(c)) {
1250 if (UTF8_IS_CONTINUATION(c1)) {
1251 c = TWO_BYTE_UTF8_TO_NATIVE(c, c1);
1253 Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
1254 "Malformed UTF-8 character "
1255 "(unexpected non-continuation byte 0x%02x"
1256 ", immediately after start byte 0x%02x)"
1257 /* Dear diag.t, it's in the pod. */
1259 PL_op ? " in " : "",
1260 PL_op ? OP_DESC(PL_op) : "");
1265 Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
1266 "%s in %s", unees, OP_DESC(PL_op));
1268 Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), "%s", unees);
1269 return -2; /* Really want to return undef :-) */
1276 return *b < c ? -2 : +2;
1281 if (b == bend && u == uend)
1284 return b < bend ? +1 : -1;
1288 =for apidoc utf8_to_bytes
1290 Converts a string C<s> of length C<len> from UTF-8 into native byte encoding.
1291 Unlike L</bytes_to_utf8>, this over-writes the original string, and
1292 updates C<len> to contain the new length.
1293 Returns zero on failure, setting C<len> to -1.
1295 If you need a copy of the string, see L</bytes_from_utf8>.
1301 Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len)
1303 U8 * const save = s;
1304 U8 * const send = s + *len;
1307 PERL_ARGS_ASSERT_UTF8_TO_BYTES;
1309 /* ensure valid UTF-8 and chars < 256 before updating string */
1311 if (! UTF8_IS_INVARIANT(*s)) {
1312 if (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(s, send)) {
1313 *len = ((STRLEN) -1);
1324 if (! UTF8_IS_INVARIANT(c)) {
1325 /* Then it is two-byte encoded */
1326 c = TWO_BYTE_UTF8_TO_NATIVE(c, *s);
1337 =for apidoc bytes_from_utf8
1339 Converts a string C<s> of length C<len> from UTF-8 into native byte encoding.
1340 Unlike L</utf8_to_bytes> but like L</bytes_to_utf8>, returns a pointer to
1341 the newly-created string, and updates C<len> to contain the new
1342 length. Returns the original string if no conversion occurs, C<len>
1343 is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
1344 0 if C<s> is converted or consisted entirely of characters that are invariant
1345 in utf8 (i.e., US-ASCII on non-EBCDIC machines).
1351 Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *len, bool *is_utf8)
1354 const U8 *start = s;
1358 PERL_ARGS_ASSERT_BYTES_FROM_UTF8;
1360 PERL_UNUSED_CONTEXT;
1364 /* ensure valid UTF-8 and chars < 256 before converting string */
1365 for (send = s + *len; s < send;) {
1366 if (! UTF8_IS_INVARIANT(*s)) {
1367 if (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(s, send)) {
1378 Newx(d, (*len) - count + 1, U8);
1379 s = start; start = d;
1382 if (! UTF8_IS_INVARIANT(c)) {
1383 /* Then it is two-byte encoded */
1384 c = TWO_BYTE_UTF8_TO_NATIVE(c, *s);
1395 =for apidoc bytes_to_utf8
1397 Converts a string C<s> of length C<len> bytes from the native encoding into
1399 Returns a pointer to the newly-created string, and sets C<len> to
1400 reflect the new length in bytes.
1402 A NUL character will be written after the end of the string.
1404 If you want to convert to UTF-8 from encodings other than
1405 the native (Latin1 or EBCDIC),
1406 see L</sv_recode_to_utf8>().
1411 /* This logic is duplicated in sv_catpvn_flags, so any bug fixes will
1412 likewise need duplication. */
1415 Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *len)
1417 const U8 * const send = s + (*len);
1421 PERL_ARGS_ASSERT_BYTES_TO_UTF8;
1422 PERL_UNUSED_CONTEXT;
1424 Newx(d, (*len) * 2 + 1, U8);
1428 append_utf8_from_native_byte(*s, &d);
1437 * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
1439 * Destination must be pre-extended to 3/2 source. Do not use in-place.
1440 * We optimize for native, for obvious reasons. */
1443 Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
1448 PERL_ARGS_ASSERT_UTF16_TO_UTF8;
1451 Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen %"UVuf, (UV)bytelen);
1456 UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
1458 if (UNI_IS_INVARIANT(uv)) {
1459 *d++ = LATIN1_TO_NATIVE((U8) uv);
1462 if (uv <= MAX_UTF8_TWO_BYTE) {
1463 *d++ = UTF8_TWO_BYTE_HI(UNI_TO_NATIVE(uv));
1464 *d++ = UTF8_TWO_BYTE_LO(UNI_TO_NATIVE(uv));
1467 #define FIRST_HIGH_SURROGATE UNICODE_SURROGATE_FIRST
1468 #define LAST_HIGH_SURROGATE 0xDBFF
1469 #define FIRST_LOW_SURROGATE 0xDC00
1470 #define LAST_LOW_SURROGATE UNICODE_SURROGATE_LAST
1471 if (uv >= FIRST_HIGH_SURROGATE && uv <= LAST_HIGH_SURROGATE) {
1473 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
1475 UV low = (p[0] << 8) + p[1];
1477 if (low < FIRST_LOW_SURROGATE || low > LAST_LOW_SURROGATE)
1478 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
1479 uv = ((uv - FIRST_HIGH_SURROGATE) << 10)
1480 + (low - FIRST_LOW_SURROGATE) + 0x10000;
1482 } else if (uv >= FIRST_LOW_SURROGATE && uv <= LAST_LOW_SURROGATE) {
1483 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
1486 d = uvoffuni_to_utf8_flags(d, uv, 0);
1489 *d++ = (U8)(( uv >> 12) | 0xe0);
1490 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
1491 *d++ = (U8)(( uv & 0x3f) | 0x80);
1495 *d++ = (U8)(( uv >> 18) | 0xf0);
1496 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
1497 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
1498 *d++ = (U8)(( uv & 0x3f) | 0x80);
1503 *newlen = d - dstart;
1507 /* Note: this one is slightly destructive of the source. */
1510 Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
1513 U8* const send = s + bytelen;
1515 PERL_ARGS_ASSERT_UTF16_TO_UTF8_REVERSED;
1518 Perl_croak(aTHX_ "panic: utf16_to_utf8_reversed: odd bytelen %"UVuf,
1522 const U8 tmp = s[0];
1527 return utf16_to_utf8(p, d, bytelen, newlen);
1531 Perl__is_uni_FOO(pTHX_ const U8 classnum, const UV c)
1533 U8 tmpbuf[UTF8_MAXBYTES+1];
1534 uvchr_to_utf8(tmpbuf, c);
1535 return _is_utf8_FOO(classnum, tmpbuf);
1538 /* for now these are all defined (inefficiently) in terms of the utf8 versions.
1539 * Note that the macros in handy.h that call these short-circuit calling them
1540 * for Latin-1 range inputs */
1543 Perl_is_uni_alnum(pTHX_ UV c)
1545 U8 tmpbuf[UTF8_MAXBYTES+1];
1546 uvchr_to_utf8(tmpbuf, c);
1547 return _is_utf8_FOO(_CC_WORDCHAR, tmpbuf);
1551 Perl_is_uni_alnumc(pTHX_ UV c)
1553 U8 tmpbuf[UTF8_MAXBYTES+1];
1554 uvchr_to_utf8(tmpbuf, c);
1555 return _is_utf8_FOO(_CC_ALPHANUMERIC, tmpbuf);
1558 /* Internal function so we can deprecate the external one, and call
1559 this one from other deprecated functions in this file */
1561 PERL_STATIC_INLINE bool
1562 S_is_utf8_idfirst(pTHX_ const U8 *p)
1568 /* is_utf8_idstart would be more logical. */
1569 return is_utf8_common(p, &PL_utf8_idstart, "IdStart");
1573 Perl_is_uni_idfirst(pTHX_ UV c)
1575 U8 tmpbuf[UTF8_MAXBYTES+1];
1576 uvchr_to_utf8(tmpbuf, c);
1577 return S_is_utf8_idfirst(aTHX_ tmpbuf);
1581 Perl__is_uni_perl_idcont(pTHX_ UV c)
1583 U8 tmpbuf[UTF8_MAXBYTES+1];
1584 uvchr_to_utf8(tmpbuf, c);
1585 return _is_utf8_perl_idcont(tmpbuf);
1589 Perl__is_uni_perl_idstart(pTHX_ UV c)
1591 U8 tmpbuf[UTF8_MAXBYTES+1];
1592 uvchr_to_utf8(tmpbuf, c);
1593 return _is_utf8_perl_idstart(tmpbuf);
1597 Perl_is_uni_alpha(pTHX_ UV c)
1599 U8 tmpbuf[UTF8_MAXBYTES+1];
1600 uvchr_to_utf8(tmpbuf, c);
1601 return _is_utf8_FOO(_CC_ALPHA, tmpbuf);
1605 Perl_is_uni_ascii(pTHX_ UV c)
1611 Perl_is_uni_blank(pTHX_ UV c)
1613 return isBLANK_uni(c);
1617 Perl_is_uni_space(pTHX_ UV c)
1619 return isSPACE_uni(c);
1623 Perl_is_uni_digit(pTHX_ UV c)
1625 U8 tmpbuf[UTF8_MAXBYTES+1];
1626 uvchr_to_utf8(tmpbuf, c);
1627 return _is_utf8_FOO(_CC_DIGIT, tmpbuf);
1631 Perl_is_uni_upper(pTHX_ UV c)
1633 U8 tmpbuf[UTF8_MAXBYTES+1];
1634 uvchr_to_utf8(tmpbuf, c);
1635 return _is_utf8_FOO(_CC_UPPER, tmpbuf);
1639 Perl_is_uni_lower(pTHX_ UV c)
1641 U8 tmpbuf[UTF8_MAXBYTES+1];
1642 uvchr_to_utf8(tmpbuf, c);
1643 return _is_utf8_FOO(_CC_LOWER, tmpbuf);
1647 Perl_is_uni_cntrl(pTHX_ UV c)
1649 return isCNTRL_L1(c);
1653 Perl_is_uni_graph(pTHX_ UV c)
1655 U8 tmpbuf[UTF8_MAXBYTES+1];
1656 uvchr_to_utf8(tmpbuf, c);
1657 return _is_utf8_FOO(_CC_GRAPH, tmpbuf);
1661 Perl_is_uni_print(pTHX_ UV c)
1663 U8 tmpbuf[UTF8_MAXBYTES+1];
1664 uvchr_to_utf8(tmpbuf, c);
1665 return _is_utf8_FOO(_CC_PRINT, tmpbuf);
1669 Perl_is_uni_punct(pTHX_ UV c)
1671 U8 tmpbuf[UTF8_MAXBYTES+1];
1672 uvchr_to_utf8(tmpbuf, c);
1673 return _is_utf8_FOO(_CC_PUNCT, tmpbuf);
1677 Perl_is_uni_xdigit(pTHX_ UV c)
1679 return isXDIGIT_uni(c);
1683 Perl__to_upper_title_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp, const char S_or_s)
1685 /* We have the latin1-range values compiled into the core, so just use
1686 * those, converting the result to utf8. The only difference between upper
1687 * and title case in this range is that LATIN_SMALL_LETTER_SHARP_S is
1688 * either "SS" or "Ss". Which one to use is passed into the routine in
1689 * 'S_or_s' to avoid a test */
1691 UV converted = toUPPER_LATIN1_MOD(c);
1693 PERL_ARGS_ASSERT__TO_UPPER_TITLE_LATIN1;
1695 assert(S_or_s == 'S' || S_or_s == 's');
1697 if (NATIVE_IS_INVARIANT(converted)) { /* No difference between the two for
1698 characters in this range */
1699 *p = (U8) converted;
1704 /* toUPPER_LATIN1_MOD gives the correct results except for three outliers,
1705 * which it maps to one of them, so as to only have to have one check for
1706 * it in the main case */
1707 if (UNLIKELY(converted == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) {
1709 case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
1710 converted = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS;
1713 converted = GREEK_CAPITAL_LETTER_MU;
1715 case LATIN_SMALL_LETTER_SHARP_S:
1721 Perl_croak(aTHX_ "panic: to_upper_title_latin1 did not expect '%c' to map to '%c'", c, LATIN_SMALL_LETTER_Y_WITH_DIAERESIS);
1722 assert(0); /* NOTREACHED */
1726 *(p)++ = UTF8_TWO_BYTE_HI(converted);
1727 *p = UTF8_TWO_BYTE_LO(converted);
1733 /* Call the function to convert a UTF-8 encoded character to the specified case.
1734 * Note that there may be more than one character in the result.
1735 * INP is a pointer to the first byte of the input character
1736 * OUTP will be set to the first byte of the string of changed characters. It
1737 * needs to have space for UTF8_MAXBYTES_CASE+1 bytes
1738 * LENP will be set to the length in bytes of the string of changed characters
1740 * The functions return the ordinal of the first character in the string of OUTP */
1741 #define CALL_UPPER_CASE(INP, OUTP, LENP) Perl_to_utf8_case(aTHX_ INP, OUTP, LENP, &PL_utf8_toupper, "ToUc", "utf8::ToSpecUc")
1742 #define CALL_TITLE_CASE(INP, OUTP, LENP) Perl_to_utf8_case(aTHX_ INP, OUTP, LENP, &PL_utf8_totitle, "ToTc", "utf8::ToSpecTc")
1743 #define CALL_LOWER_CASE(INP, OUTP, LENP) Perl_to_utf8_case(aTHX_ INP, OUTP, LENP, &PL_utf8_tolower, "ToLc", "utf8::ToSpecLc")
1745 /* This additionally has the input parameter SPECIALS, which if non-zero will
1746 * cause this to use the SPECIALS hash for folding (meaning get full case
1747 * folding); otherwise, when zero, this implies a simple case fold */
1748 #define CALL_FOLD_CASE(INP, OUTP, LENP, SPECIALS) Perl_to_utf8_case(aTHX_ INP, OUTP, LENP, &PL_utf8_tofold, "ToCf", (SPECIALS) ? "utf8::ToSpecCf" : NULL)
1751 Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
1755 /* Convert the Unicode character whose ordinal is <c> to its uppercase
1756 * version and store that in UTF-8 in <p> and its length in bytes in <lenp>.
1757 * Note that the <p> needs to be at least UTF8_MAXBYTES_CASE+1 bytes since
1758 * the changed version may be longer than the original character.
1760 * The ordinal of the first character of the changed version is returned
1761 * (but note, as explained above, that there may be more.) */
1763 PERL_ARGS_ASSERT_TO_UNI_UPPER;
1766 return _to_upper_title_latin1((U8) c, p, lenp, 'S');
1769 uvchr_to_utf8(p, c);
1770 return CALL_UPPER_CASE(p, p, lenp);
1774 Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
1778 PERL_ARGS_ASSERT_TO_UNI_TITLE;
1781 return _to_upper_title_latin1((U8) c, p, lenp, 's');
1784 uvchr_to_utf8(p, c);
1785 return CALL_TITLE_CASE(p, p, lenp);
1789 S_to_lower_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp)
1791 /* We have the latin1-range values compiled into the core, so just use
1792 * those, converting the result to utf8. Since the result is always just
1793 * one character, we allow <p> to be NULL */
1795 U8 converted = toLOWER_LATIN1(c);
1798 if (NATIVE_IS_INVARIANT(converted)) {
1803 *p = UTF8_TWO_BYTE_HI(converted);
1804 *(p+1) = UTF8_TWO_BYTE_LO(converted);
1812 Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
1816 PERL_ARGS_ASSERT_TO_UNI_LOWER;
1819 return to_lower_latin1((U8) c, p, lenp);
1822 uvchr_to_utf8(p, c);
1823 return CALL_LOWER_CASE(p, p, lenp);
1827 Perl__to_fold_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp, const unsigned int flags)
1829 /* Corresponds to to_lower_latin1(); <flags> bits meanings:
1830 * FOLD_FLAGS_NOMIX_ASCII iff non-ASCII to ASCII folds are prohibited
1831 * FOLD_FLAGS_FULL iff full folding is to be used;
1833 * Not to be used for locale folds
1838 PERL_ARGS_ASSERT__TO_FOLD_LATIN1;
1840 assert (! (flags & FOLD_FLAGS_LOCALE));
1842 if (c == MICRO_SIGN) {
1843 converted = GREEK_SMALL_LETTER_MU;
1845 else if ((flags & FOLD_FLAGS_FULL) && c == LATIN_SMALL_LETTER_SHARP_S) {
1847 /* If can't cross 127/128 boundary, can't return "ss"; instead return
1848 * two U+017F characters, as fc("\df") should eq fc("\x{17f}\x{17f}")
1849 * under those circumstances. */
1850 if (flags & FOLD_FLAGS_NOMIX_ASCII) {
1851 *lenp = 2 * sizeof(LATIN_SMALL_LETTER_LONG_S_UTF8) - 2;
1852 Copy(LATIN_SMALL_LETTER_LONG_S_UTF8 LATIN_SMALL_LETTER_LONG_S_UTF8,
1854 return LATIN_SMALL_LETTER_LONG_S;
1863 else { /* In this range the fold of all other characters is their lower
1865 converted = toLOWER_LATIN1(c);
1868 if (NATIVE_IS_INVARIANT(converted)) {
1869 *p = (U8) converted;
1873 *(p)++ = UTF8_TWO_BYTE_HI(converted);
1874 *p = UTF8_TWO_BYTE_LO(converted);
1882 Perl__to_uni_fold_flags(pTHX_ UV c, U8* p, STRLEN *lenp, const U8 flags)
1885 /* Not currently externally documented, and subject to change
1886 * <flags> bits meanings:
1887 * FOLD_FLAGS_FULL iff full folding is to be used;
1888 * FOLD_FLAGS_LOCALE iff in locale
1889 * FOLD_FLAGS_NOMIX_ASCII iff non-ASCII to ASCII folds are prohibited
1892 PERL_ARGS_ASSERT__TO_UNI_FOLD_FLAGS;
1895 UV result = _to_fold_latin1((U8) c, p, lenp,
1896 flags & (FOLD_FLAGS_FULL | FOLD_FLAGS_NOMIX_ASCII));
1897 /* It is illegal for the fold to cross the 255/256 boundary under
1898 * locale; in this case return the original */
1899 return (result > 256 && flags & FOLD_FLAGS_LOCALE)
1904 /* If no special needs, just use the macro */
1905 if ( ! (flags & (FOLD_FLAGS_LOCALE|FOLD_FLAGS_NOMIX_ASCII))) {
1906 uvchr_to_utf8(p, c);
1907 return CALL_FOLD_CASE(p, p, lenp, flags & FOLD_FLAGS_FULL);
1909 else { /* Otherwise, _to_utf8_fold_flags has the intelligence to deal with
1910 the special flags. */
1911 U8 utf8_c[UTF8_MAXBYTES + 1];
1912 uvchr_to_utf8(utf8_c, c);
1913 return _to_utf8_fold_flags(utf8_c, p, lenp, flags, NULL);
1918 Perl_is_uni_alnum_lc(pTHX_ UV c)
1921 return isALNUM_LC(c);
1923 return _is_uni_FOO(_CC_WORDCHAR, c);
1927 Perl_is_uni_alnumc_lc(pTHX_ UV c)
1930 return isALPHANUMERIC_LC(c);
1932 return _is_uni_FOO(_CC_ALPHANUMERIC, c);
1936 Perl_is_uni_idfirst_lc(pTHX_ UV c)
1939 return isIDFIRST_LC(c);
1941 return _is_uni_perl_idstart(c);
1945 Perl_is_uni_alpha_lc(pTHX_ UV c)
1948 return isALPHA_LC(c);
1950 return _is_uni_FOO(_CC_ALPHA, c);
1954 Perl_is_uni_ascii_lc(pTHX_ UV c)
1957 return isASCII_LC(c);
1963 Perl_is_uni_blank_lc(pTHX_ UV c)
1966 return isBLANK_LC(c);
1968 return isBLANK_uni(c);
1972 Perl_is_uni_space_lc(pTHX_ UV c)
1975 return isSPACE_LC(c);
1977 return isSPACE_uni(c);
1981 Perl_is_uni_digit_lc(pTHX_ UV c)
1984 return isDIGIT_LC(c);
1986 return _is_uni_FOO(_CC_DIGIT, c);
1990 Perl_is_uni_upper_lc(pTHX_ UV c)
1993 return isUPPER_LC(c);
1995 return _is_uni_FOO(_CC_UPPER, c);
1999 Perl_is_uni_lower_lc(pTHX_ UV c)
2002 return isLOWER_LC(c);
2004 return _is_uni_FOO(_CC_LOWER, c);
2008 Perl_is_uni_cntrl_lc(pTHX_ UV c)
2011 return isCNTRL_LC(c);
2017 Perl_is_uni_graph_lc(pTHX_ UV c)
2020 return isGRAPH_LC(c);
2022 return _is_uni_FOO(_CC_GRAPH, c);
2026 Perl_is_uni_print_lc(pTHX_ UV c)
2029 return isPRINT_LC(c);
2031 return _is_uni_FOO(_CC_PRINT, c);
2035 Perl_is_uni_punct_lc(pTHX_ UV c)
2038 return isPUNCT_LC(c);
2040 return _is_uni_FOO(_CC_PUNCT, c);
2044 Perl_is_uni_xdigit_lc(pTHX_ UV c)
2047 return isXDIGIT_LC(c);
2049 return isXDIGIT_uni(c);
2053 Perl_to_uni_upper_lc(pTHX_ U32 c)
2055 /* XXX returns only the first character -- do not use XXX */
2056 /* XXX no locale support yet */
2058 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
2059 return (U32)to_uni_upper(c, tmpbuf, &len);
2063 Perl_to_uni_title_lc(pTHX_ U32 c)
2065 /* XXX returns only the first character XXX -- do not use XXX */
2066 /* XXX no locale support yet */
2068 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
2069 return (U32)to_uni_title(c, tmpbuf, &len);
2073 Perl_to_uni_lower_lc(pTHX_ U32 c)
2075 /* XXX returns only the first character -- do not use XXX */
2076 /* XXX no locale support yet */
2078 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
2079 return (U32)to_uni_lower(c, tmpbuf, &len);
2082 PERL_STATIC_INLINE bool
2083 S_is_utf8_common(pTHX_ const U8 *const p, SV **swash,
2084 const char *const swashname)
2086 /* returns a boolean giving whether or not the UTF8-encoded character that
2087 * starts at <p> is in the swash indicated by <swashname>. <swash>
2088 * contains a pointer to where the swash indicated by <swashname>
2089 * is to be stored; which this routine will do, so that future calls will
2090 * look at <*swash> and only generate a swash if it is not null
2092 * Note that it is assumed that the buffer length of <p> is enough to
2093 * contain all the bytes that comprise the character. Thus, <*p> should
2094 * have been checked before this call for mal-formedness enough to assure
2099 PERL_ARGS_ASSERT_IS_UTF8_COMMON;
2101 /* The API should have included a length for the UTF-8 character in <p>,
2102 * but it doesn't. We therefore assume that p has been validated at least
2103 * as far as there being enough bytes available in it to accommodate the
2104 * character without reading beyond the end, and pass that number on to the
2105 * validating routine */
2106 if (! is_utf8_char_buf(p, p + UTF8SKIP(p))) {
2107 if (ckWARN_d(WARN_UTF8)) {
2108 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED,WARN_UTF8),
2109 "Passing malformed UTF-8 to \"%s\" is deprecated", swashname);
2110 if (ckWARN(WARN_UTF8)) { /* This will output details as to the
2111 what the malformation is */
2112 utf8_to_uvchr_buf(p, p + UTF8SKIP(p), NULL);
2118 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
2119 *swash = _core_swash_init("utf8", swashname, &PL_sv_undef, 1, 0, NULL, &flags);
2122 return swash_fetch(*swash, p, TRUE) != 0;
2126 Perl__is_utf8_FOO(pTHX_ const U8 classnum, const U8 *p)
2130 PERL_ARGS_ASSERT__IS_UTF8_FOO;
2132 assert(classnum < _FIRST_NON_SWASH_CC);
2134 return is_utf8_common(p, &PL_utf8_swash_ptrs[classnum], swash_property_names[classnum]);
2138 Perl_is_utf8_alnum(pTHX_ const U8 *p)
2142 PERL_ARGS_ASSERT_IS_UTF8_ALNUM;
2144 /* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
2145 * descendant of isalnum(3), in other words, it doesn't
2146 * contain the '_'. --jhi */
2147 return is_utf8_common(p, &PL_utf8_swash_ptrs[_CC_WORDCHAR], "IsWord");
2151 Perl_is_utf8_alnumc(pTHX_ const U8 *p)
2155 PERL_ARGS_ASSERT_IS_UTF8_ALNUMC;
2157 return is_utf8_common(p, &PL_utf8_swash_ptrs[_CC_ALPHANUMERIC], "IsAlnum");
2161 Perl_is_utf8_idfirst(pTHX_ const U8 *p) /* The naming is historical. */
2165 PERL_ARGS_ASSERT_IS_UTF8_IDFIRST;
2167 return S_is_utf8_idfirst(aTHX_ p);
2171 Perl_is_utf8_xidfirst(pTHX_ const U8 *p) /* The naming is historical. */
2175 PERL_ARGS_ASSERT_IS_UTF8_XIDFIRST;
2179 /* is_utf8_idstart would be more logical. */
2180 return is_utf8_common(p, &PL_utf8_xidstart, "XIdStart");
2184 Perl__is_utf8_perl_idstart(pTHX_ const U8 *p)
2188 PERL_ARGS_ASSERT__IS_UTF8_PERL_IDSTART;
2190 return is_utf8_common(p, &PL_utf8_perl_idstart, "_Perl_IDStart");
2194 Perl__is_utf8_perl_idcont(pTHX_ const U8 *p)
2198 PERL_ARGS_ASSERT__IS_UTF8_PERL_IDCONT;
2200 return is_utf8_common(p, &PL_utf8_perl_idcont, "_Perl_IDCont");
2205 Perl_is_utf8_idcont(pTHX_ const U8 *p)
2209 PERL_ARGS_ASSERT_IS_UTF8_IDCONT;
2211 return is_utf8_common(p, &PL_utf8_idcont, "IdContinue");
2215 Perl_is_utf8_xidcont(pTHX_ const U8 *p)
2219 PERL_ARGS_ASSERT_IS_UTF8_XIDCONT;
2221 return is_utf8_common(p, &PL_utf8_idcont, "XIdContinue");
2225 Perl_is_utf8_alpha(pTHX_ const U8 *p)
2229 PERL_ARGS_ASSERT_IS_UTF8_ALPHA;
2231 return is_utf8_common(p, &PL_utf8_swash_ptrs[_CC_ALPHA], "IsAlpha");
2235 Perl_is_utf8_ascii(pTHX_ const U8 *p)
2239 PERL_ARGS_ASSERT_IS_UTF8_ASCII;
2241 /* ASCII characters are the same whether in utf8 or not. So the macro
2242 * works on both utf8 and non-utf8 representations. */
2247 Perl_is_utf8_blank(pTHX_ const U8 *p)
2251 PERL_ARGS_ASSERT_IS_UTF8_BLANK;
2253 return isBLANK_utf8(p);
2257 Perl_is_utf8_space(pTHX_ const U8 *p)
2261 PERL_ARGS_ASSERT_IS_UTF8_SPACE;
2263 return isSPACE_utf8(p);
2267 Perl_is_utf8_perl_space(pTHX_ const U8 *p)
2271 PERL_ARGS_ASSERT_IS_UTF8_PERL_SPACE;
2273 /* Only true if is an ASCII space-like character, and ASCII is invariant
2274 * under utf8, so can just use the macro */
2275 return isSPACE_A(*p);
2279 Perl_is_utf8_perl_word(pTHX_ const U8 *p)
2283 PERL_ARGS_ASSERT_IS_UTF8_PERL_WORD;
2285 /* Only true if is an ASCII word character, and ASCII is invariant
2286 * under utf8, so can just use the macro */
2287 return isWORDCHAR_A(*p);
2291 Perl_is_utf8_digit(pTHX_ const U8 *p)
2295 PERL_ARGS_ASSERT_IS_UTF8_DIGIT;
2297 return is_utf8_common(p, &PL_utf8_swash_ptrs[_CC_DIGIT], "IsDigit");
2301 Perl_is_utf8_posix_digit(pTHX_ const U8 *p)
2305 PERL_ARGS_ASSERT_IS_UTF8_POSIX_DIGIT;
2307 /* Only true if is an ASCII digit character, and ASCII is invariant
2308 * under utf8, so can just use the macro */
2309 return isDIGIT_A(*p);
2313 Perl_is_utf8_upper(pTHX_ const U8 *p)
2317 PERL_ARGS_ASSERT_IS_UTF8_UPPER;
2319 return is_utf8_common(p, &PL_utf8_swash_ptrs[_CC_UPPER], "IsUppercase");
2323 Perl_is_utf8_lower(pTHX_ const U8 *p)
2327 PERL_ARGS_ASSERT_IS_UTF8_LOWER;
2329 return is_utf8_common(p, &PL_utf8_swash_ptrs[_CC_LOWER], "IsLowercase");
2333 Perl_is_utf8_cntrl(pTHX_ const U8 *p)
2337 PERL_ARGS_ASSERT_IS_UTF8_CNTRL;
2339 return isCNTRL_utf8(p);
2343 Perl_is_utf8_graph(pTHX_ const U8 *p)
2347 PERL_ARGS_ASSERT_IS_UTF8_GRAPH;
2349 return is_utf8_common(p, &PL_utf8_swash_ptrs[_CC_GRAPH], "IsGraph");
2353 Perl_is_utf8_print(pTHX_ const U8 *p)
2357 PERL_ARGS_ASSERT_IS_UTF8_PRINT;
2359 return is_utf8_common(p, &PL_utf8_swash_ptrs[_CC_PRINT], "IsPrint");
2363 Perl_is_utf8_punct(pTHX_ const U8 *p)
2367 PERL_ARGS_ASSERT_IS_UTF8_PUNCT;
2369 return is_utf8_common(p, &PL_utf8_swash_ptrs[_CC_PUNCT], "IsPunct");
2373 Perl_is_utf8_xdigit(pTHX_ const U8 *p)
2377 PERL_ARGS_ASSERT_IS_UTF8_XDIGIT;
2379 return is_XDIGIT_utf8(p);
2383 Perl__is_utf8_mark(pTHX_ const U8 *p)
2387 PERL_ARGS_ASSERT__IS_UTF8_MARK;
2389 return is_utf8_common(p, &PL_utf8_mark, "IsM");
2394 Perl_is_utf8_mark(pTHX_ const U8 *p)
2398 PERL_ARGS_ASSERT_IS_UTF8_MARK;
2400 return _is_utf8_mark(p);
2404 =for apidoc to_utf8_case
2406 The C<p> contains the pointer to the UTF-8 string encoding
2407 the character that is being converted. This routine assumes that the character
2408 at C<p> is well-formed.
2410 The C<ustrp> is a pointer to the character buffer to put the
2411 conversion result to. The C<lenp> is a pointer to the length
2414 The C<swashp> is a pointer to the swash to use.
2416 Both the special and normal mappings are stored in F<lib/unicore/To/Foo.pl>,
2417 and loaded by SWASHNEW, using F<lib/utf8_heavy.pl>. The C<special> (usually,
2418 but not always, a multicharacter mapping), is tried first.
2420 The C<special> is a string like "utf8::ToSpecLower", which means the
2421 hash %utf8::ToSpecLower. The access to the hash is through
2422 Perl_to_utf8_case().
2424 The C<normal> is a string like "ToLower" which means the swash
2430 Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp,
2431 SV **swashp, const char *normal, const char *special)
2435 const UV uv1 = valid_utf8_to_uvchr(p, NULL);
2437 PERL_ARGS_ASSERT_TO_UTF8_CASE;
2439 /* Note that swash_fetch() doesn't output warnings for these because it
2440 * assumes we will */
2441 if (uv1 >= UNICODE_SURROGATE_FIRST) {
2442 if (uv1 <= UNICODE_SURROGATE_LAST) {
2443 if (ckWARN_d(WARN_SURROGATE)) {
2444 const char* desc = (PL_op) ? OP_DESC(PL_op) : normal;
2445 Perl_warner(aTHX_ packWARN(WARN_SURROGATE),
2446 "Operation \"%s\" returns its argument for UTF-16 surrogate U+%04"UVXf"", desc, uv1);
2449 else if (UNICODE_IS_SUPER(uv1)) {
2450 if (ckWARN_d(WARN_NON_UNICODE)) {
2451 const char* desc = (PL_op) ? OP_DESC(PL_op) : normal;
2452 Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
2453 "Operation \"%s\" returns its argument for non-Unicode code point 0x%04"UVXf"", desc, uv1);
2457 /* Note that non-characters are perfectly legal, so no warning should
2461 if (!*swashp) /* load on-demand */
2462 *swashp = _core_swash_init("utf8", normal, &PL_sv_undef, 4, 0, NULL, NULL);
2465 /* It might be "special" (sometimes, but not always,
2466 * a multicharacter mapping) */
2467 HV * const hv = get_hv(special, 0);
2471 (svp = hv_fetch(hv, (const char*)p, UNISKIP(uv1), FALSE)) &&
2475 s = SvPV_const(*svp, len);
2478 len = uvchr_to_utf8(ustrp, *(U8*)s) - ustrp;
2480 Copy(s, ustrp, len, U8);
2485 if (!len && *swashp) {
2486 const UV uv2 = swash_fetch(*swashp, p, TRUE /* => is utf8 */);
2489 /* It was "normal" (a single character mapping). */
2490 len = uvchr_to_utf8(ustrp, uv2) - ustrp;
2498 return valid_utf8_to_uvchr(ustrp, 0);
2501 /* Here, there was no mapping defined, which means that the code point maps
2502 * to itself. Return the inputs */
2504 if (p != ustrp) { /* Don't copy onto itself */
2505 Copy(p, ustrp, len, U8);
2516 S_check_locale_boundary_crossing(pTHX_ const U8* const p, const UV result, U8* const ustrp, STRLEN *lenp)
2518 /* This is called when changing the case of a utf8-encoded character above
2519 * the Latin1 range, and the operation is in locale. If the result
2520 * contains a character that crosses the 255/256 boundary, disallow the
2521 * change, and return the original code point. See L<perlfunc/lc> for why;
2523 * p points to the original string whose case was changed; assumed
2524 * by this routine to be well-formed
2525 * result the code point of the first character in the changed-case string
2526 * ustrp points to the changed-case string (<result> represents its first char)
2527 * lenp points to the length of <ustrp> */
2529 UV original; /* To store the first code point of <p> */
2531 PERL_ARGS_ASSERT_CHECK_LOCALE_BOUNDARY_CROSSING;
2533 assert(UTF8_IS_ABOVE_LATIN1(*p));
2535 /* We know immediately if the first character in the string crosses the
2536 * boundary, so can skip */
2539 /* Look at every character in the result; if any cross the
2540 * boundary, the whole thing is disallowed */
2541 U8* s = ustrp + UTF8SKIP(ustrp);
2542 U8* e = ustrp + *lenp;
2544 if (! UTF8_IS_ABOVE_LATIN1(*s)) {
2550 /* Here, no characters crossed, result is ok as-is */
2556 /* Failed, have to return the original */
2557 original = valid_utf8_to_uvchr(p, lenp);
2558 Copy(p, ustrp, *lenp, char);
2563 =for apidoc to_utf8_upper
2565 Instead use L</toUPPER_utf8>.
2569 /* Not currently externally documented, and subject to change:
2570 * <flags> is set iff locale semantics are to be used for code points < 256
2571 * <tainted_ptr> if non-null, *tainted_ptr will be set TRUE iff locale rules
2572 * were used in the calculation; otherwise unchanged. */
2575 Perl__to_utf8_upper_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, const bool flags, bool* tainted_ptr)
2581 PERL_ARGS_ASSERT__TO_UTF8_UPPER_FLAGS;
2583 if (UTF8_IS_INVARIANT(*p)) {
2585 result = toUPPER_LC(*p);
2588 return _to_upper_title_latin1(*p, ustrp, lenp, 'S');
2591 else if UTF8_IS_DOWNGRADEABLE_START(*p) {
2593 result = toUPPER_LC(TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1)));
2596 return _to_upper_title_latin1(TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1)),
2600 else { /* utf8, ord above 255 */
2601 result = CALL_UPPER_CASE(p, ustrp, lenp);
2604 result = check_locale_boundary_crossing(p, result, ustrp, lenp);
2609 /* Here, used locale rules. Convert back to utf8 */
2610 if (UTF8_IS_INVARIANT(result)) {
2611 *ustrp = (U8) result;
2615 *ustrp = UTF8_EIGHT_BIT_HI(result);
2616 *(ustrp + 1) = UTF8_EIGHT_BIT_LO(result);
2621 *tainted_ptr = TRUE;
2627 =for apidoc to_utf8_title
2629 Instead use L</toTITLE_utf8>.
2633 /* Not currently externally documented, and subject to change:
2634 * <flags> is set iff locale semantics are to be used for code points < 256
2635 * Since titlecase is not defined in POSIX, uppercase is used instead
2637 * <tainted_ptr> if non-null, *tainted_ptr will be set TRUE iff locale rules
2638 * were used in the calculation; otherwise unchanged. */
2641 Perl__to_utf8_title_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, const bool flags, bool* tainted_ptr)
2647 PERL_ARGS_ASSERT__TO_UTF8_TITLE_FLAGS;
2649 if (UTF8_IS_INVARIANT(*p)) {
2651 result = toUPPER_LC(*p);
2654 return _to_upper_title_latin1(*p, ustrp, lenp, 's');
2657 else if UTF8_IS_DOWNGRADEABLE_START(*p) {
2659 result = toUPPER_LC(TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1)));
2662 return _to_upper_title_latin1(TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1)),
2666 else { /* utf8, ord above 255 */
2667 result = CALL_TITLE_CASE(p, ustrp, lenp);
2670 result = check_locale_boundary_crossing(p, result, ustrp, lenp);
2675 /* Here, used locale rules. Convert back to utf8 */
2676 if (UTF8_IS_INVARIANT(result)) {
2677 *ustrp = (U8) result;
2681 *ustrp = UTF8_EIGHT_BIT_HI(result);
2682 *(ustrp + 1) = UTF8_EIGHT_BIT_LO(result);
2687 *tainted_ptr = TRUE;
2693 =for apidoc to_utf8_lower
2695 Instead use L</toLOWER_utf8>.
2699 /* Not currently externally documented, and subject to change:
2700 * <flags> is set iff locale semantics are to be used for code points < 256
2701 * <tainted_ptr> if non-null, *tainted_ptr will be set TRUE iff locale rules
2702 * were used in the calculation; otherwise unchanged. */
2705 Perl__to_utf8_lower_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, const bool flags, bool* tainted_ptr)
2711 PERL_ARGS_ASSERT__TO_UTF8_LOWER_FLAGS;
2713 if (UTF8_IS_INVARIANT(*p)) {
2715 result = toLOWER_LC(*p);
2718 return to_lower_latin1(*p, ustrp, lenp);
2721 else if UTF8_IS_DOWNGRADEABLE_START(*p) {
2723 result = toLOWER_LC(TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1)));
2726 return to_lower_latin1(TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1)),
2730 else { /* utf8, ord above 255 */
2731 result = CALL_LOWER_CASE(p, ustrp, lenp);
2734 result = check_locale_boundary_crossing(p, result, ustrp, lenp);
2740 /* Here, used locale rules. Convert back to utf8 */
2741 if (UTF8_IS_INVARIANT(result)) {
2742 *ustrp = (U8) result;
2746 *ustrp = UTF8_EIGHT_BIT_HI(result);
2747 *(ustrp + 1) = UTF8_EIGHT_BIT_LO(result);
2752 *tainted_ptr = TRUE;
2758 =for apidoc to_utf8_fold
2760 Instead use L</toFOLD_utf8>.
2764 /* Not currently externally documented, and subject to change,
2766 * bit FOLD_FLAGS_LOCALE is set iff locale semantics are to be used for code
2767 * points < 256. Since foldcase is not defined in
2768 * POSIX, lowercase is used instead
2769 * bit FOLD_FLAGS_FULL is set iff full case folds are to be used;
2770 * otherwise simple folds
2771 * bit FOLD_FLAGS_NOMIX_ASCII is set iff folds of non-ASCII to ASCII are
2773 * <tainted_ptr> if non-null, *tainted_ptr will be set TRUE iff locale rules
2774 * were used in the calculation; otherwise unchanged. */
2777 Perl__to_utf8_fold_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, U8 flags, bool* tainted_ptr)
2783 PERL_ARGS_ASSERT__TO_UTF8_FOLD_FLAGS;
2785 /* These are mutually exclusive */
2786 assert (! ((flags & FOLD_FLAGS_LOCALE) && (flags & FOLD_FLAGS_NOMIX_ASCII)));
2788 assert(p != ustrp); /* Otherwise overwrites */
2790 if (UTF8_IS_INVARIANT(*p)) {
2791 if (flags & FOLD_FLAGS_LOCALE) {
2792 result = toFOLD_LC(*p);
2795 return _to_fold_latin1(*p, ustrp, lenp,
2796 flags & (FOLD_FLAGS_FULL | FOLD_FLAGS_NOMIX_ASCII));
2799 else if UTF8_IS_DOWNGRADEABLE_START(*p) {
2800 if (flags & FOLD_FLAGS_LOCALE) {
2801 result = toFOLD_LC(TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1)));
2804 return _to_fold_latin1(TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1)),
2806 flags & (FOLD_FLAGS_FULL | FOLD_FLAGS_NOMIX_ASCII));
2809 else { /* utf8, ord above 255 */
2810 result = CALL_FOLD_CASE(p, ustrp, lenp, flags & FOLD_FLAGS_FULL);
2812 if (flags & FOLD_FLAGS_LOCALE) {
2814 /* Special case these characters, as what normally gets returned
2815 * under locale doesn't work */
2816 if (UTF8SKIP(p) == sizeof(LATIN_CAPITAL_LETTER_SHARP_S_UTF8) - 1
2817 && memEQ((char *) p, LATIN_CAPITAL_LETTER_SHARP_S_UTF8,
2818 sizeof(LATIN_CAPITAL_LETTER_SHARP_S_UTF8) - 1))
2822 else if (UTF8SKIP(p) == sizeof(LATIN_SMALL_LIGATURE_LONG_S_T) - 1
2823 && memEQ((char *) p, LATIN_SMALL_LIGATURE_LONG_S_T_UTF8,
2824 sizeof(LATIN_SMALL_LIGATURE_LONG_S_T_UTF8) - 1))
2826 goto return_ligature_st;
2828 return check_locale_boundary_crossing(p, result, ustrp, lenp);
2830 else if (! (flags & FOLD_FLAGS_NOMIX_ASCII)) {
2834 /* This is called when changing the case of a utf8-encoded
2835 * character above the ASCII range, and the result should not
2836 * contain an ASCII character. */
2838 UV original; /* To store the first code point of <p> */
2840 /* Look at every character in the result; if any cross the
2841 * boundary, the whole thing is disallowed */
2843 U8* e = ustrp + *lenp;
2846 /* Crossed, have to return the original */
2847 original = valid_utf8_to_uvchr(p, lenp);
2849 /* But in these instances, there is an alternative we can
2850 * return that is valid */
2851 if (original == LATIN_CAPITAL_LETTER_SHARP_S
2852 || original == LATIN_SMALL_LETTER_SHARP_S)
2856 else if (original == LATIN_SMALL_LIGATURE_LONG_S_T) {
2857 goto return_ligature_st;
2859 Copy(p, ustrp, *lenp, char);
2865 /* Here, no characters crossed, result is ok as-is */
2870 /* Here, used locale rules. Convert back to utf8 */
2871 if (UTF8_IS_INVARIANT(result)) {
2872 *ustrp = (U8) result;
2876 *ustrp = UTF8_EIGHT_BIT_HI(result);
2877 *(ustrp + 1) = UTF8_EIGHT_BIT_LO(result);
2882 *tainted_ptr = TRUE;
2887 /* Certain folds to 'ss' are prohibited by the options, but they do allow
2888 * folds to a string of two of these characters. By returning this
2889 * instead, then, e.g.,
2890 * fc("\x{1E9E}") eq fc("\x{17F}\x{17F}")
2893 *lenp = 2 * sizeof(LATIN_SMALL_LETTER_LONG_S_UTF8) - 2;
2894 Copy(LATIN_SMALL_LETTER_LONG_S_UTF8 LATIN_SMALL_LETTER_LONG_S_UTF8,
2896 return LATIN_SMALL_LETTER_LONG_S;
2899 /* Two folds to 'st' are prohibited by the options; instead we pick one and
2900 * have the other one fold to it */
2902 *lenp = sizeof(LATIN_SMALL_LIGATURE_ST_UTF8) - 1;
2903 Copy(LATIN_SMALL_LIGATURE_ST_UTF8, ustrp, *lenp, U8);
2904 return LATIN_SMALL_LIGATURE_ST;
2908 * Returns a "swash" which is a hash described in utf8.c:Perl_swash_fetch().
2909 * C<pkg> is a pointer to a package name for SWASHNEW, should be "utf8".
2910 * For other parameters, see utf8::SWASHNEW in lib/utf8_heavy.pl.
2914 Perl_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits, I32 none)
2916 PERL_ARGS_ASSERT_SWASH_INIT;
2918 /* Returns a copy of a swash initiated by the called function. This is the
2919 * public interface, and returning a copy prevents others from doing
2920 * mischief on the original */
2922 return newSVsv(_core_swash_init(pkg, name, listsv, minbits, none, NULL, NULL));
2926 Perl__core_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits, I32 none, SV* invlist, U8* const flags_p)
2928 /* Initialize and return a swash, creating it if necessary. It does this
2929 * by calling utf8_heavy.pl in the general case. The returned value may be
2930 * the swash's inversion list instead if the input parameters allow it.
2931 * Which is returned should be immaterial to callers, as the only
2932 * operations permitted on a swash, swash_fetch(), _get_swash_invlist(),
2933 * and swash_to_invlist() handle both these transparently.
2935 * This interface should only be used by functions that won't destroy or
2936 * adversely change the swash, as doing so affects all other uses of the
2937 * swash in the program; the general public should use 'Perl_swash_init'
2940 * pkg is the name of the package that <name> should be in.
2941 * name is the name of the swash to find. Typically it is a Unicode
2942 * property name, including user-defined ones
2943 * listsv is a string to initialize the swash with. It must be of the form
2944 * documented as the subroutine return value in
2945 * L<perlunicode/User-Defined Character Properties>
2946 * minbits is the number of bits required to represent each data element.
2947 * It is '1' for binary properties.
2948 * none I (khw) do not understand this one, but it is used only in tr///.
2949 * invlist is an inversion list to initialize the swash with (or NULL)
2950 * flags_p if non-NULL is the address of various input and output flag bits
2951 * to the routine, as follows: ('I' means is input to the routine;
2952 * 'O' means output from the routine. Only flags marked O are
2953 * meaningful on return.)
2954 * _CORE_SWASH_INIT_USER_DEFINED_PROPERTY indicates if the swash
2955 * came from a user-defined property. (I O)
2956 * _CORE_SWASH_INIT_RETURN_IF_UNDEF indicates that instead of croaking
2957 * when the swash cannot be located, to simply return NULL. (I)
2958 * _CORE_SWASH_INIT_ACCEPT_INVLIST indicates that the caller will accept a
2959 * return of an inversion list instead of a swash hash if this routine
2960 * thinks that would result in faster execution of swash_fetch() later
2963 * Thus there are three possible inputs to find the swash: <name>,
2964 * <listsv>, and <invlist>. At least one must be specified. The result
2965 * will be the union of the specified ones, although <listsv>'s various
2966 * actions can intersect, etc. what <name> gives.
2968 * <invlist> is only valid for binary properties */
2971 SV* retval = &PL_sv_undef;
2972 HV* swash_hv = NULL;
2973 const int invlist_swash_boundary =
2974 (flags_p && *flags_p & _CORE_SWASH_INIT_ACCEPT_INVLIST)
2975 ? 512 /* Based on some benchmarking, but not extensive, see commit
2977 : -1; /* Never return just an inversion list */
2979 assert(listsv != &PL_sv_undef || strNE(name, "") || invlist);
2980 assert(! invlist || minbits == 1);
2982 /* If data was passed in to go out to utf8_heavy to find the swash of, do
2984 if (listsv != &PL_sv_undef || strNE(name, "")) {
2986 const size_t pkg_len = strlen(pkg);
2987 const size_t name_len = strlen(name);
2988 HV * const stash = gv_stashpvn(pkg, pkg_len, 0);
2992 PERL_ARGS_ASSERT__CORE_SWASH_INIT;
2994 PUSHSTACKi(PERLSI_MAGIC);
2998 /* We might get here via a subroutine signature which uses a utf8
2999 * parameter name, at which point PL_subname will have been set
3000 * but not yet used. */
3001 save_item(PL_subname);
3002 if (PL_parser && PL_parser->error_count)
3003 SAVEI8(PL_parser->error_count), PL_parser->error_count = 0;
3004 method = gv_fetchmeth(stash, "SWASHNEW", 8, -1);
3005 if (!method) { /* demand load utf8 */
3007 if ((errsv_save = GvSV(PL_errgv))) SAVEFREESV(errsv_save);
3008 GvSV(PL_errgv) = NULL;
3009 /* It is assumed that callers of this routine are not passing in
3010 * any user derived data. */
3011 /* Need to do this after save_re_context() as it will set
3012 * PL_tainted to 1 while saving $1 etc (see the code after getrx:
3013 * in Perl_magic_get). Even line to create errsv_save can turn on
3015 #ifndef NO_TAINT_SUPPORT
3016 SAVEBOOL(TAINT_get);
3019 Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpvn(pkg,pkg_len),
3022 /* Not ERRSV, as there is no need to vivify a scalar we are
3023 about to discard. */
3024 SV * const errsv = GvSV(PL_errgv);
3025 if (!SvTRUE(errsv)) {
3026 GvSV(PL_errgv) = SvREFCNT_inc_simple(errsv_save);
3027 SvREFCNT_dec(errsv);
3035 mPUSHp(pkg, pkg_len);
3036 mPUSHp(name, name_len);
3041 if ((errsv_save = GvSV(PL_errgv))) SAVEFREESV(errsv_save);
3042 GvSV(PL_errgv) = NULL;
3043 /* If we already have a pointer to the method, no need to use
3044 * call_method() to repeat the lookup. */
3046 ? call_sv(MUTABLE_SV(method), G_SCALAR)
3047 : call_sv(newSVpvs_flags("SWASHNEW", SVs_TEMP), G_SCALAR | G_METHOD))
3049 retval = *PL_stack_sp--;
3050 SvREFCNT_inc(retval);
3053 /* Not ERRSV. See above. */
3054 SV * const errsv = GvSV(PL_errgv);
3055 if (!SvTRUE(errsv)) {
3056 GvSV(PL_errgv) = SvREFCNT_inc_simple(errsv_save);
3057 SvREFCNT_dec(errsv);
3062 if (IN_PERL_COMPILETIME) {
3063 CopHINTS_set(PL_curcop, PL_hints);
3065 if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV) {
3068 /* If caller wants to handle missing properties, let them */
3069 if (flags_p && *flags_p & _CORE_SWASH_INIT_RETURN_IF_UNDEF) {
3073 "Can't find Unicode property definition \"%"SVf"\"",
3075 Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
3077 } /* End of calling the module to find the swash */
3079 /* If this operation fetched a swash, and we will need it later, get it */
3080 if (retval != &PL_sv_undef
3081 && (minbits == 1 || (flags_p
3083 & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY))))
3085 swash_hv = MUTABLE_HV(SvRV(retval));
3087 /* If we don't already know that there is a user-defined component to
3088 * this swash, and the user has indicated they wish to know if there is
3089 * one (by passing <flags_p>), find out */
3090 if (flags_p && ! (*flags_p & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY)) {
3091 SV** user_defined = hv_fetchs(swash_hv, "USER_DEFINED", FALSE);
3092 if (user_defined && SvUV(*user_defined)) {
3093 *flags_p |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY;
3098 /* Make sure there is an inversion list for binary properties */
3100 SV** swash_invlistsvp = NULL;
3101 SV* swash_invlist = NULL;
3102 bool invlist_in_swash_is_valid = FALSE;
3103 bool swash_invlist_unclaimed = FALSE; /* whether swash_invlist has
3104 an unclaimed reference count */
3106 /* If this operation fetched a swash, get its already existing
3107 * inversion list, or create one for it */
3110 swash_invlistsvp = hv_fetchs(swash_hv, "V", FALSE);
3111 if (swash_invlistsvp) {
3112 swash_invlist = *swash_invlistsvp;
3113 invlist_in_swash_is_valid = TRUE;
3116 swash_invlist = _swash_to_invlist(retval);
3117 swash_invlist_unclaimed = TRUE;
3121 /* If an inversion list was passed in, have to include it */
3124 /* Any fetched swash will by now have an inversion list in it;
3125 * otherwise <swash_invlist> will be NULL, indicating that we
3126 * didn't fetch a swash */
3127 if (swash_invlist) {
3129 /* Add the passed-in inversion list, which invalidates the one
3130 * already stored in the swash */
3131 invlist_in_swash_is_valid = FALSE;
3132 _invlist_union(invlist, swash_invlist, &swash_invlist);
3136 /* Here, there is no swash already. Set up a minimal one, if
3137 * we are going to return a swash */
3138 if ((int) _invlist_len(invlist) > invlist_swash_boundary) {
3140 retval = newRV_noinc(MUTABLE_SV(swash_hv));
3142 swash_invlist = invlist;
3146 /* Here, we have computed the union of all the passed-in data. It may
3147 * be that there was an inversion list in the swash which didn't get
3148 * touched; otherwise save the one computed one */
3149 if (! invlist_in_swash_is_valid
3150 && (int) _invlist_len(swash_invlist) > invlist_swash_boundary)
3152 if (! hv_stores(MUTABLE_HV(SvRV(retval)), "V", swash_invlist))
3154 Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
3156 /* We just stole a reference count. */
3157 if (swash_invlist_unclaimed) swash_invlist_unclaimed = FALSE;
3158 else SvREFCNT_inc_simple_void_NN(swash_invlist);
3161 /* Use the inversion list stand-alone if small enough */
3162 if ((int) _invlist_len(swash_invlist) <= invlist_swash_boundary) {
3163 SvREFCNT_dec(retval);
3164 if (!swash_invlist_unclaimed)
3165 SvREFCNT_inc_simple_void_NN(swash_invlist);
3166 retval = newRV_noinc(swash_invlist);
3174 /* This API is wrong for special case conversions since we may need to
3175 * return several Unicode characters for a single Unicode character
3176 * (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
3177 * the lower-level routine, and it is similarly broken for returning
3178 * multiple values. --jhi
3179 * For those, you should use to_utf8_case() instead */
3180 /* Now SWASHGET is recasted into S_swatch_get in this file. */
3183 * Returns the value of property/mapping C<swash> for the first character
3184 * of the string C<ptr>. If C<do_utf8> is true, the string C<ptr> is
3185 * assumed to be in well-formed utf8. If C<do_utf8> is false, the string C<ptr>
3186 * is assumed to be in native 8-bit encoding. Caches the swatch in C<swash>.
3188 * A "swash" is a hash which contains initially the keys/values set up by
3189 * SWASHNEW. The purpose is to be able to completely represent a Unicode
3190 * property for all possible code points. Things are stored in a compact form
3191 * (see utf8_heavy.pl) so that calculation is required to find the actual
3192 * property value for a given code point. As code points are looked up, new
3193 * key/value pairs are added to the hash, so that the calculation doesn't have
3194 * to ever be re-done. Further, each calculation is done, not just for the
3195 * desired one, but for a whole block of code points adjacent to that one.
3196 * For binary properties on ASCII machines, the block is usually for 64 code
3197 * points, starting with a code point evenly divisible by 64. Thus if the
3198 * property value for code point 257 is requested, the code goes out and
3199 * calculates the property values for all 64 code points between 256 and 319,
3200 * and stores these as a single 64-bit long bit vector, called a "swatch",
3201 * under the key for code point 256. The key is the UTF-8 encoding for code
3202 * point 256, minus the final byte. Thus, if the length of the UTF-8 encoding
3203 * for a code point is 13 bytes, the key will be 12 bytes long. If the value
3204 * for code point 258 is then requested, this code realizes that it would be
3205 * stored under the key for 256, and would find that value and extract the
3206 * relevant bit, offset from 256.
3208 * Non-binary properties are stored in as many bits as necessary to represent
3209 * their values (32 currently, though the code is more general than that), not
3210 * as single bits, but the principal is the same: the value for each key is a
3211 * vector that encompasses the property values for all code points whose UTF-8
3212 * representations are represented by the key. That is, for all code points
3213 * whose UTF-8 representations are length N bytes, and the key is the first N-1
3217 Perl_swash_fetch(pTHX_ SV *swash, const U8 *ptr, bool do_utf8)
3220 HV *const hv = MUTABLE_HV(SvRV(swash));
3225 const U8 *tmps = NULL;
3230 PERL_ARGS_ASSERT_SWASH_FETCH;
3232 /* If it really isn't a hash, it isn't really swash; must be an inversion
3234 if (SvTYPE(hv) != SVt_PVHV) {
3235 return _invlist_contains_cp((SV*)hv,
3237 ? valid_utf8_to_uvchr(ptr, NULL)
3241 /* We store the values in a "swatch" which is a vec() value in a swash
3242 * hash. Code points 0-255 are a single vec() stored with key length
3243 * (klen) 0. All other code points have a UTF-8 representation
3244 * 0xAA..0xYY,0xZZ. A vec() is constructed containing all of them which
3245 * share 0xAA..0xYY, which is the key in the hash to that vec. So the key
3246 * length for them is the length of the encoded char - 1. ptr[klen] is the
3247 * final byte in the sequence representing the character */
3248 if (!do_utf8 || UTF8_IS_INVARIANT(c)) {
3253 else if (UTF8_IS_DOWNGRADEABLE_START(c)) {
3256 off = TWO_BYTE_UTF8_TO_NATIVE(c, *(ptr + 1));
3259 klen = UTF8SKIP(ptr) - 1;
3261 /* Each vec() stores 2**UTF_ACCUMULATION_SHIFT values. The offset into
3262 * the vec is the final byte in the sequence. (In EBCDIC this is
3263 * converted to I8 to get consecutive values.) To help you visualize
3265 * Straight 1047 After final byte
3266 * UTF-8 UTF-EBCDIC I8 transform
3267 * U+0400: \xD0\x80 \xB8\x41\x41 \xB8\x41\xA0
3268 * U+0401: \xD0\x81 \xB8\x41\x42 \xB8\x41\xA1
3270 * U+0409: \xD0\x89 \xB8\x41\x4A \xB8\x41\xA9
3271 * U+040A: \xD0\x8A \xB8\x41\x51 \xB8\x41\xAA
3273 * U+0412: \xD0\x92 \xB8\x41\x59 \xB8\x41\xB2
3274 * U+0413: \xD0\x93 \xB8\x41\x62 \xB8\x41\xB3
3276 * U+041B: \xD0\x9B \xB8\x41\x6A \xB8\x41\xBB
3277 * U+041C: \xD0\x9C \xB8\x41\x70 \xB8\x41\xBC
3279 * U+041F: \xD0\x9F \xB8\x41\x73 \xB8\x41\xBF
3280 * U+0420: \xD0\xA0 \xB8\x42\x41 \xB8\x42\x41
3282 * (There are no discontinuities in the elided (...) entries.)
3283 * The UTF-8 key for these 33 code points is '\xD0' (which also is the
3284 * key for the next 31, up through U+043F, whose UTF-8 final byte is
3285 * \xBF). Thus in UTF-8, each key is for a vec() for 64 code points.
3286 * The final UTF-8 byte, which ranges between \x80 and \xBF, is an
3287 * index into the vec() swatch (after subtracting 0x80, which we
3288 * actually do with an '&').
3289 * In UTF-EBCDIC, each key is for a 32 code point vec(). The first 32
3290 * code points above have key '\xB8\x41'. The final UTF-EBCDIC byte has
3291 * dicontinuities which go away by transforming it into I8, and we
3292 * effectively subtract 0xA0 to get the index. */
3293 needents = (1 << UTF_ACCUMULATION_SHIFT);
3294 off = NATIVE_UTF8_TO_I8(ptr[klen]) & UTF_CONTINUATION_MASK;
3298 * This single-entry cache saves about 1/3 of the utf8 overhead in test
3299 * suite. (That is, only 7-8% overall over just a hash cache. Still,
3300 * it's nothing to sniff at.) Pity we usually come through at least
3301 * two function calls to get here...
3303 * NB: this code assumes that swatches are never modified, once generated!
3306 if (hv == PL_last_swash_hv &&
3307 klen == PL_last_swash_klen &&
3308 (!klen || memEQ((char *)ptr, (char *)PL_last_swash_key, klen)) )
3310 tmps = PL_last_swash_tmps;
3311 slen = PL_last_swash_slen;
3314 /* Try our second-level swatch cache, kept in a hash. */
3315 SV** svp = hv_fetch(hv, (const char*)ptr, klen, FALSE);
3317 /* If not cached, generate it via swatch_get */
3318 if (!svp || !SvPOK(*svp)
3319 || !(tmps = (const U8*)SvPV_const(*svp, slen)))
3322 const UV code_point = valid_utf8_to_uvchr(ptr, NULL);
3323 swatch = swatch_get(swash,
3324 code_point & ~((UV)needents - 1),
3327 else { /* For the first 256 code points, the swatch has a key of
3329 swatch = swatch_get(swash, 0, needents);
3332 if (IN_PERL_COMPILETIME)
3333 CopHINTS_set(PL_curcop, PL_hints);
3335 svp = hv_store(hv, (const char *)ptr, klen, swatch, 0);
3337 if (!svp || !(tmps = (U8*)SvPV(*svp, slen))
3338 || (slen << 3) < needents)
3339 Perl_croak(aTHX_ "panic: swash_fetch got improper swatch, "
3340 "svp=%p, tmps=%p, slen=%"UVuf", needents=%"UVuf,
3341 svp, tmps, (UV)slen, (UV)needents);
3344 PL_last_swash_hv = hv;
3345 assert(klen <= sizeof(PL_last_swash_key));
3346 PL_last_swash_klen = (U8)klen;
3347 /* FIXME change interpvar.h? */
3348 PL_last_swash_tmps = (U8 *) tmps;
3349 PL_last_swash_slen = slen;
3351 Copy(ptr, PL_last_swash_key, klen, U8);
3354 switch ((int)((slen << 3) / needents)) {
3356 bit = 1 << (off & 7);
3358 return (tmps[off] & bit) != 0;
3363 return (tmps[off] << 8) + tmps[off + 1] ;
3366 return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
3368 Perl_croak(aTHX_ "panic: swash_fetch got swatch of unexpected bit width, "
3369 "slen=%"UVuf", needents=%"UVuf, (UV)slen, (UV)needents);
3370 NORETURN_FUNCTION_END;
3373 /* Read a single line of the main body of the swash input text. These are of
3376 * where each number is hex. The first two numbers form the minimum and
3377 * maximum of a range, and the third is the value associated with the range.
3378 * Not all swashes should have a third number
3380 * On input: l points to the beginning of the line to be examined; it points
3381 * to somewhere in the string of the whole input text, and is
3382 * terminated by a \n or the null string terminator.
3383 * lend points to the null terminator of that string
3384 * wants_value is non-zero if the swash expects a third number
3385 * typestr is the name of the swash's mapping, like 'ToLower'
3386 * On output: *min, *max, and *val are set to the values read from the line.
3387 * returns a pointer just beyond the line examined. If there was no
3388 * valid min number on the line, returns lend+1
3392 S_swash_scan_list_line(pTHX_ U8* l, U8* const lend, UV* min, UV* max, UV* val,
3393 const bool wants_value, const U8* const typestr)
3395 const int typeto = typestr[0] == 'T' && typestr[1] == 'o';
3396 STRLEN numlen; /* Length of the number */
3397 I32 flags = PERL_SCAN_SILENT_ILLDIGIT
3398 | PERL_SCAN_DISALLOW_PREFIX
3399 | PERL_SCAN_SILENT_NON_PORTABLE;
3401 /* nl points to the next \n in the scan */
3402 U8* const nl = (U8*)memchr(l, '\n', lend - l);
3404 /* Get the first number on the line: the range minimum */
3406 *min = grok_hex((char *)l, &numlen, &flags, NULL);
3407 if (numlen) /* If found a hex number, position past it */
3409 else if (nl) { /* Else, go handle next line, if any */
3410 return nl + 1; /* 1 is length of "\n" */
3412 else { /* Else, no next line */
3413 return lend + 1; /* to LIST's end at which \n is not found */
3416 /* The max range value follows, separated by a BLANK */
3419 flags = PERL_SCAN_SILENT_ILLDIGIT
3420 | PERL_SCAN_DISALLOW_PREFIX
3421 | PERL_SCAN_SILENT_NON_PORTABLE;
3423 *max = grok_hex((char *)l, &numlen, &flags, NULL);
3426 else /* If no value here, it is a single element range */
3429 /* Non-binary tables have a third entry: what the first element of the
3435 /* The ToLc, etc table mappings are not in hex, and must be
3436 * corrected by adding the code point to them */
3438 char *after_strtol = (char *) lend;
3439 *val = Strtol((char *)l, &after_strtol, 10);
3440 l = (U8 *) after_strtol;
3442 else { /* Other tables are in hex, and are the correct result
3444 flags = PERL_SCAN_SILENT_ILLDIGIT
3445 | PERL_SCAN_DISALLOW_PREFIX
3446 | PERL_SCAN_SILENT_NON_PORTABLE;
3448 *val = grok_hex((char *)l, &numlen, &flags, NULL);
3458 /* diag_listed_as: To%s: illegal mapping '%s' */
3459 Perl_croak(aTHX_ "%s: illegal mapping '%s'",
3465 *val = 0; /* bits == 1, then any val should be ignored */
3467 else { /* Nothing following range min, should be single element with no
3473 /* diag_listed_as: To%s: illegal mapping '%s' */
3474 Perl_croak(aTHX_ "%s: illegal mapping '%s'", typestr, l);
3478 *val = 0; /* bits == 1, then val should be ignored */
3481 /* Position to next line if any, or EOF */
3491 * Returns a swatch (a bit vector string) for a code point sequence
3492 * that starts from the value C<start> and comprises the number C<span>.
3493 * A C<swash> must be an object created by SWASHNEW (see lib/utf8_heavy.pl).
3494 * Should be used via swash_fetch, which will cache the swatch in C<swash>.
3497 S_swatch_get(pTHX_ SV* swash, UV start, UV span)
3500 U8 *l, *lend, *x, *xend, *s, *send;
3501 STRLEN lcur, xcur, scur;
3502 HV *const hv = MUTABLE_HV(SvRV(swash));
3503 SV** const invlistsvp = hv_fetchs(hv, "V", FALSE);
3505 SV** listsvp = NULL; /* The string containing the main body of the table */
3506 SV** extssvp = NULL;
3507 SV** invert_it_svp = NULL;
3510 STRLEN octets; /* if bits == 1, then octets == 0 */
3512 UV end = start + span;
3514 if (invlistsvp == NULL) {
3515 SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE);
3516 SV** const nonesvp = hv_fetchs(hv, "NONE", FALSE);
3517 SV** const typesvp = hv_fetchs(hv, "TYPE", FALSE);
3518 extssvp = hv_fetchs(hv, "EXTRAS", FALSE);
3519 listsvp = hv_fetchs(hv, "LIST", FALSE);
3520 invert_it_svp = hv_fetchs(hv, "INVERT_IT", FALSE);
3522 bits = SvUV(*bitssvp);
3523 none = SvUV(*nonesvp);
3524 typestr = (U8*)SvPV_nolen(*typesvp);
3530 octets = bits >> 3; /* if bits == 1, then octets == 0 */
3532 PERL_ARGS_ASSERT_SWATCH_GET;
3534 if (bits != 1 && bits != 8 && bits != 16 && bits != 32) {
3535 Perl_croak(aTHX_ "panic: swatch_get doesn't expect bits %"UVuf,
3539 /* If overflowed, use the max possible */
3545 /* create and initialize $swatch */
3546 scur = octets ? (span * octets) : (span + 7) / 8;
3547 swatch = newSV(scur);
3549 s = (U8*)SvPVX(swatch);
3550 if (octets && none) {
3551 const U8* const e = s + scur;
3554 *s++ = (U8)(none & 0xff);
3555 else if (bits == 16) {
3556 *s++ = (U8)((none >> 8) & 0xff);
3557 *s++ = (U8)( none & 0xff);
3559 else if (bits == 32) {
3560 *s++ = (U8)((none >> 24) & 0xff);
3561 *s++ = (U8)((none >> 16) & 0xff);
3562 *s++ = (U8)((none >> 8) & 0xff);
3563 *s++ = (U8)( none & 0xff);
3569 (void)memzero((U8*)s, scur + 1);
3571 SvCUR_set(swatch, scur);
3572 s = (U8*)SvPVX(swatch);
3574 if (invlistsvp) { /* If has an inversion list set up use that */
3575 _invlist_populate_swatch(*invlistsvp, start, end, s);
3579 /* read $swash->{LIST} */
3580 l = (U8*)SvPV(*listsvp, lcur);
3583 UV min, max, val, upper;
3584 l = S_swash_scan_list_line(aTHX_ l, lend, &min, &max, &val,
3585 cBOOL(octets), typestr);
3590 /* If looking for something beyond this range, go try the next one */
3594 /* <end> is generally 1 beyond where we want to set things, but at the
3595 * platform's infinity, where we can't go any higher, we want to
3596 * include the code point at <end> */
3599 : (max != UV_MAX || end != UV_MAX)
3606 if (!none || val < none) {
3611 for (key = min; key <= upper; key++) {
3613 /* offset must be non-negative (start <= min <= key < end) */
3614 offset = octets * (key - start);
3616 s[offset] = (U8)(val & 0xff);
3617 else if (bits == 16) {
3618 s[offset ] = (U8)((val >> 8) & 0xff);
3619 s[offset + 1] = (U8)( val & 0xff);
3621 else if (bits == 32) {
3622 s[offset ] = (U8)((val >> 24) & 0xff);
3623 s[offset + 1] = (U8)((val >> 16) & 0xff);
3624 s[offset + 2] = (U8)((val >> 8) & 0xff);
3625 s[offset + 3] = (U8)( val & 0xff);
3628 if (!none || val < none)
3632 else { /* bits == 1, then val should be ignored */
3637 for (key = min; key <= upper; key++) {
3638 const STRLEN offset = (STRLEN)(key - start);
3639 s[offset >> 3] |= 1 << (offset & 7);
3644 /* Invert if the data says it should be. Assumes that bits == 1 */
3645 if (invert_it_svp && SvUV(*invert_it_svp)) {
3647 /* Unicode properties should come with all bits above PERL_UNICODE_MAX
3648 * be 0, and their inversion should also be 0, as we don't succeed any
3649 * Unicode property matches for non-Unicode code points */
3650 if (start <= PERL_UNICODE_MAX) {
3652 /* The code below assumes that we never cross the
3653 * Unicode/above-Unicode boundary in a range, as otherwise we would
3654 * have to figure out where to stop flipping the bits. Since this
3655 * boundary is divisible by a large power of 2, and swatches comes
3656 * in small powers of 2, this should be a valid assumption */
3657 assert(start + span - 1 <= PERL_UNICODE_MAX);
3667 /* read $swash->{EXTRAS}
3668 * This code also copied to swash_to_invlist() below */
3669 x = (U8*)SvPV(*extssvp, xcur);
3677 SV **otherbitssvp, *other;
3681 const U8 opc = *x++;
3685 nl = (U8*)memchr(x, '\n', xend - x);
3687 if (opc != '-' && opc != '+' && opc != '!' && opc != '&') {
3689 x = nl + 1; /* 1 is length of "\n" */
3693 x = xend; /* to EXTRAS' end at which \n is not found */
3700 namelen = nl - namestr;
3704 namelen = xend - namestr;
3708 othersvp = hv_fetch(hv, (char *)namestr, namelen, FALSE);
3709 otherhv = MUTABLE_HV(SvRV(*othersvp));
3710 otherbitssvp = hv_fetchs(otherhv, "BITS", FALSE);
3711 otherbits = (STRLEN)SvUV(*otherbitssvp);
3712 if (bits < otherbits)
3713 Perl_croak(aTHX_ "panic: swatch_get found swatch size mismatch, "
3714 "bits=%"UVuf", otherbits=%"UVuf, (UV)bits, (UV)otherbits);
3716 /* The "other" swatch must be destroyed after. */
3717 other = swatch_get(*othersvp, start, span);
3718 o = (U8*)SvPV(other, olen);
3721 Perl_croak(aTHX_ "panic: swatch_get got improper swatch");
3723 s = (U8*)SvPV(swatch, slen);
3724 if (bits == 1 && otherbits == 1) {
3726 Perl_croak(aTHX_ "panic: swatch_get found swatch length "
3727 "mismatch, slen=%"UVuf", olen=%"UVuf,
3728 (UV)slen, (UV)olen);
3752 STRLEN otheroctets = otherbits >> 3;
3754 U8* const send = s + slen;
3759 if (otherbits == 1) {
3760 otherval = (o[offset >> 3] >> (offset & 7)) & 1;
3764 STRLEN vlen = otheroctets;
3772 if (opc == '+' && otherval)
3773 NOOP; /* replace with otherval */
3774 else if (opc == '!' && !otherval)
3776 else if (opc == '-' && otherval)
3778 else if (opc == '&' && !otherval)
3781 s += octets; /* no replacement */
3786 *s++ = (U8)( otherval & 0xff);
3787 else if (bits == 16) {
3788 *s++ = (U8)((otherval >> 8) & 0xff);
3789 *s++ = (U8)( otherval & 0xff);
3791 else if (bits == 32) {
3792 *s++ = (U8)((otherval >> 24) & 0xff);
3793 *s++ = (U8)((otherval >> 16) & 0xff);
3794 *s++ = (U8)((otherval >> 8) & 0xff);
3795 *s++ = (U8)( otherval & 0xff);
3799 sv_free(other); /* through with it! */
3805 Perl__swash_inversion_hash(pTHX_ SV* const swash)
3808 /* Subject to change or removal. For use only in regcomp.c and regexec.c
3809 * Can't be used on a property that is subject to user override, as it
3810 * relies on the value of SPECIALS in the swash which would be set by
3811 * utf8_heavy.pl to the hash in the non-overriden file, and hence is not set
3812 * for overridden properties
3814 * Returns a hash which is the inversion and closure of a swash mapping.
3815 * For example, consider the input lines:
3820 * The returned hash would have two keys, the utf8 for 006B and the utf8 for
3821 * 006C. The value for each key is an array. For 006C, the array would
3822 * have two elements, the utf8 for itself, and for 004C. For 006B, there
3823 * would be three elements in its array, the utf8 for 006B, 004B and 212A.
3825 * Essentially, for any code point, it gives all the code points that map to
3826 * it, or the list of 'froms' for that point.
3828 * Currently it ignores any additions or deletions from other swashes,
3829 * looking at just the main body of the swash, and if there are SPECIALS
3830 * in the swash, at that hash
3832 * The specials hash can be extra code points, and most likely consists of
3833 * maps from single code points to multiple ones (each expressed as a string
3834 * of utf8 characters). This function currently returns only 1-1 mappings.
3835 * However consider this possible input in the specials hash:
3836 * "\xEF\xAC\x85" => "\x{0073}\x{0074}", # U+FB05 => 0073 0074
3837 * "\xEF\xAC\x86" => "\x{0073}\x{0074}", # U+FB06 => 0073 0074
3839 * Both FB05 and FB06 map to the same multi-char sequence, which we don't
3840 * currently handle. But it also means that FB05 and FB06 are equivalent in
3841 * a 1-1 mapping which we should handle, and this relationship may not be in
3842 * the main table. Therefore this function examines all the multi-char
3843 * sequences and adds the 1-1 mappings that come out of that. */
3847 HV *const hv = MUTABLE_HV(SvRV(swash));
3849 /* The string containing the main body of the table. This will have its
3850 * assertion fail if the swash has been converted to its inversion list */
3851 SV** const listsvp = hv_fetchs(hv, "LIST", FALSE);
3853 SV** const typesvp = hv_fetchs(hv, "TYPE", FALSE);
3854 SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE);
3855 SV** const nonesvp = hv_fetchs(hv, "NONE", FALSE);
3856 /*SV** const extssvp = hv_fetchs(hv, "EXTRAS", FALSE);*/
3857 const U8* const typestr = (U8*)SvPV_nolen(*typesvp);
3858 const STRLEN bits = SvUV(*bitssvp);
3859 const STRLEN octets = bits >> 3; /* if bits == 1, then octets == 0 */
3860 const UV none = SvUV(*nonesvp);
3861 SV **specials_p = hv_fetchs(hv, "SPECIALS", 0);
3865 PERL_ARGS_ASSERT__SWASH_INVERSION_HASH;
3867 /* Must have at least 8 bits to get the mappings */
3868 if (bits != 8 && bits != 16 && bits != 32) {
3869 Perl_croak(aTHX_ "panic: swash_inversion_hash doesn't expect bits %"UVuf,
3873 if (specials_p) { /* It might be "special" (sometimes, but not always, a
3874 mapping to more than one character */
3876 /* Construct an inverse mapping hash for the specials */
3877 HV * const specials_hv = MUTABLE_HV(SvRV(*specials_p));
3878 HV * specials_inverse = newHV();
3879 char *char_from; /* the lhs of the map */
3880 I32 from_len; /* its byte length */
3881 char *char_to; /* the rhs of the map */
3882 I32 to_len; /* its byte length */
3883 SV *sv_to; /* and in a sv */
3884 AV* from_list; /* list of things that map to each 'to' */
3886 hv_iterinit(specials_hv);
3888 /* The keys are the characters (in utf8) that map to the corresponding
3889 * utf8 string value. Iterate through the list creating the inverse
3891 while ((sv_to = hv_iternextsv(specials_hv, &char_from, &from_len))) {
3893 if (! SvPOK(sv_to)) {
3894 Perl_croak(aTHX_ "panic: value returned from hv_iternextsv() "
3895 "unexpectedly is not a string, flags=%lu",
3896 (unsigned long)SvFLAGS(sv_to));
3898 /*DEBUG_U(PerlIO_printf(Perl_debug_log, "Found mapping from %"UVXf", First char of to is %"UVXf"\n", valid_utf8_to_uvchr((U8*) char_from, 0), valid_utf8_to_uvchr((U8*) SvPVX(sv_to), 0)));*/
3900 /* Each key in the inverse list is a mapped-to value, and the key's
3901 * hash value is a list of the strings (each in utf8) that map to
3902 * it. Those strings are all one character long */
3903 if ((listp = hv_fetch(specials_inverse,
3907 from_list = (AV*) *listp;
3909 else { /* No entry yet for it: create one */
3910 from_list = newAV();
3911 if (! hv_store(specials_inverse,
3914 (SV*) from_list, 0))
3916 Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
3920 /* Here have the list associated with this 'to' (perhaps newly
3921 * created and empty). Just add to it. Note that we ASSUME that
3922 * the input is guaranteed to not have duplications, so we don't
3923 * check for that. Duplications just slow down execution time. */
3924 av_push(from_list, newSVpvn_utf8(char_from, from_len, TRUE));
3927 /* Here, 'specials_inverse' contains the inverse mapping. Go through
3928 * it looking for cases like the FB05/FB06 examples above. There would
3929 * be an entry in the hash like
3930 * 'st' => [ FB05, FB06 ]
3931 * In this example we will create two lists that get stored in the
3932 * returned hash, 'ret':
3933 * FB05 => [ FB05, FB06 ]
3934 * FB06 => [ FB05, FB06 ]
3936 * Note that there is nothing to do if the array only has one element.
3937 * (In the normal 1-1 case handled below, we don't have to worry about
3938 * two lists, as everything gets tied to the single list that is
3939 * generated for the single character 'to'. But here, we are omitting
3940 * that list, ('st' in the example), so must have multiple lists.) */
3941 while ((from_list = (AV *) hv_iternextsv(specials_inverse,
3942 &char_to, &to_len)))
3944 if (av_len(from_list) > 0) {
3947 /* We iterate over all combinations of i,j to place each code
3948 * point on each list */
3949 for (i = 0; i <= av_len(from_list); i++) {
3951 AV* i_list = newAV();
3952 SV** entryp = av_fetch(from_list, i, FALSE);
3953 if (entryp == NULL) {
3954 Perl_croak(aTHX_ "panic: av_fetch() unexpectedly failed");
3956 if (hv_fetch(ret, SvPVX(*entryp), SvCUR(*entryp), FALSE)) {
3957 Perl_croak(aTHX_ "panic: unexpected entry for %s", SvPVX(*entryp));
3959 if (! hv_store(ret, SvPVX(*entryp), SvCUR(*entryp),
3960 (SV*) i_list, FALSE))
3962 Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
3965 /* For debugging: UV u = valid_utf8_to_uvchr((U8*) SvPVX(*entryp), 0);*/
3966 for (j = 0; j <= av_len(from_list); j++) {
3967 entryp = av_fetch(from_list, j, FALSE);
3968 if (entryp == NULL) {
3969 Perl_croak(aTHX_ "panic: av_fetch() unexpectedly failed");
3972 /* When i==j this adds itself to the list */
3973 av_push(i_list, newSVuv(utf8_to_uvchr_buf(
3974 (U8*) SvPVX(*entryp),
3975 (U8*) SvPVX(*entryp) + SvCUR(*entryp),
3977 /*DEBUG_U(PerlIO_printf(Perl_debug_log, "%s: %d: Adding %"UVXf" to list for %"UVXf"\n", __FILE__, __LINE__, valid_utf8_to_uvchr((U8*) SvPVX(*entryp), 0), u));*/
3982 SvREFCNT_dec(specials_inverse); /* done with it */
3983 } /* End of specials */
3985 /* read $swash->{LIST} */
3986 l = (U8*)SvPV(*listsvp, lcur);
3989 /* Go through each input line */
3993 l = S_swash_scan_list_line(aTHX_ l, lend, &min, &max, &val,
3994 cBOOL(octets), typestr);
3999 /* Each element in the range is to be inverted */
4000 for (inverse = min; inverse <= max; inverse++) {
4004 bool found_key = FALSE;
4005 bool found_inverse = FALSE;
4007 /* The key is the inverse mapping */
4008 char key[UTF8_MAXBYTES+1];
4009 char* key_end = (char *) uvchr_to_utf8((U8*) key, val);
4010 STRLEN key_len = key_end - key;
4012 /* Get the list for the map */
4013 if ((listp = hv_fetch(ret, key, key_len, FALSE))) {
4014 list = (AV*) *listp;
4016 else { /* No entry yet for it: create one */
4018 if (! hv_store(ret, key, key_len, (SV*) list, FALSE)) {
4019 Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
4023 /* Look through list to see if this inverse mapping already is
4024 * listed, or if there is a mapping to itself already */
4025 for (i = 0; i <= av_len(list); i++) {
4026 SV** entryp = av_fetch(list, i, FALSE);
4028 if (entryp == NULL) {
4029 Perl_croak(aTHX_ "panic: av_fetch() unexpectedly failed");
4032 /*DEBUG_U(PerlIO_printf(Perl_debug_log, "list for %"UVXf" contains %"UVXf"\n", val, SvUV(entry)));*/
4033 if (SvUV(entry) == val) {
4036 if (SvUV(entry) == inverse) {
4037 found_inverse = TRUE;
4040 /* No need to continue searching if found everything we are
4042 if (found_key && found_inverse) {
4047 /* Make sure there is a mapping to itself on the list */
4049 av_push(list, newSVuv(val));
4050 /*DEBUG_U(PerlIO_printf(Perl_debug_log, "%s: %d: Adding %"UVXf" to list for %"UVXf"\n", __FILE__, __LINE__, val, val));*/
4054 /* Simply add the value to the list */
4055 if (! found_inverse) {
4056 av_push(list, newSVuv(inverse));
4057 /*DEBUG_U(PerlIO_printf(Perl_debug_log, "%s: %d: Adding %"UVXf" to list for %"UVXf"\n", __FILE__, __LINE__, inverse, val));*/
4060 /* swatch_get() increments the value of val for each element in the
4061 * range. That makes more compact tables possible. You can
4062 * express the capitalization, for example, of all consecutive
4063 * letters with a single line: 0061\t007A\t0041 This maps 0061 to
4064 * 0041, 0062 to 0042, etc. I (khw) have never understood 'none',
4065 * and it's not documented; it appears to be used only in
4066 * implementing tr//; I copied the semantics from swatch_get(), just
4068 if (!none || val < none) {
4078 Perl__swash_to_invlist(pTHX_ SV* const swash)
4081 /* Subject to change or removal. For use only in one place in regcomp.c.
4082 * Ownership is given to one reference count in the returned SV* */
4087 HV *const hv = MUTABLE_HV(SvRV(swash));
4088 UV elements = 0; /* Number of elements in the inversion list */
4098 STRLEN octets; /* if bits == 1, then octets == 0 */
4104 PERL_ARGS_ASSERT__SWASH_TO_INVLIST;
4106 /* If not a hash, it must be the swash's inversion list instead */
4107 if (SvTYPE(hv) != SVt_PVHV) {
4108 return SvREFCNT_inc_simple_NN((SV*) hv);
4111 /* The string containing the main body of the table */
4112 listsvp = hv_fetchs(hv, "LIST", FALSE);
4113 typesvp = hv_fetchs(hv, "TYPE", FALSE);
4114 bitssvp = hv_fetchs(hv, "BITS", FALSE);
4115 extssvp = hv_fetchs(hv, "EXTRAS", FALSE);
4116 invert_it_svp = hv_fetchs(hv, "INVERT_IT", FALSE);
4118 typestr = (U8*)SvPV_nolen(*typesvp);
4119 bits = SvUV(*bitssvp);
4120 octets = bits >> 3; /* if bits == 1, then octets == 0 */
4122 /* read $swash->{LIST} */
4123 if (SvPOK(*listsvp)) {
4124 l = (U8*)SvPV(*listsvp, lcur);
4127 /* LIST legitimately doesn't contain a string during compilation phases
4128 * of Perl itself, before the Unicode tables are generated. In this
4129 * case, just fake things up by creating an empty list */
4136 /* Scan the input to count the number of lines to preallocate array size
4137 * based on worst possible case, which is each line in the input creates 2
4138 * elements in the inversion list: 1) the beginning of a range in the list;
4139 * 2) the beginning of a range not in the list. */
4140 while ((loc = (strchr(loc, '\n'))) != NULL) {
4145 /* If the ending is somehow corrupt and isn't a new line, add another
4146 * element for the final range that isn't in the inversion list */
4147 if (! (*lend == '\n'
4148 || (*lend == '\0' && (lcur == 0 || *(lend - 1) == '\n'))))
4153 invlist = _new_invlist(elements);
4155 /* Now go through the input again, adding each range to the list */
4158 UV val; /* Not used by this function */
4160 l = S_swash_scan_list_line(aTHX_ l, lend, &start, &end, &val,
4161 cBOOL(octets), typestr);
4167 invlist = _add_range_to_invlist(invlist, start, end);
4170 /* Invert if the data says it should be */
4171 if (invert_it_svp && SvUV(*invert_it_svp)) {
4172 _invlist_invert_prop(invlist);
4175 /* This code is copied from swatch_get()
4176 * read $swash->{EXTRAS} */
4177 x = (U8*)SvPV(*extssvp, xcur);
4185 SV **otherbitssvp, *other;
4188 const U8 opc = *x++;
4192 nl = (U8*)memchr(x, '\n', xend - x);
4194 if (opc != '-' && opc != '+' && opc != '!' && opc != '&') {
4196 x = nl + 1; /* 1 is length of "\n" */
4200 x = xend; /* to EXTRAS' end at which \n is not found */
4207 namelen = nl - namestr;
4211 namelen = xend - namestr;
4215 othersvp = hv_fetch(hv, (char *)namestr, namelen, FALSE);
4216 otherhv = MUTABLE_HV(SvRV(*othersvp));
4217 otherbitssvp = hv_fetchs(otherhv, "BITS", FALSE);
4218 otherbits = (STRLEN)SvUV(*otherbitssvp);
4220 if (bits != otherbits || bits != 1) {
4221 Perl_croak(aTHX_ "panic: _swash_to_invlist only operates on boolean "
4222 "properties, bits=%"UVuf", otherbits=%"UVuf,
4223 (UV)bits, (UV)otherbits);
4226 /* The "other" swatch must be destroyed after. */
4227 other = _swash_to_invlist((SV *)*othersvp);
4229 /* End of code copied from swatch_get() */
4232 _invlist_union(invlist, other, &invlist);
4235 _invlist_union_maybe_complement_2nd(invlist, other, TRUE, &invlist);
4238 _invlist_subtract(invlist, other, &invlist);
4241 _invlist_intersection(invlist, other, &invlist);
4246 sv_free(other); /* through with it! */
4253 Perl__get_swash_invlist(pTHX_ SV* const swash)
4257 PERL_ARGS_ASSERT__GET_SWASH_INVLIST;
4259 if (! SvROK(swash)) {
4263 /* If it really isn't a hash, it isn't really swash; must be an inversion
4265 if (SvTYPE(SvRV(swash)) != SVt_PVHV) {
4269 ptr = hv_fetchs(MUTABLE_HV(SvRV(swash)), "V", FALSE);
4278 Perl_check_utf8_print(pTHX_ const U8* s, const STRLEN len)
4280 /* May change: warns if surrogates, non-character code points, or
4281 * non-Unicode code points are in s which has length len bytes. Returns
4282 * TRUE if none found; FALSE otherwise. The only other validity check is
4283 * to make sure that this won't exceed the string's length */
4285 const U8* const e = s + len;
4288 PERL_ARGS_ASSERT_CHECK_UTF8_PRINT;
4291 if (UTF8SKIP(s) > len) {
4292 Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
4293 "%s in %s", unees, PL_op ? OP_DESC(PL_op) : "print");
4296 if (UNLIKELY(*s >= UTF8_FIRST_PROBLEMATIC_CODE_POINT_FIRST_BYTE)) {
4298 if (UTF8_IS_SUPER(s)) {
4299 if (ckWARN_d(WARN_NON_UNICODE)) {
4300 UV uv = utf8_to_uvchr_buf(s, e, &char_len);
4301 Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
4302 "Code point 0x%04"UVXf" is not Unicode, may not be portable", uv);
4306 else if (UTF8_IS_SURROGATE(s)) {
4307 if (ckWARN_d(WARN_SURROGATE)) {
4308 UV uv = utf8_to_uvchr_buf(s, e, &char_len);
4309 Perl_warner(aTHX_ packWARN(WARN_SURROGATE),
4310 "Unicode surrogate U+%04"UVXf" is illegal in UTF-8", uv);
4315 ((UTF8_IS_NONCHAR_GIVEN_THAT_NON_SUPER_AND_GE_PROBLEMATIC(s))
4316 && (ckWARN_d(WARN_NONCHAR)))
4318 UV uv = utf8_to_uvchr_buf(s, e, &char_len);
4319 Perl_warner(aTHX_ packWARN(WARN_NONCHAR),
4320 "Unicode non-character U+%04"UVXf" is illegal for open interchange", uv);
4331 =for apidoc pv_uni_display
4333 Build to the scalar C<dsv> a displayable version of the string C<spv>,
4334 length C<len>, the displayable version being at most C<pvlim> bytes long
4335 (if longer, the rest is truncated and "..." will be appended).
4337 The C<flags> argument can have UNI_DISPLAY_ISPRINT set to display
4338 isPRINT()able characters as themselves, UNI_DISPLAY_BACKSLASH
4339 to display the \\[nrfta\\] as the backslashed versions (like '\n')
4340 (UNI_DISPLAY_BACKSLASH is preferred over UNI_DISPLAY_ISPRINT for \\).
4341 UNI_DISPLAY_QQ (and its alias UNI_DISPLAY_REGEX) have both
4342 UNI_DISPLAY_BACKSLASH and UNI_DISPLAY_ISPRINT turned on.
4344 The pointer to the PV of the C<dsv> is returned.
4348 Perl_pv_uni_display(pTHX_ SV *dsv, const U8 *spv, STRLEN len, STRLEN pvlim, UV flags)
4353 PERL_ARGS_ASSERT_PV_UNI_DISPLAY;
4357 for (s = (const char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
4359 /* This serves double duty as a flag and a character to print after
4360 a \ when flags & UNI_DISPLAY_BACKSLASH is true.
4364 if (pvlim && SvCUR(dsv) >= pvlim) {
4368 u = utf8_to_uvchr_buf((U8*)s, (U8*)e, 0);
4370 const unsigned char c = (unsigned char)u & 0xFF;
4371 if (flags & UNI_DISPLAY_BACKSLASH) {
4388 const char string = ok;
4389 sv_catpvs(dsv, "\\");
4390 sv_catpvn(dsv, &string, 1);
4393 /* isPRINT() is the locale-blind version. */
4394 if (!ok && (flags & UNI_DISPLAY_ISPRINT) && isPRINT(c)) {
4395 const char string = c;
4396 sv_catpvn(dsv, &string, 1);
4401 Perl_sv_catpvf(aTHX_ dsv, "\\x{%"UVxf"}", u);
4404 sv_catpvs(dsv, "...");
4410 =for apidoc sv_uni_display
4412 Build to the scalar C<dsv> a displayable version of the scalar C<sv>,
4413 the displayable version being at most C<pvlim> bytes long
4414 (if longer, the rest is truncated and "..." will be appended).
4416 The C<flags> argument is as in L</pv_uni_display>().
4418 The pointer to the PV of the C<dsv> is returned.
4423 Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags)
4425 const char * const ptr =
4426 isREGEXP(ssv) ? RX_WRAPPED((REGEXP*)ssv) : SvPVX_const(ssv);
4428 PERL_ARGS_ASSERT_SV_UNI_DISPLAY;
4430 return Perl_pv_uni_display(aTHX_ dsv, (const U8*)ptr,
4431 SvCUR(ssv), pvlim, flags);
4435 =for apidoc foldEQ_utf8
4437 Returns true if the leading portions of the strings C<s1> and C<s2> (either or both
4438 of which may be in UTF-8) are the same case-insensitively; false otherwise.
4439 How far into the strings to compare is determined by other input parameters.
4441 If C<u1> is true, the string C<s1> is assumed to be in UTF-8-encoded Unicode;
4442 otherwise it is assumed to be in native 8-bit encoding. Correspondingly for C<u2>
4443 with respect to C<s2>.
4445 If the byte length C<l1> is non-zero, it says how far into C<s1> to check for fold
4446 equality. In other words, C<s1>+C<l1> will be used as a goal to reach. The
4447 scan will not be considered to be a match unless the goal is reached, and
4448 scanning won't continue past that goal. Correspondingly for C<l2> with respect to
4451 If C<pe1> is non-NULL and the pointer it points to is not NULL, that pointer is
4452 considered an end pointer to the position 1 byte past the maximum point
4453 in C<s1> beyond which scanning will not continue under any circumstances.
4454 (This routine assumes that UTF-8 encoded input strings are not malformed;
4455 malformed input can cause it to read past C<pe1>).
4456 This means that if both C<l1> and C<pe1> are specified, and C<pe1>
4457 is less than C<s1>+C<l1>, the match will never be successful because it can
4459 get as far as its goal (and in fact is asserted against). Correspondingly for
4460 C<pe2> with respect to C<s2>.
4462 At least one of C<s1> and C<s2> must have a goal (at least one of C<l1> and
4463 C<l2> must be non-zero), and if both do, both have to be
4464 reached for a successful match. Also, if the fold of a character is multiple
4465 characters, all of them must be matched (see tr21 reference below for
4468 Upon a successful match, if C<pe1> is non-NULL,
4469 it will be set to point to the beginning of the I<next> character of C<s1>
4470 beyond what was matched. Correspondingly for C<pe2> and C<s2>.
4472 For case-insensitiveness, the "casefolding" of Unicode is used
4473 instead of upper/lowercasing both the characters, see
4474 L<http://www.unicode.org/unicode/reports/tr21/> (Case Mappings).
4478 /* A flags parameter has been added which may change, and hence isn't
4479 * externally documented. Currently it is:
4480 * 0 for as-documented above
4481 * FOLDEQ_UTF8_NOMIX_ASCII meaning that if a non-ASCII character folds to an
4482 ASCII one, to not match
4483 * FOLDEQ_UTF8_LOCALE meaning that locale rules are to be used for code
4484 * points below 256; unicode rules for above 255; and
4485 * folds that cross those boundaries are disallowed,
4486 * like the NOMIX_ASCII option
4487 * FOLDEQ_S1_ALREADY_FOLDED s1 has already been folded before calling this
4488 * routine. This allows that step to be skipped.
4489 * FOLDEQ_S2_ALREADY_FOLDED Similarly.
4492 Perl_foldEQ_utf8_flags(pTHX_ const char *s1, char **pe1, UV l1, bool u1, const char *s2, char **pe2, UV l2, bool u2, U32 flags)
4495 const U8 *p1 = (const U8*)s1; /* Point to current char */
4496 const U8 *p2 = (const U8*)s2;
4497 const U8 *g1 = NULL; /* goal for s1 */
4498 const U8 *g2 = NULL;
4499 const U8 *e1 = NULL; /* Don't scan s1 past this */
4500 U8 *f1 = NULL; /* Point to current folded */
4501 const U8 *e2 = NULL;
4503 STRLEN n1 = 0, n2 = 0; /* Number of bytes in current char */
4504 U8 foldbuf1[UTF8_MAXBYTES_CASE+1];
4505 U8 foldbuf2[UTF8_MAXBYTES_CASE+1];
4507 PERL_ARGS_ASSERT_FOLDEQ_UTF8_FLAGS;
4509 assert( ! ((flags & (FOLDEQ_UTF8_NOMIX_ASCII | FOLDEQ_UTF8_LOCALE))
4510 && (flags & (FOLDEQ_S1_ALREADY_FOLDED | FOLDEQ_S2_ALREADY_FOLDED))));
4511 /* The algorithm is to trial the folds without regard to the flags on
4512 * the first line of the above assert(), and then see if the result
4513 * violates them. This means that the inputs can't be pre-folded to a
4514 * violating result, hence the assert. This could be changed, with the
4515 * addition of extra tests here for the already-folded case, which would
4516 * slow it down. That cost is more than any possible gain for when these
4517 * flags are specified, as the flags indicate /il or /iaa matching which
4518 * is less common than /iu, and I (khw) also believe that real-world /il
4519 * and /iaa matches are most likely to involve code points 0-255, and this
4520 * function only under rare conditions gets called for 0-255. */
4527 g1 = (const U8*)s1 + l1;
4535 g2 = (const U8*)s2 + l2;
4538 /* Must have at least one goal */
4543 /* Will never match if goal is out-of-bounds */
4544 assert(! e1 || e1 >= g1);
4546 /* Here, there isn't an end pointer, or it is beyond the goal. We
4547 * only go as far as the goal */
4551 assert(e1); /* Must have an end for looking at s1 */
4554 /* Same for goal for s2 */
4556 assert(! e2 || e2 >= g2);
4563 /* If both operands are already folded, we could just do a memEQ on the
4564 * whole strings at once, but it would be better if the caller realized
4565 * this and didn't even call us */
4567 /* Look through both strings, a character at a time */
4568 while (p1 < e1 && p2 < e2) {
4570 /* If at the beginning of a new character in s1, get its fold to use
4571 * and the length of the fold. (exception: locale rules just get the
4572 * character to a single byte) */
4574 if (flags & FOLDEQ_S1_ALREADY_FOLDED) {
4579 /* If in locale matching, we use two sets of rules, depending
4580 * on if the code point is above or below 255. Here, we test
4581 * for and handle locale rules */
4582 if ((flags & FOLDEQ_UTF8_LOCALE)
4583 && (! u1 || ! UTF8_IS_ABOVE_LATIN1(*p1)))
4585 /* There is no mixing of code points above and below 255. */
4586 if (u2 && UTF8_IS_ABOVE_LATIN1(*p2)) {
4590 /* We handle locale rules by converting, if necessary, the
4591 * code point to a single byte. */
4592 if (! u1 || UTF8_IS_INVARIANT(*p1)) {
4596 *foldbuf1 = TWO_BYTE_UTF8_TO_NATIVE(*p1, *(p1 + 1));
4600 else if (isASCII(*p1)) { /* Note, that here won't be both
4601 ASCII and using locale rules */
4603 /* If trying to mix non- with ASCII, and not supposed to,
4605 if ((flags & FOLDEQ_UTF8_NOMIX_ASCII) && ! isASCII(*p2)) {
4609 *foldbuf1 = toFOLD(*p1);
4612 to_utf8_fold(p1, foldbuf1, &n1);
4614 else { /* Not utf8, get utf8 fold */
4615 to_uni_fold(*p1, foldbuf1, &n1);
4621 if (n2 == 0) { /* Same for s2 */
4622 if (flags & FOLDEQ_S2_ALREADY_FOLDED) {
4627 if ((flags & FOLDEQ_UTF8_LOCALE)
4628 && (! u2 || ! UTF8_IS_ABOVE_LATIN1(*p2)))
4630 /* Here, the next char in s2 is < 256. We've already
4631 * worked on s1, and if it isn't also < 256, can't match */
4632 if (u1 && UTF8_IS_ABOVE_LATIN1(*p1)) {
4635 if (! u2 || UTF8_IS_INVARIANT(*p2)) {
4639 *foldbuf2 = TWO_BYTE_UTF8_TO_NATIVE(*p2, *(p2 + 1));
4642 /* Use another function to handle locale rules. We've made
4643 * sure that both characters to compare are single bytes */
4644 if (! foldEQ_locale((char *) f1, (char *) foldbuf2, 1)) {
4649 else if (isASCII(*p2)) {
4650 if ((flags & FOLDEQ_UTF8_NOMIX_ASCII) && ! isASCII(*p1)) {
4654 *foldbuf2 = toFOLD(*p2);
4657 to_utf8_fold(p2, foldbuf2, &n2);
4660 to_uni_fold(*p2, foldbuf2, &n2);
4666 /* Here f1 and f2 point to the beginning of the strings to compare.
4667 * These strings are the folds of the next character from each input
4668 * string, stored in utf8. */
4670 /* While there is more to look for in both folds, see if they
4671 * continue to match */
4673 U8 fold_length = UTF8SKIP(f1);
4674 if (fold_length != UTF8SKIP(f2)
4675 || (fold_length == 1 && *f1 != *f2) /* Short circuit memNE
4676 function call for single
4678 || memNE((char*)f1, (char*)f2, fold_length))
4680 return 0; /* mismatch */
4683 /* Here, they matched, advance past them */
4690 /* When reach the end of any fold, advance the input past it */
4692 p1 += u1 ? UTF8SKIP(p1) : 1;
4695 p2 += u2 ? UTF8SKIP(p2) : 1;
4697 } /* End of loop through both strings */
4699 /* A match is defined by each scan that specified an explicit length
4700 * reaching its final goal, and the other not having matched a partial
4701 * character (which can happen when the fold of a character is more than one
4703 if (! ((g1 == 0 || p1 == g1) && (g2 == 0 || p2 == g2)) || n1 || n2) {
4707 /* Successful match. Set output pointers */
4719 * c-indentation-style: bsd
4721 * indent-tabs-mode: nil
4724 * ex: set ts=8 sts=4 sw=4 et: