X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/55d09dc854b450c4051bea8318009a46a7c083f7..7ff1117359e03ce00638e9ee1daad537321e75d6:/utf8.c diff --git a/utf8.c b/utf8.c index b445a2e..de86314 100644 --- a/utf8.c +++ b/utf8.c @@ -33,14 +33,6 @@ #include "perl.h" #include "inline_invlist.c" -#ifndef EBCDIC -/* Separate prototypes needed because in ASCII systems these are - * usually macros but they still are compiled as code, too. */ -PERL_CALLCONV UV Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags); -PERL_CALLCONV UV Perl_valid_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen); -PERL_CALLCONV U8* Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv); -#endif - static const char unees[] = "Malformed UTF-8 character (unexpected end of string)"; @@ -88,61 +80,30 @@ Perl_is_ascii_string(const U8 *s, STRLEN len) } /* -=for apidoc uvuni_to_utf8_flags - -Adds the UTF-8 representation of the Unicode code point C to the end -of the string C; C should have at least C free -bytes available. The return value is the pointer to the byte after the -end of the new character. In other words, - - d = uvuni_to_utf8_flags(d, uv, flags); - -or, in most cases, +=for apidoc uvoffuni_to_utf8_flags - d = uvuni_to_utf8(d, uv); +THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES. +Instead, B or +L>. -(which is equivalent to) - - d = uvuni_to_utf8_flags(d, uv, 0); - -This is the recommended Unicode-aware way of saying - - *(d++) = uv; - -where uv is a code point expressed in Latin-1 or above, not the platform's -native character set. B -or L>. - -This function will convert to UTF-8 (and not warn) even code points that aren't -legal Unicode or are problematic, unless C contains one or more of the -following flags: - -If C is a Unicode surrogate code point and UNICODE_WARN_SURROGATE is set, -the function will raise a warning, provided UTF8 warnings are enabled. If instead -UNICODE_DISALLOW_SURROGATE is set, the function will fail and return NULL. -If both flags are set, the function will both warn and return NULL. - -The UNICODE_WARN_NONCHAR and UNICODE_DISALLOW_NONCHAR flags correspondingly -affect how the function handles a Unicode non-character. And likewise, the -UNICODE_WARN_SUPER and UNICODE_DISALLOW_SUPER flags, affect the handling of -code points that are -above the Unicode maximum of 0x10FFFF. Code points above 0x7FFF_FFFF (which are -even less portable) can be warned and/or disallowed even if other above-Unicode -code points are accepted by the UNICODE_WARN_FE_FF and UNICODE_DISALLOW_FE_FF -flags. - -And finally, the flag UNICODE_WARN_ILLEGAL_INTERCHANGE selects all four of the -above WARN flags; and UNICODE_DISALLOW_ILLEGAL_INTERCHANGE selects all four -DISALLOW flags. +This function is like them, but the input is a strict Unicode +(as opposed to native) code point. Only in very rare circumstances should code +not be using the native code point. +For details, see the description for L>. =cut */ U8 * -Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags) +Perl_uvoffuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags) { - PERL_ARGS_ASSERT_UVUNI_TO_UTF8_FLAGS; + PERL_ARGS_ASSERT_UVOFFUNI_TO_UTF8_FLAGS; + + if (UNI_IS_INVARIANT(uv)) { + *d++ = (U8) LATIN1_TO_NATIVE(uv); + return d; + } /* The first problematic code point is the first surrogate */ if (uv >= UNICODE_SURROGATE_FIRST @@ -181,13 +142,10 @@ Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags) } } } - if (UNI_IS_INVARIANT(uv)) { - *d++ = (U8) I8_TO_NATIVE_UTF8(uv); - return d; - } + #if defined(EBCDIC) - else { - STRLEN len = UNISKIP(uv); + { + STRLEN len = OFFUNISKIP(uv); U8 *p = d+len-1; while (p > d) { *p-- = (U8) I8_TO_NATIVE_UTF8((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK); @@ -232,7 +190,7 @@ Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags) *d++ = (U8)(( uv & 0x3f) | 0x80); return d; } -#ifdef HAS_QUAD +#ifdef UTF8_QUAD_MAX if (uv < UTF8_QUAD_MAX) #endif { @@ -245,7 +203,7 @@ Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags) *d++ = (U8)(( uv & 0x3f) | 0x80); return d; } -#ifdef HAS_QUAD +#ifdef UTF8_QUAD_MAX { *d++ = 0xff; /* Can't match U+FFFE! */ *d++ = 0x80; /* 6 Reserved bits */ @@ -265,12 +223,92 @@ Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags) #endif #endif /* Non loop style */ } +/* +=for apidoc uvchr_to_utf8 + +Adds the UTF-8 representation of the native code point C to the end +of the string C; C should have at least C free +bytes available. The return value is the pointer to the byte after the +end of the new character. In other words, + + d = uvchr_to_utf8(d, uv); + +is the recommended wide native character-aware way of saying + + *(d++) = uv; + +This function accepts any UV as input. To forbid or warn on non-Unicode code +points, or those that may be problematic, see L. + +=cut +*/ + +/* This is also a macro */ +PERL_CALLCONV U8* Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv); + +U8 * +Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv) +{ + return uvchr_to_utf8(d, uv); +} + +/* +=for apidoc uvchr_to_utf8_flags + +Adds the UTF-8 representation of the native code point C to the end +of the string C; C should have at least C free +bytes available. The return value is the pointer to the byte after the +end of the new character. In other words, + + d = uvchr_to_utf8_flags(d, uv, flags); + +or, in most cases, + + d = uvchr_to_utf8_flags(d, uv, 0); + +This is the Unicode-aware way of saying + + *(d++) = uv; + +This function will convert to UTF-8 (and not warn) even code points that aren't +legal Unicode or are problematic, unless C contains one or more of the +following flags: + +If C is a Unicode surrogate code point and UNICODE_WARN_SURROGATE is set, +the function will raise a warning, provided UTF8 warnings are enabled. If instead +UNICODE_DISALLOW_SURROGATE is set, the function will fail and return NULL. +If both flags are set, the function will both warn and return NULL. + +The UNICODE_WARN_NONCHAR and UNICODE_DISALLOW_NONCHAR flags +affect how the function handles a Unicode non-character. And likewise, the +UNICODE_WARN_SUPER and UNICODE_DISALLOW_SUPER flags affect the handling of +code points that are +above the Unicode maximum of 0x10FFFF. Code points above 0x7FFF_FFFF (which are +even less portable) can be warned and/or disallowed even if other above-Unicode +code points are accepted, by the UNICODE_WARN_FE_FF and UNICODE_DISALLOW_FE_FF +flags. + +And finally, the flag UNICODE_WARN_ILLEGAL_INTERCHANGE selects all four of the +above WARN flags; and UNICODE_DISALLOW_ILLEGAL_INTERCHANGE selects all four +DISALLOW flags. + +=cut +*/ + +/* This is also a macro */ +PERL_CALLCONV U8* Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags); + +U8 * +Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags) +{ + return uvchr_to_utf8_flags(d, uv, flags); +} /* Tests if the first C bytes of string C form a valid UTF-8 -character. Note that an INVARIANT (i.e. ASCII) character is a valid -UTF-8 character. The number of bytes in the UTF-8 character +character. Note that an INVARIANT (i.e. ASCII on non-EBCDIC) character is a +valid UTF-8 character. The number of bytes in the UTF-8 character will be returned if it is valid, otherwise 0. This is the "slow" version as opposed to the "fast" version which is @@ -328,10 +366,8 @@ Perl_is_utf8_char_buf(const U8 *buf, const U8* buf_end) len = UTF8SKIP(buf); } -#ifdef IS_UTF8_CHAR if (IS_UTF8_CHAR_FAST(len)) return IS_UTF8_CHAR(buf, len) ? len : 0; -#endif /* #ifdef IS_UTF8_CHAR */ return is_utf8_char_slow(buf, len); } @@ -478,10 +514,13 @@ Perl_is_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el) /* -=for apidoc utf8n_to_uvuni +=for apidoc utf8n_to_uvchr + +THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES. +Most code should use L() rather than call this directly. Bottom level UTF-8 decode routine. -Returns the code point value of the first character in the string C, +Returns the native code point value of the first character in the string C, which is assumed to be in UTF-8 (or UTF-EBCDIC) encoding, and no longer than C bytes; C<*retlen> (if C isn't NULL) will be set to the length, in bytes, of that character. @@ -550,13 +589,11 @@ All other code points corresponding to Unicode characters, including private use and those yet to be assigned, are never considered malformed and never warn. -Most code should use L() rather than call this directly. - =cut */ UV -Perl_utf8n_to_uvuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags) +Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags) { dVAR; const U8 * const s0 = s; @@ -574,7 +611,7 @@ Perl_utf8n_to_uvuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags) const char* const malformed_text = "Malformed UTF-8 character"; - PERL_ARGS_ASSERT_UTF8N_TO_UVUNI; + PERL_ARGS_ASSERT_UTF8N_TO_UVCHR; /* The order of malformation tests here is important. We should consume as * few bytes as possible in order to not skip any valid character. This is @@ -591,7 +628,7 @@ Perl_utf8n_to_uvuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags) * We also should not consume too few bytes, otherwise someone could inject * things. For example, an input could be deliberately designed to * overflow, and if this code bailed out immediately upon discovering that, - * returning to the caller *retlen pointing to the very next byte (one + * returning to the caller C<*retlen> pointing to the very next byte (one * which is actually part of of the overflowing sequence), that could look * legitimate to the caller, which could discard the initial partial * sequence and process the rest, inappropriately */ @@ -623,7 +660,7 @@ Perl_utf8n_to_uvuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags) /* An invariant is trivially well-formed */ if (UTF8_IS_INVARIANT(uv)) { - return (UV) (NATIVE_UTF8_TO_I8(*s)); + return uv; } /* A continuation character can't start a valid sequence */ @@ -642,13 +679,13 @@ Perl_utf8n_to_uvuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags) goto malformed; } + /* Here is not a continuation byte, nor an invariant. The only thing left + * is a start byte (possibly for an overlong) */ + #ifdef EBCDIC uv = NATIVE_UTF8_TO_I8(uv); #endif - /* Here is not a continuation byte, nor an invariant. The only thing left - * is a start byte (possibly for an overlong) */ - /* Remove the leading bits that indicate the number of bytes in the * character's whole UTF-8 sequence, leaving just the bits that are part of * the value */ @@ -772,7 +809,7 @@ Perl_utf8n_to_uvuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags) #endif if (do_overlong_test - && expectlen > (STRLEN)UNISKIP(uv) + && expectlen > (STRLEN) OFFUNISKIP(uv) && ! (flags & UTF8_ALLOW_LONG)) { /* The overlong malformation has lower precedence than the others. @@ -780,7 +817,7 @@ Perl_utf8n_to_uvuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags) * value, instead of the replacement character. This is because this * value is actually well-defined. */ if (! (flags & UTF8_CHECK_ONLY)) { - sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (%d byte%s, need %d, after start byte 0x%02x)", malformed_text, (int)expectlen, expectlen == 1 ? "": "s", UNISKIP(uv), *s0)); + sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (%d byte%s, need %d, after start byte 0x%02x)", malformed_text, (int)expectlen, expectlen == 1 ? "": "s", OFFUNISKIP(uv), *s0)); } goto malformed; } @@ -826,7 +863,9 @@ Perl_utf8n_to_uvuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags) } if (sv) { - outlier_ret = uv; + outlier_ret = uv; /* Note we don't bother to convert to native, + as all the outlier code points are the same + in both ASCII and EBCDIC */ goto do_warn; } @@ -834,7 +873,7 @@ Perl_utf8n_to_uvuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags) * to return it */ } - return uv; + return UNI_TO_NATIVE(uv); /* There are three cases which get to beyond this point. In all 3 cases: * if not null points to a string to print as a warning. @@ -908,7 +947,7 @@ NULL) to -1. If those warnings are off, the computed value, if well-defined (or the Unicode REPLACEMENT CHARACTER if not), is silently returned, and C<*retlen> is set (if C isn't NULL) so that (S + C<*retlen>>) is the next possible position in C that could begin a non-malformed character. -See L for details on when the REPLACEMENT CHARACTER is +See L for details on when the REPLACEMENT CHARACTER is returned. =cut @@ -918,8 +957,6 @@ returned. UV Perl_utf8_to_uvchr_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen) { - PERL_ARGS_ASSERT_UTF8_TO_UVCHR_BUF; - assert(s < send); return utf8n_to_uvchr(s, send - s, retlen, @@ -928,17 +965,43 @@ Perl_utf8_to_uvchr_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen) /* Like L(), but should only be called when it is known that * there are no malformations in the input UTF-8 string C. surrogates, - * non-character code points, and non-Unicode code points are allowed. A macro - * in utf8.h is used to normally avoid this function wrapper */ + * non-character code points, and non-Unicode code points are allowed. */ UV Perl_valid_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen) { - const UV uv = valid_utf8_to_uvuni(s, retlen); + UV expectlen = UTF8SKIP(s); + const U8* send = s + expectlen; + UV uv = *s; PERL_ARGS_ASSERT_VALID_UTF8_TO_UVCHR; + if (retlen) { + *retlen = expectlen; + } + + /* An invariant is trivially returned */ + if (expectlen == 1) { + return uv; + } + +#ifdef EBCDIC + uv = NATIVE_UTF8_TO_I8(uv); +#endif + + /* Remove the leading bits that indicate the number of bytes, leaving just + * the bits that are part of the value */ + uv &= UTF_START_MASK(expectlen); + + /* Now, loop through the remaining bytes, accumulating each into the + * working total as we go. (I khw tried unrolling the loop for up to 4 + * bytes, but there was no performance improvement) */ + for (++s; s < send; s++) { + uv = UTF8_ACCUMULATE(uv, *s); + } + return UNI_TO_NATIVE(uv); + } /* @@ -958,7 +1021,7 @@ NULL) to -1. If those warnings are off, the computed value if well-defined (or the Unicode REPLACEMENT CHARACTER, if not) is silently returned, and C<*retlen> is set (if C isn't NULL) so that (S + C<*retlen>>) is the next possible position in C that could begin a non-malformed character. -See L for details on when the REPLACEMENT CHARACTER is returned. +See L for details on when the REPLACEMENT CHARACTER is returned. =cut */ @@ -974,20 +1037,22 @@ Perl_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen) /* =for apidoc utf8_to_uvuni_buf -Returns the Unicode code point of the first character in the string C which +Only in very rare circumstances should code need to be dealing in Unicode +(as opposed to native) code points. In those few cases, use +C> instead. + +Returns the Unicode (not-native) code point of the first character in the +string C which is assumed to be in UTF-8 encoding; C points to 1 beyond the end of C. C will be set to the length, in bytes, of that character. -This function should only be used when the returned UV is considered -an index into the Unicode semantic tables (e.g. swashes). - If C does not point to a well-formed UTF-8 character and UTF8 warnings are enabled, zero is returned and C<*retlen> is set (if C isn't NULL) to -1. If those warnings are off, the computed value if well-defined (or the Unicode REPLACEMENT CHARACTER, if not) is silently returned, and C<*retlen> is set (if C isn't NULL) so that (S + C<*retlen>>) is the next possible position in C that could begin a non-malformed character. -See L for details on when the REPLACEMENT CHARACTER is returned. +See L for details on when the REPLACEMENT CHARACTER is returned. =cut */ @@ -1000,44 +1065,21 @@ Perl_utf8_to_uvuni_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen) assert(send > s); /* Call the low level routine asking for checks */ - return Perl_utf8n_to_uvuni(aTHX_ s, send -s, retlen, - ckWARN_d(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY); + return NATIVE_TO_UNI(Perl_utf8n_to_uvchr(aTHX_ s, send -s, retlen, + ckWARN_d(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY)); } -/* Like L(), but should only be called when it is known that +/* DEPRECATED! + * Like L(), but should only be called when it is known that * there are no malformations in the input UTF-8 string C. Surrogates, * non-character code points, and non-Unicode code points are allowed */ UV Perl_valid_utf8_to_uvuni(pTHX_ const U8 *s, STRLEN *retlen) { - UV expectlen = UTF8SKIP(s); - const U8* send = s + expectlen; - UV uv = NATIVE_UTF8_TO_I8(*s); - PERL_ARGS_ASSERT_VALID_UTF8_TO_UVUNI; - if (retlen) { - *retlen = expectlen; - } - - /* An invariant is trivially returned */ - if (expectlen == 1) { - return uv; - } - - /* Remove the leading bits that indicate the number of bytes, leaving just - * the bits that are part of the value */ - uv &= UTF_START_MASK(expectlen); - - /* Now, loop through the remaining bytes, accumulating each into the - * working total as we go. (I khw tried unrolling the loop for up to 4 - * bytes, but there was no performance improvement) */ - for (++s; s < send; s++) { - uv = UTF8_ACCUMULATE(uv, *s); - } - - return uv; + return NATIVE_TO_UNI(valid_utf8_to_uvchr(s, retlen)); } /* @@ -1047,12 +1089,11 @@ Returns the Unicode code point of the first character in the string C which is assumed to be in UTF-8 encoding; C will be set to the length, in bytes, of that character. -This function should only be used when the returned UV is considered -an index into the Unicode semantic tables (e.g. swashes). - Some, but not all, UTF-8 malformations are detected, and in fact, some malformed input could cause reading beyond the end of the input buffer, which -is why this function is deprecated. Use L instead. +is one reason why this function is deprecated. The other is that only in +extremely limited circumstances should the Unicode versus native code point be +of any interest to you. See L for alternatives. If C points to one of the detected malformations, and UTF8 warnings are enabled, zero is returned and C<*retlen> is set (if C doesn't point to @@ -1060,7 +1101,7 @@ NULL) to -1. If those warnings are off, the computed value if well-defined (or the Unicode REPLACEMENT CHARACTER, if not) is silently returned, and C<*retlen> is set (if C isn't NULL) so that (S + C<*retlen>>) is the next possible position in C that could begin a non-malformed character. -See L for details on when the REPLACEMENT CHARACTER is returned. +See L for details on when the REPLACEMENT CHARACTER is returned. =cut */ @@ -1070,7 +1111,7 @@ Perl_utf8_to_uvuni(pTHX_ const U8 *s, STRLEN *retlen) { PERL_ARGS_ASSERT_UTF8_TO_UVUNI; - return valid_utf8_to_uvuni(s, retlen); + return NATIVE_TO_UNI(valid_utf8_to_uvchr(s, retlen)); } /* @@ -1204,7 +1245,7 @@ Perl_bytes_cmp_utf8(pTHX_ const U8 *b, STRLEN blen, const U8 *u, STRLEN ulen) if (u < uend) { U8 c1 = *u++; if (UTF8_IS_CONTINUATION(c1)) { - c = UNI_TO_NATIVE(TWO_BYTE_UTF8_TO_UNI(c, c1)); + c = TWO_BYTE_UTF8_TO_NATIVE(c, c1); } else { Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), "Malformed UTF-8 character " @@ -1264,21 +1305,25 @@ Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len) /* ensure valid UTF-8 and chars < 256 before updating string */ while (s < send) { - U8 c = *s++; - - if (!UTF8_IS_INVARIANT(c) && - (!UTF8_IS_DOWNGRADEABLE_START(c) || (s >= send) - || !(c = *s++) || !UTF8_IS_CONTINUATION(c))) { - *len = ((STRLEN) -1); - return 0; + if (! UTF8_IS_INVARIANT(*s)) { + if (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(s, send)) { + *len = ((STRLEN) -1); + return 0; + } + s++; } + s++; } d = s = save; while (s < send) { - STRLEN ulen; - *d++ = (U8)utf8_to_uvchr_buf(s, send, &ulen); - s += ulen; + U8 c = *s++; + if (! UTF8_IS_INVARIANT(c)) { + /* Then it is two-byte encoded */ + c = TWO_BYTE_UTF8_TO_NATIVE(c, *s); + s++; + } + *d++ = c; } *d = '\0'; *len = d - save; @@ -1315,14 +1360,14 @@ Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *len, bool *is_utf8) /* ensure valid UTF-8 and chars < 256 before converting string */ for (send = s + *len; s < send;) { - U8 c = *s++; - if (!UTF8_IS_INVARIANT(c)) { - if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send && - (c = *s++) && UTF8_IS_CONTINUATION(c)) - count++; - else + if (! UTF8_IS_INVARIANT(*s)) { + if (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(s, send)) { return (U8 *)start; + } + count++; + s++; } + s++; } *is_utf8 = FALSE; @@ -1331,9 +1376,10 @@ Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *len, bool *is_utf8) s = start; start = d; while (s < send) { U8 c = *s++; - if (!UTF8_IS_INVARIANT(c)) { + if (! UTF8_IS_INVARIANT(c)) { /* Then it is two-byte encoded */ - c = UNI_TO_NATIVE(TWO_BYTE_UTF8_TO_UNI(c, *s++)); + c = TWO_BYTE_UTF8_TO_NATIVE(c, *s); + s++; } *d++ = c; } @@ -1406,17 +1452,13 @@ Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen) while (p < pend) { UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */ p += 2; - if (uv < 0x80) { -#ifdef EBCDIC - *d++ = LATIN1_TO_NATIVE(uv); -#else - *d++ = (U8)uv; -#endif + if (UNI_IS_INVARIANT(uv)) { + *d++ = LATIN1_TO_NATIVE((U8) uv); continue; } - if (uv < 0x800) { - *d++ = (U8)(( uv >> 6) | 0xc0); - *d++ = (U8)(( uv & 0x3f) | 0x80); + if (uv <= MAX_UTF8_TWO_BYTE) { + *d++ = UTF8_TWO_BYTE_HI(UNI_TO_NATIVE(uv)); + *d++ = UTF8_TWO_BYTE_LO(UNI_TO_NATIVE(uv)); continue; } #define FIRST_HIGH_SURROGATE UNICODE_SURROGATE_FIRST @@ -1437,6 +1479,9 @@ Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen) } else if (uv >= FIRST_LOW_SURROGATE && uv <= LAST_LOW_SURROGATE) { Perl_croak(aTHX_ "Malformed UTF-16 surrogate"); } +#ifdef EBCDIC + d = uvoffuni_to_utf8_flags(d, uv, 0); +#else if (uv < 0x10000) { *d++ = (U8)(( uv >> 12) | 0xe0); *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80); @@ -1450,6 +1495,7 @@ Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen) *d++ = (U8)(( uv & 0x3f) | 0x80); continue; } +#endif } *newlen = d - dstart; return d; @@ -1645,8 +1691,8 @@ Perl__to_upper_title_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp, const char S_ assert(S_or_s == 'S' || S_or_s == 's'); - if (UNI_IS_INVARIANT(converted)) { /* No difference between the two for - characters in this range */ + if (UVCHR_IS_INVARIANT(converted)) { /* No difference between the two for + characters in this range */ *p = (U8) converted; *lenp = 1; return converted; @@ -1689,14 +1735,14 @@ Perl__to_upper_title_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp, const char S_ * LENP will be set to the length in bytes of the string of changed characters * * The functions return the ordinal of the first character in the string of OUTP */ -#define CALL_UPPER_CASE(INP, OUTP, LENP) Perl_to_utf8_case(aTHX_ INP, OUTP, LENP, &PL_utf8_toupper, "ToUc", "utf8::ToSpecUc") -#define CALL_TITLE_CASE(INP, OUTP, LENP) Perl_to_utf8_case(aTHX_ INP, OUTP, LENP, &PL_utf8_totitle, "ToTc", "utf8::ToSpecTc") -#define CALL_LOWER_CASE(INP, OUTP, LENP) Perl_to_utf8_case(aTHX_ INP, OUTP, LENP, &PL_utf8_tolower, "ToLc", "utf8::ToSpecLc") +#define CALL_UPPER_CASE(INP, OUTP, LENP) Perl_to_utf8_case(aTHX_ INP, OUTP, LENP, &PL_utf8_toupper, "ToUc", "") +#define CALL_TITLE_CASE(INP, OUTP, LENP) Perl_to_utf8_case(aTHX_ INP, OUTP, LENP, &PL_utf8_totitle, "ToTc", "") +#define CALL_LOWER_CASE(INP, OUTP, LENP) Perl_to_utf8_case(aTHX_ INP, OUTP, LENP, &PL_utf8_tolower, "ToLc", "") /* This additionally has the input parameter SPECIALS, which if non-zero will * cause this to use the SPECIALS hash for folding (meaning get full case * folding); otherwise, when zero, this implies a simple case fold */ -#define CALL_FOLD_CASE(INP, OUTP, LENP, SPECIALS) Perl_to_utf8_case(aTHX_ INP, OUTP, LENP, &PL_utf8_tofold, "ToCf", (SPECIALS) ? "utf8::ToSpecCf" : NULL) +#define CALL_FOLD_CASE(INP, OUTP, LENP, SPECIALS) Perl_to_utf8_case(aTHX_ INP, OUTP, LENP, &PL_utf8_tofold, "ToCf", (SPECIALS) ? "" : NULL) UV Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp) @@ -1746,7 +1792,7 @@ S_to_lower_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp) U8 converted = toLOWER_LATIN1(c); if (p != NULL) { - if (UNI_IS_INVARIANT(converted)) { + if (NATIVE_BYTE_IS_INVARIANT(converted)) { *p = converted; *lenp = 1; } @@ -1816,7 +1862,7 @@ Perl__to_fold_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp, const unsigned int f converted = toLOWER_LATIN1(c); } - if (UNI_IS_INVARIANT(converted)) { + if (UVCHR_IS_INVARIANT(converted)) { *p = (U8) converted; *lenp = 1; } @@ -1869,7 +1915,7 @@ bool Perl_is_uni_alnum_lc(pTHX_ UV c) { if (c < 256) { - return isALNUM_LC(UNI_TO_NATIVE(c)); + return isALNUM_LC(c); } return _is_uni_FOO(_CC_WORDCHAR, c); } @@ -1878,7 +1924,7 @@ bool Perl_is_uni_alnumc_lc(pTHX_ UV c) { if (c < 256) { - return isALPHANUMERIC_LC(UNI_TO_NATIVE(c)); + return isALPHANUMERIC_LC(c); } return _is_uni_FOO(_CC_ALPHANUMERIC, c); } @@ -1887,7 +1933,7 @@ bool Perl_is_uni_idfirst_lc(pTHX_ UV c) { if (c < 256) { - return isIDFIRST_LC(UNI_TO_NATIVE(c)); + return isIDFIRST_LC(c); } return _is_uni_perl_idstart(c); } @@ -1896,7 +1942,7 @@ bool Perl_is_uni_alpha_lc(pTHX_ UV c) { if (c < 256) { - return isALPHA_LC(UNI_TO_NATIVE(c)); + return isALPHA_LC(c); } return _is_uni_FOO(_CC_ALPHA, c); } @@ -1905,7 +1951,7 @@ bool Perl_is_uni_ascii_lc(pTHX_ UV c) { if (c < 256) { - return isASCII_LC(UNI_TO_NATIVE(c)); + return isASCII_LC(c); } return 0; } @@ -1914,7 +1960,7 @@ bool Perl_is_uni_blank_lc(pTHX_ UV c) { if (c < 256) { - return isBLANK_LC(UNI_TO_NATIVE(c)); + return isBLANK_LC(c); } return isBLANK_uni(c); } @@ -1923,7 +1969,7 @@ bool Perl_is_uni_space_lc(pTHX_ UV c) { if (c < 256) { - return isSPACE_LC(UNI_TO_NATIVE(c)); + return isSPACE_LC(c); } return isSPACE_uni(c); } @@ -1932,7 +1978,7 @@ bool Perl_is_uni_digit_lc(pTHX_ UV c) { if (c < 256) { - return isDIGIT_LC(UNI_TO_NATIVE(c)); + return isDIGIT_LC(c); } return _is_uni_FOO(_CC_DIGIT, c); } @@ -1941,7 +1987,7 @@ bool Perl_is_uni_upper_lc(pTHX_ UV c) { if (c < 256) { - return isUPPER_LC(UNI_TO_NATIVE(c)); + return isUPPER_LC(c); } return _is_uni_FOO(_CC_UPPER, c); } @@ -1950,7 +1996,7 @@ bool Perl_is_uni_lower_lc(pTHX_ UV c) { if (c < 256) { - return isLOWER_LC(UNI_TO_NATIVE(c)); + return isLOWER_LC(c); } return _is_uni_FOO(_CC_LOWER, c); } @@ -1959,7 +2005,7 @@ bool Perl_is_uni_cntrl_lc(pTHX_ UV c) { if (c < 256) { - return isCNTRL_LC(UNI_TO_NATIVE(c)); + return isCNTRL_LC(c); } return 0; } @@ -1968,7 +2014,7 @@ bool Perl_is_uni_graph_lc(pTHX_ UV c) { if (c < 256) { - return isGRAPH_LC(UNI_TO_NATIVE(c)); + return isGRAPH_LC(c); } return _is_uni_FOO(_CC_GRAPH, c); } @@ -1977,7 +2023,7 @@ bool Perl_is_uni_print_lc(pTHX_ UV c) { if (c < 256) { - return isPRINT_LC(UNI_TO_NATIVE(c)); + return isPRINT_LC(c); } return _is_uni_FOO(_CC_PRINT, c); } @@ -1986,7 +2032,7 @@ bool Perl_is_uni_punct_lc(pTHX_ UV c) { if (c < 256) { - return isPUNCT_LC(UNI_TO_NATIVE(c)); + return isPUNCT_LC(c); } return _is_uni_FOO(_CC_PUNCT, c); } @@ -1995,7 +2041,7 @@ bool Perl_is_uni_xdigit_lc(pTHX_ UV c) { if (c < 256) { - return isXDIGIT_LC(UNI_TO_NATIVE(c)); + return isXDIGIT_LC(c); } return isXDIGIT_uni(c); } @@ -2354,25 +2400,26 @@ Perl_is_utf8_mark(pTHX_ const U8 *p) /* =for apidoc to_utf8_case -The C

contains the pointer to the UTF-8 string encoding +C

contains the pointer to the UTF-8 string encoding the character that is being converted. This routine assumes that the character at C

is well-formed. -The C is a pointer to the character buffer to put the -conversion result to. The C is a pointer to the length +C is a pointer to the character buffer to put the +conversion result to. C is a pointer to the length of the result. -The C is a pointer to the swash to use. +C is a pointer to the swash to use. Both the special and normal mappings are stored in F, -and loaded by SWASHNEW, using F. The C (usually, +and loaded by SWASHNEW, using F. C (usually, but not always, a multicharacter mapping), is tried first. -The C is a string like "utf8::ToSpecLower", which means the -hash %utf8::ToSpecLower. The access to the hash is through -Perl_to_utf8_case(). +C is a string, normally C or C<"">. C means to not use +any special mappings; C<""> means to use the special mappings. Values other +than these two are treated as the name of the hash containing the special +mappings, like C<"utf8::ToSpecLower">. -The C is a string like "ToLower" which means the swash +C is a string like "ToLower" which means the swash %utf8::ToLower. =cut */ @@ -2382,13 +2429,8 @@ Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, SV **swashp, const char *normal, const char *special) { dVAR; - U8 tmpbuf[UTF8_MAXBYTES_CASE+1]; STRLEN len = 0; - const UV uv0 = valid_utf8_to_uvchr(p, NULL); - /* The NATIVE_TO_UNI() and UNI_TO_NATIVE() mappings - * are necessary in EBCDIC, they are redundant no-ops - * in ASCII-ish platforms, and hopefully optimized away. */ - const UV uv1 = NATIVE_TO_UNI(uv0); + const UV uv1 = valid_utf8_to_uvchr(p, NULL); PERL_ARGS_ASSERT_TO_UTF8_CASE; @@ -2414,68 +2456,49 @@ Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, * be given */ } - uvuni_to_utf8(tmpbuf, uv1); - if (!*swashp) /* load on-demand */ *swashp = _core_swash_init("utf8", normal, &PL_sv_undef, 4, 0, NULL, NULL); if (special) { /* It might be "special" (sometimes, but not always, * a multicharacter mapping) */ - HV * const hv = get_hv(special, 0); + HV *hv = NULL; SV **svp; - if (hv && - (svp = hv_fetch(hv, (const char*)tmpbuf, UNISKIP(uv1), FALSE)) && - (*svp)) { + /* If passed in the specials name, use that; otherwise use any + * given in the swash */ + if (*special != '\0') { + hv = get_hv(special, 0); + } + else { + svp = hv_fetchs(MUTABLE_HV(SvRV(*swashp)), "SPECIALS", 0); + if (svp) { + hv = MUTABLE_HV(SvRV(*svp)); + } + } + + if (hv + && (svp = hv_fetch(hv, (const char*)p, UNISKIP(uv1), FALSE)) + && (*svp)) + { const char *s; s = SvPV_const(*svp, len); if (len == 1) - len = uvuni_to_utf8(ustrp, NATIVE_TO_UNI(*(U8*)s)) - ustrp; + /* EIGHTBIT */ + len = uvchr_to_utf8(ustrp, *(U8*)s) - ustrp; else { -#ifdef EBCDIC - /* If we have EBCDIC we need to remap the characters - * since any characters in the low 256 are Unicode - * code points, not EBCDIC. */ - U8 *t = (U8*)s, *tend = t + len, *d; - - d = tmpbuf; - if (SvUTF8(*svp)) { - STRLEN tlen = 0; - - while (t < tend) { - const UV c = utf8_to_uvchr_buf(t, tend, &tlen); - if (tlen > 0) { - d = uvchr_to_utf8(d, UNI_TO_NATIVE(c)); - t += tlen; - } - else - break; - } - } - else { - while (t < tend) { - d = uvchr_to_utf8(d, UNI_TO_NATIVE(*t)); - t++; - } - } - len = d - tmpbuf; - Copy(tmpbuf, ustrp, len, U8); -#else Copy(s, ustrp, len, U8); -#endif } } } if (!len && *swashp) { - const UV uv2 = swash_fetch(*swashp, tmpbuf, TRUE /* => is utf8 */); + const UV uv2 = swash_fetch(*swashp, p, TRUE /* => is utf8 */); if (uv2) { /* It was "normal" (a single character mapping). */ - const UV uv3 = UNI_TO_NATIVE(uv2); - len = uvchr_to_utf8(ustrp, uv3) - ustrp; + len = uvchr_to_utf8(ustrp, uv2) - ustrp; } } @@ -2496,7 +2519,7 @@ Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, if (lenp) *lenp = len; - return uv0; + return uv1; } @@ -2578,10 +2601,11 @@ Perl__to_utf8_upper_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, const bool } else if UTF8_IS_DOWNGRADEABLE_START(*p) { if (flags) { - result = toUPPER_LC(TWO_BYTE_UTF8_TO_UNI(*p, *(p+1))); + U8 c = TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1)); + result = toUPPER_LC(c); } else { - return _to_upper_title_latin1(TWO_BYTE_UTF8_TO_UNI(*p, *(p+1)), + return _to_upper_title_latin1(TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1)), ustrp, lenp, 'S'); } } @@ -2600,8 +2624,8 @@ Perl__to_utf8_upper_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, const bool *lenp = 1; } else { - *ustrp = UTF8_EIGHT_BIT_HI(result); - *(ustrp + 1) = UTF8_EIGHT_BIT_LO(result); + *ustrp = UTF8_EIGHT_BIT_HI((U8) result); + *(ustrp + 1) = UTF8_EIGHT_BIT_LO((U8) result); *lenp = 2; } @@ -2644,10 +2668,11 @@ Perl__to_utf8_title_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, const bool } else if UTF8_IS_DOWNGRADEABLE_START(*p) { if (flags) { - result = toUPPER_LC(TWO_BYTE_UTF8_TO_UNI(*p, *(p+1))); + U8 c = TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1)); + result = toUPPER_LC(c); } else { - return _to_upper_title_latin1(TWO_BYTE_UTF8_TO_UNI(*p, *(p+1)), + return _to_upper_title_latin1(TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1)), ustrp, lenp, 's'); } } @@ -2666,8 +2691,8 @@ Perl__to_utf8_title_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, const bool *lenp = 1; } else { - *ustrp = UTF8_EIGHT_BIT_HI(result); - *(ustrp + 1) = UTF8_EIGHT_BIT_LO(result); + *ustrp = UTF8_EIGHT_BIT_HI((U8) result); + *(ustrp + 1) = UTF8_EIGHT_BIT_LO((U8) result); *lenp = 2; } @@ -2708,10 +2733,11 @@ Perl__to_utf8_lower_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, const bool } else if UTF8_IS_DOWNGRADEABLE_START(*p) { if (flags) { - result = toLOWER_LC(TWO_BYTE_UTF8_TO_UNI(*p, *(p+1))); + U8 c = TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1)); + result = toLOWER_LC(c); } else { - return to_lower_latin1(TWO_BYTE_UTF8_TO_UNI(*p, *(p+1)), + return to_lower_latin1(TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1)), ustrp, lenp); } } @@ -2731,8 +2757,8 @@ Perl__to_utf8_lower_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, const bool *lenp = 1; } else { - *ustrp = UTF8_EIGHT_BIT_HI(result); - *(ustrp + 1) = UTF8_EIGHT_BIT_LO(result); + *ustrp = UTF8_EIGHT_BIT_HI((U8) result); + *(ustrp + 1) = UTF8_EIGHT_BIT_LO((U8) result); *lenp = 2; } @@ -2786,10 +2812,11 @@ Perl__to_utf8_fold_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, U8 flags, b } else if UTF8_IS_DOWNGRADEABLE_START(*p) { if (flags & FOLD_FLAGS_LOCALE) { - result = toFOLD_LC(TWO_BYTE_UTF8_TO_UNI(*p, *(p+1))); + U8 c = TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1)); + result = toFOLD_LC(c); } else { - return _to_fold_latin1(TWO_BYTE_UTF8_TO_UNI(*p, *(p+1)), + return _to_fold_latin1(TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1)), ustrp, lenp, flags & (FOLD_FLAGS_FULL | FOLD_FLAGS_NOMIX_ASCII)); } @@ -2799,7 +2826,7 @@ Perl__to_utf8_fold_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, U8 flags, b if (flags & FOLD_FLAGS_LOCALE) { - /* Special case this character, as what normally gets returned + /* Special case these characters, as what normally gets returned * under locale doesn't work */ if (UTF8SKIP(p) == sizeof(LATIN_CAPITAL_LETTER_SHARP_S_UTF8) - 1 && memEQ((char *) p, LATIN_CAPITAL_LETTER_SHARP_S_UTF8, @@ -2807,6 +2834,12 @@ Perl__to_utf8_fold_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, U8 flags, b { goto return_long_s; } + else if (UTF8SKIP(p) == sizeof(LATIN_SMALL_LIGATURE_LONG_S_T) - 1 + && memEQ((char *) p, LATIN_SMALL_LIGATURE_LONG_S_T_UTF8, + sizeof(LATIN_SMALL_LIGATURE_LONG_S_T_UTF8) - 1)) + { + goto return_ligature_st; + } return check_locale_boundary_crossing(p, result, ustrp, lenp); } else if (! (flags & FOLD_FLAGS_NOMIX_ASCII)) { @@ -2814,8 +2847,8 @@ Perl__to_utf8_fold_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, U8 flags, b } else { /* This is called when changing the case of a utf8-encoded - * character above the Latin1 range, and the result should not - * contain an ASCII character. */ + * character above the ASCII range, and the result should not + * contain an ASCII character. */ UV original; /* To store the first code point of

*/ @@ -2828,11 +2861,16 @@ Perl__to_utf8_fold_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, U8 flags, b /* Crossed, have to return the original */ original = valid_utf8_to_uvchr(p, lenp); - /* But in this one instance, there is an alternative we can + /* But in these instances, there is an alternative we can * return that is valid */ - if (original == LATIN_CAPITAL_LETTER_SHARP_S) { + if (original == LATIN_CAPITAL_LETTER_SHARP_S + || original == LATIN_SMALL_LETTER_SHARP_S) + { goto return_long_s; } + else if (original == LATIN_SMALL_LIGATURE_LONG_S_T) { + goto return_ligature_st; + } Copy(p, ustrp, *lenp, char); return original; } @@ -2850,8 +2888,8 @@ Perl__to_utf8_fold_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, U8 flags, b *lenp = 1; } else { - *ustrp = UTF8_EIGHT_BIT_HI(result); - *(ustrp + 1) = UTF8_EIGHT_BIT_LO(result); + *ustrp = UTF8_EIGHT_BIT_HI((U8) result); + *(ustrp + 1) = UTF8_EIGHT_BIT_LO((U8) result); *lenp = 2; } @@ -2871,6 +2909,14 @@ Perl__to_utf8_fold_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, U8 flags, b Copy(LATIN_SMALL_LETTER_LONG_S_UTF8 LATIN_SMALL_LETTER_LONG_S_UTF8, ustrp, *lenp, U8); return LATIN_SMALL_LETTER_LONG_S; + + return_ligature_st: + /* Two folds to 'st' are prohibited by the options; instead we pick one and + * have the other one fold to it */ + + *lenp = sizeof(LATIN_SMALL_LIGATURE_ST_UTF8) - 1; + Copy(LATIN_SMALL_LIGATURE_ST_UTF8, ustrp, *lenp, U8); + return LATIN_SMALL_LIGATURE_ST; } /* Note: @@ -3151,8 +3197,8 @@ Perl__core_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 m /* Note: * Returns the value of property/mapping C for the first character * of the string C. If C is true, the string C is - * assumed to be in utf8. If C is false, the string C is - * assumed to be in native 8-bit encoding. Caches the swatch in C. + * assumed to be in well-formed utf8. If C is false, the string C + * is assumed to be in native 8-bit encoding. Caches the swatch in C. * * A "swash" is a hash which contains initially the keys/values set up by * SWASHNEW. The purpose is to be able to completely represent a Unicode @@ -3194,8 +3240,7 @@ Perl_swash_fetch(pTHX_ SV *swash, const U8 *ptr, bool do_utf8) const U8 *tmps = NULL; U32 bit; SV *swatch; - U8 tmputf8[2]; - const UV c = NATIVE_TO_ASCII(*ptr); + const U8 c = *ptr; PERL_ARGS_ASSERT_SWASH_FETCH; @@ -3208,28 +3253,58 @@ Perl_swash_fetch(pTHX_ SV *swash, const U8 *ptr, bool do_utf8) : c); } - /* Convert to utf8 if not already */ - if (!do_utf8 && !UNI_IS_INVARIANT(c)) { - tmputf8[0] = (U8)UTF8_EIGHT_BIT_HI(c); - tmputf8[1] = (U8)UTF8_EIGHT_BIT_LO(c); - ptr = tmputf8; + /* We store the values in a "swatch" which is a vec() value in a swash + * hash. Code points 0-255 are a single vec() stored with key length + * (klen) 0. All other code points have a UTF-8 representation + * 0xAA..0xYY,0xZZ. A vec() is constructed containing all of them which + * share 0xAA..0xYY, which is the key in the hash to that vec. So the key + * length for them is the length of the encoded char - 1. ptr[klen] is the + * final byte in the sequence representing the character */ + if (!do_utf8 || UTF8_IS_INVARIANT(c)) { + klen = 0; + needents = 256; + off = c; } - /* Given a UTF-X encoded char 0xAA..0xYY,0xZZ - * then the "swatch" is a vec() for all the chars which start - * with 0xAA..0xYY - * So the key in the hash (klen) is length of encoded char -1 - */ - klen = UTF8SKIP(ptr) - 1; - - if (klen == 0) { - /* If char is invariant then swatch is for all the invariant chars - * In both UTF-8 and UTF-8-MOD that happens to be UTF_CONTINUATION_MARK - */ - needents = UTF_CONTINUATION_MARK; - off = NATIVE_UTF8_TO_I8(ptr[klen]); + else if (UTF8_IS_DOWNGRADEABLE_START(c)) { + klen = 0; + needents = 256; + off = TWO_BYTE_UTF8_TO_NATIVE(c, *(ptr + 1)); } else { - /* If char is encoded then swatch is for the prefix */ + klen = UTF8SKIP(ptr) - 1; + + /* Each vec() stores 2**UTF_ACCUMULATION_SHIFT values. The offset into + * the vec is the final byte in the sequence. (In EBCDIC this is + * converted to I8 to get consecutive values.) To help you visualize + * all this: + * Straight 1047 After final byte + * UTF-8 UTF-EBCDIC I8 transform + * U+0400: \xD0\x80 \xB8\x41\x41 \xB8\x41\xA0 + * U+0401: \xD0\x81 \xB8\x41\x42 \xB8\x41\xA1 + * ... + * U+0409: \xD0\x89 \xB8\x41\x4A \xB8\x41\xA9 + * U+040A: \xD0\x8A \xB8\x41\x51 \xB8\x41\xAA + * ... + * U+0412: \xD0\x92 \xB8\x41\x59 \xB8\x41\xB2 + * U+0413: \xD0\x93 \xB8\x41\x62 \xB8\x41\xB3 + * ... + * U+041B: \xD0\x9B \xB8\x41\x6A \xB8\x41\xBB + * U+041C: \xD0\x9C \xB8\x41\x70 \xB8\x41\xBC + * ... + * U+041F: \xD0\x9F \xB8\x41\x73 \xB8\x41\xBF + * U+0420: \xD0\xA0 \xB8\x42\x41 \xB8\x42\x41 + * + * (There are no discontinuities in the elided (...) entries.) + * The UTF-8 key for these 33 code points is '\xD0' (which also is the + * key for the next 31, up through U+043F, whose UTF-8 final byte is + * \xBF). Thus in UTF-8, each key is for a vec() for 64 code points. + * The final UTF-8 byte, which ranges between \x80 and \xBF, is an + * index into the vec() swatch (after subtracting 0x80, which we + * actually do with an '&'). + * In UTF-EBCDIC, each key is for a 32 code point vec(). The first 32 + * code points above have key '\xB8\x41'. The final UTF-EBCDIC byte has + * dicontinuities which go away by transforming it into I8, and we + * effectively subtract 0xA0 to get the index. */ needents = (1 << UTF_ACCUMULATION_SHIFT); off = NATIVE_UTF8_TO_I8(ptr[klen]) & UTF_CONTINUATION_MASK; } @@ -3256,17 +3331,18 @@ Perl_swash_fetch(pTHX_ SV *swash, const U8 *ptr, bool do_utf8) /* If not cached, generate it via swatch_get */ if (!svp || !SvPOK(*svp) - || !(tmps = (const U8*)SvPV_const(*svp, slen))) { - /* We use utf8n_to_uvuni() as we want an index into - Unicode tables, not a native character number. - */ - const UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXBYTES, 0, - ckWARN(WARN_UTF8) ? - 0 : UTF8_ALLOW_ANY); - swatch = swatch_get(swash, - /* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */ - (klen) ? (code_point & ~((UV)needents - 1)) : 0, - needents); + || !(tmps = (const U8*)SvPV_const(*svp, slen))) + { + if (klen) { + const UV code_point = valid_utf8_to_uvchr(ptr, NULL); + swatch = swatch_get(swash, + code_point & ~((UV)needents - 1), + needents); + } + else { /* For the first 256 code points, the swatch has a key of + length 0 */ + swatch = swatch_get(swash, 0, needents); + } if (IN_PERL_COMPILETIME) CopHINTS_set(PL_curcop, PL_hints); @@ -3366,30 +3442,19 @@ S_swash_scan_list_line(pTHX_ U8* l, U8* const lend, UV* min, UV* max, UV* val, *max = *min; /* Non-binary tables have a third entry: what the first element of the - * range maps to */ + * range maps to. The map for those currently read here is in hex */ if (wants_value) { if (isBLANK(*l)) { ++l; - - /* The ToLc, etc table mappings are not in hex, and must be - * corrected by adding the code point to them */ - if (typeto) { - char *after_strtol = (char *) lend; - *val = Strtol((char *)l, &after_strtol, 10); - l = (U8 *) after_strtol; - } - else { /* Other tables are in hex, and are the correct result - without tweaking */ - flags = PERL_SCAN_SILENT_ILLDIGIT - | PERL_SCAN_DISALLOW_PREFIX - | PERL_SCAN_SILENT_NON_PORTABLE; - numlen = lend - l; - *val = grok_hex((char *)l, &numlen, &flags, NULL); - if (numlen) - l += numlen; - else - *val = 0; - } + flags = PERL_SCAN_SILENT_ILLDIGIT + | PERL_SCAN_DISALLOW_PREFIX + | PERL_SCAN_SILENT_NON_PORTABLE; + numlen = lend - l; + *val = grok_hex((char *)l, &numlen, &flags, NULL); + if (numlen) + l += numlen; + else + *val = 0; } else { *val = 0; @@ -3945,7 +4010,7 @@ Perl__swash_inversion_hash(pTHX_ SV* const swash) /* The key is the inverse mapping */ char key[UTF8_MAXBYTES+1]; - char* key_end = (char *) uvuni_to_utf8((U8*) key, val); + char* key_end = (char *) uvchr_to_utf8((U8*) key, val); STRLEN key_len = key_end - key; /* Get the list for the map */ @@ -4213,68 +4278,6 @@ Perl__get_swash_invlist(pTHX_ SV* const swash) return *ptr; } -/* -=for apidoc uvchr_to_utf8 - -Adds the UTF-8 representation of the Native code point C to the end -of the string C; C should have at least C free -bytes available. The return value is the pointer to the byte after the -end of the new character. In other words, - - d = uvchr_to_utf8(d, uv); - -is the recommended wide native character-aware way of saying - - *(d++) = uv; - -=cut -*/ - -/* On ASCII machines this is normally a macro but we want a - real function in case XS code wants it -*/ -U8 * -Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv) -{ - PERL_ARGS_ASSERT_UVCHR_TO_UTF8; - - return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), 0); -} - -U8 * -Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags) -{ - PERL_ARGS_ASSERT_UVCHR_TO_UTF8_FLAGS; - - return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), flags); -} - -/* -=for apidoc utf8n_to_uvchr - -Returns the native character value of the first character in the string -C -which is assumed to be in UTF-8 encoding; C will be set to the -length, in bytes, of that character. - -C and C are the same as L(). - -=cut -*/ -/* On ASCII machines this is normally a macro but we want - a real function in case XS code wants it -*/ -UV -Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, -U32 flags) -{ - const UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags); - - PERL_ARGS_ASSERT_UTF8N_TO_UVCHR; - - return UNI_TO_NATIVE(uv); -} - bool Perl_check_utf8_print(pTHX_ const U8* s, const STRLEN len) { @@ -4507,10 +4510,18 @@ Perl_foldEQ_utf8_flags(pTHX_ const char *s1, char **pe1, UV l1, bool u1, const c PERL_ARGS_ASSERT_FOLDEQ_UTF8_FLAGS; - /* The algorithm requires that input with the flags on the first line of - * the assert not be pre-folded. */ assert( ! ((flags & (FOLDEQ_UTF8_NOMIX_ASCII | FOLDEQ_UTF8_LOCALE)) - && (flags & (FOLDEQ_S1_ALREADY_FOLDED | FOLDEQ_S2_ALREADY_FOLDED)))); + && (flags & (FOLDEQ_S1_ALREADY_FOLDED | FOLDEQ_S2_ALREADY_FOLDED)))); + /* The algorithm is to trial the folds without regard to the flags on + * the first line of the above assert(), and then see if the result + * violates them. This means that the inputs can't be pre-folded to a + * violating result, hence the assert. This could be changed, with the + * addition of extra tests here for the already-folded case, which would + * slow it down. That cost is more than any possible gain for when these + * flags are specified, as the flags indicate /il or /iaa matching which + * is less common than /iu, and I (khw) also believe that real-world /il + * and /iaa matches are most likely to involve code points 0-255, and this + * function only under rare conditions gets called for 0-255. */ if (pe1) { e1 = *(U8**)pe1; @@ -4586,7 +4597,7 @@ Perl_foldEQ_utf8_flags(pTHX_ const char *s1, char **pe1, UV l1, bool u1, const c *foldbuf1 = *p1; } else { - *foldbuf1 = TWO_BYTE_UTF8_TO_UNI(*p1, *(p1 + 1)); + *foldbuf1 = TWO_BYTE_UTF8_TO_NATIVE(*p1, *(p1 + 1)); } n1 = 1; } @@ -4605,7 +4616,7 @@ Perl_foldEQ_utf8_flags(pTHX_ const char *s1, char **pe1, UV l1, bool u1, const c to_utf8_fold(p1, foldbuf1, &n1); } else { /* Not utf8, get utf8 fold */ - to_uni_fold(NATIVE_TO_LATIN1(*p1), foldbuf1, &n1); + to_uni_fold(*p1, foldbuf1, &n1); } f1 = foldbuf1; } @@ -4629,7 +4640,7 @@ Perl_foldEQ_utf8_flags(pTHX_ const char *s1, char **pe1, UV l1, bool u1, const c *foldbuf2 = *p2; } else { - *foldbuf2 = TWO_BYTE_UTF8_TO_UNI(*p2, *(p2 + 1)); + *foldbuf2 = TWO_BYTE_UTF8_TO_NATIVE(*p2, *(p2 + 1)); } /* Use another function to handle locale rules. We've made @@ -4650,7 +4661,7 @@ Perl_foldEQ_utf8_flags(pTHX_ const char *s1, char **pe1, UV l1, bool u1, const c to_utf8_fold(p2, foldbuf2, &n2); } else { - to_uni_fold(NATIVE_TO_LATIN1(*p2), foldbuf2, &n2); + to_uni_fold(*p2, foldbuf2, &n2); } f2 = foldbuf2; } @@ -4707,6 +4718,66 @@ Perl_foldEQ_utf8_flags(pTHX_ const char *s1, char **pe1, UV l1, bool u1, const c return 1; } +/* XXX The next four functions should likely be moved to mathoms.c once all + * occurrences of them are removed from the core; some cpan-upstream modules + * still use them */ + +U8 * +Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv) +{ + PERL_ARGS_ASSERT_UVUNI_TO_UTF8; + + return Perl_uvoffuni_to_utf8_flags(aTHX_ d, uv, 0); +} + +UV +Perl_utf8n_to_uvuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags) +{ + PERL_ARGS_ASSERT_UTF8N_TO_UVUNI; + + return NATIVE_TO_UNI(utf8n_to_uvchr(s, curlen, retlen, flags)); +} + +/* +=for apidoc uvuni_to_utf8_flags + +Instead you almost certainly want to use L or +L>. + +This function is a deprecated synonym for L, +which itself, while not deprecated, should be used only in isolated +circumstances. These functions were useful for code that wanted to handle +both EBCDIC and ASCII platforms with Unicode properties, but starting in Perl +v5.20, the distinctions between the platforms have mostly been made invisible +to most code, so this function is quite unlikely to be what you want. + +=cut +*/ + +U8 * +Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags) +{ + PERL_ARGS_ASSERT_UVUNI_TO_UTF8_FLAGS; + + return uvoffuni_to_utf8_flags(d, uv, flags); +} + +/* +=for apidoc utf8n_to_uvuni + +Instead use L, or rarely, L. + +This function was useful for code that wanted to handle both EBCDIC and +ASCII platforms with Unicode properties, but starting in Perl v5.20, the +distinctions between the platforms have mostly been made invisible to most +code, so this function is quite unlikely to be what you want. If you do need +this precise functionality, use instead +C> +or C>. + +=cut +*/ + /* * Local variables: * c-indentation-style: bsd