X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/b08f1bd50a5986cbd3d7473de7ec606dcade39f2..3cedd9d93070bd6f0cb074a3013165cd9a630fca:/utf8.c diff --git a/utf8.c b/utf8.c index 93e73a6..1b684f2 100644 --- a/utf8.c +++ b/utf8.c @@ -32,6 +32,7 @@ #define PERL_IN_UTF8_C #include "perl.h" #include "inline_invlist.c" +#include "charclass_invlists.h" static const char unees[] = "Malformed UTF-8 character (unexpected end of string)"; @@ -40,7 +41,7 @@ static const char unees[] = =head1 Unicode Support This file contains various utility functions for manipulating UTF8-encoded -strings. For the uninitiated, this is a method of representing arbitrary +strings. For the uninitiated, this is a method of representing arbitrary Unicode characters as a variable number of bytes, in such a way that characters in the ASCII range are unmodified, and a zero byte never appears within non-zero characters. @@ -56,7 +57,9 @@ or not the string is encoded in UTF-8 (or UTF-EBCDIC on EBCDIC machines). That is, if they are invariant. On ASCII-ish machines, only ASCII characters fit this definition, hence the function's name. -If C is 0, it will be calculated using C. +If C is 0, it will be calculated using C, (which means if you +use this option, that C can't have embedded C characters and has to +have a terminating C byte). See also L(), L(), and L(). @@ -107,7 +110,7 @@ Perl_uvoffuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags) /* The first problematic code point is the first surrogate */ if (uv >= UNICODE_SURROGATE_FIRST - && ckWARN4_d(WARN_UTF8, WARN_SURROGATE, WARN_NON_UNICODE, WARN_NONCHAR)) + && ckWARN3_d(WARN_SURROGATE, WARN_NON_UNICODE, WARN_NONCHAR)) { if (UNICODE_IS_SURROGATE(uv)) { if (flags & UNICODE_WARN_SURROGATE) { @@ -190,7 +193,7 @@ Perl_uvoffuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags) *d++ = (U8)(( uv & 0x3f) | 0x80); return d; } -#ifdef HAS_QUAD +#ifdef UTF8_QUAD_MAX if (uv < UTF8_QUAD_MAX) #endif { @@ -203,7 +206,7 @@ Perl_uvoffuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags) *d++ = (U8)(( uv & 0x3f) | 0x80); return d; } -#ifdef HAS_QUAD +#ifdef UTF8_QUAD_MAX { *d++ = 0xff; /* Can't match U+FFFE! */ *d++ = 0x80; /* 6 Reserved bits */ @@ -227,9 +230,9 @@ Perl_uvoffuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags) =for apidoc uvchr_to_utf8 Adds the UTF-8 representation of the native code point C to the end -of the string C; C should have at least C free -bytes available. The return value is the pointer to the byte after the -end of the new character. In other words, +of the string C; C should have at least C (up to +C) free bytes available. The return value is the pointer to +the byte after the end of the new character. In other words, d = uvchr_to_utf8(d, uv); @@ -256,9 +259,9 @@ Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv) =for apidoc uvchr_to_utf8_flags Adds the UTF-8 representation of the native code point C to the end -of the string C; C should have at least C free -bytes available. The return value is the pointer to the byte after the -end of the new character. In other words, +of the string C; C should have at least C (up to +C) free bytes available. The return value is the pointer to +the byte after the end of the new character. In other words, d = uvchr_to_utf8_flags(d, uv, flags); @@ -279,9 +282,9 @@ the function will raise a warning, provided UTF8 warnings are enabled. If inste UNICODE_DISALLOW_SURROGATE is set, the function will fail and return NULL. If both flags are set, the function will both warn and return NULL. -The UNICODE_WARN_NONCHAR and UNICODE_DISALLOW_NONCHAR flags correspondingly +The UNICODE_WARN_NONCHAR and UNICODE_DISALLOW_NONCHAR flags affect how the function handles a Unicode non-character. And likewise, the -UNICODE_WARN_SUPER and UNICODE_DISALLOW_SUPER flags, affect the handling of +UNICODE_WARN_SUPER and UNICODE_DISALLOW_SUPER flags affect the handling of code points that are above the Unicode maximum of 0x10FFFF. Code points above 0x7FFF_FFFF (which are even less portable) can be warned and/or disallowed even if other above-Unicode @@ -305,47 +308,9 @@ Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags) } /* - -Tests if the first C bytes of string C form a valid UTF-8 -character. Note that an INVARIANT (i.e. ASCII on non-EBCDIC) character is a -valid UTF-8 character. The number of bytes in the UTF-8 character -will be returned if it is valid, otherwise 0. - -This is the "slow" version as opposed to the "fast" version which is -the "unrolled" IS_UTF8_CHAR(). E.g. for t/uni/class.t the speed -difference is a factor of 2 to 3. For lengths (UTF8SKIP(s)) of four -or less you should use the IS_UTF8_CHAR(), for lengths of five or more -you should use the _slow(). In practice this means that the _slow() -will be used very rarely, since the maximum Unicode code point (as of -Unicode 4.1) is U+10FFFF, which encodes in UTF-8 to four bytes. Only -the "Perl extended UTF-8" (e.g, the infamous 'v-strings') will encode into -five bytes or more. - -=cut */ -PERL_STATIC_INLINE STRLEN -S_is_utf8_char_slow(const U8 *s, const STRLEN len) -{ - dTHX; /* The function called below requires thread context */ - - STRLEN actual_len; - - PERL_ARGS_ASSERT_IS_UTF8_CHAR_SLOW; - - utf8n_to_uvchr(s, len, &actual_len, UTF8_CHECK_ONLY); - - return (actual_len == (STRLEN) -1) ? 0 : actual_len; -} - -/* =for apidoc is_utf8_char_buf -Returns the number of bytes that comprise the first UTF-8 encoded character in -buffer C. C should point to one position beyond the end of the -buffer. 0 is returned if C does not point to a complete, valid UTF-8 -encoded character. - -Note that an INVARIANT character (i.e. ASCII on non-EBCDIC -machines) is a valid UTF-8 character. +This is identical to the macro L. =cut */ @@ -353,24 +318,9 @@ STRLEN Perl_is_utf8_char_buf(const U8 *buf, const U8* buf_end) { - STRLEN len; - PERL_ARGS_ASSERT_IS_UTF8_CHAR_BUF; - if (buf_end <= buf) { - return 0; - } - - len = buf_end - buf; - if (len > UTF8SKIP(buf)) { - len = UTF8SKIP(buf); - } - -#ifdef IS_UTF8_CHAR - if (IS_UTF8_CHAR_FAST(len)) - return IS_UTF8_CHAR(buf, len) ? len : 0; -#endif /* #ifdef IS_UTF8_CHAR */ - return is_utf8_char_slow(buf, len); + return isUTF8_CHAR(buf, buf_end); } /* @@ -393,7 +343,7 @@ Perl_is_utf8_char(const U8 *s) PERL_ARGS_ASSERT_IS_UTF8_CHAR; /* Assumes we have enough space, which is why this is deprecated */ - return is_utf8_char_buf(s, s + UTF8SKIP(s)); + return isUTF8_CHAR(s, s + UTF8SKIP(s)); } @@ -402,9 +352,9 @@ Perl_is_utf8_char(const U8 *s) Returns true if the first C bytes of string C form a valid UTF-8 string, false otherwise. If C is 0, it will be calculated -using C (which means if you use this option, that C has to have a -terminating NUL byte). Note that all characters being ASCII constitute 'a -valid UTF-8 string'. +using C (which means if you use this option, that C can't have +embedded C characters and has to have a terminating C byte). Note +that all characters being ASCII constitute 'a valid UTF-8 string'. See also L(), L(), and L(). @@ -420,28 +370,11 @@ Perl_is_utf8_string(const U8 *s, STRLEN len) PERL_ARGS_ASSERT_IS_UTF8_STRING; while (x < send) { - /* Inline the easy bits of is_utf8_char() here for speed... */ - if (UTF8_IS_INVARIANT(*x)) { - x++; - } - else { - /* ... and call is_utf8_char() only if really needed. */ - const STRLEN c = UTF8SKIP(x); - const U8* const next_char_ptr = x + c; - - if (next_char_ptr > send) { - return FALSE; - } - - if (IS_UTF8_CHAR_FAST(c)) { - if (!IS_UTF8_CHAR(x, c)) - return FALSE; - } - else if (! is_utf8_char_slow(x, c)) { - return FALSE; - } - x = next_char_ptr; - } + STRLEN len = isUTF8_CHAR(x, send); + if (UNLIKELY(! len)) { + return FALSE; + } + x += len; } return TRUE; @@ -475,34 +408,17 @@ Perl_is_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el) { const U8* const send = s + (len ? len : strlen((const char *)s)); const U8* x = s; - STRLEN c; STRLEN outlen = 0; PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN; while (x < send) { - const U8* next_char_ptr; - - /* Inline the easy bits of is_utf8_char() here for speed... */ - if (UTF8_IS_INVARIANT(*x)) - next_char_ptr = x + 1; - else { - /* ... and call is_utf8_char() only if really needed. */ - c = UTF8SKIP(x); - next_char_ptr = c + x; - if (next_char_ptr > send) { - goto out; - } - if (IS_UTF8_CHAR_FAST(c)) { - if (!IS_UTF8_CHAR(x, c)) - c = 0; - } else - c = is_utf8_char_slow(x, c); - if (!c) - goto out; - } - x = next_char_ptr; - outlen++; + STRLEN len = isUTF8_CHAR(x, send); + if (UNLIKELY(! len)) { + goto out; + } + x += len; + outlen++; } out: @@ -549,11 +465,11 @@ flags) malformation is found. If this flag is set, the routine assumes that the caller will raise a warning, and this function will silently just set C to C<-1> (cast to C) and return zero. -Note that this API requires disambiguation between successful decoding a NUL +Note that this API requires disambiguation between successful decoding a C character, and an error return (unless the UTF8_CHECK_ONLY flag is set), as in both cases, 0 is returned. To disambiguate, upon a zero return, see if the -first byte of C is 0 as well. If so, the input was a NUL; if not, the input -had an error. +first byte of C is 0 as well. If so, the input was a C; if not, the +input had an error. Certain code points are considered problematic. These are Unicode surrogates, Unicode non-characters, and code points above the Unicode maximum of 0x10FFFF. @@ -584,7 +500,8 @@ The UTF-8 encoding on ASCII platforms for these large code points begins with a byte containing 0xFE or 0xFF. The UTF8_DISALLOW_FE_FF flag will cause them to be treated as malformations, while allowing smaller above-Unicode code points. (Of course UTF8_DISALLOW_SUPER will treat all above-Unicode code points, -including these, as malformations.) Similarly, UTF8_WARN_FE_FF acts just like +including these, as malformations.) +Similarly, UTF8_WARN_FE_FF acts just like the other WARN flags, but applies just to these code points. All other code points corresponding to Unicode characters, including private @@ -630,7 +547,7 @@ Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags) * We also should not consume too few bytes, otherwise someone could inject * things. For example, an input could be deliberately designed to * overflow, and if this code bailed out immediately upon discovering that, - * returning to the caller *retlen pointing to the very next byte (one + * returning to the caller C<*retlen> pointing to the very next byte (one * which is actually part of of the overflowing sequence), that could look * legitimate to the caller, which could discard the initial partial * sequence and process the rest, inappropriately */ @@ -779,32 +696,8 @@ Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags) } } -#ifndef EBCDIC /* EBCDIC allows FE, FF, can't overflow */ - if ((*s0 & 0xFE) == 0xFE /* matches both FE, FF */ - && (flags & (UTF8_WARN_FE_FF|UTF8_DISALLOW_FE_FF))) - { - /* By adding UTF8_CHECK_ONLY to the test, we avoid unnecessary - * generation of the sv, since no warnings are raised under CHECK */ - if ((flags & (UTF8_WARN_FE_FF|UTF8_CHECK_ONLY)) == UTF8_WARN_FE_FF - && ckWARN_d(WARN_UTF8)) - { - /* This message is deliberately not of the same syntax as the other - * messages for malformations, for backwards compatibility in the - * unlikely event that code is relying on its precise earlier text - */ - sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s Code point beginning with byte 0x%02X is not Unicode, and not portable", malformed_text, *s0)); - pack_warn = packWARN(WARN_UTF8); - } - if (flags & UTF8_DISALLOW_FE_FF) { - goto malformed; - } - } +#ifndef EBCDIC /* EBCDIC can't overflow */ if (UNLIKELY(overflowed)) { - - /* If the first byte is FF, it will overflow a 32-bit word. If the - * first byte is FE, it will overflow a signed 32-bit word. The - * above preserves backward compatibility, since its message was used - * in earlier versions of this code in preference to overflow */ sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (overflow at byte 0x%02x, after start byte 0x%02x)", malformed_text, overflow_byte, *s0)); goto malformed; } @@ -824,18 +717,21 @@ Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags) goto malformed; } - /* Here, the input is considered to be well-formed , but could be a + /* Here, the input is considered to be well-formed, but it still could be a * problematic code point that is not allowed by the input parameters. */ if (uv >= UNICODE_SURROGATE_FIRST /* isn't problematic if < this */ && (flags & (UTF8_DISALLOW_ILLEGAL_INTERCHANGE |UTF8_WARN_ILLEGAL_INTERCHANGE))) { if (UNICODE_IS_SURROGATE(uv)) { + + /* By adding UTF8_CHECK_ONLY to the test, we avoid unnecessary + * generation of the sv, since no warnings are raised under CHECK */ if ((flags & (UTF8_WARN_SURROGATE|UTF8_CHECK_ONLY)) == UTF8_WARN_SURROGATE - && ckWARN2_d(WARN_UTF8, WARN_SURROGATE)) + && ckWARN_d(WARN_SURROGATE)) { sv = sv_2mortal(Perl_newSVpvf(aTHX_ "UTF-16 surrogate U+%04"UVXf"", uv)); - pack_warn = packWARN2(WARN_UTF8, WARN_SURROGATE); + pack_warn = packWARN(WARN_SURROGATE); } if (flags & UTF8_DISALLOW_SURROGATE) { goto disallowed; @@ -843,21 +739,42 @@ Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags) } else if ((uv > PERL_UNICODE_MAX)) { if ((flags & (UTF8_WARN_SUPER|UTF8_CHECK_ONLY)) == UTF8_WARN_SUPER - && ckWARN2_d(WARN_UTF8, WARN_NON_UNICODE)) + && ckWARN_d(WARN_NON_UNICODE)) { sv = sv_2mortal(Perl_newSVpvf(aTHX_ "Code point 0x%04"UVXf" is not Unicode, may not be portable", uv)); - pack_warn = packWARN2(WARN_UTF8, WARN_NON_UNICODE); + pack_warn = packWARN(WARN_NON_UNICODE); } +#ifndef EBCDIC /* EBCDIC always allows FE, FF */ + + /* The first byte being 0xFE or 0xFF is a subset of the SUPER code + * points. We test for these after the regular SUPER ones, and + * before possibly bailing out, so that the more dire warning + * overrides the regular one, if applicable */ + if ((*s0 & 0xFE) == 0xFE /* matches both FE, FF */ + && (flags & (UTF8_WARN_FE_FF|UTF8_DISALLOW_FE_FF))) + { + if ((flags & (UTF8_WARN_FE_FF|UTF8_CHECK_ONLY)) + == UTF8_WARN_FE_FF + && ckWARN_d(WARN_UTF8)) + { + sv = sv_2mortal(Perl_newSVpvf(aTHX_ "Code point 0x%"UVXf" is not Unicode, and not portable", uv)); + pack_warn = packWARN(WARN_UTF8); + } + if (flags & UTF8_DISALLOW_FE_FF) { + goto disallowed; + } + } +#endif if (flags & UTF8_DISALLOW_SUPER) { goto disallowed; } } else if (UNICODE_IS_NONCHAR(uv)) { if ((flags & (UTF8_WARN_NONCHAR|UTF8_CHECK_ONLY)) == UTF8_WARN_NONCHAR - && ckWARN2_d(WARN_UTF8, WARN_NONCHAR)) + && ckWARN_d(WARN_NONCHAR)) { sv = sv_2mortal(Perl_newSVpvf(aTHX_ "Unicode non-character U+%04"UVXf" is illegal for open interchange", uv)); - pack_warn = packWARN2(WARN_UTF8, WARN_NONCHAR); + pack_warn = packWARN(WARN_NONCHAR); } if (flags & UTF8_DISALLOW_NONCHAR) { goto disallowed; @@ -967,8 +884,7 @@ Perl_utf8_to_uvchr_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen) /* Like L(), but should only be called when it is known that * there are no malformations in the input UTF-8 string C. surrogates, - * non-character code points, and non-Unicode code points are allowed. A macro - * in utf8.h is used to normally avoid this function wrapper */ + * non-character code points, and non-Unicode code points are allowed. */ UV Perl_valid_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen) @@ -1220,12 +1136,14 @@ Perl_utf8_hop(pTHX_ const U8 *s, I32 off) =for apidoc bytes_cmp_utf8 Compares the sequence of characters (stored as octets) in C, C with the -sequence of characters (stored as UTF-8) in C, C. Returns 0 if they are +sequence of characters (stored as UTF-8) +in C, C. Returns 0 if they are equal, -1 or -2 if the first string is less than the second string, +1 or +2 if the first string is greater than the second string. -1 or +1 is returned if the shorter string was identical to the start of the -longer string. -2 or +2 is returned if the was a difference between characters +longer string. -2 or +2 is returned if +there was a difference between characters within the strings. =cut @@ -1340,7 +1258,7 @@ Converts a string C of length C from UTF-8 into native byte encoding. Unlike L but like L, returns a pointer to the newly-created string, and updates C to contain the new length. Returns the original string if no conversion occurs, C -is unchanged. Do nothing if C points to 0. Sets C to +is unchanged. Do nothing if C points to 0. Sets C to 0 if C is converted or consisted entirely of characters that are invariant in utf8 (i.e., US-ASCII on non-EBCDIC machines). @@ -1399,7 +1317,7 @@ UTF-8. Returns a pointer to the newly-created string, and sets C to reflect the new length in bytes. -A NUL character will be written after the end of the string. +A C character will be written after the end of the string. If you want to convert to UTF-8 from encodings other than the native (Latin1 or EBCDIC), @@ -1535,26 +1453,6 @@ Perl__is_uni_FOO(pTHX_ const U8 classnum, const UV c) return _is_utf8_FOO(classnum, tmpbuf); } -/* for now these are all defined (inefficiently) in terms of the utf8 versions. - * Note that the macros in handy.h that call these short-circuit calling them - * for Latin-1 range inputs */ - -bool -Perl_is_uni_alnum(pTHX_ UV c) -{ - U8 tmpbuf[UTF8_MAXBYTES+1]; - uvchr_to_utf8(tmpbuf, c); - return _is_utf8_FOO(_CC_WORDCHAR, tmpbuf); -} - -bool -Perl_is_uni_alnumc(pTHX_ UV c) -{ - U8 tmpbuf[UTF8_MAXBYTES+1]; - uvchr_to_utf8(tmpbuf, c); - return _is_utf8_FOO(_CC_ALPHANUMERIC, tmpbuf); -} - /* Internal function so we can deprecate the external one, and call this one from other deprecated functions in this file */ @@ -1566,7 +1464,7 @@ S_is_utf8_idfirst(pTHX_ const U8 *p) if (*p == '_') return TRUE; /* is_utf8_idstart would be more logical. */ - return is_utf8_common(p, &PL_utf8_idstart, "IdStart"); + return is_utf8_common(p, &PL_utf8_idstart, "IdStart", NULL); } bool @@ -1593,92 +1491,6 @@ Perl__is_uni_perl_idstart(pTHX_ UV c) return _is_utf8_perl_idstart(tmpbuf); } -bool -Perl_is_uni_alpha(pTHX_ UV c) -{ - U8 tmpbuf[UTF8_MAXBYTES+1]; - uvchr_to_utf8(tmpbuf, c); - return _is_utf8_FOO(_CC_ALPHA, tmpbuf); -} - -bool -Perl_is_uni_ascii(pTHX_ UV c) -{ - return isASCII(c); -} - -bool -Perl_is_uni_blank(pTHX_ UV c) -{ - return isBLANK_uni(c); -} - -bool -Perl_is_uni_space(pTHX_ UV c) -{ - return isSPACE_uni(c); -} - -bool -Perl_is_uni_digit(pTHX_ UV c) -{ - U8 tmpbuf[UTF8_MAXBYTES+1]; - uvchr_to_utf8(tmpbuf, c); - return _is_utf8_FOO(_CC_DIGIT, tmpbuf); -} - -bool -Perl_is_uni_upper(pTHX_ UV c) -{ - U8 tmpbuf[UTF8_MAXBYTES+1]; - uvchr_to_utf8(tmpbuf, c); - return _is_utf8_FOO(_CC_UPPER, tmpbuf); -} - -bool -Perl_is_uni_lower(pTHX_ UV c) -{ - U8 tmpbuf[UTF8_MAXBYTES+1]; - uvchr_to_utf8(tmpbuf, c); - return _is_utf8_FOO(_CC_LOWER, tmpbuf); -} - -bool -Perl_is_uni_cntrl(pTHX_ UV c) -{ - return isCNTRL_L1(c); -} - -bool -Perl_is_uni_graph(pTHX_ UV c) -{ - U8 tmpbuf[UTF8_MAXBYTES+1]; - uvchr_to_utf8(tmpbuf, c); - return _is_utf8_FOO(_CC_GRAPH, tmpbuf); -} - -bool -Perl_is_uni_print(pTHX_ UV c) -{ - U8 tmpbuf[UTF8_MAXBYTES+1]; - uvchr_to_utf8(tmpbuf, c); - return _is_utf8_FOO(_CC_PRINT, tmpbuf); -} - -bool -Perl_is_uni_punct(pTHX_ UV c) -{ - U8 tmpbuf[UTF8_MAXBYTES+1]; - uvchr_to_utf8(tmpbuf, c); - return _is_utf8_FOO(_CC_PUNCT, tmpbuf); -} - -bool -Perl_is_uni_xdigit(pTHX_ UV c) -{ - return isXDIGIT_uni(c); -} - UV Perl__to_upper_title_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp, const char S_or_s) { @@ -1694,7 +1506,7 @@ Perl__to_upper_title_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp, const char S_ assert(S_or_s == 'S' || S_or_s == 's'); - if (NATIVE_IS_INVARIANT(converted)) { /* No difference between the two for + if (UVCHR_IS_INVARIANT(converted)) { /* No difference between the two for characters in this range */ *p = (U8) converted; *lenp = 1; @@ -1738,14 +1550,14 @@ Perl__to_upper_title_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp, const char S_ * LENP will be set to the length in bytes of the string of changed characters * * The functions return the ordinal of the first character in the string of OUTP */ -#define CALL_UPPER_CASE(INP, OUTP, LENP) Perl_to_utf8_case(aTHX_ INP, OUTP, LENP, &PL_utf8_toupper, "ToUc", "utf8::ToSpecUc") -#define CALL_TITLE_CASE(INP, OUTP, LENP) Perl_to_utf8_case(aTHX_ INP, OUTP, LENP, &PL_utf8_totitle, "ToTc", "utf8::ToSpecTc") -#define CALL_LOWER_CASE(INP, OUTP, LENP) Perl_to_utf8_case(aTHX_ INP, OUTP, LENP, &PL_utf8_tolower, "ToLc", "utf8::ToSpecLc") +#define CALL_UPPER_CASE(INP, OUTP, LENP) Perl_to_utf8_case(aTHX_ INP, OUTP, LENP, &PL_utf8_toupper, "ToUc", "") +#define CALL_TITLE_CASE(INP, OUTP, LENP) Perl_to_utf8_case(aTHX_ INP, OUTP, LENP, &PL_utf8_totitle, "ToTc", "") +#define CALL_LOWER_CASE(INP, OUTP, LENP) Perl_to_utf8_case(aTHX_ INP, OUTP, LENP, &PL_utf8_tolower, "ToLc", "") /* This additionally has the input parameter SPECIALS, which if non-zero will * cause this to use the SPECIALS hash for folding (meaning get full case * folding); otherwise, when zero, this implies a simple case fold */ -#define CALL_FOLD_CASE(INP, OUTP, LENP, SPECIALS) Perl_to_utf8_case(aTHX_ INP, OUTP, LENP, &PL_utf8_tofold, "ToCf", (SPECIALS) ? "utf8::ToSpecCf" : NULL) +#define CALL_FOLD_CASE(INP, OUTP, LENP, SPECIALS) Perl_to_utf8_case(aTHX_ INP, OUTP, LENP, &PL_utf8_tofold, "ToCf", (SPECIALS) ? "" : NULL) UV Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp) @@ -1795,7 +1607,7 @@ S_to_lower_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp) U8 converted = toLOWER_LATIN1(c); if (p != NULL) { - if (NATIVE_IS_INVARIANT(converted)) { + if (NATIVE_BYTE_IS_INVARIANT(converted)) { *p = converted; *lenp = 1; } @@ -1865,7 +1677,7 @@ Perl__to_fold_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp, const unsigned int f converted = toLOWER_LATIN1(c); } - if (NATIVE_IS_INVARIANT(converted)) { + if (UVCHR_IS_INVARIANT(converted)) { *p = (U8) converted; *lenp = 1; } @@ -1879,21 +1691,27 @@ Perl__to_fold_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp, const unsigned int f } UV -Perl__to_uni_fold_flags(pTHX_ UV c, U8* p, STRLEN *lenp, const U8 flags) +Perl__to_uni_fold_flags(pTHX_ UV c, U8* p, STRLEN *lenp, U8 flags) { /* Not currently externally documented, and subject to change * bits meanings: * FOLD_FLAGS_FULL iff full folding is to be used; - * FOLD_FLAGS_LOCALE iff in locale + * FOLD_FLAGS_LOCALE is set iff the rules from the current underlying + * locale are to be used. * FOLD_FLAGS_NOMIX_ASCII iff non-ASCII to ASCII folds are prohibited */ PERL_ARGS_ASSERT__TO_UNI_FOLD_FLAGS; + /* Tread a UTF-8 locale as not being in locale at all */ + if (IN_UTF8_CTYPE_LOCALE) { + flags &= ~FOLD_FLAGS_LOCALE; + } + if (c < 256) { UV result = _to_fold_latin1((U8) c, p, lenp, - flags & (FOLD_FLAGS_FULL | FOLD_FLAGS_NOMIX_ASCII)); + flags & (FOLD_FLAGS_FULL | FOLD_FLAGS_NOMIX_ASCII)); /* It is illegal for the fold to cross the 255/256 boundary under * locale; in this case return the original */ return (result > 256 && flags & FOLD_FLAGS_LOCALE) @@ -1910,184 +1728,21 @@ Perl__to_uni_fold_flags(pTHX_ UV c, U8* p, STRLEN *lenp, const U8 flags) the special flags. */ U8 utf8_c[UTF8_MAXBYTES + 1]; uvchr_to_utf8(utf8_c, c); - return _to_utf8_fold_flags(utf8_c, p, lenp, flags, NULL); - } -} - -bool -Perl_is_uni_alnum_lc(pTHX_ UV c) -{ - if (c < 256) { - return isALNUM_LC(c); - } - return _is_uni_FOO(_CC_WORDCHAR, c); -} - -bool -Perl_is_uni_alnumc_lc(pTHX_ UV c) -{ - if (c < 256) { - return isALPHANUMERIC_LC(c); - } - return _is_uni_FOO(_CC_ALPHANUMERIC, c); -} - -bool -Perl_is_uni_idfirst_lc(pTHX_ UV c) -{ - if (c < 256) { - return isIDFIRST_LC(c); - } - return _is_uni_perl_idstart(c); -} - -bool -Perl_is_uni_alpha_lc(pTHX_ UV c) -{ - if (c < 256) { - return isALPHA_LC(c); - } - return _is_uni_FOO(_CC_ALPHA, c); -} - -bool -Perl_is_uni_ascii_lc(pTHX_ UV c) -{ - if (c < 256) { - return isASCII_LC(c); + return _to_utf8_fold_flags(utf8_c, p, lenp, flags); } - return 0; -} - -bool -Perl_is_uni_blank_lc(pTHX_ UV c) -{ - if (c < 256) { - return isBLANK_LC(c); - } - return isBLANK_uni(c); -} - -bool -Perl_is_uni_space_lc(pTHX_ UV c) -{ - if (c < 256) { - return isSPACE_LC(c); - } - return isSPACE_uni(c); -} - -bool -Perl_is_uni_digit_lc(pTHX_ UV c) -{ - if (c < 256) { - return isDIGIT_LC(c); - } - return _is_uni_FOO(_CC_DIGIT, c); -} - -bool -Perl_is_uni_upper_lc(pTHX_ UV c) -{ - if (c < 256) { - return isUPPER_LC(c); - } - return _is_uni_FOO(_CC_UPPER, c); -} - -bool -Perl_is_uni_lower_lc(pTHX_ UV c) -{ - if (c < 256) { - return isLOWER_LC(c); - } - return _is_uni_FOO(_CC_LOWER, c); -} - -bool -Perl_is_uni_cntrl_lc(pTHX_ UV c) -{ - if (c < 256) { - return isCNTRL_LC(c); - } - return 0; -} - -bool -Perl_is_uni_graph_lc(pTHX_ UV c) -{ - if (c < 256) { - return isGRAPH_LC(c); - } - return _is_uni_FOO(_CC_GRAPH, c); -} - -bool -Perl_is_uni_print_lc(pTHX_ UV c) -{ - if (c < 256) { - return isPRINT_LC(c); - } - return _is_uni_FOO(_CC_PRINT, c); -} - -bool -Perl_is_uni_punct_lc(pTHX_ UV c) -{ - if (c < 256) { - return isPUNCT_LC(c); - } - return _is_uni_FOO(_CC_PUNCT, c); -} - -bool -Perl_is_uni_xdigit_lc(pTHX_ UV c) -{ - if (c < 256) { - return isXDIGIT_LC(c); - } - return isXDIGIT_uni(c); -} - -U32 -Perl_to_uni_upper_lc(pTHX_ U32 c) -{ - /* XXX returns only the first character -- do not use XXX */ - /* XXX no locale support yet */ - STRLEN len; - U8 tmpbuf[UTF8_MAXBYTES_CASE+1]; - return (U32)to_uni_upper(c, tmpbuf, &len); -} - -U32 -Perl_to_uni_title_lc(pTHX_ U32 c) -{ - /* XXX returns only the first character XXX -- do not use XXX */ - /* XXX no locale support yet */ - STRLEN len; - U8 tmpbuf[UTF8_MAXBYTES_CASE+1]; - return (U32)to_uni_title(c, tmpbuf, &len); -} - -U32 -Perl_to_uni_lower_lc(pTHX_ U32 c) -{ - /* XXX returns only the first character -- do not use XXX */ - /* XXX no locale support yet */ - STRLEN len; - U8 tmpbuf[UTF8_MAXBYTES_CASE+1]; - return (U32)to_uni_lower(c, tmpbuf, &len); } PERL_STATIC_INLINE bool S_is_utf8_common(pTHX_ const U8 *const p, SV **swash, - const char *const swashname) + const char *const swashname, SV* const invlist) { /* returns a boolean giving whether or not the UTF8-encoded character that * starts at

is in the swash indicated by . * contains a pointer to where the swash indicated by * is to be stored; which this routine will do, so that future calls will - * look at <*swash> and only generate a swash if it is not null + * look at <*swash> and only generate a swash if it is not null. + * is NULL or an inversion list that defines the swash. If not null, it + * saves time during initialization of the swash. * * Note that it is assumed that the buffer length of

is enough to * contain all the bytes that comprise the character. Thus, <*p> should @@ -2103,7 +1758,7 @@ S_is_utf8_common(pTHX_ const U8 *const p, SV **swash, * as far as there being enough bytes available in it to accommodate the * character without reading beyond the end, and pass that number on to the * validating routine */ - if (! is_utf8_char_buf(p, p + UTF8SKIP(p))) { + if (! isUTF8_CHAR(p, p + UTF8SKIP(p))) { if (ckWARN_d(WARN_UTF8)) { Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED,WARN_UTF8), "Passing malformed UTF-8 to \"%s\" is deprecated", swashname); @@ -2116,7 +1771,13 @@ S_is_utf8_common(pTHX_ const U8 *const p, SV **swash, } if (!*swash) { U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST; - *swash = _core_swash_init("utf8", swashname, &PL_sv_undef, 1, 0, NULL, &flags); + *swash = _core_swash_init("utf8", + + /* Only use the name if there is no inversion + * list; otherwise will go out to disk */ + (invlist) ? "" : swashname, + + &PL_sv_undef, 1, 0, invlist, &flags); } return swash_fetch(*swash, p, TRUE) != 0; @@ -2131,30 +1792,10 @@ Perl__is_utf8_FOO(pTHX_ const U8 classnum, const U8 *p) assert(classnum < _FIRST_NON_SWASH_CC); - return is_utf8_common(p, &PL_utf8_swash_ptrs[classnum], swash_property_names[classnum]); -} - -bool -Perl_is_utf8_alnum(pTHX_ const U8 *p) -{ - dVAR; - - PERL_ARGS_ASSERT_IS_UTF8_ALNUM; - - /* NOTE: "IsWord", not "IsAlnum", since Alnum is a true - * descendant of isalnum(3), in other words, it doesn't - * contain the '_'. --jhi */ - return is_utf8_common(p, &PL_utf8_swash_ptrs[_CC_WORDCHAR], "IsWord"); -} - -bool -Perl_is_utf8_alnumc(pTHX_ const U8 *p) -{ - dVAR; - - PERL_ARGS_ASSERT_IS_UTF8_ALNUMC; - - return is_utf8_common(p, &PL_utf8_swash_ptrs[_CC_ALPHANUMERIC], "IsAlnum"); + return is_utf8_common(p, + &PL_utf8_swash_ptrs[classnum], + swash_property_names[classnum], + PL_XPosix_ptrs[classnum]); } bool @@ -2177,27 +1818,35 @@ Perl_is_utf8_xidfirst(pTHX_ const U8 *p) /* The naming is historical. */ if (*p == '_') return TRUE; /* is_utf8_idstart would be more logical. */ - return is_utf8_common(p, &PL_utf8_xidstart, "XIdStart"); + return is_utf8_common(p, &PL_utf8_xidstart, "XIdStart", NULL); } bool Perl__is_utf8_perl_idstart(pTHX_ const U8 *p) { dVAR; + SV* invlist = NULL; PERL_ARGS_ASSERT__IS_UTF8_PERL_IDSTART; - return is_utf8_common(p, &PL_utf8_perl_idstart, "_Perl_IDStart"); + if (! PL_utf8_perl_idstart) { + invlist = _new_invlist_C_array(_Perl_IDStart_invlist); + } + return is_utf8_common(p, &PL_utf8_perl_idstart, "", invlist); } bool Perl__is_utf8_perl_idcont(pTHX_ const U8 *p) { dVAR; + SV* invlist = NULL; PERL_ARGS_ASSERT__IS_UTF8_PERL_IDCONT; - return is_utf8_common(p, &PL_utf8_perl_idcont, "_Perl_IDCont"); + if (! PL_utf8_perl_idcont) { + invlist = _new_invlist_C_array(_Perl_IDCont_invlist); + } + return is_utf8_common(p, &PL_utf8_perl_idcont, "", invlist); } @@ -2208,7 +1857,7 @@ Perl_is_utf8_idcont(pTHX_ const U8 *p) PERL_ARGS_ASSERT_IS_UTF8_IDCONT; - return is_utf8_common(p, &PL_utf8_idcont, "IdContinue"); + return is_utf8_common(p, &PL_utf8_idcont, "IdContinue", NULL); } bool @@ -2218,165 +1867,7 @@ Perl_is_utf8_xidcont(pTHX_ const U8 *p) PERL_ARGS_ASSERT_IS_UTF8_XIDCONT; - return is_utf8_common(p, &PL_utf8_idcont, "XIdContinue"); -} - -bool -Perl_is_utf8_alpha(pTHX_ const U8 *p) -{ - dVAR; - - PERL_ARGS_ASSERT_IS_UTF8_ALPHA; - - return is_utf8_common(p, &PL_utf8_swash_ptrs[_CC_ALPHA], "IsAlpha"); -} - -bool -Perl_is_utf8_ascii(pTHX_ const U8 *p) -{ - dVAR; - - PERL_ARGS_ASSERT_IS_UTF8_ASCII; - - /* ASCII characters are the same whether in utf8 or not. So the macro - * works on both utf8 and non-utf8 representations. */ - return isASCII(*p); -} - -bool -Perl_is_utf8_blank(pTHX_ const U8 *p) -{ - dVAR; - - PERL_ARGS_ASSERT_IS_UTF8_BLANK; - - return isBLANK_utf8(p); -} - -bool -Perl_is_utf8_space(pTHX_ const U8 *p) -{ - dVAR; - - PERL_ARGS_ASSERT_IS_UTF8_SPACE; - - return isSPACE_utf8(p); -} - -bool -Perl_is_utf8_perl_space(pTHX_ const U8 *p) -{ - dVAR; - - PERL_ARGS_ASSERT_IS_UTF8_PERL_SPACE; - - /* Only true if is an ASCII space-like character, and ASCII is invariant - * under utf8, so can just use the macro */ - return isSPACE_A(*p); -} - -bool -Perl_is_utf8_perl_word(pTHX_ const U8 *p) -{ - dVAR; - - PERL_ARGS_ASSERT_IS_UTF8_PERL_WORD; - - /* Only true if is an ASCII word character, and ASCII is invariant - * under utf8, so can just use the macro */ - return isWORDCHAR_A(*p); -} - -bool -Perl_is_utf8_digit(pTHX_ const U8 *p) -{ - dVAR; - - PERL_ARGS_ASSERT_IS_UTF8_DIGIT; - - return is_utf8_common(p, &PL_utf8_swash_ptrs[_CC_DIGIT], "IsDigit"); -} - -bool -Perl_is_utf8_posix_digit(pTHX_ const U8 *p) -{ - dVAR; - - PERL_ARGS_ASSERT_IS_UTF8_POSIX_DIGIT; - - /* Only true if is an ASCII digit character, and ASCII is invariant - * under utf8, so can just use the macro */ - return isDIGIT_A(*p); -} - -bool -Perl_is_utf8_upper(pTHX_ const U8 *p) -{ - dVAR; - - PERL_ARGS_ASSERT_IS_UTF8_UPPER; - - return is_utf8_common(p, &PL_utf8_swash_ptrs[_CC_UPPER], "IsUppercase"); -} - -bool -Perl_is_utf8_lower(pTHX_ const U8 *p) -{ - dVAR; - - PERL_ARGS_ASSERT_IS_UTF8_LOWER; - - return is_utf8_common(p, &PL_utf8_swash_ptrs[_CC_LOWER], "IsLowercase"); -} - -bool -Perl_is_utf8_cntrl(pTHX_ const U8 *p) -{ - dVAR; - - PERL_ARGS_ASSERT_IS_UTF8_CNTRL; - - return isCNTRL_utf8(p); -} - -bool -Perl_is_utf8_graph(pTHX_ const U8 *p) -{ - dVAR; - - PERL_ARGS_ASSERT_IS_UTF8_GRAPH; - - return is_utf8_common(p, &PL_utf8_swash_ptrs[_CC_GRAPH], "IsGraph"); -} - -bool -Perl_is_utf8_print(pTHX_ const U8 *p) -{ - dVAR; - - PERL_ARGS_ASSERT_IS_UTF8_PRINT; - - return is_utf8_common(p, &PL_utf8_swash_ptrs[_CC_PRINT], "IsPrint"); -} - -bool -Perl_is_utf8_punct(pTHX_ const U8 *p) -{ - dVAR; - - PERL_ARGS_ASSERT_IS_UTF8_PUNCT; - - return is_utf8_common(p, &PL_utf8_swash_ptrs[_CC_PUNCT], "IsPunct"); -} - -bool -Perl_is_utf8_xdigit(pTHX_ const U8 *p) -{ - dVAR; - - PERL_ARGS_ASSERT_IS_UTF8_XDIGIT; - - return is_XDIGIT_utf8(p); + return is_utf8_common(p, &PL_utf8_idcont, "XIdContinue", NULL); } bool @@ -2386,42 +1877,32 @@ Perl__is_utf8_mark(pTHX_ const U8 *p) PERL_ARGS_ASSERT__IS_UTF8_MARK; - return is_utf8_common(p, &PL_utf8_mark, "IsM"); -} - - -bool -Perl_is_utf8_mark(pTHX_ const U8 *p) -{ - dVAR; - - PERL_ARGS_ASSERT_IS_UTF8_MARK; - - return _is_utf8_mark(p); + return is_utf8_common(p, &PL_utf8_mark, "IsM", NULL); } /* =for apidoc to_utf8_case -The C

contains the pointer to the UTF-8 string encoding +C

contains the pointer to the UTF-8 string encoding the character that is being converted. This routine assumes that the character at C

is well-formed. -The C is a pointer to the character buffer to put the -conversion result to. The C is a pointer to the length +C is a pointer to the character buffer to put the +conversion result to. C is a pointer to the length of the result. -The C is a pointer to the swash to use. +C is a pointer to the swash to use. Both the special and normal mappings are stored in F, -and loaded by SWASHNEW, using F. The C (usually, +and loaded by SWASHNEW, using F. C (usually, but not always, a multicharacter mapping), is tried first. -The C is a string like "utf8::ToSpecLower", which means the -hash %utf8::ToSpecLower. The access to the hash is through -Perl_to_utf8_case(). +C is a string, normally C or C<"">. C means to not use +any special mappings; C<""> means to use the special mappings. Values other +than these two are treated as the name of the hash containing the special +mappings, like C<"utf8::ToSpecLower">. -The C is a string like "ToLower" which means the swash +C is a string like "ToLower" which means the swash %utf8::ToLower. =cut */ @@ -2464,12 +1945,25 @@ Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, if (special) { /* It might be "special" (sometimes, but not always, * a multicharacter mapping) */ - HV * const hv = get_hv(special, 0); + HV *hv = NULL; SV **svp; - if (hv && - (svp = hv_fetch(hv, (const char*)p, UNISKIP(uv1), FALSE)) && - (*svp)) { + /* If passed in the specials name, use that; otherwise use any + * given in the swash */ + if (*special != '\0') { + hv = get_hv(special, 0); + } + else { + svp = hv_fetchs(MUTABLE_HV(SvRV(*swashp)), "SPECIALS", 0); + if (svp) { + hv = MUTABLE_HV(SvRV(*svp)); + } + } + + if (hv + && (svp = hv_fetch(hv, (const char*)p, UNISKIP(uv1), FALSE)) + && (*svp)) + { const char *s; s = SvPV_const(*svp, len); @@ -2516,9 +2010,10 @@ STATIC UV S_check_locale_boundary_crossing(pTHX_ const U8* const p, const UV result, U8* const ustrp, STRLEN *lenp) { /* This is called when changing the case of a utf8-encoded character above - * the Latin1 range, and the operation is in locale. If the result - * contains a character that crosses the 255/256 boundary, disallow the - * change, and return the original code point. See L for why; + * the Latin1 range, and the operation is in a non-UTF-8 locale. If the + * result contains a character that crosses the 255/256 boundary, disallow + * the change, and return the original code point. See L for + * why; * * p points to the original string whose case was changed; assumed * by this routine to be well-formed @@ -2567,12 +2062,11 @@ Instead use L. =cut */ /* Not currently externally documented, and subject to change: - * is set iff locale semantics are to be used for code points < 256 - * if non-null, *tainted_ptr will be set TRUE iff locale rules - * were used in the calculation; otherwise unchanged. */ + * is set iff iff the rules from the current underlying locale are to + * be used. */ UV -Perl__to_utf8_upper_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, const bool flags, bool* tainted_ptr) +Perl__to_utf8_upper_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, bool flags) { dVAR; @@ -2580,6 +2074,10 @@ Perl__to_utf8_upper_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, const bool PERL_ARGS_ASSERT__TO_UTF8_UPPER_FLAGS; + if (flags && IN_UTF8_CTYPE_LOCALE) { + flags = FALSE; + } + if (UTF8_IS_INVARIANT(*p)) { if (flags) { result = toUPPER_LC(*p); @@ -2590,7 +2088,8 @@ Perl__to_utf8_upper_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, const bool } else if UTF8_IS_DOWNGRADEABLE_START(*p) { if (flags) { - result = toUPPER_LC(TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1))); + U8 c = TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1)); + result = toUPPER_LC(c); } else { return _to_upper_title_latin1(TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1)), @@ -2612,14 +2111,11 @@ Perl__to_utf8_upper_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, const bool *lenp = 1; } else { - *ustrp = UTF8_EIGHT_BIT_HI(result); - *(ustrp + 1) = UTF8_EIGHT_BIT_LO(result); + *ustrp = UTF8_EIGHT_BIT_HI((U8) result); + *(ustrp + 1) = UTF8_EIGHT_BIT_LO((U8) result); *lenp = 2; } - if (tainted_ptr) { - *tainted_ptr = TRUE; - } return result; } @@ -2631,14 +2127,13 @@ Instead use L. =cut */ /* Not currently externally documented, and subject to change: - * is set iff locale semantics are to be used for code points < 256 - * Since titlecase is not defined in POSIX, uppercase is used instead - * for these/ - * if non-null, *tainted_ptr will be set TRUE iff locale rules - * were used in the calculation; otherwise unchanged. */ + * is set iff the rules from the current underlying locale are to be + * used. Since titlecase is not defined in POSIX, for other than a + * UTF-8 locale, uppercase is used instead for code points < 256. + */ UV -Perl__to_utf8_title_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, const bool flags, bool* tainted_ptr) +Perl__to_utf8_title_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, bool flags) { dVAR; @@ -2646,6 +2141,10 @@ Perl__to_utf8_title_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, const bool PERL_ARGS_ASSERT__TO_UTF8_TITLE_FLAGS; + if (flags && IN_UTF8_CTYPE_LOCALE) { + flags = FALSE; + } + if (UTF8_IS_INVARIANT(*p)) { if (flags) { result = toUPPER_LC(*p); @@ -2656,7 +2155,8 @@ Perl__to_utf8_title_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, const bool } else if UTF8_IS_DOWNGRADEABLE_START(*p) { if (flags) { - result = toUPPER_LC(TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1))); + U8 c = TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1)); + result = toUPPER_LC(c); } else { return _to_upper_title_latin1(TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1)), @@ -2678,14 +2178,11 @@ Perl__to_utf8_title_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, const bool *lenp = 1; } else { - *ustrp = UTF8_EIGHT_BIT_HI(result); - *(ustrp + 1) = UTF8_EIGHT_BIT_LO(result); + *ustrp = UTF8_EIGHT_BIT_HI((U8) result); + *(ustrp + 1) = UTF8_EIGHT_BIT_LO((U8) result); *lenp = 2; } - if (tainted_ptr) { - *tainted_ptr = TRUE; - } return result; } @@ -2697,12 +2194,12 @@ Instead use L. =cut */ /* Not currently externally documented, and subject to change: - * is set iff locale semantics are to be used for code points < 256 - * if non-null, *tainted_ptr will be set TRUE iff locale rules - * were used in the calculation; otherwise unchanged. */ + * is set iff iff the rules from the current underlying locale are to + * be used. + */ UV -Perl__to_utf8_lower_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, const bool flags, bool* tainted_ptr) +Perl__to_utf8_lower_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, bool flags) { UV result; @@ -2710,6 +2207,10 @@ Perl__to_utf8_lower_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, const bool PERL_ARGS_ASSERT__TO_UTF8_LOWER_FLAGS; + if (flags && IN_UTF8_CTYPE_LOCALE) { + flags = FALSE; + } + if (UTF8_IS_INVARIANT(*p)) { if (flags) { result = toLOWER_LC(*p); @@ -2720,7 +2221,8 @@ Perl__to_utf8_lower_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, const bool } else if UTF8_IS_DOWNGRADEABLE_START(*p) { if (flags) { - result = toLOWER_LC(TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1))); + U8 c = TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1)); + result = toLOWER_LC(c); } else { return to_lower_latin1(TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1)), @@ -2743,14 +2245,11 @@ Perl__to_utf8_lower_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, const bool *lenp = 1; } else { - *ustrp = UTF8_EIGHT_BIT_HI(result); - *(ustrp + 1) = UTF8_EIGHT_BIT_LO(result); + *ustrp = UTF8_EIGHT_BIT_HI((U8) result); + *(ustrp + 1) = UTF8_EIGHT_BIT_LO((U8) result); *lenp = 2; } - if (tainted_ptr) { - *tainted_ptr = TRUE; - } return result; } @@ -2763,18 +2262,16 @@ Instead use L. /* Not currently externally documented, and subject to change, * in - * bit FOLD_FLAGS_LOCALE is set iff locale semantics are to be used for code - * points < 256. Since foldcase is not defined in - * POSIX, lowercase is used instead + * bit FOLD_FLAGS_LOCALE is set iff the rules from the current underlying + * locale are to be used. * bit FOLD_FLAGS_FULL is set iff full case folds are to be used; * otherwise simple folds * bit FOLD_FLAGS_NOMIX_ASCII is set iff folds of non-ASCII to ASCII are * prohibited - * if non-null, *tainted_ptr will be set TRUE iff locale rules - * were used in the calculation; otherwise unchanged. */ + */ UV -Perl__to_utf8_fold_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, U8 flags, bool* tainted_ptr) +Perl__to_utf8_fold_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, U8 flags) { dVAR; @@ -2787,6 +2284,10 @@ Perl__to_utf8_fold_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, U8 flags, b assert(p != ustrp); /* Otherwise overwrites */ + if (flags & FOLD_FLAGS_LOCALE && IN_UTF8_CTYPE_LOCALE) { + flags &= ~FOLD_FLAGS_LOCALE; + } + if (UTF8_IS_INVARIANT(*p)) { if (flags & FOLD_FLAGS_LOCALE) { result = toFOLD_LC(*p); @@ -2798,7 +2299,8 @@ Perl__to_utf8_fold_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, U8 flags, b } else if UTF8_IS_DOWNGRADEABLE_START(*p) { if (flags & FOLD_FLAGS_LOCALE) { - result = toFOLD_LC(TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1))); + U8 c = TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1)); + result = toFOLD_LC(c); } else { return _to_fold_latin1(TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1)), @@ -2811,8 +2313,8 @@ Perl__to_utf8_fold_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, U8 flags, b if (flags & FOLD_FLAGS_LOCALE) { - /* Special case these characters, as what normally gets returned - * under locale doesn't work */ + /* Special case these two characters, as what normally gets + * returned under locale doesn't work */ if (UTF8SKIP(p) == sizeof(LATIN_CAPITAL_LETTER_SHARP_S_UTF8) - 1 && memEQ((char *) p, LATIN_CAPITAL_LETTER_SHARP_S_UTF8, sizeof(LATIN_CAPITAL_LETTER_SHARP_S_UTF8) - 1)) @@ -2873,14 +2375,11 @@ Perl__to_utf8_fold_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, U8 flags, b *lenp = 1; } else { - *ustrp = UTF8_EIGHT_BIT_HI(result); - *(ustrp + 1) = UTF8_EIGHT_BIT_LO(result); + *ustrp = UTF8_EIGHT_BIT_HI((U8) result); + *(ustrp + 1) = UTF8_EIGHT_BIT_LO((U8) result); *lenp = 2; } - if (tainted_ptr) { - *tainted_ptr = TRUE; - } return result; return_long_s: @@ -2963,7 +2462,9 @@ Perl__core_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 m * Thus there are three possible inputs to find the swash: , * , and . At least one must be specified. The result * will be the union of the specified ones, although 's various - * actions can intersect, etc. what gives. + * actions can intersect, etc. what gives. To avoid going out to + * disk at all, should specify completely what the swash should + * have, and should be &PL_sv_undef and should be "". * * is only valid for binary properties */ @@ -3145,7 +2646,7 @@ Perl__core_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 m /* Here, we have computed the union of all the passed-in data. It may * be that there was an inversion list in the swash which didn't get - * touched; otherwise save the one computed one */ + * touched; otherwise save the computed one */ if (! invlist_in_swash_is_valid && (int) _invlist_len(swash_invlist) > invlist_swash_boundary) { @@ -3158,6 +2659,8 @@ Perl__core_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 m else SvREFCNT_inc_simple_void_NN(swash_invlist); } + SvREADONLY_on(swash_invlist); + /* Use the inversion list stand-alone if small enough */ if ((int) _invlist_len(swash_invlist) <= invlist_swash_boundary) { SvREFCNT_dec(retval); @@ -3220,10 +2723,9 @@ Perl_swash_fetch(pTHX_ SV *swash, const U8 *ptr, bool do_utf8) HV *const hv = MUTABLE_HV(SvRV(swash)); U32 klen; U32 off; - STRLEN slen; + STRLEN slen = 0; STRLEN needents; const U8 *tmps = NULL; - U32 bit; SV *swatch; const U8 c = *ptr; @@ -3353,17 +2855,21 @@ Perl_swash_fetch(pTHX_ SV *swash, const U8 *ptr, bool do_utf8) switch ((int)((slen << 3) / needents)) { case 1: - bit = 1 << (off & 7); - off >>= 3; - return (tmps[off] & bit) != 0; + return ((UV) tmps[off >> 3] & (1 << (off & 7))) != 0; case 8: - return tmps[off]; + return ((UV) tmps[off]); case 16: off <<= 1; - return (tmps[off] << 8) + tmps[off + 1] ; + return + ((UV) tmps[off ] << 8) + + ((UV) tmps[off + 1]); case 32: off <<= 2; - return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ; + return + ((UV) tmps[off ] << 24) + + ((UV) tmps[off + 1] << 16) + + ((UV) tmps[off + 2] << 8) + + ((UV) tmps[off + 3]); } Perl_croak(aTHX_ "panic: swash_fetch got swatch of unexpected bit width, " "slen=%"UVuf", needents=%"UVuf, (UV)slen, (UV)needents); @@ -3401,9 +2907,12 @@ S_swash_scan_list_line(pTHX_ U8* l, U8* const lend, UV* min, UV* max, UV* val, /* nl points to the next \n in the scan */ U8* const nl = (U8*)memchr(l, '\n', lend - l); + PERL_ARGS_ASSERT_SWASH_SCAN_LIST_LINE; + /* Get the first number on the line: the range minimum */ numlen = lend - l; *min = grok_hex((char *)l, &numlen, &flags, NULL); + *max = *min; /* So can never return without setting max */ if (numlen) /* If found a hex number, position past it */ l += numlen; else if (nl) { /* Else, go handle next line, if any */ @@ -3427,30 +2936,19 @@ S_swash_scan_list_line(pTHX_ U8* l, U8* const lend, UV* min, UV* max, UV* val, *max = *min; /* Non-binary tables have a third entry: what the first element of the - * range maps to */ + * range maps to. The map for those currently read here is in hex */ if (wants_value) { if (isBLANK(*l)) { ++l; - - /* The ToLc, etc table mappings are not in hex, and must be - * corrected by adding the code point to them */ - if (typeto) { - char *after_strtol = (char *) lend; - *val = Strtol((char *)l, &after_strtol, 10); - l = (U8 *) after_strtol; - } - else { /* Other tables are in hex, and are the correct result - without tweaking */ - flags = PERL_SCAN_SILENT_ILLDIGIT - | PERL_SCAN_DISALLOW_PREFIX - | PERL_SCAN_SILENT_NON_PORTABLE; - numlen = lend - l; - *val = grok_hex((char *)l, &numlen, &flags, NULL); - if (numlen) - l += numlen; - else - *val = 0; - } + flags = PERL_SCAN_SILENT_ILLDIGIT + | PERL_SCAN_DISALLOW_PREFIX + | PERL_SCAN_SILENT_NON_PORTABLE; + numlen = lend - l; + *val = grok_hex((char *)l, &numlen, &flags, NULL); + if (numlen) + l += numlen; + else + *val = 0; } else { *val = 0; @@ -3466,7 +2964,6 @@ S_swash_scan_list_line(pTHX_ U8* l, U8* const lend, UV* min, UV* max, UV* val, } else { /* Nothing following range min, should be single element with no mapping expected */ - *max = *min; if (wants_value) { *val = 0; if (typeto) { @@ -3581,8 +3078,8 @@ S_swatch_get(pTHX_ SV* swash, UV start, UV span) lend = l + lcur; while (l < lend) { UV min, max, val, upper; - l = S_swash_scan_list_line(aTHX_ l, lend, &min, &max, &val, - cBOOL(octets), typestr); + l = swash_scan_list_line(l, lend, &min, &max, &val, + cBOOL(octets), typestr); if (l > lend) { break; } @@ -3822,6 +3319,9 @@ Perl__swash_inversion_hash(pTHX_ SV* const swash) * have two elements, the utf8 for itself, and for 004C. For 006B, there * would be three elements in its array, the utf8 for 006B, 004B and 212A. * + * Note that there are no elements in the hash for 004B, 004C, 212A. The + * keys are only code points that are folded-to, so it isn't a full closure. + * * Essentially, for any code point, it gives all the code points that map to * it, or the list of 'froms' for that point. * @@ -3941,12 +3441,12 @@ Perl__swash_inversion_hash(pTHX_ SV* const swash) while ((from_list = (AV *) hv_iternextsv(specials_inverse, &char_to, &to_len))) { - if (av_len(from_list) > 0) { + if (av_tindex(from_list) > 0) { SSize_t i; /* We iterate over all combinations of i,j to place each code * point on each list */ - for (i = 0; i <= av_len(from_list); i++) { + for (i = 0; i <= av_tindex(from_list); i++) { SSize_t j; AV* i_list = newAV(); SV** entryp = av_fetch(from_list, i, FALSE); @@ -3962,8 +3462,8 @@ Perl__swash_inversion_hash(pTHX_ SV* const swash) Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed"); } - /* For debugging: UV u = valid_utf8_to_uvchr((U8*) SvPVX(*entryp), 0);*/ - for (j = 0; j <= av_len(from_list); j++) { + /* For DEBUG_U: UV u = valid_utf8_to_uvchr((U8*) SvPVX(*entryp), 0);*/ + for (j = 0; j <= av_tindex(from_list); j++) { entryp = av_fetch(from_list, j, FALSE); if (entryp == NULL) { Perl_croak(aTHX_ "panic: av_fetch() unexpectedly failed"); @@ -3990,8 +3490,8 @@ Perl__swash_inversion_hash(pTHX_ SV* const swash) while (l < lend) { UV min, max, val; UV inverse; - l = S_swash_scan_list_line(aTHX_ l, lend, &min, &max, &val, - cBOOL(octets), typestr); + l = swash_scan_list_line(l, lend, &min, &max, &val, + cBOOL(octets), typestr); if (l > lend) { break; } @@ -4022,7 +3522,7 @@ Perl__swash_inversion_hash(pTHX_ SV* const swash) /* Look through list to see if this inverse mapping already is * listed, or if there is a mapping to itself already */ - for (i = 0; i <= av_len(list); i++) { + for (i = 0; i <= av_tindex(list); i++) { SV** entryp = av_fetch(list, i, FALSE); SV* entry; if (entryp == NULL) { @@ -4133,43 +3633,76 @@ Perl__swash_to_invlist(pTHX_ SV* const swash) loc = (char *) l; lend = l + lcur; - /* Scan the input to count the number of lines to preallocate array size - * based on worst possible case, which is each line in the input creates 2 - * elements in the inversion list: 1) the beginning of a range in the list; - * 2) the beginning of a range not in the list. */ - while ((loc = (strchr(loc, '\n'))) != NULL) { - elements += 2; - loc++; - } + if (*l == 'V') { /* Inversion list format */ + char *after_strtol = (char *) lend; + UV element0; + UV* other_elements_ptr; - /* If the ending is somehow corrupt and isn't a new line, add another - * element for the final range that isn't in the inversion list */ - if (! (*lend == '\n' - || (*lend == '\0' && (lcur == 0 || *(lend - 1) == '\n')))) - { - elements++; + /* The first number is a count of the rest */ + l++; + elements = Strtoul((char *)l, &after_strtol, 10); + if (elements == 0) { + invlist = _new_invlist(0); + } + else { + l = (U8 *) after_strtol; + + /* Get the 0th element, which is needed to setup the inversion list */ + element0 = (UV) Strtoul((char *)l, &after_strtol, 10); + l = (U8 *) after_strtol; + invlist = _setup_canned_invlist(elements, element0, &other_elements_ptr); + elements--; + + /* Then just populate the rest of the input */ + while (elements-- > 0) { + if (l > lend) { + Perl_croak(aTHX_ "panic: Expecting %"UVuf" more elements than available", elements); + } + *other_elements_ptr++ = (UV) Strtoul((char *)l, &after_strtol, 10); + l = (U8 *) after_strtol; + } + } } + else { - invlist = _new_invlist(elements); + /* Scan the input to count the number of lines to preallocate array + * size based on worst possible case, which is each line in the input + * creates 2 elements in the inversion list: 1) the beginning of a + * range in the list; 2) the beginning of a range not in the list. */ + while ((loc = (strchr(loc, '\n'))) != NULL) { + elements += 2; + loc++; + } - /* Now go through the input again, adding each range to the list */ - while (l < lend) { - UV start, end; - UV val; /* Not used by this function */ + /* If the ending is somehow corrupt and isn't a new line, add another + * element for the final range that isn't in the inversion list */ + if (! (*lend == '\n' + || (*lend == '\0' && (lcur == 0 || *(lend - 1) == '\n')))) + { + elements++; + } - l = S_swash_scan_list_line(aTHX_ l, lend, &start, &end, &val, - cBOOL(octets), typestr); + invlist = _new_invlist(elements); - if (l > lend) { - break; - } + /* Now go through the input again, adding each range to the list */ + while (l < lend) { + UV start, end; + UV val; /* Not used by this function */ - invlist = _add_range_to_invlist(invlist, start, end); + l = swash_scan_list_line(l, lend, &start, &end, &val, + cBOOL(octets), typestr); + + if (l > lend) { + break; + } + + invlist = _add_range_to_invlist(invlist, start, end); + } } /* Invert if the data says it should be */ if (invert_it_svp && SvUV(*invert_it_svp)) { - _invlist_invert_prop(invlist); + _invlist_invert(invlist); } /* This code is copied from swatch_get() @@ -4246,6 +3779,7 @@ Perl__swash_to_invlist(pTHX_ SV* const swash) sv_free(other); /* through with it! */ } + SvREADONLY_on(invlist); return invlist; } @@ -4480,13 +4014,11 @@ L (Case Mappings). * 0 for as-documented above * FOLDEQ_UTF8_NOMIX_ASCII meaning that if a non-ASCII character folds to an ASCII one, to not match - * FOLDEQ_UTF8_LOCALE meaning that locale rules are to be used for code - * points below 256; unicode rules for above 255; and - * folds that cross those boundaries are disallowed, - * like the NOMIX_ASCII option - * FOLDEQ_S1_ALREADY_FOLDED s1 has already been folded before calling this - * routine. This allows that step to be skipped. - * FOLDEQ_S2_ALREADY_FOLDED Similarly. + * FOLDEQ_LOCALE is set iff the rules from the current underlying + * locale are to be used. + * FOLDEQ_S1_ALREADY_FOLDED s1 has already been folded before calling this + * routine. This allows that step to be skipped. + * FOLDEQ_S2_ALREADY_FOLDED Similarly. */ I32 Perl_foldEQ_utf8_flags(pTHX_ const char *s1, char **pe1, UV l1, bool u1, const char *s2, char **pe2, UV l2, bool u2, U32 flags) @@ -4506,7 +4038,7 @@ Perl_foldEQ_utf8_flags(pTHX_ const char *s1, char **pe1, UV l1, bool u1, const c PERL_ARGS_ASSERT_FOLDEQ_UTF8_FLAGS; - assert( ! ((flags & (FOLDEQ_UTF8_NOMIX_ASCII | FOLDEQ_UTF8_LOCALE)) + assert( ! ((flags & (FOLDEQ_UTF8_NOMIX_ASCII | FOLDEQ_LOCALE)) && (flags & (FOLDEQ_S1_ALREADY_FOLDED | FOLDEQ_S2_ALREADY_FOLDED)))); /* The algorithm is to trial the folds without regard to the flags on * the first line of the above assert(), and then see if the result @@ -4519,6 +4051,10 @@ Perl_foldEQ_utf8_flags(pTHX_ const char *s1, char **pe1, UV l1, bool u1, const c * and /iaa matches are most likely to involve code points 0-255, and this * function only under rare conditions gets called for 0-255. */ + if (IN_UTF8_CTYPE_LOCALE) { + flags &= ~FOLDEQ_LOCALE; + } + if (pe1) { e1 = *(U8**)pe1; } @@ -4579,7 +4115,7 @@ Perl_foldEQ_utf8_flags(pTHX_ const char *s1, char **pe1, UV l1, bool u1, const c /* If in locale matching, we use two sets of rules, depending * on if the code point is above or below 255. Here, we test * for and handle locale rules */ - if ((flags & FOLDEQ_UTF8_LOCALE) + if ((flags & FOLDEQ_LOCALE) && (! u1 || ! UTF8_IS_ABOVE_LATIN1(*p1))) { /* There is no mixing of code points above and below 255. */ @@ -4624,7 +4160,7 @@ Perl_foldEQ_utf8_flags(pTHX_ const char *s1, char **pe1, UV l1, bool u1, const c n2 = UTF8SKIP(f2); } else { - if ((flags & FOLDEQ_UTF8_LOCALE) + if ((flags & FOLDEQ_LOCALE) && (! u2 || ! UTF8_IS_ABOVE_LATIN1(*p2))) { /* Here, the next char in s2 is < 256. We've already @@ -4714,6 +4250,66 @@ Perl_foldEQ_utf8_flags(pTHX_ const char *s1, char **pe1, UV l1, bool u1, const c return 1; } +/* XXX The next four functions should likely be moved to mathoms.c once all + * occurrences of them are removed from the core; some cpan-upstream modules + * still use them */ + +U8 * +Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv) +{ + PERL_ARGS_ASSERT_UVUNI_TO_UTF8; + + return Perl_uvoffuni_to_utf8_flags(aTHX_ d, uv, 0); +} + +/* +=for apidoc utf8n_to_uvuni + +Instead use L, or rarely, L. + +This function was useful for code that wanted to handle both EBCDIC and +ASCII platforms with Unicode properties, but starting in Perl v5.20, the +distinctions between the platforms have mostly been made invisible to most +code, so this function is quite unlikely to be what you want. If you do need +this precise functionality, use instead +C> +or C>. + +=cut +*/ + +UV +Perl_utf8n_to_uvuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags) +{ + PERL_ARGS_ASSERT_UTF8N_TO_UVUNI; + + return NATIVE_TO_UNI(utf8n_to_uvchr(s, curlen, retlen, flags)); +} + +/* +=for apidoc uvuni_to_utf8_flags + +Instead you almost certainly want to use L or +L>. + +This function is a deprecated synonym for L, +which itself, while not deprecated, should be used only in isolated +circumstances. These functions were useful for code that wanted to handle +both EBCDIC and ASCII platforms with Unicode properties, but starting in Perl +v5.20, the distinctions between the platforms have mostly been made invisible +to most code, so this function is quite unlikely to be what you want. + +=cut +*/ + +U8 * +Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags) +{ + PERL_ARGS_ASSERT_UVUNI_TO_UTF8_FLAGS; + + return uvoffuni_to_utf8_flags(d, uv, flags); +} + /* * Local variables: * c-indentation-style: bsd