X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/659a7c2dc7d764cfa77666323ca9fc06c8e3ea78..3f6b66c14467c0f8c7459e32c576618155ca89f3:/utf8.c diff --git a/utf8.c b/utf8.c index 457b5e3..56d3322 100644 --- a/utf8.c +++ b/utf8.c @@ -36,9 +36,9 @@ static const char unees[] = "Malformed UTF-8 character (unexpected end of string)"; static const char cp_above_legal_max[] = - "It is deprecated to use code point 0x%"UVXf"; the permissible max is 0x%"UVXf""; + "Use of code point 0x%"UVXf" is deprecated; the permissible max is 0x%"UVXf""; -#define MAX_NON_DEPRECATED_CP (IV_MAX) +#define MAX_NON_DEPRECATED_CP ((UV) (IV_MAX)) /* =head1 Unicode Support @@ -141,9 +141,10 @@ Perl_uvoffuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags) *d++ = LATIN1_TO_NATIVE(uv); return d; } + if (uv <= MAX_UTF8_TWO_BYTE) { - *d++ = UTF8_TWO_BYTE_HI(uv); - *d++ = UTF8_TWO_BYTE_LO(uv); + *d++ = I8_TO_NATIVE_UTF8(( uv >> SHIFT) | UTF_START_MARK(2)); + *d++ = I8_TO_NATIVE_UTF8(( uv & MASK) | MARK); return d; } @@ -1117,7 +1118,7 @@ on the first byte of character or just after the last byte of a character. */ U8 * -Perl_utf8_hop(const U8 *s, I32 off) +Perl_utf8_hop(const U8 *s, SSize_t off) { PERL_ARGS_ASSERT_UTF8_HOP; @@ -1557,14 +1558,14 @@ Perl__to_upper_title_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp, const char S_ * LENP will be set to the length in bytes of the string of changed characters * * The functions return the ordinal of the first character in the string of OUTP */ -#define CALL_UPPER_CASE(INP, OUTP, LENP) Perl_to_utf8_case(aTHX_ INP, OUTP, LENP, &PL_utf8_toupper, "ToUc", "") -#define CALL_TITLE_CASE(INP, OUTP, LENP) Perl_to_utf8_case(aTHX_ INP, OUTP, LENP, &PL_utf8_totitle, "ToTc", "") -#define CALL_LOWER_CASE(INP, OUTP, LENP) Perl_to_utf8_case(aTHX_ INP, OUTP, LENP, &PL_utf8_tolower, "ToLc", "") +#define CALL_UPPER_CASE(uv, s, d, lenp) _to_utf8_case(uv, s, d, lenp, &PL_utf8_toupper, "ToUc", "") +#define CALL_TITLE_CASE(uv, s, d, lenp) _to_utf8_case(uv, s, d, lenp, &PL_utf8_totitle, "ToTc", "") +#define CALL_LOWER_CASE(uv, s, d, lenp) _to_utf8_case(uv, s, d, lenp, &PL_utf8_tolower, "ToLc", "") -/* This additionally has the input parameter SPECIALS, which if non-zero will - * cause this to use the SPECIALS hash for folding (meaning get full case +/* This additionally has the input parameter 'specials', which if non-zero will + * cause this to use the specials hash for folding (meaning get full case * folding); otherwise, when zero, this implies a simple case fold */ -#define CALL_FOLD_CASE(INP, OUTP, LENP, SPECIALS) Perl_to_utf8_case(aTHX_ INP, OUTP, LENP, &PL_utf8_tofold, "ToCf", (SPECIALS) ? "" : NULL) +#define CALL_FOLD_CASE(uv, s, d, lenp, specials) _to_utf8_case(uv, s, d, lenp, &PL_utf8_tofold, "ToCf", (specials) ? "" : NULL) UV Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp) @@ -1584,7 +1585,7 @@ Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp) } uvchr_to_utf8(p, c); - return CALL_UPPER_CASE(p, p, lenp); + return CALL_UPPER_CASE(c, p, p, lenp); } UV @@ -1597,7 +1598,7 @@ Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp) } uvchr_to_utf8(p, c); - return CALL_TITLE_CASE(p, p, lenp); + return CALL_TITLE_CASE(c, p, p, lenp); } STATIC U8 @@ -1635,7 +1636,7 @@ Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp) } uvchr_to_utf8(p, c); - return CALL_LOWER_CASE(p, p, lenp); + return CALL_LOWER_CASE(c, p, p, lenp); } UV @@ -1732,7 +1733,7 @@ Perl__to_uni_fold_flags(pTHX_ UV c, U8* p, STRLEN *lenp, U8 flags) /* Here, above 255. If no special needs, just use the macro */ if ( ! (flags & (FOLD_FLAGS_LOCALE|FOLD_FLAGS_NOMIX_ASCII))) { uvchr_to_utf8(p, c); - return CALL_FOLD_CASE(p, p, lenp, flags & FOLD_FLAGS_FULL); + return CALL_FOLD_CASE(c, p, p, lenp, flags & FOLD_FLAGS_FULL); } else { /* Otherwise, _to_utf8_fold_flags has the intelligence to deal with the special flags. */ @@ -1869,6 +1870,11 @@ Perl__is_utf8_mark(pTHX_ const U8 *p) /* =for apidoc to_utf8_case +Instead use the appropriate one of L, +L, +L, +or L. + C

contains the pointer to the UTF-8 string encoding the character that is being converted. This routine assumes that the character at C

is well-formed. @@ -1900,37 +1906,114 @@ UV Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, SV **swashp, const char *normal, const char *special) { + PERL_ARGS_ASSERT_TO_UTF8_CASE; + + return _to_utf8_case(valid_utf8_to_uvchr(p, NULL), p, ustrp, lenp, swashp, normal, special); +} + + /* change namve uv1 to 'from' */ +STATIC UV +S__to_utf8_case(pTHX_ const UV uv1, const U8 *p, U8* ustrp, STRLEN *lenp, + SV **swashp, const char *normal, const char *special) +{ STRLEN len = 0; - const UV uv1 = valid_utf8_to_uvchr(p, NULL); - PERL_ARGS_ASSERT_TO_UTF8_CASE; + PERL_ARGS_ASSERT__TO_UTF8_CASE; + + /* For code points that don't change case, we already know that the output + * of this function is the unchanged input, so we can skip doing look-ups + * for them. Unfortunately the case-changing code points are scattered + * around. But there are some long consecutive ranges where there are no + * case changing code points. By adding tests, we can eliminate the lookup + * for all the ones in such ranges. This is currently done here only for + * just a few cases where the scripts are in common use in modern commerce + * (and scripts adjacent to those which can be included without additional + * tests). */ + + if (uv1 >= 0x0590) { + /* This keeps from needing further processing the code points most + * likely to be used in the following non-cased scripts: Hebrew, + * Arabic, Syriac, Thaana, NKo, Samaritan, Mandaic, Devanagari, + * Bengali, Gurmukhi, Gujarati, Oriya, Tamil, Telugu, Kannada, + * Malayalam, Sinhala, Thai, Lao, Tibetan, Myanmar */ + if (uv1 < 0x10A0) { + goto cases_to_self; + } - /* Note that swash_fetch() doesn't output warnings for these because it - * assumes we will */ - if (uv1 >= UNICODE_SURROGATE_FIRST) { - if (UNLIKELY(uv1 <= UNICODE_SURROGATE_LAST)) { - if (ckWARN_d(WARN_SURROGATE)) { - const char* desc = (PL_op) ? OP_DESC(PL_op) : normal; - Perl_warner(aTHX_ packWARN(WARN_SURROGATE), - "Operation \"%s\" returns its argument for UTF-16 surrogate U+%04"UVXf"", desc, uv1); - } - } - else if (UNLIKELY(UNICODE_IS_SUPER(uv1))) { - if ( UNLIKELY(uv1 > MAX_NON_DEPRECATED_CP) - && ckWARN_d(WARN_DEPRECATED)) - { - Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), - cp_above_legal_max, uv1, MAX_NON_DEPRECATED_CP); + /* The following largish code point ranges also don't have case + * changes, but khw didn't think they warranted extra tests to speed + * them up (which would slightly slow down everything else above them): + * 1100..139F Hangul Jamo, Ethiopic + * 1400..1CFF Unified Canadian Aboriginal Syllabics, Ogham, Runic, + * Tagalog, Hanunoo, Buhid, Tagbanwa, Khmer, Mongolian, + * Limbu, Tai Le, New Tai Lue, Buginese, Tai Tham, + * Combining Diacritical Marks Extended, Balinese, + * Sundanese, Batak, Lepcha, Ol Chiki + * 2000..206F General Punctuation + */ + + if (uv1 >= 0x2D30) { + + /* This keeps the from needing further processing the code points + * most likely to be used in the following non-cased major scripts: + * CJK, Katakana, Hiragana, plus some less-likely scripts. + * + * (0x2D30 above might have to be changed to 2F00 in the unlikely + * event that Unicode eventually allocates the unused block as of + * v8.0 2FE0..2FEF to code points that are cased. khw has verified + * that the test suite will start having failures to alert you + * should that happen) */ + if (uv1 < 0xA640) { + goto cases_to_self; } - if (ckWARN_d(WARN_NON_UNICODE)) { - const char* desc = (PL_op) ? OP_DESC(PL_op) : normal; - Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE), - "Operation \"%s\" returns its argument for non-Unicode code point 0x%04"UVXf"", desc, uv1); - } - } + + if (uv1 >= 0xAC00) { + if (UNLIKELY(UNICODE_IS_SURROGATE(uv1))) { + if (ckWARN_d(WARN_SURROGATE)) { + const char* desc = (PL_op) ? OP_DESC(PL_op) : normal; + Perl_warner(aTHX_ packWARN(WARN_SURROGATE), + "Operation \"%s\" returns its argument for UTF-16 surrogate U+%04"UVXf"", desc, uv1); + } + goto cases_to_self; + } + + /* AC00..FAFF Catches Hangul syllables and private use, plus + * some others */ + if (uv1 < 0xFB00) { + goto cases_to_self; + + } + + if (UNLIKELY(UNICODE_IS_SUPER(uv1))) { + if ( UNLIKELY(uv1 > MAX_NON_DEPRECATED_CP) + && ckWARN_d(WARN_DEPRECATED)) + { + Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), + cp_above_legal_max, uv1, MAX_NON_DEPRECATED_CP); + } + if (ckWARN_d(WARN_NON_UNICODE)) { + const char* desc = (PL_op) ? OP_DESC(PL_op) : normal; + Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE), + "Operation \"%s\" returns its argument for non-Unicode code point 0x%04"UVXf"", desc, uv1); + } + goto cases_to_self; + } +#ifdef HIGHEST_CASE_CHANGING_CP_FOR_USE_ONLY_BY_UTF8_DOT_C + if (UNLIKELY(uv1 + > HIGHEST_CASE_CHANGING_CP_FOR_USE_ONLY_BY_UTF8_DOT_C)) + { + + /* As of this writing, this means we avoid swash creation + * for anything beyond low Plane 1 */ + goto cases_to_self; + } +#endif + } + } /* Note that non-characters are perfectly legal, so no warning should - * be given */ + * be given. There are so few of them, that it isn't worth the extra + * tests to avoid swash creation */ } if (!*swashp) /* load on-demand */ @@ -1988,6 +2071,7 @@ Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, /* Here, there was no mapping defined, which means that the code point maps * to itself. Return the inputs */ + cases_to_self: len = UTF8SKIP(p); if (p != ustrp) { /* Don't copy onto itself */ Copy(p, ustrp, len, U8); @@ -2104,7 +2188,7 @@ Perl__to_utf8_upper_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, bool flags } } else { /* UTF-8, ord above 255 */ - result = CALL_UPPER_CASE(p, ustrp, lenp); + result = CALL_UPPER_CASE(valid_utf8_to_uvchr(p, NULL), p, ustrp, lenp); if (flags) { result = check_locale_boundary_crossing(p, result, ustrp, lenp); @@ -2175,7 +2259,7 @@ Perl__to_utf8_title_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, bool flags } } else { /* UTF-8, ord above 255 */ - result = CALL_TITLE_CASE(p, ustrp, lenp); + result = CALL_TITLE_CASE(valid_utf8_to_uvchr(p, NULL), p, ustrp, lenp); if (flags) { result = check_locale_boundary_crossing(p, result, ustrp, lenp); @@ -2245,7 +2329,7 @@ Perl__to_utf8_lower_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, bool flags } } else { /* UTF-8, ord above 255 */ - result = CALL_LOWER_CASE(p, ustrp, lenp); + result = CALL_LOWER_CASE(valid_utf8_to_uvchr(p, NULL), p, ustrp, lenp); if (flags) { result = check_locale_boundary_crossing(p, result, ustrp, lenp); @@ -2328,7 +2412,7 @@ Perl__to_utf8_fold_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, U8 flags) } } else { /* UTF-8, ord above 255 */ - result = CALL_FOLD_CASE(p, ustrp, lenp, flags & FOLD_FLAGS_FULL); + result = CALL_FOLD_CASE(valid_utf8_to_uvchr(p, NULL), p, ustrp, lenp, flags & FOLD_FLAGS_FULL); if (flags & FOLD_FLAGS_LOCALE) { @@ -2656,7 +2740,7 @@ Perl__core_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 m CopHINTS_set(PL_curcop, PL_hints); } if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV) { - if (SvPOK(retval)) + if (SvPOK(retval)) { /* If caller wants to handle missing properties, let them */ if (flags_p && *flags_p & _CORE_SWASH_INIT_RETURN_IF_UNDEF) { @@ -2666,6 +2750,7 @@ Perl__core_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 m "Can't find Unicode property definition \"%"SVf"\"", SVfARG(retval)); NOT_REACHED; /* NOTREACHED */ + } } } /* End of calling the module to find the swash */ @@ -2772,7 +2857,7 @@ Perl__core_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 m * (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is * the lower-level routine, and it is similarly broken for returning * multiple values. --jhi - * For those, you should use to_utf8_case() instead */ + * For those, you should use S__to_utf8_case() instead */ /* Now SWASHGET is recasted into S_swatch_get in this file. */ /* Note: @@ -3550,12 +3635,12 @@ Perl__swash_inversion_hash(pTHX_ SV* const swash) while ((from_list = (AV *) hv_iternextsv(specials_inverse, &char_to, &to_len))) { - if (av_tindex(from_list) > 0) { + if (av_tindex_nomg(from_list) > 0) { SSize_t i; /* We iterate over all combinations of i,j to place each code * point on each list */ - for (i = 0; i <= av_tindex(from_list); i++) { + for (i = 0; i <= av_tindex_nomg(from_list); i++) { SSize_t j; AV* i_list = newAV(); SV** entryp = av_fetch(from_list, i, FALSE); @@ -3572,7 +3657,7 @@ Perl__swash_inversion_hash(pTHX_ SV* const swash) } /* For DEBUG_U: UV u = valid_utf8_to_uvchr((U8*) SvPVX(*entryp), 0);*/ - for (j = 0; j <= av_tindex(from_list); j++) { + for (j = 0; j <= av_tindex_nomg(from_list); j++) { entryp = av_fetch(from_list, j, FALSE); if (entryp == NULL) { Perl_croak(aTHX_ "panic: av_fetch() unexpectedly failed"); @@ -3648,7 +3733,7 @@ Perl__swash_inversion_hash(pTHX_ SV* const swash) /* Look through list to see if this inverse mapping already is * listed, or if there is a mapping to itself already */ - for (i = 0; i <= av_tindex(list); i++) { + for (i = 0; i <= av_tindex_nomg(list); i++) { SV** entryp = av_fetch(list, i, FALSE); SV* entry; UV uv;