X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/02c85471b2808214d1c256b044b1d79c74d5d450..a160ac48dbc970429ac6220f3bce8eff27dbcd09:/utf8.c diff --git a/utf8.c b/utf8.c index bde7f87..bd729c2 100644 --- a/utf8.c +++ b/utf8.c @@ -90,7 +90,7 @@ Perl_is_ascii_string(const U8 *s, STRLEN len) /* =for apidoc uvuni_to_utf8_flags -Adds the UTF-8 representation of the code point C to the end +Adds the UTF-8 representation of the Unicode code point C to the end of the string C; C should have at least C free bytes available. The return value is the pointer to the byte after the end of the new character. In other words, @@ -109,6 +109,10 @@ This is the recommended Unicode-aware way of saying *(d++) = uv; +where uv is a code point expressed in Latin-1 or above, not the platform's +native character set. B +or L>. + This function will convert to UTF-8 (and not warn) even code points that aren't legal Unicode or are problematic, unless C contains one or more of the following flags: @@ -119,8 +123,9 @@ UNICODE_DISALLOW_SURROGATE is set, the function will fail and return NULL. If both flags are set, the function will both warn and return NULL. The UNICODE_WARN_NONCHAR and UNICODE_DISALLOW_NONCHAR flags correspondingly -affect how the function handles a Unicode non-character. And, likewise for the -UNICODE_WARN_SUPER and UNICODE_DISALLOW_SUPER flags, and code points that are +affect how the function handles a Unicode non-character. And likewise, the +UNICODE_WARN_SUPER and UNICODE_DISALLOW_SUPER flags, affect the handling of +code points that are above the Unicode maximum of 0x10FFFF. Code points above 0x7FFF_FFFF (which are even less portable) can be warned and/or disallowed even if other above-Unicode code points are accepted by the UNICODE_WARN_FE_FF and UNICODE_DISALLOW_FE_FF @@ -258,7 +263,7 @@ Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags) return d; } #endif -#endif /* Loop style */ +#endif /* Non loop style */ } /* @@ -275,11 +280,11 @@ or less you should use the IS_UTF8_CHAR(), for lengths of five or more you should use the _slow(). In practice this means that the _slow() will be used very rarely, since the maximum Unicode code point (as of Unicode 4.1) is U+10FFFF, which encodes in UTF-8 to four bytes. Only -the "Perl extended UTF-8" (the infamous 'v-strings') will encode into +the "Perl extended UTF-8" (e.g, the infamous 'v-strings') will encode into five bytes or more. =cut */ -STATIC STRLEN +PERL_STATIC_INLINE STRLEN S_is_utf8_char_slow(const U8 *s, const STRLEN len) { dTHX; /* The function called below requires thread context */ @@ -333,8 +338,6 @@ Perl_is_utf8_char_buf(const U8 *buf, const U8* buf_end) /* =for apidoc is_utf8_char -DEPRECATED! - Tests if some arbitrary number of bytes begins in a valid UTF-8 character. Note that an INVARIANT (i.e. ASCII on non-EBCDIC machines) character is a valid UTF-8 character. The actual number of bytes in the UTF-8 @@ -901,11 +904,12 @@ C<*retlen> will be set to the length, in bytes, of that character. If C does not point to a well-formed UTF-8 character and UTF8 warnings are enabled, zero is returned and C<*retlen> is set (if C isn't -NULL) to -1. If those warnings are off, the computed value if well-defined (or -the Unicode REPLACEMENT CHARACTER, if not) is silently returned, and C<*retlen> -is set (if C isn't NULL) so that (S + C<*retlen>>) is the -next possible position in C that could begin a non-malformed character. -See L for details on when the REPLACEMENT CHARACTER is returned. +NULL) to -1. If those warnings are off, the computed value, if well-defined +(or the Unicode REPLACEMENT CHARACTER if not), is silently returned, and +C<*retlen> is set (if C isn't NULL) so that (S + C<*retlen>>) is +the next possible position in C that could begin a non-malformed character. +See L for details on when the REPLACEMENT CHARACTER is +returned. =cut */ @@ -940,8 +944,6 @@ Perl_valid_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen) /* =for apidoc utf8_to_uvchr -DEPRECATED! - Returns the native code point of the first character in the string C which is assumed to be in UTF-8 encoding; C will be set to the length, in bytes, of that character. @@ -1041,8 +1043,6 @@ Perl_valid_utf8_to_uvuni(pTHX_ const U8 *s, STRLEN *retlen) /* =for apidoc utf8_to_uvuni -DEPRECATED! - Returns the Unicode code point of the first character in the string C which is assumed to be in UTF-8 encoding; C will be set to the length, in bytes, of that character. @@ -1424,17 +1424,22 @@ Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen) *d++ = (U8)(( uv & 0x3f) | 0x80); continue; } - if (uv >= 0xd800 && uv <= 0xdbff) { /* surrogates */ +#define FIRST_HIGH_SURROGATE UNICODE_SURROGATE_FIRST +#define LAST_HIGH_SURROGATE 0xDBFF +#define FIRST_LOW_SURROGATE 0xDC00 +#define LAST_LOW_SURROGATE UNICODE_SURROGATE_LAST + if (uv >= FIRST_HIGH_SURROGATE && uv <= LAST_HIGH_SURROGATE) { if (p >= pend) { Perl_croak(aTHX_ "Malformed UTF-16 surrogate"); } else { UV low = (p[0] << 8) + p[1]; p += 2; - if (low < 0xdc00 || low > 0xdfff) + if (low < FIRST_LOW_SURROGATE || low > LAST_LOW_SURROGATE) Perl_croak(aTHX_ "Malformed UTF-16 surrogate"); - uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000; + uv = ((uv - FIRST_HIGH_SURROGATE) << 10) + + (low - FIRST_LOW_SURROGATE) + 0x10000; } - } else if (uv >= 0xdc00 && uv <= 0xdfff) { + } else if (uv >= FIRST_LOW_SURROGATE && uv <= LAST_LOW_SURROGATE) { Perl_croak(aTHX_ "Malformed UTF-16 surrogate"); } if (uv < 0x10000) { @@ -1478,6 +1483,14 @@ Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen) return utf16_to_utf8(p, d, bytelen, newlen); } +bool +Perl__is_uni_FOO(pTHX_ const U8 classnum, const UV c) +{ + U8 tmpbuf[UTF8_MAXBYTES+1]; + uvchr_to_utf8(tmpbuf, c); + return _is_utf8_FOO(classnum, tmpbuf); +} + /* for now these are all defined (inefficiently) in terms of the utf8 versions. * Note that the macros in handy.h that call these short-circuit calling them * for Latin-1 range inputs */ @@ -1487,7 +1500,29 @@ Perl_is_uni_alnum(pTHX_ UV c) { U8 tmpbuf[UTF8_MAXBYTES+1]; uvchr_to_utf8(tmpbuf, c); - return is_utf8_alnum(tmpbuf); + return _is_utf8_FOO(_CC_WORDCHAR, tmpbuf); +} + +bool +Perl_is_uni_alnumc(pTHX_ UV c) +{ + U8 tmpbuf[UTF8_MAXBYTES+1]; + uvchr_to_utf8(tmpbuf, c); + return _is_utf8_FOO(_CC_ALPHANUMERIC, tmpbuf); +} + +/* Internal function so we can deprecate the external one, and call + this one from other deprecated functions in this file */ + +PERL_STATIC_INLINE bool +S_is_utf8_idfirst(pTHX_ const U8 *p) +{ + dVAR; + + if (*p == '_') + return TRUE; + /* is_utf8_idstart would be more logical. */ + return is_utf8_common(p, &PL_utf8_idstart, "IdStart"); } bool @@ -1495,7 +1530,23 @@ Perl_is_uni_idfirst(pTHX_ UV c) { U8 tmpbuf[UTF8_MAXBYTES+1]; uvchr_to_utf8(tmpbuf, c); - return is_utf8_idfirst(tmpbuf); + return S_is_utf8_idfirst(aTHX_ tmpbuf); +} + +bool +Perl__is_uni_perl_idcont(pTHX_ UV c) +{ + U8 tmpbuf[UTF8_MAXBYTES+1]; + uvchr_to_utf8(tmpbuf, c); + return _is_utf8_perl_idcont(tmpbuf); +} + +bool +Perl__is_uni_perl_idstart(pTHX_ UV c) +{ + U8 tmpbuf[UTF8_MAXBYTES+1]; + uvchr_to_utf8(tmpbuf, c); + return _is_utf8_perl_idstart(tmpbuf); } bool @@ -1503,7 +1554,7 @@ Perl_is_uni_alpha(pTHX_ UV c) { U8 tmpbuf[UTF8_MAXBYTES+1]; uvchr_to_utf8(tmpbuf, c); - return is_utf8_alpha(tmpbuf); + return _is_utf8_FOO(_CC_ALPHA, tmpbuf); } bool @@ -1515,17 +1566,13 @@ Perl_is_uni_ascii(pTHX_ UV c) bool Perl_is_uni_blank(pTHX_ UV c) { - U8 tmpbuf[UTF8_MAXBYTES+1]; - uvchr_to_utf8(tmpbuf, c); - return is_utf8_blank(tmpbuf); + return isBLANK_uni(c); } bool Perl_is_uni_space(pTHX_ UV c) { - U8 tmpbuf[UTF8_MAXBYTES+1]; - uvchr_to_utf8(tmpbuf, c); - return is_utf8_space(tmpbuf); + return isSPACE_uni(c); } bool @@ -1533,7 +1580,7 @@ Perl_is_uni_digit(pTHX_ UV c) { U8 tmpbuf[UTF8_MAXBYTES+1]; uvchr_to_utf8(tmpbuf, c); - return is_utf8_digit(tmpbuf); + return _is_utf8_FOO(_CC_DIGIT, tmpbuf); } bool @@ -1541,7 +1588,7 @@ Perl_is_uni_upper(pTHX_ UV c) { U8 tmpbuf[UTF8_MAXBYTES+1]; uvchr_to_utf8(tmpbuf, c); - return is_utf8_upper(tmpbuf); + return _is_utf8_FOO(_CC_UPPER, tmpbuf); } bool @@ -1549,7 +1596,7 @@ Perl_is_uni_lower(pTHX_ UV c) { U8 tmpbuf[UTF8_MAXBYTES+1]; uvchr_to_utf8(tmpbuf, c); - return is_utf8_lower(tmpbuf); + return _is_utf8_FOO(_CC_LOWER, tmpbuf); } bool @@ -1563,7 +1610,7 @@ Perl_is_uni_graph(pTHX_ UV c) { U8 tmpbuf[UTF8_MAXBYTES+1]; uvchr_to_utf8(tmpbuf, c); - return is_utf8_graph(tmpbuf); + return _is_utf8_FOO(_CC_GRAPH, tmpbuf); } bool @@ -1571,7 +1618,7 @@ Perl_is_uni_print(pTHX_ UV c) { U8 tmpbuf[UTF8_MAXBYTES+1]; uvchr_to_utf8(tmpbuf, c); - return is_utf8_print(tmpbuf); + return _is_utf8_FOO(_CC_PRINT, tmpbuf); } bool @@ -1579,15 +1626,13 @@ Perl_is_uni_punct(pTHX_ UV c) { U8 tmpbuf[UTF8_MAXBYTES+1]; uvchr_to_utf8(tmpbuf, c); - return is_utf8_punct(tmpbuf); + return _is_utf8_FOO(_CC_PUNCT, tmpbuf); } bool Perl_is_uni_xdigit(pTHX_ UV c) { - U8 tmpbuf[UTF8_MAXBYTES_CASE+1]; - uvchr_to_utf8(tmpbuf, c); - return is_utf8_xdigit(tmpbuf); + return isXDIGIT_uni(c); } UV @@ -1735,23 +1780,41 @@ Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp) } UV -Perl__to_fold_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp, const bool flags) +Perl__to_fold_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp, const unsigned int flags) { - /* Corresponds to to_lower_latin1(), is TRUE if to use full case - * folding */ + /* Corresponds to to_lower_latin1(); bits meanings: + * FOLD_FLAGS_NOMIX_ASCII iff non-ASCII to ASCII folds are prohibited + * FOLD_FLAGS_FULL iff full folding is to be used; + * + * Not to be used for locale folds + */ UV converted; PERL_ARGS_ASSERT__TO_FOLD_LATIN1; + assert (! (flags & FOLD_FLAGS_LOCALE)); + if (c == MICRO_SIGN) { converted = GREEK_SMALL_LETTER_MU; } - else if (flags && c == LATIN_SMALL_LETTER_SHARP_S) { - *(p)++ = 's'; - *p = 's'; - *lenp = 2; - return 's'; + else if ((flags & FOLD_FLAGS_FULL) && c == LATIN_SMALL_LETTER_SHARP_S) { + + /* If can't cross 127/128 boundary, can't return "ss"; instead return + * two U+017F characters, as fc("\df") should eq fc("\x{17f}\x{17f}") + * under those circumstances. */ + if (flags & FOLD_FLAGS_NOMIX_ASCII) { + *lenp = 2 * sizeof(LATIN_SMALL_LETTER_LONG_S_UTF8) - 2; + Copy(LATIN_SMALL_LETTER_LONG_S_UTF8 LATIN_SMALL_LETTER_LONG_S_UTF8, + p, *lenp, U8); + return LATIN_SMALL_LETTER_LONG_S; + } + else { + *(p)++ = 's'; + *p = 's'; + *lenp = 2; + return 's'; + } } else { /* In this range the fold of all other characters is their lower case */ @@ -1786,12 +1849,7 @@ Perl__to_uni_fold_flags(pTHX_ UV c, U8* p, STRLEN *lenp, const U8 flags) if (c < 256) { UV result = _to_fold_latin1((U8) c, p, lenp, - cBOOL(((flags & FOLD_FLAGS_FULL) - /* If ASCII-safe, don't allow full folding, - * as that could include SHARP S => ss; - * otherwise there is no crossing of - * ascii/non-ascii in the latin1 range */ - && ! (flags & FOLD_FLAGS_NOMIX_ASCII)))); + flags & (FOLD_FLAGS_FULL | FOLD_FLAGS_NOMIX_ASCII)); /* It is illegal for the fold to cross the 255/256 boundary under * locale; in this case return the original */ return (result > 256 && flags & FOLD_FLAGS_LOCALE) @@ -1812,92 +1870,139 @@ Perl__to_uni_fold_flags(pTHX_ UV c, U8* p, STRLEN *lenp, const U8 flags) } } -/* for now these all assume no locale info available for Unicode > 255; and - * the corresponding macros in handy.h (like isALNUM_LC_uvchr) should have been - * called instead, so that these don't get called for < 255 */ - bool Perl_is_uni_alnum_lc(pTHX_ UV c) { - return is_uni_alnum(c); /* XXX no locale support yet */ + if (c < 256) { + return isALNUM_LC(UNI_TO_NATIVE(c)); + } + return _is_uni_FOO(_CC_WORDCHAR, c); +} + +bool +Perl_is_uni_alnumc_lc(pTHX_ UV c) +{ + if (c < 256) { + return isALPHANUMERIC_LC(UNI_TO_NATIVE(c)); + } + return _is_uni_FOO(_CC_ALPHANUMERIC, c); } bool Perl_is_uni_idfirst_lc(pTHX_ UV c) { - return is_uni_idfirst(c); /* XXX no locale support yet */ + if (c < 256) { + return isIDFIRST_LC(UNI_TO_NATIVE(c)); + } + return _is_uni_perl_idstart(c); } bool Perl_is_uni_alpha_lc(pTHX_ UV c) { - return is_uni_alpha(c); /* XXX no locale support yet */ + if (c < 256) { + return isALPHA_LC(UNI_TO_NATIVE(c)); + } + return _is_uni_FOO(_CC_ALPHA, c); } bool Perl_is_uni_ascii_lc(pTHX_ UV c) { - return is_uni_ascii(c); /* XXX no locale support yet */ + if (c < 256) { + return isASCII_LC(UNI_TO_NATIVE(c)); + } + return 0; } bool Perl_is_uni_blank_lc(pTHX_ UV c) { - return is_uni_blank(c); /* XXX no locale support yet */ + if (c < 256) { + return isBLANK_LC(UNI_TO_NATIVE(c)); + } + return isBLANK_uni(c); } bool Perl_is_uni_space_lc(pTHX_ UV c) { - return is_uni_space(c); /* XXX no locale support yet */ + if (c < 256) { + return isSPACE_LC(UNI_TO_NATIVE(c)); + } + return isSPACE_uni(c); } bool Perl_is_uni_digit_lc(pTHX_ UV c) { - return is_uni_digit(c); /* XXX no locale support yet */ + if (c < 256) { + return isDIGIT_LC(UNI_TO_NATIVE(c)); + } + return _is_uni_FOO(_CC_DIGIT, c); } bool Perl_is_uni_upper_lc(pTHX_ UV c) { - return is_uni_upper(c); /* XXX no locale support yet */ + if (c < 256) { + return isUPPER_LC(UNI_TO_NATIVE(c)); + } + return _is_uni_FOO(_CC_UPPER, c); } bool Perl_is_uni_lower_lc(pTHX_ UV c) { - return is_uni_lower(c); /* XXX no locale support yet */ + if (c < 256) { + return isLOWER_LC(UNI_TO_NATIVE(c)); + } + return _is_uni_FOO(_CC_LOWER, c); } bool Perl_is_uni_cntrl_lc(pTHX_ UV c) { - return is_uni_cntrl(c); /* XXX no locale support yet */ + if (c < 256) { + return isCNTRL_LC(UNI_TO_NATIVE(c)); + } + return 0; } bool Perl_is_uni_graph_lc(pTHX_ UV c) { - return is_uni_graph(c); /* XXX no locale support yet */ + if (c < 256) { + return isGRAPH_LC(UNI_TO_NATIVE(c)); + } + return _is_uni_FOO(_CC_GRAPH, c); } bool Perl_is_uni_print_lc(pTHX_ UV c) { - return is_uni_print(c); /* XXX no locale support yet */ + if (c < 256) { + return isPRINT_LC(UNI_TO_NATIVE(c)); + } + return _is_uni_FOO(_CC_PRINT, c); } bool Perl_is_uni_punct_lc(pTHX_ UV c) { - return is_uni_punct(c); /* XXX no locale support yet */ + if (c < 256) { + return isPUNCT_LC(UNI_TO_NATIVE(c)); + } + return _is_uni_FOO(_CC_PUNCT, c); } bool Perl_is_uni_xdigit_lc(pTHX_ UV c) { - return is_uni_xdigit(c); /* XXX no locale support yet */ + if (c < 256) { + return isXDIGIT_LC(UNI_TO_NATIVE(c)); + } + return isXDIGIT_uni(c); } U32 @@ -1930,7 +2035,7 @@ Perl_to_uni_lower_lc(pTHX_ U32 c) return (U32)to_uni_lower(c, tmpbuf, &len); } -static bool +PERL_STATIC_INLINE bool S_is_utf8_common(pTHX_ const U8 *const p, SV **swash, const char *const swashname) { @@ -1950,20 +2055,42 @@ S_is_utf8_common(pTHX_ const U8 *const p, SV **swash, PERL_ARGS_ASSERT_IS_UTF8_COMMON; /* The API should have included a length for the UTF-8 character in

, - * but it doesn't. We therefor assume that p has been validated at least + * but it doesn't. We therefore assume that p has been validated at least * as far as there being enough bytes available in it to accommodate the * character without reading beyond the end, and pass that number on to the * validating routine */ - if (!is_utf8_char_buf(p, p + UTF8SKIP(p))) - return FALSE; + if (! is_utf8_char_buf(p, p + UTF8SKIP(p))) { + if (ckWARN_d(WARN_UTF8)) { + Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED,WARN_UTF8), + "Passing malformed UTF-8 to \"%s\" is deprecated", swashname); + if (ckWARN(WARN_UTF8)) { /* This will output details as to the + what the malformation is */ + utf8_to_uvchr_buf(p, p + UTF8SKIP(p), NULL); + } + } + return FALSE; + } if (!*swash) { U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST; *swash = _core_swash_init("utf8", swashname, &PL_sv_undef, 1, 0, NULL, &flags); } + return swash_fetch(*swash, p, TRUE) != 0; } bool +Perl__is_utf8_FOO(pTHX_ const U8 classnum, const U8 *p) +{ + dVAR; + + PERL_ARGS_ASSERT__IS_UTF8_FOO; + + assert(classnum < _FIRST_NON_SWASH_CC); + + return is_utf8_common(p, &PL_utf8_swash_ptrs[classnum], swash_property_names[classnum]); +} + +bool Perl_is_utf8_alnum(pTHX_ const U8 *p) { dVAR; @@ -1973,7 +2100,17 @@ Perl_is_utf8_alnum(pTHX_ const U8 *p) /* NOTE: "IsWord", not "IsAlnum", since Alnum is a true * descendant of isalnum(3), in other words, it doesn't * contain the '_'. --jhi */ - return is_utf8_common(p, &PL_utf8_alnum, "IsWord"); + return is_utf8_common(p, &PL_utf8_swash_ptrs[_CC_WORDCHAR], "IsWord"); +} + +bool +Perl_is_utf8_alnumc(pTHX_ const U8 *p) +{ + dVAR; + + PERL_ARGS_ASSERT_IS_UTF8_ALNUMC; + + return is_utf8_common(p, &PL_utf8_swash_ptrs[_CC_ALPHANUMERIC], "IsAlnum"); } bool @@ -1983,10 +2120,7 @@ Perl_is_utf8_idfirst(pTHX_ const U8 *p) /* The naming is historical. */ PERL_ARGS_ASSERT_IS_UTF8_IDFIRST; - if (*p == '_') - return TRUE; - /* is_utf8_idstart would be more logical. */ - return is_utf8_common(p, &PL_utf8_idstart, "IdStart"); + return S_is_utf8_idfirst(aTHX_ p); } bool @@ -2003,16 +2137,27 @@ Perl_is_utf8_xidfirst(pTHX_ const U8 *p) /* The naming is historical. */ } bool -Perl__is_utf8__perl_idstart(pTHX_ const U8 *p) +Perl__is_utf8_perl_idstart(pTHX_ const U8 *p) { dVAR; - PERL_ARGS_ASSERT__IS_UTF8__PERL_IDSTART; + PERL_ARGS_ASSERT__IS_UTF8_PERL_IDSTART; return is_utf8_common(p, &PL_utf8_perl_idstart, "_Perl_IDStart"); } bool +Perl__is_utf8_perl_idcont(pTHX_ const U8 *p) +{ + dVAR; + + PERL_ARGS_ASSERT__IS_UTF8_PERL_IDCONT; + + return is_utf8_common(p, &PL_utf8_perl_idcont, "_Perl_IDCont"); +} + + +bool Perl_is_utf8_idcont(pTHX_ const U8 *p) { dVAR; @@ -2039,7 +2184,7 @@ Perl_is_utf8_alpha(pTHX_ const U8 *p) PERL_ARGS_ASSERT_IS_UTF8_ALPHA; - return is_utf8_common(p, &PL_utf8_alpha, "IsAlpha"); + return is_utf8_common(p, &PL_utf8_swash_ptrs[_CC_ALPHA], "IsAlpha"); } bool @@ -2061,7 +2206,7 @@ Perl_is_utf8_blank(pTHX_ const U8 *p) PERL_ARGS_ASSERT_IS_UTF8_BLANK; - return is_utf8_common(p, &PL_utf8_blank, "XPosixBlank"); + return isBLANK_utf8(p); } bool @@ -2071,7 +2216,7 @@ Perl_is_utf8_space(pTHX_ const U8 *p) PERL_ARGS_ASSERT_IS_UTF8_SPACE; - return is_utf8_common(p, &PL_utf8_space, "IsXPerlSpace"); + return isSPACE_utf8(p); } bool @@ -2105,7 +2250,7 @@ Perl_is_utf8_digit(pTHX_ const U8 *p) PERL_ARGS_ASSERT_IS_UTF8_DIGIT; - return is_utf8_common(p, &PL_utf8_digit, "IsDigit"); + return is_utf8_common(p, &PL_utf8_swash_ptrs[_CC_DIGIT], "IsDigit"); } bool @@ -2127,7 +2272,7 @@ Perl_is_utf8_upper(pTHX_ const U8 *p) PERL_ARGS_ASSERT_IS_UTF8_UPPER; - return is_utf8_common(p, &PL_utf8_upper, "IsUppercase"); + return is_utf8_common(p, &PL_utf8_swash_ptrs[_CC_UPPER], "IsUppercase"); } bool @@ -2137,7 +2282,7 @@ Perl_is_utf8_lower(pTHX_ const U8 *p) PERL_ARGS_ASSERT_IS_UTF8_LOWER; - return is_utf8_common(p, &PL_utf8_lower, "IsLowercase"); + return is_utf8_common(p, &PL_utf8_swash_ptrs[_CC_LOWER], "IsLowercase"); } bool @@ -2147,15 +2292,7 @@ Perl_is_utf8_cntrl(pTHX_ const U8 *p) PERL_ARGS_ASSERT_IS_UTF8_CNTRL; - if (isASCII(*p)) { - return isCNTRL_A(*p); - } - - /* All controls are in Latin1 */ - if (! UTF8_IS_DOWNGRADEABLE_START(*p)) { - return 0; - } - return isCNTRL_L1(TWO_BYTE_UTF8_TO_UNI(*p, *(p+1))); + return isCNTRL_utf8(p); } bool @@ -2165,7 +2302,7 @@ Perl_is_utf8_graph(pTHX_ const U8 *p) PERL_ARGS_ASSERT_IS_UTF8_GRAPH; - return is_utf8_common(p, &PL_utf8_graph, "IsGraph"); + return is_utf8_common(p, &PL_utf8_swash_ptrs[_CC_GRAPH], "IsGraph"); } bool @@ -2175,7 +2312,7 @@ Perl_is_utf8_print(pTHX_ const U8 *p) PERL_ARGS_ASSERT_IS_UTF8_PRINT; - return is_utf8_common(p, &PL_utf8_print, "IsPrint"); + return is_utf8_common(p, &PL_utf8_swash_ptrs[_CC_PRINT], "IsPrint"); } bool @@ -2185,7 +2322,7 @@ Perl_is_utf8_punct(pTHX_ const U8 *p) PERL_ARGS_ASSERT_IS_UTF8_PUNCT; - return is_utf8_common(p, &PL_utf8_punct, "IsPunct"); + return is_utf8_common(p, &PL_utf8_swash_ptrs[_CC_PUNCT], "IsPunct"); } bool @@ -2195,37 +2332,28 @@ Perl_is_utf8_xdigit(pTHX_ const U8 *p) PERL_ARGS_ASSERT_IS_UTF8_XDIGIT; - return is_utf8_common(p, &PL_utf8_xdigit, "IsXDigit"); + return is_XDIGIT_utf8(p); } bool -Perl_is_utf8_mark(pTHX_ const U8 *p) +Perl__is_utf8_mark(pTHX_ const U8 *p) { dVAR; - PERL_ARGS_ASSERT_IS_UTF8_MARK; + PERL_ARGS_ASSERT__IS_UTF8_MARK; return is_utf8_common(p, &PL_utf8_mark, "IsM"); } -bool -Perl_is_utf8_X_regular_begin(pTHX_ const U8 *p) -{ - dVAR; - - PERL_ARGS_ASSERT_IS_UTF8_X_REGULAR_BEGIN; - - return is_utf8_common(p, &PL_utf8_X_regular_begin, "_X_Regular_Begin"); -} bool -Perl_is_utf8_X_extend(pTHX_ const U8 *p) +Perl_is_utf8_mark(pTHX_ const U8 *p) { dVAR; - PERL_ARGS_ASSERT_IS_UTF8_X_EXTEND; + PERL_ARGS_ASSERT_IS_UTF8_MARK; - return is_utf8_common(p, &PL_utf8_X_extend, "_X_Extend"); + return _is_utf8_mark(p); } /* @@ -2395,7 +2523,7 @@ S_check_locale_boundary_crossing(pTHX_ const U8* const p, const UV result, U8* c PERL_ARGS_ASSERT_CHECK_LOCALE_BOUNDARY_CROSSING; - assert(! UTF8_IS_INVARIANT(*p) && ! UTF8_IS_DOWNGRADEABLE_START(*p)); + assert(UTF8_IS_ABOVE_LATIN1(*p)); /* We know immediately if the first character in the string crosses the * boundary, so can skip */ @@ -2406,8 +2534,7 @@ S_check_locale_boundary_crossing(pTHX_ const U8* const p, const UV result, U8* c U8* s = ustrp + UTF8SKIP(ustrp); U8* e = ustrp + *lenp; while (s < e) { - if (UTF8_IS_INVARIANT(*s) || UTF8_IS_DOWNGRADEABLE_START(*s)) - { + if (! UTF8_IS_ABOVE_LATIN1(*s)) { goto bad_crossing; } s += UTF8SKIP(s); @@ -2428,15 +2555,7 @@ bad_crossing: /* =for apidoc to_utf8_upper -Convert the UTF-8 encoded character at C

to its uppercase version and -store that in UTF-8 in C and its length in bytes in C. Note -that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since -the uppercase version may be longer than the original character. - -The first character of the uppercased version is returned -(but note, as explained above, that there may be more.) - -The character at C

is assumed by this routine to be well-formed. +Instead use L. =cut */ @@ -2500,15 +2619,7 @@ Perl__to_utf8_upper_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, const bool /* =for apidoc to_utf8_title -Convert the UTF-8 encoded character at C

to its titlecase version and -store that in UTF-8 in C and its length in bytes in C. Note -that the C needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the -titlecase version may be longer than the original character. - -The first character of the titlecased version is returned -(but note, as explained above, that there may be more.) - -The character at C

is assumed by this routine to be well-formed. +Instead use L. =cut */ @@ -2574,15 +2685,7 @@ Perl__to_utf8_title_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, const bool /* =for apidoc to_utf8_lower -Convert the UTF-8 encoded character at C

to its lowercase version and -store that in UTF-8 in ustrp and its length in bytes in C. Note -that the C needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the -lowercase version may be longer than the original character. - -The first character of the lowercased version is returned -(but note, as explained above, that there may be more.) - -The character at C

is assumed by this routine to be well-formed. +Instead use L. =cut */ @@ -2647,16 +2750,7 @@ Perl__to_utf8_lower_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, const bool /* =for apidoc to_utf8_fold -Convert the UTF-8 encoded character at C

to its foldcase version and -store that in UTF-8 in C and its length in bytes in C. Note -that the C needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the -foldcase version may be longer than the original character (up to -three characters). - -The first character of the foldcased version is returned -(but note, as explained above, that there may be more.) - -The character at C

is assumed by this routine to be well-formed. +Instead use L. =cut */ @@ -2688,33 +2782,36 @@ Perl__to_utf8_fold_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, U8 flags, b if (UTF8_IS_INVARIANT(*p)) { if (flags & FOLD_FLAGS_LOCALE) { - result = toLOWER_LC(*p); + result = toFOLD_LC(*p); } else { return _to_fold_latin1(*p, ustrp, lenp, - cBOOL(flags & FOLD_FLAGS_FULL)); + flags & (FOLD_FLAGS_FULL | FOLD_FLAGS_NOMIX_ASCII)); } } else if UTF8_IS_DOWNGRADEABLE_START(*p) { if (flags & FOLD_FLAGS_LOCALE) { - result = toLOWER_LC(TWO_BYTE_UTF8_TO_UNI(*p, *(p+1))); + result = toFOLD_LC(TWO_BYTE_UTF8_TO_UNI(*p, *(p+1))); } else { return _to_fold_latin1(TWO_BYTE_UTF8_TO_UNI(*p, *(p+1)), - ustrp, lenp, - cBOOL((flags & FOLD_FLAGS_FULL - /* If ASCII safe, don't allow full - * folding, as that could include SHARP - * S => ss; otherwise there is no - * crossing of ascii/non-ascii in the - * latin1 range */ - && ! (flags & FOLD_FLAGS_NOMIX_ASCII)))); + ustrp, lenp, + flags & (FOLD_FLAGS_FULL | FOLD_FLAGS_NOMIX_ASCII)); } } else { /* utf8, ord above 255 */ result = CALL_FOLD_CASE(p, ustrp, lenp, flags & FOLD_FLAGS_FULL); - if ((flags & FOLD_FLAGS_LOCALE)) { + if (flags & FOLD_FLAGS_LOCALE) { + + /* Special case this character, as what normally gets returned + * under locale doesn't work */ + if (UTF8SKIP(p) == sizeof(LATIN_CAPITAL_LETTER_SHARP_S_UTF8) - 1 + && memEQ((char *) p, LATIN_CAPITAL_LETTER_SHARP_S_UTF8, + sizeof(LATIN_CAPITAL_LETTER_SHARP_S_UTF8) - 1)) + { + goto return_long_s; + } return check_locale_boundary_crossing(p, result, ustrp, lenp); } else if (! (flags & FOLD_FLAGS_NOMIX_ASCII)) { @@ -2735,6 +2832,12 @@ Perl__to_utf8_fold_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, U8 flags, b if (isASCII(*s)) { /* Crossed, have to return the original */ original = valid_utf8_to_uvchr(p, lenp); + + /* But in this one instance, there is an alternative we can + * return that is valid */ + if (original == LATIN_CAPITAL_LETTER_SHARP_S) { + goto return_long_s; + } Copy(p, ustrp, *lenp, char); return original; } @@ -2761,6 +2864,18 @@ Perl__to_utf8_fold_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, U8 flags, b *tainted_ptr = TRUE; } return result; + + return_long_s: + /* Certain folds to 'ss' are prohibited by the options, but they do allow + * folds to a string of two of these characters. By returning this + * instead, then, e.g., + * fc("\x{1E9E}") eq fc("\x{17F}\x{17F}") + * works. */ + + *lenp = 2 * sizeof(LATIN_SMALL_LETTER_LONG_S_UTF8) - 2; + Copy(LATIN_SMALL_LETTER_LONG_S_UTF8 LATIN_SMALL_LETTER_LONG_S_UTF8, + ustrp, *lenp, U8); + return LATIN_SMALL_LETTER_LONG_S; } /* Note: @@ -2788,8 +2903,8 @@ Perl__core_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 m * by calling utf8_heavy.pl in the general case. The returned value may be * the swash's inversion list instead if the input parameters allow it. * Which is returned should be immaterial to callers, as the only - * operations permitted on a swash, swash_fetch() and - * _get_swash_invlist(), handle both these transparently. + * operations permitted on a swash, swash_fetch(), _get_swash_invlist(), + * and swash_to_invlist() handle both these transparently. * * This interface should only be used by functions that won't destroy or * adversely change the swash, as doing so affects all other uses of the @@ -2863,8 +2978,8 @@ Perl__core_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 m method = gv_fetchmeth(stash, "SWASHNEW", 8, -1); if (!method) { /* demand load utf8 */ ENTER; - errsv_save = newSVsv(ERRSV); - SAVEFREESV(errsv_save); + if ((errsv_save = GvSV(PL_errgv))) SAVEFREESV(errsv_save); + GvSV(PL_errgv) = NULL; /* It is assumed that callers of this routine are not passing in * any user derived data. */ /* Need to do this after save_re_context() as it will set @@ -2877,8 +2992,15 @@ Perl__core_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 m #endif Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpvn(pkg,pkg_len), NULL); - if (!SvTRUE(ERRSV)) - sv_setsv(ERRSV, errsv_save); + { + /* Not ERRSV, as there is no need to vivify a scalar we are + about to discard. */ + SV * const errsv = GvSV(PL_errgv); + if (!SvTRUE(errsv)) { + GvSV(PL_errgv) = SvREFCNT_inc_simple(errsv_save); + SvREFCNT_dec(errsv); + } + } LEAVE; } SPAGAIN; @@ -2890,18 +3012,25 @@ Perl__core_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 m mPUSHi(minbits); mPUSHi(none); PUTBACK; - errsv_save = newSVsv(ERRSV); - SAVEFREESV(errsv_save); + if ((errsv_save = GvSV(PL_errgv))) SAVEFREESV(errsv_save); + GvSV(PL_errgv) = NULL; /* If we already have a pointer to the method, no need to use * call_method() to repeat the lookup. */ - if (method ? call_sv(MUTABLE_SV(method), G_SCALAR) + if (method + ? call_sv(MUTABLE_SV(method), G_SCALAR) : call_sv(newSVpvs_flags("SWASHNEW", SVs_TEMP), G_SCALAR | G_METHOD)) { retval = *PL_stack_sp--; SvREFCNT_inc(retval); } - if (!SvTRUE(ERRSV)) - sv_setsv(ERRSV, errsv_save); + { + /* Not ERRSV. See above. */ + SV * const errsv = GvSV(PL_errgv); + if (!SvTRUE(errsv)) { + GvSV(PL_errgv) = SvREFCNT_inc_simple(errsv_save); + SvREFCNT_dec(errsv); + } + } LEAVE; POPSTACK; if (IN_PERL_COMPILETIME) { @@ -2982,7 +3111,7 @@ Perl__core_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 m * we are going to return a swash */ if ((int) _invlist_len(invlist) > invlist_swash_boundary) { swash_hv = newHV(); - retval = newRV_inc(MUTABLE_SV(swash_hv)); + retval = newRV_noinc(MUTABLE_SV(swash_hv)); } swash_invlist = invlist; } @@ -2998,8 +3127,12 @@ Perl__core_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 m { Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed"); } + /* We just stole a reference count. */ + if (swash_invlist_unclaimed) swash_invlist_unclaimed = FALSE; + else SvREFCNT_inc_simple_void_NN(swash_invlist); } + /* Use the inversion list stand-alone if small enough */ if ((int) _invlist_len(swash_invlist) <= invlist_swash_boundary) { SvREFCNT_dec(retval); if (!swash_invlist_unclaimed) @@ -3092,7 +3225,6 @@ Perl_swash_fetch(pTHX_ SV *swash, const U8 *ptr, bool do_utf8) * So the key in the hash (klen) is length of encoded char -1 */ klen = UTF8SKIP(ptr) - 1; - off = ptr[klen]; if (klen == 0) { /* If char is invariant then swatch is for all the invariant chars @@ -3631,7 +3763,7 @@ Perl__swash_inversion_hash(pTHX_ SV* const swash) * * The returned hash would have two keys, the utf8 for 006B and the utf8 for * 006C. The value for each key is an array. For 006C, the array would - * have a two elements, the utf8 for itself, and for 004C. For 006B, there + * have two elements, the utf8 for itself, and for 004C. For 006B, there * would be three elements in its array, the utf8 for 006B, 004B and 212A. * * Essentially, for any code point, it gives all the code points that map to @@ -3658,7 +3790,8 @@ Perl__swash_inversion_hash(pTHX_ SV* const swash) STRLEN lcur; HV *const hv = MUTABLE_HV(SvRV(swash)); - /* The string containing the main body of the table */ + /* The string containing the main body of the table. This will have its + * assertion fail if the swash has been converted to its inversion list */ SV** const listsvp = hv_fetchs(hv, "LIST", FALSE); SV** const typesvp = hv_fetchs(hv, "TYPE", FALSE); @@ -3889,7 +4022,8 @@ SV* Perl__swash_to_invlist(pTHX_ SV* const swash) { - /* Subject to change or removal. For use only in one place in regcomp.c */ + /* Subject to change or removal. For use only in one place in regcomp.c. + * Ownership is given to one reference count in the returned SV* */ U8 *l, *lend; char *loc; @@ -3897,17 +4031,15 @@ Perl__swash_to_invlist(pTHX_ SV* const swash) HV *const hv = MUTABLE_HV(SvRV(swash)); UV elements = 0; /* Number of elements in the inversion list */ U8 empty[] = ""; + SV** listsvp; + SV** typesvp; + SV** bitssvp; + SV** extssvp; + SV** invert_it_svp; - /* The string containing the main body of the table */ - SV** const listsvp = hv_fetchs(hv, "LIST", FALSE); - SV** const typesvp = hv_fetchs(hv, "TYPE", FALSE); - SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE); - SV** const extssvp = hv_fetchs(hv, "EXTRAS", FALSE); - SV** const invert_it_svp = hv_fetchs(hv, "INVERT_IT", FALSE); - - const U8* const typestr = (U8*)SvPV_nolen(*typesvp); - const STRLEN bits = SvUV(*bitssvp); - const STRLEN octets = bits >> 3; /* if bits == 1, then octets == 0 */ + U8* typestr; + STRLEN bits; + STRLEN octets; /* if bits == 1, then octets == 0 */ U8 *x, *xend; STRLEN xcur; @@ -3915,6 +4047,22 @@ Perl__swash_to_invlist(pTHX_ SV* const swash) PERL_ARGS_ASSERT__SWASH_TO_INVLIST; + /* If not a hash, it must be the swash's inversion list instead */ + if (SvTYPE(hv) != SVt_PVHV) { + return SvREFCNT_inc_simple_NN((SV*) hv); + } + + /* The string containing the main body of the table */ + listsvp = hv_fetchs(hv, "LIST", FALSE); + typesvp = hv_fetchs(hv, "TYPE", FALSE); + bitssvp = hv_fetchs(hv, "BITS", FALSE); + extssvp = hv_fetchs(hv, "EXTRAS", FALSE); + invert_it_svp = hv_fetchs(hv, "INVERT_IT", FALSE); + + typestr = (U8*)SvPV_nolen(*typesvp); + bits = SvUV(*bitssvp); + octets = bits >> 3; /* if bits == 1, then octets == 0 */ + /* read $swash->{LIST} */ if (SvPOK(*listsvp)) { l = (U8*)SvPV(*listsvp, lcur); @@ -4028,8 +4176,7 @@ Perl__swash_to_invlist(pTHX_ SV* const swash) _invlist_union(invlist, other, &invlist); break; case '!': - _invlist_invert(other); - _invlist_union(invlist, other, &invlist); + _invlist_union_maybe_complement_2nd(invlist, other, TRUE, &invlist); break; case '-': _invlist_subtract(invlist, other, &invlist); @@ -4134,7 +4281,7 @@ U32 flags) } bool -Perl_check_utf8_print(pTHX_ register const U8* s, const STRLEN len) +Perl_check_utf8_print(pTHX_ const U8* s, const STRLEN len) { /* May change: warns if surrogates, non-character code points, or * non-Unicode code points are in s which has length len bytes. Returns @@ -4281,9 +4428,12 @@ The pointer to the PV of the C is returned. char * Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags) { + const char * const ptr = + isREGEXP(ssv) ? RX_WRAPPED((REGEXP*)ssv) : SvPVX_const(ssv); + PERL_ARGS_ASSERT_SV_UNI_DISPLAY; - return Perl_pv_uni_display(aTHX_ dsv, (const U8*)SvPVX_const(ssv), + return Perl_pv_uni_display(aTHX_ dsv, (const U8*)ptr, SvCUR(ssv), pvlim, flags); } @@ -4345,7 +4495,7 @@ L (Case Mappings). * FOLDEQ_S2_ALREADY_FOLDED Similarly. */ I32 -Perl_foldEQ_utf8_flags(pTHX_ const char *s1, char **pe1, register UV l1, bool u1, const char *s2, char **pe2, register UV l2, bool u2, U32 flags) +Perl_foldEQ_utf8_flags(pTHX_ const char *s1, char **pe1, UV l1, bool u1, const char *s2, char **pe2, UV l2, bool u2, U32 flags) { dVAR; const U8 *p1 = (const U8*)s1; /* Point to current char */ @@ -4423,19 +4573,15 @@ Perl_foldEQ_utf8_flags(pTHX_ const char *s1, char **pe1, register UV l1, bool u1 f1 = (U8 *) p1; n1 = UTF8SKIP(f1); } - else { /* If in locale matching, we use two sets of rules, depending * on if the code point is above or below 255. Here, we test * for and handle locale rules */ if ((flags & FOLDEQ_UTF8_LOCALE) - && (! u1 || UTF8_IS_INVARIANT(*p1) - || UTF8_IS_DOWNGRADEABLE_START(*p1))) + && (! u1 || ! UTF8_IS_ABOVE_LATIN1(*p1))) { /* There is no mixing of code points above and below 255. */ - if (u2 && (! UTF8_IS_INVARIANT(*p2) - && ! UTF8_IS_DOWNGRADEABLE_START(*p2))) - { + if (u2 && UTF8_IS_ABOVE_LATIN1(*p2)) { return 0; } @@ -4458,8 +4604,7 @@ Perl_foldEQ_utf8_flags(pTHX_ const char *s1, char **pe1, register UV l1, bool u1 return 0; } n1 = 1; - *foldbuf1 = toLOWER(*p1); /* Folds in the ASCII range are - just lowercased */ + *foldbuf1 = toFOLD(*p1); } else if (u1) { to_utf8_fold(p1, foldbuf1, &n1); @@ -4478,13 +4623,11 @@ Perl_foldEQ_utf8_flags(pTHX_ const char *s1, char **pe1, register UV l1, bool u1 } else { if ((flags & FOLDEQ_UTF8_LOCALE) - && (! u2 || UTF8_IS_INVARIANT(*p2) || UTF8_IS_DOWNGRADEABLE_START(*p2))) + && (! u2 || ! UTF8_IS_ABOVE_LATIN1(*p2))) { /* Here, the next char in s2 is < 256. We've already * worked on s1, and if it isn't also < 256, can't match */ - if (u1 && (! UTF8_IS_INVARIANT(*p1) - && ! UTF8_IS_DOWNGRADEABLE_START(*p1))) - { + if (u1 && UTF8_IS_ABOVE_LATIN1(*p1)) { return 0; } if (! u2 || UTF8_IS_INVARIANT(*p2)) { @@ -4506,7 +4649,7 @@ Perl_foldEQ_utf8_flags(pTHX_ const char *s1, char **pe1, register UV l1, bool u1 return 0; } n2 = 1; - *foldbuf2 = toLOWER(*p2); + *foldbuf2 = toFOLD(*p2); } else if (u2) { to_utf8_fold(p2, foldbuf2, &n2);