From 0dec74cdf5288e3984f799818ed991a90bc9675b Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Fri, 5 Jan 2018 14:09:40 -0700 Subject: [PATCH 1/1] locale.c: Revamp finding if locale is UTF-8 This changes how this functionality works for the LC_CTYPE locale. On systems that have nl_langinfo() one can get a "definitive" answer from just that. Otherwise (or if that doesn't return properly) one can use mbtowc() to check if the UTF-8 byte sequence for the Unicode REPLACEMENT CHARACTER actually is considered to be that code point. This is also "definitive". If the maximum byte string length for a character is too short to handle all Unicode UTF-8, we know without further checking that this isn't a UTF-8 locale, so can avoid the mbtowc check. It turns out, from testing, that some locales are labelled UTF-8 by nl_langinfo even though they depart from that at times. Similarly for mbtowc(). Perl assumes that a locale doesn't depart from this, and uses its internal rules that it knows are UTF-8. A future commit will warn when this happens. --- locale.c | 134 +++++++++++++++++++++++++++++++-------------------------------- 1 file changed, 66 insertions(+), 68 deletions(-) diff --git a/locale.c b/locale.c index 583bf12..3391925 100644 --- a/locale.c +++ b/locale.c @@ -41,6 +41,10 @@ #include "reentr.h" +#ifdef I_WCHAR +# include +#endif + /* If the environment says to, we can output debugging information during * initialization. This is done before option parsing, and before any thread * creation, so can be a file-level static */ @@ -3068,7 +3072,6 @@ Perl__is_cur_LC_category_utf8(pTHX_ int category) const char *save_input_locale = NULL; bool is_utf8 = FALSE; /* The return value */ - STRLEN final_pos; /* The variables below are for the cache of previous lookups using this * function. The cache is a C string, described at the definition for @@ -3157,14 +3160,14 @@ Perl__is_cur_LC_category_utf8(pTHX_ int category) /* Here we don't have stored the utf8ness for the input locale. We have to * calculate it */ -# if defined(USE_LOCALE_CTYPE) \ - && (defined(MB_CUR_MAX) || (defined(HAS_NL_LANGINFO) && defined(CODESET))) - - { /* Next try nl_langinfo or MB_CUR_MAX if available */ +# if defined(USE_LOCALE_CTYPE) \ + && ( (defined(HAS_NL_LANGINFO) && defined(CODESET)) \ + || defined(HAS_MBTOWC)) - char *save_ctype_locale = NULL; + { + const char *save_ctype_locale = NULL; - if (category != LC_CTYPE) { /* These work only on LC_CTYPE */ + if (category != LC_CTYPE) { /* Get the current LC_CTYPE locale */ save_ctype_locale = do_setlocale_c(LC_CTYPE, NULL); @@ -3195,12 +3198,32 @@ Perl__is_cur_LC_category_utf8(pTHX_ int category) save_input_locale)); /* Here the current LC_CTYPE is set to the locale of the category whose - * information is desired. This means that nl_langinfo() and MB_CUR_MAX + * information is desired. This means that nl_langinfo() and mbtowc() * should give the correct results */ +# ifdef MB_CUR_MAX /* But we can potentially rule out UTF-8ness, avoiding + calling the functions if we have this */ + + /* Standard UTF-8 needs at least 4 bytes to represent the maximum + * Unicode code point. */ + + DEBUG_L(PerlIO_printf(Perl_debug_log, "\tMB_CUR_MAX=%d\n", + (int) MB_CUR_MAX)); + if ((unsigned) MB_CUR_MAX < STRLENs(MAX_UNICODE_UTF8)) { + is_utf8 = FALSE; + goto finish_ctype; + } + +# endif # if defined(HAS_NL_LANGINFO) && defined(CODESET) - { /* The task is easiest if the platform has this POSIX 2001 function */ + { /* The task is easiest if the platform has this POSIX 2001 function. + Except on some platforms it can wrongly return "", so have to have + a fallback. And it can return that it's UTF-8, even if there are + variances from that. For example, Turkish locales may use the + alternate dotted I rules, and sometimes it appears to be a + defective locale definition. XXX We should probably check for + these in the Latin1 range and warn */ const char *codeset = my_nl_langinfo(PERL_CODESET, FALSE); /* FALSE => already in dest locale */ @@ -3208,11 +3231,6 @@ Perl__is_cur_LC_category_utf8(pTHX_ int category) "\tnllanginfo returned CODESET '%s'\n", codeset)); if (codeset && strNE(codeset, "")) { - /* If we switched LC_CTYPE, switch back */ - if (save_ctype_locale) { - do_setlocale_c(LC_CTYPE, save_ctype_locale); - Safefree(save_ctype_locale); - } /* If the implementation of foldEQ() somehow were * to change to not go byte-by-byte, this could @@ -3225,56 +3243,38 @@ Perl__is_cur_LC_category_utf8(pTHX_ int category) DEBUG_L(PerlIO_printf(Perl_debug_log, "\tnllanginfo returned CODESET '%s'; ?UTF8 locale=%d\n", codeset, is_utf8)); - goto finish_and_return; + goto finish_ctype; } } # endif -# ifdef MB_CUR_MAX - - /* Here, either we don't have nl_langinfo, or it didn't return a - * codeset. Try MB_CUR_MAX */ - - /* Standard UTF-8 needs at least 4 bytes to represent the maximum - * Unicode code point. Since UTF-8 is the only non-single byte - * encoding we handle, we just say any such encoding is UTF-8, and if - * turns out to be wrong, other things will fail */ - is_utf8 = (unsigned) MB_CUR_MAX >= STRLENs(MAX_UNICODE_UTF8); - - DEBUG_L(PerlIO_printf(Perl_debug_log, - "\tMB_CUR_MAX=%d; ?UTF8 locale=%d\n", - (int) MB_CUR_MAX, is_utf8)); +# if defined(HAS_MBTOWC) + /* We can see if this is a UTF-8-like locale if have mbtowc(). It was a + * late adder to C89, so very likely to have it. However, testing has + * shown that, like nl_langinfo() above, there are locales that are not + * strictly UTF-8 that this will return that they are */ - Safefree(save_input_locale); - -# ifdef HAS_MBTOWC - - /* ... But, most system that have MB_CUR_MAX will also have mbtowc(), - * since they are both in the C99 standard. We can feed a known byte - * string to the latter function, and check that it gives the expected - * result */ - if (is_utf8) { + { wchar_t wc; int len; + /* mbtowc() converts a byte string to a wide character. Feed a byte + * string to it and check that the result is the expected Unicode + * code point */ + PERL_UNUSED_RESULT(mbtowc(&wc, NULL, 0));/* Reset any shift state */ errno = 0; len = mbtowc(&wc, STR_WITH_LEN(REPLACEMENT_CHARACTER_UTF8)); + DEBUG_L(PerlIO_printf(Perl_debug_log, + "\treturn from mbtowc; len=%d; code_point=%x; errno=%d\n", + len, (unsigned int) wc, errno)); - if ( len != STRLENs(REPLACEMENT_CHARACTER_UTF8) - || wc != (wchar_t) UNICODE_REPLACEMENT) - { - is_utf8 = FALSE; - DEBUG_L(PerlIO_printf(Perl_debug_log, "\replacement=U+%x\n", - (unsigned int)wc)); - DEBUG_L(PerlIO_printf(Perl_debug_log, - "\treturn from mbtowc=%d; errno=%d; ?UTF8 locale=0\n", - len, errno)); - } + is_utf8 = cBOOL( len == STRLENs(REPLACEMENT_CHARACTER_UTF8) + && wc == (wchar_t) UNICODE_REPLACEMENT); } -# endif + finish_ctype: /* If we switched LC_CTYPE, switch back */ if (save_ctype_locale) { @@ -3283,21 +3283,17 @@ Perl__is_cur_LC_category_utf8(pTHX_ int category) } goto finish_and_return; - -# endif - } -# else /* nl_langinfo should work if available, so don't bother compiling this - fallback code. The final fallback of looking at the name is - compiled, and will be executed if nl_langinfo fails */ +# endif +# else - /* nl_langinfo not available or failed somehow. Next try looking at the - * currency symbol to see if it disambiguates things. Often that will be - * in the native script, and if the symbol isn't in UTF-8, we know that the - * locale isn't. If it is non-ASCII UTF-8, we infer that the locale is - * too, as the odds of a non-UTF8 string being valid UTF-8 are quite small - * */ + /* Here, we must have a C89 compiler that doesn't have mbtowc(). Next + * try looking at the currency symbol to see if it disambiguates + * things. Often that will be in the native script, and if the symbol + * isn't in UTF-8, we know that the locale isn't. If it is non-ASCII + * UTF-8, we infer that the locale is too, as the odds of a non-UTF8 + * string being valid UTF-8 are quite small */ # ifdef HAS_LOCALECONV # ifdef USE_LOCALE_MONETARY @@ -3543,9 +3539,7 @@ Perl__is_cur_LC_category_utf8(pTHX_ int category) } # endif -# endif /* the code that is compiled when no nl_langinfo */ - -# ifndef EBCDIC /* On os390, even if the name ends with "UTF-8', it isn't a +# ifndef EBCDIC /* On os390, even if the name ends with "UTF-8', it isn't a UTF-8 locale */ /* As a last resort, look at the locale name to see if it matches @@ -3555,7 +3549,9 @@ Perl__is_cur_LC_category_utf8(pTHX_ int category) * its UTF-8ness, but if it has UTF8 in the name, it is extremely likely to * be a UTF-8 locale. Similarly for the other common names */ - final_pos = strlen(save_input_locale) - 1; + { + const Size_t final_pos = strlen(save_input_locale) - 1; + if (final_pos >= 3) { const char *name = save_input_locale; @@ -3598,6 +3594,7 @@ Perl__is_cur_LC_category_utf8(pTHX_ int category) is_utf8 = TRUE; goto finish_and_return; } + } # endif # endif @@ -3606,7 +3603,7 @@ Perl__is_cur_LC_category_utf8(pTHX_ int category) * since we are about to return FALSE anyway, there is no point in doing * this extra work */ -# if 0 +# if 0 if (instr(save_input_locale, "8859")) { DEBUG_L(PerlIO_printf(Perl_debug_log, "Locale %s has 8859 in name, not UTF-8 locale\n", @@ -3614,13 +3611,15 @@ Perl__is_cur_LC_category_utf8(pTHX_ int category) is_utf8 = FALSE; goto finish_and_return; } -# endif +# endif DEBUG_L(PerlIO_printf(Perl_debug_log, "Assuming locale %s is not a UTF-8 locale\n", save_input_locale)); is_utf8 = FALSE; +# endif /* the code that is compiled when no modern LC_CTYPE */ + finish_and_return: /* Cache this result so we don't have to go through all this next time. */ @@ -3690,7 +3689,6 @@ Perl__is_cur_LC_category_utf8(pTHX_ int category) #endif - bool Perl__is_in_locale_category(pTHX_ const bool compiling, const int category) { -- 1.8.3.1