X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/6873aa47bf0efc8a323fec776705c5b196c05c12..7a3934240c41fbfc2d9bd119996fc8740d6a6973:/locale.c diff --git a/locale.c b/locale.c index 4e0015e..e29d0c4 100644 --- a/locale.c +++ b/locale.c @@ -74,24 +74,46 @@ static bool debug_initialization = FALSE; #ifdef USE_LOCALE -/* - * Standardize the locale name from a string returned by 'setlocale', possibly - * modifying that string. - * - * The typical return value of setlocale() is either - * (1) "xx_YY" if the first argument of setlocale() is not LC_ALL - * (2) "xa_YY xb_YY ..." if the first argument of setlocale() is LC_ALL - * (the space-separated values represent the various sublocales, - * in some unspecified order). This is not handled by this function. +/* This code keeps a LRU cache of the UTF-8ness of the locales it has so-far + * looked up. This is in the form of a C string: */ + +#define UTF8NESS_SEP "\v" +#define UTF8NESS_PREFIX "\f" + +/* So, the string looks like: * - * In some platforms it has a form like "LC_SOMETHING=Lang_Country.866\n", - * which is harmful for further use of the string in setlocale(). This - * function removes the trailing new line and everything up through the '=' + * \vC\a0\vPOSIX\a0\vam_ET\a0\vaf_ZA.utf8\a1\ven_US.UTF-8\a1\0 * - */ + * where the digit 0 after the \a indicates that the locale starting just + * after the preceding \v is not UTF-8, and the digit 1 mean it is. */ + +STATIC_ASSERT_DECL(STRLENs(UTF8NESS_SEP) == 1); +STATIC_ASSERT_DECL(STRLENs(UTF8NESS_PREFIX) == 1); + +#define C_and_POSIX_utf8ness UTF8NESS_SEP "C" UTF8NESS_PREFIX "0" \ + UTF8NESS_SEP "POSIX" UTF8NESS_PREFIX "0" + +/* The cache is initialized to C_and_POSIX_utf8ness at start up. These are + * kept there always. The remining portion of the cache is LRU, with the + * oldest looked-up locale at the tail end */ + STATIC char * S_stdize_locale(pTHX_ char *locs) { + /* Standardize the locale name from a string returned by 'setlocale', + * possibly modifying that string. + * + * The typical return value of setlocale() is either + * (1) "xx_YY" if the first argument of setlocale() is not LC_ALL + * (2) "xa_YY xb_YY ..." if the first argument of setlocale() is LC_ALL + * (the space-separated values represent the various sublocales, + * in some unspecified order). This is not handled by this function. + * + * In some platforms it has a form like "LC_SOMETHING=Lang_Country.866\n", + * which is harmful for further use of the string in setlocale(). This + * function removes the trailing new line and everything up through the '=' + * */ + const char * const s = strchr(locs, '='); bool okay = TRUE; @@ -372,7 +394,6 @@ S_set_numeric_radix(pTHX_ const bool use_locale) /* FALSE => already in dest locale */ /* ... and the character being used isn't a dot */ if (strNE(radix, ".")) { - const U8 * first_variant; if (PL_numeric_radix_sv) { sv_setpv(PL_numeric_radix_sv, radix); @@ -381,17 +402,10 @@ S_set_numeric_radix(pTHX_ const bool use_locale) PL_numeric_radix_sv = newSVpv(radix, 0); } - /* If there is a byte variant under UTF-8, and if the remainder of - * the string starting there is valid UTF-8, and we are in a UTF-8 - * locale, then mark the radix as being in UTF-8 */ - if ( ! is_utf8_invariant_string_loc( - (U8 *) SvPVX(PL_numeric_radix_sv), - SvCUR(PL_numeric_radix_sv), - &first_variant) - && is_utf8_string(first_variant, - SvCUR(PL_numeric_radix_sv) - - ((char *) first_variant - - SvPVX(PL_numeric_radix_sv))) + /* If this is valid UTF-8 that isn't totally ASCII, and we are in + * a UTF-8 locale, then mark the radix as being in UTF-8 */ + if (is_utf8_non_invariant_string((U8 *) SvPVX(PL_numeric_radix_sv), + SvCUR(PL_numeric_radix_sv)) && _is_cur_LC_category_utf8(LC_NUMERIC)) { SvUTF8_on(PL_numeric_radix_sv); @@ -560,7 +574,7 @@ Perl_set_numeric_underlying(pTHX) do_setlocale_c(LC_NUMERIC, PL_numeric_name); PL_numeric_standard = PL_numeric_underlying_is_standard; PL_numeric_underlying = TRUE; - set_numeric_radix(1); + set_numeric_radix(! PL_numeric_standard); # ifdef DEBUGGING @@ -1412,6 +1426,14 @@ S_my_nl_langinfo(const int item, bool toggle) { dTHX; + /* We only need to toggle into the underlying LC_NUMERIC locale for these + * two items, and only if not already there */ + if (toggle && (( item != PERL_RADIXCHAR && item != PERL_THOUSEP) + || PL_numeric_underlying)) + { + toggle = FALSE; + } + #if defined(HAS_NL_LANGINFO) /* nl_langinfo() is available. */ #if ! defined(HAS_POSIX_2008_LOCALE) @@ -1420,46 +1442,48 @@ S_my_nl_langinfo(const int item, bool toggle) * switching back, as some systems destroy the buffer when setlocale() is * called */ - LOCALE_LOCK; + { + DECLARATION_FOR_LC_NUMERIC_MANIPULATION; - if (toggle) { - if ( ! PL_numeric_underlying - && (item == PERL_RADIXCHAR || item == PERL_THOUSEP)) - { - do_setlocale_c(LC_NUMERIC, PL_numeric_name); + if (toggle) { + STORE_LC_NUMERIC_FORCE_TO_UNDERLYING(); } - else { - toggle = FALSE; - } - } - save_to_buffer(nl_langinfo(item), &PL_langinfo_buf, &PL_langinfo_bufsize, 0); + LOCALE_LOCK; /* Prevent interference from another thread executing + this code section (the only call to nl_langinfo in + the core) */ - if (toggle) { - do_setlocale_c(LC_NUMERIC, "C"); - } + save_to_buffer(nl_langinfo(item), &PL_langinfo_buf, + &PL_langinfo_bufsize, 0); - LOCALE_UNLOCK; + LOCALE_UNLOCK; + + if (toggle) { + RESTORE_LC_NUMERIC(); + } + } # else /* Use nl_langinfo_l(), avoiding both a mutex and changing the locale */ - bool do_free = FALSE; - locale_t cur = uselocale((locale_t) 0); + { + bool do_free = FALSE; + locale_t cur = uselocale((locale_t) 0); - if (cur == LC_GLOBAL_LOCALE) { - cur = duplocale(LC_GLOBAL_LOCALE); - do_free = TRUE; - } + if (cur == LC_GLOBAL_LOCALE) { + cur = duplocale(LC_GLOBAL_LOCALE); + do_free = TRUE; + } - if (toggle) { - cur = newlocale(LC_NUMERIC_MASK, PL_numeric_name, cur); - do_free = TRUE; - } + if (toggle) { + cur = newlocale(LC_NUMERIC_MASK, PL_numeric_name, cur); + do_free = TRUE; + } - save_to_buffer(nl_langinfo_l(item, cur), - &PL_langinfo_buf, &PL_langinfo_bufsize, 0); - if (do_free) { - freelocale(cur); + save_to_buffer(nl_langinfo_l(item, cur), + &PL_langinfo_buf, &PL_langinfo_bufsize, 0); + if (do_free) { + freelocale(cur); + } } # endif @@ -1482,6 +1506,7 @@ S_my_nl_langinfo(const int item, bool toggle) # ifdef HAS_LOCALECONV const struct lconv* lc; + DECLARATION_FOR_LC_NUMERIC_MANIPULATION; # endif # ifdef HAS_STRFTIME @@ -1521,11 +1546,12 @@ S_my_nl_langinfo(const int item, bool toggle) case PERL_CRNCYSTR: - LOCALE_LOCK; - /* We don't bother with localeconv_l() because any system that * has it is likely to also have nl_langinfo() */ + LOCALE_LOCK; /* Prevent interference with other threads + using localeconv() */ + lc = localeconv(); if ( ! lc || ! lc->currency_symbol @@ -1557,17 +1583,13 @@ S_my_nl_langinfo(const int item, bool toggle) case PERL_RADIXCHAR: case PERL_THOUSEP: - LOCALE_LOCK; - if (toggle) { - if (! PL_numeric_underlying) { - do_setlocale_c(LC_NUMERIC, PL_numeric_name); - } - else { - toggle = FALSE; - } + STORE_LC_NUMERIC_FORCE_TO_UNDERLYING(); } + LOCALE_LOCK; /* Prevent interference with other threads + using localeconv() */ + lc = localeconv(); if (! lc) { retval = ""; @@ -1584,12 +1606,12 @@ S_my_nl_langinfo(const int item, bool toggle) save_to_buffer(retval, &PL_langinfo_buf, &PL_langinfo_bufsize, 0); + LOCALE_UNLOCK; + if (toggle) { - do_setlocale_c(LC_NUMERIC, "C"); + RESTORE_LC_NUMERIC(); } - LOCALE_UNLOCK; - break; # endif @@ -2018,6 +2040,12 @@ Perl_init_i18nl10n(pTHX_ int printwarn) assert(NOMINAL_LC_ALL_INDEX == LC_ALL_INDEX); # endif # endif /* DEBUGGING */ + + /* Initialize the cache of the program's UTF-8ness for the always known + * locales C and POSIX */ + my_strlcpy(PL_locale_utf8ness, C_and_POSIX_utf8ness, + sizeof(PL_locale_utf8ness)); + # ifdef LOCALE_ENVIRON_REQUIRED /* @@ -2374,19 +2402,36 @@ Perl_init_i18nl10n(pTHX_ int printwarn) # endif - for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++) { + +# if defined(USE_ITHREADS) + + /* This caches whether each category's locale is UTF-8 or not. This + * may involve changing the locale. It is ok to do this at + * initialization time before any threads have started, but not later. + * Caching means that if the program heeds our dictate not to change + * locales in threaded applications, this data will remain valid, and + * it may get queried without changing locales. If the environment is + * such that all categories have the same locale, this isn't needed, as + * the code will not change the locale; but this handles the uncommon + * case where the environment has disparate locales for the categories + * */ + (void) _is_cur_LC_category_utf8(categories[i]); + +# endif + Safefree(curlocales[i]); } # if defined(USE_PERLIO) && defined(USE_LOCALE_CTYPE) /* Set PL_utf8locale to TRUE if using PerlIO _and_ the current LC_CTYPE - * locale is UTF-8. If PL_utf8locale and PL_unicode (set by -C or by - * $ENV{PERL_UNICODE}) are true, perl.c:S_parse_body() will turn on the - * PerlIO :utf8 layer on STDIN, STDOUT, STDERR, _and_ the default open - * discipline. */ - PL_utf8locale = _is_cur_LC_category_utf8(LC_CTYPE); + * locale is UTF-8. The call to new_ctype() just above has already + * calculated the latter value and saved it in PL_in_utf8_CTYPE_locale. If + * both PL_utf8locale and PL_unicode (set by -C or by $ENV{PERL_UNICODE}) + * are true, perl.c:S_parse_body() will turn on the PerlIO :utf8 layer on + * STDIN, STDOUT, STDERR, _and_ the default open discipline. */ + PL_utf8locale = PL_in_utf8_CTYPE_locale; /* Set PL_unicode to $ENV{PERL_UNICODE} if using PerlIO. This is an alternative to using the -C command line switch @@ -3023,39 +3068,105 @@ Perl__is_cur_LC_category_utf8(pTHX_ int category) * English, it comes down to if the locale's name ends in something like * "UTF-8". It errs on the side of not being a UTF-8 locale. */ + /* Name of current locale corresponding to the input category */ const char *save_input_locale = NULL; + + bool is_utf8 = FALSE; /* The return value */ STRLEN final_pos; + /* The variables below are for the cache of previous lookups using this + * function. The cache is a C string, described at the definition for + * 'C_and_POSIX_utf8ness'. + * + * The first part of the cache is fixed, for the C and POSIX locales. The + * varying part starts just after them. */ + char * utf8ness_cache = PL_locale_utf8ness + STRLENs(C_and_POSIX_utf8ness); + + Size_t utf8ness_cache_size; /* Size of the varying portion */ + Size_t input_name_len; /* Length in bytes of save_input_locale */ + Size_t input_name_len_with_overhead; /* plus extra chars used to store + the name in the cache */ + char * delimited; /* The name plus the delimiters used to store + it in the cache */ + char * name_pos; /* position of 'delimited' in the cache, or 0 + if not there */ + + # ifdef LC_ALL assert(category != LC_ALL); # endif - /* First dispose of the trivial cases */ + /* Get the desired category's locale */ save_input_locale = do_setlocale_r(category, NULL); if (! save_input_locale) { - DEBUG_L(PerlIO_printf(Perl_debug_log, - "Could not find current locale for category %d\n", - category)); - return FALSE; /* XXX maybe should croak */ + Perl_croak(aTHX_ + "panic: %s: %d: Could not find current locale for %s\n", + __FILE__, __LINE__, category_name(category)); } + save_input_locale = stdize_locale(savepv(save_input_locale)); - if (isNAME_C_OR_POSIX(save_input_locale)) { - DEBUG_L(PerlIO_printf(Perl_debug_log, - "Current locale for category %d is %s\n", - category, save_input_locale)); + DEBUG_L(PerlIO_printf(Perl_debug_log, + "Current locale for %s is %s\n", + category_name(category), save_input_locale)); + + input_name_len = strlen(save_input_locale); + + /* In our cache, each name is accompanied by two delimiters and a single + * utf8ness digit */ + input_name_len_with_overhead = input_name_len + 3; + + /* Allocate and populate space for a copy of the name surrounded by the + * delimiters */ + Newx(delimited, input_name_len_with_overhead, char); + delimited[0] = UTF8NESS_SEP[0]; + Copy(save_input_locale, delimited + 1, input_name_len, char); + delimited[input_name_len+1] = UTF8NESS_PREFIX[0]; + delimited[input_name_len+2] = '\0'; + + /* And see if that is in the cache */ + name_pos = instr(PL_locale_utf8ness, delimited); + if (name_pos) { + is_utf8 = *(name_pos + input_name_len_with_overhead - 1) - '0'; + +# ifdef DEBUGGING + + if (DEBUG_Lv_TEST || debug_initialization) { + PerlIO_printf(Perl_debug_log, "UTF8ness for locale %s=%d, \n", + save_input_locale, is_utf8); + } + +# endif + + /* And, if not already in that position, move it to the beginning of + * the non-constant portion of the list, since it is the most recently + * used. (We don't have to worry about overflow, since just moving + * existing names around) */ + if (name_pos > utf8ness_cache) { + Move(utf8ness_cache, + utf8ness_cache + input_name_len_with_overhead, + name_pos - utf8ness_cache, char); + Copy(delimited, + utf8ness_cache, + input_name_len_with_overhead - 1, char); + utf8ness_cache[input_name_len_with_overhead - 1] = is_utf8 + '0'; + } + + Safefree(delimited); Safefree(save_input_locale); - return FALSE; + return is_utf8; } + /* Here we don't have stored the utf8ness for the input locale. We have to + * calculate it */ + # if defined(USE_LOCALE_CTYPE) \ && (defined(MB_CUR_MAX) || (defined(HAS_NL_LANGINFO) && defined(CODESET))) { /* Next try nl_langinfo or MB_CUR_MAX if available */ char *save_ctype_locale = NULL; - bool is_utf8; if (category != LC_CTYPE) { /* These work only on LC_CTYPE */ @@ -3108,16 +3219,18 @@ Perl__is_cur_LC_category_utf8(pTHX_ int category) Safefree(save_ctype_locale); } - is_utf8 = ( ( strlen(codeset) == STRLENs("UTF-8") - && foldEQ(codeset, STR_WITH_LEN("UTF-8"))) - || ( strlen(codeset) == STRLENs("UTF8") - && foldEQ(codeset, STR_WITH_LEN("UTF8")))); + /* If the implementation of foldEQ() somehow were + * to change to not go byte-by-byte, this could + * read past end of string, as only one length is + * checked. But currently, a premature NUL will + * compare false, and it will stop there */ + is_utf8 = cBOOL( foldEQ(codeset, STR_WITH_LEN("UTF-8")) + || foldEQ(codeset, STR_WITH_LEN("UTF8"))); DEBUG_L(PerlIO_printf(Perl_debug_log, "\tnllanginfo returned CODESET '%s'; ?UTF8 locale=%d\n", codeset, is_utf8)); - Safefree(save_input_locale); - return is_utf8; + goto finish_and_return; } } @@ -3174,7 +3287,7 @@ Perl__is_cur_LC_category_utf8(pTHX_ int category) Safefree(save_ctype_locale); } - return is_utf8; + goto finish_and_return; # endif @@ -3199,7 +3312,6 @@ Perl__is_cur_LC_category_utf8(pTHX_ int category) { char *save_monetary_locale = NULL; bool only_ascii = FALSE; - bool is_utf8 = FALSE; struct lconv* lc; /* Like above for LC_CTYPE, we first set LC_MONETARY to the locale of @@ -3256,8 +3368,7 @@ Perl__is_cur_LC_category_utf8(pTHX_ int category) * is non-ascii UTF-8. */ DEBUG_L(PerlIO_printf(Perl_debug_log, "\t?Currency symbol for %s is UTF-8=%d\n", save_input_locale, is_utf8)); - Safefree(save_input_locale); - return is_utf8; + goto finish_and_return; } } cant_use_monetary: @@ -3342,8 +3453,8 @@ Perl__is_cur_LC_category_utf8(pTHX_ int category) DEBUG_L(PerlIO_printf(Perl_debug_log, "\t?time-related strings for %s are UTF-8=%d\n", save_input_locale, is_utf8_string((U8 *) formatted_time, 0))); - Safefree(save_input_locale); - return is_utf8_string((U8 *) formatted_time, 0); + is_utf8 = is_utf8_string((U8 *) formatted_time, 0); + goto finish_and_return; } /* Falling off the end of the loop indicates all the names were just @@ -3374,7 +3485,6 @@ Perl__is_cur_LC_category_utf8(pTHX_ int category) * are much more likely to have been translated. */ { int e; - bool is_utf8 = FALSE; bool non_ascii = FALSE; char *save_messages_locale = NULL; const char * errmsg = NULL; @@ -3438,8 +3548,7 @@ Perl__is_cur_LC_category_utf8(pTHX_ int category) DEBUG_L(PerlIO_printf(Perl_debug_log, "\t?error messages for %s are UTF-8=%d\n", save_input_locale, is_utf8)); - Safefree(save_input_locale); - return is_utf8; + goto finish_and_return; } DEBUG_L(PerlIO_printf(Perl_debug_log, "All error messages for %s contain only ASCII; can't use for determining if UTF-8 locale\n", save_input_locale)); @@ -3483,8 +3592,8 @@ Perl__is_cur_LC_category_utf8(pTHX_ int category) DEBUG_L(PerlIO_printf(Perl_debug_log, "Locale %s ends with UTF-8 in name\n", save_input_locale)); - Safefree(save_input_locale); - return TRUE; + is_utf8 = TRUE; + goto finish_and_return; } } DEBUG_L(PerlIO_printf(Perl_debug_log, @@ -3492,19 +3601,19 @@ Perl__is_cur_LC_category_utf8(pTHX_ int category) save_input_locale)); } -# endif -# ifdef WIN32 +# ifdef WIN32 /* http://msdn.microsoft.com/en-us/library/windows/desktop/dd317756.aspx */ if (memENDs(save_input_locale, final_pos, "65001")) { DEBUG_L(PerlIO_printf(Perl_debug_log, "Locale %s ends with 65001 in name, is UTF-8 locale\n", save_input_locale)); - Safefree(save_input_locale); - return TRUE; + is_utf8 = TRUE; + goto finish_and_return; } -# endif +# endif +# endif /* Other common encodings are the ISO 8859 series, which aren't UTF-8. But * since we are about to return FALSE anyway, there is no point in doing @@ -3515,16 +3624,81 @@ Perl__is_cur_LC_category_utf8(pTHX_ int category) DEBUG_L(PerlIO_printf(Perl_debug_log, "Locale %s has 8859 in name, not UTF-8 locale\n", save_input_locale)); - Safefree(save_input_locale); - return FALSE; + is_utf8 = FALSE; + goto finish_and_return; } # endif DEBUG_L(PerlIO_printf(Perl_debug_log, "Assuming locale %s is not a UTF-8 locale\n", save_input_locale)); + is_utf8 = FALSE; + + finish_and_return: + + /* Cache this result so we don't have to go through all this next time. */ + utf8ness_cache_size = sizeof(PL_locale_utf8ness) + - (utf8ness_cache - PL_locale_utf8ness); + + /* But we can't save it if it is too large for the total space available */ + if (LIKELY(input_name_len_with_overhead < utf8ness_cache_size)) { + Size_t utf8ness_cache_len = strlen(utf8ness_cache); + + /* Here it can fit, but we may need to clear out the oldest cached + * result(s) to do so. Check */ + if (utf8ness_cache_len + input_name_len_with_overhead + >= utf8ness_cache_size) + { + /* Here we have to clear something out to make room for this. + * Start looking at the rightmost place where it could fit and find + * the beginning of the entry that extends past that. */ + char * cutoff = (char *) my_memrchr(utf8ness_cache, + UTF8NESS_SEP[0], + utf8ness_cache_size + - input_name_len_with_overhead); + + assert(cutoff); + assert(cutoff >= utf8ness_cache); + + /* This and all subsequent entries must be removed */ + *cutoff = '\0'; + utf8ness_cache_len = strlen(utf8ness_cache); + } + + /* Make space for the new entry */ + Move(utf8ness_cache, + utf8ness_cache + input_name_len_with_overhead, + utf8ness_cache_len + 1 /* Incl. trailing NUL */, char); + + /* And insert it */ + Copy(delimited, utf8ness_cache, input_name_len_with_overhead - 1, char); + utf8ness_cache[input_name_len_with_overhead - 1] = is_utf8 + '0'; + + if ((PL_locale_utf8ness[strlen(PL_locale_utf8ness)-1] + & (PERL_UINTMAX_T) ~1) != '0') + { + Perl_croak(aTHX_ + "panic: %s: %d: Corrupt utf8ness_cache=%s\nlen=%u," + " inserted_name=%s, its_len=%u\n", + __FILE__, __LINE__, + PL_locale_utf8ness, strlen(PL_locale_utf8ness), + delimited, input_name_len_with_overhead); + } + } + +# ifdef DEBUGGING + + if (DEBUG_Lv_TEST || debug_initialization) { + PerlIO_printf(Perl_debug_log, + "PL_locale_utf8ness is now %s; returning %d\n", + PL_locale_utf8ness, is_utf8); + } + +# endif + + Safefree(delimited); Safefree(save_input_locale); - return FALSE; + return is_utf8; } #endif