X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/50bf02bdec7e84f5ebd09f3f200e5939e80e4e19..bfa9f5ee70ce509f0e66dcff9e9fda131ea8a133:/locale.c diff --git a/locale.c b/locale.c index 50bcaea..2b123d1 100644 --- a/locale.c +++ b/locale.c @@ -32,6 +32,15 @@ * the desired behavior of those functions at the moment. And, LC_MESSAGES is * switched to the C locale for outputting the message unless within the scope * of 'use locale'. + * + * This code now has multi-thread-safe locale handling on systems that support + * that. This is completely transparent to most XS code. On earlier systems, + * it would be possible to emulate thread-safe locales, but this likely would + * involve a lot of locale switching, and would require XS code changes. + * Macros could be written so that the code wouldn't have to know which type of + * system is being used. It's unlikely that we would ever do that, since most + * modern systems support thread-safe locales, but there was code written to + * this end, and is retained, #ifdef'd out. */ #include "EXTERN.h" @@ -41,6 +50,13 @@ #include "reentr.h" +#ifdef I_WCHAR +# include +#endif +#ifdef I_WCTYPE +# include +#endif + /* If the environment says to, we can output debugging information during * initialization. This is done before option parsing, and before any thread * creation, so can be a file-level static */ @@ -52,6 +68,12 @@ static bool debug_initialization = FALSE; # define DEBUG_INITIALIZATION_set(v) (debug_initialization = v) #endif + +/* Returns the Unix errno portion; ignoring any others. This is a macro here + * instead of putting it into perl.h, because unclear to khw what should be + * done generally. */ +#define GET_ERRNO saved_errno + /* strlen() of a literal string constant. We might want this more general, * but using it in just this file for now. A problem with more generality is * the compiler warnings about comparing unlike signs */ @@ -74,24 +96,46 @@ static bool debug_initialization = FALSE; #ifdef USE_LOCALE -/* - * Standardize the locale name from a string returned by 'setlocale', possibly - * modifying that string. - * - * The typical return value of setlocale() is either - * (1) "xx_YY" if the first argument of setlocale() is not LC_ALL - * (2) "xa_YY xb_YY ..." if the first argument of setlocale() is LC_ALL - * (the space-separated values represent the various sublocales, - * in some unspecified order). This is not handled by this function. +/* This code keeps a LRU cache of the UTF-8ness of the locales it has so-far + * looked up. This is in the form of a C string: */ + +#define UTF8NESS_SEP "\v" +#define UTF8NESS_PREFIX "\f" + +/* So, the string looks like: * - * In some platforms it has a form like "LC_SOMETHING=Lang_Country.866\n", - * which is harmful for further use of the string in setlocale(). This - * function removes the trailing new line and everything up through the '=' + * \vC\a0\vPOSIX\a0\vam_ET\a0\vaf_ZA.utf8\a1\ven_US.UTF-8\a1\0 * - */ + * where the digit 0 after the \a indicates that the locale starting just + * after the preceding \v is not UTF-8, and the digit 1 mean it is. */ + +STATIC_ASSERT_DECL(STRLENs(UTF8NESS_SEP) == 1); +STATIC_ASSERT_DECL(STRLENs(UTF8NESS_PREFIX) == 1); + +#define C_and_POSIX_utf8ness UTF8NESS_SEP "C" UTF8NESS_PREFIX "0" \ + UTF8NESS_SEP "POSIX" UTF8NESS_PREFIX "0" + +/* The cache is initialized to C_and_POSIX_utf8ness at start up. These are + * kept there always. The remining portion of the cache is LRU, with the + * oldest looked-up locale at the tail end */ + STATIC char * S_stdize_locale(pTHX_ char *locs) { + /* Standardize the locale name from a string returned by 'setlocale', + * possibly modifying that string. + * + * The typical return value of setlocale() is either + * (1) "xx_YY" if the first argument of setlocale() is not LC_ALL + * (2) "xa_YY xb_YY ..." if the first argument of setlocale() is LC_ALL + * (the space-separated values represent the various sublocales, + * in some unspecified order). This is not handled by this function. + * + * In some platforms it has a form like "LC_SOMETHING=Lang_Country.866\n", + * which is harmful for further use of the string in setlocale(). This + * function removes the trailing new line and everything up through the '=' + * */ + const char * const s = strchr(locs, '='); bool okay = TRUE; @@ -166,7 +210,7 @@ const int categories[] = { /* The top-most real element is LC_ALL */ -const char * category_names[] = { +const char * const category_names[] = { # ifdef USE_LOCALE_NUMERIC "LC_NUMERIC", @@ -290,7 +334,7 @@ S_category_name(const int category) # define LC_COLLATE_INDEX _DUMMY_CTYPE + 1 # define _DUMMY_COLLATE LC_COLLATE_INDEX # else -# define _DUMMY_COLLATE _DUMMY_COLLATE +# define _DUMMY_COLLATE _DUMMY_CTYPE # endif # ifdef USE_LOCALE_TIME # define LC_TIME_INDEX _DUMMY_COLLATE + 1 @@ -346,301 +390,1186 @@ S_category_name(const int category) #endif /* ifdef USE_LOCALE */ /* Windows requres a customized base-level setlocale() */ -# ifdef WIN32 -# define my_setlocale(cat, locale) win32_setlocale(cat, locale) -# else -# define my_setlocale(cat, locale) setlocale(cat, locale) -# endif +#ifdef WIN32 +# define my_setlocale(cat, locale) win32_setlocale(cat, locale) +#else +# define my_setlocale(cat, locale) setlocale(cat, locale) +#endif -/* Just placeholders for now. "_c" is intended to be called when the category - * is a constant known at compile time; "_r", not known until run time */ -# define do_setlocale_c(category, locale) my_setlocale(category, locale) -# define do_setlocale_r(category, locale) my_setlocale(category, locale) +#ifndef USE_POSIX_2008_LOCALE -STATIC void -S_set_numeric_radix(pTHX_ const bool use_locale) -{ - /* If 'use_locale' is FALSE, set to use a dot for the radix character. If - * TRUE, use the radix character derived from the current locale */ +/* "do_setlocale_c" is intended to be called when the category is a constant + * known at compile time; "do_setlocale_r", not known until run time */ +# define do_setlocale_c(cat, locale) my_setlocale(cat, locale) +# define do_setlocale_r(cat, locale) my_setlocale(cat, locale) -#if defined(USE_LOCALE_NUMERIC) && ( defined(HAS_LOCALECONV) \ - || defined(HAS_NL_LANGINFO)) +#else /* Below uses POSIX 2008 */ - /* We only set up the radix SV if we are to use a locale radix ... */ - if (use_locale) { - const char * radix = my_nl_langinfo(PERL_RADIXCHAR, FALSE); - /* FALSE => already in dest locale */ - /* ... and the character being used isn't a dot */ - if (strNE(radix, ".")) { - const U8 * first_variant; +/* We emulate setlocale with our own function. LC_foo is not valid for the + * POSIX 2008 functions. Instead LC_foo_MASK is used, which we use an array + * lookup to convert to. At compile time we have defined LC_foo_INDEX as the + * proper offset into the array 'category_masks[]'. At runtime, we have to + * search through the array (as the actual numbers may not be small contiguous + * positive integers which would lend themselves to array lookup). */ +# define do_setlocale_c(cat, locale) \ + emulate_setlocale(cat, locale, cat ## _INDEX, TRUE) +# define do_setlocale_r(cat, locale) emulate_setlocale(cat, locale, 0, FALSE) - if (PL_numeric_radix_sv) { - sv_setpv(PL_numeric_radix_sv, radix); - } - else { - PL_numeric_radix_sv = newSVpv(radix, 0); - } - - /* If there is a byte variant under UTF-8, and if the remainder of - * the string starting there is valid UTF-8, and we are in a UTF-8 - * locale, then mark the radix as being in UTF-8 */ - if ( ! is_utf8_invariant_string_loc( - (U8 *) SvPVX(PL_numeric_radix_sv), - SvCUR(PL_numeric_radix_sv), - &first_variant) - && is_utf8_string(first_variant, - SvCUR(PL_numeric_radix_sv) - - ((char *) first_variant - - SvPVX(PL_numeric_radix_sv))) - && _is_cur_LC_category_utf8(LC_NUMERIC)) - { - SvUTF8_on(PL_numeric_radix_sv); - } - goto done; - } - } +/* A third array, parallel to the ones above to map from category to its + * equivalent mask */ +const int category_masks[] = { +# ifdef USE_LOCALE_NUMERIC + LC_NUMERIC_MASK, +# endif +# ifdef USE_LOCALE_CTYPE + LC_CTYPE_MASK, +# endif +# ifdef USE_LOCALE_COLLATE + LC_COLLATE_MASK, +# endif +# ifdef USE_LOCALE_TIME + LC_TIME_MASK, +# endif +# ifdef USE_LOCALE_MESSAGES + LC_MESSAGES_MASK, +# endif +# ifdef USE_LOCALE_MONETARY + LC_MONETARY_MASK, +# endif +# ifdef USE_LOCALE_ADDRESS + LC_ADDRESS_MASK, +# endif +# ifdef USE_LOCALE_IDENTIFICATION + LC_IDENTIFICATION_MASK, +# endif +# ifdef USE_LOCALE_MEASUREMENT + LC_MEASUREMENT_MASK, +# endif +# ifdef USE_LOCALE_PAPER + LC_PAPER_MASK, +# endif +# ifdef USE_LOCALE_TELEPHONE + LC_TELEPHONE_MASK, +# endif + /* LC_ALL can't be turned off by a Configure + * option, and in Posix 2008, should always be + * here, so compile it in unconditionally. + * This could catch some glitches at compile + * time */ + LC_ALL_MASK + }; - SvREFCNT_dec(PL_numeric_radix_sv); - PL_numeric_radix_sv = NULL; +STATIC const char * +S_emulate_setlocale(const int category, + const char * locale, + unsigned int index, + const bool is_index_valid + ) +{ + /* This function effectively performs a setlocale() on just the current + * thread; thus it is thread-safe. It does this by using the POSIX 2008 + * locale functions to emulate the behavior of setlocale(). Similar to + * regular setlocale(), the return from this function points to memory that + * can be overwritten by other system calls, so needs to be copied + * immediately if you need to retain it. The difference here is that + * system calls besides another setlocale() can overwrite it. + * + * By doing this, most locale-sensitive functions become thread-safe. The + * exceptions are mostly those that return a pointer to static memory. + * + * This function takes the same parameters, 'category' and 'locale', that + * the regular setlocale() function does, but it also takes two additional + * ones. This is because the 2008 functions don't use a category; instead + * they use a corresponding mask. Because this function operates in both + * worlds, it may need one or the other or both. This function can + * calculate the mask from the input category, but to avoid this + * calculation, if the caller knows at compile time what the mask is, it + * can pass it, setting 'is_index_valid' to TRUE; otherwise the mask + * parameter is ignored. + * + * POSIX 2008, for some sick reason, chose not to provide a method to find + * the category name of a locale. Some vendors have created a + * querylocale() function to do just that. This function is a lot simpler + * to implement on systems that have this. Otherwise, we have to keep + * track of what the locale has been set to, so that we can return its + * name to emulate setlocale(). It's also possible for C code in some + * library to change the locale without us knowing it, though as of + * September 2017, there are no occurrences in CPAN of uselocale(). Some + * libraries do use setlocale(), but that changes the global locale, and + * threads using per-thread locales will just ignore those changes. + * Another problem is that without querylocale(), we have to guess at what + * was meant by setting a locale of "". We handle this by not actually + * ever setting to "" (unless querylocale exists), but to emulate what we + * think should happen for "". + */ - done: ; + int mask; + locale_t old_obj; + locale_t new_obj; + dTHX; # ifdef DEBUGGING - if (DEBUG_L_TEST || debug_initialization) { - PerlIO_printf(Perl_debug_log, "Locale radix is '%s', ?UTF-8=%d\n", - (PL_numeric_radix_sv) - ? SvPVX(PL_numeric_radix_sv) - : "NULL", - (PL_numeric_radix_sv) - ? cBOOL(SvUTF8(PL_numeric_radix_sv)) - : 0); + if (DEBUG_Lv_TEST || debug_initialization) { + PerlIO_printf(Perl_debug_log, "%s:%d: emulate_setlocale input=%d (%s), \"%s\", %d, %d\n", __FILE__, __LINE__, category, category_name(category), locale, index, is_index_valid); } # endif -#endif /* USE_LOCALE_NUMERIC and can find the radix char */ -} + /* If the input mask might be incorrect, calculate the correct one */ + if (! is_index_valid) { + unsigned int i; +# ifdef DEBUGGING -void -Perl_new_numeric(pTHX_ const char *newnum) -{ + if (DEBUG_Lv_TEST || debug_initialization) { + PerlIO_printf(Perl_debug_log, "%s:%d: finding index of category %d (%s)\n", __FILE__, __LINE__, category, category_name(category)); + } -#ifndef USE_LOCALE_NUMERIC +# endif - PERL_UNUSED_ARG(newnum); + for (i = 0; i <= LC_ALL_INDEX; i++) { + if (category == categories[i]) { + index = i; + goto found_index; + } + } -#else + /* Here, we don't know about this category, so can't handle it. + * Fallback to the early POSIX usages */ + Perl_warner(aTHX_ packWARN(WARN_LOCALE), + "Unknown locale category %d; can't set it to %s\n", + category, locale); + return NULL; - /* Called after each libc setlocale() call affecting LC_NUMERIC, to tell - * core Perl this and that 'newnum' is the name of the new locale. - * It installs this locale as the current underlying default. - * - * The default locale and the C locale can be toggled between by use of the - * set_numeric_underlying() and set_numeric_standard() functions, which - * should probably not be called directly, but only via macros like - * SET_NUMERIC_STANDARD() in perl.h. - * - * The toggling is necessary mainly so that a non-dot radix decimal point - * character can be output, while allowing internal calculations to use a - * dot. - * - * This sets several interpreter-level variables: - * PL_numeric_name The underlying locale's name: a copy of 'newnum' - * PL_numeric_underlying A boolean indicating if the toggled state is such - * that the current locale is the program's underlying - * locale - * PL_numeric_standard An int indicating if the toggled state is such - * that the current locale is the C locale or - * indistinguishable from the C locale. If non-zero, it - * is in C; if > 1, it means it may not be toggled away - * from C. - * PL_numeric_underlying_is_standard A bool kept by this function - * indicating that the underlying locale and the standard - * C locale are indistinguishable for the purposes of - * LC_NUMERIC. This happens when both of the above two - * variables are true at the same time. (Toggling is a - * no-op under these circumstances.) This variable is - * used to avoid having to recalculate. - * Any code changing the locale (outside this file) should use - * POSIX::setlocale, which calls this function. Therefore this function - * should be called directly only from this file and from - * POSIX::setlocale() */ + found_index: ; - char *save_newnum; +# ifdef DEBUGGING - if (! newnum) { - Safefree(PL_numeric_name); - PL_numeric_name = NULL; - PL_numeric_standard = TRUE; - PL_numeric_underlying = TRUE; - PL_numeric_underlying_is_standard = TRUE; - return; - } + if (DEBUG_Lv_TEST || debug_initialization) { + PerlIO_printf(Perl_debug_log, "%s:%d: index is %d for %s\n", __FILE__, __LINE__, index, category_name(category)); + } - save_newnum = stdize_locale(savepv(newnum)); - PL_numeric_underlying = TRUE; - PL_numeric_standard = isNAME_C_OR_POSIX(save_newnum); +# endif - /* If its name isn't C nor POSIX, it could still be indistinguishable from - * them */ - if (! PL_numeric_standard) { - PL_numeric_standard = cBOOL(strEQ(".", my_nl_langinfo(PERL_RADIXCHAR, - FALSE /* Don't toggle locale */ )) - && strEQ("", my_nl_langinfo(PERL_THOUSEP, - FALSE))); } - /* Save the new name if it isn't the same as the previous one, if any */ - if (! PL_numeric_name || strNE(PL_numeric_name, save_newnum)) { - Safefree(PL_numeric_name); - PL_numeric_name = save_newnum; - } - else { - Safefree(save_newnum); - } + mask = category_masks[index]; - PL_numeric_underlying_is_standard = PL_numeric_standard; +# ifdef DEBUGGING - if (DEBUG_L_TEST || debug_initialization) { - PerlIO_printf(Perl_debug_log, "Called new_numeric with %s, PL_numeric_name=%s\n", newnum, PL_numeric_name); + if (DEBUG_Lv_TEST || debug_initialization) { + PerlIO_printf(Perl_debug_log, "%s:%d: category name is %s; mask is 0x%x\n", __FILE__, __LINE__, category_names[index], mask); } - /* Keep LC_NUMERIC in the C locale. This is for XS modules, so they don't - * have to worry about the radix being a non-dot. (Core operations that - * need the underlying locale change to it temporarily). */ - set_numeric_standard(); +# endif -#endif /* USE_LOCALE_NUMERIC */ + /* If just querying what the existing locale is ... */ + if (locale == NULL) { + locale_t cur_obj = uselocale((locale_t) 0); -} +# ifdef DEBUGGING -void -Perl_set_numeric_standard(pTHX) -{ + if (DEBUG_Lv_TEST || debug_initialization) { + PerlIO_printf(Perl_debug_log, "%s:%d: emulate_setlocale querying %p\n", __FILE__, __LINE__, cur_obj); + } -#ifdef USE_LOCALE_NUMERIC +# endif - /* Toggle the LC_NUMERIC locale to C. Most code should use the macros like - * SET_NUMERIC_STANDARD() in perl.h instead of calling this directly. The - * macro avoids calling this routine if toggling isn't necessary according - * to our records (which could be wrong if some XS code has changed the - * locale behind our back) */ + if (cur_obj == LC_GLOBAL_LOCALE) { + return my_setlocale(category, NULL); + } - do_setlocale_c(LC_NUMERIC, "C"); - PL_numeric_standard = TRUE; - PL_numeric_underlying = PL_numeric_underlying_is_standard; - set_numeric_radix(0); +# ifdef HAS_QUERYLOCALE -# ifdef DEBUGGING + return (char *) querylocale(mask, cur_obj); - if (DEBUG_L_TEST || debug_initialization) { - PerlIO_printf(Perl_debug_log, - "LC_NUMERIC locale now is standard C\n"); - } +# else -# endif -#endif /* USE_LOCALE_NUMERIC */ + /* If this assert fails, adjust the size of curlocales in intrpvar.h */ + STATIC_ASSERT_STMT(C_ARRAY_LENGTH(PL_curlocales) > LC_ALL_INDEX); -} +# if defined(_NL_LOCALE_NAME) \ + && defined(DEBUGGING) \ + && ! defined(SETLOCALE_ACCEPTS_ANY_LOCALE_NAME) + /* On systems that accept any locale name, the real underlying locale + * is often returned by this internal function, so we can't use it */ + { + /* Internal glibc for querylocale(), but doesn't handle + * empty-string ("") locale properly; who knows what other + * glitches. Check for it now, under debug. */ + + char * temp_name = nl_langinfo_l(_NL_LOCALE_NAME(category), + uselocale((locale_t) 0)); + /* + PerlIO_printf(Perl_debug_log, "%s:%d: temp_name=%s\n", __FILE__, __LINE__, temp_name ? temp_name : "NULL"); + PerlIO_printf(Perl_debug_log, "%s:%d: index=%d\n", __FILE__, __LINE__, index); + PerlIO_printf(Perl_debug_log, "%s:%d: PL_curlocales[index]=%s\n", __FILE__, __LINE__, PL_curlocales[index]); + */ + if (temp_name && PL_curlocales[index] && strNE(temp_name, "")) { + if ( strNE(PL_curlocales[index], temp_name) + && ! ( isNAME_C_OR_POSIX(temp_name) + && isNAME_C_OR_POSIX(PL_curlocales[index]))) { + +# ifdef USE_C_BACKTRACE + + dump_c_backtrace(Perl_debug_log, 20, 1); -void -Perl_set_numeric_underlying(pTHX) -{ +# endif -#ifdef USE_LOCALE_NUMERIC + Perl_croak(aTHX_ "panic: Mismatch between what Perl thinks %s is" + " (%s) and what internal glibc thinks" + " (%s)\n", category_names[index], + PL_curlocales[index], temp_name); + } - /* Toggle the LC_NUMERIC locale to the current underlying default. Most - * code should use the macros like SET_NUMERIC_UNDERLYING() in perl.h - * instead of calling this directly. The macro avoids calling this routine - * if toggling isn't necessary according to our records (which could be - * wrong if some XS code has changed the locale behind our back) */ + return temp_name; + } + } - do_setlocale_c(LC_NUMERIC, PL_numeric_name); - PL_numeric_standard = PL_numeric_underlying_is_standard; - PL_numeric_underlying = TRUE; - set_numeric_radix(! PL_numeric_standard); +# endif -# ifdef DEBUGGING + /* Without querylocale(), we have to use our record-keeping we've + * done. */ - if (DEBUG_L_TEST || debug_initialization) { - PerlIO_printf(Perl_debug_log, - "LC_NUMERIC locale now is %s\n", - PL_numeric_name); - } + if (category != LC_ALL) { -# endif -#endif /* USE_LOCALE_NUMERIC */ +# ifdef DEBUGGING -} + if (DEBUG_Lv_TEST || debug_initialization) { + PerlIO_printf(Perl_debug_log, "%s:%d: emulate_setlocale returning %s\n", __FILE__, __LINE__, PL_curlocales[index]); + } -/* - * Set up for a new ctype locale. - */ -STATIC void -S_new_ctype(pTHX_ const char *newctype) -{ +# endif -#ifndef USE_LOCALE_CTYPE + return PL_curlocales[index]; + } + else { /* For LC_ALL */ + unsigned int i; + Size_t names_len = 0; + char * all_string; + bool are_all_categories_the_same_locale = TRUE; - PERL_ARGS_ASSERT_NEW_CTYPE; - PERL_UNUSED_ARG(newctype); - PERL_UNUSED_CONTEXT; + /* If we have a valid LC_ALL value, just return it */ + if (PL_curlocales[LC_ALL_INDEX]) { -#else +# ifdef DEBUGGING - /* Called after each libc setlocale() call affecting LC_CTYPE, to tell - * core Perl this and that 'newctype' is the name of the new locale. - * - * This function sets up the folding arrays for all 256 bytes, assuming - * that tofold() is tolc() since fold case is not a concept in POSIX, - * - * Any code changing the locale (outside this file) should use - * POSIX::setlocale, which calls this function. Therefore this function - * should be called directly only from this file and from - * POSIX::setlocale() */ + if (DEBUG_Lv_TEST || debug_initialization) { + PerlIO_printf(Perl_debug_log, "%s:%d: emulate_setlocale returning %s\n", __FILE__, __LINE__, PL_curlocales[LC_ALL_INDEX]); + } - dVAR; - UV i; +# endif - PERL_ARGS_ASSERT_NEW_CTYPE; + return PL_curlocales[LC_ALL_INDEX]; + } - /* We will replace any bad locale warning with 1) nothing if the new one is - * ok; or 2) a new warning for the bad new locale */ - if (PL_warn_locale) { - SvREFCNT_dec_NN(PL_warn_locale); - PL_warn_locale = NULL; - } + /* Otherwise, we need to construct a string of name=value pairs. + * We use the glibc syntax, like + * LC_NUMERIC=C;LC_TIME=en_US.UTF-8;... + * First calculate the needed size. Along the way, check if all + * the locale names are the same */ + for (i = 0; i < LC_ALL_INDEX; i++) { - PL_in_utf8_CTYPE_locale = _is_cur_LC_category_utf8(LC_CTYPE); +# ifdef DEBUGGING - /* A UTF-8 locale gets standard rules. But note that code still has to - * handle this specially because of the three problematic code points */ - if (PL_in_utf8_CTYPE_locale) { - Copy(PL_fold_latin1, PL_fold_locale, 256, U8); - } - else { - /* Assume enough space for every character being bad. 4 spaces each - * for the 94 printable characters that are output like "'x' "; and 5 - * spaces each for "'\\' ", "'\t' ", and "'\n' "; plus a terminating - * NUL */ - char bad_chars_list[ (94 * 4) + (3 * 5) + 1 ]; + if (DEBUG_Lv_TEST || debug_initialization) { + PerlIO_printf(Perl_debug_log, "%s:%d: emulate_setlocale i=%d, name=%s, locale=%s\n", __FILE__, __LINE__, i, category_names[i], PL_curlocales[i]); + } - /* Don't check for problems if we are suppressing the warnings */ - bool check_for_problems = ckWARN_d(WARN_LOCALE) - || UNLIKELY(DEBUG_L_TEST); +# endif + + names_len += strlen(category_names[i]) + + 1 /* '=' */ + + strlen(PL_curlocales[i]) + + 1; /* ';' */ + + if (i > 0 && strNE(PL_curlocales[i], PL_curlocales[i-1])) { + are_all_categories_the_same_locale = FALSE; + } + } + + /* If they are the same, we don't actually have to construct the + * string; we just make the entry in LC_ALL_INDEX valid, and be + * that single name */ + if (are_all_categories_the_same_locale) { + PL_curlocales[LC_ALL_INDEX] = savepv(PL_curlocales[0]); + return PL_curlocales[LC_ALL_INDEX]; + } + + names_len++; /* Trailing '\0' */ + SAVEFREEPV(Newx(all_string, names_len, char)); + *all_string = '\0'; + + /* Then fill in the string */ + for (i = 0; i < LC_ALL_INDEX; i++) { + +# ifdef DEBUGGING + + if (DEBUG_Lv_TEST || debug_initialization) { + PerlIO_printf(Perl_debug_log, "%s:%d: emulate_setlocale i=%d, name=%s, locale=%s\n", __FILE__, __LINE__, i, category_names[i], PL_curlocales[i]); + } + +# endif + + my_strlcat(all_string, category_names[i], names_len); + my_strlcat(all_string, "=", names_len); + my_strlcat(all_string, PL_curlocales[i], names_len); + my_strlcat(all_string, ";", names_len); + } + +# ifdef DEBUGGING + + if (DEBUG_L_TEST || debug_initialization) { + PerlIO_printf(Perl_debug_log, "%s:%d: emulate_setlocale returning %s\n", __FILE__, __LINE__, all_string); + } + + #endif + + return all_string; + } + +# ifdef EINVAL + + SETERRNO(EINVAL, LIB_INVARG); + +# endif + + return NULL; + +# endif + + } + + /* Here, we are switching locales. */ + +# ifndef HAS_QUERYLOCALE + + if (strEQ(locale, "")) { + + /* For non-querylocale() systems, we do the setting of "" ourselves to + * be sure that we really know what's going on. We follow the Linux + * documented behavior (but if that differs from the actual behavior, + * this won't work exactly as the OS implements). We go out and + * examine the environment based on our understanding of how the system + * works, and use that to figure things out */ + + const char * const lc_all = PerlEnv_getenv("LC_ALL"); + + /* Use any "LC_ALL" environment variable, as it overrides everything + * else. */ + if (lc_all && strNE(lc_all, "")) { + locale = lc_all; + } + else { + + /* Otherwise, we need to dig deeper. Unless overridden, the + * default is the LANG environment variable; if it doesn't exist, + * then "C" */ + + const char * default_name; + + /* To minimize other threads messing with the environment, we copy + * the variable, making it a temporary. But this doesn't work upon + * program initialization before any scopes are created, and at + * this time, there's nothing else going on that would interfere. + * So skip the copy in that case */ + if (PL_scopestack_ix == 0) { + default_name = PerlEnv_getenv("LANG"); + } + else { + default_name = savepv(PerlEnv_getenv("LANG")); + } + + if (! default_name || strEQ(default_name, "")) { + default_name = "C"; + } + else if (PL_scopestack_ix != 0) { + SAVEFREEPV(default_name); + } + + if (category != LC_ALL) { + const char * const name = PerlEnv_getenv(category_names[index]); + + /* Here we are setting a single category. Assume will have the + * default name */ + locale = default_name; + + /* But then look for an overriding environment variable */ + if (name && strNE(name, "")) { + locale = name; + } + } + else { + bool did_override = FALSE; + unsigned int i; + + /* Here, we are getting LC_ALL. Any categories that don't have + * a corresponding environment variable set should be set to + * LANG, or to "C" if there is no LANG. If no individual + * categories differ from this, we can just set LC_ALL. This + * is buggy on systems that have extra categories that we don't + * know about. If there is an environment variable that sets + * that category, we won't know to look for it, and so our use + * of LANG or "C" improperly overrides it. On the other hand, + * if we don't do what is done here, and there is no + * environment variable, the category's locale should be set to + * LANG or "C". So there is no good solution. khw thinks the + * best is to look at systems to see what categories they have, + * and include them, and then to assume that we know the + * complete set */ + + for (i = 0; i < LC_ALL_INDEX; i++) { + const char * const env_override + = savepv(PerlEnv_getenv(category_names[i])); + const char * this_locale = ( env_override + && strNE(env_override, "")) + ? env_override + : default_name; + if (! emulate_setlocale(categories[i], this_locale, i, TRUE)) + { + Safefree(env_override); + return NULL; + } + + if (strNE(this_locale, default_name)) { + did_override = TRUE; + } + + Safefree(env_override); + } + + /* If all the categories are the same, we can set LC_ALL to + * that */ + if (! did_override) { + locale = default_name; + } + else { + + /* Here, LC_ALL is no longer valid, as some individual + * categories don't match it. We call ourselves + * recursively, as that will execute the code that + * generates the proper locale string for this situation. + * We don't do the remainder of this function, as that is + * to update our records, and we've just done that for the + * individual categories in the loop above, and doing so + * would cause LC_ALL to be done as well */ + return emulate_setlocale(LC_ALL, NULL, LC_ALL_INDEX, TRUE); + } + } + } + } + else if (strchr(locale, ';')) { + + /* LC_ALL may actually incude a conglomeration of various categories. + * Without querylocale, this code uses the glibc (as of this writing) + * syntax for representing that, but that is not a stable API, and + * other platforms do it differently, so we have to handle all cases + * ourselves */ + + unsigned int i; + const char * s = locale; + const char * e = locale + strlen(locale); + const char * p = s; + const char * category_end; + const char * name_start; + const char * name_end; + + /* If the string that gives what to set doesn't include all categories, + * the omitted ones get set to "C". To get this behavior, first set + * all the individual categories to "C", and override the furnished + * ones below */ + for (i = 0; i < LC_ALL_INDEX; i++) { + if (! emulate_setlocale(categories[i], "C", i, TRUE)) { + return NULL; + } + } + + while (s < e) { + + /* Parse through the category */ + while (isWORDCHAR(*p)) { + p++; + } + category_end = p; + + if (*p++ != '=') { + Perl_croak(aTHX_ + "panic: %s: %d: Unexpected character in locale name '%02X", + __FILE__, __LINE__, *(p-1)); + } + + /* Parse through the locale name */ + name_start = p; + while (p < e && *p != ';') { + if (! isGRAPH(*p)) { + Perl_croak(aTHX_ + "panic: %s: %d: Unexpected character in locale name '%02X", + __FILE__, __LINE__, *(p-1)); + } + p++; + } + name_end = p; + + /* Space past the semi-colon */ + if (p < e) { + p++; + } + + /* Find the index of the category name in our lists */ + for (i = 0; i < LC_ALL_INDEX; i++) { + char * individ_locale; + + /* Keep going if this isn't the index. The strnNE() avoids a + * Perl_form(), but would fail if ever a category name could be + * a substring of another one, like if there were a + * "LC_TIME_DATE" */ + if strnNE(s, category_names[i], category_end - s) { + continue; + } + + /* If this index is for the single category we're changing, we + * have found the locale to set it to. */ + if (category == categories[i]) { + locale = Perl_form(aTHX_ "%.*s", + (int) (name_end - name_start), + name_start); + goto ready_to_set; + } + + assert(category == LC_ALL); + individ_locale = Perl_form(aTHX_ "%.*s", + (int) (name_end - name_start), name_start); + if (! emulate_setlocale(categories[i], individ_locale, i, TRUE)) + { + return NULL; + } + } + + s = p; + } + + /* Here we have set all the individual categories by recursive calls. + * These collectively should have fixed up LC_ALL, so can just query + * what that now is */ + assert(category == LC_ALL); + + return do_setlocale_c(LC_ALL, NULL); + } + + ready_to_set: ; + + /* Here at the end of having to deal with the absence of querylocale(). + * Some cases have already been fully handled by recursive calls to this + * function. But at this point, we haven't dealt with those, but are now + * prepared to, knowing what the locale name to set this category to is. + * This would have come for free if this system had had querylocale() */ + +# endif /* end of ! querylocale */ + + assert(PL_C_locale_obj); + + /* Switching locales generally entails freeing the current one's space (at + * the C library's discretion). We need to stop using that locale before + * the switch. So switch to a known locale object that we don't otherwise + * mess with. This returns the locale object in effect at the time of the + * switch. */ + old_obj = uselocale(PL_C_locale_obj); + +# ifdef DEBUGGING + + if (DEBUG_Lv_TEST || debug_initialization) { + PerlIO_printf(Perl_debug_log, "%s:%d: emulate_setlocale was using %p\n", __FILE__, __LINE__, old_obj); + } + +# endif + + if (! old_obj) { + +# ifdef DEBUGGING + + if (DEBUG_L_TEST || debug_initialization) { + dSAVE_ERRNO; + PerlIO_printf(Perl_debug_log, "%s:%d: emulate_setlocale switching to C failed: %d\n", __FILE__, __LINE__, GET_ERRNO); + RESTORE_ERRNO; + } + +# endif + + return NULL; + } + +# ifdef DEBUGGING + + if (DEBUG_Lv_TEST || debug_initialization) { + PerlIO_printf(Perl_debug_log, "%s:%d: emulate_setlocale now using %p\n", __FILE__, __LINE__, PL_C_locale_obj); + } + +# endif + + /* If we weren't in a thread safe locale, set so that newlocale() below + which uses 'old_obj', uses an empty one. Same for our reserved C object. + The latter is defensive coding, so that, even if there is some bug, we + will never end up trying to modify either of these, as if passed to + newlocale(), they can be. */ + if (old_obj == LC_GLOBAL_LOCALE || old_obj == PL_C_locale_obj) { + old_obj = (locale_t) 0; + } + + /* Ready to create a new locale by modification of the exising one */ + new_obj = newlocale(mask, locale, old_obj); + + if (! new_obj) { + dSAVE_ERRNO; + +# ifdef DEBUGGING + + if (DEBUG_L_TEST || debug_initialization) { + PerlIO_printf(Perl_debug_log, "%s:%d: emulate_setlocale creating new object failed: %d\n", __FILE__, __LINE__, GET_ERRNO); + } + +# endif + + if (! uselocale(old_obj)) { + +# ifdef DEBUGGING + + if (DEBUG_L_TEST || debug_initialization) { + PerlIO_printf(Perl_debug_log, "%s:%d: switching back failed: %d\n", __FILE__, __LINE__, GET_ERRNO); + } + +# endif + + } + RESTORE_ERRNO; + return NULL; + } + +# ifdef DEBUGGING + + if (DEBUG_Lv_TEST || debug_initialization) { + PerlIO_printf(Perl_debug_log, "%s:%d: emulate_setlocale created %p; should have freed %p\n", __FILE__, __LINE__, new_obj, old_obj); + } + +# endif + + /* And switch into it */ + if (! uselocale(new_obj)) { + dSAVE_ERRNO; + +# ifdef DEBUGGING + + if (DEBUG_L_TEST || debug_initialization) { + PerlIO_printf(Perl_debug_log, "%s:%d: emulate_setlocale switching to new object failed\n", __FILE__, __LINE__); + } + +# endif + + if (! uselocale(old_obj)) { + +# ifdef DEBUGGING + + if (DEBUG_L_TEST || debug_initialization) { + PerlIO_printf(Perl_debug_log, "%s:%d: switching back failed: %d\n", __FILE__, __LINE__, GET_ERRNO); + } + +# endif + + } + freelocale(new_obj); + RESTORE_ERRNO; + return NULL; + } + +# ifdef DEBUGGING + + if (DEBUG_Lv_TEST || debug_initialization) { + PerlIO_printf(Perl_debug_log, "%s:%d: emulate_setlocale now using %p\n", __FILE__, __LINE__, new_obj); + } + +# endif + + /* We are done, except for updating our records (if the system doesn't keep + * them) and in the case of locale "", we don't actually know what the + * locale that got switched to is, as it came from the environment. So + * have to find it */ + +# ifdef HAS_QUERYLOCALE + + if (strEQ(locale, "")) { + locale = querylocale(mask, new_obj); + } + +# else + + /* Here, 'locale' is the return value */ + + /* Without querylocale(), we have to update our records */ + + if (category == LC_ALL) { + unsigned int i; + + /* For LC_ALL, we change all individual categories to correspond */ + /* PL_curlocales is a parallel array, so has same + * length as 'categories' */ + for (i = 0; i <= LC_ALL_INDEX; i++) { + Safefree(PL_curlocales[i]); + PL_curlocales[i] = savepv(locale); + } + } + else { + + /* For a single category, if it's not the same as the one in LC_ALL, we + * nullify LC_ALL */ + + if (PL_curlocales[LC_ALL_INDEX] && strNE(PL_curlocales[LC_ALL_INDEX], locale)) { + Safefree(PL_curlocales[LC_ALL_INDEX]); + PL_curlocales[LC_ALL_INDEX] = NULL; + } + + /* Then update the category's record */ + Safefree(PL_curlocales[index]); + PL_curlocales[index] = savepv(locale); + } + +# endif + + return locale; +} + +#endif /* USE_POSIX_2008_LOCALE */ + +#if 0 /* Code that was to emulate thread-safe locales on platforms that + didn't natively support them */ + +/* The way this would work is that we would keep a per-thread list of the + * correct locale for that thread. Any operation that was locale-sensitive + * would have to be changed so that it would look like this: + * + * LOCALE_LOCK; + * setlocale to the correct locale for this operation + * do operation + * LOCALE_UNLOCK + * + * This leaves the global locale in the most recently used operation's, but it + * was locked long enough to get the result. If that result is static, it + * needs to be copied before the unlock. + * + * Macros could be written like SETUP_LOCALE_DEPENDENT_OP(category) that did + * the setup, but are no-ops when not needed, and similarly, + * END_LOCALE_DEPENDENT_OP for the tear-down + * + * But every call to a locale-sensitive function would have to be changed, and + * if a module didn't cooperate by using the mutex, things would break. + * + * This code was abandoned before being completed or tested, and is left as-is +*/ + +# define do_setlocale_c(cat, locale) locking_setlocale(cat, locale, cat ## _INDEX, TRUE) +# define do_setlocale_r(cat, locale) locking_setlocale(cat, locale, 0, FALSE) + +STATIC char * +S_locking_setlocale(pTHX_ + const int category, + const char * locale, + int index, + const bool is_index_valid + ) +{ + /* This function kind of performs a setlocale() on just the current thread; + * thus it is kind of thread-safe. It does this by keeping a thread-level + * array of the current locales for each category. Every time a locale is + * switched to, it does the switch globally, but updates the thread's + * array. A query as to what the current locale is just returns the + * appropriate element from the array, and doesn't actually call the system + * setlocale(). The saving into the array is done in an uninterruptible + * section of code, so is unaffected by whatever any other threads might be + * doing. + * + * All locale-sensitive operations must work by first starting a critical + * section, then switching to the thread's locale as kept by this function, + * and then doing the operation, then ending the critical section. Thus, + * each gets done in the appropriate locale. simulating thread-safety. + * + * This function takes the same parameters, 'category' and 'locale', that + * the regular setlocale() function does, but it also takes two additional + * ones. This is because as described earlier. If we know on input the + * index corresponding to the category into the array where we store the + * current locales, we don't have to calculate it. If the caller knows at + * compile time what the index is, it it can pass it, setting + * 'is_index_valid' to TRUE; otherwise the index parameter is ignored. + * + */ + + /* If the input index might be incorrect, calculate the correct one */ + if (! is_index_valid) { + unsigned int i; + + if (DEBUG_Lv_TEST || debug_initialization) { + PerlIO_printf(Perl_debug_log, "%s:%d: converting category %d to index\n", __FILE__, __LINE__, category); + } + + for (i = 0; i <= LC_ALL_INDEX; i++) { + if (category == categories[i]) { + index = i; + goto found_index; + } + } + + /* Here, we don't know about this category, so can't handle it. + * XXX best we can do is to unsafely set this + * XXX warning */ + + return my_setlocale(category, locale); + + found_index: ; + + if (DEBUG_Lv_TEST || debug_initialization) { + PerlIO_printf(Perl_debug_log, "%s:%d: index is 0x%x\n", __FILE__, __LINE__, index); + } + } + + /* For a query, just return what's in our records */ + if (new_locale == NULL) { + return curlocales[index]; + } + + + /* Otherwise, we need to do the switch, and save the result, all in a + * critical section */ + + Safefree(curlocales[[index]]); + + /* It might be that this is called from an already-locked section of code. + * We would have to detect and skip the LOCK/UNLOCK if so */ + LOCALE_LOCK; + + curlocales[index] = savepv(my_setlocale(category, new_locale)); + + if (strEQ(new_locale, "")) { + +#ifdef LC_ALL + + /* The locale values come from the environment, and may not all be the + * same, so for LC_ALL, we have to update all the others, while the + * mutex is still locked */ + + if (category == LC_ALL) { + unsigned int i; + for (i = 0; i < LC_ALL_INDEX) { + curlocales[i] = my_setlocale(categories[i], NULL); + } + } + } + +#endif + + LOCALE_UNLOCK; + + return curlocales[index]; +} + +#endif +#ifdef USE_LOCALE + +STATIC void +S_set_numeric_radix(pTHX_ const bool use_locale) +{ + /* If 'use_locale' is FALSE, set to use a dot for the radix character. If + * TRUE, use the radix character derived from the current locale */ + +#if defined(USE_LOCALE_NUMERIC) && ( defined(HAS_LOCALECONV) \ + || defined(HAS_NL_LANGINFO)) + + const char * radix = (use_locale) + ? my_nl_langinfo(RADIXCHAR, FALSE) + /* FALSE => already in dest locale */ + : "."; + + sv_setpv(PL_numeric_radix_sv, radix); + + /* If this is valid UTF-8 that isn't totally ASCII, and we are in + * a UTF-8 locale, then mark the radix as being in UTF-8 */ + if (is_utf8_non_invariant_string((U8 *) SvPVX(PL_numeric_radix_sv), + SvCUR(PL_numeric_radix_sv)) + && _is_cur_LC_category_utf8(LC_NUMERIC)) + { + SvUTF8_on(PL_numeric_radix_sv); + } + +# ifdef DEBUGGING + + if (DEBUG_L_TEST || debug_initialization) { + PerlIO_printf(Perl_debug_log, "Locale radix is '%s', ?UTF-8=%d\n", + SvPVX(PL_numeric_radix_sv), + cBOOL(SvUTF8(PL_numeric_radix_sv))); + } + +# endif +#else + + PERL_UNUSED_ARG(use_locale); + +#endif /* USE_LOCALE_NUMERIC and can find the radix char */ + +} + +STATIC void +S_new_numeric(pTHX_ const char *newnum) +{ + +#ifndef USE_LOCALE_NUMERIC + + PERL_UNUSED_ARG(newnum); + +#else + + /* Called after each libc setlocale() call affecting LC_NUMERIC, to tell + * core Perl this and that 'newnum' is the name of the new locale. + * It installs this locale as the current underlying default. + * + * The default locale and the C locale can be toggled between by use of the + * set_numeric_underlying() and set_numeric_standard() functions, which + * should probably not be called directly, but only via macros like + * SET_NUMERIC_STANDARD() in perl.h. + * + * The toggling is necessary mainly so that a non-dot radix decimal point + * character can be output, while allowing internal calculations to use a + * dot. + * + * This sets several interpreter-level variables: + * PL_numeric_name The underlying locale's name: a copy of 'newnum' + * PL_numeric_underlying A boolean indicating if the toggled state is such + * that the current locale is the program's underlying + * locale + * PL_numeric_standard An int indicating if the toggled state is such + * that the current locale is the C locale or + * indistinguishable from the C locale. If non-zero, it + * is in C; if > 1, it means it may not be toggled away + * from C. + * PL_numeric_underlying_is_standard A bool kept by this function + * indicating that the underlying locale and the standard + * C locale are indistinguishable for the purposes of + * LC_NUMERIC. This happens when both of the above two + * variables are true at the same time. (Toggling is a + * no-op under these circumstances.) This variable is + * used to avoid having to recalculate. + */ + + char *save_newnum; + + if (! newnum) { + Safefree(PL_numeric_name); + PL_numeric_name = NULL; + PL_numeric_standard = TRUE; + PL_numeric_underlying = TRUE; + PL_numeric_underlying_is_standard = TRUE; + return; + } + + save_newnum = stdize_locale(savepv(newnum)); + PL_numeric_underlying = TRUE; + PL_numeric_standard = isNAME_C_OR_POSIX(save_newnum); + +#ifndef TS_W32_BROKEN_LOCALECONV + + /* If its name isn't C nor POSIX, it could still be indistinguishable from + * them. But on broken Windows systems calling my_nl_langinfo() for + * THOUSEP can currently (but rarely) cause a race, so avoid doing that, + * and just always change the locale if not C nor POSIX on those systems */ + if (! PL_numeric_standard) { + PL_numeric_standard = cBOOL(strEQ(".", my_nl_langinfo(RADIXCHAR, + FALSE /* Don't toggle locale */ )) + && strEQ("", my_nl_langinfo(THOUSEP, FALSE))); + } + +#endif + + /* Save the new name if it isn't the same as the previous one, if any */ + if (! PL_numeric_name || strNE(PL_numeric_name, save_newnum)) { + Safefree(PL_numeric_name); + PL_numeric_name = save_newnum; + } + else { + Safefree(save_newnum); + } + + PL_numeric_underlying_is_standard = PL_numeric_standard; + +# ifdef HAS_POSIX_2008_LOCALE + + PL_underlying_numeric_obj = newlocale(LC_NUMERIC_MASK, + PL_numeric_name, + PL_underlying_numeric_obj); + +#endif + + if (DEBUG_L_TEST || debug_initialization) { + PerlIO_printf(Perl_debug_log, "Called new_numeric with %s, PL_numeric_name=%s\n", newnum, PL_numeric_name); + } + + /* Keep LC_NUMERIC in the C locale. This is for XS modules, so they don't + * have to worry about the radix being a non-dot. (Core operations that + * need the underlying locale change to it temporarily). */ + if (PL_numeric_standard) { + set_numeric_radix(0); + } + else { + set_numeric_standard(); + } + +#endif /* USE_LOCALE_NUMERIC */ + +} + +void +Perl_set_numeric_standard(pTHX) +{ + +#ifdef USE_LOCALE_NUMERIC + + /* Toggle the LC_NUMERIC locale to C. Most code should use the macros like + * SET_NUMERIC_STANDARD() in perl.h instead of calling this directly. The + * macro avoids calling this routine if toggling isn't necessary according + * to our records (which could be wrong if some XS code has changed the + * locale behind our back) */ + +# ifdef DEBUGGING + + if (DEBUG_L_TEST || debug_initialization) { + PerlIO_printf(Perl_debug_log, + "Setting LC_NUMERIC locale to standard C\n"); + } + +# endif + + do_setlocale_c(LC_NUMERIC, "C"); + PL_numeric_standard = TRUE; + PL_numeric_underlying = PL_numeric_underlying_is_standard; + set_numeric_radix(0); + +#endif /* USE_LOCALE_NUMERIC */ + +} + +void +Perl_set_numeric_underlying(pTHX) +{ + +#ifdef USE_LOCALE_NUMERIC + + /* Toggle the LC_NUMERIC locale to the current underlying default. Most + * code should use the macros like SET_NUMERIC_UNDERLYING() in perl.h + * instead of calling this directly. The macro avoids calling this routine + * if toggling isn't necessary according to our records (which could be + * wrong if some XS code has changed the locale behind our back) */ + +# ifdef DEBUGGING + + if (DEBUG_L_TEST || debug_initialization) { + PerlIO_printf(Perl_debug_log, + "Setting LC_NUMERIC locale to %s\n", + PL_numeric_name); + } + +# endif + + do_setlocale_c(LC_NUMERIC, PL_numeric_name); + PL_numeric_standard = PL_numeric_underlying_is_standard; + PL_numeric_underlying = TRUE; + set_numeric_radix(! PL_numeric_standard); + +#endif /* USE_LOCALE_NUMERIC */ + +} + +/* + * Set up for a new ctype locale. + */ +STATIC void +S_new_ctype(pTHX_ const char *newctype) +{ + +#ifndef USE_LOCALE_CTYPE + + PERL_UNUSED_ARG(newctype); + PERL_UNUSED_CONTEXT; + +#else + + /* Called after each libc setlocale() call affecting LC_CTYPE, to tell + * core Perl this and that 'newctype' is the name of the new locale. + * + * This function sets up the folding arrays for all 256 bytes, assuming + * that tofold() is tolc() since fold case is not a concept in POSIX, + * + * Any code changing the locale (outside this file) should use + * Perl_setlocale or POSIX::setlocale, which call this function. Therefore + * this function should be called directly only from this file and from + * POSIX::setlocale() */ + + dVAR; + unsigned int i; + + /* Don't check for problems if we are suppressing the warnings */ + bool check_for_problems = ckWARN_d(WARN_LOCALE) || UNLIKELY(DEBUG_L_TEST); + bool maybe_utf8_turkic = FALSE; + + PERL_ARGS_ASSERT_NEW_CTYPE; + + /* We will replace any bad locale warning with 1) nothing if the new one is + * ok; or 2) a new warning for the bad new locale */ + if (PL_warn_locale) { + SvREFCNT_dec_NN(PL_warn_locale); + PL_warn_locale = NULL; + } + + PL_in_utf8_CTYPE_locale = _is_cur_LC_category_utf8(LC_CTYPE); + + /* A UTF-8 locale gets standard rules. But note that code still has to + * handle this specially because of the three problematic code points */ + if (PL_in_utf8_CTYPE_locale) { + Copy(PL_fold_latin1, PL_fold_locale, 256, U8); + + /* UTF-8 locales can have special handling for 'I' and 'i' if they are + * Turkic. Make sure these two are the only anomalies. (We don't use + * towupper and towlower because they aren't in C89.) */ + +#if defined(HAS_TOWUPPER) && defined (HAS_TOWLOWER) + + if (towupper('i') == 0x130 && towlower('I') == 0x131) { + +#else + + if (toupper('i') == 'i' && tolower('I') == 'I') { + +#endif + check_for_problems = TRUE; + maybe_utf8_turkic = TRUE; + } + } + + /* We don't populate the other lists if a UTF-8 locale, but do check that + * everything works as expected, unless checking turned off */ + if (check_for_problems || ! PL_in_utf8_CTYPE_locale) { + /* Assume enough space for every character being bad. 4 spaces each + * for the 94 printable characters that are output like "'x' "; and 5 + * spaces each for "'\\' ", "'\t' ", and "'\n' "; plus a terminating + * NUL */ + char bad_chars_list[ (94 * 4) + (3 * 5) + 1 ] = { '\0' }; bool multi_byte_locale = FALSE; /* Assume is a single-byte locale to start */ unsigned int bad_count = 0; /* Count of bad characters */ for (i = 0; i < 256; i++) { - if (isupper(i)) - PL_fold_locale[i] = (U8) tolower(i); - else if (islower(i)) - PL_fold_locale[i] = (U8) toupper(i); - else - PL_fold_locale[i] = (U8) i; + if (! PL_in_utf8_CTYPE_locale) { + if (isupper(i)) + PL_fold_locale[i] = (U8) tolower(i); + else if (islower(i)) + PL_fold_locale[i] = (U8) toupper(i); + else + PL_fold_locale[i] = (U8) i; + } /* If checking for locale problems, see if the native ASCII-range * printables plus \n and \t are in their expected categories in @@ -654,44 +1583,128 @@ S_new_ctype(pTHX_ const char *newctype) if ( check_for_problems && (isGRAPH_A(i) || isBLANK_A(i) || i == '\n')) { - if ( cBOOL(isalnum(i)) != cBOOL(isALPHANUMERIC(i)) - || cBOOL(isalpha(i)) != cBOOL(isALPHA_A(i)) - || cBOOL(isdigit(i)) != cBOOL(isDIGIT_A(i)) - || cBOOL(isgraph(i)) != cBOOL(isGRAPH_A(i)) - || cBOOL(islower(i)) != cBOOL(isLOWER_A(i)) - || cBOOL(isprint(i)) != cBOOL(isPRINT_A(i)) - || cBOOL(ispunct(i)) != cBOOL(isPUNCT_A(i)) - || cBOOL(isspace(i)) != cBOOL(isSPACE_A(i)) - || cBOOL(isupper(i)) != cBOOL(isUPPER_A(i)) - || cBOOL(isxdigit(i))!= cBOOL(isXDIGIT_A(i)) - || tolower(i) != (int) toLOWER_A(i) - || toupper(i) != (int) toUPPER_A(i) - || (i == '\n' && ! isCNTRL_LC(i))) - { - if (bad_count) { /* Separate multiple entries with a - blank */ - bad_chars_list[bad_count++] = ' '; - } - bad_chars_list[bad_count++] = '\''; - if (isPRINT_A(i)) { - bad_chars_list[bad_count++] = (char) i; - } - else { - bad_chars_list[bad_count++] = '\\'; - if (i == '\n') { - bad_chars_list[bad_count++] = 'n'; - } - else { - assert(i == '\t'); - bad_chars_list[bad_count++] = 't'; - } + bool is_bad = FALSE; + char name[4] = { '\0' }; + + /* Convert the name into a string */ + if (isGRAPH_A(i)) { + name[0] = i; + name[1] = '\0'; + } + else if (i == '\n') { + my_strlcpy(name, "\\n", sizeof(name)); + } + else if (i == '\t') { + my_strlcpy(name, "\\t", sizeof(name)); + } + else { + assert(i == ' '); + my_strlcpy(name, "' '", sizeof(name)); + } + + /* Check each possibe class */ + if (UNLIKELY(cBOOL(isalnum(i)) != cBOOL(isALPHANUMERIC_A(i)))) { + is_bad = TRUE; + DEBUG_L(PerlIO_printf(Perl_debug_log, + "isalnum('%s') unexpectedly is %d\n", + name, cBOOL(isalnum(i)))); + } + if (UNLIKELY(cBOOL(isalpha(i)) != cBOOL(isALPHA_A(i)))) { + is_bad = TRUE; + DEBUG_L(PerlIO_printf(Perl_debug_log, + "isalpha('%s') unexpectedly is %d\n", + name, cBOOL(isalpha(i)))); + } + if (UNLIKELY(cBOOL(isdigit(i)) != cBOOL(isDIGIT_A(i)))) { + is_bad = TRUE; + DEBUG_L(PerlIO_printf(Perl_debug_log, + "isdigit('%s') unexpectedly is %d\n", + name, cBOOL(isdigit(i)))); + } + if (UNLIKELY(cBOOL(isgraph(i)) != cBOOL(isGRAPH_A(i)))) { + is_bad = TRUE; + DEBUG_L(PerlIO_printf(Perl_debug_log, + "isgraph('%s') unexpectedly is %d\n", + name, cBOOL(isgraph(i)))); + } + if (UNLIKELY(cBOOL(islower(i)) != cBOOL(isLOWER_A(i)))) { + is_bad = TRUE; + DEBUG_L(PerlIO_printf(Perl_debug_log, + "islower('%s') unexpectedly is %d\n", + name, cBOOL(islower(i)))); + } + if (UNLIKELY(cBOOL(isprint(i)) != cBOOL(isPRINT_A(i)))) { + is_bad = TRUE; + DEBUG_L(PerlIO_printf(Perl_debug_log, + "isprint('%s') unexpectedly is %d\n", + name, cBOOL(isprint(i)))); + } + if (UNLIKELY(cBOOL(ispunct(i)) != cBOOL(isPUNCT_A(i)))) { + is_bad = TRUE; + DEBUG_L(PerlIO_printf(Perl_debug_log, + "ispunct('%s') unexpectedly is %d\n", + name, cBOOL(ispunct(i)))); + } + if (UNLIKELY(cBOOL(isspace(i)) != cBOOL(isSPACE_A(i)))) { + is_bad = TRUE; + DEBUG_L(PerlIO_printf(Perl_debug_log, + "isspace('%s') unexpectedly is %d\n", + name, cBOOL(isspace(i)))); + } + if (UNLIKELY(cBOOL(isupper(i)) != cBOOL(isUPPER_A(i)))) { + is_bad = TRUE; + DEBUG_L(PerlIO_printf(Perl_debug_log, + "isupper('%s') unexpectedly is %d\n", + name, cBOOL(isupper(i)))); + } + if (UNLIKELY(cBOOL(isxdigit(i))!= cBOOL(isXDIGIT_A(i)))) { + is_bad = TRUE; + DEBUG_L(PerlIO_printf(Perl_debug_log, + "isxdigit('%s') unexpectedly is %d\n", + name, cBOOL(isxdigit(i)))); + } + if (UNLIKELY(tolower(i) != (int) toLOWER_A(i))) { + is_bad = TRUE; + DEBUG_L(PerlIO_printf(Perl_debug_log, + "tolower('%s')=0x%x instead of the expected 0x%x\n", + name, tolower(i), (int) toLOWER_A(i))); + } + if (UNLIKELY(toupper(i) != (int) toUPPER_A(i))) { + is_bad = TRUE; + DEBUG_L(PerlIO_printf(Perl_debug_log, + "toupper('%s')=0x%x instead of the expected 0x%x\n", + name, toupper(i), (int) toUPPER_A(i))); + } + if (UNLIKELY((i == '\n' && ! isCNTRL_LC(i)))) { + is_bad = TRUE; + DEBUG_L(PerlIO_printf(Perl_debug_log, + "'\\n' (=%02X) is not a control\n", (int) i)); + } + + /* Add to the list; Separate multiple entries with a blank */ + if (is_bad) { + if (bad_count) { + my_strlcat(bad_chars_list, " ", sizeof(bad_chars_list)); } - bad_chars_list[bad_count++] = '\''; - bad_chars_list[bad_count] = '\0'; + my_strlcat(bad_chars_list, name, sizeof(bad_chars_list)); + bad_count++; } } } + if (bad_count == 2 && maybe_utf8_turkic) { + bad_count = 0; + *bad_chars_list = '\0'; + PL_fold_locale['I'] = 'I'; + PL_fold_locale['i'] = 'i'; + PL_in_utf8_turkic_locale = TRUE; + DEBUG_L(PerlIO_printf(Perl_debug_log, "%s:%d: %s is turkic\n", + __FILE__, __LINE__, newctype)); + } + else { + PL_in_utf8_turkic_locale = FALSE; + } + # ifdef MB_CUR_MAX /* We only handle single-byte locales (outside of UTF-8 ones; so if @@ -701,7 +1714,8 @@ S_new_ctype(pTHX_ const char *newctype) "%s:%d: check_for_problems=%d, MB_CUR_MAX=%d\n", __FILE__, __LINE__, check_for_problems, (int) MB_CUR_MAX)); - if (check_for_problems && MB_CUR_MAX > 1 + if ( check_for_problems && MB_CUR_MAX > 1 + && ! PL_in_utf8_CTYPE_locale /* Some platforms return MB_CUR_MAX > 1 for even the "C" * locale. Just assume that the implementation for them (plus @@ -716,8 +1730,19 @@ S_new_ctype(pTHX_ const char *newctype) # endif - if (bad_count || multi_byte_locale) { - PL_warn_locale = Perl_newSVpvf(aTHX_ + /* If we found problems and we want them output, do so */ + if ( (UNLIKELY(bad_count) || UNLIKELY(multi_byte_locale)) + && (LIKELY(ckWARN_d(WARN_LOCALE)) || UNLIKELY(DEBUG_L_TEST))) + { + if (UNLIKELY(bad_count) && PL_in_utf8_CTYPE_locale) { + PL_warn_locale = Perl_newSVpvf(aTHX_ + "Locale '%s' contains (at least) the following characters" + " which have\nunexpected meanings: %s\nThe Perl program" + " will use the expected meanings", + newctype, bad_chars_list); + } + else { + PL_warn_locale = Perl_newSVpvf(aTHX_ "Locale '%s' may not work well.%s%s%s\n", newctype, (multi_byte_locale) @@ -733,6 +1758,18 @@ S_new_ctype(pTHX_ const char *newctype) ? bad_chars_list : "" ); + } + +# ifdef HAS_NL_LANGINFO + + Perl_sv_catpvf(aTHX_ PL_warn_locale, "; codeset=%s", + /* parameter FALSE is a don't care here */ + my_nl_langinfo(CODESET, FALSE)); + +# endif + + Perl_sv_catpvf(aTHX_ PL_warn_locale, "\n"); + /* If we are actually in the scope of the locale or are debugging, * output the message now. If not in that scope, we save the * message to be output at the first operation using this locale, @@ -740,20 +1777,9 @@ S_new_ctype(pTHX_ const char *newctype) * they are immune to bad ones. */ if (IN_LC(LC_CTYPE) || UNLIKELY(DEBUG_L_TEST)) { - /* We have to save 'newctype' because the setlocale() just - * below may destroy it. The next setlocale() further down - * should restore it properly so that the intermediate change - * here is transparent to this function's caller */ - const char * const badlocale = savepv(newctype); - - do_setlocale_c(LC_CTYPE, "C"); - /* The '0' below suppresses a bogus gcc compiler warning */ Perl_warner(aTHX_ packWARN(WARN_LOCALE), SvPVX(PL_warn_locale), 0); - do_setlocale_c(LC_CTYPE, badlocale); - Safefree(badlocale); - if (IN_LC(LC_CTYPE)) { SvREFCNT_dec_NN(PL_warn_locale); PL_warn_locale = NULL; @@ -1012,6 +2038,8 @@ S_new_collate(pTHX_ const char *newcoll) } +#endif + #ifdef WIN32 STATIC char * @@ -1072,8 +2100,12 @@ S_win32_setlocale(pTHX_ int category, const char* locale) } result = setlocale(category, locale); - DEBUG_L(PerlIO_printf(Perl_debug_log, "%s:%d: %s\n", __FILE__, __LINE__, - setlocale_debug_string(category, locale, result))); + DEBUG_L(STMT_START { + dSAVE_ERRNO; + PerlIO_printf(Perl_debug_log, "%s:%d: %s\n", __FILE__, __LINE__, + setlocale_debug_string(category, locale, result)); + RESTORE_ERRNO; + } STMT_END); if (! override_LC_ALL) { return result; @@ -1096,40 +2128,97 @@ S_win32_setlocale(pTHX_ int category, const char* locale) } result = setlocale(LC_ALL, NULL); - DEBUG_L(PerlIO_printf(Perl_debug_log, "%s:%d: %s\n", + DEBUG_L(STMT_START { + dSAVE_ERRNO; + PerlIO_printf(Perl_debug_log, "%s:%d: %s\n", __FILE__, __LINE__, - setlocale_debug_string(LC_ALL, NULL, result))); + setlocale_debug_string(LC_ALL, NULL, result)); + RESTORE_ERRNO; + } STMT_END); return result; } #endif -char * -Perl_setlocale(int category, const char * locale) +/* + +=head1 Locale-related functions and macros + +=for apidoc Perl_setlocale + +This is an (almost) drop-in replacement for the system L>, +taking the same parameters, and returning the same information, except that it +returns the correct underlying C locale. Regular C will +instead return C if the underlying locale has a non-dot decimal point +character, or a non-empty thousands separator for displaying floating point +numbers. This is because perl keeps that locale category such that it has a +dot and empty separator, changing the locale briefly during the operations +where the underlying one is required. C knows about this, and +compensates; regular C doesn't. + +Another reason it isn't completely a drop-in replacement is that it is +declared to return S>, whereas the system setlocale omits the +C (presumably because its API was specified long ago, and can't be +updated; it is illegal to change the information C returns; doing +so leads to segfaults.) + +Finally, C works under all circumstances, whereas plain +C can be completely ineffective on some platforms under some +configurations. + +C should not be used to change the locale except on systems +where the predefined variable C<${^SAFE_LOCALES}> is 1. On some such systems, +the system C is ineffective, returning the wrong information, and +failing to actually change the locale. C, however works +properly in all circumstances. + +The return points to a per-thread static buffer, which is overwritten the next +time C is called from the same thread. + +=cut + +*/ + +const char * +Perl_setlocale(const int category, const char * locale) { /* This wraps POSIX::setlocale() */ - char * retval; - char * newlocale; +#ifdef NO_LOCALE + + PERL_UNUSED_ARG(category); + PERL_UNUSED_ARG(locale); + + return "C"; + +#else + + const char * retval; + const char * newlocale; + dSAVEDERRNO; dTHX; + DECLARATION_FOR_LC_NUMERIC_MANIPULATION; #ifdef USE_LOCALE_NUMERIC /* A NULL locale means only query what the current one is. We have the * LC_NUMERIC name saved, because we are normally switched into the C - * locale for it. For an LC_ALL query, switch back to get the correct - * results. All other categories don't require special handling */ + * (or equivalent) locale for it. For an LC_ALL query, switch back to get + * the correct results. All other categories don't require special + * handling */ if (locale == NULL) { if (category == LC_NUMERIC) { - return savepv(PL_numeric_name); + + /* We don't have to copy this return value, as it is a per-thread + * variable, and won't change until a future setlocale */ + return PL_numeric_name; } # ifdef LC_ALL - else if (category == LC_ALL && ! PL_numeric_underlying) { - - SET_NUMERIC_UNDERLYING(); + else if (category == LC_ALL) { + STORE_LC_NUMERIC_FORCE_TO_UNDERLYING(); } # endif @@ -1138,26 +2227,30 @@ Perl_setlocale(int category, const char * locale) #endif - /* Save retval since subsequent setlocale() calls may overwrite it. */ - retval = savepv(do_setlocale_r(category, locale)); + retval = save_to_buffer(do_setlocale_r(category, locale), + &PL_setlocale_buf, &PL_setlocale_bufsize, 0); + SAVE_ERRNO; + +#if defined(USE_LOCALE_NUMERIC) && defined(LC_ALL) + + if (locale == NULL && category == LC_ALL) { + RESTORE_LC_NUMERIC(); + } + +#endif DEBUG_L(PerlIO_printf(Perl_debug_log, "%s:%d: %s\n", __FILE__, __LINE__, setlocale_debug_string(category, locale, retval))); - if (! retval) { - /* Should never happen that a query would return an error, but be - * sure and reset to C locale */ - if (locale == 0) { - SET_NUMERIC_STANDARD(); - } + RESTORE_ERRNO; + + if (! retval) { return NULL; } - /* If locale == NULL, we are just querying the state, but may have switched - * to NUMERIC_UNDERLYING. Switch back before returning. */ + /* If locale == NULL, we are just querying the state */ if (locale == NULL) { - SET_NUMERIC_STANDARD(); return retval; } @@ -1197,20 +2290,23 @@ Perl_setlocale(int category, const char * locale) # ifdef USE_LOCALE_CTYPE - newlocale = do_setlocale_c(LC_CTYPE, NULL); + newlocale = savepv(do_setlocale_c(LC_CTYPE, NULL)); new_ctype(newlocale); + Safefree(newlocale); # endif /* USE_LOCALE_CTYPE */ # ifdef USE_LOCALE_COLLATE - newlocale = do_setlocale_c(LC_COLLATE, NULL); + newlocale = savepv(do_setlocale_c(LC_COLLATE, NULL)); new_collate(newlocale); + Safefree(newlocale); # endif # ifdef USE_LOCALE_NUMERIC - newlocale = do_setlocale_c(LC_NUMERIC, NULL); + newlocale = savepv(do_setlocale_c(LC_NUMERIC, NULL)); new_numeric(newlocale); + Safefree(newlocale); # endif /* USE_LOCALE_NUMERIC */ #endif /* LC_ALL */ @@ -1221,6 +2317,7 @@ Perl_setlocale(int category, const char * locale) return retval; +#endif } @@ -1230,10 +2327,16 @@ S_save_to_buffer(const char * string, char **buf, Size_t *buf_size, const Size_t /* Copy the NUL-terminated 'string' to 'buf' + 'offset'. 'buf' has size 'buf_size', * growing it if necessary */ - const Size_t string_size = strlen(string) + offset + 1; + Size_t string_size; PERL_ARGS_ASSERT_SAVE_TO_BUFFER; + if (! string) { + return NULL; + } + + string_size = strlen(string) + offset + 1; + if (*buf_size == 0) { Newx(*buf, string_size, char); *buf_size = string_size; @@ -1249,11 +2352,9 @@ S_save_to_buffer(const char * string, char **buf, Size_t *buf_size, const Size_t /* -=head1 Locale-related functions and macros - =for apidoc Perl_langinfo -This is an (almost ª) drop-in replacement for the system C>, +This is an (almost) drop-in replacement for the system C>, taking the same C parameter values, and returning the same information. But it is more thread-safe than regular C, and hides the quirks of Perl's locale handling from your code, and can be used on systems that lack @@ -1265,88 +2366,66 @@ Expanding on these: =item * -It delivers the correct results for the C and C items, -without you having to write extra code. The reason for the extra code would be -because these are from the C locale category, which is normally -kept set to the C locale by Perl, no matter what the underlying locale is -supposed to be, and so to get the expected results, you have to temporarily -toggle into the underlying locale, and later toggle back. (You could use -plain C and C> for this -but then you wouldn't get the other advantages of C; not -keeping C in the C locale would break a lot of CPAN, which is -expecting the radix (decimal point) character to be a dot.) +The reason it isn't quite a drop-in replacement is actually an advantage. The +only difference is that it returns S>, whereas plain +C returns S>, but you are (only by documentation) +forbidden to write into the buffer. By declaring this C, the compiler +enforces this restriction, so if it is violated, you know at compilation time, +rather than getting segfaults at runtime. =item * -Depending on C, it works on systems that don't have C, hence -makes your code more portable. Of the fifty-some possible items specified by -the POSIX 2008 standard, -L, -only two are completely unimplemented. It uses various techniques to recover -the other items, including calling C>, and C>, -both of which are specified in C89, so should be always be available. Later -C versions have additional capabilities; C<""> is returned for -those not available on your system. - -The details for those items which may differ from what this emulation returns -and what a native C would return are: - -=over - -=item C - -=item C - -Unimplemented, so returns C<"">. - -=item C - -=item C - -=item C - -=item C - -Only the values for English are returned. C and C have been -removed from POSIX 2008, and are retained for backwards compatibility. Your -platform's C may not support them. - -=item C - -Always evaluates to C<%x>, the locale's appropriate date representation. - -=item C - -Always evaluates to C<%X>, the locale's appropriate time representation. - -=item C - -Always evaluates to C<%c>, the locale's appropriate date and time -representation. - -=item C - -The return may be incorrect for those rare locales where the currency symbol -replaces the radix character. -Send email to L if you have examples of it needing -to work differently. - -=item C +It delivers the correct results for the C and C items, +without you having to write extra code. The reason for the extra code would be +because these are from the C locale category, which is normally +kept set by Perl so that the radix is a dot, and the separator is the empty +string, no matter what the underlying locale is supposed to be, and so to get +the expected results, you have to temporarily toggle into the underlying +locale, and later toggle back. (You could use plain C and +C> for this but then you wouldn't get +the other advantages of C; not keeping C in the C +(or equivalent) locale would break a lot of CPAN, which is expecting the radix +(decimal point) character to be a dot.) -Currently this gives the same results as Linux does. -Send email to L if you have examples of it needing -to work differently. +=item * -=item C +The system function it replaces can have its static return buffer trashed, +not only by a subesequent call to that function, but by a C, +C, or other locale change. The returned buffer of this function is +not changed until the next call to it, so the buffer is never in a trashed +state. -=item C +=item * -=item C +Its return buffer is per-thread, so it also is never overwritten by a call to +this function from another thread; unlike the function it replaces. -=item C +=item * -These are derived by using C, and not all versions of that function -know about them. C<""> is returned for these on such systems. +But most importantly, it works on systems that don't have C, such +as Windows, hence makes your code more portable. Of the fifty-some possible +items specified by the POSIX 2008 standard, +L, +only one is completely unimplemented, though on non-Windows platforms, another +significant one is also not implemented). It uses various techniques to +recover the other items, including calling C>, and +C>, both of which are specified in C89, so should be always be +available. Later C versions have additional capabilities; C<""> is +returned for those not available on your system. + +It is important to note that when called with an item that is recovered by +using C, the buffer from any previous explicit call to +C will be overwritten. This means you must save that buffer's +contents if you need to access them after a call to this function. (But note +that you might not want to be using C directly anyway, because of +issues like the ones listed in the second item of this list (above) for +C and C. You can use the methods given in L to +call L and avoid all the issues, but then you have a hash to +unpack). + +The details for those items which may deviate from what this emulation returns +and what a native C would return are specified in +L. =back @@ -1357,29 +2436,8 @@ C, you must before the C C<#include>. You can replace your C C<#include> with this one. (Doing it this way keeps out the symbols that plain -C imports into the namespace for code that doesn't need it.) - -You also should not use the bare C item names, but should preface -them with C, so use C instead of plain C. -The C> versions will also work for this function on systems that do -have a native C. - -=item * - -It is thread-friendly, returning its result in a buffer that won't be -overwritten by another thread, so you don't have to code for that possibility. -The buffer can be overwritten by the next call to C or -C in the same thread. - -=item * - -ª It returns S>, whereas plain C returns S>, but you are (only by documentation) forbidden to write into the buffer. -By declaring this C, the compiler enforces this restriction. The extra -C is why this isn't an unequivocal drop-in replacement for -C. - -=back +C would try to import into the namespace for code that doesn't need +it.) The original impetus for C was so that code that needs to find out the current currency symbol, floating point radix character, or digit @@ -1403,7 +2461,7 @@ Perl_langinfo(const int item) return my_nl_langinfo(item, TRUE); } -const char * +STATIC const char * #ifdef HAS_NL_LANGINFO S_my_nl_langinfo(const nl_item item, bool toggle) #else @@ -1411,69 +2469,101 @@ S_my_nl_langinfo(const int item, bool toggle) #endif { dTHX; + const char * retval; + +#ifdef USE_LOCALE_NUMERIC + + /* We only need to toggle into the underlying LC_NUMERIC locale for these + * two items, and only if not already there */ + if (toggle && (( item != RADIXCHAR && item != THOUSEP) + || PL_numeric_underlying)) + +#endif /* No toggling needed if not using LC_NUMERIC */ + + toggle = FALSE; #if defined(HAS_NL_LANGINFO) /* nl_langinfo() is available. */ -#if ! defined(HAS_POSIX_2008_LOCALE) +# if ! defined(HAS_THREAD_SAFE_NL_LANGINFO_L) \ + || ! defined(HAS_POSIX_2008_LOCALE) \ + || ! defined(DUPLOCALE) /* Here, use plain nl_langinfo(), switching to the underlying LC_NUMERIC * for those items dependent on it. This must be copied to a buffer before * switching back, as some systems destroy the buffer when setlocale() is * called */ - LOCALE_LOCK; + { + DECLARATION_FOR_LC_NUMERIC_MANIPULATION; - if (toggle) { - if ( ! PL_numeric_underlying - && (item == PERL_RADIXCHAR || item == PERL_THOUSEP)) - { - do_setlocale_c(LC_NUMERIC, PL_numeric_name); + if (toggle) { + STORE_LC_NUMERIC_FORCE_TO_UNDERLYING(); } - else { - toggle = FALSE; + + LOCALE_LOCK; /* Prevent interference from another thread executing + this code section (the only call to nl_langinfo in + the core) */ + + + /* Copy to a per-thread buffer, which is also one that won't be + * destroyed by a subsequent setlocale(), such as the + * RESTORE_LC_NUMERIC may do just below. */ + retval = save_to_buffer(nl_langinfo(item), + &PL_langinfo_buf, &PL_langinfo_bufsize, 0); + + LOCALE_UNLOCK; + + if (toggle) { + RESTORE_LC_NUMERIC(); } } - save_to_buffer(nl_langinfo(item), &PL_langinfo_buf, &PL_langinfo_bufsize, 0); +# else /* Use nl_langinfo_l(), avoiding both a mutex and changing the locale */ - if (toggle) { - do_setlocale_c(LC_NUMERIC, "C"); - } + { + bool do_free = FALSE; + locale_t cur = uselocale((locale_t) 0); - LOCALE_UNLOCK; + if (cur == LC_GLOBAL_LOCALE) { + cur = duplocale(LC_GLOBAL_LOCALE); + do_free = TRUE; + } -# else /* Use nl_langinfo_l(), avoiding both a mutex and changing the locale */ +# ifdef USE_LOCALE_NUMERIC - bool do_free = FALSE; - locale_t cur = uselocale((locale_t) 0); + if (toggle) { + if (PL_underlying_numeric_obj) { + cur = PL_underlying_numeric_obj; + } + else { + cur = newlocale(LC_NUMERIC_MASK, PL_numeric_name, cur); + do_free = TRUE; + } + } - if (cur == LC_GLOBAL_LOCALE) { - cur = duplocale(LC_GLOBAL_LOCALE); - do_free = TRUE; - } +# endif - if (toggle) { - cur = newlocale(LC_NUMERIC_MASK, PL_numeric_name, cur); - do_free = TRUE; - } + /* We have to save it to a buffer, because the freelocale() just below + * can invalidate the internal one */ + retval = save_to_buffer(nl_langinfo_l(item, cur), + &PL_langinfo_buf, &PL_langinfo_bufsize, 0); - save_to_buffer(nl_langinfo_l(item, cur), - &PL_langinfo_buf, &PL_langinfo_bufsize, 0); - if (do_free) { - freelocale(cur); + if (do_free) { + freelocale(cur); + } } # endif - if (strEQ(PL_langinfo_buf, "")) { - if (item == PERL_YESSTR) { + if (strEQ(retval, "")) { + if (item == YESSTR) { return "yes"; } - if (item == PERL_NOSTR) { + if (item == NOSTR) { return "no"; } } - return PL_langinfo_buf; + return retval; #else /* Below, emulate nl_langinfo as best we can */ @@ -1482,7 +2572,19 @@ S_my_nl_langinfo(const int item, bool toggle) # ifdef HAS_LOCALECONV const struct lconv* lc; + const char * temp; + DECLARATION_FOR_LC_NUMERIC_MANIPULATION; + +# ifdef TS_W32_BROKEN_LOCALECONV + const char * save_global; + const char * save_thread; + int needed_size; + char * ptr; + char * e; + char * item_start; + +# endif # endif # ifdef HAS_STRFTIME @@ -1502,94 +2604,276 @@ S_my_nl_langinfo(const int item, bool toggle) switch (item) { Size_t len; - const char * retval; - /* These 2 are unimplemented */ - case PERL_CODESET: - case PERL_ERA: /* For use with strftime() %E modifier */ + /* This is unimplemented */ + case ERA: /* For use with strftime() %E modifier */ + + default: + return ""; + + /* We use only an English set, since we don't know any more */ + case YESEXPR: return "^[+1yY]"; + case YESSTR: return "yes"; + case NOEXPR: return "^[-0nN]"; + case NOSTR: return "no"; + + case CODESET: + +# ifndef WIN32 + + /* On non-windows, this is unimplemented, in part because of + * inconsistencies between vendors. The Darwin native + * nl_langinfo() implementation simply looks at everything past + * any dot in the name, but that doesn't work for other + * vendors. Many Linux locales that don't have UTF-8 in their + * names really are UTF-8, for example; z/OS locales that do + * have UTF-8 in their names, aren't really UTF-8 */ + return ""; + +# else + + { /* But on Windows, the name does seem to be consistent, so + use that. */ + const char * p; + const char * first; + Size_t offset = 0; + const char * name = my_setlocale(LC_CTYPE, NULL); + + if (isNAME_C_OR_POSIX(name)) { + return "ANSI_X3.4-1968"; + } + + /* Find the dot in the locale name */ + first = (const char *) strchr(name, '.'); + if (! first) { + first = name; + goto has_nondigit; + } + + /* Look at everything past the dot */ + first++; + p = first; + + while (*p) { + if (! isDIGIT(*p)) { + goto has_nondigit; + } + + p++; + } + + /* Here everything past the dot is a digit. Treat it as a + * code page */ + retval = save_to_buffer("CP", &PL_langinfo_buf, + &PL_langinfo_bufsize, 0); + offset = STRLENs("CP"); + + has_nondigit: + + retval = save_to_buffer(first, &PL_langinfo_buf, + &PL_langinfo_bufsize, offset); + } + + break; + +# endif +# ifdef HAS_LOCALECONV + + case CRNCYSTR: - default: - return ""; + /* We don't bother with localeconv_l() because any system that + * has it is likely to also have nl_langinfo() */ - /* We use only an English set, since we don't know any more */ - case PERL_YESEXPR: return "^[+1yY]"; - case PERL_YESSTR: return "yes"; - case PERL_NOEXPR: return "^[-0nN]"; - case PERL_NOSTR: return "no"; + LOCALE_LOCK_V; /* Prevent interference with other threads + using localeconv() */ -# ifdef HAS_LOCALECONV +# ifdef TS_W32_BROKEN_LOCALECONV - case PERL_CRNCYSTR: + /* This is a workaround for a Windows bug prior to VS 15. + * What we do here is, while locked, switch to the global + * locale so localeconv() works; then switch back just before + * the unlock. This can screw things up if some thread is + * already using the global locale while assuming no other is. + * A different workaround would be to call GetCurrencyFormat on + * a known value, and parse it; patches welcome + * + * We have to use LC_ALL instead of LC_MONETARY because of + * another bug in Windows */ - LOCALE_LOCK; + save_thread = savepv(my_setlocale(LC_ALL, NULL)); + _configthreadlocale(_DISABLE_PER_THREAD_LOCALE); + save_global= savepv(my_setlocale(LC_ALL, NULL)); + my_setlocale(LC_ALL, save_thread); - /* We don't bother with localeconv_l() because any system that - * has it is likely to also have nl_langinfo() */ +# endif lc = localeconv(); if ( ! lc || ! lc->currency_symbol || strEQ("", lc->currency_symbol)) { - LOCALE_UNLOCK; + LOCALE_UNLOCK_V; return ""; } /* Leave the first spot empty to be filled in below */ - save_to_buffer(lc->currency_symbol, &PL_langinfo_buf, - &PL_langinfo_bufsize, 1); + retval = save_to_buffer(lc->currency_symbol, &PL_langinfo_buf, + &PL_langinfo_bufsize, 1); if (lc->mon_decimal_point && strEQ(lc->mon_decimal_point, "")) { /* khw couldn't figure out how the localedef specifications would show that the $ should replace the radix; this is just a guess as to how it might work.*/ - *PL_langinfo_buf = '.'; + PL_langinfo_buf[0] = '.'; } else if (lc->p_cs_precedes) { - *PL_langinfo_buf = '-'; + PL_langinfo_buf[0] = '-'; } else { - *PL_langinfo_buf = '+'; + PL_langinfo_buf[0] = '+'; } - LOCALE_UNLOCK; +# ifdef TS_W32_BROKEN_LOCALECONV + + my_setlocale(LC_ALL, save_global); + _configthreadlocale(_ENABLE_PER_THREAD_LOCALE); + my_setlocale(LC_ALL, save_thread); + Safefree(save_global); + Safefree(save_thread); + +# endif + + LOCALE_UNLOCK_V; break; - case PERL_RADIXCHAR: - case PERL_THOUSEP: +# ifdef TS_W32_BROKEN_LOCALECONV - LOCALE_LOCK; + case RADIXCHAR: + + /* For this, we output a known simple floating point number to + * a buffer, and parse it, looking for the radix */ if (toggle) { - if (! PL_numeric_underlying) { - do_setlocale_c(LC_NUMERIC, PL_numeric_name); - } - else { - toggle = FALSE; - } + STORE_LC_NUMERIC_FORCE_TO_UNDERLYING(); + } + + if (PL_langinfo_bufsize < 10) { + PL_langinfo_bufsize = 10; + Renew(PL_langinfo_buf, PL_langinfo_bufsize, char); + } + + needed_size = my_snprintf(PL_langinfo_buf, PL_langinfo_bufsize, + "%.1f", 1.5); + if (needed_size >= (int) PL_langinfo_bufsize) { + PL_langinfo_bufsize = needed_size + 1; + Renew(PL_langinfo_buf, PL_langinfo_bufsize, char); + needed_size = my_snprintf(PL_langinfo_buf, PL_langinfo_bufsize, + "%.1f", 1.5); + assert(needed_size < (int) PL_langinfo_bufsize); + } + + ptr = PL_langinfo_buf; + e = PL_langinfo_buf + PL_langinfo_bufsize; + + /* Find the '1' */ + while (ptr < e && *ptr != '1') { + ptr++; + } + ptr++; + + /* Find the '5' */ + item_start = ptr; + while (ptr < e && *ptr != '5') { + ptr++; + } + + /* Everything in between is the radix string */ + if (ptr >= e) { + PL_langinfo_buf[0] = '?'; + PL_langinfo_buf[1] = '\0'; + } + else { + *ptr = '\0'; + Move(item_start, PL_langinfo_buf, ptr - PL_langinfo_buf, char); + } + + if (toggle) { + RESTORE_LC_NUMERIC(); + } + + retval = PL_langinfo_buf; + break; + +# else + + case RADIXCHAR: /* No special handling needed */ + +# endif + + case THOUSEP: + + if (toggle) { + STORE_LC_NUMERIC_FORCE_TO_UNDERLYING(); } + LOCALE_LOCK_V; /* Prevent interference with other threads + using localeconv() */ + +# ifdef TS_W32_BROKEN_LOCALECONV + + /* This should only be for the thousands separator. A + * different work around would be to use GetNumberFormat on a + * known value and parse the result to find the separator */ + save_thread = savepv(my_setlocale(LC_ALL, NULL)); + _configthreadlocale(_DISABLE_PER_THREAD_LOCALE); + save_global = savepv(my_setlocale(LC_ALL, NULL)); + my_setlocale(LC_ALL, save_thread); +# if 0 + /* This is the start of code that for broken Windows replaces + * the above and below code, and instead calls + * GetNumberFormat() and then would parse that to find the + * thousands separator. It needs to handle UTF-16 vs -8 + * issues. */ + + needed_size = GetNumberFormatEx(PL_numeric_name, 0, "1234.5", NULL, PL_langinfo_buf, PL_langinfo_bufsize); + DEBUG_L(PerlIO_printf(Perl_debug_log, + "%s: %d: return from GetNumber, count=%d, val=%s\n", + __FILE__, __LINE__, needed_size, PL_langinfo_buf)); + +# endif +# endif + lc = localeconv(); if (! lc) { - retval = ""; + temp = ""; } else { - retval = (item == PERL_RADIXCHAR) + temp = (item == RADIXCHAR) ? lc->decimal_point : lc->thousands_sep; - if (! retval) { - retval = ""; + if (! temp) { + temp = ""; } } - save_to_buffer(retval, &PL_langinfo_buf, - &PL_langinfo_bufsize, 0); + retval = save_to_buffer(temp, &PL_langinfo_buf, + &PL_langinfo_bufsize, 0); + +# ifdef TS_W32_BROKEN_LOCALECONV + + my_setlocale(LC_ALL, save_global); + _configthreadlocale(_ENABLE_PER_THREAD_LOCALE); + my_setlocale(LC_ALL, save_thread); + Safefree(save_global); + Safefree(save_thread); + +# endif + + LOCALE_UNLOCK_V; if (toggle) { - do_setlocale_c(LC_NUMERIC, "C"); + RESTORE_LC_NUMERIC(); } - LOCALE_UNLOCK; - break; # endif @@ -1601,30 +2885,26 @@ S_my_nl_langinfo(const int item, bool toggle) * for someone using them as formats (as opposed to trying to parse * them to figure out what the locale says). The other format * items are actually tested to verify they work on the platform */ - case PERL_D_FMT: return "%x"; - case PERL_T_FMT: return "%X"; - case PERL_D_T_FMT: return "%c"; + case D_FMT: return "%x"; + case T_FMT: return "%X"; + case D_T_FMT: return "%c"; /* These formats are only available in later strfmtime's */ - case PERL_ERA_D_FMT: case PERL_ERA_T_FMT: case PERL_ERA_D_T_FMT: - case PERL_T_FMT_AMPM: + case ERA_D_FMT: case ERA_T_FMT: case ERA_D_T_FMT: case T_FMT_AMPM: /* The rest can be gotten from most versions of strftime(). */ - case PERL_ABDAY_1: case PERL_ABDAY_2: case PERL_ABDAY_3: - case PERL_ABDAY_4: case PERL_ABDAY_5: case PERL_ABDAY_6: - case PERL_ABDAY_7: - case PERL_ALT_DIGITS: - case PERL_AM_STR: case PERL_PM_STR: - case PERL_ABMON_1: case PERL_ABMON_2: case PERL_ABMON_3: - case PERL_ABMON_4: case PERL_ABMON_5: case PERL_ABMON_6: - case PERL_ABMON_7: case PERL_ABMON_8: case PERL_ABMON_9: - case PERL_ABMON_10: case PERL_ABMON_11: case PERL_ABMON_12: - case PERL_DAY_1: case PERL_DAY_2: case PERL_DAY_3: case PERL_DAY_4: - case PERL_DAY_5: case PERL_DAY_6: case PERL_DAY_7: - case PERL_MON_1: case PERL_MON_2: case PERL_MON_3: case PERL_MON_4: - case PERL_MON_5: case PERL_MON_6: case PERL_MON_7: case PERL_MON_8: - case PERL_MON_9: case PERL_MON_10: case PERL_MON_11: - case PERL_MON_12: + case ABDAY_1: case ABDAY_2: case ABDAY_3: + case ABDAY_4: case ABDAY_5: case ABDAY_6: case ABDAY_7: + case ALT_DIGITS: + case AM_STR: case PM_STR: + case ABMON_1: case ABMON_2: case ABMON_3: case ABMON_4: + case ABMON_5: case ABMON_6: case ABMON_7: case ABMON_8: + case ABMON_9: case ABMON_10: case ABMON_11: case ABMON_12: + case DAY_1: case DAY_2: case DAY_3: case DAY_4: + case DAY_5: case DAY_6: case DAY_7: + case MON_1: case MON_2: case MON_3: case MON_4: + case MON_5: case MON_6: case MON_7: case MON_8: + case MON_9: case MON_10: case MON_11: case MON_12: LOCALE_LOCK; @@ -1643,82 +2923,82 @@ S_my_nl_langinfo(const int item, bool toggle) __FILE__, __LINE__, item); NOT_REACHED; /* NOTREACHED */ - case PERL_PM_STR: tm.tm_hour = 18; - case PERL_AM_STR: + case PM_STR: tm.tm_hour = 18; + case AM_STR: format = "%p"; break; - case PERL_ABDAY_7: tm.tm_wday++; - case PERL_ABDAY_6: tm.tm_wday++; - case PERL_ABDAY_5: tm.tm_wday++; - case PERL_ABDAY_4: tm.tm_wday++; - case PERL_ABDAY_3: tm.tm_wday++; - case PERL_ABDAY_2: tm.tm_wday++; - case PERL_ABDAY_1: + case ABDAY_7: tm.tm_wday++; + case ABDAY_6: tm.tm_wday++; + case ABDAY_5: tm.tm_wday++; + case ABDAY_4: tm.tm_wday++; + case ABDAY_3: tm.tm_wday++; + case ABDAY_2: tm.tm_wday++; + case ABDAY_1: format = "%a"; break; - case PERL_DAY_7: tm.tm_wday++; - case PERL_DAY_6: tm.tm_wday++; - case PERL_DAY_5: tm.tm_wday++; - case PERL_DAY_4: tm.tm_wday++; - case PERL_DAY_3: tm.tm_wday++; - case PERL_DAY_2: tm.tm_wday++; - case PERL_DAY_1: + case DAY_7: tm.tm_wday++; + case DAY_6: tm.tm_wday++; + case DAY_5: tm.tm_wday++; + case DAY_4: tm.tm_wday++; + case DAY_3: tm.tm_wday++; + case DAY_2: tm.tm_wday++; + case DAY_1: format = "%A"; break; - case PERL_ABMON_12: tm.tm_mon++; - case PERL_ABMON_11: tm.tm_mon++; - case PERL_ABMON_10: tm.tm_mon++; - case PERL_ABMON_9: tm.tm_mon++; - case PERL_ABMON_8: tm.tm_mon++; - case PERL_ABMON_7: tm.tm_mon++; - case PERL_ABMON_6: tm.tm_mon++; - case PERL_ABMON_5: tm.tm_mon++; - case PERL_ABMON_4: tm.tm_mon++; - case PERL_ABMON_3: tm.tm_mon++; - case PERL_ABMON_2: tm.tm_mon++; - case PERL_ABMON_1: + case ABMON_12: tm.tm_mon++; + case ABMON_11: tm.tm_mon++; + case ABMON_10: tm.tm_mon++; + case ABMON_9: tm.tm_mon++; + case ABMON_8: tm.tm_mon++; + case ABMON_7: tm.tm_mon++; + case ABMON_6: tm.tm_mon++; + case ABMON_5: tm.tm_mon++; + case ABMON_4: tm.tm_mon++; + case ABMON_3: tm.tm_mon++; + case ABMON_2: tm.tm_mon++; + case ABMON_1: format = "%b"; break; - case PERL_MON_12: tm.tm_mon++; - case PERL_MON_11: tm.tm_mon++; - case PERL_MON_10: tm.tm_mon++; - case PERL_MON_9: tm.tm_mon++; - case PERL_MON_8: tm.tm_mon++; - case PERL_MON_7: tm.tm_mon++; - case PERL_MON_6: tm.tm_mon++; - case PERL_MON_5: tm.tm_mon++; - case PERL_MON_4: tm.tm_mon++; - case PERL_MON_3: tm.tm_mon++; - case PERL_MON_2: tm.tm_mon++; - case PERL_MON_1: + case MON_12: tm.tm_mon++; + case MON_11: tm.tm_mon++; + case MON_10: tm.tm_mon++; + case MON_9: tm.tm_mon++; + case MON_8: tm.tm_mon++; + case MON_7: tm.tm_mon++; + case MON_6: tm.tm_mon++; + case MON_5: tm.tm_mon++; + case MON_4: tm.tm_mon++; + case MON_3: tm.tm_mon++; + case MON_2: tm.tm_mon++; + case MON_1: format = "%B"; break; - case PERL_T_FMT_AMPM: + case T_FMT_AMPM: format = "%r"; return_format = TRUE; break; - case PERL_ERA_D_FMT: + case ERA_D_FMT: format = "%Ex"; return_format = TRUE; break; - case PERL_ERA_T_FMT: + case ERA_T_FMT: format = "%EX"; return_format = TRUE; break; - case PERL_ERA_D_T_FMT: + case ERA_D_T_FMT: format = "%Ec"; return_format = TRUE; break; - case PERL_ALT_DIGITS: + case ALT_DIGITS: tm.tm_wday = 0; format = "%Ow"; /* Find the alternate digit for 0 */ break; @@ -1786,7 +3066,7 @@ S_my_nl_langinfo(const int item, bool toggle) * alternate format for wday 0. If the value is the same as * the normal 0, there isn't an alternate, so clear the buffer. * */ - if ( item == PERL_ALT_DIGITS + if ( item == ALT_DIGITS && strEQ(PL_langinfo_buf, "0")) { *PL_langinfo_buf = '\0'; @@ -1815,6 +3095,8 @@ S_my_nl_langinfo(const int item, bool toggle) LOCALE_UNLOCK; + retval = PL_langinfo_buf; + /* If to return the format, not the value, overwrite the buffer * with it. But some strftime()s will keep the original format * if illegal, so change those to "" */ @@ -1823,8 +3105,8 @@ S_my_nl_langinfo(const int item, bool toggle) *PL_langinfo_buf = '\0'; } else { - save_to_buffer(format, &PL_langinfo_buf, - &PL_langinfo_bufsize, 0); + retval = save_to_buffer(format, &PL_langinfo_buf, + &PL_langinfo_bufsize, 0); } } @@ -1835,7 +3117,7 @@ S_my_nl_langinfo(const int item, bool toggle) } } - return PL_langinfo_buf; + return retval; #endif @@ -1895,6 +3177,8 @@ Perl_init_i18nl10n(pTHX_ int printwarn) * values for our db, instead of trying to change them. * */ + dVAR; + int ok = 1; #ifndef USE_LOCALE @@ -1971,53 +3255,127 @@ Perl_init_i18nl10n(pTHX_ int printwarn) # ifdef USE_LOCALE_NUMERIC assert(categories[LC_NUMERIC_INDEX] == LC_NUMERIC); assert(strEQ(category_names[LC_NUMERIC_INDEX], "LC_NUMERIC")); +# ifdef USE_POSIX_2008_LOCALE + assert(category_masks[LC_NUMERIC_INDEX] == LC_NUMERIC_MASK); +# endif # endif # ifdef USE_LOCALE_CTYPE assert(categories[LC_CTYPE_INDEX] == LC_CTYPE); assert(strEQ(category_names[LC_CTYPE_INDEX], "LC_CTYPE")); +# ifdef USE_POSIX_2008_LOCALE + assert(category_masks[LC_CTYPE_INDEX] == LC_CTYPE_MASK); +# endif # endif # ifdef USE_LOCALE_COLLATE assert(categories[LC_COLLATE_INDEX] == LC_COLLATE); assert(strEQ(category_names[LC_COLLATE_INDEX], "LC_COLLATE")); +# ifdef USE_POSIX_2008_LOCALE + assert(category_masks[LC_COLLATE_INDEX] == LC_COLLATE_MASK); +# endif # endif # ifdef USE_LOCALE_TIME assert(categories[LC_TIME_INDEX] == LC_TIME); assert(strEQ(category_names[LC_TIME_INDEX], "LC_TIME")); +# ifdef USE_POSIX_2008_LOCALE + assert(category_masks[LC_TIME_INDEX] == LC_TIME_MASK); +# endif # endif # ifdef USE_LOCALE_MESSAGES assert(categories[LC_MESSAGES_INDEX] == LC_MESSAGES); assert(strEQ(category_names[LC_MESSAGES_INDEX], "LC_MESSAGES")); +# ifdef USE_POSIX_2008_LOCALE + assert(category_masks[LC_MESSAGES_INDEX] == LC_MESSAGES_MASK); +# endif # endif # ifdef USE_LOCALE_MONETARY assert(categories[LC_MONETARY_INDEX] == LC_MONETARY); assert(strEQ(category_names[LC_MONETARY_INDEX], "LC_MONETARY")); +# ifdef USE_POSIX_2008_LOCALE + assert(category_masks[LC_MONETARY_INDEX] == LC_MONETARY_MASK); +# endif # endif # ifdef USE_LOCALE_ADDRESS assert(categories[LC_ADDRESS_INDEX] == LC_ADDRESS); assert(strEQ(category_names[LC_ADDRESS_INDEX], "LC_ADDRESS")); +# ifdef USE_POSIX_2008_LOCALE + assert(category_masks[LC_ADDRESS_INDEX] == LC_ADDRESS_MASK); +# endif # endif # ifdef USE_LOCALE_IDENTIFICATION assert(categories[LC_IDENTIFICATION_INDEX] == LC_IDENTIFICATION); assert(strEQ(category_names[LC_IDENTIFICATION_INDEX], "LC_IDENTIFICATION")); +# ifdef USE_POSIX_2008_LOCALE + assert(category_masks[LC_IDENTIFICATION_INDEX] == LC_IDENTIFICATION_MASK); +# endif # endif # ifdef USE_LOCALE_MEASUREMENT assert(categories[LC_MEASUREMENT_INDEX] == LC_MEASUREMENT); assert(strEQ(category_names[LC_MEASUREMENT_INDEX], "LC_MEASUREMENT")); +# ifdef USE_POSIX_2008_LOCALE + assert(category_masks[LC_MEASUREMENT_INDEX] == LC_MEASUREMENT_MASK); +# endif # endif # ifdef USE_LOCALE_PAPER assert(categories[LC_PAPER_INDEX] == LC_PAPER); assert(strEQ(category_names[LC_PAPER_INDEX], "LC_PAPER")); +# ifdef USE_POSIX_2008_LOCALE + assert(category_masks[LC_PAPER_INDEX] == LC_PAPER_MASK); +# endif # endif # ifdef USE_LOCALE_TELEPHONE assert(categories[LC_TELEPHONE_INDEX] == LC_TELEPHONE); assert(strEQ(category_names[LC_TELEPHONE_INDEX], "LC_TELEPHONE")); +# ifdef USE_POSIX_2008_LOCALE + assert(category_masks[LC_TELEPHONE_INDEX] == LC_TELEPHONE_MASK); +# endif # endif # ifdef LC_ALL assert(categories[LC_ALL_INDEX] == LC_ALL); assert(strEQ(category_names[LC_ALL_INDEX], "LC_ALL")); assert(NOMINAL_LC_ALL_INDEX == LC_ALL_INDEX); +# ifdef USE_POSIX_2008_LOCALE + assert(category_masks[LC_ALL_INDEX] == LC_ALL_MASK); +# endif # endif # endif /* DEBUGGING */ + + /* Initialize the cache of the program's UTF-8ness for the always known + * locales C and POSIX */ + my_strlcpy(PL_locale_utf8ness, C_and_POSIX_utf8ness, + sizeof(PL_locale_utf8ness)); + +# ifdef USE_THREAD_SAFE_LOCALE +# ifdef WIN32 + + _configthreadlocale(_ENABLE_PER_THREAD_LOCALE); + +# endif +# endif +# ifdef USE_POSIX_2008_LOCALE + + PL_C_locale_obj = newlocale(LC_ALL_MASK, "C", (locale_t) 0); + if (! PL_C_locale_obj) { + Perl_croak_nocontext( + "panic: Cannot create POSIX 2008 C locale object; errno=%d", errno); + } + if (DEBUG_Lv_TEST || debug_initialization) { + PerlIO_printf(Perl_debug_log, "%s:%d: created C object %p\n", __FILE__, __LINE__, PL_C_locale_obj); + } + +# endif + +# ifdef USE_LOCALE_NUMERIC + + PL_numeric_radix_sv = newSVpvs("."); + +# endif + +# if defined(USE_POSIX_2008_LOCALE) && ! defined(HAS_QUERYLOCALE) + + /* Initialize our records. If we have POSIX 2008, we have LC_ALL */ + do_setlocale_c(LC_ALL, my_setlocale(LC_ALL, NULL)); + +# endif # ifdef LOCALE_ENVIRON_REQUIRED /* @@ -2374,8 +3732,25 @@ Perl_init_i18nl10n(pTHX_ int printwarn) # endif - for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++) { + +# if defined(USE_ITHREADS) && ! defined(USE_THREAD_SAFE_LOCALE) + + /* This caches whether each category's locale is UTF-8 or not. This + * may involve changing the locale. It is ok to do this at + * initialization time before any threads have started, but not later + * unless thread-safe operations are used. + * Caching means that if the program heeds our dictate not to change + * locales in threaded applications, this data will remain valid, and + * it may get queried without having to change locales. If the + * environment is such that all categories have the same locale, this + * isn't needed, as the code will not change the locale; but this + * handles the uncommon case where the environment has disparate + * locales for the categories */ + (void) _is_cur_LC_category_utf8(categories[i]); + +# endif + Safefree(curlocales[i]); } @@ -2612,6 +3987,7 @@ Perl__mem_collxfrm(pTHX_ const char *input_string, /* Make sure the UTF8ness of the string and locale match */ if (utf8 != PL_in_utf8_COLLATE_locale) { + /* XXX convert above Unicode to 10FFFF? */ const char * const t = s; /* Temporary so we can later find where the input was */ @@ -2973,6 +4349,11 @@ S_print_collxfrm_input_and_return(pTHX_ PerlIO_printf(Perl_debug_log, "'\n"); } +# endif /* DEBUGGING */ +#endif /* USE_LOCALE_COLLATE */ +#ifdef USE_LOCALE +# ifdef DEBUGGING + STATIC void S_print_bytes_for_locale(pTHX_ const char * const s, @@ -3009,9 +4390,91 @@ S_print_bytes_for_locale(pTHX_ } # endif /* #ifdef DEBUGGING */ -#endif /* USE_LOCALE_COLLATE */ -#ifdef USE_LOCALE +STATIC const char * +S_switch_category_locale_to_template(pTHX_ const int switch_category, const int template_category, const char * template_locale) +{ + /* Changes the locale for LC_'switch_category" to that of + * LC_'template_category', if they aren't already the same. If not NULL, + * 'template_locale' is the locale that 'template_category' is in. + * + * Returns a copy of the name of the original locale for 'switch_category' + * so can be switched back to with the companion function + * restore_switched_locale(), (NULL if no restoral is necessary.) */ + + char * restore_to_locale = NULL; + + if (switch_category == template_category) { /* No changes needed */ + return NULL; + } + + /* Find the original locale of the category we may need to change, so that + * it can be restored to later */ + restore_to_locale = stdize_locale(savepv(do_setlocale_r(switch_category, + NULL))); + if (! restore_to_locale) { + Perl_croak(aTHX_ + "panic: %s: %d: Could not find current %s locale, errno=%d\n", + __FILE__, __LINE__, category_name(switch_category), errno); + } + + /* If the locale of the template category wasn't passed in, find it now */ + if (template_locale == NULL) { + template_locale = do_setlocale_r(template_category, NULL); + if (! template_locale) { + Perl_croak(aTHX_ + "panic: %s: %d: Could not find current %s locale, errno=%d\n", + __FILE__, __LINE__, category_name(template_category), errno); + } + } + + /* It the locales are the same, there's nothing to do */ + if (strEQ(restore_to_locale, template_locale)) { + Safefree(restore_to_locale); + + DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s locale unchanged as %s\n", + category_name(switch_category), restore_to_locale)); + + return NULL; + } + + /* Finally, change the locale to the template one */ + if (! do_setlocale_r(switch_category, template_locale)) { + Perl_croak(aTHX_ + "panic: %s: %d: Could not change %s locale to %s, errno=%d\n", + __FILE__, __LINE__, category_name(switch_category), + template_locale, errno); + } + + DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s locale switched to %s\n", + category_name(switch_category), template_locale)); + + return restore_to_locale; +} + +STATIC void +S_restore_switched_locale(pTHX_ const int category, const char * const original_locale) +{ + /* Restores the locale for LC_'category' to 'original_locale' (which is a + * copy that will be freed by this function), or do nothing if the latter + * parameter is NULL */ + + if (original_locale == NULL) { + return; + } + + if (! do_setlocale_r(category, original_locale)) { + Perl_croak(aTHX_ + "panic: %s: %d: setlocale %s restore to %s failed, errno=%d\n", + __FILE__, __LINE__, + category_name(category), original_locale, errno); + } + + Safefree(original_locale); +} + +/* is_cur_LC_category_utf8 uses a small char buffer to avoid malloc/free */ +#define CUR_LC_BUFFER_SIZE 64 bool Perl__is_cur_LC_category_utf8(pTHX_ int category) @@ -3022,10 +4485,38 @@ Perl__is_cur_LC_category_utf8(pTHX_ int category) * could give the wrong result. The result will very likely be correct for * languages that have commonly used non-ASCII characters, but for notably * English, it comes down to if the locale's name ends in something like - * "UTF-8". It errs on the side of not being a UTF-8 locale. */ - + * "UTF-8". It errs on the side of not being a UTF-8 locale. + * + * If the platform is early C89, not containing mbtowc(), or we are + * compiled to not pay attention to LC_CTYPE, this employs heuristics. + * These work very well for non-Latin locales or those whose currency + * symbol isn't a '$' nor plain ASCII text. But without LC_CTYPE and at + * least MB_CUR_MAX, English locales with an ASCII currency symbol depend + * on the name containing UTF-8 or not. */ + + /* Name of current locale corresponding to the input category */ const char *save_input_locale = NULL; - STRLEN final_pos; + + bool is_utf8 = FALSE; /* The return value */ + + /* The variables below are for the cache of previous lookups using this + * function. The cache is a C string, described at the definition for + * 'C_and_POSIX_utf8ness'. + * + * The first part of the cache is fixed, for the C and POSIX locales. The + * varying part starts just after them. */ + char * utf8ness_cache = PL_locale_utf8ness + STRLENs(C_and_POSIX_utf8ness); + + Size_t utf8ness_cache_size; /* Size of the varying portion */ + Size_t input_name_len; /* Length in bytes of save_input_locale */ + Size_t input_name_len_with_overhead; /* plus extra chars used to store + the name in the cache */ + char * delimited; /* The name plus the delimiters used to store + it in the cache */ + char buffer[CUR_LC_BUFFER_SIZE]; /* small buffer */ + char * name_pos; /* position of 'delimited' in the cache, or 0 + if not there */ + # ifdef LC_ALL @@ -3033,424 +4524,373 @@ Perl__is_cur_LC_category_utf8(pTHX_ int category) # endif - /* First dispose of the trivial cases */ - save_input_locale = do_setlocale_r(category, NULL); + /* Get the desired category's locale */ + save_input_locale = stdize_locale(savepv(do_setlocale_r(category, NULL))); if (! save_input_locale) { - DEBUG_L(PerlIO_printf(Perl_debug_log, - "Could not find current locale for category %d\n", - category)); - return FALSE; /* XXX maybe should croak */ + Perl_croak(aTHX_ + "panic: %s: %d: Could not find current %s locale, errno=%d\n", + __FILE__, __LINE__, category_name(category), errno); } - save_input_locale = stdize_locale(savepv(save_input_locale)); - if (isNAME_C_OR_POSIX(save_input_locale)) { - DEBUG_L(PerlIO_printf(Perl_debug_log, - "Current locale for category %d is %s\n", - category, save_input_locale)); - Safefree(save_input_locale); - return FALSE; + + DEBUG_L(PerlIO_printf(Perl_debug_log, + "Current locale for %s is %s\n", + category_name(category), save_input_locale)); + + input_name_len = strlen(save_input_locale); + + /* In our cache, each name is accompanied by two delimiters and a single + * utf8ness digit */ + input_name_len_with_overhead = input_name_len + 3; + + if ( input_name_len_with_overhead <= CUR_LC_BUFFER_SIZE ) { + /* we can use the buffer, avoid a malloc */ + delimited = buffer; + } else { /* need a malloc */ + /* Allocate and populate space for a copy of the name surrounded by the + * delimiters */ + Newx(delimited, input_name_len_with_overhead, char); } -# if defined(USE_LOCALE_CTYPE) \ - && (defined(MB_CUR_MAX) || (defined(HAS_NL_LANGINFO) && defined(CODESET))) + delimited[0] = UTF8NESS_SEP[0]; + Copy(save_input_locale, delimited + 1, input_name_len, char); + delimited[input_name_len+1] = UTF8NESS_PREFIX[0]; + delimited[input_name_len+2] = '\0'; - { /* Next try nl_langinfo or MB_CUR_MAX if available */ + /* And see if that is in the cache */ + name_pos = instr(PL_locale_utf8ness, delimited); + if (name_pos) { + is_utf8 = *(name_pos + input_name_len_with_overhead - 1) - '0'; - char *save_ctype_locale = NULL; - bool is_utf8; +# ifdef DEBUGGING - if (category != LC_CTYPE) { /* These work only on LC_CTYPE */ + if (DEBUG_Lv_TEST || debug_initialization) { + PerlIO_printf(Perl_debug_log, "UTF8ness for locale %s=%d, \n", + save_input_locale, is_utf8); + } - /* Get the current LC_CTYPE locale */ - save_ctype_locale = do_setlocale_c(LC_CTYPE, NULL); - if (! save_ctype_locale) { - DEBUG_L(PerlIO_printf(Perl_debug_log, - "Could not find current locale for LC_CTYPE\n")); - goto cant_use_nllanginfo; - } - save_ctype_locale = stdize_locale(savepv(save_ctype_locale)); +# endif - /* If LC_CTYPE and the desired category use the same locale, this - * means that finding the value for LC_CTYPE is the same as finding - * the value for the desired category. Otherwise, switch LC_CTYPE - * to the desired category's locale */ - if (strEQ(save_ctype_locale, save_input_locale)) { - Safefree(save_ctype_locale); - save_ctype_locale = NULL; - } - else if (! do_setlocale_c(LC_CTYPE, save_input_locale)) { - DEBUG_L(PerlIO_printf(Perl_debug_log, - "Could not change LC_CTYPE locale to %s\n", - save_input_locale)); - Safefree(save_ctype_locale); - goto cant_use_nllanginfo; - } + /* And, if not already in that position, move it to the beginning of + * the non-constant portion of the list, since it is the most recently + * used. (We don't have to worry about overflow, since just moving + * existing names around) */ + if (name_pos > utf8ness_cache) { + Move(utf8ness_cache, + utf8ness_cache + input_name_len_with_overhead, + name_pos - utf8ness_cache, char); + Copy(delimited, + utf8ness_cache, + input_name_len_with_overhead - 1, char); + utf8ness_cache[input_name_len_with_overhead - 1] = is_utf8 + '0'; } - DEBUG_L(PerlIO_printf(Perl_debug_log, "Current LC_CTYPE locale=%s\n", - save_input_locale)); + /* free only when not using the buffer */ + if ( delimited != buffer ) Safefree(delimited); + Safefree(save_input_locale); + return is_utf8; + } + + /* Here we don't have stored the utf8ness for the input locale. We have to + * calculate it */ + +# if defined(USE_LOCALE_CTYPE) \ + && ( defined(HAS_NL_LANGINFO) \ + || (defined(HAS_MBTOWC) || defined(HAS_MBRTOWC))) + + { + const char *original_ctype_locale + = switch_category_locale_to_template(LC_CTYPE, + category, + save_input_locale); /* Here the current LC_CTYPE is set to the locale of the category whose - * information is desired. This means that nl_langinfo() and MB_CUR_MAX + * information is desired. This means that nl_langinfo() and mbtowc() * should give the correct results */ -# if defined(HAS_NL_LANGINFO) && defined(CODESET) +# ifdef MB_CUR_MAX /* But we can potentially rule out UTF-8ness, avoiding + calling the functions if we have this */ + + /* Standard UTF-8 needs at least 4 bytes to represent the maximum + * Unicode code point. */ + + DEBUG_L(PerlIO_printf(Perl_debug_log, "%s: %d: MB_CUR_MAX=%d\n", + __FILE__, __LINE__, (int) MB_CUR_MAX)); + if ((unsigned) MB_CUR_MAX < STRLENs(MAX_UNICODE_UTF8)) { + is_utf8 = FALSE; + restore_switched_locale(LC_CTYPE, original_ctype_locale); + goto finish_and_return; + } - { /* The task is easiest if the platform has this POSIX 2001 function */ - const char *codeset = my_nl_langinfo(PERL_CODESET, FALSE); +# endif +# if defined(HAS_NL_LANGINFO) + + { /* The task is easiest if the platform has this POSIX 2001 function. + Except on some platforms it can wrongly return "", so have to have + a fallback. And it can return that it's UTF-8, even if there are + variances from that. For example, Turkish locales may use the + alternate dotted I rules, and sometimes it appears to be a + defective locale definition. XXX We should probably check for + these in the Latin1 range and warn (but on glibc, requires + iswalnum() etc. due to their not handling 80-FF correctly */ + const char *codeset = my_nl_langinfo(CODESET, FALSE); /* FALSE => already in dest locale */ - DEBUG_L(PerlIO_printf(Perl_debug_log, + DEBUG_Lv(PerlIO_printf(Perl_debug_log, "\tnllanginfo returned CODESET '%s'\n", codeset)); if (codeset && strNE(codeset, "")) { - /* If we switched LC_CTYPE, switch back */ - if (save_ctype_locale) { - do_setlocale_c(LC_CTYPE, save_ctype_locale); - Safefree(save_ctype_locale); - } - is_utf8 = ( ( strlen(codeset) == STRLENs("UTF-8") - && foldEQ(codeset, STR_WITH_LEN("UTF-8"))) - || ( strlen(codeset) == STRLENs("UTF8") - && foldEQ(codeset, STR_WITH_LEN("UTF8")))); + /* If the implementation of foldEQ() somehow were + * to change to not go byte-by-byte, this could + * read past end of string, as only one length is + * checked. But currently, a premature NUL will + * compare false, and it will stop there */ + is_utf8 = cBOOL( foldEQ(codeset, STR_WITH_LEN("UTF-8")) + || foldEQ(codeset, STR_WITH_LEN("UTF8"))); DEBUG_L(PerlIO_printf(Perl_debug_log, "\tnllanginfo returned CODESET '%s'; ?UTF8 locale=%d\n", codeset, is_utf8)); - Safefree(save_input_locale); - return is_utf8; + restore_switched_locale(LC_CTYPE, original_ctype_locale); + goto finish_and_return; } } # endif -# ifdef MB_CUR_MAX - - /* Here, either we don't have nl_langinfo, or it didn't return a - * codeset. Try MB_CUR_MAX */ - - /* Standard UTF-8 needs at least 4 bytes to represent the maximum - * Unicode code point. Since UTF-8 is the only non-single byte - * encoding we handle, we just say any such encoding is UTF-8, and if - * turns out to be wrong, other things will fail */ - is_utf8 = (unsigned) MB_CUR_MAX >= STRLENs(MAX_UNICODE_UTF8); - - DEBUG_L(PerlIO_printf(Perl_debug_log, - "\tMB_CUR_MAX=%d; ?UTF8 locale=%d\n", - (int) MB_CUR_MAX, is_utf8)); - - Safefree(save_input_locale); - -# ifdef HAS_MBTOWC +# if defined(HAS_MBTOWC) || defined(HAS_MBRTOWC) + /* We can see if this is a UTF-8-like locale if have mbtowc(). It was a + * late adder to C89, so very likely to have it. However, testing has + * shown that, like nl_langinfo() above, there are locales that are not + * strictly UTF-8 that this will return that they are */ - /* ... But, most system that have MB_CUR_MAX will also have mbtowc(), - * since they are both in the C99 standard. We can feed a known byte - * string to the latter function, and check that it gives the expected - * result */ - if (is_utf8) { + { wchar_t wc; int len; + dSAVEDERRNO; - PERL_UNUSED_RESULT(mbtowc(&wc, NULL, 0));/* Reset any shift state */ - errno = 0; - len = mbtowc(&wc, STR_WITH_LEN(REPLACEMENT_CHARACTER_UTF8)); - +# if defined(HAS_MBRTOWC) && defined(USE_ITHREADS) - if ( len != STRLENs(REPLACEMENT_CHARACTER_UTF8) - || wc != (wchar_t) UNICODE_REPLACEMENT) - { - is_utf8 = FALSE; - DEBUG_L(PerlIO_printf(Perl_debug_log, "\replacement=U+%x\n", - (unsigned int)wc)); - DEBUG_L(PerlIO_printf(Perl_debug_log, - "\treturn from mbtowc=%d; errno=%d; ?UTF8 locale=0\n", - len, errno)); - } - } + mbstate_t ps; # endif - /* If we switched LC_CTYPE, switch back */ - if (save_ctype_locale) { - do_setlocale_c(LC_CTYPE, save_ctype_locale); - Safefree(save_ctype_locale); - } - - return is_utf8; - -# endif - - } - - cant_use_nllanginfo: - -# else /* nl_langinfo should work if available, so don't bother compiling this - fallback code. The final fallback of looking at the name is - compiled, and will be executed if nl_langinfo fails */ - - /* nl_langinfo not available or failed somehow. Next try looking at the - * currency symbol to see if it disambiguates things. Often that will be - * in the native script, and if the symbol isn't in UTF-8, we know that the - * locale isn't. If it is non-ASCII UTF-8, we infer that the locale is - * too, as the odds of a non-UTF8 string being valid UTF-8 are quite small - * */ - -# ifdef HAS_LOCALECONV -# ifdef USE_LOCALE_MONETARY - - { - char *save_monetary_locale = NULL; - bool only_ascii = FALSE; - bool is_utf8 = FALSE; - struct lconv* lc; + /* mbrtowc() and mbtowc() convert a byte string to a wide + * character. Feed a byte string to one of them and check that the + * result is the expected Unicode code point */ - /* Like above for LC_CTYPE, we first set LC_MONETARY to the locale of - * the desired category, if it isn't that locale already */ +# if defined(HAS_MBRTOWC) && defined(USE_ITHREADS) + /* Prefer this function if available, as it's reentrant */ - if (category != LC_MONETARY) { + memset(&ps, 0, sizeof(ps));; + PERL_UNUSED_RESULT(mbrtowc(&wc, NULL, 0, &ps)); /* Reset any shift + state */ + SETERRNO(0, 0); + len = mbrtowc(&wc, STR_WITH_LEN(REPLACEMENT_CHARACTER_UTF8), &ps); + SAVE_ERRNO; - save_monetary_locale = do_setlocale_c(LC_MONETARY, NULL); - if (! save_monetary_locale) { - DEBUG_L(PerlIO_printf(Perl_debug_log, - "Could not find current locale for LC_MONETARY\n")); - goto cant_use_monetary; - } - save_monetary_locale = stdize_locale(savepv(save_monetary_locale)); +# else - if (strEQ(save_monetary_locale, save_input_locale)) { - Safefree(save_monetary_locale); - save_monetary_locale = NULL; - } - else if (! do_setlocale_c(LC_MONETARY, save_input_locale)) { - DEBUG_L(PerlIO_printf(Perl_debug_log, - "Could not change LC_MONETARY locale to %s\n", - save_input_locale)); - Safefree(save_monetary_locale); - goto cant_use_monetary; - } - } + LOCALE_LOCK; + PERL_UNUSED_RESULT(mbtowc(&wc, NULL, 0));/* Reset any shift state */ + SETERRNO(0, 0); + len = mbtowc(&wc, STR_WITH_LEN(REPLACEMENT_CHARACTER_UTF8)); + SAVE_ERRNO; + LOCALE_UNLOCK; - /* Here the current LC_MONETARY is set to the locale of the category - * whose information is desired. */ +# endif - lc = localeconv(); - if (! lc - || ! lc->currency_symbol - || is_utf8_invariant_string((U8 *) lc->currency_symbol, 0)) - { - DEBUG_L(PerlIO_printf(Perl_debug_log, "Couldn't get currency symbol for %s, or contains only ASCII; can't use for determining if UTF-8 locale\n", save_input_locale)); - only_ascii = TRUE; - } - else { - is_utf8 = is_utf8_string((U8 *) lc->currency_symbol, 0); - } + RESTORE_ERRNO; + DEBUG_Lv(PerlIO_printf(Perl_debug_log, + "\treturn from mbtowc; len=%d; code_point=%x; errno=%d\n", + len, (unsigned int) wc, GET_ERRNO)); - /* If we changed it, restore LC_MONETARY to its original locale */ - if (save_monetary_locale) { - do_setlocale_c(LC_MONETARY, save_monetary_locale); - Safefree(save_monetary_locale); + is_utf8 = cBOOL( len == STRLENs(REPLACEMENT_CHARACTER_UTF8) + && wc == (wchar_t) UNICODE_REPLACEMENT); } - if (! only_ascii) { +# endif - /* It isn't a UTF-8 locale if the symbol is not legal UTF-8; - * otherwise assume the locale is UTF-8 if and only if the symbol - * is non-ascii UTF-8. */ - DEBUG_L(PerlIO_printf(Perl_debug_log, "\t?Currency symbol for %s is UTF-8=%d\n", - save_input_locale, is_utf8)); - Safefree(save_input_locale); - return is_utf8; - } + restore_switched_locale(LC_CTYPE, original_ctype_locale); + goto finish_and_return; } - cant_use_monetary: -# endif /* USE_LOCALE_MONETARY */ -# endif /* HAS_LOCALECONV */ +# else -# if defined(HAS_STRFTIME) && defined(USE_LOCALE_TIME) + /* Here, we must have a C89 compiler that doesn't have mbtowc(). Next + * try looking at the currency symbol to see if it disambiguates + * things. Often that will be in the native script, and if the symbol + * isn't in UTF-8, we know that the locale isn't. If it is non-ASCII + * UTF-8, we infer that the locale is too, as the odds of a non-UTF8 + * string being valid UTF-8 are quite small */ -/* Still haven't found a non-ASCII string to disambiguate UTF-8 or not. Try - * the names of the months and weekdays, timezone, and am/pm indicator */ - { - char *save_time_locale = NULL; - int hour = 10; - bool is_dst = FALSE; - int dom = 1; - int month = 0; - int i; - char * formatted_time; +# ifdef USE_LOCALE_MONETARY + /* If have LC_MONETARY, we can look at the currency symbol. Often that + * will be in the native script. We do this one first because there is + * just one string to examine, so potentially avoids work */ - /* Like above for LC_MONETARY, we set LC_TIME to the locale of the - * desired category, if it isn't that locale already */ + { + const char *original_monetary_locale + = switch_category_locale_to_template(LC_MONETARY, + category, + save_input_locale); + bool only_ascii = FALSE; + const U8 * currency_string + = (const U8 *) my_nl_langinfo(CRNCYSTR, FALSE); + /* 2nd param not relevant for this item */ + const U8 * first_variant; - if (category != LC_TIME) { + assert( *currency_string == '-' + || *currency_string == '+' + || *currency_string == '.'); - save_time_locale = do_setlocale_c(LC_TIME, NULL); - if (! save_time_locale) { - DEBUG_L(PerlIO_printf(Perl_debug_log, - "Could not find current locale for LC_TIME\n")); - goto cant_use_time; - } - save_time_locale = stdize_locale(savepv(save_time_locale)); + currency_string++; - if (strEQ(save_time_locale, save_input_locale)) { - Safefree(save_time_locale); - save_time_locale = NULL; + if (is_utf8_invariant_string_loc(currency_string, 0, &first_variant)) + { + DEBUG_L(PerlIO_printf(Perl_debug_log, "Couldn't get currency symbol for %s, or contains only ASCII; can't use for determining if UTF-8 locale\n", save_input_locale)); + only_ascii = TRUE; } - else if (! do_setlocale_c(LC_TIME, save_input_locale)) { - DEBUG_L(PerlIO_printf(Perl_debug_log, - "Could not change LC_TIME locale to %s\n", - save_input_locale)); - Safefree(save_time_locale); - goto cant_use_time; + else { + is_utf8 = is_strict_utf8_string(first_variant, 0); } - } - - /* Here the current LC_TIME is set to the locale of the category - * whose information is desired. Look at all the days of the week and - * month names, and the timezone and am/pm indicator for UTF-8 variant - * characters. The first such a one found will tell us if the locale - * is UTF-8 or not */ - for (i = 0; i < 7 + 12; i++) { /* 7 days; 12 months */ - formatted_time = my_strftime("%A %B %Z %p", - 0, 0, hour, dom, month, 2012 - 1900, 0, 0, is_dst); - if ( ! formatted_time - || is_utf8_invariant_string((U8 *) formatted_time, 0)) - { + restore_switched_locale(LC_MONETARY, original_monetary_locale); - /* Here, we didn't find a non-ASCII. Try the next time through - * with the complemented dst and am/pm, and try with the next - * weekday. After we have gotten all weekdays, try the next - * month */ - is_dst = ! is_dst; - hour = (hour + 12) % 24; - dom++; - if (i > 6) { - month++; - } - continue; - } + if (! only_ascii) { - /* Here, we have a non-ASCII. Return TRUE is it is valid UTF8; - * false otherwise. But first, restore LC_TIME to its original - * locale if we changed it */ - if (save_time_locale) { - do_setlocale_c(LC_TIME, save_time_locale); - Safefree(save_time_locale); + /* It isn't a UTF-8 locale if the symbol is not legal UTF-8; + * otherwise assume the locale is UTF-8 if and only if the symbol + * is non-ascii UTF-8. */ + DEBUG_Lv(PerlIO_printf(Perl_debug_log, "\t?Currency symbol for %s is UTF-8=%d\n", + save_input_locale, is_utf8)); + goto finish_and_return; } - - DEBUG_L(PerlIO_printf(Perl_debug_log, "\t?time-related strings for %s are UTF-8=%d\n", - save_input_locale, - is_utf8_string((U8 *) formatted_time, 0))); - Safefree(save_input_locale); - return is_utf8_string((U8 *) formatted_time, 0); } - /* Falling off the end of the loop indicates all the names were just - * ASCII. Go on to the next test. If we changed it, restore LC_TIME - * to its original locale */ - if (save_time_locale) { - do_setlocale_c(LC_TIME, save_time_locale); - Safefree(save_time_locale); - } - DEBUG_L(PerlIO_printf(Perl_debug_log, "All time-related words for %s contain only ASCII; can't use for determining if UTF-8 locale\n", save_input_locale)); - } - cant_use_time: - -# endif - -# if 0 && defined(USE_LOCALE_MESSAGES) && defined(HAS_SYS_ERRLIST) +# endif /* USE_LOCALE_MONETARY */ +# if defined(HAS_STRFTIME) && defined(USE_LOCALE_TIME) -/* This code is ifdefd out because it was found to not be necessary in testing - * on our dromedary test machine, which has over 700 locales. There, this - * added no value to looking at the currency symbol and the time strings. I - * left it in so as to avoid rewriting it if real-world experience indicates - * that dromedary is an outlier. Essentially, instead of returning abpve if we - * haven't found illegal utf8, we continue on and examine all the strerror() - * messages on the platform for utf8ness. If all are ASCII, we still don't - * know the answer; but otherwise we have a pretty good indication of the - * utf8ness. The reason this doesn't help much is that the messages may not - * have been translated into the locale. The currency symbol and time strings - * are much more likely to have been translated. */ - { - int e; - bool is_utf8 = FALSE; - bool non_ascii = FALSE; - char *save_messages_locale = NULL; - const char * errmsg = NULL; + /* Still haven't found a non-ASCII string to disambiguate UTF-8 or not. Try + * the names of the months and weekdays, timezone, and am/pm indicator */ + { + const char *original_time_locale + = switch_category_locale_to_template(LC_TIME, + category, + save_input_locale); + int hour = 10; + bool is_dst = FALSE; + int dom = 1; + int month = 0; + int i; + char * formatted_time; + + /* Here the current LC_TIME is set to the locale of the category + * whose information is desired. Look at all the days of the week and + * month names, and the timezone and am/pm indicator for UTF-8 variant + * characters. The first such a one found will tell us if the locale + * is UTF-8 or not */ + + for (i = 0; i < 7 + 12; i++) { /* 7 days; 12 months */ + formatted_time = my_strftime("%A %B %Z %p", + 0, 0, hour, dom, month, 2012 - 1900, 0, 0, is_dst); + if ( ! formatted_time + || is_utf8_invariant_string((U8 *) formatted_time, 0)) + { - /* Like above, we set LC_MESSAGES to the locale of the desired - * category, if it isn't that locale already */ + /* Here, we didn't find a non-ASCII. Try the next time through + * with the complemented dst and am/pm, and try with the next + * weekday. After we have gotten all weekdays, try the next + * month */ + is_dst = ! is_dst; + hour = (hour + 12) % 24; + dom++; + if (i > 6) { + month++; + } + continue; + } - if (category != LC_MESSAGES) { + /* Here, we have a non-ASCII. Return TRUE is it is valid UTF8; + * false otherwise. But first, restore LC_TIME to its original + * locale if we changed it */ + restore_switched_locale(LC_TIME, original_time_locale); - save_messages_locale = do_setlocale_c(LC_MESSAGES, NULL); - if (! save_messages_locale) { - DEBUG_L(PerlIO_printf(Perl_debug_log, - "Could not find current locale for LC_MESSAGES\n")); - goto cant_use_messages; + DEBUG_Lv(PerlIO_printf(Perl_debug_log, "\t?time-related strings for %s are UTF-8=%d\n", + save_input_locale, + is_utf8_string((U8 *) formatted_time, 0))); + is_utf8 = is_utf8_string((U8 *) formatted_time, 0); + goto finish_and_return; } - save_messages_locale = stdize_locale(savepv(save_messages_locale)); - if (strEQ(save_messages_locale, save_input_locale)) { - Safefree(save_messages_locale); - save_messages_locale = NULL; - } - else if (! do_setlocale_c(LC_MESSAGES, save_input_locale)) { - DEBUG_L(PerlIO_printf(Perl_debug_log, - "Could not change LC_MESSAGES locale to %s\n", - save_input_locale)); - Safefree(save_messages_locale); - goto cant_use_messages; - } + /* Falling off the end of the loop indicates all the names were just + * ASCII. Go on to the next test. If we changed it, restore LC_TIME + * to its original locale */ + restore_switched_locale(LC_TIME, original_time_locale); + DEBUG_Lv(PerlIO_printf(Perl_debug_log, "All time-related words for %s contain only ASCII; can't use for determining if UTF-8 locale\n", save_input_locale)); } - /* Here the current LC_MESSAGES is set to the locale of the category - * whose information is desired. Look through all the messages. We - * can't use Strerror() here because it may expand to code that - * segfaults in miniperl */ +# endif - for (e = 0; e <= sys_nerr; e++) { - errno = 0; - errmsg = sys_errlist[e]; - if (errno || !errmsg) { - break; - } - errmsg = savepv(errmsg); - if (! is_utf8_invariant_string((U8 *) errmsg, 0)) { - non_ascii = TRUE; - is_utf8 = is_utf8_string((U8 *) errmsg, 0); - break; +# if 0 && defined(USE_LOCALE_MESSAGES) && defined(HAS_SYS_ERRLIST) + + /* This code is ifdefd out because it was found to not be necessary in testing + * on our dromedary test machine, which has over 700 locales. There, this + * added no value to looking at the currency symbol and the time strings. I + * left it in so as to avoid rewriting it if real-world experience indicates + * that dromedary is an outlier. Essentially, instead of returning abpve if we + * haven't found illegal utf8, we continue on and examine all the strerror() + * messages on the platform for utf8ness. If all are ASCII, we still don't + * know the answer; but otherwise we have a pretty good indication of the + * utf8ness. The reason this doesn't help much is that the messages may not + * have been translated into the locale. The currency symbol and time strings + * are much more likely to have been translated. */ + { + int e; + bool non_ascii = FALSE; + const char *original_messages_locale + = switch_category_locale_to_template(LC_MESSAGES, + category, + save_input_locale); + const char * errmsg = NULL; + + /* Here the current LC_MESSAGES is set to the locale of the category + * whose information is desired. Look through all the messages. We + * can't use Strerror() here because it may expand to code that + * segfaults in miniperl */ + + for (e = 0; e <= sys_nerr; e++) { + errno = 0; + errmsg = sys_errlist[e]; + if (errno || !errmsg) { + break; + } + errmsg = savepv(errmsg); + if (! is_utf8_invariant_string((U8 *) errmsg, 0)) { + non_ascii = TRUE; + is_utf8 = is_utf8_string((U8 *) errmsg, 0); + break; + } } - } - Safefree(errmsg); + Safefree(errmsg); - /* And, if we changed it, restore LC_MESSAGES to its original locale */ - if (save_messages_locale) { - do_setlocale_c(LC_MESSAGES, save_messages_locale); - Safefree(save_messages_locale); - } + restore_switched_locale(LC_MESSAGES, original_messages_locale); - if (non_ascii) { + if (non_ascii) { - /* Any non-UTF-8 message means not a UTF-8 locale; if all are valid, - * any non-ascii means it is one; otherwise we assume it isn't */ - DEBUG_L(PerlIO_printf(Perl_debug_log, "\t?error messages for %s are UTF-8=%d\n", - save_input_locale, - is_utf8)); - Safefree(save_input_locale); - return is_utf8; - } + /* Any non-UTF-8 message means not a UTF-8 locale; if all are valid, + * any non-ascii means it is one; otherwise we assume it isn't */ + DEBUG_Lv(PerlIO_printf(Perl_debug_log, "\t?error messages for %s are UTF-8=%d\n", + save_input_locale, + is_utf8)); + goto finish_and_return; + } - DEBUG_L(PerlIO_printf(Perl_debug_log, "All error messages for %s contain only ASCII; can't use for determining if UTF-8 locale\n", save_input_locale)); - } - cant_use_messages: + DEBUG_L(PerlIO_printf(Perl_debug_log, "All error messages for %s contain only ASCII; can't use for determining if UTF-8 locale\n", save_input_locale)); + } # endif -# endif /* the code that is compiled when no nl_langinfo */ - -# ifndef EBCDIC /* On os390, even if the name ends with "UTF-8', it isn't a +# ifndef EBCDIC /* On os390, even if the name ends with "UTF-8', it isn't a UTF-8 locale */ /* As a last resort, look at the locale name to see if it matches @@ -3460,77 +4900,194 @@ Perl__is_cur_LC_category_utf8(pTHX_ int category) * its UTF-8ness, but if it has UTF8 in the name, it is extremely likely to * be a UTF-8 locale. Similarly for the other common names */ - final_pos = strlen(save_input_locale) - 1; - if (final_pos >= 3) { - const char *name = save_input_locale; + { + const Size_t final_pos = strlen(save_input_locale) - 1; - /* Find next 'U' or 'u' and look from there */ - while ((name += strcspn(name, "Uu") + 1) - <= save_input_locale + final_pos - 2) - { - if ( isALPHA_FOLD_NE(*name, 't') - || isALPHA_FOLD_NE(*(name + 1), 'f')) + if (final_pos >= 3) { + const char *name = save_input_locale; + + /* Find next 'U' or 'u' and look from there */ + while ((name += strcspn(name, "Uu") + 1) + <= save_input_locale + final_pos - 2) { - continue; - } - name += 2; - if (*(name) == '-') { - if ((name > save_input_locale + final_pos - 1)) { - break; + if ( isALPHA_FOLD_NE(*name, 't') + || isALPHA_FOLD_NE(*(name + 1), 'f')) + { + continue; + } + name += 2; + if (*(name) == '-') { + if ((name > save_input_locale + final_pos - 1)) { + break; + } + name++; + } + if (*(name) == '8') { + DEBUG_L(PerlIO_printf(Perl_debug_log, + "Locale %s ends with UTF-8 in name\n", + save_input_locale)); + is_utf8 = TRUE; + goto finish_and_return; } - name++; - } - if (*(name) == '8') { - DEBUG_L(PerlIO_printf(Perl_debug_log, - "Locale %s ends with UTF-8 in name\n", - save_input_locale)); - Safefree(save_input_locale); - return TRUE; } + DEBUG_L(PerlIO_printf(Perl_debug_log, + "Locale %s doesn't end with UTF-8 in name\n", + save_input_locale)); } - DEBUG_L(PerlIO_printf(Perl_debug_log, - "Locale %s doesn't end with UTF-8 in name\n", - save_input_locale)); - } -# endif -# ifdef WIN32 +# ifdef WIN32 - /* http://msdn.microsoft.com/en-us/library/windows/desktop/dd317756.aspx */ - if (memENDs(save_input_locale, final_pos, "65001")) { - DEBUG_L(PerlIO_printf(Perl_debug_log, + /* http://msdn.microsoft.com/en-us/library/windows/desktop/dd317756.aspx */ + if (memENDs(save_input_locale, final_pos, "65001")) { + DEBUG_L(PerlIO_printf(Perl_debug_log, "Locale %s ends with 65001 in name, is UTF-8 locale\n", save_input_locale)); - Safefree(save_input_locale); - return TRUE; - } + is_utf8 = TRUE; + goto finish_and_return; + } -# endif +# endif + } +# endif /* Other common encodings are the ISO 8859 series, which aren't UTF-8. But * since we are about to return FALSE anyway, there is no point in doing * this extra work */ -# if 0 +# if 0 if (instr(save_input_locale, "8859")) { DEBUG_L(PerlIO_printf(Perl_debug_log, "Locale %s has 8859 in name, not UTF-8 locale\n", save_input_locale)); - Safefree(save_input_locale); - return FALSE; + is_utf8 = FALSE; + goto finish_and_return; } -# endif +# endif DEBUG_L(PerlIO_printf(Perl_debug_log, "Assuming locale %s is not a UTF-8 locale\n", save_input_locale)); + is_utf8 = FALSE; + +# endif /* the code that is compiled when no modern LC_CTYPE */ + + finish_and_return: + + /* Cache this result so we don't have to go through all this next time. */ + utf8ness_cache_size = sizeof(PL_locale_utf8ness) + - (utf8ness_cache - PL_locale_utf8ness); + + /* But we can't save it if it is too large for the total space available */ + if (LIKELY(input_name_len_with_overhead < utf8ness_cache_size)) { + Size_t utf8ness_cache_len = strlen(utf8ness_cache); + + /* Here it can fit, but we may need to clear out the oldest cached + * result(s) to do so. Check */ + if (utf8ness_cache_len + input_name_len_with_overhead + >= utf8ness_cache_size) + { + /* Here we have to clear something out to make room for this. + * Start looking at the rightmost place where it could fit and find + * the beginning of the entry that extends past that. */ + char * cutoff = (char *) my_memrchr(utf8ness_cache, + UTF8NESS_SEP[0], + utf8ness_cache_size + - input_name_len_with_overhead); + + assert(cutoff); + assert(cutoff >= utf8ness_cache); + + /* This and all subsequent entries must be removed */ + *cutoff = '\0'; + utf8ness_cache_len = strlen(utf8ness_cache); + } + + /* Make space for the new entry */ + Move(utf8ness_cache, + utf8ness_cache + input_name_len_with_overhead, + utf8ness_cache_len + 1 /* Incl. trailing NUL */, char); + + /* And insert it */ + Copy(delimited, utf8ness_cache, input_name_len_with_overhead - 1, char); + utf8ness_cache[input_name_len_with_overhead - 1] = is_utf8 + '0'; + + if ((PL_locale_utf8ness[strlen(PL_locale_utf8ness)-1] + & (PERL_UINTMAX_T) ~1) != '0') + { + Perl_croak(aTHX_ + "panic: %s: %d: Corrupt utf8ness_cache=%s\nlen=%zu," + " inserted_name=%s, its_len=%zu\n", + __FILE__, __LINE__, + PL_locale_utf8ness, strlen(PL_locale_utf8ness), + delimited, input_name_len_with_overhead); + } + } + +# ifdef DEBUGGING + + if (DEBUG_Lv_TEST) { + const char * s = PL_locale_utf8ness; + + /* Audit the structure */ + while (s < PL_locale_utf8ness + strlen(PL_locale_utf8ness)) { + const char *e; + + if (*s != UTF8NESS_SEP[0]) { + Perl_croak(aTHX_ + "panic: %s: %d: Corrupt utf8ness_cache: missing" + " separator %.*s<-- HERE %s\n", + __FILE__, __LINE__, + (int) (s - PL_locale_utf8ness), PL_locale_utf8ness, + s); + } + s++; + e = strchr(s, UTF8NESS_PREFIX[0]); + if (! e) { + Perl_croak(aTHX_ + "panic: %s: %d: Corrupt utf8ness_cache: missing" + " separator %.*s<-- HERE %s\n", + __FILE__, __LINE__, + (int) (e - PL_locale_utf8ness), PL_locale_utf8ness, + e); + } + e++; + if (*e != '0' && *e != '1') { + Perl_croak(aTHX_ + "panic: %s: %d: Corrupt utf8ness_cache: utf8ness" + " must be [01] %.*s<-- HERE %s\n", + __FILE__, __LINE__, + (int) (e + 1 - PL_locale_utf8ness), + PL_locale_utf8ness, e + 1); + } + if (ninstr(PL_locale_utf8ness, s, s-1, e)) { + Perl_croak(aTHX_ + "panic: %s: %d: Corrupt utf8ness_cache: entry" + " has duplicate %.*s<-- HERE %s\n", + __FILE__, __LINE__, + (int) (e - PL_locale_utf8ness), PL_locale_utf8ness, + e); + } + s = e + 1; + } + } + + if (DEBUG_Lv_TEST || debug_initialization) { + + PerlIO_printf(Perl_debug_log, + "PL_locale_utf8ness is now %s; returning %d\n", + PL_locale_utf8ness, is_utf8); + } + +# endif + + /* free only when not using the buffer */ + if ( delimited != buffer ) Safefree(delimited); Safefree(save_input_locale); - return FALSE; + return is_utf8; } #endif - bool Perl__is_in_locale_category(pTHX_ const bool compiling, const int category) { @@ -3541,15 +5098,15 @@ Perl__is_in_locale_category(pTHX_ const bool compiling, const int category) const COP * const cop = (compiling) ? &PL_compiling : PL_curcop; - SV *categories = cop_hints_fetch_pvs(cop, "locale", 0); - if (! categories || categories == &PL_sv_placeholder) { + SV *these_categories = cop_hints_fetch_pvs(cop, "locale", 0); + if (! these_categories || these_categories == &PL_sv_placeholder) { return FALSE; } /* The pseudo-category 'not_characters' is -1, so just add 1 to each to get * a valid unsigned */ assert(category >= -1); - return cBOOL(SvUV(categories) & (1U << (category + 1))); + return cBOOL(SvUV(these_categories) & (1U << (category + 1))); } char * @@ -3578,19 +5135,34 @@ Perl_my_strerror(pTHX_ const int errnum) const bool within_locale_scope = IN_LC(LC_MESSAGES); -# if defined(HAS_POSIX_2008_LOCALE) && defined(HAS_STRERROR_L) +# ifndef USE_ITHREADS - /* This function is trivial if we don't have to worry about thread safety - * and have strerror_l(), as it handles the switch of locales so we don't - * have to deal with that. We don't have to worry about thread safety if - * this is an unthreaded build, or if strerror_r() is also available. Both - * it and strerror_l() are thread-safe. Plain strerror() isn't thread - * safe. But on threaded builds when strerror_r() is available, the - * apparent call to strerror() below is actually a macro that - * behind-the-scenes calls strerror_r(). - */ + /* This function is trivial without threads. */ + if (within_locale_scope) { + errstr = savepv(strerror(errnum)); + } + else { + const char * save_locale = savepv(do_setlocale_c(LC_MESSAGES, NULL)); + + do_setlocale_c(LC_MESSAGES, "C"); + errstr = savepv(strerror(errnum)); + do_setlocale_c(LC_MESSAGES, save_locale); + Safefree(save_locale); + } + +# elif defined(HAS_POSIX_2008_LOCALE) \ + && defined(HAS_STRERROR_L) \ + && defined(HAS_DUPLOCALE) -# if ! defined(USE_ITHREADS) || defined(HAS_STRERROR_R) + /* This function is also trivial if we don't have to worry about thread + * safety and have strerror_l(), as it handles the switch of locales so we + * don't have to deal with that. We don't have to worry about thread + * safety if strerror_r() is also available. Both it and strerror_l() are + * thread-safe. Plain strerror() isn't thread safe. But on threaded + * builds when strerror_r() is available, the apparent call to strerror() + * below is actually a macro that behind-the-scenes calls strerror_r(). */ + +# ifdef HAS_STRERROR_R if (within_locale_scope) { errstr = savepv(strerror(errnum)); @@ -3628,48 +5200,23 @@ Perl_my_strerror(pTHX_ const int errnum) # endif # else /* Doesn't have strerror_l() */ -# ifdef USE_POSIX_2008_LOCALE - - locale_t save_locale = NULL; - -# else - const char * save_locale = NULL; bool locale_is_C = FALSE; - /* We have a critical section to prevent another thread from changing the - * locale out from under us (or zapping the buffer returned from - * setlocale() ) */ + /* We have a critical section to prevent another thread from executing this + * same code at the same time. (On thread-safe perls, the LOCK is a + * no-op.) Since this is the only place in core that changes LC_MESSAGES + * (unless the user has called setlocale(), this works to prevent races. */ LOCALE_LOCK; -# endif - DEBUG_Lv(PerlIO_printf(Perl_debug_log, "my_strerror called with errnum %d\n", errnum)); if (! within_locale_scope) { - errno = 0; - -# ifdef USE_POSIX_2008_LOCALE /* Use the thread-safe locale functions */ - - DEBUG_Lv(PerlIO_printf(Perl_debug_log, - "Not within locale scope, about to call" - " uselocale(0x%p)\n", PL_C_locale_obj)); - save_locale = uselocale(PL_C_locale_obj); - if (! save_locale) { - DEBUG_L(PerlIO_printf(Perl_debug_log, - "uselocale failed, errno=%d\n", errno)); - } - else { - DEBUG_Lv(PerlIO_printf(Perl_debug_log, - "uselocale returned 0x%p\n", save_locale)); - } - -# else /* Not thread-safe build */ - save_locale = do_setlocale_c(LC_MESSAGES, NULL); if (! save_locale) { - DEBUG_L(PerlIO_printf(Perl_debug_log, - "setlocale failed, errno=%d\n", errno)); + Perl_croak(aTHX_ + "panic: %s: %d: Could not find current LC_MESSAGES locale," + " errno=%d\n", __FILE__, __LINE__, errno); } else { locale_is_C = isNAME_C_OR_POSIX(save_locale); @@ -3683,9 +5230,6 @@ Perl_my_strerror(pTHX_ const int errnum) do_setlocale_c(LC_MESSAGES, "C"); } } - -# endif - } /* end of ! within_locale_scope */ else { DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s: %d: WITHIN locale scope\n", @@ -3697,25 +5241,11 @@ Perl_my_strerror(pTHX_ const int errnum) errstr = savepv(Strerror(errnum)); if (! within_locale_scope) { - errno = 0; - -# ifdef USE_POSIX_2008_LOCALE - - DEBUG_Lv(PerlIO_printf(Perl_debug_log, - "%s: %d: not within locale scope, restoring the locale\n", - __FILE__, __LINE__)); - if (save_locale && ! uselocale(save_locale)) { - DEBUG_L(PerlIO_printf(Perl_debug_log, - "uselocale restore failed, errno=%d\n", errno)); - } - } - -# else - if (save_locale && ! locale_is_C) { if (! do_setlocale_c(LC_MESSAGES, save_locale)) { - DEBUG_L(PerlIO_printf(Perl_debug_log, - "setlocale restore failed, errno=%d\n", errno)); + Perl_croak(aTHX_ + "panic: %s: %d: setlocale restore failed, errno=%d\n", + __FILE__, __LINE__, errno); } Safefree(save_locale); } @@ -3723,11 +5253,8 @@ Perl_my_strerror(pTHX_ const int errnum) LOCALE_UNLOCK; -# endif # endif /* End of doesn't have strerror_l */ -#endif /* End of does have locale messages */ - -#ifdef DEBUGGING +# ifdef DEBUGGING if (DEBUG_Lv_TEST) { PerlIO_printf(Perl_debug_log, "Strerror returned; saving a copy: '"); @@ -3735,7 +5262,8 @@ Perl_my_strerror(pTHX_ const int errnum) PerlIO_printf(Perl_debug_log, "'\n"); } -#endif +# endif +#endif /* End of does have locale messages */ SAVEFREEPV(errstr); return errstr; @@ -3743,48 +5271,200 @@ Perl_my_strerror(pTHX_ const int errnum) /* -=for apidoc sync_locale +=for apidoc switch_to_global_locale + +On systems without locale support, or on single-threaded builds, or on +platforms that do not support per-thread locale operations, this function does +nothing. On such systems that do have locale support, only a locale global to +the whole program is available. + +On multi-threaded builds on systems that do have per-thread locale operations, +this function converts the thread it is running in to use the global locale. +This is for code that has not yet or cannot be updated to handle multi-threaded +locale operation. As long as only a single thread is so-converted, everything +works fine, as all the other threads continue to ignore the global one, so only +this thread looks at it. + +However, on Windows systems this isn't quite true prior to Visual Studio 15, +at which point Microsoft fixed a bug. A race can occur if you use the +following operations on earlier Windows platforms: + +=over + +=item L -Changing the program's locale should be avoided by XS code. Nevertheless, -certain non-Perl libraries called from XS, such as C do so. When this -happens, Perl needs to be told that the locale has changed. Use this function -to do so, before returning to Perl. +=item L, items C and C + +=item L, items C and C + +=back + +The first item is not fixable (except by upgrading to a later Visual Studio +release), but it would be possible to work around the latter two items by using +the Windows API functions C and C; patches +welcome. + +Without this function call, threads that use the L> system +function will not work properly, as all the locale-sensitive functions will +look at the per-thread locale, and C will have no effect on this +thread. + +Perl code should convert to either call +L|perlapi/Perl_setlocale> (which is a drop-in for the system +C) or use the methods given in L to call +L|POSIX/setlocale>. Either one will transparently properly +handle all cases of single- vs multi-thread, POSIX 2008-supported or not. + +Non-Perl libraries, such as C, that call the system C can +continue to work if this function is called before transferring control to the +library. + +Upon return from the code that needs to use the global locale, +L|perlapi/sync_locale> should be called to restore the safe +multi-thread operation. =cut */ void -Perl_sync_locale(pTHX) +Perl_switch_to_global_locale() { - char * newlocale; -#ifdef USE_LOCALE_CTYPE +#ifdef USE_THREAD_SAFE_LOCALE +# ifdef WIN32 + + _configthreadlocale(_DISABLE_PER_THREAD_LOCALE); + +# else +# ifdef HAS_QUERYLOCALE + + setlocale(LC_ALL, querylocale(LC_ALL_MASK, uselocale((locale_t) 0))); + +# else + + { + unsigned int i; + + for (i = 0; i < LC_ALL_INDEX; i++) { + setlocale(categories[i], do_setlocale_r(categories[i], NULL)); + } + } + +# endif + + uselocale(LC_GLOBAL_LOCALE); + +# endif +#endif + +} + +/* + +=for apidoc sync_locale + +L|perlapi/Perl_setlocale> can be used at any time to query or +change the locale (though changing the locale is antisocial and dangerous on +multi-threaded systems that don't have multi-thread safe locale operations. +(See L). Using the system +L> should be avoided. Nevertheless, certain non-Perl libraries +called from XS, such as C do so, and this can't be changed. When the +locale is changed by XS code that didn't use +L|perlapi/Perl_setlocale>, Perl needs to be told that the +locale has changed. Use this function to do so, before returning to Perl. + +The return value is a boolean: TRUE if the global locale at the time of call +was in effect; and FALSE if a per-thread locale was in effect. This can be +used by the caller that needs to restore things as-they-were to decide whether +or not to call +L|perlapi/switch_to_global_locale>. + +=cut +*/ + +bool +Perl_sync_locale() +{ + +#ifndef USE_LOCALE + + return TRUE; + +#else + + const char * newlocale; + dTHX; + +# ifdef USE_POSIX_2008_LOCALE + + bool was_in_global_locale = FALSE; + locale_t cur_obj = uselocale((locale_t) 0); + + /* On Windows, unless the foreign code has turned off the thread-safe + * locale setting, any plain setlocale() will have affected what we see, so + * no need to worry. Otherwise, If the foreign code has done a plain + * setlocale(), it will only affect the global locale on POSIX systems, but + * will affect the */ + if (cur_obj == LC_GLOBAL_LOCALE) { + +# ifdef HAS_QUERY_LOCALE + + do_setlocale_c(LC_ALL, setlocale(LC_ALL, NULL)); + +# else + + unsigned int i; + + /* We can't trust that we can read the LC_ALL format on the + * platform, so do them individually */ + for (i = 0; i < LC_ALL_INDEX; i++) { + do_setlocale_r(categories[i], setlocale(categories[i], NULL)); + } + +# endif + + was_in_global_locale = TRUE; + } + +# else - newlocale = do_setlocale_c(LC_CTYPE, NULL); + bool was_in_global_locale = TRUE; + +# endif +# ifdef USE_LOCALE_CTYPE + + newlocale = savepv(do_setlocale_c(LC_CTYPE, NULL)); DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s:%d: %s\n", __FILE__, __LINE__, setlocale_debug_string(LC_CTYPE, NULL, newlocale))); new_ctype(newlocale); + Safefree(newlocale); -#endif /* USE_LOCALE_CTYPE */ -#ifdef USE_LOCALE_COLLATE +# endif /* USE_LOCALE_CTYPE */ +# ifdef USE_LOCALE_COLLATE - newlocale = do_setlocale_c(LC_COLLATE, NULL); + newlocale = savepv(do_setlocale_c(LC_COLLATE, NULL)); DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s:%d: %s\n", __FILE__, __LINE__, setlocale_debug_string(LC_COLLATE, NULL, newlocale))); new_collate(newlocale); + Safefree(newlocale); -#endif -#ifdef USE_LOCALE_NUMERIC +# endif +# ifdef USE_LOCALE_NUMERIC - newlocale = do_setlocale_c(LC_NUMERIC, NULL); + newlocale = savepv(do_setlocale_c(LC_NUMERIC, NULL)); DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s:%d: %s\n", __FILE__, __LINE__, setlocale_debug_string(LC_NUMERIC, NULL, newlocale))); new_numeric(newlocale); + Safefree(newlocale); -#endif /* USE_LOCALE_NUMERIC */ +# endif /* USE_LOCALE_NUMERIC */ + + return was_in_global_locale; + +#endif } @@ -3806,7 +5486,7 @@ S_setlocale_debug_string(const int category, /* category number, /* initialise to a non-null value to keep it out of BSS and so keep * -DPERL_GLOBAL_STRUCT_PRIVATE happy */ - static char ret[128] = "If you can read this, thank your buggy C" + static char ret[256] = "If you can read this, thank your buggy C" " library strlcpy(), and change your hints file" " to undef it"; @@ -3841,6 +5521,58 @@ S_setlocale_debug_string(const int category, /* category number, #endif +void +Perl_thread_locale_init() +{ + /* Called from a thread on startup*/ + +#ifdef USE_THREAD_SAFE_LOCALE + + dTHX_DEBUGGING; + + /* C starts the new thread in the global C locale. If we are thread-safe, + * we want to not be in the global locale */ + + DEBUG_L(PerlIO_printf(Perl_debug_log, + "%s:%d: new thread, initial locale is %s; calling setlocale\n", + __FILE__, __LINE__, setlocale(LC_ALL, NULL))); + +# ifdef WIN32 + + _configthreadlocale(_ENABLE_PER_THREAD_LOCALE); + +# else + + Perl_setlocale(LC_ALL, "C"); + +# endif +#endif + +} + +void +Perl_thread_locale_term() +{ + /* Called from a thread as it gets ready to terminate */ + +#ifdef USE_THREAD_SAFE_LOCALE + + /* C starts the new thread in the global C locale. If we are thread-safe, + * we want to not be in the global locale */ + +# ifndef WIN32 + + { /* Free up */ + locale_t cur_obj = uselocale(LC_GLOBAL_LOCALE); + if (cur_obj != LC_GLOBAL_LOCALE) { + freelocale(cur_obj); + } + } + +# endif +#endif + +} /* * ex: set ts=8 sts=4 sw=4 et: