X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/892e646556f1997b0384dc508d3223c2c2a7be74..36c2b2aa3853cfaf4237aec66eccdc6faaf9fe58:/locale.c diff --git a/locale.c b/locale.c index 36522ee..e7348e1 100644 --- a/locale.c +++ b/locale.c @@ -32,87 +32,1239 @@ * the desired behavior of those functions at the moment. And, LC_MESSAGES is * switched to the C locale for outputting the message unless within the scope * of 'use locale'. + * + * This code now has multi-thread-safe locale handling on systems that support + * that. This is completely transparent to most XS code. On earlier systems, + * it would be possible to emulate thread-safe locales, but this likely would + * involve a lot of locale switching, and would require XS code changes. + * Macros could be written so that the code wouldn't have to know which type of + * system is being used. It's unlikely that we would ever do that, since most + * modern systems support thread-safe locales, but there was code written to + * this end, and is retained, #ifdef'd out. */ -#include "EXTERN.h" -#define PERL_IN_LOCALE_C -#include "perl_langinfo.h" -#include "perl.h" +#include "EXTERN.h" +#define PERL_IN_LOCALE_C +#include "perl_langinfo.h" +#include "perl.h" + +#include "reentr.h" + +#ifdef I_WCHAR +# include +#endif + +/* If the environment says to, we can output debugging information during + * initialization. This is done before option parsing, and before any thread + * creation, so can be a file-level static */ +#if ! defined(DEBUGGING) || defined(PERL_GLOBAL_STRUCT) +# define debug_initialization 0 +# define DEBUG_INITIALIZATION_set(v) +#else +static bool debug_initialization = FALSE; +# define DEBUG_INITIALIZATION_set(v) (debug_initialization = v) +#endif + + +/* Returns the Unix errno portion; ignoring any others. This is a macro here + * instead of putting it into perl.h, because unclear to khw what should be + * done generally. */ +#define GET_ERRNO saved_errno + +/* strlen() of a literal string constant. We might want this more general, + * but using it in just this file for now. A problem with more generality is + * the compiler warnings about comparing unlike signs */ +#define STRLENs(s) (sizeof("" s "") - 1) + +/* Is the C string input 'name' "C" or "POSIX"? If so, and 'name' is the + * return of setlocale(), then this is extremely likely to be the C or POSIX + * locale. However, the output of setlocale() is documented to be opaque, but + * the odds are extremely small that it would return these two strings for some + * other locale. Note that VMS in these two locales includes many non-ASCII + * characters as controls and punctuation (below are hex bytes): + * cntrl: 84-97 9B-9F + * punct: A1-A3 A5 A7-AB B0-B3 B5-B7 B9-BD BF-CF D1-DD DF-EF F1-FD + * Oddly, none there are listed as alphas, though some represent alphabetics + * http://www.nntp.perl.org/group/perl.perl5.porters/2013/02/msg198753.html */ +#define isNAME_C_OR_POSIX(name) \ + ( (name) != NULL \ + && (( *(name) == 'C' && (*(name + 1)) == '\0') \ + || strEQ((name), "POSIX"))) + +#ifdef USE_LOCALE + +/* This code keeps a LRU cache of the UTF-8ness of the locales it has so-far + * looked up. This is in the form of a C string: */ + +#define UTF8NESS_SEP "\v" +#define UTF8NESS_PREFIX "\f" + +/* So, the string looks like: + * + * \vC\a0\vPOSIX\a0\vam_ET\a0\vaf_ZA.utf8\a1\ven_US.UTF-8\a1\0 + * + * where the digit 0 after the \a indicates that the locale starting just + * after the preceding \v is not UTF-8, and the digit 1 mean it is. */ + +STATIC_ASSERT_DECL(STRLENs(UTF8NESS_SEP) == 1); +STATIC_ASSERT_DECL(STRLENs(UTF8NESS_PREFIX) == 1); + +#define C_and_POSIX_utf8ness UTF8NESS_SEP "C" UTF8NESS_PREFIX "0" \ + UTF8NESS_SEP "POSIX" UTF8NESS_PREFIX "0" + +/* The cache is initialized to C_and_POSIX_utf8ness at start up. These are + * kept there always. The remining portion of the cache is LRU, with the + * oldest looked-up locale at the tail end */ + +STATIC char * +S_stdize_locale(pTHX_ char *locs) +{ + /* Standardize the locale name from a string returned by 'setlocale', + * possibly modifying that string. + * + * The typical return value of setlocale() is either + * (1) "xx_YY" if the first argument of setlocale() is not LC_ALL + * (2) "xa_YY xb_YY ..." if the first argument of setlocale() is LC_ALL + * (the space-separated values represent the various sublocales, + * in some unspecified order). This is not handled by this function. + * + * In some platforms it has a form like "LC_SOMETHING=Lang_Country.866\n", + * which is harmful for further use of the string in setlocale(). This + * function removes the trailing new line and everything up through the '=' + * */ + + const char * const s = strchr(locs, '='); + bool okay = TRUE; + + PERL_ARGS_ASSERT_STDIZE_LOCALE; + + if (s) { + const char * const t = strchr(s, '.'); + okay = FALSE; + if (t) { + const char * const u = strchr(t, '\n'); + if (u && (u[1] == 0)) { + const STRLEN len = u - s; + Move(s + 1, locs, len, char); + locs[len] = 0; + okay = TRUE; + } + } + } + + if (!okay) + Perl_croak(aTHX_ "Can't fix broken locale name \"%s\"", locs); + + return locs; +} + +/* Two parallel arrays; first the locale categories Perl uses on this system; + * the second array is their names. These arrays are in mostly arbitrary + * order. */ + +const int categories[] = { + +# ifdef USE_LOCALE_NUMERIC + LC_NUMERIC, +# endif +# ifdef USE_LOCALE_CTYPE + LC_CTYPE, +# endif +# ifdef USE_LOCALE_COLLATE + LC_COLLATE, +# endif +# ifdef USE_LOCALE_TIME + LC_TIME, +# endif +# ifdef USE_LOCALE_MESSAGES + LC_MESSAGES, +# endif +# ifdef USE_LOCALE_MONETARY + LC_MONETARY, +# endif +# ifdef USE_LOCALE_ADDRESS + LC_ADDRESS, +# endif +# ifdef USE_LOCALE_IDENTIFICATION + LC_IDENTIFICATION, +# endif +# ifdef USE_LOCALE_MEASUREMENT + LC_MEASUREMENT, +# endif +# ifdef USE_LOCALE_PAPER + LC_PAPER, +# endif +# ifdef USE_LOCALE_TELEPHONE + LC_TELEPHONE, +# endif +# ifdef LC_ALL + LC_ALL, +# endif + -1 /* Placeholder because C doesn't allow a + trailing comma, and it would get complicated + with all the #ifdef's */ +}; + +/* The top-most real element is LC_ALL */ + +const char * category_names[] = { + +# ifdef USE_LOCALE_NUMERIC + "LC_NUMERIC", +# endif +# ifdef USE_LOCALE_CTYPE + "LC_CTYPE", +# endif +# ifdef USE_LOCALE_COLLATE + "LC_COLLATE", +# endif +# ifdef USE_LOCALE_TIME + "LC_TIME", +# endif +# ifdef USE_LOCALE_MESSAGES + "LC_MESSAGES", +# endif +# ifdef USE_LOCALE_MONETARY + "LC_MONETARY", +# endif +# ifdef USE_LOCALE_ADDRESS + "LC_ADDRESS", +# endif +# ifdef USE_LOCALE_IDENTIFICATION + "LC_IDENTIFICATION", +# endif +# ifdef USE_LOCALE_MEASUREMENT + "LC_MEASUREMENT", +# endif +# ifdef USE_LOCALE_PAPER + "LC_PAPER", +# endif +# ifdef USE_LOCALE_TELEPHONE + "LC_TELEPHONE", +# endif +# ifdef LC_ALL + "LC_ALL", +# endif + NULL /* Placeholder */ + }; + +# ifdef LC_ALL + + /* On systems with LC_ALL, it is kept in the highest index position. (-2 + * to account for the final unused placeholder element.) */ +# define NOMINAL_LC_ALL_INDEX (C_ARRAY_LENGTH(categories) - 2) + +# else + + /* On systems without LC_ALL, we pretend it is there, one beyond the real + * top element, hence in the unused placeholder element. */ +# define NOMINAL_LC_ALL_INDEX (C_ARRAY_LENGTH(categories) - 1) + +# endif + +/* Pretending there is an LC_ALL element just above allows us to avoid most + * special cases. Most loops through these arrays in the code below are + * written like 'for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++)'. They will work + * on either type of system. But the code must be written to not access the + * element at 'LC_ALL_INDEX' except on platforms that have it. This can be + * checked for at compile time by using the #define LC_ALL_INDEX which is only + * defined if we do have LC_ALL. */ + +STATIC const char * +S_category_name(const int category) +{ + unsigned int i; + +#ifdef LC_ALL + + if (category == LC_ALL) { + return "LC_ALL"; + } + +#endif + + for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++) { + if (category == categories[i]) { + return category_names[i]; + } + } + + { + const char suffix[] = " (unknown)"; + int temp = category; + Size_t length = sizeof(suffix) + 1; + char * unknown; + dTHX; + + if (temp < 0) { + length++; + temp = - temp; + } + + /* Calculate the number of digits */ + while (temp >= 10) { + temp /= 10; + length++; + } + + Newx(unknown, length, char); + my_snprintf(unknown, length, "%d%s", category, suffix); + SAVEFREEPV(unknown); + return unknown; + } +} + +/* Now create LC_foo_INDEX #defines for just those categories on this system */ +# ifdef USE_LOCALE_NUMERIC +# define LC_NUMERIC_INDEX 0 +# define _DUMMY_NUMERIC LC_NUMERIC_INDEX +# else +# define _DUMMY_NUMERIC -1 +# endif +# ifdef USE_LOCALE_CTYPE +# define LC_CTYPE_INDEX _DUMMY_NUMERIC + 1 +# define _DUMMY_CTYPE LC_CTYPE_INDEX +# else +# define _DUMMY_CTYPE _DUMMY_NUMERIC +# endif +# ifdef USE_LOCALE_COLLATE +# define LC_COLLATE_INDEX _DUMMY_CTYPE + 1 +# define _DUMMY_COLLATE LC_COLLATE_INDEX +# else +# define _DUMMY_COLLATE _DUMMY_CTYPE +# endif +# ifdef USE_LOCALE_TIME +# define LC_TIME_INDEX _DUMMY_COLLATE + 1 +# define _DUMMY_TIME LC_TIME_INDEX +# else +# define _DUMMY_TIME _DUMMY_COLLATE +# endif +# ifdef USE_LOCALE_MESSAGES +# define LC_MESSAGES_INDEX _DUMMY_TIME + 1 +# define _DUMMY_MESSAGES LC_MESSAGES_INDEX +# else +# define _DUMMY_MESSAGES _DUMMY_TIME +# endif +# ifdef USE_LOCALE_MONETARY +# define LC_MONETARY_INDEX _DUMMY_MESSAGES + 1 +# define _DUMMY_MONETARY LC_MONETARY_INDEX +# else +# define _DUMMY_MONETARY _DUMMY_MESSAGES +# endif +# ifdef USE_LOCALE_ADDRESS +# define LC_ADDRESS_INDEX _DUMMY_MONETARY + 1 +# define _DUMMY_ADDRESS LC_ADDRESS_INDEX +# else +# define _DUMMY_ADDRESS _DUMMY_MONETARY +# endif +# ifdef USE_LOCALE_IDENTIFICATION +# define LC_IDENTIFICATION_INDEX _DUMMY_ADDRESS + 1 +# define _DUMMY_IDENTIFICATION LC_IDENTIFICATION_INDEX +# else +# define _DUMMY_IDENTIFICATION _DUMMY_ADDRESS +# endif +# ifdef USE_LOCALE_MEASUREMENT +# define LC_MEASUREMENT_INDEX _DUMMY_IDENTIFICATION + 1 +# define _DUMMY_MEASUREMENT LC_MEASUREMENT_INDEX +# else +# define _DUMMY_MEASUREMENT _DUMMY_IDENTIFICATION +# endif +# ifdef USE_LOCALE_PAPER +# define LC_PAPER_INDEX _DUMMY_MEASUREMENT + 1 +# define _DUMMY_PAPER LC_PAPER_INDEX +# else +# define _DUMMY_PAPER _DUMMY_MEASUREMENT +# endif +# ifdef USE_LOCALE_TELEPHONE +# define LC_TELEPHONE_INDEX _DUMMY_PAPER + 1 +# define _DUMMY_TELEPHONE LC_TELEPHONE_INDEX +# else +# define _DUMMY_TELEPHONE _DUMMY_PAPER +# endif +# ifdef LC_ALL +# define LC_ALL_INDEX _DUMMY_TELEPHONE + 1 +# endif +#endif /* ifdef USE_LOCALE */ + +/* Windows requres a customized base-level setlocale() */ +#ifdef WIN32 +# define my_setlocale(cat, locale) win32_setlocale(cat, locale) +#else +# define my_setlocale(cat, locale) setlocale(cat, locale) +#endif + +#ifndef USE_POSIX_2008_LOCALE + +/* "do_setlocale_c" is intended to be called when the category is a constant + * known at compile time; "do_setlocale_r", not known until run time */ +# define do_setlocale_c(cat, locale) my_setlocale(cat, locale) +# define do_setlocale_r(cat, locale) my_setlocale(cat, locale) + +#else /* Below uses POSIX 2008 */ + +/* We emulate setlocale with our own function. LC_foo is not valid for the + * POSIX 2008 functions. Instead LC_foo_MASK is used, which we use an array + * lookup to convert to. At compile time we have defined LC_foo_INDEX as the + * proper offset into the array 'category_masks[]'. At runtime, we have to + * search through the array (as the actual numbers may not be small contiguous + * positive integers which would lend themselves to array lookup). */ +# define do_setlocale_c(cat, locale) \ + emulate_setlocale(cat, locale, cat ## _INDEX, TRUE) +# define do_setlocale_r(cat, locale) emulate_setlocale(cat, locale, 0, FALSE) + +/* A third array, parallel to the ones above to map from category to its + * equivalent mask */ +const int category_masks[] = { +# ifdef USE_LOCALE_NUMERIC + LC_NUMERIC_MASK, +# endif +# ifdef USE_LOCALE_CTYPE + LC_CTYPE_MASK, +# endif +# ifdef USE_LOCALE_COLLATE + LC_COLLATE_MASK, +# endif +# ifdef USE_LOCALE_TIME + LC_TIME_MASK, +# endif +# ifdef USE_LOCALE_MESSAGES + LC_MESSAGES_MASK, +# endif +# ifdef USE_LOCALE_MONETARY + LC_MONETARY_MASK, +# endif +# ifdef USE_LOCALE_ADDRESS + LC_ADDRESS_MASK, +# endif +# ifdef USE_LOCALE_IDENTIFICATION + LC_IDENTIFICATION_MASK, +# endif +# ifdef USE_LOCALE_MEASUREMENT + LC_MEASUREMENT_MASK, +# endif +# ifdef USE_LOCALE_PAPER + LC_PAPER_MASK, +# endif +# ifdef USE_LOCALE_TELEPHONE + LC_TELEPHONE_MASK, +# endif + /* LC_ALL can't be turned off by a Configure + * option, and in Posix 2008, should always be + * here, so compile it in unconditionally. + * This could catch some glitches at compile + * time */ + LC_ALL_MASK + }; + +STATIC const char * +S_emulate_setlocale(const int category, + const char * locale, + unsigned int index, + const bool is_index_valid + ) +{ + /* This function effectively performs a setlocale() on just the current + * thread; thus it is thread-safe. It does this by using the POSIX 2008 + * locale functions to emulate the behavior of setlocale(). Similar to + * regular setlocale(), the return from this function points to memory that + * can be overwritten by other system calls, so needs to be copied + * immediately if you need to retain it. The difference here is that + * system calls besides another setlocale() can overwrite it. + * + * By doing this, most locale-sensitive functions become thread-safe. The + * exceptions are mostly those that return a pointer to static memory. + * + * This function takes the same parameters, 'category' and 'locale', that + * the regular setlocale() function does, but it also takes two additional + * ones. This is because the 2008 functions don't use a category; instead + * they use a corresponding mask. Because this function operates in both + * worlds, it may need one or the other or both. This function can + * calculate the mask from the input category, but to avoid this + * calculation, if the caller knows at compile time what the mask is, it + * can pass it, setting 'is_index_valid' to TRUE; otherwise the mask + * parameter is ignored. + * + * POSIX 2008, for some sick reason, chose not to provide a method to find + * the category name of a locale. Some vendors have created a + * querylocale() function to do just that. This function is a lot simpler + * to implement on systems that have this. Otherwise, we have to keep + * track of what the locale has been set to, so that we can return its + * name to emulate setlocale(). It's also possible for C code in some + * library to change the locale without us knowing it, though as of + * September 2017, there are no occurrences in CPAN of uselocale(). Some + * libraries do use setlocale(), but that changes the global locale, and + * threads using per-thread locales will just ignore those changes. + * Another problem is that without querylocale(), we have to guess at what + * was meant by setting a locale of "". We handle this by not actually + * ever setting to "" (unless querylocale exists), but to emulate what we + * think should happen for "". + */ + + int mask; + locale_t old_obj; + locale_t new_obj; + dTHX; + +# ifdef DEBUGGING + + if (DEBUG_Lv_TEST || debug_initialization) { + PerlIO_printf(Perl_debug_log, "%s:%d: emulate_setlocale input=%d (%s), \"%s\", %d, %d\n", __FILE__, __LINE__, category, category_name(category), locale, index, is_index_valid); + } + +# endif + + /* If the input mask might be incorrect, calculate the correct one */ + if (! is_index_valid) { + unsigned int i; + +# ifdef DEBUGGING + + if (DEBUG_Lv_TEST || debug_initialization) { + PerlIO_printf(Perl_debug_log, "%s:%d: finding index of category %d (%s)\n", __FILE__, __LINE__, category, category_name(category)); + } + +# endif + + for (i = 0; i <= LC_ALL_INDEX; i++) { + if (category == categories[i]) { + index = i; + goto found_index; + } + } + + /* Here, we don't know about this category, so can't handle it. + * Fallback to the early POSIX usages */ + Perl_warner(aTHX_ packWARN(WARN_LOCALE), + "Unknown locale category %d; can't set it to %s\n", + category, locale); + return NULL; + + found_index: ; + +# ifdef DEBUGGING + + if (DEBUG_Lv_TEST || debug_initialization) { + PerlIO_printf(Perl_debug_log, "%s:%d: index is %d for %s\n", __FILE__, __LINE__, index, category_name(category)); + } + +# endif + + } + + mask = category_masks[index]; + +# ifdef DEBUGGING + + if (DEBUG_Lv_TEST || debug_initialization) { + PerlIO_printf(Perl_debug_log, "%s:%d: category name is %s; mask is 0x%x\n", __FILE__, __LINE__, category_names[index], mask); + } + +# endif + + /* If just querying what the existing locale is ... */ + if (locale == NULL) { + locale_t cur_obj = uselocale((locale_t) 0); + +# ifdef DEBUGGING + + if (DEBUG_Lv_TEST || debug_initialization) { + PerlIO_printf(Perl_debug_log, "%s:%d: emulate_setlocale querying %p\n", __FILE__, __LINE__, cur_obj); + } + +# endif + + if (cur_obj == LC_GLOBAL_LOCALE) { + return my_setlocale(category, NULL); + } + +# ifdef HAS_QUERYLOCALE + + return (char *) querylocale(mask, cur_obj); + +# else + + /* If this assert fails, adjust the size of curlocales in intrpvar.h */ + STATIC_ASSERT_STMT(C_ARRAY_LENGTH(PL_curlocales) > LC_ALL_INDEX); + +# if defined(_NL_LOCALE_NAME) && defined(DEBUGGING) + + { + /* Internal glibc for querylocale(), but doesn't handle + * empty-string ("") locale properly; who knows what other + * glitches. Check it for now, under debug. */ + + char * temp_name = nl_langinfo_l(_NL_LOCALE_NAME(category), + uselocale((locale_t) 0)); + /* + PerlIO_printf(Perl_debug_log, "%s:%d: temp_name=%s\n", __FILE__, __LINE__, temp_name ? temp_name : "NULL"); + PerlIO_printf(Perl_debug_log, "%s:%d: index=%d\n", __FILE__, __LINE__, index); + PerlIO_printf(Perl_debug_log, "%s:%d: PL_curlocales[index]=%s\n", __FILE__, __LINE__, PL_curlocales[index]); + */ + if (temp_name && PL_curlocales[index] && strNE(temp_name, "")) { + if ( strNE(PL_curlocales[index], temp_name) + && ! ( isNAME_C_OR_POSIX(temp_name) + && isNAME_C_OR_POSIX(PL_curlocales[index]))) { + +# ifdef USE_C_BACKTRACE + + dump_c_backtrace(Perl_debug_log, 20, 1); + +# endif + + Perl_croak(aTHX_ "panic: Mismatch between what Perl thinks %s is" + " (%s) and what internal glibc thinks" + " (%s)\n", category_names[index], + PL_curlocales[index], temp_name); + } + + return temp_name; + } + } + +# endif + + /* Without querylocale(), we have to use our record-keeping we've + * done. */ + + if (category != LC_ALL) { + +# ifdef DEBUGGING + + if (DEBUG_Lv_TEST || debug_initialization) { + PerlIO_printf(Perl_debug_log, "%s:%d: emulate_setlocale returning %s\n", __FILE__, __LINE__, PL_curlocales[index]); + } + +# endif + + return PL_curlocales[index]; + } + else { /* For LC_ALL */ + unsigned int i; + Size_t names_len = 0; + char * all_string; + bool are_all_categories_the_same_locale = TRUE; + + /* If we have a valid LC_ALL value, just return it */ + if (PL_curlocales[LC_ALL_INDEX]) { + +# ifdef DEBUGGING + + if (DEBUG_Lv_TEST || debug_initialization) { + PerlIO_printf(Perl_debug_log, "%s:%d: emulate_setlocale returning %s\n", __FILE__, __LINE__, PL_curlocales[LC_ALL_INDEX]); + } + +# endif + + return PL_curlocales[LC_ALL_INDEX]; + } + + /* Otherwise, we need to construct a string of name=value pairs. + * We use the glibc syntax, like + * LC_NUMERIC=C;LC_TIME=en_US.UTF-8;... + * First calculate the needed size. Along the way, check if all + * the locale names are the same */ + for (i = 0; i < LC_ALL_INDEX; i++) { + +# ifdef DEBUGGING + + if (DEBUG_Lv_TEST || debug_initialization) { + PerlIO_printf(Perl_debug_log, "%s:%d: emulate_setlocale i=%d, name=%s, locale=%s\n", __FILE__, __LINE__, i, category_names[i], PL_curlocales[i]); + } + +# endif + + names_len += strlen(category_names[i]) + + 1 /* '=' */ + + strlen(PL_curlocales[i]) + + 1; /* ';' */ + + if (i > 0 && strNE(PL_curlocales[i], PL_curlocales[i-1])) { + are_all_categories_the_same_locale = FALSE; + } + } + + /* If they are the same, we don't actually have to construct the + * string; we just make the entry in LC_ALL_INDEX valid, and be + * that single name */ + if (are_all_categories_the_same_locale) { + PL_curlocales[LC_ALL_INDEX] = savepv(PL_curlocales[0]); + return PL_curlocales[LC_ALL_INDEX]; + } + + names_len++; /* Trailing '\0' */ + SAVEFREEPV(Newx(all_string, names_len, char)); + *all_string = '\0'; + + /* Then fill in the string */ + for (i = 0; i < LC_ALL_INDEX; i++) { + +# ifdef DEBUGGING + + if (DEBUG_Lv_TEST || debug_initialization) { + PerlIO_printf(Perl_debug_log, "%s:%d: emulate_setlocale i=%d, name=%s, locale=%s\n", __FILE__, __LINE__, i, category_names[i], PL_curlocales[i]); + } + +# endif + + my_strlcat(all_string, category_names[i], names_len); + my_strlcat(all_string, "=", names_len); + my_strlcat(all_string, PL_curlocales[i], names_len); + my_strlcat(all_string, ";", names_len); + } + +# ifdef DEBUGGING + + if (DEBUG_L_TEST || debug_initialization) { + PerlIO_printf(Perl_debug_log, "%s:%d: emulate_setlocale returning %s\n", __FILE__, __LINE__, all_string); + } + + #endif + + return all_string; + } + +# ifdef EINVAL + + SETERRNO(EINVAL, LIB_INVARG); + +# endif + + return NULL; + +# endif + + } + + /* Here, we are switching locales. */ + +# ifndef HAS_QUERYLOCALE + + if (strEQ(locale, "")) { + + /* For non-querylocale() systems, we do the setting of "" ourselves to + * be sure that we really know what's going on. We follow the Linux + * documented behavior (but if that differs from the actual behavior, + * this won't work exactly as the OS implements). We go out and + * examine the environment based on our understanding of how the system + * works, and use that to figure things out */ + + const char * const lc_all = PerlEnv_getenv("LC_ALL"); + + /* Use any "LC_ALL" environment variable, as it overrides everything + * else. */ + if (lc_all && strNE(lc_all, "")) { + locale = lc_all; + } + else { + + /* Otherwise, we need to dig deeper. Unless overridden, the + * default is the LANG environment variable; if it doesn't exist, + * then "C" */ + + const char * default_name; + + /* To minimize other threads messing with the environment, we copy + * the variable, making it a temporary. But this doesn't work upon + * program initialization before any scopes are created, and at + * this time, there's nothing else going on that would interfere. + * So skip the copy in that case */ + if (PL_scopestack_ix == 0) { + default_name = PerlEnv_getenv("LANG"); + } + else { + default_name = savepv(PerlEnv_getenv("LANG")); + } + + if (! default_name || strEQ(default_name, "")) { + default_name = "C"; + } + else if (PL_scopestack_ix != 0) { + SAVEFREEPV(default_name); + } + + if (category != LC_ALL) { + const char * const name = PerlEnv_getenv(category_names[index]); + + /* Here we are setting a single category. Assume will have the + * default name */ + locale = default_name; + + /* But then look for an overriding environment variable */ + if (name && strNE(name, "")) { + locale = name; + } + } + else { + bool did_override = FALSE; + unsigned int i; + + /* Here, we are getting LC_ALL. Any categories that don't have + * a corresponding environment variable set should be set to + * LANG, or to "C" if there is no LANG. If no individual + * categories differ from this, we can just set LC_ALL. This + * is buggy on systems that have extra categories that we don't + * know about. If there is an environment variable that sets + * that category, we won't know to look for it, and so our use + * of LANG or "C" improperly overrides it. On the other hand, + * if we don't do what is done here, and there is no + * environment variable, the category's locale should be set to + * LANG or "C". So there is no good solution. khw thinks the + * best is to look at systems to see what categories they have, + * and include them, and then to assume that we know the + * complete set */ + + for (i = 0; i < LC_ALL_INDEX; i++) { + const char * const env_override + = savepv(PerlEnv_getenv(category_names[i])); + const char * this_locale = ( env_override + && strNE(env_override, "")) + ? env_override + : default_name; + if (! emulate_setlocale(categories[i], this_locale, i, TRUE)) + { + Safefree(env_override); + return NULL; + } + + if (strNE(this_locale, default_name)) { + did_override = TRUE; + } + + Safefree(env_override); + } + + /* If all the categories are the same, we can set LC_ALL to + * that */ + if (! did_override) { + locale = default_name; + } + else { + + /* Here, LC_ALL is no longer valid, as some individual + * categories don't match it. We call ourselves + * recursively, as that will execute the code that + * generates the proper locale string for this situation. + * We don't do the remainder of this function, as that is + * to update our records, and we've just done that for the + * individual categories in the loop above, and doing so + * would cause LC_ALL to be done as well */ + return emulate_setlocale(LC_ALL, NULL, LC_ALL_INDEX, TRUE); + } + } + } + } + else if (strchr(locale, ';')) { + + /* LC_ALL may actually incude a conglomeration of various categories. + * Without querylocale, this code uses the glibc (as of this writing) + * syntax for representing that, but that is not a stable API, and + * other platforms do it differently, so we have to handle all cases + * ourselves */ + + unsigned int i; + const char * s = locale; + const char * e = locale + strlen(locale); + const char * p = s; + const char * category_end; + const char * name_start; + const char * name_end; + + /* If the string that gives what to set doesn't include all categories, + * the omitted ones get set to "C". To get this behavior, first set + * all the individual categories to "C", and override the furnished + * ones below */ + for (i = 0; i < LC_ALL_INDEX; i++) { + if (! emulate_setlocale(categories[i], "C", i, TRUE)) { + return NULL; + } + } + + while (s < e) { + + /* Parse through the category */ + while (isWORDCHAR(*p)) { + p++; + } + category_end = p; + + if (*p++ != '=') { + Perl_croak(aTHX_ + "panic: %s: %d: Unexpected character in locale name '%02X", + __FILE__, __LINE__, *(p-1)); + } + + /* Parse through the locale name */ + name_start = p; + while (p < e && *p != ';') { + if (! isGRAPH(*p)) { + Perl_croak(aTHX_ + "panic: %s: %d: Unexpected character in locale name '%02X", + __FILE__, __LINE__, *(p-1)); + } + p++; + } + name_end = p; + + /* Space past the semi-colon */ + if (p < e) { + p++; + } + + /* Find the index of the category name in our lists */ + for (i = 0; i < LC_ALL_INDEX; i++) { + char * individ_locale; + + /* Keep going if this isn't the index. The strnNE() avoids a + * Perl_form(), but would fail if ever a category name could be + * a substring of another one, like if there were a + * "LC_TIME_DATE" */ + if strnNE(s, category_names[i], category_end - s) { + continue; + } + + /* If this index is for the single category we're changing, we + * have found the locale to set it to. */ + if (category == categories[i]) { + locale = Perl_form(aTHX_ "%.*s", + (int) (name_end - name_start), + name_start); + goto ready_to_set; + } + + assert(category == LC_ALL); + individ_locale = Perl_form(aTHX_ "%.*s", + (int) (name_end - name_start), name_start); + if (! emulate_setlocale(categories[i], individ_locale, i, TRUE)) + { + return NULL; + } + } + + s = p; + } + + /* Here we have set all the individual categories by recursive calls. + * These collectively should have fixed up LC_ALL, so can just query + * what that now is */ + assert(category == LC_ALL); + + return do_setlocale_c(LC_ALL, NULL); + } + + ready_to_set: ; + + /* Here at the end of having to deal with the absence of querylocale(). + * Some cases have already been fully handled by recursive calls to this + * function. But at this point, we haven't dealt with those, but are now + * prepared to, knowing what the locale name to set this category to is. + * This would have come for free if this system had had querylocale() */ + +# endif /* end of ! querylocale */ + + assert(PL_C_locale_obj); + + /* Switching locales generally entails freeing the current one's space (at + * the C library's discretion). We need to stop using that locale before + * the switch. So switch to a known locale object that we don't otherwise + * mess with. This returns the locale object in effect at the time of the + * switch. */ + old_obj = uselocale(PL_C_locale_obj); + +# ifdef DEBUGGING + + if (DEBUG_Lv_TEST || debug_initialization) { + PerlIO_printf(Perl_debug_log, "%s:%d: emulate_setlocale was using %p\n", __FILE__, __LINE__, old_obj); + } + +# endif + + if (! old_obj) { + +# ifdef DEBUGGING + + if (DEBUG_L_TEST || debug_initialization) { + dSAVE_ERRNO; + PerlIO_printf(Perl_debug_log, "%s:%d: emulate_setlocale switching to C failed: %d\n", __FILE__, __LINE__, GET_ERRNO); + RESTORE_ERRNO; + } + +# endif + + return NULL; + } + +# ifdef DEBUGGING + + if (DEBUG_Lv_TEST || debug_initialization) { + PerlIO_printf(Perl_debug_log, "%s:%d: emulate_setlocale now using %p\n", __FILE__, __LINE__, PL_C_locale_obj); + } + +# endif + + /* If we weren't in a thread safe locale, set so that newlocale() below + which uses 'old_obj', uses an empty one. Same for our reserved C object. + The latter is defensive coding, so that, even if there is some bug, we + will never end up trying to modify either of these, as if passed to + newlocale(), they can be. */ + if (old_obj == LC_GLOBAL_LOCALE || old_obj == PL_C_locale_obj) { + old_obj = (locale_t) 0; + } + + /* Ready to create a new locale by modification of the exising one */ + new_obj = newlocale(mask, locale, old_obj); -#include "reentr.h" + if (! new_obj) { + dSAVE_ERRNO; + +# ifdef DEBUGGING + + if (DEBUG_L_TEST || debug_initialization) { + PerlIO_printf(Perl_debug_log, "%s:%d: emulate_setlocale creating new object failed: %d\n", __FILE__, __LINE__, GET_ERRNO); + } + +# endif + + if (! uselocale(old_obj)) { + +# ifdef DEBUGGING + + if (DEBUG_L_TEST || debug_initialization) { + PerlIO_printf(Perl_debug_log, "%s:%d: switching back failed: %d\n", __FILE__, __LINE__, GET_ERRNO); + } + +# endif + + } + RESTORE_ERRNO; + return NULL; + } + +# ifdef DEBUGGING + + if (DEBUG_Lv_TEST || debug_initialization) { + PerlIO_printf(Perl_debug_log, "%s:%d: emulate_setlocale created %p; should have freed %p\n", __FILE__, __LINE__, new_obj, old_obj); + } + +# endif + + /* And switch into it */ + if (! uselocale(new_obj)) { + dSAVE_ERRNO; + +# ifdef DEBUGGING + + if (DEBUG_L_TEST || debug_initialization) { + PerlIO_printf(Perl_debug_log, "%s:%d: emulate_setlocale switching to new object failed\n", __FILE__, __LINE__); + } + +# endif + + if (! uselocale(old_obj)) { + +# ifdef DEBUGGING + + if (DEBUG_L_TEST || debug_initialization) { + PerlIO_printf(Perl_debug_log, "%s:%d: switching back failed: %d\n", __FILE__, __LINE__, GET_ERRNO); + } + +# endif + + } + freelocale(new_obj); + RESTORE_ERRNO; + return NULL; + } + +# ifdef DEBUGGING + + if (DEBUG_Lv_TEST || debug_initialization) { + PerlIO_printf(Perl_debug_log, "%s:%d: emulate_setlocale now using %p\n", __FILE__, __LINE__, new_obj); + } + +# endif + + /* We are done, except for updating our records (if the system doesn't keep + * them) and in the case of locale "", we don't actually know what the + * locale that got switched to is, as it came from the environment. So + * have to find it */ + +# ifdef HAS_QUERYLOCALE + + if (strEQ(locale, "")) { + locale = querylocale(mask, new_obj); + } -/* If the environment says to, we can output debugging information during - * initialization. This is done before option parsing, and before any thread - * creation, so can be a file-level static */ -#ifdef DEBUGGING -# ifdef PERL_GLOBAL_STRUCT - /* no global syms allowed */ -# define debug_initialization 0 -# define DEBUG_INITIALIZATION_set(v) # else -static bool debug_initialization = FALSE; -# define DEBUG_INITIALIZATION_set(v) (debug_initialization = v) + + /* Here, 'locale' is the return value */ + + /* Without querylocale(), we have to update our records */ + + if (category == LC_ALL) { + unsigned int i; + + /* For LC_ALL, we change all individual categories to correspond */ + /* PL_curlocales is a parallel array, so has same + * length as 'categories' */ + for (i = 0; i <= LC_ALL_INDEX; i++) { + Safefree(PL_curlocales[i]); + PL_curlocales[i] = savepv(locale); + } + } + else { + + /* For a single category, if it's not the same as the one in LC_ALL, we + * nullify LC_ALL */ + + if (PL_curlocales[LC_ALL_INDEX] && strNE(PL_curlocales[LC_ALL_INDEX], locale)) { + Safefree(PL_curlocales[LC_ALL_INDEX]); + PL_curlocales[LC_ALL_INDEX] = NULL; + } + + /* Then update the category's record */ + Safefree(PL_curlocales[index]); + PL_curlocales[index] = savepv(locale); + } + # endif -#endif -#ifdef USE_LOCALE + return locale; +} -/* - * Standardize the locale name from a string returned by 'setlocale', possibly - * modifying that string. +#endif /* USE_POSIX_2008_LOCALE */ + +#if 0 /* Code that was to emulate thread-safe locales on platforms that + didn't natively support them */ + +/* The way this would work is that we would keep a per-thread list of the + * correct locale for that thread. Any operation that was locale-sensitive + * would have to be changed so that it would look like this: * - * The typical return value of setlocale() is either - * (1) "xx_YY" if the first argument of setlocale() is not LC_ALL - * (2) "xa_YY xb_YY ..." if the first argument of setlocale() is LC_ALL - * (the space-separated values represent the various sublocales, - * in some unspecified order). This is not handled by this function. + * LOCALE_LOCK; + * setlocale to the correct locale for this operation + * do operation + * LOCALE_UNLOCK * - * In some platforms it has a form like "LC_SOMETHING=Lang_Country.866\n", - * which is harmful for further use of the string in setlocale(). This - * function removes the trailing new line and everything up through the '=' + * This leaves the global locale in the most recently used operation's, but it + * was locked long enough to get the result. If that result is static, it + * needs to be copied before the unlock. * - */ + * Macros could be written like SETUP_LOCALE_DEPENDENT_OP(category) that did + * the setup, but are no-ops when not needed, and similarly, + * END_LOCALE_DEPENDENT_OP for the tear-down + * + * But every call to a locale-sensitive function would have to be changed, and + * if a module didn't cooperate by using the mutex, things would break. + * + * This code was abandoned before being completed or tested, and is left as-is +*/ + +# define do_setlocale_c(cat, locale) locking_setlocale(cat, locale, cat ## _INDEX, TRUE) +# define do_setlocale_r(cat, locale) locking_setlocale(cat, locale, 0, FALSE) + STATIC char * -S_stdize_locale(pTHX_ char *locs) +S_locking_setlocale(pTHX_ + const int category, + const char * locale, + int index, + const bool is_index_valid + ) { - const char * const s = strchr(locs, '='); - bool okay = TRUE; + /* This function kind of performs a setlocale() on just the current thread; + * thus it is kind of thread-safe. It does this by keeping a thread-level + * array of the current locales for each category. Every time a locale is + * switched to, it does the switch globally, but updates the thread's + * array. A query as to what the current locale is just returns the + * appropriate element from the array, and doesn't actually call the system + * setlocale(). The saving into the array is done in an uninterruptible + * section of code, so is unaffected by whatever any other threads might be + * doing. + * + * All locale-sensitive operations must work by first starting a critical + * section, then switching to the thread's locale as kept by this function, + * and then doing the operation, then ending the critical section. Thus, + * each gets done in the appropriate locale. simulating thread-safety. + * + * This function takes the same parameters, 'category' and 'locale', that + * the regular setlocale() function does, but it also takes two additional + * ones. This is because as described earlier. If we know on input the + * index corresponding to the category into the array where we store the + * current locales, we don't have to calculate it. If the caller knows at + * compile time what the index is, it it can pass it, setting + * 'is_index_valid' to TRUE; otherwise the index parameter is ignored. + * + */ - PERL_ARGS_ASSERT_STDIZE_LOCALE; + /* If the input index might be incorrect, calculate the correct one */ + if (! is_index_valid) { + unsigned int i; - if (s) { - const char * const t = strchr(s, '.'); - okay = FALSE; - if (t) { - const char * const u = strchr(t, '\n'); - if (u && (u[1] == 0)) { - const STRLEN len = u - s; - Move(s + 1, locs, len, char); - locs[len] = 0; - okay = TRUE; - } - } + if (DEBUG_Lv_TEST || debug_initialization) { + PerlIO_printf(Perl_debug_log, "%s:%d: converting category %d to index\n", __FILE__, __LINE__, category); + } + + for (i = 0; i <= LC_ALL_INDEX; i++) { + if (category == categories[i]) { + index = i; + goto found_index; + } + } + + /* Here, we don't know about this category, so can't handle it. + * XXX best we can do is to unsafely set this + * XXX warning */ + + return my_setlocale(category, locale); + + found_index: ; + + if (DEBUG_Lv_TEST || debug_initialization) { + PerlIO_printf(Perl_debug_log, "%s:%d: index is 0x%x\n", __FILE__, __LINE__, index); + } } - if (!okay) - Perl_croak(aTHX_ "Can't fix broken locale name \"%s\"", locs); + /* For a query, just return what's in our records */ + if (new_locale == NULL) { + return curlocales[index]; + } - return locs; -} + + /* Otherwise, we need to do the switch, and save the result, all in a + * critical section */ + + Safefree(curlocales[[index]]); + + /* It might be that this is called from an already-locked section of code. + * We would have to detect and skip the LOCK/UNLOCK if so */ + LOCALE_LOCK; + + curlocales[index] = savepv(my_setlocale(category, new_locale)); + + if (strEQ(new_locale, "")) { + +#ifdef LC_ALL + + /* The locale values come from the environment, and may not all be the + * same, so for LC_ALL, we have to update all the others, while the + * mutex is still locked */ + + if (category == LC_ALL) { + unsigned int i; + for (i = 0; i < LC_ALL_INDEX) { + curlocales[i] = my_setlocale(categories[i], NULL); + } + } + } #endif -/* Windows requres a customized base-level setlocale() */ -# ifdef WIN32 -# define my_setlocale(cat, locale) win32_setlocale(cat, locale) -# else -# define my_setlocale(cat, locale) setlocale(cat, locale) -# endif + LOCALE_UNLOCK; + + return curlocales[index]; +} -/* Just placeholders for now. "_c" is intended to be called when the category - * is a constant known at compile time; "_r", not known until run time */ -# define do_setlocale_c(category, locale) my_setlocale(category, locale) -# define do_setlocale_r(category, locale) my_setlocale(category, locale) +#endif +#ifdef USE_LOCALE STATIC void S_set_numeric_radix(pTHX_ const bool use_locale) @@ -123,71 +1275,41 @@ S_set_numeric_radix(pTHX_ const bool use_locale) #if defined(USE_LOCALE_NUMERIC) && ( defined(HAS_LOCALECONV) \ || defined(HAS_NL_LANGINFO)) - /* We only set up the radix SV if we are to use a locale radix ... */ - if (use_locale) { - const char * radix = my_nl_langinfo(PERL_RADIXCHAR, FALSE); - /* FALSE => already in dest locale */ + const char * radix = (use_locale) + ? my_nl_langinfo(RADIXCHAR, FALSE) + /* FALSE => already in dest locale */ + : "."; - /* ... and the character being used isn't a dot */ - if (strNE(radix, ".")) { - if (PL_numeric_radix_sv) { - sv_setpv(PL_numeric_radix_sv, radix); - } - else { - PL_numeric_radix_sv = newSVpv(radix, 0); - } + sv_setpv(PL_numeric_radix_sv, radix); - if ( ! is_utf8_invariant_string( - (U8 *) SvPVX(PL_numeric_radix_sv), SvCUR(PL_numeric_radix_sv)) - && is_utf8_string( - (U8 *) SvPVX(PL_numeric_radix_sv), SvCUR(PL_numeric_radix_sv)) - && _is_cur_LC_category_utf8(LC_NUMERIC)) - { - SvUTF8_on(PL_numeric_radix_sv); - } - goto done; - } + /* If this is valid UTF-8 that isn't totally ASCII, and we are in + * a UTF-8 locale, then mark the radix as being in UTF-8 */ + if (is_utf8_non_invariant_string((U8 *) SvPVX(PL_numeric_radix_sv), + SvCUR(PL_numeric_radix_sv)) + && _is_cur_LC_category_utf8(LC_NUMERIC)) + { + SvUTF8_on(PL_numeric_radix_sv); } - SvREFCNT_dec(PL_numeric_radix_sv); - PL_numeric_radix_sv = NULL; - - done: ; - # ifdef DEBUGGING if (DEBUG_L_TEST || debug_initialization) { PerlIO_printf(Perl_debug_log, "Locale radix is '%s', ?UTF-8=%d\n", - (PL_numeric_radix_sv) - ? SvPVX(PL_numeric_radix_sv) - : "NULL", - (PL_numeric_radix_sv) - ? cBOOL(SvUTF8(PL_numeric_radix_sv)) - : 0); + SvPVX(PL_numeric_radix_sv), + cBOOL(SvUTF8(PL_numeric_radix_sv))); } # endif +#else + + PERL_UNUSED_ARG(use_locale); + #endif /* USE_LOCALE_NUMERIC and can find the radix char */ } -/* Is the C string input 'name' "C" or "POSIX"? If so, and 'name' is the - * return of setlocale(), then this is extremely likely to be the C or POSIX - * locale. However, the output of setlocale() is documented to be opaque, but - * the odds are extremely small that it would return these two strings for some - * other locale. Note that VMS in these two locales includes many non-ASCII - * characters as controls and punctuation (below are hex bytes): - * cntrl: 00-1F 7F 84-97 9B-9F - * punct: 21-2F 3A-40 5B-60 7B-7E A1-A3 A5 A7-AB B0-B3 B5-B7 B9-BD BF-CF D1-DD DF-EF F1-FD - * Oddly, none there are listed as alphas, though some represent alphabetics - * http://www.nntp.perl.org/group/perl.perl5.porters/2013/02/msg198753.html */ -#define isNAME_C_OR_POSIX(name) \ - ( (name) != NULL \ - && (( *(name) == 'C' && (*(name + 1)) == '\0') \ - || strEQ((name), "POSIX"))) - -void -Perl_new_numeric(pTHX_ const char *newnum) +STATIC void +S_new_numeric(pTHX_ const char *newnum) { #ifndef USE_LOCALE_NUMERIC @@ -196,13 +1318,13 @@ Perl_new_numeric(pTHX_ const char *newnum) #else - /* Called after all libc setlocale() calls affecting LC_NUMERIC, to tell + /* Called after each libc setlocale() call affecting LC_NUMERIC, to tell * core Perl this and that 'newnum' is the name of the new locale. * It installs this locale as the current underlying default. * * The default locale and the C locale can be toggled between by use of the - * set_numeric_local() and set_numeric_standard() functions, which should - * probably not be called directly, but only via macros like + * set_numeric_underlying() and set_numeric_standard() functions, which + * should probably not be called directly, but only via macros like * SET_NUMERIC_STANDARD() in perl.h. * * The toggling is necessary mainly so that a non-dot radix decimal point @@ -215,17 +1337,18 @@ Perl_new_numeric(pTHX_ const char *newnum) * that the current locale is the program's underlying * locale * PL_numeric_standard An int indicating if the toggled state is such - * that the current locale is the C locale. If non-zero, - * it is in C; if > 1, it means it may not be toggled away + * that the current locale is the C locale or + * indistinguishable from the C locale. If non-zero, it + * is in C; if > 1, it means it may not be toggled away * from C. - * Note that both of the last two variables can be true at the same time, - * if the underlying locale is C. (Toggling is a no-op under these - * circumstances.) - * - * Any code changing the locale (outside this file) should use - * POSIX::setlocale, which calls this function. Therefore this function - * should be called directly only from this file and from - * POSIX::setlocale() */ + * PL_numeric_underlying_is_standard A bool kept by this function + * indicating that the underlying locale and the standard + * C locale are indistinguishable for the purposes of + * LC_NUMERIC. This happens when both of the above two + * variables are true at the same time. (Toggling is a + * no-op under these circumstances.) This variable is + * used to avoid having to recalculate. + */ char *save_newnum; @@ -234,14 +1357,29 @@ Perl_new_numeric(pTHX_ const char *newnum) PL_numeric_name = NULL; PL_numeric_standard = TRUE; PL_numeric_underlying = TRUE; + PL_numeric_underlying_is_standard = TRUE; return; } save_newnum = stdize_locale(savepv(newnum)); - - PL_numeric_standard = isNAME_C_OR_POSIX(save_newnum); PL_numeric_underlying = TRUE; + PL_numeric_standard = isNAME_C_OR_POSIX(save_newnum); + +#ifndef TS_W32_BROKEN_LOCALECONV + + /* If its name isn't C nor POSIX, it could still be indistinguishable from + * them. But on broken Windows systems calling my_nl_langinfo() for + * THOUSEP can currently (but rarely) cause a race, so avoid doing that, + * and just always change the locale if not C nor POSIX on those systems */ + if (! PL_numeric_standard) { + PL_numeric_standard = cBOOL(strEQ(".", my_nl_langinfo(RADIXCHAR, + FALSE /* Don't toggle locale */ )) + && strEQ("", my_nl_langinfo(THOUSEP, FALSE))); + } + +#endif + /* Save the new name if it isn't the same as the previous one, if any */ if (! PL_numeric_name || strNE(PL_numeric_name, save_newnum)) { Safefree(PL_numeric_name); PL_numeric_name = save_newnum; @@ -250,10 +1388,29 @@ Perl_new_numeric(pTHX_ const char *newnum) Safefree(save_newnum); } + PL_numeric_underlying_is_standard = PL_numeric_standard; + +# ifdef HAS_POSIX_2008_LOCALE + + PL_underlying_numeric_obj = newlocale(LC_NUMERIC_MASK, + PL_numeric_name, + PL_underlying_numeric_obj); + +#endif + + if (DEBUG_L_TEST || debug_initialization) { + PerlIO_printf(Perl_debug_log, "Called new_numeric with %s, PL_numeric_name=%s\n", newnum, PL_numeric_name); + } + /* Keep LC_NUMERIC in the C locale. This is for XS modules, so they don't * have to worry about the radix being a non-dot. (Core operations that * need the underlying locale change to it temporarily). */ - set_numeric_standard(); + if (PL_numeric_standard) { + set_numeric_radix(0); + } + else { + set_numeric_standard(); + } #endif /* USE_LOCALE_NUMERIC */ @@ -271,25 +1428,26 @@ Perl_set_numeric_standard(pTHX) * to our records (which could be wrong if some XS code has changed the * locale behind our back) */ - do_setlocale_c(LC_NUMERIC, "C"); - PL_numeric_standard = TRUE; - PL_numeric_underlying = isNAME_C_OR_POSIX(PL_numeric_name); - set_numeric_radix(0); - # ifdef DEBUGGING if (DEBUG_L_TEST || debug_initialization) { PerlIO_printf(Perl_debug_log, - "Underlying LC_NUMERIC locale now is C\n"); + "Setting LC_NUMERIC locale to standard C\n"); } # endif + + do_setlocale_c(LC_NUMERIC, "C"); + PL_numeric_standard = TRUE; + PL_numeric_underlying = PL_numeric_underlying_is_standard; + set_numeric_radix(0); + #endif /* USE_LOCALE_NUMERIC */ } void -Perl_set_numeric_local(pTHX) +Perl_set_numeric_underlying(pTHX) { #ifdef USE_LOCALE_NUMERIC @@ -300,20 +1458,21 @@ Perl_set_numeric_local(pTHX) * if toggling isn't necessary according to our records (which could be * wrong if some XS code has changed the locale behind our back) */ - do_setlocale_c(LC_NUMERIC, PL_numeric_name); - PL_numeric_standard = isNAME_C_OR_POSIX(PL_numeric_name); - PL_numeric_underlying = TRUE; - set_numeric_radix(1); - # ifdef DEBUGGING if (DEBUG_L_TEST || debug_initialization) { PerlIO_printf(Perl_debug_log, - "Underlying LC_NUMERIC locale now is %s\n", + "Setting LC_NUMERIC locale to %s\n", PL_numeric_name); } # endif + + do_setlocale_c(LC_NUMERIC, PL_numeric_name); + PL_numeric_standard = PL_numeric_underlying_is_standard; + PL_numeric_underlying = TRUE; + set_numeric_radix(! PL_numeric_standard); + #endif /* USE_LOCALE_NUMERIC */ } @@ -327,25 +1486,28 @@ S_new_ctype(pTHX_ const char *newctype) #ifndef USE_LOCALE_CTYPE - PERL_ARGS_ASSERT_NEW_CTYPE; PERL_UNUSED_ARG(newctype); PERL_UNUSED_CONTEXT; #else - /* Called after all libc setlocale() calls affecting LC_CTYPE, to tell + /* Called after each libc setlocale() call affecting LC_CTYPE, to tell * core Perl this and that 'newctype' is the name of the new locale. * * This function sets up the folding arrays for all 256 bytes, assuming * that tofold() is tolc() since fold case is not a concept in POSIX, * * Any code changing the locale (outside this file) should use - * POSIX::setlocale, which calls this function. Therefore this function - * should be called directly only from this file and from + * Perl_setlocale or POSIX::setlocale, which call this function. Therefore + * this function should be called directly only from this file and from * POSIX::setlocale() */ dVAR; - UV i; + unsigned int i; + + /* Don't check for problems if we are suppressing the warnings */ + bool check_for_problems = ckWARN_d(WARN_LOCALE) || UNLIKELY(DEBUG_L_TEST); + bool maybe_utf8_turkic = FALSE; PERL_ARGS_ASSERT_NEW_CTYPE; @@ -362,28 +1524,37 @@ S_new_ctype(pTHX_ const char *newctype) * handle this specially because of the three problematic code points */ if (PL_in_utf8_CTYPE_locale) { Copy(PL_fold_latin1, PL_fold_locale, 256, U8); + + /* UTF-8 locales can have special handling for 'I' and 'i' if they are + * Turkic. Make sure these two are the only anomalies. (We don't use + * towupper and towlower because they aren't in C89.) */ + if (toupper('i') == 'i' && tolower('I') == 'I') { + check_for_problems = TRUE; + maybe_utf8_turkic = TRUE; + } } - else { + + /* We don't populate the other lists if a UTF-8 locale, but do check that + * everything works as expected, unless checking turned off */ + if (check_for_problems || ! PL_in_utf8_CTYPE_locale) { /* Assume enough space for every character being bad. 4 spaces each * for the 94 printable characters that are output like "'x' "; and 5 * spaces each for "'\\' ", "'\t' ", and "'\n' "; plus a terminating * NUL */ - char bad_chars_list[ (94 * 4) + (3 * 5) + 1 ]; - - /* Don't check for problems if we are suppressing the warnings */ - bool check_for_problems = ckWARN_d(WARN_LOCALE) - || UNLIKELY(DEBUG_L_TEST); + char bad_chars_list[ (94 * 4) + (3 * 5) + 1 ] = { '\0' }; bool multi_byte_locale = FALSE; /* Assume is a single-byte locale to start */ unsigned int bad_count = 0; /* Count of bad characters */ for (i = 0; i < 256; i++) { - if (isUPPER_LC((U8) i)) - PL_fold_locale[i] = (U8) toLOWER_LC((U8) i); - else if (isLOWER_LC((U8) i)) - PL_fold_locale[i] = (U8) toUPPER_LC((U8) i); - else - PL_fold_locale[i] = (U8) i; + if (! PL_in_utf8_CTYPE_locale) { + if (isupper(i)) + PL_fold_locale[i] = (U8) tolower(i); + else if (islower(i)) + PL_fold_locale[i] = (U8) toupper(i); + else + PL_fold_locale[i] = (U8) i; + } /* If checking for locale problems, see if the native ASCII-range * printables plus \n and \t are in their expected categories in @@ -397,35 +1568,128 @@ S_new_ctype(pTHX_ const char *newctype) if ( check_for_problems && (isGRAPH_A(i) || isBLANK_A(i) || i == '\n')) { - if (( isALPHANUMERIC_A(i) && ! isALPHANUMERIC_LC(i)) - || (isPUNCT_A(i) && ! isPUNCT_LC(i)) - || (isBLANK_A(i) && ! isBLANK_LC(i)) - || (i == '\n' && ! isCNTRL_LC(i))) - { - if (bad_count) { /* Separate multiple entries with a - blank */ - bad_chars_list[bad_count++] = ' '; - } - bad_chars_list[bad_count++] = '\''; - if (isPRINT_A(i)) { - bad_chars_list[bad_count++] = (char) i; - } - else { - bad_chars_list[bad_count++] = '\\'; - if (i == '\n') { - bad_chars_list[bad_count++] = 'n'; - } - else { - assert(i == '\t'); - bad_chars_list[bad_count++] = 't'; - } + bool is_bad = FALSE; + char name[4] = { '\0' }; + + /* Convert the name into a string */ + if (isGRAPH_A(i)) { + name[0] = i; + name[1] = '\0'; + } + else if (i == '\n') { + my_strlcpy(name, "\\n", sizeof(name)); + } + else if (i == '\t') { + my_strlcpy(name, "\\t", sizeof(name)); + } + else { + assert(i == ' '); + my_strlcpy(name, "' '", sizeof(name)); + } + + /* Check each possibe class */ + if (UNLIKELY(cBOOL(isalnum(i)) != cBOOL(isALPHANUMERIC_A(i)))) { + is_bad = TRUE; + DEBUG_L(PerlIO_printf(Perl_debug_log, + "isalnum('%s') unexpectedly is %d\n", + name, cBOOL(isalnum(i)))); + } + if (UNLIKELY(cBOOL(isalpha(i)) != cBOOL(isALPHA_A(i)))) { + is_bad = TRUE; + DEBUG_L(PerlIO_printf(Perl_debug_log, + "isalpha('%s') unexpectedly is %d\n", + name, cBOOL(isalpha(i)))); + } + if (UNLIKELY(cBOOL(isdigit(i)) != cBOOL(isDIGIT_A(i)))) { + is_bad = TRUE; + DEBUG_L(PerlIO_printf(Perl_debug_log, + "isdigit('%s') unexpectedly is %d\n", + name, cBOOL(isdigit(i)))); + } + if (UNLIKELY(cBOOL(isgraph(i)) != cBOOL(isGRAPH_A(i)))) { + is_bad = TRUE; + DEBUG_L(PerlIO_printf(Perl_debug_log, + "isgraph('%s') unexpectedly is %d\n", + name, cBOOL(isgraph(i)))); + } + if (UNLIKELY(cBOOL(islower(i)) != cBOOL(isLOWER_A(i)))) { + is_bad = TRUE; + DEBUG_L(PerlIO_printf(Perl_debug_log, + "islower('%s') unexpectedly is %d\n", + name, cBOOL(islower(i)))); + } + if (UNLIKELY(cBOOL(isprint(i)) != cBOOL(isPRINT_A(i)))) { + is_bad = TRUE; + DEBUG_L(PerlIO_printf(Perl_debug_log, + "isprint('%s') unexpectedly is %d\n", + name, cBOOL(isprint(i)))); + } + if (UNLIKELY(cBOOL(ispunct(i)) != cBOOL(isPUNCT_A(i)))) { + is_bad = TRUE; + DEBUG_L(PerlIO_printf(Perl_debug_log, + "ispunct('%s') unexpectedly is %d\n", + name, cBOOL(ispunct(i)))); + } + if (UNLIKELY(cBOOL(isspace(i)) != cBOOL(isSPACE_A(i)))) { + is_bad = TRUE; + DEBUG_L(PerlIO_printf(Perl_debug_log, + "isspace('%s') unexpectedly is %d\n", + name, cBOOL(isspace(i)))); + } + if (UNLIKELY(cBOOL(isupper(i)) != cBOOL(isUPPER_A(i)))) { + is_bad = TRUE; + DEBUG_L(PerlIO_printf(Perl_debug_log, + "isupper('%s') unexpectedly is %d\n", + name, cBOOL(isupper(i)))); + } + if (UNLIKELY(cBOOL(isxdigit(i))!= cBOOL(isXDIGIT_A(i)))) { + is_bad = TRUE; + DEBUG_L(PerlIO_printf(Perl_debug_log, + "isxdigit('%s') unexpectedly is %d\n", + name, cBOOL(isxdigit(i)))); + } + if (UNLIKELY(tolower(i) != (int) toLOWER_A(i))) { + is_bad = TRUE; + DEBUG_L(PerlIO_printf(Perl_debug_log, + "tolower('%s')=0x%x instead of the expected 0x%x\n", + name, tolower(i), (int) toLOWER_A(i))); + } + if (UNLIKELY(toupper(i) != (int) toUPPER_A(i))) { + is_bad = TRUE; + DEBUG_L(PerlIO_printf(Perl_debug_log, + "toupper('%s')=0x%x instead of the expected 0x%x\n", + name, toupper(i), (int) toUPPER_A(i))); + } + if (UNLIKELY((i == '\n' && ! isCNTRL_LC(i)))) { + is_bad = TRUE; + DEBUG_L(PerlIO_printf(Perl_debug_log, + "'\\n' (=%02X) is not a control\n", (int) i)); + } + + /* Add to the list; Separate multiple entries with a blank */ + if (is_bad) { + if (bad_count) { + my_strlcat(bad_chars_list, " ", sizeof(bad_chars_list)); } - bad_chars_list[bad_count++] = '\''; - bad_chars_list[bad_count] = '\0'; + my_strlcat(bad_chars_list, name, sizeof(bad_chars_list)); + bad_count++; } } } + if (bad_count == 2 && maybe_utf8_turkic) { + bad_count = 0; + *bad_chars_list = '\0'; + PL_fold_locale['I'] = 'I'; + PL_fold_locale['i'] = 'i'; + PL_in_utf8_turkic_locale = TRUE; + DEBUG_L(PerlIO_printf(Perl_debug_log, "%s:%d: %s is turkic\n", + __FILE__, __LINE__, newctype)); + } + else { + PL_in_utf8_turkic_locale = FALSE; + } + # ifdef MB_CUR_MAX /* We only handle single-byte locales (outside of UTF-8 ones; so if @@ -435,7 +1699,8 @@ S_new_ctype(pTHX_ const char *newctype) "%s:%d: check_for_problems=%d, MB_CUR_MAX=%d\n", __FILE__, __LINE__, check_for_problems, (int) MB_CUR_MAX)); - if (check_for_problems && MB_CUR_MAX > 1 + if ( check_for_problems && MB_CUR_MAX > 1 + && ! PL_in_utf8_CTYPE_locale /* Some platforms return MB_CUR_MAX > 1 for even the "C" * locale. Just assume that the implementation for them (plus @@ -450,8 +1715,19 @@ S_new_ctype(pTHX_ const char *newctype) # endif - if (bad_count || multi_byte_locale) { - PL_warn_locale = Perl_newSVpvf(aTHX_ + /* If we found problems and we want them output, do so */ + if ( (UNLIKELY(bad_count) || UNLIKELY(multi_byte_locale)) + && (LIKELY(ckWARN_d(WARN_LOCALE)) || UNLIKELY(DEBUG_L_TEST))) + { + if (UNLIKELY(bad_count) && PL_in_utf8_CTYPE_locale) { + PL_warn_locale = Perl_newSVpvf(aTHX_ + "Locale '%s' contains (at least) the following characters" + " which have\nunexpected meanings: %s\nThe Perl program" + " will use the expected meanings", + newctype, bad_chars_list); + } + else { + PL_warn_locale = Perl_newSVpvf(aTHX_ "Locale '%s' may not work well.%s%s%s\n", newctype, (multi_byte_locale) @@ -467,6 +1743,18 @@ S_new_ctype(pTHX_ const char *newctype) ? bad_chars_list : "" ); + } + +# ifdef HAS_NL_LANGINFO + + Perl_sv_catpvf(aTHX_ PL_warn_locale, "; codeset=%s", + /* parameter FALSE is a don't care here */ + my_nl_langinfo(CODESET, FALSE)); + +# endif + + Perl_sv_catpvf(aTHX_ PL_warn_locale, "\n"); + /* If we are actually in the scope of the locale or are debugging, * output the message now. If not in that scope, we save the * message to be output at the first operation using this locale, @@ -474,20 +1762,9 @@ S_new_ctype(pTHX_ const char *newctype) * they are immune to bad ones. */ if (IN_LC(LC_CTYPE) || UNLIKELY(DEBUG_L_TEST)) { - /* We have to save 'newctype' because the setlocale() just - * below may destroy it. The next setlocale() further down - * should restore it properly so that the intermediate change - * here is transparent to this function's caller */ - const char * const badlocale = savepv(newctype); - - do_setlocale_c(LC_CTYPE, "C"); - /* The '0' below suppresses a bogus gcc compiler warning */ Perl_warner(aTHX_ packWARN(WARN_LOCALE), SvPVX(PL_warn_locale), 0); - do_setlocale_c(LC_CTYPE, badlocale); - Safefree(badlocale); - if (IN_LC(LC_CTYPE)) { SvREFCNT_dec_NN(PL_warn_locale); PL_warn_locale = NULL; @@ -513,11 +1790,9 @@ Perl__warn_problematic_locale() * _CHECK_AND_WARN_PROBLEMATIC_LOCALE */ if (PL_warn_locale) { - /*GCC_DIAG_IGNORE(-Wformat-security); Didn't work */ Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), SvPVX(PL_warn_locale), 0 /* dummy to avoid compiler warning */ ); - /* GCC_DIAG_RESTORE; */ SvREFCNT_dec_NN(PL_warn_locale); PL_warn_locale = NULL; } @@ -537,7 +1812,7 @@ S_new_collate(pTHX_ const char *newcoll) #else - /* Called after all libc setlocale() calls affecting LC_COLLATE, to tell + /* Called after each libc setlocale() call affecting LC_COLLATE, to tell * core Perl this and that 'newcoll' is the name of the new locale. * * The design of locale collation is that every locale change is given an @@ -748,6 +2023,8 @@ S_new_collate(pTHX_ const char *newcoll) } +#endif + #ifdef WIN32 STATIC char * @@ -769,6 +2046,7 @@ S_win32_setlocale(pTHX_ int category, const char* locale) bool override_LC_ALL = FALSE; char * result; + unsigned int i; if (locale && strEQ(locale, "")) { @@ -776,73 +2054,30 @@ S_win32_setlocale(pTHX_ int category, const char* locale) locale = PerlEnv_getenv("LC_ALL"); if (! locale) { + if (category == LC_ALL) { + override_LC_ALL = TRUE; + } + else { # endif - switch (category) { - -# ifdef LC_ALL - case LC_ALL: - override_LC_ALL = TRUE; - break; /* We already know its variable isn't set */ - -# endif -# ifdef USE_LOCALE_TIME - - case LC_TIME: - locale = PerlEnv_getenv("LC_TIME"); - break; - -# endif -# ifdef USE_LOCALE_CTYPE - - case LC_CTYPE: - locale = PerlEnv_getenv("LC_CTYPE"); - break; - -# endif -# ifdef USE_LOCALE_COLLATE - - case LC_COLLATE: - locale = PerlEnv_getenv("LC_COLLATE"); - break; - -# endif -# ifdef USE_LOCALE_MONETARY - - case LC_MONETARY: - locale = PerlEnv_getenv("LC_MONETARY"); - break; - -# endif -# ifdef USE_LOCALE_NUMERIC - - case LC_NUMERIC: - locale = PerlEnv_getenv("LC_NUMERIC"); - break; - -# endif -# ifdef USE_LOCALE_MESSAGES - - case LC_MESSAGES: - locale = PerlEnv_getenv("LC_MESSAGES"); - break; -# endif + for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++) { + if (category == categories[i]) { + locale = PerlEnv_getenv(category_names[i]); + goto found_locale; + } + } - default: - /* This is a category, like PAPER_SIZE that we don't - * know about; and so can't provide a wrapper. */ - break; - } - if (! locale) { locale = PerlEnv_getenv("LANG"); if (! locale) { locale = ""; } - } + + found_locale: ; # ifdef LC_ALL + } } # endif @@ -850,8 +2085,12 @@ S_win32_setlocale(pTHX_ int category, const char* locale) } result = setlocale(category, locale); - DEBUG_L(PerlIO_printf(Perl_debug_log, "%s:%d: %s\n", __FILE__, __LINE__, - setlocale_debug_string(category, locale, result))); + DEBUG_L(STMT_START { + dSAVE_ERRNO; + PerlIO_printf(Perl_debug_log, "%s:%d: %s\n", __FILE__, __LINE__, + setlocale_debug_string(category, locale, result)); + RESTORE_ERRNO; + } STMT_END); if (! override_LC_ALL) { return result; @@ -863,108 +2102,108 @@ S_win32_setlocale(pTHX_ int category, const char* locale) * one that is set. (If they are set to "", it means to use the same thing * we just set LC_ALL to, so can skip) */ -# ifdef USE_LOCALE_TIME - - result = PerlEnv_getenv("LC_TIME"); - if (result && strNE(result, "")) { - setlocale(LC_TIME, result); - DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s:%d: %s\n", - __FILE__, __LINE__, - setlocale_debug_string(LC_TIME, result, "not captured"))); - } - -# endif -# ifdef USE_LOCALE_CTYPE - - result = PerlEnv_getenv("LC_CTYPE"); - if (result && strNE(result, "")) { - setlocale(LC_CTYPE, result); - DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s:%d: %s\n", - __FILE__, __LINE__, - setlocale_debug_string(LC_CTYPE, result, "not captured"))); + for (i = 0; i < LC_ALL_INDEX; i++) { + result = PerlEnv_getenv(category_names[i]); + if (result && strNE(result, "")) { + setlocale(categories[i], result); + DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s:%d: %s\n", + __FILE__, __LINE__, + setlocale_debug_string(categories[i], result, "not captured"))); + } } -# endif -# ifdef USE_LOCALE_COLLATE + result = setlocale(LC_ALL, NULL); + DEBUG_L(STMT_START { + dSAVE_ERRNO; + PerlIO_printf(Perl_debug_log, "%s:%d: %s\n", + __FILE__, __LINE__, + setlocale_debug_string(LC_ALL, NULL, result)); + RESTORE_ERRNO; + } STMT_END); - result = PerlEnv_getenv("LC_COLLATE"); - if (result && strNE(result, "")) { - setlocale(LC_COLLATE, result); - DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s:%d: %s\n", - __FILE__, __LINE__, - setlocale_debug_string(LC_COLLATE, result, "not captured"))); - } + return result; +} -# endif -# ifdef USE_LOCALE_MONETARY +#endif - result = PerlEnv_getenv("LC_MONETARY"); - if (result && strNE(result, "")) { - setlocale(LC_MONETARY, result); - DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s:%d: %s\n", - __FILE__, __LINE__, - setlocale_debug_string(LC_MONETARY, result, "not captured"))); - } +/* -# endif -# ifdef USE_LOCALE_NUMERIC +=head1 Locale-related functions and macros - result = PerlEnv_getenv("LC_NUMERIC"); - if (result && strNE(result, "")) { - setlocale(LC_NUMERIC, result); - DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s:%d: %s\n", - __FILE__, __LINE__, - setlocale_debug_string(LC_NUMERIC, result, "not captured"))); - } +=for apidoc Perl_setlocale + +This is an (almost) drop-in replacement for the system L>, +taking the same parameters, and returning the same information, except that it +returns the correct underlying C locale. Regular C will +instead return C if the underlying locale has a non-dot decimal point +character, or a non-empty thousands separator for displaying floating point +numbers. This is because perl keeps that locale category such that it has a +dot and empty separator, changing the locale briefly during the operations +where the underlying one is required. C knows about this, and +compensates; regular C doesn't. + +Another reason it isn't completely a drop-in replacement is that it is +declared to return S>, whereas the system setlocale omits the +C (presumably because its API was specified long ago, and can't be +updated; it is illegal to change the information C returns; doing +so leads to segfaults.) + +Finally, C works under all circumstances, whereas plain +C can be completely ineffective on some platforms under some +configurations. + +C should not be used to change the locale except on systems +where the predefined variable C<${^SAFE_LOCALES}> is 1. On some such systems, +the system C is ineffective, returning the wrong information, and +failing to actually change the locale. C, however works +properly in all circumstances. + +The return points to a per-thread static buffer, which is overwritten the next +time C is called from the same thread. -# endif -# ifdef USE_LOCALE_MESSAGES +=cut - result = PerlEnv_getenv("LC_MESSAGES"); - if (result && strNE(result, "")) { - setlocale(LC_MESSAGES, result); - DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s:%d: %s\n", - __FILE__, __LINE__, - setlocale_debug_string(LC_MESSAGES, result, "not captured"))); - } +*/ -# endif +const char * +Perl_setlocale(const int category, const char * locale) +{ + /* This wraps POSIX::setlocale() */ - result = setlocale(LC_ALL, NULL); - DEBUG_L(PerlIO_printf(Perl_debug_log, "%s:%d: %s\n", - __FILE__, __LINE__, - setlocale_debug_string(LC_ALL, NULL, result))); +#ifdef NO_LOCALE - return result; -} + PERL_UNUSED_ARG(category); + PERL_UNUSED_ARG(locale); -#endif + return "C"; -char * -Perl_setlocale(int category, const char * locale) -{ - /* This wraps POSIX::setlocale() */ +#else - char * retval; - char * newlocale; + const char * retval; + const char * newlocale; + dSAVEDERRNO; dTHX; + DECLARATION_FOR_LC_NUMERIC_MANIPULATION; #ifdef USE_LOCALE_NUMERIC - /* A NULL locale means only query what the current one is. We - * have the LC_NUMERIC name saved, because we are normally switched - * into the C locale for it. Switch back so an LC_ALL query will yield - * the correct results; all other categories don't require special + /* A NULL locale means only query what the current one is. We have the + * LC_NUMERIC name saved, because we are normally switched into the C + * (or equivalent) locale for it. For an LC_ALL query, switch back to get + * the correct results. All other categories don't require special * handling */ if (locale == NULL) { if (category == LC_NUMERIC) { - return savepv(PL_numeric_name); + + /* We don't have to copy this return value, as it is a per-thread + * variable, and won't change until a future setlocale */ + return PL_numeric_name; } # ifdef LC_ALL else if (category == LC_ALL) { - SET_NUMERIC_UNDERLYING(); + STORE_LC_NUMERIC_FORCE_TO_UNDERLYING(); } # endif @@ -973,28 +2212,30 @@ Perl_setlocale(int category, const char * locale) #endif - retval = do_setlocale_r(category, locale); + retval = save_to_buffer(do_setlocale_r(category, locale), + &PL_setlocale_buf, &PL_setlocale_bufsize, 0); + SAVE_ERRNO; + +#if defined(USE_LOCALE_NUMERIC) && defined(LC_ALL) + + if (locale == NULL && category == LC_ALL) { + RESTORE_LC_NUMERIC(); + } + +#endif DEBUG_L(PerlIO_printf(Perl_debug_log, "%s:%d: %s\n", __FILE__, __LINE__, setlocale_debug_string(category, locale, retval))); - if (! retval) { - /* Should never happen that a query would return an error, but be - * sure and reset to C locale */ - if (locale == 0) { - SET_NUMERIC_STANDARD(); - } + RESTORE_ERRNO; + + if (! retval) { return NULL; } - /* Save retval since subsequent setlocale() calls may overwrite it. */ - retval = savepv(retval); - - /* If locale == NULL, we are just querying the state, but may have switched - * to NUMERIC_UNDERLYING. Switch back before returning. */ + /* If locale == NULL, we are just querying the state */ if (locale == NULL) { - SET_NUMERIC_STANDARD(); return retval; } @@ -1034,20 +2275,23 @@ Perl_setlocale(int category, const char * locale) # ifdef USE_LOCALE_CTYPE - newlocale = do_setlocale_c(LC_CTYPE, NULL); + newlocale = savepv(do_setlocale_c(LC_CTYPE, NULL)); new_ctype(newlocale); + Safefree(newlocale); # endif /* USE_LOCALE_CTYPE */ # ifdef USE_LOCALE_COLLATE - newlocale = do_setlocale_c(LC_COLLATE, NULL); + newlocale = savepv(do_setlocale_c(LC_COLLATE, NULL)); new_collate(newlocale); + Safefree(newlocale); # endif # ifdef USE_LOCALE_NUMERIC - newlocale = do_setlocale_c(LC_NUMERIC, NULL); + newlocale = savepv(do_setlocale_c(LC_NUMERIC, NULL)); new_numeric(newlocale); + Safefree(newlocale); # endif /* USE_LOCALE_NUMERIC */ #endif /* LC_ALL */ @@ -1058,6 +2302,7 @@ Perl_setlocale(int category, const char * locale) return retval; +#endif } @@ -1067,10 +2312,16 @@ S_save_to_buffer(const char * string, char **buf, Size_t *buf_size, const Size_t /* Copy the NUL-terminated 'string' to 'buf' + 'offset'. 'buf' has size 'buf_size', * growing it if necessary */ - const Size_t string_size = strlen(string) + offset + 1; + Size_t string_size; PERL_ARGS_ASSERT_SAVE_TO_BUFFER; + if (! string) { + return NULL; + } + + string_size = strlen(string) + offset + 1; + if (*buf_size == 0) { Newx(*buf, string_size, char); *buf_size = string_size; @@ -1086,11 +2337,9 @@ S_save_to_buffer(const char * string, char **buf, Size_t *buf_size, const Size_t /* -=head1 Locale-related functions and macros - =for apidoc Perl_langinfo -This is an (almost ª) drop-in replacement for the system C>, +This is an (almost) drop-in replacement for the system C>, taking the same C parameter values, and returning the same information. But it is more thread-safe than regular C, and hides the quirks of Perl's locale handling from your code, and can be used on systems that lack @@ -1102,84 +2351,66 @@ Expanding on these: =item * -It delivers the correct results for the C and C items, -without you having to write extra code. The reason for the extra code would be -because these are from the C locale category, which is normally -kept set to the C locale by Perl, no matter what the underlying locale is -supposed to be, and so to get the expected results, you have to temporarily -toggle into the underlying locale, and later toggle back. (You could use -plain C and C> for this -but then you wouldn't get the other advantages of C; not -keeping C in the C locale would break a lot of CPAN, which is -expecting the radix (decimal point) character to be a dot.) +The reason it isn't quite a drop-in replacement is actually an advantage. The +only difference is that it returns S>, whereas plain +C returns S>, but you are (only by documentation) +forbidden to write into the buffer. By declaring this C, the compiler +enforces this restriction, so if it is violated, you know at compilation time, +rather than getting segfaults at runtime. =item * -Depending on C, it works on systems that don't have C, hence -makes your code more portable. Of the fifty-some possible items specified by -the POSIX 2008 standard, -L, -only two are completely unimplemented. It uses various techniques to recover -the other items, including calling C>, and C>, -both of which are specified in C89, so should be always be available. Later -C versions have additional capabilities; C<""> is returned for -those not available on your system. - -The details for those items which may differ from what this emulation returns -and what a native C would return are: - -=over - -=item C - -=item C - -Unimplemented, so returns C<"">. - -=item C - -=item C - -Only the values for English are returned. Earlier POSIX standards also -specified C and C, but these have been removed from POSIX 2008, -and aren't supported by C. - -=item C - -Always evaluates to C<%x>, the locale's appropriate date representation. - -=item C - -Always evaluates to C<%X>, the locale's appropriate time representation. - -=item C - -Always evaluates to C<%c>, the locale's appropriate date and time -representation. - -=item C - -The return may be incorrect for those rare locales where the currency symbol -replaces the radix character. -Send email to L if you have examples of it needing -to work differently. - -=item C +It delivers the correct results for the C and C items, +without you having to write extra code. The reason for the extra code would be +because these are from the C locale category, which is normally +kept set by Perl so that the radix is a dot, and the separator is the empty +string, no matter what the underlying locale is supposed to be, and so to get +the expected results, you have to temporarily toggle into the underlying +locale, and later toggle back. (You could use plain C and +C> for this but then you wouldn't get +the other advantages of C; not keeping C in the C +(or equivalent) locale would break a lot of CPAN, which is expecting the radix +(decimal point) character to be a dot.) -Currently this gives the same results as Linux does. -Send email to L if you have examples of it needing -to work differently. +=item * -=item C +The system function it replaces can have its static return buffer trashed, +not only by a subesequent call to that function, but by a C, +C, or other locale change. The returned buffer of this function is +not changed until the next call to it, so the buffer is never in a trashed +state. -=item C +=item * -=item C +Its return buffer is per-thread, so it also is never overwritten by a call to +this function from another thread; unlike the function it replaces. -=item C +=item * -These are derived by using C, and not all versions of that function -know about them. C<""> is returned for these on such systems. +But most importantly, it works on systems that don't have C, such +as Windows, hence makes your code more portable. Of the fifty-some possible +items specified by the POSIX 2008 standard, +L, +only one is completely unimplemented, though on non-Windows platforms, another +significant one is also not implemented). It uses various techniques to +recover the other items, including calling C>, and +C>, both of which are specified in C89, so should be always be +available. Later C versions have additional capabilities; C<""> is +returned for those not available on your system. + +It is important to note that when called with an item that is recovered by +using C, the buffer from any previous explicit call to +C will be overwritten. This means you must save that buffer's +contents if you need to access them after a call to this function. (But note +that you might not want to be using C directly anyway, because of +issues like the ones listed in the second item of this list (above) for +C and C. You can use the methods given in L to +call L and avoid all the issues, but then you have a hash to +unpack). + +The details for those items which may deviate from what this emulation returns +and what a native C would return are specified in +L. =back @@ -1190,29 +2421,8 @@ C, you must before the C C<#include>. You can replace your C C<#include> with this one. (Doing it this way keeps out the symbols that plain -C imports into the namespace for code that doesn't need it.) - -You also should not use the bare C item names, but should preface -them with C, so use C instead of plain C. -The C> versions will also work for this function on systems that do -have a native C. - -=item * - -It is thread-friendly, returning its result in a buffer that won't be -overwritten by another thread, so you don't have to code for that possibility. -The buffer can be overwritten by the next call to C or -C in the same thread. - -=item * - -ª It returns S>, whereas plain C returns S>, but you are (only by documentation) forbidden to write into the buffer. -By declaring this C, the compiler enforces this restriction. The extra -C is why this isn't an unequivocal drop-in replacement for -C. - -=back +C would try to import into the namespace for code that doesn't need +it.) The original impetus for C was so that code that needs to find out the current currency symbol, floating point radix character, or digit @@ -1236,7 +2446,7 @@ Perl_langinfo(const int item) return my_nl_langinfo(item, TRUE); } -const char * +STATIC const char * #ifdef HAS_NL_LANGINFO S_my_nl_langinfo(const nl_item item, bool toggle) #else @@ -1244,415 +2454,655 @@ S_my_nl_langinfo(const int item, bool toggle) #endif { dTHX; + const char * retval; + +#ifdef USE_LOCALE_NUMERIC + + /* We only need to toggle into the underlying LC_NUMERIC locale for these + * two items, and only if not already there */ + if (toggle && (( item != RADIXCHAR && item != THOUSEP) + || PL_numeric_underlying)) + +#endif /* No toggling needed if not using LC_NUMERIC */ + + toggle = FALSE; + +#if defined(HAS_NL_LANGINFO) /* nl_langinfo() is available. */ +# if ! defined(HAS_THREAD_SAFE_NL_LANGINFO_L) \ + || ! defined(HAS_POSIX_2008_LOCALE) \ + || ! defined(DUPLOCALE) + + /* Here, use plain nl_langinfo(), switching to the underlying LC_NUMERIC + * for those items dependent on it. This must be copied to a buffer before + * switching back, as some systems destroy the buffer when setlocale() is + * called */ + + { + DECLARATION_FOR_LC_NUMERIC_MANIPULATION; + + if (toggle) { + STORE_LC_NUMERIC_FORCE_TO_UNDERLYING(); + } + + LOCALE_LOCK; /* Prevent interference from another thread executing + this code section (the only call to nl_langinfo in + the core) */ + + + /* Copy to a per-thread buffer, which is also one that won't be + * destroyed by a subsequent setlocale(), such as the + * RESTORE_LC_NUMERIC may do just below. */ + retval = save_to_buffer(nl_langinfo(item), + &PL_langinfo_buf, &PL_langinfo_bufsize, 0); + + LOCALE_UNLOCK; + + if (toggle) { + RESTORE_LC_NUMERIC(); + } + } + +# else /* Use nl_langinfo_l(), avoiding both a mutex and changing the locale */ + + { + bool do_free = FALSE; + locale_t cur = uselocale((locale_t) 0); + + if (cur == LC_GLOBAL_LOCALE) { + cur = duplocale(LC_GLOBAL_LOCALE); + do_free = TRUE; + } + +# ifdef USE_LOCALE_NUMERIC + + if (toggle) { + if (PL_underlying_numeric_obj) { + cur = PL_underlying_numeric_obj; + } + else { + cur = newlocale(LC_NUMERIC_MASK, PL_numeric_name, cur); + do_free = TRUE; + } + } + +# endif + + /* We have to save it to a buffer, because the freelocale() just below + * can invalidate the internal one */ + retval = save_to_buffer(nl_langinfo_l(item, cur), + &PL_langinfo_buf, &PL_langinfo_bufsize, 0); + + if (do_free) { + freelocale(cur); + } + } + +# endif + + if (strEQ(retval, "")) { + if (item == YESSTR) { + return "yes"; + } + if (item == NOSTR) { + return "no"; + } + } + + return retval; + +#else /* Below, emulate nl_langinfo as best we can */ + + { + +# ifdef HAS_LOCALECONV + + const struct lconv* lc; + const char * temp; + DECLARATION_FOR_LC_NUMERIC_MANIPULATION; + +# ifdef TS_W32_BROKEN_LOCALECONV + + const char * save_global; + const char * save_thread; + int needed_size; + char * ptr; + char * e; + char * item_start; + +# endif +# endif +# ifdef HAS_STRFTIME + + struct tm tm; + bool return_format = FALSE; /* Return the %format, not the value */ + const char * format; + +# endif + + /* We copy the results to a per-thread buffer, even if not + * multi-threaded. This is in part to simplify this code, and partly + * because we need a buffer anyway for strftime(), and partly because a + * call of localeconv() could otherwise wipe out the buffer, and the + * programmer would not be expecting this, as this is a nl_langinfo() + * substitute after all, so s/he might be thinking their localeconv() + * is safe until another localeconv() call. */ + + switch (item) { + Size_t len; + + /* This is unimplemented */ + case ERA: /* For use with strftime() %E modifier */ + + default: + return ""; + + /* We use only an English set, since we don't know any more */ + case YESEXPR: return "^[+1yY]"; + case YESSTR: return "yes"; + case NOEXPR: return "^[-0nN]"; + case NOSTR: return "no"; + + case CODESET: + +# ifndef WIN32 + + /* On non-windows, this is unimplemented, in part because of + * inconsistencies between vendors. The Darwin native + * nl_langinfo() implementation simply looks at everything past + * any dot in the name, but that doesn't work for other + * vendors. Many Linux locales that don't have UTF-8 in their + * names really are UTF-8, for example; z/OS locales that do + * have UTF-8 in their names, aren't really UTF-8 */ + return ""; + +# else + + { /* But on Windows, the name does seem to be consistent, so + use that. */ + const char * p; + const char * first; + Size_t offset = 0; + const char * name = my_setlocale(LC_CTYPE, NULL); + + if (isNAME_C_OR_POSIX(name)) { + return "ANSI_X3.4-1968"; + } + + /* Find the dot in the locale name */ + first = (const char *) strchr(name, '.'); + if (! first) { + first = name; + goto has_nondigit; + } -#if defined(HAS_NL_LANGINFO) /* nl_langinfo() is available. */ -#if ! defined(HAS_POSIX_2008_LOCALE) + /* Look at everything past the dot */ + first++; + p = first; - /* Here, use plain nl_langinfo(), switching to the underlying LC_NUMERIC - * for those items dependent on it. This must be copied to a buffer before - * switching back, as some systems destroy the buffer when setlocale() is - * called */ + while (*p) { + if (! isDIGIT(*p)) { + goto has_nondigit; + } - LOCALE_LOCK; + p++; + } - if (toggle) { - if (item == PERL_RADIXCHAR || item == PERL_THOUSEP) { - do_setlocale_c(LC_NUMERIC, PL_numeric_name); - } - else { - toggle = FALSE; - } - } + /* Here everything past the dot is a digit. Treat it as a + * code page */ + retval = save_to_buffer("CP", &PL_langinfo_buf, + &PL_langinfo_bufsize, 0); + offset = STRLENs("CP"); - save_to_buffer(nl_langinfo(item), &PL_langinfo_buf, &PL_langinfo_bufsize, 0); + has_nondigit: - if (toggle) { - do_setlocale_c(LC_NUMERIC, "C"); - } + retval = save_to_buffer(first, &PL_langinfo_buf, + &PL_langinfo_bufsize, offset); + } - LOCALE_UNLOCK; + break; - return PL_langinfo_buf; +# endif +# ifdef HAS_LOCALECONV -# else /* Use nl_langinfo_l(), avoiding both a mutex and changing the locale */ + case CRNCYSTR: - bool do_free = FALSE; - locale_t cur = uselocale((locale_t) 0); + /* We don't bother with localeconv_l() because any system that + * has it is likely to also have nl_langinfo() */ - if (cur == LC_GLOBAL_LOCALE) { - cur = duplocale(LC_GLOBAL_LOCALE); - do_free = TRUE; - } + LOCALE_LOCK_V; /* Prevent interference with other threads + using localeconv() */ - if ( toggle - && (item == PERL_RADIXCHAR || item == PERL_THOUSEP)) - { - cur = newlocale(LC_NUMERIC_MASK, PL_numeric_name, cur); - do_free = TRUE; - } +# ifdef TS_W32_BROKEN_LOCALECONV - save_to_buffer(nl_langinfo_l(item, cur), - &PL_langinfo_buf, &PL_langinfo_bufsize, 0); - if (do_free) { - freelocale(cur); - } + /* This is a workaround for a Windows bug prior to VS 15. + * What we do here is, while locked, switch to the global + * locale so localeconv() works; then switch back just before + * the unlock. This can screw things up if some thread is + * already using the global locale while assuming no other is. + * A different workaround would be to call GetCurrencyFormat on + * a known value, and parse it; patches welcome + * + * We have to use LC_ALL instead of LC_MONETARY because of + * another bug in Windows */ - return PL_langinfo_buf; + save_thread = savepv(my_setlocale(LC_ALL, NULL)); + _configthreadlocale(_DISABLE_PER_THREAD_LOCALE); + save_global= savepv(my_setlocale(LC_ALL, NULL)); + my_setlocale(LC_ALL, save_thread); # endif -#else /* Below, emulate nl_langinfo as best we can */ -# ifdef HAS_LOCALECONV - const struct lconv* lc; - -# endif -# ifdef HAS_STRFTIME + lc = localeconv(); + if ( ! lc + || ! lc->currency_symbol + || strEQ("", lc->currency_symbol)) + { + LOCALE_UNLOCK_V; + return ""; + } - struct tm tm; - bool return_format = FALSE; /* Return the %format, not the value */ - const char * format; + /* Leave the first spot empty to be filled in below */ + retval = save_to_buffer(lc->currency_symbol, &PL_langinfo_buf, + &PL_langinfo_bufsize, 1); + if (lc->mon_decimal_point && strEQ(lc->mon_decimal_point, "")) + { /* khw couldn't figure out how the localedef specifications + would show that the $ should replace the radix; this is + just a guess as to how it might work.*/ + PL_langinfo_buf[0] = '.'; + } + else if (lc->p_cs_precedes) { + PL_langinfo_buf[0] = '-'; + } + else { + PL_langinfo_buf[0] = '+'; + } -# endif +# ifdef TS_W32_BROKEN_LOCALECONV - /* We copy the results to a per-thread buffer, even if not multi-threaded. - * This is in part to simplify this code, and partly because we need a - * buffer anyway for strftime(), and partly because a call of localeconv() - * could otherwise wipe out the buffer, and the programmer would not be - * expecting this, as this is a nl_langinfo() substitute after all, so s/he - * might be thinking their localeconv() is safe until another localeconv() - * call. */ + my_setlocale(LC_ALL, save_global); + _configthreadlocale(_ENABLE_PER_THREAD_LOCALE); + my_setlocale(LC_ALL, save_thread); + Safefree(save_global); + Safefree(save_thread); - switch (item) { - Size_t len; - const char * retval; +# endif - /* These 2 are unimplemented */ - case PERL_CODESET: - case PERL_ERA: /* For use with strftime() %E modifier */ + LOCALE_UNLOCK_V; + break; - default: - return ""; +# ifdef TS_W32_BROKEN_LOCALECONV - /* We use only an English set, since we don't know any more */ - case PERL_YESEXPR: return "^[+1yY]"; - case PERL_NOEXPR: return "^[-0nN]"; + case RADIXCHAR: -# ifdef HAS_LOCALECONV + /* For this, we output a known simple floating point number to + * a buffer, and parse it, looking for the radix */ - case PERL_CRNCYSTR: + if (toggle) { + STORE_LC_NUMERIC_FORCE_TO_UNDERLYING(); + } - LOCALE_LOCK; + if (PL_langinfo_bufsize < 10) { + PL_langinfo_bufsize = 10; + Renew(PL_langinfo_buf, PL_langinfo_bufsize, char); + } - lc = localeconv(); - if (! lc || ! lc->currency_symbol || strEQ("", lc->currency_symbol)) - { - LOCALE_UNLOCK; - return ""; - } + needed_size = my_snprintf(PL_langinfo_buf, PL_langinfo_bufsize, + "%.1f", 1.5); + if (needed_size >= (int) PL_langinfo_bufsize) { + PL_langinfo_bufsize = needed_size + 1; + Renew(PL_langinfo_buf, PL_langinfo_bufsize, char); + needed_size = my_snprintf(PL_langinfo_buf, PL_langinfo_bufsize, + "%.1f", 1.5); + assert(needed_size < (int) PL_langinfo_bufsize); + } - /* Leave the first spot empty to be filled in below */ - save_to_buffer(lc->currency_symbol, &PL_langinfo_buf, - &PL_langinfo_bufsize, 1); - if (lc->mon_decimal_point && strEQ(lc->mon_decimal_point, "")) - { /* khw couldn't figure out how the localedef specifications - would show that the $ should replace the radix; this is - just a guess as to how it might work.*/ - *PL_langinfo_buf = '.'; - } - else if (lc->p_cs_precedes) { - *PL_langinfo_buf = '-'; - } - else { - *PL_langinfo_buf = '+'; - } + ptr = PL_langinfo_buf; + e = PL_langinfo_buf + PL_langinfo_bufsize; - LOCALE_UNLOCK; - break; + /* Find the '1' */ + while (ptr < e && *ptr != '1') { + ptr++; + } + ptr++; - case PERL_RADIXCHAR: - case PERL_THOUSEP: + /* Find the '5' */ + item_start = ptr; + while (ptr < e && *ptr != '5') { + ptr++; + } - LOCALE_LOCK; + /* Everything in between is the radix string */ + if (ptr >= e) { + PL_langinfo_buf[0] = '?'; + PL_langinfo_buf[1] = '\0'; + } + else { + *ptr = '\0'; + Move(item_start, PL_langinfo_buf, ptr - PL_langinfo_buf, char); + } - if (toggle) { - do_setlocale_c(LC_NUMERIC, PL_numeric_name); - } + if (toggle) { + RESTORE_LC_NUMERIC(); + } - lc = localeconv(); - if (! lc) { - retval = ""; - } - else switch (item) { - case PERL_RADIXCHAR: - if (! lc->decimal_point) { - retval = ""; - } - else { - retval = lc->decimal_point; - } - break; + retval = PL_langinfo_buf; + break; - case PERL_THOUSEP: - if (! lc->thousands_sep || strEQ("", lc->thousands_sep)) { - retval = ""; - } - else { - retval = lc->thousands_sep; - } - break; +# else - default: - LOCALE_UNLOCK; - Perl_croak(aTHX_ "panic: %s: %d: switch case: %d problem", - __FILE__, __LINE__, item); - } + case RADIXCHAR: /* No special handling needed */ - save_to_buffer(retval, &PL_langinfo_buf, &PL_langinfo_bufsize, 0); +# endif - if (toggle) { - do_setlocale_c(LC_NUMERIC, "C"); - } + case THOUSEP: - LOCALE_UNLOCK; + if (toggle) { + STORE_LC_NUMERIC_FORCE_TO_UNDERLYING(); + } - break; + LOCALE_LOCK_V; /* Prevent interference with other threads + using localeconv() */ + +# ifdef TS_W32_BROKEN_LOCALECONV + + /* This should only be for the thousands separator. A + * different work around would be to use GetNumberFormat on a + * known value and parse the result to find the separator */ + save_thread = savepv(my_setlocale(LC_ALL, NULL)); + _configthreadlocale(_DISABLE_PER_THREAD_LOCALE); + save_global = savepv(my_setlocale(LC_ALL, NULL)); + my_setlocale(LC_ALL, save_thread); +# if 0 + /* This is the start of code that for broken Windows replaces + * the above and below code, and instead calls + * GetNumberFormat() and then would parse that to find the + * thousands separator. It needs to handle UTF-16 vs -8 + * issues. */ + + needed_size = GetNumberFormatEx(PL_numeric_name, 0, "1234.5", NULL, PL_langinfo_buf, PL_langinfo_bufsize); + DEBUG_L(PerlIO_printf(Perl_debug_log, + "%s: %d: return from GetNumber, count=%d, val=%s\n", + __FILE__, __LINE__, needed_size, PL_langinfo_buf)); -# endif -# ifdef HAS_STRFTIME +# endif +# endif - /* These are defined by C89, so we assume that strftime supports them, - * and so are returned unconditionally; they may not be what the locale - * actually says, but should give good enough results for someone using - * them as formats (as opposed to trying to parse them to figure out - * what the locale says). The other format items are actually tested to - * verify they work on the platform */ - case PERL_D_FMT: return "%x"; - case PERL_T_FMT: return "%X"; - case PERL_D_T_FMT: return "%c"; - - /* These formats are only available in later strfmtime's */ - case PERL_ERA_D_FMT: case PERL_ERA_T_FMT: case PERL_ERA_D_T_FMT: - case PERL_T_FMT_AMPM: - - /* The rest can be gotten from most versions of strftime(). */ - case PERL_ABDAY_1: case PERL_ABDAY_2: case PERL_ABDAY_3: - case PERL_ABDAY_4: case PERL_ABDAY_5: case PERL_ABDAY_6: - case PERL_ABDAY_7: - case PERL_ALT_DIGITS: - case PERL_AM_STR: case PERL_PM_STR: - case PERL_ABMON_1: case PERL_ABMON_2: case PERL_ABMON_3: - case PERL_ABMON_4: case PERL_ABMON_5: case PERL_ABMON_6: - case PERL_ABMON_7: case PERL_ABMON_8: case PERL_ABMON_9: - case PERL_ABMON_10: case PERL_ABMON_11: case PERL_ABMON_12: - case PERL_DAY_1: case PERL_DAY_2: case PERL_DAY_3: case PERL_DAY_4: - case PERL_DAY_5: case PERL_DAY_6: case PERL_DAY_7: - case PERL_MON_1: case PERL_MON_2: case PERL_MON_3: case PERL_MON_4: - case PERL_MON_5: case PERL_MON_6: case PERL_MON_7: case PERL_MON_8: - case PERL_MON_9: case PERL_MON_10: case PERL_MON_11: case PERL_MON_12: + lc = localeconv(); + if (! lc) { + temp = ""; + } + else { + temp = (item == RADIXCHAR) + ? lc->decimal_point + : lc->thousands_sep; + if (! temp) { + temp = ""; + } + } - LOCALE_LOCK; + retval = save_to_buffer(temp, &PL_langinfo_buf, + &PL_langinfo_bufsize, 0); - init_tm(&tm); /* Precaution against core dumps */ - tm.tm_sec = 30; - tm.tm_min = 30; - tm.tm_hour = 6; - tm.tm_year = 2017 - 1900; - tm.tm_wday = 0; - tm.tm_mon = 0; - switch (item) { - default: - LOCALE_UNLOCK; - Perl_croak(aTHX_ "panic: %s: %d: switch case: %d problem", - __FILE__, __LINE__, item); - NOT_REACHED; /* NOTREACHED */ - - case PERL_PM_STR: tm.tm_hour = 18; - case PERL_AM_STR: - format = "%p"; - break; +# ifdef TS_W32_BROKEN_LOCALECONV - case PERL_ABDAY_7: tm.tm_wday++; - case PERL_ABDAY_6: tm.tm_wday++; - case PERL_ABDAY_5: tm.tm_wday++; - case PERL_ABDAY_4: tm.tm_wday++; - case PERL_ABDAY_3: tm.tm_wday++; - case PERL_ABDAY_2: tm.tm_wday++; - case PERL_ABDAY_1: - format = "%a"; - break; + my_setlocale(LC_ALL, save_global); + _configthreadlocale(_ENABLE_PER_THREAD_LOCALE); + my_setlocale(LC_ALL, save_thread); + Safefree(save_global); + Safefree(save_thread); - case PERL_DAY_7: tm.tm_wday++; - case PERL_DAY_6: tm.tm_wday++; - case PERL_DAY_5: tm.tm_wday++; - case PERL_DAY_4: tm.tm_wday++; - case PERL_DAY_3: tm.tm_wday++; - case PERL_DAY_2: tm.tm_wday++; - case PERL_DAY_1: - format = "%A"; - break; +# endif - case PERL_ABMON_12: tm.tm_mon++; - case PERL_ABMON_11: tm.tm_mon++; - case PERL_ABMON_10: tm.tm_mon++; - case PERL_ABMON_9: tm.tm_mon++; - case PERL_ABMON_8: tm.tm_mon++; - case PERL_ABMON_7: tm.tm_mon++; - case PERL_ABMON_6: tm.tm_mon++; - case PERL_ABMON_5: tm.tm_mon++; - case PERL_ABMON_4: tm.tm_mon++; - case PERL_ABMON_3: tm.tm_mon++; - case PERL_ABMON_2: tm.tm_mon++; - case PERL_ABMON_1: - format = "%b"; - break; + LOCALE_UNLOCK_V; - case PERL_MON_12: tm.tm_mon++; - case PERL_MON_11: tm.tm_mon++; - case PERL_MON_10: tm.tm_mon++; - case PERL_MON_9: tm.tm_mon++; - case PERL_MON_8: tm.tm_mon++; - case PERL_MON_7: tm.tm_mon++; - case PERL_MON_6: tm.tm_mon++; - case PERL_MON_5: tm.tm_mon++; - case PERL_MON_4: tm.tm_mon++; - case PERL_MON_3: tm.tm_mon++; - case PERL_MON_2: tm.tm_mon++; - case PERL_MON_1: - format = "%B"; - break; + if (toggle) { + RESTORE_LC_NUMERIC(); + } - case PERL_T_FMT_AMPM: - format = "%r"; - return_format = TRUE; - break; + break; - case PERL_ERA_D_FMT: - format = "%Ex"; - return_format = TRUE; - break; +# endif +# ifdef HAS_STRFTIME - case PERL_ERA_T_FMT: - format = "%EX"; - return_format = TRUE; - break; + /* These are defined by C89, so we assume that strftime supports + * them, and so are returned unconditionally; they may not be what + * the locale actually says, but should give good enough results + * for someone using them as formats (as opposed to trying to parse + * them to figure out what the locale says). The other format + * items are actually tested to verify they work on the platform */ + case D_FMT: return "%x"; + case T_FMT: return "%X"; + case D_T_FMT: return "%c"; + + /* These formats are only available in later strfmtime's */ + case ERA_D_FMT: case ERA_T_FMT: case ERA_D_T_FMT: case T_FMT_AMPM: + + /* The rest can be gotten from most versions of strftime(). */ + case ABDAY_1: case ABDAY_2: case ABDAY_3: + case ABDAY_4: case ABDAY_5: case ABDAY_6: case ABDAY_7: + case ALT_DIGITS: + case AM_STR: case PM_STR: + case ABMON_1: case ABMON_2: case ABMON_3: case ABMON_4: + case ABMON_5: case ABMON_6: case ABMON_7: case ABMON_8: + case ABMON_9: case ABMON_10: case ABMON_11: case ABMON_12: + case DAY_1: case DAY_2: case DAY_3: case DAY_4: + case DAY_5: case DAY_6: case DAY_7: + case MON_1: case MON_2: case MON_3: case MON_4: + case MON_5: case MON_6: case MON_7: case MON_8: + case MON_9: case MON_10: case MON_11: case MON_12: + + LOCALE_LOCK; + + init_tm(&tm); /* Precaution against core dumps */ + tm.tm_sec = 30; + tm.tm_min = 30; + tm.tm_hour = 6; + tm.tm_year = 2017 - 1900; + tm.tm_wday = 0; + tm.tm_mon = 0; + switch (item) { + default: + LOCALE_UNLOCK; + Perl_croak(aTHX_ + "panic: %s: %d: switch case: %d problem", + __FILE__, __LINE__, item); + NOT_REACHED; /* NOTREACHED */ + + case PM_STR: tm.tm_hour = 18; + case AM_STR: + format = "%p"; + break; + + case ABDAY_7: tm.tm_wday++; + case ABDAY_6: tm.tm_wday++; + case ABDAY_5: tm.tm_wday++; + case ABDAY_4: tm.tm_wday++; + case ABDAY_3: tm.tm_wday++; + case ABDAY_2: tm.tm_wday++; + case ABDAY_1: + format = "%a"; + break; + + case DAY_7: tm.tm_wday++; + case DAY_6: tm.tm_wday++; + case DAY_5: tm.tm_wday++; + case DAY_4: tm.tm_wday++; + case DAY_3: tm.tm_wday++; + case DAY_2: tm.tm_wday++; + case DAY_1: + format = "%A"; + break; + + case ABMON_12: tm.tm_mon++; + case ABMON_11: tm.tm_mon++; + case ABMON_10: tm.tm_mon++; + case ABMON_9: tm.tm_mon++; + case ABMON_8: tm.tm_mon++; + case ABMON_7: tm.tm_mon++; + case ABMON_6: tm.tm_mon++; + case ABMON_5: tm.tm_mon++; + case ABMON_4: tm.tm_mon++; + case ABMON_3: tm.tm_mon++; + case ABMON_2: tm.tm_mon++; + case ABMON_1: + format = "%b"; + break; + + case MON_12: tm.tm_mon++; + case MON_11: tm.tm_mon++; + case MON_10: tm.tm_mon++; + case MON_9: tm.tm_mon++; + case MON_8: tm.tm_mon++; + case MON_7: tm.tm_mon++; + case MON_6: tm.tm_mon++; + case MON_5: tm.tm_mon++; + case MON_4: tm.tm_mon++; + case MON_3: tm.tm_mon++; + case MON_2: tm.tm_mon++; + case MON_1: + format = "%B"; + break; + + case T_FMT_AMPM: + format = "%r"; + return_format = TRUE; + break; + + case ERA_D_FMT: + format = "%Ex"; + return_format = TRUE; + break; + + case ERA_T_FMT: + format = "%EX"; + return_format = TRUE; + break; + + case ERA_D_T_FMT: + format = "%Ec"; + return_format = TRUE; + break; + + case ALT_DIGITS: + tm.tm_wday = 0; + format = "%Ow"; /* Find the alternate digit for 0 */ + break; + } - case PERL_ERA_D_T_FMT: - format = "%Ec"; - return_format = TRUE; - break; + /* We can't use my_strftime() because it doesn't look at + * tm_wday */ + while (0 == strftime(PL_langinfo_buf, PL_langinfo_bufsize, + format, &tm)) + { + /* A zero return means one of: + * a) there wasn't enough space in PL_langinfo_buf + * b) the format, like a plain %p, returns empty + * c) it was an illegal format, though some + * implementations of strftime will just return the + * illegal format as a plain character sequence. + * + * To quickly test for case 'b)', try again but precede + * the format with a plain character. If that result is + * still empty, the problem is either 'a)' or 'c)' */ + + Size_t format_size = strlen(format) + 1; + Size_t mod_size = format_size + 1; + char * mod_format; + char * temp_result; + + Newx(mod_format, mod_size, char); + Newx(temp_result, PL_langinfo_bufsize, char); + *mod_format = ' '; + my_strlcpy(mod_format + 1, format, mod_size); + len = strftime(temp_result, + PL_langinfo_bufsize, + mod_format, &tm); + Safefree(mod_format); + Safefree(temp_result); + + /* If 'len' is non-zero, it means that we had a case like + * %p which means the current locale doesn't use a.m. or + * p.m., and that is valid */ + if (len == 0) { + + /* Here, still didn't work. If we get well beyond a + * reasonable size, bail out to prevent an infinite + * loop. */ + + if (PL_langinfo_bufsize > 100 * format_size) { + *PL_langinfo_buf = '\0'; + } + else { + /* Double the buffer size to retry; Add 1 in case + * original was 0, so we aren't stuck at 0. */ + PL_langinfo_bufsize *= 2; + PL_langinfo_bufsize++; + Renew(PL_langinfo_buf, PL_langinfo_bufsize, char); + continue; + } + } - case PERL_ALT_DIGITS: - tm.tm_wday = 0; - format = "%Ow"; /* Find the alternate digit for 0 */ break; - } + } - /* We can't use my_strftime() because it doesn't look at tm_wday */ - while (0 == strftime(PL_langinfo_buf, PL_langinfo_bufsize, - format, &tm)) - { - /* A zero return means one of: - * a) there wasn't enough space in PL_langinfo_buf - * b) the format, like a plain %p, returns empty - * c) it was an illegal format, though some implementations of - * strftime will just return the illegal format as a plain - * character sequence. + /* Here, we got a result. * - * To quickly test for case 'b)', try again but precede the - * format with a plain character. If that result is still - * empty, the problem is either 'a)' or 'c)' */ - - Size_t format_size = strlen(format) + 1; - Size_t mod_size = format_size + 1; - char * mod_format; - char * temp_result; - - Newx(mod_format, mod_size, char); - Newx(temp_result, PL_langinfo_bufsize, char); - *mod_format = '\a'; - my_strlcpy(mod_format + 1, format, mod_size); - len = strftime(temp_result, - PL_langinfo_bufsize, - mod_format, &tm); - Safefree(mod_format); - Safefree(temp_result); - - /* If 'len' is non-zero, it means that we had a case like %p - * which means the current locale doesn't use a.m. or p.m., and - * that is valid */ - if (len == 0) { - - /* Here, still didn't work. If we get well beyond a - * reasonable size, bail out to prevent an infinite loop. */ - - if (PL_langinfo_bufsize > 100 * format_size) { - *PL_langinfo_buf = '\0'; - } - else { /* Double the buffer size to retry; Add 1 in case - original was 0, so we aren't stuck at 0. */ - PL_langinfo_bufsize *= 2; - PL_langinfo_bufsize++; - Renew(PL_langinfo_buf, PL_langinfo_bufsize, char); - continue; - } + * If the item is 'ALT_DIGITS', PL_langinfo_buf contains the + * alternate format for wday 0. If the value is the same as + * the normal 0, there isn't an alternate, so clear the buffer. + * */ + if ( item == ALT_DIGITS + && strEQ(PL_langinfo_buf, "0")) + { + *PL_langinfo_buf = '\0'; } - break; - } + /* ALT_DIGITS is problematic. Experiments on it showed that + * strftime() did not always work properly when going from + * alt-9 to alt-10. Only a few locales have this item defined, + * and in all of them on Linux that khw was able to find, + * nl_langinfo() merely returned the alt-0 character, possibly + * doubled. Most Unicode digits are in blocks of 10 + * consecutive code points, so that is sufficient information + * for those scripts, as we can infer alt-1, alt-2, .... But + * for a Japanese locale, a CJK ideographic 0 is returned, and + * the CJK digits are not in code point order, so you can't + * really infer anything. The localedef for this locale did + * specify the succeeding digits, so that strftime() works + * properly on them, without needing to infer anything. But + * the nl_langinfo() return did not give sufficient information + * for the caller to understand what's going on. So until + * there is evidence that it should work differently, this + * returns the alt-0 string for ALT_DIGITS. + * + * wday was chosen because its range is all a single digit. + * Things like tm_sec have two digits as the minimum: '00' */ - /* Here, we got a result. - * - * If the item is 'ALT_DIGITS', PL_langinfo_buf contains the - * alternate format for wday 0. If the value is the same as the - * normal 0, there isn't an alternate, so clear the buffer. */ - if ( item == PERL_ALT_DIGITS - && strEQ(PL_langinfo_buf, "0")) - { - *PL_langinfo_buf = '\0'; - } - - /* ALT_DIGITS is problematic. Experiments on it showed that - * strftime() did not always work properly when going from alt-9 to - * alt-10. Only a few locales have this item defined, and in all - * of them on Linux that khw was able to find, nl_langinfo() merely - * returned the alt-0 character, possibly doubled. Most Unicode - * digits are in blocks of 10 consecutive code points, so that is - * sufficient information for those scripts, as we can infer alt-1, - * alt-2, .... But for a Japanese locale, a CJK ideographic 0 is - * returned, and the CJK digits are not in code point order, so you - * can't really infer anything. The localedef for this locale did - * specify the succeeding digits, so that strftime() works properly - * on them, without needing to infer anything. But the - * nl_langinfo() return did not give sufficient information for the - * caller to understand what's going on. So until there is - * evidence that it should work differently, this returns the alt-0 - * string for ALT_DIGITS. - * - * wday was chosen because its range is all a single digit. Things - * like tm_sec have two digits as the minimum: '00' */ + LOCALE_UNLOCK; - LOCALE_UNLOCK; + retval = PL_langinfo_buf; - /* If to return the format, not the value, overwrite the buffer - * with it. But some strftime()s will keep the original format if - * illegal, so change those to "" */ - if (return_format) { - if (strEQ(PL_langinfo_buf, format)) { - *PL_langinfo_buf = '\0'; - } - else { - save_to_buffer(format, &PL_langinfo_buf, - &PL_langinfo_bufsize, 0); + /* If to return the format, not the value, overwrite the buffer + * with it. But some strftime()s will keep the original format + * if illegal, so change those to "" */ + if (return_format) { + if (strEQ(PL_langinfo_buf, format)) { + *PL_langinfo_buf = '\0'; + } + else { + retval = save_to_buffer(format, &PL_langinfo_buf, + &PL_langinfo_bufsize, 0); + } } - } - break; + break; # endif + } } - return PL_langinfo_buf; + return retval; #endif @@ -1719,21 +3169,6 @@ Perl_init_i18nl10n(pTHX_ int printwarn) PERL_UNUSED_ARG(printwarn); #else /* USE_LOCALE */ -# ifdef USE_LOCALE_CTYPE - - char *curctype = NULL; - -# endif /* USE_LOCALE_CTYPE */ -# ifdef USE_LOCALE_COLLATE - - char *curcoll = NULL; - -# endif /* USE_LOCALE_COLLATE */ -# ifdef USE_LOCALE_NUMERIC - - char *curnum = NULL; - -# endif /* USE_LOCALE_NUMERIC */ # ifdef __GLIBC__ const char * const language = savepv(PerlEnv_getenv("LANGUAGE")); @@ -1755,15 +3190,19 @@ Perl_init_i18nl10n(pTHX_ int printwarn) const char * const bad_lang_use_once = PerlEnv_getenv("PERL_BADLANG"); const bool locwarn = (printwarn > 1 - || (printwarn - && (! bad_lang_use_once + || ( printwarn + && ( ! bad_lang_use_once || ( - /* disallow with "" or "0" */ - *bad_lang_use_once - && strNE("0", bad_lang_use_once))))); - bool done = FALSE; - char * sl_result; /* return from setlocale() */ - char * locale_param; + /* disallow with "" or "0" */ + *bad_lang_use_once + && strNE("0", bad_lang_use_once))))); + + /* setlocale() return vals; not copied so must be looked at immediately */ + const char * sl_result[NOMINAL_LC_ALL_INDEX + 1]; + + /* current locale for given category; should have been copied so aren't + * volatile */ + const char * curlocales[NOMINAL_LC_ALL_INDEX + 1]; # ifdef WIN32 @@ -1776,7 +3215,10 @@ Perl_init_i18nl10n(pTHX_ int printwarn) const char *system_default_locale = NULL; # endif -# ifdef DEBUGGING + +# ifndef DEBUGGING +# define DEBUG_LOCALE_INIT(a,b,c) +# else DEBUG_INITIALIZATION_set(cBOOL(PerlEnv_getenv("PERL_DEBUG_LOCALE_INIT"))); @@ -1792,102 +3234,169 @@ Perl_init_i18nl10n(pTHX_ int printwarn) } \ } STMT_END -# else -# define DEBUG_LOCALE_INIT(a,b,c) +/* Make sure the parallel arrays are properly set up */ +# ifdef USE_LOCALE_NUMERIC + assert(categories[LC_NUMERIC_INDEX] == LC_NUMERIC); + assert(strEQ(category_names[LC_NUMERIC_INDEX], "LC_NUMERIC")); +# ifdef USE_POSIX_2008_LOCALE + assert(category_masks[LC_NUMERIC_INDEX] == LC_NUMERIC_MASK); +# endif +# endif +# ifdef USE_LOCALE_CTYPE + assert(categories[LC_CTYPE_INDEX] == LC_CTYPE); + assert(strEQ(category_names[LC_CTYPE_INDEX], "LC_CTYPE")); +# ifdef USE_POSIX_2008_LOCALE + assert(category_masks[LC_CTYPE_INDEX] == LC_CTYPE_MASK); +# endif +# endif +# ifdef USE_LOCALE_COLLATE + assert(categories[LC_COLLATE_INDEX] == LC_COLLATE); + assert(strEQ(category_names[LC_COLLATE_INDEX], "LC_COLLATE")); +# ifdef USE_POSIX_2008_LOCALE + assert(category_masks[LC_COLLATE_INDEX] == LC_COLLATE_MASK); +# endif +# endif +# ifdef USE_LOCALE_TIME + assert(categories[LC_TIME_INDEX] == LC_TIME); + assert(strEQ(category_names[LC_TIME_INDEX], "LC_TIME")); +# ifdef USE_POSIX_2008_LOCALE + assert(category_masks[LC_TIME_INDEX] == LC_TIME_MASK); +# endif +# endif +# ifdef USE_LOCALE_MESSAGES + assert(categories[LC_MESSAGES_INDEX] == LC_MESSAGES); + assert(strEQ(category_names[LC_MESSAGES_INDEX], "LC_MESSAGES")); +# ifdef USE_POSIX_2008_LOCALE + assert(category_masks[LC_MESSAGES_INDEX] == LC_MESSAGES_MASK); +# endif +# endif +# ifdef USE_LOCALE_MONETARY + assert(categories[LC_MONETARY_INDEX] == LC_MONETARY); + assert(strEQ(category_names[LC_MONETARY_INDEX], "LC_MONETARY")); +# ifdef USE_POSIX_2008_LOCALE + assert(category_masks[LC_MONETARY_INDEX] == LC_MONETARY_MASK); +# endif +# endif +# ifdef USE_LOCALE_ADDRESS + assert(categories[LC_ADDRESS_INDEX] == LC_ADDRESS); + assert(strEQ(category_names[LC_ADDRESS_INDEX], "LC_ADDRESS")); +# ifdef USE_POSIX_2008_LOCALE + assert(category_masks[LC_ADDRESS_INDEX] == LC_ADDRESS_MASK); +# endif +# endif +# ifdef USE_LOCALE_IDENTIFICATION + assert(categories[LC_IDENTIFICATION_INDEX] == LC_IDENTIFICATION); + assert(strEQ(category_names[LC_IDENTIFICATION_INDEX], "LC_IDENTIFICATION")); +# ifdef USE_POSIX_2008_LOCALE + assert(category_masks[LC_IDENTIFICATION_INDEX] == LC_IDENTIFICATION_MASK); +# endif +# endif +# ifdef USE_LOCALE_MEASUREMENT + assert(categories[LC_MEASUREMENT_INDEX] == LC_MEASUREMENT); + assert(strEQ(category_names[LC_MEASUREMENT_INDEX], "LC_MEASUREMENT")); +# ifdef USE_POSIX_2008_LOCALE + assert(category_masks[LC_MEASUREMENT_INDEX] == LC_MEASUREMENT_MASK); +# endif +# endif +# ifdef USE_LOCALE_PAPER + assert(categories[LC_PAPER_INDEX] == LC_PAPER); + assert(strEQ(category_names[LC_PAPER_INDEX], "LC_PAPER")); +# ifdef USE_POSIX_2008_LOCALE + assert(category_masks[LC_PAPER_INDEX] == LC_PAPER_MASK); +# endif +# endif +# ifdef USE_LOCALE_TELEPHONE + assert(categories[LC_TELEPHONE_INDEX] == LC_TELEPHONE); + assert(strEQ(category_names[LC_TELEPHONE_INDEX], "LC_TELEPHONE")); +# ifdef USE_POSIX_2008_LOCALE + assert(category_masks[LC_TELEPHONE_INDEX] == LC_TELEPHONE_MASK); +# endif +# endif +# ifdef LC_ALL + assert(categories[LC_ALL_INDEX] == LC_ALL); + assert(strEQ(category_names[LC_ALL_INDEX], "LC_ALL")); + assert(NOMINAL_LC_ALL_INDEX == LC_ALL_INDEX); +# ifdef USE_POSIX_2008_LOCALE + assert(category_masks[LC_ALL_INDEX] == LC_ALL_MASK); +# endif +# endif +# endif /* DEBUGGING */ + + /* Initialize the cache of the program's UTF-8ness for the always known + * locales C and POSIX */ + my_strlcpy(PL_locale_utf8ness, C_and_POSIX_utf8ness, + sizeof(PL_locale_utf8ness)); + +# ifdef USE_THREAD_SAFE_LOCALE +# ifdef WIN32 + + _configthreadlocale(_ENABLE_PER_THREAD_LOCALE); + +# endif # endif +# ifdef USE_POSIX_2008_LOCALE -# ifndef LOCALE_ENVIRON_REQUIRED + PL_C_locale_obj = newlocale(LC_ALL_MASK, "C", (locale_t) 0); + if (! PL_C_locale_obj) { + Perl_croak_nocontext( + "panic: Cannot create POSIX 2008 C locale object; errno=%d", errno); + } + if (DEBUG_Lv_TEST || debug_initialization) { + PerlIO_printf(Perl_debug_log, "%s:%d: created C object %p\n", __FILE__, __LINE__, PL_C_locale_obj); + } - PERL_UNUSED_VAR(done); - PERL_UNUSED_VAR(locale_param); +# endif -# else +# ifdef USE_LOCALE_NUMERIC + + PL_numeric_radix_sv = newSVpvs("."); + +# endif + +# if defined(USE_POSIX_2008_LOCALE) && ! defined(HAS_QUERYLOCALE) + + /* Initialize our records. If we have POSIX 2008, we have LC_ALL */ + do_setlocale_c(LC_ALL, my_setlocale(LC_ALL, NULL)); + +# endif +# ifdef LOCALE_ENVIRON_REQUIRED /* * Ultrix setlocale(..., "") fails if there are no environment * variables from which to get a locale name. */ -# ifdef LC_ALL - - if (lang) { - sl_result = do_setlocale_c(LC_ALL, setlocale_init); - DEBUG_LOCALE_INIT(LC_ALL, setlocale_init, sl_result); - if (sl_result) - done = TRUE; - else - setlocale_failure = TRUE; - } - if (! setlocale_failure) { - -# ifdef USE_LOCALE_CTYPE - - locale_param = (! done && (lang || PerlEnv_getenv("LC_CTYPE"))) - ? setlocale_init - : NULL; - curctype = do_setlocale_c(LC_CTYPE, locale_param); - DEBUG_LOCALE_INIT(LC_CTYPE, locale_param, sl_result); - if (! curctype) - setlocale_failure = TRUE; - else - curctype = savepv(curctype); - -# endif /* USE_LOCALE_CTYPE */ -# ifdef USE_LOCALE_COLLATE - - locale_param = (! done && (lang || PerlEnv_getenv("LC_COLLATE"))) - ? setlocale_init - : NULL; - curcoll = do_setlocale_c(LC_COLLATE, locale_param); - DEBUG_LOCALE_INIT(LC_COLLATE, locale_param, sl_result); - if (! curcoll) - setlocale_failure = TRUE; - else - curcoll = savepv(curcoll); - -# endif /* USE_LOCALE_COLLATE */ -# ifdef USE_LOCALE_NUMERIC - - locale_param = (! done && (lang || PerlEnv_getenv("LC_NUMERIC"))) - ? setlocale_init - : NULL; - curnum = do_setlocale_c(LC_NUMERIC, locale_param); - DEBUG_LOCALE_INIT(LC_NUMERIC, locale_param, sl_result); - if (! curnum) - setlocale_failure = TRUE; - else - curnum = savepv(curnum); - -# endif /* USE_LOCALE_NUMERIC */ -# ifdef USE_LOCALE_MESSAGES - - locale_param = (! done && (lang || PerlEnv_getenv("LC_MESSAGES"))) - ? setlocale_init - : NULL; - sl_result = do_setlocale_c(LC_MESSAGES, locale_param); - DEBUG_LOCALE_INIT(LC_MESSAGES, locale_param, sl_result); - if (! sl_result) { - setlocale_failure = TRUE; - } - -# endif /* USE_LOCALE_MESSAGES */ -# ifdef USE_LOCALE_MONETARY - - locale_param = (! done && (lang || PerlEnv_getenv("LC_MONETARY"))) - ? setlocale_init - : NULL; - sl_result = do_setlocale_c(LC_MONETARY, locale_param); - DEBUG_LOCALE_INIT(LC_MONETARY, locale_param, sl_result); - if (! sl_result) { - setlocale_failure = TRUE; - } - -# endif /* USE_LOCALE_MONETARY */ +# ifndef LC_ALL +# error Ultrix without LC_ALL not implemented +# else + { + bool done = FALSE; + if (lang) { + sl_result[LC_ALL_INDEX] = do_setlocale_c(LC_ALL, setlocale_init); + DEBUG_LOCALE_INIT(LC_ALL, setlocale_init, sl_result[LC_ALL_INDEX]); + if (sl_result[LC_ALL_INDEX]) + done = TRUE; + else + setlocale_failure = TRUE; + } + if (! setlocale_failure) { + const char * locale_param; + for (i = 0; i < LC_ALL_INDEX; i++) { + locale_param = (! done && (lang || PerlEnv_getenv(category_names[i]))) + ? setlocale_init + : NULL; + sl_result[i] = do_setlocale_r(categories[i], locale_param); + if (! sl_result[i]) { + setlocale_failure = TRUE; + } + DEBUG_LOCALE_INIT(categories[i], locale_param, sl_result[i]); + } + } } # endif /* LC_ALL */ -# endif /* !LOCALE_ENVIRON_REQUIRED */ +# endif /* LOCALE_ENVIRON_REQUIRED */ /* We try each locale in the list until we get one that works, or exhaust * the list. Normally the loop is executed just once. But if setting the @@ -1907,7 +3416,7 @@ Perl_init_i18nl10n(pTHX_ int printwarn) setlocale_failure = FALSE; # ifdef SYSTEM_DEFAULT_LOCALE -# ifdef WIN32 +# ifdef WIN32 /* Note that assumes Win32 has LC_ALL */ /* On Windows machines, an entry of "" after the 0th means to use * the system default locale, which we now proceed to get. */ @@ -1932,84 +3441,46 @@ Perl_init_i18nl10n(pTHX_ int printwarn) trial_locale = system_default_locale; } -# endif /* WIN32 */ +# else +# error SYSTEM_DEFAULT_LOCALE only implemented for Win32 +# endif # endif /* SYSTEM_DEFAULT_LOCALE */ - } + + } /* For i > 0 */ # ifdef LC_ALL - sl_result = do_setlocale_c(LC_ALL, trial_locale); - DEBUG_LOCALE_INIT(LC_ALL, trial_locale, sl_result); - if (! sl_result) { + sl_result[LC_ALL_INDEX] = do_setlocale_c(LC_ALL, trial_locale); + DEBUG_LOCALE_INIT(LC_ALL, trial_locale, sl_result[LC_ALL_INDEX]); + if (! sl_result[LC_ALL_INDEX]) { setlocale_failure = TRUE; } else { /* Since LC_ALL succeeded, it should have changed all the other * categories it can to its value; so we massage things so that the - * setlocales below just return their category's current values. - * This adequately handles the case in NetBSD where LC_COLLATE may - * not be defined for a locale, and setting it individually will - * fail, whereas setting LC_ALL succeeds, leaving LC_COLLATE set to - * the POSIX locale. */ - trial_locale = NULL; - } - -# endif /* LC_ALL */ - - if (!setlocale_failure) { - -# ifdef USE_LOCALE_CTYPE - - Safefree(curctype); - curctype = do_setlocale_c(LC_CTYPE, trial_locale); - DEBUG_LOCALE_INIT(LC_CTYPE, trial_locale, curctype); - if (! curctype) - setlocale_failure = TRUE; - else - curctype = savepv(curctype); - -# endif /* USE_LOCALE_CTYPE */ -# ifdef USE_LOCALE_COLLATE - - Safefree(curcoll); - curcoll = do_setlocale_c(LC_COLLATE, trial_locale); - DEBUG_LOCALE_INIT(LC_COLLATE, trial_locale, curcoll); - if (! curcoll) - setlocale_failure = TRUE; - else - curcoll = savepv(curcoll); - -# endif /* USE_LOCALE_COLLATE */ -# ifdef USE_LOCALE_NUMERIC - - Safefree(curnum); - curnum = do_setlocale_c(LC_NUMERIC, trial_locale); - DEBUG_LOCALE_INIT(LC_NUMERIC, trial_locale, curnum); - if (! curnum) - setlocale_failure = TRUE; - else - curnum = savepv(curnum); - -# endif /* USE_LOCALE_NUMERIC */ -# ifdef USE_LOCALE_MESSAGES - - sl_result = do_setlocale_c(LC_MESSAGES, trial_locale); - DEBUG_LOCALE_INIT(LC_MESSAGES, trial_locale, sl_result); - if (! (sl_result)) - setlocale_failure = TRUE; - -# endif /* USE_LOCALE_MESSAGES */ -# ifdef USE_LOCALE_MONETARY + * setlocales below just return their category's current values. + * This adequately handles the case in NetBSD where LC_COLLATE may + * not be defined for a locale, and setting it individually will + * fail, whereas setting LC_ALL succeeds, leaving LC_COLLATE set to + * the POSIX locale. */ + trial_locale = NULL; + } - sl_result = do_setlocale_c(LC_MONETARY, trial_locale); - DEBUG_LOCALE_INIT(LC_MONETARY, trial_locale, sl_result); - if (! (sl_result)) - setlocale_failure = TRUE; +# endif /* LC_ALL */ -# endif /* USE_LOCALE_MONETARY */ + if (! setlocale_failure) { + unsigned int j; + for (j = 0; j < NOMINAL_LC_ALL_INDEX; j++) { + curlocales[j] + = savepv(do_setlocale_r(categories[j], trial_locale)); + if (! curlocales[j]) { + setlocale_failure = TRUE; + } + DEBUG_LOCALE_INIT(categories[j], trial_locale, curlocales[j]); + } - if (! setlocale_failure) { /* Success */ - break; + if (! setlocale_failure) { /* All succeeded */ + break; /* Exit trial_locales loop */ } } @@ -2031,25 +3502,14 @@ Perl_init_i18nl10n(pTHX_ int printwarn) PerlIO_printf(Perl_error_log, "perl: warning: Setting locale failed for the categories:\n\t"); -# ifdef USE_LOCALE_CTYPE - - if (! curctype) - PerlIO_printf(Perl_error_log, "LC_CTYPE "); - -# endif /* USE_LOCALE_CTYPE */ -# ifdef USE_LOCALE_COLLATE - if (! curcoll) - PerlIO_printf(Perl_error_log, "LC_COLLATE "); - -# endif /* USE_LOCALE_COLLATE */ -# ifdef USE_LOCALE_NUMERIC - - if (! curnum) - PerlIO_printf(Perl_error_log, "LC_NUMERIC "); - -# endif /* USE_LOCALE_NUMERIC */ - - PerlIO_printf(Perl_error_log, "and possibly others\n"); + for (j = 0; j < NOMINAL_LC_ALL_INDEX; j++) { + if (! curlocales[j]) { + PerlIO_printf(Perl_error_log, category_names[j]); + } + else { + Safefree(curlocales[j]); + } + } # endif /* LC_ALL */ @@ -2185,6 +3645,7 @@ Perl_init_i18nl10n(pTHX_ int printwarn) msg = "Falling back to"; } else { /* fallback failed */ + unsigned int j; /* We dropped off the end of the loop, so have to decrement i to * get back to the value the last time through */ @@ -2195,28 +3656,11 @@ Perl_init_i18nl10n(pTHX_ int printwarn) /* To continue, we should use whatever values we've got */ -# ifdef USE_LOCALE_CTYPE - - Safefree(curctype); - curctype = savepv(do_setlocale_c(LC_CTYPE, NULL)); - DEBUG_LOCALE_INIT(LC_CTYPE, NULL, curctype); - -# endif /* USE_LOCALE_CTYPE */ -# ifdef USE_LOCALE_COLLATE - - Safefree(curcoll); - curcoll = savepv(do_setlocale_c(LC_COLLATE, NULL)); - DEBUG_LOCALE_INIT(LC_COLLATE, NULL, curcoll); - -# endif /* USE_LOCALE_COLLATE */ -# ifdef USE_LOCALE_NUMERIC - - Safefree(curnum); - curnum = savepv(do_setlocale_c(LC_NUMERIC, NULL)); - DEBUG_LOCALE_INIT(LC_NUMERIC, NULL, curnum); - -# endif /* USE_LOCALE_NUMERIC */ - + for (j = 0; j < NOMINAL_LC_ALL_INDEX; j++) { + Safefree(curlocales[j]); + curlocales[j] = savepv(do_setlocale_r(categories[j], NULL)); + DEBUG_LOCALE_INIT(categories[j], NULL, curlocales[j]); + } } if (locwarn) { @@ -2253,29 +3697,55 @@ Perl_init_i18nl10n(pTHX_ int printwarn) } } /* End of tried to fallback */ + /* Done with finding the locales; update our records */ + # ifdef USE_LOCALE_CTYPE - new_ctype(curctype); + new_ctype(curlocales[LC_CTYPE_INDEX]); -# endif /* USE_LOCALE_CTYPE */ +# endif # ifdef USE_LOCALE_COLLATE - new_collate(curcoll); + new_collate(curlocales[LC_COLLATE_INDEX]); -# endif /* USE_LOCALE_COLLATE */ +# endif # ifdef USE_LOCALE_NUMERIC - new_numeric(curnum); + new_numeric(curlocales[LC_NUMERIC_INDEX]); + +# endif + + for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++) { + +# if defined(USE_ITHREADS) && ! defined(USE_THREAD_SAFE_LOCALE) + + /* This caches whether each category's locale is UTF-8 or not. This + * may involve changing the locale. It is ok to do this at + * initialization time before any threads have started, but not later + * unless thread-safe operations are used. + * Caching means that if the program heeds our dictate not to change + * locales in threaded applications, this data will remain valid, and + * it may get queried without having to change locales. If the + * environment is such that all categories have the same locale, this + * isn't needed, as the code will not change the locale; but this + * handles the uncommon case where the environment has disparate + * locales for the categories */ + (void) _is_cur_LC_category_utf8(categories[i]); + +# endif + + Safefree(curlocales[i]); + } -# endif /* USE_LOCALE_NUMERIC */ # if defined(USE_PERLIO) && defined(USE_LOCALE_CTYPE) /* Set PL_utf8locale to TRUE if using PerlIO _and_ the current LC_CTYPE - * locale is UTF-8. If PL_utf8locale and PL_unicode (set by -C or by - * $ENV{PERL_UNICODE}) are true, perl.c:S_parse_body() will turn on the - * PerlIO :utf8 layer on STDIN, STDOUT, STDERR, _and_ the default open - * discipline. */ - PL_utf8locale = _is_cur_LC_category_utf8(LC_CTYPE); + * locale is UTF-8. The call to new_ctype() just above has already + * calculated the latter value and saved it in PL_in_utf8_CTYPE_locale. If + * both PL_utf8locale and PL_unicode (set by -C or by $ENV{PERL_UNICODE}) + * are true, perl.c:S_parse_body() will turn on the PerlIO :utf8 layer on + * STDIN, STDOUT, STDERR, _and_ the default open discipline. */ + PL_utf8locale = PL_in_utf8_CTYPE_locale; /* Set PL_unicode to $ENV{PERL_UNICODE} if using PerlIO. This is an alternative to using the -C command line switch @@ -2288,22 +3758,6 @@ Perl_init_i18nl10n(pTHX_ int printwarn) } # endif -# ifdef USE_LOCALE_CTYPE - - Safefree(curctype); - -# endif /* USE_LOCALE_CTYPE */ -# ifdef USE_LOCALE_COLLATE - - Safefree(curcoll); - -# endif /* USE_LOCALE_COLLATE */ -# ifdef USE_LOCALE_NUMERIC - - Safefree(curnum); - -# endif /* USE_LOCALE_NUMERIC */ - # ifdef __GLIBC__ Safefree(language); @@ -2472,7 +3926,7 @@ Perl__mem_collxfrm(pTHX_ const char *input_string, } /* End of determining the character that is to replace NULs */ /* If the replacement is variant under UTF-8, it must match the - * UTF8-ness as the original */ + * UTF8-ness of the original */ if ( ! UVCHR_IS_INVARIANT(PL_strxfrm_NUL_replacement) && utf8) { this_replacement_char[0] = UTF8_EIGHT_BIT_HI(PL_strxfrm_NUL_replacement); @@ -2516,6 +3970,7 @@ Perl__mem_collxfrm(pTHX_ const char *input_string, /* Make sure the UTF8ness of the string and locale match */ if (utf8 != PL_in_utf8_COLLATE_locale) { + /* XXX convert above Unicode to 10FFFF? */ const char * const t = s; /* Temporary so we can later find where the input was */ @@ -2877,6 +4332,11 @@ S_print_collxfrm_input_and_return(pTHX_ PerlIO_printf(Perl_debug_log, "'\n"); } +# endif /* DEBUGGING */ +#endif /* USE_LOCALE_COLLATE */ +#ifdef USE_LOCALE +# ifdef DEBUGGING + STATIC void S_print_bytes_for_locale(pTHX_ const char * const s, @@ -2913,9 +4373,91 @@ S_print_bytes_for_locale(pTHX_ } # endif /* #ifdef DEBUGGING */ -#endif /* USE_LOCALE_COLLATE */ -#ifdef USE_LOCALE +STATIC const char * +S_switch_category_locale_to_template(pTHX_ const int switch_category, const int template_category, const char * template_locale) +{ + /* Changes the locale for LC_'switch_category" to that of + * LC_'template_category', if they aren't already the same. If not NULL, + * 'template_locale' is the locale that 'template_category' is in. + * + * Returns a copy of the name of the original locale for 'switch_category' + * so can be switched back to with the companion function + * restore_switched_locale(), (NULL if no restoral is necessary.) */ + + char * restore_to_locale = NULL; + + if (switch_category == template_category) { /* No changes needed */ + return NULL; + } + + /* Find the original locale of the category we may need to change, so that + * it can be restored to later */ + restore_to_locale = stdize_locale(savepv(do_setlocale_r(switch_category, + NULL))); + if (! restore_to_locale) { + Perl_croak(aTHX_ + "panic: %s: %d: Could not find current %s locale, errno=%d\n", + __FILE__, __LINE__, category_name(switch_category), errno); + } + + /* If the locale of the template category wasn't passed in, find it now */ + if (template_locale == NULL) { + template_locale = do_setlocale_r(template_category, NULL); + if (! template_locale) { + Perl_croak(aTHX_ + "panic: %s: %d: Could not find current %s locale, errno=%d\n", + __FILE__, __LINE__, category_name(template_category), errno); + } + } + + /* It the locales are the same, there's nothing to do */ + if (strEQ(restore_to_locale, template_locale)) { + Safefree(restore_to_locale); + + DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s locale unchanged as %s\n", + category_name(switch_category), restore_to_locale)); + + return NULL; + } + + /* Finally, change the locale to the template one */ + if (! do_setlocale_r(switch_category, template_locale)) { + Perl_croak(aTHX_ + "panic: %s: %d: Could not change %s locale to %s, errno=%d\n", + __FILE__, __LINE__, category_name(switch_category), + template_locale, errno); + } + + DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s locale switched to %s\n", + category_name(switch_category), template_locale)); + + return restore_to_locale; +} + +STATIC void +S_restore_switched_locale(pTHX_ const int category, const char * const original_locale) +{ + /* Restores the locale for LC_'category' to 'original_locale' (which is a + * copy that will be freed by this function), or do nothing if the latter + * parameter is NULL */ + + if (original_locale == NULL) { + return; + } + + if (! do_setlocale_r(category, original_locale)) { + Perl_croak(aTHX_ + "panic: %s: %d: setlocale %s restore to %s failed, errno=%d\n", + __FILE__, __LINE__, + category_name(category), original_locale, errno); + } + + Safefree(original_locale); +} + +/* is_cur_LC_category_utf8 uses a small char buffer to avoid malloc/free */ +#define CUR_LC_BUFFER_SIZE 64 bool Perl__is_cur_LC_category_utf8(pTHX_ int category) @@ -2926,10 +4468,38 @@ Perl__is_cur_LC_category_utf8(pTHX_ int category) * could give the wrong result. The result will very likely be correct for * languages that have commonly used non-ASCII characters, but for notably * English, it comes down to if the locale's name ends in something like - * "UTF-8". It errs on the side of not being a UTF-8 locale. */ + * "UTF-8". It errs on the side of not being a UTF-8 locale. + * + * If the platform is early C89, not containing mbtowc(), or we are + * compiled to not pay attention to LC_CTYPE, this employs heuristics. + * These work very well for non-Latin locales or those whose currency + * symbol isn't a '$' nor plain ASCII text. But without LC_CTYPE and at + * least MB_CUR_MAX, English locales with an ASCII currency symbol depend + * on the name containing UTF-8 or not. */ + + /* Name of current locale corresponding to the input category */ + const char *save_input_locale = NULL; + + bool is_utf8 = FALSE; /* The return value */ + + /* The variables below are for the cache of previous lookups using this + * function. The cache is a C string, described at the definition for + * 'C_and_POSIX_utf8ness'. + * + * The first part of the cache is fixed, for the C and POSIX locales. The + * varying part starts just after them. */ + char * utf8ness_cache = PL_locale_utf8ness + STRLENs(C_and_POSIX_utf8ness); + + Size_t utf8ness_cache_size; /* Size of the varying portion */ + Size_t input_name_len; /* Length in bytes of save_input_locale */ + Size_t input_name_len_with_overhead; /* plus extra chars used to store + the name in the cache */ + char * delimited; /* The name plus the delimiters used to store + it in the cache */ + char buffer[CUR_LC_BUFFER_SIZE]; /* small buffer */ + char * name_pos; /* position of 'delimited' in the cache, or 0 + if not there */ - char *save_input_locale = NULL; - STRLEN final_pos; # ifdef LC_ALL @@ -2937,418 +4507,373 @@ Perl__is_cur_LC_category_utf8(pTHX_ int category) # endif - /* First dispose of the trivial cases */ - save_input_locale = do_setlocale_r(category, NULL); + /* Get the desired category's locale */ + save_input_locale = stdize_locale(savepv(do_setlocale_r(category, NULL))); if (! save_input_locale) { - DEBUG_L(PerlIO_printf(Perl_debug_log, - "Could not find current locale for category %d\n", - category)); - return FALSE; /* XXX maybe should croak */ + Perl_croak(aTHX_ + "panic: %s: %d: Could not find current %s locale, errno=%d\n", + __FILE__, __LINE__, category_name(category), errno); } - save_input_locale = stdize_locale(savepv(save_input_locale)); - if (isNAME_C_OR_POSIX(save_input_locale)) { - DEBUG_L(PerlIO_printf(Perl_debug_log, - "Current locale for category %d is %s\n", - category, save_input_locale)); - Safefree(save_input_locale); - return FALSE; + + DEBUG_L(PerlIO_printf(Perl_debug_log, + "Current locale for %s is %s\n", + category_name(category), save_input_locale)); + + input_name_len = strlen(save_input_locale); + + /* In our cache, each name is accompanied by two delimiters and a single + * utf8ness digit */ + input_name_len_with_overhead = input_name_len + 3; + + if ( input_name_len_with_overhead <= CUR_LC_BUFFER_SIZE ) { + /* we can use the buffer, avoid a malloc */ + delimited = buffer; + } else { /* need a malloc */ + /* Allocate and populate space for a copy of the name surrounded by the + * delimiters */ + Newx(delimited, input_name_len_with_overhead, char); } -# if defined(USE_LOCALE_CTYPE) \ - && (defined(MB_CUR_MAX) || (defined(HAS_NL_LANGINFO) && defined(CODESET))) + delimited[0] = UTF8NESS_SEP[0]; + Copy(save_input_locale, delimited + 1, input_name_len, char); + delimited[input_name_len+1] = UTF8NESS_PREFIX[0]; + delimited[input_name_len+2] = '\0'; - { /* Next try nl_langinfo or MB_CUR_MAX if available */ + /* And see if that is in the cache */ + name_pos = instr(PL_locale_utf8ness, delimited); + if (name_pos) { + is_utf8 = *(name_pos + input_name_len_with_overhead - 1) - '0'; - char *save_ctype_locale = NULL; - bool is_utf8; +# ifdef DEBUGGING - if (category != LC_CTYPE) { /* These work only on LC_CTYPE */ + if (DEBUG_Lv_TEST || debug_initialization) { + PerlIO_printf(Perl_debug_log, "UTF8ness for locale %s=%d, \n", + save_input_locale, is_utf8); + } - /* Get the current LC_CTYPE locale */ - save_ctype_locale = do_setlocale_c(LC_CTYPE, NULL); - if (! save_ctype_locale) { - DEBUG_L(PerlIO_printf(Perl_debug_log, - "Could not find current locale for LC_CTYPE\n")); - goto cant_use_nllanginfo; - } - save_ctype_locale = stdize_locale(savepv(save_ctype_locale)); +# endif - /* If LC_CTYPE and the desired category use the same locale, this - * means that finding the value for LC_CTYPE is the same as finding - * the value for the desired category. Otherwise, switch LC_CTYPE - * to the desired category's locale */ - if (strEQ(save_ctype_locale, save_input_locale)) { - Safefree(save_ctype_locale); - save_ctype_locale = NULL; - } - else if (! do_setlocale_c(LC_CTYPE, save_input_locale)) { - DEBUG_L(PerlIO_printf(Perl_debug_log, - "Could not change LC_CTYPE locale to %s\n", - save_input_locale)); - Safefree(save_ctype_locale); - goto cant_use_nllanginfo; - } + /* And, if not already in that position, move it to the beginning of + * the non-constant portion of the list, since it is the most recently + * used. (We don't have to worry about overflow, since just moving + * existing names around) */ + if (name_pos > utf8ness_cache) { + Move(utf8ness_cache, + utf8ness_cache + input_name_len_with_overhead, + name_pos - utf8ness_cache, char); + Copy(delimited, + utf8ness_cache, + input_name_len_with_overhead - 1, char); + utf8ness_cache[input_name_len_with_overhead - 1] = is_utf8 + '0'; } - DEBUG_L(PerlIO_printf(Perl_debug_log, "Current LC_CTYPE locale=%s\n", - save_input_locale)); + /* free only when not using the buffer */ + if ( delimited != buffer ) Safefree(delimited); + Safefree(save_input_locale); + return is_utf8; + } + + /* Here we don't have stored the utf8ness for the input locale. We have to + * calculate it */ + +# if defined(USE_LOCALE_CTYPE) \ + && ( defined(HAS_NL_LANGINFO) \ + || (defined(HAS_MBTOWC) || defined(HAS_MBRTOWC))) + + { + const char *original_ctype_locale + = switch_category_locale_to_template(LC_CTYPE, + category, + save_input_locale); /* Here the current LC_CTYPE is set to the locale of the category whose - * information is desired. This means that nl_langinfo() and MB_CUR_MAX + * information is desired. This means that nl_langinfo() and mbtowc() * should give the correct results */ -# if defined(HAS_NL_LANGINFO) && defined(CODESET) - /* The task is easiest if has this POSIX 2001 function */ +# ifdef MB_CUR_MAX /* But we can potentially rule out UTF-8ness, avoiding + calling the functions if we have this */ - { - const char *codeset = my_nl_langinfo(PERL_CODESET, FALSE); + /* Standard UTF-8 needs at least 4 bytes to represent the maximum + * Unicode code point. */ + + DEBUG_L(PerlIO_printf(Perl_debug_log, "%s: %d: MB_CUR_MAX=%d\n", + __FILE__, __LINE__, (int) MB_CUR_MAX)); + if ((unsigned) MB_CUR_MAX < STRLENs(MAX_UNICODE_UTF8)) { + is_utf8 = FALSE; + restore_switched_locale(LC_CTYPE, original_ctype_locale); + goto finish_and_return; + } + +# endif +# if defined(HAS_NL_LANGINFO) + + { /* The task is easiest if the platform has this POSIX 2001 function. + Except on some platforms it can wrongly return "", so have to have + a fallback. And it can return that it's UTF-8, even if there are + variances from that. For example, Turkish locales may use the + alternate dotted I rules, and sometimes it appears to be a + defective locale definition. XXX We should probably check for + these in the Latin1 range and warn (but on glibc, requires + iswalnum() etc. due to their not handling 80-FF correctly */ + const char *codeset = my_nl_langinfo(CODESET, FALSE); /* FALSE => already in dest locale */ - DEBUG_L(PerlIO_printf(Perl_debug_log, + DEBUG_Lv(PerlIO_printf(Perl_debug_log, "\tnllanginfo returned CODESET '%s'\n", codeset)); if (codeset && strNE(codeset, "")) { - /* If we switched LC_CTYPE, switch back */ - if (save_ctype_locale) { - do_setlocale_c(LC_CTYPE, save_ctype_locale); - Safefree(save_ctype_locale); - } - is_utf8 = foldEQ(codeset, STR_WITH_LEN("UTF-8")) - || foldEQ(codeset, STR_WITH_LEN("UTF8")); + /* If the implementation of foldEQ() somehow were + * to change to not go byte-by-byte, this could + * read past end of string, as only one length is + * checked. But currently, a premature NUL will + * compare false, and it will stop there */ + is_utf8 = cBOOL( foldEQ(codeset, STR_WITH_LEN("UTF-8")) + || foldEQ(codeset, STR_WITH_LEN("UTF8"))); DEBUG_L(PerlIO_printf(Perl_debug_log, "\tnllanginfo returned CODESET '%s'; ?UTF8 locale=%d\n", codeset, is_utf8)); - Safefree(save_input_locale); - return is_utf8; + restore_switched_locale(LC_CTYPE, original_ctype_locale); + goto finish_and_return; } } # endif -# ifdef MB_CUR_MAX - - /* Here, either we don't have nl_langinfo, or it didn't return a - * codeset. Try MB_CUR_MAX */ - - /* Standard UTF-8 needs at least 4 bytes to represent the maximum - * Unicode code point. Since UTF-8 is the only non-single byte - * encoding we handle, we just say any such encoding is UTF-8, and if - * turns out to be wrong, other things will fail */ - is_utf8 = MB_CUR_MAX >= 4; - - DEBUG_L(PerlIO_printf(Perl_debug_log, - "\tMB_CUR_MAX=%d; ?UTF8 locale=%d\n", - (int) MB_CUR_MAX, is_utf8)); - - Safefree(save_input_locale); - -# ifdef HAS_MBTOWC +# if defined(HAS_MBTOWC) || defined(HAS_MBRTOWC) + /* We can see if this is a UTF-8-like locale if have mbtowc(). It was a + * late adder to C89, so very likely to have it. However, testing has + * shown that, like nl_langinfo() above, there are locales that are not + * strictly UTF-8 that this will return that they are */ - /* ... But, most system that have MB_CUR_MAX will also have mbtowc(), - * since they are both in the C99 standard. We can feed a known byte - * string to the latter function, and check that it gives the expected - * result */ - if (is_utf8) { + { wchar_t wc; - PERL_UNUSED_RESULT(mbtowc(&wc, NULL, 0));/* Reset any shift state */ - errno = 0; - if ((size_t)mbtowc(&wc, HYPHEN_UTF8, strlen(HYPHEN_UTF8)) - != strlen(HYPHEN_UTF8) - || wc != (wchar_t) 0x2010) - { - is_utf8 = FALSE; - DEBUG_L(PerlIO_printf(Perl_debug_log, "\thyphen=U+%x\n", (unsigned int)wc)); - DEBUG_L(PerlIO_printf(Perl_debug_log, - "\treturn from mbtowc=%d; errno=%d; ?UTF8 locale=0\n", - mbtowc(&wc, HYPHEN_UTF8, strlen(HYPHEN_UTF8)), errno)); - } - } - -# endif - - /* If we switched LC_CTYPE, switch back */ - if (save_ctype_locale) { - do_setlocale_c(LC_CTYPE, save_ctype_locale); - Safefree(save_ctype_locale); - } - - return is_utf8; - -# endif - - } - - cant_use_nllanginfo: + int len; + dSAVEDERRNO; -# else /* nl_langinfo should work if available, so don't bother compiling this - fallback code. The final fallback of looking at the name is - compiled, and will be executed if nl_langinfo fails */ +# if defined(HAS_MBRTOWC) && defined(USE_ITHREADS) - /* nl_langinfo not available or failed somehow. Next try looking at the - * currency symbol to see if it disambiguates things. Often that will be - * in the native script, and if the symbol isn't in UTF-8, we know that the - * locale isn't. If it is non-ASCII UTF-8, we infer that the locale is - * too, as the odds of a non-UTF8 string being valid UTF-8 are quite small - * */ + mbstate_t ps; -# ifdef HAS_LOCALECONV -# ifdef USE_LOCALE_MONETARY +# endif - { - char *save_monetary_locale = NULL; - bool only_ascii = FALSE; - bool is_utf8 = FALSE; - struct lconv* lc; + /* mbrtowc() and mbtowc() convert a byte string to a wide + * character. Feed a byte string to one of them and check that the + * result is the expected Unicode code point */ - /* Like above for LC_CTYPE, we first set LC_MONETARY to the locale of - * the desired category, if it isn't that locale already */ +# if defined(HAS_MBRTOWC) && defined(USE_ITHREADS) + /* Prefer this function if available, as it's reentrant */ - if (category != LC_MONETARY) { + memset(&ps, 0, sizeof(ps));; + PERL_UNUSED_RESULT(mbrtowc(&wc, NULL, 0, &ps)); /* Reset any shift + state */ + SETERRNO(0, 0); + len = mbrtowc(&wc, STR_WITH_LEN(REPLACEMENT_CHARACTER_UTF8), &ps); + SAVE_ERRNO; - save_monetary_locale = do_setlocale_c(LC_MONETARY, NULL); - if (! save_monetary_locale) { - DEBUG_L(PerlIO_printf(Perl_debug_log, - "Could not find current locale for LC_MONETARY\n")); - goto cant_use_monetary; - } - save_monetary_locale = stdize_locale(savepv(save_monetary_locale)); +# else - if (strEQ(save_monetary_locale, save_input_locale)) { - Safefree(save_monetary_locale); - save_monetary_locale = NULL; - } - else if (! do_setlocale_c(LC_MONETARY, save_input_locale)) { - DEBUG_L(PerlIO_printf(Perl_debug_log, - "Could not change LC_MONETARY locale to %s\n", - save_input_locale)); - Safefree(save_monetary_locale); - goto cant_use_monetary; - } - } + LOCALE_LOCK; + PERL_UNUSED_RESULT(mbtowc(&wc, NULL, 0));/* Reset any shift state */ + SETERRNO(0, 0); + len = mbtowc(&wc, STR_WITH_LEN(REPLACEMENT_CHARACTER_UTF8)); + SAVE_ERRNO; + LOCALE_UNLOCK; - /* Here the current LC_MONETARY is set to the locale of the category - * whose information is desired. */ +# endif - lc = localeconv(); - if (! lc - || ! lc->currency_symbol - || is_utf8_invariant_string((U8 *) lc->currency_symbol, 0)) - { - DEBUG_L(PerlIO_printf(Perl_debug_log, "Couldn't get currency symbol for %s, or contains only ASCII; can't use for determining if UTF-8 locale\n", save_input_locale)); - only_ascii = TRUE; - } - else { - is_utf8 = is_utf8_string((U8 *) lc->currency_symbol, 0); - } + RESTORE_ERRNO; + DEBUG_Lv(PerlIO_printf(Perl_debug_log, + "\treturn from mbtowc; len=%d; code_point=%x; errno=%d\n", + len, (unsigned int) wc, GET_ERRNO)); - /* If we changed it, restore LC_MONETARY to its original locale */ - if (save_monetary_locale) { - do_setlocale_c(LC_MONETARY, save_monetary_locale); - Safefree(save_monetary_locale); + is_utf8 = cBOOL( len == STRLENs(REPLACEMENT_CHARACTER_UTF8) + && wc == (wchar_t) UNICODE_REPLACEMENT); } - if (! only_ascii) { +# endif - /* It isn't a UTF-8 locale if the symbol is not legal UTF-8; - * otherwise assume the locale is UTF-8 if and only if the symbol - * is non-ascii UTF-8. */ - DEBUG_L(PerlIO_printf(Perl_debug_log, "\t?Currency symbol for %s is UTF-8=%d\n", - save_input_locale, is_utf8)); - Safefree(save_input_locale); - return is_utf8; - } + restore_switched_locale(LC_CTYPE, original_ctype_locale); + goto finish_and_return; } - cant_use_monetary: -# endif /* USE_LOCALE_MONETARY */ -# endif /* HAS_LOCALECONV */ +# else -# if defined(HAS_STRFTIME) && defined(USE_LOCALE_TIME) + /* Here, we must have a C89 compiler that doesn't have mbtowc(). Next + * try looking at the currency symbol to see if it disambiguates + * things. Often that will be in the native script, and if the symbol + * isn't in UTF-8, we know that the locale isn't. If it is non-ASCII + * UTF-8, we infer that the locale is too, as the odds of a non-UTF8 + * string being valid UTF-8 are quite small */ -/* Still haven't found a non-ASCII string to disambiguate UTF-8 or not. Try - * the names of the months and weekdays, timezone, and am/pm indicator */ - { - char *save_time_locale = NULL; - int hour = 10; - bool is_dst = FALSE; - int dom = 1; - int month = 0; - int i; - char * formatted_time; +# ifdef USE_LOCALE_MONETARY + /* If have LC_MONETARY, we can look at the currency symbol. Often that + * will be in the native script. We do this one first because there is + * just one string to examine, so potentially avoids work */ - /* Like above for LC_MONETARY, we set LC_TIME to the locale of the - * desired category, if it isn't that locale already */ + { + const char *original_monetary_locale + = switch_category_locale_to_template(LC_MONETARY, + category, + save_input_locale); + bool only_ascii = FALSE; + const U8 * currency_string + = (const U8 *) my_nl_langinfo(CRNCYSTR, FALSE); + /* 2nd param not relevant for this item */ + const U8 * first_variant; + + assert( *currency_string == '-' + || *currency_string == '+' + || *currency_string == '.'); + + currency_string++; + + if (is_utf8_invariant_string_loc(currency_string, 0, &first_variant)) + { + DEBUG_L(PerlIO_printf(Perl_debug_log, "Couldn't get currency symbol for %s, or contains only ASCII; can't use for determining if UTF-8 locale\n", save_input_locale)); + only_ascii = TRUE; + } + else { + is_utf8 = is_strict_utf8_string(first_variant, 0); + } - if (category != LC_TIME) { + restore_switched_locale(LC_MONETARY, original_monetary_locale); - save_time_locale = do_setlocale_c(LC_TIME, NULL); - if (! save_time_locale) { - DEBUG_L(PerlIO_printf(Perl_debug_log, - "Could not find current locale for LC_TIME\n")); - goto cant_use_time; - } - save_time_locale = stdize_locale(savepv(save_time_locale)); + if (! only_ascii) { - if (strEQ(save_time_locale, save_input_locale)) { - Safefree(save_time_locale); - save_time_locale = NULL; - } - else if (! do_setlocale_c(LC_TIME, save_input_locale)) { - DEBUG_L(PerlIO_printf(Perl_debug_log, - "Could not change LC_TIME locale to %s\n", - save_input_locale)); - Safefree(save_time_locale); - goto cant_use_time; + /* It isn't a UTF-8 locale if the symbol is not legal UTF-8; + * otherwise assume the locale is UTF-8 if and only if the symbol + * is non-ascii UTF-8. */ + DEBUG_Lv(PerlIO_printf(Perl_debug_log, "\t?Currency symbol for %s is UTF-8=%d\n", + save_input_locale, is_utf8)); + goto finish_and_return; } } - /* Here the current LC_TIME is set to the locale of the category - * whose information is desired. Look at all the days of the week and - * month names, and the timezone and am/pm indicator for UTF-8 variant - * characters. The first such a one found will tell us if the locale - * is UTF-8 or not */ +# endif /* USE_LOCALE_MONETARY */ +# if defined(HAS_STRFTIME) && defined(USE_LOCALE_TIME) - for (i = 0; i < 7 + 12; i++) { /* 7 days; 12 months */ - formatted_time = my_strftime("%A %B %Z %p", - 0, 0, hour, dom, month, 2012 - 1900, 0, 0, is_dst); - if ( ! formatted_time - || is_utf8_invariant_string((U8 *) formatted_time, 0)) - { + /* Still haven't found a non-ASCII string to disambiguate UTF-8 or not. Try + * the names of the months and weekdays, timezone, and am/pm indicator */ + { + const char *original_time_locale + = switch_category_locale_to_template(LC_TIME, + category, + save_input_locale); + int hour = 10; + bool is_dst = FALSE; + int dom = 1; + int month = 0; + int i; + char * formatted_time; + + /* Here the current LC_TIME is set to the locale of the category + * whose information is desired. Look at all the days of the week and + * month names, and the timezone and am/pm indicator for UTF-8 variant + * characters. The first such a one found will tell us if the locale + * is UTF-8 or not */ + + for (i = 0; i < 7 + 12; i++) { /* 7 days; 12 months */ + formatted_time = my_strftime("%A %B %Z %p", + 0, 0, hour, dom, month, 2012 - 1900, 0, 0, is_dst); + if ( ! formatted_time + || is_utf8_invariant_string((U8 *) formatted_time, 0)) + { - /* Here, we didn't find a non-ASCII. Try the next time through - * with the complemented dst and am/pm, and try with the next - * weekday. After we have gotten all weekdays, try the next - * month */ - is_dst = ! is_dst; - hour = (hour + 12) % 24; - dom++; - if (i > 6) { - month++; + /* Here, we didn't find a non-ASCII. Try the next time through + * with the complemented dst and am/pm, and try with the next + * weekday. After we have gotten all weekdays, try the next + * month */ + is_dst = ! is_dst; + hour = (hour + 12) % 24; + dom++; + if (i > 6) { + month++; + } + continue; } - continue; - } - /* Here, we have a non-ASCII. Return TRUE is it is valid UTF8; - * false otherwise. But first, restore LC_TIME to its original - * locale if we changed it */ - if (save_time_locale) { - do_setlocale_c(LC_TIME, save_time_locale); - Safefree(save_time_locale); - } + /* Here, we have a non-ASCII. Return TRUE is it is valid UTF8; + * false otherwise. But first, restore LC_TIME to its original + * locale if we changed it */ + restore_switched_locale(LC_TIME, original_time_locale); - DEBUG_L(PerlIO_printf(Perl_debug_log, "\t?time-related strings for %s are UTF-8=%d\n", - save_input_locale, - is_utf8_string((U8 *) formatted_time, 0))); - Safefree(save_input_locale); - return is_utf8_string((U8 *) formatted_time, 0); - } + DEBUG_Lv(PerlIO_printf(Perl_debug_log, "\t?time-related strings for %s are UTF-8=%d\n", + save_input_locale, + is_utf8_string((U8 *) formatted_time, 0))); + is_utf8 = is_utf8_string((U8 *) formatted_time, 0); + goto finish_and_return; + } - /* Falling off the end of the loop indicates all the names were just - * ASCII. Go on to the next test. If we changed it, restore LC_TIME - * to its original locale */ - if (save_time_locale) { - do_setlocale_c(LC_TIME, save_time_locale); - Safefree(save_time_locale); + /* Falling off the end of the loop indicates all the names were just + * ASCII. Go on to the next test. If we changed it, restore LC_TIME + * to its original locale */ + restore_switched_locale(LC_TIME, original_time_locale); + DEBUG_Lv(PerlIO_printf(Perl_debug_log, "All time-related words for %s contain only ASCII; can't use for determining if UTF-8 locale\n", save_input_locale)); } - DEBUG_L(PerlIO_printf(Perl_debug_log, "All time-related words for %s contain only ASCII; can't use for determining if UTF-8 locale\n", save_input_locale)); - } - cant_use_time: # endif # if 0 && defined(USE_LOCALE_MESSAGES) && defined(HAS_SYS_ERRLIST) -/* This code is ifdefd out because it was found to not be necessary in testing - * on our dromedary test machine, which has over 700 locales. There, this - * added no value to looking at the currency symbol and the time strings. I - * left it in so as to avoid rewriting it if real-world experience indicates - * that dromedary is an outlier. Essentially, instead of returning abpve if we - * haven't found illegal utf8, we continue on and examine all the strerror() - * messages on the platform for utf8ness. If all are ASCII, we still don't - * know the answer; but otherwise we have a pretty good indication of the - * utf8ness. The reason this doesn't help much is that the messages may not - * have been translated into the locale. The currency symbol and time strings - * are much more likely to have been translated. */ - { - int e; - bool is_utf8 = FALSE; - bool non_ascii = FALSE; - char *save_messages_locale = NULL; - const char * errmsg = NULL; - - /* Like above, we set LC_MESSAGES to the locale of the desired - * category, if it isn't that locale already */ - - if (category != LC_MESSAGES) { - - save_messages_locale = do_setlocale_c(LC_MESSAGES, NULL); - if (! save_messages_locale) { - DEBUG_L(PerlIO_printf(Perl_debug_log, - "Could not find current locale for LC_MESSAGES\n")); - goto cant_use_messages; + /* This code is ifdefd out because it was found to not be necessary in testing + * on our dromedary test machine, which has over 700 locales. There, this + * added no value to looking at the currency symbol and the time strings. I + * left it in so as to avoid rewriting it if real-world experience indicates + * that dromedary is an outlier. Essentially, instead of returning abpve if we + * haven't found illegal utf8, we continue on and examine all the strerror() + * messages on the platform for utf8ness. If all are ASCII, we still don't + * know the answer; but otherwise we have a pretty good indication of the + * utf8ness. The reason this doesn't help much is that the messages may not + * have been translated into the locale. The currency symbol and time strings + * are much more likely to have been translated. */ + { + int e; + bool non_ascii = FALSE; + const char *original_messages_locale + = switch_category_locale_to_template(LC_MESSAGES, + category, + save_input_locale); + const char * errmsg = NULL; + + /* Here the current LC_MESSAGES is set to the locale of the category + * whose information is desired. Look through all the messages. We + * can't use Strerror() here because it may expand to code that + * segfaults in miniperl */ + + for (e = 0; e <= sys_nerr; e++) { + errno = 0; + errmsg = sys_errlist[e]; + if (errno || !errmsg) { + break; + } + errmsg = savepv(errmsg); + if (! is_utf8_invariant_string((U8 *) errmsg, 0)) { + non_ascii = TRUE; + is_utf8 = is_utf8_string((U8 *) errmsg, 0); + break; + } } - save_messages_locale = stdize_locale(savepv(save_messages_locale)); + Safefree(errmsg); - if (strEQ(save_messages_locale, save_input_locale)) { - Safefree(save_messages_locale); - save_messages_locale = NULL; - } - else if (! do_setlocale_c(LC_MESSAGES, save_input_locale)) { - DEBUG_L(PerlIO_printf(Perl_debug_log, - "Could not change LC_MESSAGES locale to %s\n", - save_input_locale)); - Safefree(save_messages_locale); - goto cant_use_messages; - } - } + restore_switched_locale(LC_MESSAGES, original_messages_locale); - /* Here the current LC_MESSAGES is set to the locale of the category - * whose information is desired. Look through all the messages. We - * can't use Strerror() here because it may expand to code that - * segfaults in miniperl */ + if (non_ascii) { - for (e = 0; e <= sys_nerr; e++) { - errno = 0; - errmsg = sys_errlist[e]; - if (errno || !errmsg) { - break; - } - errmsg = savepv(errmsg); - if (! is_utf8_invariant_string((U8 *) errmsg, 0)) { - non_ascii = TRUE; - is_utf8 = is_utf8_string((U8 *) errmsg, 0); - break; + /* Any non-UTF-8 message means not a UTF-8 locale; if all are valid, + * any non-ascii means it is one; otherwise we assume it isn't */ + DEBUG_Lv(PerlIO_printf(Perl_debug_log, "\t?error messages for %s are UTF-8=%d\n", + save_input_locale, + is_utf8)); + goto finish_and_return; } - } - Safefree(errmsg); - - /* And, if we changed it, restore LC_MESSAGES to its original locale */ - if (save_messages_locale) { - do_setlocale_c(LC_MESSAGES, save_messages_locale); - Safefree(save_messages_locale); - } - if (non_ascii) { - - /* Any non-UTF-8 message means not a UTF-8 locale; if all are valid, - * any non-ascii means it is one; otherwise we assume it isn't */ - DEBUG_L(PerlIO_printf(Perl_debug_log, "\t?error messages for %s are UTF-8=%d\n", - save_input_locale, - is_utf8)); - Safefree(save_input_locale); - return is_utf8; + DEBUG_L(PerlIO_printf(Perl_debug_log, "All error messages for %s contain only ASCII; can't use for determining if UTF-8 locale\n", save_input_locale)); } - DEBUG_L(PerlIO_printf(Perl_debug_log, "All error messages for %s contain only ASCII; can't use for determining if UTF-8 locale\n", save_input_locale)); - } - cant_use_messages: - # endif -# endif /* the code that is compiled when no nl_langinfo */ - -# ifndef EBCDIC /* On os390, even if the name ends with "UTF-8', it isn't a +# ifndef EBCDIC /* On os390, even if the name ends with "UTF-8', it isn't a UTF-8 locale */ /* As a last resort, look at the locale name to see if it matches @@ -3358,77 +4883,194 @@ Perl__is_cur_LC_category_utf8(pTHX_ int category) * its UTF-8ness, but if it has UTF8 in the name, it is extremely likely to * be a UTF-8 locale. Similarly for the other common names */ - final_pos = strlen(save_input_locale) - 1; - if (final_pos >= 3) { - char *name = save_input_locale; + { + const Size_t final_pos = strlen(save_input_locale) - 1; - /* Find next 'U' or 'u' and look from there */ - while ((name += strcspn(name, "Uu") + 1) - <= save_input_locale + final_pos - 2) - { - if ( isALPHA_FOLD_NE(*name, 't') - || isALPHA_FOLD_NE(*(name + 1), 'f')) + if (final_pos >= 3) { + const char *name = save_input_locale; + + /* Find next 'U' or 'u' and look from there */ + while ((name += strcspn(name, "Uu") + 1) + <= save_input_locale + final_pos - 2) { - continue; - } - name += 2; - if (*(name) == '-') { - if ((name > save_input_locale + final_pos - 1)) { - break; + if ( isALPHA_FOLD_NE(*name, 't') + || isALPHA_FOLD_NE(*(name + 1), 'f')) + { + continue; + } + name += 2; + if (*(name) == '-') { + if ((name > save_input_locale + final_pos - 1)) { + break; + } + name++; + } + if (*(name) == '8') { + DEBUG_L(PerlIO_printf(Perl_debug_log, + "Locale %s ends with UTF-8 in name\n", + save_input_locale)); + is_utf8 = TRUE; + goto finish_and_return; } - name++; - } - if (*(name) == '8') { - DEBUG_L(PerlIO_printf(Perl_debug_log, - "Locale %s ends with UTF-8 in name\n", - save_input_locale)); - Safefree(save_input_locale); - return TRUE; } + DEBUG_L(PerlIO_printf(Perl_debug_log, + "Locale %s doesn't end with UTF-8 in name\n", + save_input_locale)); } - DEBUG_L(PerlIO_printf(Perl_debug_log, - "Locale %s doesn't end with UTF-8 in name\n", - save_input_locale)); - } -# endif -# ifdef WIN32 +# ifdef WIN32 - /* http://msdn.microsoft.com/en-us/library/windows/desktop/dd317756.aspx */ - if (memENDs(save_input_locale, final_pos, "65001")) { - DEBUG_L(PerlIO_printf(Perl_debug_log, + /* http://msdn.microsoft.com/en-us/library/windows/desktop/dd317756.aspx */ + if (memENDs(save_input_locale, final_pos, "65001")) { + DEBUG_L(PerlIO_printf(Perl_debug_log, "Locale %s ends with 65001 in name, is UTF-8 locale\n", save_input_locale)); - Safefree(save_input_locale); - return TRUE; + is_utf8 = TRUE; + goto finish_and_return; + } + +# endif } - -# endif +# endif /* Other common encodings are the ISO 8859 series, which aren't UTF-8. But * since we are about to return FALSE anyway, there is no point in doing * this extra work */ -# if 0 +# if 0 if (instr(save_input_locale, "8859")) { DEBUG_L(PerlIO_printf(Perl_debug_log, "Locale %s has 8859 in name, not UTF-8 locale\n", save_input_locale)); - Safefree(save_input_locale); - return FALSE; + is_utf8 = FALSE; + goto finish_and_return; } -# endif +# endif DEBUG_L(PerlIO_printf(Perl_debug_log, "Assuming locale %s is not a UTF-8 locale\n", save_input_locale)); + is_utf8 = FALSE; + +# endif /* the code that is compiled when no modern LC_CTYPE */ + + finish_and_return: + + /* Cache this result so we don't have to go through all this next time. */ + utf8ness_cache_size = sizeof(PL_locale_utf8ness) + - (utf8ness_cache - PL_locale_utf8ness); + + /* But we can't save it if it is too large for the total space available */ + if (LIKELY(input_name_len_with_overhead < utf8ness_cache_size)) { + Size_t utf8ness_cache_len = strlen(utf8ness_cache); + + /* Here it can fit, but we may need to clear out the oldest cached + * result(s) to do so. Check */ + if (utf8ness_cache_len + input_name_len_with_overhead + >= utf8ness_cache_size) + { + /* Here we have to clear something out to make room for this. + * Start looking at the rightmost place where it could fit and find + * the beginning of the entry that extends past that. */ + char * cutoff = (char *) my_memrchr(utf8ness_cache, + UTF8NESS_SEP[0], + utf8ness_cache_size + - input_name_len_with_overhead); + + assert(cutoff); + assert(cutoff >= utf8ness_cache); + + /* This and all subsequent entries must be removed */ + *cutoff = '\0'; + utf8ness_cache_len = strlen(utf8ness_cache); + } + + /* Make space for the new entry */ + Move(utf8ness_cache, + utf8ness_cache + input_name_len_with_overhead, + utf8ness_cache_len + 1 /* Incl. trailing NUL */, char); + + /* And insert it */ + Copy(delimited, utf8ness_cache, input_name_len_with_overhead - 1, char); + utf8ness_cache[input_name_len_with_overhead - 1] = is_utf8 + '0'; + + if ((PL_locale_utf8ness[strlen(PL_locale_utf8ness)-1] + & (PERL_UINTMAX_T) ~1) != '0') + { + Perl_croak(aTHX_ + "panic: %s: %d: Corrupt utf8ness_cache=%s\nlen=%zu," + " inserted_name=%s, its_len=%zu\n", + __FILE__, __LINE__, + PL_locale_utf8ness, strlen(PL_locale_utf8ness), + delimited, input_name_len_with_overhead); + } + } + +# ifdef DEBUGGING + + if (DEBUG_Lv_TEST) { + const char * s = PL_locale_utf8ness; + + /* Audit the structure */ + while (s < PL_locale_utf8ness + strlen(PL_locale_utf8ness)) { + const char *e; + + if (*s != UTF8NESS_SEP[0]) { + Perl_croak(aTHX_ + "panic: %s: %d: Corrupt utf8ness_cache: missing" + " separator %.*s<-- HERE %s\n", + __FILE__, __LINE__, + (int) (s - PL_locale_utf8ness), PL_locale_utf8ness, + s); + } + s++; + e = strchr(s, UTF8NESS_PREFIX[0]); + if (! e) { + Perl_croak(aTHX_ + "panic: %s: %d: Corrupt utf8ness_cache: missing" + " separator %.*s<-- HERE %s\n", + __FILE__, __LINE__, + (int) (e - PL_locale_utf8ness), PL_locale_utf8ness, + e); + } + e++; + if (*e != '0' && *e != '1') { + Perl_croak(aTHX_ + "panic: %s: %d: Corrupt utf8ness_cache: utf8ness" + " must be [01] %.*s<-- HERE %s\n", + __FILE__, __LINE__, + (int) (e + 1 - PL_locale_utf8ness), + PL_locale_utf8ness, e + 1); + } + if (ninstr(PL_locale_utf8ness, s, s-1, e)) { + Perl_croak(aTHX_ + "panic: %s: %d: Corrupt utf8ness_cache: entry" + " has duplicate %.*s<-- HERE %s\n", + __FILE__, __LINE__, + (int) (e - PL_locale_utf8ness), PL_locale_utf8ness, + e); + } + s = e + 1; + } + } + + if (DEBUG_Lv_TEST || debug_initialization) { + + PerlIO_printf(Perl_debug_log, + "PL_locale_utf8ness is now %s; returning %d\n", + PL_locale_utf8ness, is_utf8); + } + +# endif + + /* free only when not using the buffer */ + if ( delimited != buffer ) Safefree(delimited); Safefree(save_input_locale); - return FALSE; + return is_utf8; } #endif - bool Perl__is_in_locale_category(pTHX_ const bool compiling, const int category) { @@ -3439,15 +5081,15 @@ Perl__is_in_locale_category(pTHX_ const bool compiling, const int category) const COP * const cop = (compiling) ? &PL_compiling : PL_curcop; - SV *categories = cop_hints_fetch_pvs(cop, "locale", 0); - if (! categories || categories == &PL_sv_placeholder) { + SV *these_categories = cop_hints_fetch_pvs(cop, "locale", 0); + if (! these_categories || these_categories == &PL_sv_placeholder) { return FALSE; } /* The pseudo-category 'not_characters' is -1, so just add 1 to each to get * a valid unsigned */ assert(category >= -1); - return cBOOL(SvUV(categories) & (1U << (category + 1))); + return cBOOL(SvUV(these_categories) & (1U << (category + 1))); } char * @@ -3476,63 +5118,88 @@ Perl_my_strerror(pTHX_ const int errnum) const bool within_locale_scope = IN_LC(LC_MESSAGES); -# if defined(HAS_POSIX_2008_LOCALE) && defined(HAS_STRERROR_L) - - /* This function is trivial if we have strerror_l() */ +# ifndef USE_ITHREADS + /* This function is trivial without threads. */ if (within_locale_scope) { - errstr = strerror(errnum); + errstr = savepv(strerror(errnum)); } else { - errstr = strerror_l(errnum, PL_C_locale_obj); + const char * save_locale = savepv(do_setlocale_c(LC_MESSAGES, NULL)); + + do_setlocale_c(LC_MESSAGES, "C"); + errstr = savepv(strerror(errnum)); + do_setlocale_c(LC_MESSAGES, save_locale); + Safefree(save_locale); } - errstr = savepv(errstr); +# elif defined(HAS_POSIX_2008_LOCALE) \ + && defined(HAS_STRERROR_L) \ + && defined(HAS_DUPLOCALE) -# else /* Doesn't have strerror_l(). */ + /* This function is also trivial if we don't have to worry about thread + * safety and have strerror_l(), as it handles the switch of locales so we + * don't have to deal with that. We don't have to worry about thread + * safety if strerror_r() is also available. Both it and strerror_l() are + * thread-safe. Plain strerror() isn't thread safe. But on threaded + * builds when strerror_r() is available, the apparent call to strerror() + * below is actually a macro that behind-the-scenes calls strerror_r(). */ -# ifdef USE_POSIX_2008_LOCALE +# ifdef HAS_STRERROR_R - locale_t save_locale = NULL; + if (within_locale_scope) { + errstr = savepv(strerror(errnum)); + } + else { + errstr = savepv(strerror_l(errnum, PL_C_locale_obj)); + } # else - char * save_locale = NULL; - bool locale_is_C = FALSE; + /* Here we have strerror_l(), but not strerror_r() and we are on a + * threaded-build. We use strerror_l() for everything, constructing a + * locale to pass to it if necessary */ - /* We have a critical section to prevent another thread from changing the - * locale out from under us (or zapping the buffer returned from - * setlocale() ) */ - LOCALE_LOCK; + bool do_free = FALSE; + locale_t locale_to_use; -# endif + if (within_locale_scope) { + locale_to_use = uselocale((locale_t) 0); + if (locale_to_use == LC_GLOBAL_LOCALE) { + locale_to_use = duplocale(LC_GLOBAL_LOCALE); + do_free = TRUE; + } + } + else { /* Use C locale if not within 'use locale' scope */ + locale_to_use = PL_C_locale_obj; + } - DEBUG_Lv(PerlIO_printf(Perl_debug_log, - "my_strerror called with errnum %d\n", errnum)); - if (! within_locale_scope) { - errno = 0; + errstr = savepv(strerror_l(errnum, locale_to_use)); -# ifdef USE_POSIX_2008_LOCALE /* Use the thread-safe locale functions */ + if (do_free) { + freelocale(locale_to_use); + } - DEBUG_Lv(PerlIO_printf(Perl_debug_log, - "Not within locale scope, about to call" - " uselocale(0x%p)\n", PL_C_locale_obj)); - save_locale = uselocale(PL_C_locale_obj); - if (! save_locale) { - DEBUG_L(PerlIO_printf(Perl_debug_log, - "uselocale failed, errno=%d\n", errno)); - } - else { - DEBUG_Lv(PerlIO_printf(Perl_debug_log, - "uselocale returned 0x%p\n", save_locale)); - } +# endif +# else /* Doesn't have strerror_l() */ + + const char * save_locale = NULL; + bool locale_is_C = FALSE; -# else /* Not thread-safe build */ + /* We have a critical section to prevent another thread from executing this + * same code at the same time. (On thread-safe perls, the LOCK is a + * no-op.) Since this is the only place in core that changes LC_MESSAGES + * (unless the user has called setlocale(), this works to prevent races. */ + LOCALE_LOCK; + DEBUG_Lv(PerlIO_printf(Perl_debug_log, + "my_strerror called with errnum %d\n", errnum)); + if (! within_locale_scope) { save_locale = do_setlocale_c(LC_MESSAGES, NULL); if (! save_locale) { - DEBUG_L(PerlIO_printf(Perl_debug_log, - "setlocale failed, errno=%d\n", errno)); + Perl_croak(aTHX_ + "panic: %s: %d: Could not find current LC_MESSAGES locale," + " errno=%d\n", __FILE__, __LINE__, errno); } else { locale_is_C = isNAME_C_OR_POSIX(save_locale); @@ -3546,9 +5213,6 @@ Perl_my_strerror(pTHX_ const int errnum) do_setlocale_c(LC_MESSAGES, "C"); } } - -# endif - } /* end of ! within_locale_scope */ else { DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s: %d: WITHIN locale scope\n", @@ -3560,25 +5224,11 @@ Perl_my_strerror(pTHX_ const int errnum) errstr = savepv(Strerror(errnum)); if (! within_locale_scope) { - errno = 0; - -# ifdef USE_POSIX_2008_LOCALE - - DEBUG_Lv(PerlIO_printf(Perl_debug_log, - "%s: %d: not within locale scope, restoring the locale\n", - __FILE__, __LINE__)); - if (save_locale && ! uselocale(save_locale)) { - DEBUG_L(PerlIO_printf(Perl_debug_log, - "uselocale restore failed, errno=%d\n", errno)); - } - } - -# else - if (save_locale && ! locale_is_C) { if (! do_setlocale_c(LC_MESSAGES, save_locale)) { - DEBUG_L(PerlIO_printf(Perl_debug_log, - "setlocale restore failed, errno=%d\n", errno)); + Perl_croak(aTHX_ + "panic: %s: %d: setlocale restore failed, errno=%d\n", + __FILE__, __LINE__, errno); } Safefree(save_locale); } @@ -3586,11 +5236,8 @@ Perl_my_strerror(pTHX_ const int errnum) LOCALE_UNLOCK; -# endif # endif /* End of doesn't have strerror_l */ -#endif /* End of does have locale messages */ - -#ifdef DEBUGGING +# ifdef DEBUGGING if (DEBUG_Lv_TEST) { PerlIO_printf(Perl_debug_log, "Strerror returned; saving a copy: '"); @@ -3598,7 +5245,8 @@ Perl_my_strerror(pTHX_ const int errnum) PerlIO_printf(Perl_debug_log, "'\n"); } -#endif +# endif +#endif /* End of does have locale messages */ SAVEFREEPV(errstr); return errstr; @@ -3606,48 +5254,200 @@ Perl_my_strerror(pTHX_ const int errnum) /* -=for apidoc sync_locale +=for apidoc switch_to_global_locale -Changing the program's locale should be avoided by XS code. Nevertheless, -certain non-Perl libraries called from XS, such as C do so. When this -happens, Perl needs to be told that the locale has changed. Use this function -to do so, before returning to Perl. +On systems without locale support, or on single-threaded builds, or on +platforms that do not support per-thread locale operations, this function does +nothing. On such systems that do have locale support, only a locale global to +the whole program is available. + +On multi-threaded builds on systems that do have per-thread locale operations, +this function converts the thread it is running in to use the global locale. +This is for code that has not yet or cannot be updated to handle multi-threaded +locale operation. As long as only a single thread is so-converted, everything +works fine, as all the other threads continue to ignore the global one, so only +this thread looks at it. + +However, on Windows systems this isn't quite true prior to Visual Studio 15, +at which point Microsoft fixed a bug. A race can occur if you use the +following operations on earlier Windows platforms: + +=over + +=item L + +=item L, items C and C + +=item L, items C and C + +=back + +The first item is not fixable (except by upgrading to a later Visual Studio +release), but it would be possible to work around the latter two items by using +the Windows API functions C and C; patches +welcome. + +Without this function call, threads that use the L> system +function will not work properly, as all the locale-sensitive functions will +look at the per-thread locale, and C will have no effect on this +thread. + +Perl code should convert to either call +L|perlapi/Perl_setlocale> (which is a drop-in for the system +C) or use the methods given in L to call +L|POSIX/setlocale>. Either one will transparently properly +handle all cases of single- vs multi-thread, POSIX 2008-supported or not. + +Non-Perl libraries, such as C, that call the system C can +continue to work if this function is called before transferring control to the +library. + +Upon return from the code that needs to use the global locale, +L|perlapi/sync_locale> should be called to restore the safe +multi-thread operation. =cut */ void -Perl_sync_locale(pTHX) +Perl_switch_to_global_locale() { - char * newlocale; -#ifdef USE_LOCALE_CTYPE +#ifdef USE_THREAD_SAFE_LOCALE +# ifdef WIN32 + + _configthreadlocale(_DISABLE_PER_THREAD_LOCALE); + +# else +# ifdef HAS_QUERYLOCALE + + setlocale(LC_ALL, querylocale(LC_ALL_MASK, uselocale((locale_t) 0))); + +# else + + { + unsigned int i; + + for (i = 0; i < LC_ALL_INDEX; i++) { + setlocale(categories[i], do_setlocale_r(categories[i], NULL)); + } + } + +# endif + + uselocale(LC_GLOBAL_LOCALE); + +# endif +#endif + +} + +/* + +=for apidoc sync_locale + +L|perlapi/Perl_setlocale> can be used at any time to query or +change the locale (though changing the locale is antisocial and dangerous on +multi-threaded systems that don't have multi-thread safe locale operations. +(See L). Using the system +L> should be avoided. Nevertheless, certain non-Perl libraries +called from XS, such as C do so, and this can't be changed. When the +locale is changed by XS code that didn't use +L|perlapi/Perl_setlocale>, Perl needs to be told that the +locale has changed. Use this function to do so, before returning to Perl. + +The return value is a boolean: TRUE if the global locale at the time of call +was in effect; and FALSE if a per-thread locale was in effect. This can be +used by the caller that needs to restore things as-they-were to decide whether +or not to call +L|perlapi/switch_to_global_locale>. + +=cut +*/ + +bool +Perl_sync_locale() +{ + +#ifndef USE_LOCALE + + return TRUE; + +#else + + const char * newlocale; + dTHX; + +# ifdef USE_POSIX_2008_LOCALE + + bool was_in_global_locale = FALSE; + locale_t cur_obj = uselocale((locale_t) 0); + + /* On Windows, unless the foreign code has turned off the thread-safe + * locale setting, any plain setlocale() will have affected what we see, so + * no need to worry. Otherwise, If the foreign code has done a plain + * setlocale(), it will only affect the global locale on POSIX systems, but + * will affect the */ + if (cur_obj == LC_GLOBAL_LOCALE) { + +# ifdef HAS_QUERY_LOCALE + + do_setlocale_c(LC_ALL, setlocale(LC_ALL, NULL)); + +# else + + unsigned int i; + + /* We can't trust that we can read the LC_ALL format on the + * platform, so do them individually */ + for (i = 0; i < LC_ALL_INDEX; i++) { + do_setlocale_r(categories[i], setlocale(categories[i], NULL)); + } - newlocale = do_setlocale_c(LC_CTYPE, NULL); +# endif + + was_in_global_locale = TRUE; + } + +# else + + bool was_in_global_locale = TRUE; + +# endif +# ifdef USE_LOCALE_CTYPE + + newlocale = savepv(do_setlocale_c(LC_CTYPE, NULL)); DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s:%d: %s\n", __FILE__, __LINE__, setlocale_debug_string(LC_CTYPE, NULL, newlocale))); new_ctype(newlocale); + Safefree(newlocale); -#endif /* USE_LOCALE_CTYPE */ -#ifdef USE_LOCALE_COLLATE +# endif /* USE_LOCALE_CTYPE */ +# ifdef USE_LOCALE_COLLATE - newlocale = do_setlocale_c(LC_COLLATE, NULL); + newlocale = savepv(do_setlocale_c(LC_COLLATE, NULL)); DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s:%d: %s\n", __FILE__, __LINE__, setlocale_debug_string(LC_COLLATE, NULL, newlocale))); new_collate(newlocale); + Safefree(newlocale); -#endif -#ifdef USE_LOCALE_NUMERIC +# endif +# ifdef USE_LOCALE_NUMERIC - newlocale = do_setlocale_c(LC_NUMERIC, NULL); + newlocale = savepv(do_setlocale_c(LC_NUMERIC, NULL)); DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s:%d: %s\n", __FILE__, __LINE__, setlocale_debug_string(LC_NUMERIC, NULL, newlocale))); new_numeric(newlocale); + Safefree(newlocale); -#endif /* USE_LOCALE_NUMERIC */ +# endif /* USE_LOCALE_NUMERIC */ + + return was_in_global_locale; + +#endif } @@ -3669,68 +5469,12 @@ S_setlocale_debug_string(const int category, /* category number, /* initialise to a non-null value to keep it out of BSS and so keep * -DPERL_GLOBAL_STRUCT_PRIVATE happy */ - static char ret[128] = "If you can read this, thank your buggy C" + static char ret[256] = "If you can read this, thank your buggy C" " library strlcpy(), and change your hints file" " to undef it"; - my_strlcpy(ret, "setlocale(", sizeof(ret)); - - switch (category) { - default: - my_snprintf(ret, sizeof(ret), "%s? %d", ret, category); - break; - -# ifdef LC_ALL - - case LC_ALL: - my_strlcat(ret, "LC_ALL", sizeof(ret)); - break; - -# endif -# ifdef LC_CTYPE - - case LC_CTYPE: - my_strlcat(ret, "LC_CTYPE", sizeof(ret)); - break; - -# endif -# ifdef LC_NUMERIC - - case LC_NUMERIC: - my_strlcat(ret, "LC_NUMERIC", sizeof(ret)); - break; - -# endif -# ifdef LC_COLLATE - - case LC_COLLATE: - my_strlcat(ret, "LC_COLLATE", sizeof(ret)); - break; - -# endif -# ifdef LC_TIME - - case LC_TIME: - my_strlcat(ret, "LC_TIME", sizeof(ret)); - break; - -# endif -# ifdef LC_MONETARY - - case LC_MONETARY: - my_strlcat(ret, "LC_MONETARY", sizeof(ret)); - break; - -# endif -# ifdef LC_MESSAGES - - case LC_MESSAGES: - my_strlcat(ret, "LC_MESSAGES", sizeof(ret)); - break; - -# endif - - } + my_strlcpy(ret, "setlocale(", sizeof(ret)); + my_strlcat(ret, category_name(category), sizeof(ret)); my_strlcat(ret, ", ", sizeof(ret)); if (locale) { @@ -3760,6 +5504,58 @@ S_setlocale_debug_string(const int category, /* category number, #endif +void +Perl_thread_locale_init() +{ + /* Called from a thread on startup*/ + +#ifdef USE_THREAD_SAFE_LOCALE + + dTHX_DEBUGGING; + + /* C starts the new thread in the global C locale. If we are thread-safe, + * we want to not be in the global locale */ + + DEBUG_L(PerlIO_printf(Perl_debug_log, + "%s:%d: new thread, initial locale is %s; calling setlocale\n", + __FILE__, __LINE__, setlocale(LC_ALL, NULL))); + +# ifdef WIN32 + + _configthreadlocale(_ENABLE_PER_THREAD_LOCALE); + +# else + + Perl_setlocale(LC_ALL, "C"); + +# endif +#endif + +} + +void +Perl_thread_locale_term() +{ + /* Called from a thread as it gets ready to terminate */ + +#ifdef USE_THREAD_SAFE_LOCALE + + /* C starts the new thread in the global C locale. If we are thread-safe, + * we want to not be in the global locale */ + +# ifndef WIN32 + + { /* Free up */ + locale_t cur_obj = uselocale(LC_GLOBAL_LOCALE); + if (cur_obj != LC_GLOBAL_LOCALE) { + freelocale(cur_obj); + } + } + +# endif +#endif + +} /* * ex: set ts=8 sts=4 sw=4 et: