X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/b1d4925c105b6e865b16a8914b6819899889d639..4d8d465a12b7ff56709b3e3cdfc9377a57f02bed:/locale.c diff --git a/locale.c b/locale.c index 99d42e0..8a3e461 100644 --- a/locale.c +++ b/locale.c @@ -44,41 +44,76 @@ /* If the environment says to, we can output debugging information during * initialization. This is done before option parsing, and before any thread * creation, so can be a file-level static */ -#ifdef DEBUGGING -# ifdef PERL_GLOBAL_STRUCT - /* no global syms allowed */ -# define debug_initialization 0 -# define DEBUG_INITIALIZATION_set(v) -# else +#if ! defined(DEBUGGING) || defined(PERL_GLOBAL_STRUCT) +# define debug_initialization 0 +# define DEBUG_INITIALIZATION_set(v) +#else static bool debug_initialization = FALSE; -# define DEBUG_INITIALIZATION_set(v) (debug_initialization = v) -# endif +# define DEBUG_INITIALIZATION_set(v) (debug_initialization = v) #endif -/* strlen() of a literal string constant. XXX We might want this more general, - * but using it in just this file for now */ +/* strlen() of a literal string constant. We might want this more general, + * but using it in just this file for now. A problem with more generality is + * the compiler warnings about comparing unlike signs */ #define STRLENs(s) (sizeof("" s "") - 1) +/* Is the C string input 'name' "C" or "POSIX"? If so, and 'name' is the + * return of setlocale(), then this is extremely likely to be the C or POSIX + * locale. However, the output of setlocale() is documented to be opaque, but + * the odds are extremely small that it would return these two strings for some + * other locale. Note that VMS in these two locales includes many non-ASCII + * characters as controls and punctuation (below are hex bytes): + * cntrl: 84-97 9B-9F + * punct: A1-A3 A5 A7-AB B0-B3 B5-B7 B9-BD BF-CF D1-DD DF-EF F1-FD + * Oddly, none there are listed as alphas, though some represent alphabetics + * http://www.nntp.perl.org/group/perl.perl5.porters/2013/02/msg198753.html */ +#define isNAME_C_OR_POSIX(name) \ + ( (name) != NULL \ + && (( *(name) == 'C' && (*(name + 1)) == '\0') \ + || strEQ((name), "POSIX"))) + #ifdef USE_LOCALE -/* - * Standardize the locale name from a string returned by 'setlocale', possibly - * modifying that string. - * - * The typical return value of setlocale() is either - * (1) "xx_YY" if the first argument of setlocale() is not LC_ALL - * (2) "xa_YY xb_YY ..." if the first argument of setlocale() is LC_ALL - * (the space-separated values represent the various sublocales, - * in some unspecified order). This is not handled by this function. +/* This code keeps a LRU cache of the UTF-8ness of the locales it has so-far + * looked up. This is in the form of a C string: */ + +#define UTF8NESS_SEP "\v" +#define UTF8NESS_PREFIX "\f" + +/* So, the string looks like: * - * In some platforms it has a form like "LC_SOMETHING=Lang_Country.866\n", - * which is harmful for further use of the string in setlocale(). This - * function removes the trailing new line and everything up through the '=' + * \vC\a0\vPOSIX\a0\vam_ET\a0\vaf_ZA.utf8\a1\ven_US.UTF-8\a1\0 * - */ + * where the digit 0 after the \a indicates that the locale starting just + * after the preceding \v is not UTF-8, and the digit 1 mean it is. */ + +STATIC_ASSERT_DECL(STRLENs(UTF8NESS_SEP) == 1); +STATIC_ASSERT_DECL(STRLENs(UTF8NESS_PREFIX) == 1); + +#define C_and_POSIX_utf8ness UTF8NESS_SEP "C" UTF8NESS_PREFIX "0" \ + UTF8NESS_SEP "POSIX" UTF8NESS_PREFIX "0" + +/* The cache is initialized to C_and_POSIX_utf8ness at start up. These are + * kept there always. The remining portion of the cache is LRU, with the + * oldest looked-up locale at the tail end */ + STATIC char * S_stdize_locale(pTHX_ char *locs) { + /* Standardize the locale name from a string returned by 'setlocale', + * possibly modifying that string. + * + * The typical return value of setlocale() is either + * (1) "xx_YY" if the first argument of setlocale() is not LC_ALL + * (2) "xa_YY xb_YY ..." if the first argument of setlocale() is LC_ALL + * (the space-separated values represent the various sublocales, + * in some unspecified order). This is not handled by this function. + * + * In some platforms it has a form like "LC_SOMETHING=Lang_Country.866\n", + * which is harmful for further use of the string in setlocale(). This + * function removes the trailing new line and everything up through the '=' + * */ + const char * const s = strchr(locs, '='); bool okay = TRUE; @@ -104,8 +139,234 @@ S_stdize_locale(pTHX_ char *locs) return locs; } +/* Two parallel arrays; first the locale categories Perl uses on this system; + * the second array is their names. These arrays are in mostly arbitrary + * order. */ + +const int categories[] = { + +# ifdef USE_LOCALE_NUMERIC + LC_NUMERIC, +# endif +# ifdef USE_LOCALE_CTYPE + LC_CTYPE, +# endif +# ifdef USE_LOCALE_COLLATE + LC_COLLATE, +# endif +# ifdef USE_LOCALE_TIME + LC_TIME, +# endif +# ifdef USE_LOCALE_MESSAGES + LC_MESSAGES, +# endif +# ifdef USE_LOCALE_MONETARY + LC_MONETARY, +# endif +# ifdef USE_LOCALE_ADDRESS + LC_ADDRESS, +# endif +# ifdef USE_LOCALE_IDENTIFICATION + LC_IDENTIFICATION, +# endif +# ifdef USE_LOCALE_MEASUREMENT + LC_MEASUREMENT, +# endif +# ifdef USE_LOCALE_PAPER + LC_PAPER, +# endif +# ifdef USE_LOCALE_TELEPHONE + LC_TELEPHONE, +# endif +# ifdef LC_ALL + LC_ALL, +# endif + -1 /* Placeholder because C doesn't allow a + trailing comma, and it would get complicated + with all the #ifdef's */ +}; + +/* The top-most real element is LC_ALL */ + +const char * category_names[] = { + +# ifdef USE_LOCALE_NUMERIC + "LC_NUMERIC", +# endif +# ifdef USE_LOCALE_CTYPE + "LC_CTYPE", +# endif +# ifdef USE_LOCALE_COLLATE + "LC_COLLATE", +# endif +# ifdef USE_LOCALE_TIME + "LC_TIME", +# endif +# ifdef USE_LOCALE_MESSAGES + "LC_MESSAGES", +# endif +# ifdef USE_LOCALE_MONETARY + "LC_MONETARY", +# endif +# ifdef USE_LOCALE_ADDRESS + "LC_ADDRESS", +# endif +# ifdef USE_LOCALE_IDENTIFICATION + "LC_IDENTIFICATION", +# endif +# ifdef USE_LOCALE_MEASUREMENT + "LC_MEASUREMENT", +# endif +# ifdef USE_LOCALE_PAPER + "LC_PAPER", +# endif +# ifdef USE_LOCALE_TELEPHONE + "LC_TELEPHONE", +# endif +# ifdef LC_ALL + "LC_ALL", +# endif + NULL /* Placeholder */ + }; + +# ifdef LC_ALL + + /* On systems with LC_ALL, it is kept in the highest index position. (-2 + * to account for the final unused placeholder element.) */ +# define NOMINAL_LC_ALL_INDEX (C_ARRAY_LENGTH(categories) - 2) + +# else + + /* On systems without LC_ALL, we pretend it is there, one beyond the real + * top element, hence in the unused placeholder element. */ +# define NOMINAL_LC_ALL_INDEX (C_ARRAY_LENGTH(categories) - 1) + +# endif + +/* Pretending there is an LC_ALL element just above allows us to avoid most + * special cases. Most loops through these arrays in the code below are + * written like 'for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++)'. They will work + * on either type of system. But the code must be written to not access the + * element at 'LC_ALL_INDEX' except on platforms that have it. This can be + * checked for at compile time by using the #define LC_ALL_INDEX which is only + * defined if we do have LC_ALL. */ + +STATIC const char * +S_category_name(const int category) +{ + unsigned int i; + +#ifdef LC_ALL + + if (category == LC_ALL) { + return "LC_ALL"; + } + #endif + for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++) { + if (category == categories[i]) { + return category_names[i]; + } + } + + { + const char suffix[] = " (unknown)"; + int temp = category; + Size_t length = sizeof(suffix) + 1; + char * unknown; + dTHX; + + if (temp < 0) { + length++; + temp = - temp; + } + + /* Calculate the number of digits */ + while (temp >= 10) { + temp /= 10; + length++; + } + + Newx(unknown, length, char); + my_snprintf(unknown, length, "%d%s", category, suffix); + SAVEFREEPV(unknown); + return unknown; + } +} + +/* Now create LC_foo_INDEX #defines for just those categories on this system */ +# ifdef USE_LOCALE_NUMERIC +# define LC_NUMERIC_INDEX 0 +# define _DUMMY_NUMERIC LC_NUMERIC_INDEX +# else +# define _DUMMY_NUMERIC -1 +# endif +# ifdef USE_LOCALE_CTYPE +# define LC_CTYPE_INDEX _DUMMY_NUMERIC + 1 +# define _DUMMY_CTYPE LC_CTYPE_INDEX +# else +# define _DUMMY_CTYPE _DUMMY_NUMERIC +# endif +# ifdef USE_LOCALE_COLLATE +# define LC_COLLATE_INDEX _DUMMY_CTYPE + 1 +# define _DUMMY_COLLATE LC_COLLATE_INDEX +# else +# define _DUMMY_COLLATE _DUMMY_COLLATE +# endif +# ifdef USE_LOCALE_TIME +# define LC_TIME_INDEX _DUMMY_COLLATE + 1 +# define _DUMMY_TIME LC_TIME_INDEX +# else +# define _DUMMY_TIME _DUMMY_COLLATE +# endif +# ifdef USE_LOCALE_MESSAGES +# define LC_MESSAGES_INDEX _DUMMY_TIME + 1 +# define _DUMMY_MESSAGES LC_MESSAGES_INDEX +# else +# define _DUMMY_MESSAGES _DUMMY_TIME +# endif +# ifdef USE_LOCALE_MONETARY +# define LC_MONETARY_INDEX _DUMMY_MESSAGES + 1 +# define _DUMMY_MONETARY LC_MONETARY_INDEX +# else +# define _DUMMY_MONETARY _DUMMY_MESSAGES +# endif +# ifdef USE_LOCALE_ADDRESS +# define LC_ADDRESS_INDEX _DUMMY_MONETARY + 1 +# define _DUMMY_ADDRESS LC_ADDRESS_INDEX +# else +# define _DUMMY_ADDRESS _DUMMY_MONETARY +# endif +# ifdef USE_LOCALE_IDENTIFICATION +# define LC_IDENTIFICATION_INDEX _DUMMY_ADDRESS + 1 +# define _DUMMY_IDENTIFICATION LC_IDENTIFICATION_INDEX +# else +# define _DUMMY_IDENTIFICATION _DUMMY_ADDRESS +# endif +# ifdef USE_LOCALE_MEASUREMENT +# define LC_MEASUREMENT_INDEX _DUMMY_IDENTIFICATION + 1 +# define _DUMMY_MEASUREMENT LC_MEASUREMENT_INDEX +# else +# define _DUMMY_MEASUREMENT _DUMMY_IDENTIFICATION +# endif +# ifdef USE_LOCALE_PAPER +# define LC_PAPER_INDEX _DUMMY_MEASUREMENT + 1 +# define _DUMMY_PAPER LC_PAPER_INDEX +# else +# define _DUMMY_PAPER _DUMMY_MEASUREMENT +# endif +# ifdef USE_LOCALE_TELEPHONE +# define LC_TELEPHONE_INDEX _DUMMY_PAPER + 1 +# define _DUMMY_TELEPHONE LC_TELEPHONE_INDEX +# else +# define _DUMMY_TELEPHONE _DUMMY_PAPER +# endif +# ifdef LC_ALL +# define LC_ALL_INDEX _DUMMY_TELEPHONE + 1 +# endif +#endif /* ifdef USE_LOCALE */ + /* Windows requres a customized base-level setlocale() */ # ifdef WIN32 # define my_setlocale(cat, locale) win32_setlocale(cat, locale) @@ -131,9 +392,10 @@ S_set_numeric_radix(pTHX_ const bool use_locale) if (use_locale) { const char * radix = my_nl_langinfo(PERL_RADIXCHAR, FALSE); /* FALSE => already in dest locale */ - /* ... and the character being used isn't a dot */ if (strNE(radix, ".")) { + const U8 * first_variant; + if (PL_numeric_radix_sv) { sv_setpv(PL_numeric_radix_sv, radix); } @@ -141,10 +403,17 @@ S_set_numeric_radix(pTHX_ const bool use_locale) PL_numeric_radix_sv = newSVpv(radix, 0); } - if ( ! is_utf8_invariant_string( - (U8 *) SvPVX(PL_numeric_radix_sv), SvCUR(PL_numeric_radix_sv)) - && is_utf8_string( - (U8 *) SvPVX(PL_numeric_radix_sv), SvCUR(PL_numeric_radix_sv)) + /* If there is a byte variant under UTF-8, and if the remainder of + * the string starting there is valid UTF-8, and we are in a UTF-8 + * locale, then mark the radix as being in UTF-8 */ + if ( ! is_utf8_invariant_string_loc( + (U8 *) SvPVX(PL_numeric_radix_sv), + SvCUR(PL_numeric_radix_sv), + &first_variant) + && is_utf8_string(first_variant, + SvCUR(PL_numeric_radix_sv) + - ((char *) first_variant + - SvPVX(PL_numeric_radix_sv))) && _is_cur_LC_category_utf8(LC_NUMERIC)) { SvUTF8_on(PL_numeric_radix_sv); @@ -175,20 +444,6 @@ S_set_numeric_radix(pTHX_ const bool use_locale) } -/* Is the C string input 'name' "C" or "POSIX"? If so, and 'name' is the - * return of setlocale(), then this is extremely likely to be the C or POSIX - * locale. However, the output of setlocale() is documented to be opaque, but - * the odds are extremely small that it would return these two strings for some - * other locale. Note that VMS in these two locales includes many non-ASCII - * characters as controls and punctuation (below are hex bytes): - * cntrl: 84-97 9B-9F - * punct: A1-A3 A5 A7-AB B0-B3 B5-B7 B9-BD BF-CF D1-DD DF-EF F1-FD - * Oddly, none there are listed as alphas, though some represent alphabetics - * http://www.nntp.perl.org/group/perl.perl5.porters/2013/02/msg198753.html */ -#define isNAME_C_OR_POSIX(name) \ - ( (name) != NULL \ - && (( *(name) == 'C' && (*(name + 1)) == '\0') \ - || strEQ((name), "POSIX"))) void Perl_new_numeric(pTHX_ const char *newnum) @@ -200,7 +455,7 @@ Perl_new_numeric(pTHX_ const char *newnum) #else - /* Called after all libc setlocale() calls affecting LC_NUMERIC, to tell + /* Called after each libc setlocale() call affecting LC_NUMERIC, to tell * core Perl this and that 'newnum' is the name of the new locale. * It installs this locale as the current underlying default. * @@ -219,13 +474,17 @@ Perl_new_numeric(pTHX_ const char *newnum) * that the current locale is the program's underlying * locale * PL_numeric_standard An int indicating if the toggled state is such - * that the current locale is the C locale. If non-zero, - * it is in C; if > 1, it means it may not be toggled away + * that the current locale is the C locale or + * indistinguishable from the C locale. If non-zero, it + * is in C; if > 1, it means it may not be toggled away * from C. - * Note that both of the last two variables can be true at the same time, - * if the underlying locale is C. (Toggling is a no-op under these - * circumstances.) - * + * PL_numeric_underlying_is_standard A bool kept by this function + * indicating that the underlying locale and the standard + * C locale are indistinguishable for the purposes of + * LC_NUMERIC. This happens when both of the above two + * variables are true at the same time. (Toggling is a + * no-op under these circumstances.) This variable is + * used to avoid having to recalculate. * Any code changing the locale (outside this file) should use * POSIX::setlocale, which calls this function. Therefore this function * should be called directly only from this file and from @@ -238,14 +497,24 @@ Perl_new_numeric(pTHX_ const char *newnum) PL_numeric_name = NULL; PL_numeric_standard = TRUE; PL_numeric_underlying = TRUE; + PL_numeric_underlying_is_standard = TRUE; return; } save_newnum = stdize_locale(savepv(newnum)); - - PL_numeric_standard = isNAME_C_OR_POSIX(save_newnum); PL_numeric_underlying = TRUE; + PL_numeric_standard = isNAME_C_OR_POSIX(save_newnum); + /* If its name isn't C nor POSIX, it could still be indistinguishable from + * them */ + if (! PL_numeric_standard) { + PL_numeric_standard = cBOOL(strEQ(".", my_nl_langinfo(PERL_RADIXCHAR, + FALSE /* Don't toggle locale */ )) + && strEQ("", my_nl_langinfo(PERL_THOUSEP, + FALSE))); + } + + /* Save the new name if it isn't the same as the previous one, if any */ if (! PL_numeric_name || strNE(PL_numeric_name, save_newnum)) { Safefree(PL_numeric_name); PL_numeric_name = save_newnum; @@ -254,6 +523,12 @@ Perl_new_numeric(pTHX_ const char *newnum) Safefree(save_newnum); } + PL_numeric_underlying_is_standard = PL_numeric_standard; + + if (DEBUG_L_TEST || debug_initialization) { + PerlIO_printf(Perl_debug_log, "Called new_numeric with %s, PL_numeric_name=%s\n", newnum, PL_numeric_name); + } + /* Keep LC_NUMERIC in the C locale. This is for XS modules, so they don't * have to worry about the radix being a non-dot. (Core operations that * need the underlying locale change to it temporarily). */ @@ -277,14 +552,14 @@ Perl_set_numeric_standard(pTHX) do_setlocale_c(LC_NUMERIC, "C"); PL_numeric_standard = TRUE; - PL_numeric_underlying = isNAME_C_OR_POSIX(PL_numeric_name); + PL_numeric_underlying = PL_numeric_underlying_is_standard; set_numeric_radix(0); # ifdef DEBUGGING if (DEBUG_L_TEST || debug_initialization) { PerlIO_printf(Perl_debug_log, - "Underlying LC_NUMERIC locale now is C\n"); + "LC_NUMERIC locale now is standard C\n"); } # endif @@ -305,15 +580,15 @@ Perl_set_numeric_underlying(pTHX) * wrong if some XS code has changed the locale behind our back) */ do_setlocale_c(LC_NUMERIC, PL_numeric_name); - PL_numeric_standard = isNAME_C_OR_POSIX(PL_numeric_name); + PL_numeric_standard = PL_numeric_underlying_is_standard; PL_numeric_underlying = TRUE; - set_numeric_radix(1); + set_numeric_radix(! PL_numeric_standard); # ifdef DEBUGGING if (DEBUG_L_TEST || debug_initialization) { PerlIO_printf(Perl_debug_log, - "Underlying LC_NUMERIC locale now is %s\n", + "LC_NUMERIC locale now is %s\n", PL_numeric_name); } @@ -337,7 +612,7 @@ S_new_ctype(pTHX_ const char *newctype) #else - /* Called after all libc setlocale() calls affecting LC_CTYPE, to tell + /* Called after each libc setlocale() call affecting LC_CTYPE, to tell * core Perl this and that 'newctype' is the name of the new locale. * * This function sets up the folding arrays for all 256 bytes, assuming @@ -382,10 +657,10 @@ S_new_ctype(pTHX_ const char *newctype) unsigned int bad_count = 0; /* Count of bad characters */ for (i = 0; i < 256; i++) { - if (isUPPER_LC((U8) i)) - PL_fold_locale[i] = (U8) toLOWER_LC((U8) i); - else if (isLOWER_LC((U8) i)) - PL_fold_locale[i] = (U8) toUPPER_LC((U8) i); + if (isupper(i)) + PL_fold_locale[i] = (U8) tolower(i); + else if (islower(i)) + PL_fold_locale[i] = (U8) toupper(i); else PL_fold_locale[i] = (U8) i; @@ -401,10 +676,19 @@ S_new_ctype(pTHX_ const char *newctype) if ( check_for_problems && (isGRAPH_A(i) || isBLANK_A(i) || i == '\n')) { - if (( isALPHANUMERIC_A(i) && ! isALPHANUMERIC_LC(i)) - || (isPUNCT_A(i) && ! isPUNCT_LC(i)) - || (isBLANK_A(i) && ! isBLANK_LC(i)) - || (i == '\n' && ! isCNTRL_LC(i))) + if ( cBOOL(isalnum(i)) != cBOOL(isALPHANUMERIC(i)) + || cBOOL(isalpha(i)) != cBOOL(isALPHA_A(i)) + || cBOOL(isdigit(i)) != cBOOL(isDIGIT_A(i)) + || cBOOL(isgraph(i)) != cBOOL(isGRAPH_A(i)) + || cBOOL(islower(i)) != cBOOL(isLOWER_A(i)) + || cBOOL(isprint(i)) != cBOOL(isPRINT_A(i)) + || cBOOL(ispunct(i)) != cBOOL(isPUNCT_A(i)) + || cBOOL(isspace(i)) != cBOOL(isSPACE_A(i)) + || cBOOL(isupper(i)) != cBOOL(isUPPER_A(i)) + || cBOOL(isxdigit(i))!= cBOOL(isXDIGIT_A(i)) + || tolower(i) != (int) toLOWER_A(i) + || toupper(i) != (int) toUPPER_A(i) + || (i == '\n' && ! isCNTRL_LC(i))) { if (bad_count) { /* Separate multiple entries with a blank */ @@ -517,11 +801,9 @@ Perl__warn_problematic_locale() * _CHECK_AND_WARN_PROBLEMATIC_LOCALE */ if (PL_warn_locale) { - /*GCC_DIAG_IGNORE(-Wformat-security); Didn't work */ Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), SvPVX(PL_warn_locale), 0 /* dummy to avoid compiler warning */ ); - /* GCC_DIAG_RESTORE; */ SvREFCNT_dec_NN(PL_warn_locale); PL_warn_locale = NULL; } @@ -541,7 +823,7 @@ S_new_collate(pTHX_ const char *newcoll) #else - /* Called after all libc setlocale() calls affecting LC_COLLATE, to tell + /* Called after each libc setlocale() call affecting LC_COLLATE, to tell * core Perl this and that 'newcoll' is the name of the new locale. * * The design of locale collation is that every locale change is given an @@ -773,6 +1055,7 @@ S_win32_setlocale(pTHX_ int category, const char* locale) bool override_LC_ALL = FALSE; char * result; + unsigned int i; if (locale && strEQ(locale, "")) { @@ -780,73 +1063,30 @@ S_win32_setlocale(pTHX_ int category, const char* locale) locale = PerlEnv_getenv("LC_ALL"); if (! locale) { + if (category == LC_ALL) { + override_LC_ALL = TRUE; + } + else { # endif - switch (category) { - -# ifdef LC_ALL - case LC_ALL: - override_LC_ALL = TRUE; - break; /* We already know its variable isn't set */ - -# endif -# ifdef USE_LOCALE_TIME - - case LC_TIME: - locale = PerlEnv_getenv("LC_TIME"); - break; - -# endif -# ifdef USE_LOCALE_CTYPE - - case LC_CTYPE: - locale = PerlEnv_getenv("LC_CTYPE"); - break; - -# endif -# ifdef USE_LOCALE_COLLATE - - case LC_COLLATE: - locale = PerlEnv_getenv("LC_COLLATE"); - break; - -# endif -# ifdef USE_LOCALE_MONETARY - - case LC_MONETARY: - locale = PerlEnv_getenv("LC_MONETARY"); - break; - -# endif -# ifdef USE_LOCALE_NUMERIC - - case LC_NUMERIC: - locale = PerlEnv_getenv("LC_NUMERIC"); - break; - -# endif -# ifdef USE_LOCALE_MESSAGES - - case LC_MESSAGES: - locale = PerlEnv_getenv("LC_MESSAGES"); - break; -# endif + for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++) { + if (category == categories[i]) { + locale = PerlEnv_getenv(category_names[i]); + goto found_locale; + } + } - default: - /* This is a category, like PAPER_SIZE that we don't - * know about; and so can't provide a wrapper. */ - break; - } - if (! locale) { locale = PerlEnv_getenv("LANG"); if (! locale) { locale = ""; } - } + + found_locale: ; # ifdef LC_ALL + } } # endif @@ -867,73 +1107,16 @@ S_win32_setlocale(pTHX_ int category, const char* locale) * one that is set. (If they are set to "", it means to use the same thing * we just set LC_ALL to, so can skip) */ -# ifdef USE_LOCALE_TIME - - result = PerlEnv_getenv("LC_TIME"); - if (result && strNE(result, "")) { - setlocale(LC_TIME, result); - DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s:%d: %s\n", - __FILE__, __LINE__, - setlocale_debug_string(LC_TIME, result, "not captured"))); - } - -# endif -# ifdef USE_LOCALE_CTYPE - - result = PerlEnv_getenv("LC_CTYPE"); - if (result && strNE(result, "")) { - setlocale(LC_CTYPE, result); - DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s:%d: %s\n", - __FILE__, __LINE__, - setlocale_debug_string(LC_CTYPE, result, "not captured"))); - } - -# endif -# ifdef USE_LOCALE_COLLATE - - result = PerlEnv_getenv("LC_COLLATE"); - if (result && strNE(result, "")) { - setlocale(LC_COLLATE, result); - DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s:%d: %s\n", - __FILE__, __LINE__, - setlocale_debug_string(LC_COLLATE, result, "not captured"))); - } - -# endif -# ifdef USE_LOCALE_MONETARY - - result = PerlEnv_getenv("LC_MONETARY"); - if (result && strNE(result, "")) { - setlocale(LC_MONETARY, result); - DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s:%d: %s\n", - __FILE__, __LINE__, - setlocale_debug_string(LC_MONETARY, result, "not captured"))); - } - -# endif -# ifdef USE_LOCALE_NUMERIC - - result = PerlEnv_getenv("LC_NUMERIC"); - if (result && strNE(result, "")) { - setlocale(LC_NUMERIC, result); - DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s:%d: %s\n", - __FILE__, __LINE__, - setlocale_debug_string(LC_NUMERIC, result, "not captured"))); - } - -# endif -# ifdef USE_LOCALE_MESSAGES - - result = PerlEnv_getenv("LC_MESSAGES"); - if (result && strNE(result, "")) { - setlocale(LC_MESSAGES, result); - DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s:%d: %s\n", - __FILE__, __LINE__, - setlocale_debug_string(LC_MESSAGES, result, "not captured"))); + for (i = 0; i < LC_ALL_INDEX; i++) { + result = PerlEnv_getenv(category_names[i]); + if (result && strNE(result, "")) { + setlocale(categories[i], result); + DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s:%d: %s\n", + __FILE__, __LINE__, + setlocale_debug_string(categories[i], result, "not captured"))); + } } -# endif - result = setlocale(LC_ALL, NULL); DEBUG_L(PerlIO_printf(Perl_debug_log, "%s:%d: %s\n", __FILE__, __LINE__, @@ -955,11 +1138,10 @@ Perl_setlocale(int category, const char * locale) #ifdef USE_LOCALE_NUMERIC - /* A NULL locale means only query what the current one is. We - * have the LC_NUMERIC name saved, because we are normally switched - * into the C locale for it. Switch back so an LC_ALL query will yield - * the correct results; all other categories don't require special - * handling */ + /* A NULL locale means only query what the current one is. We have the + * LC_NUMERIC name saved, because we are normally switched into the C + * locale for it. For an LC_ALL query, switch back to get the correct + * results. All other categories don't require special handling */ if (locale == NULL) { if (category == LC_NUMERIC) { return savepv(PL_numeric_name); @@ -967,7 +1149,8 @@ Perl_setlocale(int category, const char * locale) # ifdef LC_ALL - else if (category == LC_ALL) { + else if (category == LC_ALL && ! PL_numeric_underlying) { + SET_NUMERIC_UNDERLYING(); } @@ -977,7 +1160,8 @@ Perl_setlocale(int category, const char * locale) #endif - retval = do_setlocale_r(category, locale); + /* Save retval since subsequent setlocale() calls may overwrite it. */ + retval = savepv(do_setlocale_r(category, locale)); DEBUG_L(PerlIO_printf(Perl_debug_log, "%s:%d: %s\n", __FILE__, __LINE__, @@ -992,9 +1176,6 @@ Perl_setlocale(int category, const char * locale) return NULL; } - /* Save retval since subsequent setlocale() calls may overwrite it. */ - retval = savepv(retval); - /* If locale == NULL, we are just querying the state, but may have switched * to NUMERIC_UNDERLYING. Switch back before returning. */ if (locale == NULL) { @@ -1142,11 +1323,15 @@ Unimplemented, so returns C<"">. =item C +=item C + =item C -Only the values for English are returned. Earlier POSIX standards also -specified C and C, but these have been removed from POSIX 2008, -and aren't supported by C. +=item C + +Only the values for English are returned. C and C have been +removed from POSIX 2008, and are retained for backwards compatibility. Your +platform's C may not support them. =item C @@ -1260,7 +1445,9 @@ S_my_nl_langinfo(const int item, bool toggle) LOCALE_LOCK; if (toggle) { - if (item == PERL_RADIXCHAR || item == PERL_THOUSEP) { + if ( ! PL_numeric_underlying + && (item == PERL_RADIXCHAR || item == PERL_THOUSEP)) + { do_setlocale_c(LC_NUMERIC, PL_numeric_name); } else { @@ -1276,8 +1463,6 @@ S_my_nl_langinfo(const int item, bool toggle) LOCALE_UNLOCK; - return PL_langinfo_buf; - # else /* Use nl_langinfo_l(), avoiding both a mutex and changing the locale */ bool do_free = FALSE; @@ -1288,9 +1473,7 @@ S_my_nl_langinfo(const int item, bool toggle) do_free = TRUE; } - if ( toggle - && (item == PERL_RADIXCHAR || item == PERL_THOUSEP)) - { + if (toggle) { cur = newlocale(LC_NUMERIC_MASK, PL_numeric_name, cur); do_free = TRUE; } @@ -1301,359 +1484,377 @@ S_my_nl_langinfo(const int item, bool toggle) freelocale(cur); } +# endif + + if (strEQ(PL_langinfo_buf, "")) { + if (item == PERL_YESSTR) { + return "yes"; + } + if (item == PERL_NOSTR) { + return "no"; + } + } + return PL_langinfo_buf; -# endif #else /* Below, emulate nl_langinfo as best we can */ + + { + # ifdef HAS_LOCALECONV - const struct lconv* lc; + const struct lconv* lc; # endif # ifdef HAS_STRFTIME - struct tm tm; - bool return_format = FALSE; /* Return the %format, not the value */ - const char * format; + struct tm tm; + bool return_format = FALSE; /* Return the %format, not the value */ + const char * format; # endif - /* We copy the results to a per-thread buffer, even if not multi-threaded. - * This is in part to simplify this code, and partly because we need a - * buffer anyway for strftime(), and partly because a call of localeconv() - * could otherwise wipe out the buffer, and the programmer would not be - * expecting this, as this is a nl_langinfo() substitute after all, so s/he - * might be thinking their localeconv() is safe until another localeconv() - * call. */ + /* We copy the results to a per-thread buffer, even if not + * multi-threaded. This is in part to simplify this code, and partly + * because we need a buffer anyway for strftime(), and partly because a + * call of localeconv() could otherwise wipe out the buffer, and the + * programmer would not be expecting this, as this is a nl_langinfo() + * substitute after all, so s/he might be thinking their localeconv() + * is safe until another localeconv() call. */ - switch (item) { - Size_t len; - const char * retval; + switch (item) { + Size_t len; + const char * retval; - /* These 2 are unimplemented */ - case PERL_CODESET: - case PERL_ERA: /* For use with strftime() %E modifier */ + /* These 2 are unimplemented */ + case PERL_CODESET: + case PERL_ERA: /* For use with strftime() %E modifier */ - default: - return ""; + default: + return ""; - /* We use only an English set, since we don't know any more */ - case PERL_YESEXPR: return "^[+1yY]"; - case PERL_NOEXPR: return "^[-0nN]"; + /* We use only an English set, since we don't know any more */ + case PERL_YESEXPR: return "^[+1yY]"; + case PERL_YESSTR: return "yes"; + case PERL_NOEXPR: return "^[-0nN]"; + case PERL_NOSTR: return "no"; # ifdef HAS_LOCALECONV - case PERL_CRNCYSTR: + case PERL_CRNCYSTR: - LOCALE_LOCK; + LOCALE_LOCK; - lc = localeconv(); - if (! lc || ! lc->currency_symbol || strEQ("", lc->currency_symbol)) - { - LOCALE_UNLOCK; - return ""; - } + /* We don't bother with localeconv_l() because any system that + * has it is likely to also have nl_langinfo() */ - /* Leave the first spot empty to be filled in below */ - save_to_buffer(lc->currency_symbol, &PL_langinfo_buf, - &PL_langinfo_bufsize, 1); - if (lc->mon_decimal_point && strEQ(lc->mon_decimal_point, "")) - { /* khw couldn't figure out how the localedef specifications - would show that the $ should replace the radix; this is - just a guess as to how it might work.*/ - *PL_langinfo_buf = '.'; - } - else if (lc->p_cs_precedes) { - *PL_langinfo_buf = '-'; - } - else { - *PL_langinfo_buf = '+'; - } + lc = localeconv(); + if ( ! lc + || ! lc->currency_symbol + || strEQ("", lc->currency_symbol)) + { + LOCALE_UNLOCK; + return ""; + } - LOCALE_UNLOCK; - break; + /* Leave the first spot empty to be filled in below */ + save_to_buffer(lc->currency_symbol, &PL_langinfo_buf, + &PL_langinfo_bufsize, 1); + if (lc->mon_decimal_point && strEQ(lc->mon_decimal_point, "")) + { /* khw couldn't figure out how the localedef specifications + would show that the $ should replace the radix; this is + just a guess as to how it might work.*/ + *PL_langinfo_buf = '.'; + } + else if (lc->p_cs_precedes) { + *PL_langinfo_buf = '-'; + } + else { + *PL_langinfo_buf = '+'; + } - case PERL_RADIXCHAR: - case PERL_THOUSEP: + LOCALE_UNLOCK; + break; - LOCALE_LOCK; + case PERL_RADIXCHAR: + case PERL_THOUSEP: - if (toggle) { - do_setlocale_c(LC_NUMERIC, PL_numeric_name); - } + LOCALE_LOCK; - lc = localeconv(); - if (! lc) { - retval = ""; - } - else switch (item) { - case PERL_RADIXCHAR: - if (! lc->decimal_point) { - retval = ""; + if (toggle) { + if (! PL_numeric_underlying) { + do_setlocale_c(LC_NUMERIC, PL_numeric_name); } else { - retval = lc->decimal_point; + toggle = FALSE; } - break; + } - case PERL_THOUSEP: - if (! lc->thousands_sep || strEQ("", lc->thousands_sep)) { + lc = localeconv(); + if (! lc) { + retval = ""; + } + else { + retval = (item == PERL_RADIXCHAR) + ? lc->decimal_point + : lc->thousands_sep; + if (! retval) { retval = ""; } - else { - retval = lc->thousands_sep; - } - break; - - default: - LOCALE_UNLOCK; - Perl_croak(aTHX_ "panic: %s: %d: switch case: %d problem", - __FILE__, __LINE__, item); - } + } - save_to_buffer(retval, &PL_langinfo_buf, &PL_langinfo_bufsize, 0); + save_to_buffer(retval, &PL_langinfo_buf, + &PL_langinfo_bufsize, 0); - if (toggle) { - do_setlocale_c(LC_NUMERIC, "C"); - } + if (toggle) { + do_setlocale_c(LC_NUMERIC, "C"); + } - LOCALE_UNLOCK; + LOCALE_UNLOCK; - break; + break; # endif # ifdef HAS_STRFTIME - /* These are defined by C89, so we assume that strftime supports them, - * and so are returned unconditionally; they may not be what the locale - * actually says, but should give good enough results for someone using - * them as formats (as opposed to trying to parse them to figure out - * what the locale says). The other format items are actually tested to - * verify they work on the platform */ - case PERL_D_FMT: return "%x"; - case PERL_T_FMT: return "%X"; - case PERL_D_T_FMT: return "%c"; - - /* These formats are only available in later strfmtime's */ - case PERL_ERA_D_FMT: case PERL_ERA_T_FMT: case PERL_ERA_D_T_FMT: - case PERL_T_FMT_AMPM: - - /* The rest can be gotten from most versions of strftime(). */ - case PERL_ABDAY_1: case PERL_ABDAY_2: case PERL_ABDAY_3: - case PERL_ABDAY_4: case PERL_ABDAY_5: case PERL_ABDAY_6: - case PERL_ABDAY_7: - case PERL_ALT_DIGITS: - case PERL_AM_STR: case PERL_PM_STR: - case PERL_ABMON_1: case PERL_ABMON_2: case PERL_ABMON_3: - case PERL_ABMON_4: case PERL_ABMON_5: case PERL_ABMON_6: - case PERL_ABMON_7: case PERL_ABMON_8: case PERL_ABMON_9: - case PERL_ABMON_10: case PERL_ABMON_11: case PERL_ABMON_12: - case PERL_DAY_1: case PERL_DAY_2: case PERL_DAY_3: case PERL_DAY_4: - case PERL_DAY_5: case PERL_DAY_6: case PERL_DAY_7: - case PERL_MON_1: case PERL_MON_2: case PERL_MON_3: case PERL_MON_4: - case PERL_MON_5: case PERL_MON_6: case PERL_MON_7: case PERL_MON_8: - case PERL_MON_9: case PERL_MON_10: case PERL_MON_11: case PERL_MON_12: - - LOCALE_LOCK; - - init_tm(&tm); /* Precaution against core dumps */ - tm.tm_sec = 30; - tm.tm_min = 30; - tm.tm_hour = 6; - tm.tm_year = 2017 - 1900; - tm.tm_wday = 0; - tm.tm_mon = 0; - switch (item) { - default: - LOCALE_UNLOCK; - Perl_croak(aTHX_ "panic: %s: %d: switch case: %d problem", - __FILE__, __LINE__, item); - NOT_REACHED; /* NOTREACHED */ - - case PERL_PM_STR: tm.tm_hour = 18; - case PERL_AM_STR: - format = "%p"; - break; - - case PERL_ABDAY_7: tm.tm_wday++; - case PERL_ABDAY_6: tm.tm_wday++; - case PERL_ABDAY_5: tm.tm_wday++; - case PERL_ABDAY_4: tm.tm_wday++; - case PERL_ABDAY_3: tm.tm_wday++; - case PERL_ABDAY_2: tm.tm_wday++; - case PERL_ABDAY_1: - format = "%a"; - break; - - case PERL_DAY_7: tm.tm_wday++; - case PERL_DAY_6: tm.tm_wday++; - case PERL_DAY_5: tm.tm_wday++; - case PERL_DAY_4: tm.tm_wday++; - case PERL_DAY_3: tm.tm_wday++; - case PERL_DAY_2: tm.tm_wday++; - case PERL_DAY_1: - format = "%A"; - break; - - case PERL_ABMON_12: tm.tm_mon++; - case PERL_ABMON_11: tm.tm_mon++; - case PERL_ABMON_10: tm.tm_mon++; - case PERL_ABMON_9: tm.tm_mon++; - case PERL_ABMON_8: tm.tm_mon++; - case PERL_ABMON_7: tm.tm_mon++; - case PERL_ABMON_6: tm.tm_mon++; - case PERL_ABMON_5: tm.tm_mon++; - case PERL_ABMON_4: tm.tm_mon++; - case PERL_ABMON_3: tm.tm_mon++; - case PERL_ABMON_2: tm.tm_mon++; - case PERL_ABMON_1: - format = "%b"; - break; - - case PERL_MON_12: tm.tm_mon++; - case PERL_MON_11: tm.tm_mon++; - case PERL_MON_10: tm.tm_mon++; - case PERL_MON_9: tm.tm_mon++; - case PERL_MON_8: tm.tm_mon++; - case PERL_MON_7: tm.tm_mon++; - case PERL_MON_6: tm.tm_mon++; - case PERL_MON_5: tm.tm_mon++; - case PERL_MON_4: tm.tm_mon++; - case PERL_MON_3: tm.tm_mon++; - case PERL_MON_2: tm.tm_mon++; - case PERL_MON_1: - format = "%B"; - break; + /* These are defined by C89, so we assume that strftime supports + * them, and so are returned unconditionally; they may not be what + * the locale actually says, but should give good enough results + * for someone using them as formats (as opposed to trying to parse + * them to figure out what the locale says). The other format + * items are actually tested to verify they work on the platform */ + case PERL_D_FMT: return "%x"; + case PERL_T_FMT: return "%X"; + case PERL_D_T_FMT: return "%c"; + + /* These formats are only available in later strfmtime's */ + case PERL_ERA_D_FMT: case PERL_ERA_T_FMT: case PERL_ERA_D_T_FMT: + case PERL_T_FMT_AMPM: + + /* The rest can be gotten from most versions of strftime(). */ + case PERL_ABDAY_1: case PERL_ABDAY_2: case PERL_ABDAY_3: + case PERL_ABDAY_4: case PERL_ABDAY_5: case PERL_ABDAY_6: + case PERL_ABDAY_7: + case PERL_ALT_DIGITS: + case PERL_AM_STR: case PERL_PM_STR: + case PERL_ABMON_1: case PERL_ABMON_2: case PERL_ABMON_3: + case PERL_ABMON_4: case PERL_ABMON_5: case PERL_ABMON_6: + case PERL_ABMON_7: case PERL_ABMON_8: case PERL_ABMON_9: + case PERL_ABMON_10: case PERL_ABMON_11: case PERL_ABMON_12: + case PERL_DAY_1: case PERL_DAY_2: case PERL_DAY_3: case PERL_DAY_4: + case PERL_DAY_5: case PERL_DAY_6: case PERL_DAY_7: + case PERL_MON_1: case PERL_MON_2: case PERL_MON_3: case PERL_MON_4: + case PERL_MON_5: case PERL_MON_6: case PERL_MON_7: case PERL_MON_8: + case PERL_MON_9: case PERL_MON_10: case PERL_MON_11: + case PERL_MON_12: + + LOCALE_LOCK; + + init_tm(&tm); /* Precaution against core dumps */ + tm.tm_sec = 30; + tm.tm_min = 30; + tm.tm_hour = 6; + tm.tm_year = 2017 - 1900; + tm.tm_wday = 0; + tm.tm_mon = 0; + switch (item) { + default: + LOCALE_UNLOCK; + Perl_croak(aTHX_ + "panic: %s: %d: switch case: %d problem", + __FILE__, __LINE__, item); + NOT_REACHED; /* NOTREACHED */ + + case PERL_PM_STR: tm.tm_hour = 18; + case PERL_AM_STR: + format = "%p"; + break; + + case PERL_ABDAY_7: tm.tm_wday++; + case PERL_ABDAY_6: tm.tm_wday++; + case PERL_ABDAY_5: tm.tm_wday++; + case PERL_ABDAY_4: tm.tm_wday++; + case PERL_ABDAY_3: tm.tm_wday++; + case PERL_ABDAY_2: tm.tm_wday++; + case PERL_ABDAY_1: + format = "%a"; + break; + + case PERL_DAY_7: tm.tm_wday++; + case PERL_DAY_6: tm.tm_wday++; + case PERL_DAY_5: tm.tm_wday++; + case PERL_DAY_4: tm.tm_wday++; + case PERL_DAY_3: tm.tm_wday++; + case PERL_DAY_2: tm.tm_wday++; + case PERL_DAY_1: + format = "%A"; + break; + + case PERL_ABMON_12: tm.tm_mon++; + case PERL_ABMON_11: tm.tm_mon++; + case PERL_ABMON_10: tm.tm_mon++; + case PERL_ABMON_9: tm.tm_mon++; + case PERL_ABMON_8: tm.tm_mon++; + case PERL_ABMON_7: tm.tm_mon++; + case PERL_ABMON_6: tm.tm_mon++; + case PERL_ABMON_5: tm.tm_mon++; + case PERL_ABMON_4: tm.tm_mon++; + case PERL_ABMON_3: tm.tm_mon++; + case PERL_ABMON_2: tm.tm_mon++; + case PERL_ABMON_1: + format = "%b"; + break; + + case PERL_MON_12: tm.tm_mon++; + case PERL_MON_11: tm.tm_mon++; + case PERL_MON_10: tm.tm_mon++; + case PERL_MON_9: tm.tm_mon++; + case PERL_MON_8: tm.tm_mon++; + case PERL_MON_7: tm.tm_mon++; + case PERL_MON_6: tm.tm_mon++; + case PERL_MON_5: tm.tm_mon++; + case PERL_MON_4: tm.tm_mon++; + case PERL_MON_3: tm.tm_mon++; + case PERL_MON_2: tm.tm_mon++; + case PERL_MON_1: + format = "%B"; + break; + + case PERL_T_FMT_AMPM: + format = "%r"; + return_format = TRUE; + break; + + case PERL_ERA_D_FMT: + format = "%Ex"; + return_format = TRUE; + break; + + case PERL_ERA_T_FMT: + format = "%EX"; + return_format = TRUE; + break; + + case PERL_ERA_D_T_FMT: + format = "%Ec"; + return_format = TRUE; + break; + + case PERL_ALT_DIGITS: + tm.tm_wday = 0; + format = "%Ow"; /* Find the alternate digit for 0 */ + break; + } - case PERL_T_FMT_AMPM: - format = "%r"; - return_format = TRUE; - break; + /* We can't use my_strftime() because it doesn't look at + * tm_wday */ + while (0 == strftime(PL_langinfo_buf, PL_langinfo_bufsize, + format, &tm)) + { + /* A zero return means one of: + * a) there wasn't enough space in PL_langinfo_buf + * b) the format, like a plain %p, returns empty + * c) it was an illegal format, though some + * implementations of strftime will just return the + * illegal format as a plain character sequence. + * + * To quickly test for case 'b)', try again but precede + * the format with a plain character. If that result is + * still empty, the problem is either 'a)' or 'c)' */ + + Size_t format_size = strlen(format) + 1; + Size_t mod_size = format_size + 1; + char * mod_format; + char * temp_result; + + Newx(mod_format, mod_size, char); + Newx(temp_result, PL_langinfo_bufsize, char); + *mod_format = ' '; + my_strlcpy(mod_format + 1, format, mod_size); + len = strftime(temp_result, + PL_langinfo_bufsize, + mod_format, &tm); + Safefree(mod_format); + Safefree(temp_result); + + /* If 'len' is non-zero, it means that we had a case like + * %p which means the current locale doesn't use a.m. or + * p.m., and that is valid */ + if (len == 0) { + + /* Here, still didn't work. If we get well beyond a + * reasonable size, bail out to prevent an infinite + * loop. */ + + if (PL_langinfo_bufsize > 100 * format_size) { + *PL_langinfo_buf = '\0'; + } + else { + /* Double the buffer size to retry; Add 1 in case + * original was 0, so we aren't stuck at 0. */ + PL_langinfo_bufsize *= 2; + PL_langinfo_bufsize++; + Renew(PL_langinfo_buf, PL_langinfo_bufsize, char); + continue; + } + } - case PERL_ERA_D_FMT: - format = "%Ex"; - return_format = TRUE; break; + } - case PERL_ERA_T_FMT: - format = "%EX"; - return_format = TRUE; - break; + /* Here, we got a result. + * + * If the item is 'ALT_DIGITS', PL_langinfo_buf contains the + * alternate format for wday 0. If the value is the same as + * the normal 0, there isn't an alternate, so clear the buffer. + * */ + if ( item == PERL_ALT_DIGITS + && strEQ(PL_langinfo_buf, "0")) + { + *PL_langinfo_buf = '\0'; + } - case PERL_ERA_D_T_FMT: - format = "%Ec"; - return_format = TRUE; - break; + /* ALT_DIGITS is problematic. Experiments on it showed that + * strftime() did not always work properly when going from + * alt-9 to alt-10. Only a few locales have this item defined, + * and in all of them on Linux that khw was able to find, + * nl_langinfo() merely returned the alt-0 character, possibly + * doubled. Most Unicode digits are in blocks of 10 + * consecutive code points, so that is sufficient information + * for those scripts, as we can infer alt-1, alt-2, .... But + * for a Japanese locale, a CJK ideographic 0 is returned, and + * the CJK digits are not in code point order, so you can't + * really infer anything. The localedef for this locale did + * specify the succeeding digits, so that strftime() works + * properly on them, without needing to infer anything. But + * the nl_langinfo() return did not give sufficient information + * for the caller to understand what's going on. So until + * there is evidence that it should work differently, this + * returns the alt-0 string for ALT_DIGITS. + * + * wday was chosen because its range is all a single digit. + * Things like tm_sec have two digits as the minimum: '00' */ - case PERL_ALT_DIGITS: - tm.tm_wday = 0; - format = "%Ow"; /* Find the alternate digit for 0 */ - break; - } + LOCALE_UNLOCK; - /* We can't use my_strftime() because it doesn't look at tm_wday */ - while (0 == strftime(PL_langinfo_buf, PL_langinfo_bufsize, - format, &tm)) - { - /* A zero return means one of: - * a) there wasn't enough space in PL_langinfo_buf - * b) the format, like a plain %p, returns empty - * c) it was an illegal format, though some implementations of - * strftime will just return the illegal format as a plain - * character sequence. - * - * To quickly test for case 'b)', try again but precede the - * format with a plain character. If that result is still - * empty, the problem is either 'a)' or 'c)' */ - - Size_t format_size = strlen(format) + 1; - Size_t mod_size = format_size + 1; - char * mod_format; - char * temp_result; - - Newx(mod_format, mod_size, char); - Newx(temp_result, PL_langinfo_bufsize, char); - *mod_format = '\a'; - my_strlcpy(mod_format + 1, format, mod_size); - len = strftime(temp_result, - PL_langinfo_bufsize, - mod_format, &tm); - Safefree(mod_format); - Safefree(temp_result); - - /* If 'len' is non-zero, it means that we had a case like %p - * which means the current locale doesn't use a.m. or p.m., and - * that is valid */ - if (len == 0) { - - /* Here, still didn't work. If we get well beyond a - * reasonable size, bail out to prevent an infinite loop. */ - - if (PL_langinfo_bufsize > 100 * format_size) { + /* If to return the format, not the value, overwrite the buffer + * with it. But some strftime()s will keep the original format + * if illegal, so change those to "" */ + if (return_format) { + if (strEQ(PL_langinfo_buf, format)) { *PL_langinfo_buf = '\0'; } - else { /* Double the buffer size to retry; Add 1 in case - original was 0, so we aren't stuck at 0. */ - PL_langinfo_bufsize *= 2; - PL_langinfo_bufsize++; - Renew(PL_langinfo_buf, PL_langinfo_bufsize, char); - continue; + else { + save_to_buffer(format, &PL_langinfo_buf, + &PL_langinfo_bufsize, 0); } } - break; - } - - /* Here, we got a result. - * - * If the item is 'ALT_DIGITS', PL_langinfo_buf contains the - * alternate format for wday 0. If the value is the same as the - * normal 0, there isn't an alternate, so clear the buffer. */ - if ( item == PERL_ALT_DIGITS - && strEQ(PL_langinfo_buf, "0")) - { - *PL_langinfo_buf = '\0'; - } - - /* ALT_DIGITS is problematic. Experiments on it showed that - * strftime() did not always work properly when going from alt-9 to - * alt-10. Only a few locales have this item defined, and in all - * of them on Linux that khw was able to find, nl_langinfo() merely - * returned the alt-0 character, possibly doubled. Most Unicode - * digits are in blocks of 10 consecutive code points, so that is - * sufficient information for those scripts, as we can infer alt-1, - * alt-2, .... But for a Japanese locale, a CJK ideographic 0 is - * returned, and the CJK digits are not in code point order, so you - * can't really infer anything. The localedef for this locale did - * specify the succeeding digits, so that strftime() works properly - * on them, without needing to infer anything. But the - * nl_langinfo() return did not give sufficient information for the - * caller to understand what's going on. So until there is - * evidence that it should work differently, this returns the alt-0 - * string for ALT_DIGITS. - * - * wday was chosen because its range is all a single digit. Things - * like tm_sec have two digits as the minimum: '00' */ - - LOCALE_UNLOCK; - - /* If to return the format, not the value, overwrite the buffer - * with it. But some strftime()s will keep the original format if - * illegal, so change those to "" */ - if (return_format) { - if (strEQ(PL_langinfo_buf, format)) { - *PL_langinfo_buf = '\0'; - } - else { - save_to_buffer(format, &PL_langinfo_buf, - &PL_langinfo_bufsize, 0); - } - } - - break; + break; # endif + } } return PL_langinfo_buf; @@ -1723,21 +1924,6 @@ Perl_init_i18nl10n(pTHX_ int printwarn) PERL_UNUSED_ARG(printwarn); #else /* USE_LOCALE */ -# ifdef USE_LOCALE_CTYPE - - char *curctype = NULL; - -# endif /* USE_LOCALE_CTYPE */ -# ifdef USE_LOCALE_COLLATE - - char *curcoll = NULL; - -# endif /* USE_LOCALE_COLLATE */ -# ifdef USE_LOCALE_NUMERIC - - char *curnum = NULL; - -# endif /* USE_LOCALE_NUMERIC */ # ifdef __GLIBC__ const char * const language = savepv(PerlEnv_getenv("LANGUAGE")); @@ -1759,15 +1945,19 @@ Perl_init_i18nl10n(pTHX_ int printwarn) const char * const bad_lang_use_once = PerlEnv_getenv("PERL_BADLANG"); const bool locwarn = (printwarn > 1 - || (printwarn - && (! bad_lang_use_once + || ( printwarn + && ( ! bad_lang_use_once || ( - /* disallow with "" or "0" */ - *bad_lang_use_once - && strNE("0", bad_lang_use_once))))); - bool done = FALSE; - char * sl_result; /* return from setlocale() */ - char * locale_param; + /* disallow with "" or "0" */ + *bad_lang_use_once + && strNE("0", bad_lang_use_once))))); + + /* setlocale() return vals; not copied so must be looked at immediately */ + const char * sl_result[NOMINAL_LC_ALL_INDEX + 1]; + + /* current locale for given category; should have been copied so aren't + * volatile */ + const char * curlocales[NOMINAL_LC_ALL_INDEX + 1]; # ifdef WIN32 @@ -1780,7 +1970,10 @@ Perl_init_i18nl10n(pTHX_ int printwarn) const char *system_default_locale = NULL; # endif -# ifdef DEBUGGING + +# ifndef DEBUGGING +# define DEBUG_LOCALE_INIT(a,b,c) +# else DEBUG_INITIALIZATION_set(cBOOL(PerlEnv_getenv("PERL_DEBUG_LOCALE_INIT"))); @@ -1796,102 +1989,101 @@ Perl_init_i18nl10n(pTHX_ int printwarn) } \ } STMT_END -# else -# define DEBUG_LOCALE_INIT(a,b,c) -# endif - -# ifndef LOCALE_ENVIRON_REQUIRED +/* Make sure the parallel arrays are properly set up */ +# ifdef USE_LOCALE_NUMERIC + assert(categories[LC_NUMERIC_INDEX] == LC_NUMERIC); + assert(strEQ(category_names[LC_NUMERIC_INDEX], "LC_NUMERIC")); +# endif +# ifdef USE_LOCALE_CTYPE + assert(categories[LC_CTYPE_INDEX] == LC_CTYPE); + assert(strEQ(category_names[LC_CTYPE_INDEX], "LC_CTYPE")); +# endif +# ifdef USE_LOCALE_COLLATE + assert(categories[LC_COLLATE_INDEX] == LC_COLLATE); + assert(strEQ(category_names[LC_COLLATE_INDEX], "LC_COLLATE")); +# endif +# ifdef USE_LOCALE_TIME + assert(categories[LC_TIME_INDEX] == LC_TIME); + assert(strEQ(category_names[LC_TIME_INDEX], "LC_TIME")); +# endif +# ifdef USE_LOCALE_MESSAGES + assert(categories[LC_MESSAGES_INDEX] == LC_MESSAGES); + assert(strEQ(category_names[LC_MESSAGES_INDEX], "LC_MESSAGES")); +# endif +# ifdef USE_LOCALE_MONETARY + assert(categories[LC_MONETARY_INDEX] == LC_MONETARY); + assert(strEQ(category_names[LC_MONETARY_INDEX], "LC_MONETARY")); +# endif +# ifdef USE_LOCALE_ADDRESS + assert(categories[LC_ADDRESS_INDEX] == LC_ADDRESS); + assert(strEQ(category_names[LC_ADDRESS_INDEX], "LC_ADDRESS")); +# endif +# ifdef USE_LOCALE_IDENTIFICATION + assert(categories[LC_IDENTIFICATION_INDEX] == LC_IDENTIFICATION); + assert(strEQ(category_names[LC_IDENTIFICATION_INDEX], "LC_IDENTIFICATION")); +# endif +# ifdef USE_LOCALE_MEASUREMENT + assert(categories[LC_MEASUREMENT_INDEX] == LC_MEASUREMENT); + assert(strEQ(category_names[LC_MEASUREMENT_INDEX], "LC_MEASUREMENT")); +# endif +# ifdef USE_LOCALE_PAPER + assert(categories[LC_PAPER_INDEX] == LC_PAPER); + assert(strEQ(category_names[LC_PAPER_INDEX], "LC_PAPER")); +# endif +# ifdef USE_LOCALE_TELEPHONE + assert(categories[LC_TELEPHONE_INDEX] == LC_TELEPHONE); + assert(strEQ(category_names[LC_TELEPHONE_INDEX], "LC_TELEPHONE")); +# endif +# ifdef LC_ALL + assert(categories[LC_ALL_INDEX] == LC_ALL); + assert(strEQ(category_names[LC_ALL_INDEX], "LC_ALL")); + assert(NOMINAL_LC_ALL_INDEX == LC_ALL_INDEX); +# endif +# endif /* DEBUGGING */ - PERL_UNUSED_VAR(done); - PERL_UNUSED_VAR(locale_param); + /* Initialize the cache of the program's UTF-8ness for the always known + * locales C and POSIX */ + my_strlcpy(PL_locale_utf8ness, C_and_POSIX_utf8ness, + sizeof(PL_locale_utf8ness)); -# else +# ifdef LOCALE_ENVIRON_REQUIRED /* * Ultrix setlocale(..., "") fails if there are no environment * variables from which to get a locale name. */ -# ifdef LC_ALL - - if (lang) { - sl_result = do_setlocale_c(LC_ALL, setlocale_init); - DEBUG_LOCALE_INIT(LC_ALL, setlocale_init, sl_result); - if (sl_result) - done = TRUE; - else - setlocale_failure = TRUE; - } - if (! setlocale_failure) { - -# ifdef USE_LOCALE_CTYPE - - locale_param = (! done && (lang || PerlEnv_getenv("LC_CTYPE"))) - ? setlocale_init - : NULL; - curctype = do_setlocale_c(LC_CTYPE, locale_param); - DEBUG_LOCALE_INIT(LC_CTYPE, locale_param, sl_result); - if (! curctype) - setlocale_failure = TRUE; - else - curctype = savepv(curctype); - -# endif /* USE_LOCALE_CTYPE */ -# ifdef USE_LOCALE_COLLATE - - locale_param = (! done && (lang || PerlEnv_getenv("LC_COLLATE"))) - ? setlocale_init - : NULL; - curcoll = do_setlocale_c(LC_COLLATE, locale_param); - DEBUG_LOCALE_INIT(LC_COLLATE, locale_param, sl_result); - if (! curcoll) - setlocale_failure = TRUE; - else - curcoll = savepv(curcoll); - -# endif /* USE_LOCALE_COLLATE */ -# ifdef USE_LOCALE_NUMERIC - - locale_param = (! done && (lang || PerlEnv_getenv("LC_NUMERIC"))) - ? setlocale_init - : NULL; - curnum = do_setlocale_c(LC_NUMERIC, locale_param); - DEBUG_LOCALE_INIT(LC_NUMERIC, locale_param, sl_result); - if (! curnum) - setlocale_failure = TRUE; - else - curnum = savepv(curnum); - -# endif /* USE_LOCALE_NUMERIC */ -# ifdef USE_LOCALE_MESSAGES - - locale_param = (! done && (lang || PerlEnv_getenv("LC_MESSAGES"))) - ? setlocale_init - : NULL; - sl_result = do_setlocale_c(LC_MESSAGES, locale_param); - DEBUG_LOCALE_INIT(LC_MESSAGES, locale_param, sl_result); - if (! sl_result) { - setlocale_failure = TRUE; - } - -# endif /* USE_LOCALE_MESSAGES */ -# ifdef USE_LOCALE_MONETARY +# ifndef LC_ALL +# error Ultrix without LC_ALL not implemented +# else - locale_param = (! done && (lang || PerlEnv_getenv("LC_MONETARY"))) - ? setlocale_init - : NULL; - sl_result = do_setlocale_c(LC_MONETARY, locale_param); - DEBUG_LOCALE_INIT(LC_MONETARY, locale_param, sl_result); - if (! sl_result) { - setlocale_failure = TRUE; + { + bool done = FALSE; + if (lang) { + sl_result[LC_ALL_INDEX] = do_setlocale_c(LC_ALL, setlocale_init); + DEBUG_LOCALE_INIT(LC_ALL, setlocale_init, sl_result[LC_ALL_INDEX]); + if (sl_result[LC_ALL_INDEX]) + done = TRUE; + else + setlocale_failure = TRUE; + } + if (! setlocale_failure) { + const char * locale_param; + for (i = 0; i < LC_ALL_INDEX; i++) { + locale_param = (! done && (lang || PerlEnv_getenv(category_names[i]))) + ? setlocale_init + : NULL; + sl_result[i] = do_setlocale_r(categories[i], locale_param); + if (! sl_result[i]) { + setlocale_failure = TRUE; + } + DEBUG_LOCALE_INIT(categories[i], locale_param, sl_result[i]); + } } - -# endif /* USE_LOCALE_MONETARY */ - } # endif /* LC_ALL */ -# endif /* !LOCALE_ENVIRON_REQUIRED */ +# endif /* LOCALE_ENVIRON_REQUIRED */ /* We try each locale in the list until we get one that works, or exhaust * the list. Normally the loop is executed just once. But if setting the @@ -1911,7 +2103,7 @@ Perl_init_i18nl10n(pTHX_ int printwarn) setlocale_failure = FALSE; # ifdef SYSTEM_DEFAULT_LOCALE -# ifdef WIN32 +# ifdef WIN32 /* Note that assumes Win32 has LC_ALL */ /* On Windows machines, an entry of "" after the 0th means to use * the system default locale, which we now proceed to get. */ @@ -1936,15 +2128,18 @@ Perl_init_i18nl10n(pTHX_ int printwarn) trial_locale = system_default_locale; } -# endif /* WIN32 */ +# else +# error SYSTEM_DEFAULT_LOCALE only implemented for Win32 +# endif # endif /* SYSTEM_DEFAULT_LOCALE */ - } + + } /* For i > 0 */ # ifdef LC_ALL - sl_result = do_setlocale_c(LC_ALL, trial_locale); - DEBUG_LOCALE_INIT(LC_ALL, trial_locale, sl_result); - if (! sl_result) { + sl_result[LC_ALL_INDEX] = do_setlocale_c(LC_ALL, trial_locale); + DEBUG_LOCALE_INIT(LC_ALL, trial_locale, sl_result[LC_ALL_INDEX]); + if (! sl_result[LC_ALL_INDEX]) { setlocale_failure = TRUE; } else { @@ -1960,60 +2155,19 @@ Perl_init_i18nl10n(pTHX_ int printwarn) # endif /* LC_ALL */ - if (!setlocale_failure) { - -# ifdef USE_LOCALE_CTYPE - - Safefree(curctype); - curctype = do_setlocale_c(LC_CTYPE, trial_locale); - DEBUG_LOCALE_INIT(LC_CTYPE, trial_locale, curctype); - if (! curctype) - setlocale_failure = TRUE; - else - curctype = savepv(curctype); - -# endif /* USE_LOCALE_CTYPE */ -# ifdef USE_LOCALE_COLLATE - - Safefree(curcoll); - curcoll = do_setlocale_c(LC_COLLATE, trial_locale); - DEBUG_LOCALE_INIT(LC_COLLATE, trial_locale, curcoll); - if (! curcoll) - setlocale_failure = TRUE; - else - curcoll = savepv(curcoll); - -# endif /* USE_LOCALE_COLLATE */ -# ifdef USE_LOCALE_NUMERIC - - Safefree(curnum); - curnum = do_setlocale_c(LC_NUMERIC, trial_locale); - DEBUG_LOCALE_INIT(LC_NUMERIC, trial_locale, curnum); - if (! curnum) - setlocale_failure = TRUE; - else - curnum = savepv(curnum); - -# endif /* USE_LOCALE_NUMERIC */ -# ifdef USE_LOCALE_MESSAGES - - sl_result = do_setlocale_c(LC_MESSAGES, trial_locale); - DEBUG_LOCALE_INIT(LC_MESSAGES, trial_locale, sl_result); - if (! (sl_result)) - setlocale_failure = TRUE; - -# endif /* USE_LOCALE_MESSAGES */ -# ifdef USE_LOCALE_MONETARY - - sl_result = do_setlocale_c(LC_MONETARY, trial_locale); - DEBUG_LOCALE_INIT(LC_MONETARY, trial_locale, sl_result); - if (! (sl_result)) - setlocale_failure = TRUE; - -# endif /* USE_LOCALE_MONETARY */ + if (! setlocale_failure) { + unsigned int j; + for (j = 0; j < NOMINAL_LC_ALL_INDEX; j++) { + curlocales[j] + = savepv(do_setlocale_r(categories[j], trial_locale)); + if (! curlocales[j]) { + setlocale_failure = TRUE; + } + DEBUG_LOCALE_INIT(categories[j], trial_locale, curlocales[j]); + } - if (! setlocale_failure) { /* Success */ - break; + if (! setlocale_failure) { /* All succeeded */ + break; /* Exit trial_locales loop */ } } @@ -2035,25 +2189,14 @@ Perl_init_i18nl10n(pTHX_ int printwarn) PerlIO_printf(Perl_error_log, "perl: warning: Setting locale failed for the categories:\n\t"); -# ifdef USE_LOCALE_CTYPE - - if (! curctype) - PerlIO_printf(Perl_error_log, "LC_CTYPE "); - -# endif /* USE_LOCALE_CTYPE */ -# ifdef USE_LOCALE_COLLATE - if (! curcoll) - PerlIO_printf(Perl_error_log, "LC_COLLATE "); - -# endif /* USE_LOCALE_COLLATE */ -# ifdef USE_LOCALE_NUMERIC - - if (! curnum) - PerlIO_printf(Perl_error_log, "LC_NUMERIC "); - -# endif /* USE_LOCALE_NUMERIC */ - - PerlIO_printf(Perl_error_log, "and possibly others\n"); + for (j = 0; j < NOMINAL_LC_ALL_INDEX; j++) { + if (! curlocales[j]) { + PerlIO_printf(Perl_error_log, category_names[j]); + } + else { + Safefree(curlocales[j]); + } + } # endif /* LC_ALL */ @@ -2189,6 +2332,7 @@ Perl_init_i18nl10n(pTHX_ int printwarn) msg = "Falling back to"; } else { /* fallback failed */ + unsigned int j; /* We dropped off the end of the loop, so have to decrement i to * get back to the value the last time through */ @@ -2199,28 +2343,11 @@ Perl_init_i18nl10n(pTHX_ int printwarn) /* To continue, we should use whatever values we've got */ -# ifdef USE_LOCALE_CTYPE - - Safefree(curctype); - curctype = savepv(do_setlocale_c(LC_CTYPE, NULL)); - DEBUG_LOCALE_INIT(LC_CTYPE, NULL, curctype); - -# endif /* USE_LOCALE_CTYPE */ -# ifdef USE_LOCALE_COLLATE - - Safefree(curcoll); - curcoll = savepv(do_setlocale_c(LC_COLLATE, NULL)); - DEBUG_LOCALE_INIT(LC_COLLATE, NULL, curcoll); - -# endif /* USE_LOCALE_COLLATE */ -# ifdef USE_LOCALE_NUMERIC - - Safefree(curnum); - curnum = savepv(do_setlocale_c(LC_NUMERIC, NULL)); - DEBUG_LOCALE_INIT(LC_NUMERIC, NULL, curnum); - -# endif /* USE_LOCALE_NUMERIC */ - + for (j = 0; j < NOMINAL_LC_ALL_INDEX; j++) { + Safefree(curlocales[j]); + curlocales[j] = savepv(do_setlocale_r(categories[j], NULL)); + DEBUG_LOCALE_INIT(categories[j], NULL, curlocales[j]); + } } if (locwarn) { @@ -2257,29 +2384,54 @@ Perl_init_i18nl10n(pTHX_ int printwarn) } } /* End of tried to fallback */ + /* Done with finding the locales; update our records */ + # ifdef USE_LOCALE_CTYPE - new_ctype(curctype); + new_ctype(curlocales[LC_CTYPE_INDEX]); -# endif /* USE_LOCALE_CTYPE */ +# endif # ifdef USE_LOCALE_COLLATE - new_collate(curcoll); + new_collate(curlocales[LC_COLLATE_INDEX]); -# endif /* USE_LOCALE_COLLATE */ +# endif # ifdef USE_LOCALE_NUMERIC - new_numeric(curnum); + new_numeric(curlocales[LC_NUMERIC_INDEX]); + +# endif + + for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++) { + +# if defined(USE_ITHREADS) + + /* This caches whether each category's locale is UTF-8 or not. This + * may involve changing the locale. It is ok to do this at + * initialization time before any threads have started, but not later. + * Caching means that if the program heeds our dictate not to change + * locales in threaded applications, this data will remain valid, and + * it may get queried without changing locales. If the environment is + * such that all categories have the same locale, this isn't needed, as + * the code will not change the locale; but this handles the uncommon + * case where the environment has disparate locales for the categories + * */ + (void) _is_cur_LC_category_utf8(categories[i]); + +# endif + + Safefree(curlocales[i]); + } -# endif /* USE_LOCALE_NUMERIC */ # if defined(USE_PERLIO) && defined(USE_LOCALE_CTYPE) /* Set PL_utf8locale to TRUE if using PerlIO _and_ the current LC_CTYPE - * locale is UTF-8. If PL_utf8locale and PL_unicode (set by -C or by - * $ENV{PERL_UNICODE}) are true, perl.c:S_parse_body() will turn on the - * PerlIO :utf8 layer on STDIN, STDOUT, STDERR, _and_ the default open - * discipline. */ - PL_utf8locale = _is_cur_LC_category_utf8(LC_CTYPE); + * locale is UTF-8. The call to new_ctype() just above has already + * calculated the latter value and saved it in PL_in_utf8_CTYPE_locale. If + * both PL_utf8locale and PL_unicode (set by -C or by $ENV{PERL_UNICODE}) + * are true, perl.c:S_parse_body() will turn on the PerlIO :utf8 layer on + * STDIN, STDOUT, STDERR, _and_ the default open discipline. */ + PL_utf8locale = PL_in_utf8_CTYPE_locale; /* Set PL_unicode to $ENV{PERL_UNICODE} if using PerlIO. This is an alternative to using the -C command line switch @@ -2292,22 +2444,6 @@ Perl_init_i18nl10n(pTHX_ int printwarn) } # endif -# ifdef USE_LOCALE_CTYPE - - Safefree(curctype); - -# endif /* USE_LOCALE_CTYPE */ -# ifdef USE_LOCALE_COLLATE - - Safefree(curcoll); - -# endif /* USE_LOCALE_COLLATE */ -# ifdef USE_LOCALE_NUMERIC - - Safefree(curnum); - -# endif /* USE_LOCALE_NUMERIC */ - # ifdef __GLIBC__ Safefree(language); @@ -2476,7 +2612,7 @@ Perl__mem_collxfrm(pTHX_ const char *input_string, } /* End of determining the character that is to replace NULs */ /* If the replacement is variant under UTF-8, it must match the - * UTF8-ness as the original */ + * UTF8-ness of the original */ if ( ! UVCHR_IS_INVARIANT(PL_strxfrm_NUL_replacement) && utf8) { this_replacement_char[0] = UTF8_EIGHT_BIT_HI(PL_strxfrm_NUL_replacement); @@ -2932,39 +3068,105 @@ Perl__is_cur_LC_category_utf8(pTHX_ int category) * English, it comes down to if the locale's name ends in something like * "UTF-8". It errs on the side of not being a UTF-8 locale. */ - char *save_input_locale = NULL; + /* Name of current locale corresponding to the input category */ + const char *save_input_locale = NULL; + + bool is_utf8 = FALSE; /* The return value */ STRLEN final_pos; + /* The variables below are for the cache of previous lookups using this + * function. The cache is a C string, described at the definition for + * 'C_and_POSIX_utf8ness'. + * + * The first part of the cache is fixed, for the C and POSIX locales. The + * varying part starts just after them. */ + char * utf8ness_cache = PL_locale_utf8ness + STRLENs(C_and_POSIX_utf8ness); + + Size_t utf8ness_cache_size; /* Size of the varying portion */ + Size_t input_name_len; /* Length in bytes of save_input_locale */ + Size_t input_name_len_with_overhead; /* plus extra chars used to store + the name in the cache */ + char * delimited; /* The name plus the delimiters used to store + it in the cache */ + char * name_pos; /* position of 'delimited' in the cache, or 0 + if not there */ + + # ifdef LC_ALL assert(category != LC_ALL); # endif - /* First dispose of the trivial cases */ + /* Get the desired category's locale */ save_input_locale = do_setlocale_r(category, NULL); if (! save_input_locale) { - DEBUG_L(PerlIO_printf(Perl_debug_log, - "Could not find current locale for category %d\n", - category)); - return FALSE; /* XXX maybe should croak */ + Perl_croak(aTHX_ + "panic: %s: %d: Could not find current locale for %s\n", + __FILE__, __LINE__, category_name(category)); } + save_input_locale = stdize_locale(savepv(save_input_locale)); - if (isNAME_C_OR_POSIX(save_input_locale)) { - DEBUG_L(PerlIO_printf(Perl_debug_log, - "Current locale for category %d is %s\n", - category, save_input_locale)); + DEBUG_L(PerlIO_printf(Perl_debug_log, + "Current locale for %s is %s\n", + category_name(category), save_input_locale)); + + input_name_len = strlen(save_input_locale); + + /* In our cache, each name is accompanied by two delimiters and a single + * utf8ness digit */ + input_name_len_with_overhead = input_name_len + 3; + + /* Allocate and populate space for a copy of the name surrounded by the + * delimiters */ + Newx(delimited, input_name_len_with_overhead, char); + delimited[0] = UTF8NESS_SEP[0]; + Copy(save_input_locale, delimited + 1, input_name_len, char); + delimited[input_name_len+1] = UTF8NESS_PREFIX[0]; + delimited[input_name_len+2] = '\0'; + + /* And see if that is in the cache */ + name_pos = instr(PL_locale_utf8ness, delimited); + if (name_pos) { + is_utf8 = *(name_pos + input_name_len_with_overhead - 1) - '0'; + +# ifdef DEBUGGING + + if (DEBUG_Lv_TEST || debug_initialization) { + PerlIO_printf(Perl_debug_log, "UTF8ness for locale %s=%d, \n", + save_input_locale, is_utf8); + } + +# endif + + /* And, if not already in that position, move it to the beginning of + * the non-constant portion of the list, since it is the most recently + * used. (We don't have to worry about overflow, since just moving + * existing names around) */ + if (name_pos > utf8ness_cache) { + Move(utf8ness_cache, + utf8ness_cache + input_name_len_with_overhead, + name_pos - utf8ness_cache, char); + Copy(delimited, + utf8ness_cache, + input_name_len_with_overhead - 1, char); + utf8ness_cache[input_name_len_with_overhead - 1] = is_utf8 + '0'; + } + + Safefree(delimited); Safefree(save_input_locale); - return FALSE; + return is_utf8; } + /* Here we don't have stored the utf8ness for the input locale. We have to + * calculate it */ + # if defined(USE_LOCALE_CTYPE) \ && (defined(MB_CUR_MAX) || (defined(HAS_NL_LANGINFO) && defined(CODESET))) { /* Next try nl_langinfo or MB_CUR_MAX if available */ char *save_ctype_locale = NULL; - bool is_utf8; if (category != LC_CTYPE) { /* These work only on LC_CTYPE */ @@ -3002,9 +3204,8 @@ Perl__is_cur_LC_category_utf8(pTHX_ int category) * should give the correct results */ # if defined(HAS_NL_LANGINFO) && defined(CODESET) - /* The task is easiest if has this POSIX 2001 function */ - { + { /* The task is easiest if the platform has this POSIX 2001 function */ const char *codeset = my_nl_langinfo(PERL_CODESET, FALSE); /* FALSE => already in dest locale */ @@ -3018,14 +3219,18 @@ Perl__is_cur_LC_category_utf8(pTHX_ int category) Safefree(save_ctype_locale); } - is_utf8 = foldEQ(codeset, STR_WITH_LEN("UTF-8")) - || foldEQ(codeset, STR_WITH_LEN("UTF8")); + /* If the implementation of foldEQ() somehow were + * to change to not go byte-by-byte, this could + * read past end of string, as only one length is + * checked. But currently, a premature NUL will + * compare false, and it will stop there */ + is_utf8 = cBOOL( foldEQ(codeset, STR_WITH_LEN("UTF-8")) + || foldEQ(codeset, STR_WITH_LEN("UTF8"))); DEBUG_L(PerlIO_printf(Perl_debug_log, "\tnllanginfo returned CODESET '%s'; ?UTF8 locale=%d\n", codeset, is_utf8)); - Safefree(save_input_locale); - return is_utf8; + goto finish_and_return; } } @@ -3039,7 +3244,7 @@ Perl__is_cur_LC_category_utf8(pTHX_ int category) * Unicode code point. Since UTF-8 is the only non-single byte * encoding we handle, we just say any such encoding is UTF-8, and if * turns out to be wrong, other things will fail */ - is_utf8 = MB_CUR_MAX >= 4; + is_utf8 = (unsigned) MB_CUR_MAX >= STRLENs(MAX_UNICODE_UTF8); DEBUG_L(PerlIO_printf(Perl_debug_log, "\tMB_CUR_MAX=%d; ?UTF8 locale=%d\n", @@ -3063,7 +3268,7 @@ Perl__is_cur_LC_category_utf8(pTHX_ int category) if ( len != STRLENs(REPLACEMENT_CHARACTER_UTF8) - || wc != (wchar_t) 0xFFFD) + || wc != (wchar_t) UNICODE_REPLACEMENT) { is_utf8 = FALSE; DEBUG_L(PerlIO_printf(Perl_debug_log, "\replacement=U+%x\n", @@ -3082,7 +3287,7 @@ Perl__is_cur_LC_category_utf8(pTHX_ int category) Safefree(save_ctype_locale); } - return is_utf8; + goto finish_and_return; # endif @@ -3107,7 +3312,6 @@ Perl__is_cur_LC_category_utf8(pTHX_ int category) { char *save_monetary_locale = NULL; bool only_ascii = FALSE; - bool is_utf8 = FALSE; struct lconv* lc; /* Like above for LC_CTYPE, we first set LC_MONETARY to the locale of @@ -3164,8 +3368,7 @@ Perl__is_cur_LC_category_utf8(pTHX_ int category) * is non-ascii UTF-8. */ DEBUG_L(PerlIO_printf(Perl_debug_log, "\t?Currency symbol for %s is UTF-8=%d\n", save_input_locale, is_utf8)); - Safefree(save_input_locale); - return is_utf8; + goto finish_and_return; } } cant_use_monetary: @@ -3250,8 +3453,8 @@ Perl__is_cur_LC_category_utf8(pTHX_ int category) DEBUG_L(PerlIO_printf(Perl_debug_log, "\t?time-related strings for %s are UTF-8=%d\n", save_input_locale, is_utf8_string((U8 *) formatted_time, 0))); - Safefree(save_input_locale); - return is_utf8_string((U8 *) formatted_time, 0); + is_utf8 = is_utf8_string((U8 *) formatted_time, 0); + goto finish_and_return; } /* Falling off the end of the loop indicates all the names were just @@ -3282,7 +3485,6 @@ Perl__is_cur_LC_category_utf8(pTHX_ int category) * are much more likely to have been translated. */ { int e; - bool is_utf8 = FALSE; bool non_ascii = FALSE; char *save_messages_locale = NULL; const char * errmsg = NULL; @@ -3346,8 +3548,7 @@ Perl__is_cur_LC_category_utf8(pTHX_ int category) DEBUG_L(PerlIO_printf(Perl_debug_log, "\t?error messages for %s are UTF-8=%d\n", save_input_locale, is_utf8)); - Safefree(save_input_locale); - return is_utf8; + goto finish_and_return; } DEBUG_L(PerlIO_printf(Perl_debug_log, "All error messages for %s contain only ASCII; can't use for determining if UTF-8 locale\n", save_input_locale)); @@ -3369,7 +3570,7 @@ Perl__is_cur_LC_category_utf8(pTHX_ int category) final_pos = strlen(save_input_locale) - 1; if (final_pos >= 3) { - char *name = save_input_locale; + const char *name = save_input_locale; /* Find next 'U' or 'u' and look from there */ while ((name += strcspn(name, "Uu") + 1) @@ -3391,8 +3592,8 @@ Perl__is_cur_LC_category_utf8(pTHX_ int category) DEBUG_L(PerlIO_printf(Perl_debug_log, "Locale %s ends with UTF-8 in name\n", save_input_locale)); - Safefree(save_input_locale); - return TRUE; + is_utf8 = TRUE; + goto finish_and_return; } } DEBUG_L(PerlIO_printf(Perl_debug_log, @@ -3400,19 +3601,19 @@ Perl__is_cur_LC_category_utf8(pTHX_ int category) save_input_locale)); } -# endif -# ifdef WIN32 +# ifdef WIN32 /* http://msdn.microsoft.com/en-us/library/windows/desktop/dd317756.aspx */ if (memENDs(save_input_locale, final_pos, "65001")) { DEBUG_L(PerlIO_printf(Perl_debug_log, "Locale %s ends with 65001 in name, is UTF-8 locale\n", save_input_locale)); - Safefree(save_input_locale); - return TRUE; + is_utf8 = TRUE; + goto finish_and_return; } -# endif +# endif +# endif /* Other common encodings are the ISO 8859 series, which aren't UTF-8. But * since we are about to return FALSE anyway, there is no point in doing @@ -3423,16 +3624,81 @@ Perl__is_cur_LC_category_utf8(pTHX_ int category) DEBUG_L(PerlIO_printf(Perl_debug_log, "Locale %s has 8859 in name, not UTF-8 locale\n", save_input_locale)); - Safefree(save_input_locale); - return FALSE; + is_utf8 = FALSE; + goto finish_and_return; } # endif DEBUG_L(PerlIO_printf(Perl_debug_log, "Assuming locale %s is not a UTF-8 locale\n", save_input_locale)); + is_utf8 = FALSE; + + finish_and_return: + + /* Cache this result so we don't have to go through all this next time. */ + utf8ness_cache_size = sizeof(PL_locale_utf8ness) + - (utf8ness_cache - PL_locale_utf8ness); + + /* But we can't save it if it is too large for the total space available */ + if (LIKELY(input_name_len_with_overhead < utf8ness_cache_size)) { + Size_t utf8ness_cache_len = strlen(utf8ness_cache); + + /* Here it can fit, but we may need to clear out the oldest cached + * result(s) to do so. Check */ + if (utf8ness_cache_len + input_name_len_with_overhead + >= utf8ness_cache_size) + { + /* Here we have to clear something out to make room for this. + * Start looking at the rightmost place where it could fit and find + * the beginning of the entry that extends past that. */ + char * cutoff = (char *) my_memrchr(utf8ness_cache, + UTF8NESS_SEP[0], + utf8ness_cache_size + - input_name_len_with_overhead); + + assert(cutoff); + assert(cutoff >= utf8ness_cache); + + /* This and all subsequent entries must be removed */ + *cutoff = '\0'; + utf8ness_cache_len = strlen(utf8ness_cache); + } + + /* Make space for the new entry */ + Move(utf8ness_cache, + utf8ness_cache + input_name_len_with_overhead, + utf8ness_cache_len + 1 /* Incl. trailing NUL */, char); + + /* And insert it */ + Copy(delimited, utf8ness_cache, input_name_len_with_overhead - 1, char); + utf8ness_cache[input_name_len_with_overhead - 1] = is_utf8 + '0'; + + if ((PL_locale_utf8ness[strlen(PL_locale_utf8ness)-1] + & (PERL_UINTMAX_T) ~1) != '0') + { + Perl_croak(aTHX_ + "panic: %s: %d: Corrupt utf8ness_cache=%s\nlen=%u," + " inserted_name=%s, its_len=%u\n", + __FILE__, __LINE__, + PL_locale_utf8ness, strlen(PL_locale_utf8ness), + delimited, input_name_len_with_overhead); + } + } + +# ifdef DEBUGGING + + if (DEBUG_Lv_TEST || debug_initialization) { + PerlIO_printf(Perl_debug_log, + "PL_locale_utf8ness is now %s; returning %d\n", + PL_locale_utf8ness, is_utf8); + } + +# endif + + Safefree(delimited); Safefree(save_input_locale); - return FALSE; + return is_utf8; } #endif @@ -3487,18 +3753,53 @@ Perl_my_strerror(pTHX_ const int errnum) # if defined(HAS_POSIX_2008_LOCALE) && defined(HAS_STRERROR_L) - /* This function is trivial if we have strerror_l() */ + /* This function is trivial if we don't have to worry about thread safety + * and have strerror_l(), as it handles the switch of locales so we don't + * have to deal with that. We don't have to worry about thread safety if + * this is an unthreaded build, or if strerror_r() is also available. Both + * it and strerror_l() are thread-safe. Plain strerror() isn't thread + * safe. But on threaded builds when strerror_r() is available, the + * apparent call to strerror() below is actually a macro that + * behind-the-scenes calls strerror_r(). + */ + +# if ! defined(USE_ITHREADS) || defined(HAS_STRERROR_R) if (within_locale_scope) { - errstr = strerror(errnum); + errstr = savepv(strerror(errnum)); } else { - errstr = strerror_l(errnum, PL_C_locale_obj); + errstr = savepv(strerror_l(errnum, PL_C_locale_obj)); + } + +# else + + /* Here we have strerror_l(), but not strerror_r() and we are on a + * threaded-build. We use strerror_l() for everything, constructing a + * locale to pass to it if necessary */ + + bool do_free = FALSE; + locale_t locale_to_use; + + if (within_locale_scope) { + locale_to_use = uselocale((locale_t) 0); + if (locale_to_use == LC_GLOBAL_LOCALE) { + locale_to_use = duplocale(LC_GLOBAL_LOCALE); + do_free = TRUE; + } + } + else { /* Use C locale if not within 'use locale' scope */ + locale_to_use = PL_C_locale_obj; } - errstr = savepv(errstr); + errstr = savepv(strerror_l(errnum, locale_to_use)); + + if (do_free) { + freelocale(locale_to_use); + } -# else /* Doesn't have strerror_l(). */ +# endif +# else /* Doesn't have strerror_l() */ # ifdef USE_POSIX_2008_LOCALE @@ -3506,7 +3807,7 @@ Perl_my_strerror(pTHX_ const int errnum) # else - char * save_locale = NULL; + const char * save_locale = NULL; bool locale_is_C = FALSE; /* We have a critical section to prevent another thread from changing the @@ -3681,65 +3982,9 @@ S_setlocale_debug_string(const int category, /* category number, static char ret[128] = "If you can read this, thank your buggy C" " library strlcpy(), and change your hints file" " to undef it"; - my_strlcpy(ret, "setlocale(", sizeof(ret)); - - switch (category) { - default: - my_snprintf(ret, sizeof(ret), "%s? %d", ret, category); - break; - -# ifdef LC_ALL - - case LC_ALL: - my_strlcat(ret, "LC_ALL", sizeof(ret)); - break; - -# endif -# ifdef LC_CTYPE - - case LC_CTYPE: - my_strlcat(ret, "LC_CTYPE", sizeof(ret)); - break; - -# endif -# ifdef LC_NUMERIC - - case LC_NUMERIC: - my_strlcat(ret, "LC_NUMERIC", sizeof(ret)); - break; - -# endif -# ifdef LC_COLLATE - - case LC_COLLATE: - my_strlcat(ret, "LC_COLLATE", sizeof(ret)); - break; - -# endif -# ifdef LC_TIME - - case LC_TIME: - my_strlcat(ret, "LC_TIME", sizeof(ret)); - break; - -# endif -# ifdef LC_MONETARY - - case LC_MONETARY: - my_strlcat(ret, "LC_MONETARY", sizeof(ret)); - break; - -# endif -# ifdef LC_MESSAGES - - case LC_MESSAGES: - my_strlcat(ret, "LC_MESSAGES", sizeof(ret)); - break; - -# endif - - } + my_strlcpy(ret, "setlocale(", sizeof(ret)); + my_strlcat(ret, category_name(category), sizeof(ret)); my_strlcat(ret, ", ", sizeof(ret)); if (locale) {