X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/c342d20eb18956826cfda8e89f2e6940436a9379..94646a694b90b98eae52c029474265026e05f891:/locale.c diff --git a/locale.c b/locale.c index 8521ffd..66c64cc 100644 --- a/locale.c +++ b/locale.c @@ -23,8 +23,8 @@ /* utility functions for handling locale-specific stuff like what * character represents the decimal point. * - * All C programs have an underlying locale. Perl generally doesn't pay any - * attention to it except within the scope of a 'use locale'. For most + * All C programs have an underlying locale. Perl code generally doesn't pay + * any attention to it except within the scope of a 'use locale'. For most * categories, it accomplishes this by just using different operations if it is * in such scope than if not. However, various libc functions called by Perl * are affected by the LC_NUMERIC category, so there are macros in perl.h that @@ -36,48 +36,88 @@ #include "EXTERN.h" #define PERL_IN_LOCALE_C +#include "perl_langinfo.h" #include "perl.h" -#ifdef I_LANGINFO -# include -#endif - #include "reentr.h" +#ifdef I_WCHAR +# include +#endif + /* If the environment says to, we can output debugging information during * initialization. This is done before option parsing, and before any thread * creation, so can be a file-level static */ -#ifdef DEBUGGING -# ifdef PERL_GLOBAL_STRUCT - /* no global syms allowed */ +#if ! defined(DEBUGGING) || defined(PERL_GLOBAL_STRUCT) # define debug_initialization 0 # define DEBUG_INITIALIZATION_set(v) -# else +#else static bool debug_initialization = FALSE; # define DEBUG_INITIALIZATION_set(v) (debug_initialization = v) -# endif #endif +/* strlen() of a literal string constant. We might want this more general, + * but using it in just this file for now. A problem with more generality is + * the compiler warnings about comparing unlike signs */ +#define STRLENs(s) (sizeof("" s "") - 1) + +/* Is the C string input 'name' "C" or "POSIX"? If so, and 'name' is the + * return of setlocale(), then this is extremely likely to be the C or POSIX + * locale. However, the output of setlocale() is documented to be opaque, but + * the odds are extremely small that it would return these two strings for some + * other locale. Note that VMS in these two locales includes many non-ASCII + * characters as controls and punctuation (below are hex bytes): + * cntrl: 84-97 9B-9F + * punct: A1-A3 A5 A7-AB B0-B3 B5-B7 B9-BD BF-CF D1-DD DF-EF F1-FD + * Oddly, none there are listed as alphas, though some represent alphabetics + * http://www.nntp.perl.org/group/perl.perl5.porters/2013/02/msg198753.html */ +#define isNAME_C_OR_POSIX(name) \ + ( (name) != NULL \ + && (( *(name) == 'C' && (*(name + 1)) == '\0') \ + || strEQ((name), "POSIX"))) + #ifdef USE_LOCALE -/* - * Standardize the locale name from a string returned by 'setlocale', possibly - * modifying that string. - * - * The typical return value of setlocale() is either - * (1) "xx_YY" if the first argument of setlocale() is not LC_ALL - * (2) "xa_YY xb_YY ..." if the first argument of setlocale() is LC_ALL - * (the space-separated values represent the various sublocales, - * in some unspecified order). This is not handled by this function. +/* This code keeps a LRU cache of the UTF-8ness of the locales it has so-far + * looked up. This is in the form of a C string: */ + +#define UTF8NESS_SEP "\v" +#define UTF8NESS_PREFIX "\f" + +/* So, the string looks like: * - * In some platforms it has a form like "LC_SOMETHING=Lang_Country.866\n", - * which is harmful for further use of the string in setlocale(). This - * function removes the trailing new line and everything up through the '=' + * \vC\a0\vPOSIX\a0\vam_ET\a0\vaf_ZA.utf8\a1\ven_US.UTF-8\a1\0 * - */ + * where the digit 0 after the \a indicates that the locale starting just + * after the preceding \v is not UTF-8, and the digit 1 mean it is. */ + +STATIC_ASSERT_DECL(STRLENs(UTF8NESS_SEP) == 1); +STATIC_ASSERT_DECL(STRLENs(UTF8NESS_PREFIX) == 1); + +#define C_and_POSIX_utf8ness UTF8NESS_SEP "C" UTF8NESS_PREFIX "0" \ + UTF8NESS_SEP "POSIX" UTF8NESS_PREFIX "0" + +/* The cache is initialized to C_and_POSIX_utf8ness at start up. These are + * kept there always. The remining portion of the cache is LRU, with the + * oldest looked-up locale at the tail end */ + STATIC char * S_stdize_locale(pTHX_ char *locs) { + /* Standardize the locale name from a string returned by 'setlocale', + * possibly modifying that string. + * + * The typical return value of setlocale() is either + * (1) "xx_YY" if the first argument of setlocale() is not LC_ALL + * (2) "xa_YY xb_YY ..." if the first argument of setlocale() is LC_ALL + * (the space-separated values represent the various sublocales, + * in some unspecified order). This is not handled by this function. + * + * In some platforms it has a form like "LC_SOMETHING=Lang_Country.866\n", + * which is harmful for further use of the string in setlocale(). This + * function removes the trailing new line and everything up through the '=' + * */ + const char * const s = strchr(locs, '='); bool okay = TRUE; @@ -103,78 +143,302 @@ S_stdize_locale(pTHX_ char *locs) return locs; } +/* Two parallel arrays; first the locale categories Perl uses on this system; + * the second array is their names. These arrays are in mostly arbitrary + * order. */ + +const int categories[] = { + +# ifdef USE_LOCALE_NUMERIC + LC_NUMERIC, +# endif +# ifdef USE_LOCALE_CTYPE + LC_CTYPE, +# endif +# ifdef USE_LOCALE_COLLATE + LC_COLLATE, +# endif +# ifdef USE_LOCALE_TIME + LC_TIME, +# endif +# ifdef USE_LOCALE_MESSAGES + LC_MESSAGES, +# endif +# ifdef USE_LOCALE_MONETARY + LC_MONETARY, +# endif +# ifdef USE_LOCALE_ADDRESS + LC_ADDRESS, +# endif +# ifdef USE_LOCALE_IDENTIFICATION + LC_IDENTIFICATION, +# endif +# ifdef USE_LOCALE_MEASUREMENT + LC_MEASUREMENT, +# endif +# ifdef USE_LOCALE_PAPER + LC_PAPER, +# endif +# ifdef USE_LOCALE_TELEPHONE + LC_TELEPHONE, +# endif +# ifdef LC_ALL + LC_ALL, +# endif + -1 /* Placeholder because C doesn't allow a + trailing comma, and it would get complicated + with all the #ifdef's */ +}; + +/* The top-most real element is LC_ALL */ + +const char * category_names[] = { + +# ifdef USE_LOCALE_NUMERIC + "LC_NUMERIC", +# endif +# ifdef USE_LOCALE_CTYPE + "LC_CTYPE", +# endif +# ifdef USE_LOCALE_COLLATE + "LC_COLLATE", +# endif +# ifdef USE_LOCALE_TIME + "LC_TIME", +# endif +# ifdef USE_LOCALE_MESSAGES + "LC_MESSAGES", +# endif +# ifdef USE_LOCALE_MONETARY + "LC_MONETARY", +# endif +# ifdef USE_LOCALE_ADDRESS + "LC_ADDRESS", +# endif +# ifdef USE_LOCALE_IDENTIFICATION + "LC_IDENTIFICATION", +# endif +# ifdef USE_LOCALE_MEASUREMENT + "LC_MEASUREMENT", +# endif +# ifdef USE_LOCALE_PAPER + "LC_PAPER", +# endif +# ifdef USE_LOCALE_TELEPHONE + "LC_TELEPHONE", +# endif +# ifdef LC_ALL + "LC_ALL", +# endif + NULL /* Placeholder */ + }; + +# ifdef LC_ALL + + /* On systems with LC_ALL, it is kept in the highest index position. (-2 + * to account for the final unused placeholder element.) */ +# define NOMINAL_LC_ALL_INDEX (C_ARRAY_LENGTH(categories) - 2) + +# else + + /* On systems without LC_ALL, we pretend it is there, one beyond the real + * top element, hence in the unused placeholder element. */ +# define NOMINAL_LC_ALL_INDEX (C_ARRAY_LENGTH(categories) - 1) + +# endif + +/* Pretending there is an LC_ALL element just above allows us to avoid most + * special cases. Most loops through these arrays in the code below are + * written like 'for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++)'. They will work + * on either type of system. But the code must be written to not access the + * element at 'LC_ALL_INDEX' except on platforms that have it. This can be + * checked for at compile time by using the #define LC_ALL_INDEX which is only + * defined if we do have LC_ALL. */ + +STATIC const char * +S_category_name(const int category) +{ + unsigned int i; + +#ifdef LC_ALL + + if (category == LC_ALL) { + return "LC_ALL"; + } + #endif -void -Perl_set_numeric_radix(pTHX) + for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++) { + if (category == categories[i]) { + return category_names[i]; + } + } + + { + const char suffix[] = " (unknown)"; + int temp = category; + Size_t length = sizeof(suffix) + 1; + char * unknown; + dTHX; + + if (temp < 0) { + length++; + temp = - temp; + } + + /* Calculate the number of digits */ + while (temp >= 10) { + temp /= 10; + length++; + } + + Newx(unknown, length, char); + my_snprintf(unknown, length, "%d%s", category, suffix); + SAVEFREEPV(unknown); + return unknown; + } +} + +/* Now create LC_foo_INDEX #defines for just those categories on this system */ +# ifdef USE_LOCALE_NUMERIC +# define LC_NUMERIC_INDEX 0 +# define _DUMMY_NUMERIC LC_NUMERIC_INDEX +# else +# define _DUMMY_NUMERIC -1 +# endif +# ifdef USE_LOCALE_CTYPE +# define LC_CTYPE_INDEX _DUMMY_NUMERIC + 1 +# define _DUMMY_CTYPE LC_CTYPE_INDEX +# else +# define _DUMMY_CTYPE _DUMMY_NUMERIC +# endif +# ifdef USE_LOCALE_COLLATE +# define LC_COLLATE_INDEX _DUMMY_CTYPE + 1 +# define _DUMMY_COLLATE LC_COLLATE_INDEX +# else +# define _DUMMY_COLLATE _DUMMY_COLLATE +# endif +# ifdef USE_LOCALE_TIME +# define LC_TIME_INDEX _DUMMY_COLLATE + 1 +# define _DUMMY_TIME LC_TIME_INDEX +# else +# define _DUMMY_TIME _DUMMY_COLLATE +# endif +# ifdef USE_LOCALE_MESSAGES +# define LC_MESSAGES_INDEX _DUMMY_TIME + 1 +# define _DUMMY_MESSAGES LC_MESSAGES_INDEX +# else +# define _DUMMY_MESSAGES _DUMMY_TIME +# endif +# ifdef USE_LOCALE_MONETARY +# define LC_MONETARY_INDEX _DUMMY_MESSAGES + 1 +# define _DUMMY_MONETARY LC_MONETARY_INDEX +# else +# define _DUMMY_MONETARY _DUMMY_MESSAGES +# endif +# ifdef USE_LOCALE_ADDRESS +# define LC_ADDRESS_INDEX _DUMMY_MONETARY + 1 +# define _DUMMY_ADDRESS LC_ADDRESS_INDEX +# else +# define _DUMMY_ADDRESS _DUMMY_MONETARY +# endif +# ifdef USE_LOCALE_IDENTIFICATION +# define LC_IDENTIFICATION_INDEX _DUMMY_ADDRESS + 1 +# define _DUMMY_IDENTIFICATION LC_IDENTIFICATION_INDEX +# else +# define _DUMMY_IDENTIFICATION _DUMMY_ADDRESS +# endif +# ifdef USE_LOCALE_MEASUREMENT +# define LC_MEASUREMENT_INDEX _DUMMY_IDENTIFICATION + 1 +# define _DUMMY_MEASUREMENT LC_MEASUREMENT_INDEX +# else +# define _DUMMY_MEASUREMENT _DUMMY_IDENTIFICATION +# endif +# ifdef USE_LOCALE_PAPER +# define LC_PAPER_INDEX _DUMMY_MEASUREMENT + 1 +# define _DUMMY_PAPER LC_PAPER_INDEX +# else +# define _DUMMY_PAPER _DUMMY_MEASUREMENT +# endif +# ifdef USE_LOCALE_TELEPHONE +# define LC_TELEPHONE_INDEX _DUMMY_PAPER + 1 +# define _DUMMY_TELEPHONE LC_TELEPHONE_INDEX +# else +# define _DUMMY_TELEPHONE _DUMMY_PAPER +# endif +# ifdef LC_ALL +# define LC_ALL_INDEX _DUMMY_TELEPHONE + 1 +# endif +#endif /* ifdef USE_LOCALE */ + +/* Windows requres a customized base-level setlocale() */ +# ifdef WIN32 +# define my_setlocale(cat, locale) win32_setlocale(cat, locale) +# else +# define my_setlocale(cat, locale) setlocale(cat, locale) +# endif + +/* Just placeholders for now. "_c" is intended to be called when the category + * is a constant known at compile time; "_r", not known until run time */ +# define do_setlocale_c(category, locale) my_setlocale(category, locale) +# define do_setlocale_r(category, locale) my_setlocale(category, locale) + +STATIC void +S_set_numeric_radix(pTHX_ const bool use_locale) { -#ifdef USE_LOCALE_NUMERIC -# ifdef HAS_LOCALECONV - const struct lconv* const lc = localeconv(); + /* If 'use_locale' is FALSE, set to use a dot for the radix character. If + * TRUE, use the radix character derived from the current locale */ - if (lc && lc->decimal_point) { - if (lc->decimal_point[0] == '.' && lc->decimal_point[1] == 0) { - SvREFCNT_dec(PL_numeric_radix_sv); - PL_numeric_radix_sv = NULL; - } - else { - if (PL_numeric_radix_sv) - sv_setpv(PL_numeric_radix_sv, lc->decimal_point); - else - PL_numeric_radix_sv = newSVpv(lc->decimal_point, 0); - if (! is_utf8_invariant_string((U8 *) lc->decimal_point, 0) - && is_utf8_string((U8 *) lc->decimal_point, 0) - && _is_cur_LC_category_utf8(LC_NUMERIC)) - { - SvUTF8_on(PL_numeric_radix_sv); - } - } +#if defined(USE_LOCALE_NUMERIC) && ( defined(HAS_LOCALECONV) \ + || defined(HAS_NL_LANGINFO)) + + const char * radix = (use_locale) + ? my_nl_langinfo(PERL_RADIXCHAR, FALSE) + /* FALSE => already in dest locale */ + : "."; + + sv_setpv(PL_numeric_radix_sv, radix); + + /* If this is valid UTF-8 that isn't totally ASCII, and we are in + * a UTF-8 locale, then mark the radix as being in UTF-8 */ + if (is_utf8_non_invariant_string((U8 *) SvPVX(PL_numeric_radix_sv), + SvCUR(PL_numeric_radix_sv)) + && _is_cur_LC_category_utf8(LC_NUMERIC)) + { + SvUTF8_on(PL_numeric_radix_sv); } - else - PL_numeric_radix_sv = NULL; -#ifdef DEBUGGING +# ifdef DEBUGGING + if (DEBUG_L_TEST || debug_initialization) { PerlIO_printf(Perl_debug_log, "Locale radix is '%s', ?UTF-8=%d\n", - (PL_numeric_radix_sv) - ? SvPVX(PL_numeric_radix_sv) - : "NULL", - (PL_numeric_radix_sv) - ? cBOOL(SvUTF8(PL_numeric_radix_sv)) - : 0); + SvPVX(PL_numeric_radix_sv), + cBOOL(SvUTF8(PL_numeric_radix_sv))); } -#endif -# endif /* HAS_LOCALECONV */ -#endif /* USE_LOCALE_NUMERIC */ +# endif +#endif /* USE_LOCALE_NUMERIC and can find the radix char */ + } -/* Is the C string input 'name' "C" or "POSIX"? If so, and 'name' is the - * return of setlocale(), then this is extremely likely to be the C or POSIX - * locale. However, the output of setlocale() is documented to be opaque, but - * the odds are extremely small that it would return these two strings for some - * other locale. Note that VMS in these two locales includes many non-ASCII - * characters as controls and punctuation (below are hex bytes): - * cntrl: 00-1F 7F 84-97 9B-9F - * punct: 21-2F 3A-40 5B-60 7B-7E A1-A3 A5 A7-AB B0-B3 B5-B7 B9-BD BF-CF D1-DD DF-EF F1-FD - * Oddly, none there are listed as alphas, though some represent alphabetics - * http://www.nntp.perl.org/group/perl.perl5.porters/2013/02/msg198753.html */ -#define isNAME_C_OR_POSIX(name) ((name) != NULL \ - && ((*(name) == 'C' && (*(name + 1)) == '\0') \ - || strEQ((name), "POSIX"))) void Perl_new_numeric(pTHX_ const char *newnum) { -#ifdef USE_LOCALE_NUMERIC - /* Called after all libc setlocale() calls affecting LC_NUMERIC, to tell +#ifndef USE_LOCALE_NUMERIC + + PERL_UNUSED_ARG(newnum); + +#else + + /* Called after each libc setlocale() call affecting LC_NUMERIC, to tell * core Perl this and that 'newnum' is the name of the new locale. * It installs this locale as the current underlying default. * * The default locale and the C locale can be toggled between by use of the - * set_numeric_local() and set_numeric_standard() functions, which should - * probably not be called directly, but only via macros like + * set_numeric_underlying() and set_numeric_standard() functions, which + * should probably not be called directly, but only via macros like * SET_NUMERIC_STANDARD() in perl.h. * * The toggling is necessary mainly so that a non-dot radix decimal point @@ -183,17 +447,21 @@ Perl_new_numeric(pTHX_ const char *newnum) * * This sets several interpreter-level variables: * PL_numeric_name The underlying locale's name: a copy of 'newnum' - * PL_numeric_local A boolean indicating if the toggled state is such + * PL_numeric_underlying A boolean indicating if the toggled state is such * that the current locale is the program's underlying * locale * PL_numeric_standard An int indicating if the toggled state is such - * that the current locale is the C locale. If non-zero, - * it is in C; if > 1, it means it may not be toggled away + * that the current locale is the C locale or + * indistinguishable from the C locale. If non-zero, it + * is in C; if > 1, it means it may not be toggled away * from C. - * Note that both of the last two variables can be true at the same time, - * if the underlying locale is C. (Toggling is a no-op under these - * circumstances.) - * + * PL_numeric_underlying_is_standard A bool kept by this function + * indicating that the underlying locale and the standard + * C locale are indistinguishable for the purposes of + * LC_NUMERIC. This happens when both of the above two + * variables are true at the same time. (Toggling is a + * no-op under these circumstances.) This variable is + * used to avoid having to recalculate. * Any code changing the locale (outside this file) should use * POSIX::setlocale, which calls this function. Therefore this function * should be called directly only from this file and from @@ -205,15 +473,25 @@ Perl_new_numeric(pTHX_ const char *newnum) Safefree(PL_numeric_name); PL_numeric_name = NULL; PL_numeric_standard = TRUE; - PL_numeric_local = TRUE; + PL_numeric_underlying = TRUE; + PL_numeric_underlying_is_standard = TRUE; return; } save_newnum = stdize_locale(savepv(newnum)); - + PL_numeric_underlying = TRUE; PL_numeric_standard = isNAME_C_OR_POSIX(save_newnum); - PL_numeric_local = TRUE; + /* If its name isn't C nor POSIX, it could still be indistinguishable from + * them */ + if (! PL_numeric_standard) { + PL_numeric_standard = cBOOL(strEQ(".", my_nl_langinfo(PERL_RADIXCHAR, + FALSE /* Don't toggle locale */ )) + && strEQ("", my_nl_langinfo(PERL_THOUSEP, + FALSE))); + } + + /* Save the new name if it isn't the same as the previous one, if any */ if (! PL_numeric_name || strNE(PL_numeric_name, save_newnum)) { Safefree(PL_numeric_name); PL_numeric_name = save_newnum; @@ -222,76 +500,104 @@ Perl_new_numeric(pTHX_ const char *newnum) Safefree(save_newnum); } + PL_numeric_underlying_is_standard = PL_numeric_standard; + +# if defined(HAS_NEWLOCALE) && ! defined(NO_POSIX_2008_LOCALE) + + PL_underlying_numeric_obj = newlocale(LC_NUMERIC_MASK, + PL_numeric_name, + PL_underlying_numeric_obj); + +#endif + + if (DEBUG_L_TEST || debug_initialization) { + PerlIO_printf(Perl_debug_log, "Called new_numeric with %s, PL_numeric_name=%s\n", newnum, PL_numeric_name); + } + /* Keep LC_NUMERIC in the C locale. This is for XS modules, so they don't * have to worry about the radix being a non-dot. (Core operations that * need the underlying locale change to it temporarily). */ set_numeric_standard(); - set_numeric_radix(); - -#else - PERL_UNUSED_ARG(newnum); #endif /* USE_LOCALE_NUMERIC */ + } void Perl_set_numeric_standard(pTHX) { + #ifdef USE_LOCALE_NUMERIC + /* Toggle the LC_NUMERIC locale to C. Most code should use the macros like * SET_NUMERIC_STANDARD() in perl.h instead of calling this directly. The * macro avoids calling this routine if toggling isn't necessary according * to our records (which could be wrong if some XS code has changed the * locale behind our back) */ - setlocale(LC_NUMERIC, "C"); + do_setlocale_c(LC_NUMERIC, "C"); PL_numeric_standard = TRUE; - PL_numeric_local = isNAME_C_OR_POSIX(PL_numeric_name); - set_numeric_radix(); -#ifdef DEBUGGING + PL_numeric_underlying = PL_numeric_underlying_is_standard; + set_numeric_radix(0); + +# ifdef DEBUGGING + if (DEBUG_L_TEST || debug_initialization) { PerlIO_printf(Perl_debug_log, - "Underlying LC_NUMERIC locale now is C\n"); + "LC_NUMERIC locale now is standard C\n"); } -#endif +# endif #endif /* USE_LOCALE_NUMERIC */ + } void -Perl_set_numeric_local(pTHX) +Perl_set_numeric_underlying(pTHX) { + #ifdef USE_LOCALE_NUMERIC + /* Toggle the LC_NUMERIC locale to the current underlying default. Most - * code should use the macros like SET_NUMERIC_LOCAL() in perl.h instead of - * calling this directly. The macro avoids calling this routine if - * toggling isn't necessary according to our records (which could be wrong - * if some XS code has changed the locale behind our back) */ - - setlocale(LC_NUMERIC, PL_numeric_name); - PL_numeric_standard = isNAME_C_OR_POSIX(PL_numeric_name); - PL_numeric_local = TRUE; - set_numeric_radix(); -#ifdef DEBUGGING + * code should use the macros like SET_NUMERIC_UNDERLYING() in perl.h + * instead of calling this directly. The macro avoids calling this routine + * if toggling isn't necessary according to our records (which could be + * wrong if some XS code has changed the locale behind our back) */ + + do_setlocale_c(LC_NUMERIC, PL_numeric_name); + PL_numeric_standard = PL_numeric_underlying_is_standard; + PL_numeric_underlying = TRUE; + set_numeric_radix(! PL_numeric_standard); + +# ifdef DEBUGGING + if (DEBUG_L_TEST || debug_initialization) { PerlIO_printf(Perl_debug_log, - "Underlying LC_NUMERIC locale now is %s\n", + "LC_NUMERIC locale now is %s\n", PL_numeric_name); } -#endif +# endif #endif /* USE_LOCALE_NUMERIC */ + } /* * Set up for a new ctype locale. */ -void -Perl_new_ctype(pTHX_ const char *newctype) +STATIC void +S_new_ctype(pTHX_ const char *newctype) { -#ifdef USE_LOCALE_CTYPE - /* Called after all libc setlocale() calls affecting LC_CTYPE, to tell +#ifndef USE_LOCALE_CTYPE + + PERL_ARGS_ASSERT_NEW_CTYPE; + PERL_UNUSED_ARG(newctype); + PERL_UNUSED_CONTEXT; + +#else + + /* Called after each libc setlocale() call affecting LC_CTYPE, to tell * core Perl this and that 'newctype' is the name of the new locale. * * This function sets up the folding arrays for all 256 bytes, assuming @@ -336,10 +642,10 @@ Perl_new_ctype(pTHX_ const char *newctype) unsigned int bad_count = 0; /* Count of bad characters */ for (i = 0; i < 256; i++) { - if (isUPPER_LC((U8) i)) - PL_fold_locale[i] = (U8) toLOWER_LC((U8) i); - else if (isLOWER_LC((U8) i)) - PL_fold_locale[i] = (U8) toUPPER_LC((U8) i); + if (isupper(i)) + PL_fold_locale[i] = (U8) tolower(i); + else if (islower(i)) + PL_fold_locale[i] = (U8) toupper(i); else PL_fold_locale[i] = (U8) i; @@ -352,13 +658,22 @@ Perl_new_ctype(pTHX_ const char *newctype) * nowadays. It isn't a problem for most controls to be changed * into something else; we check only \n and \t, though perhaps \r * could be an issue as well. */ - if (check_for_problems + if ( check_for_problems && (isGRAPH_A(i) || isBLANK_A(i) || i == '\n')) { - if ((isALPHANUMERIC_A(i) && ! isALPHANUMERIC_LC(i)) - || (isPUNCT_A(i) && ! isPUNCT_LC(i)) - || (isBLANK_A(i) && ! isBLANK_LC(i)) - || (i == '\n' && ! isCNTRL_LC(i))) + if ( cBOOL(isalnum(i)) != cBOOL(isALPHANUMERIC(i)) + || cBOOL(isalpha(i)) != cBOOL(isALPHA_A(i)) + || cBOOL(isdigit(i)) != cBOOL(isDIGIT_A(i)) + || cBOOL(isgraph(i)) != cBOOL(isGRAPH_A(i)) + || cBOOL(islower(i)) != cBOOL(isLOWER_A(i)) + || cBOOL(isprint(i)) != cBOOL(isPRINT_A(i)) + || cBOOL(ispunct(i)) != cBOOL(isPUNCT_A(i)) + || cBOOL(isspace(i)) != cBOOL(isSPACE_A(i)) + || cBOOL(isupper(i)) != cBOOL(isUPPER_A(i)) + || cBOOL(isxdigit(i))!= cBOOL(isXDIGIT_A(i)) + || tolower(i) != (int) toLOWER_A(i) + || toupper(i) != (int) toUPPER_A(i) + || (i == '\n' && ! isCNTRL_LC(i))) { if (bad_count) { /* Separate multiple entries with a blank */ @@ -384,10 +699,15 @@ Perl_new_ctype(pTHX_ const char *newctype) } } -#ifdef MB_CUR_MAX +# ifdef MB_CUR_MAX + /* We only handle single-byte locales (outside of UTF-8 ones; so if * this locale requires more than one byte, there are going to be * problems. */ + DEBUG_Lv(PerlIO_printf(Perl_debug_log, + "%s:%d: check_for_problems=%d, MB_CUR_MAX=%d\n", + __FILE__, __LINE__, check_for_problems, (int) MB_CUR_MAX)); + if (check_for_problems && MB_CUR_MAX > 1 /* Some platforms return MB_CUR_MAX > 1 for even the "C" @@ -400,7 +720,8 @@ Perl_new_ctype(pTHX_ const char *newctype) { multi_byte_locale = TRUE; } -#endif + +# endif if (bad_count || multi_byte_locale) { PL_warn_locale = Perl_newSVpvf(aTHX_ @@ -432,12 +753,12 @@ Perl_new_ctype(pTHX_ const char *newctype) * here is transparent to this function's caller */ const char * const badlocale = savepv(newctype); - setlocale(LC_CTYPE, "C"); + do_setlocale_c(LC_CTYPE, "C"); /* The '0' below suppresses a bogus gcc compiler warning */ Perl_warner(aTHX_ packWARN(WARN_LOCALE), SvPVX(PL_warn_locale), 0); - setlocale(LC_CTYPE, badlocale); + do_setlocale_c(LC_CTYPE, badlocale); Safefree(badlocale); if (IN_LC(LC_CTYPE)) { @@ -449,9 +770,7 @@ Perl_new_ctype(pTHX_ const char *newctype) } #endif /* USE_LOCALE_CTYPE */ - PERL_ARGS_ASSERT_NEW_CTYPE; - PERL_UNUSED_ARG(newctype); - PERL_UNUSED_CONTEXT; + } void @@ -467,11 +786,9 @@ Perl__warn_problematic_locale() * _CHECK_AND_WARN_PROBLEMATIC_LOCALE */ if (PL_warn_locale) { - /*GCC_DIAG_IGNORE(-Wformat-security); Didn't work */ Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), SvPVX(PL_warn_locale), 0 /* dummy to avoid compiler warning */ ); - /* GCC_DIAG_RESTORE; */ SvREFCNT_dec_NN(PL_warn_locale); PL_warn_locale = NULL; } @@ -480,19 +797,20 @@ Perl__warn_problematic_locale() } -void -Perl_new_collate(pTHX_ const char *newcoll) +STATIC void +S_new_collate(pTHX_ const char *newcoll) { -#ifdef USE_LOCALE_COLLATE - /* Called after all libc setlocale() calls affecting LC_COLLATE, to tell +#ifndef USE_LOCALE_COLLATE + + PERL_UNUSED_ARG(newcoll); + PERL_UNUSED_CONTEXT; + +#else + + /* Called after each libc setlocale() call affecting LC_COLLATE, to tell * core Perl this and that 'newcoll' is the name of the new locale. * - * Any code changing the locale (outside this file) should use - * POSIX::setlocale, which calls this function. Therefore this function - * should be called directly only from this file and from - * POSIX::setlocale(). - * * The design of locale collation is that every locale change is given an * index 'PL_collation_ix'. The first time a string particpates in an * operation that requires collation while locale collation is active, it @@ -680,7 +998,8 @@ Perl_new_collate(pTHX_ const char *newcoll) PL_collxfrm_base = base + 1; } -#ifdef DEBUGGING +# ifdef DEBUGGING + if (DEBUG_L_TEST || debug_initialization) { PerlIO_printf(Perl_debug_log, "%s:%d: ?UTF-8 locale=%d; x_len_shorter=%zu, " @@ -691,95 +1010,77 @@ Perl_new_collate(pTHX_ const char *newcoll) x_len_shorter, x_len_longer, PL_collxfrm_mult, PL_collxfrm_base); } -#endif +# endif + } } -#else - PERL_UNUSED_ARG(newcoll); #endif /* USE_LOCALE_COLLATE */ + } #ifdef WIN32 -char * -Perl_my_setlocale(pTHX_ int category, const char* locale) +STATIC char * +S_win32_setlocale(pTHX_ int category, const char* locale) { /* This, for Windows, emulates POSIX setlocale() behavior. There is no - * difference unless the input locale is "", which means on Windows to get - * the machine default, which is set via the computer's "Regional and - * Language Options" (or its current equivalent). In POSIX, it instead - * means to find the locale from the user's environment. This routine - * looks in the environment, and, if anything is found, uses that instead - * of going to the machine default. If there is no environment override, - * the machine default is used, as normal, by calling the real setlocale() - * with "". The POSIX behavior is to use the LC_ALL variable if set; - * otherwise to use the particular category's variable if set; otherwise to - * use the LANG variable. */ + * difference between the two unless the input locale is "", which normally + * means on Windows to get the machine default, which is set via the + * computer's "Regional and Language Options" (or its current equivalent). + * In POSIX, it instead means to find the locale from the user's + * environment. This routine changes the Windows behavior to first look in + * the environment, and, if anything is found, use that instead of going to + * the machine default. If there is no environment override, the machine + * default is used, by calling the real setlocale() with "". + * + * The POSIX behavior is to use the LC_ALL variable if set; otherwise to + * use the particular category's variable if set; otherwise to use the LANG + * variable. */ bool override_LC_ALL = FALSE; char * result; + unsigned int i; if (locale && strEQ(locale, "")) { -# ifdef LC_ALL + +# ifdef LC_ALL + locale = PerlEnv_getenv("LC_ALL"); if (! locale) { -#endif - switch (category) { -# ifdef LC_ALL - case LC_ALL: - override_LC_ALL = TRUE; - break; /* We already know its variable isn't set */ -# endif -# ifdef USE_LOCALE_TIME - case LC_TIME: - locale = PerlEnv_getenv("LC_TIME"); - break; -# endif -# ifdef USE_LOCALE_CTYPE - case LC_CTYPE: - locale = PerlEnv_getenv("LC_CTYPE"); - break; -# endif -# ifdef USE_LOCALE_COLLATE - case LC_COLLATE: - locale = PerlEnv_getenv("LC_COLLATE"); - break; -# endif -# ifdef USE_LOCALE_MONETARY - case LC_MONETARY: - locale = PerlEnv_getenv("LC_MONETARY"); - break; -# endif -# ifdef USE_LOCALE_NUMERIC - case LC_NUMERIC: - locale = PerlEnv_getenv("LC_NUMERIC"); - break; -# endif -# ifdef USE_LOCALE_MESSAGES - case LC_MESSAGES: - locale = PerlEnv_getenv("LC_MESSAGES"); - break; -# endif - default: - /* This is a category, like PAPER_SIZE that we don't - * know about; and so can't provide a wrapper. */ - break; + if (category == LC_ALL) { + override_LC_ALL = TRUE; } - if (! locale) { + else { + +# endif + + for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++) { + if (category == categories[i]) { + locale = PerlEnv_getenv(category_names[i]); + goto found_locale; + } + } + locale = PerlEnv_getenv("LANG"); if (! locale) { locale = ""; } + + found_locale: ; + +# ifdef LC_ALL + } -# ifdef LC_ALL } -# endif + +# endif + } result = setlocale(category, locale); DEBUG_L(PerlIO_printf(Perl_debug_log, "%s:%d: %s\n", __FILE__, __LINE__, - _setlocale_debug_string(category, locale, result))); + setlocale_debug_string(category, locale, result))); if (! override_LC_ALL) { return result; @@ -790,108 +1091,813 @@ Perl_my_setlocale(pTHX_ int category, const char* locale) * lower priority than the other LC_foo variables, so override it for each * one that is set. (If they are set to "", it means to use the same thing * we just set LC_ALL to, so can skip) */ -# ifdef USE_LOCALE_TIME - result = PerlEnv_getenv("LC_TIME"); - if (result && strNE(result, "")) { - setlocale(LC_TIME, result); - DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s:%d: %s\n", - __FILE__, __LINE__, - _setlocale_debug_string(LC_TIME, result, "not captured"))); - } -# endif -# ifdef USE_LOCALE_CTYPE - result = PerlEnv_getenv("LC_CTYPE"); - if (result && strNE(result, "")) { - setlocale(LC_CTYPE, result); - DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s:%d: %s\n", - __FILE__, __LINE__, - _setlocale_debug_string(LC_CTYPE, result, "not captured"))); - } -# endif -# ifdef USE_LOCALE_COLLATE - result = PerlEnv_getenv("LC_COLLATE"); - if (result && strNE(result, "")) { - setlocale(LC_COLLATE, result); - DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s:%d: %s\n", - __FILE__, __LINE__, - _setlocale_debug_string(LC_COLLATE, result, "not captured"))); - } -# endif -# ifdef USE_LOCALE_MONETARY - result = PerlEnv_getenv("LC_MONETARY"); - if (result && strNE(result, "")) { - setlocale(LC_MONETARY, result); - DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s:%d: %s\n", - __FILE__, __LINE__, - _setlocale_debug_string(LC_MONETARY, result, "not captured"))); - } -# endif -# ifdef USE_LOCALE_NUMERIC - result = PerlEnv_getenv("LC_NUMERIC"); - if (result && strNE(result, "")) { - setlocale(LC_NUMERIC, result); - DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s:%d: %s\n", - __FILE__, __LINE__, - _setlocale_debug_string(LC_NUMERIC, result, "not captured"))); - } -# endif -# ifdef USE_LOCALE_MESSAGES - result = PerlEnv_getenv("LC_MESSAGES"); - if (result && strNE(result, "")) { - setlocale(LC_MESSAGES, result); - DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s:%d: %s\n", - __FILE__, __LINE__, - _setlocale_debug_string(LC_MESSAGES, result, "not captured"))); - } -# endif + + for (i = 0; i < LC_ALL_INDEX; i++) { + result = PerlEnv_getenv(category_names[i]); + if (result && strNE(result, "")) { + setlocale(categories[i], result); + DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s:%d: %s\n", + __FILE__, __LINE__, + setlocale_debug_string(categories[i], result, "not captured"))); + } + } result = setlocale(LC_ALL, NULL); DEBUG_L(PerlIO_printf(Perl_debug_log, "%s:%d: %s\n", __FILE__, __LINE__, - _setlocale_debug_string(LC_ALL, NULL, result))); + setlocale_debug_string(LC_ALL, NULL, result))); return result; } #endif - -/* - * Initialize locale awareness. - */ -int -Perl_init_i18nl10n(pTHX_ int printwarn) +char * +Perl_setlocale(int category, const char * locale) { - /* printwarn is - * - * 0 if not to output warning when setup locale is bad - * 1 if to output warning based on value of PERL_BADLANG - * >1 if to output regardless of PERL_BADLANG - * - * returns - * 1 = set ok or not applicable, - * 0 = fallback to a locale of lower priority - * -1 = fallback to all locales failed, not even to the C locale - * - * Under -DDEBUGGING, if the environment variable PERL_DEBUG_LOCALE_INIT is - * set, debugging information is output. - * - * This looks more complicated than it is, mainly due to the #ifdefs. - * - * We try to set LC_ALL to the value determined by the environment. If - * there is no LC_ALL on this platform, we try the individual categories we - * know about. If this works, we are done. - * - * But if it doesn't work, we have to do something else. We search the - * environment variables ourselves instead of relying on the system to do - * it. We look at, in order, LC_ALL, LANG, a system default locale (if we - * think there is one), and the ultimate fallback "C". This is all done in - * the same loop as above to avoid duplicating code, but it makes things - * more complex. After the original failure, we add the fallback - * possibilities to the list of locales to try, and iterate the loop - * through them all until one succeeds. - * - * On Ultrix, the locale MUST come from the environment, so there is + /* This wraps POSIX::setlocale() */ + + char * retval; + char * newlocale; + dTHX; + +#ifdef USE_LOCALE_NUMERIC + + /* A NULL locale means only query what the current one is. We have the + * LC_NUMERIC name saved, because we are normally switched into the C + * locale for it. For an LC_ALL query, switch back to get the correct + * results. All other categories don't require special handling */ + if (locale == NULL) { + if (category == LC_NUMERIC) { + return savepv(PL_numeric_name); + } + +# ifdef LC_ALL + + else if (category == LC_ALL && ! PL_numeric_underlying) { + + SET_NUMERIC_UNDERLYING(); + } + +# endif + + } + +#endif + + /* Save retval since subsequent setlocale() calls may overwrite it. */ + retval = savepv(do_setlocale_r(category, locale)); + + DEBUG_L(PerlIO_printf(Perl_debug_log, + "%s:%d: %s\n", __FILE__, __LINE__, + setlocale_debug_string(category, locale, retval))); + if (! retval) { + /* Should never happen that a query would return an error, but be + * sure and reset to C locale */ + if (locale == 0) { + SET_NUMERIC_STANDARD(); + } + + return NULL; + } + + /* If locale == NULL, we are just querying the state, but may have switched + * to NUMERIC_UNDERLYING. Switch back before returning. */ + if (locale == NULL) { + SET_NUMERIC_STANDARD(); + return retval; + } + + /* Now that have switched locales, we have to update our records to + * correspond. */ + + switch (category) { + +#ifdef USE_LOCALE_CTYPE + + case LC_CTYPE: + new_ctype(retval); + break; + +#endif +#ifdef USE_LOCALE_COLLATE + + case LC_COLLATE: + new_collate(retval); + break; + +#endif +#ifdef USE_LOCALE_NUMERIC + + case LC_NUMERIC: + new_numeric(retval); + break; + +#endif +#ifdef LC_ALL + + case LC_ALL: + + /* LC_ALL updates all the things we care about. The values may not + * be the same as 'retval', as the locale "" may have set things + * individually */ + +# ifdef USE_LOCALE_CTYPE + + newlocale = do_setlocale_c(LC_CTYPE, NULL); + new_ctype(newlocale); + +# endif /* USE_LOCALE_CTYPE */ +# ifdef USE_LOCALE_COLLATE + + newlocale = do_setlocale_c(LC_COLLATE, NULL); + new_collate(newlocale); + +# endif +# ifdef USE_LOCALE_NUMERIC + + newlocale = do_setlocale_c(LC_NUMERIC, NULL); + new_numeric(newlocale); + +# endif /* USE_LOCALE_NUMERIC */ +#endif /* LC_ALL */ + + default: + break; + } + + return retval; + + +} + +PERL_STATIC_INLINE const char * +S_save_to_buffer(const char * string, char **buf, Size_t *buf_size, const Size_t offset) +{ + /* Copy the NUL-terminated 'string' to 'buf' + 'offset'. 'buf' has size 'buf_size', + * growing it if necessary */ + + const Size_t string_size = strlen(string) + offset + 1; + + PERL_ARGS_ASSERT_SAVE_TO_BUFFER; + + if (*buf_size == 0) { + Newx(*buf, string_size, char); + *buf_size = string_size; + } + else if (string_size > *buf_size) { + Renew(*buf, string_size, char); + *buf_size = string_size; + } + + Copy(string, *buf + offset, string_size - offset, char); + return *buf; +} + +/* + +=head1 Locale-related functions and macros + +=for apidoc Perl_langinfo + +This is an (almost ª) drop-in replacement for the system C>, +taking the same C parameter values, and returning the same information. +But it is more thread-safe than regular C, and hides the quirks +of Perl's locale handling from your code, and can be used on systems that lack +a native C. + +Expanding on these: + +=over + +=item * + +It delivers the correct results for the C and C items, +without you having to write extra code. The reason for the extra code would be +because these are from the C locale category, which is normally +kept set to the C locale by Perl, no matter what the underlying locale is +supposed to be, and so to get the expected results, you have to temporarily +toggle into the underlying locale, and later toggle back. (You could use +plain C and C> for this +but then you wouldn't get the other advantages of C; not +keeping C in the C locale would break a lot of CPAN, which is +expecting the radix (decimal point) character to be a dot.) + +=item * + +Depending on C, it works on systems that don't have C, hence +makes your code more portable. Of the fifty-some possible items specified by +the POSIX 2008 standard, +L, +only two are completely unimplemented. It uses various techniques to recover +the other items, including calling C>, and C>, +both of which are specified in C89, so should be always be available. Later +C versions have additional capabilities; C<""> is returned for +those not available on your system. + +The details for those items which may differ from what this emulation returns +and what a native C would return are: + +=over + +=item C + +=item C + +Unimplemented, so returns C<"">. + +=item C + +=item C + +=item C + +=item C + +Only the values for English are returned. C and C have been +removed from POSIX 2008, and are retained for backwards compatibility. Your +platform's C may not support them. + +=item C + +Always evaluates to C<%x>, the locale's appropriate date representation. + +=item C + +Always evaluates to C<%X>, the locale's appropriate time representation. + +=item C + +Always evaluates to C<%c>, the locale's appropriate date and time +representation. + +=item C + +The return may be incorrect for those rare locales where the currency symbol +replaces the radix character. +Send email to L if you have examples of it needing +to work differently. + +=item C + +Currently this gives the same results as Linux does. +Send email to L if you have examples of it needing +to work differently. + +=item C + +=item C + +=item C + +=item C + +These are derived by using C, and not all versions of that function +know about them. C<""> is returned for these on such systems. + +=back + +When using C on systems that don't have a native +C, you must + + #include "perl_langinfo.h" + +before the C C<#include>. You can replace your C +C<#include> with this one. (Doing it this way keeps out the symbols that plain +C imports into the namespace for code that doesn't need it.) + +You also should not use the bare C item names, but should preface +them with C, so use C instead of plain C. +The C> versions will also work for this function on systems that do +have a native C. + +=item * + +It is thread-friendly, returning its result in a buffer that won't be +overwritten by another thread, so you don't have to code for that possibility. +The buffer can be overwritten by the next call to C or +C in the same thread. + +=item * + +ª It returns S>, whereas plain C returns S>, but you are (only by documentation) forbidden to write into the buffer. +By declaring this C, the compiler enforces this restriction. The extra +C is why this isn't an unequivocal drop-in replacement for +C. + +=back + +The original impetus for C was so that code that needs to +find out the current currency symbol, floating point radix character, or digit +grouping separator can use, on all systems, the simpler and more +thread-friendly C API instead of C> which is a +pain to make thread-friendly. For other fields returned by C, it +is better to use the methods given in L to call +L|POSIX/localeconv>, which is thread-friendly. + +=cut + +*/ + +const char * +#ifdef HAS_NL_LANGINFO +Perl_langinfo(const nl_item item) +#else +Perl_langinfo(const int item) +#endif +{ + return my_nl_langinfo(item, TRUE); +} + +const char * +#ifdef HAS_NL_LANGINFO +S_my_nl_langinfo(const nl_item item, bool toggle) +#else +S_my_nl_langinfo(const int item, bool toggle) +#endif +{ + dTHX; + + /* We only need to toggle into the underlying LC_NUMERIC locale for these + * two items, and only if not already there */ + if (toggle && (( item != PERL_RADIXCHAR && item != PERL_THOUSEP) + || PL_numeric_underlying)) + { + toggle = FALSE; + } + +#if defined(HAS_NL_LANGINFO) /* nl_langinfo() is available. */ +#if ! defined(HAS_POSIX_2008_LOCALE) + + /* Here, use plain nl_langinfo(), switching to the underlying LC_NUMERIC + * for those items dependent on it. This must be copied to a buffer before + * switching back, as some systems destroy the buffer when setlocale() is + * called */ + + { + DECLARATION_FOR_LC_NUMERIC_MANIPULATION; + + if (toggle) { + STORE_LC_NUMERIC_FORCE_TO_UNDERLYING(); + } + + LOCALE_LOCK; /* Prevent interference from another thread executing + this code section (the only call to nl_langinfo in + the core) */ + + save_to_buffer(nl_langinfo(item), &PL_langinfo_buf, + &PL_langinfo_bufsize, 0); + + LOCALE_UNLOCK; + + if (toggle) { + RESTORE_LC_NUMERIC(); + } + } + +# else /* Use nl_langinfo_l(), avoiding both a mutex and changing the locale */ + + { + bool do_free = FALSE; + locale_t cur = uselocale((locale_t) 0); + + if (cur == LC_GLOBAL_LOCALE) { + cur = duplocale(LC_GLOBAL_LOCALE); + do_free = TRUE; + } + + if (toggle) { + if (PL_underlying_numeric_obj) { + cur = PL_underlying_numeric_obj; + } + else { + cur = newlocale(LC_NUMERIC_MASK, PL_numeric_name, cur); + do_free = TRUE; + } + } + + save_to_buffer(nl_langinfo_l(item, cur), + &PL_langinfo_buf, &PL_langinfo_bufsize, 0); + if (do_free) { + freelocale(cur); + } + } + +# endif + + if (strEQ(PL_langinfo_buf, "")) { + if (item == PERL_YESSTR) { + return "yes"; + } + if (item == PERL_NOSTR) { + return "no"; + } + } + + return PL_langinfo_buf; + +#else /* Below, emulate nl_langinfo as best we can */ + + { + +# ifdef HAS_LOCALECONV + + const struct lconv* lc; + DECLARATION_FOR_LC_NUMERIC_MANIPULATION; + +# endif +# ifdef HAS_STRFTIME + + struct tm tm; + bool return_format = FALSE; /* Return the %format, not the value */ + const char * format; + +# endif + + /* We copy the results to a per-thread buffer, even if not + * multi-threaded. This is in part to simplify this code, and partly + * because we need a buffer anyway for strftime(), and partly because a + * call of localeconv() could otherwise wipe out the buffer, and the + * programmer would not be expecting this, as this is a nl_langinfo() + * substitute after all, so s/he might be thinking their localeconv() + * is safe until another localeconv() call. */ + + switch (item) { + Size_t len; + const char * retval; + + /* These 2 are unimplemented */ + case PERL_CODESET: + case PERL_ERA: /* For use with strftime() %E modifier */ + + default: + return ""; + + /* We use only an English set, since we don't know any more */ + case PERL_YESEXPR: return "^[+1yY]"; + case PERL_YESSTR: return "yes"; + case PERL_NOEXPR: return "^[-0nN]"; + case PERL_NOSTR: return "no"; + +# ifdef HAS_LOCALECONV + + case PERL_CRNCYSTR: + + /* We don't bother with localeconv_l() because any system that + * has it is likely to also have nl_langinfo() */ + + LOCALE_LOCK; /* Prevent interference with other threads + using localeconv() */ + + lc = localeconv(); + if ( ! lc + || ! lc->currency_symbol + || strEQ("", lc->currency_symbol)) + { + LOCALE_UNLOCK; + return ""; + } + + /* Leave the first spot empty to be filled in below */ + save_to_buffer(lc->currency_symbol, &PL_langinfo_buf, + &PL_langinfo_bufsize, 1); + if (lc->mon_decimal_point && strEQ(lc->mon_decimal_point, "")) + { /* khw couldn't figure out how the localedef specifications + would show that the $ should replace the radix; this is + just a guess as to how it might work.*/ + *PL_langinfo_buf = '.'; + } + else if (lc->p_cs_precedes) { + *PL_langinfo_buf = '-'; + } + else { + *PL_langinfo_buf = '+'; + } + + LOCALE_UNLOCK; + break; + + case PERL_RADIXCHAR: + case PERL_THOUSEP: + + if (toggle) { + STORE_LC_NUMERIC_FORCE_TO_UNDERLYING(); + } + + LOCALE_LOCK; /* Prevent interference with other threads + using localeconv() */ + + lc = localeconv(); + if (! lc) { + retval = ""; + } + else { + retval = (item == PERL_RADIXCHAR) + ? lc->decimal_point + : lc->thousands_sep; + if (! retval) { + retval = ""; + } + } + + save_to_buffer(retval, &PL_langinfo_buf, + &PL_langinfo_bufsize, 0); + + LOCALE_UNLOCK; + + if (toggle) { + RESTORE_LC_NUMERIC(); + } + + break; + +# endif +# ifdef HAS_STRFTIME + + /* These are defined by C89, so we assume that strftime supports + * them, and so are returned unconditionally; they may not be what + * the locale actually says, but should give good enough results + * for someone using them as formats (as opposed to trying to parse + * them to figure out what the locale says). The other format + * items are actually tested to verify they work on the platform */ + case PERL_D_FMT: return "%x"; + case PERL_T_FMT: return "%X"; + case PERL_D_T_FMT: return "%c"; + + /* These formats are only available in later strfmtime's */ + case PERL_ERA_D_FMT: case PERL_ERA_T_FMT: case PERL_ERA_D_T_FMT: + case PERL_T_FMT_AMPM: + + /* The rest can be gotten from most versions of strftime(). */ + case PERL_ABDAY_1: case PERL_ABDAY_2: case PERL_ABDAY_3: + case PERL_ABDAY_4: case PERL_ABDAY_5: case PERL_ABDAY_6: + case PERL_ABDAY_7: + case PERL_ALT_DIGITS: + case PERL_AM_STR: case PERL_PM_STR: + case PERL_ABMON_1: case PERL_ABMON_2: case PERL_ABMON_3: + case PERL_ABMON_4: case PERL_ABMON_5: case PERL_ABMON_6: + case PERL_ABMON_7: case PERL_ABMON_8: case PERL_ABMON_9: + case PERL_ABMON_10: case PERL_ABMON_11: case PERL_ABMON_12: + case PERL_DAY_1: case PERL_DAY_2: case PERL_DAY_3: case PERL_DAY_4: + case PERL_DAY_5: case PERL_DAY_6: case PERL_DAY_7: + case PERL_MON_1: case PERL_MON_2: case PERL_MON_3: case PERL_MON_4: + case PERL_MON_5: case PERL_MON_6: case PERL_MON_7: case PERL_MON_8: + case PERL_MON_9: case PERL_MON_10: case PERL_MON_11: + case PERL_MON_12: + + LOCALE_LOCK; + + init_tm(&tm); /* Precaution against core dumps */ + tm.tm_sec = 30; + tm.tm_min = 30; + tm.tm_hour = 6; + tm.tm_year = 2017 - 1900; + tm.tm_wday = 0; + tm.tm_mon = 0; + switch (item) { + default: + LOCALE_UNLOCK; + Perl_croak(aTHX_ + "panic: %s: %d: switch case: %d problem", + __FILE__, __LINE__, item); + NOT_REACHED; /* NOTREACHED */ + + case PERL_PM_STR: tm.tm_hour = 18; + case PERL_AM_STR: + format = "%p"; + break; + + case PERL_ABDAY_7: tm.tm_wday++; + case PERL_ABDAY_6: tm.tm_wday++; + case PERL_ABDAY_5: tm.tm_wday++; + case PERL_ABDAY_4: tm.tm_wday++; + case PERL_ABDAY_3: tm.tm_wday++; + case PERL_ABDAY_2: tm.tm_wday++; + case PERL_ABDAY_1: + format = "%a"; + break; + + case PERL_DAY_7: tm.tm_wday++; + case PERL_DAY_6: tm.tm_wday++; + case PERL_DAY_5: tm.tm_wday++; + case PERL_DAY_4: tm.tm_wday++; + case PERL_DAY_3: tm.tm_wday++; + case PERL_DAY_2: tm.tm_wday++; + case PERL_DAY_1: + format = "%A"; + break; + + case PERL_ABMON_12: tm.tm_mon++; + case PERL_ABMON_11: tm.tm_mon++; + case PERL_ABMON_10: tm.tm_mon++; + case PERL_ABMON_9: tm.tm_mon++; + case PERL_ABMON_8: tm.tm_mon++; + case PERL_ABMON_7: tm.tm_mon++; + case PERL_ABMON_6: tm.tm_mon++; + case PERL_ABMON_5: tm.tm_mon++; + case PERL_ABMON_4: tm.tm_mon++; + case PERL_ABMON_3: tm.tm_mon++; + case PERL_ABMON_2: tm.tm_mon++; + case PERL_ABMON_1: + format = "%b"; + break; + + case PERL_MON_12: tm.tm_mon++; + case PERL_MON_11: tm.tm_mon++; + case PERL_MON_10: tm.tm_mon++; + case PERL_MON_9: tm.tm_mon++; + case PERL_MON_8: tm.tm_mon++; + case PERL_MON_7: tm.tm_mon++; + case PERL_MON_6: tm.tm_mon++; + case PERL_MON_5: tm.tm_mon++; + case PERL_MON_4: tm.tm_mon++; + case PERL_MON_3: tm.tm_mon++; + case PERL_MON_2: tm.tm_mon++; + case PERL_MON_1: + format = "%B"; + break; + + case PERL_T_FMT_AMPM: + format = "%r"; + return_format = TRUE; + break; + + case PERL_ERA_D_FMT: + format = "%Ex"; + return_format = TRUE; + break; + + case PERL_ERA_T_FMT: + format = "%EX"; + return_format = TRUE; + break; + + case PERL_ERA_D_T_FMT: + format = "%Ec"; + return_format = TRUE; + break; + + case PERL_ALT_DIGITS: + tm.tm_wday = 0; + format = "%Ow"; /* Find the alternate digit for 0 */ + break; + } + + /* We can't use my_strftime() because it doesn't look at + * tm_wday */ + while (0 == strftime(PL_langinfo_buf, PL_langinfo_bufsize, + format, &tm)) + { + /* A zero return means one of: + * a) there wasn't enough space in PL_langinfo_buf + * b) the format, like a plain %p, returns empty + * c) it was an illegal format, though some + * implementations of strftime will just return the + * illegal format as a plain character sequence. + * + * To quickly test for case 'b)', try again but precede + * the format with a plain character. If that result is + * still empty, the problem is either 'a)' or 'c)' */ + + Size_t format_size = strlen(format) + 1; + Size_t mod_size = format_size + 1; + char * mod_format; + char * temp_result; + + Newx(mod_format, mod_size, char); + Newx(temp_result, PL_langinfo_bufsize, char); + *mod_format = ' '; + my_strlcpy(mod_format + 1, format, mod_size); + len = strftime(temp_result, + PL_langinfo_bufsize, + mod_format, &tm); + Safefree(mod_format); + Safefree(temp_result); + + /* If 'len' is non-zero, it means that we had a case like + * %p which means the current locale doesn't use a.m. or + * p.m., and that is valid */ + if (len == 0) { + + /* Here, still didn't work. If we get well beyond a + * reasonable size, bail out to prevent an infinite + * loop. */ + + if (PL_langinfo_bufsize > 100 * format_size) { + *PL_langinfo_buf = '\0'; + } + else { + /* Double the buffer size to retry; Add 1 in case + * original was 0, so we aren't stuck at 0. */ + PL_langinfo_bufsize *= 2; + PL_langinfo_bufsize++; + Renew(PL_langinfo_buf, PL_langinfo_bufsize, char); + continue; + } + } + + break; + } + + /* Here, we got a result. + * + * If the item is 'ALT_DIGITS', PL_langinfo_buf contains the + * alternate format for wday 0. If the value is the same as + * the normal 0, there isn't an alternate, so clear the buffer. + * */ + if ( item == PERL_ALT_DIGITS + && strEQ(PL_langinfo_buf, "0")) + { + *PL_langinfo_buf = '\0'; + } + + /* ALT_DIGITS is problematic. Experiments on it showed that + * strftime() did not always work properly when going from + * alt-9 to alt-10. Only a few locales have this item defined, + * and in all of them on Linux that khw was able to find, + * nl_langinfo() merely returned the alt-0 character, possibly + * doubled. Most Unicode digits are in blocks of 10 + * consecutive code points, so that is sufficient information + * for those scripts, as we can infer alt-1, alt-2, .... But + * for a Japanese locale, a CJK ideographic 0 is returned, and + * the CJK digits are not in code point order, so you can't + * really infer anything. The localedef for this locale did + * specify the succeeding digits, so that strftime() works + * properly on them, without needing to infer anything. But + * the nl_langinfo() return did not give sufficient information + * for the caller to understand what's going on. So until + * there is evidence that it should work differently, this + * returns the alt-0 string for ALT_DIGITS. + * + * wday was chosen because its range is all a single digit. + * Things like tm_sec have two digits as the minimum: '00' */ + + LOCALE_UNLOCK; + + /* If to return the format, not the value, overwrite the buffer + * with it. But some strftime()s will keep the original format + * if illegal, so change those to "" */ + if (return_format) { + if (strEQ(PL_langinfo_buf, format)) { + *PL_langinfo_buf = '\0'; + } + else { + save_to_buffer(format, &PL_langinfo_buf, + &PL_langinfo_bufsize, 0); + } + } + + break; + +# endif + + } + } + + return PL_langinfo_buf; + +#endif + +} + +/* + * Initialize locale awareness. + */ +int +Perl_init_i18nl10n(pTHX_ int printwarn) +{ + /* printwarn is + * + * 0 if not to output warning when setup locale is bad + * 1 if to output warning based on value of PERL_BADLANG + * >1 if to output regardless of PERL_BADLANG + * + * returns + * 1 = set ok or not applicable, + * 0 = fallback to a locale of lower priority + * -1 = fallback to all locales failed, not even to the C locale + * + * Under -DDEBUGGING, if the environment variable PERL_DEBUG_LOCALE_INIT is + * set, debugging information is output. + * + * This looks more complicated than it is, mainly due to the #ifdefs. + * + * We try to set LC_ALL to the value determined by the environment. If + * there is no LC_ALL on this platform, we try the individual categories we + * know about. If this works, we are done. + * + * But if it doesn't work, we have to do something else. We search the + * environment variables ourselves instead of relying on the system to do + * it. We look at, in order, LC_ALL, LANG, a system default locale (if we + * think there is one), and the ultimate fallback "C". This is all done in + * the same loop as above to avoid duplicating code, but it makes things + * more complex. The 'trial_locales' array is initialized with just one + * element; it causes the behavior described in the paragraph above this to + * happen. If that fails, we add elements to 'trial_locales', and do extra + * loop iterations to cause the behavior described in this paragraph. + * + * On Ultrix, the locale MUST come from the environment, so there is * preliminary code to set it. I (khw) am not sure that it is necessary, * and that this couldn't be folded into the loop, but barring any real * platforms to test on, it's staying as-is @@ -911,19 +1917,16 @@ Perl_init_i18nl10n(pTHX_ int printwarn) int ok = 1; -#if defined(USE_LOCALE) -#ifdef USE_LOCALE_CTYPE - char *curctype = NULL; -#endif /* USE_LOCALE_CTYPE */ -#ifdef USE_LOCALE_COLLATE - char *curcoll = NULL; -#endif /* USE_LOCALE_COLLATE */ -#ifdef USE_LOCALE_NUMERIC - char *curnum = NULL; -#endif /* USE_LOCALE_NUMERIC */ -#ifdef __GLIBC__ +#ifndef USE_LOCALE + + PERL_UNUSED_ARG(printwarn); + +#else /* USE_LOCALE */ +# ifdef __GLIBC__ + const char * const language = savepv(PerlEnv_getenv("LANGUAGE")); -#endif + +# endif /* NULL uses the existing already set up locale */ const char * const setlocale_init = (PerlEnv_getenv("PERL_SKIP_LOCALE_INIT")) @@ -935,127 +1938,152 @@ Perl_init_i18nl10n(pTHX_ int printwarn) const char * const lang = savepv(PerlEnv_getenv("LANG")); bool setlocale_failure = FALSE; unsigned int i; - char *p; /* A later getenv() could zap this, so only use here */ const char * const bad_lang_use_once = PerlEnv_getenv("PERL_BADLANG"); const bool locwarn = (printwarn > 1 - || (printwarn - && (! bad_lang_use_once + || ( printwarn + && ( ! bad_lang_use_once || ( - /* disallow with "" or "0" */ - *bad_lang_use_once - && strNE("0", bad_lang_use_once))))); - bool done = FALSE; - char * sl_result; /* return from setlocale() */ - char * locale_param; -#ifdef WIN32 + /* disallow with "" or "0" */ + *bad_lang_use_once + && strNE("0", bad_lang_use_once))))); + + /* setlocale() return vals; not copied so must be looked at immediately */ + const char * sl_result[NOMINAL_LC_ALL_INDEX + 1]; + + /* current locale for given category; should have been copied so aren't + * volatile */ + const char * curlocales[NOMINAL_LC_ALL_INDEX + 1]; + +# ifdef WIN32 + /* In some systems you can find out the system default locale * and use that as the fallback locale. */ -# define SYSTEM_DEFAULT_LOCALE -#endif -#ifdef SYSTEM_DEFAULT_LOCALE +# define SYSTEM_DEFAULT_LOCALE +# endif +# ifdef SYSTEM_DEFAULT_LOCALE + const char *system_default_locale = NULL; -#endif -#ifdef DEBUGGING - DEBUG_INITIALIZATION_set((PerlEnv_getenv("PERL_DEBUG_LOCALE_INIT")) - ? TRUE - : FALSE); -# define DEBUG_LOCALE_INIT(category, locale, result) \ +# endif + +# ifndef DEBUGGING +# define DEBUG_LOCALE_INIT(a,b,c) +# else + + DEBUG_INITIALIZATION_set(cBOOL(PerlEnv_getenv("PERL_DEBUG_LOCALE_INIT"))); + +# define DEBUG_LOCALE_INIT(category, locale, result) \ STMT_START { \ if (debug_initialization) { \ PerlIO_printf(Perl_debug_log, \ "%s:%d: %s\n", \ __FILE__, __LINE__, \ - _setlocale_debug_string(category, \ + setlocale_debug_string(category, \ locale, \ result)); \ } \ } STMT_END -#else -# define DEBUG_LOCALE_INIT(a,b,c) -#endif -#ifndef LOCALE_ENVIRON_REQUIRED - PERL_UNUSED_VAR(done); - PERL_UNUSED_VAR(locale_param); -#else +/* Make sure the parallel arrays are properly set up */ +# ifdef USE_LOCALE_NUMERIC + assert(categories[LC_NUMERIC_INDEX] == LC_NUMERIC); + assert(strEQ(category_names[LC_NUMERIC_INDEX], "LC_NUMERIC")); +# endif +# ifdef USE_LOCALE_CTYPE + assert(categories[LC_CTYPE_INDEX] == LC_CTYPE); + assert(strEQ(category_names[LC_CTYPE_INDEX], "LC_CTYPE")); +# endif +# ifdef USE_LOCALE_COLLATE + assert(categories[LC_COLLATE_INDEX] == LC_COLLATE); + assert(strEQ(category_names[LC_COLLATE_INDEX], "LC_COLLATE")); +# endif +# ifdef USE_LOCALE_TIME + assert(categories[LC_TIME_INDEX] == LC_TIME); + assert(strEQ(category_names[LC_TIME_INDEX], "LC_TIME")); +# endif +# ifdef USE_LOCALE_MESSAGES + assert(categories[LC_MESSAGES_INDEX] == LC_MESSAGES); + assert(strEQ(category_names[LC_MESSAGES_INDEX], "LC_MESSAGES")); +# endif +# ifdef USE_LOCALE_MONETARY + assert(categories[LC_MONETARY_INDEX] == LC_MONETARY); + assert(strEQ(category_names[LC_MONETARY_INDEX], "LC_MONETARY")); +# endif +# ifdef USE_LOCALE_ADDRESS + assert(categories[LC_ADDRESS_INDEX] == LC_ADDRESS); + assert(strEQ(category_names[LC_ADDRESS_INDEX], "LC_ADDRESS")); +# endif +# ifdef USE_LOCALE_IDENTIFICATION + assert(categories[LC_IDENTIFICATION_INDEX] == LC_IDENTIFICATION); + assert(strEQ(category_names[LC_IDENTIFICATION_INDEX], "LC_IDENTIFICATION")); +# endif +# ifdef USE_LOCALE_MEASUREMENT + assert(categories[LC_MEASUREMENT_INDEX] == LC_MEASUREMENT); + assert(strEQ(category_names[LC_MEASUREMENT_INDEX], "LC_MEASUREMENT")); +# endif +# ifdef USE_LOCALE_PAPER + assert(categories[LC_PAPER_INDEX] == LC_PAPER); + assert(strEQ(category_names[LC_PAPER_INDEX], "LC_PAPER")); +# endif +# ifdef USE_LOCALE_TELEPHONE + assert(categories[LC_TELEPHONE_INDEX] == LC_TELEPHONE); + assert(strEQ(category_names[LC_TELEPHONE_INDEX], "LC_TELEPHONE")); +# endif +# ifdef LC_ALL + assert(categories[LC_ALL_INDEX] == LC_ALL); + assert(strEQ(category_names[LC_ALL_INDEX], "LC_ALL")); + assert(NOMINAL_LC_ALL_INDEX == LC_ALL_INDEX); +# endif +# endif /* DEBUGGING */ + + /* Initialize the cache of the program's UTF-8ness for the always known + * locales C and POSIX */ + my_strlcpy(PL_locale_utf8ness, C_and_POSIX_utf8ness, + sizeof(PL_locale_utf8ness)); + + PL_numeric_radix_sv = newSVpvs("."); + +# ifdef LOCALE_ENVIRON_REQUIRED /* * Ultrix setlocale(..., "") fails if there are no environment * variables from which to get a locale name. */ -# ifdef LC_ALL - if (lang) { - sl_result = my_setlocale(LC_ALL, setlocale_init); - DEBUG_LOCALE_INIT(LC_ALL, setlocale_init, sl_result); - if (sl_result) - done = TRUE; - else - setlocale_failure = TRUE; - } - if (! setlocale_failure) { -# ifdef USE_LOCALE_CTYPE - locale_param = (! done && (lang || PerlEnv_getenv("LC_CTYPE"))) - ? setlocale_init - : NULL; - curctype = my_setlocale(LC_CTYPE, locale_param); - DEBUG_LOCALE_INIT(LC_CTYPE, locale_param, sl_result); - if (! curctype) - setlocale_failure = TRUE; - else - curctype = savepv(curctype); -# endif /* USE_LOCALE_CTYPE */ -# ifdef USE_LOCALE_COLLATE - locale_param = (! done && (lang || PerlEnv_getenv("LC_COLLATE"))) - ? setlocale_init - : NULL; - curcoll = my_setlocale(LC_COLLATE, locale_param); - DEBUG_LOCALE_INIT(LC_COLLATE, locale_param, sl_result); - if (! curcoll) - setlocale_failure = TRUE; - else - curcoll = savepv(curcoll); -# endif /* USE_LOCALE_COLLATE */ -# ifdef USE_LOCALE_NUMERIC - locale_param = (! done && (lang || PerlEnv_getenv("LC_NUMERIC"))) - ? setlocale_init - : NULL; - curnum = my_setlocale(LC_NUMERIC, locale_param); - DEBUG_LOCALE_INIT(LC_NUMERIC, locale_param, sl_result); - if (! curnum) - setlocale_failure = TRUE; - else - curnum = savepv(curnum); -# endif /* USE_LOCALE_NUMERIC */ -# ifdef USE_LOCALE_MESSAGES - locale_param = (! done && (lang || PerlEnv_getenv("LC_MESSAGES"))) - ? setlocale_init - : NULL; - sl_result = my_setlocale(LC_MESSAGES, locale_param); - DEBUG_LOCALE_INIT(LC_MESSAGES, locale_param, sl_result); - if (! sl_result) { - setlocale_failure = TRUE; - } -# endif /* USE_LOCALE_MESSAGES */ -# ifdef USE_LOCALE_MONETARY - locale_param = (! done && (lang || PerlEnv_getenv("LC_MONETARY"))) - ? setlocale_init - : NULL; - sl_result = my_setlocale(LC_MONETARY, locale_param); - DEBUG_LOCALE_INIT(LC_MONETARY, locale_param, sl_result); - if (! sl_result) { - setlocale_failure = TRUE; - } -# endif /* USE_LOCALE_MONETARY */ - } - -# endif /* LC_ALL */ - -#endif /* !LOCALE_ENVIRON_REQUIRED */ +# ifndef LC_ALL +# error Ultrix without LC_ALL not implemented +# else + + { + bool done = FALSE; + if (lang) { + sl_result[LC_ALL_INDEX] = do_setlocale_c(LC_ALL, setlocale_init); + DEBUG_LOCALE_INIT(LC_ALL, setlocale_init, sl_result[LC_ALL_INDEX]); + if (sl_result[LC_ALL_INDEX]) + done = TRUE; + else + setlocale_failure = TRUE; + } + if (! setlocale_failure) { + const char * locale_param; + for (i = 0; i < LC_ALL_INDEX; i++) { + locale_param = (! done && (lang || PerlEnv_getenv(category_names[i]))) + ? setlocale_init + : NULL; + sl_result[i] = do_setlocale_r(categories[i], locale_param); + if (! sl_result[i]) { + setlocale_failure = TRUE; + } + DEBUG_LOCALE_INIT(categories[i], locale_param, sl_result[i]); + } + } + } + +# endif /* LC_ALL */ +# endif /* LOCALE_ENVIRON_REQUIRED */ /* We try each locale in the list until we get one that works, or exhaust * the list. Normally the loop is executed just once. But if setting the @@ -1063,6 +2091,7 @@ Perl_init_i18nl10n(pTHX_ int printwarn) * will execute the loop multiple times */ trial_locales[0] = setlocale_init; trial_locales_count = 1; + for (i= 0; i < trial_locales_count; i++) { const char * trial_locale = trial_locales[i]; @@ -1073,8 +2102,9 @@ Perl_init_i18nl10n(pTHX_ int printwarn) * sense */ setlocale_failure = FALSE; -#ifdef SYSTEM_DEFAULT_LOCALE -# ifdef WIN32 +# ifdef SYSTEM_DEFAULT_LOCALE +# ifdef WIN32 /* Note that assumes Win32 has LC_ALL */ + /* On Windows machines, an entry of "" after the 0th means to use * the system default locale, which we now proceed to get. */ if (strEQ(trial_locale, "")) { @@ -1082,10 +2112,10 @@ Perl_init_i18nl10n(pTHX_ int printwarn) /* Note that this may change the locale, but we are going to do * that anyway just below */ - system_default_locale = setlocale(LC_ALL, ""); + system_default_locale = do_setlocale_c(LC_ALL, ""); DEBUG_LOCALE_INIT(LC_ALL, "", system_default_locale); - /* Skip if invalid or it's already on the list of locales to + /* Skip if invalid or if it's already on the list of locales to * try */ if (! system_default_locale) { goto next_iteration; @@ -1098,14 +2128,18 @@ Perl_init_i18nl10n(pTHX_ int printwarn) trial_locale = system_default_locale; } -# endif /* WIN32 */ -#endif /* SYSTEM_DEFAULT_LOCALE */ - } +# else +# error SYSTEM_DEFAULT_LOCALE only implemented for Win32 +# endif +# endif /* SYSTEM_DEFAULT_LOCALE */ -#ifdef LC_ALL - sl_result = my_setlocale(LC_ALL, trial_locale); - DEBUG_LOCALE_INIT(LC_ALL, trial_locale, sl_result); - if (! sl_result) { + } /* For i > 0 */ + +# ifdef LC_ALL + + sl_result[LC_ALL_INDEX] = do_setlocale_c(LC_ALL, trial_locale); + DEBUG_LOCALE_INIT(LC_ALL, trial_locale, sl_result[LC_ALL_INDEX]); + if (! sl_result[LC_ALL_INDEX]) { setlocale_failure = TRUE; } else { @@ -1114,55 +2148,26 @@ Perl_init_i18nl10n(pTHX_ int printwarn) * setlocales below just return their category's current values. * This adequately handles the case in NetBSD where LC_COLLATE may * not be defined for a locale, and setting it individually will - * fail, whereas setting LC_ALL suceeds, leaving LC_COLLATE set to + * fail, whereas setting LC_ALL succeeds, leaving LC_COLLATE set to * the POSIX locale. */ trial_locale = NULL; } -#endif /* LC_ALL */ - if (!setlocale_failure) { -#ifdef USE_LOCALE_CTYPE - Safefree(curctype); - curctype = my_setlocale(LC_CTYPE, trial_locale); - DEBUG_LOCALE_INIT(LC_CTYPE, trial_locale, curctype); - if (! curctype) - setlocale_failure = TRUE; - else - curctype = savepv(curctype); -#endif /* USE_LOCALE_CTYPE */ -#ifdef USE_LOCALE_COLLATE - Safefree(curcoll); - curcoll = my_setlocale(LC_COLLATE, trial_locale); - DEBUG_LOCALE_INIT(LC_COLLATE, trial_locale, curcoll); - if (! curcoll) - setlocale_failure = TRUE; - else - curcoll = savepv(curcoll); -#endif /* USE_LOCALE_COLLATE */ -#ifdef USE_LOCALE_NUMERIC - Safefree(curnum); - curnum = my_setlocale(LC_NUMERIC, trial_locale); - DEBUG_LOCALE_INIT(LC_NUMERIC, trial_locale, curnum); - if (! curnum) - setlocale_failure = TRUE; - else - curnum = savepv(curnum); -#endif /* USE_LOCALE_NUMERIC */ -#ifdef USE_LOCALE_MESSAGES - sl_result = my_setlocale(LC_MESSAGES, trial_locale); - DEBUG_LOCALE_INIT(LC_MESSAGES, trial_locale, sl_result); - if (! (sl_result)) - setlocale_failure = TRUE; -#endif /* USE_LOCALE_MESSAGES */ -#ifdef USE_LOCALE_MONETARY - sl_result = my_setlocale(LC_MONETARY, trial_locale); - DEBUG_LOCALE_INIT(LC_MONETARY, trial_locale, sl_result); - if (! (sl_result)) - setlocale_failure = TRUE; -#endif /* USE_LOCALE_MONETARY */ +# endif /* LC_ALL */ - if (! setlocale_failure) { /* Success */ - break; + if (! setlocale_failure) { + unsigned int j; + for (j = 0; j < NOMINAL_LC_ALL_INDEX; j++) { + curlocales[j] + = savepv(do_setlocale_r(categories[j], trial_locale)); + if (! curlocales[j]) { + setlocale_failure = TRUE; + } + DEBUG_LOCALE_INIT(categories[j], trial_locale, curlocales[j]); + } + + if (! setlocale_failure) { /* All succeeded */ + break; /* Exit trial_locales loop */ } } @@ -1173,41 +2178,39 @@ Perl_init_i18nl10n(pTHX_ int printwarn) unsigned int j; if (locwarn) { /* Output failure info only on the first one */ -#ifdef LC_ALL + +# ifdef LC_ALL PerlIO_printf(Perl_error_log, "perl: warning: Setting locale failed.\n"); -#else /* !LC_ALL */ +# else /* !LC_ALL */ PerlIO_printf(Perl_error_log, "perl: warning: Setting locale failed for the categories:\n\t"); -# ifdef USE_LOCALE_CTYPE - if (! curctype) - PerlIO_printf(Perl_error_log, "LC_CTYPE "); -# endif /* USE_LOCALE_CTYPE */ -# ifdef USE_LOCALE_COLLATE - if (! curcoll) - PerlIO_printf(Perl_error_log, "LC_COLLATE "); -# endif /* USE_LOCALE_COLLATE */ -# ifdef USE_LOCALE_NUMERIC - if (! curnum) - PerlIO_printf(Perl_error_log, "LC_NUMERIC "); -# endif /* USE_LOCALE_NUMERIC */ - PerlIO_printf(Perl_error_log, "and possibly others\n"); -#endif /* LC_ALL */ + for (j = 0; j < NOMINAL_LC_ALL_INDEX; j++) { + if (! curlocales[j]) { + PerlIO_printf(Perl_error_log, category_names[j]); + } + else { + Safefree(curlocales[j]); + } + } + +# endif /* LC_ALL */ PerlIO_printf(Perl_error_log, "perl: warning: Please check that your locale settings:\n"); -#ifdef __GLIBC__ +# ifdef __GLIBC__ + PerlIO_printf(Perl_error_log, "\tLANGUAGE = %c%s%c,\n", language ? '"' : '(', language ? language : "unset", language ? '"' : ')'); -#endif +# endif PerlIO_printf(Perl_error_log, "\tLC_ALL = %c%s%c,\n", @@ -1215,21 +2218,38 @@ Perl_init_i18nl10n(pTHX_ int printwarn) lc_all ? lc_all : "unset", lc_all ? '"' : ')'); -#if defined(USE_ENVIRON_ARRAY) +# if defined(USE_ENVIRON_ARRAY) + { - char **e; - for (e = environ; *e; e++) { - if (strEQs(*e, "LC_") - && strNEs(*e, "LC_ALL=") - && (p = strchr(*e, '='))) - PerlIO_printf(Perl_error_log, "\t%.*s = \"%s\",\n", - (int)(p - *e), *e, p + 1); - } + char **e; + + /* Look through the environment for any variables of the + * form qr/ ^ LC_ [A-Z]+ = /x, except LC_ALL which was + * already handled above. These are assumed to be locale + * settings. Output them and their values. */ + for (e = environ; *e; e++) { + const STRLEN prefix_len = sizeof("LC_") - 1; + STRLEN uppers_len; + + if ( strBEGINs(*e, "LC_") + && ! strBEGINs(*e, "LC_ALL=") + && (uppers_len = strspn(*e + prefix_len, + "ABCDEFGHIJKLMNOPQRSTUVWXYZ")) + && ((*e)[prefix_len + uppers_len] == '=')) + { + PerlIO_printf(Perl_error_log, "\t%.*s = \"%s\",\n", + (int) (prefix_len + uppers_len), *e, + *e + prefix_len + uppers_len + 1); + } + } } -#else + +# else + PerlIO_printf(Perl_error_log, "\t(possibly more locale environment variables)\n"); -#endif + +# endif PerlIO_printf(Perl_error_log, "\tLANG = %c%s%c\n", @@ -1276,7 +2296,8 @@ Perl_init_i18nl10n(pTHX_ int printwarn) } done_lang: -#if defined(WIN32) && defined(LC_ALL) +# if defined(WIN32) && defined(LC_ALL) + /* For Windows, we also try the system default locale before "C". * (If there exists a Windows without LC_ALL we skip this because * it gets too complicated. For those, the "C" is the next @@ -1284,7 +2305,8 @@ Perl_init_i18nl10n(pTHX_ int printwarn) * the array, but the code at the loop above knows to treat it * differently when not the 0th */ trial_locales[trial_locales_count++] = ""; -#endif + +# endif for (j = 0; j < trial_locales_count; j++) { if (strEQ("C", trial_locales[j])) { @@ -1296,9 +2318,11 @@ Perl_init_i18nl10n(pTHX_ int printwarn) done_C: ; } /* end of first time through the loop */ -#ifdef WIN32 +# ifdef WIN32 + next_iteration: ; -#endif + +# endif } /* end of looping through the trial locales */ @@ -1308,6 +2332,7 @@ Perl_init_i18nl10n(pTHX_ int printwarn) msg = "Falling back to"; } else { /* fallback failed */ + unsigned int j; /* We dropped off the end of the loop, so have to decrement i to * get back to the value the last time through */ @@ -1317,21 +2342,12 @@ Perl_init_i18nl10n(pTHX_ int printwarn) msg = "Failed to fall back to"; /* To continue, we should use whatever values we've got */ -#ifdef USE_LOCALE_CTYPE - Safefree(curctype); - curctype = savepv(setlocale(LC_CTYPE, NULL)); - DEBUG_LOCALE_INIT(LC_CTYPE, NULL, curctype); -#endif /* USE_LOCALE_CTYPE */ -#ifdef USE_LOCALE_COLLATE - Safefree(curcoll); - curcoll = savepv(setlocale(LC_COLLATE, NULL)); - DEBUG_LOCALE_INIT(LC_COLLATE, NULL, curcoll); -#endif /* USE_LOCALE_COLLATE */ -#ifdef USE_LOCALE_NUMERIC - Safefree(curnum); - curnum = savepv(setlocale(LC_NUMERIC, NULL)); - DEBUG_LOCALE_INIT(LC_NUMERIC, NULL, curnum); -#endif /* USE_LOCALE_NUMERIC */ + + for (j = 0; j < NOMINAL_LC_ALL_INDEX; j++) { + Safefree(curlocales[j]); + curlocales[j] = savepv(do_setlocale_r(categories[j], NULL)); + DEBUG_LOCALE_INIT(categories[j], NULL, curlocales[j]); + } } if (locwarn) { @@ -1341,14 +2357,18 @@ Perl_init_i18nl10n(pTHX_ int printwarn) description = "the standard locale"; name = "C"; } -#ifdef SYSTEM_DEFAULT_LOCALE + +# ifdef SYSTEM_DEFAULT_LOCALE + else if (strEQ(trial_locales[i], "")) { description = "the system default locale"; if (system_default_locale) { name = system_default_locale; } } -#endif /* SYSTEM_DEFAULT_LOCALE */ + +# endif /* SYSTEM_DEFAULT_LOCALE */ + else { description = "a fallback locale"; name = trial_locales[i]; @@ -1364,25 +2384,54 @@ Perl_init_i18nl10n(pTHX_ int printwarn) } } /* End of tried to fallback */ -#ifdef USE_LOCALE_CTYPE - new_ctype(curctype); -#endif /* USE_LOCALE_CTYPE */ + /* Done with finding the locales; update our records */ -#ifdef USE_LOCALE_COLLATE - new_collate(curcoll); -#endif /* USE_LOCALE_COLLATE */ +# ifdef USE_LOCALE_CTYPE -#ifdef USE_LOCALE_NUMERIC - new_numeric(curnum); -#endif /* USE_LOCALE_NUMERIC */ + new_ctype(curlocales[LC_CTYPE_INDEX]); + +# endif +# ifdef USE_LOCALE_COLLATE + + new_collate(curlocales[LC_COLLATE_INDEX]); + +# endif +# ifdef USE_LOCALE_NUMERIC + + new_numeric(curlocales[LC_NUMERIC_INDEX]); + +# endif + + for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++) { + +# if defined(USE_ITHREADS) + + /* This caches whether each category's locale is UTF-8 or not. This + * may involve changing the locale. It is ok to do this at + * initialization time before any threads have started, but not later. + * Caching means that if the program heeds our dictate not to change + * locales in threaded applications, this data will remain valid, and + * it may get queried without changing locales. If the environment is + * such that all categories have the same locale, this isn't needed, as + * the code will not change the locale; but this handles the uncommon + * case where the environment has disparate locales for the categories + * */ + (void) _is_cur_LC_category_utf8(categories[i]); + +# endif + + Safefree(curlocales[i]); + } + +# if defined(USE_PERLIO) && defined(USE_LOCALE_CTYPE) -#if defined(USE_PERLIO) && defined(USE_LOCALE_CTYPE) /* Set PL_utf8locale to TRUE if using PerlIO _and_ the current LC_CTYPE - * locale is UTF-8. If PL_utf8locale and PL_unicode (set by -C or by - * $ENV{PERL_UNICODE}) are true, perl.c:S_parse_body() will turn on the - * PerlIO :utf8 layer on STDIN, STDOUT, STDERR, _and_ the default open - * discipline. */ - PL_utf8locale = _is_cur_LC_category_utf8(LC_CTYPE); + * locale is UTF-8. The call to new_ctype() just above has already + * calculated the latter value and saved it in PL_in_utf8_CTYPE_locale. If + * both PL_utf8locale and PL_unicode (set by -C or by $ENV{PERL_UNICODE}) + * are true, perl.c:S_parse_body() will turn on the PerlIO :utf8 layer on + * STDIN, STDOUT, STDERR, _and_ the default open discipline. */ + PL_utf8locale = PL_in_utf8_CTYPE_locale; /* Set PL_unicode to $ENV{PERL_UNICODE} if using PerlIO. This is an alternative to using the -C command line switch @@ -1393,32 +2442,23 @@ Perl_init_i18nl10n(pTHX_ int printwarn) if (PL_unicode & PERL_UNICODE_UTF8CACHEASSERT_FLAG) PL_utf8cache = -1; } -#endif -#ifdef USE_LOCALE_CTYPE - Safefree(curctype); -#endif /* USE_LOCALE_CTYPE */ -#ifdef USE_LOCALE_COLLATE - Safefree(curcoll); -#endif /* USE_LOCALE_COLLATE */ -#ifdef USE_LOCALE_NUMERIC - Safefree(curnum); -#endif /* USE_LOCALE_NUMERIC */ +# endif +# ifdef __GLIBC__ -#ifdef __GLIBC__ Safefree(language); -#endif + +# endif Safefree(lc_all); Safefree(lang); -#else /* !USE_LOCALE */ - PERL_UNUSED_ARG(printwarn); #endif /* USE_LOCALE */ - #ifdef DEBUGGING + /* So won't continue to output stuff */ DEBUG_INITIALIZATION_set(FALSE); + #endif return ok; @@ -1469,11 +2509,11 @@ Perl__mem_collxfrm(pTHX_ const char *input_string, * otherwise contain that character, but otherwise there may be * less-than-perfect results with that character and NUL. This is * unavoidable unless we replace strxfrm with our own implementation. */ - if (s_strlen < len) { /* Only execute if there is an embedded NUL */ + if (UNLIKELY(s_strlen < len)) { /* Only execute if there is an embedded + NUL */ char * e = s + len; char * sans_nuls; STRLEN sans_nuls_len; - STRLEN sans_nuls_pos; int try_non_controls; char this_replacement_char[] = "?\0"; /* Room for a two-byte string, making sure 2nd byte is NUL. @@ -1507,10 +2547,10 @@ Perl__mem_collxfrm(pTHX_ const char *input_string, char * x; /* j's xfrm plus collation index */ STRLEN x_len; /* length of 'x' */ STRLEN trial_len = 1; + char cur_source[] = { '\0', '\0' }; - /* Create a 1 byte string of the current code point */ - char cur_source[] = { (char) j, '\0' }; - + /* Skip non-controls the first time through the loop. The + * controls in a UTF-8 locale are the L1 ones */ if (! try_non_controls && (PL_in_utf8_COLLATE_locale) ? ! isCNTRL_L1(j) : ! isCNTRL_LC(j)) @@ -1518,6 +2558,9 @@ Perl__mem_collxfrm(pTHX_ const char *input_string, continue; } + /* Create a 1-char string of the current code point */ + cur_source[0] = (char) j; + /* Then transform it */ x = _mem_collxfrm(cur_source, trial_len, &x_len, 0 /* The string is not in UTF-8 */); @@ -1569,7 +2612,7 @@ Perl__mem_collxfrm(pTHX_ const char *input_string, } /* End of determining the character that is to replace NULs */ /* If the replacement is variant under UTF-8, it must match the - * UTF8-ness as the original */ + * UTF8-ness of the original */ if ( ! UVCHR_IS_INVARIANT(PL_strxfrm_NUL_replacement) && utf8) { this_replacement_char[0] = UTF8_EIGHT_BIT_HI(PL_strxfrm_NUL_replacement); @@ -1589,19 +2632,14 @@ Perl__mem_collxfrm(pTHX_ const char *input_string, sans_nuls_len = (len * this_replacement_len) + 1; Newx(sans_nuls, sans_nuls_len, char); *sans_nuls = '\0'; - sans_nuls_pos = 0; /* Replace each NUL with the lowest collating control. Loop until have * exhausted all the NULs */ while (s + s_strlen < e) { - sans_nuls_pos = my_strlcat(sans_nuls + sans_nuls_pos, - s, - sans_nuls_len); + my_strlcat(sans_nuls, s, sans_nuls_len); /* Do the actual replacement */ - sans_nuls_pos = my_strlcat(sans_nuls + sans_nuls_pos, - this_replacement_char, - sans_nuls_len); + my_strlcat(sans_nuls, this_replacement_char, sans_nuls_len); /* Move past the input NUL */ s += s_strlen + 1; @@ -1609,7 +2647,7 @@ Perl__mem_collxfrm(pTHX_ const char *input_string, } /* And add anything that trails the final NUL */ - my_strlcat(sans_nuls + sans_nuls_pos, s, sans_nuls_len); + my_strlcat(sans_nuls, s, sans_nuls_len); /* Switch so below we transform this modified string */ s = sans_nuls; @@ -1673,9 +2711,10 @@ Perl__mem_collxfrm(pTHX_ const char *input_string, for (j = 1; j < 256; j++) { char * x; STRLEN x_len; + char cur_source[] = { '\0', '\0' }; - /* Create a 1-char string of the current code point. */ - char cur_source[] = { (char) j, '\0' }; + /* Create a 1-char string of the current code point */ + cur_source[0] = (char) j; /* Then transform it */ x = _mem_collxfrm(cur_source, 1, &x_len, FALSE); @@ -1820,10 +2859,14 @@ Perl__mem_collxfrm(pTHX_ const char *input_string, * length 1 strings, as we can't be sure that it's a real slope * change */ if (length_in_chars > 1 && new_m > PL_collxfrm_mult) { -#ifdef DEBUGGING + +# ifdef DEBUGGING + STRLEN old_m = PL_collxfrm_mult; STRLEN old_b = PL_collxfrm_base; -#endif + +# endif + PL_collxfrm_mult = new_m; PL_collxfrm_base = 1; /* +1 For trailing NUL */ computed_guess = PL_collxfrm_base @@ -1882,7 +2925,8 @@ Perl__mem_collxfrm(pTHX_ const char *input_string, xAlloc += (xAlloc / 4) + 1; PL_strxfrm_is_behaved = FALSE; -#ifdef DEBUGGING +# ifdef DEBUGGING + if (DEBUG_Lv_TEST || debug_initialization) { PerlIO_printf(Perl_debug_log, "_mem_collxfrm required more space than previously calculated" @@ -1890,7 +2934,9 @@ Perl__mem_collxfrm(pTHX_ const char *input_string, PL_collation_name, (int) COLLXFRM_HDR_LEN, xAlloc - COLLXFRM_HDR_LEN); } -#endif + +# endif + } Renew(xbuf, xAlloc, char); @@ -1904,18 +2950,18 @@ Perl__mem_collxfrm(pTHX_ const char *input_string, } -#ifdef DEBUGGING +# ifdef DEBUGGING + if (DEBUG_Lv_TEST || debug_initialization) { - Size_t i; print_collxfrm_input_and_return(s, s + len, xlen, utf8); PerlIO_printf(Perl_debug_log, "Its xfrm is:"); - for (i = COLLXFRM_HDR_LEN; i < *xlen + COLLXFRM_HDR_LEN; i++) { - PerlIO_printf(Perl_debug_log, " %02x", (U8) xbuf[i]); - } - PerlIO_printf(Perl_debug_log, "\n"); + PerlIO_printf(Perl_debug_log, "%s\n", + _byte_dump_string((U8 *) xbuf + COLLXFRM_HDR_LEN, + *xlen, 1)); } -#endif + +# endif /* Free up unneeded space; retain ehough for trailing NUL */ Renew(xbuf, COLLXFRM_HDR_LEN + *xlen + 1, char); @@ -1932,15 +2978,19 @@ Perl__mem_collxfrm(pTHX_ const char *input_string, Safefree(s); } *xlen = 0; -#ifdef DEBUGGING + +# ifdef DEBUGGING + if (DEBUG_Lv_TEST || debug_initialization) { print_collxfrm_input_and_return(s, s + len, NULL, utf8); } -#endif + +# endif + return NULL; } -#ifdef DEBUGGING +# ifdef DEBUGGING STATIC void S_print_collxfrm_input_and_return(pTHX_ @@ -1949,22 +2999,35 @@ S_print_collxfrm_input_and_return(pTHX_ const STRLEN * const xlen, const bool is_utf8) { - const char * t = s; - bool prev_was_printable = TRUE; - bool first_time = TRUE; PERL_ARGS_ASSERT_PRINT_COLLXFRM_INPUT_AND_RETURN; - PerlIO_printf(Perl_debug_log, "_mem_collxfrm[%u]: returning ", - PL_collation_ix); + PerlIO_printf(Perl_debug_log, "_mem_collxfrm[%" UVuf "]: returning ", + (UV)PL_collation_ix); if (xlen) { - PerlIO_printf(Perl_debug_log, "%" UVuf, (UV) *xlen); + PerlIO_printf(Perl_debug_log, "%zu", *xlen); } else { PerlIO_printf(Perl_debug_log, "NULL"); } PerlIO_printf(Perl_debug_log, " for locale '%s', string='", PL_collation_name); + print_bytes_for_locale(s, e, is_utf8); + + PerlIO_printf(Perl_debug_log, "'\n"); +} + +STATIC void +S_print_bytes_for_locale(pTHX_ + const char * const s, + const char * const e, + const bool is_utf8) +{ + const char * t = s; + bool prev_was_printable = TRUE; + bool first_time = TRUE; + + PERL_ARGS_ASSERT_PRINT_BYTES_FOR_LOCALE; while (t < e) { UV cp = (is_utf8) @@ -1987,12 +3050,9 @@ S_print_collxfrm_input_and_return(pTHX_ t += (is_utf8) ? UTF8SKIP(t) : 1; first_time = FALSE; } - - PerlIO_printf(Perl_debug_log, "'\n"); } -#endif /* #ifdef DEBUGGING */ - +# endif /* #ifdef DEBUGGING */ #endif /* USE_LOCALE_COLLATE */ #ifdef USE_LOCALE @@ -2008,46 +3068,113 @@ Perl__is_cur_LC_category_utf8(pTHX_ int category) * English, it comes down to if the locale's name ends in something like * "UTF-8". It errs on the side of not being a UTF-8 locale. */ - char *save_input_locale = NULL; - STRLEN final_pos; + /* Name of current locale corresponding to the input category */ + const char *save_input_locale = NULL; + + bool is_utf8 = FALSE; /* The return value */ + + /* The variables below are for the cache of previous lookups using this + * function. The cache is a C string, described at the definition for + * 'C_and_POSIX_utf8ness'. + * + * The first part of the cache is fixed, for the C and POSIX locales. The + * varying part starts just after them. */ + char * utf8ness_cache = PL_locale_utf8ness + STRLENs(C_and_POSIX_utf8ness); + + Size_t utf8ness_cache_size; /* Size of the varying portion */ + Size_t input_name_len; /* Length in bytes of save_input_locale */ + Size_t input_name_len_with_overhead; /* plus extra chars used to store + the name in the cache */ + char * delimited; /* The name plus the delimiters used to store + it in the cache */ + char * name_pos; /* position of 'delimited' in the cache, or 0 + if not there */ + + +# ifdef LC_ALL -#ifdef LC_ALL assert(category != LC_ALL); -#endif - /* First dispose of the trivial cases */ - save_input_locale = setlocale(category, NULL); +# endif + + /* Get the desired category's locale */ + save_input_locale = do_setlocale_r(category, NULL); if (! save_input_locale) { - DEBUG_L(PerlIO_printf(Perl_debug_log, - "Could not find current locale for category %d\n", - category)); - return FALSE; /* XXX maybe should croak */ + Perl_croak(aTHX_ + "panic: %s: %d: Could not find current %s locale, errno=%d\n", + __FILE__, __LINE__, category_name(category), errno); } + save_input_locale = stdize_locale(savepv(save_input_locale)); - if (isNAME_C_OR_POSIX(save_input_locale)) { - DEBUG_L(PerlIO_printf(Perl_debug_log, - "Current locale for category %d is %s\n", - category, save_input_locale)); + DEBUG_L(PerlIO_printf(Perl_debug_log, + "Current locale for %s is %s\n", + category_name(category), save_input_locale)); + + input_name_len = strlen(save_input_locale); + + /* In our cache, each name is accompanied by two delimiters and a single + * utf8ness digit */ + input_name_len_with_overhead = input_name_len + 3; + + /* Allocate and populate space for a copy of the name surrounded by the + * delimiters */ + Newx(delimited, input_name_len_with_overhead, char); + delimited[0] = UTF8NESS_SEP[0]; + Copy(save_input_locale, delimited + 1, input_name_len, char); + delimited[input_name_len+1] = UTF8NESS_PREFIX[0]; + delimited[input_name_len+2] = '\0'; + + /* And see if that is in the cache */ + name_pos = instr(PL_locale_utf8ness, delimited); + if (name_pos) { + is_utf8 = *(name_pos + input_name_len_with_overhead - 1) - '0'; + +# ifdef DEBUGGING + + if (DEBUG_Lv_TEST || debug_initialization) { + PerlIO_printf(Perl_debug_log, "UTF8ness for locale %s=%d, \n", + save_input_locale, is_utf8); + } + +# endif + + /* And, if not already in that position, move it to the beginning of + * the non-constant portion of the list, since it is the most recently + * used. (We don't have to worry about overflow, since just moving + * existing names around) */ + if (name_pos > utf8ness_cache) { + Move(utf8ness_cache, + utf8ness_cache + input_name_len_with_overhead, + name_pos - utf8ness_cache, char); + Copy(delimited, + utf8ness_cache, + input_name_len_with_overhead - 1, char); + utf8ness_cache[input_name_len_with_overhead - 1] = is_utf8 + '0'; + } + + Safefree(delimited); Safefree(save_input_locale); - return FALSE; + return is_utf8; } -#if defined(USE_LOCALE_CTYPE) \ - && (defined(MB_CUR_MAX) || (defined(HAS_NL_LANGINFO) && defined(CODESET))) + /* Here we don't have stored the utf8ness for the input locale. We have to + * calculate it */ - { /* Next try nl_langinfo or MB_CUR_MAX if available */ +# if defined(USE_LOCALE_CTYPE) \ + && ( (defined(HAS_NL_LANGINFO) && defined(CODESET)) \ + || (defined(HAS_MBTOWC) || defined(HAS_MBRTOWC))) - char *save_ctype_locale = NULL; - bool is_utf8; + { + const char *save_ctype_locale = NULL; - if (category != LC_CTYPE) { /* These work only on LC_CTYPE */ + if (category != LC_CTYPE) { /* Get the current LC_CTYPE locale */ - save_ctype_locale = setlocale(LC_CTYPE, NULL); + save_ctype_locale = do_setlocale_c(LC_CTYPE, NULL); if (! save_ctype_locale) { - DEBUG_L(PerlIO_printf(Perl_debug_log, - "Could not find current locale for LC_CTYPE\n")); - goto cant_use_nllanginfo; + Perl_croak(aTHX_ + "panic: %s: %d: Could not find current LC_CTYPE locale," + " errno=%d\n", __FILE__, __LINE__, errno); } save_ctype_locale = stdize_locale(savepv(save_ctype_locale)); @@ -2059,12 +3186,11 @@ Perl__is_cur_LC_category_utf8(pTHX_ int category) Safefree(save_ctype_locale); save_ctype_locale = NULL; } - else if (! setlocale(LC_CTYPE, save_input_locale)) { - DEBUG_L(PerlIO_printf(Perl_debug_log, - "Could not change LC_CTYPE locale to %s\n", - save_input_locale)); + else if (! do_setlocale_c(LC_CTYPE, save_input_locale)) { Safefree(save_ctype_locale); - goto cant_use_nllanginfo; + Perl_croak(aTHX_ + "panic: %s: %d: Could not change LC_CTYPE locale to %s," + " errno=%d\n", __FILE__, __LINE__, save_input_locale, errno); } } @@ -2072,103 +3198,128 @@ Perl__is_cur_LC_category_utf8(pTHX_ int category) save_input_locale)); /* Here the current LC_CTYPE is set to the locale of the category whose - * information is desired. This means that nl_langinfo() and MB_CUR_MAX + * information is desired. This means that nl_langinfo() and mbtowc() * should give the correct results */ -# if defined(HAS_NL_LANGINFO) && defined(CODESET) - { - char *codeset = nl_langinfo(CODESET); - if (codeset && strNE(codeset, "")) { - codeset = savepv(codeset); +# ifdef MB_CUR_MAX /* But we can potentially rule out UTF-8ness, avoiding + calling the functions if we have this */ - /* If we switched LC_CTYPE, switch back */ - if (save_ctype_locale) { - setlocale(LC_CTYPE, save_ctype_locale); - Safefree(save_ctype_locale); - } + /* Standard UTF-8 needs at least 4 bytes to represent the maximum + * Unicode code point. */ + + DEBUG_L(PerlIO_printf(Perl_debug_log, "\tMB_CUR_MAX=%d\n", + (int) MB_CUR_MAX)); + if ((unsigned) MB_CUR_MAX < STRLENs(MAX_UNICODE_UTF8)) { + is_utf8 = FALSE; + goto finish_ctype; + } + +# endif +# if defined(HAS_NL_LANGINFO) && defined(CODESET) + + { /* The task is easiest if the platform has this POSIX 2001 function. + Except on some platforms it can wrongly return "", so have to have + a fallback. And it can return that it's UTF-8, even if there are + variances from that. For example, Turkish locales may use the + alternate dotted I rules, and sometimes it appears to be a + defective locale definition. XXX We should probably check for + these in the Latin1 range and warn */ + const char *codeset = my_nl_langinfo(PERL_CODESET, FALSE); + /* FALSE => already in dest locale */ + + DEBUG_L(PerlIO_printf(Perl_debug_log, + "\tnllanginfo returned CODESET '%s'\n", codeset)); + + if (codeset && strNE(codeset, "")) { - is_utf8 = foldEQ(codeset, STR_WITH_LEN("UTF-8")) - || foldEQ(codeset, STR_WITH_LEN("UTF8")); + /* If the implementation of foldEQ() somehow were + * to change to not go byte-by-byte, this could + * read past end of string, as only one length is + * checked. But currently, a premature NUL will + * compare false, and it will stop there */ + is_utf8 = cBOOL( foldEQ(codeset, STR_WITH_LEN("UTF-8")) + || foldEQ(codeset, STR_WITH_LEN("UTF8"))); DEBUG_L(PerlIO_printf(Perl_debug_log, "\tnllanginfo returned CODESET '%s'; ?UTF8 locale=%d\n", codeset, is_utf8)); - Safefree(codeset); - Safefree(save_input_locale); - return is_utf8; + goto finish_ctype; } } -# endif -# ifdef MB_CUR_MAX +# endif +# if defined(HAS_MBTOWC) || defined(HAS_MBRTOWC) + /* We can see if this is a UTF-8-like locale if have mbtowc(). It was a + * late adder to C89, so very likely to have it. However, testing has + * shown that, like nl_langinfo() above, there are locales that are not + * strictly UTF-8 that this will return that they are */ - /* Here, either we don't have nl_langinfo, or it didn't return a - * codeset. Try MB_CUR_MAX */ + { + wchar_t wc; + int len; - /* Standard UTF-8 needs at least 4 bytes to represent the maximum - * Unicode code point. Since UTF-8 is the only non-single byte - * encoding we handle, we just say any such encoding is UTF-8, and if - * turns out to be wrong, other things will fail */ - is_utf8 = MB_CUR_MAX >= 4; +# if defined(HAS_MBRTOWC) && defined(USE_ITHREADS) - DEBUG_L(PerlIO_printf(Perl_debug_log, - "\tMB_CUR_MAX=%d; ?UTF8 locale=%d\n", - (int) MB_CUR_MAX, is_utf8)); + mbstate_t ps; - Safefree(save_input_locale); +# endif -# ifdef HAS_MBTOWC + /* mbrtowc() and mbtowc() convert a byte string to a wide + * character. Feed a byte string to one of them and check that the + * result is the expected Unicode code point */ + +# if defined(HAS_MBRTOWC) && defined(USE_ITHREADS) + /* Prefer this function if available, as it's reentrant */ + + memset(&ps, 0, sizeof(ps));; + PERL_UNUSED_RESULT(mbrtowc(&wc, NULL, 0, &ps)); /* Reset any shift + state */ + errno = 0; + len = mbrtowc(&wc, STR_WITH_LEN(REPLACEMENT_CHARACTER_UTF8), &ps); + +# else - /* ... But, most system that have MB_CUR_MAX will also have mbtowc(), - * since they are both in the C99 standard. We can feed a known byte - * string to the latter function, and check that it gives the expected - * result */ - if (is_utf8) { - wchar_t wc; PERL_UNUSED_RESULT(mbtowc(&wc, NULL, 0));/* Reset any shift state */ errno = 0; - if ((size_t)mbtowc(&wc, HYPHEN_UTF8, strlen(HYPHEN_UTF8)) - != strlen(HYPHEN_UTF8) - || wc != (wchar_t) 0x2010) - { - is_utf8 = FALSE; - DEBUG_L(PerlIO_printf(Perl_debug_log, "\thyphen=U+%x\n", (unsigned int)wc)); - DEBUG_L(PerlIO_printf(Perl_debug_log, - "\treturn from mbtowc=%d; errno=%d; ?UTF8 locale=0\n", - mbtowc(&wc, HYPHEN_UTF8, strlen(HYPHEN_UTF8)), errno)); - } + len = mbtowc(&wc, STR_WITH_LEN(REPLACEMENT_CHARACTER_UTF8)); + +# endif + + DEBUG_L(PerlIO_printf(Perl_debug_log, + "\treturn from mbtowc; len=%d; code_point=%x; errno=%d\n", + len, (unsigned int) wc, errno)); + + is_utf8 = cBOOL( len == STRLENs(REPLACEMENT_CHARACTER_UTF8) + && wc == (wchar_t) UNICODE_REPLACEMENT); } -# endif + + finish_ctype: /* If we switched LC_CTYPE, switch back */ if (save_ctype_locale) { - setlocale(LC_CTYPE, save_ctype_locale); + do_setlocale_c(LC_CTYPE, save_ctype_locale); Safefree(save_ctype_locale); } - return is_utf8; -# endif + goto finish_and_return; } - cant_use_nllanginfo: +# endif +# else -#else /* nl_langinfo should work if available, so don't bother compiling this - fallback code. The final fallback of looking at the name is - compiled, and will be executed if nl_langinfo fails */ + /* Here, we must have a C89 compiler that doesn't have mbtowc(). Next + * try looking at the currency symbol to see if it disambiguates + * things. Often that will be in the native script, and if the symbol + * isn't in UTF-8, we know that the locale isn't. If it is non-ASCII + * UTF-8, we infer that the locale is too, as the odds of a non-UTF8 + * string being valid UTF-8 are quite small */ - /* nl_langinfo not available or failed somehow. Next try looking at the - * currency symbol to see if it disambiguates things. Often that will be - * in the native script, and if the symbol isn't in UTF-8, we know that the - * locale isn't. If it is non-ASCII UTF-8, we infer that the locale is - * too, as the odds of a non-UTF8 string being valid UTF-8 are quite small - * */ +# ifdef HAS_LOCALECONV +# ifdef USE_LOCALE_MONETARY -#ifdef HAS_LOCALECONV -# ifdef USE_LOCALE_MONETARY { char *save_monetary_locale = NULL; bool only_ascii = FALSE; - bool is_utf8 = FALSE; struct lconv* lc; /* Like above for LC_CTYPE, we first set LC_MONETARY to the locale of @@ -2176,11 +3327,11 @@ Perl__is_cur_LC_category_utf8(pTHX_ int category) if (category != LC_MONETARY) { - save_monetary_locale = setlocale(LC_MONETARY, NULL); + save_monetary_locale = do_setlocale_c(LC_MONETARY, NULL); if (! save_monetary_locale) { - DEBUG_L(PerlIO_printf(Perl_debug_log, - "Could not find current locale for LC_MONETARY\n")); - goto cant_use_monetary; + Perl_croak(aTHX_ + "panic: %s: %d: Could not find current LC_MONETARY locale," + " errno=%d\n", __FILE__, __LINE__, errno); } save_monetary_locale = stdize_locale(savepv(save_monetary_locale)); @@ -2188,12 +3339,11 @@ Perl__is_cur_LC_category_utf8(pTHX_ int category) Safefree(save_monetary_locale); save_monetary_locale = NULL; } - else if (! setlocale(LC_MONETARY, save_input_locale)) { - DEBUG_L(PerlIO_printf(Perl_debug_log, - "Could not change LC_MONETARY locale to %s\n", - save_input_locale)); + else if (! do_setlocale_c(LC_MONETARY, save_input_locale)) { Safefree(save_monetary_locale); - goto cant_use_monetary; + Perl_croak(aTHX_ + "panic: %s: %d: Could not change LC_MONETARY locale to %s," + " errno=%d\n", __FILE__, __LINE__, save_input_locale, errno); } } @@ -2214,7 +3364,7 @@ Perl__is_cur_LC_category_utf8(pTHX_ int category) /* If we changed it, restore LC_MONETARY to its original locale */ if (save_monetary_locale) { - setlocale(LC_MONETARY, save_monetary_locale); + do_setlocale_c(LC_MONETARY, save_monetary_locale); Safefree(save_monetary_locale); } @@ -2225,16 +3375,14 @@ Perl__is_cur_LC_category_utf8(pTHX_ int category) * is non-ascii UTF-8. */ DEBUG_L(PerlIO_printf(Perl_debug_log, "\t?Currency symbol for %s is UTF-8=%d\n", save_input_locale, is_utf8)); - Safefree(save_input_locale); - return is_utf8; + goto finish_and_return; } } - cant_use_monetary: -# endif /* USE_LOCALE_MONETARY */ -#endif /* HAS_LOCALECONV */ +# endif /* USE_LOCALE_MONETARY */ +# endif /* HAS_LOCALECONV */ -#if defined(HAS_STRFTIME) && defined(USE_LOCALE_TIME) +# if defined(HAS_STRFTIME) && defined(USE_LOCALE_TIME) /* Still haven't found a non-ASCII string to disambiguate UTF-8 or not. Try * the names of the months and weekdays, timezone, and am/pm indicator */ @@ -2253,11 +3401,11 @@ Perl__is_cur_LC_category_utf8(pTHX_ int category) if (category != LC_TIME) { - save_time_locale = setlocale(LC_TIME, NULL); + save_time_locale = do_setlocale_c(LC_TIME, NULL); if (! save_time_locale) { - DEBUG_L(PerlIO_printf(Perl_debug_log, - "Could not find current locale for LC_TIME\n")); - goto cant_use_time; + Perl_croak(aTHX_ + "panic: %s: %d: Could not find current LC_TIME locale," + " errno=%d\n", __FILE__, __LINE__, errno); } save_time_locale = stdize_locale(savepv(save_time_locale)); @@ -2265,12 +3413,11 @@ Perl__is_cur_LC_category_utf8(pTHX_ int category) Safefree(save_time_locale); save_time_locale = NULL; } - else if (! setlocale(LC_TIME, save_input_locale)) { - DEBUG_L(PerlIO_printf(Perl_debug_log, - "Could not change LC_TIME locale to %s\n", - save_input_locale)); + else if (! do_setlocale_c(LC_TIME, save_input_locale)) { Safefree(save_time_locale); - goto cant_use_time; + Perl_croak(aTHX_ + "panic: %s: %d: Could not change LC_TIME locale to %s," + " errno=%d\n", __FILE__, __LINE__, save_input_locale, errno); } } @@ -2282,7 +3429,7 @@ Perl__is_cur_LC_category_utf8(pTHX_ int category) for (i = 0; i < 7 + 12; i++) { /* 7 days; 12 months */ formatted_time = my_strftime("%A %B %Z %p", - 0, 0, hour, dom, month, 112, 0, 0, is_dst); + 0, 0, hour, dom, month, 2012 - 1900, 0, 0, is_dst); if ( ! formatted_time || is_utf8_invariant_string((U8 *) formatted_time, 0)) { @@ -2304,31 +3451,30 @@ Perl__is_cur_LC_category_utf8(pTHX_ int category) * false otherwise. But first, restore LC_TIME to its original * locale if we changed it */ if (save_time_locale) { - setlocale(LC_TIME, save_time_locale); + do_setlocale_c(LC_TIME, save_time_locale); Safefree(save_time_locale); } DEBUG_L(PerlIO_printf(Perl_debug_log, "\t?time-related strings for %s are UTF-8=%d\n", save_input_locale, is_utf8_string((U8 *) formatted_time, 0))); - Safefree(save_input_locale); - return is_utf8_string((U8 *) formatted_time, 0); + is_utf8 = is_utf8_string((U8 *) formatted_time, 0); + goto finish_and_return; } /* Falling off the end of the loop indicates all the names were just * ASCII. Go on to the next test. If we changed it, restore LC_TIME * to its original locale */ if (save_time_locale) { - setlocale(LC_TIME, save_time_locale); + do_setlocale_c(LC_TIME, save_time_locale); Safefree(save_time_locale); } DEBUG_L(PerlIO_printf(Perl_debug_log, "All time-related words for %s contain only ASCII; can't use for determining if UTF-8 locale\n", save_input_locale)); } - cant_use_time: -#endif +# endif -#if 0 && defined(USE_LOCALE_MESSAGES) && defined(HAS_SYS_ERRLIST) +# if 0 && defined(USE_LOCALE_MESSAGES) && defined(HAS_SYS_ERRLIST) /* This code is ifdefd out because it was found to not be necessary in testing * on our dromedary test machine, which has over 700 locales. There, this @@ -2343,7 +3489,6 @@ Perl__is_cur_LC_category_utf8(pTHX_ int category) * are much more likely to have been translated. */ { int e; - bool is_utf8 = FALSE; bool non_ascii = FALSE; char *save_messages_locale = NULL; const char * errmsg = NULL; @@ -2353,11 +3498,11 @@ Perl__is_cur_LC_category_utf8(pTHX_ int category) if (category != LC_MESSAGES) { - save_messages_locale = setlocale(LC_MESSAGES, NULL); + save_messages_locale = do_setlocale_c(LC_MESSAGES, NULL); if (! save_messages_locale) { - DEBUG_L(PerlIO_printf(Perl_debug_log, - "Could not find current locale for LC_MESSAGES\n")); - goto cant_use_messages; + Perl_croak(aTHX_ + "panic: %s: %d: Could not find current LC_MESSAGES locale," + " errno=%d\n", __FILE__, __LINE__, errno); } save_messages_locale = stdize_locale(savepv(save_messages_locale)); @@ -2365,12 +3510,11 @@ Perl__is_cur_LC_category_utf8(pTHX_ int category) Safefree(save_messages_locale); save_messages_locale = NULL; } - else if (! setlocale(LC_MESSAGES, save_input_locale)) { - DEBUG_L(PerlIO_printf(Perl_debug_log, - "Could not change LC_MESSAGES locale to %s\n", - save_input_locale)); + else if (! do_setlocale_c(LC_MESSAGES, save_input_locale)) { Safefree(save_messages_locale); - goto cant_use_messages; + Perl_croak(aTHX_ + "panic: %s: %d: Could not change LC_MESSAGES locale to %s," + " errno=%d\n", __FILE__, __LINE__, save_input_locale, errno); } } @@ -2396,7 +3540,7 @@ Perl__is_cur_LC_category_utf8(pTHX_ int category) /* And, if we changed it, restore LC_MESSAGES to its original locale */ if (save_messages_locale) { - setlocale(LC_MESSAGES, save_messages_locale); + do_setlocale_c(LC_MESSAGES, save_messages_locale); Safefree(save_messages_locale); } @@ -2407,20 +3551,16 @@ Perl__is_cur_LC_category_utf8(pTHX_ int category) DEBUG_L(PerlIO_printf(Perl_debug_log, "\t?error messages for %s are UTF-8=%d\n", save_input_locale, is_utf8)); - Safefree(save_input_locale); - return is_utf8; + goto finish_and_return; } DEBUG_L(PerlIO_printf(Perl_debug_log, "All error messages for %s contain only ASCII; can't use for determining if UTF-8 locale\n", save_input_locale)); } - cant_use_messages: -#endif - -#endif /* the code that is compiled when no nl_langinfo */ - -#ifndef EBCDIC /* On os390, even if the name ends with "UTF-8', it isn't a +# endif +# ifndef EBCDIC /* On os390, even if the name ends with "UTF-8', it isn't a UTF-8 locale */ + /* As a last resort, look at the locale name to see if it matches * qr/UTF -? * 8 /ix, or some other common locale names. This "name", the * return of setlocale(), is actually defined to be opaque, so we can't @@ -2428,80 +3568,146 @@ Perl__is_cur_LC_category_utf8(pTHX_ int category) * its UTF-8ness, but if it has UTF8 in the name, it is extremely likely to * be a UTF-8 locale. Similarly for the other common names */ - final_pos = strlen(save_input_locale) - 1; - if (final_pos >= 3) { - char *name = save_input_locale; + { + const Size_t final_pos = strlen(save_input_locale) - 1; - /* Find next 'U' or 'u' and look from there */ - while ((name += strcspn(name, "Uu") + 1) - <= save_input_locale + final_pos - 2) - { - if (!isALPHA_FOLD_NE(*name, 't') - || isALPHA_FOLD_NE(*(name + 1), 'f')) + if (final_pos >= 3) { + const char *name = save_input_locale; + + /* Find next 'U' or 'u' and look from there */ + while ((name += strcspn(name, "Uu") + 1) + <= save_input_locale + final_pos - 2) { - continue; - } - name += 2; - if (*(name) == '-') { - if ((name > save_input_locale + final_pos - 1)) { - break; + if ( isALPHA_FOLD_NE(*name, 't') + || isALPHA_FOLD_NE(*(name + 1), 'f')) + { + continue; + } + name += 2; + if (*(name) == '-') { + if ((name > save_input_locale + final_pos - 1)) { + break; + } + name++; + } + if (*(name) == '8') { + DEBUG_L(PerlIO_printf(Perl_debug_log, + "Locale %s ends with UTF-8 in name\n", + save_input_locale)); + is_utf8 = TRUE; + goto finish_and_return; } - name++; - } - if (*(name) == '8') { - DEBUG_L(PerlIO_printf(Perl_debug_log, - "Locale %s ends with UTF-8 in name\n", - save_input_locale)); - Safefree(save_input_locale); - return TRUE; } + DEBUG_L(PerlIO_printf(Perl_debug_log, + "Locale %s doesn't end with UTF-8 in name\n", + save_input_locale)); } - DEBUG_L(PerlIO_printf(Perl_debug_log, - "Locale %s doesn't end with UTF-8 in name\n", - save_input_locale)); - } -#endif -#ifdef WIN32 - /* http://msdn.microsoft.com/en-us/library/windows/desktop/dd317756.aspx */ - if (final_pos >= 4 - && *(save_input_locale + final_pos - 0) == '1' - && *(save_input_locale + final_pos - 1) == '0' - && *(save_input_locale + final_pos - 2) == '0' - && *(save_input_locale + final_pos - 3) == '5' - && *(save_input_locale + final_pos - 4) == '6') - { - DEBUG_L(PerlIO_printf(Perl_debug_log, - "Locale %s ends with 10056 in name, is UTF-8 locale\n", +# ifdef WIN32 + + /* http://msdn.microsoft.com/en-us/library/windows/desktop/dd317756.aspx */ + if (memENDs(save_input_locale, final_pos, "65001")) { + DEBUG_L(PerlIO_printf(Perl_debug_log, + "Locale %s ends with 65001 in name, is UTF-8 locale\n", save_input_locale)); - Safefree(save_input_locale); - return TRUE; + is_utf8 = TRUE; + goto finish_and_return; + } } -#endif + +# endif +# endif /* Other common encodings are the ISO 8859 series, which aren't UTF-8. But * since we are about to return FALSE anyway, there is no point in doing * this extra work */ -#if 0 + +# if 0 if (instr(save_input_locale, "8859")) { DEBUG_L(PerlIO_printf(Perl_debug_log, "Locale %s has 8859 in name, not UTF-8 locale\n", save_input_locale)); - Safefree(save_input_locale); - return FALSE; + is_utf8 = FALSE; + goto finish_and_return; } -#endif +# endif DEBUG_L(PerlIO_printf(Perl_debug_log, "Assuming locale %s is not a UTF-8 locale\n", save_input_locale)); + is_utf8 = FALSE; + +# endif /* the code that is compiled when no modern LC_CTYPE */ + + finish_and_return: + + /* Cache this result so we don't have to go through all this next time. */ + utf8ness_cache_size = sizeof(PL_locale_utf8ness) + - (utf8ness_cache - PL_locale_utf8ness); + + /* But we can't save it if it is too large for the total space available */ + if (LIKELY(input_name_len_with_overhead < utf8ness_cache_size)) { + Size_t utf8ness_cache_len = strlen(utf8ness_cache); + + /* Here it can fit, but we may need to clear out the oldest cached + * result(s) to do so. Check */ + if (utf8ness_cache_len + input_name_len_with_overhead + >= utf8ness_cache_size) + { + /* Here we have to clear something out to make room for this. + * Start looking at the rightmost place where it could fit and find + * the beginning of the entry that extends past that. */ + char * cutoff = (char *) my_memrchr(utf8ness_cache, + UTF8NESS_SEP[0], + utf8ness_cache_size + - input_name_len_with_overhead); + + assert(cutoff); + assert(cutoff >= utf8ness_cache); + + /* This and all subsequent entries must be removed */ + *cutoff = '\0'; + utf8ness_cache_len = strlen(utf8ness_cache); + } + + /* Make space for the new entry */ + Move(utf8ness_cache, + utf8ness_cache + input_name_len_with_overhead, + utf8ness_cache_len + 1 /* Incl. trailing NUL */, char); + + /* And insert it */ + Copy(delimited, utf8ness_cache, input_name_len_with_overhead - 1, char); + utf8ness_cache[input_name_len_with_overhead - 1] = is_utf8 + '0'; + + if ((PL_locale_utf8ness[strlen(PL_locale_utf8ness)-1] + & (PERL_UINTMAX_T) ~1) != '0') + { + Perl_croak(aTHX_ + "panic: %s: %d: Corrupt utf8ness_cache=%s\nlen=%u," + " inserted_name=%s, its_len=%u\n", + __FILE__, __LINE__, + PL_locale_utf8ness, strlen(PL_locale_utf8ness), + delimited, input_name_len_with_overhead); + } + } + +# ifdef DEBUGGING + + if (DEBUG_Lv_TEST || debug_initialization) { + PerlIO_printf(Perl_debug_log, + "PL_locale_utf8ness is now %s; returning %d\n", + PL_locale_utf8ness, is_utf8); + } + +# endif + + Safefree(delimited); Safefree(save_input_locale); - return FALSE; + return is_utf8; } #endif - bool Perl__is_in_locale_category(pTHX_ const bool compiling, const int category) { @@ -2536,18 +3742,76 @@ Perl_my_strerror(pTHX_ const int errnum) * to the C locale */ char *errstr; + dVAR; + +#ifndef USE_LOCALE_MESSAGES + + /* If platform doesn't have messages category, we don't do any switching to + * the C locale; we just use whatever strerror() returns */ + + errstr = savepv(Strerror(errnum)); + +#else /* Has locale messages */ -#ifdef USE_LOCALE_MESSAGES /* If platform doesn't have messages category, we - don't do any switching to the C locale; we just - use whatever strerror() returns */ const bool within_locale_scope = IN_LC(LC_MESSAGES); - dVAR; +# if defined(HAS_POSIX_2008_LOCALE) && defined(HAS_STRERROR_L) + + /* This function is trivial if we don't have to worry about thread safety + * and have strerror_l(), as it handles the switch of locales so we don't + * have to deal with that. We don't have to worry about thread safety if + * this is an unthreaded build, or if strerror_r() is also available. Both + * it and strerror_l() are thread-safe. Plain strerror() isn't thread + * safe. But on threaded builds when strerror_r() is available, the + * apparent call to strerror() below is actually a macro that + * behind-the-scenes calls strerror_r(). + */ + +# if ! defined(USE_ITHREADS) || defined(HAS_STRERROR_R) + + if (within_locale_scope) { + errstr = savepv(strerror(errnum)); + } + else { + errstr = savepv(strerror_l(errnum, PL_C_locale_obj)); + } + +# else + + /* Here we have strerror_l(), but not strerror_r() and we are on a + * threaded-build. We use strerror_l() for everything, constructing a + * locale to pass to it if necessary */ + + bool do_free = FALSE; + locale_t locale_to_use; + + if (within_locale_scope) { + locale_to_use = uselocale((locale_t) 0); + if (locale_to_use == LC_GLOBAL_LOCALE) { + locale_to_use = duplocale(LC_GLOBAL_LOCALE); + do_free = TRUE; + } + } + else { /* Use C locale if not within 'use locale' scope */ + locale_to_use = PL_C_locale_obj; + } + + errstr = savepv(strerror_l(errnum, locale_to_use)); + + if (do_free) { + freelocale(locale_to_use); + } + +# endif +# else /* Doesn't have strerror_l() */ + +# ifdef USE_POSIX_2008_LOCALE -# ifdef USE_THREAD_SAFE_LOCALE locale_t save_locale = NULL; -# else - char * save_locale = NULL; + +# else + + const char * save_locale = NULL; bool locale_is_C = FALSE; /* We have a critical section to prevent another thread from changing the @@ -2555,25 +3819,35 @@ Perl_my_strerror(pTHX_ const int errnum) * setlocale() ) */ LOCALE_LOCK; -# endif +# endif + DEBUG_Lv(PerlIO_printf(Perl_debug_log, + "my_strerror called with errnum %d\n", errnum)); if (! within_locale_scope) { errno = 0; -# ifdef USE_THREAD_SAFE_LOCALE /* Use the thread-safe locale functions */ +# ifdef USE_POSIX_2008_LOCALE /* Use the thread-safe locale functions */ + DEBUG_Lv(PerlIO_printf(Perl_debug_log, + "Not within locale scope, about to call" + " uselocale(0x%p)\n", PL_C_locale_obj)); save_locale = uselocale(PL_C_locale_obj); if (! save_locale) { DEBUG_L(PerlIO_printf(Perl_debug_log, - "uselocale failed, errno=%d\n", errno)); + "uselocale failed, errno=%d\n", errno)); + } + else { + DEBUG_Lv(PerlIO_printf(Perl_debug_log, + "uselocale returned 0x%p\n", save_locale)); } -# else /* Not thread-safe build */ +# else /* Not thread-safe build */ - save_locale = setlocale(LC_MESSAGES, NULL); + save_locale = do_setlocale_c(LC_MESSAGES, NULL); if (! save_locale) { - DEBUG_L(PerlIO_printf(Perl_debug_log, - "setlocale failed, errno=%d\n", errno)); + Perl_croak(aTHX_ + "panic: %s: %d: Could not find current LC_MESSAGES locale," + " errno=%d\n", __FILE__, __LINE__, errno); } else { locale_is_C = isNAME_C_OR_POSIX(save_locale); @@ -2584,41 +3858,43 @@ Perl_my_strerror(pTHX_ const int errnum) /* The setlocale() just below likely will zap 'save_locale', so * create a copy. */ save_locale = savepv(save_locale); - setlocale(LC_MESSAGES, "C"); + do_setlocale_c(LC_MESSAGES, "C"); } } -# endif +# endif } /* end of ! within_locale_scope */ - -#endif - - errstr = Strerror(errnum); - if (errstr) { - errstr = savepv(errstr); - SAVEFREEPV(errstr); + else { + DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s: %d: WITHIN locale scope\n", + __FILE__, __LINE__)); } -#ifdef USE_LOCALE_MESSAGES + DEBUG_Lv(PerlIO_printf(Perl_debug_log, + "Any locale change has been done; about to call Strerror\n")); + errstr = savepv(Strerror(errnum)); if (! within_locale_scope) { errno = 0; -# ifdef USE_THREAD_SAFE_LOCALE +# ifdef USE_POSIX_2008_LOCALE + DEBUG_Lv(PerlIO_printf(Perl_debug_log, + "%s: %d: not within locale scope, restoring the locale\n", + __FILE__, __LINE__)); if (save_locale && ! uselocale(save_locale)) { DEBUG_L(PerlIO_printf(Perl_debug_log, "uselocale restore failed, errno=%d\n", errno)); } } -# else +# else if (save_locale && ! locale_is_C) { - if (! setlocale(LC_MESSAGES, save_locale)) { - DEBUG_L(PerlIO_printf(Perl_debug_log, - "setlocale restore failed, errno=%d\n", errno)); + if (! do_setlocale_c(LC_MESSAGES, save_locale)) { + Perl_croak(aTHX_ + "panic: %s: %d: setlocale restore failed, errno=%d\n", + __FILE__, __LINE__, errno); } Safefree(save_locale); } @@ -2626,16 +3902,26 @@ Perl_my_strerror(pTHX_ const int errnum) LOCALE_UNLOCK; -# endif +# endif +# endif /* End of doesn't have strerror_l */ +#endif /* End of does have locale messages */ + +#ifdef DEBUGGING + + if (DEBUG_Lv_TEST) { + PerlIO_printf(Perl_debug_log, "Strerror returned; saving a copy: '"); + print_bytes_for_locale(errstr, errstr + strlen(errstr), 0); + PerlIO_printf(Perl_debug_log, "'\n"); + } + #endif + SAVEFREEPV(errstr); return errstr; } /* -=head1 Locale-related functions and macros - =for apidoc sync_locale Changing the program's locale should be avoided by XS code. Nevertheless, @@ -2649,26 +3935,42 @@ to do so, before returning to Perl. void Perl_sync_locale(pTHX) { + char * newlocale; #ifdef USE_LOCALE_CTYPE - new_ctype(setlocale(LC_CTYPE, NULL)); -#endif /* USE_LOCALE_CTYPE */ + newlocale = do_setlocale_c(LC_CTYPE, NULL); + DEBUG_Lv(PerlIO_printf(Perl_debug_log, + "%s:%d: %s\n", __FILE__, __LINE__, + setlocale_debug_string(LC_CTYPE, NULL, newlocale))); + new_ctype(newlocale); + +#endif /* USE_LOCALE_CTYPE */ #ifdef USE_LOCALE_COLLATE - new_collate(setlocale(LC_COLLATE, NULL)); -#endif + newlocale = do_setlocale_c(LC_COLLATE, NULL); + DEBUG_Lv(PerlIO_printf(Perl_debug_log, + "%s:%d: %s\n", __FILE__, __LINE__, + setlocale_debug_string(LC_COLLATE, NULL, newlocale))); + new_collate(newlocale); + +#endif #ifdef USE_LOCALE_NUMERIC - set_numeric_local(); /* Switch from "C" to underlying LC_NUMERIC */ - new_numeric(setlocale(LC_NUMERIC, NULL)); + + newlocale = do_setlocale_c(LC_NUMERIC, NULL); + DEBUG_Lv(PerlIO_printf(Perl_debug_log, + "%s:%d: %s\n", __FILE__, __LINE__, + setlocale_debug_string(LC_NUMERIC, NULL, newlocale))); + new_numeric(newlocale); + #endif /* USE_LOCALE_NUMERIC */ } #if defined(DEBUGGING) && defined(USE_LOCALE) -char * -Perl__setlocale_debug_string(const int category, /* category number, +STATIC char * +S_setlocale_debug_string(const int category, /* category number, like LC_ALL */ const char* const locale, /* locale name */ @@ -2686,49 +3988,9 @@ Perl__setlocale_debug_string(const int category, /* category number, static char ret[128] = "If you can read this, thank your buggy C" " library strlcpy(), and change your hints file" " to undef it"; - my_strlcpy(ret, "setlocale(", sizeof(ret)); - - switch (category) { - default: - my_snprintf(ret, sizeof(ret), "%s? %d", ret, category); - break; -# ifdef LC_ALL - case LC_ALL: - my_strlcat(ret, "LC_ALL", sizeof(ret)); - break; -# endif -# ifdef LC_CTYPE - case LC_CTYPE: - my_strlcat(ret, "LC_CTYPE", sizeof(ret)); - break; -# endif -# ifdef LC_NUMERIC - case LC_NUMERIC: - my_strlcat(ret, "LC_NUMERIC", sizeof(ret)); - break; -# endif -# ifdef LC_COLLATE - case LC_COLLATE: - my_strlcat(ret, "LC_COLLATE", sizeof(ret)); - break; -# endif -# ifdef LC_TIME - case LC_TIME: - my_strlcat(ret, "LC_TIME", sizeof(ret)); - break; -# endif -# ifdef LC_MONETARY - case LC_MONETARY: - my_strlcat(ret, "LC_MONETARY", sizeof(ret)); - break; -# endif -# ifdef LC_MESSAGES - case LC_MESSAGES: - my_strlcat(ret, "LC_MESSAGES", sizeof(ret)); - break; -# endif - } + my_strlcpy(ret, "setlocale(", sizeof(ret)); + my_strlcat(ret, category_name(category), sizeof(ret)); my_strlcat(ret, ", ", sizeof(ret)); if (locale) {