X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/4ac71550d23cca4632a2bcdfcb1d83a6bf705e45..a9421703c81675c866e764cc79ff61a061faf9d6:/locale.c diff --git a/locale.c b/locale.c index f7cb79c..31aa592 100644 --- a/locale.c +++ b/locale.c @@ -10,9 +10,9 @@ /* * A Elbereth Gilthoniel, - * silivren penna míriel + * silivren penna míriel * o menel aglar elenath! - * Na-chaered palan-díriel + * Na-chaered palan-díriel * o galadhremmin ennorath, * Fanuilos, le linnathon * nef aear, si nef aearon! @@ -28,28 +28,26 @@ #define PERL_IN_LOCALE_C #include "perl.h" -#ifdef I_LOCALE -# include -#endif - #ifdef I_LANGINFO # include #endif #include "reentr.h" -#if defined(USE_LOCALE_NUMERIC) || defined(USE_LOCALE_COLLATE) +#ifdef USE_LOCALE + /* * Standardize the locale name from a string returned by 'setlocale'. * - * The standard return value of setlocale() is either + * The typical return value of setlocale() is either * (1) "xx_YY" if the first argument of setlocale() is not LC_ALL * (2) "xa_YY xb_YY ..." if the first argument of setlocale() is LC_ALL * (the space-separated values represent the various sublocales, - * in some unspecificed order) + * in some unspecified order). This is not handled by this function. * * In some platforms it has a form like "LC_SOMETHING=Lang_Country.866\n", - * which is harmful for further use of the string in setlocale(). + * which is harmful for further use of the string in setlocale(). This + * function removes the trailing new line and everything up through the '=' * */ STATIC char * @@ -79,6 +77,7 @@ S_stdize_locale(pTHX_ char *locs) return locs; } + #endif void @@ -99,6 +98,12 @@ Perl_set_numeric_radix(pTHX) sv_setpv(PL_numeric_radix_sv, lc->decimal_point); else PL_numeric_radix_sv = newSVpv(lc->decimal_point, 0); + if (! is_ascii_string((U8 *) lc->decimal_point, 0) + && is_utf8_string((U8 *) lc->decimal_point, 0) + && is_cur_LC_category_utf8(LC_NUMERIC)) + { + SvUTF8_on(PL_numeric_radix_sv); + } } } else @@ -114,6 +119,7 @@ void Perl_new_numeric(pTHX_ const char *newnum) { #ifdef USE_LOCALE_NUMERIC + char *save_newnum; dVAR; if (! newnum) { @@ -124,14 +130,18 @@ Perl_new_numeric(pTHX_ const char *newnum) return; } - if (! PL_numeric_name || strNE(PL_numeric_name, newnum)) { + save_newnum = stdize_locale(savepv(newnum)); + if (! PL_numeric_name || strNE(PL_numeric_name, save_newnum)) { Safefree(PL_numeric_name); - PL_numeric_name = stdize_locale(savepv(newnum)); - PL_numeric_standard = ((*newnum == 'C' && newnum[1] == '\0') - || strEQ(newnum, "POSIX")); + PL_numeric_name = save_newnum; + PL_numeric_standard = ((*save_newnum == 'C' && save_newnum[1] == '\0') + || strEQ(save_newnum, "POSIX")); PL_numeric_local = TRUE; set_numeric_radix(); } + else { + Safefree(save_newnum); + } #endif /* USE_LOCALE_NUMERIC */ } @@ -176,15 +186,15 @@ Perl_new_ctype(pTHX_ const char *newctype) { #ifdef USE_LOCALE_CTYPE dVAR; - int i; + UV i; PERL_ARGS_ASSERT_NEW_CTYPE; for (i = 0; i < 256; i++) { - if (isUPPER_LC(i)) - PL_fold_locale[i] = toLOWER_LC(i); - else if (isLOWER_LC(i)) - PL_fold_locale[i] = toUPPER_LC(i); + if (isUPPER_LC((U8) i)) + PL_fold_locale[i] = toLOWER_LC((U8) i); + else if (isLOWER_LC((U8) i)) + PL_fold_locale[i] = toUPPER_LC((U8) i); else PL_fold_locale[i] = i; } @@ -231,8 +241,9 @@ Perl_new_collate(pTHX_ const char *newcoll) const Size_t fa = strxfrm(xbuf, "a", XFRMBUFSIZE); const Size_t fb = strxfrm(xbuf, "ab", XFRMBUFSIZE); const SSize_t mult = fb - fa; - if (mult < 1) - Perl_croak(aTHX_ "strxfrm() gets absurd"); + if (mult < 1 && !(fa == 0 && fb == 0)) + Perl_croak(aTHX_ "panic: strxfrm() gets absurd - a => %"UVuf", ab => %"UVuf, + (UV) fa, (UV) fb); PL_collxfrm_base = (fa > (Size_t)mult) ? (fa - mult) : 0; PL_collxfrm_mult = mult; } @@ -269,6 +280,10 @@ Perl_init_i18nl10n(pTHX_ int printwarn) #ifdef __GLIBC__ char * const language = PerlEnv_getenv("LANGUAGE"); #endif + /* NULL uses the existing already set up locale */ + const char * const setlocale_init = (PerlEnv_getenv("PERL_SKIP_LOCALE_INIT")) + ? NULL + : ""; char * const lc_all = PerlEnv_getenv("LC_ALL"); char * const lang = PerlEnv_getenv("LANG"); bool setlocale_failure = FALSE; @@ -282,73 +297,73 @@ Perl_init_i18nl10n(pTHX_ int printwarn) bool done = FALSE; -#ifdef LC_ALL +# ifdef LC_ALL if (lang) { - if (setlocale(LC_ALL, "")) + if (setlocale(LC_ALL, setlocale_init)) done = TRUE; else setlocale_failure = TRUE; } if (!setlocale_failure) { -#ifdef USE_LOCALE_CTYPE +# ifdef USE_LOCALE_CTYPE Safefree(curctype); if (! (curctype = setlocale(LC_CTYPE, (!done && (lang || PerlEnv_getenv("LC_CTYPE"))) - ? "" : NULL))) + ? setlocale_init : NULL))) setlocale_failure = TRUE; else curctype = savepv(curctype); -#endif /* USE_LOCALE_CTYPE */ -#ifdef USE_LOCALE_COLLATE +# endif /* USE_LOCALE_CTYPE */ +# ifdef USE_LOCALE_COLLATE Safefree(curcoll); if (! (curcoll = setlocale(LC_COLLATE, (!done && (lang || PerlEnv_getenv("LC_COLLATE"))) - ? "" : NULL))) + ? setlocale_init : NULL))) setlocale_failure = TRUE; else curcoll = savepv(curcoll); -#endif /* USE_LOCALE_COLLATE */ -#ifdef USE_LOCALE_NUMERIC +# endif /* USE_LOCALE_COLLATE */ +# ifdef USE_LOCALE_NUMERIC Safefree(curnum); if (! (curnum = setlocale(LC_NUMERIC, (!done && (lang || PerlEnv_getenv("LC_NUMERIC"))) - ? "" : NULL))) + ? setlocale_init : NULL))) setlocale_failure = TRUE; else curnum = savepv(curnum); -#endif /* USE_LOCALE_NUMERIC */ +# endif /* USE_LOCALE_NUMERIC */ } -#endif /* LC_ALL */ +# endif /* LC_ALL */ #endif /* !LOCALE_ENVIRON_REQUIRED */ #ifdef LC_ALL - if (! setlocale(LC_ALL, "")) + if (! setlocale(LC_ALL, setlocale_init)) setlocale_failure = TRUE; #endif /* LC_ALL */ if (!setlocale_failure) { #ifdef USE_LOCALE_CTYPE Safefree(curctype); - if (! (curctype = setlocale(LC_CTYPE, ""))) + if (! (curctype = setlocale(LC_CTYPE, setlocale_init))) setlocale_failure = TRUE; else curctype = savepv(curctype); #endif /* USE_LOCALE_CTYPE */ #ifdef USE_LOCALE_COLLATE Safefree(curcoll); - if (! (curcoll = setlocale(LC_COLLATE, ""))) + if (! (curcoll = setlocale(LC_COLLATE, setlocale_init))) setlocale_failure = TRUE; else curcoll = savepv(curcoll); #endif /* USE_LOCALE_COLLATE */ #ifdef USE_LOCALE_NUMERIC Safefree(curnum); - if (! (curnum = setlocale(LC_NUMERIC, ""))) + if (! (curnum = setlocale(LC_NUMERIC, setlocale_init))) setlocale_failure = TRUE; else curnum = savepv(curnum); @@ -496,53 +511,15 @@ Perl_init_i18nl10n(pTHX_ int printwarn) } -#endif /* USE_LOCALE */ - -#ifdef USE_PERLIO +#if defined(USE_PERLIO) && defined(USE_LOCALE_CTYPE) { /* Set PL_utf8locale to TRUE if using PerlIO _and_ - any of the following are true: - - nl_langinfo(CODESET) contains /^utf-?8/i - - $ENV{LC_ALL} contains /^utf-?8/i - - $ENV{LC_CTYPE} contains /^utf-?8/i - - $ENV{LANG} contains /^utf-?8/i - The LC_ALL, LC_CTYPE, LANG obey the usual override - hierarchy of locale environment variables. (LANGUAGE - affects only LC_MESSAGES only under glibc.) (If present, - it overrides LC_MESSAGES for GNU gettext, and it also - can have more than one locale, separated by spaces, - in case you need to know.) + the current LC_CTYPE locale is UTF-8. If PL_utf8locale and PL_unicode (set by -C or by $ENV{PERL_UNICODE}) are true, perl.c:S_parse_body() will turn on the PerlIO :utf8 layer on STDIN, STDOUT, STDERR, _and_ the default open discipline. */ - bool utf8locale = FALSE; - char *codeset = NULL; -#if defined(HAS_NL_LANGINFO) && defined(CODESET) - codeset = nl_langinfo(CODESET); -#endif - if (codeset) - utf8locale = (Perl_ibcmp(aTHX_ codeset, STR_WITH_LEN("UTF-8")) == 0 || - Perl_ibcmp(aTHX_ codeset, STR_WITH_LEN("UTF8") ) == 0); -#if defined(USE_LOCALE) - else { /* nl_langinfo(CODESET) is supposed to correctly - * interpret the locale environment variables, - * but just in case it fails, let's do this manually. */ - if (lang) - utf8locale = (Perl_ibcmp(aTHX_ lang, STR_WITH_LEN("UTF-8")) == 0 || - Perl_ibcmp(aTHX_ lang, STR_WITH_LEN("UTF8") ) == 0); -#ifdef USE_LOCALE_CTYPE - if (curctype) - utf8locale = (Perl_ibcmp(aTHX_ curctype, STR_WITH_LEN("UTF-8")) == 0 || - Perl_ibcmp(aTHX_ curctype, STR_WITH_LEN("UTF8") ) == 0); -#endif - if (lc_all) - utf8locale = (Perl_ibcmp(aTHX_ lc_all, STR_WITH_LEN("UTF-8")) == 0 || - Perl_ibcmp(aTHX_ lc_all, STR_WITH_LEN("UTF8") ) == 0); - } -#endif /* USE_LOCALE */ - if (utf8locale) - PL_utf8locale = TRUE; + PL_utf8locale = is_cur_LC_category_utf8(LC_CTYPE); } /* Set PL_unicode to $ENV{PERL_UNICODE} if using PerlIO. This is an alternative to using the -C command line switch @@ -564,6 +541,9 @@ Perl_init_i18nl10n(pTHX_ int printwarn) #ifdef USE_LOCALE_NUMERIC Safefree(curnum); #endif /* USE_LOCALE_NUMERIC */ + +#endif /* USE_LOCALE */ + return ok; } @@ -630,12 +610,300 @@ Perl_mem_collxfrm(pTHX_ const char *s, STRLEN len, STRLEN *xlen) #endif /* USE_LOCALE_COLLATE */ +#ifdef USE_LOCALE + +STATIC bool +S_is_cur_LC_category_utf8(pTHX_ int category) +{ + /* Returns TRUE if the current locale for 'category' is UTF-8; FALSE + * otherwise. 'category' may not be LC_ALL. If the platform doesn't have + * nl_langinfo(), this employs a heuristic, which hence could give the + * wrong result. It errs on the side of not being a UTF-8 locale. */ + + char *save_input_locale = NULL; + STRLEN final_pos; + +#ifdef LC_ALL + assert(category != LC_ALL); +#endif + + /* First dispose of the trivial cases */ + save_input_locale = stdize_locale(setlocale(category, NULL)); + if (! save_input_locale) { + return FALSE; /* XXX maybe should croak */ + } + if ((*save_input_locale == 'C' && save_input_locale[1] == '\0') + || strEQ(save_input_locale, "POSIX")) + { + return FALSE; + } + + save_input_locale = savepv(save_input_locale); + +#if defined(HAS_NL_LANGINFO) && defined(CODESET) && defined(USE_LOCALE_CTYPE) + + { /* Next try nl_langinfo if available */ + + char *save_ctype_locale = NULL; + char *codeset = NULL; + + if (category != LC_CTYPE) { /* nl_langinfo works only on LC_CTYPE */ + + /* Get the current LC_CTYPE locale */ + save_ctype_locale = stdize_locale(savepv(setlocale(LC_CTYPE, NULL))); + if (! save_ctype_locale) { + goto cant_use_nllanginfo; + } + + /* If LC_CTYPE and the desired category use the same locale, this + * means that finding the value for LC_CTYPE is the same as finding + * the value for the desired category. Otherwise, switch LC_CTYPE + * to the desired category's locale */ + if (strEQ(save_ctype_locale, save_input_locale)) { + Safefree(save_ctype_locale); + save_ctype_locale = NULL; + } + else if (! setlocale(LC_CTYPE, save_input_locale)) { + Safefree(save_ctype_locale); + goto cant_use_nllanginfo; + } + } + + /* Here the current LC_CTYPE is set to the locale of the category whose + * information is desired. This means that nl_langinfo() should give + * the correct results */ + codeset = savepv(nl_langinfo(CODESET)); + if (codeset) { + bool is_utf8; + + /* If we switched LC_CTYPE, switch back */ + if (save_ctype_locale) { + setlocale(LC_CTYPE, save_ctype_locale); + Safefree(save_ctype_locale); + } + + is_utf8 = foldEQ(codeset, STR_WITH_LEN("UTF-8")) + || foldEQ(codeset, STR_WITH_LEN("UTF8")); + + Safefree(codeset); + Safefree(save_input_locale); + return is_utf8; + } + + } + cant_use_nllanginfo: + +#endif /* HAS_NL_LANGINFO etc */ + + /* nl_langinfo not available or failed somehow. Look at the locale name to + * see if it matches qr/UTF -? 8 /ix */ + + final_pos = strlen(save_input_locale) - 1; + if (final_pos >= 3) { + char *name = save_input_locale; + + /* Find next 'U' or 'u' and look from there */ + while ((name += strcspn(name, "Uu") + 1) + <= save_input_locale + final_pos - 2) + { + if (toFOLD(*(name)) != 't' + || toFOLD(*(name + 1)) != 'f') + { + continue; + } + name += 2; + if (*(name) == '-') { + if ((name > save_input_locale + final_pos - 1)) { + break; + } + name++; + } + if (*(name) == '8') { + Safefree(save_input_locale); + return TRUE; + } + } + } + +#ifdef WIN32 + /* http://msdn.microsoft.com/en-us/library/windows/desktop/dd317756.aspx */ + if (final_pos >= 4 + && *(save_input_locale + final_pos - 0) == '1' + && *(save_input_locale + final_pos - 1) == '0' + && *(save_input_locale + final_pos - 2) == '0' + && *(save_input_locale + final_pos - 3) == '5' + && *(save_input_locale + final_pos - 4) == '6') + { + Safefree(save_input_locale); + return TRUE; + } +#endif + + /* Other common encodings are the ISO 8859 series, which aren't UTF-8 */ + if (instr(save_input_locale, "8859")) { + Safefree(save_input_locale); + return FALSE; + } + +#ifdef HAS_LOCALECONV + +# ifdef USE_LOCALE_MONETARY + + /* Here, there is nothing in the locale name to indicate whether the locale + * is UTF-8 or not. This "name", the return of setlocale(), is actually + * defined to be opaque, so we can't really rely on the absence of various + * substrings in the name to indicate its UTF-8ness. Look at the locale's + * currency symbol. Often that will be in the native script, and if the + * symbol isn't in UTF-8, we know that the locale isn't. If it is + * non-ASCII UTF-8, we infer that the locale is too. + * To do this, like above for LC_CTYPE, we first set LC_MONETARY to the + * locale of the desired category, if it isn't that locale already */ + + { + char *save_monetary_locale = NULL; + bool illegal_utf8 = FALSE; + bool only_ascii = FALSE; + const struct lconv* const lc = localeconv(); + + if (category != LC_MONETARY) { + + save_monetary_locale = stdize_locale(savepv(setlocale(LC_MONETARY, + NULL))); + if (! save_monetary_locale) { + goto cant_use_monetary; + } + + if (strNE(save_monetary_locale, save_input_locale)) { + if (! setlocale(LC_MONETARY, save_input_locale)) { + Safefree(save_monetary_locale); + goto cant_use_monetary; + } + } + } + + /* Here the current LC_MONETARY is set to the locale of the category + * whose information is desired. */ + + if (lc && lc->currency_symbol) { + if (! is_utf8_string((U8 *) lc->currency_symbol, 0)) { + illegal_utf8 = TRUE; + } + else if (is_ascii_string((U8 *) lc->currency_symbol, 0)) { + only_ascii = TRUE; + } + } + + /* If we changed it, restore LC_MONETARY to its original locale */ + if (save_monetary_locale) { + setlocale(LC_MONETARY, save_monetary_locale); + Safefree(save_monetary_locale); + } + + Safefree(save_input_locale); + + /* It isn't a UTF-8 locale if the symbol is not legal UTF-8; otherwise + * assume the locale is UTF-8 if and only if the symbol is non-ascii + * UTF-8. (We can't really tell if the locale is UTF-8 or not if the + * symbol is just a '$', so we err on the side of it not being UTF-8) + * */ + return (illegal_utf8) + ? FALSE + : ! only_ascii; + + } + cant_use_monetary: + +# endif /* USE_LOCALE_MONETARY */ +#endif /* HAS_LOCALECONV */ + +#if 0 && defined(HAS_STRERROR) && defined(USE_LOCALE_MESSAGES) + +/* This code is ifdefd out because it was found to not be necessary in testing + * on our dromedary test machine, which has over 700 locales. There, looking + * at just the currency symbol gave essentially the same results as doing this + * extra work. Executing this also caused segfaults in miniperl. I left it in + * so as to avoid rewriting it if real-world experience indicates that + * dromedary is an outlier. Essentially, instead of returning abpve if we + * haven't found illegal utf8, we continue on and examine all the strerror() + * messages on the platform for utf8ness. If all are ASCII, we still don't + * know the answer; but otherwise we have a pretty good indication of the + * utf8ness. The reason this doesn't necessarily help much is that the + * messages may not have been translated into the locale. The currency symbol + * is much more likely to have been translated. The code below would need to + * be altered somewhat to just be a continuation of testing the currency + * symbol. */ + int e; + unsigned int failures = 0, non_ascii = 0; + char *save_messages_locale = NULL; + + /* Like above for LC_CTYPE, we set LC_MESSAGES to the locale of the + * desired category, if it isn't that locale already */ + + if (category != LC_MESSAGES) { + + save_messages_locale = stdize_locale(savepv(setlocale(LC_MESSAGES, + NULL))); + if (! save_messages_locale) { + goto cant_use_messages; + } + + if (strEQ(save_messages_locale, save_input_locale)) { + Safefree(save_input_locale); + } + else if (! setlocale(LC_MESSAGES, save_input_locale)) { + Safefree(save_messages_locale); + goto cant_use_messages; + } + } + + /* Here the current LC_MESSAGES is set to the locale of the category + * whose information is desired. Look through all the messages */ + + for (e = 0; +#ifdef HAS_SYS_ERRLIST + e <= sys_nerr +#endif + ; e++) + { + const U8* const errmsg = (U8 *) Strerror(e) ; + if (!errmsg) + break; + if (! is_utf8_string(errmsg, 0)) { + failures++; + break; + } + else if (! is_ascii_string(errmsg, 0)) { + non_ascii++; + } + } + + /* And, if we changed it, restore LC_MESSAGES to its original locale */ + if (save_messages_locale) { + setlocale(LC_MESSAGES, save_messages_locale); + Safefree(save_messages_locale); + } + + /* Any non-UTF-8 message means not a UTF-8 locale; if all are valid, + * any non-ascii means it is one; otherwise we assume it isn't */ + return (failures) ? FALSE : non_ascii; + + } + cant_use_messages: + +#endif + + Safefree(save_input_locale); + return FALSE; +} + +#endif + /* * Local variables: * c-indentation-style: bsd * c-basic-offset: 4 - * indent-tabs-mode: t + * indent-tabs-mode: nil * End: * - * ex: set ts=8 sts=4 sw=4 noet: + * ex: set ts=8 sts=4 sw=4 et: */