X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/5857e934cfe6888b4c2a28640ad5677cc4e268e3..0d42058edbaed311ab41cba6c791941297388c3b:/locale.c diff --git a/locale.c b/locale.c index 20edb19..a1fe449 100644 --- a/locale.c +++ b/locale.c @@ -106,7 +106,7 @@ Perl_set_numeric_radix(pTHX) sv_setpv(PL_numeric_radix_sv, lc->decimal_point); else PL_numeric_radix_sv = newSVpv(lc->decimal_point, 0); - if (! is_ascii_string((U8 *) lc->decimal_point, 0) + if (! is_invariant_string((U8 *) lc->decimal_point, 0) && is_utf8_string((U8 *) lc->decimal_point, 0) && _is_cur_LC_category_utf8(LC_NUMERIC)) { @@ -159,7 +159,7 @@ Perl_new_numeric(pTHX_ const char *newnum) * dot. * * This sets several interpreter-level variables: - * PL_numeric_name The default locale's name: a copy of 'newnum' + * PL_numeric_name The underlying locale's name: a copy of 'newnum' * PL_numeric_local A boolean indicating if the toggled state is such * that the current locale is the program's underlying * locale @@ -272,6 +272,13 @@ Perl_new_ctype(pTHX_ const char *newctype) PERL_ARGS_ASSERT_NEW_CTYPE; + /* We will replace any bad locale warning with 1) nothing if the new one is + * ok; or 2) a new warning for the bad new locale */ + if (PL_warn_locale) { + SvREFCNT_dec_NN(PL_warn_locale); + PL_warn_locale = NULL; + } + PL_in_utf8_CTYPE_locale = _is_cur_LC_category_utf8(LC_CTYPE); /* A UTF-8 locale gets standard rules. But note that code still has to @@ -280,6 +287,18 @@ Perl_new_ctype(pTHX_ const char *newctype) Copy(PL_fold_latin1, PL_fold_locale, 256, U8); } else { + /* Assume enough space for every character being bad. 4 spaces each + * for the 94 printable characters that are output like "'x' "; and 5 + * spaces each for "'\\' ", "'\t' ", and "'\n' "; plus a terminating + * NUL */ + char bad_chars_list[ (94 * 4) + (3 * 5) + 1 ]; + + bool check_for_problems = ckWARN_d(WARN_LOCALE); /* No warnings means + no check */ + bool multi_byte_locale = FALSE; /* Assume is a single-byte locale + to start */ + unsigned int bad_count = 0; /* Count of bad characters */ + for (i = 0; i < 256; i++) { if (isUPPER_LC((U8) i)) PL_fold_locale[i] = (U8) toLOWER_LC((U8) i); @@ -287,6 +306,104 @@ Perl_new_ctype(pTHX_ const char *newctype) PL_fold_locale[i] = (U8) toUPPER_LC((U8) i); else PL_fold_locale[i] = (U8) i; + + /* If checking for locale problems, see if the native ASCII-range + * printables plus \n and \t are in their expected categories in + * the new locale. If not, this could mean big trouble, upending + * Perl's and most programs' assumptions, like having a + * metacharacter with special meaning become a \w. Fortunately, + * it's very rare to find locales that aren't supersets of ASCII + * nowadays. It isn't a problem for most controls to be changed + * into something else; we check only \n and \t, though perhaps \r + * could be an issue as well. */ + if (check_for_problems + && (isGRAPH_A(i) || isBLANK_A(i) || i == '\n')) + { + if ((isALPHANUMERIC_A(i) && ! isALPHANUMERIC_LC(i)) + || (isPUNCT_A(i) && ! isPUNCT_LC(i)) + || (isBLANK_A(i) && ! isBLANK_LC(i)) + || (i == '\n' && ! isCNTRL_LC(i))) + { + if (bad_count) { /* Separate multiple entries with a + blank */ + bad_chars_list[bad_count++] = ' '; + } + bad_chars_list[bad_count++] = '\''; + if (isPRINT_A(i)) { + bad_chars_list[bad_count++] = (char) i; + } + else { + bad_chars_list[bad_count++] = '\\'; + if (i == '\n') { + bad_chars_list[bad_count++] = 'n'; + } + else { + assert(i == '\t'); + bad_chars_list[bad_count++] = 't'; + } + } + bad_chars_list[bad_count++] = '\''; + bad_chars_list[bad_count] = '\0'; + } + } + } + +#ifdef MB_CUR_MAX + /* We only handle single-byte locales (outside of UTF-8 ones; so if + * this locale requires than one byte, there are going to be + * problems. */ + if (check_for_problems && MB_CUR_MAX > 1 + + /* Some platforms return MB_CUR_MAX > 1 for even the "C" + * locale. Just assume that the implementation for them (plus + * for POSIX) is correct and the > 1 value is spurious. (Since + * these are specially handled to never be considered UTF-8 + * locales, as long as this is the only problem, everything + * should work fine */ + && strNE(newctype, "C") && strNE(newctype, "POSIX")) + { + multi_byte_locale = TRUE; + } +#endif + + if (bad_count || multi_byte_locale) { + PL_warn_locale = Perl_newSVpvf(aTHX_ + "Locale '%s' may not work well.%s%s%s\n", + newctype, + (multi_byte_locale) + ? " Some characters in it are not recognized by" + " Perl." + : "", + (bad_count) + ? "\nThe following characters (and maybe others)" + " may not have the same meaning as the Perl" + " program expects:\n" + : "", + (bad_count) + ? bad_chars_list + : "" + ); + /* If we are actually in the scope of the locale, output the + * message now. Otherwise we save it to be output at the first + * operation using this locale, if that actually happens. Most + * programs don't use locales, so they are immune to bad ones */ + if (IN_LC(LC_CTYPE)) { + + /* We have to save 'newctype' because the setlocale() just + * below may destroy it. The next setlocale() further down + * should restore it properly so that the intermediate change + * here is transparent to this function's caller */ + const char * const badlocale = savepv(newctype); + + setlocale(LC_CTYPE, "C"); + + /* The '0' below suppresses a bogus gcc compiler warning */ + Perl_warner(aTHX_ packWARN(WARN_LOCALE), SvPVX(PL_warn_locale), 0); + setlocale(LC_CTYPE, badlocale); + Safefree(badlocale); + SvREFCNT_dec_NN(PL_warn_locale); + PL_warn_locale = NULL; + } } } @@ -297,6 +414,32 @@ Perl_new_ctype(pTHX_ const char *newctype) } void +Perl__warn_problematic_locale() +{ + +#ifdef USE_LOCALE_CTYPE + + dTHX; + + /* Internal-to-core function that outputs the message in PL_warn_locale, + * and then NULLS it. Should be called only through the macro + * _CHECK_AND_WARN_PROBLEMATIC_LOCALE */ + + if (PL_warn_locale) { + /*GCC_DIAG_IGNORE(-Wformat-security); Didn't work */ + Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), + SvPVX(PL_warn_locale), + 0 /* dummy to avoid compiler warning */ ); + /* GCC_DIAG_RESTORE; */ + SvREFCNT_dec_NN(PL_warn_locale); + PL_warn_locale = NULL; + } + +#endif + +} + +void Perl_new_collate(pTHX_ const char *newcoll) { #ifdef USE_LOCALE_COLLATE @@ -365,7 +508,7 @@ Perl_my_setlocale(pTHX_ int category, const char* locale) * otherwise to use the particular category's variable if set; otherwise to * use the LANG variable. */ - bool override_LC_ALL = 0; + bool override_LC_ALL = FALSE; char * result; if (locale && strEQ(locale, "")) { @@ -431,7 +574,7 @@ Perl_my_setlocale(pTHX_ int category, const char* locale) return result; } - /* Here the input locale was LC_ALL, and we have set it to what is in the + /* Here the input category was LC_ALL, and we have set it to what is in the * LANG variable or the system default if there is no LANG. But these have * lower priority than the other LC_foo variables, so override it for each * one that is set. (If they are set to "", it means to use the same thing @@ -511,7 +654,7 @@ Perl_init_i18nl10n(pTHX_ int printwarn) char *curnum = NULL; #endif /* USE_LOCALE_NUMERIC */ #ifdef __GLIBC__ - char * const language = PerlEnv_getenv("LANGUAGE"); + const char * const language = savepv(PerlEnv_getenv("LANGUAGE")); #endif /* NULL uses the existing already set up locale */ @@ -520,14 +663,22 @@ Perl_init_i18nl10n(pTHX_ int printwarn) : ""; const char* trial_locales[5]; /* 5 = 1 each for "", LC_ALL, LANG, "", C */ unsigned int trial_locales_count; - char * const lc_all = PerlEnv_getenv("LC_ALL"); - char * const lang = PerlEnv_getenv("LANG"); + const char * const lc_all = savepv(PerlEnv_getenv("LC_ALL")); + const char * const lang = savepv(PerlEnv_getenv("LANG")); bool setlocale_failure = FALSE; unsigned int i; char *p; - const bool locwarn = (printwarn > 1 || - (printwarn && - (!(p = PerlEnv_getenv("PERL_BADLANG")) || atoi(p)))); + + /* A later getenv() could zap this, so only use here */ + const char * const bad_lang_use_once = PerlEnv_getenv("PERL_BADLANG"); + + const bool locwarn = (printwarn > 1 + || (printwarn + && (! bad_lang_use_once + || ( + /* disallow with "" or "0" */ + *bad_lang_use_once + && strNE("0", bad_lang_use_once))))); bool done = FALSE; #ifdef WIN32 /* In some systems you can find out the system default locale @@ -777,7 +928,7 @@ Perl_init_i18nl10n(pTHX_ int printwarn) } /* Calculate what fallback locales to try. We have avoided this - * until we have to, becuase failure is quite unlikely. This will + * until we have to, because failure is quite unlikely. This will * usually change the upper bound of the loop we are in. * * Since the system's default way of setting the locale has not @@ -932,6 +1083,13 @@ Perl_init_i18nl10n(pTHX_ int printwarn) Safefree(curnum); #endif /* USE_LOCALE_NUMERIC */ +#ifdef __GLIBC__ + Safefree(language); +#endif + + Safefree(lc_all); + Safefree(lang); + #else /* !USE_LOCALE */ PERL_UNUSED_ARG(printwarn); #endif /* USE_LOCALE */ @@ -1010,8 +1168,10 @@ Perl__is_cur_LC_category_utf8(pTHX_ int category) /* Returns TRUE if the current locale for 'category' is UTF-8; FALSE * otherwise. 'category' may not be LC_ALL. If the platform doesn't have * nl_langinfo(), nor MB_CUR_MAX, this employs a heuristic, which hence - * could give the wrong result. It errs on the side of not being a UTF-8 - * locale. */ + * could give the wrong result. The result will very likely be correct for + * languages that have commonly used non-ASCII characters, but for notably + * English, it comes down to if the locale's name ends in something like + * "UTF-8". It errs on the side of not being a UTF-8 locale. */ char *save_input_locale = NULL; STRLEN final_pos; @@ -1130,16 +1290,14 @@ Perl__is_cur_LC_category_utf8(pTHX_ int category) * result */ if (is_utf8) { wchar_t wc; - GCC_DIAG_IGNORE(-Wunused-result); - (void) mbtowc(&wc, NULL, 0); /* Reset any shift state */ - GCC_DIAG_RESTORE; + PERL_UNUSED_RESULT(mbtowc(&wc, NULL, 0));/* Reset any shift state */ errno = 0; if ((size_t)mbtowc(&wc, HYPHEN_UTF8, strlen(HYPHEN_UTF8)) != strlen(HYPHEN_UTF8) || wc != (wchar_t) 0x2010) { is_utf8 = FALSE; - DEBUG_L(PerlIO_printf(Perl_debug_log, "\thyphen=U+%x\n", wc)); + DEBUG_L(PerlIO_printf(Perl_debug_log, "\thyphen=U+%x\n", (unsigned int)wc)); DEBUG_L(PerlIO_printf(Perl_debug_log, "\treturn from mbtowc=%d; errno=%d; ?UTF8 locale=0\n", mbtowc(&wc, HYPHEN_UTF8, strlen(HYPHEN_UTF8)), errno)); @@ -1167,7 +1325,8 @@ Perl__is_cur_LC_category_utf8(pTHX_ int category) * currency symbol to see if it disambiguates things. Often that will be * in the native script, and if the symbol isn't in UTF-8, we know that the * locale isn't. If it is non-ASCII UTF-8, we infer that the locale is - * too. */ + * too, as the odds of a non-UTF8 string being valid UTF-8 are quite small + * */ #ifdef HAS_LOCALECONV # ifdef USE_LOCALE_MONETARY @@ -1209,7 +1368,7 @@ Perl__is_cur_LC_category_utf8(pTHX_ int category) lc = localeconv(); if (! lc || ! lc->currency_symbol - || is_ascii_string((U8 *) lc->currency_symbol, 0)) + || is_invariant_string((U8 *) lc->currency_symbol, 0)) { DEBUG_L(PerlIO_printf(Perl_debug_log, "Couldn't get currency symbol for %s, or contains only ASCII; can't use for determining if UTF-8 locale\n", save_input_locale)); only_ascii = TRUE; @@ -1282,14 +1441,14 @@ Perl__is_cur_LC_category_utf8(pTHX_ int category) /* Here the current LC_TIME is set to the locale of the category * whose information is desired. Look at all the days of the week and - * month names, and the timezone and am/pm indicator for non-ASCII + * month names, and the timezone and am/pm indicator for UTF-8 variant * characters. The first such a one found will tell us if the locale * is UTF-8 or not */ for (i = 0; i < 7 + 12; i++) { /* 7 days; 12 months */ formatted_time = my_strftime("%A %B %Z %p", 0, 0, hour, dom, month, 112, 0, 0, is_dst); - if (! formatted_time || is_ascii_string((U8 *) formatted_time, 0)) { + if (! formatted_time || is_invariant_string((U8 *) formatted_time, 0)) { /* Here, we didn't find a non-ASCII. Try the next time through * with the complemented dst and am/pm, and try with the next @@ -1390,7 +1549,7 @@ Perl__is_cur_LC_category_utf8(pTHX_ int category) break; } errmsg = savepv(errmsg); - if (! is_ascii_string((U8 *) errmsg, 0)) { + if (! is_invariant_string((U8 *) errmsg, 0)) { non_ascii = TRUE; is_utf8 = is_utf8_string((U8 *) errmsg, 0); break; @@ -1423,6 +1582,8 @@ Perl__is_cur_LC_category_utf8(pTHX_ int category) #endif /* the code that is compiled when no nl_langinfo */ +#ifndef EBCDIC /* On os390, even if the name ends with "UTF-8', it isn't a + UTF-8 locale */ /* As a last resort, look at the locale name to see if it matches * qr/UTF -? * 8 /ix, or some other common locale names. This "name", the * return of setlocale(), is actually defined to be opaque, so we can't @@ -1438,8 +1599,8 @@ Perl__is_cur_LC_category_utf8(pTHX_ int category) while ((name += strcspn(name, "Uu") + 1) <= save_input_locale + final_pos - 2) { - if (toFOLD(*(name)) != 't' - || toFOLD(*(name + 1)) != 'f') + if (!isALPHA_FOLD_NE(*name, 't') + || isALPHA_FOLD_NE(*(name + 1), 'f')) { continue; } @@ -1462,6 +1623,7 @@ Perl__is_cur_LC_category_utf8(pTHX_ int category) "Locale %s doesn't end with UTF-8 in name\n", save_input_locale)); } +#endif #ifdef WIN32 /* http://msdn.microsoft.com/en-us/library/windows/desktop/dd317756.aspx */ @@ -1556,6 +1718,41 @@ Perl_my_strerror(pTHX_ const int errnum) { } /* + +=head1 Locale-related functions and macros + +=for apidoc sync_locale + +Changing the program's locale should be avoided by XS code. Nevertheless, +certain non-Perl libraries called from XS, such as C do so. When this +happens, Perl needs to be told that the locale has changed. Use this function +to do so, before returning to Perl. + +=cut +*/ + +void +Perl_sync_locale(pTHX) +{ + +#ifdef USE_LOCALE_CTYPE + new_ctype(setlocale(LC_CTYPE, NULL)); +#endif /* USE_LOCALE_CTYPE */ + +#ifdef USE_LOCALE_COLLATE + new_collate(setlocale(LC_COLLATE, NULL)); +#endif + +#ifdef USE_LOCALE_NUMERIC + set_numeric_local(); /* Switch from "C" to underlying LC_NUMERIC */ + new_numeric(setlocale(LC_NUMERIC, NULL)); +#endif /* USE_LOCALE_NUMERIC */ + +} + + + +/* * Local variables: * c-indentation-style: bsd * c-basic-offset: 4