+ if (! default_name || strEQ(default_name, "")) {
+ default_name = "C";
+ }
+ else if (PL_scopestack_ix != 0) {
+ SAVEFREEPV(default_name);
+ }
+
+ if (category != LC_ALL) {
+ const char * const name = PerlEnv_getenv(category_names[index]);
+
+ /* Here we are setting a single category. Assume will have the
+ * default name */
+ locale = default_name;
+
+ /* But then look for an overriding environment variable */
+ if (name && strNE(name, "")) {
+ locale = name;
+ }
+ }
+ else {
+ bool did_override = FALSE;
+ unsigned int i;
+
+ /* Here, we are getting LC_ALL. Any categories that don't have
+ * a corresponding environment variable set should be set to
+ * LANG, or to "C" if there is no LANG. If no individual
+ * categories differ from this, we can just set LC_ALL. This
+ * is buggy on systems that have extra categories that we don't
+ * know about. If there is an environment variable that sets
+ * that category, we won't know to look for it, and so our use
+ * of LANG or "C" improperly overrides it. On the other hand,
+ * if we don't do what is done here, and there is no
+ * environment variable, the category's locale should be set to
+ * LANG or "C". So there is no good solution. khw thinks the
+ * best is to look at systems to see what categories they have,
+ * and include them, and then to assume that we know the
+ * complete set */
+
+ for (i = 0; i < LC_ALL_INDEX; i++) {
+ const char * const env_override
+ = savepv(PerlEnv_getenv(category_names[i]));
+ const char * this_locale = ( env_override
+ && strNE(env_override, ""))
+ ? env_override
+ : default_name;
+ if (! emulate_setlocale(categories[i], this_locale, i, TRUE))
+ {
+ Safefree(env_override);
+ return NULL;
+ }
+
+ if (strNE(this_locale, default_name)) {
+ did_override = TRUE;
+ }
+
+ Safefree(env_override);
+ }
+
+ /* If all the categories are the same, we can set LC_ALL to
+ * that */
+ if (! did_override) {
+ locale = default_name;
+ }
+ else {
+
+ /* Here, LC_ALL is no longer valid, as some individual
+ * categories don't match it. We call ourselves
+ * recursively, as that will execute the code that
+ * generates the proper locale string for this situation.
+ * We don't do the remainder of this function, as that is
+ * to update our records, and we've just done that for the
+ * individual categories in the loop above, and doing so
+ * would cause LC_ALL to be done as well */
+ return emulate_setlocale(LC_ALL, NULL, LC_ALL_INDEX, TRUE);
+ }
+ }
+ }
+ }
+ else if (strchr(locale, ';')) {
+
+ /* LC_ALL may actually incude a conglomeration of various categories.
+ * Without querylocale, this code uses the glibc (as of this writing)
+ * syntax for representing that, but that is not a stable API, and
+ * other platforms do it differently, so we have to handle all cases
+ * ourselves */
+
+ unsigned int i;
+ const char * s = locale;
+ const char * e = locale + strlen(locale);
+ const char * p = s;
+ const char * category_end;
+ const char * name_start;
+ const char * name_end;
+
+ /* If the string that gives what to set doesn't include all categories,
+ * the omitted ones get set to "C". To get this behavior, first set
+ * all the individual categories to "C", and override the furnished
+ * ones below */
+ for (i = 0; i < LC_ALL_INDEX; i++) {
+ if (! emulate_setlocale(categories[i], "C", i, TRUE)) {
+ return NULL;
+ }
+ }
+
+ while (s < e) {
+
+ /* Parse through the category */
+ while (isWORDCHAR(*p)) {
+ p++;
+ }
+ category_end = p;
+
+ if (*p++ != '=') {
+ Perl_croak(aTHX_
+ "panic: %s: %d: Unexpected character in locale name '%02X",
+ __FILE__, __LINE__, *(p-1));
+ }
+
+ /* Parse through the locale name */
+ name_start = p;
+ while (p < e && *p != ';') {
+ if (! isGRAPH(*p)) {
+ Perl_croak(aTHX_
+ "panic: %s: %d: Unexpected character in locale name '%02X",
+ __FILE__, __LINE__, *(p-1));
+ }
+ p++;
+ }
+ name_end = p;
+
+ /* Space past the semi-colon */
+ if (p < e) {
+ p++;
+ }
+
+ /* Find the index of the category name in our lists */
+ for (i = 0; i < LC_ALL_INDEX; i++) {
+ char * individ_locale;
+
+ /* Keep going if this isn't the index. The strnNE() avoids a
+ * Perl_form(), but would fail if ever a category name could be
+ * a substring of another one, like if there were a
+ * "LC_TIME_DATE" */
+ if strnNE(s, category_names[i], category_end - s) {
+ continue;
+ }
+
+ /* If this index is for the single category we're changing, we
+ * have found the locale to set it to. */
+ if (category == categories[i]) {
+ locale = Perl_form(aTHX_ "%.*s",
+ (int) (name_end - name_start),
+ name_start);
+ goto ready_to_set;
+ }
+
+ assert(category == LC_ALL);
+ individ_locale = Perl_form(aTHX_ "%.*s",
+ (int) (name_end - name_start), name_start);
+ if (! emulate_setlocale(categories[i], individ_locale, i, TRUE))
+ {
+ return NULL;
+ }
+ }
+
+ s = p;
+ }
+
+ /* Here we have set all the individual categories by recursive calls.
+ * These collectively should have fixed up LC_ALL, so can just query
+ * what that now is */
+ assert(category == LC_ALL);
+
+ return do_setlocale_c(LC_ALL, NULL);
+ }
+
+ ready_to_set: ;
+
+ /* Here at the end of having to deal with the absence of querylocale().
+ * Some cases have already been fully handled by recursive calls to this
+ * function. But at this point, we haven't dealt with those, but are now
+ * prepared to, knowing what the locale name to set this category to is.
+ * This would have come for free if this system had had querylocale() */
+
+# endif /* end of ! querylocale */
+
+ assert(PL_C_locale_obj);
+
+ /* Switching locales generally entails freeing the current one's space (at
+ * the C library's discretion). We need to stop using that locale before
+ * the switch. So switch to a known locale object that we don't otherwise
+ * mess with. This returns the locale object in effect at the time of the
+ * switch. */
+ old_obj = uselocale(PL_C_locale_obj);
+
+# ifdef DEBUGGING
+
+ if (DEBUG_Lv_TEST || debug_initialization) {
+ PerlIO_printf(Perl_debug_log, "%s:%d: emulate_setlocale was using %p\n", __FILE__, __LINE__, old_obj);
+ }
+
+# endif
+
+ if (! old_obj) {
+
+# ifdef DEBUGGING
+
+ if (DEBUG_L_TEST || debug_initialization) {
+ dSAVE_ERRNO;
+ PerlIO_printf(Perl_debug_log, "%s:%d: emulate_setlocale switching to C failed: %d\n", __FILE__, __LINE__, GET_ERRNO);
+ RESTORE_ERRNO;
+ }
+
+# endif
+
+ return NULL;
+ }
+
+# ifdef DEBUGGING
+
+ if (DEBUG_Lv_TEST || debug_initialization) {
+ PerlIO_printf(Perl_debug_log, "%s:%d: emulate_setlocale now using %p\n", __FILE__, __LINE__, PL_C_locale_obj);
+ }
+
+# endif
+
+ /* If we weren't in a thread safe locale, set so that newlocale() below
+ which uses 'old_obj', uses an empty one. Same for our reserved C object.
+ The latter is defensive coding, so that, even if there is some bug, we
+ will never end up trying to modify either of these, as if passed to
+ newlocale(), they can be. */
+ if (old_obj == LC_GLOBAL_LOCALE || old_obj == PL_C_locale_obj) {
+ old_obj = (locale_t) 0;
+ }
+
+ /* Ready to create a new locale by modification of the exising one */
+ new_obj = newlocale(mask, locale, old_obj);
+
+ if (! new_obj) {
+ dSAVE_ERRNO;
+
+# ifdef DEBUGGING
+
+ if (DEBUG_L_TEST || debug_initialization) {
+ PerlIO_printf(Perl_debug_log, "%s:%d: emulate_setlocale creating new object failed: %d\n", __FILE__, __LINE__, GET_ERRNO);
+ }
+
+# endif
+
+ if (! uselocale(old_obj)) {
+
+# ifdef DEBUGGING
+
+ if (DEBUG_L_TEST || debug_initialization) {
+ PerlIO_printf(Perl_debug_log, "%s:%d: switching back failed: %d\n", __FILE__, __LINE__, GET_ERRNO);
+ }
+
+# endif
+
+ }
+ RESTORE_ERRNO;
+ return NULL;
+ }
+
+# ifdef DEBUGGING
+
+ if (DEBUG_Lv_TEST || debug_initialization) {
+ PerlIO_printf(Perl_debug_log, "%s:%d: emulate_setlocale created %p; should have freed %p\n", __FILE__, __LINE__, new_obj, old_obj);
+ }
+
+# endif
+
+ /* And switch into it */
+ if (! uselocale(new_obj)) {
+ dSAVE_ERRNO;
+
+# ifdef DEBUGGING
+
+ if (DEBUG_L_TEST || debug_initialization) {
+ PerlIO_printf(Perl_debug_log, "%s:%d: emulate_setlocale switching to new object failed\n", __FILE__, __LINE__);
+ }
+
+# endif
+
+ if (! uselocale(old_obj)) {
+
+# ifdef DEBUGGING
+
+ if (DEBUG_L_TEST || debug_initialization) {
+ PerlIO_printf(Perl_debug_log, "%s:%d: switching back failed: %d\n", __FILE__, __LINE__, GET_ERRNO);
+ }
+
+# endif
+
+ }
+ freelocale(new_obj);
+ RESTORE_ERRNO;
+ return NULL;
+ }
+
+# ifdef DEBUGGING
+
+ if (DEBUG_Lv_TEST || debug_initialization) {
+ PerlIO_printf(Perl_debug_log, "%s:%d: emulate_setlocale now using %p\n", __FILE__, __LINE__, new_obj);
+ }
+
+# endif
+
+ /* We are done, except for updating our records (if the system doesn't keep
+ * them) and in the case of locale "", we don't actually know what the
+ * locale that got switched to is, as it came from the environment. So
+ * have to find it */
+
+# ifdef HAS_QUERYLOCALE
+
+ if (strEQ(locale, "")) {
+ locale = querylocale(mask, new_obj);
+ }
+
+# else
+
+ /* Here, 'locale' is the return value */
+
+ /* Without querylocale(), we have to update our records */
+
+ if (category == LC_ALL) {
+ unsigned int i;
+
+ /* For LC_ALL, we change all individual categories to correspond */
+ /* PL_curlocales is a parallel array, so has same
+ * length as 'categories' */
+ for (i = 0; i <= LC_ALL_INDEX; i++) {
+ Safefree(PL_curlocales[i]);
+ PL_curlocales[i] = savepv(locale);
+ }
+ }
+ else {
+
+ /* For a single category, if it's not the same as the one in LC_ALL, we
+ * nullify LC_ALL */
+
+ if (PL_curlocales[LC_ALL_INDEX] && strNE(PL_curlocales[LC_ALL_INDEX], locale)) {
+ Safefree(PL_curlocales[LC_ALL_INDEX]);
+ PL_curlocales[LC_ALL_INDEX] = NULL;
+ }
+
+ /* Then update the category's record */
+ Safefree(PL_curlocales[index]);
+ PL_curlocales[index] = savepv(locale);
+ }
+
+# endif
+
+ return locale;
+}
+
+#endif /* USE_POSIX_2008_LOCALE */
+
+#if 0 /* Code that was to emulate thread-safe locales on platforms that
+ didn't natively support them */
+
+/* The way this would work is that we would keep a per-thread list of the
+ * correct locale for that thread. Any operation that was locale-sensitive
+ * would have to be changed so that it would look like this:
+ *
+ * LOCALE_LOCK;
+ * setlocale to the correct locale for this operation
+ * do operation
+ * LOCALE_UNLOCK
+ *
+ * This leaves the global locale in the most recently used operation's, but it
+ * was locked long enough to get the result. If that result is static, it
+ * needs to be copied before the unlock.
+ *
+ * Macros could be written like SETUP_LOCALE_DEPENDENT_OP(category) that did
+ * the setup, but are no-ops when not needed, and similarly,
+ * END_LOCALE_DEPENDENT_OP for the tear-down
+ *
+ * But every call to a locale-sensitive function would have to be changed, and
+ * if a module didn't cooperate by using the mutex, things would break.
+ *
+ * This code was abandoned before being completed or tested, and is left as-is
+*/
+
+# define do_setlocale_c(cat, locale) locking_setlocale(cat, locale, cat ## _INDEX, TRUE)
+# define do_setlocale_r(cat, locale) locking_setlocale(cat, locale, 0, FALSE)
+
+STATIC char *
+S_locking_setlocale(pTHX_
+ const int category,
+ const char * locale,
+ int index,
+ const bool is_index_valid
+ )
+{
+ /* This function kind of performs a setlocale() on just the current thread;
+ * thus it is kind of thread-safe. It does this by keeping a thread-level
+ * array of the current locales for each category. Every time a locale is
+ * switched to, it does the switch globally, but updates the thread's
+ * array. A query as to what the current locale is just returns the
+ * appropriate element from the array, and doesn't actually call the system
+ * setlocale(). The saving into the array is done in an uninterruptible
+ * section of code, so is unaffected by whatever any other threads might be
+ * doing.
+ *
+ * All locale-sensitive operations must work by first starting a critical
+ * section, then switching to the thread's locale as kept by this function,
+ * and then doing the operation, then ending the critical section. Thus,
+ * each gets done in the appropriate locale. simulating thread-safety.
+ *
+ * This function takes the same parameters, 'category' and 'locale', that
+ * the regular setlocale() function does, but it also takes two additional
+ * ones. This is because as described earlier. If we know on input the
+ * index corresponding to the category into the array where we store the
+ * current locales, we don't have to calculate it. If the caller knows at
+ * compile time what the index is, it it can pass it, setting
+ * 'is_index_valid' to TRUE; otherwise the index parameter is ignored.
+ *
+ */
+
+ /* If the input index might be incorrect, calculate the correct one */
+ if (! is_index_valid) {
+ unsigned int i;
+
+ if (DEBUG_Lv_TEST || debug_initialization) {
+ PerlIO_printf(Perl_debug_log, "%s:%d: converting category %d to index\n", __FILE__, __LINE__, category);
+ }
+
+ for (i = 0; i <= LC_ALL_INDEX; i++) {
+ if (category == categories[i]) {
+ index = i;
+ goto found_index;
+ }
+ }
+
+ /* Here, we don't know about this category, so can't handle it.
+ * XXX best we can do is to unsafely set this
+ * XXX warning */
+
+ return my_setlocale(category, locale);
+
+ found_index: ;
+
+ if (DEBUG_Lv_TEST || debug_initialization) {
+ PerlIO_printf(Perl_debug_log, "%s:%d: index is 0x%x\n", __FILE__, __LINE__, index);
+ }
+ }
+
+ /* For a query, just return what's in our records */
+ if (new_locale == NULL) {
+ return curlocales[index];
+ }
+
+
+ /* Otherwise, we need to do the switch, and save the result, all in a
+ * critical section */
+
+ Safefree(curlocales[[index]]);
+
+ /* It might be that this is called from an already-locked section of code.
+ * We would have to detect and skip the LOCK/UNLOCK if so */
+ LOCALE_LOCK;
+
+ curlocales[index] = savepv(my_setlocale(category, new_locale));
+
+ if (strEQ(new_locale, "")) {
+
+#ifdef LC_ALL
+
+ /* The locale values come from the environment, and may not all be the
+ * same, so for LC_ALL, we have to update all the others, while the
+ * mutex is still locked */
+
+ if (category == LC_ALL) {
+ unsigned int i;
+ for (i = 0; i < LC_ALL_INDEX) {
+ curlocales[i] = my_setlocale(categories[i], NULL);
+ }
+ }
+ }
+
+#endif
+
+ LOCALE_UNLOCK;
+
+ return curlocales[index];
+}
+
+#endif
+#ifdef USE_LOCALE
+
+STATIC void
+S_set_numeric_radix(pTHX_ const bool use_locale)
+{
+ /* If 'use_locale' is FALSE, set to use a dot for the radix character. If
+ * TRUE, use the radix character derived from the current locale */
+
+#if defined(USE_LOCALE_NUMERIC) && ( defined(HAS_LOCALECONV) \
+ || defined(HAS_NL_LANGINFO))
+
+ const char * radix = (use_locale)
+ ? my_nl_langinfo(RADIXCHAR, FALSE)
+ /* FALSE => already in dest locale */
+ : ".";
+
+ sv_setpv(PL_numeric_radix_sv, radix);
+
+ /* If this is valid UTF-8 that isn't totally ASCII, and we are in
+ * a UTF-8 locale, then mark the radix as being in UTF-8 */
+ if (is_utf8_non_invariant_string((U8 *) SvPVX(PL_numeric_radix_sv),
+ SvCUR(PL_numeric_radix_sv))
+ && _is_cur_LC_category_utf8(LC_NUMERIC))
+ {
+ SvUTF8_on(PL_numeric_radix_sv);
+ }
+
+# ifdef DEBUGGING
+
+ if (DEBUG_L_TEST || debug_initialization) {
+ PerlIO_printf(Perl_debug_log, "Locale radix is '%s', ?UTF-8=%d\n",
+ SvPVX(PL_numeric_radix_sv),
+ cBOOL(SvUTF8(PL_numeric_radix_sv)));
+ }
+
+# endif
+#else
+
+ PERL_UNUSED_ARG(use_locale);
+
+#endif /* USE_LOCALE_NUMERIC and can find the radix char */
+
+}
+
+STATIC void
+S_new_numeric(pTHX_ const char *newnum)
+{
+
+#ifndef USE_LOCALE_NUMERIC
+
+ PERL_UNUSED_ARG(newnum);
+
+#else
+
+ /* Called after each libc setlocale() call affecting LC_NUMERIC, to tell
+ * core Perl this and that 'newnum' is the name of the new locale.
+ * It installs this locale as the current underlying default.
+ *
+ * The default locale and the C locale can be toggled between by use of the
+ * set_numeric_underlying() and set_numeric_standard() functions, which
+ * should probably not be called directly, but only via macros like
+ * SET_NUMERIC_STANDARD() in perl.h.
+ *
+ * The toggling is necessary mainly so that a non-dot radix decimal point
+ * character can be output, while allowing internal calculations to use a
+ * dot.
+ *
+ * This sets several interpreter-level variables:
+ * PL_numeric_name The underlying locale's name: a copy of 'newnum'
+ * PL_numeric_underlying A boolean indicating if the toggled state is such
+ * that the current locale is the program's underlying
+ * locale
+ * PL_numeric_standard An int indicating if the toggled state is such
+ * that the current locale is the C locale or
+ * indistinguishable from the C locale. If non-zero, it
+ * is in C; if > 1, it means it may not be toggled away
+ * from C.
+ * PL_numeric_underlying_is_standard A bool kept by this function
+ * indicating that the underlying locale and the standard
+ * C locale are indistinguishable for the purposes of
+ * LC_NUMERIC. This happens when both of the above two
+ * variables are true at the same time. (Toggling is a
+ * no-op under these circumstances.) This variable is
+ * used to avoid having to recalculate.
+ */
+
+ char *save_newnum;
+
+ if (! newnum) {
+ Safefree(PL_numeric_name);
+ PL_numeric_name = NULL;
+ PL_numeric_standard = TRUE;
+ PL_numeric_underlying = TRUE;
+ PL_numeric_underlying_is_standard = TRUE;
+ return;
+ }
+
+ save_newnum = stdize_locale(savepv(newnum));
+ PL_numeric_underlying = TRUE;
+ PL_numeric_standard = isNAME_C_OR_POSIX(save_newnum);
+
+#ifndef TS_W32_BROKEN_LOCALECONV
+
+ /* If its name isn't C nor POSIX, it could still be indistinguishable from
+ * them. But on broken Windows systems calling my_nl_langinfo() for
+ * THOUSEP can currently (but rarely) cause a race, so avoid doing that,
+ * and just always change the locale if not C nor POSIX on those systems */
+ if (! PL_numeric_standard) {
+ PL_numeric_standard = cBOOL(strEQ(".", my_nl_langinfo(RADIXCHAR,
+ FALSE /* Don't toggle locale */ ))
+ && strEQ("", my_nl_langinfo(THOUSEP, FALSE)));
+ }
+
+#endif
+
+ /* Save the new name if it isn't the same as the previous one, if any */
+ if (! PL_numeric_name || strNE(PL_numeric_name, save_newnum)) {
+ Safefree(PL_numeric_name);
+ PL_numeric_name = save_newnum;
+ }
+ else {
+ Safefree(save_newnum);
+ }
+
+ PL_numeric_underlying_is_standard = PL_numeric_standard;
+
+# ifdef HAS_POSIX_2008_LOCALE
+
+ PL_underlying_numeric_obj = newlocale(LC_NUMERIC_MASK,
+ PL_numeric_name,
+ PL_underlying_numeric_obj);
+
+#endif
+
+ if (DEBUG_L_TEST || debug_initialization) {
+ PerlIO_printf(Perl_debug_log, "Called new_numeric with %s, PL_numeric_name=%s\n", newnum, PL_numeric_name);
+ }
+
+ /* Keep LC_NUMERIC in the C locale. This is for XS modules, so they don't
+ * have to worry about the radix being a non-dot. (Core operations that
+ * need the underlying locale change to it temporarily). */
+ if (PL_numeric_standard) {
+ set_numeric_radix(0);
+ }
+ else {
+ set_numeric_standard();
+ }
+
+#endif /* USE_LOCALE_NUMERIC */
+
+}
+
+void
+Perl_set_numeric_standard(pTHX)
+{
+
+#ifdef USE_LOCALE_NUMERIC
+
+ /* Toggle the LC_NUMERIC locale to C. Most code should use the macros like
+ * SET_NUMERIC_STANDARD() in perl.h instead of calling this directly. The
+ * macro avoids calling this routine if toggling isn't necessary according
+ * to our records (which could be wrong if some XS code has changed the
+ * locale behind our back) */
+
+# ifdef DEBUGGING
+
+ if (DEBUG_L_TEST || debug_initialization) {
+ PerlIO_printf(Perl_debug_log,
+ "Setting LC_NUMERIC locale to standard C\n");
+ }
+
+# endif
+
+ do_setlocale_c(LC_NUMERIC, "C");
+ PL_numeric_standard = TRUE;
+ PL_numeric_underlying = PL_numeric_underlying_is_standard;
+ set_numeric_radix(0);
+
+#endif /* USE_LOCALE_NUMERIC */
+
+}
+
+void
+Perl_set_numeric_underlying(pTHX)
+{
+
+#ifdef USE_LOCALE_NUMERIC
+
+ /* Toggle the LC_NUMERIC locale to the current underlying default. Most
+ * code should use the macros like SET_NUMERIC_UNDERLYING() in perl.h
+ * instead of calling this directly. The macro avoids calling this routine
+ * if toggling isn't necessary according to our records (which could be
+ * wrong if some XS code has changed the locale behind our back) */
+
+# ifdef DEBUGGING
+
+ if (DEBUG_L_TEST || debug_initialization) {
+ PerlIO_printf(Perl_debug_log,
+ "Setting LC_NUMERIC locale to %s\n",
+ PL_numeric_name);
+ }
+
+# endif
+
+ do_setlocale_c(LC_NUMERIC, PL_numeric_name);
+ PL_numeric_standard = PL_numeric_underlying_is_standard;
+ PL_numeric_underlying = TRUE;
+ set_numeric_radix(! PL_numeric_standard);
+
+#endif /* USE_LOCALE_NUMERIC */
+
+}
+
+/*
+ * Set up for a new ctype locale.
+ */
+STATIC void
+S_new_ctype(pTHX_ const char *newctype)
+{
+
+#ifndef USE_LOCALE_CTYPE
+
+ PERL_UNUSED_ARG(newctype);
+ PERL_UNUSED_CONTEXT;
+
+#else
+
+ /* Called after each libc setlocale() call affecting LC_CTYPE, to tell
+ * core Perl this and that 'newctype' is the name of the new locale.
+ *
+ * This function sets up the folding arrays for all 256 bytes, assuming
+ * that tofold() is tolc() since fold case is not a concept in POSIX,
+ *
+ * Any code changing the locale (outside this file) should use
+ * Perl_setlocale or POSIX::setlocale, which call this function. Therefore
+ * this function should be called directly only from this file and from
+ * POSIX::setlocale() */
+
+ dVAR;
+ unsigned int i;
+
+ /* Don't check for problems if we are suppressing the warnings */
+ bool check_for_problems = ckWARN_d(WARN_LOCALE) || UNLIKELY(DEBUG_L_TEST);
+ bool maybe_utf8_turkic = FALSE;
+
+ PERL_ARGS_ASSERT_NEW_CTYPE;
+
+ /* We will replace any bad locale warning with 1) nothing if the new one is
+ * ok; or 2) a new warning for the bad new locale */
+ if (PL_warn_locale) {
+ SvREFCNT_dec_NN(PL_warn_locale);
+ PL_warn_locale = NULL;
+ }
+
+ PL_in_utf8_CTYPE_locale = _is_cur_LC_category_utf8(LC_CTYPE);
+
+ /* A UTF-8 locale gets standard rules. But note that code still has to
+ * handle this specially because of the three problematic code points */
+ if (PL_in_utf8_CTYPE_locale) {
+ Copy(PL_fold_latin1, PL_fold_locale, 256, U8);
+
+ /* UTF-8 locales can have special handling for 'I' and 'i' if they are
+ * Turkic. Make sure these two are the only anomalies. (We don't use
+ * towupper and towlower because they aren't in C89.) */
+ if (toupper('i') == 'i' && tolower('I') == 'I') {
+ check_for_problems = TRUE;
+ maybe_utf8_turkic = TRUE;
+ }
+ }
+
+ /* We don't populate the other lists if a UTF-8 locale, but do check that
+ * everything works as expected, unless checking turned off */
+ if (check_for_problems || ! PL_in_utf8_CTYPE_locale) {
+ /* Assume enough space for every character being bad. 4 spaces each
+ * for the 94 printable characters that are output like "'x' "; and 5
+ * spaces each for "'\\' ", "'\t' ", and "'\n' "; plus a terminating
+ * NUL */
+ char bad_chars_list[ (94 * 4) + (3 * 5) + 1 ] = { '\0' };
+ bool multi_byte_locale = FALSE; /* Assume is a single-byte locale
+ to start */
+ unsigned int bad_count = 0; /* Count of bad characters */
+
+ for (i = 0; i < 256; i++) {
+ if (! PL_in_utf8_CTYPE_locale) {
+ if (isupper(i))
+ PL_fold_locale[i] = (U8) tolower(i);
+ else if (islower(i))
+ PL_fold_locale[i] = (U8) toupper(i);
+ else
+ PL_fold_locale[i] = (U8) i;
+ }
+
+ /* If checking for locale problems, see if the native ASCII-range
+ * printables plus \n and \t are in their expected categories in
+ * the new locale. If not, this could mean big trouble, upending
+ * Perl's and most programs' assumptions, like having a
+ * metacharacter with special meaning become a \w. Fortunately,
+ * it's very rare to find locales that aren't supersets of ASCII
+ * nowadays. It isn't a problem for most controls to be changed
+ * into something else; we check only \n and \t, though perhaps \r
+ * could be an issue as well. */
+ if ( check_for_problems
+ && (isGRAPH_A(i) || isBLANK_A(i) || i == '\n'))
+ {
+ bool is_bad = FALSE;
+ char name[4] = { '\0' };
+
+ /* Convert the name into a string */
+ if (isGRAPH_A(i)) {
+ name[0] = i;
+ name[1] = '\0';
+ }
+ else if (i == '\n') {
+ my_strlcpy(name, "\\n", sizeof(name));
+ }
+ else if (i == '\t') {
+ my_strlcpy(name, "\\t", sizeof(name));
+ }
+ else {
+ assert(i == ' ');
+ my_strlcpy(name, "' '", sizeof(name));
+ }
+
+ /* Check each possibe class */
+ if (UNLIKELY(cBOOL(isalnum(i)) != cBOOL(isALPHANUMERIC_A(i)))) {
+ is_bad = TRUE;
+ DEBUG_L(PerlIO_printf(Perl_debug_log,
+ "isalnum('%s') unexpectedly is %d\n",
+ name, cBOOL(isalnum(i))));
+ }
+ if (UNLIKELY(cBOOL(isalpha(i)) != cBOOL(isALPHA_A(i)))) {
+ is_bad = TRUE;
+ DEBUG_L(PerlIO_printf(Perl_debug_log,
+ "isalpha('%s') unexpectedly is %d\n",
+ name, cBOOL(isalpha(i))));
+ }
+ if (UNLIKELY(cBOOL(isdigit(i)) != cBOOL(isDIGIT_A(i)))) {
+ is_bad = TRUE;
+ DEBUG_L(PerlIO_printf(Perl_debug_log,
+ "isdigit('%s') unexpectedly is %d\n",
+ name, cBOOL(isdigit(i))));
+ }
+ if (UNLIKELY(cBOOL(isgraph(i)) != cBOOL(isGRAPH_A(i)))) {
+ is_bad = TRUE;
+ DEBUG_L(PerlIO_printf(Perl_debug_log,
+ "isgraph('%s') unexpectedly is %d\n",
+ name, cBOOL(isgraph(i))));
+ }
+ if (UNLIKELY(cBOOL(islower(i)) != cBOOL(isLOWER_A(i)))) {
+ is_bad = TRUE;
+ DEBUG_L(PerlIO_printf(Perl_debug_log,
+ "islower('%s') unexpectedly is %d\n",
+ name, cBOOL(islower(i))));
+ }
+ if (UNLIKELY(cBOOL(isprint(i)) != cBOOL(isPRINT_A(i)))) {
+ is_bad = TRUE;
+ DEBUG_L(PerlIO_printf(Perl_debug_log,
+ "isprint('%s') unexpectedly is %d\n",
+ name, cBOOL(isprint(i))));
+ }
+ if (UNLIKELY(cBOOL(ispunct(i)) != cBOOL(isPUNCT_A(i)))) {
+ is_bad = TRUE;
+ DEBUG_L(PerlIO_printf(Perl_debug_log,
+ "ispunct('%s') unexpectedly is %d\n",
+ name, cBOOL(ispunct(i))));
+ }
+ if (UNLIKELY(cBOOL(isspace(i)) != cBOOL(isSPACE_A(i)))) {
+ is_bad = TRUE;
+ DEBUG_L(PerlIO_printf(Perl_debug_log,
+ "isspace('%s') unexpectedly is %d\n",
+ name, cBOOL(isspace(i))));
+ }
+ if (UNLIKELY(cBOOL(isupper(i)) != cBOOL(isUPPER_A(i)))) {
+ is_bad = TRUE;
+ DEBUG_L(PerlIO_printf(Perl_debug_log,
+ "isupper('%s') unexpectedly is %d\n",
+ name, cBOOL(isupper(i))));
+ }
+ if (UNLIKELY(cBOOL(isxdigit(i))!= cBOOL(isXDIGIT_A(i)))) {
+ is_bad = TRUE;
+ DEBUG_L(PerlIO_printf(Perl_debug_log,
+ "isxdigit('%s') unexpectedly is %d\n",
+ name, cBOOL(isxdigit(i))));
+ }
+ if (UNLIKELY(tolower(i) != (int) toLOWER_A(i))) {
+ is_bad = TRUE;
+ DEBUG_L(PerlIO_printf(Perl_debug_log,
+ "tolower('%s')=0x%x instead of the expected 0x%x\n",
+ name, tolower(i), (int) toLOWER_A(i)));
+ }
+ if (UNLIKELY(toupper(i) != (int) toUPPER_A(i))) {
+ is_bad = TRUE;
+ DEBUG_L(PerlIO_printf(Perl_debug_log,
+ "toupper('%s')=0x%x instead of the expected 0x%x\n",
+ name, toupper(i), (int) toUPPER_A(i)));
+ }
+ if (UNLIKELY((i == '\n' && ! isCNTRL_LC(i)))) {
+ is_bad = TRUE;
+ DEBUG_L(PerlIO_printf(Perl_debug_log,
+ "'\\n' (=%02X) is not a control\n", (int) i));
+ }
+
+ /* Add to the list; Separate multiple entries with a blank */
+ if (is_bad) {
+ if (bad_count) {
+ my_strlcat(bad_chars_list, " ", sizeof(bad_chars_list));
+ }
+ my_strlcat(bad_chars_list, name, sizeof(bad_chars_list));
+ bad_count++;
+ }
+ }
+ }
+
+ if (bad_count == 2 && maybe_utf8_turkic) {
+ bad_count = 0;
+ *bad_chars_list = '\0';
+ PL_fold_locale['I'] = 'I';
+ PL_fold_locale['i'] = 'i';
+ PL_in_utf8_turkic_locale = TRUE;
+ DEBUG_L(PerlIO_printf(Perl_debug_log, "%s:%d: %s is turkic\n",
+ __FILE__, __LINE__, newctype));
+ }
+ else {
+ PL_in_utf8_turkic_locale = FALSE;
+ }
+
+# ifdef MB_CUR_MAX
+
+ /* We only handle single-byte locales (outside of UTF-8 ones; so if
+ * this locale requires more than one byte, there are going to be
+ * problems. */
+ DEBUG_Lv(PerlIO_printf(Perl_debug_log,
+ "%s:%d: check_for_problems=%d, MB_CUR_MAX=%d\n",
+ __FILE__, __LINE__, check_for_problems, (int) MB_CUR_MAX));
+
+ if ( check_for_problems && MB_CUR_MAX > 1
+ && ! PL_in_utf8_CTYPE_locale
+
+ /* Some platforms return MB_CUR_MAX > 1 for even the "C"
+ * locale. Just assume that the implementation for them (plus
+ * for POSIX) is correct and the > 1 value is spurious. (Since
+ * these are specially handled to never be considered UTF-8
+ * locales, as long as this is the only problem, everything
+ * should work fine */
+ && strNE(newctype, "C") && strNE(newctype, "POSIX"))
+ {
+ multi_byte_locale = TRUE;
+ }
+
+# endif
+
+ /* If we found problems and we want them output, do so */
+ if ( (UNLIKELY(bad_count) || UNLIKELY(multi_byte_locale))
+ && (LIKELY(ckWARN_d(WARN_LOCALE)) || UNLIKELY(DEBUG_L_TEST)))
+ {
+ if (UNLIKELY(bad_count) && PL_in_utf8_CTYPE_locale) {
+ PL_warn_locale = Perl_newSVpvf(aTHX_
+ "Locale '%s' contains (at least) the following characters"
+ " which have\nunexpected meanings: %s\nThe Perl program"
+ " will use the expected meanings",
+ newctype, bad_chars_list);
+ }
+ else {
+ PL_warn_locale = Perl_newSVpvf(aTHX_
+ "Locale '%s' may not work well.%s%s%s\n",
+ newctype,
+ (multi_byte_locale)
+ ? " Some characters in it are not recognized by"
+ " Perl."
+ : "",
+ (bad_count)
+ ? "\nThe following characters (and maybe others)"
+ " may not have the same meaning as the Perl"
+ " program expects:\n"
+ : "",
+ (bad_count)
+ ? bad_chars_list
+ : ""
+ );
+ }
+
+# ifdef HAS_NL_LANGINFO
+
+ Perl_sv_catpvf(aTHX_ PL_warn_locale, "; codeset=%s",
+ /* parameter FALSE is a don't care here */
+ my_nl_langinfo(CODESET, FALSE));
+
+# endif
+
+ Perl_sv_catpvf(aTHX_ PL_warn_locale, "\n");
+
+ /* If we are actually in the scope of the locale or are debugging,
+ * output the message now. If not in that scope, we save the
+ * message to be output at the first operation using this locale,
+ * if that actually happens. Most programs don't use locales, so
+ * they are immune to bad ones. */
+ if (IN_LC(LC_CTYPE) || UNLIKELY(DEBUG_L_TEST)) {
+
+ /* The '0' below suppresses a bogus gcc compiler warning */
+ Perl_warner(aTHX_ packWARN(WARN_LOCALE), SvPVX(PL_warn_locale), 0);
+
+ if (IN_LC(LC_CTYPE)) {
+ SvREFCNT_dec_NN(PL_warn_locale);
+ PL_warn_locale = NULL;
+ }
+ }
+ }
+ }
+
+#endif /* USE_LOCALE_CTYPE */
+
+}
+
+void
+Perl__warn_problematic_locale()
+{
+
+#ifdef USE_LOCALE_CTYPE
+
+ dTHX;
+
+ /* Internal-to-core function that outputs the message in PL_warn_locale,
+ * and then NULLS it. Should be called only through the macro
+ * _CHECK_AND_WARN_PROBLEMATIC_LOCALE */
+
+ if (PL_warn_locale) {
+ Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
+ SvPVX(PL_warn_locale),
+ 0 /* dummy to avoid compiler warning */ );
+ SvREFCNT_dec_NN(PL_warn_locale);
+ PL_warn_locale = NULL;
+ }
+
+#endif
+
+}
+
+STATIC void
+S_new_collate(pTHX_ const char *newcoll)
+{
+
+#ifndef USE_LOCALE_COLLATE
+
+ PERL_UNUSED_ARG(newcoll);
+ PERL_UNUSED_CONTEXT;
+
+#else
+
+ /* Called after each libc setlocale() call affecting LC_COLLATE, to tell
+ * core Perl this and that 'newcoll' is the name of the new locale.
+ *
+ * The design of locale collation is that every locale change is given an
+ * index 'PL_collation_ix'. The first time a string particpates in an
+ * operation that requires collation while locale collation is active, it
+ * is given PERL_MAGIC_collxfrm magic (via sv_collxfrm_flags()). That
+ * magic includes the collation index, and the transformation of the string
+ * by strxfrm(), q.v. That transformation is used when doing comparisons,
+ * instead of the string itself. If a string changes, the magic is
+ * cleared. The next time the locale changes, the index is incremented,
+ * and so we know during a comparison that the transformation is not
+ * necessarily still valid, and so is recomputed. Note that if the locale
+ * changes enough times, the index could wrap (a U32), and it is possible
+ * that a transformation would improperly be considered valid, leading to
+ * an unlikely bug */
+
+ if (! newcoll) {
+ if (PL_collation_name) {
+ ++PL_collation_ix;
+ Safefree(PL_collation_name);
+ PL_collation_name = NULL;
+ }
+ PL_collation_standard = TRUE;
+ is_standard_collation:
+ PL_collxfrm_base = 0;
+ PL_collxfrm_mult = 2;
+ PL_in_utf8_COLLATE_locale = FALSE;
+ PL_strxfrm_NUL_replacement = '\0';
+ PL_strxfrm_max_cp = 0;
+ return;
+ }
+
+ /* If this is not the same locale as currently, set the new one up */
+ if (! PL_collation_name || strNE(PL_collation_name, newcoll)) {
+ ++PL_collation_ix;
+ Safefree(PL_collation_name);
+ PL_collation_name = stdize_locale(savepv(newcoll));
+ PL_collation_standard = isNAME_C_OR_POSIX(newcoll);
+ if (PL_collation_standard) {
+ goto is_standard_collation;
+ }
+
+ PL_in_utf8_COLLATE_locale = _is_cur_LC_category_utf8(LC_COLLATE);
+ PL_strxfrm_NUL_replacement = '\0';
+ PL_strxfrm_max_cp = 0;
+
+ /* A locale collation definition includes primary, secondary, tertiary,
+ * etc. weights for each character. To sort, the primary weights are
+ * used, and only if they compare equal, then the secondary weights are
+ * used, and only if they compare equal, then the tertiary, etc.
+ *
+ * strxfrm() works by taking the input string, say ABC, and creating an
+ * output transformed string consisting of first the primary weights,
+ * A¹B¹C¹ followed by the secondary ones, A²B²C²; and then the
+ * tertiary, etc, yielding A¹B¹C¹ A²B²C² A³B³C³ .... Some characters
+ * may not have weights at every level. In our example, let's say B
+ * doesn't have a tertiary weight, and A doesn't have a secondary
+ * weight. The constructed string is then going to be
+ * A¹B¹C¹ B²C² A³C³ ....
+ * This has the desired effect that strcmp() will look at the secondary
+ * or tertiary weights only if the strings compare equal at all higher
+ * priority weights. The spaces shown here, like in
+ * "A¹B¹C¹ A²B²C² "
+ * are not just for readability. In the general case, these must
+ * actually be bytes, which we will call here 'separator weights'; and
+ * they must be smaller than any other weight value, but since these
+ * are C strings, only the terminating one can be a NUL (some
+ * implementations may include a non-NUL separator weight just before
+ * the NUL). Implementations tend to reserve 01 for the separator
+ * weights. They are needed so that a shorter string's secondary
+ * weights won't be misconstrued as primary weights of a longer string,
+ * etc. By making them smaller than any other weight, the shorter
+ * string will sort first. (Actually, if all secondary weights are
+ * smaller than all primary ones, there is no need for a separator
+ * weight between those two levels, etc.)
+ *
+ * The length of the transformed string is roughly a linear function of
+ * the input string. It's not exactly linear because some characters
+ * don't have weights at all levels. When we call strxfrm() we have to
+ * allocate some memory to hold the transformed string. The
+ * calculations below try to find coefficients 'm' and 'b' for this
+ * locale so that m*x + b equals how much space we need, given the size
+ * of the input string in 'x'. If we calculate too small, we increase
+ * the size as needed, and call strxfrm() again, but it is better to
+ * get it right the first time to avoid wasted expensive string
+ * transformations. */
+
+ {
+ /* We use the string below to find how long the tranformation of it
+ * is. Almost all locales are supersets of ASCII, or at least the
+ * ASCII letters. We use all of them, half upper half lower,
+ * because if we used fewer, we might hit just the ones that are
+ * outliers in a particular locale. Most of the strings being
+ * collated will contain a preponderance of letters, and even if
+ * they are above-ASCII, they are likely to have the same number of
+ * weight levels as the ASCII ones. It turns out that digits tend
+ * to have fewer levels, and some punctuation has more, but those
+ * are relatively sparse in text, and khw believes this gives a
+ * reasonable result, but it could be changed if experience so
+ * dictates. */
+ const char longer[] = "ABCDEFGHIJKLMnopqrstuvwxyz";
+ char * x_longer; /* Transformed 'longer' */
+ Size_t x_len_longer; /* Length of 'x_longer' */
+
+ char * x_shorter; /* We also transform a substring of 'longer' */
+ Size_t x_len_shorter;
+
+ /* _mem_collxfrm() is used get the transformation (though here we
+ * are interested only in its length). It is used because it has
+ * the intelligence to handle all cases, but to work, it needs some
+ * values of 'm' and 'b' to get it started. For the purposes of
+ * this calculation we use a very conservative estimate of 'm' and
+ * 'b'. This assumes a weight can be multiple bytes, enough to
+ * hold any UV on the platform, and there are 5 levels, 4 weight
+ * bytes, and a trailing NUL. */
+ PL_collxfrm_base = 5;
+ PL_collxfrm_mult = 5 * sizeof(UV);
+
+ /* Find out how long the transformation really is */
+ x_longer = _mem_collxfrm(longer,
+ sizeof(longer) - 1,
+ &x_len_longer,
+
+ /* We avoid converting to UTF-8 in the
+ * called function by telling it the
+ * string is in UTF-8 if the locale is a
+ * UTF-8 one. Since the string passed
+ * here is invariant under UTF-8, we can
+ * claim it's UTF-8 even though it isn't.
+ * */
+ PL_in_utf8_COLLATE_locale);
+ Safefree(x_longer);
+
+ /* Find out how long the transformation of a substring of 'longer'
+ * is. Together the lengths of these transformations are
+ * sufficient to calculate 'm' and 'b'. The substring is all of
+ * 'longer' except the first character. This minimizes the chances
+ * of being swayed by outliers */
+ x_shorter = _mem_collxfrm(longer + 1,
+ sizeof(longer) - 2,
+ &x_len_shorter,
+ PL_in_utf8_COLLATE_locale);
+ Safefree(x_shorter);
+
+ /* If the results are nonsensical for this simple test, the whole
+ * locale definition is suspect. Mark it so that locale collation
+ * is not active at all for it. XXX Should we warn? */
+ if ( x_len_shorter == 0
+ || x_len_longer == 0
+ || x_len_shorter >= x_len_longer)
+ {
+ PL_collxfrm_mult = 0;
+ PL_collxfrm_base = 0;
+ }
+ else {
+ SSize_t base; /* Temporary */
+
+ /* We have both: m * strlen(longer) + b = x_len_longer
+ * m * strlen(shorter) + b = x_len_shorter;
+ * subtracting yields:
+ * m * (strlen(longer) - strlen(shorter))
+ * = x_len_longer - x_len_shorter
+ * But we have set things up so that 'shorter' is 1 byte smaller
+ * than 'longer'. Hence:
+ * m = x_len_longer - x_len_shorter
+ *
+ * But if something went wrong, make sure the multiplier is at
+ * least 1.