* setlocale() knows about, there is a layer to cope with that.
* b) stdized_setlocale() is a layer above a) that fixes some vagaries in
* the return value of the libc setlocale(). On most platforms this
- * layer is empty; it requires perl to be Configured with a parameter
- * indicating the platform's defect, in order to be activated. The
+ * layer is empty; in order to be activated, it requires perl to be
+ * Configured with a parameter indicating the platform's defect. The
* current ones are listed at the definition of the macro.
*
* 2) An implementation that adds a minimal layer above implementation 1),
a single instance, so is a #define */
static const char C_decimal_point[] = ".";
+#if defined(HAS_NL_LANGINFO_L) || defined(HAS_NL_LANGINFO)
+# define HAS_SOME_LANGINFO
+#endif
+
#if (defined(USE_LOCALE_NUMERIC) && ! defined(TS_W32_BROKEN_LOCALECONV)) \
|| ! ( defined(USE_LOCALE_NUMERIC) \
&& (defined(HAS_SOME_LANGINFO) || defined(HAS_LOCALECONV)))
&& (( *(name) == 'C' && (*(name + 1)) == '\0') \
|| strEQ((name), "POSIX")))
-#if defined(HAS_NL_LANGINFO_L) || defined(HAS_NL_LANGINFO)
-# define HAS_SOME_LANGINFO
-#endif
-
#define my_langinfo_c(item, category, locale, retbufp, retbuf_sizep, utf8ness) \
my_langinfo_i(item, category##_INDEX_, locale, retbufp, \
retbuf_sizep, utf8ness)
return FALSE;
+ /* Definitively, can't be UTF-8 */
+# define HAS_DEFINITIVE_UTF8NESS_DETERMINATION
# else
/* If the input happens to be the same locale as we are currently setup
);
SV * sv = NULL;
if (retval) {
- STRLEN len = strlen(retval);
- sv = newSV(len);
- sv_usepvn_flags(sv, retval, len, SV_HAS_TRAILING_NUL);
+ sv = newSV_type(SVt_PV);
+ sv_usepvn_flags(sv, retval, strlen(retval), SV_HAS_TRAILING_NUL);
if (result_utf8ness == UTF8NESS_YES) {
SvUTF8_on(sv);
}
#ifdef USE_LOCALE
+# ifndef HAS_DEFINITIVE_UTF8NESS_DETERMINATION
+
+/* Forward declaration of function that we don't put into embed.fnc so as to
+ * make its removal easier, as there may not be any extant platforms that need
+ * it; and the function is located after S_my_langinfo_i() because it's easier
+ * to understand when placed in the context of that code */
+STATIC const char * S_override_codeset_if_utf8_found(pTHX_
+ const char *codeset,
+ const char *locale);
+# endif
/* There are several implementations of my_langinfo, depending on the
* Configuration. They all share the same beginning of the function */
* value, as documented */
utf8ness_t * utf8ness)
{
- const char * retval = NULL;
-
PERL_ARGS_ASSERT_MY_LANGINFO_I;
assert(cat_index < LC_ALL_INDEX_);
* isn't, or vice versa). There is explicit code to bring the categories into
* sync. This doesn't seem to be a problem with nl_langinfo(), so that
* implementation doesn't currently worry about it. But it is a problem on
- * Windows boxes, which don't have nl_langinfo(). */
+ * Windows boxes, which don't have nl_langinfo().
+ *
+ * One might be tempted to avoid any toggling by instead using nl_langinfo_l()
+ * on platforms that have it. This would entail creating a locale object with
+ * newlocale() and freeing it afterwards. But doing so runs significantly
+ * slower than just doing the toggle ourselves. lib/locale_threads.t was
+ * slowed down by 25% on Ubuntu 22.04 */
/*--------------------------------------------------------------------------*/
# if defined(HAS_NL_LANGINFO) /* nl_langinfo() is available. */
const char * orig_switched_locale = toggle_locale_i(cat_index, locale);
gwLOCALE_LOCK;
- retval = save_to_buffer(nl_langinfo(item), retbufp, retbuf_sizep);
+ const char * retval = save_to_buffer(nl_langinfo(item), retbufp, retbuf_sizep);
gwLOCALE_UNLOCK;
if (utf8ness) {
* but they are not exposed. Also calling setlocale(), then calling
* GetThreadLocale() doesn't work, as the former doesn't change the
* latter's return. Therefore we are stuck using the mechanisms below. */
-
-# ifdef WE_MUST_DEAL_WITH_MISMATCHED_CTYPE
-
- const char * orig_CTYPE_locale = toggle_locale_c(LC_CTYPE, locale);
-
-# endif
-
- const char * orig_switched_locale = toggle_locale_i(cat_index, locale);
-
- /* Here, we are in the locale we want information about */
-
/* Almost all the items will have ASCII return values. Set that here, and
* override if necessary */
utf8ness_t is_utf8 = UTF8NESS_IMMATERIAL;
+ const char * retval = NULL;
switch (item) {
default:
Newx(floatbuf, initial_size, char);
+# if defined(WE_MUST_DEAL_WITH_MISMATCHED_CTYPE)
+ const char * orig_CTYPE_locale = toggle_locale_c(LC_CTYPE, locale);
+# endif
+
+ const char * orig_NUMERIC_locale = toggle_locale_c(LC_NUMERIC,
+ locale);
/* 1.5 is exactly representable on binary computers */
Size_t needed_size = snprintf(floatbuf, initial_size, "%.1f", 1.5);
needed_size = new_needed;
}
+ restore_toggled_locale_c(LC_NUMERIC, orig_NUMERIC_locale);
+
+# if defined(WE_MUST_DEAL_WITH_MISMATCHED_CTYPE)
+ restore_toggled_locale_c(LC_CTYPE, orig_CTYPE_locale);
+# endif
+
char * s = floatbuf;
char * e = floatbuf + needed_size;
/* The modification is to prefix the localeconv() return with a
* single byte, calculated as follows: */
- char prefix = (LIKELY(SvIV(precedes) != -1))
- ? ((precedes != 0) ? '-' : '+')
-
- /* khw couldn't find any documentation that
- * CHAR_MAX (which we modify to -1) is the signal,
- * but cygwin uses it thusly, and it makes sense
- * given that CHAR_MAX indicates the value isn't
- * used, so it neither precedes nor succeeds */
- : '.';
+ const char * prefix = (LIKELY(SvIV(precedes) != -1))
+ ? ((precedes != 0) ? "-" : "+")
+ : ".";
+ /* (khw couldn't find any documentation that the dot is signalled
+ * by CHAR_MAX (which we modify to -1), but cygwin uses it thusly,
+ * and it makes sense given that CHAR_MAX indicates the value isn't
+ * used, so it neither precedes nor succeeds) */
/* Now get CRNCYSTR */
(void) hv_iterinit(result_hv);
string = hv_iterval(result_hv, entry);
/* And perform the modification */
- Perl_sv_setpvf(aTHX_ string, "%c%s", prefix, SvPV_nolen(string));
+ sv_insert(string, 0, 0, prefix, 1);
}
/* Here, 'string' contains the value we want to return */
* first day of the week. Since we're only getting one thing at a
* time, it all works */
struct tm mytm;
+
+ const char * orig_TIME_locale = toggle_locale_c(LC_TIME, locale);
+
ints_to_tm(&mytm, 30, 30, hour, mday, mon, 2011, 0, 0, 0);
char * temp;
if (utf8ness) {
temp = strftime_tm(format, &mytm);
}
+ restore_toggled_locale_c(LC_TIME, orig_TIME_locale);
+
retval = save_to_buffer(temp, retbufp, retbuf_sizep);
Safefree(temp);
}
# ifdef WIN32
+
+ const char * orig_CTYPE_locale = toggle_locale_c(LC_CTYPE, locale);
+
# ifndef WIN32_USE_FAKE_OLD_MINGW_LOCALES
/* This function retrieves the code page. It is subject to change, but
retbufp, retbuf_sizep);
# endif
+ restore_toggled_locale_c(LC_CTYPE, orig_CTYPE_locale);
+
DEBUG_Lv(PerlIO_printf(Perl_debug_log, "locale='%s' cp=%s\n",
locale, retval));
break;
retval = save_to_buffer(retval, retbufp, retbuf_sizep);
}
-# ifdef HAS_DEFINITIVE_UTF8NESS_DETERMINATION
+# ifndef HAS_DEFINITIVE_UTF8NESS_DETERMINATION
+
+ /* Here, 'retval' contains any codeset name derived from the locale
+ * name. That derived name may be empty or not necessarily indicative
+ * of the real codeset. But we can often determine if it should be
+ * UTF-8, regardless of what the name is. On most platforms, that
+ * determination is definitive, and was already done. But for this
+ * code to be compiled, this platform is not one of them. However,
+ * there are typically tools available to make a very good guess, and
+ * knowing the derived codeset name improves the quality of that guess.
+ * The following function overrides the derived codeset name when it
+ * guesses that it actually should be UTF-8. It could be inlined here,
+ * but was moved out of this switch() so as to make the switch()
+ * control flow easier to follow */
+ retval = S_override_codeset_if_utf8_found(aTHX_ retval, locale);
- break; /* All done */
+# endif
-# else /* Below, no definitive locale utf8ness calculation on this
- platform */
-# define NAME_INDICATES_UTF8 0x1
-# define MB_CUR_MAX_SUGGESTS_UTF8 0x2
-
- /* Here, 'retval' contains whatever code set name is in the locale
- * name. In this #else, it being a UTF-8 code set hasn't been
- * determined, because this platform is lacking the libc functions
- * which would definitely return that information. So, we try to infer
- * the UTF-8ness by other means, using the code set name just found as
- * a hint to help resolve ambiguities. So if that name indicates it is
- * UTF-8, we expect it to be so */
- unsigned int lean_towards_being_utf8 = 0;
- if (is_codeset_name_UTF8(retval)) {
- lean_towards_being_utf8 |= NAME_INDICATES_UTF8;
- }
+ break; /* All done */
- /* The code set is often UTF-8, even when the locale name doesn't so
- * indicate. If we discover this is so, we will override whatever the
- * locale name said. Conversely (but rarely), "UTF-8" in the locale
- * name might be wrong. We return "" as the code set name if we find
- * that to be the case.
- *
- * For this portion of the file to compile, some C99 functions aren't
- * available to us, even though we now require C99. So, something must
- * be wrong with them. The code here should be good enough to work
- * around this issue, but should the need arise, comments in
- * S_is_locale_utf8() list some alternative C99 functions that could
- * be tried.
- *
- * But MB_CUR_MAX is a C99 construct that helps a lot, is simple for a
- * vendor to implement, and our experience with it is that it works
- * well on a variety of platforms. We have found that it returns a
- * too-large number on some platforms for the C locale, but for no
- * others. That locale was already ruled out above. (If MB_CUR_MAX
- * returned too small a number, that would break a lot of things, and
- * likely would be quickly corrected by the vendor.) khw has some
- * confidence that it doesn't return >1 when 1 is meant, as that would
- * trigger a Perl warning, and we've had no reports of invalid
- * occurrences of such. */
-# ifdef MB_CUR_MAX
-
- /* If there are fewer bytes available in this locale than are required
- * to represent the largest legal UTF-8 code point, this definitely
- * isn't a UTF-8 locale, even if the locale name says it is. */
- const int mb_cur_max = MB_CUR_MAX;
- if (mb_cur_max < (int) UNISKIP(PERL_UNICODE_MAX)) {
- if (lean_towards_being_utf8 & NAME_INDICATES_UTF8) {
- retval = ""; /* The name is wrong; override */
- }
+# endif /* ! WIN32 */
+# endif /* USE_LOCALE_CTYPE */
- break;
- }
+ } /* Giant switch() of nl_langinfo() items */
- /* But if the locale could be UTF-8, and also the name corroborates
- * this, assume it is so */
- if (lean_towards_being_utf8 & NAME_INDICATES_UTF8) {
- break;
- }
+ if (utf8ness) {
+ *utf8ness = is_utf8;
+ }
- /* Here, the name doesn't indicate UTF-8, but MB_CUR_MAX indicates it
- * could be. khw knows of only two other locales in the world, EUC-TW
- * and GB 18030, that legitimately require this many bytes (4). In
- * both, the single byte characters are the same as ASCII. No
- * multi-byte character in EUC-TW is legal UTF-8 (since the first byte
- * of each is a continuation). GB 18030 has no three byte sequences,
- * and none of the four byte ones is legal UTF-8 (as the second byte
- * for these is a non-continuation). But every legal UTF-8 two byte
- * sequence is also legal in GB 18030, though none have the same
- * meaning, and no Han code point expressed in UTF-8 is two byte. So
- * the further tests below which look for native expressions of
- * currency and time will not return two byte sequences, hence they
- * will reliably rule out this locale as being UTF-8. So, if we get
- * this far, the result is almost certainly UTF-8. But to be really
- * sure, also check that there is no illegal UTF-8. */
- lean_towards_being_utf8 |= MB_CUR_MAX_SUGGESTS_UTF8;
-
-# endif /* has MB_CUR_MAX */
-
- /* Here, MB_CUR_MAX is not available, or was inconclusive. What we do
- * is to look at various strings associated with the locale:
- * 1) If any are illegal UTF-8, the locale can't be UTF-8.
- * 2) If all are legal UTF-8, and some non-ASCII characters are
- * present, it is likely to be UTF-8, because of the strictness of
- * UTF-8 syntax. So assume it is UTF-8
- * 3) If all are ASCII and the locale name and/or MB_CUR_MAX indicate
- * UTF-8, assume the locale is UTF-8.
- * 4) Otherwise, assume the locale isn't UTF-8
- *
- * To save cycles, if the locale name indicates it is a UTF-8 locale,
- * we stop looking at the first instance with legal non-ASCII UTF-8.
- * It is very unlikely this combination is coincidental. */
+ return retval;
- utf8ness_t strings_utf8ness = UTF8NESS_UNKNOWN;
- char * scratch_buf = NULL;
- Size_t scratch_buf_size = 0;
+# endif /* All the implementations of my_langinfo() */
- /* List of strings to look at */
- const int trials[] = {
+/*--------------------------------------------------------------------------*/
-# if defined(USE_LOCALE_MONETARY) && defined(HAS_LOCALECONV)
+} /* my_langinfo() */
- /* The first string tried is the locale currency name. Often that
- * will be in the native script.
- *
- * But this is usable only if localeconv() is available, as that's
- * the way we find out the currency symbol. */
+# ifndef HAS_DEFINITIVE_UTF8NESS_DETERMINATION
- CRNCYSTR,
+STATIC const char *
+S_override_codeset_if_utf8_found(pTHX_ const char * codeset,
+ const char * locale)
+{
+# define NAME_INDICATES_UTF8 0x1
+# define MB_CUR_MAX_SUGGESTS_UTF8 0x2
-# endif
-# ifdef USE_LOCALE_TIME
+ /* Override 'codeset' with UTF-8 if this routine guesses that it should be.
+ * Conversely (but rarely), "UTF-8" in the locale name might be wrong. We
+ * return "" as the code set name if we find that to be the case. */
- /* We can also try various strings associated with LC_TIME, like the
- * names of months or days of the week */
+ unsigned int lean_towards_being_utf8 = 0;
+ if (is_codeset_name_UTF8(codeset)) {
+ lean_towards_being_utf8 |= NAME_INDICATES_UTF8;
+ }
- DAY_1, DAY_2, DAY_3, DAY_4, DAY_5, DAY_6, DAY_7,
- MON_1, MON_2, MON_3, MON_4, MON_5, MON_6, MON_7, MON_8,
- MON_9, MON_10, MON_11, MON_12,
- ALT_DIGITS, AM_STR, PM_STR,
- ABDAY_1, ABDAY_2, ABDAY_3, ABDAY_4, ABDAY_5, ABDAY_6, ABDAY_7,
- ABMON_1, ABMON_2, ABMON_3, ABMON_4, ABMON_5, ABMON_6,
- ABMON_7, ABMON_8, ABMON_9, ABMON_10, ABMON_11, ABMON_12
+ /* For this portion of the file to compile, some C99 functions aren't
+ * available to us, even though we now require C99. So, something must be
+ * wrong with them. The code here should be good enough to work around
+ * this issue, but should the need arise, comments in S_is_locale_utf8()
+ * list some alternative C99 functions that could be tried.
+ *
+ * But MB_CUR_MAX is a C89 construct that helps a lot, is simple for a
+ * vendor to implement, and our experience with it is that it works well on
+ * a variety of platforms. We have found that it returns a too-large
+ * number on some platforms for the C locale, but for no others. That
+ * locale was already ruled out in the code that called this function. (If
+ * MB_CUR_MAX returned too small a number, that would break a lot of
+ * things, and likely would be quickly corrected by the vendor.) khw has
+ * some confidence that it doesn't return >1 when 1 is meant, as that would
+ * trigger a Perl warning, and we've had no reports of invalid occurrences
+ * of such. */
+# ifdef MB_CUR_MAX
-# endif
- };
+ /* If there are fewer bytes available in this locale than are required to
+ * represent the largest legal UTF-8 code point, this definitely isn't a
+ * UTF-8 locale, even if the locale name says it is. */
+ const int mb_cur_max = MB_CUR_MAX;
+ if (mb_cur_max < (int) UNISKIP(PERL_UNICODE_MAX)) {
+ if (lean_towards_being_utf8 & NAME_INDICATES_UTF8) {
+ return ""; /* The name is wrong; override */
+ }
-# ifdef USE_LOCALE_TIME
+ return codeset;
+ }
+
+ /* But if the locale could be UTF-8, and also the name corroborates this,
+ * assume it is so */
+ if (lean_towards_being_utf8 & NAME_INDICATES_UTF8) {
+ return codeset;
+ }
+
+ /* Here, the name doesn't indicate UTF-8, but MB_CUR_MAX indicates it could
+ * be. khw knows of only two other locales in the world, EUC-TW and GB
+ * 18030, that legitimately require this many bytes (4). In both, the
+ * single byte characters are the same as ASCII. No multi-byte character
+ * in EUC-TW is legal UTF-8 (since the first byte of each is a
+ * continuation). GB 18030 has no three byte sequences, and none of the
+ * four byte ones is legal UTF-8 (as the second byte for these is a
+ * non-continuation). But every legal UTF-8 two byte sequence is also
+ * legal in GB 18030, though none have the same meaning, and no Han code
+ * point expressed in UTF-8 is two byte. So the further tests below which
+ * look for native expressions of currency and time will not return two
+ * byte sequences, hence they will reliably rule out this locale as being
+ * UTF-8. So, if we get this far, the result is almost certainly UTF-8.
+ * But to be really sure, also check that there is no illegal UTF-8. */
+ lean_towards_being_utf8 |= MB_CUR_MAX_SUGGESTS_UTF8;
+
+# endif /* has MB_CUR_MAX */
+
+ /* Here, MB_CUR_MAX is not available, or was inconclusive. What we do is
+ * to look at various strings associated with the locale:
+ * 1) If any are illegal UTF-8, the locale can't be UTF-8.
+ * 2) If all are legal UTF-8, and some non-ASCII characters are present,
+ * it is likely to be UTF-8, because of the strictness of UTF-8
+ * syntax. So assume it is UTF-8
+ * 3) If all are ASCII and the locale name and/or MB_CUR_MAX indicate
+ * UTF-8, assume the locale is UTF-8.
+ * 4) Otherwise, assume the locale isn't UTF-8
+ *
+ * To save cycles, if the locale name indicates it is a UTF-8 locale, we
+ * stop looking at the first instance with legal non-ASCII UTF-8. It is
+ * very unlikely this combination is coincidental. */
- /* The code in the recursive call below can handle switching the
- * locales, but by doing it now here, that code will check and discover
- * that there is no need to switch then restore, avoiding those each
- * loop iteration */
- const char * orig_TIME_locale = toggle_locale_c(LC_TIME, locale);
+ utf8ness_t strings_utf8ness = UTF8NESS_UNKNOWN;
+ char * scratch_buf = NULL;
+ Size_t scratch_buf_size = 0;
-# endif
+ /* List of strings to look at */
+ const int trials[] = {
- /* The trials array may consist of strings from two different locale
- * categories. The call to my_langinfo_i() below needs to pass the
- * proper category for each string. There is a max of 1 trial for
- * LC_MONETARY; the rest are LC_TIME. So the array is arranged so the
- * LC_MONETARY item (if any) is first, and all subsequent iterations
- * will use LC_TIME. These #ifdefs set up the values for all possible
- * combinations. */
-# if defined(USE_LOCALE_MONETARY) && defined(HAS_LOCALECONV)
+# if defined(USE_LOCALE_MONETARY) && defined(HAS_LOCALECONV)
- locale_category_index cat_index = LC_MONETARY_INDEX_;
+ /* The first string tried is the locale currency name. Often that will
+ * be in the native script.
+ *
+ * But this is usable only if localeconv() is available, as that's the
+ * way we find out the currency symbol. */
-# ifdef USE_LOCALE_TIME
+ CRNCYSTR,
- const locale_category_index follow_on_cat_index = LC_TIME_INDEX_;
- assert(trials[1] == DAY_1); /* Make sure only a single non-time entry */
+# endif
+# ifdef USE_LOCALE_TIME
-# else
+ /* We can also try various strings associated with LC_TIME, like the names
+ * of months or days of the week */
- /* Effectively out-of-bounds, as there is only the monetary entry */
- const locale_category_index follow_on_cat_index = LC_ALL_INDEX_;
+ DAY_1, DAY_2, DAY_3, DAY_4, DAY_5, DAY_6, DAY_7,
+ MON_1, MON_2, MON_3, MON_4, MON_5, MON_6, MON_7, MON_8,
+ MON_9, MON_10, MON_11, MON_12,
+ ALT_DIGITS, AM_STR, PM_STR,
+ ABDAY_1, ABDAY_2, ABDAY_3, ABDAY_4, ABDAY_5, ABDAY_6, ABDAY_7,
+ ABMON_1, ABMON_2, ABMON_3, ABMON_4, ABMON_5, ABMON_6,
+ ABMON_7, ABMON_8, ABMON_9, ABMON_10, ABMON_11, ABMON_12
-# endif
-# elif defined(USE_LOCALE_TIME)
+# endif
- locale_category_index cat_index = LC_TIME_INDEX_;
- const locale_category_index follow_on_cat_index = LC_TIME_INDEX_;
+ };
-# else
+# ifdef USE_LOCALE_TIME
- /* Effectively out-of-bounds, as here there are no trial entries at
- * all. This allows this code to compile, but there are no strings to
- * test, and so the answer will always be non-UTF-8. */
- locale_category_index cat_index = LC_ALL_INDEX_;
- const locale_category_index follow_on_cat_index = LC_ALL_INDEX_;
+ /* The code in the recursive call below can handle switching the locales,
+ * but by doing it now here, that code will check and discover that there
+ * is no need to switch then restore, avoiding those each loop iteration */
+ const char * orig_TIME_locale = toggle_locale_c(LC_TIME, locale);
-# endif
+# endif
- /* Everything set up; look through all the strings */
- for (PERL_UINT_FAST8_T i = 0; i < C_ARRAY_LENGTH(trials); i++) {
- (void) my_langinfo_i(trials[i], cat_index, locale,
- &scratch_buf, &scratch_buf_size, NULL);
- cat_index = follow_on_cat_index;
+ /* The trials array may consist of strings from two different locale
+ * categories. The call to my_langinfo_i() below needs to pass the proper
+ * category for each string. There is a max of 1 trial for LC_MONETARY;
+ * the rest are LC_TIME. So the array is arranged so the LC_MONETARY item
+ * (if any) is first, and all subsequent iterations will use LC_TIME.
+ * These #ifdefs set up the values for all possible combinations. */
+# if defined(USE_LOCALE_MONETARY) && defined(HAS_LOCALECONV)
- /* To prevent infinite recursive calls, we don't ask for the
- * UTF-8ness of the string (in 'trials[i]') above. Instead we
- * examine the returned string here */
- const Size_t len = strlen(scratch_buf);
- const U8 * first_variant;
+ locale_category_index cat_index = LC_MONETARY_INDEX_;
- /* If the string is identical whether or not it is encoded as
- * UTF-8, it isn't helpful in determining UTF8ness. */
- if (is_utf8_invariant_string_loc((U8 *) scratch_buf, len,
- &first_variant))
- {
- continue;
- }
+# ifdef USE_LOCALE_TIME
- /* Here, has non-ASCII. If not legal UTF-8, isn't a UTF-8
- * locale */
- if (! is_utf8_string(first_variant,
- len - (first_variant - (U8 *) scratch_buf)))
- {
- strings_utf8ness = UTF8NESS_NO;
- break;
- }
+ const locale_category_index follow_on_cat_index = LC_TIME_INDEX_;
+ assert(trials[1] == DAY_1); /* Make sure only a single non-time entry */
- /* Here, is a legal non-ASCII UTF-8 string; tentatively set the
- * return to YES; possibly overridden by later iterations */
- strings_utf8ness = UTF8NESS_YES;
+# else
- /* But if this corroborates our expectation, quit now */
- if (lean_towards_being_utf8 & NAME_INDICATES_UTF8) {
- break;
- }
- }
+ /* Effectively out-of-bounds, as there is only the monetary entry */
+ const locale_category_index follow_on_cat_index = LC_ALL_INDEX_;
-# ifdef USE_LOCALE_TIME
+# endif
+# elif defined(USE_LOCALE_TIME)
- restore_toggled_locale_c(LC_TIME, orig_TIME_locale);
+ locale_category_index cat_index = LC_TIME_INDEX_;
+ const locale_category_index follow_on_cat_index = LC_TIME_INDEX_;
-# endif
+# else
- Safefree(scratch_buf);
- scratch_buf = NULL;
+ /* Effectively out-of-bounds, as here there are no trial entries at all.
+ * This allows this code to compile, but there are no strings to test, and
+ * so the answer will always be non-UTF-8. */
+ locale_category_index cat_index = LC_ALL_INDEX_;
+ const locale_category_index follow_on_cat_index = LC_ALL_INDEX_;
- if (strings_utf8ness == UTF8NESS_NO) {
- /* 'retval' is already loaded with whatever code set we found. */
- break;
- }
+# endif
- /* Here all tested strings are legal UTF-8.
- *
- * Above we set UTF8NESS_YES if any string wasn't ASCII. But even if
- * they are all ascii, and the locale name indicates it is a UTF-8
- * locale, assume the locale is UTF-8. */
- if (lean_towards_being_utf8) {
- strings_utf8ness = UTF8NESS_YES;
+ /* Everything set up; look through all the strings */
+ for (PERL_UINT_FAST8_T i = 0; i < C_ARRAY_LENGTH(trials); i++) {
+ (void) my_langinfo_i(trials[i], cat_index, locale,
+ &scratch_buf, &scratch_buf_size, NULL);
+ cat_index = follow_on_cat_index;
+
+ /* To prevent infinite recursive calls, we don't ask for the UTF-8ness
+ * of the string (in 'trials[i]') above. Instead we examine the
+ * returned string here */
+ const Size_t len = strlen(scratch_buf);
+ const U8 * first_variant;
+
+ /* If the string is identical whether or not it is encoded as UTF-8, it
+ * isn't helpful in determining UTF8ness. */
+ if (is_utf8_invariant_string_loc((U8 *) scratch_buf, len,
+ &first_variant))
+ {
+ continue;
}
- if (strings_utf8ness == UTF8NESS_YES) {
- retval = "UTF-8";
+ /* Here, has non-ASCII. If not legal UTF-8, isn't a UTF-8 locale */
+ if (! is_utf8_string(first_variant,
+ len - (first_variant - (U8 *) scratch_buf)))
+ {
+ strings_utf8ness = UTF8NESS_NO;
break;
}
- /* Here, nothing examined indicates that the codeset is or isn't UTF-8.
- * But what is it? The other locale categories are not likely to be of
- * further help:
- *
- * LC_NUMERIC Only a few locales in the world have a non-ASCII radix
- * or group separator.
- * LC_CTYPE This code wouldn't be compiled if mbtowc() existed and
- * was reliable. This is unlikely in C99. There are
- * other functions that could be used instead, but are
- * they going to exist, and be able to distinguish between
- * UTF-8 and 8859-1? Deal with this only if it becomes
- * necessary.
- * LC_MESSAGES The strings returned from strerror() would seem likely
- * candidates, but experience has shown that many systems
- * don't actually have translations installed for them.
- * They are instead always in English, so everything in
- * them is ASCII, which is of no help to us. A Configure
- * probe could possibly be written to see if this platform
- * has non-ASCII error messages. But again, wait until it
- * turns out to be an actual problem.
- *
- * Things like YESSTR, NOSTR, might not be in ASCII, but
- * need nl_langinfo() to access, which we don't have.
- */
-
- /* Otherwise, assume the locale isn't UTF-8. This can be wrong if we
- * don't have MB_CUR_MAX, and the locale is English without UTF-8 in
- * its name, and with a dollar currency symbol. */
- break; /* 'retval' is already loaded with whatever code set we found. */
+ /* Here, is a legal non-ASCII UTF-8 string; tentatively set the return
+ * to YES; possibly overridden by later iterations */
+ strings_utf8ness = UTF8NESS_YES;
-# endif /* NEED_FURTHER_UTF8NESS_CHECKING */
-# endif /* ! WIN32 */
-# endif /* USE_LOCALE_CTYPE */
+ /* But if this corroborates our expectation, quit now */
+ if (lean_towards_being_utf8 & NAME_INDICATES_UTF8) {
+ break;
+ }
+ }
- } /* Giant switch() of nl_langinfo() items */
+# ifdef USE_LOCALE_TIME
- restore_toggled_locale_i(cat_index, orig_switched_locale);
+ restore_toggled_locale_c(LC_TIME, orig_TIME_locale);
-# ifdef WE_MUST_DEAL_WITH_MISMATCHED_CTYPE
- restore_toggled_locale_c(LC_CTYPE, orig_CTYPE_locale);
# endif
- if (utf8ness) {
- *utf8ness = is_utf8;
+ Safefree(scratch_buf);
+ scratch_buf = NULL;
+
+ if (strings_utf8ness == UTF8NESS_NO) {
+ return codeset; /* No override */
}
- return retval;
+ /* Here all tested strings are legal UTF-8.
+ *
+ * Above we set UTF8NESS_YES if any string wasn't ASCII. But even if they
+ * are all ascii, and the locale name indicates it is a UTF-8 locale,
+ * assume the locale is UTF-8. */
+ if (lean_towards_being_utf8) {
+ strings_utf8ness = UTF8NESS_YES;
+ }
-# endif /* All the implementations of my_langinfo() */
+ if (strings_utf8ness == UTF8NESS_YES) {
+ return "UTF-8";
+ }
-/*--------------------------------------------------------------------------*/
+ /* Here, nothing examined indicates that the codeset is or isn't UTF-8.
+ * But what is it? The other locale categories are not likely to be of
+ * further help:
+ *
+ * LC_NUMERIC Only a few locales in the world have a non-ASCII radix or
+ * group separator.
+ * LC_CTYPE This code wouldn't be compiled if mbtowc() existed and was
+ * reliable. This is unlikely in C99. There are other
+ * functions that could be used instead, but are they going to
+ * exist, and be able to distinguish between UTF-8 and 8859-1?
+ * Deal with this only if it becomes necessary.
+ * LC_MESSAGES The strings returned from strerror() would seem likely
+ * candidates, but experience has shown that many systems
+ * don't actually have translations installed for them. They
+ * are instead always in English, so everything in them is
+ * ASCII, which is of no help to us. A Configure probe could
+ * possibly be written to see if this platform has non-ASCII
+ * error messages. But again, wait until it turns out to be
+ * an actual problem.
+ *
+ * Things like YESSTR, NOSTR, might not be in ASCII, but need
+ * nl_langinfo() to access, which we don't have.
+ */
-} /* my_langinfo() */
+ /* Otherwise, assume the locale isn't UTF-8. This can be wrong if we don't
+ * have MB_CUR_MAX, and the locale is English without UTF-8 in its name,
+ * and with a dollar currency symbol. */
+ return codeset; /* No override */
+}
+# endif /* ! HAS_DEFINITIVE_UTF8NESS_DETERMINATION */
#endif /* USE_LOCALE */
/*
S<C<struct tm>> parameter. C<sv_strftime_ints> takes a bunch of integer
parameters that together completely define a given time.
-C<my_strftime> is kept for backwards compatibility. Knowing if the result
+C<my_strftime> is kept for backwards compatibility. Knowing if its result
should be considered UTF-8 or not requires significant extra logic.
Note that C<yday> and C<wday> effectively are ignored by C<sv_strftime_ints>
const char * locale_to_restore_to = querylocale_i(cat_index);
DEBUG_Lv(PerlIO_printf(Perl_debug_log,
- "(%" LINE_Tf "): toggle_locale_i: index=%d(%s), wanted=%s,"
- " actual=%s\n",
- caller_line, cat_index, category_names[cat_index],
- new_locale, locale_to_restore_to));
+ "Entering toggle_locale_i: index=%d(%s)," \
+ " wanted=%s, actual=%s; called from %" LINE_Tf \
+ "\n", cat_index, category_names[cat_index],
+ new_locale, locale_to_restore_to, caller_line));
if (! locale_to_restore_to) {
locale_panic_via_(Perl_form(aTHX_
/* If the locales are the same, there's nothing to do */
if (strEQ(locale_to_restore_to, new_locale)) {
- DEBUG_Lv(PerlIO_printf(Perl_debug_log,
- "(%" LINE_Tf "): %s locale unchanged as %s\n",
- caller_line, category_names[cat_index],
- new_locale));
+ DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s locale unchanged as %s\n",
+ category_names[cat_index],
+ new_locale));
return NULL;
}
void_setlocale_i_with_caller(cat_index, new_locale, __FILE__, caller_line);
DEBUG_Lv(PerlIO_printf(Perl_debug_log,
- "(%" LINE_Tf "): %s locale switched to %s\n",
- caller_line, category_names[cat_index], new_locale));
+ "%s locale switched to %s\n",
+ category_names[cat_index], new_locale));
return locale_to_restore_to;
if (restore_locale == NULL) {
DEBUG_Lv(PerlIO_printf(Perl_debug_log,
- "(%" LINE_Tf "): No need to restore %s\n",
- caller_line, category_names[cat_index]));
+ "restore_toggled_locale_i: No need to" \
+ " restore %s; called from %" LINE_Tf "\n", \
+ category_names[cat_index], caller_line));
return;
}
DEBUG_Lv(PerlIO_printf(Perl_debug_log,
- "(%" LINE_Tf "): %s restoring locale to %s\n",
- caller_line, category_names[cat_index],
- restore_locale));
+ "restore_toggled_locale_i: restoring locale for" \
+ " %s to %s; called from %" LINE_Tf "\n", \
+ category_names[cat_index], restore_locale,
+ caller_line));
void_setlocale_i_with_caller(cat_index, restore_locale,
__FILE__, caller_line);