#ifdef USE_LOCALE
-/*
- * Standardize the locale name from a string returned by 'setlocale', possibly
- * modifying that string.
- *
- * The typical return value of setlocale() is either
- * (1) "xx_YY" if the first argument of setlocale() is not LC_ALL
- * (2) "xa_YY xb_YY ..." if the first argument of setlocale() is LC_ALL
- * (the space-separated values represent the various sublocales,
- * in some unspecified order). This is not handled by this function.
+/* This code keeps a LRU cache of the UTF-8ness of the locales it has so-far
+ * looked up. This is in the form of a C string: */
+
+#define UTF8NESS_SEP "\v"
+#define UTF8NESS_PREFIX "\f"
+
+/* So, the string looks like:
*
- * In some platforms it has a form like "LC_SOMETHING=Lang_Country.866\n",
- * which is harmful for further use of the string in setlocale(). This
- * function removes the trailing new line and everything up through the '='
+ * \vC\a0\vPOSIX\a0\vam_ET\a0\vaf_ZA.utf8\a1\ven_US.UTF-8\a1\0
*
- */
+ * where the digit 0 after the \a indicates that the locale starting just
+ * after the preceding \v is not UTF-8, and the digit 1 mean it is. */
+
+STATIC_ASSERT_DECL(STRLENs(UTF8NESS_SEP) == 1);
+STATIC_ASSERT_DECL(STRLENs(UTF8NESS_PREFIX) == 1);
+
+#define C_and_POSIX_utf8ness UTF8NESS_SEP "C" UTF8NESS_PREFIX "0" \
+ UTF8NESS_SEP "POSIX" UTF8NESS_PREFIX "0"
+
+/* The cache is initialized to C_and_POSIX_utf8ness at start up. These are
+ * kept there always. The remining portion of the cache is LRU, with the
+ * oldest looked-up locale at the tail end */
+
STATIC char *
S_stdize_locale(pTHX_ char *locs)
{
+ /* Standardize the locale name from a string returned by 'setlocale',
+ * possibly modifying that string.
+ *
+ * The typical return value of setlocale() is either
+ * (1) "xx_YY" if the first argument of setlocale() is not LC_ALL
+ * (2) "xa_YY xb_YY ..." if the first argument of setlocale() is LC_ALL
+ * (the space-separated values represent the various sublocales,
+ * in some unspecified order). This is not handled by this function.
+ *
+ * In some platforms it has a form like "LC_SOMETHING=Lang_Country.866\n",
+ * which is harmful for further use of the string in setlocale(). This
+ * function removes the trailing new line and everything up through the '='
+ * */
+
const char * const s = strchr(locs, '=');
bool okay = TRUE;
# ifdef USE_LOCALE_MONETARY
LC_MONETARY,
# endif
+# ifdef USE_LOCALE_ADDRESS
+ LC_ADDRESS,
+# endif
+# ifdef USE_LOCALE_IDENTIFICATION
+ LC_IDENTIFICATION,
+# endif
+# ifdef USE_LOCALE_MEASUREMENT
+ LC_MEASUREMENT,
+# endif
+# ifdef USE_LOCALE_PAPER
+ LC_PAPER,
+# endif
+# ifdef USE_LOCALE_TELEPHONE
+ LC_TELEPHONE,
+# endif
# ifdef LC_ALL
LC_ALL,
# endif
# ifdef USE_LOCALE_MONETARY
"LC_MONETARY",
# endif
+# ifdef USE_LOCALE_ADDRESS
+ "LC_ADDRESS",
+# endif
+# ifdef USE_LOCALE_IDENTIFICATION
+ "LC_IDENTIFICATION",
+# endif
+# ifdef USE_LOCALE_MEASUREMENT
+ "LC_MEASUREMENT",
+# endif
+# ifdef USE_LOCALE_PAPER
+ "LC_PAPER",
+# endif
+# ifdef USE_LOCALE_TELEPHONE
+ "LC_TELEPHONE",
+# endif
# ifdef LC_ALL
"LC_ALL",
# endif
# else
# define _DUMMY_MONETARY _DUMMY_MESSAGES
# endif
+# ifdef USE_LOCALE_ADDRESS
+# define LC_ADDRESS_INDEX _DUMMY_MONETARY + 1
+# define _DUMMY_ADDRESS LC_ADDRESS_INDEX
+# else
+# define _DUMMY_ADDRESS _DUMMY_MONETARY
+# endif
+# ifdef USE_LOCALE_IDENTIFICATION
+# define LC_IDENTIFICATION_INDEX _DUMMY_ADDRESS + 1
+# define _DUMMY_IDENTIFICATION LC_IDENTIFICATION_INDEX
+# else
+# define _DUMMY_IDENTIFICATION _DUMMY_ADDRESS
+# endif
+# ifdef USE_LOCALE_MEASUREMENT
+# define LC_MEASUREMENT_INDEX _DUMMY_IDENTIFICATION + 1
+# define _DUMMY_MEASUREMENT LC_MEASUREMENT_INDEX
+# else
+# define _DUMMY_MEASUREMENT _DUMMY_IDENTIFICATION
+# endif
+# ifdef USE_LOCALE_PAPER
+# define LC_PAPER_INDEX _DUMMY_MEASUREMENT + 1
+# define _DUMMY_PAPER LC_PAPER_INDEX
+# else
+# define _DUMMY_PAPER _DUMMY_MEASUREMENT
+# endif
+# ifdef USE_LOCALE_TELEPHONE
+# define LC_TELEPHONE_INDEX _DUMMY_PAPER + 1
+# define _DUMMY_TELEPHONE LC_TELEPHONE_INDEX
+# else
+# define _DUMMY_TELEPHONE _DUMMY_PAPER
+# endif
# ifdef LC_ALL
-# define LC_ALL_INDEX _DUMMY_MONETARY + 1
+# define LC_ALL_INDEX _DUMMY_TELEPHONE + 1
# endif
#endif /* ifdef USE_LOCALE */
#if defined(USE_LOCALE_NUMERIC) && ( defined(HAS_LOCALECONV) \
|| defined(HAS_NL_LANGINFO))
- /* We only set up the radix SV if we are to use a locale radix ... */
- if (use_locale) {
- const char * radix = my_nl_langinfo(PERL_RADIXCHAR, FALSE);
- /* FALSE => already in dest locale */
+ const char * radix = (use_locale)
+ ? my_nl_langinfo(PERL_RADIXCHAR, FALSE)
+ /* FALSE => already in dest locale */
+ : ".";
- /* ... and the character being used isn't a dot */
- if (strNE(radix, ".")) {
- if (PL_numeric_radix_sv) {
- sv_setpv(PL_numeric_radix_sv, radix);
- }
- else {
- PL_numeric_radix_sv = newSVpv(radix, 0);
- }
+ sv_setpv(PL_numeric_radix_sv, radix);
- if ( ! is_utf8_invariant_string(
- (U8 *) SvPVX(PL_numeric_radix_sv), SvCUR(PL_numeric_radix_sv))
- && is_utf8_string(
- (U8 *) SvPVX(PL_numeric_radix_sv), SvCUR(PL_numeric_radix_sv))
- && _is_cur_LC_category_utf8(LC_NUMERIC))
- {
- SvUTF8_on(PL_numeric_radix_sv);
- }
- goto done;
- }
+ /* If this is valid UTF-8 that isn't totally ASCII, and we are in
+ * a UTF-8 locale, then mark the radix as being in UTF-8 */
+ if (is_utf8_non_invariant_string((U8 *) SvPVX(PL_numeric_radix_sv),
+ SvCUR(PL_numeric_radix_sv))
+ && _is_cur_LC_category_utf8(LC_NUMERIC))
+ {
+ SvUTF8_on(PL_numeric_radix_sv);
}
- SvREFCNT_dec(PL_numeric_radix_sv);
- PL_numeric_radix_sv = NULL;
-
- done: ;
-
# ifdef DEBUGGING
if (DEBUG_L_TEST || debug_initialization) {
PerlIO_printf(Perl_debug_log, "Locale radix is '%s', ?UTF-8=%d\n",
- (PL_numeric_radix_sv)
- ? SvPVX(PL_numeric_radix_sv)
- : "NULL",
- (PL_numeric_radix_sv)
- ? cBOOL(SvUTF8(PL_numeric_radix_sv))
- : 0);
+ SvPVX(PL_numeric_radix_sv),
+ cBOOL(SvUTF8(PL_numeric_radix_sv)));
}
# endif
* that the current locale is the program's underlying
* locale
* PL_numeric_standard An int indicating if the toggled state is such
- * that the current locale is the C locale. If non-zero,
- * it is in C; if > 1, it means it may not be toggled away
+ * that the current locale is the C locale or
+ * indistinguishable from the C locale. If non-zero, it
+ * is in C; if > 1, it means it may not be toggled away
* from C.
- * Note that both of the last two variables can be true at the same time,
- * if the underlying locale is C. (Toggling is a no-op under these
- * circumstances.)
- *
+ * PL_numeric_underlying_is_standard A bool kept by this function
+ * indicating that the underlying locale and the standard
+ * C locale are indistinguishable for the purposes of
+ * LC_NUMERIC. This happens when both of the above two
+ * variables are true at the same time. (Toggling is a
+ * no-op under these circumstances.) This variable is
+ * used to avoid having to recalculate.
* Any code changing the locale (outside this file) should use
* POSIX::setlocale, which calls this function. Therefore this function
* should be called directly only from this file and from
PL_numeric_name = NULL;
PL_numeric_standard = TRUE;
PL_numeric_underlying = TRUE;
+ PL_numeric_underlying_is_standard = TRUE;
return;
}
save_newnum = stdize_locale(savepv(newnum));
-
- PL_numeric_standard = isNAME_C_OR_POSIX(save_newnum);
PL_numeric_underlying = TRUE;
+ PL_numeric_standard = isNAME_C_OR_POSIX(save_newnum);
+
+ /* If its name isn't C nor POSIX, it could still be indistinguishable from
+ * them */
+ if (! PL_numeric_standard) {
+ PL_numeric_standard = cBOOL(strEQ(".", my_nl_langinfo(PERL_RADIXCHAR,
+ FALSE /* Don't toggle locale */ ))
+ && strEQ("", my_nl_langinfo(PERL_THOUSEP,
+ FALSE)));
+ }
+ /* Save the new name if it isn't the same as the previous one, if any */
if (! PL_numeric_name || strNE(PL_numeric_name, save_newnum)) {
Safefree(PL_numeric_name);
PL_numeric_name = save_newnum;
Safefree(save_newnum);
}
+ PL_numeric_underlying_is_standard = PL_numeric_standard;
+
+# if defined(HAS_NEWLOCALE) && ! defined(NO_POSIX_2008_LOCALE)
+
+ PL_underlying_numeric_obj = newlocale(LC_NUMERIC_MASK,
+ PL_numeric_name,
+ PL_underlying_numeric_obj);
+
+#endif
+
+ if (DEBUG_L_TEST || debug_initialization) {
+ PerlIO_printf(Perl_debug_log, "Called new_numeric with %s, PL_numeric_name=%s\n", newnum, PL_numeric_name);
+ }
+
/* Keep LC_NUMERIC in the C locale. This is for XS modules, so they don't
* have to worry about the radix being a non-dot. (Core operations that
* need the underlying locale change to it temporarily). */
do_setlocale_c(LC_NUMERIC, "C");
PL_numeric_standard = TRUE;
- PL_numeric_underlying = isNAME_C_OR_POSIX(PL_numeric_name);
+ PL_numeric_underlying = PL_numeric_underlying_is_standard;
set_numeric_radix(0);
# ifdef DEBUGGING
* wrong if some XS code has changed the locale behind our back) */
do_setlocale_c(LC_NUMERIC, PL_numeric_name);
- PL_numeric_standard = isNAME_C_OR_POSIX(PL_numeric_name);
+ PL_numeric_standard = PL_numeric_underlying_is_standard;
PL_numeric_underlying = TRUE;
- set_numeric_radix(1);
+ set_numeric_radix(! PL_numeric_standard);
# ifdef DEBUGGING
unsigned int bad_count = 0; /* Count of bad characters */
for (i = 0; i < 256; i++) {
- if (isUPPER_LC((U8) i))
- PL_fold_locale[i] = (U8) toLOWER_LC((U8) i);
- else if (isLOWER_LC((U8) i))
- PL_fold_locale[i] = (U8) toUPPER_LC((U8) i);
+ if (isupper(i))
+ PL_fold_locale[i] = (U8) tolower(i);
+ else if (islower(i))
+ PL_fold_locale[i] = (U8) toupper(i);
else
PL_fold_locale[i] = (U8) i;
# ifdef LC_ALL
- else if (category == LC_ALL) {
+ else if (category == LC_ALL && ! PL_numeric_underlying) {
+
SET_NUMERIC_UNDERLYING();
}
{
dTHX;
+ /* We only need to toggle into the underlying LC_NUMERIC locale for these
+ * two items, and only if not already there */
+ if (toggle && (( item != PERL_RADIXCHAR && item != PERL_THOUSEP)
+ || PL_numeric_underlying))
+ {
+ toggle = FALSE;
+ }
+
#if defined(HAS_NL_LANGINFO) /* nl_langinfo() is available. */
#if ! defined(HAS_POSIX_2008_LOCALE)
* switching back, as some systems destroy the buffer when setlocale() is
* called */
- LOCALE_LOCK;
+ {
+ DECLARATION_FOR_LC_NUMERIC_MANIPULATION;
- if (toggle) {
- if (item == PERL_RADIXCHAR || item == PERL_THOUSEP) {
- do_setlocale_c(LC_NUMERIC, PL_numeric_name);
+ if (toggle) {
+ STORE_LC_NUMERIC_FORCE_TO_UNDERLYING();
}
- else {
- toggle = FALSE;
- }
- }
- save_to_buffer(nl_langinfo(item), &PL_langinfo_buf, &PL_langinfo_bufsize, 0);
+ LOCALE_LOCK; /* Prevent interference from another thread executing
+ this code section (the only call to nl_langinfo in
+ the core) */
- if (toggle) {
- do_setlocale_c(LC_NUMERIC, "C");
- }
+ save_to_buffer(nl_langinfo(item), &PL_langinfo_buf,
+ &PL_langinfo_bufsize, 0);
- LOCALE_UNLOCK;
+ LOCALE_UNLOCK;
+
+ if (toggle) {
+ RESTORE_LC_NUMERIC();
+ }
+ }
# else /* Use nl_langinfo_l(), avoiding both a mutex and changing the locale */
- bool do_free = FALSE;
- locale_t cur = uselocale((locale_t) 0);
+ {
+ bool do_free = FALSE;
+ locale_t cur = uselocale((locale_t) 0);
- if (cur == LC_GLOBAL_LOCALE) {
- cur = duplocale(LC_GLOBAL_LOCALE);
- do_free = TRUE;
- }
+ if (cur == LC_GLOBAL_LOCALE) {
+ cur = duplocale(LC_GLOBAL_LOCALE);
+ do_free = TRUE;
+ }
- if ( toggle
- && (item == PERL_RADIXCHAR || item == PERL_THOUSEP))
- {
- cur = newlocale(LC_NUMERIC_MASK, PL_numeric_name, cur);
- do_free = TRUE;
- }
+ if (toggle) {
+ if (PL_underlying_numeric_obj) {
+ cur = PL_underlying_numeric_obj;
+ }
+ else {
+ cur = newlocale(LC_NUMERIC_MASK, PL_numeric_name, cur);
+ do_free = TRUE;
+ }
+ }
- save_to_buffer(nl_langinfo_l(item, cur),
- &PL_langinfo_buf, &PL_langinfo_bufsize, 0);
- if (do_free) {
- freelocale(cur);
+ save_to_buffer(nl_langinfo_l(item, cur),
+ &PL_langinfo_buf, &PL_langinfo_bufsize, 0);
+ if (do_free) {
+ freelocale(cur);
+ }
}
# endif
return PL_langinfo_buf;
#else /* Below, emulate nl_langinfo as best we can */
+
+ {
+
# ifdef HAS_LOCALECONV
- const struct lconv* lc;
+ const struct lconv* lc;
+ DECLARATION_FOR_LC_NUMERIC_MANIPULATION;
# endif
# ifdef HAS_STRFTIME
- struct tm tm;
- bool return_format = FALSE; /* Return the %format, not the value */
- const char * format;
+ struct tm tm;
+ bool return_format = FALSE; /* Return the %format, not the value */
+ const char * format;
# endif
- /* We copy the results to a per-thread buffer, even if not multi-threaded.
- * This is in part to simplify this code, and partly because we need a
- * buffer anyway for strftime(), and partly because a call of localeconv()
- * could otherwise wipe out the buffer, and the programmer would not be
- * expecting this, as this is a nl_langinfo() substitute after all, so s/he
- * might be thinking their localeconv() is safe until another localeconv()
- * call. */
+ /* We copy the results to a per-thread buffer, even if not
+ * multi-threaded. This is in part to simplify this code, and partly
+ * because we need a buffer anyway for strftime(), and partly because a
+ * call of localeconv() could otherwise wipe out the buffer, and the
+ * programmer would not be expecting this, as this is a nl_langinfo()
+ * substitute after all, so s/he might be thinking their localeconv()
+ * is safe until another localeconv() call. */
- switch (item) {
- Size_t len;
- const char * retval;
+ switch (item) {
+ Size_t len;
+ const char * retval;
- /* These 2 are unimplemented */
- case PERL_CODESET:
- case PERL_ERA: /* For use with strftime() %E modifier */
+ /* These 2 are unimplemented */
+ case PERL_CODESET:
+ case PERL_ERA: /* For use with strftime() %E modifier */
- default:
- return "";
+ default:
+ return "";
- /* We use only an English set, since we don't know any more */
- case PERL_YESEXPR: return "^[+1yY]";
- case PERL_YESSTR: return "yes";
- case PERL_NOEXPR: return "^[-0nN]";
- case PERL_NOSTR: return "no";
+ /* We use only an English set, since we don't know any more */
+ case PERL_YESEXPR: return "^[+1yY]";
+ case PERL_YESSTR: return "yes";
+ case PERL_NOEXPR: return "^[-0nN]";
+ case PERL_NOSTR: return "no";
# ifdef HAS_LOCALECONV
- case PERL_CRNCYSTR:
+ case PERL_CRNCYSTR:
- LOCALE_LOCK;
+ /* We don't bother with localeconv_l() because any system that
+ * has it is likely to also have nl_langinfo() */
- /* We don't bother with localeconv_l() because any system that has
- * it is likely to also have nl_langinfo() */
+ LOCALE_LOCK; /* Prevent interference with other threads
+ using localeconv() */
- lc = localeconv();
- if (! lc || ! lc->currency_symbol || strEQ("", lc->currency_symbol))
- {
- LOCALE_UNLOCK;
- return "";
- }
+ lc = localeconv();
+ if ( ! lc
+ || ! lc->currency_symbol
+ || strEQ("", lc->currency_symbol))
+ {
+ LOCALE_UNLOCK;
+ return "";
+ }
- /* Leave the first spot empty to be filled in below */
- save_to_buffer(lc->currency_symbol, &PL_langinfo_buf,
- &PL_langinfo_bufsize, 1);
- if (lc->mon_decimal_point && strEQ(lc->mon_decimal_point, ""))
- { /* khw couldn't figure out how the localedef specifications
- would show that the $ should replace the radix; this is
- just a guess as to how it might work.*/
- *PL_langinfo_buf = '.';
- }
- else if (lc->p_cs_precedes) {
- *PL_langinfo_buf = '-';
- }
- else {
- *PL_langinfo_buf = '+';
- }
+ /* Leave the first spot empty to be filled in below */
+ save_to_buffer(lc->currency_symbol, &PL_langinfo_buf,
+ &PL_langinfo_bufsize, 1);
+ if (lc->mon_decimal_point && strEQ(lc->mon_decimal_point, ""))
+ { /* khw couldn't figure out how the localedef specifications
+ would show that the $ should replace the radix; this is
+ just a guess as to how it might work.*/
+ *PL_langinfo_buf = '.';
+ }
+ else if (lc->p_cs_precedes) {
+ *PL_langinfo_buf = '-';
+ }
+ else {
+ *PL_langinfo_buf = '+';
+ }
- LOCALE_UNLOCK;
- break;
+ LOCALE_UNLOCK;
+ break;
- case PERL_RADIXCHAR:
- case PERL_THOUSEP:
+ case PERL_RADIXCHAR:
+ case PERL_THOUSEP:
- LOCALE_LOCK;
+ if (toggle) {
+ STORE_LC_NUMERIC_FORCE_TO_UNDERLYING();
+ }
- if (toggle) {
- do_setlocale_c(LC_NUMERIC, PL_numeric_name);
- }
+ LOCALE_LOCK; /* Prevent interference with other threads
+ using localeconv() */
- lc = localeconv();
- if (! lc) {
- retval = "";
- }
- else {
- retval = (item == PERL_RADIXCHAR)
- ? lc->decimal_point
- : lc->thousands_sep;
- if (! retval) {
+ lc = localeconv();
+ if (! lc) {
retval = "";
}
- }
+ else {
+ retval = (item == PERL_RADIXCHAR)
+ ? lc->decimal_point
+ : lc->thousands_sep;
+ if (! retval) {
+ retval = "";
+ }
+ }
- save_to_buffer(retval, &PL_langinfo_buf, &PL_langinfo_bufsize, 0);
+ save_to_buffer(retval, &PL_langinfo_buf,
+ &PL_langinfo_bufsize, 0);
- if (toggle) {
- do_setlocale_c(LC_NUMERIC, "C");
- }
+ LOCALE_UNLOCK;
- LOCALE_UNLOCK;
+ if (toggle) {
+ RESTORE_LC_NUMERIC();
+ }
- break;
+ break;
# endif
# ifdef HAS_STRFTIME
- /* These are defined by C89, so we assume that strftime supports them,
- * and so are returned unconditionally; they may not be what the locale
- * actually says, but should give good enough results for someone using
- * them as formats (as opposed to trying to parse them to figure out
- * what the locale says). The other format items are actually tested to
- * verify they work on the platform */
- case PERL_D_FMT: return "%x";
- case PERL_T_FMT: return "%X";
- case PERL_D_T_FMT: return "%c";
-
- /* These formats are only available in later strfmtime's */
- case PERL_ERA_D_FMT: case PERL_ERA_T_FMT: case PERL_ERA_D_T_FMT:
- case PERL_T_FMT_AMPM:
-
- /* The rest can be gotten from most versions of strftime(). */
- case PERL_ABDAY_1: case PERL_ABDAY_2: case PERL_ABDAY_3:
- case PERL_ABDAY_4: case PERL_ABDAY_5: case PERL_ABDAY_6:
- case PERL_ABDAY_7:
- case PERL_ALT_DIGITS:
- case PERL_AM_STR: case PERL_PM_STR:
- case PERL_ABMON_1: case PERL_ABMON_2: case PERL_ABMON_3:
- case PERL_ABMON_4: case PERL_ABMON_5: case PERL_ABMON_6:
- case PERL_ABMON_7: case PERL_ABMON_8: case PERL_ABMON_9:
- case PERL_ABMON_10: case PERL_ABMON_11: case PERL_ABMON_12:
- case PERL_DAY_1: case PERL_DAY_2: case PERL_DAY_3: case PERL_DAY_4:
- case PERL_DAY_5: case PERL_DAY_6: case PERL_DAY_7:
- case PERL_MON_1: case PERL_MON_2: case PERL_MON_3: case PERL_MON_4:
- case PERL_MON_5: case PERL_MON_6: case PERL_MON_7: case PERL_MON_8:
- case PERL_MON_9: case PERL_MON_10: case PERL_MON_11: case PERL_MON_12:
-
- LOCALE_LOCK;
-
- init_tm(&tm); /* Precaution against core dumps */
- tm.tm_sec = 30;
- tm.tm_min = 30;
- tm.tm_hour = 6;
- tm.tm_year = 2017 - 1900;
- tm.tm_wday = 0;
- tm.tm_mon = 0;
- switch (item) {
- default:
- LOCALE_UNLOCK;
- Perl_croak(aTHX_ "panic: %s: %d: switch case: %d problem",
- __FILE__, __LINE__, item);
- NOT_REACHED; /* NOTREACHED */
-
- case PERL_PM_STR: tm.tm_hour = 18;
- case PERL_AM_STR:
- format = "%p";
- break;
-
- case PERL_ABDAY_7: tm.tm_wday++;
- case PERL_ABDAY_6: tm.tm_wday++;
- case PERL_ABDAY_5: tm.tm_wday++;
- case PERL_ABDAY_4: tm.tm_wday++;
- case PERL_ABDAY_3: tm.tm_wday++;
- case PERL_ABDAY_2: tm.tm_wday++;
- case PERL_ABDAY_1:
- format = "%a";
- break;
-
- case PERL_DAY_7: tm.tm_wday++;
- case PERL_DAY_6: tm.tm_wday++;
- case PERL_DAY_5: tm.tm_wday++;
- case PERL_DAY_4: tm.tm_wday++;
- case PERL_DAY_3: tm.tm_wday++;
- case PERL_DAY_2: tm.tm_wday++;
- case PERL_DAY_1:
- format = "%A";
- break;
-
- case PERL_ABMON_12: tm.tm_mon++;
- case PERL_ABMON_11: tm.tm_mon++;
- case PERL_ABMON_10: tm.tm_mon++;
- case PERL_ABMON_9: tm.tm_mon++;
- case PERL_ABMON_8: tm.tm_mon++;
- case PERL_ABMON_7: tm.tm_mon++;
- case PERL_ABMON_6: tm.tm_mon++;
- case PERL_ABMON_5: tm.tm_mon++;
- case PERL_ABMON_4: tm.tm_mon++;
- case PERL_ABMON_3: tm.tm_mon++;
- case PERL_ABMON_2: tm.tm_mon++;
- case PERL_ABMON_1:
- format = "%b";
- break;
-
- case PERL_MON_12: tm.tm_mon++;
- case PERL_MON_11: tm.tm_mon++;
- case PERL_MON_10: tm.tm_mon++;
- case PERL_MON_9: tm.tm_mon++;
- case PERL_MON_8: tm.tm_mon++;
- case PERL_MON_7: tm.tm_mon++;
- case PERL_MON_6: tm.tm_mon++;
- case PERL_MON_5: tm.tm_mon++;
- case PERL_MON_4: tm.tm_mon++;
- case PERL_MON_3: tm.tm_mon++;
- case PERL_MON_2: tm.tm_mon++;
- case PERL_MON_1:
- format = "%B";
- break;
+ /* These are defined by C89, so we assume that strftime supports
+ * them, and so are returned unconditionally; they may not be what
+ * the locale actually says, but should give good enough results
+ * for someone using them as formats (as opposed to trying to parse
+ * them to figure out what the locale says). The other format
+ * items are actually tested to verify they work on the platform */
+ case PERL_D_FMT: return "%x";
+ case PERL_T_FMT: return "%X";
+ case PERL_D_T_FMT: return "%c";
+
+ /* These formats are only available in later strfmtime's */
+ case PERL_ERA_D_FMT: case PERL_ERA_T_FMT: case PERL_ERA_D_T_FMT:
+ case PERL_T_FMT_AMPM:
+
+ /* The rest can be gotten from most versions of strftime(). */
+ case PERL_ABDAY_1: case PERL_ABDAY_2: case PERL_ABDAY_3:
+ case PERL_ABDAY_4: case PERL_ABDAY_5: case PERL_ABDAY_6:
+ case PERL_ABDAY_7:
+ case PERL_ALT_DIGITS:
+ case PERL_AM_STR: case PERL_PM_STR:
+ case PERL_ABMON_1: case PERL_ABMON_2: case PERL_ABMON_3:
+ case PERL_ABMON_4: case PERL_ABMON_5: case PERL_ABMON_6:
+ case PERL_ABMON_7: case PERL_ABMON_8: case PERL_ABMON_9:
+ case PERL_ABMON_10: case PERL_ABMON_11: case PERL_ABMON_12:
+ case PERL_DAY_1: case PERL_DAY_2: case PERL_DAY_3: case PERL_DAY_4:
+ case PERL_DAY_5: case PERL_DAY_6: case PERL_DAY_7:
+ case PERL_MON_1: case PERL_MON_2: case PERL_MON_3: case PERL_MON_4:
+ case PERL_MON_5: case PERL_MON_6: case PERL_MON_7: case PERL_MON_8:
+ case PERL_MON_9: case PERL_MON_10: case PERL_MON_11:
+ case PERL_MON_12:
+
+ LOCALE_LOCK;
+
+ init_tm(&tm); /* Precaution against core dumps */
+ tm.tm_sec = 30;
+ tm.tm_min = 30;
+ tm.tm_hour = 6;
+ tm.tm_year = 2017 - 1900;
+ tm.tm_wday = 0;
+ tm.tm_mon = 0;
+ switch (item) {
+ default:
+ LOCALE_UNLOCK;
+ Perl_croak(aTHX_
+ "panic: %s: %d: switch case: %d problem",
+ __FILE__, __LINE__, item);
+ NOT_REACHED; /* NOTREACHED */
+
+ case PERL_PM_STR: tm.tm_hour = 18;
+ case PERL_AM_STR:
+ format = "%p";
+ break;
+
+ case PERL_ABDAY_7: tm.tm_wday++;
+ case PERL_ABDAY_6: tm.tm_wday++;
+ case PERL_ABDAY_5: tm.tm_wday++;
+ case PERL_ABDAY_4: tm.tm_wday++;
+ case PERL_ABDAY_3: tm.tm_wday++;
+ case PERL_ABDAY_2: tm.tm_wday++;
+ case PERL_ABDAY_1:
+ format = "%a";
+ break;
+
+ case PERL_DAY_7: tm.tm_wday++;
+ case PERL_DAY_6: tm.tm_wday++;
+ case PERL_DAY_5: tm.tm_wday++;
+ case PERL_DAY_4: tm.tm_wday++;
+ case PERL_DAY_3: tm.tm_wday++;
+ case PERL_DAY_2: tm.tm_wday++;
+ case PERL_DAY_1:
+ format = "%A";
+ break;
+
+ case PERL_ABMON_12: tm.tm_mon++;
+ case PERL_ABMON_11: tm.tm_mon++;
+ case PERL_ABMON_10: tm.tm_mon++;
+ case PERL_ABMON_9: tm.tm_mon++;
+ case PERL_ABMON_8: tm.tm_mon++;
+ case PERL_ABMON_7: tm.tm_mon++;
+ case PERL_ABMON_6: tm.tm_mon++;
+ case PERL_ABMON_5: tm.tm_mon++;
+ case PERL_ABMON_4: tm.tm_mon++;
+ case PERL_ABMON_3: tm.tm_mon++;
+ case PERL_ABMON_2: tm.tm_mon++;
+ case PERL_ABMON_1:
+ format = "%b";
+ break;
+
+ case PERL_MON_12: tm.tm_mon++;
+ case PERL_MON_11: tm.tm_mon++;
+ case PERL_MON_10: tm.tm_mon++;
+ case PERL_MON_9: tm.tm_mon++;
+ case PERL_MON_8: tm.tm_mon++;
+ case PERL_MON_7: tm.tm_mon++;
+ case PERL_MON_6: tm.tm_mon++;
+ case PERL_MON_5: tm.tm_mon++;
+ case PERL_MON_4: tm.tm_mon++;
+ case PERL_MON_3: tm.tm_mon++;
+ case PERL_MON_2: tm.tm_mon++;
+ case PERL_MON_1:
+ format = "%B";
+ break;
+
+ case PERL_T_FMT_AMPM:
+ format = "%r";
+ return_format = TRUE;
+ break;
+
+ case PERL_ERA_D_FMT:
+ format = "%Ex";
+ return_format = TRUE;
+ break;
+
+ case PERL_ERA_T_FMT:
+ format = "%EX";
+ return_format = TRUE;
+ break;
+
+ case PERL_ERA_D_T_FMT:
+ format = "%Ec";
+ return_format = TRUE;
+ break;
+
+ case PERL_ALT_DIGITS:
+ tm.tm_wday = 0;
+ format = "%Ow"; /* Find the alternate digit for 0 */
+ break;
+ }
- case PERL_T_FMT_AMPM:
- format = "%r";
- return_format = TRUE;
- break;
+ /* We can't use my_strftime() because it doesn't look at
+ * tm_wday */
+ while (0 == strftime(PL_langinfo_buf, PL_langinfo_bufsize,
+ format, &tm))
+ {
+ /* A zero return means one of:
+ * a) there wasn't enough space in PL_langinfo_buf
+ * b) the format, like a plain %p, returns empty
+ * c) it was an illegal format, though some
+ * implementations of strftime will just return the
+ * illegal format as a plain character sequence.
+ *
+ * To quickly test for case 'b)', try again but precede
+ * the format with a plain character. If that result is
+ * still empty, the problem is either 'a)' or 'c)' */
+
+ Size_t format_size = strlen(format) + 1;
+ Size_t mod_size = format_size + 1;
+ char * mod_format;
+ char * temp_result;
+
+ Newx(mod_format, mod_size, char);
+ Newx(temp_result, PL_langinfo_bufsize, char);
+ *mod_format = ' ';
+ my_strlcpy(mod_format + 1, format, mod_size);
+ len = strftime(temp_result,
+ PL_langinfo_bufsize,
+ mod_format, &tm);
+ Safefree(mod_format);
+ Safefree(temp_result);
+
+ /* If 'len' is non-zero, it means that we had a case like
+ * %p which means the current locale doesn't use a.m. or
+ * p.m., and that is valid */
+ if (len == 0) {
+
+ /* Here, still didn't work. If we get well beyond a
+ * reasonable size, bail out to prevent an infinite
+ * loop. */
+
+ if (PL_langinfo_bufsize > 100 * format_size) {
+ *PL_langinfo_buf = '\0';
+ }
+ else {
+ /* Double the buffer size to retry; Add 1 in case
+ * original was 0, so we aren't stuck at 0. */
+ PL_langinfo_bufsize *= 2;
+ PL_langinfo_bufsize++;
+ Renew(PL_langinfo_buf, PL_langinfo_bufsize, char);
+ continue;
+ }
+ }
- case PERL_ERA_D_FMT:
- format = "%Ex";
- return_format = TRUE;
break;
+ }
- case PERL_ERA_T_FMT:
- format = "%EX";
- return_format = TRUE;
- break;
+ /* Here, we got a result.
+ *
+ * If the item is 'ALT_DIGITS', PL_langinfo_buf contains the
+ * alternate format for wday 0. If the value is the same as
+ * the normal 0, there isn't an alternate, so clear the buffer.
+ * */
+ if ( item == PERL_ALT_DIGITS
+ && strEQ(PL_langinfo_buf, "0"))
+ {
+ *PL_langinfo_buf = '\0';
+ }
- case PERL_ERA_D_T_FMT:
- format = "%Ec";
- return_format = TRUE;
- break;
+ /* ALT_DIGITS is problematic. Experiments on it showed that
+ * strftime() did not always work properly when going from
+ * alt-9 to alt-10. Only a few locales have this item defined,
+ * and in all of them on Linux that khw was able to find,
+ * nl_langinfo() merely returned the alt-0 character, possibly
+ * doubled. Most Unicode digits are in blocks of 10
+ * consecutive code points, so that is sufficient information
+ * for those scripts, as we can infer alt-1, alt-2, .... But
+ * for a Japanese locale, a CJK ideographic 0 is returned, and
+ * the CJK digits are not in code point order, so you can't
+ * really infer anything. The localedef for this locale did
+ * specify the succeeding digits, so that strftime() works
+ * properly on them, without needing to infer anything. But
+ * the nl_langinfo() return did not give sufficient information
+ * for the caller to understand what's going on. So until
+ * there is evidence that it should work differently, this
+ * returns the alt-0 string for ALT_DIGITS.
+ *
+ * wday was chosen because its range is all a single digit.
+ * Things like tm_sec have two digits as the minimum: '00' */
- case PERL_ALT_DIGITS:
- tm.tm_wday = 0;
- format = "%Ow"; /* Find the alternate digit for 0 */
- break;
- }
+ LOCALE_UNLOCK;
- /* We can't use my_strftime() because it doesn't look at tm_wday */
- while (0 == strftime(PL_langinfo_buf, PL_langinfo_bufsize,
- format, &tm))
- {
- /* A zero return means one of:
- * a) there wasn't enough space in PL_langinfo_buf
- * b) the format, like a plain %p, returns empty
- * c) it was an illegal format, though some implementations of
- * strftime will just return the illegal format as a plain
- * character sequence.
- *
- * To quickly test for case 'b)', try again but precede the
- * format with a plain character. If that result is still
- * empty, the problem is either 'a)' or 'c)' */
-
- Size_t format_size = strlen(format) + 1;
- Size_t mod_size = format_size + 1;
- char * mod_format;
- char * temp_result;
-
- Newx(mod_format, mod_size, char);
- Newx(temp_result, PL_langinfo_bufsize, char);
- *mod_format = '\a';
- my_strlcpy(mod_format + 1, format, mod_size);
- len = strftime(temp_result,
- PL_langinfo_bufsize,
- mod_format, &tm);
- Safefree(mod_format);
- Safefree(temp_result);
-
- /* If 'len' is non-zero, it means that we had a case like %p
- * which means the current locale doesn't use a.m. or p.m., and
- * that is valid */
- if (len == 0) {
-
- /* Here, still didn't work. If we get well beyond a
- * reasonable size, bail out to prevent an infinite loop. */
-
- if (PL_langinfo_bufsize > 100 * format_size) {
+ /* If to return the format, not the value, overwrite the buffer
+ * with it. But some strftime()s will keep the original format
+ * if illegal, so change those to "" */
+ if (return_format) {
+ if (strEQ(PL_langinfo_buf, format)) {
*PL_langinfo_buf = '\0';
}
- else { /* Double the buffer size to retry; Add 1 in case
- original was 0, so we aren't stuck at 0. */
- PL_langinfo_bufsize *= 2;
- PL_langinfo_bufsize++;
- Renew(PL_langinfo_buf, PL_langinfo_bufsize, char);
- continue;
+ else {
+ save_to_buffer(format, &PL_langinfo_buf,
+ &PL_langinfo_bufsize, 0);
}
}
break;
- }
-
- /* Here, we got a result.
- *
- * If the item is 'ALT_DIGITS', PL_langinfo_buf contains the
- * alternate format for wday 0. If the value is the same as the
- * normal 0, there isn't an alternate, so clear the buffer. */
- if ( item == PERL_ALT_DIGITS
- && strEQ(PL_langinfo_buf, "0"))
- {
- *PL_langinfo_buf = '\0';
- }
-
- /* ALT_DIGITS is problematic. Experiments on it showed that
- * strftime() did not always work properly when going from alt-9 to
- * alt-10. Only a few locales have this item defined, and in all
- * of them on Linux that khw was able to find, nl_langinfo() merely
- * returned the alt-0 character, possibly doubled. Most Unicode
- * digits are in blocks of 10 consecutive code points, so that is
- * sufficient information for those scripts, as we can infer alt-1,
- * alt-2, .... But for a Japanese locale, a CJK ideographic 0 is
- * returned, and the CJK digits are not in code point order, so you
- * can't really infer anything. The localedef for this locale did
- * specify the succeeding digits, so that strftime() works properly
- * on them, without needing to infer anything. But the
- * nl_langinfo() return did not give sufficient information for the
- * caller to understand what's going on. So until there is
- * evidence that it should work differently, this returns the alt-0
- * string for ALT_DIGITS.
- *
- * wday was chosen because its range is all a single digit. Things
- * like tm_sec have two digits as the minimum: '00' */
-
- LOCALE_UNLOCK;
-
- /* If to return the format, not the value, overwrite the buffer
- * with it. But some strftime()s will keep the original format if
- * illegal, so change those to "" */
- if (return_format) {
- if (strEQ(PL_langinfo_buf, format)) {
- *PL_langinfo_buf = '\0';
- }
- else {
- save_to_buffer(format, &PL_langinfo_buf,
- &PL_langinfo_bufsize, 0);
- }
- }
-
- break;
# endif
+ }
}
return PL_langinfo_buf;
/* disallow with "" or "0" */
*bad_lang_use_once
&& strNE("0", bad_lang_use_once)))));
- bool done = FALSE;
+
/* setlocale() return vals; not copied so must be looked at immediately */
- char * sl_result[NOMINAL_LC_ALL_INDEX + 1];
+ const char * sl_result[NOMINAL_LC_ALL_INDEX + 1];
/* current locale for given category; should have been copied so aren't
* volatile */
- char * curlocales[NOMINAL_LC_ALL_INDEX + 1];
-
- char * locale_param;
+ const char * curlocales[NOMINAL_LC_ALL_INDEX + 1];
# ifdef WIN32
assert(categories[LC_MONETARY_INDEX] == LC_MONETARY);
assert(strEQ(category_names[LC_MONETARY_INDEX], "LC_MONETARY"));
# endif
+# ifdef USE_LOCALE_ADDRESS
+ assert(categories[LC_ADDRESS_INDEX] == LC_ADDRESS);
+ assert(strEQ(category_names[LC_ADDRESS_INDEX], "LC_ADDRESS"));
+# endif
+# ifdef USE_LOCALE_IDENTIFICATION
+ assert(categories[LC_IDENTIFICATION_INDEX] == LC_IDENTIFICATION);
+ assert(strEQ(category_names[LC_IDENTIFICATION_INDEX], "LC_IDENTIFICATION"));
+# endif
+# ifdef USE_LOCALE_MEASUREMENT
+ assert(categories[LC_MEASUREMENT_INDEX] == LC_MEASUREMENT);
+ assert(strEQ(category_names[LC_MEASUREMENT_INDEX], "LC_MEASUREMENT"));
+# endif
+# ifdef USE_LOCALE_PAPER
+ assert(categories[LC_PAPER_INDEX] == LC_PAPER);
+ assert(strEQ(category_names[LC_PAPER_INDEX], "LC_PAPER"));
+# endif
+# ifdef USE_LOCALE_TELEPHONE
+ assert(categories[LC_TELEPHONE_INDEX] == LC_TELEPHONE);
+ assert(strEQ(category_names[LC_TELEPHONE_INDEX], "LC_TELEPHONE"));
+# endif
# ifdef LC_ALL
assert(categories[LC_ALL_INDEX] == LC_ALL);
assert(strEQ(category_names[LC_ALL_INDEX], "LC_ALL"));
assert(NOMINAL_LC_ALL_INDEX == LC_ALL_INDEX);
# endif
# endif /* DEBUGGING */
-# ifndef LOCALE_ENVIRON_REQUIRED
- PERL_UNUSED_VAR(done);
- PERL_UNUSED_VAR(locale_param);
+ /* Initialize the cache of the program's UTF-8ness for the always known
+ * locales C and POSIX */
+ my_strlcpy(PL_locale_utf8ness, C_and_POSIX_utf8ness,
+ sizeof(PL_locale_utf8ness));
-# else
+ PL_numeric_radix_sv = newSVpvs(".");
+
+# ifdef LOCALE_ENVIRON_REQUIRED
/*
* Ultrix setlocale(..., "") fails if there are no environment
* variables from which to get a locale name.
*/
-# ifdef LC_ALL
+# ifndef LC_ALL
+# error Ultrix without LC_ALL not implemented
+# else
- if (lang) {
- sl_result[LC_ALL_INDEX] = do_setlocale_c(LC_ALL, setlocale_init);
- DEBUG_LOCALE_INIT(LC_ALL, setlocale_init, sl_result[LC_ALL_INDEX]);
- if (sl_result[LC_ALL_INDEX])
- done = TRUE;
- else
- setlocale_failure = TRUE;
- }
- if (! setlocale_failure) {
- for (i = 0; i < LC_ALL_INDEX; i++) {
- locale_param = (! done && (lang || PerlEnv_getenv(category_names[i])))
- ? setlocale_init
- : NULL;
- sl_result[i] = do_setlocale_r(categories[i], locale_param);
- if (! sl_result[i]) {
+ {
+ bool done = FALSE;
+ if (lang) {
+ sl_result[LC_ALL_INDEX] = do_setlocale_c(LC_ALL, setlocale_init);
+ DEBUG_LOCALE_INIT(LC_ALL, setlocale_init, sl_result[LC_ALL_INDEX]);
+ if (sl_result[LC_ALL_INDEX])
+ done = TRUE;
+ else
setlocale_failure = TRUE;
+ }
+ if (! setlocale_failure) {
+ const char * locale_param;
+ for (i = 0; i < LC_ALL_INDEX; i++) {
+ locale_param = (! done && (lang || PerlEnv_getenv(category_names[i])))
+ ? setlocale_init
+ : NULL;
+ sl_result[i] = do_setlocale_r(categories[i], locale_param);
+ if (! sl_result[i]) {
+ setlocale_failure = TRUE;
+ }
+ DEBUG_LOCALE_INIT(categories[i], locale_param, sl_result[i]);
}
- DEBUG_LOCALE_INIT(categories[i], locale_param, sl_result[i]);
}
}
trial_locale = system_default_locale;
}
-# endif /* WIN32 */
+# else
+# error SYSTEM_DEFAULT_LOCALE only implemented for Win32
+# endif
# endif /* SYSTEM_DEFAULT_LOCALE */
} /* For i > 0 */
}
}
- PerlIO_printf(Perl_error_log, "and possibly others\n");
-
# endif /* LC_ALL */
PerlIO_printf(Perl_error_log,
# endif
-
for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++) {
+
+# if defined(USE_ITHREADS)
+
+ /* This caches whether each category's locale is UTF-8 or not. This
+ * may involve changing the locale. It is ok to do this at
+ * initialization time before any threads have started, but not later.
+ * Caching means that if the program heeds our dictate not to change
+ * locales in threaded applications, this data will remain valid, and
+ * it may get queried without changing locales. If the environment is
+ * such that all categories have the same locale, this isn't needed, as
+ * the code will not change the locale; but this handles the uncommon
+ * case where the environment has disparate locales for the categories
+ * */
+ (void) _is_cur_LC_category_utf8(categories[i]);
+
+# endif
+
Safefree(curlocales[i]);
}
# if defined(USE_PERLIO) && defined(USE_LOCALE_CTYPE)
/* Set PL_utf8locale to TRUE if using PerlIO _and_ the current LC_CTYPE
- * locale is UTF-8. If PL_utf8locale and PL_unicode (set by -C or by
- * $ENV{PERL_UNICODE}) are true, perl.c:S_parse_body() will turn on the
- * PerlIO :utf8 layer on STDIN, STDOUT, STDERR, _and_ the default open
- * discipline. */
- PL_utf8locale = _is_cur_LC_category_utf8(LC_CTYPE);
+ * locale is UTF-8. The call to new_ctype() just above has already
+ * calculated the latter value and saved it in PL_in_utf8_CTYPE_locale. If
+ * both PL_utf8locale and PL_unicode (set by -C or by $ENV{PERL_UNICODE})
+ * are true, perl.c:S_parse_body() will turn on the PerlIO :utf8 layer on
+ * STDIN, STDOUT, STDERR, _and_ the default open discipline. */
+ PL_utf8locale = PL_in_utf8_CTYPE_locale;
/* Set PL_unicode to $ENV{PERL_UNICODE} if using PerlIO.
This is an alternative to using the -C command line switch
* English, it comes down to if the locale's name ends in something like
* "UTF-8". It errs on the side of not being a UTF-8 locale. */
- char *save_input_locale = NULL;
+ /* Name of current locale corresponding to the input category */
+ const char *save_input_locale = NULL;
+
+ bool is_utf8 = FALSE; /* The return value */
STRLEN final_pos;
+ /* The variables below are for the cache of previous lookups using this
+ * function. The cache is a C string, described at the definition for
+ * 'C_and_POSIX_utf8ness'.
+ *
+ * The first part of the cache is fixed, for the C and POSIX locales. The
+ * varying part starts just after them. */
+ char * utf8ness_cache = PL_locale_utf8ness + STRLENs(C_and_POSIX_utf8ness);
+
+ Size_t utf8ness_cache_size; /* Size of the varying portion */
+ Size_t input_name_len; /* Length in bytes of save_input_locale */
+ Size_t input_name_len_with_overhead; /* plus extra chars used to store
+ the name in the cache */
+ char * delimited; /* The name plus the delimiters used to store
+ it in the cache */
+ char * name_pos; /* position of 'delimited' in the cache, or 0
+ if not there */
+
+
# ifdef LC_ALL
assert(category != LC_ALL);
# endif
- /* First dispose of the trivial cases */
+ /* Get the desired category's locale */
save_input_locale = do_setlocale_r(category, NULL);
if (! save_input_locale) {
- DEBUG_L(PerlIO_printf(Perl_debug_log,
- "Could not find current locale for category %d\n",
- category));
- return FALSE; /* XXX maybe should croak */
+ Perl_croak(aTHX_
+ "panic: %s: %d: Could not find current locale for %s\n",
+ __FILE__, __LINE__, category_name(category));
}
+
save_input_locale = stdize_locale(savepv(save_input_locale));
- if (isNAME_C_OR_POSIX(save_input_locale)) {
- DEBUG_L(PerlIO_printf(Perl_debug_log,
- "Current locale for category %d is %s\n",
- category, save_input_locale));
+ DEBUG_L(PerlIO_printf(Perl_debug_log,
+ "Current locale for %s is %s\n",
+ category_name(category), save_input_locale));
+
+ input_name_len = strlen(save_input_locale);
+
+ /* In our cache, each name is accompanied by two delimiters and a single
+ * utf8ness digit */
+ input_name_len_with_overhead = input_name_len + 3;
+
+ /* Allocate and populate space for a copy of the name surrounded by the
+ * delimiters */
+ Newx(delimited, input_name_len_with_overhead, char);
+ delimited[0] = UTF8NESS_SEP[0];
+ Copy(save_input_locale, delimited + 1, input_name_len, char);
+ delimited[input_name_len+1] = UTF8NESS_PREFIX[0];
+ delimited[input_name_len+2] = '\0';
+
+ /* And see if that is in the cache */
+ name_pos = instr(PL_locale_utf8ness, delimited);
+ if (name_pos) {
+ is_utf8 = *(name_pos + input_name_len_with_overhead - 1) - '0';
+
+# ifdef DEBUGGING
+
+ if (DEBUG_Lv_TEST || debug_initialization) {
+ PerlIO_printf(Perl_debug_log, "UTF8ness for locale %s=%d, \n",
+ save_input_locale, is_utf8);
+ }
+
+# endif
+
+ /* And, if not already in that position, move it to the beginning of
+ * the non-constant portion of the list, since it is the most recently
+ * used. (We don't have to worry about overflow, since just moving
+ * existing names around) */
+ if (name_pos > utf8ness_cache) {
+ Move(utf8ness_cache,
+ utf8ness_cache + input_name_len_with_overhead,
+ name_pos - utf8ness_cache, char);
+ Copy(delimited,
+ utf8ness_cache,
+ input_name_len_with_overhead - 1, char);
+ utf8ness_cache[input_name_len_with_overhead - 1] = is_utf8 + '0';
+ }
+
+ Safefree(delimited);
Safefree(save_input_locale);
- return FALSE;
+ return is_utf8;
}
+ /* Here we don't have stored the utf8ness for the input locale. We have to
+ * calculate it */
+
# if defined(USE_LOCALE_CTYPE) \
&& (defined(MB_CUR_MAX) || (defined(HAS_NL_LANGINFO) && defined(CODESET)))
{ /* Next try nl_langinfo or MB_CUR_MAX if available */
char *save_ctype_locale = NULL;
- bool is_utf8;
if (category != LC_CTYPE) { /* These work only on LC_CTYPE */
Safefree(save_ctype_locale);
}
- is_utf8 = ( ( strlen(codeset) == STRLENs("UTF-8")
- && foldEQ(codeset, STR_WITH_LEN("UTF-8")))
- || ( strlen(codeset) == STRLENs("UTF8")
- && foldEQ(codeset, STR_WITH_LEN("UTF8"))));
+ /* If the implementation of foldEQ() somehow were
+ * to change to not go byte-by-byte, this could
+ * read past end of string, as only one length is
+ * checked. But currently, a premature NUL will
+ * compare false, and it will stop there */
+ is_utf8 = cBOOL( foldEQ(codeset, STR_WITH_LEN("UTF-8"))
+ || foldEQ(codeset, STR_WITH_LEN("UTF8")));
DEBUG_L(PerlIO_printf(Perl_debug_log,
"\tnllanginfo returned CODESET '%s'; ?UTF8 locale=%d\n",
codeset, is_utf8));
- Safefree(save_input_locale);
- return is_utf8;
+ goto finish_and_return;
}
}
Safefree(save_ctype_locale);
}
- return is_utf8;
+ goto finish_and_return;
# endif
{
char *save_monetary_locale = NULL;
bool only_ascii = FALSE;
- bool is_utf8 = FALSE;
struct lconv* lc;
/* Like above for LC_CTYPE, we first set LC_MONETARY to the locale of
* is non-ascii UTF-8. */
DEBUG_L(PerlIO_printf(Perl_debug_log, "\t?Currency symbol for %s is UTF-8=%d\n",
save_input_locale, is_utf8));
- Safefree(save_input_locale);
- return is_utf8;
+ goto finish_and_return;
}
}
cant_use_monetary:
DEBUG_L(PerlIO_printf(Perl_debug_log, "\t?time-related strings for %s are UTF-8=%d\n",
save_input_locale,
is_utf8_string((U8 *) formatted_time, 0)));
- Safefree(save_input_locale);
- return is_utf8_string((U8 *) formatted_time, 0);
+ is_utf8 = is_utf8_string((U8 *) formatted_time, 0);
+ goto finish_and_return;
}
/* Falling off the end of the loop indicates all the names were just
* are much more likely to have been translated. */
{
int e;
- bool is_utf8 = FALSE;
bool non_ascii = FALSE;
char *save_messages_locale = NULL;
const char * errmsg = NULL;
DEBUG_L(PerlIO_printf(Perl_debug_log, "\t?error messages for %s are UTF-8=%d\n",
save_input_locale,
is_utf8));
- Safefree(save_input_locale);
- return is_utf8;
+ goto finish_and_return;
}
DEBUG_L(PerlIO_printf(Perl_debug_log, "All error messages for %s contain only ASCII; can't use for determining if UTF-8 locale\n", save_input_locale));
final_pos = strlen(save_input_locale) - 1;
if (final_pos >= 3) {
- char *name = save_input_locale;
+ const char *name = save_input_locale;
/* Find next 'U' or 'u' and look from there */
while ((name += strcspn(name, "Uu") + 1)
DEBUG_L(PerlIO_printf(Perl_debug_log,
"Locale %s ends with UTF-8 in name\n",
save_input_locale));
- Safefree(save_input_locale);
- return TRUE;
+ is_utf8 = TRUE;
+ goto finish_and_return;
}
}
DEBUG_L(PerlIO_printf(Perl_debug_log,
save_input_locale));
}
-# endif
-# ifdef WIN32
+# ifdef WIN32
/* http://msdn.microsoft.com/en-us/library/windows/desktop/dd317756.aspx */
if (memENDs(save_input_locale, final_pos, "65001")) {
DEBUG_L(PerlIO_printf(Perl_debug_log,
"Locale %s ends with 65001 in name, is UTF-8 locale\n",
save_input_locale));
- Safefree(save_input_locale);
- return TRUE;
+ is_utf8 = TRUE;
+ goto finish_and_return;
}
-# endif
+# endif
+# endif
/* Other common encodings are the ISO 8859 series, which aren't UTF-8. But
* since we are about to return FALSE anyway, there is no point in doing
DEBUG_L(PerlIO_printf(Perl_debug_log,
"Locale %s has 8859 in name, not UTF-8 locale\n",
save_input_locale));
- Safefree(save_input_locale);
- return FALSE;
+ is_utf8 = FALSE;
+ goto finish_and_return;
}
# endif
DEBUG_L(PerlIO_printf(Perl_debug_log,
"Assuming locale %s is not a UTF-8 locale\n",
save_input_locale));
+ is_utf8 = FALSE;
+
+ finish_and_return:
+
+ /* Cache this result so we don't have to go through all this next time. */
+ utf8ness_cache_size = sizeof(PL_locale_utf8ness)
+ - (utf8ness_cache - PL_locale_utf8ness);
+
+ /* But we can't save it if it is too large for the total space available */
+ if (LIKELY(input_name_len_with_overhead < utf8ness_cache_size)) {
+ Size_t utf8ness_cache_len = strlen(utf8ness_cache);
+
+ /* Here it can fit, but we may need to clear out the oldest cached
+ * result(s) to do so. Check */
+ if (utf8ness_cache_len + input_name_len_with_overhead
+ >= utf8ness_cache_size)
+ {
+ /* Here we have to clear something out to make room for this.
+ * Start looking at the rightmost place where it could fit and find
+ * the beginning of the entry that extends past that. */
+ char * cutoff = (char *) my_memrchr(utf8ness_cache,
+ UTF8NESS_SEP[0],
+ utf8ness_cache_size
+ - input_name_len_with_overhead);
+
+ assert(cutoff);
+ assert(cutoff >= utf8ness_cache);
+
+ /* This and all subsequent entries must be removed */
+ *cutoff = '\0';
+ utf8ness_cache_len = strlen(utf8ness_cache);
+ }
+
+ /* Make space for the new entry */
+ Move(utf8ness_cache,
+ utf8ness_cache + input_name_len_with_overhead,
+ utf8ness_cache_len + 1 /* Incl. trailing NUL */, char);
+
+ /* And insert it */
+ Copy(delimited, utf8ness_cache, input_name_len_with_overhead - 1, char);
+ utf8ness_cache[input_name_len_with_overhead - 1] = is_utf8 + '0';
+
+ if ((PL_locale_utf8ness[strlen(PL_locale_utf8ness)-1]
+ & (PERL_UINTMAX_T) ~1) != '0')
+ {
+ Perl_croak(aTHX_
+ "panic: %s: %d: Corrupt utf8ness_cache=%s\nlen=%u,"
+ " inserted_name=%s, its_len=%u\n",
+ __FILE__, __LINE__,
+ PL_locale_utf8ness, strlen(PL_locale_utf8ness),
+ delimited, input_name_len_with_overhead);
+ }
+ }
+
+# ifdef DEBUGGING
+
+ if (DEBUG_Lv_TEST || debug_initialization) {
+ PerlIO_printf(Perl_debug_log,
+ "PL_locale_utf8ness is now %s; returning %d\n",
+ PL_locale_utf8ness, is_utf8);
+ }
+
+# endif
+
+ Safefree(delimited);
Safefree(save_input_locale);
- return FALSE;
+ return is_utf8;
}
#endif
# else
- char * save_locale = NULL;
+ const char * save_locale = NULL;
bool locale_is_C = FALSE;
/* We have a critical section to prevent another thread from changing the