* setlocale() knows about, there is a layer to cope with that.
* b) stdized_setlocale() is a layer above a) that fixes some vagaries in
* the return value of the libc setlocale(). On most platforms this
- * layer is empty; it requires perl to be Configured with a parameter
- * indicating the platform's defect, in order to be activated. The
+ * layer is empty; in order to be activated, it requires perl to be
+ * Configured with a parameter indicating the platform's defect. The
* current ones are listed at the definition of the macro.
*
* 2) An implementation that adds a minimal layer above implementation 1),
* querylocale() is called only while the locale mutex is locked, and
* the result is copied to a per-thread place before unlocking.
*
- * -Accflags=-DNO_LOCALE_CTYE
+ * -Accflags=-DUSE_NL_LOCALE_NAME
+ * glibc has an undocumented equivalent function to querylocale(). It
+ * currently isn't used by default because it is undocumented. But
+ * testing hasn't found any problems with it. Using this Configure
+ * option enables it on systems that have it (with no effect on
+ * systems lacking it). Enabling this removes the need for perl
+ * to keep its own records, hence is more efficient and guaranteed to
+ * be accurate.
+ *
+ * -Accflags=-DNO_LOCALE_CTYPE
* -Accflags=-DNO_LOCALE_NUMERIC
* etc.
*
* If the named category(ies) does(do) not exist on this platform,
* these have no effect. Otherwise they cause perl to be compiled to
- * always keep the named category(ies) in the C locale XXX
+ * always keep the named category(ies) in the C locale.
*
* -Accflags=-DHAS_BROKEN_SETLOCALE_QUERY_LC_ALL
* This would be set in a hints file to tell perl that doing a libc
* to be stripped off. khw believes there aren't any such platforms
* still in existence.
*
+ * -Accflags=-DLIBC_HANDLES_MISMATCHED_CTYPE
+ * Consider the name of a month in some language, Chinese for example.
+ * If LC_TIME has been set to a Chinese locale, strftime() can be used
+ * to generate the Chinese month name for any given date, by using the
+ * %B format. But also suppose that LC_CTYPE is set to, say, "C".
+ * The return from strftime() on many platforms will be mojibake given
+ * that no Chinese month name is composed of just ASCII characters.
+ * Perl handles this for you by automatically toggling LC_CTYPE to
+ * whatever LC_TIME is during the execution of strftime(), and
+ * afterwards restoring it to its prior value. But the strftime()
+ * (and similar functions) in some libc implementations already do
+ * this toggle, meaning perl's action is redundant. You can tell perl
+ * that a libc does this by setting this Configure option, and it will
+ * skip its syncing LC_CTYPE and whatever the other locale is.
+ * Currently, perl ignores this Configuration option and syncs anyway
+ * for LC_COLLATE-related operations, due to perl's internal needs.
+ *
* -Accflags=USE_FAKE_LC_ALL_POSITIONAL_NOTATION
* This is used when developing Perl on a platform that uses
* 'name=value;' notation to represent LC_ALL when not all categories
#define PERL_IN_LOCALE_C
#include "perl.h"
+/* Some platforms require LC_CTYPE to be congruent with the category we are
+ * looking for. XXX This still presumes that we have to match COLLATE and
+ * CTYPE even on platforms that apparently handle this. */
+#if defined(USE_LOCALE_CTYPE) && ! defined(LIBC_HANDLES_MISMATCHED_CTYPE)
+# define WE_MUST_DEAL_WITH_MISMATCHED_CTYPE
+#endif
+
+#if PERL_VERSION_GT(5,39,9)
+# error Revert the commit that added this line
+#endif
+
#ifdef WIN32_USE_FAKE_OLD_MINGW_LOCALES
/* Use -Accflags=-DWIN32_USE_FAKE_OLD_MINGW_LOCALES on a POSIX or *nix box
* to get a semblance of pretending the locale handling is that of a MingW
* that doesn't use UCRT (hence 'OLD' in the name). This exercizes code
- * paths that are not compiled on non-Windows boxes, and allows for ASAN.
- * This is thus a way to see if locale.c on Windows is likely going to
- * compile, without having to use a real Win32 box. And running the test
- * suite will verify to a large extent our logic and memory allocation
- * handling for such boxes. And access to ASAN and PERL_MEMLOG Of course the underlying calls are to the POSIX
- * libc, so any differences in implementation between those and the Windows
- * versions will not be caught by this. */
+ * paths that are not compiled on non-Windows boxes, and allows for ASAN
+ * and PERL_MEMLOG. This is thus a way to see if locale.c on Windows is
+ * likely going to compile, without having to use a real Win32 box. And
+ * running the test suite will verify to a large extent our logic and memory
+ * allocation handling for such boxes. Of course the underlying calls are
+ * to the POSIX libc, so any differences in implementation between those and
+ * the Windows versions will not be caught by this. */
# define WIN32
# undef P_CS_PRECEDES
# define _configthreadlocale(arg) NOOP
# define MultiByteToWideChar(cp, flags, byte_string, m1, wstring, req_size) \
- (mbsrtowcs(wstring, &(byte_string), req_size, NULL) + 1)
+ (PERL_UNUSED_ARG(cp), \
+ mbsrtowcs(wstring, &(byte_string), req_size, NULL) + 1)
# define WideCharToMultiByte(cp, flags, wstring, m1, byte_string, \
req_size, default_char, found_default_char) \
- (wcsrtombs(byte_string, &(wstring), req_size, NULL) + 1)
+ (PERL_UNUSED_ARG(cp), \
+ wcsrtombs(byte_string, &(wstring), req_size, NULL) + 1)
# ifdef USE_LOCALE
static const wchar_t * wsetlocale_buf = NULL;
static Size_t wsetlocale_buf_size = 0;
+
+# ifdef MULTIPLICITY
+
static PerlInterpreter * wsetlocale_buf_aTHX = NULL;
+# endif
+
STATIC
const wchar_t *
S_wsetlocale(const int category, const wchar_t * wlocale)
# define SET_EINVAL
#endif
-/* If we have any of these library functions, we can reliably determine is a
- * locale is a UTF-8 one or not. And if we aren't using locales at all, we act
- * as if everything is the C locale, so the answer there is always "No, it
- * isn't UTF-8"; this too is reliably accurate */
-#if defined(HAS_SOME_LANGINFO) || defined(HAS_MBTOWC) \
- || defined(HAS_MBRTOWC) || ! defined(USE_LOCALE)
-# define HAS_RELIABLE_UTF8NESS_DETERMINATION
-#endif
-
/* This is a starting guess as to when this is true. It definititely isn't
* true on *BSD where positional LC_ALL notation is used. Likely this will end
* up being defined in hints files. */
a single instance, so is a #define */
static const char C_decimal_point[] = ".";
+#if defined(HAS_NL_LANGINFO_L) || defined(HAS_NL_LANGINFO)
+# define HAS_SOME_LANGINFO
+#endif
+
#if (defined(USE_LOCALE_NUMERIC) && ! defined(TS_W32_BROKEN_LOCALECONV)) \
|| ! ( defined(USE_LOCALE_NUMERIC) \
&& (defined(HAS_SOME_LANGINFO) || defined(HAS_LOCALECONV)))
&& (( *(name) == 'C' && (*(name + 1)) == '\0') \
|| strEQ((name), "POSIX")))
-#if defined(HAS_NL_LANGINFO_L) || defined(HAS_NL_LANGINFO)
-# define HAS_SOME_LANGINFO
-#endif
-
#define my_langinfo_c(item, category, locale, retbufp, retbuf_sizep, utf8ness) \
my_langinfo_i(item, category##_INDEX_, locale, retbufp, \
retbuf_sizep, utf8ness)
# ifdef LC_ALL
true,
# else
- false
+ false,
# endif
false /* LC_UNKNOWN_AVAIL_ */
#define setlocale_failure_panic_c(cat, cur, fail, line, higher_line) \
setlocale_failure_panic_i(cat##_INDEX_, cur, fail, line, higher_line)
-#if defined(LC_ALL) && defined(USE_LOCALE)
+#if defined(USE_LOCALE)
/* Expands to the code to
* result = savepvn(s, len)
* get this value, which doesn't appear to be used in any of the Microsoft
* library routines anyway. */
- char * scratch_buffer = NULL;
if (PL_numeric_underlying_is_standard) {
+ char * scratch_buffer = NULL;
PL_numeric_underlying_is_standard = strEQ(C_thousands_sep,
my_langinfo_c(THOUSEP, LC_NUMERIC,
PL_numeric_name,
&scratch_buffer,
NULL, NULL));
+ Safefree(scratch_buffer);
}
- Safefree(scratch_buffer);
# endif
# endif /* USE_LOCALE_CTYPE */
STATIC void
-
-# ifdef LC_ALL
-
S_new_LC_ALL(pTHX_ const char *lc_all, bool force)
-
-# else
-
-S_new_LC_ALL(pTHX_ const char ** individ_locales, bool force)
-
-# endif
-
{
PERL_ARGS_ASSERT_NEW_LC_ALL;
* set, and not the nominal one (should they differ, as they may in
* LC_NUMERIC). */
-# ifdef LC_ALL
-
const char * individ_locales[LC_ALL_INDEX_] = { NULL };
switch (parse_LC_ALL_string(lc_all,
earlier had to have succeeded */
__LINE__))
{
- case invalid:
- case no_array:
- case only_element_0:
+ case invalid:
+ case no_array:
+ case only_element_0:
locale_panic_("Unexpected return from parse_LC_ALL_string");
- case full_array:
+ case full_array:
break;
}
-# endif
-
for_all_individual_category_indexes(i) {
if (update_functions[i]) {
const char * this_locale = individ_locales[i];
update_functions[i](aTHX_ this_locale, force);
}
-# ifdef LC_ALL
-
Safefree(individ_locales[i]);
-
-# endif
-
}
}
* If we already know the UTF-8ness of the locale, then we immediately know
* what the string is */
if (UNLIKELY(known_utf8 != LOCALE_UTF8NESS_UNKNOWN)) {
- if (known_utf8 == LOCALE_IS_UTF8) {
- return UTF8NESS_YES;
- }
- else {
- return UTF8NESS_NO;
- }
+ return (known_utf8 == LOCALE_IS_UTF8) ? UTF8NESS_YES : UTF8NESS_NO;
}
-# ifdef HAS_RELIABLE_UTF8NESS_DETERMINATION
+ if (locale == NULL) {
+ locale = querylocale_i(cat_index);
+ }
- /* Here, we have available the libc functions that can be used to
- * accurately determine the UTF8ness of the underlying locale. If it is a
- * UTF-8 locale, the string is UTF-8; otherwise it was coincidental that
- * the string is legal UTF-8
+ /* If the locale is UTF-8, the string is UTF-8; otherwise it was
+ * coincidental that the string is legal UTF-8
*
* However, if the perl is compiled to not pay attention to the category
* being passed in, you might think that that locale is essentially always
* messages really could be UTF-8, and given that the odds are rather small
* of something not being UTF-8 but being syntactically valid UTF-8, khw
* has decided to call such strings as UTF-8. */
+ return (is_locale_utf8(locale)) ? UTF8NESS_YES : UTF8NESS_NO;
- if (locale == NULL) {
- locale = querylocale_i(cat_index);
- }
-
- if (is_locale_utf8(locale)) {
- return UTF8NESS_YES;
- }
-
- return UTF8NESS_NO;
-
-# else
-
- /* Here, we have a valid UTF-8 string containing non-ASCII characters, and
- * don't have access to functions to check if the locale is UTF-8 or not.
- * Assume that it is. khw tried adding a check that the string is entirely
- * in a single Unicode script, but discovered the strftime() timezone is
- * user-settable through the environment, which may be in a different
- * script than the locale-expected value. */
- PERL_UNUSED_ARG(locale);
- PERL_UNUSED_ARG(cat_index);
-
- return UTF8NESS_YES;
-
-# endif
# endif
}
STATIC bool
S_is_locale_utf8(pTHX_ const char * locale)
{
- /* Returns TRUE if the locale 'locale' is UTF-8; FALSE otherwise. It uses
- * my_langinfo(), which employs various methods to get this information
- * if nl_langinfo() isn't available, using heuristics as a last resort, in
- * which case, the result will very likely be correct for locales for
- * languages that have commonly used non-ASCII characters, but for notably
- * English, it comes down to if the locale's name ends in something like
- * "UTF-8". It errs on the side of not being a UTF-8 locale.
- *
- * Systems conforming to C99 should have the needed libc calls to give us a
- * completely reliable result. */
+ PERL_ARGS_ASSERT_IS_LOCALE_UTF8;
+
+ /* Returns TRUE if the locale 'locale' is UTF-8; FALSE otherwise. */
# if ! defined(USE_LOCALE) \
|| ! defined(USE_LOCALE_CTYPE) \
return FALSE;
+ /* Definitively, can't be UTF-8 */
+# define HAS_DEFINITIVE_UTF8NESS_DETERMINATION
# else
- char * scratch_buffer = NULL;
- const char * codeset;
- bool retval;
-
- PERL_ARGS_ASSERT_IS_LOCALE_UTF8;
-
+ /* If the input happens to be the same locale as we are currently setup
+ * for, the answer has already been cached. */
if (strEQ(locale, PL_ctype_name)) {
return PL_in_utf8_CTYPE_locale;
}
- codeset = my_langinfo_c(CODESET, LC_CTYPE, locale,
- &scratch_buffer, NULL, NULL);
- retval = is_codeset_name_UTF8(codeset);
+ if (isNAME_C_OR_POSIX(locale)) {
+ return false;
+ }
+
+# if ! defined(HAS_SOME_LANGINFO) && ! defined(WIN32)
+
+ /* On non-Windows without nl_langinfo(), we have to do some digging to get
+ * the answer. First, toggle to the desired locale so can query its state
+ * */
+ const char * orig_CTYPE_locale = toggle_locale_c(LC_CTYPE, locale);
+
+# define TEARDOWN_FOR_IS_LOCALE_UTF8 \
+ restore_toggled_locale_c(LC_CTYPE, orig_CTYPE_locale)
+
+# ifdef MB_CUR_MAX
+
+ /* If there are fewer bytes available in this locale than are required
+ * to represent the largest legal UTF-8 code point, this isn't a UTF-8
+ * locale. */
+ const int mb_cur_max = MB_CUR_MAX;
+ if (mb_cur_max < (int) UNISKIP(PERL_UNICODE_MAX)) {
+ TEARDOWN_FOR_IS_LOCALE_UTF8;
+ return false;
+ }
+
+# endif
+# if defined(HAS_MBTOWC) || defined(HAS_MBRTOWC)
+
+ /* With these functions, we can definitively determine a locale's
+ * UTF-8ness */
+# define HAS_DEFINITIVE_UTF8NESS_DETERMINATION
+
+ /* If libc mbtowc() evaluates the bytes that form the REPLACEMENT CHARACTER
+ * as that Unicode code point, this has to be a UTF-8 locale; otherwise it
+ * can't be */
+ wchar_t wc = 0;
+ (void) Perl_mbtowc_(aTHX_ NULL, NULL, 0);/* Reset shift state */
+ int mbtowc_ret = Perl_mbtowc_(aTHX_ &wc,
+ STR_WITH_LEN(REPLACEMENT_CHARACTER_UTF8));
+ TEARDOWN_FOR_IS_LOCALE_UTF8;
+ return ( mbtowc_ret == STRLENs(REPLACEMENT_CHARACTER_UTF8)
+ && wc == UNICODE_REPLACEMENT);
+
+# else
+
+ /* If the above two C99 functions aren't working, you could try some
+ * different methods. It seems likely that the obvious choices,
+ * wctomb() and wcrtomb(), wouldn't be working either. But you could
+ * choose one of the dozen-ish Unicode titlecase triples and verify
+ * that towupper/towlower work as expected.
+ *
+ * But, our emulation of nl_langinfo() works quite well, so avoid the
+ * extra code until forced to by some weird non-conforming platform. */
+# define USE_LANGINFO_FOR_UTF8NESS
+# undef HAS_DEFINITIVE_UTF8NESS_DETERMINATION
+# endif
+# else
+
+ /* On Windows or on platforms with nl_langinfo(), there is a direct way to
+ * get the locale's codeset, which will be some form of 'UTF-8' for a
+ * UTF-8 locale. my_langinfo_i() handles this, and we will call that
+ * below */
+# define HAS_DEFINITIVE_UTF8NESS_DETERMINATION
+# define USE_LANGINFO_FOR_UTF8NESS
+# define TEARDOWN_FOR_IS_LOCALE_UTF8
+# endif /* USE_LANGINFO_FOR_UTF8NESS */
+
+ /* If the above compiled into code, it found the locale's UTF-8ness,
+ * nothing more to do; if it didn't get compiled,
+ * USE_LANGINFO_FOR_UTF8NESS is defined. There are two possible reasons:
+ * 1) it is the preferred method because it knows directly for sure
+ * what the codeset is because the platform has libc functions that
+ * return this; or
+ * 2) the functions the above code section would compile to use don't
+ * exist or are unreliable on this platform; we are less sure of the
+ * my_langinfo() result, though it is very unlikely to be wrong
+ * about if it is UTF-8 or not */
+# ifdef USE_LANGINFO_FOR_UTF8NESS
+
+ char * scratch_buffer = NULL;
+ const char * codeset = my_langinfo_c(CODESET, LC_CTYPE, locale,
+ &scratch_buffer, NULL, NULL);
+ bool retval = is_codeset_name_UTF8(codeset);
DEBUG_Lv(PerlIO_printf(Perl_debug_log,
"found codeset=%s, is_utf8=%d\n", codeset, retval));
DEBUG_Lv(PerlIO_printf(Perl_debug_log, "is_locale_utf8(%s) returning %d\n",
locale, retval));
+ TEARDOWN_FOR_IS_LOCALE_UTF8;
return retval;
-# endif
+# endif
+# endif /* End of the #else clause, for the non-trivial case */
}
# ifdef USE_LOCALE_CTYPE
+ /* We don't know the locale utf8ness here, and not even the locale itself.
+ * Since Windows uses a different mechanism to specify message language
+ * output than the locale system, it is going to be problematic deciding
+ * if we are to store it as UTF-8 or not. By specifying LOCALE_IS_UTF8, we
+ * are telling the called function to return true iff the string has
+ * non-ASCII characters in it that are all syntactically UTF-8. We are
+ * thus relying on the fact that a string that is syntactically valid UTF-8
+ * is likely to be UTF-8. Should this ever cause problems, this function
+ * could be replaced by something more Windows-specific */
return get_locale_string_utf8ness_i(string, LOCALE_IS_UTF8,
NULL, LC_CTYPE_INDEX_);
# else
+ PERL_UNUSED_ARG(string);
return false;
# endif
}
-#if defined(HAS_LOCALECONV)
+#if defined(HAS_LOCALECONV)
HV *
S_my_localeconv(pTHX_ const int item)
PERL_ARGS_ASSERT_MY_LOCALECONV;
/* This returns a mortalized hash containing all or certain elements
- * returned by localeconv(). It is used by Perl_localeconv() and
- * POSIX::localeconv() and is thread-safe.
+ * returned by localeconv(). */
+ HV * hv = newHV(); /* The returned hash, initially empty */
+ sv_2mortal((SV*)hv);
+
+ /* The function is used by Perl_localeconv() and POSIX::localeconv(), or
+ * internally from this file, and is thread-safe.
*
- * There are two use cases:
- * 1) Called from POSIX::locale_conv(). This returns the lconv structure
- * copied to a hash, based on the current underlying locales for
- * LC_NUMERIC and LC_MONETARY. An input item==0 signifies this case, or
- * on many platforms it is the only use case compiled.
+ * localeconv() returns items from two different locale categories,
+ * LC_MONETARY and LC_NUMERIC. Various data structures in this function
+ * are arrays with two elements, one for each category, and these indexes
+ * indicate which array element applies to which category */
+#define NUMERIC_OFFSET 0
+#define MONETARY_OFFSET 1
+
+ /* Some operations apply to one or the other category, or both. A mask
+ * is used to specify all the possibilities. This macro converts from the
+ * category offset to its bit position in the mask. */
+#define OFFSET_TO_BIT(i) (1 << (i))
+
+ /* There are two use cases for this function:
+ * 1) Called as Perl_localeconv(), or from POSIX::locale_conv(). This
+ * returns the lconv structure copied to a hash, based on the current
+ * underlying locales for LC_NUMERIC and LC_MONETARY. An input item==0
+ * signifies this case, or on many platforms it is the only use case
+ * compiled.
* 2) Certain items that nl_langinfo() provides are also derivable from
* the return of localeconv(). Windows notably doesn't have
* nl_langinfo(), so on that, and actually any platform lacking it,
LCONV_NUMERIC_ENTRY(grouping),
# endif
LCONV_NUMERIC_ENTRY(thousands_sep),
+# define THOUSANDS_SEP_LITERAL "thousands_sep"
LCONV_NUMERIC_ENTRY(decimal_point),
+# define DECIMAL_POINT_LITERAL "decimal_point"
{NULL, 0}
};
LCONV_MONETARY_ENTRY(positive_sign),
LCONV_MONETARY_ENTRY(negative_sign),
LCONV_MONETARY_ENTRY(currency_symbol),
+# define CURRENCY_SYMBOL_LITERAL "currency_symbol"
{NULL, 0}
};
LCONV_ENTRY(int_p_sign_posn),
LCONV_ENTRY(int_n_sign_posn),
# endif
+# define P_CS_PRECEDES_LITERAL "p_cs_precedes"
LCONV_ENTRY(p_cs_precedes),
{NULL, 0}
};
# define P_CS_PRECEDES_ADDRESS \
&lconv_integers[(C_ARRAY_LENGTH(lconv_integers) - 2)]
- /* If we aren't paying attention to a given category, use LC_CTYPE instead;
- * If not paying attention to that either, the code below should end up not
- * using this. Make sure that things blow up if that avoidance gets lost,
- * by setting the category to an out-of-bounds value */
- locale_category_index numeric_index;
- locale_category_index monetary_index;
+ /* The actual populating of the hash is done by two sub functions that get
+ * passed an array of length two containing the data structure they are
+ * supposed to use to get the key names to fill the hash with. One element
+ * is always for the NUMERIC strings (or NULL if none to use), and the
+ * other element similarly for the MONETARY ones. */
+ const lconv_offset_t * strings[2] = { lconv_numeric_strings,
+ lconv_monetary_strings
+ };
+
+ /* The LC_MONETARY category also has some integer-valued fields, whose
+ * information is kept in a separate parallel array to 'strings' */
+ const lconv_offset_t * integers[2] = {
+ NULL,
+ lconv_integers
+ };
+
+# if ! defined(USE_LOCALE_NUMERIC) && ! defined(USE_LOCALE_MONETARY)
+
+ /* If both NUMERIC and MONETARY must be the "C" locale, simply populate the
+ * hash using the function that works on just that locale. */
+ populate_hash_from_C_localeconv(hv,
+ "C",
+ ( OFFSET_TO_BIT(NUMERIC_OFFSET)
+ | OFFSET_TO_BIT(MONETARY_OFFSET)),
+ strings, integers);
+
+ /* We shouldn't get to here for the case of an individual item, as
+ * preprocessor directives elsewhere in this file should have filled in the
+ * correct values at a higher level */
+ assert(item == 0);
+ PERL_UNUSED_ARG(item);
+
+ return hv;
-# ifdef USE_LOCALE_NUMERIC
- numeric_index = LC_NUMERIC_INDEX_;
-# elif defined(USE_LOCALE_CTYPE)
- numeric_index = LC_CTYPE_INDEX_;
-# else
- numeric_index = LC_ALL_INDEX_; /* Out-of-bounds */
-# endif
-# ifdef USE_LOCALE_MONETARY
- monetary_index = LC_MONETARY_INDEX_;
-# elif defined(USE_LOCALE_CTYPE)
- monetary_index = LC_CTYPE_INDEX_;
# else
- monetary_index = LC_ALL_INDEX_; /* Out-of-bounds */
-# endif
+
+ /* From here to the end of this function, at least one of NUMERIC or
+ * MONETARY can be non-C */
+
+ /* This is a mask, with one bit to tell the populate functions to populate
+ * the NUMERIC items; another bit for the MONETARY ones. This way they can
+ * choose which (or both) to populate from */
+ U32 index_bits = 0;
/* Some platforms, for correct non-mojibake results, require LC_CTYPE's
* locale to match LC_NUMERIC's for the numeric fields, and LC_MONETARY's
* a second call. Assume this is the case unless overridden below */
bool requires_2nd_localeconv = false;
- /* The actual hash populating is done by S_populate_hash_from_localeconv().
- * It gets passed an array of length two containing the data structure it
- * is supposed to use to get the key names to fill the hash with. One
- * element is always for the NUMERIC strings (or NULL if none to use), and
- * the other element similarly for the MONETARY ones. */
-# define NUMERIC_STRING_OFFSET 0
-# define MONETARY_STRING_OFFSET 1
- const lconv_offset_t * strings[2] = { NULL, NULL };
-
- /* This is a mask, with one bit to tell S_populate_hash_from_localeconv to
- * populate the NUMERIC items; another bit for the MONETARY ones. This way
- * it can choose which (or both) to populate from */
- U32 index_bits = 0;
-
- /* This converts from a locale index to its bit position in the above mask.
- * */
-# define INDEX_TO_BIT(i) (1 << (i))
+ /* The actual hash populating is done by one of the two populate functions.
+ * Which one is appropriate for either the MONETARY_OFFSET or the
+ * NUMERIC_OFFSET is calculated and then stored in this table */
+ void (*populate[2]) (pTHX_
+ HV * ,
+ const char *,
+ const U32,
+ const lconv_offset_t **,
+ const lconv_offset_t **);
- /* The two categories can have disparate locales. Initialize them to C and
- * override later whichever one(s) we pay attention to */
- const char * numeric_locale = "C";
- const char * monetary_locale = "C";
-
- /* This will be either 'numeric_locale' or 'monetary_locale' depending on
- * what we are working on at the moment */
- const char * locale;
-
- /* The LC_MONETARY category also has some integer-valued fields, whose
- * information is kept in a separate list */
- const lconv_offset_t * integers;
+ /* This gives the locale to use for the corresponding OFFSET, like the
+ * 'populate' array above */
+ const char * locales[2];
# ifdef HAS_SOME_LANGINFO
* parameter is ignored. */
PERL_UNUSED_ARG(item);
-# else
-
- /* This only gets compiled for the use-case of using localeconv() to
- * emulate an nl_langinfo() missing from the platform. */
+# else /* This only gets compiled for the use-case of using localeconv()
+ to emulate nl_langinfo() when missing from the platform. */
-# ifdef USE_LOCALE_NUMERIC
+# ifdef USE_LOCALE_NUMERIC
/* We need this substructure to only return this field for the THOUSEP
* item. The other items also need substructures, but they were handled
{NULL, 0}
};
-# endif
+# endif
/* End of all the initialization of data structures. Now for actual code.
*
- * Without nl_langinfo(), the call to my_localeconv() could be for just one
- * of the following 3 items to emulate nl_langinfo(). This is compiled
- * only when using perl_langinfo.h, which we control, and it has been
- * constructed so that no item is numbered 0.
+ * Without nl_langinfo(), the call to my_localeconv() could be for all of
+ * the localeconv() items or for just one of the following 3 items to
+ * emulate nl_langinfo().
*
- * For each, set up the appropriate parameters for the call below to
- * S_populate_hash_from_localeconv() */
- if (item != 0) switch (item) {
- default:
- locale_panic_(Perl_form(aTHX_
- "Unexpected item passed to my_localeconv: %d", item));
- break;
+ * This is compiled only when using perl_langinfo.h, which we control, and
+ * it has been constructed so that no item is numbered 0.
+ *
+ * For each individual item, either return the known value if the current
+ * locale is "C", or set up the appropriate parameters for the call below
+ * to the populate function */
+ if (item != 0) {
+ const char *locale;
+
+ switch (item) {
+ default:
+ locale_panic_(Perl_form(aTHX_
+ "Unexpected item passed to my_localeconv: %d", item));
+ break;
-# ifdef USE_LOCALE_NUMERIC
+# ifdef USE_LOCALE_NUMERIC
- case RADIXCHAR:
- locale = numeric_locale = PL_numeric_name;
- index_bits = INDEX_TO_BIT(LC_NUMERIC_INDEX_);
- strings[NUMERIC_STRING_OFFSET] = DECIMAL_POINT_ADDRESS;
- integers = NULL;
- break;
+ case RADIXCHAR:
+ if (isNAME_C_OR_POSIX(PL_numeric_name)) {
+ (void) hv_stores(hv, DECIMAL_POINT_LITERAL, newSVpvs("."));
+ return hv;
+ }
- case THOUSEP:
- index_bits = INDEX_TO_BIT(LC_NUMERIC_INDEX_);
- locale = numeric_locale = PL_numeric_name;
- strings[NUMERIC_STRING_OFFSET] = thousands_sep_string;
- integers = NULL;
- break;
+ strings[NUMERIC_OFFSET] = DECIMAL_POINT_ADDRESS;
+ goto numeric_common;
-# endif
-# ifdef USE_LOCALE_MONETARY
+ case THOUSEP:
+ if (isNAME_C_OR_POSIX(PL_numeric_name)) {
+ (void) hv_stores(hv, THOUSANDS_SEP_LITERAL, newSVpvs(""));
+ return hv;
+ }
- case CRNCYSTR:
- index_bits = INDEX_TO_BIT(LC_MONETARY_INDEX_);
- locale = monetary_locale = querylocale_i(LC_MONETARY_INDEX_);
+ strings[NUMERIC_OFFSET] = thousands_sep_string;
- /* This item needs the values for both the currency symbol, and another
- * one used to construct the nl_langino()-compatible return */
- strings[MONETARY_STRING_OFFSET] = CURRENCY_SYMBOL_ADDRESS;
- integers = P_CS_PRECEDES_ADDRESS;
- break;
+ numeric_common:
+ index_bits = OFFSET_TO_BIT(NUMERIC_OFFSET);
+ locale = PL_numeric_name;
+ break;
+
+# endif
+# ifdef USE_LOCALE_MONETARY
+
+ case CRNCYSTR: /* This item needs the values for both the currency
+ symbol, and another one used to construct the
+ nl_langino()-compatible return. */
+
+ locale = querylocale_c(LC_MONETARY);
+ if (isNAME_C_OR_POSIX(locale)) {
+ (void) hv_stores(hv, CURRENCY_SYMBOL_LITERAL, newSVpvs(""));
+ (void) hv_stores(hv, P_CS_PRECEDES_LITERAL, newSViv(-1));
+ return hv;
+ }
+
+ strings[MONETARY_OFFSET] = CURRENCY_SYMBOL_ADDRESS;
+ integers[MONETARY_OFFSET] = P_CS_PRECEDES_ADDRESS;
+
+ index_bits = OFFSET_TO_BIT(MONETARY_OFFSET);
+ break;
+
+# endif
+
+ } /* End of switch() */
+
+ /* There's only one item, so only one of each of these will get used,
+ * but cheap to initialize both */
+ populate[MONETARY_OFFSET] =
+ populate[NUMERIC_OFFSET] = S_populate_hash_from_localeconv;
+ locales[MONETARY_OFFSET] = locales[NUMERIC_OFFSET] = locale;
+ }
+ else /* End of for just one item to emulate nl_langinfo() */
# endif
- } /* End of switch() */
+ {
+ /* Here, the call is for all of localeconv(). It has a bunch of
+ * items. The first function call always gets the MONETARY values */
+ index_bits = OFFSET_TO_BIT(MONETARY_OFFSET);
- else /* End of for just one item to emulate nl_langinfo() */
+# ifdef USE_LOCALE_MONETARY
-# endif
+ locales[MONETARY_OFFSET] = querylocale_c(LC_MONETARY);
+ populate[MONETARY_OFFSET] =
+ (isNAME_C_OR_POSIX(locales[MONETARY_OFFSET]))
+ ? S_populate_hash_from_C_localeconv
+ : S_populate_hash_from_localeconv;
- { /* Here, the call is for all of localeconv(). It has a bunch of
- * items. As in the individual item case, set up the parameters for
- * S_populate_hash_from_localeconv(); */
+# else
-# ifdef USE_LOCALE_NUMERIC
- numeric_locale = PL_numeric_name;
-# elif defined(USE_LOCALE_CTYPE)
- numeric_locale = querylocale_i(numeric_index);
-# endif
-# if defined(USE_LOCALE_MONETARY) || defined(USE_LOCALE_CTYPE)
- monetary_locale = querylocale_i(monetary_index);
-# endif
+ locales[MONETARY_OFFSET] = "C";
+ populate[MONETARY_OFFSET] = S_populate_hash_from_C_localeconv;
- /* The first call to S_populate_hash_from_localeconv() will be for the
- * MONETARY values */
- index_bits = INDEX_TO_BIT(monetary_index);
- locale = monetary_locale;
+# endif
+# ifdef USE_LOCALE_NUMERIC
/* And if the locales for the two categories are the same, we can also
* do the NUMERIC values in the same call */
- if (strEQ(numeric_locale, monetary_locale)) {
- index_bits |= INDEX_TO_BIT(numeric_index);
+ if (strEQ(PL_numeric_name, locales[MONETARY_OFFSET])) {
+ index_bits |= OFFSET_TO_BIT(NUMERIC_OFFSET);
+ locales[NUMERIC_OFFSET] = locales[MONETARY_OFFSET];
+ populate[NUMERIC_OFFSET] = populate[MONETARY_OFFSET];
}
else {
requires_2nd_localeconv = true;
+ locales[NUMERIC_OFFSET] = PL_numeric_name;
+ populate[NUMERIC_OFFSET] = (isNAME_C_OR_POSIX(PL_numeric_name))
+ ? S_populate_hash_from_C_localeconv
+ : S_populate_hash_from_localeconv;
}
- /* We always pass both sets of strings. 'index_bits' tells
- * S_populate_hash_from_localeconv which to actually look at */
- strings[NUMERIC_STRING_OFFSET] = lconv_numeric_strings;
- strings[MONETARY_STRING_OFFSET] = lconv_monetary_strings;
+# else
+
+ /* When LC_NUMERIC is confined to "C", the two locales are the same
+ iff LC_MONETARY in this case is also "C". We set up the function
+ for that case above, so fastest to test just its address */
+ locales[NUMERIC_OFFSET] = "C";
+ if (populate[MONETARY_OFFSET] == S_populate_hash_from_C_localeconv) {
+ index_bits |= OFFSET_TO_BIT(NUMERIC_OFFSET);
+ populate[NUMERIC_OFFSET] = populate[MONETARY_OFFSET];
+ }
+ else {
+ requires_2nd_localeconv = true;
+ populate[NUMERIC_OFFSET] = S_populate_hash_from_C_localeconv;
+ }
- /* And pass the integer values to populate; again 'index_bits' will
- * say to use them or not */
- integers = lconv_integers;
+# endif
} /* End of call is for localeconv() */
- /* The code above has determined the parameters to
- S_populate_hash_from_localeconv() for both cases of an individual item
- and for the entire structure. Below is code common to both */
+ /* Call the proper populate function (which may call localeconv()) and copy
+ * its results into the hash. All the parameters have been initialized
+ * above */
+ (*populate[MONETARY_OFFSET])(aTHX_
+ hv, locales[MONETARY_OFFSET],
+ index_bits, strings, integers);
- HV * hv = newHV(); /* The returned hash, initially empty */
- sv_2mortal((SV*)hv);
+# ifndef HAS_SOME_LANGINFO /* Could be using this function to emulate
+ nl_langinfo() */
+
+ /* We are done when called with an individual item. There are no integer
+ * items to adjust, and it's best for the caller to determine if this
+ * string item is UTF-8 or not. This is because the locale's UTF-8ness is
+ * calculated below, and in some Configurations, that can lead to a
+ * recursive call to here, which could recurse infinitely. */
+ if (item != 0) {
+ return hv;
+ }
- /* Call localeconv() and copy its results into the hash. All the
- * parameters have been initialized above */
- populate_hash_from_localeconv(hv,
- locale,
- index_bits,
- strings,
- integers
- );
+# endif
/* The above call may have done all the hash fields, but not always, as
* already explained. If we need a second call it is always for the
* NUMERIC fields */
if (requires_2nd_localeconv) {
- populate_hash_from_localeconv(hv,
- numeric_locale,
- INDEX_TO_BIT(numeric_index),
- strings,
- NULL /* There are no NUMERIC integer
- fields */
- );
+ (*populate[NUMERIC_OFFSET])(aTHX_
+ hv,
+ locales[NUMERIC_OFFSET],
+ OFFSET_TO_BIT(NUMERIC_OFFSET),
+ strings, integers);
}
/* Here, the hash has been completely populated.
* corrections determined at hash population time, at an extra maintenance
* cost which khw doesn't think is worth it
*/
- for (unsigned int i = 0; i < 2; i++) { /* Try both types of strings */
- if (! strings[i]) { /* Skip if no strings of this type */
- continue;
- }
-
- locale = (i == NUMERIC_STRING_OFFSET)
- ? numeric_locale
- : monetary_locale;
-
- locale_utf8ness_t locale_is_utf8 = LOCALE_UTF8NESS_UNKNOWN;
-
-# ifdef HAS_RELIABLE_UTF8NESS_DETERMINATION
- /* It saves time in the loop below to have predetermined the UTF8ness
- * of the locale. But only do so if the platform reliably has this
- * information; otherwise it's better to do it only it should become
- * necessary, which happens on a per-element basis in the loop. */
-
- locale_is_utf8 = (is_locale_utf8(locale))
- ? LOCALE_IS_UTF8
- : LOCALE_NOT_UTF8;
+ for (unsigned int i = 0; i < 2; i++) { /* Try both types of strings */
- if (locale_is_utf8 == LOCALE_NOT_UTF8) {
- continue; /* No string can be UTF-8 if the locale isn't */
+ /* The return from this function is already adjusted */
+ if (populate[i] == S_populate_hash_from_C_localeconv) {
+ continue;
}
-# endif
-
/* Examine each string */
- while (1) {
- const char * name = strings[i]->name;
-
- if (! name) { /* Reached the end */
- break;
- }
+ for (const lconv_offset_t *strp = strings[i]; strp->name; strp++) {
+ const char * name = strp->name;
/* 'value' will contain the string that may need to be marked as
* UTF-8 */
SV ** value = hv_fetch(hv, name, strlen(name), true);
- if (! value) {
+ if (value == NULL) {
continue;
}
/* Determine if the string should be marked as UTF-8. */
if (UTF8NESS_YES == (get_locale_string_utf8ness_i(SvPVX(*value),
- locale_is_utf8,
- NULL,
- (locale_category_index) 0)))
+ LOCALE_UTF8NESS_UNKNOWN,
+ locales[i],
+ LC_ALL_INDEX_ /* OOB */)))
{
SvUTF8_on(*value);
}
-
- strings[i]++; /* Iterate */
}
- } /* End of fixing up UTF8ness */
+ if (integers[i] == NULL) {
+ continue;
+ }
- /* Examine each integer */
- if (integers) while (1) {
- const char * name = integers->name;
+ /* And each integer */
+ for (const lconv_offset_t *intp = integers[i]; intp->name; intp++) {
+ const char * name = intp->name;
if (! name) { /* Reached the end */
break;
if (SvIV(*value) == CHAR_MAX) {
sv_setiv(*value, -1);
}
-
- integers++; /* Iterate */
+ }
}
return hv;
+
+# endif /* End of must have one or both USE_MONETARY, USE_NUMERIC */
+
+}
+
+STATIC void
+S_populate_hash_from_C_localeconv(pTHX_ HV * hv,
+ const char * locale, /* Unused */
+
+ /* bit mask of which categories to
+ * populate */
+ const U32 which_mask,
+
+ /* The string type values to return;
+ * one element for numeric; the other
+ * for monetary */
+ const lconv_offset_t * strings[2],
+
+ /* And the integer fields */
+ const lconv_offset_t * integers[2])
+{
+ PERL_ARGS_ASSERT_POPULATE_HASH_FROM_C_LOCALECONV;
+ PERL_UNUSED_ARG(locale);
+ assert(isNAME_C_OR_POSIX(locale));
+
+ /* Fill hv with the values that localeconv() is supposed to return for
+ * the C locale */
+
+ U32 working_mask = which_mask;
+ while (working_mask) {
+
+ /* Get the bit position of the next lowest set bit. That is the
+ * index into the 'strings' array of the category we use in this loop
+ * iteration. Turn the bit off so we don't work on this category
+ * again in this function call. */
+ const PERL_UINT_FAST8_T i = lsbit_pos(working_mask);
+ working_mask &= ~ (1 << i);
+
+ /* This category's string fields */
+ const lconv_offset_t * category_strings = strings[i];
+
+# ifndef HAS_SOME_LANGINFO /* This doesn't work properly if called on a single
+ item, which could only happen when there isn't
+ nl_langinfo on the platform */
+ assert(category_strings[1].name != NULL);
+# endif
+
+ /* All string fields are empty except for one NUMERIC one. That one
+ * has been initialized to be the final one in the NUMERIC strings, so
+ * stop the loop early in that case. Otherwise, we would store an
+ * empty string to the hash, and immediately overwrite it with the
+ * correct value */
+ const unsigned int stop_early = (i == NUMERIC_OFFSET) ? 1 : 0;
+
+ /* A NULL element terminates the list */
+ while ((category_strings + stop_early)->name) {
+ (void) hv_store(hv,
+ category_strings->name,
+ strlen(category_strings->name),
+ newSVpvs(""),
+ 0);
+
+ category_strings++;
+ }
+
+ /* And fill in the NUMERIC exception */
+ if (i == NUMERIC_OFFSET) {
+ (void) hv_stores(hv, "decimal_point", newSVpvs("."));
+ category_strings++;
+ }
+
+ /* Add any int fields. In the C locale, all are -1 */
+ if (integers[i]) {
+ const lconv_offset_t * current = integers[i];
+ while (current->name) {
+ (void) hv_store(hv,
+ current->name, strlen(current->name),
+ newSViv(-1),
+ 0);
+ current++;
+ }
+ }
+ }
}
+# if defined(USE_LOCALE_NUMERIC) || defined(USE_LOCALE_MONETARY)
+
STATIC void
S_populate_hash_from_localeconv(pTHX_ HV * hv,
* populate */
const U32 which_mask,
- /* strings[0] points to the numeric
- * string fields; [1] to the monetary */
+ /* The string type values to return; one
+ * element for numeric; the other for
+ * monetary */
const lconv_offset_t * strings[2],
- /* And to the monetary integer fields */
- const lconv_offset_t * integers)
+ /* And similarly the integer fields */
+ const lconv_offset_t * integers[2])
{
PERL_ARGS_ASSERT_POPULATE_HASH_FROM_LOCALECONV;
PERL_UNUSED_ARG(which_mask); /* Some configurations don't use this;
complicated to figure out which */
-# ifndef USE_LOCALE
- PERL_UNUSED_ARG(locale);
-# endif
/* Run localeconv() and copy some or all of its results to the input 'hv'
* hash. Most localeconv() implementations return the values in a global
* global static buffer. Some locks might be no-ops on this platform, but
* not others. We need to lock if any one isn't a no-op. */
-# ifdef USE_LOCALE_CTYPE
+# ifdef WE_MUST_DEAL_WITH_MISMATCHED_CTYPE
- /* Some platforms require LC_CTYPE to be congruent with the category we are
- * looking for */
const char * orig_CTYPE_locale = toggle_locale_c(LC_CTYPE, locale);
-# endif
-# ifdef USE_LOCALE_NUMERIC
+# endif
+# ifdef USE_LOCALE_NUMERIC
/* We need to toggle to the underlying NUMERIC locale if we are getting
* NUMERIC strings */
const char * orig_NUMERIC_locale = NULL;
- if (which_mask & INDEX_TO_BIT(LC_NUMERIC_INDEX_)) {
+ if (which_mask & OFFSET_TO_BIT(NUMERIC_OFFSET)) {
LC_NUMERIC_LOCK(0);
-# if defined(WIN32)
+# if defined(WIN32)
/* There is a bug in Windows in which setting LC_CTYPE after the others
* doesn't actually take effect for localeconv(). See commit
* 418efacd1950763f74ed3cc22f8cf9206661b892 for details. Thus we have
* to make sure that the locale we want is set after LC_CTYPE. We
* unconditionally toggle away from and back to the current locale
- * prior to calling localeconv().
- *
- * This code will have no effect if we already are in C, but khw
- * hasn't seen any cases where this causes problems when we are in the
- * C locale. */
- orig_NUMERIC_locale = toggle_locale_i(LC_NUMERIC_INDEX_, "C");
- toggle_locale_i(LC_NUMERIC_INDEX_, locale);
+ * prior to calling localeconv(). */
+ orig_NUMERIC_locale = toggle_locale_c(LC_NUMERIC, "C");
+ toggle_locale_c(LC_NUMERIC, locale);
-# else
+# else
/* No need for the extra toggle when not on Windows */
- orig_NUMERIC_locale = toggle_locale_i(LC_NUMERIC_INDEX_, locale);
+ orig_NUMERIC_locale = toggle_locale_c(LC_NUMERIC, locale);
-# endif
+# endif
}
-# endif
-# if defined(USE_LOCALE_MONETARY) && defined(WIN32)
+# endif
+# if defined(USE_LOCALE_MONETARY) && defined(WIN32)
/* Same Windows bug as described just above for NUMERIC. Otherwise, no
* need to toggle LC_MONETARY, as it is kept in the underlying locale */
const char * orig_MONETARY_locale = NULL;
- if (which_mask & INDEX_TO_BIT(LC_MONETARY_INDEX_)) {
- orig_MONETARY_locale = toggle_locale_i(LC_MONETARY_INDEX_, "C");
- toggle_locale_i(LC_MONETARY_INDEX_, locale);
+ if (which_mask & OFFSET_TO_BIT(MONETARY_OFFSET)) {
+ orig_MONETARY_locale = toggle_locale_c(LC_MONETARY, "C");
+ toggle_locale_c(LC_MONETARY, locale);
}
-# endif
+# endif
/* Finally ready to do the actual localeconv(). Lock to prevent other
* accesses until we have made a copy of its returned static buffer */
gwLOCALE_LOCK;
-# if defined(TS_W32_BROKEN_LOCALECONV) && defined(USE_THREAD_SAFE_LOCALE)
+# if defined(TS_W32_BROKEN_LOCALECONV) && defined(USE_THREAD_SAFE_LOCALE)
/* This is a workaround for another bug in Windows. localeconv() was
* broken with thread-safe locales prior to VS 15. It looks at the global
const char * save_global = querylocale_c(LC_ALL);
void_setlocale_c(LC_ALL, save_thread);
-# endif /* TS_W32_BROKEN_LOCALECONV */
+# endif /* TS_W32_BROKEN_LOCALECONV */
/* Finally, do the actual localeconv */
const char *lcbuf_as_string = (const char *) localeconv();
- /* Fill in the string fields of the HV* */
- for (unsigned int i = 0; i < 2; i++) {
-
- /* One iteration is only for the numeric string fields. Skip these
- * unless we are compiled to care about those fields and the input
- * parameters indicate we want their values */
- if ( i == NUMERIC_STRING_OFFSET
+ /* Copy its results for each desired category as determined by
+ * 'which_mask' */
+ U32 working_mask = which_mask;
+ while (working_mask) {
-# ifdef USE_LOCALE_NUMERIC
-
- && (which_mask & INDEX_TO_BIT(LC_NUMERIC_INDEX_)) == 0
-
-# endif
-
- ) {
- continue;
- }
+ /* Get the bit position of the next lowest set bit. That is the
+ * index into the 'strings' array of the category we use in this loop
+ * iteration. Turn the bit off so we don't work on this category
+ * again in this function call. */
+ const PERL_UINT_FAST8_T i = lsbit_pos32(working_mask);
+ working_mask &= ~ (1 << i);
- /* The other iteration is only for the monetary string fields. Again
- * skip it unless we want those values */
- if ( i == MONETARY_STRING_OFFSET
-
-# ifdef USE_LOCALE_MONETARY
-
- && (which_mask & INDEX_TO_BIT(LC_MONETARY_INDEX_)) == 0
-
-# endif
- ) {
-
- continue;
- }
-
- /* For each field for the given category ... */
+ /* Point to the string field list for the given category ... */
const lconv_offset_t * category_strings = strings[i];
- while (1) {
- const char * name = category_strings->name;
- if (! name) { /* Quit at the end */
- break;
- }
+ while (category_strings->name) {
- /* we have set things up so that we know where in the returned
+ /* We have set things up so that we know where in the returned
* structure, when viewed as a string, the corresponding value is.
* */
const char *value = *((const char **)( lcbuf_as_string
+ category_strings->offset));
-
- /* Set to get next string on next iteration */
- category_strings++;
-
- /* Skip if this platform doesn't have this field. */
- if (! value) {
- continue;
+ if (value) { /* Copy to the hash */
+ (void) hv_store(hv,
+ category_strings->name,
+ strlen(category_strings->name),
+ newSVpv(value, strlen(value)),
+ 0);
}
- /* Copy to the hash */
- (void) hv_store(hv,
- name, strlen(name),
- newSVpv(value, strlen(value)),
- 0);
+ category_strings++;
}
- /* Add any int fields to the HV* */
- if (i == MONETARY_STRING_OFFSET && integers) {
- while (integers->name) {
+ /* Add any int fields to the HV*. */
+ if (integers[i]) {
+ const lconv_offset_t * current = integers[i];
+ while (current->name) {
const char value = *((const char *)( lcbuf_as_string
- + integers->offset));
- (void) hv_store(hv, integers->name,
- strlen(integers->name), newSViv(value), 0);
- integers++;
+ + current->offset));
+ (void) hv_store(hv,
+ current->name, strlen(current->name),
+ newSViv(value),
+ 0);
+ current++;
}
}
} /* End of loop through the fields */
/* Done with copying to the hash. Can unwind the critical section locks */
-# if defined(TS_W32_BROKEN_LOCALECONV) && defined(USE_THREAD_SAFE_LOCALE)
+# if defined(TS_W32_BROKEN_LOCALECONV) && defined(USE_THREAD_SAFE_LOCALE)
/* Restore the global locale's prior state */
void_setlocale_c(LC_ALL, save_global);
/* Restore the per-thread locale state */
void_setlocale_c(LC_ALL, save_thread);
-# endif /* TS_W32_BROKEN_LOCALECONV */
+# endif /* TS_W32_BROKEN_LOCALECONV */
gwLOCALE_UNLOCK; /* Finished with the critical section of a
globally-accessible buffer */
-# if defined(USE_LOCALE_MONETARY) && defined(WIN32)
+# if defined(USE_LOCALE_MONETARY) && defined(WIN32)
- restore_toggled_locale_i(LC_MONETARY_INDEX_, orig_MONETARY_locale);
+ restore_toggled_locale_c(LC_MONETARY, orig_MONETARY_locale);
-# endif
-# ifdef USE_LOCALE_NUMERIC
+# endif
+# ifdef USE_LOCALE_NUMERIC
- restore_toggled_locale_i(LC_NUMERIC_INDEX_, orig_NUMERIC_locale);
- if (which_mask & INDEX_TO_BIT(LC_NUMERIC_INDEX_)) {
+ restore_toggled_locale_c(LC_NUMERIC, orig_NUMERIC_locale);
+ if (which_mask & OFFSET_TO_BIT(NUMERIC_OFFSET)) {
LC_NUMERIC_UNLOCK;
}
-# endif
-# ifdef USE_LOCALE_CTYPE
+# endif
+# ifdef WE_MUST_DEAL_WITH_MISMATCHED_CTYPE
restore_toggled_locale_c(LC_CTYPE, orig_CTYPE_locale);
-# endif
+# endif
}
+# endif /* defined(USE_LOCALE_NUMERIC) || defined(USE_LOCALE_MONETARY) */
#endif /* defined(HAS_LOCALECONV) */
#ifndef HAS_SOME_LANGINFO
);
SV * sv = NULL;
if (retval) {
- STRLEN len = strlen(retval);
- sv = newSV(len);
- sv_usepvn_flags(sv, retval, len, SV_HAS_TRAILING_NUL);
+ sv = newSV_type(SVt_PV);
+ sv_usepvn_flags(sv, retval, strlen(retval), SV_HAS_TRAILING_NUL);
if (result_utf8ness == UTF8NESS_YES) {
SvUTF8_on(sv);
}
#ifdef USE_LOCALE
+# ifndef HAS_DEFINITIVE_UTF8NESS_DETERMINATION
+
+/* Forward declaration of function that we don't put into embed.fnc so as to
+ * make its removal easier, as there may not be any extant platforms that need
+ * it; and the function is located after S_my_langinfo_i() because it's easier
+ * to understand when placed in the context of that code */
+STATIC const char * S_override_codeset_if_utf8_found(pTHX_
+ const char *codeset,
+ const char *locale);
+# endif
/* There are several implementations of my_langinfo, depending on the
* Configuration. They all share the same beginning of the function */
* value, as documented */
utf8ness_t * utf8ness)
{
- const char * retval = NULL;
-
PERL_ARGS_ASSERT_MY_LANGINFO_I;
assert(cat_index < LC_ALL_INDEX_);
* isn't, or vice versa). There is explicit code to bring the categories into
* sync. This doesn't seem to be a problem with nl_langinfo(), so that
* implementation doesn't currently worry about it. But it is a problem on
- * Windows boxes, which don't have nl_langinfo(). */
+ * Windows boxes, which don't have nl_langinfo().
+ *
+ * One might be tempted to avoid any toggling by instead using nl_langinfo_l()
+ * on platforms that have it. This would entail creating a locale object with
+ * newlocale() and freeing it afterwards. But doing so runs significantly
+ * slower than just doing the toggle ourselves. lib/locale_threads.t was
+ * slowed down by 25% on Ubuntu 22.04 */
/*--------------------------------------------------------------------------*/
# if defined(HAS_NL_LANGINFO) /* nl_langinfo() is available. */
-# ifdef USE_LOCALE_CTYPE
+# ifdef WE_MUST_DEAL_WITH_MISMATCHED_CTYPE
/* This function sorts out if things actually have to be switched or not,
* for both save and restore. */
const char * orig_switched_locale = toggle_locale_i(cat_index, locale);
gwLOCALE_LOCK;
- retval = save_to_buffer(nl_langinfo(item), retbufp, retbuf_sizep);
+ const char * retval = save_to_buffer(nl_langinfo(item), retbufp, retbuf_sizep);
gwLOCALE_UNLOCK;
if (utf8ness) {
restore_toggled_locale_i(cat_index, orig_switched_locale);
-# ifdef USE_LOCALE_CTYPE
+# ifdef WE_MUST_DEAL_WITH_MISMATCHED_CTYPE
restore_toggled_locale_c(LC_CTYPE, orig_CTYPE_locale);
# else /* Below, emulate nl_langinfo as best we can */
/* The other completion is where we have to emulate nl_langinfo(). There
- * are various possibilities depending on the Configuration */
-
-# ifdef USE_LOCALE_CTYPE
-
- const char * orig_CTYPE_locale = toggle_locale_c(LC_CTYPE, locale);
-
-# endif
-
- const char * orig_switched_locale = toggle_locale_i(cat_index, locale);
-
- /* Here, we are in the locale we want information about */
-
+ * are various possibilities depending on the Configuration. The major
+ * platform lacking nl_langinfo is Windows. It does have GetLocaleInfoEx()
+ * that could be used to get most of the items, but it (and other similar
+ * Windows API functions) use what MS calls "locale names", whereas the C
+ * functions use what MS calls "locale strings". The locale string
+ * "English_United_States.1252" is equivalent to the locale name "en_US".
+ * There are tables inside Windows that translate between the two forms,
+ * but they are not exposed. Also calling setlocale(), then calling
+ * GetThreadLocale() doesn't work, as the former doesn't change the
+ * latter's return. Therefore we are stuck using the mechanisms below. */
/* Almost all the items will have ASCII return values. Set that here, and
* override if necessary */
utf8ness_t is_utf8 = UTF8NESS_IMMATERIAL;
+ const char * retval = NULL;
switch (item) {
default:
Newx(floatbuf, initial_size, char);
+# if defined(WE_MUST_DEAL_WITH_MISMATCHED_CTYPE)
+ const char * orig_CTYPE_locale = toggle_locale_c(LC_CTYPE, locale);
+# endif
+
+ const char * orig_NUMERIC_locale = toggle_locale_c(LC_NUMERIC,
+ locale);
/* 1.5 is exactly representable on binary computers */
Size_t needed_size = snprintf(floatbuf, initial_size, "%.1f", 1.5);
needed_size = new_needed;
}
+ restore_toggled_locale_c(LC_NUMERIC, orig_NUMERIC_locale);
+
+# if defined(WE_MUST_DEAL_WITH_MISMATCHED_CTYPE)
+ restore_toggled_locale_c(LC_CTYPE, orig_CTYPE_locale);
+# endif
+
char * s = floatbuf;
char * e = floatbuf + needed_size;
# endif
# ifdef HAS_LOCALECONV
- /* These items are available from localeconv(). (To avoid using
- * TS_W32_BROKEN_LOCALECONV, one could use GetNumberFormat and
- * GetCurrencyFormat; patches welcome) */
-
-# define P_CS_PRECEDES "p_cs_precedes"
-# define CURRENCY_SYMBOL "currency_symbol"
+ /* These items are available from localeconv(). */
/* case RADIXCHAR: // May drop down to here in some configurations */
case THOUSEP:
* with exactly both fields. Delete this one, leaving just the
* CRNCYSTR one in the hash */
SV* precedes = hv_delete(result_hv,
- P_CS_PRECEDES, STRLENs(P_CS_PRECEDES),
+ P_CS_PRECEDES_LITERAL,
+ STRLENs(P_CS_PRECEDES_LITERAL),
0);
if (! precedes) {
locale_panic_("my_localeconv() unexpectedly didn't return"
- " a value for " P_CS_PRECEDES);
+ " a value for " P_CS_PRECEDES_LITERAL);
}
/* The modification is to prefix the localeconv() return with a
* single byte, calculated as follows: */
- char prefix = (LIKELY(SvIV(precedes) != -1))
- ? ((precedes != 0) ? '-' : '+')
-
- /* khw couldn't find any documentation that
- * CHAR_MAX (which we modify to -1) is the signal,
- * but cygwin uses it thusly, and it makes sense
- * given that CHAR_MAX indicates the value isn't
- * used, so it neither precedes nor succeeds */
- : '.';
+ const char * prefix = (LIKELY(SvIV(precedes) != -1))
+ ? ((precedes != 0) ? "-" : "+")
+ : ".";
+ /* (khw couldn't find any documentation that the dot is signalled
+ * by CHAR_MAX (which we modify to -1), but cygwin uses it thusly,
+ * and it makes sense given that CHAR_MAX indicates the value isn't
+ * used, so it neither precedes nor succeeds) */
/* Now get CRNCYSTR */
(void) hv_iterinit(result_hv);
string = hv_iterval(result_hv, entry);
/* And perform the modification */
- Perl_sv_setpvf(aTHX_ string, "%c%s", prefix, SvPV_nolen(string));
+ sv_insert(string, 0, 0, prefix, 1);
}
/* Here, 'string' contains the value we want to return */
retval = save_to_buffer(SvPV_nolen(string), retbufp, retbuf_sizep);
if (utf8ness) {
- is_utf8 = (SvUTF8(string))
- ? UTF8NESS_YES
- : (is_utf8_invariant_string( (U8 *) retval,
- strlen(retval)))
- ? UTF8NESS_IMMATERIAL
- : UTF8NESS_NO;
+ is_utf8 = get_locale_string_utf8ness_i(retval,
+ LOCALE_UTF8NESS_UNKNOWN,
+ locale,
+ cat_index);
}
break;
* first day of the week. Since we're only getting one thing at a
* time, it all works */
struct tm mytm;
+
+ const char * orig_TIME_locale = toggle_locale_c(LC_TIME, locale);
+
ints_to_tm(&mytm, 30, 30, hour, mday, mon, 2011, 0, 0, 0);
char * temp;
if (utf8ness) {
temp = strftime_tm(format, &mytm);
}
+ restore_toggled_locale_c(LC_TIME, orig_TIME_locale);
+
retval = save_to_buffer(temp, retbufp, retbuf_sizep);
Safefree(temp);
break;
}
+ /* If this happens to match our cached value */
+ if (PL_in_utf8_CTYPE_locale && strEQ(locale, PL_ctype_name)) {
+ retval = "UTF-8";
+ break;
+ }
+
# ifdef WIN32
- /* This function retrieves the code page. It is subject to change, but
- * is documented and has been stable for many releases */
- UINT ___lc_codepage_func(void);
+ const char * orig_CTYPE_locale = toggle_locale_c(LC_CTYPE, locale);
# ifndef WIN32_USE_FAKE_OLD_MINGW_LOCALES
+ /* This function retrieves the code page. It is subject to change, but
+ * is documented and has been stable for many releases */
retval = save_to_buffer(Perl_form(aTHX_ "%d", ___lc_codepage_func()),
retbufp, retbuf_sizep);
# else
retbufp, retbuf_sizep);
# endif
+ restore_toggled_locale_c(LC_CTYPE, orig_CTYPE_locale);
+
DEBUG_Lv(PerlIO_printf(Perl_debug_log, "locale='%s' cp=%s\n",
locale, retval));
break;
* UTF-8 locale or not. If it is UTF-8, we (correctly) use that for
* the code set. */
-# if defined(HAS_MBTOWC) || defined(HAS_MBRTOWC)
+# ifdef HAS_DEFINITIVE_UTF8NESS_DETERMINATION
- /* If libc mbtowc() evaluates the bytes that form the REPLACEMENT
- * CHARACTER as that Unicode code point, this has to be a UTF-8 locale.
- * */
- wchar_t wc = 0;
- (void) Perl_mbtowc_(aTHX_ NULL, NULL, 0);/* Reset shift state */
- int mbtowc_ret = Perl_mbtowc_(aTHX_ &wc,
- STR_WITH_LEN(REPLACEMENT_CHARACTER_UTF8));
- if (mbtowc_ret >= 0 && wc == UNICODE_REPLACEMENT) {
- DEBUG_Lv(PerlIO_printf(Perl_debug_log,
- "mbtowc returned REPLACEMENT\n"));
+ if (is_locale_utf8(locale)) {
retval = "UTF-8";
break;
}
- /* Here, it isn't a UTF-8 locale. */
-
-# else /* mbtowc() is not available. The chances of this code getting
- compiled are very small, as it is a C99 required function,
- and we are now requiring C99; perhaps if it is a defective
- implementation. But if so, there are other libc functions
- that could be used instead. */
+# endif
- /* Sling together several possibilities, depending on platform
- * capabilities and what we found.
+ /* Here, the code set has not been found. The only other option khw
+ * could think of is to see if the codeset is part of the locale name.
+ * This is very less than ideal; often there is no code set in the
+ * name; and at other times they even lie.
*
- * For non-English locales or non-dollar currency locales, we likely
- * will find out whether a locale is UTF-8 or not */
-
- utf8ness_t is_utf8 = UTF8NESS_UNKNOWN;
- const char * scratch_buf = NULL;
-
-# if defined(USE_LOCALE_MONETARY) && defined(HAS_LOCALECONV)
-# define LANGINFO_RECURSED_MONETARY 0x1
-# define LANGINFO_RECURSED_TIME 0x2
-
- /* Can't use this method unless localeconv() is available, as that's
- * the way we find out the currency symbol.
+ * But there is an XPG standard syntax, which many locales follow:
*
- * First try looking at the currency symbol (via a recursive call) to
- * see if it disambiguates things. Often that will be in the native
- * script, and if the symbol isn't legal UTF-8, we know that the locale
- * isn't either.
+ * language[_territory[.codeset]][@modifier]
*
- * The recursion calls my_localeconv() to find CRNCYSTR, and that can
- * call is_locale_utf8() which will call my_langinfo(CODESET) which
- * will get to here again, ad infinitum. The guard prevents that.
- */
- if ((PL_langinfo_recursed & LANGINFO_RECURSED_MONETARY) == 0) {
- PL_langinfo_recursed |= LANGINFO_RECURSED_MONETARY;
- (void) my_langinfo_c(CRNCYSTR, LC_MONETARY, locale, &scratch_buf,
- NULL, &is_utf8);
- PL_langinfo_recursed &= ~LANGINFO_RECURSED_MONETARY;
+ * So we take the part between the dot and any '@' */
+ retval = strchr(locale, '.');
+ if (! retval) {
+ retval = ""; /* Alas, no dot */
}
+ else {
- Safefree(scratch_buf);
+ /* Don't include the dot */
+ retval++;
+
+ /* And stop before any '@' */
+ const char * modifier = strchr(retval, '@');
+ if (modifier) {
+ char * code_set_name;
+ const Size_t name_len = modifier - retval;
+ Newx(code_set_name, name_len + 1, char); /* +1 for NUL */
+ my_strlcpy(code_set_name, retval, name_len + 1);
+ SAVEFREEPV(code_set_name);
+ retval = code_set_name;
+ }
-# endif
-# ifdef USE_LOCALE_TIME
+ /* The code set name is considered to be everything between the dot
+ * and the '@' */
+ retval = save_to_buffer(retval, retbufp, retbuf_sizep);
+ }
- /* If we have ruled out being UTF-8, no point in checking further. */
- if ( is_utf8 != UTF8NESS_NO
- && (PL_langinfo_recursed & LANGINFO_RECURSED_TIME) == 0)
- {
- /* But otherwise do check more. This is done even if the currency
- * symbol looks to be UTF-8, just in case that's a false positive.
- *
- * Look at the LC_TIME entries, like the names of the months or
- * weekdays. We quit at the first one that is illegal UTF-8
- *
- * The recursion guard is because the recursed my_langinfo_c() will
- * call strftime8() to find the LC_TIME value passed to it, and
- * that will call my_langinfo(CODESET) for non-ASCII returns,
- * which will get here again, ad infinitum
- */
+# ifndef HAS_DEFINITIVE_UTF8NESS_DETERMINATION
+
+ /* Here, 'retval' contains any codeset name derived from the locale
+ * name. That derived name may be empty or not necessarily indicative
+ * of the real codeset. But we can often determine if it should be
+ * UTF-8, regardless of what the name is. On most platforms, that
+ * determination is definitive, and was already done. But for this
+ * code to be compiled, this platform is not one of them. However,
+ * there are typically tools available to make a very good guess, and
+ * knowing the derived codeset name improves the quality of that guess.
+ * The following function overrides the derived codeset name when it
+ * guesses that it actually should be UTF-8. It could be inlined here,
+ * but was moved out of this switch() so as to make the switch()
+ * control flow easier to follow */
+ retval = S_override_codeset_if_utf8_found(aTHX_ retval, locale);
- utf8ness_t this_is_utf8 = UTF8NESS_UNKNOWN;
- const int times[] = {
- DAY_1, DAY_2, DAY_3, DAY_4, DAY_5, DAY_6, DAY_7,
- MON_1, MON_2, MON_3, MON_4, MON_5, MON_6, MON_7, MON_8,
- MON_9, MON_10, MON_11, MON_12,
- ALT_DIGITS, AM_STR, PM_STR,
- ABDAY_1, ABDAY_2, ABDAY_3, ABDAY_4, ABDAY_5, ABDAY_6,
- ABDAY_7,
- ABMON_1, ABMON_2, ABMON_3, ABMON_4, ABMON_5, ABMON_6,
- ABMON_7, ABMON_8, ABMON_9, ABMON_10, ABMON_11, ABMON_12
- };
-
- /* The code in the recursive call can handle switching the locales,
- * but by doing it here, we avoid switching each iteration of the
- * loop */
- const char * orig_TIME_locale = toggle_locale_c(LC_TIME, locale);
+# endif
- PL_langinfo_recursed |= LANGINFO_RECURSED_TIME;
- for (PERL_UINT_FAST8_T i = 0; i < C_ARRAY_LENGTH(times); i++) {
- scratch_buf = NULL;
- (void) my_langinfo_c(times[i], LC_TIME, locale, &scratch_buf,
- NULL, &this_is_utf8);
- Safefree(scratch_buf);
- if (this_is_utf8 == UTF8NESS_NO) {
- is_utf8 = UTF8NESS_NO;
- break;
- }
+ break; /* All done */
- if (this_is_utf8 == UTF8NESS_YES) {
- is_utf8 = UTF8NESS_YES;
- }
- }
- PL_langinfo_recursed &= ~LANGINFO_RECURSED_TIME;
+# endif /* ! WIN32 */
+# endif /* USE_LOCALE_CTYPE */
- /* Here we have gone through all the LC_TIME elements. is_utf8 has
- * been set as follows:
- * UTF8NESS_NO If at least one isn't legal UTF-8
- * UTF8NESS_IMMMATERIAL If all are ASCII
- * UTF8NESS_YES If all are legal UTF-8 (including
- * ASCII), and at least one isn't
- * ASCII. */
+ } /* Giant switch() of nl_langinfo() items */
- restore_toggled_locale_c(LC_TIME, orig_TIME_locale);
- }
+ if (utf8ness) {
+ *utf8ness = is_utf8;
+ }
-# endif /* LC_TIME */
+ return retval;
- /* If nothing examined above rules out it being UTF-8, and at least one
- * thing fits as UTF-8 (and not plain ASCII), assume the codeset is
- * UTF-8. */
- if (is_utf8 == UTF8NESS_YES) {
- retval = "UTF-8";
- break;
+# endif /* All the implementations of my_langinfo() */
+
+/*--------------------------------------------------------------------------*/
+
+} /* my_langinfo() */
+
+# ifndef HAS_DEFINITIVE_UTF8NESS_DETERMINATION
+
+STATIC const char *
+S_override_codeset_if_utf8_found(pTHX_ const char * codeset,
+ const char * locale)
+{
+# define NAME_INDICATES_UTF8 0x1
+# define MB_CUR_MAX_SUGGESTS_UTF8 0x2
+
+ /* Override 'codeset' with UTF-8 if this routine guesses that it should be.
+ * Conversely (but rarely), "UTF-8" in the locale name might be wrong. We
+ * return "" as the code set name if we find that to be the case. */
+
+ unsigned int lean_towards_being_utf8 = 0;
+ if (is_codeset_name_UTF8(codeset)) {
+ lean_towards_being_utf8 |= NAME_INDICATES_UTF8;
+ }
+
+ /* For this portion of the file to compile, some C99 functions aren't
+ * available to us, even though we now require C99. So, something must be
+ * wrong with them. The code here should be good enough to work around
+ * this issue, but should the need arise, comments in S_is_locale_utf8()
+ * list some alternative C99 functions that could be tried.
+ *
+ * But MB_CUR_MAX is a C89 construct that helps a lot, is simple for a
+ * vendor to implement, and our experience with it is that it works well on
+ * a variety of platforms. We have found that it returns a too-large
+ * number on some platforms for the C locale, but for no others. That
+ * locale was already ruled out in the code that called this function. (If
+ * MB_CUR_MAX returned too small a number, that would break a lot of
+ * things, and likely would be quickly corrected by the vendor.) khw has
+ * some confidence that it doesn't return >1 when 1 is meant, as that would
+ * trigger a Perl warning, and we've had no reports of invalid occurrences
+ * of such. */
+# ifdef MB_CUR_MAX
+
+ /* If there are fewer bytes available in this locale than are required to
+ * represent the largest legal UTF-8 code point, this definitely isn't a
+ * UTF-8 locale, even if the locale name says it is. */
+ const int mb_cur_max = MB_CUR_MAX;
+ if (mb_cur_max < (int) UNISKIP(PERL_UNICODE_MAX)) {
+ if (lean_towards_being_utf8 & NAME_INDICATES_UTF8) {
+ return ""; /* The name is wrong; override */
}
- /* Here, nothing examined indicates that the codeset is UTF-8. But
- * what is it? The other locale categories are not likely to be of
- * further help:
- *
- * LC_NUMERIC Only a few locales in the world have a non-ASCII radix
- * or group separator.
- * LC_CTYPE This code wouldn't be compiled if mbtowc() existed and
- * was reliable. This is unlikely in C99. There are
- * other functions that could be used instead, but are
- * they going to exist, and be able to distinguish between
- * UTF-8 and 8859-1? Deal with this only if it becomes
- * necessary.
- * LC_MESSAGES The strings returned from strerror() would seem likely
- * candidates, but experience has shown that many systems
- * don't actually have translations installed for them.
- * They are instead always in English, so everything in
- * them is ASCII, which is of no help to us. A Configure
- * probe could possibly be written to see if this platform
- * has non-ASCII error messages. But again, wait until it
- * turns out to be an actual problem. */
-
-# endif /* ! mbtowc() */
-
- /* Rejoin the mbtowc available/not-available cases.
- *
- * We got here only because we haven't been able to find the codeset.
- * The only other option khw could think of is to see if the codeset is
- * part of the locale name. This is very less than ideal; often there
- * is no code set in the name; and at other times they even lie.
- *
- * But there is an XPG standard syntax, which many locales follow:
- *
- * language[_territory[.codeset]][@modifier]
+ return codeset;
+ }
+
+ /* But if the locale could be UTF-8, and also the name corroborates this,
+ * assume it is so */
+ if (lean_towards_being_utf8 & NAME_INDICATES_UTF8) {
+ return codeset;
+ }
+
+ /* Here, the name doesn't indicate UTF-8, but MB_CUR_MAX indicates it could
+ * be. khw knows of only two other locales in the world, EUC-TW and GB
+ * 18030, that legitimately require this many bytes (4). In both, the
+ * single byte characters are the same as ASCII. No multi-byte character
+ * in EUC-TW is legal UTF-8 (since the first byte of each is a
+ * continuation). GB 18030 has no three byte sequences, and none of the
+ * four byte ones is legal UTF-8 (as the second byte for these is a
+ * non-continuation). But every legal UTF-8 two byte sequence is also
+ * legal in GB 18030, though none have the same meaning, and no Han code
+ * point expressed in UTF-8 is two byte. So the further tests below which
+ * look for native expressions of currency and time will not return two
+ * byte sequences, hence they will reliably rule out this locale as being
+ * UTF-8. So, if we get this far, the result is almost certainly UTF-8.
+ * But to be really sure, also check that there is no illegal UTF-8. */
+ lean_towards_being_utf8 |= MB_CUR_MAX_SUGGESTS_UTF8;
+
+# endif /* has MB_CUR_MAX */
+
+ /* Here, MB_CUR_MAX is not available, or was inconclusive. What we do is
+ * to look at various strings associated with the locale:
+ * 1) If any are illegal UTF-8, the locale can't be UTF-8.
+ * 2) If all are legal UTF-8, and some non-ASCII characters are present,
+ * it is likely to be UTF-8, because of the strictness of UTF-8
+ * syntax. So assume it is UTF-8
+ * 3) If all are ASCII and the locale name and/or MB_CUR_MAX indicate
+ * UTF-8, assume the locale is UTF-8.
+ * 4) Otherwise, assume the locale isn't UTF-8
+ *
+ * To save cycles, if the locale name indicates it is a UTF-8 locale, we
+ * stop looking at the first instance with legal non-ASCII UTF-8. It is
+ * very unlikely this combination is coincidental. */
+
+ utf8ness_t strings_utf8ness = UTF8NESS_UNKNOWN;
+ char * scratch_buf = NULL;
+ Size_t scratch_buf_size = 0;
+
+ /* List of strings to look at */
+ const int trials[] = {
+
+# if defined(USE_LOCALE_MONETARY) && defined(HAS_LOCALECONV)
+
+ /* The first string tried is the locale currency name. Often that will
+ * be in the native script.
*
- * So we take the part between the dot and any '@' */
- retval = (const char *) strchr(locale, '.');
- if (! retval) {
- retval = ""; /* Alas, no dot */
- break;
- }
+ * But this is usable only if localeconv() is available, as that's the
+ * way we find out the currency symbol. */
- /* Don't include the dot */
- retval++;
-
- /* And stop before any '@' */
- const char * modifier = strchr(retval, '@');
- if (modifier) {
- char * code_set_name;
- const Size_t name_len = modifier - retval;
- Newx(code_set_name, name_len + 1, char); /* +1 for NUL */
- my_strlcpy(code_set_name, retval, name_len + 1);
- SAVEFREEPV(code_set_name);
- retval = code_set_name;
- }
+ CRNCYSTR,
-# if defined(HAS_MBTOWC) || defined(HAS_MBRTOWC)
+# endif
+# ifdef USE_LOCALE_TIME
- /* When these functions, are available, they were tried earlier and
- * indicated that the locale did not act like a proper UTF-8 one. So
- * if it claims to be UTF-8, it is a lie */
- if (is_codeset_name_UTF8(retval)) {
- retval = "";
- break;
- }
+ /* We can also try various strings associated with LC_TIME, like the names
+ * of months or days of the week */
-# endif
+ DAY_1, DAY_2, DAY_3, DAY_4, DAY_5, DAY_6, DAY_7,
+ MON_1, MON_2, MON_3, MON_4, MON_5, MON_6, MON_7, MON_8,
+ MON_9, MON_10, MON_11, MON_12,
+ ALT_DIGITS, AM_STR, PM_STR,
+ ABDAY_1, ABDAY_2, ABDAY_3, ABDAY_4, ABDAY_5, ABDAY_6, ABDAY_7,
+ ABMON_1, ABMON_2, ABMON_3, ABMON_4, ABMON_5, ABMON_6,
+ ABMON_7, ABMON_8, ABMON_9, ABMON_10, ABMON_11, ABMON_12
- /* Otherwise the code set name is considered to be everything between
- * the dot and the '@' */
- retval = save_to_buffer(retval, retbufp, retbuf_sizep);
+# endif
- break;
+ };
-# endif /* ! WIN32 */
-# endif /* USE_LOCALE_CTYPE */
+# ifdef USE_LOCALE_TIME
- } /* Giant switch() of nl_langinfo() items */
+ /* The code in the recursive call below can handle switching the locales,
+ * but by doing it now here, that code will check and discover that there
+ * is no need to switch then restore, avoiding those each loop iteration */
+ const char * orig_TIME_locale = toggle_locale_c(LC_TIME, locale);
- restore_toggled_locale_i(cat_index, orig_switched_locale);
+# endif
+
+ /* The trials array may consist of strings from two different locale
+ * categories. The call to my_langinfo_i() below needs to pass the proper
+ * category for each string. There is a max of 1 trial for LC_MONETARY;
+ * the rest are LC_TIME. So the array is arranged so the LC_MONETARY item
+ * (if any) is first, and all subsequent iterations will use LC_TIME.
+ * These #ifdefs set up the values for all possible combinations. */
+# if defined(USE_LOCALE_MONETARY) && defined(HAS_LOCALECONV)
+
+ locale_category_index cat_index = LC_MONETARY_INDEX_;
+
+# ifdef USE_LOCALE_TIME
+
+ const locale_category_index follow_on_cat_index = LC_TIME_INDEX_;
+ assert(trials[1] == DAY_1); /* Make sure only a single non-time entry */
+
+# else
+
+ /* Effectively out-of-bounds, as there is only the monetary entry */
+ const locale_category_index follow_on_cat_index = LC_ALL_INDEX_;
+
+# endif
+# elif defined(USE_LOCALE_TIME)
+
+ locale_category_index cat_index = LC_TIME_INDEX_;
+ const locale_category_index follow_on_cat_index = LC_TIME_INDEX_;
+
+# else
+
+ /* Effectively out-of-bounds, as here there are no trial entries at all.
+ * This allows this code to compile, but there are no strings to test, and
+ * so the answer will always be non-UTF-8. */
+ locale_category_index cat_index = LC_ALL_INDEX_;
+ const locale_category_index follow_on_cat_index = LC_ALL_INDEX_;
-# ifdef USE_LOCALE_CTYPE
- restore_toggled_locale_c(LC_CTYPE, orig_CTYPE_locale);
# endif
- if (utf8ness) {
- *utf8ness = is_utf8;
+ /* Everything set up; look through all the strings */
+ for (PERL_UINT_FAST8_T i = 0; i < C_ARRAY_LENGTH(trials); i++) {
+ (void) my_langinfo_i(trials[i], cat_index, locale,
+ &scratch_buf, &scratch_buf_size, NULL);
+ cat_index = follow_on_cat_index;
+
+ /* To prevent infinite recursive calls, we don't ask for the UTF-8ness
+ * of the string (in 'trials[i]') above. Instead we examine the
+ * returned string here */
+ const Size_t len = strlen(scratch_buf);
+ const U8 * first_variant;
+
+ /* If the string is identical whether or not it is encoded as UTF-8, it
+ * isn't helpful in determining UTF8ness. */
+ if (is_utf8_invariant_string_loc((U8 *) scratch_buf, len,
+ &first_variant))
+ {
+ continue;
+ }
+
+ /* Here, has non-ASCII. If not legal UTF-8, isn't a UTF-8 locale */
+ if (! is_utf8_string(first_variant,
+ len - (first_variant - (U8 *) scratch_buf)))
+ {
+ strings_utf8ness = UTF8NESS_NO;
+ break;
+ }
+
+ /* Here, is a legal non-ASCII UTF-8 string; tentatively set the return
+ * to YES; possibly overridden by later iterations */
+ strings_utf8ness = UTF8NESS_YES;
+
+ /* But if this corroborates our expectation, quit now */
+ if (lean_towards_being_utf8 & NAME_INDICATES_UTF8) {
+ break;
+ }
}
- return retval;
+# ifdef USE_LOCALE_TIME
-# endif /* All the implementations of my_langinfo() */
+ restore_toggled_locale_c(LC_TIME, orig_TIME_locale);
-/*--------------------------------------------------------------------------*/
+# endif
-} /* my_langinfo() */
+ Safefree(scratch_buf);
+ scratch_buf = NULL;
+
+ if (strings_utf8ness == UTF8NESS_NO) {
+ return codeset; /* No override */
+ }
+
+ /* Here all tested strings are legal UTF-8.
+ *
+ * Above we set UTF8NESS_YES if any string wasn't ASCII. But even if they
+ * are all ascii, and the locale name indicates it is a UTF-8 locale,
+ * assume the locale is UTF-8. */
+ if (lean_towards_being_utf8) {
+ strings_utf8ness = UTF8NESS_YES;
+ }
+
+ if (strings_utf8ness == UTF8NESS_YES) {
+ return "UTF-8";
+ }
+
+ /* Here, nothing examined indicates that the codeset is or isn't UTF-8.
+ * But what is it? The other locale categories are not likely to be of
+ * further help:
+ *
+ * LC_NUMERIC Only a few locales in the world have a non-ASCII radix or
+ * group separator.
+ * LC_CTYPE This code wouldn't be compiled if mbtowc() existed and was
+ * reliable. This is unlikely in C99. There are other
+ * functions that could be used instead, but are they going to
+ * exist, and be able to distinguish between UTF-8 and 8859-1?
+ * Deal with this only if it becomes necessary.
+ * LC_MESSAGES The strings returned from strerror() would seem likely
+ * candidates, but experience has shown that many systems
+ * don't actually have translations installed for them. They
+ * are instead always in English, so everything in them is
+ * ASCII, which is of no help to us. A Configure probe could
+ * possibly be written to see if this platform has non-ASCII
+ * error messages. But again, wait until it turns out to be
+ * an actual problem.
+ *
+ * Things like YESSTR, NOSTR, might not be in ASCII, but need
+ * nl_langinfo() to access, which we don't have.
+ */
+
+ /* Otherwise, assume the locale isn't UTF-8. This can be wrong if we don't
+ * have MB_CUR_MAX, and the locale is English without UTF-8 in its name,
+ * and with a dollar currency symbol. */
+ return codeset; /* No override */
+}
+# endif /* ! HAS_DEFINITIVE_UTF8NESS_DETERMINATION */
#endif /* USE_LOCALE */
/*
S<C<struct tm>> parameter. C<sv_strftime_ints> takes a bunch of integer
parameters that together completely define a given time.
-C<my_strftime> is kept for backwards compatibility. Knowing if the result
+C<my_strftime> is kept for backwards compatibility. Knowing if its result
should be considered UTF-8 or not requires significant extra logic.
Note that C<yday> and C<wday> effectively are ignored by C<sv_strftime_ints>
#ifndef HAS_STRFTIME
Perl_croak(aTHX_ "panic: no strftime");
#else
-# if defined(USE_LOCALE_CTYPE) && defined(USE_LOCALE_TIME)
+# if defined(WE_MUST_DEAL_WITH_MISMATCHED_CTYPE) && defined(USE_LOCALE_TIME)
const char * orig_CTYPE_LOCALE = toggle_locale_c(LC_CTYPE,
querylocale_c(LC_TIME));
strftime_return:
-# if defined(USE_LOCALE_CTYPE) && defined(USE_LOCALE_TIME)
+# if defined(WE_MUST_DEAL_WITH_MISMATCHED_CTYPE) && defined(USE_LOCALE_TIME)
restore_toggled_locale_c(LC_CTYPE, orig_CTYPE_LOCALE);
break;
case UTF8NESS_YES: /* Known to be UTF-8; must be UTF-8 locale if can't
- downgrade. But downgrading assumes the locale
- is latin 1. Maybe just fail XXX */
+ downgrade. */
if (! is_locale_utf8(locale)) {
locale_utf8ness = LOCALE_NOT_UTF8;
# else
- new_LC_ALL(locales, true);
-
+ new_LC_ALL(calculate_LC_ALL_string(locales,
+ INTERNAL_FORMAT,
+ WANT_TEMP_PV,
+ caller_line),
+ true);
# endif
+
}
STATIC void
} trials;
trials trial;
- SSize_t already_checked = 0;
+ unsigned int already_checked = 0;
const char * checked[C_trial];
# ifdef LC_ALL
}
/* And, for future iterations, indicate we've tried this locale */
+ assert(already_checked < C_ARRAY_LENGTH(checked));
checked[already_checked] = savepv(locale);
SAVEFREEPV(checked[already_checked]);
already_checked++;
const char * locale_to_restore_to = querylocale_i(cat_index);
DEBUG_Lv(PerlIO_printf(Perl_debug_log,
- "(%" LINE_Tf "): toggle_locale_i: index=%d(%s), wanted=%s,"
- " actual=%s\n",
- caller_line, cat_index, category_names[cat_index],
- new_locale, locale_to_restore_to));
+ "Entering toggle_locale_i: index=%d(%s)," \
+ " wanted=%s, actual=%s; called from %" LINE_Tf \
+ "\n", cat_index, category_names[cat_index],
+ new_locale, locale_to_restore_to, caller_line));
if (! locale_to_restore_to) {
locale_panic_via_(Perl_form(aTHX_
/* If the locales are the same, there's nothing to do */
if (strEQ(locale_to_restore_to, new_locale)) {
- DEBUG_Lv(PerlIO_printf(Perl_debug_log,
- "(%" LINE_Tf "): %s locale unchanged as %s\n",
- caller_line, category_names[cat_index],
- new_locale));
+ DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s locale unchanged as %s\n",
+ category_names[cat_index],
+ new_locale));
return NULL;
}
void_setlocale_i_with_caller(cat_index, new_locale, __FILE__, caller_line);
DEBUG_Lv(PerlIO_printf(Perl_debug_log,
- "(%" LINE_Tf "): %s locale switched to %s\n",
- caller_line, category_names[cat_index], new_locale));
+ "%s locale switched to %s\n",
+ category_names[cat_index], new_locale));
return locale_to_restore_to;
if (restore_locale == NULL) {
DEBUG_Lv(PerlIO_printf(Perl_debug_log,
- "(%" LINE_Tf "): No need to restore %s\n",
- caller_line, category_names[cat_index]));
+ "restore_toggled_locale_i: No need to" \
+ " restore %s; called from %" LINE_Tf "\n", \
+ category_names[cat_index], caller_line));
return;
}
DEBUG_Lv(PerlIO_printf(Perl_debug_log,
- "(%" LINE_Tf "): %s restoring locale to %s\n",
- caller_line, category_names[cat_index],
- restore_locale));
+ "restore_toggled_locale_i: restoring locale for" \
+ " %s to %s; called from %" LINE_Tf "\n", \
+ category_names[cat_index], restore_locale,
+ caller_line));
void_setlocale_i_with_caller(cat_index, restore_locale,
__FILE__, caller_line);
* per-thread variables:
* PL_controls_locale indicates if this thread is using
* per-thread locales or not
- * PL_cur_LC_ALL indicates what the the locale
- * should be if it is a per-thread
- * locale.
+ * PL_cur_LC_ALL indicates what the locale should be
+ * if it is a per-thread locale.
*/
if (UNLIKELY( PL_veto_switch_non_tTHX_context