3 * Copyright (C) 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001,
4 * 2002, 2003, 2005, 2006, 2007, 2008 by Larry Wall and others
6 * You may distribute under the terms of either the GNU General Public
7 * License or the Artistic License, as specified in the README file.
12 * A Elbereth Gilthoniel,
13 * silivren penna míriel
14 * o menel aglar elenath!
15 * Na-chaered palan-díriel
16 * o galadhremmin ennorath,
17 * Fanuilos, le linnathon
18 * nef aear, si nef aearon!
20 * [p.238 of _The Lord of the Rings_, II/i: "Many Meetings"]
23 /* utility functions for handling locale-specific stuff like what
24 * character represents the decimal point.
26 * All C programs have an underlying locale. Perl code generally doesn't pay
27 * any attention to it except within the scope of a 'use locale'. For most
28 * categories, it accomplishes this by just using different operations if it is
29 * in such scope than if not. However, various libc functions called by Perl
30 * are affected by the LC_NUMERIC category, so there are macros in perl.h that
31 * are used to toggle between the current locale and the C locale depending on
32 * the desired behavior of those functions at the moment. And, LC_MESSAGES is
33 * switched to the C locale for outputting the message unless within the scope
36 * This code now has multi-thread-safe locale handling on systems that support
37 * that. This is completely transparent to most XS code. On earlier systems,
38 * it would be possible to emulate thread-safe locales, but this likely would
39 * involve a lot of locale switching, and would require XS code changes.
40 * Macros could be written so that the code wouldn't have to know which type of
41 * system is being used.
43 * Table-driven code is used for simplicity and clarity, as many operations
44 * differ only in which category is being worked on. However the system
45 * categories need not be small contiguous integers, so do not lend themselves
46 * to table lookup. Instead we have created our own equivalent values which
47 * are all small contiguous non-negative integers, and translation functions
48 * between the two sets. For category 'LC_foo', the name of our index is
49 * LC_foo_INDEX_. Various parallel tables, indexed by these, are used.
51 * Many of the macros and functions in this file have one of the suffixes '_c',
52 * '_r', or '_i'. khw found these useful in remembering what type of locale
53 * category to use as their parameter. '_r' takes an int category number as
54 * passed to setlocale(), like LC_ALL, LC_CTYPE, etc. The 'r' indicates that
55 * the value isn't known until runtime. '_c' also indicates such a category
56 * number, but its value is known at compile time. These are both converted
57 * into unsigned indexes into various tables of category information, where the
58 * real work is generally done. The tables are generated at compile-time based
59 * on platform characteristics and Configure options. They hide from the code
60 * many of the vagaries of the different locale implementations out there. You
61 * may have already guessed that '_i' indicates the parameter is such an
62 * unsigned index. Converting from '_r' to '_i' requires run-time lookup.
63 * '_c' is used to get cpp to do this at compile time. To avoid the runtime
64 * expense, the code is structured to use '_r' at the API level, and once
65 * converted, everything possible is done using the table indexes.
67 * On unthreaded perls, most operations expand out to just the basic
68 * setlocale() calls. The same is true on threaded perls on modern Windows
69 * systems where the same API, after set up, is used for thread-safe locale
70 * handling. On other systems, there is a completely different API, specified
71 * in POSIX 2008, to do thread-safe locales. On these systems, our
72 * emulate_setlocale_i() function is used to hide the different API from the
73 * outside. This makes it completely transparent to most XS code.
75 * A huge complicating factor is that the LC_NUMERIC category is normally held
76 * in the C locale, except during those relatively rare times when it needs to
77 * be in the underlying locale. There is a bunch of code to accomplish this,
78 * and to allow easy switches from one state to the other.
80 * z/OS (os390) is an outlier. Locales really don't work under threads when
81 * either the radix character isn't a dot, or attempts are made to change
82 * locales after the first thread is created. The reason is that IBM has made
83 * it thread-safe by refusing to change locales (returning failure if
84 * attempted) any time after an application has called pthread_create() to
85 * create another thread. The expectation is that an application will set up
86 * its locale information before the first fork, and be stable thereafter. But
87 * perl toggles LC_NUMERIC if the locale's radix character isn't a dot, as do
88 * the other toggles, which are less common.
91 /* If the environment says to, we can output debugging information during
92 * initialization. This is done before option parsing, and before any thread
93 * creation, so can be a file-level static. (Must come before #including
96 static int debug_initialization = 0;
97 # define DEBUG_INITIALIZATION_set(v) (debug_initialization = v)
98 # define DEBUG_LOCALE_INITIALIZATION_ debug_initialization
100 # define debug_initialization 0
101 # define DEBUG_INITIALIZATION_set(v)
104 #define DEBUG_PRE_STMTS dSAVE_ERRNO; \
105 PerlIO_printf(Perl_debug_log, "%s: %" LINE_Tf ": ", __FILE__, __LINE__);
106 #define DEBUG_POST_STMTS RESTORE_ERRNO;
109 #define PERL_IN_LOCALE_C
110 #include "perl_langinfo.h"
122 PERL_STATIC_INLINE const char *
123 S_mortalized_pv_copy(pTHX_ const char * const pv)
125 PERL_ARGS_ASSERT_MORTALIZED_PV_COPY;
127 /* Copies the input pv, and arranges for it to be freed at an unspecified
134 const char * copy = savepv(pv);
140 /* Returns the Unix errno portion; ignoring any others. This is a macro here
141 * instead of putting it into perl.h, because unclear to khw what should be
143 #define GET_ERRNO saved_errno
145 /* Default values come from the C locale */
146 static const char C_codeset[] = "ANSI_X3.4-1968";
147 static const char C_decimal_point[] = ".";
148 static const char C_thousands_sep[] = "";
150 /* Is the C string input 'name' "C" or "POSIX"? If so, and 'name' is the
151 * return of setlocale(), then this is extremely likely to be the C or POSIX
152 * locale. However, the output of setlocale() is documented to be opaque, but
153 * the odds are extremely small that it would return these two strings for some
154 * other locale. Note that VMS in these two locales includes many non-ASCII
155 * characters as controls and punctuation (below are hex bytes):
157 * punct: A1-A3 A5 A7-AB B0-B3 B5-B7 B9-BD BF-CF D1-DD DF-EF F1-FD
158 * Oddly, none there are listed as alphas, though some represent alphabetics
159 * http://www.nntp.perl.org/group/perl.perl5.porters/2013/02/msg198753.html */
160 #define isNAME_C_OR_POSIX(name) \
162 && (( *(name) == 'C' && (*(name + 1)) == '\0') \
163 || strEQ((name), "POSIX")))
165 #if defined(HAS_NL_LANGINFO_L) || defined(HAS_NL_LANGINFO)
166 # define HAS_SOME_LANGINFO
168 #if defined(HAS_LOCALECONV) || defined(HAS_LOCALECONV_L)
169 # define HAS_SOME_LOCALECONV
172 #define my_langinfo_c(item, category, locale, retbufp, retbuf_sizep, utf8ness) \
173 my_langinfo_i(item, category##_INDEX_, locale, retbufp, \
174 retbuf_sizep, utf8ness)
178 /* This code keeps a LRU cache of the UTF-8ness of the locales it has so-far
179 * looked up. This is in the form of a C string: */
181 # define UTF8NESS_SEP "\v"
182 # define UTF8NESS_PREFIX "\f"
184 /* So, the string looks like:
186 * \vC\a0\vPOSIX\a0\vam_ET\a0\vaf_ZA.utf8\a1\ven_US.UTF-8\a1\0
188 * where the digit 0 after the \a indicates that the locale starting just
189 * after the preceding \v is not UTF-8, and the digit 1 mean it is. */
191 STATIC_ASSERT_DECL(STRLENs(UTF8NESS_SEP) == 1);
192 STATIC_ASSERT_DECL(STRLENs(UTF8NESS_PREFIX) == 1);
194 # define C_and_POSIX_utf8ness UTF8NESS_SEP "C" UTF8NESS_PREFIX "0" \
195 UTF8NESS_SEP "POSIX" UTF8NESS_PREFIX "0"
197 /* The cache is initialized to C_and_POSIX_utf8ness at start up. These are
198 * kept there always. The remining portion of the cache is LRU, with the
199 * oldest looked-up locale at the tail end */
202 # define setlocale_debug_string_c(category, locale, result) \
203 setlocale_debug_string_i(category##_INDEX_, locale, result)
204 # define setlocale_debug_string_r(category, locale, result) \
205 setlocale_debug_string_i(get_category_index(category, locale), \
209 # define toggle_locale_i(index, locale) \
210 S_toggle_locale_i(aTHX_ index, locale, __LINE__)
211 # define toggle_locale_c(cat, locale) toggle_locale_i(cat##_INDEX_, locale)
212 # define restore_toggled_locale_i(index, locale) \
213 S_restore_toggled_locale_i(aTHX_ index, locale, __LINE__)
214 # define restore_toggled_locale_c(cat, locale) \
215 restore_toggled_locale_i(cat##_INDEX_, locale)
217 /* Two parallel arrays indexed by our mapping of category numbers into small
218 * non-negative indexes; first the locale categories Perl uses on this system,
219 * used to do the inverse mapping. The second array is their names. These
220 * arrays are in mostly arbitrary order. */
222 STATIC const int categories[] = {
224 # ifdef USE_LOCALE_CTYPE
227 # ifdef USE_LOCALE_NUMERIC
230 # ifdef USE_LOCALE_COLLATE
233 # ifdef USE_LOCALE_TIME
236 # ifdef USE_LOCALE_MESSAGES
239 # ifdef USE_LOCALE_MONETARY
242 # ifdef USE_LOCALE_ADDRESS
245 # ifdef USE_LOCALE_IDENTIFICATION
248 # ifdef USE_LOCALE_MEASUREMENT
251 # ifdef USE_LOCALE_PAPER
254 # ifdef USE_LOCALE_TELEPHONE
257 # ifdef USE_LOCALE_SYNTAX
260 # ifdef USE_LOCALE_TOD
267 /* Placeholder as a precaution if code fails to check the return of
268 * get_category_index(), which returns this element to indicate an error */
272 /* The top-most real element is LC_ALL */
274 STATIC const char * const category_names[] = {
276 # ifdef USE_LOCALE_CTYPE
279 # ifdef USE_LOCALE_NUMERIC
282 # ifdef USE_LOCALE_COLLATE
285 # ifdef USE_LOCALE_TIME
288 # ifdef USE_LOCALE_MESSAGES
291 # ifdef USE_LOCALE_MONETARY
294 # ifdef USE_LOCALE_ADDRESS
297 # ifdef USE_LOCALE_IDENTIFICATION
300 # ifdef USE_LOCALE_MEASUREMENT
303 # ifdef USE_LOCALE_PAPER
306 # ifdef USE_LOCALE_TELEPHONE
309 # ifdef USE_LOCALE_SYNTAX
312 # ifdef USE_LOCALE_TOD
319 /* Placeholder as a precaution if code fails to check the return of
320 * get_category_index(), which returns this element to indicate an error */
324 /* A few categories require additional setup when they are changed. This table
325 * points to the functions that do that setup */
326 STATIC void (*update_functions[]) (pTHX_ const char *) = {
327 # ifdef USE_LOCALE_CTYPE
330 # ifdef USE_LOCALE_NUMERIC
333 # ifdef USE_LOCALE_COLLATE
336 # ifdef USE_LOCALE_TIME
339 # ifdef USE_LOCALE_MESSAGES
342 # ifdef USE_LOCALE_MONETARY
345 # ifdef USE_LOCALE_ADDRESS
348 # ifdef USE_LOCALE_IDENTIFICATION
351 # ifdef USE_LOCALE_MEASUREMENT
354 # ifdef USE_LOCALE_PAPER
357 # ifdef USE_LOCALE_TELEPHONE
360 # ifdef USE_LOCALE_SYNTAX
363 # ifdef USE_LOCALE_TOD
366 /* No harm done to have this even without an LC_ALL */
369 /* Placeholder as a precaution if code fails to check the return of
370 * get_category_index(), which returns this element to indicate an error */
376 /* On systems with LC_ALL, it is kept in the highest index position. (-2
377 * to account for the final unused placeholder element.) */
378 # define NOMINAL_LC_ALL_INDEX (C_ARRAY_LENGTH(categories) - 2)
381 /* On systems without LC_ALL, we pretend it is there, one beyond the real
382 * top element, hence in the unused placeholder element. */
383 # define NOMINAL_LC_ALL_INDEX (C_ARRAY_LENGTH(categories) - 1)
386 /* Pretending there is an LC_ALL element just above allows us to avoid most
387 * special cases. Most loops through these arrays in the code below are
388 * written like 'for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++)'. They will work
389 * on either type of system. But the code must be written to not access the
390 * element at 'LC_ALL_INDEX_' except on platforms that have it. This can be
391 * checked for at compile time by using the #define LC_ALL_INDEX_ which is only
392 * defined if we do have LC_ALL. */
395 S_get_category_index(const int category, const char * locale)
397 /* Given a category, return the equivalent internal index we generally use
400 * 'locale' is for use in any generated diagnostics, and may be NULL
402 * Some sort of hash could be used instead of this loop, but the number of
403 * elements is so far at most 12 */
406 const char * conditional_warn_text = "; can't set it to ";
408 PERL_ARGS_ASSERT_GET_CATEGORY_INDEX;
411 for (i = 0; i <= LC_ALL_INDEX_; i++)
413 for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++)
416 if (category == categories[i]) {
418 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
419 "index of category %d (%s) is %d\n",
420 category, category_names[i], i));
425 /* Here, we don't know about this category, so can't handle it. */
429 conditional_warn_text = "";
432 /* diag_listed_as: Unknown locale category %d; can't set it to %s */
433 Perl_warner_nocontext(packWARN(WARN_LOCALE),
434 "Unknown locale category %d%s%s",
435 category, conditional_warn_text, locale);
439 SETERRNO(EINVAL, LIB_INVARG);
443 /* Return an out-of-bounds value */
444 return NOMINAL_LC_ALL_INDEX + 1;
448 S_category_name(const int category)
452 index = get_category_index(category, NULL);
454 if (index <= NOMINAL_LC_ALL_INDEX) {
455 return category_names[index];
458 return Perl_form_nocontext("%d (unknown)", category);
461 #endif /* ifdef USE_LOCALE */
464 Perl_force_locale_unlock()
467 #if defined(USE_LOCALE_THREADS)
470 # ifdef LOCALE_UNLOCK_
478 #ifdef USE_POSIX_2008_LOCALE
481 S_use_curlocale_scratch(pTHX)
483 /* This function is used to hide from the caller the case where the current
484 * locale_t object in POSIX 2008 is the global one, which is illegal in
485 * many of the P2008 API calls. This checks for that and, if necessary
486 * creates a proper P2008 object. Any prior object is deleted, as is any
487 * remaining object during global destruction. */
489 locale_t cur = uselocale((locale_t) 0);
491 if (cur != LC_GLOBAL_LOCALE) {
495 if (PL_scratch_locale_obj) {
496 freelocale(PL_scratch_locale_obj);
499 PL_scratch_locale_obj = duplocale(LC_GLOBAL_LOCALE);
500 return PL_scratch_locale_obj;
506 Perl_locale_panic(const char * msg,
507 const char * file_name,
513 PERL_ARGS_ASSERT_LOCALE_PANIC;
515 force_locale_unlock();
517 #ifdef USE_C_BACKTRACE
518 dump_c_backtrace(Perl_debug_log, 20, 1);
521 /* diag_listed_as: panic: %s */
522 Perl_croak(aTHX_ "%s: %d: panic: %s; errno=%d\n",
523 file_name, line, msg, errnum);
526 #define setlocale_failure_panic_c( \
527 cat, current, failed, caller_0_line, caller_1_line) \
528 setlocale_failure_panic_i(cat##_INDEX_, current, failed, \
529 caller_0_line, caller_1_line)
531 /* porcelain_setlocale() presents a consistent POSIX-compliant interface to
532 * setlocale(). Windows requres a customized base-level setlocale() */
534 # define porcelain_setlocale(cat, locale) win32_setlocale(cat, locale)
536 # define porcelain_setlocale(cat, locale) \
537 ((const char *) setlocale(cat, locale))
540 /* The next layer up is to catch vagaries and bugs in the libc setlocale return
543 # define stdized_setlocale(cat, locale) \
544 stdize_locale(cat, porcelain_setlocale(cat, locale), \
545 &PL_stdize_locale_buf, &PL_stdize_locale_bufsize, __LINE__)
547 # define stdized_setlocale(cat, locale) porcelain_setlocale(cat, locale)
550 /* The next many lines form a layer above the close-to-the-metal 'porcelain'
551 * and 'stdized' macros. They are used to present a uniform API to the rest of
552 * the code in this file in spite of the disparate underlying implementations.
555 #ifndef USE_POSIX_2008_LOCALE
557 /* For non-threaded perls (which we are not to use the POSIX 2008 API on), or a
558 * thread-safe Windows one in which threading is invisible to us, the added
559 * layer just calls the base-level functions. See the introductory comments in
560 * this file for the meaning of the suffixes '_c', '_r', '_i'. */
562 # define setlocale_r(cat, locale) stdized_setlocale(cat, locale)
563 # define setlocale_i(i, locale) setlocale_r(categories[i], locale)
564 # define setlocale_c(cat, locale) setlocale_r(cat, locale)
566 # define void_setlocale_i(i, locale) \
568 if (! porcelain_setlocale(categories[i], locale)) { \
569 setlocale_failure_panic_i(i, NULL, locale, __LINE__, 0); \
570 NOT_REACHED; /* NOTREACHED */ \
573 # define void_setlocale_c(cat, locale) \
574 void_setlocale_i(cat##_INDEX_, locale)
575 # define void_setlocale_r(cat, locale) \
576 void_setlocale_i(get_category_index(cat, locale), locale)
578 # define bool_setlocale_r(cat, locale) \
579 cBOOL(porcelain_setlocale(cat, locale))
580 # define bool_setlocale_i(i, locale) \
581 bool_setlocale_c(categories[i], locale)
582 # define bool_setlocale_c(cat, locale) bool_setlocale_r(cat, locale)
584 /* All the querylocale...() forms return a mortalized copy. If you need
585 * something stable across calls, you need to savepv() the result yourself */
587 # define querylocale_r(cat) mortalized_pv_copy(setlocale_r(cat, NULL))
588 # define querylocale_c(cat) querylocale_r(cat)
589 # define querylocale_i(i) querylocale_c(categories[i])
591 #else /* Below is defined(POSIX 2008) */
593 /* Here, there is a completely different API to get thread-safe locales. We
594 * emulate the setlocale() API with our own function(s). setlocale categories,
595 * like LC_NUMERIC, are not valid here for the POSIX 2008 API. Instead, there
596 * are equivalents, like LC_NUMERIC_MASK, which we use instead, converting to
597 * by using get_category_index() followed by table lookup. */
599 # define emulate_setlocale_c(cat, locale, recalc_LC_ALL, line) \
600 emulate_setlocale_i(cat##_INDEX_, locale, recalc_LC_ALL, line)
602 /* A wrapper for the macros below. */
603 # define common_emulate_setlocale(i, locale) \
604 emulate_setlocale_i(i, locale, YES_RECALC_LC_ALL, __LINE__)
606 # define setlocale_i(i, locale) common_emulate_setlocale(i, locale)
607 # define setlocale_c(cat, locale) setlocale_i(cat##_INDEX_, locale)
608 # define setlocale_r(cat, locale) \
609 setlocale_i(get_category_index(cat, locale), locale)
611 # define void_setlocale_i(i, locale) ((void) setlocale_i(i, locale))
612 # define void_setlocale_c(cat, locale) \
613 void_setlocale_i(cat##_INDEX_, locale)
614 # define void_setlocale_r(cat, locale) ((void) setlocale_r(cat, locale))
616 # define bool_setlocale_i(i, locale) cBOOL(setlocale_i(i, locale))
617 # define bool_setlocale_c(cat, locale) \
618 bool_setlocale_i(cat##_INDEX_, locale)
619 # define bool_setlocale_r(cat, locale) cBOOL(setlocale_r(cat, locale))
621 # define querylocale_i(i) mortalized_pv_copy(my_querylocale_i(i))
622 # define querylocale_c(cat) querylocale_i(cat##_INDEX_)
623 # define querylocale_r(cat) querylocale_i(get_category_index(cat,NULL))
625 # ifndef USE_QUERYLOCALE
626 # define USE_PL_CURLOCALES
628 # define isSINGLE_BIT_SET(mask) isPOWER_OF_2(mask)
630 /* This code used to think querylocale() was valid on LC_ALL. Make sure
631 * all instances of that have been removed */
632 # define QUERYLOCALE_ASSERT(index) \
633 __ASSERT_(isSINGLE_BIT_SET(category_masks[index]))
634 # if ! defined(HAS_QUERYLOCALE) && defined(_NL_LOCALE_NAME)
635 # define querylocale_l(index, locale_obj) \
636 (QUERYLOCALE_ASSERT(index) \
637 mortalized_pv_copy(nl_langinfo_l( \
638 _NL_LOCALE_NAME(categories[index]), locale_obj)))
640 # define querylocale_l(index, locale_obj) \
641 (QUERYLOCALE_ASSERT(index) \
642 mortalized_pv_copy(querylocale(category_masks[index], locale_obj)))
645 # if defined(__GLIBC__) && defined(USE_LOCALE_MESSAGES)
646 # define HAS_GLIBC_LC_MESSAGES_BUG
647 # include <libintl.h>
650 /* A fourth array, parallel to the ones above to map from category to its
652 STATIC const int category_masks[] = {
653 # ifdef USE_LOCALE_CTYPE
656 # ifdef USE_LOCALE_NUMERIC
659 # ifdef USE_LOCALE_COLLATE
662 # ifdef USE_LOCALE_TIME
665 # ifdef USE_LOCALE_MESSAGES
668 # ifdef USE_LOCALE_MONETARY
671 # ifdef USE_LOCALE_ADDRESS
674 # ifdef USE_LOCALE_IDENTIFICATION
675 LC_IDENTIFICATION_MASK,
677 # ifdef USE_LOCALE_MEASUREMENT
680 # ifdef USE_LOCALE_PAPER
683 # ifdef USE_LOCALE_TELEPHONE
686 # ifdef USE_LOCALE_SYNTAX
689 # ifdef USE_LOCALE_TOD
692 /* LC_ALL can't be turned off by a Configure
693 * option, and in Posix 2008, should always be
694 * here, so compile it in unconditionally.
695 * This could catch some glitches at compile
699 /* Placeholder as a precaution if code fails to check the return of
700 * get_category_index(), which returns this element to indicate an error */
704 # define my_querylocale_c(cat) my_querylocale_i(cat##_INDEX_)
707 S_my_querylocale_i(pTHX_ const unsigned int index)
709 /* This function returns the name of the locale category given by the input
710 * index into our parallel tables of them.
712 * POSIX 2008, for some sick reason, chose not to provide a method to find
713 * the category name of a locale, discarding a basic linguistic tenet that
714 * for any object, people will create a name for it. Some vendors have
715 * created a querylocale() function to do just that. This function is a
716 * lot simpler to implement on systems that have this. Otherwise, we have
717 * to keep track of what the locale has been set to, so that we can return
718 * its name so as to emulate setlocale(). It's also possible for C code in
719 * some library to change the locale without us knowing it, though as of
720 * September 2017, there are no occurrences in CPAN of uselocale(). Some
721 * libraries do use setlocale(), but that changes the global locale, and
722 * threads using per-thread locales will just ignore those changes. */
725 const locale_t cur_obj = uselocale((locale_t) 0);
728 PERL_ARGS_ASSERT_MY_QUERYLOCALE_I;
729 assert(index <= NOMINAL_LC_ALL_INDEX);
731 category = categories[index];
733 DEBUG_Lv(PerlIO_printf(Perl_debug_log, "my_querylocale_i(%s) on %p\n",
734 category_names[index], cur_obj));
735 if (cur_obj == LC_GLOBAL_LOCALE) {
736 retval = porcelain_setlocale(category, NULL);
740 # ifdef USE_QUERYLOCALE
742 /* We don't currently keep records when there is querylocale(), so have
743 * to get it anew each time */
744 retval = (index == LC_ALL_INDEX_)
745 ? calculate_LC_ALL(cur_obj)
746 : querylocale_l(index, cur_obj);
750 /* But we do have up-to-date values when we keep our own records */
751 retval = PL_curlocales[index];
757 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
758 "my_querylocale_i(%s) returning '%s'\n",
759 category_names[index], retval));
763 # ifdef USE_PL_CURLOCALES
766 S_update_PL_curlocales_i(pTHX_
767 const unsigned int index,
768 const char * new_locale,
769 recalc_lc_all_t recalc_LC_ALL)
771 /* This is a helper function for emulate_setlocale_i(), mostly used to
772 * make that function easier to read. */
774 PERL_ARGS_ASSERT_UPDATE_PL_CURLOCALES_I;
775 assert(index <= NOMINAL_LC_ALL_INDEX);
777 if (index == LC_ALL_INDEX_) {
780 /* For LC_ALL, we change all individual categories to correspond */
781 /* PL_curlocales is a parallel array, so has same
782 * length as 'categories' */
783 for (i = 0; i < LC_ALL_INDEX_; i++) {
784 Safefree(PL_curlocales[i]);
785 PL_curlocales[i] = savepv(new_locale);
788 recalc_LC_ALL = YES_RECALC_LC_ALL;
792 /* Update the single category's record */
793 Safefree(PL_curlocales[index]);
794 PL_curlocales[index] = savepv(new_locale);
796 if (recalc_LC_ALL == RECALCULATE_LC_ALL_ON_FINAL_INTERATION) {
797 recalc_LC_ALL = (index == NOMINAL_LC_ALL_INDEX - 1)
799 : DONT_RECALC_LC_ALL;
803 if (recalc_LC_ALL == YES_RECALC_LC_ALL) {
804 Safefree(PL_curlocales[LC_ALL_INDEX_]);
805 PL_curlocales[LC_ALL_INDEX_] =
806 savepv(calculate_LC_ALL(PL_curlocales));
809 return PL_curlocales[index];
812 # endif /* Need PL_curlocales[] */
815 S_setlocale_from_aggregate_LC_ALL(pTHX_ const char * locale, const line_t line)
817 /* This function parses the value of the LC_ALL locale, assuming glibc
818 * syntax, and sets each individual category on the system to the proper
821 * This is likely to only ever be called from one place, so exists to make
822 * the calling function easier to read by moving this ancillary code out of
825 * The locale for each category is independent of the other categories.
826 * Often, they are all the same, but certainly not always. Perl, in fact,
827 * usually keeps LC_NUMERIC in the C locale, regardless of the underlying
828 * locale. LC_ALL has to be able to represent the case of when there are
829 * varying locales. Platforms have differing ways of representing this.
830 * Because of this, the code in this file goes to lengths to avoid the
831 * issue, generally looping over the component categories instead of
832 * referring to them in the aggregate, wherever possible. However, there
833 * are cases where we have to parse our own constructed aggregates, which use
834 * the glibc syntax. */
836 const char * locale_on_entry = querylocale_c(LC_ALL);
838 PERL_ARGS_ASSERT_SETLOCALE_FROM_AGGREGATE_LC_ALL;
840 /* If the string that gives what to set doesn't include all categories,
841 * the omitted ones get set to "C". To get this behavior, first set
842 * all the individual categories to "C", and override the furnished
843 * ones below. FALSE => No need to recalculate LC_ALL, as this is a
845 if (! emulate_setlocale_c(LC_ALL, "C", DONT_RECALC_LC_ALL, line)) {
846 setlocale_failure_panic_c(LC_ALL, locale_on_entry,
847 "C", __LINE__, line);
848 NOT_REACHED; /* NOTREACHED */
851 const char * s = locale;
852 const char * e = locale + strlen(locale);
856 /* Parse through the category */
857 while (isWORDCHAR(*p)) {
861 const char * category_end = p;
864 locale_panic_(Perl_form(aTHX_
865 "Unexpected character in locale category name '%02X", *(p-1)));
868 /* Parse through the locale name */
869 const char * name_start = p;
870 while (p < e && *p != ';') {
872 locale_panic_(Perl_form(aTHX_
873 "Unexpected character in locale name '%02X", *p));
878 const char * name_end = p;
880 /* Space past the semi-colon */
885 /* Find the index of the category name in our lists */
886 for (PERL_UINT_FAST8_T i = 0; i < LC_ALL_INDEX_; i++) {
888 /* Keep going if this index doesn't point to the category being
889 * parsed. The strnNE() avoids a Perl_form(), but would fail if
890 * ever a category name could be a substring of another one, e.g.,
891 * if there were a "LC_TIME_DATE" */
892 if strnNE(s, category_names[i], category_end - s) {
896 /* Here i points to the category being parsed. Now isolate the
897 * locale it is being changed to */
898 const char * individ_locale = Perl_form(aTHX_ "%.*s",
899 (int) (name_end - name_start), name_start);
901 /* And do the change. FALSE => Don't recalculate LC_ALL; we'll do
902 * it ourselves after the loop */
903 if (! emulate_setlocale_i(i, individ_locale,
904 DONT_RECALC_LC_ALL, line))
907 /* But if we have to back out, do fix up LC_ALL */
908 if (! emulate_setlocale_c(LC_ALL, locale_on_entry,
909 YES_RECALC_LC_ALL, line))
911 setlocale_failure_panic_i(i, individ_locale,
912 locale, __LINE__, line);
913 NOT_REACHED; /* NOTREACHED */
916 /* Reverting to the entry value succeeded, but the operation
917 * failed to go to the requested locale. */
921 /* Found and handled the desired category. Quit the inner loop to
922 * try the next category */
926 /* Finished with this category; iterate to the next one in the input */
930 # ifdef USE_PL_CURLOCALES
932 /* Here we have set all the individual categories. Update the LC_ALL entry
933 * as well. We can't just use the input 'locale' as the value may omit
934 * categories whose locale is 'C'. khw thinks it's better to store a
935 * complete LC_ALL. So calculate it. */
936 const char * retval = savepv(calculate_LC_ALL(PL_curlocales));
937 Safefree(PL_curlocales[LC_ALL_INDEX_]);
938 PL_curlocales[LC_ALL_INDEX_] = retval;
942 const char * retval = querylocale_c(LC_ALL);
949 # ifndef USE_QUERYLOCALE
952 S_find_locale_from_environment(pTHX_ const unsigned int index)
954 /* On systems without querylocale(), it is problematic getting the results
955 * of the POSIX 2008 equivalent of setlocale(category, "") (which gets the
956 * locale from the environment).
958 * To ensure that we know exactly what those values are, we do the setting
959 * ourselves, using the documented algorithm (assuming the documentation is
960 * correct) rather than use "" as the locale. This will lead to results
961 * that differ from native behavior if the native behavior differs from the
962 * standard documented value, but khw believes it is better to know what's
963 * going on, even if different from native, than to just guess.
965 * Another option would be, in a critical section, to save the global
966 * locale's current value, and do a straight setlocale(LC_ALL, ""). That
967 * would return our desired values, destroying the global locale's, which
968 * we would then restore. But that could cause races with any other thread
969 * that is using the global locale and isn't using the mutex. And, the
970 * only reason someone would have done that is because they are calling a
971 * library function, like in gtk, that calls setlocale(), and which can't
972 * be changed to use the mutex. That wouldn't be a problem if this were to
973 * be done before any threads had switched, say during perl construction
974 * time. But this code would still be needed for the general case. */
976 const char * default_name;
978 const char * locale_names[LC_ALL_INDEX_];
980 /* We rely on PerlEnv_getenv() returning a mortalized copy */
981 const char * const lc_all = PerlEnv_getenv("LC_ALL");
983 /* Use any "LC_ALL" environment variable, as it overrides everything
985 if (lc_all && strNE(lc_all, "")) {
989 /* Otherwise, we need to dig deeper. Unless overridden, the default is
990 * the LANG environment variable; "C" if it doesn't exist. */
991 default_name = PerlEnv_getenv("LANG");
992 if (! default_name || strEQ(default_name, "")) {
996 /* If setting an individual category, use its corresponding value found in
997 * the environment, if any; otherwise use the default we already
999 if (index != LC_ALL_INDEX_) {
1000 const char * const new_value = PerlEnv_getenv(category_names[index]);
1002 return (new_value && strNE(new_value, ""))
1007 /* Here, we are getting LC_ALL. Any categories that don't have a
1008 * corresponding environment variable set should be set to 'default_name'
1010 * Simply find the values for all categories, and call the function to
1011 * compute LC_ALL. */
1012 for (i = 0; i < LC_ALL_INDEX_; i++) {
1013 const char * const env_override = PerlEnv_getenv(category_names[i]);
1015 locale_names[i] = (env_override && strNE(env_override, ""))
1019 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
1020 "find_locale_from_environment i=%d, name=%s, locale=%s\n",
1021 i, category_names[i], locale_names[i]));
1024 return calculate_LC_ALL(locale_names);
1030 S_emulate_setlocale_i(pTHX_
1032 /* Our internal index of the 'category' setlocale is
1034 const unsigned int index,
1036 const char * new_locale, /* The locale to set the category to */
1037 const recalc_lc_all_t recalc_LC_ALL, /* Explained below */
1038 const line_t line /* Called from this line number */
1041 PERL_ARGS_ASSERT_EMULATE_SETLOCALE_I;
1042 assert(index <= NOMINAL_LC_ALL_INDEX);
1044 /* This function effectively performs a setlocale() on just the current
1045 * thread; thus it is thread-safe. It does this by using the POSIX 2008
1046 * locale functions to emulate the behavior of setlocale(). Similar to
1047 * regular setlocale(), the return from this function points to memory that
1048 * can be overwritten by other system calls, so needs to be copied
1049 * immediately if you need to retain it. The difference here is that
1050 * system calls besides another setlocale() can overwrite it.
1052 * By doing this, most locale-sensitive functions become thread-safe. The
1053 * exceptions are mostly those that return a pointer to static memory.
1055 * This function may be called in a tight loop that iterates over all
1056 * categories. Because LC_ALL is not a "real" category, but merely the sum
1057 * of all the other ones, such loops don't include LC_ALL. On systems that
1058 * have querylocale() or similar, the current LC_ALL value is immediately
1059 * retrievable; on systems lacking that feature, we have to keep track of
1060 * LC_ALL ourselves. We could do that on each iteration, only to throw it
1061 * away on the next, but the calculation is more than a trivial amount of
1062 * work. Instead, the 'recalc_LC_ALL' parameter is set to
1063 * RECALCULATE_LC_ALL_ON_FINAL_INTERATION to only do the calculation once.
1064 * This function calls itself recursively in such a loop.
1066 * When not in such a loop, the parameter is set to the other enum values
1067 * DONT_RECALC_LC_ALL or YES_RECALC_LC_ALL. */
1069 int mask = category_masks[index];
1070 const locale_t entry_obj = uselocale((locale_t) 0);
1071 const char * locale_on_entry = querylocale_i(index);
1073 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
1074 "emulate_setlocale_i input=%d (%s), mask=0x%x,"
1075 " new locale=\"%s\", current locale=\"%s\","
1076 "index=%d, object=%p\n",
1077 categories[index], category_name(categories[index]), mask,
1078 ((new_locale == NULL) ? "(nil)" : new_locale),
1079 locale_on_entry, index, entry_obj));
1081 /* Return the already-calculated info if just querying what the existing
1083 if (new_locale == NULL) {
1084 return locale_on_entry;
1087 /* Here, trying to change the locale, but it is a no-op if the new boss is
1088 * the same as the old boss. Except this routine is called when converting
1089 * from the global locale, so in that case we will create a per-thread
1090 * locale below (with the current values). Bitter experience also
1091 * indicates that newlocale() can free up the basis locale memory if we
1092 * call it with the new and old being the same. */
1093 if ( entry_obj != LC_GLOBAL_LOCALE
1095 && strEQ(new_locale, locale_on_entry))
1097 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
1098 "(%" LINE_Tf "): emulate_setlocale_i"
1099 " no-op to change to what it already was\n",
1102 # ifdef USE_PL_CURLOCALES
1104 /* On the final iteration of a loop that needs to recalculate LC_ALL, do
1105 * so. If no iteration changed anything, LC_ALL also doesn't change,
1106 * but khw believes the complexity needed to keep track of that isn't
1108 if (UNLIKELY( recalc_LC_ALL == RECALCULATE_LC_ALL_ON_FINAL_INTERATION
1109 && index == NOMINAL_LC_ALL_INDEX - 1))
1111 Safefree(PL_curlocales[LC_ALL_INDEX_]);
1112 PL_curlocales[LC_ALL_INDEX_] =
1113 savepv(calculate_LC_ALL(PL_curlocales));
1118 return locale_on_entry;
1121 # ifndef USE_QUERYLOCALE
1123 /* Without a querylocale() mechanism, we have to figure out ourselves what
1124 * happens with setting a locale to "" */
1125 if (strEQ(new_locale, "")) {
1126 new_locale = find_locale_from_environment(index);
1131 /* So far, it has worked that a semi-colon in the locale name means that
1132 * the category is LC_ALL and it subsumes categories which don't all have
1133 * the same locale. This is the glibc syntax. */
1134 if (strchr(new_locale, ';')) {
1135 assert(index == LC_ALL_INDEX_);
1136 return setlocale_from_aggregate_LC_ALL(new_locale, line);
1139 # ifdef HAS_GLIBC_LC_MESSAGES_BUG
1141 /* For this bug, if the LC_MESSAGES locale changes, we have to do an
1142 * expensive workaround. Save the current value so we can later determine
1144 const char * old_messages_locale = NULL;
1145 if ( (index == LC_MESSAGES_INDEX_ || index == LC_ALL_INDEX_)
1146 && LIKELY(PL_phase != PERL_PHASE_CONSTRUCT))
1148 old_messages_locale = querylocale_c(LC_MESSAGES);
1153 assert(PL_C_locale_obj);
1155 /* Now ready to switch to the input 'new_locale' */
1157 /* Switching locales generally entails freeing the current one's space (at
1158 * the C library's discretion), hence we can't be using that locale at the
1159 * time of the switch (this wasn't obvious to khw from the man pages). So
1160 * switch to a known locale object that we don't otherwise mess with. */
1161 if (! uselocale(PL_C_locale_obj)) {
1163 /* Not being able to change to the C locale is severe; don't keep
1165 setlocale_failure_panic_i(index, locale_on_entry, "C", __LINE__, line);
1166 NOT_REACHED; /* NOTREACHED */
1169 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
1170 "(%" LINE_Tf "): emulate_setlocale_i now using C"
1171 " object=%p\n", line, PL_C_locale_obj));
1175 /* We created a (never changing) object at start-up for LC_ALL being in the
1176 * C locale. If this call is to switch to LC_ALL=>C, simply use that
1177 * object. But in fact, we already have switched to it just above, in
1178 * preparation for the general case. Since we're already there, no need to
1179 * do further switching. */
1180 if (mask == LC_ALL_MASK && isNAME_C_OR_POSIX(new_locale)) {
1181 DEBUG_Lv(PerlIO_printf(Perl_debug_log, "(%" LINE_Tf "):"
1182 " emulate_setlocale_i will stay"
1183 " in C object\n", line));
1184 new_obj = PL_C_locale_obj;
1186 /* And free the old object if it isn't a special one */
1187 if (entry_obj != LC_GLOBAL_LOCALE && entry_obj != PL_C_locale_obj) {
1188 freelocale(entry_obj);
1191 else { /* Here is the general case, not to LC_ALL=>C */
1192 locale_t basis_obj = entry_obj;
1194 /* Specially handle two objects */
1195 if (basis_obj == LC_GLOBAL_LOCALE || basis_obj == PL_C_locale_obj) {
1197 /* For these two objects, we make duplicates to hand to newlocale()
1198 * below. For LC_GLOBAL_LOCALE, this is because newlocale()
1199 * doesn't necessarily accept it as input (the results are
1200 * undefined). For PL_C_locale_obj, it is so that it never gets
1201 * modified, as otherwise newlocale() is free to do so */
1202 basis_obj = duplocale(entry_obj);
1204 locale_panic_(Perl_form(aTHX_ "(%" LINE_Tf "): duplocale failed",
1206 NOT_REACHED; /* NOTREACHED */
1209 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
1210 "(%" LINE_Tf "): emulate_setlocale_i"
1211 " created %p by duping the input\n",
1215 /* Ready to create a new locale by modification of the exising one */
1216 new_obj = newlocale(mask, new_locale, basis_obj);
1219 DEBUG_L(PerlIO_printf(Perl_debug_log,
1220 " (%" LINE_Tf "): emulate_setlocale_i"
1221 " creating new object from %p failed:"
1223 line, basis_obj, GET_ERRNO));
1225 /* Failed. Likely this is because the proposed new locale isn't
1226 * valid on this system. But we earlier switched to the LC_ALL=>C
1227 * locale in anticipation of it succeeding, Now have to switch
1228 * back to the state upon entry */
1229 if (! uselocale(entry_obj)) {
1230 setlocale_failure_panic_i(index, "switching back to",
1231 locale_on_entry, __LINE__, line);
1232 NOT_REACHED; /* NOTREACHED */
1235 # ifdef USE_PL_CURLOCALES
1237 if (entry_obj == LC_GLOBAL_LOCALE) {
1238 /* Here, we are back in the global locale. We may never have
1239 * set PL_curlocales. If the locale change had succeeded, the
1240 * code would have then set them up, but since it didn't, do so
1241 * here. khw isn't sure if this prevents some issues or not,
1242 * but tis is defensive coding. The system setlocale() returns
1243 * the desired information. This will calculate LC_ALL's entry
1244 * only on the final iteration */
1245 for (PERL_UINT_FAST8_T i = 0; i < NOMINAL_LC_ALL_INDEX; i++) {
1246 update_PL_curlocales_i(i,
1247 porcelain_setlocale(categories[i], NULL),
1248 RECALCULATE_LC_ALL_ON_FINAL_INTERATION);
1256 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
1257 "(%" LINE_Tf "): emulate_setlocale_i created %p"
1258 " while freeing %p\n", line, new_obj, basis_obj));
1260 /* Here, successfully created an object representing the desired
1261 * locale; now switch into it */
1262 if (! uselocale(new_obj)) {
1263 freelocale(new_obj);
1264 locale_panic_(Perl_form(aTHX_ "(%" LINE_Tf "): emulate_setlocale_i"
1265 " switching into new locale failed",
1270 /* Here, we are using 'new_obj' which matches the input 'new_locale'. */
1271 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
1272 "(%" LINE_Tf "): emulate_setlocale_i now using %p\n", line, new_obj));
1274 /* We are done, except for updating our records (if the system doesn't keep
1275 * them) and in the case of locale "", we don't actually know what the
1276 * locale that got switched to is, as it came from the environment. So
1277 * have to find it */
1279 # ifdef USE_QUERYLOCALE
1281 if (strEQ(new_locale, "")) {
1282 new_locale = querylocale_i(index);
1285 PERL_UNUSED_ARG(recalc_LC_ALL);
1289 new_locale = update_PL_curlocales_i(index, new_locale, recalc_LC_ALL);
1292 # ifdef HAS_GLIBC_LC_MESSAGES_BUG
1294 /* Invalidate the glibc cache of loaded translations if the locale has changed,
1295 * see [perl #134264] */
1296 if (old_messages_locale) {
1297 if (strNE(old_messages_locale, my_querylocale_c(LC_MESSAGES))) {
1298 textdomain(textdomain(NULL));
1307 #endif /* End of the various implementations of the setlocale and
1308 querylocale macros used in the remainder of this program */
1312 /* So far, the locale strings returned by modern 2008-compliant systems have
1316 S_stdize_locale(pTHX_ const int category,
1317 const char *input_locale,
1320 const line_t caller_line)
1322 /* The return value of setlocale() is opaque, but is required to be usable
1323 * as input to a future setlocale() to create the same state.
1324 * Unfortunately not all systems are compliant. But most often they are of
1325 * a very restricted set of forms that this file has been coded to expect.
1327 * There are some outliers, though, that this function tries to tame:
1329 * 1) A new-line. This function chomps any \n characters
1330 * 2) foo=bar. 'bar' is what is generally meant, and the foo= part is
1331 * stripped. This form is legal for LC_ALL. When found in
1332 * that category group, the function calls itself
1333 * recursively on each possible component category to make
1334 * sure the individual categories are ok.
1336 * If no changes to the input were made, it is returned; otherwise the
1337 * changed version is stored into memory at *buf, with *buf_size set to its
1338 * new value, and *buf is returned.
1341 const char * first_bad;
1342 const char * retval;
1344 PERL_ARGS_ASSERT_STDIZE_LOCALE;
1346 if (input_locale == NULL) {
1350 first_bad = strpbrk(input_locale, "=\n");
1352 /* Most likely, there isn't a problem with the input */
1353 if (LIKELY(! first_bad)) {
1354 return input_locale;
1359 /* But if there is, and the category is LC_ALL, we have to look at each
1360 * component category */
1361 if (category == LC_ALL) {
1362 const char * individ_locales[LC_ALL_INDEX_];
1363 bool made_changes = FALSE;
1366 for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++) {
1367 Size_t this_size = 0;
1368 individ_locales[i] = stdize_locale(categories[i],
1369 porcelain_setlocale(categories[i],
1371 &individ_locales[i],
1375 /* If the size didn't change, it means this category did not have
1376 * to be adjusted, and individ_locales[i] points to the buffer
1377 * returned by porcelain_setlocale(); we have to copy that before
1378 * it's called again in the next iteration */
1379 if (this_size == 0) {
1380 individ_locales[i] = savepv(individ_locales[i]);
1383 made_changes = TRUE;
1387 /* If all the individual categories were ok as-is, this was a false
1388 * alarm. We must have seen an '=' which was a legal occurrence in
1389 * this combination locale */
1390 if (! made_changes) {
1391 retval = input_locale; /* The input can be returned unchanged */
1394 retval = save_to_buffer(querylocale_c(LC_ALL), buf, buf_size);
1397 for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++) {
1398 Safefree(individ_locales[i]);
1404 # else /* else no LC_ALL */
1406 PERL_UNUSED_ARG(category);
1407 PERL_UNUSED_ARG(caller_line);
1411 /* Here, there was a problem in an individual category. This means that at
1412 * least one adjustment will be necessary. Create a modifiable copy */
1413 retval = save_to_buffer(input_locale, buf, buf_size);
1415 if (*first_bad != '=') {
1417 /* Translate the found position into terms of the copy */
1418 first_bad = retval + (first_bad - input_locale);
1422 /* It is unlikely that the return is so screwed-up that it contains
1423 * multiple equals signs, but handle that case by stripping all of
1425 const char * final_equals = strrchr(retval, '=');
1427 /* The length passed here causes the move to include the terminating
1429 Move(final_equals + 1, retval, strlen(final_equals), char);
1431 /* See if there are additional problems; if not, we're good to return.
1433 first_bad = strpbrk(retval, "\n");
1440 /* Here, the problem must be a \n. Get rid of it and what follows.
1441 * (Originally, only a trailing \n was stripped. Unsure what to do if not
1443 *((char *) first_bad) = '\0';
1447 #if defined(USE_POSIX_2008_LOCALE)
1452 # ifdef USE_QUERYLOCALE
1453 S_calculate_LC_ALL(pTHX_ const locale_t cur_obj)
1455 S_calculate_LC_ALL(pTHX_ const char ** individ_locales)
1459 /* For POSIX 2008, we have to figure out LC_ALL ourselves when needed.
1460 * querylocale(), on systems that have it, doesn't tend to work for LC_ALL.
1461 * So we have to construct the answer ourselves based on the passed in
1462 * data, which is either a locale_t object, for systems with querylocale(),
1463 * or an array we keep updated to the proper values, otherwise.
1465 * This returns a mortalized string containing the locale name(s) of
1468 * If all individual categories are the same locale, we can just set LC_ALL
1469 * to that locale. But if not, we have to create an aggregation of all the
1470 * categories on the system. Platforms differ as to the syntax they use
1471 * for these non-uniform locales for LC_ALL. Some use a '/' or other
1472 * delimiter of the locales with a predetermined order of categories; a
1473 * Configure probe would be needed to tell us how to decipher those. glibc
1474 * uses a series of name=value pairs, like
1475 * LC_NUMERIC=C;LC_TIME=en_US.UTF-8;...
1476 * The syntax we use for our aggregation doesn't much matter, as we take
1477 * care not to use the native setlocale() function on whatever style is
1478 * chosen. But, it would be possible for someone to call Perl_setlocale()
1479 * using a native style we don't understand. So far no one has complained.
1481 * For systems that have categories we don't know about, the algorithm
1482 * below won't know about those missing categories, leading to potential
1483 * bugs for code that looks at them. If there is an environment variable
1484 * that sets that category, we won't know to look for it, and so our use of
1485 * LANG or "C" improperly overrides it. On the other hand, if we don't do
1486 * what is done here, and there is no environment variable, the category's
1487 * locale should be set to LANG or "C". So there is no good solution. khw
1488 * thinks the best is to make sure we have a complete list of possible
1489 * categories, adding new ones as they show up on obscure platforms.
1493 Size_t names_len = 0;
1494 bool are_all_categories_the_same_locale = TRUE;
1495 char * aggregate_locale;
1496 char * previous_start = NULL;
1497 char * this_start = NULL;
1498 Size_t entry_len = 0;
1500 PERL_ARGS_ASSERT_CALCULATE_LC_ALL;
1502 /* First calculate the needed size for the string listing the categories
1503 * and their locales. */
1504 for (i = 0; i < LC_ALL_INDEX_; i++) {
1506 # ifdef USE_QUERYLOCALE
1507 const char * entry = querylocale_l(i, cur_obj);
1509 const char * entry = individ_locales[i];
1512 names_len += strlen(category_names[i])
1518 names_len++; /* Trailing '\0' */
1520 /* Allocate enough space for the aggregated string */
1521 SAVEFREEPV(Newxz(aggregate_locale, names_len, char));
1523 /* Then fill it in */
1524 for (i = 0; i < LC_ALL_INDEX_; i++) {
1527 # ifdef USE_QUERYLOCALE
1528 const char * entry = querylocale_l(i, cur_obj);
1530 const char * entry = individ_locales[i];
1533 new_len = my_strlcat(aggregate_locale, category_names[i], names_len);
1534 assert(new_len <= names_len);
1535 new_len = my_strlcat(aggregate_locale, "=", names_len);
1536 assert(new_len <= names_len);
1538 this_start = aggregate_locale + strlen(aggregate_locale);
1539 entry_len = strlen(entry);
1541 new_len = my_strlcat(aggregate_locale, entry, names_len);
1542 assert(new_len <= names_len);
1543 new_len = my_strlcat(aggregate_locale, ";", names_len);
1544 assert(new_len <= names_len);
1545 PERL_UNUSED_VAR(new_len); /* Only used in DEBUGGING */
1548 && are_all_categories_the_same_locale
1549 && memNE(previous_start, this_start, entry_len + 1))
1551 are_all_categories_the_same_locale = FALSE;
1554 previous_start = this_start;
1558 /* If they are all the same, just return any one of them */
1559 if (are_all_categories_the_same_locale) {
1560 aggregate_locale = this_start;
1561 aggregate_locale[entry_len] = '\0';
1564 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
1565 "calculate_LC_ALL returning '%s'\n",
1568 return aggregate_locale;
1570 #endif /*defined(USE_POSIX_2008_LOCALE)*/
1573 S_setlocale_failure_panic_i(pTHX_
1574 const unsigned int cat_index,
1575 const char * current,
1576 const char * failed,
1577 const line_t caller_0_line,
1578 const line_t caller_1_line)
1581 const int cat = categories[cat_index];
1582 const char * name = category_names[cat_index];
1584 PERL_ARGS_ASSERT_SETLOCALE_FAILURE_PANIC_I;
1586 if (current == NULL) {
1587 current = querylocale_i(cat_index);
1590 Perl_locale_panic(Perl_form(aTHX_ "(%" LINE_Tf
1591 "): Can't change locale for %s(%d)"
1592 " from '%s' to '%s'",
1593 caller_1_line, name, cat,
1595 __FILE__, caller_0_line, GET_ERRNO);
1596 NOT_REACHED; /* NOTREACHED */
1599 /* Any of these will allow us to find the RADIX */
1600 # if defined(USE_LOCALE_NUMERIC) && ( defined(HAS_SOME_LANGINFO) \
1601 || defined(HAS_SOME_LOCALECONV) \
1602 || defined(HAS_SNPRINTF))
1603 # define CAN_CALCULATE_RADIX
1607 S_set_numeric_radix(pTHX_ const bool use_locale)
1609 /* If 'use_locale' is FALSE, set to use a dot for the radix character. If
1610 * TRUE, use the radix character derived from the current locale */
1612 # ifdef CAN_CALCULATE_RADIX
1614 utf8ness_t utf8ness = UTF8NESS_IMMATERIAL;
1616 const char * scratch_buffer = NULL;
1619 radix = C_decimal_point;
1622 radix = my_langinfo_c(RADIXCHAR, LC_NUMERIC,
1624 &scratch_buffer, NULL, &utf8ness);
1627 sv_setpv(PL_numeric_radix_sv, radix);
1628 Safefree(scratch_buffer);
1630 if (utf8ness == UTF8NESS_YES) {
1631 SvUTF8_on(PL_numeric_radix_sv);
1634 DEBUG_L(PerlIO_printf(Perl_debug_log, "Locale radix is '%s', ?UTF-8=%d\n",
1635 SvPVX(PL_numeric_radix_sv),
1636 cBOOL(SvUTF8(PL_numeric_radix_sv))));
1639 PERL_UNUSED_ARG(use_locale);
1641 # endif /* USE_LOCALE_NUMERIC and can find the radix char */
1646 S_new_numeric(pTHX_ const char *newnum)
1649 # ifndef USE_LOCALE_NUMERIC
1651 PERL_UNUSED_ARG(newnum);
1655 /* Called after each libc setlocale() call affecting LC_NUMERIC, to tell
1656 * core Perl this and that 'newnum' is the name of the new locale, and we
1657 * are switched into it. It installs this locale as the current underlying
1658 * default, and then switches to the C locale, if necessary, so that the
1659 * code that has traditionally expected the radix character to be a dot may
1660 * continue to do so.
1662 * The default locale and the C locale can be toggled between by use of the
1663 * set_numeric_underlying() and set_numeric_standard() functions, which
1664 * should probably not be called directly, but only via macros like
1665 * SET_NUMERIC_STANDARD() in perl.h.
1667 * The toggling is necessary mainly so that a non-dot radix decimal point
1668 * character can be input and output, while allowing internal calculations
1671 * This sets several interpreter-level variables:
1672 * PL_numeric_name The underlying locale's name: a copy of 'newnum'
1673 * PL_numeric_underlying A boolean indicating if the toggled state is such
1674 * that the current locale is the program's underlying
1676 * PL_numeric_standard An int indicating if the toggled state is such
1677 * that the current locale is the C locale or
1678 * indistinguishable from the C locale. If non-zero, it
1679 * is in C; if > 1, it means it may not be toggled away
1681 * PL_numeric_underlying_is_standard A bool kept by this function
1682 * indicating that the underlying locale and the standard
1683 * C locale are indistinguishable for the purposes of
1684 * LC_NUMERIC. This happens when both of the above two
1685 * variables are true at the same time. (Toggling is a
1686 * no-op under these circumstances.) This variable is
1687 * used to avoid having to recalculate.
1688 * PL_numeric_radix_sv Contains the string that code should use for the
1689 * decimal point. It is set to either a dot or the
1690 * program's underlying locale's radix character string,
1691 * depending on the situation.
1692 * PL_underlying_numeric_obj = (only on POSIX 2008 platforms) An object
1693 * with everything set up properly so as to avoid work on
1700 Safefree(PL_numeric_name);
1701 PL_numeric_name = savepv("C");
1702 PL_numeric_standard = TRUE;
1703 PL_numeric_underlying = TRUE;
1704 PL_numeric_underlying_is_standard = TRUE;
1708 save_newnum = savepv(newnum);
1709 PL_numeric_underlying = TRUE;
1710 PL_numeric_standard = isNAME_C_OR_POSIX(save_newnum);
1712 # ifndef TS_W32_BROKEN_LOCALECONV
1714 /* If its name isn't C nor POSIX, it could still be indistinguishable from
1715 * them. But on broken Windows systems calling my_langinfo() for
1716 * THOUSEP can currently (but rarely) cause a race, so avoid doing that,
1717 * and just always change the locale if not C nor POSIX on those systems */
1718 if (! PL_numeric_standard) {
1719 const char * scratch_buffer = NULL;
1720 PL_numeric_standard = strEQ(C_decimal_point,
1721 my_langinfo_c(RADIXCHAR, LC_NUMERIC,
1723 &scratch_buffer, NULL, NULL));
1724 Safefree(scratch_buffer);
1725 scratch_buffer = NULL;
1727 PL_numeric_standard &= strEQ(C_thousands_sep,
1728 my_langinfo_c(THOUSEP, LC_NUMERIC,
1730 &scratch_buffer, NULL, NULL));
1731 Safefree(scratch_buffer);
1736 /* Save the new name if it isn't the same as the previous one, if any */
1737 if (strNE(PL_numeric_name, save_newnum)) {
1738 /* Save the locale name for future use */
1739 Safefree(PL_numeric_name);
1740 PL_numeric_name = save_newnum;
1743 Safefree(save_newnum);
1746 PL_numeric_underlying_is_standard = PL_numeric_standard;
1748 # ifdef USE_POSIX_2008_LOCALE
1750 /* We keep a special object for easy switching to */
1751 PL_underlying_numeric_obj = newlocale(LC_NUMERIC_MASK,
1753 PL_underlying_numeric_obj);
1757 DEBUG_L( PerlIO_printf(Perl_debug_log,
1758 "Called new_numeric with %s, PL_numeric_name=%s\n",
1759 newnum, PL_numeric_name));
1761 /* Keep LC_NUMERIC so that it has the C locale radix and thousands
1762 * separator. This is for XS modules, so they don't have to worry about
1763 * the radix being a non-dot. (Core operations that need the underlying
1764 * locale change to it temporarily). */
1765 if (PL_numeric_standard) {
1766 set_numeric_radix(0);
1769 set_numeric_standard();
1777 Perl_set_numeric_standard(pTHX)
1780 # ifdef USE_LOCALE_NUMERIC
1782 /* Unconditionally toggle the LC_NUMERIC locale to the current underlying
1785 * Most code should use the macro SET_NUMERIC_STANDARD() in perl.h
1786 * instead of calling this directly. The macro avoids calling this routine
1787 * if toggling isn't necessary according to our records (which could be
1788 * wrong if some XS code has changed the locale behind our back) */
1790 DEBUG_L(PerlIO_printf(Perl_debug_log,
1791 "Setting LC_NUMERIC locale to standard C\n"));
1793 void_setlocale_c(LC_NUMERIC, "C");
1794 PL_numeric_standard = TRUE;
1795 PL_numeric_underlying = PL_numeric_underlying_is_standard;
1796 set_numeric_radix(0);
1798 # endif /* USE_LOCALE_NUMERIC */
1803 Perl_set_numeric_underlying(pTHX)
1806 # ifdef USE_LOCALE_NUMERIC
1808 /* Unconditionally toggle the LC_NUMERIC locale to the current underlying
1811 * Most code should use the macro SET_NUMERIC_UNDERLYING() in perl.h
1812 * instead of calling this directly. The macro avoids calling this routine
1813 * if toggling isn't necessary according to our records (which could be
1814 * wrong if some XS code has changed the locale behind our back) */
1816 DEBUG_L(PerlIO_printf(Perl_debug_log, "Setting LC_NUMERIC locale to %s\n",
1819 void_setlocale_c(LC_NUMERIC, PL_numeric_name);
1820 PL_numeric_standard = PL_numeric_underlying_is_standard;
1821 PL_numeric_underlying = TRUE;
1822 set_numeric_radix(! PL_numeric_standard);
1824 # endif /* USE_LOCALE_NUMERIC */
1829 * Set up for a new ctype locale.
1832 S_new_ctype(pTHX_ const char *newctype)
1835 # ifndef USE_LOCALE_CTYPE
1837 PERL_UNUSED_ARG(newctype);
1838 PERL_UNUSED_CONTEXT;
1842 /* Called after each libc setlocale() call affecting LC_CTYPE, to tell
1843 * core Perl this and that 'newctype' is the name of the new locale.
1845 * This function sets up the folding arrays for all 256 bytes, assuming
1846 * that tofold() is tolc() since fold case is not a concept in POSIX,
1848 * Any code changing the locale (outside this file) should use
1849 * Perl_setlocale or POSIX::setlocale, which call this function. Therefore
1850 * this function should be called directly only from this file and from
1851 * POSIX::setlocale() */
1855 /* Don't check for problems if we are suppressing the warnings */
1856 bool check_for_problems = ckWARN_d(WARN_LOCALE) || UNLIKELY(DEBUG_L_TEST);
1857 bool maybe_utf8_turkic = FALSE;
1859 PERL_ARGS_ASSERT_NEW_CTYPE;
1861 DEBUG_L(PerlIO_printf(Perl_debug_log, "Entering new_ctype(%s)\n", newctype));
1863 /* We will replace any bad locale warning with 1) nothing if the new one is
1864 * ok; or 2) a new warning for the bad new locale */
1865 if (PL_warn_locale) {
1866 SvREFCNT_dec_NN(PL_warn_locale);
1867 PL_warn_locale = NULL;
1870 PL_in_utf8_CTYPE_locale = is_locale_utf8(newctype);
1872 /* A UTF-8 locale gets standard rules. But note that code still has to
1873 * handle this specially because of the three problematic code points */
1874 if (PL_in_utf8_CTYPE_locale) {
1875 Copy(PL_fold_latin1, PL_fold_locale, 256, U8);
1877 /* UTF-8 locales can have special handling for 'I' and 'i' if they are
1878 * Turkic. Make sure these two are the only anomalies. (We don't
1879 * require towupper and towlower because they aren't in C89.) */
1881 # if defined(HAS_TOWUPPER) && defined (HAS_TOWLOWER)
1883 if (towupper('i') == 0x130 && towlower('I') == 0x131)
1887 if (toU8_UPPER_LC('i') == 'i' && toU8_LOWER_LC('I') == 'I')
1892 /* This is how we determine it really is Turkic */
1893 check_for_problems = TRUE;
1894 maybe_utf8_turkic = TRUE;
1898 /* We don't populate the other lists if a UTF-8 locale, but do check that
1899 * everything works as expected, unless checking turned off */
1900 if (check_for_problems || ! PL_in_utf8_CTYPE_locale) {
1901 /* Assume enough space for every character being bad. 4 spaces each
1902 * for the 94 printable characters that are output like "'x' "; and 5
1903 * spaces each for "'\\' ", "'\t' ", and "'\n' "; plus a terminating
1905 char bad_chars_list[ (94 * 4) + (3 * 5) + 1 ] = { '\0' };
1906 bool multi_byte_locale = FALSE; /* Assume is a single-byte locale
1908 unsigned int bad_count = 0; /* Count of bad characters */
1910 for (i = 0; i < 256; i++) {
1911 if (! PL_in_utf8_CTYPE_locale) {
1912 if (isU8_UPPER_LC(i))
1913 PL_fold_locale[i] = (U8) toU8_LOWER_LC(i);
1914 else if (isU8_LOWER_LC(i))
1915 PL_fold_locale[i] = (U8) toU8_UPPER_LC(i);
1917 PL_fold_locale[i] = (U8) i;
1920 /* If checking for locale problems, see if the native ASCII-range
1921 * printables plus \n and \t are in their expected categories in
1922 * the new locale. If not, this could mean big trouble, upending
1923 * Perl's and most programs' assumptions, like having a
1924 * metacharacter with special meaning become a \w. Fortunately,
1925 * it's very rare to find locales that aren't supersets of ASCII
1926 * nowadays. It isn't a problem for most controls to be changed
1927 * into something else; we check only \n and \t, though perhaps \r
1928 * could be an issue as well. */
1929 if ( check_for_problems
1930 && (isGRAPH_A(i) || isBLANK_A(i) || i == '\n'))
1932 bool is_bad = FALSE;
1933 char name[4] = { '\0' };
1935 /* Convert the name into a string */
1940 else if (i == '\n') {
1941 my_strlcpy(name, "\\n", sizeof(name));
1943 else if (i == '\t') {
1944 my_strlcpy(name, "\\t", sizeof(name));
1948 my_strlcpy(name, "' '", sizeof(name));
1951 /* Check each possibe class */
1952 if (UNLIKELY(cBOOL(isU8_ALPHANUMERIC_LC(i)) != cBOOL(isALPHANUMERIC_A(i)))) {
1954 DEBUG_L(PerlIO_printf(Perl_debug_log,
1955 "isalnum('%s') unexpectedly is %x\n",
1956 name, cBOOL(isU8_ALPHANUMERIC_LC(i))));
1958 if (UNLIKELY(cBOOL(isU8_ALPHA_LC(i)) != cBOOL(isALPHA_A(i)))) {
1960 DEBUG_L(PerlIO_printf(Perl_debug_log,
1961 "isalpha('%s') unexpectedly is %x\n",
1962 name, cBOOL(isU8_ALPHA_LC(i))));
1964 if (UNLIKELY(cBOOL(isU8_DIGIT_LC(i)) != cBOOL(isDIGIT_A(i)))) {
1966 DEBUG_L(PerlIO_printf(Perl_debug_log,
1967 "isdigit('%s') unexpectedly is %x\n",
1968 name, cBOOL(isU8_DIGIT_LC(i))));
1970 if (UNLIKELY(cBOOL(isU8_GRAPH_LC(i)) != cBOOL(isGRAPH_A(i)))) {
1972 DEBUG_L(PerlIO_printf(Perl_debug_log,
1973 "isgraph('%s') unexpectedly is %x\n",
1974 name, cBOOL(isU8_GRAPH_LC(i))));
1976 if (UNLIKELY(cBOOL(isU8_LOWER_LC(i)) != cBOOL(isLOWER_A(i)))) {
1978 DEBUG_L(PerlIO_printf(Perl_debug_log,
1979 "islower('%s') unexpectedly is %x\n",
1980 name, cBOOL(isU8_LOWER_LC(i))));
1982 if (UNLIKELY(cBOOL(isU8_PRINT_LC(i)) != cBOOL(isPRINT_A(i)))) {
1984 DEBUG_L(PerlIO_printf(Perl_debug_log,
1985 "isprint('%s') unexpectedly is %x\n",
1986 name, cBOOL(isU8_PRINT_LC(i))));
1988 if (UNLIKELY(cBOOL(isU8_PUNCT_LC(i)) != cBOOL(isPUNCT_A(i)))) {
1990 DEBUG_L(PerlIO_printf(Perl_debug_log,
1991 "ispunct('%s') unexpectedly is %x\n",
1992 name, cBOOL(isU8_PUNCT_LC(i))));
1994 if (UNLIKELY(cBOOL(isU8_SPACE_LC(i)) != cBOOL(isSPACE_A(i)))) {
1996 DEBUG_L(PerlIO_printf(Perl_debug_log,
1997 "isspace('%s') unexpectedly is %x\n",
1998 name, cBOOL(isU8_SPACE_LC(i))));
2000 if (UNLIKELY(cBOOL(isU8_UPPER_LC(i)) != cBOOL(isUPPER_A(i)))) {
2002 DEBUG_L(PerlIO_printf(Perl_debug_log,
2003 "isupper('%s') unexpectedly is %x\n",
2004 name, cBOOL(isU8_UPPER_LC(i))));
2006 if (UNLIKELY(cBOOL(isU8_XDIGIT_LC(i))!= cBOOL(isXDIGIT_A(i)))) {
2008 DEBUG_L(PerlIO_printf(Perl_debug_log,
2009 "isxdigit('%s') unexpectedly is %x\n",
2010 name, cBOOL(isU8_XDIGIT_LC(i))));
2012 if (UNLIKELY(toU8_LOWER_LC(i) != (int) toLOWER_A(i))) {
2014 DEBUG_L(PerlIO_printf(Perl_debug_log,
2015 "tolower('%s')=0x%x instead of the expected 0x%x\n",
2016 name, toU8_LOWER_LC(i), (int) toLOWER_A(i)));
2018 if (UNLIKELY(toU8_UPPER_LC(i) != (int) toUPPER_A(i))) {
2020 DEBUG_L(PerlIO_printf(Perl_debug_log,
2021 "toupper('%s')=0x%x instead of the expected 0x%x\n",
2022 name, toU8_UPPER_LC(i), (int) toUPPER_A(i)));
2024 if (UNLIKELY((i == '\n' && ! isCNTRL_LC(i)))) {
2026 DEBUG_L(PerlIO_printf(Perl_debug_log,
2027 "'\\n' (=%02X) is not a control\n", (int) i));
2030 /* Add to the list; Separate multiple entries with a blank */
2033 my_strlcat(bad_chars_list, " ", sizeof(bad_chars_list));
2035 my_strlcat(bad_chars_list, name, sizeof(bad_chars_list));
2041 if (bad_count == 2 && maybe_utf8_turkic) {
2043 *bad_chars_list = '\0';
2044 PL_fold_locale['I'] = 'I';
2045 PL_fold_locale['i'] = 'i';
2046 PL_in_utf8_turkic_locale = TRUE;
2047 DEBUG_L(PerlIO_printf(Perl_debug_log, "%s is turkic\n", newctype));
2050 PL_in_utf8_turkic_locale = FALSE;
2055 /* We only handle single-byte locales (outside of UTF-8 ones; so if
2056 * this locale requires more than one byte, there are going to be
2058 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
2059 "check_for_problems=%d, MB_CUR_MAX=%d\n",
2060 check_for_problems, (int) MB_CUR_MAX));
2062 if ( check_for_problems && MB_CUR_MAX > 1
2063 && ! PL_in_utf8_CTYPE_locale
2065 /* Some platforms return MB_CUR_MAX > 1 for even the "C"
2066 * locale. Just assume that the implementation for them (plus
2067 * for POSIX) is correct and the > 1 value is spurious. (Since
2068 * these are specially handled to never be considered UTF-8
2069 * locales, as long as this is the only problem, everything
2070 * should work fine */
2071 && strNE(newctype, "C") && strNE(newctype, "POSIX"))
2073 multi_byte_locale = TRUE;
2078 /* If we found problems and we want them output, do so */
2079 if ( (UNLIKELY(bad_count) || UNLIKELY(multi_byte_locale))
2080 && (LIKELY(ckWARN_d(WARN_LOCALE)) || UNLIKELY(DEBUG_L_TEST)))
2082 if (UNLIKELY(bad_count) && PL_in_utf8_CTYPE_locale) {
2083 PL_warn_locale = Perl_newSVpvf(aTHX_
2084 "Locale '%s' contains (at least) the following characters"
2085 " which have\nunexpected meanings: %s\nThe Perl program"
2086 " will use the expected meanings",
2087 newctype, bad_chars_list);
2090 PL_warn_locale = Perl_newSVpvf(aTHX_
2091 "Locale '%s' may not work well.%s%s%s\n",
2094 ? " Some characters in it are not recognized by"
2098 ? "\nThe following characters (and maybe others)"
2099 " may not have the same meaning as the Perl"
2100 " program expects:\n"
2108 # ifdef HAS_SOME_LANGINFO
2110 const char * scratch_buffer = NULL;
2111 Perl_sv_catpvf(aTHX_ PL_warn_locale, "; codeset=%s",
2112 my_langinfo_c(CODESET, LC_CTYPE,
2114 &scratch_buffer, NULL,
2116 Safefree(scratch_buffer);
2120 Perl_sv_catpvf(aTHX_ PL_warn_locale, "\n");
2122 /* If we are actually in the scope of the locale or are debugging,
2123 * output the message now. If not in that scope, we save the
2124 * message to be output at the first operation using this locale,
2125 * if that actually happens. Most programs don't use locales, so
2126 * they are immune to bad ones. */
2127 if (IN_LC(LC_CTYPE) || UNLIKELY(DEBUG_L_TEST)) {
2129 /* The '0' below suppresses a bogus gcc compiler warning */
2130 Perl_warner(aTHX_ packWARN(WARN_LOCALE), SvPVX(PL_warn_locale),
2133 if (IN_LC(LC_CTYPE)) {
2134 SvREFCNT_dec_NN(PL_warn_locale);
2135 PL_warn_locale = NULL;
2141 # endif /* USE_LOCALE_CTYPE */
2146 Perl__warn_problematic_locale()
2149 # ifdef USE_LOCALE_CTYPE
2153 /* Internal-to-core function that outputs the message in PL_warn_locale,
2154 * and then NULLS it. Should be called only through the macro
2155 * CHECK_AND_WARN_PROBLEMATIC_LOCALE_ */
2157 if (PL_warn_locale) {
2158 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
2159 SvPVX(PL_warn_locale),
2160 0 /* dummy to avoid compiler warning */ );
2161 SvREFCNT_dec_NN(PL_warn_locale);
2162 PL_warn_locale = NULL;
2170 S_new_LC_ALL(pTHX_ const char *unused)
2174 /* LC_ALL updates all the things we care about. */
2176 PERL_UNUSED_ARG(unused);
2178 for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++) {
2179 if (update_functions[i]) {
2180 const char * this_locale = querylocale_i(i);
2181 update_functions[i](aTHX_ this_locale);
2187 S_new_collate(pTHX_ const char *newcoll)
2190 # ifndef USE_LOCALE_COLLATE
2192 PERL_UNUSED_ARG(newcoll);
2193 PERL_UNUSED_CONTEXT;
2197 /* Called after each libc setlocale() call affecting LC_COLLATE, to tell
2198 * core Perl this and that 'newcoll' is the name of the new locale.
2200 * The design of locale collation is that every locale change is given an
2201 * index 'PL_collation_ix'. The first time a string particpates in an
2202 * operation that requires collation while locale collation is active, it
2203 * is given PERL_MAGIC_collxfrm magic (via sv_collxfrm_flags()). That
2204 * magic includes the collation index, and the transformation of the string
2205 * by strxfrm(), q.v. That transformation is used when doing comparisons,
2206 * instead of the string itself. If a string changes, the magic is
2207 * cleared. The next time the locale changes, the index is incremented,
2208 * and so we know during a comparison that the transformation is not
2209 * necessarily still valid, and so is recomputed. Note that if the locale
2210 * changes enough times, the index could wrap (a U32), and it is possible
2211 * that a transformation would improperly be considered valid, leading to
2212 * an unlikely bug */
2216 Safefree(PL_collation_name);
2217 PL_collation_name = NULL;
2218 PL_collation_standard = TRUE;
2219 is_standard_collation:
2220 PL_collxfrm_base = 0;
2221 PL_collxfrm_mult = 2;
2222 PL_in_utf8_COLLATE_locale = FALSE;
2223 PL_strxfrm_NUL_replacement = '\0';
2224 PL_strxfrm_max_cp = 0;
2228 /* If this is not the same locale as currently, set the new one up */
2229 if (strNE(PL_collation_name, newcoll)) {
2231 Safefree(PL_collation_name);
2232 PL_collation_name = savepv(newcoll);
2233 PL_collation_standard = isNAME_C_OR_POSIX(newcoll);
2234 if (PL_collation_standard) {
2235 goto is_standard_collation;
2238 PL_in_utf8_COLLATE_locale = is_locale_utf8(newcoll);
2239 PL_strxfrm_NUL_replacement = '\0';
2240 PL_strxfrm_max_cp = 0;
2242 /* A locale collation definition includes primary, secondary, tertiary,
2243 * etc. weights for each character. To sort, the primary weights are
2244 * used, and only if they compare equal, then the secondary weights are
2245 * used, and only if they compare equal, then the tertiary, etc.
2247 * strxfrm() works by taking the input string, say ABC, and creating an
2248 * output transformed string consisting of first the primary weights,
2249 * A¹B¹C¹ followed by the secondary ones, A²B²C²; and then the
2250 * tertiary, etc, yielding A¹B¹C¹ A²B²C² A³B³C³ .... Some characters
2251 * may not have weights at every level. In our example, let's say B
2252 * doesn't have a tertiary weight, and A doesn't have a secondary
2253 * weight. The constructed string is then going to be
2254 * A¹B¹C¹ B²C² A³C³ ....
2255 * This has the desired effect that strcmp() will look at the secondary
2256 * or tertiary weights only if the strings compare equal at all higher
2257 * priority weights. The spaces shown here, like in
2259 * are not just for readability. In the general case, these must
2260 * actually be bytes, which we will call here 'separator weights'; and
2261 * they must be smaller than any other weight value, but since these
2262 * are C strings, only the terminating one can be a NUL (some
2263 * implementations may include a non-NUL separator weight just before
2264 * the NUL). Implementations tend to reserve 01 for the separator
2265 * weights. They are needed so that a shorter string's secondary
2266 * weights won't be misconstrued as primary weights of a longer string,
2267 * etc. By making them smaller than any other weight, the shorter
2268 * string will sort first. (Actually, if all secondary weights are
2269 * smaller than all primary ones, there is no need for a separator
2270 * weight between those two levels, etc.)
2272 * The length of the transformed string is roughly a linear function of
2273 * the input string. It's not exactly linear because some characters
2274 * don't have weights at all levels. When we call strxfrm() we have to
2275 * allocate some memory to hold the transformed string. The
2276 * calculations below try to find coefficients 'm' and 'b' for this
2277 * locale so that m*x + b equals how much space we need, given the size
2278 * of the input string in 'x'. If we calculate too small, we increase
2279 * the size as needed, and call strxfrm() again, but it is better to
2280 * get it right the first time to avoid wasted expensive string
2281 * transformations. */
2284 /* We use the string below to find how long the tranformation of it
2285 * is. Almost all locales are supersets of ASCII, or at least the
2286 * ASCII letters. We use all of them, half upper half lower,
2287 * because if we used fewer, we might hit just the ones that are
2288 * outliers in a particular locale. Most of the strings being
2289 * collated will contain a preponderance of letters, and even if
2290 * they are above-ASCII, they are likely to have the same number of
2291 * weight levels as the ASCII ones. It turns out that digits tend
2292 * to have fewer levels, and some punctuation has more, but those
2293 * are relatively sparse in text, and khw believes this gives a
2294 * reasonable result, but it could be changed if experience so
2296 const char longer[] = "ABCDEFGHIJKLMnopqrstuvwxyz";
2297 char * x_longer; /* Transformed 'longer' */
2298 Size_t x_len_longer; /* Length of 'x_longer' */
2300 char * x_shorter; /* We also transform a substring of 'longer' */
2301 Size_t x_len_shorter;
2303 /* _mem_collxfrm() is used get the transformation (though here we
2304 * are interested only in its length). It is used because it has
2305 * the intelligence to handle all cases, but to work, it needs some
2306 * values of 'm' and 'b' to get it started. For the purposes of
2307 * this calculation we use a very conservative estimate of 'm' and
2308 * 'b'. This assumes a weight can be multiple bytes, enough to
2309 * hold any UV on the platform, and there are 5 levels, 4 weight
2310 * bytes, and a trailing NUL. */
2311 PL_collxfrm_base = 5;
2312 PL_collxfrm_mult = 5 * sizeof(UV);
2314 /* Find out how long the transformation really is */
2315 x_longer = _mem_collxfrm(longer,
2319 /* We avoid converting to UTF-8 in the
2320 * called function by telling it the
2321 * string is in UTF-8 if the locale is a
2322 * UTF-8 one. Since the string passed
2323 * here is invariant under UTF-8, we can
2324 * claim it's UTF-8 even though it isn't.
2326 PL_in_utf8_COLLATE_locale);
2329 /* Find out how long the transformation of a substring of 'longer'
2330 * is. Together the lengths of these transformations are
2331 * sufficient to calculate 'm' and 'b'. The substring is all of
2332 * 'longer' except the first character. This minimizes the chances
2333 * of being swayed by outliers */
2334 x_shorter = _mem_collxfrm(longer + 1,
2337 PL_in_utf8_COLLATE_locale);
2338 Safefree(x_shorter);
2340 /* If the results are nonsensical for this simple test, the whole
2341 * locale definition is suspect. Mark it so that locale collation
2342 * is not active at all for it. XXX Should we warn? */
2343 if ( x_len_shorter == 0
2344 || x_len_longer == 0
2345 || x_len_shorter >= x_len_longer)
2347 PL_collxfrm_mult = 0;
2348 PL_collxfrm_base = 0;
2351 SSize_t base; /* Temporary */
2353 /* We have both: m * strlen(longer) + b = x_len_longer
2354 * m * strlen(shorter) + b = x_len_shorter;
2355 * subtracting yields:
2356 * m * (strlen(longer) - strlen(shorter))
2357 * = x_len_longer - x_len_shorter
2358 * But we have set things up so that 'shorter' is 1 byte smaller
2359 * than 'longer'. Hence:
2360 * m = x_len_longer - x_len_shorter
2362 * But if something went wrong, make sure the multiplier is at
2365 if (x_len_longer > x_len_shorter) {
2366 PL_collxfrm_mult = (STRLEN) x_len_longer - x_len_shorter;
2369 PL_collxfrm_mult = 1;
2374 * but in case something has gone wrong, make sure it is
2376 base = x_len_longer - PL_collxfrm_mult * (sizeof(longer) - 1);
2381 /* Add 1 for the trailing NUL */
2382 PL_collxfrm_base = base + 1;
2385 DEBUG_L(PerlIO_printf(Perl_debug_log,
2386 "?UTF-8 locale=%d; x_len_shorter=%zu, "
2388 " collate multipler=%zu, collate base=%zu\n",
2389 PL_in_utf8_COLLATE_locale,
2390 x_len_shorter, x_len_longer,
2391 PL_collxfrm_mult, PL_collxfrm_base));
2395 # endif /* USE_LOCALE_COLLATE */
2399 #endif /* USE_LOCALE */
2404 Perl_Win_utf8_string_to_wstring(const char * utf8_string)
2408 int req_size = MultiByteToWideChar(CP_UTF8, 0, utf8_string, -1, NULL, 0);
2414 Newx(wstring, req_size, wchar_t);
2416 if (! MultiByteToWideChar(CP_UTF8, 0, utf8_string, -1, wstring, req_size))
2427 Perl_Win_wstring_to_utf8_string(const wchar_t * wstring)
2432 WideCharToMultiByte(CP_UTF8, 0, wstring, -1, NULL, 0, NULL, NULL);
2434 Newx(utf8_string, req_size, char);
2436 if (! WideCharToMultiByte(CP_UTF8, 0, wstring, -1, utf8_string,
2437 req_size, NULL, NULL))
2439 Safefree(utf8_string);
2447 #define USE_WSETLOCALE
2449 #ifdef USE_WSETLOCALE
2452 S_wrap_wsetlocale(pTHX_ int category, const char *locale) {
2453 wchar_t *wlocale = NULL;
2458 wlocale = Win_utf8_string_to_wstring(locale);
2467 wresult = _wsetlocale(category, wlocale);
2474 result = Win_wstring_to_utf8_string(wresult);
2475 SAVEFREEPV(result); /* is there something better we can do here? */
2483 S_win32_setlocale(pTHX_ int category, const char* locale)
2485 /* This, for Windows, emulates POSIX setlocale() behavior. There is no
2486 * difference between the two unless the input locale is "", which normally
2487 * means on Windows to get the machine default, which is set via the
2488 * computer's "Regional and Language Options" (or its current equivalent).
2489 * In POSIX, it instead means to find the locale from the user's
2490 * environment. This routine changes the Windows behavior to first look in
2491 * the environment, and, if anything is found, use that instead of going to
2492 * the machine default. If there is no environment override, the machine
2493 * default is used, by calling the real setlocale() with "".
2495 * The POSIX behavior is to use the LC_ALL variable if set; otherwise to
2496 * use the particular category's variable if set; otherwise to use the LANG
2499 bool override_LC_ALL = FALSE;
2503 if (locale && strEQ(locale, "")) {
2507 locale = PerlEnv_getenv("LC_ALL");
2509 if (category == LC_ALL) {
2510 override_LC_ALL = TRUE;
2516 for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++) {
2517 if (category == categories[i]) {
2518 locale = PerlEnv_getenv(category_names[i]);
2523 locale = PerlEnv_getenv("LANG");
2539 #ifdef USE_WSETLOCALE
2540 result = S_wrap_wsetlocale(aTHX_ category, locale);
2542 result = setlocale(category, locale);
2544 DEBUG_L(STMT_START {
2545 PerlIO_printf(Perl_debug_log, "%s\n",
2546 setlocale_debug_string_r(category, locale, result));
2549 if (! override_LC_ALL) {
2553 /* Here the input category was LC_ALL, and we have set it to what is in the
2554 * LANG variable or the system default if there is no LANG. But these have
2555 * lower priority than the other LC_foo variables, so override it for each
2556 * one that is set. (If they are set to "", it means to use the same thing
2557 * we just set LC_ALL to, so can skip) */
2559 for (i = 0; i < LC_ALL_INDEX_; i++) {
2560 result = PerlEnv_getenv(category_names[i]);
2561 if (result && strNE(result, "")) {
2562 #ifdef USE_WSETLOCALE
2563 S_wrap_wsetlocale(aTHX_ categories[i], result);
2565 setlocale(categories[i], result);
2567 DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s\n",
2568 setlocale_debug_string_i(i, result, "not captured")));
2572 result = setlocale(LC_ALL, NULL);
2573 DEBUG_L(STMT_START {
2574 PerlIO_printf(Perl_debug_log, "%s\n",
2575 setlocale_debug_string_c(LC_ALL, NULL, result));
2584 =for apidoc Perl_setlocale
2586 This is an (almost) drop-in replacement for the system L<C<setlocale(3)>>,
2587 taking the same parameters, and returning the same information, except that it
2588 returns the correct underlying C<LC_NUMERIC> locale. Regular C<setlocale> will
2589 instead return C<C> if the underlying locale has a non-dot decimal point
2590 character, or a non-empty thousands separator for displaying floating point
2591 numbers. This is because perl keeps that locale category such that it has a
2592 dot and empty separator, changing the locale briefly during the operations
2593 where the underlying one is required. C<Perl_setlocale> knows about this, and
2594 compensates; regular C<setlocale> doesn't.
2596 Another reason it isn't completely a drop-in replacement is that it is
2597 declared to return S<C<const char *>>, whereas the system setlocale omits the
2598 C<const> (presumably because its API was specified long ago, and can't be
2599 updated; it is illegal to change the information C<setlocale> returns; doing
2600 so leads to segfaults.)
2602 Finally, C<Perl_setlocale> works under all circumstances, whereas plain
2603 C<setlocale> can be completely ineffective on some platforms under some
2606 C<Perl_setlocale> should not be used to change the locale except on systems
2607 where the predefined variable C<${^SAFE_LOCALES}> is 1. On some such systems,
2608 the system C<setlocale()> is ineffective, returning the wrong information, and
2609 failing to actually change the locale. C<Perl_setlocale>, however works
2610 properly in all circumstances.
2612 The return points to a per-thread static buffer, which is overwritten the next
2613 time C<Perl_setlocale> is called from the same thread.
2619 #ifndef USE_LOCALE_NUMERIC
2620 # define affects_LC_NUMERIC(cat) 0
2621 #elif defined(LC_ALL)
2622 # define affects_LC_NUMERIC(cat) (cat == LC_NUMERIC || cat == LC_ALL)
2624 # define affects_LC_NUMERIC(cat) (cat == LC_NUMERIC)
2628 Perl_setlocale(const int category, const char * locale)
2630 /* This wraps POSIX::setlocale() */
2634 PERL_UNUSED_ARG(category);
2635 PERL_UNUSED_ARG(locale);
2641 const char * retval;
2644 DEBUG_L(PerlIO_printf(Perl_debug_log,
2645 "Entering Perl_setlocale(%d, \"%s\")\n",
2648 /* A NULL locale means only query what the current one is. */
2649 if (locale == NULL) {
2651 # ifndef USE_LOCALE_NUMERIC
2653 /* Without LC_NUMERIC, it's trivial; we just return the value */
2654 return save_to_buffer(querylocale_r(category),
2655 &PL_setlocale_buf, &PL_setlocale_bufsize);
2658 /* We have the LC_NUMERIC name saved, because we are normally switched
2659 * into the C locale (or equivalent) for it. */
2660 if (category == LC_NUMERIC) {
2661 DEBUG_L(PerlIO_printf(Perl_debug_log,
2662 "Perl_setlocale(LC_NUMERIC, NULL) returning stashed '%s'\n",
2665 /* We don't have to copy this return value, as it is a per-thread
2666 * variable, and won't change until a future setlocale */
2667 return PL_numeric_name;
2672 /* Without LC_ALL, just return the value */
2673 return save_to_buffer(querylocale_r(category),
2674 &PL_setlocale_buf, &PL_setlocale_bufsize);
2678 /* Here, LC_ALL is available on this platform. It's the one
2679 * complicating category (because it can contain a toggled LC_NUMERIC
2680 * value), for all the remaining ones (we took care of LC_NUMERIC
2681 * above), just return the value */
2682 if (category != LC_ALL) {
2683 return save_to_buffer(querylocale_r(category),
2684 &PL_setlocale_buf, &PL_setlocale_bufsize);
2687 bool toggled = FALSE;
2689 /* For an LC_ALL query, switch back to the underlying numeric locale
2690 * (if we aren't there already) so as to get the correct results. Our
2691 * records for all the other categories are valid without switching */
2692 if (! PL_numeric_underlying) {
2693 set_numeric_underlying();
2697 retval = querylocale_c(LC_ALL);
2700 set_numeric_standard();
2703 DEBUG_L(PerlIO_printf(Perl_debug_log, "%s\n",
2704 setlocale_debug_string_r(category, locale, retval)));
2706 return save_to_buffer(retval, &PL_setlocale_buf, &PL_setlocale_bufsize);
2708 # endif /* Has LC_ALL */
2709 # endif /* Has LC_NUMERIC */
2711 } /* End of querying the current locale */
2714 /* Here, the input has a locale to change to. First find the current
2716 unsigned int cat_index = get_category_index(category, NULL);
2717 retval = querylocale_i(cat_index);
2719 /* If the new locale is the same as the current one, nothing is actually
2720 * being changed, so do nothing. */
2721 if ( strEQ(retval, locale)
2722 && ( ! affects_LC_NUMERIC(category)
2724 # ifdef USE_LOCALE_NUMERIC
2726 || strEQ(locale, PL_numeric_name)
2731 DEBUG_L(PerlIO_printf(Perl_debug_log,
2732 "Already in requested locale: no action taken\n"));
2733 return save_to_buffer(setlocale_i(cat_index, locale),
2734 &PL_setlocale_buf, &PL_setlocale_bufsize);
2737 /* Here, an actual change is being requested. Do it */
2738 retval = setlocale_i(cat_index, locale);
2740 DEBUG_L(PerlIO_printf(Perl_debug_log, "%s\n",
2741 setlocale_debug_string_i(cat_index, locale, "NULL")));
2745 retval = save_to_buffer(retval, &PL_setlocale_buf, &PL_setlocale_bufsize);
2747 /* Now that have changed locales, we have to update our records to
2748 * correspond. Only certain categories have extra work to update. */
2749 if (update_functions[cat_index]) {
2750 update_functions[cat_index](aTHX_ retval);
2753 DEBUG_L(PerlIO_printf(Perl_debug_log, "returning '%s'\n", retval));
2764 S_save_to_buffer(const char * string, const char **buf, Size_t *buf_size)
2766 /* Copy the NUL-terminated 'string' to a buffer whose address before this
2767 * call began at *buf, and whose available length before this call was
2770 * If the length of 'string' is greater than the space available, the
2771 * buffer is grown accordingly, which may mean that it gets relocated.
2772 * *buf and *buf_size will be updated to reflect this.
2774 * Regardless, the function returns a pointer to where 'string' is now
2777 * 'string' may be NULL, which means no action gets taken, and NULL is
2780 * If *buf or 'buf_size' are NULL or *buf_size is 0, the buffer is assumed
2781 * empty, and memory is malloc'd. 'buf-size' being NULL is to be used
2782 * when this is a single use buffer, which will shortly be freed by the
2788 PERL_ARGS_ASSERT_SAVE_TO_BUFFER;
2794 string_size = strlen(string) + 1;
2796 if (buf_size == NULL) {
2797 Newx(*buf, string_size, char);
2799 else if (*buf_size == 0) {
2800 Newx(*buf, string_size, char);
2801 *buf_size = string_size;
2803 else if (string_size > *buf_size) {
2804 Renew(*buf, string_size, char);
2805 *buf_size = string_size;
2810 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
2811 "Copying '%s' to %p\n",
2812 ((is_utf8_string((U8 *) string, 0))
2814 :_byte_dump_string((U8 *) string, strlen(string), 0)),
2818 Copy(string, *buf, string_size, char);
2823 S_get_locale_string_utf8ness_i(pTHX_ const char * locale,
2824 const unsigned cat_index,
2825 const char * string,
2826 const locale_utf8ness_t known_utf8)
2828 /* Return to indicate if 'string' in the locale given by the input
2829 * arguments should be considered UTF-8 or not.
2831 * If the input 'locale' is not NULL, use that for the locale; otherwise
2832 * use the current locale for the category specified by 'cat_index'.
2836 const U8 * first_variant = NULL;
2838 PERL_ARGS_ASSERT_GET_LOCALE_STRING_UTF8NESS_I;
2839 assert(cat_index <= NOMINAL_LC_ALL_INDEX);
2841 if (string == NULL) {
2845 if (IN_BYTES) { /* respect 'use bytes' */
2849 len = strlen(string);
2851 /* UTF8ness is immaterial if the representation doesn't vary */
2852 if (is_utf8_invariant_string_loc((U8 *) string, len, &first_variant)) {
2853 return UTF8NESS_IMMATERIAL;
2856 /* Can't be UTF-8 if invalid */
2857 if (! is_utf8_string((U8 *) first_variant,
2858 len - ((char *) first_variant - string)))
2863 /* Here and below, we know the string is legal UTF-8, containing at least
2864 * one character requiring a sequence of two or more bytes. It is quite
2865 * likely to be UTF-8. But it pays to be paranoid and do further checking.
2867 * If we already know the UTF-8ness of the locale, then we immediately know
2868 * what the string is */
2869 if (UNLIKELY(known_utf8 != LOCALE_UTF8NESS_UNKNOWN)) {
2870 if (known_utf8 == LOCALE_IS_UTF8) {
2871 return UTF8NESS_YES;
2878 # if defined(HAS_SOME_LANGINFO) || defined(HAS_MBTOWC) || defined(HAS_MBRTOWC)
2880 /* Here, we have available the libc functions that can be used to
2881 * accurately determine the UTF8ness of the underlying locale. If it is a
2882 * UTF-8 locale, the string is UTF-8; otherwise it was coincidental that
2883 * the string is legal UTF-8
2885 * However, if the perl is compiled to not pay attention to the category
2886 * being passed in, you might think that that locale is essentially always
2887 * the C locale, so it would make sense to say it isn't UTF-8. But to get
2888 * here, the string has to contain characters unknown in the C locale. And
2889 * in fact, Windows boxes are compiled without LC_MESSAGES, as their
2890 * message catalog isn't really a part of the locale system. But those
2891 * messages really could be UTF-8, and given that the odds are rather small
2892 * of something not being UTF-8 but being syntactically valid UTF-8, khw
2893 * has decided to call such strings as UTF-8. */
2895 if (locale == NULL) {
2896 locale = querylocale_i(cat_index);
2898 if (is_locale_utf8(locale)) {
2899 return UTF8NESS_YES;
2906 /* Here, we have a valid UTF-8 string containing non-ASCII characters, and
2907 * don't have access to functions to check if the locale is UTF-8 or not.
2908 * Assume that it is. khw tried adding a check that the string is entirely
2909 * in a single Unicode script, but discovered the strftime() timezone is
2910 * user-settable through the environment, which may be in a different
2911 * script than the locale-expected value. */
2912 PERL_UNUSED_ARG(locale);
2913 PERL_UNUSED_ARG(cat_index);
2915 return UTF8NESS_YES;
2921 #endif /* USE_LOCALE */
2924 Perl_mbtowc_(pTHX_ const wchar_t * pwc, const char * s, const Size_t len)
2927 #if ! defined(HAS_MBRTOWC) && ! defined(HAS_MBTOWC)
2929 PERL_UNUSED_ARG(pwc);
2931 PERL_UNUSED_ARG(len);
2934 #else /* Below we have some form of mbtowc() */
2935 # if defined(HAS_MBRTOWC) \
2936 && (defined(USE_LOCALE_THREADS) || ! defined(HAS_MBTOWC))
2937 # define USE_MBRTOWC
2944 if (s == NULL) { /* Initialize the shift state to all zeros in
2947 # if defined(USE_MBRTOWC)
2949 memzero(&PL_mbrtowc_ps, sizeof(PL_mbrtowc_ps));
2956 retval = mbtowc(NULL, NULL, 0);
2964 # if defined(USE_MBRTOWC)
2967 retval = (SSize_t) mbrtowc((wchar_t *) pwc, s, len, &PL_mbrtowc_ps);
2971 /* Locking prevents races, but locales can be switched out without locking,
2972 * so this isn't a cure all */
2975 retval = mbtowc((wchar_t *) pwc, s, len);
2987 =for apidoc Perl_localeconv
2989 This is a thread-safe version of the libc L<localeconv(3)>. It is the same as
2990 L<POSIX::localeconv|POSIX/localeconv> (returning a hash of the C<localeconv()>
2991 fields), but directly callable from XS code.
2997 Perl_localeconv(pTHX)
3000 #if ! defined(HAS_SOME_LOCALECONV) \
3001 || (! defined(USE_LOCALE_MONETARY) && ! defined(USE_LOCALE_NUMERIC))
3007 return my_localeconv(0, LOCALE_UTF8NESS_UNKNOWN);
3013 #if defined(HAS_SOME_LOCALECONV) \
3014 && (defined(USE_LOCALE_MONETARY) || defined(USE_LOCALE_NUMERIC))
3017 S_my_localeconv(pTHX_ const int item, const locale_utf8ness_t locale_is_utf8)
3020 locale_utf8ness_t numeric_locale_is_utf8 = LOCALE_UTF8NESS_UNKNOWN;
3021 locale_utf8ness_t monetary_locale_is_utf8 = LOCALE_UTF8NESS_UNKNOWN;
3022 HV * (*copy_localeconv)(pTHX_ const struct lconv *,
3024 const locale_utf8ness_t,
3025 const locale_utf8ness_t);
3027 /* A thread-safe locale_conv(). The locking mechanisms vary greatly
3028 * depending on platform capabilities. They all share this common set up
3029 * code for the function, and then conditional compilations choose one of
3030 * several terminations.
3032 * There are two use cases:
3033 * 1) Called from POSIX::locale_conv(). This returns lconv() copied to
3034 * a hash, based on the current underlying locale.
3035 * 2) Certain items that nl_langinfo() provides are also derivable from
3036 * the return of localeconv(). Windows notably doesn't have
3037 * nl_langinfo(), so on that, and actually any platform lacking it,
3038 * my_localeconv() is used to emulate it for those particular items.
3039 * The code to do this is compiled only on such platforms. Rather than
3040 * going to the expense of creating a full hash when only one item is
3041 * needed, just the desired item is returned, in an SV cast to an HV.
3043 * There is a helper function to accomplish each of the two tasks. The
3044 * function pointer just below is set to the appropriate one, and is called
3045 * from each of the various implementations, in the middle of whatever
3046 * necessary locking/locale swapping have been done. */
3048 # ifdef HAS_SOME_LANGINFO
3050 PERL_UNUSED_ARG(item);
3051 PERL_UNUSED_ARG(locale_is_utf8);
3053 # ifdef USE_LOCALE_NUMERIC
3055 /* When there is a nl_langinfo, we will only be called for localeconv
3056 * numeric purposes. */
3057 const bool is_localeconv_call = true;
3063 /* Note we use this sentinel; this works because this only gets compiled
3064 * when our perl_langinfo.h is used, and that uses negative numbers for all
3066 const bool is_localeconv_call = (item == 0);
3067 if (is_localeconv_call)
3072 copy_localeconv = S_populate_localeconv;
3074 # ifdef USE_LOCALE_NUMERIC
3076 /* Get the UTF8ness of the locales now to avoid repeating this for each
3077 * string returned by localeconv() */
3078 numeric_locale_is_utf8 = (is_locale_utf8(PL_numeric_name))
3083 # ifdef USE_LOCALE_MONETARY
3085 monetary_locale_is_utf8 = (is_locale_utf8(querylocale_c(LC_MONETARY)))
3093 # ifndef HAS_SOME_LANGINFO
3096 copy_localeconv = S_get_nl_item_from_localeconv;
3097 numeric_locale_is_utf8 = locale_is_utf8;
3102 PERL_ARGS_ASSERT_MY_LOCALECONV;
3103 /*--------------------------------------------------------------------------*/
3104 /* Here, we are done with the common beginning of all the implementations of
3105 * my_localeconv(). Below are the various terminations of the function (except
3106 * the closing '}'. They are separated out because the preprocessor directives
3107 * were making the simple logic hard to follow. Each implementation ends with
3108 * the same few lines. khw decided to keep those separate because he thought
3109 * it was clearer to the reader.
3111 * The first distinct termination (of the above common code) are the
3112 * implementations when we have locale_conv_l() and can use it. These are the
3113 * simplest cases, without any locking needed. */
3114 # if defined(USE_POSIX_2008_LOCALE) && defined(HAS_LOCALECONV_L)
3116 /* And there are two sub-cases: First (by far the most common) is where we
3117 * are compiled to pay attention to LC_NUMERIC */
3118 # ifdef USE_LOCALE_NUMERIC
3120 const locale_t cur = use_curlocale_scratch();
3121 locale_t with_numeric = duplocale(cur);
3123 /* Just create a new locale object with what we've got, but using the
3124 * underlying LC_NUMERIC locale */
3125 with_numeric = newlocale(LC_NUMERIC_MASK, PL_numeric_name, with_numeric);
3127 retval = copy_localeconv(aTHX_ localeconv_l(with_numeric),
3129 numeric_locale_is_utf8,
3130 monetary_locale_is_utf8);
3131 freelocale(with_numeric);
3135 /*--------------------------------------------------------------------------*/
3136 # else /* Below not paying attention to LC_NUMERIC */
3138 const locale_t cur = use_curlocale_scratch();
3140 retval = copy_localeconv(aTHX_ localeconv_l(cur),
3142 numeric_locale_is_utf8,
3143 monetary_locale_is_utf8);
3146 # endif /* Above, using lconv_l(); below plain lconv() */
3147 /*--------------------------------------------------------------------------*/
3148 # elif ! defined(TS_W32_BROKEN_LOCALECONV) /* Next is regular lconv() */
3150 /* There are so many locks because localeconv() deals with two
3151 * categories, and returns in a single global static buffer. Some
3152 * locks might be no-ops on this platform, but not others. We need to
3153 * lock if any one isn't a no-op. */
3155 # ifdef USE_LOCALE_NUMERIC
3158 const char * orig_switched_locale = NULL;
3160 /* When called internally, are already switched into the proper numeric
3161 * locale; otherwise must toggle to it */
3162 if (is_localeconv_call) {
3163 orig_switched_locale = toggle_locale_c(LC_NUMERIC, PL_numeric_name);
3169 retval = copy_localeconv(aTHX_ localeconv(),
3171 numeric_locale_is_utf8,
3172 monetary_locale_is_utf8);
3175 # ifdef USE_LOCALE_NUMERIC
3177 if (orig_switched_locale) {
3178 restore_toggled_locale_c(LC_NUMERIC, orig_switched_locale);
3186 /*--------------------------------------------------------------------------*/
3187 # else /* defined(TS_W32_BROKEN_LOCALECONV) */
3189 /* Last is a workaround for the broken localeconv() on Windows with
3190 * thread-safe locales prior to VS 15. It looks at the global locale
3191 * instead of the thread one. As a work-around, we toggle to the global
3192 * locale; populate the return; then toggle back. We have to use LC_ALL
3193 * instead of the individual categories because of another bug in Windows.
3195 * This introduces a potential race with any other thread that has also
3196 * converted to use the global locale, and doesn't protect its locale calls
3197 * with mutexes. khw can't think of any reason for a thread to do so on
3198 * Windows, as the locale API is the same regardless of thread-safety, except
3199 * if the code is ported from working on another platform where there might
3200 * be some reason to do this. But this is typically due to some
3201 * alien-to-Perl library that thinks it owns locale setting. Such a
3202 * library usn't likely to exist on Windows, so such an application is
3203 * unlikely to be run on Windows
3205 bool restore_per_thread = FALSE;
3207 # ifdef USE_LOCALE_NUMERIC
3209 const char * orig_switched_locale = NULL;
3213 /* When called internally, are already switched into the proper numeric
3214 * locale; otherwise must toggle to it */
3215 if (is_localeconv_call) {
3216 orig_switched_locale = toggle_locale_c(LC_NUMERIC, PL_numeric_name);
3221 /* Save the per-thread locale state */
3222 const char * save_thread = querylocale_c(LC_ALL);
3224 /* Change to the global locale, and note if we already were there */
3225 if (_configthreadlocale(_DISABLE_PER_THREAD_LOCALE)
3226 != _DISABLE_PER_THREAD_LOCALE)
3228 restore_per_thread = TRUE;
3231 /* Save the state of the global locale; then convert to our desired
3233 const char * save_global = querylocale_c(LC_ALL);
3234 void_setlocale_c(LC_ALL, save_thread);
3236 /* Safely stash the desired data */
3238 retval = copy_localeconv(aTHX_ localeconv(),
3240 numeric_locale_is_utf8,
3241 monetary_locale_is_utf8);
3244 /* Restore the global locale's prior state */
3245 void_setlocale_c(LC_ALL, save_global);
3247 /* And back to per-thread locales */
3248 if (restore_per_thread) {
3249 _configthreadlocale(_ENABLE_PER_THREAD_LOCALE);
3252 /* Restore the per-thread locale state */
3253 void_setlocale_c(LC_ALL, save_thread);
3255 # ifdef USE_LOCALE_NUMERIC
3257 if (orig_switched_locale) {
3258 restore_toggled_locale_c(LC_NUMERIC, orig_switched_locale);
3267 /*--------------------------------------------------------------------------*/
3271 S_populate_localeconv(pTHX_ const struct lconv *lcbuf,
3273 const locale_utf8ness_t numeric_locale_is_utf8,
3274 const locale_utf8ness_t monetary_locale_is_utf8)
3276 /* This returns a mortalized hash containing all the elements returned by
3277 * localeconv(). It is used by Perl_localeconv() and POSIX::localeconv()
3279 PERL_UNUSED_ARG(unused);
3281 struct lconv_offset {
3287 {"thousands_sep", STRUCT_OFFSET(struct lconv, thousands_sep)},
3289 # define LCONV_ENTRY(name) \
3290 {STRINGIFY(name), STRUCT_OFFSET(struct lconv, name)}
3292 /* Set up structures containing the documented fields. One structure for
3293 * LC_NUMERIC-controlled strings; one for LC_MONETARY ones, and a final one
3294 * of just numerics. */
3295 # ifdef USE_LOCALE_NUMERIC
3297 static const struct lconv_offset lconv_numeric_strings[] = {
3298 LCONV_ENTRY(decimal_point),
3299 LCONV_ENTRY(thousands_sep),
3300 # ifndef NO_LOCALECONV_GROUPING
3301 LCONV_ENTRY(grouping),
3307 # ifdef USE_LOCALE_MONETARY
3309 static const struct lconv_offset lconv_monetary_strings[] = {
3310 LCONV_ENTRY(int_curr_symbol),
3311 LCONV_ENTRY(currency_symbol),
3312 LCONV_ENTRY(mon_decimal_point),
3313 # ifndef NO_LOCALECONV_MON_THOUSANDS_SEP
3314 LCONV_ENTRY(mon_thousands_sep),
3316 # ifndef NO_LOCALECONV_MON_GROUPING
3317 LCONV_ENTRY(mon_grouping),
3319 LCONV_ENTRY(positive_sign),
3320 LCONV_ENTRY(negative_sign),
3326 static const struct lconv_offset lconv_integers[] = {
3327 # ifdef USE_LOCALE_MONETARY
3328 LCONV_ENTRY(int_frac_digits),
3329 LCONV_ENTRY(frac_digits),
3330 LCONV_ENTRY(p_cs_precedes),
3331 LCONV_ENTRY(p_sep_by_space),
3332 LCONV_ENTRY(n_cs_precedes),
3333 LCONV_ENTRY(n_sep_by_space),
3334 LCONV_ENTRY(p_sign_posn),
3335 LCONV_ENTRY(n_sign_posn),
3336 # ifdef HAS_LC_MONETARY_2008
3337 LCONV_ENTRY(int_p_cs_precedes),
3338 LCONV_ENTRY(int_p_sep_by_space),
3339 LCONV_ENTRY(int_n_cs_precedes),
3340 LCONV_ENTRY(int_n_sep_by_space),
3341 LCONV_ENTRY(int_p_sign_posn),
3342 LCONV_ENTRY(int_n_sign_posn),
3348 static const unsigned category_indices[] = {
3349 # ifdef USE_LOCALE_NUMERIC
3352 # ifdef USE_LOCALE_MONETARY
3355 (unsigned) -1 /* Just so the previous element can always end with a
3356 comma => subtract 1 below for the max loop index */
3359 const char *ptr = (const char *) lcbuf;
3360 const struct lconv_offset *integers = lconv_integers;
3362 HV * retval = newHV();
3363 sv_2mortal((SV*)retval);
3365 PERL_ARGS_ASSERT_POPULATE_LOCALECONV;
3367 /* For each enabled category ... */
3368 for (PERL_UINT_FAST8_T i = 0; i < C_ARRAY_LENGTH(category_indices) - 1; i++) {
3369 const unsigned cat_index = category_indices[i];
3370 locale_utf8ness_t locale_is_utf8 = LOCALE_UTF8NESS_UNKNOWN;
3373 /* ( = NULL silences a compiler warning; would segfault if it could
3374 * actually happen.) */
3375 const struct lconv_offset *strings = NULL;
3377 # ifdef USE_LOCALE_NUMERIC
3378 if (cat_index == LC_NUMERIC_INDEX_) {
3379 locale_is_utf8 = numeric_locale_is_utf8;
3380 strings = lconv_numeric_strings;
3383 PERL_UNUSED_ARG(numeric_locale_is_utf8);
3385 # ifdef USE_LOCALE_MONETARY
3386 if (cat_index == LC_MONETARY_INDEX_) {
3387 locale_is_utf8 = monetary_locale_is_utf8;
3388 strings = lconv_monetary_strings;
3391 PERL_UNUSED_ARG(monetary_locale_is_utf8);
3394 assert(locale_is_utf8 != LOCALE_UTF8NESS_UNKNOWN);
3396 /* Iterate over the strings structure for this category */
3397 locale = querylocale_i(cat_index);
3399 while (strings->name) {
3400 const char *value = *((const char **)(ptr + strings->offset));
3401 if (value && *value) {
3402 bool is_utf8 = /* Only make UTF-8 if required to */
3403 (UTF8NESS_YES == (get_locale_string_utf8ness_i(locale,
3407 (void) hv_store(retval,
3409 strlen(strings->name),
3410 newSVpvn_utf8(value, strlen(value), is_utf8),
3418 while (integers->name) {
3419 const char value = *((const char *)(ptr + integers->offset));
3421 if (value != CHAR_MAX)
3422 (void) hv_store(retval, integers->name,
3423 strlen(integers->name), newSViv(value), 0);
3430 # ifndef HAS_SOME_LANGINFO
3433 S_get_nl_item_from_localeconv(pTHX_ const struct lconv *lcbuf,
3435 const locale_utf8ness_t unused1,
3436 const locale_utf8ness_t unused2)
3438 /* This is a helper function for my_localeconv(), which is called from
3439 * my_langinfo() to emulate the libc nl_langinfo() function on platforms
3440 * that don't have it available.
3442 * This function acts as an extension to my_langinfo(), the intermediate
3443 * my_localeconv() call is to set up the locks and switch into the proper
3444 * locale. That logic exists for other reasons, and by doing it this way,
3445 * it doesn't have to be duplicated.
3447 * This function extracts the current value of 'item' in the current locale
3448 * using the localconv() result also passed in, via 'lcbuf'. The other
3449 * parameter is unused, a placeholder so the signature of this function
3450 * matches another that does need it, and so the two functions can be
3451 * referred to by a single function pointer, to simplify the code below */
3453 const char * prefix = "";
3454 const char * temp = NULL;
3456 PERL_ARGS_ASSERT_GET_NL_ITEM_FROM_LOCALECONV;
3457 PERL_UNUSED_ARG(unused1);
3458 PERL_UNUSED_ARG(unused2);
3462 temp = lcbuf->currency_symbol;
3464 if (lcbuf->p_cs_precedes) {
3466 /* khw couldn't find any documentation that CHAR_MAX is the signal,
3467 * but cygwin uses it thusly */
3468 if (lcbuf->p_cs_precedes == CHAR_MAX) {
3482 temp = lcbuf->decimal_point;
3486 temp = lcbuf->thousands_sep;
3490 locale_panic_(Perl_form(aTHX_
3491 "Unexpected item passed to populate_localeconv: %d", item));
3494 return (HV *) Perl_newSVpvf(aTHX_ "%s%s", prefix, temp);
3497 # endif /* ! Has some form of langinfo() */
3498 #endif /* Has some form of localeconv() and paying attn to a category it
3503 =for apidoc Perl_langinfo
3505 This is an (almost) drop-in replacement for the system C<L<nl_langinfo(3)>>,
3506 taking the same C<item> parameter values, and returning the same information.
3507 But it is more thread-safe than regular C<nl_langinfo()>, and hides the quirks
3508 of Perl's locale handling from your code, and can be used on systems that lack
3509 a native C<nl_langinfo>.
3517 The reason it isn't quite a drop-in replacement is actually an advantage. The
3518 only difference is that it returns S<C<const char *>>, whereas plain
3519 C<nl_langinfo()> returns S<C<char *>>, but you are (only by documentation)
3520 forbidden to write into the buffer. By declaring this C<const>, the compiler
3521 enforces this restriction, so if it is violated, you know at compilation time,
3522 rather than getting segfaults at runtime.
3526 It delivers the correct results for the C<RADIXCHAR> and C<THOUSEP> items,
3527 without you having to write extra code. The reason for the extra code would be
3528 because these are from the C<LC_NUMERIC> locale category, which is normally
3529 kept set by Perl so that the radix is a dot, and the separator is the empty
3530 string, no matter what the underlying locale is supposed to be, and so to get
3531 the expected results, you have to temporarily toggle into the underlying
3532 locale, and later toggle back. (You could use plain C<nl_langinfo> and
3533 C<L</STORE_LC_NUMERIC_FORCE_TO_UNDERLYING>> for this but then you wouldn't get
3534 the other advantages of C<Perl_langinfo()>; not keeping C<LC_NUMERIC> in the C
3535 (or equivalent) locale would break a lot of CPAN, which is expecting the radix
3536 (decimal point) character to be a dot.)
3540 The system function it replaces can have its static return buffer trashed,
3541 not only by a subsequent call to that function, but by a C<freelocale>,
3542 C<setlocale>, or other locale change. The returned buffer of this function is
3543 not changed until the next call to it, so the buffer is never in a trashed
3548 Its return buffer is per-thread, so it also is never overwritten by a call to
3549 this function from another thread; unlike the function it replaces.
3553 But most importantly, it works on systems that don't have C<nl_langinfo>, such
3554 as Windows, hence makes your code more portable. Of the fifty-some possible
3555 items specified by the POSIX 2008 standard,
3556 L<http://pubs.opengroup.org/onlinepubs/9699919799/basedefs/langinfo.h.html>,
3557 only one is completely unimplemented, though on non-Windows platforms, another
3558 significant one is also not implemented). It uses various techniques to
3559 recover the other items, including calling C<L<localeconv(3)>>, and
3560 C<L<strftime(3)>>, both of which are specified in C89, so should be always be
3561 available. Later C<strftime()> versions have additional capabilities; C<""> is
3562 returned for those not available on your system.
3564 It is important to note that when called with an item that is recovered by
3565 using C<localeconv>, the buffer from any previous explicit call to
3566 C<localeconv> will be overwritten. This means you must save that buffer's
3567 contents if you need to access them after a call to this function. (But note
3568 that you might not want to be using C<localeconv()> directly anyway, because of
3569 issues like the ones listed in the second item of this list (above) for
3570 C<RADIXCHAR> and C<THOUSEP>. You can use the methods given in L<perlcall> to
3571 call L<POSIX/localeconv> and avoid all the issues, but then you have a hash to
3574 The details for those items which may deviate from what this emulation returns
3575 and what a native C<nl_langinfo()> would return are specified in
3580 When using C<Perl_langinfo> on systems that don't have a native
3581 C<nl_langinfo()>, you must
3583 #include "perl_langinfo.h"
3585 before the C<perl.h> C<#include>. You can replace your C<langinfo.h>
3586 C<#include> with this one. (Doing it this way keeps out the symbols that plain
3587 C<langinfo.h> would try to import into the namespace for code that doesn't need
3590 The original impetus for C<Perl_langinfo()> was so that code that needs to
3591 find out the current currency symbol, floating point radix character, or digit
3592 grouping separator can use, on all systems, the simpler and more
3593 thread-friendly C<nl_langinfo> API instead of C<L<localeconv(3)>> which is a
3594 pain to make thread-friendly. For other fields returned by C<localeconv>, it
3595 is better to use the methods given in L<perlcall> to call
3596 L<C<POSIX::localeconv()>|POSIX/localeconv>, which is thread-friendly.
3602 #ifndef HAS_SOME_LANGINFO
3604 typedef int nl_item; /* Substitute 'int' for emulated nl_langinfo() */
3609 Perl_langinfo(const nl_item item)
3614 /* Find the locale category that controls the input 'item'. If we are not
3615 * paying attention to that category, instead return a default value. Also
3616 * return the default value if there is no way for us to figure out the
3617 * correct value. If we have some form of nl_langinfo(), we can always
3618 * figure it out, but lacking that, there may be alternative methods that
3619 * can be used to recover most of the possible items. Some of those
3620 * methods need libc functions, which may or may not be available. If
3621 * unavailable, we can't compute the correct value, so must here return the
3627 #ifdef USE_LOCALE_CTYPE
3629 cat_index = LC_CTYPE_INDEX_;
3635 #if defined(USE_LOCALE_MESSAGES) && defined(HAS_SOME_LANGINFO)
3637 case YESEXPR: case YESSTR: case NOEXPR: case NOSTR:
3638 cat_index = LC_MESSAGES_INDEX_;
3641 case YESEXPR: return "^[+1yY]";
3642 case YESSTR: return "yes";
3643 case NOEXPR: return "^[-0nN]";
3644 case NOSTR: return "no";
3649 #if defined(USE_LOCALE_MONETARY) \
3650 && (defined(HAS_SOME_LANGINFO) || defined(HAS_SOME_LOCALECONV))
3652 cat_index = LC_MONETARY_INDEX_;
3660 #ifdef CAN_CALCULATE_RADIX
3662 cat_index = LC_NUMERIC_INDEX_;
3665 return C_decimal_point;
3670 #if defined(USE_LOCALE_NUMERIC) \
3671 && (defined(HAS_SOME_LANGINFO) || defined(HAS_SOME_LOCALECONV))
3673 cat_index = LC_NUMERIC_INDEX_;
3676 return C_thousands_sep;
3679 /* The other possible items are all in LC_TIME. */
3680 #ifdef USE_LOCALE_TIME
3683 cat_index = LC_TIME_INDEX_;
3687 #if ! defined(USE_LOCALE_TIME) || ! defined(HAS_SOME_LANGINFO)
3689 /* If not using LC_TIME, hard code the rest. Or, if there is no
3690 * nl_langinfo(), we use strftime() as an alternative, and it is missing
3691 * functionality to get every single one, so hard-code those */
3693 case ERA: return ""; /* Unimplemented; for use with strftime() %E
3696 /* These formats are defined by C89, so we assume that strftime supports
3697 * them, and so are returned unconditionally; they may not be what the
3698 * locale actually says, but should give good enough results for someone
3699 * using them as formats (as opposed to trying to parse them to figure
3700 * out what the locale says). The other format items are actually tested
3701 * to verify they work on the platform */
3702 case D_FMT: return "%x";
3703 case T_FMT: return "%X";
3704 case D_T_FMT: return "%c";
3706 # if defined(WIN32) || ! defined(USE_LOCALE_TIME)
3708 /* strftime() on Windows doesn't have the POSIX (beyond C89) extensions
3709 * that would allow it to recover these */
3710 case ERA_D_FMT: return "%x";
3711 case ERA_T_FMT: return "%X";
3712 case ERA_D_T_FMT: return "%c";
3713 case ALT_DIGITS: return "0";
3716 # ifndef USE_LOCALE_TIME
3718 case T_FMT_AMPM: return "%r";
3719 case ABDAY_1: return "Sun";
3720 case ABDAY_2: return "Mon";
3721 case ABDAY_3: return "Tue";
3722 case ABDAY_4: return "Wed";
3723 case ABDAY_5: return "Thu";
3724 case ABDAY_6: return "Fri";
3725 case ABDAY_7: return "Sat";
3726 case AM_STR: return "AM";
3727 case PM_STR: return "PM";
3728 case ABMON_1: return "Jan";
3729 case ABMON_2: return "Feb";
3730 case ABMON_3: return "Mar";
3731 case ABMON_4: return "Apr";
3732 case ABMON_5: return "May";
3733 case ABMON_6: return "Jun";
3734 case ABMON_7: return "Jul";
3735 case ABMON_8: return "Aug";
3736 case ABMON_9: return "Sep";
3737 case ABMON_10: return "Oct";
3738 case ABMON_11: return "Nov";
3739 case ABMON_12: return "Dec";
3740 case DAY_1: return "Sunday";
3741 case DAY_2: return "Monday";
3742 case DAY_3: return "Tuesday";
3743 case DAY_4: return "Wednesday";
3744 case DAY_5: return "Thursday";
3745 case DAY_6: return "Friday";
3746 case DAY_7: return "Saturday";
3747 case MON_1: return "January";
3748 case MON_2: return "February";
3749 case MON_3: return "March";
3750 case MON_4: return "April";
3751 case MON_5: return "May";
3752 case MON_6: return "June";
3753 case MON_7: return "July";
3754 case MON_8: return "August";
3755 case MON_9: return "September";
3756 case MON_10: return "October";
3757 case MON_11: return "November";
3758 case MON_12: return "December";
3763 } /* End of switch on item */
3767 Perl_croak_nocontext("panic: Unexpected nl_langinfo() item %d", item);
3768 NOT_REACHED; /* NOTREACHED */
3769 PERL_UNUSED_VAR(cat_index);
3772 # ifdef USE_LOCALE_NUMERIC
3774 /* Use either the underlying numeric, or the other underlying categories */
3775 if (cat_index == LC_NUMERIC_INDEX_) {
3776 return my_langinfo_c(item, LC_NUMERIC, PL_numeric_name,
3777 &PL_langinfo_buf, &PL_langinfo_bufsize, NULL);
3784 return my_langinfo_i(item, cat_index, querylocale_i(cat_index),
3785 &PL_langinfo_buf, &PL_langinfo_bufsize, NULL);
3794 /* There are several implementations of my_langinfo, depending on the
3795 * Configuration. They all share the same beginning of the function */
3797 S_my_langinfo_i(pTHX_
3798 const nl_item item, /* The item to look up */
3799 const unsigned int cat_index, /* The locale category that
3801 /* The locale to look up 'item' in. */
3802 const char * locale,
3804 /* Where to store the result, and where the size of that buffer
3805 * is stored, updated on exit. retbuf_sizep may be NULL for an
3806 * empty-on-entry, single use buffer whose size we don't need
3807 * to keep track of */
3808 const char ** retbufp,
3809 Size_t * retbuf_sizep,
3811 /* If not NULL, the location to store the UTF8-ness of 'item's
3812 * value, as documented */
3813 utf8ness_t * utf8ness)
3815 const char * retval = NULL;
3817 PERL_ARGS_ASSERT_MY_LANGINFO_I;
3818 assert(cat_index <= NOMINAL_LC_ALL_INDEX);
3820 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
3821 "Entering my_langinfo item=%d, using locale %s\n",
3823 /*--------------------------------------------------------------------------*/
3824 /* Above is the common beginning to all the implementations of my_langinfo().
3825 * Below are the various completions.
3827 * Some platforms don't deal well with non-ASCII strings in locale X when
3828 * LC_CTYPE is not in X. (Actually it is probably when X is UTF-8 and LC_CTYPE
3829 * isn't, or vice versa). There is explicit code to bring the categories into
3830 * sync. This doesn't seem to be a problem with nl_langinfo(), so that
3831 * implementation doesn't currently worry about it. But it is a problem on
3832 * Windows boxes, which don't have nl_langinfo(). */
3834 # if defined(HAS_THREAD_SAFE_NL_LANGINFO_L) && defined(USE_POSIX_2008_LOCALE)
3836 /* Simplest is if we can use nl_langinfo_l()
3838 * With it, we can change LC_CTYPE in the same call as the other category */
3839 # ifdef USE_LOCALE_CTYPE
3840 # define CTYPE_SAFETY_MASK LC_CTYPE_MASK
3842 # define CTYPE_SAFETY_MASK 0
3845 locale_t cur = newlocale((category_masks[cat_index] | CTYPE_SAFETY_MASK),
3846 locale, (locale_t) 0);
3848 retval = save_to_buffer(nl_langinfo_l(item, cur), retbufp, retbuf_sizep);
3850 *utf8ness = get_locale_string_utf8ness_i(locale, cat_index, retval,
3851 LOCALE_UTF8NESS_UNKNOWN);
3857 /*--------------------------------------------------------------------------*/
3858 # elif defined(HAS_NL_LANGINFO) /* nl_langinfo() is available. */
3860 /* The second version of my_langinfo() is if we have plain nl_langinfo() */
3862 # ifdef USE_LOCALE_CTYPE
3864 /* Ths function sorts out if things actually have to be switched or not,
3865 * for both calls. */
3866 const char * orig_CTYPE_locale = toggle_locale_c(LC_CTYPE, locale);
3870 const char * orig_switched_locale = toggle_locale_i(cat_index, locale);
3873 retval = save_to_buffer(nl_langinfo(item), retbufp, retbuf_sizep);
3877 *utf8ness = get_locale_string_utf8ness_i(locale, cat_index,
3878 retval, LOCALE_UTF8NESS_UNKNOWN);
3881 restore_toggled_locale_i(cat_index, orig_switched_locale);
3883 # ifdef USE_LOCALE_CTYPE
3884 restore_toggled_locale_c(LC_CTYPE, orig_CTYPE_locale);
3888 /*--------------------------------------------------------------------------*/
3889 # else /* Below, emulate nl_langinfo as best we can */
3891 /* And the third and final completion is where we have to emulate
3892 * nl_langinfo(). There are various possibilities depending on the
3895 # ifdef USE_LOCALE_CTYPE
3897 const char * orig_CTYPE_locale = toggle_locale_c(LC_CTYPE, locale);
3901 const char * orig_switched_locale = toggle_locale_i(cat_index, locale);
3903 /* Here, we are in the locale we want information about */
3905 /* Almost all the items will have ASCII return values. Set that here, and
3906 * override if necessary */
3907 utf8ness_t is_utf8 = UTF8NESS_IMMATERIAL;
3916 # if defined(HAS_SNPRINTF) \
3917 && (! defined(HAS_SOME_LOCALECONV) || defined(TS_W32_BROKEN_LOCALECONV))
3920 /* snprintf() can be used to find the radix character by outputting
3921 * a known simple floating point number to a buffer, and parsing
3922 * it, inferring the radix as the bytes separating the integer and
3923 * fractional parts. But localeconv() is more direct, not
3924 * requiring inference, so use it instead of the code just below,
3925 * if (likely) it is available and works ok */
3927 char * floatbuf = NULL;
3928 const Size_t initial_size = 10;
3930 Newx(floatbuf, initial_size, char);
3932 /* 1.5 is exactly representable on binary computers */
3933 Size_t needed_size = snprintf(floatbuf, initial_size, "%.1f", 1.5);
3935 /* If our guess wasn't big enough, increase and try again, based on
3936 * the real number that strnprintf() is supposed to return */
3937 if (UNLIKELY(needed_size >= initial_size)) {
3938 needed_size++; /* insurance */
3939 Renew(floatbuf, needed_size, char);
3940 Size_t new_needed = snprintf(floatbuf, needed_size, "%.1f", 1.5);
3941 assert(new_needed <= needed_size);
3942 needed_size = new_needed;
3945 char * s = floatbuf;
3946 char * e = floatbuf + needed_size;
3949 while (s < e && *s != '1') {
3953 if (LIKELY(s < e)) {
3958 char * item_start = s;
3959 while (s < e && *s != '5') {
3963 /* Everything in between is the radix string */
3964 if (LIKELY(s < e)) {
3966 retval = save_to_buffer(item_start,
3967 (const char **) &PL_langinfo_buf,
3968 &PL_langinfo_bufsize);
3972 is_utf8 = get_locale_string_utf8ness_i(locale, cat_index,
3974 LOCALE_UTF8NESS_UNKNOWN);
3984 # ifdef HAS_SOME_LOCALECONV /* snprintf() failed; drop down to use
3989 # else /* snprintf() failed and no localeconv() */
3991 retval = C_decimal_point;
3996 # ifdef HAS_SOME_LOCALECONV
3998 /* These items are available from localeconv(). (To avoid using
3999 * TS_W32_BROKEN_LOCALECONV, one could use GetNumberFormat and
4000 * GetCurrencyFormat; patches welcome) */
4005 SV * string = (SV *) my_localeconv(item, LOCALE_UTF8NESS_UNKNOWN);
4007 retval = save_to_buffer(SvPV_nolen(string), retbufp, retbuf_sizep);
4010 is_utf8 = get_locale_string_utf8ness_i(locale, cat_index, retval,
4011 LOCALE_UTF8NESS_UNKNOWN);
4014 SvREFCNT_dec_NN(string);
4018 # endif /* Some form of localeconv */
4019 # ifdef HAS_STRFTIME
4021 /* These formats are only available in later strfmtime's */
4022 case ERA_D_FMT: case ERA_T_FMT: case ERA_D_T_FMT: case T_FMT_AMPM:
4024 /* The rest can be gotten from most versions of strftime(). */
4025 case ABDAY_1: case ABDAY_2: case ABDAY_3:
4026 case ABDAY_4: case ABDAY_5: case ABDAY_6: case ABDAY_7:
4028 case AM_STR: case PM_STR:
4029 case ABMON_1: case ABMON_2: case ABMON_3: case ABMON_4:
4030 case ABMON_5: case ABMON_6: case ABMON_7: case ABMON_8:
4031 case ABMON_9: case ABMON_10: case ABMON_11: case ABMON_12:
4032 case DAY_1: case DAY_2: case DAY_3: case DAY_4:
4033 case DAY_5: case DAY_6: case DAY_7:
4034 case MON_1: case MON_2: case MON_3: case MON_4:
4035 case MON_5: case MON_6: case MON_7: case MON_8:
4036 case MON_9: case MON_10: case MON_11: case MON_12:
4038 const char * format;
4039 bool return_format = FALSE;
4044 GCC_DIAG_IGNORE_STMT(-Wimplicit-fallthrough);
4048 locale_panic_(Perl_form(aTHX_ "switch case: %d problem", item));
4049 NOT_REACHED; /* NOTREACHED */
4051 case PM_STR: hour = 18;
4055 case ABDAY_7: mday++;
4056 case ABDAY_6: mday++;
4057 case ABDAY_5: mday++;
4058 case ABDAY_4: mday++;
4059 case ABDAY_3: mday++;
4060 case ABDAY_2: mday++;
4073 case ABMON_12: mon++;
4074 case ABMON_11: mon++;
4075 case ABMON_10: mon++;
4076 case ABMON_9: mon++;
4077 case ABMON_8: mon++;
4078 case ABMON_7: mon++;
4079 case ABMON_6: mon++;
4080 case ABMON_5: mon++;
4081 case ABMON_4: mon++;
4082 case ABMON_3: mon++;
4083 case ABMON_2: mon++;
4103 return_format = TRUE;
4107 return_format = TRUE;
4111 return_format = TRUE;
4115 return_format = TRUE;
4118 format = "%Ow"; /* Find the alternate digit for 0 */
4122 GCC_DIAG_RESTORE_STMT;
4124 /* The year was deliberately chosen so that January 1 is on the
4125 * first day of the week. Since we're only getting one thing at a
4126 * time, it all works */
4127 const char * temp = my_strftime8(format, 30, 30, hour, mday, mon,
4128 2011, 0, 0, 0, &is_utf8);
4129 retval = save_to_buffer(temp, retbufp, retbuf_sizep);
4132 /* If the item is 'ALT_DIGITS', '*retbuf' contains the alternate
4133 * format for wday 0. If the value is the same as the normal 0,
4134 * there isn't an alternate, so clear the buffer.
4136 * (wday was chosen because its range is all a single digit.
4137 * Things like tm_sec have two digits as the minimum: '00'.) */
4138 if (item == ALT_DIGITS && strEQ(*retbufp, "0")) {
4143 /* ALT_DIGITS is problematic. Experiments on it showed that
4144 * strftime() did not always work properly when going from alt-9 to
4145 * alt-10. Only a few locales have this item defined, and in all
4146 * of them on Linux that khw was able to find, nl_langinfo() merely
4147 * returned the alt-0 character, possibly doubled. Most Unicode
4148 * digits are in blocks of 10 consecutive code points, so that is
4149 * sufficient information for such scripts, as we can infer alt-1,
4150 * alt-2, .... But for a Japanese locale, a CJK ideographic 0 is
4151 * returned, and the CJK digits are not in code point order, so you
4152 * can't really infer anything. The localedef for this locale did
4153 * specify the succeeding digits, so that strftime() works properly
4154 * on them, without needing to infer anything. But the
4155 * nl_langinfo() return did not give sufficient information for the
4156 * caller to understand what's going on. So until there is
4157 * evidence that it should work differently, this returns the alt-0
4158 * string for ALT_DIGITS. */
4160 if (return_format) {
4162 /* If to return the format, not the value, overwrite the buffer
4163 * with it. But some strftime()s will keep the original format
4164 * if illegal, so change those to "" */
4165 if (strEQ(*retbufp, format)) {
4172 /* A format is always in ASCII */
4173 is_utf8 = UTF8NESS_IMMATERIAL;
4183 /* The trivial case */
4184 if (isNAME_C_OR_POSIX(locale)) {
4191 /* This function retrieves the code page. It is subject to change, but
4192 * is documented and has been stable for many releases */
4193 UINT ___lc_codepage_func(void);
4195 retval = save_to_buffer(Perl_form(aTHX_ "%d", ___lc_codepage_func()),
4196 retbufp, retbuf_sizep);
4197 DEBUG_Lv(PerlIO_printf(Perl_debug_log, "locale='%s' cp=%s\n",
4203 /* The codeset is important, but khw did not figure out a way for it to
4204 * be retrieved on non-Windows boxes without nl_langinfo(). But even
4205 * if we can't get it directly, we can usually determine if it is a
4206 * UTF-8 locale or not. If it is UTF-8, we (correctly) use that for
4209 # if defined(HAS_MBTOWC) || defined(HAS_MBRTOWC)
4211 /* If libc mbtowc() evaluates the bytes that form the REPLACEMENT
4212 * CHARACTER as that Unicode code point, this has to be a UTF-8 locale.
4216 (void) Perl_mbtowc_(aTHX_ NULL, NULL, 0);/* Reset shift state */
4217 int mbtowc_ret = Perl_mbtowc_(aTHX_ &wc,
4218 STR_WITH_LEN(REPLACEMENT_CHARACTER_UTF8));
4219 if (mbtowc_ret >= 0 && wc == UNICODE_REPLACEMENT) {
4224 /* Here, it isn't a UTF-8 locale. */
4228 /* Here we know it isn't a UTF-8 locale (if mbtowc() was available on
4229 * the platform). All that is left us is looking at the locale name.
4231 * Find any dot in the locale name */
4232 retval = (const char *) strchr(locale, '.');
4234 retval = ""; /* Alas, no dot */
4238 /* Use everything past the dot */
4241 # if defined(HAS_MBTOWC) || defined(HAS_MBRTOWC)
4243 /* Here, we know that the locale did not act like a proper UTF-8 one.
4244 * So if it claims to be UTF-8, it is a lie */
4245 if (is_codeset_name_UTF8(retval)) {
4249 retval = save_to_buffer(retval, retbufp, retbuf_sizep);
4257 } /* Giant switch() of nl_langinfo() items */
4259 restore_toggled_locale_i(cat_index, orig_switched_locale);
4261 # ifdef USE_LOCALE_CTYPE
4262 restore_toggled_locale_c(LC_CTYPE, orig_CTYPE_locale);
4266 *utf8ness = is_utf8;
4271 # endif /* All the implementations of my_langinfo() */
4272 /*--------------------------------------------------------------------------*/
4274 } /* my_langinfo() */
4276 #endif /* USE_LOCALE */
4279 Perl_my_strftime8(pTHX_ const char *fmt, int sec, int min, int hour, int mday,
4280 int mon, int year, int wday, int yday, int isdst,
4281 utf8ness_t * utf8ness)
4282 { /* Documented in util.c */
4283 char * retval = my_strftime(fmt, sec, min, hour, mday, mon, year, wday,
4286 PERL_ARGS_ASSERT_MY_STRFTIME8;
4290 #ifdef USE_LOCALE_TIME
4291 *utf8ness = get_locale_string_utf8ness_i(NULL, LC_TIME_INDEX_,
4292 retval, LOCALE_UTF8NESS_UNKNOWN);
4294 *utf8ness = UTF8NESS_IMMATERIAL;
4299 DEBUG_Lv(PerlIO_printf(Perl_debug_log, "fmt=%s, retval=%s", fmt,
4300 ((is_utf8_string((U8 *) retval, 0))
4302 :_byte_dump_string((U8 *) retval, strlen(retval), 0)));
4303 if (utf8ness) PerlIO_printf(Perl_debug_log, "; utf8ness=%d",
4305 PerlIO_printf(Perl_debug_log, "\n");
4312 * Initialize locale awareness.
4315 Perl_init_i18nl10n(pTHX_ int printwarn)
4319 * 0 if not to output warning when setup locale is bad
4320 * 1 if to output warning based on value of PERL_BADLANG
4321 * >1 if to output regardless of PERL_BADLANG
4324 * 1 = set ok or not applicable,
4325 * 0 = fallback to a locale of lower priority
4326 * -1 = fallback to all locales failed, not even to the C locale
4328 * Under -DDEBUGGING, if the environment variable PERL_DEBUG_LOCALE_INIT is
4329 * set, debugging information is output.
4331 * This looks more complicated than it is, mainly due to the #ifdefs and
4334 * Besides some asserts, data structure initialization, and specific
4335 * platform complications, this routine is effectively represented by this
4338 * setlocale(LC_ALL, ""); x
4339 * foreach (subcategory) { x
4340 * curlocales[f(subcategory)] = setlocale(subcategory, NULL); x
4342 * if (platform_so_requires) {
4343 * foreach (subcategory) {
4344 * PL_curlocales[f(subcategory)] = curlocales[f(subcategory)]
4347 * foreach (subcategory) {
4348 * if (needs_special_handling[f(subcategory)] &this_subcat_handler
4351 * This sets all the categories to the values in the current environment,
4352 * saves them temporarily in curlocales[] until they can be handled and/or
4353 * on some platforms saved in a per-thread array PL_curlocales[].
4355 * f(foo) is a mapping from the opaque system category numbers to small
4356 * non-negative integers used most everywhere in this file as indices into
4357 * arrays (such as curlocales[]) so the program doesn't have to otherwise
4358 * deal with the opaqueness.
4360 * If the platform doesn't have LC_ALL, the lines marked 'x' above are
4361 * effectively replaced by:
4362 * foreach (subcategory) { y
4363 * curlocales[f(subcategory)] = setlocale(subcategory, ""); y
4366 * The only differences being the lack of an LC_ALL call, and using ""
4367 * instead of NULL in the setlocale calls.
4369 * But there are, of course, complications.
4371 * it has to deal with if this is an embedded perl, whose locale doesn't
4372 * come from the environment, but has been set up by the caller. This is
4373 * pretty simply handled: the "" in the setlocale calls is not a string
4374 * constant, but a variable which is set to NULL in the embedded case.
4376 * But the major complication is handling failure and doing fallback. All
4377 * the code marked 'x' or 'y' above is actually enclosed in an outer loop,
4378 * using the array trial_locales[]. On entry, trial_locales[] is
4379 * initialized to just one entry, containing the NULL or "" locale argument
4380 * shown above. If, as is almost always the case, everything works, it
4381 * exits after just the one iteration, going on to the next step.
4383 * But if there is a failure, the code tries its best to honor the
4384 * environment as much as possible. It self-modifies trial_locales[] to
4385 * have more elements, one for each of the POSIX-specified settings from
4386 * the environment, such as LANG, ending in the ultimate fallback, the C
4387 * locale. Thus if there is something bogus with a higher priority
4388 * environment variable, it will try with the next highest, until something
4389 * works. If everything fails, it limps along with whatever state it got
4392 * A further complication is that Windows has an additional fallback, the
4393 * user-default ANSI code page obtained from the operating system. This is
4394 * added as yet another loop iteration, just before the final "C"
4396 * A slight complication is that in embedded Perls, the locale may already
4397 * be set-up, and we don't want to get it from the normal environment
4398 * variables. This is handled by having a special environment variable
4399 * indicate we're in this situation. We simply set setlocale's 2nd
4400 * parameter to be a NULL instead of "". That indicates to setlocale that
4401 * it is not to change anything, but to return the current value,
4402 * effectively initializing perl's db to what the locale already is.
4404 * We play the same trick with NULL if a LC_ALL succeeds. We call
4405 * setlocale() on the individual categores with NULL to get their existing
4406 * values for our db, instead of trying to change them.
4413 PERL_UNUSED_ARG(printwarn);
4415 #else /* USE_LOCALE */
4418 const char * const language = PerlEnv_getenv("LANGUAGE");
4422 /* NULL uses the existing already set up locale */
4423 const char * const setlocale_init = (PerlEnv_getenv("PERL_SKIP_LOCALE_INIT"))
4426 typedef struct trial_locales_struct_s {
4427 const char* trial_locale;
4428 const char* fallback_desc;
4429 const char* fallback_name;
4430 } trial_locales_struct;
4431 /* 5 = 1 each for "", LC_ALL, LANG, (Win32) system default locale, C */
4432 trial_locales_struct trial_locales[5];
4433 unsigned int trial_locales_count;
4434 const char * const lc_all = PerlEnv_getenv("LC_ALL");
4435 const char * const lang = PerlEnv_getenv("LANG");
4436 bool setlocale_failure = FALSE;
4439 /* A later getenv() could zap this, so only use here */
4440 const char * const bad_lang_use_once = PerlEnv_getenv("PERL_BADLANG");
4442 const bool locwarn = (printwarn > 1
4444 && ( ! bad_lang_use_once
4446 /* disallow with "" or "0" */
4448 && strNE("0", bad_lang_use_once)))));
4450 /* current locale for given category; should have been copied so aren't
4452 const char * curlocales[NOMINAL_LC_ALL_INDEX + 1];
4455 # define DEBUG_LOCALE_INIT(a,b,c)
4458 DEBUG_INITIALIZATION_set(cBOOL(PerlEnv_getenv("PERL_DEBUG_LOCALE_INIT")));
4460 # define DEBUG_LOCALE_INIT(cat_index, locale, result) \
4461 DEBUG_L(PerlIO_printf(Perl_debug_log, "%s\n", \
4462 setlocale_debug_string_i(cat_index, locale, result)));
4464 /* Make sure the parallel arrays are properly set up */
4465 # ifdef USE_LOCALE_NUMERIC
4466 assert(categories[LC_NUMERIC_INDEX_] == LC_NUMERIC);
4467 assert(strEQ(category_names[LC_NUMERIC_INDEX_], "LC_NUMERIC"));
4468 # ifdef USE_POSIX_2008_LOCALE
4469 assert(category_masks[LC_NUMERIC_INDEX_] == LC_NUMERIC_MASK);
4472 # ifdef USE_LOCALE_CTYPE
4473 assert(categories[LC_CTYPE_INDEX_] == LC_CTYPE);
4474 assert(strEQ(category_names[LC_CTYPE_INDEX_], "LC_CTYPE"));
4475 # ifdef USE_POSIX_2008_LOCALE
4476 assert(category_masks[LC_CTYPE_INDEX_] == LC_CTYPE_MASK);
4479 # ifdef USE_LOCALE_COLLATE
4480 assert(categories[LC_COLLATE_INDEX_] == LC_COLLATE);
4481 assert(strEQ(category_names[LC_COLLATE_INDEX_], "LC_COLLATE"));
4482 # ifdef USE_POSIX_2008_LOCALE
4483 assert(category_masks[LC_COLLATE_INDEX_] == LC_COLLATE_MASK);
4486 # ifdef USE_LOCALE_TIME
4487 assert(categories[LC_TIME_INDEX_] == LC_TIME);
4488 assert(strEQ(category_names[LC_TIME_INDEX_], "LC_TIME"));
4489 # ifdef USE_POSIX_2008_LOCALE
4490 assert(category_masks[LC_TIME_INDEX_] == LC_TIME_MASK);
4493 # ifdef USE_LOCALE_MESSAGES
4494 assert(categories[LC_MESSAGES_INDEX_] == LC_MESSAGES);
4495 assert(strEQ(category_names[LC_MESSAGES_INDEX_], "LC_MESSAGES"));
4496 # ifdef USE_POSIX_2008_LOCALE
4497 assert(category_masks[LC_MESSAGES_INDEX_] == LC_MESSAGES_MASK);
4500 # ifdef USE_LOCALE_MONETARY
4501 assert(categories[LC_MONETARY_INDEX_] == LC_MONETARY);
4502 assert(strEQ(category_names[LC_MONETARY_INDEX_], "LC_MONETARY"));
4503 # ifdef USE_POSIX_2008_LOCALE
4504 assert(category_masks[LC_MONETARY_INDEX_] == LC_MONETARY_MASK);
4507 # ifdef USE_LOCALE_ADDRESS
4508 assert(categories[LC_ADDRESS_INDEX_] == LC_ADDRESS);
4509 assert(strEQ(category_names[LC_ADDRESS_INDEX_], "LC_ADDRESS"));
4510 # ifdef USE_POSIX_2008_LOCALE
4511 assert(category_masks[LC_ADDRESS_INDEX_] == LC_ADDRESS_MASK);
4514 # ifdef USE_LOCALE_IDENTIFICATION
4515 assert(categories[LC_IDENTIFICATION_INDEX_] == LC_IDENTIFICATION);
4516 assert(strEQ(category_names[LC_IDENTIFICATION_INDEX_], "LC_IDENTIFICATION"));
4517 # ifdef USE_POSIX_2008_LOCALE
4518 assert(category_masks[LC_IDENTIFICATION_INDEX_] == LC_IDENTIFICATION_MASK);
4521 # ifdef USE_LOCALE_MEASUREMENT
4522 assert(categories[LC_MEASUREMENT_INDEX_] == LC_MEASUREMENT);
4523 assert(strEQ(category_names[LC_MEASUREMENT_INDEX_], "LC_MEASUREMENT"));
4524 # ifdef USE_POSIX_2008_LOCALE
4525 assert(category_masks[LC_MEASUREMENT_INDEX_] == LC_MEASUREMENT_MASK);
4528 # ifdef USE_LOCALE_PAPER
4529 assert(categories[LC_PAPER_INDEX_] == LC_PAPER);
4530 assert(strEQ(category_names[LC_PAPER_INDEX_], "LC_PAPER"));
4531 # ifdef USE_POSIX_2008_LOCALE
4532 assert(category_masks[LC_PAPER_INDEX_] == LC_PAPER_MASK);
4535 # ifdef USE_LOCALE_TELEPHONE
4536 assert(categories[LC_TELEPHONE_INDEX_] == LC_TELEPHONE);
4537 assert(strEQ(category_names[LC_TELEPHONE_INDEX_], "LC_TELEPHONE"));
4538 # ifdef USE_POSIX_2008_LOCALE
4539 assert(category_masks[LC_TELEPHONE_INDEX_] == LC_TELEPHONE_MASK);
4542 # ifdef USE_LOCALE_SYNTAX
4543 assert(categories[LC_SYNTAX_INDEX_] == LC_SYNTAX);
4544 assert(strEQ(category_names[LC_SYNTAX_INDEX_], "LC_SYNTAX"));
4545 # ifdef USE_POSIX_2008_LOCALE
4546 assert(category_masks[LC_SYNTAX_INDEX_] == LC_SYNTAX_MASK);
4549 # ifdef USE_LOCALE_TOD
4550 assert(categories[LC_TOD_INDEX_] == LC_TOD);
4551 assert(strEQ(category_names[LC_TOD_INDEX_], "LC_TOD"));
4552 # ifdef USE_POSIX_2008_LOCALE
4553 assert(category_masks[LC_TOD_INDEX_] == LC_TOD_MASK);
4557 assert(categories[LC_ALL_INDEX_] == LC_ALL);
4558 assert(strEQ(category_names[LC_ALL_INDEX_], "LC_ALL"));
4559 STATIC_ASSERT_STMT(NOMINAL_LC_ALL_INDEX == LC_ALL_INDEX_);
4560 # ifdef USE_POSIX_2008_LOCALE
4561 assert(category_masks[LC_ALL_INDEX_] == LC_ALL_MASK);
4564 # endif /* DEBUGGING */
4566 /* Initialize the per-thread mbrFOO() state variables. See POSIX.xs for
4567 * why these particular incantations are used. */
4569 memzero(&PL_mbrlen_ps, sizeof(PL_mbrlen_ps));
4572 memzero(&PL_mbrtowc_ps, sizeof(PL_mbrtowc_ps));
4575 wcrtomb(NULL, L'\0', &PL_wcrtomb_ps);
4578 /* Initialize the cache of the program's UTF-8ness for the always known
4579 * locales C and POSIX */
4580 my_strlcpy(PL_locale_utf8ness, C_and_POSIX_utf8ness,
4581 sizeof(PL_locale_utf8ness));
4583 /* See https://github.com/Perl/perl5/issues/17824 */
4584 Zero(curlocales, NOMINAL_LC_ALL_INDEX, char *);
4586 # ifdef USE_THREAD_SAFE_LOCALE
4589 _configthreadlocale(_ENABLE_PER_THREAD_LOCALE);
4593 # ifdef USE_POSIX_2008_LOCALE
4595 PL_C_locale_obj = newlocale(LC_ALL_MASK, "C", (locale_t) 0);
4596 if (! PL_C_locale_obj) {
4597 locale_panic_(Perl_form(aTHX_
4598 "Cannot create POSIX 2008 C locale object"));
4601 DEBUG_Lv(PerlIO_printf(Perl_debug_log, "created C object %p\n",
4604 # ifdef USE_LOCALE_NUMERIC
4606 PL_numeric_radix_sv = newSVpvn(C_decimal_point, strlen(C_decimal_point));
4607 Newx(PL_numeric_name, 2, char);
4608 Copy("C", PL_numeric_name, 2, char);
4611 # ifdef USE_LOCALE_COLLATE
4613 Newx(PL_collation_name, 2, char);
4614 Copy("C", PL_collation_name, 2, char);
4617 # ifdef USE_PL_CURLOCALES
4619 /* Initialize our records. If we have POSIX 2008, we have LC_ALL */
4620 void_setlocale_c(LC_ALL, porcelain_setlocale(LC_ALL, NULL));
4624 /* We try each locale in the list until we get one that works, or exhaust
4625 * the list. Normally the loop is executed just once. But if setting the
4626 * locale fails, inside the loop we add fallback trials to the array and so
4627 * will execute the loop multiple times */
4628 trial_locales[0] = (trial_locales_struct) {
4629 .trial_locale = setlocale_init,
4630 .fallback_desc = NULL,
4631 .fallback_name = NULL,
4633 trial_locales_count = 1;
4635 for (i= 0; i < trial_locales_count; i++) {
4636 const char * trial_locale = trial_locales[i].trial_locale;
4637 setlocale_failure = FALSE;
4641 /* setlocale() return vals; not copied so must be looked at
4643 const char * sl_result[NOMINAL_LC_ALL_INDEX + 1];
4644 sl_result[LC_ALL_INDEX_] = stdized_setlocale(LC_ALL, trial_locale);
4645 DEBUG_LOCALE_INIT(LC_ALL_INDEX_, trial_locale, sl_result[LC_ALL_INDEX_]);
4646 if (! sl_result[LC_ALL_INDEX_]) {
4647 setlocale_failure = TRUE;
4650 /* Since LC_ALL succeeded, it should have changed all the other
4651 * categories it can to its value; so we massage things so that the
4652 * setlocales below just return their category's current values.
4653 * This adequately handles the case in NetBSD where LC_COLLATE may
4654 * not be defined for a locale, and setting it individually will
4655 * fail, whereas setting LC_ALL succeeds, leaving LC_COLLATE set to
4656 * the POSIX locale. */
4657 trial_locale = NULL;
4660 # endif /* LC_ALL */
4662 if (! setlocale_failure) {
4664 for (j = 0; j < NOMINAL_LC_ALL_INDEX; j++) {
4665 Safefree(curlocales[j]);
4666 curlocales[j] = stdized_setlocale(categories[j], trial_locale);
4667 if (! curlocales[j]) {
4668 setlocale_failure = TRUE;
4670 curlocales[j] = savepv(curlocales[j]);
4671 DEBUG_LOCALE_INIT(j, trial_locale, curlocales[j]);
4674 if (LIKELY(! setlocale_failure)) { /* All succeeded */
4675 break; /* Exit trial_locales loop */
4679 /* Here, something failed; will need to try a fallback. */
4685 if (locwarn) { /* Output failure info only on the first one */
4689 PerlIO_printf(Perl_error_log,
4690 "perl: warning: Setting locale failed.\n");
4692 # else /* !LC_ALL */
4694 PerlIO_printf(Perl_error_log,
4695 "perl: warning: Setting locale failed for the categories:\n");
4697 for (j = 0; j < NOMINAL_LC_ALL_INDEX; j++) {
4698 if (! curlocales[j]) {
4699 PerlIO_printf(Perl_error_log, "\t%s\n", category_names[j]);
4703 # endif /* LC_ALL */
4705 PerlIO_printf(Perl_error_log,
4706 "perl: warning: Please check that your locale settings:\n");
4710 PerlIO_printf(Perl_error_log,
4711 "\tLANGUAGE = %c%s%c,\n",
4712 language ? '"' : '(',
4713 language ? language : "unset",
4714 language ? '"' : ')');
4717 PerlIO_printf(Perl_error_log,
4718 "\tLC_ALL = %c%s%c,\n",
4720 lc_all ? lc_all : "unset",
4721 lc_all ? '"' : ')');
4723 # if defined(USE_ENVIRON_ARRAY)
4728 /* Look through the environment for any variables of the
4729 * form qr/ ^ LC_ [A-Z]+ = /x, except LC_ALL which was
4730 * already handled above. These are assumed to be locale
4731 * settings. Output them and their values. */
4732 for (e = environ; *e; e++) {
4733 const STRLEN prefix_len = sizeof("LC_") - 1;
4736 if ( strBEGINs(*e, "LC_")
4737 && ! strBEGINs(*e, "LC_ALL=")
4738 && (uppers_len = strspn(*e + prefix_len,
4739 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"))
4740 && ((*e)[prefix_len + uppers_len] == '='))
4742 PerlIO_printf(Perl_error_log, "\t%.*s = \"%s\",\n",
4743 (int) (prefix_len + uppers_len), *e,
4744 *e + prefix_len + uppers_len + 1);
4751 PerlIO_printf(Perl_error_log,
4752 "\t(possibly more locale environment variables)\n");
4756 PerlIO_printf(Perl_error_log,
4757 "\tLANG = %c%s%c\n",
4759 lang ? lang : "unset",
4762 PerlIO_printf(Perl_error_log,
4763 " are supported and installed on your system.\n");
4766 /* Calculate what fallback locales to try. We have avoided this
4767 * until we have to, because failure is quite unlikely. This will
4768 * usually change the upper bound of the loop we are in.
4770 * Since the system's default way of setting the locale has not
4771 * found one that works, We use Perl's defined ordering: LC_ALL,
4772 * LANG, and the C locale. We don't try the same locale twice, so
4773 * don't add to the list if already there. (On POSIX systems, the
4774 * LC_ALL element will likely be a repeat of the 0th element "",
4775 * but there's no harm done by doing it explicitly.
4777 * Note that this tries the LC_ALL environment variable even on
4778 * systems which have no LC_ALL locale setting. This may or may
4779 * not have been originally intentional, but there's no real need
4780 * to change the behavior. */
4782 for (j = 0; j < trial_locales_count; j++) {
4783 if (strEQ(lc_all, trial_locales[j].trial_locale)) {
4787 trial_locales[trial_locales_count++] = (trial_locales_struct) {
4788 .trial_locale = lc_all,
4789 .fallback_desc = (strEQ(lc_all, "C")
4790 ? "the standard locale"
4791 : "a fallback locale"),
4792 .fallback_name = lc_all,
4798 for (j = 0; j < trial_locales_count; j++) {
4799 if (strEQ(lang, trial_locales[j].trial_locale)) {
4803 trial_locales[trial_locales_count++] = (trial_locales_struct) {
4804 .trial_locale = lang,
4805 .fallback_desc = (strEQ(lang, "C")
4806 ? "the standard locale"
4807 : "a fallback locale"),
4808 .fallback_name = lang,
4813 # if defined(WIN32) && defined(LC_ALL)
4815 /* For Windows, we also try the system default locale before "C".
4816 * (If there exists a Windows without LC_ALL we skip this because
4817 * it gets too complicated. For those, the "C" is the next
4818 * fallback possibility). */
4820 /* Note that this may change the locale, but we are going to do
4822 const char *system_default_locale = stdized_setlocale(LC_ALL, "");
4823 DEBUG_LOCALE_INIT(LC_ALL_INDEX_, "", system_default_locale);
4825 /* Skip if invalid or if it's already on the list of locales to
4827 if (! system_default_locale) {
4828 goto done_system_default;
4830 for (j = 0; j < trial_locales_count; j++) {
4831 if (strEQ(system_default_locale, trial_locales[j].trial_locale)) {
4832 goto done_system_default;
4836 trial_locales[trial_locales_count++] = (trial_locales_struct) {
4837 .trial_locale = system_default_locale,
4838 .fallback_desc = (strEQ(system_default_locale, "C")
4839 ? "the standard locale"
4840 : "the system default locale"),
4841 .fallback_name = system_default_locale,
4844 done_system_default:
4848 for (j = 0; j < trial_locales_count; j++) {
4849 if (strEQ("C", trial_locales[j].trial_locale)) {
4853 trial_locales[trial_locales_count++] = (trial_locales_struct) {
4854 .trial_locale = "C",
4855 .fallback_desc = "the standard locale",
4856 .fallback_name = "C",
4860 } /* end of first time through the loop */
4868 } /* end of looping through the trial locales */
4870 if (ok < 1) { /* If we tried to fallback */
4872 if (! setlocale_failure) { /* fallback succeeded */
4873 msg = "Falling back to";
4875 else { /* fallback failed */
4878 /* We dropped off the end of the loop, so have to decrement i to
4879 * get back to the value the last time through */
4883 msg = "Failed to fall back to";
4885 /* To continue, we should use whatever values we've got */
4887 for (j = 0; j < NOMINAL_LC_ALL_INDEX; j++) {
4888 Safefree(curlocales[j]);
4889 curlocales[j] = savepv(stdized_setlocale(categories[j], NULL));
4890 DEBUG_LOCALE_INIT(j, NULL, curlocales[j]);
4895 const char * description = trial_locales[i].fallback_desc;
4896 const char * name = trial_locales[i].fallback_name;
4898 if (name && strNE(name, "")) {
4899 PerlIO_printf(Perl_error_log,
4900 "perl: warning: %s %s (\"%s\").\n", msg, description, name);
4903 PerlIO_printf(Perl_error_log,
4904 "perl: warning: %s %s.\n", msg, description);
4907 } /* End of tried to fallback */
4909 # ifdef USE_POSIX_2008_LOCALE
4911 /* The stdized setlocales haven't affected the P2008 locales. Initialize
4912 * them now, calculating LC_ALL only on the final go round, when all have
4914 for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++) {
4915 (void) emulate_setlocale_i(i, curlocales[i],
4916 RECALCULATE_LC_ALL_ON_FINAL_INTERATION,
4922 /* Done with finding the locales; update the auxiliary records */
4925 for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++) {
4927 # if defined(USE_LOCALE_THREADS) && ! defined(USE_THREAD_SAFE_LOCALE)
4929 /* This caches whether each category's locale is UTF-8 or not. This
4930 * may involve changing the locale. It is ok to do this at
4931 * initialization time before any threads have started, but not later
4932 * unless thread-safe operations are used.
4933 * Caching means that if the program heeds our dictate not to change
4934 * locales in threaded applications, this data will remain valid, and
4935 * it may get queried without having to change locales. If the
4936 * environment is such that all categories have the same locale, this
4937 * isn't needed, as the code will not change the locale; but this
4938 * handles the uncommon case where the environment has disparate
4939 * locales for the categories */
4940 (void) _is_cur_LC_category_utf8(categories[i]);
4944 Safefree(curlocales[i]);
4947 # if defined(USE_PERLIO) && defined(USE_LOCALE_CTYPE)
4949 /* Set PL_utf8locale to TRUE if using PerlIO _and_ the current LC_CTYPE
4950 * locale is UTF-8. The call to new_ctype() just above has already
4951 * calculated the latter value and saved it in PL_in_utf8_CTYPE_locale. If
4952 * both PL_utf8locale and PL_unicode (set by -C or by $ENV{PERL_UNICODE})
4953 * are true, perl.c:S_parse_body() will turn on the PerlIO :utf8 layer on
4954 * STDIN, STDOUT, STDERR, _and_ the default open discipline. */
4955 PL_utf8locale = PL_in_utf8_CTYPE_locale;
4957 /* Set PL_unicode to $ENV{PERL_UNICODE} if using PerlIO.
4958 This is an alternative to using the -C command line switch
4959 (the -C if present will override this). */
4961 const char *p = PerlEnv_getenv("PERL_UNICODE");
4962 PL_unicode = p ? parse_unicode_opts(&p) : 0;
4963 if (PL_unicode & PERL_UNICODE_UTF8CACHEASSERT_FLAG)
4968 #endif /* USE_LOCALE */
4970 /* So won't continue to output stuff */
4971 DEBUG_INITIALIZATION_set(FALSE);
4976 #ifdef USE_LOCALE_COLLATE
4979 Perl__mem_collxfrm(pTHX_ const char *input_string,
4980 STRLEN len, /* Length of 'input_string' */
4981 STRLEN *xlen, /* Set to length of returned string
4982 (not including the collation index
4984 bool utf8 /* Is the input in UTF-8? */
4987 /* _mem_collxfrm() is like strxfrm() but with two important differences.
4988 * First, it handles embedded NULs. Second, it allocates a bit more memory
4989 * than needed for the transformed data itself. The real transformed data
4990 * begins at offset COLLXFRM_HDR_LEN. *xlen is set to the length of that,
4991 * and doesn't include the collation index size.
4993 * It is the caller's responsibility to eventually free the memory returned
4996 * Please see sv_collxfrm() to see how this is used. */
4998 # define COLLXFRM_HDR_LEN sizeof(PL_collation_ix)
5000 char * s = (char *) input_string;
5001 STRLEN s_strlen = strlen(input_string);
5003 STRLEN xAlloc; /* xalloc is a reserved word in VC */
5004 STRLEN length_in_chars;
5005 bool first_time = TRUE; /* Cleared after first loop iteration */
5007 PERL_ARGS_ASSERT__MEM_COLLXFRM;
5009 /* Must be NUL-terminated */
5010 assert(*(input_string + len) == '\0');
5012 /* If this locale has defective collation, skip */
5013 if (PL_collxfrm_base == 0 && PL_collxfrm_mult == 0) {
5014 DEBUG_L(PerlIO_printf(Perl_debug_log,
5015 "_mem_collxfrm: locale's collation is defective\n"));
5019 /* Replace any embedded NULs with the control that sorts before any others.
5020 * This will give as good as possible results on strings that don't
5021 * otherwise contain that character, but otherwise there may be
5022 * less-than-perfect results with that character and NUL. This is
5023 * unavoidable unless we replace strxfrm with our own implementation. */
5024 if (UNLIKELY(s_strlen < len)) { /* Only execute if there is an embedded
5028 STRLEN sans_nuls_len;
5029 int try_non_controls;
5030 char this_replacement_char[] = "?\0"; /* Room for a two-byte string,
5031 making sure 2nd byte is NUL.
5033 STRLEN this_replacement_len;
5035 /* If we don't know what non-NUL control character sorts lowest for
5036 * this locale, find it */
5037 if (PL_strxfrm_NUL_replacement == '\0') {
5039 char * cur_min_x = NULL; /* The min_char's xfrm, (except it also
5040 includes the collation index
5043 DEBUG_Lv(PerlIO_printf(Perl_debug_log, "Looking to replace NUL\n"));
5045 /* Unlikely, but it may be that no control will work to replace
5046 * NUL, in which case we instead look for any character. Controls
5047 * are preferred because collation order is, in general, context
5048 * sensitive, with adjoining characters affecting the order, and
5049 * controls are less likely to have such interactions, allowing the
5050 * NUL-replacement to stand on its own. (Another way to look at it
5051 * is to imagine what would happen if the NUL were replaced by a
5052 * combining character; it wouldn't work out all that well.) */
5053 for (try_non_controls = 0;
5054 try_non_controls < 2;
5057 /* Look through all legal code points (NUL isn't) */
5058 for (j = 1; j < 256; j++) {
5059 char * x; /* j's xfrm plus collation index */
5060 STRLEN x_len; /* length of 'x' */
5061 STRLEN trial_len = 1;
5062 char cur_source[] = { '\0', '\0' };
5064 /* Skip non-controls the first time through the loop. The
5065 * controls in a UTF-8 locale are the L1 ones */
5066 if (! try_non_controls && (PL_in_utf8_COLLATE_locale)
5073 /* Create a 1-char string of the current code point */
5074 cur_source[0] = (char) j;
5076 /* Then transform it */
5077 x = _mem_collxfrm(cur_source, trial_len, &x_len,
5078 0 /* The string is not in UTF-8 */);
5080 /* Ignore any character that didn't successfully transform.
5086 /* If this character's transformation is lower than
5087 * the current lowest, this one becomes the lowest */
5088 if ( cur_min_x == NULL
5089 || strLT(x + COLLXFRM_HDR_LEN,
5090 cur_min_x + COLLXFRM_HDR_LEN))
5092 PL_strxfrm_NUL_replacement = j;
5093 Safefree(cur_min_x);
5099 } /* end of loop through all 255 characters */
5101 /* Stop looking if found */
5106 /* Unlikely, but possible, if there aren't any controls that
5107 * work in the locale, repeat the loop, looking for any
5108 * character that works */
5109 DEBUG_L(PerlIO_printf(Perl_debug_log,
5110 "_mem_collxfrm: No control worked. Trying non-controls\n"));
5111 } /* End of loop to try first the controls, then any char */
5114 DEBUG_L(PerlIO_printf(Perl_debug_log,
5115 "_mem_collxfrm: Couldn't find any character to replace"
5116 " embedded NULs in locale %s with", PL_collation_name));
5120 DEBUG_L(PerlIO_printf(Perl_debug_log,
5121 "_mem_collxfrm: Replacing embedded NULs in locale %s with "
5122 "0x%02X\n", PL_collation_name, PL_strxfrm_NUL_replacement));
5124 Safefree(cur_min_x);
5125 } /* End of determining the character that is to replace NULs */
5127 /* If the replacement is variant under UTF-8, it must match the
5128 * UTF8-ness of the original */
5129 if ( ! UVCHR_IS_INVARIANT(PL_strxfrm_NUL_replacement) && utf8) {
5130 this_replacement_char[0] =
5131 UTF8_EIGHT_BIT_HI(PL_strxfrm_NUL_replacement);
5132 this_replacement_char[1] =
5133 UTF8_EIGHT_BIT_LO(PL_strxfrm_NUL_replacement);
5134 this_replacement_len = 2;
5137 this_replacement_char[0] = PL_strxfrm_NUL_replacement;
5138 /* this_replacement_char[1] = '\0' was done at initialization */
5139 this_replacement_len = 1;
5142 /* The worst case length for the replaced string would be if every
5143 * character in it is NUL. Multiply that by the length of each
5144 * replacement, and allow for a trailing NUL */
5145 sans_nuls_len = (len * this_replacement_len) + 1;
5146 Newx(sans_nuls, sans_nuls_len, char);
5149 /* Replace each NUL with the lowest collating control. Loop until have
5150 * exhausted all the NULs */
5151 while (s + s_strlen < e) {
5152 my_strlcat(sans_nuls, s, sans_nuls_len);
5154 /* Do the actual replacement */
5155 my_strlcat(sans_nuls, this_replacement_char, sans_nuls_len);
5157 /* Move past the input NUL */
5159 s_strlen = strlen(s);
5162 /* And add anything that trails the final NUL */
5163 my_strlcat(sans_nuls, s, sans_nuls_len);
5165 /* Switch so below we transform this modified string */
5168 } /* End of replacing NULs */
5170 /* Make sure the UTF8ness of the string and locale match */
5171 if (utf8 != PL_in_utf8_COLLATE_locale) {
5172 /* XXX convert above Unicode to 10FFFF? */
5173 const char * const t = s; /* Temporary so we can later find where the
5176 /* Here they don't match. Change the string's to be what the locale is
5179 if (! utf8) { /* locale is UTF-8, but input isn't; upgrade the input */
5180 s = (char *) bytes_to_utf8((const U8 *) s, &len);
5183 else { /* locale is not UTF-8; but input is; downgrade the input */
5185 s = (char *) bytes_from_utf8((const U8 *) s, &len, &utf8);
5187 /* If the downgrade was successful we are done, but if the input
5188 * contains things that require UTF-8 to represent, have to do
5189 * damage control ... */
5190 if (UNLIKELY(utf8)) {
5192 /* What we do is construct a non-UTF-8 string with
5193 * 1) the characters representable by a single byte converted
5194 * to be so (if necessary);
5195 * 2) and the rest converted to collate the same as the
5196 * highest collating representable character. That makes
5197 * them collate at the end. This is similar to how we
5198 * handle embedded NULs, but we use the highest collating
5199 * code point instead of the smallest. Like the NUL case,
5200 * this isn't perfect, but is the best we can reasonably
5201 * do. Every above-255 code point will sort the same as
5202 * the highest-sorting 0-255 code point. If that code
5203 * point can combine in a sequence with some other code
5204 * points for weight calculations, us changing something to
5205 * be it can adversely affect the results. But in most
5206 * cases, it should work reasonably. And note that this is
5207 * really an illegal situation: using code points above 255
5208 * on a locale where only 0-255 are valid. If two strings
5209 * sort entirely equal, then the sort order for the
5210 * above-255 code points will be in code point order. */
5214 /* If we haven't calculated the code point with the maximum
5215 * collating order for this locale, do so now */
5216 if (! PL_strxfrm_max_cp) {
5219 /* The current transformed string that collates the
5220 * highest (except it also includes the prefixed collation
5222 char * cur_max_x = NULL;
5224 /* Look through all legal code points (NUL isn't) */
5225 for (j = 1; j < 256; j++) {
5228 char cur_source[] = { '\0', '\0' };
5230 /* Create a 1-char string of the current code point */
5231 cur_source[0] = (char) j;
5233 /* Then transform it */
5234 x = _mem_collxfrm(cur_source, 1, &x_len, FALSE);
5236 /* If something went wrong (which it shouldn't), just
5237 * ignore this code point */
5242 /* If this character's transformation is higher than
5243 * the current highest, this one becomes the highest */
5244 if ( cur_max_x == NULL
5245 || strGT(x + COLLXFRM_HDR_LEN,
5246 cur_max_x + COLLXFRM_HDR_LEN))
5248 PL_strxfrm_max_cp = j;
5249 Safefree(cur_max_x);
5258 DEBUG_L(PerlIO_printf(Perl_debug_log,
5259 "_mem_collxfrm: Couldn't find any character to"
5260 " replace above-Latin1 chars in locale %s with",
5261 PL_collation_name));
5265 DEBUG_L(PerlIO_printf(Perl_debug_log,
5266 "_mem_collxfrm: highest 1-byte collating character"
5267 " in locale %s is 0x%02X\n",
5269 PL_strxfrm_max_cp));
5271 Safefree(cur_max_x);
5274 /* Here we know which legal code point collates the highest.
5275 * We are ready to construct the non-UTF-8 string. The length
5276 * will be at least 1 byte smaller than the input string
5277 * (because we changed at least one 2-byte character into a
5278 * single byte), but that is eaten up by the trailing NUL */
5284 char * e = (char *) t + len;
5286 for (i = 0; i < len; i+= UTF8SKIP(t + i)) {
5288 if (UTF8_IS_INVARIANT(cur_char)) {
5291 else if (UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(t + i, e)) {
5292 s[d++] = EIGHT_BIT_UTF8_TO_NATIVE(cur_char, t[i+1]);
5294 else { /* Replace illegal cp with highest collating
5296 s[d++] = PL_strxfrm_max_cp;
5300 Renew(s, d, char); /* Free up unused space */
5305 /* Here, we have constructed a modified version of the input. It could
5306 * be that we already had a modified copy before we did this version.
5307 * If so, that copy is no longer needed */
5308 if (t != input_string) {
5313 length_in_chars = (utf8)
5314 ? utf8_length((U8 *) s, (U8 *) s + len)
5317 /* The first element in the output is the collation id, used by
5318 * sv_collxfrm(); then comes the space for the transformed string. The
5319 * equation should give us a good estimate as to how much is needed */
5320 xAlloc = COLLXFRM_HDR_LEN
5322 + (PL_collxfrm_mult * length_in_chars);
5323 Newx(xbuf, xAlloc, char);
5324 if (UNLIKELY(! xbuf)) {
5325 DEBUG_L(PerlIO_printf(Perl_debug_log,
5326 "_mem_collxfrm: Couldn't malloc %zu bytes\n", xAlloc));
5330 /* Store the collation id */
5331 *(U32*)xbuf = PL_collation_ix;
5333 /* Then the transformation of the input. We loop until successful, or we
5338 *xlen = strxfrm(xbuf + COLLXFRM_HDR_LEN, s, xAlloc - COLLXFRM_HDR_LEN);
5340 /* If the transformed string occupies less space than we told strxfrm()
5341 * was available, it means it transformed the whole string. */
5342 if (*xlen < xAlloc - COLLXFRM_HDR_LEN) {
5344 /* But there still could have been a problem */
5346 DEBUG_L(PerlIO_printf(Perl_debug_log,
5347 "strxfrm failed for LC_COLLATE=%s; errno=%d, input=%s\n",
5348 PL_collation_name, errno,
5349 _byte_dump_string((U8 *) s, len, 0)));
5353 /* Here, the transformation was successful. Some systems include a
5354 * trailing NUL in the returned length. Ignore it, using a loop in
5355 * case multiple trailing NULs are returned. */
5357 && *(xbuf + COLLXFRM_HDR_LEN + (*xlen) - 1) == '\0')
5362 /* If the first try didn't get it, it means our prediction was low.
5363 * Modify the coefficients so that we predict a larger value in any
5364 * future transformations */
5366 STRLEN needed = *xlen + 1; /* +1 For trailing NUL */
5367 STRLEN computed_guess = PL_collxfrm_base
5368 + (PL_collxfrm_mult * length_in_chars);
5370 /* On zero-length input, just keep current slope instead of
5372 const STRLEN new_m = (length_in_chars != 0)
5373 ? needed / length_in_chars
5376 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
5377 "initial size of %zu bytes for a length "
5378 "%zu string was insufficient, %zu needed\n",
5379 computed_guess, length_in_chars, needed));
5381 /* If slope increased, use it, but discard this result for
5382 * length 1 strings, as we can't be sure that it's a real slope
5384 if (length_in_chars > 1 && new_m > PL_collxfrm_mult) {
5388 STRLEN old_m = PL_collxfrm_mult;
5389 STRLEN old_b = PL_collxfrm_base;
5393 PL_collxfrm_mult = new_m;
5394 PL_collxfrm_base = 1; /* +1 For trailing NUL */
5395 computed_guess = PL_collxfrm_base
5396 + (PL_collxfrm_mult * length_in_chars);
5397 if (computed_guess < needed) {
5398 PL_collxfrm_base += needed - computed_guess;
5401 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
5402 "slope is now %zu; was %zu, base "
5403 "is now %zu; was %zu\n",
5404 PL_collxfrm_mult, old_m,
5405 PL_collxfrm_base, old_b));
5407 else { /* Slope didn't change, but 'b' did */
5408 const STRLEN new_b = needed
5411 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
5412 "base is now %zu; was %zu\n", new_b, PL_collxfrm_base));
5413 PL_collxfrm_base = new_b;
5420 if (UNLIKELY(*xlen >= PERL_INT_MAX)) {
5421 DEBUG_L(PerlIO_printf(Perl_debug_log,
5422 "_mem_collxfrm: Needed %zu bytes, max permissible is %u\n",
5423 *xlen, PERL_INT_MAX));
5427 /* A well-behaved strxfrm() returns exactly how much space it needs
5428 * (usually not including the trailing NUL) when it fails due to not
5429 * enough space being provided. Assume that this is the case unless
5430 * it's been proven otherwise */
5431 if (LIKELY(PL_strxfrm_is_behaved) && first_time) {
5432 xAlloc = *xlen + COLLXFRM_HDR_LEN + 1;
5434 else { /* Here, either:
5435 * 1) The strxfrm() has previously shown bad behavior; or
5436 * 2) It isn't the first time through the loop, which means
5437 * that the strxfrm() is now showing bad behavior, because
5438 * we gave it what it said was needed in the previous
5439 * iteration, and it came back saying it needed still more.
5440 * (Many versions of cygwin fit this. When the buffer size
5441 * isn't sufficient, they return the input size instead of
5442 * how much is needed.)
5443 * Increase the buffer size by a fixed percentage and try again.
5445 xAlloc += (xAlloc / 4) + 1;
5446 PL_strxfrm_is_behaved = FALSE;
5448 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
5449 "_mem_collxfrm required more space than previously"
5450 " calculated for locale %s, trying again with new"
5452 PL_collation_name, COLLXFRM_HDR_LEN,
5453 xAlloc - COLLXFRM_HDR_LEN));
5456 Renew(xbuf, xAlloc, char);
5457 if (UNLIKELY(! xbuf)) {
5458 DEBUG_L(PerlIO_printf(Perl_debug_log,
5459 "_mem_collxfrm: Couldn't realloc %zu bytes\n", xAlloc));
5466 DEBUG_Lv((print_collxfrm_input_and_return(s, s + len, xlen, utf8),
5467 PerlIO_printf(Perl_debug_log, "Its xfrm is:"),
5468 PerlIO_printf(Perl_debug_log, "%s\n",
5469 _byte_dump_string((U8 *) xbuf + COLLXFRM_HDR_LEN,
5472 /* Free up unneeded space; retain enough for trailing NUL */
5473 Renew(xbuf, COLLXFRM_HDR_LEN + *xlen + 1, char);
5475 if (s != input_string) {
5483 DEBUG_Lv(print_collxfrm_input_and_return(s, s + len, NULL, utf8));
5486 if (s != input_string) {
5497 S_print_collxfrm_input_and_return(pTHX_
5498 const char * const s,
5499 const char * const e,
5500 const STRLEN * const xlen,
5504 PERL_ARGS_ASSERT_PRINT_COLLXFRM_INPUT_AND_RETURN;
5506 PerlIO_printf(Perl_debug_log, "_mem_collxfrm[%" UVuf "]: returning ",
5507 (UV)PL_collation_ix);
5509 PerlIO_printf(Perl_debug_log, "%zu", *xlen);
5512 PerlIO_printf(Perl_debug_log, "NULL");
5514 PerlIO_printf(Perl_debug_log, " for locale '%s', string='",
5516 print_bytes_for_locale(s, e, is_utf8);
5518 PerlIO_printf(Perl_debug_log, "'\n");
5521 # endif /* DEBUGGING */
5522 #endif /* USE_LOCALE_COLLATE */
5528 S_print_bytes_for_locale(pTHX_
5529 const char * const s,
5530 const char * const e,
5534 bool prev_was_printable = TRUE;
5535 bool first_time = TRUE;
5537 PERL_ARGS_ASSERT_PRINT_BYTES_FOR_LOCALE;
5541 ? utf8_to_uvchr_buf((U8 *) t, e, NULL)
5544 if (! prev_was_printable) {
5545 PerlIO_printf(Perl_debug_log, " ");
5547 PerlIO_printf(Perl_debug_log, "%c", (U8) cp);
5548 prev_was_printable = TRUE;
5552 PerlIO_printf(Perl_debug_log, " ");
5554 PerlIO_printf(Perl_debug_log, "%02" UVXf, cp);
5555 prev_was_printable = FALSE;
5557 t += (is_utf8) ? UTF8SKIP(t) : 1;
5562 # endif /* #ifdef DEBUGGING */
5565 S_toggle_locale_i(pTHX_ const unsigned cat_index,
5566 const char * new_locale,
5567 const line_t caller_line)
5569 /* Changes the locale for the category specified by 'index' to 'new_locale,
5570 * if they aren't already the same.
5572 * Returns a copy of the name of the original locale for 'cat_index'
5573 * so can be switched back to with the companion function
5574 * restore_toggled_locale_i(), (NULL if no restoral is necessary.) */
5576 const char * locale_to_restore_to = NULL;
5578 PERL_ARGS_ASSERT_TOGGLE_LOCALE_I;
5579 assert(cat_index <= NOMINAL_LC_ALL_INDEX);
5581 /* Find the original locale of the category we may need to change, so that
5582 * it can be restored to later */
5584 locale_to_restore_to = querylocale_i(cat_index);
5586 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
5587 "(%" LINE_Tf "): toggle_locale_i: index=%d(%s), wanted=%s,"
5589 caller_line, cat_index, category_names[cat_index],
5590 new_locale, locale_to_restore_to));
5592 if (! locale_to_restore_to) {
5593 locale_panic_(Perl_form(aTHX_ "Could not find current %s locale, errno=%d",
5594 category_names[cat_index], errno));
5597 /* If the locales are the same, there's nothing to do */
5598 if (strEQ(locale_to_restore_to, new_locale)) {
5599 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
5600 "(%d): %s locale unchanged as %s\n",
5601 caller_line, category_names[cat_index],
5607 /* Finally, change the locale to the new one */
5608 void_setlocale_i(cat_index, new_locale);
5610 DEBUG_Lv(PerlIO_printf(Perl_debug_log, "(%" LINE_Tf "): %s locale switched to %s\n",
5611 caller_line, category_names[cat_index], new_locale));
5613 return locale_to_restore_to;
5616 PERL_UNUSED_ARG(caller_line);
5622 S_restore_toggled_locale_i(pTHX_ const unsigned int cat_index,
5623 const char * restore_locale,
5624 const line_t caller_line)
5626 /* Restores the locale for LC_category corresponding to cat_indes to
5627 * 'restore_locale' (which is a copy that will be freed by this function),
5628 * or do nothing if the latter parameter is NULL */
5630 PERL_ARGS_ASSERT_RESTORE_TOGGLED_LOCALE_I;
5631 assert(cat_index <= NOMINAL_LC_ALL_INDEX);
5633 if (restore_locale == NULL) {
5634 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
5635 "(%" LINE_Tf "): No need to restore %s\n",
5636 caller_line, category_names[cat_index]));
5640 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
5641 "(%" LINE_Tf "): %s restoring locale to %s\n",
5642 caller_line, category_names[cat_index],
5645 void_setlocale_i(cat_index, restore_locale);
5648 PERL_UNUSED_ARG(caller_line);
5654 S_switch_category_locale_to_template(pTHX_ const int switch_category,
5655 const int template_category,
5656 const char * template_locale)
5658 /* Changes the locale for LC_'switch_category" to that of
5659 * LC_'template_category', if they aren't already the same. If not NULL,
5660 * 'template_locale' is the locale that 'template_category' is in.
5662 * Returns a copy of the name of the original locale for 'switch_category'
5663 * so can be switched back to with the companion function
5664 * restore_switched_locale(), (NULL if no restoral is necessary.) */
5666 const char * restore_to_locale = NULL;
5668 if (switch_category == template_category) { /* No changes needed */
5672 /* Find the original locale of the category we may need to change, so that
5673 * it can be restored to later */
5674 restore_to_locale = querylocale_r(switch_category);
5675 if (! restore_to_locale) {
5676 locale_panic_(Perl_form(aTHX_ "Could not find current %s locale",
5677 category_name(switch_category)));
5679 restore_to_locale = savepv(restore_to_locale);
5681 /* If the locale of the template category wasn't passed in, find it now */
5682 if (template_locale == NULL) {
5683 template_locale = querylocale_r(template_category);
5684 if (! template_locale) {
5685 locale_panic_(Perl_form(aTHX_ "Could not find current %s locale\n",
5686 category_name(template_category)));
5690 /* It the locales are the same, there's nothing to do */
5691 if (strEQ(restore_to_locale, template_locale)) {
5692 Safefree(restore_to_locale);
5694 DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s locale unchanged as %s\n",
5695 category_name(switch_category), template_locale));
5700 /* Finally, change the locale to the template one */
5701 if (! bool_setlocale_r(switch_category, template_locale)) {
5702 setlocale_failure_panic_i(get_category_index(switch_category,
5704 category_name(switch_category),
5710 DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s locale switched to %s\n",
5711 category_name(switch_category), template_locale));
5713 return restore_to_locale;
5717 S_restore_switched_locale(pTHX_ const int category,
5718 const char * const original_locale)
5720 /* Restores the locale for LC_'category' to 'original_locale' (which is a
5721 * copy that will be freed by this function), or do nothing if the latter
5722 * parameter is NULL */
5724 if (original_locale == NULL) {
5728 if (! bool_setlocale_r(category, original_locale)) {
5729 locale_panic_(Perl_form(aTHX_ "s restoring %s to %s failed",
5730 category_name(category), original_locale));
5733 Safefree(original_locale);
5736 /* is_cur_LC_category_utf8 uses a small char buffer to avoid malloc/free */
5737 # define CUR_LC_BUFFER_SIZE 64
5740 Perl__is_cur_LC_category_utf8(pTHX_ int category)
5742 /* Returns TRUE if the current locale for 'category' is UTF-8; FALSE
5743 * otherwise. 'category' may not be LC_ALL. If the platform doesn't have
5744 * nl_langinfo(), nor MB_CUR_MAX, this employs a heuristic, which hence
5745 * could give the wrong result. The result will very likely be correct for
5746 * languages that have commonly used non-ASCII characters, but for notably
5747 * English, it comes down to if the locale's name ends in something like
5748 * "UTF-8". It errs on the side of not being a UTF-8 locale.
5750 * If the platform is early C89, not containing mbtowc(), or we are
5751 * compiled to not pay attention to LC_CTYPE, this employs heuristics.
5752 * These work very well for non-Latin locales or those whose currency
5753 * symbol isn't a '$' nor plain ASCII text. But without LC_CTYPE and at
5754 * least MB_CUR_MAX, English locales with an ASCII currency symbol depend
5755 * on the name containing UTF-8 or not. */
5757 /* Name of current locale corresponding to the input category */
5758 const char *save_input_locale = NULL;
5760 bool is_utf8 = FALSE; /* The return value */
5762 /* The variables below are for the cache of previous lookups using this
5763 * function. The cache is a C string, described at the definition for
5764 * 'C_and_POSIX_utf8ness'.
5766 * The first part of the cache is fixed, for the C and POSIX locales. The
5767 * varying part starts just after them. */
5768 char * utf8ness_cache = PL_locale_utf8ness + STRLENs(C_and_POSIX_utf8ness);
5770 Size_t utf8ness_cache_size; /* Size of the varying portion */
5771 Size_t input_name_len; /* Length in bytes of save_input_locale */
5772 Size_t input_name_len_with_overhead; /* plus extra chars used to store
5773 the name in the cache */
5774 char * delimited; /* The name plus the delimiters used to store
5776 char buffer[CUR_LC_BUFFER_SIZE]; /* small buffer */
5777 char * name_pos; /* position of 'delimited' in the cache, or 0
5783 assert(category != LC_ALL);
5787 /* Get the desired category's locale */
5788 save_input_locale = querylocale_r(category);
5790 DEBUG_L(PerlIO_printf(Perl_debug_log,
5791 "Current locale for %s is %s\n",
5792 category_name(category), save_input_locale));
5794 input_name_len = strlen(save_input_locale);
5796 /* In our cache, each name is accompanied by two delimiters and a single
5798 input_name_len_with_overhead = input_name_len + 3;
5800 if ( input_name_len_with_overhead <= CUR_LC_BUFFER_SIZE ) {
5801 /* we can use the buffer, avoid a malloc */
5803 } else { /* need a malloc */
5804 /* Allocate and populate space for a copy of the name surrounded by the
5806 Newx(delimited, input_name_len_with_overhead, char);
5809 delimited[0] = UTF8NESS_SEP[0];
5810 Copy(save_input_locale, delimited + 1, input_name_len, char);
5811 delimited[input_name_len+1] = UTF8NESS_PREFIX[0];
5812 delimited[input_name_len+2] = '\0';
5814 /* And see if that is in the cache */
5815 name_pos = instr(PL_locale_utf8ness, delimited);
5817 is_utf8 = *(name_pos + input_name_len_with_overhead - 1) - '0';
5819 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
5820 "UTF8ness for locale %s=%d, \n",
5821 save_input_locale, is_utf8));
5823 /* And, if not already in that position, move it to the beginning of
5824 * the non-constant portion of the list, since it is the most recently
5825 * used. (We don't have to worry about overflow, since just moving
5826 * existing names around) */
5827 if (name_pos > utf8ness_cache) {
5828 Move(utf8ness_cache,
5829 utf8ness_cache + input_name_len_with_overhead,
5830 name_pos - utf8ness_cache, char);
5833 input_name_len_with_overhead - 1, char);
5834 utf8ness_cache[input_name_len_with_overhead - 1] = is_utf8 + '0';
5837 /* free only when not using the buffer */
5838 if ( delimited != buffer ) Safefree(delimited);
5842 /* Here we don't have stored the utf8ness for the input locale. We have to
5845 # if defined(USE_LOCALE_CTYPE) \
5846 && ( defined(HAS_SOME_LANGINFO) \
5847 || (defined(HAS_MBTOWC) || defined(HAS_MBRTOWC)))
5850 const char *original_ctype_locale
5851 = switch_category_locale_to_template(LC_CTYPE,
5855 /* Here the current LC_CTYPE is set to the locale of the category whose
5856 * information is desired. This means that nl_langinfo() and mbtowc()
5857 * should give the correct results */
5859 # ifdef MB_CUR_MAX /* But we can potentially rule out UTF-8ness, avoiding
5860 calling the functions if we have this */
5862 /* Standard UTF-8 needs at least 4 bytes to represent the maximum
5863 * Unicode code point. */
5865 DEBUG_L(PerlIO_printf(Perl_debug_log, "MB_CUR_MAX=%d\n",
5867 if ((unsigned) MB_CUR_MAX < STRLENs(MAX_UNICODE_UTF8)) {
5869 restore_switched_locale(LC_CTYPE, original_ctype_locale);
5870 goto finish_and_return;
5874 # if defined(HAS_SOME_LANGINFO)
5876 { /* The task is easiest if the platform has this POSIX 2001 function.
5877 Except on some platforms it can wrongly return "", so have to have
5878 a fallback. And it can return that it's UTF-8, even if there are
5879 variances from that. For example, Turkish locales may use the
5880 alternate dotted I rules, and sometimes it appears to be a
5881 defective locale definition. XXX We should probably check for
5882 these in the Latin1 range and warn (but on glibc, requires
5883 iswalnum() etc. due to their not handling 80-FF correctly */
5884 const char * scratch_buffer = NULL;
5885 const char *codeset = my_langinfo_c(CODESET, LC_CTYPE,
5887 &scratch_buffer, NULL, NULL);
5889 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
5890 "\tnllanginfo returned CODESET '%s'\n", codeset));
5892 if (codeset && strNE(codeset, "")) {
5894 /* If the implementation of foldEQ() somehow were
5895 * to change to not go byte-by-byte, this could
5896 * read past end of string, as only one length is
5897 * checked. But currently, a premature NUL will
5898 * compare false, and it will stop there */
5899 is_utf8 = cBOOL( foldEQ(codeset, "UTF-8", STRLENs("UTF-8"))
5900 || foldEQ(codeset, "UTF8", STRLENs("UTF8")));
5902 DEBUG_L(PerlIO_printf(Perl_debug_log,
5903 "\tnllanginfo returned CODESET '%s'; ?UTF8 locale=%d\n",
5905 restore_switched_locale(LC_CTYPE, original_ctype_locale);
5906 Safefree(scratch_buffer);
5907 goto finish_and_return;
5912 # if defined(HAS_MBTOWC) || defined(HAS_MBRTOWC)
5913 /* We can see if this is a UTF-8-like locale if have mbtowc(). It was a
5914 * late adder to C89, so very likely to have it. However, testing has
5915 * shown that, like nl_langinfo() above, there are locales that are not
5916 * strictly UTF-8 that this will return that they are */
5921 PERL_UNUSED_RESULT(mbtowc_(NULL, NULL, 0));
5922 len = mbtowc_(&wc, REPLACEMENT_CHARACTER_UTF8,
5923 STRLENs(REPLACEMENT_CHARACTER_UTF8));
5925 is_utf8 = cBOOL( len == STRLENs(REPLACEMENT_CHARACTER_UTF8)
5926 && wc == (wchar_t) UNICODE_REPLACEMENT);
5931 restore_switched_locale(LC_CTYPE, original_ctype_locale);
5932 goto finish_and_return;
5937 /* Here, we must have a C89 compiler that doesn't have mbtowc(). Next
5938 * try looking at the currency symbol to see if it disambiguates
5939 * things. Often that will be in the native script, and if the symbol
5940 * isn't in UTF-8, we know that the locale isn't. If it is non-ASCII
5941 * UTF-8, we infer that the locale is too, as the odds of a non-UTF8
5942 * string being valid UTF-8 are quite small */
5944 # ifdef USE_LOCALE_MONETARY
5946 /* If have LC_MONETARY, we can look at the currency symbol. Often that
5947 * will be in the native script. We do this one first because there is
5948 * just one string to examine, so potentially avoids work */
5951 const char *original_monetary_locale
5952 = switch_category_locale_to_template(LC_MONETARY,
5955 bool only_ascii = FALSE;
5956 const char * scratch_buffer = NULL;
5957 const U8 * currency_string
5958 = (const U8 *) my_langinfo_c(CRNCYSTR, LC_MONETARY,
5960 &scratch_buffer, NULL, NULL);
5961 /* 2nd param not relevant for this item */
5962 const U8 * first_variant;
5964 assert( *currency_string == '-'
5965 || *currency_string == '+'
5966 || *currency_string == '.');
5970 if (is_utf8_invariant_string_loc(currency_string, 0, &first_variant))
5972 DEBUG_L(PerlIO_printf(Perl_debug_log,
5973 "Couldn't get currency symbol for %s, or contains"
5974 " only ASCII; can't use for determining if UTF-8"
5975 " locale\n", save_input_locale));
5979 is_utf8 = is_strict_utf8_string(first_variant, 0);
5981 Safefree(scratch_buffer);
5983 restore_switched_locale(LC_MONETARY, original_monetary_locale);
5987 /* It isn't a UTF-8 locale if the symbol is not legal UTF-8;
5988 * otherwise assume the locale is UTF-8 if and only if the symbol
5989 * is non-ascii UTF-8. */
5990 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
5991 "\t?Currency symbol for %s is UTF-8=%d\n",
5992 save_input_locale, is_utf8));
5993 goto finish_and_return;
5997 # endif /* USE_LOCALE_MONETARY */
5998 # if defined(HAS_STRFTIME) && defined(USE_LOCALE_TIME)
6000 /* Still haven't found a non-ASCII string to disambiguate UTF-8 or not. Try
6001 * the names of the months and weekdays, timezone, and am/pm indicator */
6003 const char *original_time_locale
6004 = switch_category_locale_to_template(LC_TIME,
6008 bool is_dst = FALSE;
6012 char * formatted_time;
6014 /* Here the current LC_TIME is set to the locale of the category
6015 * whose information is desired. Look at all the days of the week
6016 * and month names, and the timezone and am/pm indicator for UTF-8
6017 * variant characters. The first such a one found will tell us if
6018 * the locale is UTF-8 or not */
6020 for (i = 0; i < 7 + 12; i++) { /* 7 days; 12 months */
6021 formatted_time = my_strftime("%A %B %Z %p",
6022 0, 0, hour, dom, month, 2012 - 1900, 0, 0, is_dst);
6023 if ( ! formatted_time
6024 || is_utf8_invariant_string((U8 *) formatted_time, 0))
6027 /* Here, we didn't find a non-ASCII. Try the next time
6028 * through with the complemented dst and am/pm, and try
6029 * with the next weekday. After we have gotten all
6030 * weekdays, try the next month */
6032 hour = (hour + 12) % 24;
6037 Safefree(formatted_time);
6041 /* Here, we have a non-ASCII. Return TRUE is it is valid UTF8;
6042 * false otherwise. But first, restore LC_TIME to its original
6043 * locale if we changed it */
6044 restore_switched_locale(LC_TIME, original_time_locale);
6046 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
6047 "\t?time-related strings for %s are UTF-8=%d\n",
6049 is_utf8_string((U8 *) formatted_time, 0)));
6050 is_utf8 = is_utf8_string((U8 *) formatted_time, 0);
6051 Safefree(formatted_time);
6052 goto finish_and_return;
6055 /* Falling off the end of the loop indicates all the names were just
6056 * ASCII. Go on to the next test. If we changed it, restore LC_TIME
6057 * to its original locale */
6058 restore_switched_locale(LC_TIME, original_time_locale);
6059 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
6060 "All time-related words for %s contain only ASCII;"
6061 " can't use for determining if UTF-8 locale\n",
6062 save_input_locale));
6067 # if 0 && defined(USE_LOCALE_MESSAGES) && defined(HAS_SYS_ERRLIST)
6069 /* This code is ifdefd out because it was found to not be necessary in
6070 * testing on our dromedary test machine, which has over 700 locales.
6071 * There, this added no value to looking at the currency symbol and the
6072 * time strings. I left it in so as to avoid rewriting it if real-world
6073 * experience indicates that dromedary is an outlier. Essentially, instead
6074 * of returning abpve if we haven't found illegal utf8, we continue on and
6075 * examine all the strerror() messages on the platform for utf8ness. If
6076 * all are ASCII, we still don't know the answer; but otherwise we have a
6077 * pretty good indication of the utf8ness. The reason this doesn't help
6078 * much is that the messages may not have been translated into the locale.
6079 * The currency symbol and time strings are much more likely to have been
6083 bool non_ascii = FALSE;
6084 const char *original_messages_locale
6085 = switch_category_locale_to_template(LC_MESSAGES,
6088 const char * errmsg = NULL;
6090 /* Here the current LC_MESSAGES is set to the locale of the category
6091 * whose information is desired. Look through all the messages. We
6092 * can't use Strerror() here because it may expand to code that
6093 * segfaults in miniperl */
6095 for (e = 0; e <= sys_nerr; e++) {
6097 errmsg = sys_errlist[e];
6098 if (errno || !errmsg) {
6101 errmsg = savepv(errmsg);
6102 if (! is_utf8_invariant_string((U8 *) errmsg, 0)) {
6104 is_utf8 = is_utf8_string((U8 *) errmsg, 0);
6110 restore_switched_locale(LC_MESSAGES, original_messages_locale);
6114 /* Any non-UTF-8 message means not a UTF-8 locale; if all are
6115 * valid, any non-ascii means it is one; otherwise we assume it
6117 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
6118 "\t?error messages for %s are UTF-8=%d\n",
6121 goto finish_and_return;
6124 DEBUG_L(PerlIO_printf(Perl_debug_log,
6125 "All error messages for %s contain only ASCII;"
6126 " can't use for determining if UTF-8 locale\n",
6127 save_input_locale));
6131 # ifndef EBCDIC /* On os390, even if the name ends with "UTF-8', it isn't a
6134 /* As a last resort, look at the locale name to see if it matches
6135 * qr/UTF -? * 8 /ix, or some other common locale names. This "name", the
6136 * return of setlocale(), is actually defined to be opaque, so we can't
6137 * really rely on the absence of various substrings in the name to indicate
6138 * its UTF-8ness, but if it has UTF8 in the name, it is extremely likely to
6139 * be a UTF-8 locale. Similarly for the other common names */
6142 const Size_t final_pos = strlen(save_input_locale) - 1;
6144 if (final_pos >= 3) {
6145 const char *name = save_input_locale;
6147 /* Find next 'U' or 'u' and look from there */
6148 while ((name += strcspn(name, "Uu") + 1)
6149 <= save_input_locale + final_pos - 2)
6151 if ( isALPHA_FOLD_NE(*name, 't')
6152 || isALPHA_FOLD_NE(*(name + 1), 'f'))
6157 if (*(name) == '-') {
6158 if ((name > save_input_locale + final_pos - 1)) {
6163 if (*(name) == '8') {
6164 DEBUG_L(PerlIO_printf(Perl_debug_log,
6165 "Locale %s ends with UTF-8 in name\n",
6166 save_input_locale));
6168 goto finish_and_return;
6171 DEBUG_L(PerlIO_printf(Perl_debug_log,
6172 "Locale %s doesn't end with UTF-8 in name\n",
6173 save_input_locale));
6178 /* http://msdn.microsoft.com/en-us/library/windows/desktop/dd317756.aspx */
6179 if (memENDs(save_input_locale, final_pos, "65001")) {
6180 DEBUG_L(PerlIO_printf(Perl_debug_log,
6181 "Locale %s ends with 65001 in name, is UTF-8 locale\n",
6182 save_input_locale));
6184 goto finish_and_return;
6191 /* Other common encodings are the ISO 8859 series, which aren't UTF-8. But
6192 * since we are about to return FALSE anyway, there is no point in doing
6193 * this extra work */
6196 if (instr(save_input_locale, "8859")) {
6197 DEBUG_L(PerlIO_printf(Perl_debug_log,
6198 "Locale %s has 8859 in name, not UTF-8 locale\n",
6199 save_input_locale));
6201 goto finish_and_return;
6205 DEBUG_L(PerlIO_printf(Perl_debug_log,
6206 "Assuming locale %s is not a UTF-8 locale\n",
6207 save_input_locale));
6210 # endif /* the code that is compiled when no modern LC_CTYPE */
6214 /* Cache this result so we don't have to go through all this next time. */
6215 utf8ness_cache_size = sizeof(PL_locale_utf8ness)
6216 - (utf8ness_cache - PL_locale_utf8ness);
6218 /* But we can't save it if it is too large for the total space available */
6219 if (LIKELY(input_name_len_with_overhead < utf8ness_cache_size)) {
6220 Size_t utf8ness_cache_len = strlen(utf8ness_cache);
6222 /* Here it can fit, but we may need to clear out the oldest cached
6223 * result(s) to do so. Check */
6224 if (utf8ness_cache_len + input_name_len_with_overhead
6225 >= utf8ness_cache_size)
6227 /* Here we have to clear something out to make room for this.
6228 * Start looking at the rightmost place where it could fit and find
6229 * the beginning of the entry that extends past that. */
6230 char * cutoff = (char *) my_memrchr(utf8ness_cache,
6233 - input_name_len_with_overhead);
6236 assert(cutoff >= utf8ness_cache);
6238 /* This and all subsequent entries must be removed */
6240 utf8ness_cache_len = strlen(utf8ness_cache);
6243 /* Make space for the new entry */
6244 Move(utf8ness_cache,
6245 utf8ness_cache + input_name_len_with_overhead,
6246 utf8ness_cache_len + 1 /* Incl. trailing NUL */, char);
6249 Copy(delimited, utf8ness_cache, input_name_len_with_overhead - 1, char);
6250 utf8ness_cache[input_name_len_with_overhead - 1] = is_utf8 + '0';
6252 if ((PL_locale_utf8ness[strlen(PL_locale_utf8ness)-1] & ~1) != '0') {
6253 locale_panic_(Perl_form(aTHX_
6254 "Corrupt utf8ness_cache=%s\nlen=%zu,"
6255 " inserted_name=%s, its_len=%zu",
6256 PL_locale_utf8ness, strlen(PL_locale_utf8ness),
6257 delimited, input_name_len_with_overhead));
6263 if (DEBUG_Lv_TEST) {
6264 const char * s = PL_locale_utf8ness;
6266 /* Audit the structure */
6267 while (s < PL_locale_utf8ness + strlen(PL_locale_utf8ness)) {
6270 if (*s != UTF8NESS_SEP[0]) {
6271 locale_panic_(Perl_form(aTHX_
6272 "Corrupt utf8ness_cache: missing"
6273 " separator %.*s<-- HERE %s",
6274 (int) (s - PL_locale_utf8ness),
6279 e = strchr(s, UTF8NESS_PREFIX[0]);
6281 e = PL_locale_utf8ness + strlen(PL_locale_utf8ness);
6282 locale_panic_(Perl_form(aTHX_
6283 "Corrupt utf8ness_cache: missing"
6284 " separator %.*s<-- HERE %s",
6285 (int) (e - PL_locale_utf8ness),
6290 if (*e != '0' && *e != '1') {
6291 locale_panic_(Perl_form(aTHX_
6292 "Corrupt utf8ness_cache: utf8ness"
6293 " must be [01] %.*s<-- HERE %s",
6294 (int) (e + 1 - PL_locale_utf8ness),
6298 if (ninstr(PL_locale_utf8ness, s, s-1, e)) {
6299 locale_panic_(Perl_form(aTHX_
6300 "Corrupt utf8ness_cache: entry"
6301 " has duplicate %.*s<-- HERE %s",
6302 (int) (e - PL_locale_utf8ness),
6310 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
6311 "PL_locale_utf8ness is now %s; returning %d\n",
6312 PL_locale_utf8ness, is_utf8));
6316 /* free only when not using the buffer */
6317 if ( delimited != buffer ) Safefree(delimited);
6322 S_is_codeset_name_UTF8(const char * name)
6324 /* Return a boolean as to if the passed-in name indicates it is a UTF-8
6325 * code set. Several variants are possible */
6326 const Size_t len = strlen(name);
6328 PERL_ARGS_ASSERT_IS_CODESET_NAME_UTF8;
6332 /* http://msdn.microsoft.com/en-us/library/windows/desktop/dd317756.aspx */
6333 if (memENDs(name, len, "65001")) {
6338 /* 'UTF8' or 'UTF-8' */
6339 return ( inRANGE(len, 4, 5)
6340 && name[len-1] == '8'
6341 && ( memBEGINs(name, len, "UTF")
6342 || memBEGINs(name, len, "utf"))
6343 && (len == 4 || name[3] == '-'));
6347 S_is_locale_utf8(pTHX_ const char * locale)
6349 /* Returns TRUE if the locale 'locale' is UTF-8; FALSE otherwise. It uses
6352 # if ! defined(USE_LOCALE_CTYPE) \
6353 || defined(EBCDIC) /* There aren't any real UTF-8 locales at this time */
6355 PERL_UNUSED_ARG(locale);
6361 const char * scratch_buffer = NULL;
6362 const char * codeset = my_langinfo_c(CODESET, LC_CTYPE, locale,
6363 &scratch_buffer, NULL, NULL);
6364 bool retval = is_codeset_name_UTF8(codeset);
6366 PERL_ARGS_ASSERT_IS_LOCALE_UTF8;
6368 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
6369 "found codeset=%s, is_utf8=%d\n", codeset, retval));
6371 Safefree(scratch_buffer);
6378 #endif /* USE_LOCALE */
6381 Perl__is_in_locale_category(pTHX_ const bool compiling, const int category)
6383 /* Internal function which returns if we are in the scope of a pragma that
6384 * enables the locale category 'category'. 'compiling' should indicate if
6385 * this is during the compilation phase (TRUE) or not (FALSE). */
6387 const COP * const cop = (compiling) ? &PL_compiling : PL_curcop;
6389 SV *these_categories = cop_hints_fetch_pvs(cop, "locale", 0);
6390 if (! these_categories || these_categories == &PL_sv_placeholder) {
6394 /* The pseudo-category 'not_characters' is -1, so just add 1 to each to get
6395 * a valid unsigned */
6396 assert(category >= -1);
6397 return cBOOL(SvUV(these_categories) & (1U << (category + 1)));
6401 Perl_my_strerror(pTHX_ const int errnum)
6403 /* Returns a mortalized copy of the text of the error message associated
6404 * with 'errnum'. It uses the current locale's text unless the platform
6405 * doesn't have the LC_MESSAGES category or we are not being called from
6406 * within the scope of 'use locale'. In the former case, it uses whatever
6407 * strerror returns; in the latter case it uses the text from the C locale.
6409 * The function just calls strerror(), but temporarily switches, if needed,
6410 * to the C locale */
6414 #ifndef USE_LOCALE_MESSAGES
6416 /* If platform doesn't have messages category, we don't do any switching to
6417 * the C locale; we just use whatever strerror() returns */
6419 errstr = savepv(Strerror(errnum));
6421 #else /* Has locale messages */
6423 const bool within_locale_scope = IN_LC(LC_MESSAGES);
6425 # ifndef USE_LOCALE_THREADS
6427 /* This function is trivial without threads. */
6428 if (within_locale_scope) {
6429 errstr = savepv(Strerror(errnum));
6432 const char * save_locale = querylocale_c(LC_MESSAGES);
6434 void_setlocale_c(LC_MESSAGES, "C");
6435 errstr = savepv(Strerror(errnum));
6436 void_setlocale_c(LC_MESSAGES, save_locale);
6439 # elif defined(USE_POSIX_2008_LOCALE) && defined(HAS_STRERROR_L)
6441 /* This function is also trivial if we don't have to worry about thread
6442 * safety and have strerror_l(), as it handles the switch of locales so we
6443 * don't have to deal with that. We don't have to worry about thread
6444 * safety if strerror_r() is also available. Both it and strerror_l() are
6445 * thread-safe. Plain strerror() isn't thread safe. But on threaded
6446 * builds when strerror_r() is available, the apparent call to strerror()
6447 * below is actually a macro that behind-the-scenes calls strerror_r(). */
6449 # ifdef HAS_STRERROR_R
6451 if (within_locale_scope) {
6452 errstr = savepv(Strerror(errnum));
6455 errstr = savepv(strerror_l(errnum, PL_C_locale_obj));
6460 /* Here we have strerror_l(), but not strerror_r() and we are on a
6461 * threaded-build. We use strerror_l() for everything, constructing a
6462 * locale to pass to it if necessary */
6464 locale_t locale_to_use;
6466 if (within_locale_scope) {
6467 locale_to_use = use_curlocale_scratch();
6469 else { /* Use C locale if not within 'use locale' scope */
6470 locale_to_use = PL_C_locale_obj;
6473 errstr = savepv(strerror_l(errnum, locale_to_use));
6476 # else /* Doesn't have strerror_l() */
6478 const char * save_locale = NULL;
6479 bool locale_is_C = FALSE;
6481 /* We have a critical section to prevent another thread from executing this
6482 * same code at the same time which could cause LC_MESSAGES to be changed
6483 * to something else while we need it to be constant. (On thread-safe
6484 * perls, the LOCK is a no-op.) Since this is the only place in core that
6485 * changes LC_MESSAGES (unless the user has called setlocale()), this works
6486 * to prevent races. */
6489 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
6490 "my_strerror called with errnum %d\n", errnum));
6492 /* If not within locale scope, need to return messages in the C locale */
6493 if (within_locale_scope) {
6494 DEBUG_Lv(PerlIO_printf(Perl_debug_log, "WITHIN locale scope\n"));
6497 save_locale = querylocale_c(LC_MESSAGES);
6498 if (! save_locale) {
6500 locale_panic_("Could not find current LC_MESSAGES locale");
6501 NOT_REACHED; /* NOTREACHED */ \
6504 locale_is_C = isNAME_C_OR_POSIX(save_locale);
6506 /* Switch to the C locale if not already in it */
6507 if (! locale_is_C && ! bool_setlocale_c(LC_MESSAGES, "C")) {
6509 /* If, for some reason, the locale change failed, we soldier on as
6510 * best as possible under the circumstances, using the current
6511 * locale, and clear save_locale, so we don't try to change back.
6512 * On z/0S, all setlocale() calls fail after you've created a
6513 * thread. This is their way of making sure the entire process is
6514 * always a single locale. This means that 'use locale' is always
6515 * in place for messages under these circumstances. */
6518 } /* end of ! within_locale_scope */
6520 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
6521 "Any locale change has been done; about to call Strerror\n"));
6522 errstr = savepv(Strerror(errnum));
6524 /* Switch back if we successully switched */
6527 && ! bool_setlocale_c(LC_MESSAGES, save_locale))
6530 locale_panic_(Perl_form(aTHX_
6531 "setlocale restore to '%s' failed",
6533 NOT_REACHED; /* NOTREACHED */ \
6538 # endif /* End of doesn't have strerror_l */
6540 DEBUG_Lv((PerlIO_printf(Perl_debug_log,
6541 "Strerror returned; saving a copy: '"),
6542 print_bytes_for_locale(errstr, errstr + strlen(errstr), 0),
6543 PerlIO_printf(Perl_debug_log, "'\n")));
6545 #endif /* End of does have locale messages */
6553 =for apidoc switch_to_global_locale
6555 On systems without locale support, or on typical single-threaded builds, or on
6556 platforms that do not support per-thread locale operations, this function does
6557 nothing. On such systems that do have locale support, only a locale global to
6558 the whole program is available.
6560 On multi-threaded builds on systems that do have per-thread locale operations,
6561 this function converts the thread it is running in to use the global locale.
6562 This is for code that has not yet or cannot be updated to handle multi-threaded
6563 locale operation. As long as only a single thread is so-converted, everything
6564 works fine, as all the other threads continue to ignore the global one, so only
6565 this thread looks at it.
6567 However, on Windows systems this isn't quite true prior to Visual Studio 15,
6568 at which point Microsoft fixed a bug. A race can occur if you use the
6569 following operations on earlier Windows platforms:
6573 =item L<POSIX::localeconv|POSIX/localeconv>
6575 =item L<I18N::Langinfo>, items C<CRNCYSTR> and C<THOUSEP>
6577 =item L<perlapi/Perl_langinfo>, items C<CRNCYSTR> and C<THOUSEP>
6581 The first item is not fixable (except by upgrading to a later Visual Studio
6582 release), but it would be possible to work around the latter two items by using
6583 the Windows API functions C<GetNumberFormat> and C<GetCurrencyFormat>; patches
6586 Without this function call, threads that use the L<C<setlocale(3)>> system
6587 function will not work properly, as all the locale-sensitive functions will
6588 look at the per-thread locale, and C<setlocale> will have no effect on this
6591 Perl code should convert to either call
6592 L<C<Perl_setlocale>|perlapi/Perl_setlocale> (which is a drop-in for the system
6593 C<setlocale>) or use the methods given in L<perlcall> to call
6594 L<C<POSIX::setlocale>|POSIX/setlocale>. Either one will transparently properly
6595 handle all cases of single- vs multi-thread, POSIX 2008-supported or not.
6597 Non-Perl libraries, such as C<gtk>, that call the system C<setlocale> can
6598 continue to work if this function is called before transferring control to the
6601 Upon return from the code that needs to use the global locale,
6602 L<C<sync_locale()>|perlapi/sync_locale> should be called to restore the safe
6603 multi-thread operation.
6609 Perl_switch_to_global_locale()
6613 #ifdef USE_THREAD_SAFE_LOCALE
6616 _configthreadlocale(_DISABLE_PER_THREAD_LOCALE);
6623 for (i = 0; i < LC_ALL_INDEX_; i++) {
6624 setlocale(categories[i], querylocale_i(i));
6628 uselocale(LC_GLOBAL_LOCALE);
6637 =for apidoc sync_locale
6639 L<C<Perl_setlocale>|perlapi/Perl_setlocale> can be used at any time to query or
6640 change the locale (though changing the locale is antisocial and dangerous on
6641 multi-threaded systems that don't have multi-thread safe locale operations.
6642 (See L<perllocale/Multi-threaded operation>). Using the system
6643 L<C<setlocale(3)>> should be avoided. Nevertheless, certain non-Perl libraries
6644 called from XS, such as C<Gtk> do so, and this can't be changed. When the
6645 locale is changed by XS code that didn't use
6646 L<C<Perl_setlocale>|perlapi/Perl_setlocale>, Perl needs to be told that the
6647 locale has changed. Use this function to do so, before returning to Perl.
6649 The return value is a boolean: TRUE if the global locale at the time of call
6650 was in effect; and FALSE if a per-thread locale was in effect. This can be
6651 used by the caller that needs to restore things as-they-were to decide whether
6653 L<C<Perl_switch_to_global_locale>|perlapi/switch_to_global_locale>.
6668 const char * newlocale;
6671 # ifdef USE_POSIX_2008_LOCALE
6673 bool was_in_global_locale = FALSE;
6674 locale_t cur_obj = uselocale((locale_t) 0);
6676 /* On Windows, unless the foreign code has turned off the thread-safe
6677 * locale setting, any plain setlocale() will have affected what we see, so
6678 * no need to worry. Otherwise, If the foreign code has done a plain
6679 * setlocale(), it will only affect the global locale on POSIX systems, but
6680 * will affect the */
6681 if (cur_obj == LC_GLOBAL_LOCALE) {
6683 # ifdef HAS_QUERY_LOCALE
6685 void_setlocale_c(LC_ALL, querylocale_c(LC_ALL));
6691 /* We can't trust that we can read the LC_ALL format on the
6692 * platform, so do them individually */
6693 for (i = 0; i < LC_ALL_INDEX_; i++) {
6694 void_setlocale_i(i, querylocale_i(i));
6699 was_in_global_locale = TRUE;
6704 bool was_in_global_locale = TRUE;
6707 # ifdef USE_LOCALE_CTYPE
6709 newlocale = querylocale_c(LC_CTYPE);
6710 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
6711 "%s\n", setlocale_debug_string_c(LC_CTYPE, NULL, newlocale)));
6712 new_ctype(newlocale);
6714 # endif /* USE_LOCALE_CTYPE */
6715 # ifdef USE_LOCALE_COLLATE
6717 newlocale = querylocale_c(LC_COLLATE);
6718 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
6719 "%s\n", setlocale_debug_string_c(LC_COLLATE, NULL, newlocale)));
6720 new_collate(newlocale);
6723 # ifdef USE_LOCALE_NUMERIC
6725 newlocale = querylocale_c(LC_NUMERIC);
6726 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
6727 "%s\n", setlocale_debug_string_c(LC_NUMERIC, NULL, newlocale)));
6728 new_numeric(newlocale);
6730 # endif /* USE_LOCALE_NUMERIC */
6732 return was_in_global_locale;
6738 #if defined(DEBUGGING) && defined(USE_LOCALE)
6741 S_setlocale_debug_string_i(const unsigned cat_index,
6742 const char* const locale, /* Optional locale name */
6744 /* return value from setlocale() when attempting to
6745 * set 'category' to 'locale' */
6746 const char* const retval)
6748 /* Returns a pointer to a NUL-terminated string in static storage with
6749 * added text about the info passed in. This is not thread safe and will
6750 * be overwritten by the next call, so this should be used just to
6751 * formulate a string to immediately print or savepv() on. */
6753 static char ret[1024];
6754 assert(cat_index <= NOMINAL_LC_ALL_INDEX);
6756 my_strlcpy(ret, "setlocale(", sizeof(ret));
6757 my_strlcat(ret, category_names[cat_index], sizeof(ret));
6758 my_strlcat(ret, ", ", sizeof(ret));
6761 my_strlcat(ret, "\"", sizeof(ret));
6762 my_strlcat(ret, locale, sizeof(ret));
6763 my_strlcat(ret, "\"", sizeof(ret));
6766 my_strlcat(ret, "NULL", sizeof(ret));
6769 my_strlcat(ret, ") returned ", sizeof(ret));
6772 my_strlcat(ret, "\"", sizeof(ret));
6773 my_strlcat(ret, retval, sizeof(ret));
6774 my_strlcat(ret, "\"", sizeof(ret));
6777 my_strlcat(ret, "NULL", sizeof(ret));
6780 assert(strlen(ret) < sizeof(ret));
6788 Perl_thread_locale_init()
6790 /* Called from a thread on startup*/
6792 #ifdef USE_THREAD_SAFE_LOCALE
6797 DEBUG_L(PerlIO_printf(Perl_debug_log,
6798 "new thread, initial locale is %s; calling setlocale\n",
6799 setlocale(LC_ALL, NULL)));
6803 /* On Windows, make sure new thread has per-thread locales enabled */
6804 _configthreadlocale(_ENABLE_PER_THREAD_LOCALE);
6808 /* This thread starts off in the C locale */
6809 Perl_setlocale(LC_ALL, "C");
6817 Perl_thread_locale_term()
6819 /* Called from a thread as it gets ready to terminate */
6821 #ifdef USE_POSIX_2008_LOCALE
6823 /* C starts the new thread in the global C locale. If we are thread-safe,
6824 * we want to not be in the global locale */
6827 locale_t cur_obj = uselocale(LC_GLOBAL_LOCALE);
6828 if (cur_obj != LC_GLOBAL_LOCALE && cur_obj != PL_C_locale_obj) {
6829 freelocale(cur_obj);
6838 * ex: set ts=8 sts=4 sw=4 et: