3 * Copyright (C) 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001,
4 * 2002, 2003, 2005, 2006, 2007, 2008 by Larry Wall and others
6 * You may distribute under the terms of either the GNU General Public
7 * License or the Artistic License, as specified in the README file.
12 * A Elbereth Gilthoniel,
13 * silivren penna míriel
14 * o menel aglar elenath!
15 * Na-chaered palan-díriel
16 * o galadhremmin ennorath,
17 * Fanuilos, le linnathon
18 * nef aear, si nef aearon!
20 * [p.238 of _The Lord of the Rings_, II/i: "Many Meetings"]
23 /* utility functions for handling locale-specific stuff like what
24 * character represents the decimal point.
26 * All C programs have an underlying locale. Perl code generally doesn't pay
27 * any attention to it except within the scope of a 'use locale'. For most
28 * categories, it accomplishes this by just using different operations if it is
29 * in such scope than if not. However, various libc functions called by Perl
30 * are affected by the LC_NUMERIC category, so there are macros in perl.h that
31 * are used to toggle between the current locale and the C locale depending on
32 * the desired behavior of those functions at the moment. And, LC_MESSAGES is
33 * switched to the C locale for outputting the message unless within the scope
38 #define PERL_IN_LOCALE_C
39 #include "perl_langinfo.h"
44 /* If the environment says to, we can output debugging information during
45 * initialization. This is done before option parsing, and before any thread
46 * creation, so can be a file-level static */
48 # ifdef PERL_GLOBAL_STRUCT
49 /* no global syms allowed */
50 # define debug_initialization 0
51 # define DEBUG_INITIALIZATION_set(v)
53 static bool debug_initialization = FALSE;
54 # define DEBUG_INITIALIZATION_set(v) (debug_initialization = v)
58 /* strlen() of a literal string constant. XXX We might want this more general,
59 * but using it in just this file for now */
60 #define STRLENs(s) (sizeof("" s "") - 1)
65 * Standardize the locale name from a string returned by 'setlocale', possibly
66 * modifying that string.
68 * The typical return value of setlocale() is either
69 * (1) "xx_YY" if the first argument of setlocale() is not LC_ALL
70 * (2) "xa_YY xb_YY ..." if the first argument of setlocale() is LC_ALL
71 * (the space-separated values represent the various sublocales,
72 * in some unspecified order). This is not handled by this function.
74 * In some platforms it has a form like "LC_SOMETHING=Lang_Country.866\n",
75 * which is harmful for further use of the string in setlocale(). This
76 * function removes the trailing new line and everything up through the '='
80 S_stdize_locale(pTHX_ char *locs)
82 const char * const s = strchr(locs, '=');
85 PERL_ARGS_ASSERT_STDIZE_LOCALE;
88 const char * const t = strchr(s, '.');
91 const char * const u = strchr(t, '\n');
92 if (u && (u[1] == 0)) {
93 const STRLEN len = u - s;
94 Move(s + 1, locs, len, char);
102 Perl_croak(aTHX_ "Can't fix broken locale name \"%s\"", locs);
109 /* Windows requres a customized base-level setlocale() */
111 # define my_setlocale(cat, locale) win32_setlocale(cat, locale)
113 # define my_setlocale(cat, locale) setlocale(cat, locale)
116 /* Just placeholders for now. "_c" is intended to be called when the category
117 * is a constant known at compile time; "_r", not known until run time */
118 # define do_setlocale_c(category, locale) my_setlocale(category, locale)
119 # define do_setlocale_r(category, locale) my_setlocale(category, locale)
122 S_set_numeric_radix(pTHX_ const bool use_locale)
124 /* If 'use_locale' is FALSE, set to use a dot for the radix character. If
125 * TRUE, use the radix character derived from the current locale */
127 #if defined(USE_LOCALE_NUMERIC) && ( defined(HAS_LOCALECONV) \
128 || defined(HAS_NL_LANGINFO))
130 /* We only set up the radix SV if we are to use a locale radix ... */
132 const char * radix = my_nl_langinfo(PERL_RADIXCHAR, FALSE);
133 /* FALSE => already in dest locale */
135 /* ... and the character being used isn't a dot */
136 if (strNE(radix, ".")) {
137 if (PL_numeric_radix_sv) {
138 sv_setpv(PL_numeric_radix_sv, radix);
141 PL_numeric_radix_sv = newSVpv(radix, 0);
144 if ( ! is_utf8_invariant_string(
145 (U8 *) SvPVX(PL_numeric_radix_sv), SvCUR(PL_numeric_radix_sv))
147 (U8 *) SvPVX(PL_numeric_radix_sv), SvCUR(PL_numeric_radix_sv))
148 && _is_cur_LC_category_utf8(LC_NUMERIC))
150 SvUTF8_on(PL_numeric_radix_sv);
156 SvREFCNT_dec(PL_numeric_radix_sv);
157 PL_numeric_radix_sv = NULL;
163 if (DEBUG_L_TEST || debug_initialization) {
164 PerlIO_printf(Perl_debug_log, "Locale radix is '%s', ?UTF-8=%d\n",
165 (PL_numeric_radix_sv)
166 ? SvPVX(PL_numeric_radix_sv)
168 (PL_numeric_radix_sv)
169 ? cBOOL(SvUTF8(PL_numeric_radix_sv))
174 #endif /* USE_LOCALE_NUMERIC and can find the radix char */
178 /* Is the C string input 'name' "C" or "POSIX"? If so, and 'name' is the
179 * return of setlocale(), then this is extremely likely to be the C or POSIX
180 * locale. However, the output of setlocale() is documented to be opaque, but
181 * the odds are extremely small that it would return these two strings for some
182 * other locale. Note that VMS in these two locales includes many non-ASCII
183 * characters as controls and punctuation (below are hex bytes):
185 * punct: A1-A3 A5 A7-AB B0-B3 B5-B7 B9-BD BF-CF D1-DD DF-EF F1-FD
186 * Oddly, none there are listed as alphas, though some represent alphabetics
187 * http://www.nntp.perl.org/group/perl.perl5.porters/2013/02/msg198753.html */
188 #define isNAME_C_OR_POSIX(name) \
190 && (( *(name) == 'C' && (*(name + 1)) == '\0') \
191 || strEQ((name), "POSIX")))
194 Perl_new_numeric(pTHX_ const char *newnum)
197 #ifndef USE_LOCALE_NUMERIC
199 PERL_UNUSED_ARG(newnum);
203 /* Called after all libc setlocale() calls affecting LC_NUMERIC, to tell
204 * core Perl this and that 'newnum' is the name of the new locale.
205 * It installs this locale as the current underlying default.
207 * The default locale and the C locale can be toggled between by use of the
208 * set_numeric_underlying() and set_numeric_standard() functions, which
209 * should probably not be called directly, but only via macros like
210 * SET_NUMERIC_STANDARD() in perl.h.
212 * The toggling is necessary mainly so that a non-dot radix decimal point
213 * character can be output, while allowing internal calculations to use a
216 * This sets several interpreter-level variables:
217 * PL_numeric_name The underlying locale's name: a copy of 'newnum'
218 * PL_numeric_underlying A boolean indicating if the toggled state is such
219 * that the current locale is the program's underlying
221 * PL_numeric_standard An int indicating if the toggled state is such
222 * that the current locale is the C locale. If non-zero,
223 * it is in C; if > 1, it means it may not be toggled away
225 * Note that both of the last two variables can be true at the same time,
226 * if the underlying locale is C. (Toggling is a no-op under these
229 * Any code changing the locale (outside this file) should use
230 * POSIX::setlocale, which calls this function. Therefore this function
231 * should be called directly only from this file and from
232 * POSIX::setlocale() */
237 Safefree(PL_numeric_name);
238 PL_numeric_name = NULL;
239 PL_numeric_standard = TRUE;
240 PL_numeric_underlying = TRUE;
244 save_newnum = stdize_locale(savepv(newnum));
246 PL_numeric_standard = isNAME_C_OR_POSIX(save_newnum);
247 PL_numeric_underlying = TRUE;
249 if (! PL_numeric_name || strNE(PL_numeric_name, save_newnum)) {
250 Safefree(PL_numeric_name);
251 PL_numeric_name = save_newnum;
254 Safefree(save_newnum);
257 /* Keep LC_NUMERIC in the C locale. This is for XS modules, so they don't
258 * have to worry about the radix being a non-dot. (Core operations that
259 * need the underlying locale change to it temporarily). */
260 set_numeric_standard();
262 #endif /* USE_LOCALE_NUMERIC */
267 Perl_set_numeric_standard(pTHX)
270 #ifdef USE_LOCALE_NUMERIC
272 /* Toggle the LC_NUMERIC locale to C. Most code should use the macros like
273 * SET_NUMERIC_STANDARD() in perl.h instead of calling this directly. The
274 * macro avoids calling this routine if toggling isn't necessary according
275 * to our records (which could be wrong if some XS code has changed the
276 * locale behind our back) */
278 do_setlocale_c(LC_NUMERIC, "C");
279 PL_numeric_standard = TRUE;
280 PL_numeric_underlying = isNAME_C_OR_POSIX(PL_numeric_name);
281 set_numeric_radix(0);
285 if (DEBUG_L_TEST || debug_initialization) {
286 PerlIO_printf(Perl_debug_log,
287 "Underlying LC_NUMERIC locale now is C\n");
291 #endif /* USE_LOCALE_NUMERIC */
296 Perl_set_numeric_underlying(pTHX)
299 #ifdef USE_LOCALE_NUMERIC
301 /* Toggle the LC_NUMERIC locale to the current underlying default. Most
302 * code should use the macros like SET_NUMERIC_UNDERLYING() in perl.h
303 * instead of calling this directly. The macro avoids calling this routine
304 * if toggling isn't necessary according to our records (which could be
305 * wrong if some XS code has changed the locale behind our back) */
307 do_setlocale_c(LC_NUMERIC, PL_numeric_name);
308 PL_numeric_standard = isNAME_C_OR_POSIX(PL_numeric_name);
309 PL_numeric_underlying = TRUE;
310 set_numeric_radix(1);
314 if (DEBUG_L_TEST || debug_initialization) {
315 PerlIO_printf(Perl_debug_log,
316 "Underlying LC_NUMERIC locale now is %s\n",
321 #endif /* USE_LOCALE_NUMERIC */
326 * Set up for a new ctype locale.
329 S_new_ctype(pTHX_ const char *newctype)
332 #ifndef USE_LOCALE_CTYPE
334 PERL_ARGS_ASSERT_NEW_CTYPE;
335 PERL_UNUSED_ARG(newctype);
340 /* Called after all libc setlocale() calls affecting LC_CTYPE, to tell
341 * core Perl this and that 'newctype' is the name of the new locale.
343 * This function sets up the folding arrays for all 256 bytes, assuming
344 * that tofold() is tolc() since fold case is not a concept in POSIX,
346 * Any code changing the locale (outside this file) should use
347 * POSIX::setlocale, which calls this function. Therefore this function
348 * should be called directly only from this file and from
349 * POSIX::setlocale() */
354 PERL_ARGS_ASSERT_NEW_CTYPE;
356 /* We will replace any bad locale warning with 1) nothing if the new one is
357 * ok; or 2) a new warning for the bad new locale */
358 if (PL_warn_locale) {
359 SvREFCNT_dec_NN(PL_warn_locale);
360 PL_warn_locale = NULL;
363 PL_in_utf8_CTYPE_locale = _is_cur_LC_category_utf8(LC_CTYPE);
365 /* A UTF-8 locale gets standard rules. But note that code still has to
366 * handle this specially because of the three problematic code points */
367 if (PL_in_utf8_CTYPE_locale) {
368 Copy(PL_fold_latin1, PL_fold_locale, 256, U8);
371 /* Assume enough space for every character being bad. 4 spaces each
372 * for the 94 printable characters that are output like "'x' "; and 5
373 * spaces each for "'\\' ", "'\t' ", and "'\n' "; plus a terminating
375 char bad_chars_list[ (94 * 4) + (3 * 5) + 1 ];
377 /* Don't check for problems if we are suppressing the warnings */
378 bool check_for_problems = ckWARN_d(WARN_LOCALE)
379 || UNLIKELY(DEBUG_L_TEST);
380 bool multi_byte_locale = FALSE; /* Assume is a single-byte locale
382 unsigned int bad_count = 0; /* Count of bad characters */
384 for (i = 0; i < 256; i++) {
385 if (isUPPER_LC((U8) i))
386 PL_fold_locale[i] = (U8) toLOWER_LC((U8) i);
387 else if (isLOWER_LC((U8) i))
388 PL_fold_locale[i] = (U8) toUPPER_LC((U8) i);
390 PL_fold_locale[i] = (U8) i;
392 /* If checking for locale problems, see if the native ASCII-range
393 * printables plus \n and \t are in their expected categories in
394 * the new locale. If not, this could mean big trouble, upending
395 * Perl's and most programs' assumptions, like having a
396 * metacharacter with special meaning become a \w. Fortunately,
397 * it's very rare to find locales that aren't supersets of ASCII
398 * nowadays. It isn't a problem for most controls to be changed
399 * into something else; we check only \n and \t, though perhaps \r
400 * could be an issue as well. */
401 if ( check_for_problems
402 && (isGRAPH_A(i) || isBLANK_A(i) || i == '\n'))
404 if (( isALPHANUMERIC_A(i) && ! isALPHANUMERIC_LC(i))
405 || (isPUNCT_A(i) && ! isPUNCT_LC(i))
406 || (isBLANK_A(i) && ! isBLANK_LC(i))
407 || (i == '\n' && ! isCNTRL_LC(i)))
409 if (bad_count) { /* Separate multiple entries with a
411 bad_chars_list[bad_count++] = ' ';
413 bad_chars_list[bad_count++] = '\'';
415 bad_chars_list[bad_count++] = (char) i;
418 bad_chars_list[bad_count++] = '\\';
420 bad_chars_list[bad_count++] = 'n';
424 bad_chars_list[bad_count++] = 't';
427 bad_chars_list[bad_count++] = '\'';
428 bad_chars_list[bad_count] = '\0';
435 /* We only handle single-byte locales (outside of UTF-8 ones; so if
436 * this locale requires more than one byte, there are going to be
438 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
439 "%s:%d: check_for_problems=%d, MB_CUR_MAX=%d\n",
440 __FILE__, __LINE__, check_for_problems, (int) MB_CUR_MAX));
442 if (check_for_problems && MB_CUR_MAX > 1
444 /* Some platforms return MB_CUR_MAX > 1 for even the "C"
445 * locale. Just assume that the implementation for them (plus
446 * for POSIX) is correct and the > 1 value is spurious. (Since
447 * these are specially handled to never be considered UTF-8
448 * locales, as long as this is the only problem, everything
449 * should work fine */
450 && strNE(newctype, "C") && strNE(newctype, "POSIX"))
452 multi_byte_locale = TRUE;
457 if (bad_count || multi_byte_locale) {
458 PL_warn_locale = Perl_newSVpvf(aTHX_
459 "Locale '%s' may not work well.%s%s%s\n",
462 ? " Some characters in it are not recognized by"
466 ? "\nThe following characters (and maybe others)"
467 " may not have the same meaning as the Perl"
468 " program expects:\n"
474 /* If we are actually in the scope of the locale or are debugging,
475 * output the message now. If not in that scope, we save the
476 * message to be output at the first operation using this locale,
477 * if that actually happens. Most programs don't use locales, so
478 * they are immune to bad ones. */
479 if (IN_LC(LC_CTYPE) || UNLIKELY(DEBUG_L_TEST)) {
481 /* We have to save 'newctype' because the setlocale() just
482 * below may destroy it. The next setlocale() further down
483 * should restore it properly so that the intermediate change
484 * here is transparent to this function's caller */
485 const char * const badlocale = savepv(newctype);
487 do_setlocale_c(LC_CTYPE, "C");
489 /* The '0' below suppresses a bogus gcc compiler warning */
490 Perl_warner(aTHX_ packWARN(WARN_LOCALE), SvPVX(PL_warn_locale), 0);
492 do_setlocale_c(LC_CTYPE, badlocale);
495 if (IN_LC(LC_CTYPE)) {
496 SvREFCNT_dec_NN(PL_warn_locale);
497 PL_warn_locale = NULL;
503 #endif /* USE_LOCALE_CTYPE */
508 Perl__warn_problematic_locale()
511 #ifdef USE_LOCALE_CTYPE
515 /* Internal-to-core function that outputs the message in PL_warn_locale,
516 * and then NULLS it. Should be called only through the macro
517 * _CHECK_AND_WARN_PROBLEMATIC_LOCALE */
519 if (PL_warn_locale) {
520 /*GCC_DIAG_IGNORE(-Wformat-security); Didn't work */
521 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
522 SvPVX(PL_warn_locale),
523 0 /* dummy to avoid compiler warning */ );
524 /* GCC_DIAG_RESTORE; */
525 SvREFCNT_dec_NN(PL_warn_locale);
526 PL_warn_locale = NULL;
534 S_new_collate(pTHX_ const char *newcoll)
537 #ifndef USE_LOCALE_COLLATE
539 PERL_UNUSED_ARG(newcoll);
544 /* Called after all libc setlocale() calls affecting LC_COLLATE, to tell
545 * core Perl this and that 'newcoll' is the name of the new locale.
547 * The design of locale collation is that every locale change is given an
548 * index 'PL_collation_ix'. The first time a string particpates in an
549 * operation that requires collation while locale collation is active, it
550 * is given PERL_MAGIC_collxfrm magic (via sv_collxfrm_flags()). That
551 * magic includes the collation index, and the transformation of the string
552 * by strxfrm(), q.v. That transformation is used when doing comparisons,
553 * instead of the string itself. If a string changes, the magic is
554 * cleared. The next time the locale changes, the index is incremented,
555 * and so we know during a comparison that the transformation is not
556 * necessarily still valid, and so is recomputed. Note that if the locale
557 * changes enough times, the index could wrap (a U32), and it is possible
558 * that a transformation would improperly be considered valid, leading to
562 if (PL_collation_name) {
564 Safefree(PL_collation_name);
565 PL_collation_name = NULL;
567 PL_collation_standard = TRUE;
568 is_standard_collation:
569 PL_collxfrm_base = 0;
570 PL_collxfrm_mult = 2;
571 PL_in_utf8_COLLATE_locale = FALSE;
572 PL_strxfrm_NUL_replacement = '\0';
573 PL_strxfrm_max_cp = 0;
577 /* If this is not the same locale as currently, set the new one up */
578 if (! PL_collation_name || strNE(PL_collation_name, newcoll)) {
580 Safefree(PL_collation_name);
581 PL_collation_name = stdize_locale(savepv(newcoll));
582 PL_collation_standard = isNAME_C_OR_POSIX(newcoll);
583 if (PL_collation_standard) {
584 goto is_standard_collation;
587 PL_in_utf8_COLLATE_locale = _is_cur_LC_category_utf8(LC_COLLATE);
588 PL_strxfrm_NUL_replacement = '\0';
589 PL_strxfrm_max_cp = 0;
591 /* A locale collation definition includes primary, secondary, tertiary,
592 * etc. weights for each character. To sort, the primary weights are
593 * used, and only if they compare equal, then the secondary weights are
594 * used, and only if they compare equal, then the tertiary, etc.
596 * strxfrm() works by taking the input string, say ABC, and creating an
597 * output transformed string consisting of first the primary weights,
598 * A¹B¹C¹ followed by the secondary ones, A²B²C²; and then the
599 * tertiary, etc, yielding A¹B¹C¹ A²B²C² A³B³C³ .... Some characters
600 * may not have weights at every level. In our example, let's say B
601 * doesn't have a tertiary weight, and A doesn't have a secondary
602 * weight. The constructed string is then going to be
603 * A¹B¹C¹ B²C² A³C³ ....
604 * This has the desired effect that strcmp() will look at the secondary
605 * or tertiary weights only if the strings compare equal at all higher
606 * priority weights. The spaces shown here, like in
608 * are not just for readability. In the general case, these must
609 * actually be bytes, which we will call here 'separator weights'; and
610 * they must be smaller than any other weight value, but since these
611 * are C strings, only the terminating one can be a NUL (some
612 * implementations may include a non-NUL separator weight just before
613 * the NUL). Implementations tend to reserve 01 for the separator
614 * weights. They are needed so that a shorter string's secondary
615 * weights won't be misconstrued as primary weights of a longer string,
616 * etc. By making them smaller than any other weight, the shorter
617 * string will sort first. (Actually, if all secondary weights are
618 * smaller than all primary ones, there is no need for a separator
619 * weight between those two levels, etc.)
621 * The length of the transformed string is roughly a linear function of
622 * the input string. It's not exactly linear because some characters
623 * don't have weights at all levels. When we call strxfrm() we have to
624 * allocate some memory to hold the transformed string. The
625 * calculations below try to find coefficients 'm' and 'b' for this
626 * locale so that m*x + b equals how much space we need, given the size
627 * of the input string in 'x'. If we calculate too small, we increase
628 * the size as needed, and call strxfrm() again, but it is better to
629 * get it right the first time to avoid wasted expensive string
630 * transformations. */
633 /* We use the string below to find how long the tranformation of it
634 * is. Almost all locales are supersets of ASCII, or at least the
635 * ASCII letters. We use all of them, half upper half lower,
636 * because if we used fewer, we might hit just the ones that are
637 * outliers in a particular locale. Most of the strings being
638 * collated will contain a preponderance of letters, and even if
639 * they are above-ASCII, they are likely to have the same number of
640 * weight levels as the ASCII ones. It turns out that digits tend
641 * to have fewer levels, and some punctuation has more, but those
642 * are relatively sparse in text, and khw believes this gives a
643 * reasonable result, but it could be changed if experience so
645 const char longer[] = "ABCDEFGHIJKLMnopqrstuvwxyz";
646 char * x_longer; /* Transformed 'longer' */
647 Size_t x_len_longer; /* Length of 'x_longer' */
649 char * x_shorter; /* We also transform a substring of 'longer' */
650 Size_t x_len_shorter;
652 /* _mem_collxfrm() is used get the transformation (though here we
653 * are interested only in its length). It is used because it has
654 * the intelligence to handle all cases, but to work, it needs some
655 * values of 'm' and 'b' to get it started. For the purposes of
656 * this calculation we use a very conservative estimate of 'm' and
657 * 'b'. This assumes a weight can be multiple bytes, enough to
658 * hold any UV on the platform, and there are 5 levels, 4 weight
659 * bytes, and a trailing NUL. */
660 PL_collxfrm_base = 5;
661 PL_collxfrm_mult = 5 * sizeof(UV);
663 /* Find out how long the transformation really is */
664 x_longer = _mem_collxfrm(longer,
668 /* We avoid converting to UTF-8 in the
669 * called function by telling it the
670 * string is in UTF-8 if the locale is a
671 * UTF-8 one. Since the string passed
672 * here is invariant under UTF-8, we can
673 * claim it's UTF-8 even though it isn't.
675 PL_in_utf8_COLLATE_locale);
678 /* Find out how long the transformation of a substring of 'longer'
679 * is. Together the lengths of these transformations are
680 * sufficient to calculate 'm' and 'b'. The substring is all of
681 * 'longer' except the first character. This minimizes the chances
682 * of being swayed by outliers */
683 x_shorter = _mem_collxfrm(longer + 1,
686 PL_in_utf8_COLLATE_locale);
689 /* If the results are nonsensical for this simple test, the whole
690 * locale definition is suspect. Mark it so that locale collation
691 * is not active at all for it. XXX Should we warn? */
692 if ( x_len_shorter == 0
694 || x_len_shorter >= x_len_longer)
696 PL_collxfrm_mult = 0;
697 PL_collxfrm_base = 0;
700 SSize_t base; /* Temporary */
702 /* We have both: m * strlen(longer) + b = x_len_longer
703 * m * strlen(shorter) + b = x_len_shorter;
704 * subtracting yields:
705 * m * (strlen(longer) - strlen(shorter))
706 * = x_len_longer - x_len_shorter
707 * But we have set things up so that 'shorter' is 1 byte smaller
708 * than 'longer'. Hence:
709 * m = x_len_longer - x_len_shorter
711 * But if something went wrong, make sure the multiplier is at
714 if (x_len_longer > x_len_shorter) {
715 PL_collxfrm_mult = (STRLEN) x_len_longer - x_len_shorter;
718 PL_collxfrm_mult = 1;
723 * but in case something has gone wrong, make sure it is
725 base = x_len_longer - PL_collxfrm_mult * (sizeof(longer) - 1);
730 /* Add 1 for the trailing NUL */
731 PL_collxfrm_base = base + 1;
736 if (DEBUG_L_TEST || debug_initialization) {
737 PerlIO_printf(Perl_debug_log,
738 "%s:%d: ?UTF-8 locale=%d; x_len_shorter=%zu, "
740 " collate multipler=%zu, collate base=%zu\n",
742 PL_in_utf8_COLLATE_locale,
743 x_len_shorter, x_len_longer,
744 PL_collxfrm_mult, PL_collxfrm_base);
751 #endif /* USE_LOCALE_COLLATE */
758 S_win32_setlocale(pTHX_ int category, const char* locale)
760 /* This, for Windows, emulates POSIX setlocale() behavior. There is no
761 * difference between the two unless the input locale is "", which normally
762 * means on Windows to get the machine default, which is set via the
763 * computer's "Regional and Language Options" (or its current equivalent).
764 * In POSIX, it instead means to find the locale from the user's
765 * environment. This routine changes the Windows behavior to first look in
766 * the environment, and, if anything is found, use that instead of going to
767 * the machine default. If there is no environment override, the machine
768 * default is used, by calling the real setlocale() with "".
770 * The POSIX behavior is to use the LC_ALL variable if set; otherwise to
771 * use the particular category's variable if set; otherwise to use the LANG
774 bool override_LC_ALL = FALSE;
777 if (locale && strEQ(locale, "")) {
781 locale = PerlEnv_getenv("LC_ALL");
789 override_LC_ALL = TRUE;
790 break; /* We already know its variable isn't set */
793 # ifdef USE_LOCALE_TIME
796 locale = PerlEnv_getenv("LC_TIME");
800 # ifdef USE_LOCALE_CTYPE
803 locale = PerlEnv_getenv("LC_CTYPE");
807 # ifdef USE_LOCALE_COLLATE
810 locale = PerlEnv_getenv("LC_COLLATE");
814 # ifdef USE_LOCALE_MONETARY
817 locale = PerlEnv_getenv("LC_MONETARY");
821 # ifdef USE_LOCALE_NUMERIC
824 locale = PerlEnv_getenv("LC_NUMERIC");
828 # ifdef USE_LOCALE_MESSAGES
831 locale = PerlEnv_getenv("LC_MESSAGES");
837 /* This is a category, like PAPER_SIZE that we don't
838 * know about; and so can't provide a wrapper. */
842 locale = PerlEnv_getenv("LANG");
856 result = setlocale(category, locale);
857 DEBUG_L(PerlIO_printf(Perl_debug_log, "%s:%d: %s\n", __FILE__, __LINE__,
858 setlocale_debug_string(category, locale, result)));
860 if (! override_LC_ALL) {
864 /* Here the input category was LC_ALL, and we have set it to what is in the
865 * LANG variable or the system default if there is no LANG. But these have
866 * lower priority than the other LC_foo variables, so override it for each
867 * one that is set. (If they are set to "", it means to use the same thing
868 * we just set LC_ALL to, so can skip) */
870 # ifdef USE_LOCALE_TIME
872 result = PerlEnv_getenv("LC_TIME");
873 if (result && strNE(result, "")) {
874 setlocale(LC_TIME, result);
875 DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s:%d: %s\n",
877 setlocale_debug_string(LC_TIME, result, "not captured")));
881 # ifdef USE_LOCALE_CTYPE
883 result = PerlEnv_getenv("LC_CTYPE");
884 if (result && strNE(result, "")) {
885 setlocale(LC_CTYPE, result);
886 DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s:%d: %s\n",
888 setlocale_debug_string(LC_CTYPE, result, "not captured")));
892 # ifdef USE_LOCALE_COLLATE
894 result = PerlEnv_getenv("LC_COLLATE");
895 if (result && strNE(result, "")) {
896 setlocale(LC_COLLATE, result);
897 DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s:%d: %s\n",
899 setlocale_debug_string(LC_COLLATE, result, "not captured")));
903 # ifdef USE_LOCALE_MONETARY
905 result = PerlEnv_getenv("LC_MONETARY");
906 if (result && strNE(result, "")) {
907 setlocale(LC_MONETARY, result);
908 DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s:%d: %s\n",
910 setlocale_debug_string(LC_MONETARY, result, "not captured")));
914 # ifdef USE_LOCALE_NUMERIC
916 result = PerlEnv_getenv("LC_NUMERIC");
917 if (result && strNE(result, "")) {
918 setlocale(LC_NUMERIC, result);
919 DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s:%d: %s\n",
921 setlocale_debug_string(LC_NUMERIC, result, "not captured")));
925 # ifdef USE_LOCALE_MESSAGES
927 result = PerlEnv_getenv("LC_MESSAGES");
928 if (result && strNE(result, "")) {
929 setlocale(LC_MESSAGES, result);
930 DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s:%d: %s\n",
932 setlocale_debug_string(LC_MESSAGES, result, "not captured")));
937 result = setlocale(LC_ALL, NULL);
938 DEBUG_L(PerlIO_printf(Perl_debug_log, "%s:%d: %s\n",
940 setlocale_debug_string(LC_ALL, NULL, result)));
948 Perl_setlocale(int category, const char * locale)
950 /* This wraps POSIX::setlocale() */
956 #ifdef USE_LOCALE_NUMERIC
958 /* A NULL locale means only query what the current one is. We
959 * have the LC_NUMERIC name saved, because we are normally switched
960 * into the C locale for it. Switch back so an LC_ALL query will yield
961 * the correct results; all other categories don't require special
963 if (locale == NULL) {
964 if (category == LC_NUMERIC) {
965 return savepv(PL_numeric_name);
970 else if (category == LC_ALL) {
971 SET_NUMERIC_UNDERLYING();
980 retval = do_setlocale_r(category, locale);
982 DEBUG_L(PerlIO_printf(Perl_debug_log,
983 "%s:%d: %s\n", __FILE__, __LINE__,
984 setlocale_debug_string(category, locale, retval)));
986 /* Should never happen that a query would return an error, but be
987 * sure and reset to C locale */
989 SET_NUMERIC_STANDARD();
995 /* Save retval since subsequent setlocale() calls may overwrite it. */
996 retval = savepv(retval);
998 /* If locale == NULL, we are just querying the state, but may have switched
999 * to NUMERIC_UNDERLYING. Switch back before returning. */
1000 if (locale == NULL) {
1001 SET_NUMERIC_STANDARD();
1005 /* Now that have switched locales, we have to update our records to
1010 #ifdef USE_LOCALE_CTYPE
1017 #ifdef USE_LOCALE_COLLATE
1020 new_collate(retval);
1024 #ifdef USE_LOCALE_NUMERIC
1027 new_numeric(retval);
1035 /* LC_ALL updates all the things we care about. The values may not
1036 * be the same as 'retval', as the locale "" may have set things
1039 # ifdef USE_LOCALE_CTYPE
1041 newlocale = do_setlocale_c(LC_CTYPE, NULL);
1042 new_ctype(newlocale);
1044 # endif /* USE_LOCALE_CTYPE */
1045 # ifdef USE_LOCALE_COLLATE
1047 newlocale = do_setlocale_c(LC_COLLATE, NULL);
1048 new_collate(newlocale);
1051 # ifdef USE_LOCALE_NUMERIC
1053 newlocale = do_setlocale_c(LC_NUMERIC, NULL);
1054 new_numeric(newlocale);
1056 # endif /* USE_LOCALE_NUMERIC */
1068 PERL_STATIC_INLINE const char *
1069 S_save_to_buffer(const char * string, char **buf, Size_t *buf_size, const Size_t offset)
1071 /* Copy the NUL-terminated 'string' to 'buf' + 'offset'. 'buf' has size 'buf_size',
1072 * growing it if necessary */
1074 const Size_t string_size = strlen(string) + offset + 1;
1076 PERL_ARGS_ASSERT_SAVE_TO_BUFFER;
1078 if (*buf_size == 0) {
1079 Newx(*buf, string_size, char);
1080 *buf_size = string_size;
1082 else if (string_size > *buf_size) {
1083 Renew(*buf, string_size, char);
1084 *buf_size = string_size;
1087 Copy(string, *buf + offset, string_size - offset, char);
1093 =head1 Locale-related functions and macros
1095 =for apidoc Perl_langinfo
1097 This is an (almost ª) drop-in replacement for the system C<L<nl_langinfo(3)>>,
1098 taking the same C<item> parameter values, and returning the same information.
1099 But it is more thread-safe than regular C<nl_langinfo()>, and hides the quirks
1100 of Perl's locale handling from your code, and can be used on systems that lack
1101 a native C<nl_langinfo>.
1109 It delivers the correct results for the C<RADIXCHAR> and C<THOUSESEP> items,
1110 without you having to write extra code. The reason for the extra code would be
1111 because these are from the C<LC_NUMERIC> locale category, which is normally
1112 kept set to the C locale by Perl, no matter what the underlying locale is
1113 supposed to be, and so to get the expected results, you have to temporarily
1114 toggle into the underlying locale, and later toggle back. (You could use
1115 plain C<nl_langinfo> and C<L</STORE_LC_NUMERIC_FORCE_TO_UNDERLYING>> for this
1116 but then you wouldn't get the other advantages of C<Perl_langinfo()>; not
1117 keeping C<LC_NUMERIC> in the C locale would break a lot of CPAN, which is
1118 expecting the radix (decimal point) character to be a dot.)
1122 Depending on C<item>, it works on systems that don't have C<nl_langinfo>, hence
1123 makes your code more portable. Of the fifty-some possible items specified by
1124 the POSIX 2008 standard,
1125 L<http://pubs.opengroup.org/onlinepubs/9699919799/basedefs/langinfo.h.html>,
1126 only two are completely unimplemented. It uses various techniques to recover
1127 the other items, including calling C<L<localeconv(3)>>, and C<L<strftime(3)>>,
1128 both of which are specified in C89, so should be always be available. Later
1129 C<strftime()> versions have additional capabilities; C<""> is returned for
1130 those not available on your system.
1132 The details for those items which may differ from what this emulation returns
1133 and what a native C<nl_langinfo()> would return are:
1141 Unimplemented, so returns C<"">.
1147 Only the values for English are returned. Earlier POSIX standards also
1148 specified C<YESSTR> and C<NOSTR>, but these have been removed from POSIX 2008,
1149 and aren't supported by C<Perl_langinfo>.
1153 Always evaluates to C<%x>, the locale's appropriate date representation.
1157 Always evaluates to C<%X>, the locale's appropriate time representation.
1161 Always evaluates to C<%c>, the locale's appropriate date and time
1166 The return may be incorrect for those rare locales where the currency symbol
1167 replaces the radix character.
1168 Send email to L<mailto:perlbug@perl.org> if you have examples of it needing
1169 to work differently.
1173 Currently this gives the same results as Linux does.
1174 Send email to L<mailto:perlbug@perl.org> if you have examples of it needing
1175 to work differently.
1181 =item C<ERA_D_T_FMT>
1185 These are derived by using C<strftime()>, and not all versions of that function
1186 know about them. C<""> is returned for these on such systems.
1190 When using C<Perl_langinfo> on systems that don't have a native
1191 C<nl_langinfo()>, you must
1193 #include "perl_langinfo.h"
1195 before the C<perl.h> C<#include>. You can replace your C<langinfo.h>
1196 C<#include> with this one. (Doing it this way keeps out the symbols that plain
1197 C<langinfo.h> imports into the namespace for code that doesn't need it.)
1199 You also should not use the bare C<langinfo.h> item names, but should preface
1200 them with C<PERL_>, so use C<PERL_RADIXCHAR> instead of plain C<RADIXCHAR>.
1201 The C<PERL_I<foo>> versions will also work for this function on systems that do
1202 have a native C<nl_langinfo>.
1206 It is thread-friendly, returning its result in a buffer that won't be
1207 overwritten by another thread, so you don't have to code for that possibility.
1208 The buffer can be overwritten by the next call to C<nl_langinfo> or
1209 C<Perl_langinfo> in the same thread.
1213 ª It returns S<C<const char *>>, whereas plain C<nl_langinfo()> returns S<C<char
1214 *>>, but you are (only by documentation) forbidden to write into the buffer.
1215 By declaring this C<const>, the compiler enforces this restriction. The extra
1216 C<const> is why this isn't an unequivocal drop-in replacement for
1221 The original impetus for C<Perl_langinfo()> was so that code that needs to
1222 find out the current currency symbol, floating point radix character, or digit
1223 grouping separator can use, on all systems, the simpler and more
1224 thread-friendly C<nl_langinfo> API instead of C<L<localeconv(3)>> which is a
1225 pain to make thread-friendly. For other fields returned by C<localeconv>, it
1226 is better to use the methods given in L<perlcall> to call
1227 L<C<POSIX::localeconv()>|POSIX/localeconv>, which is thread-friendly.
1234 #ifdef HAS_NL_LANGINFO
1235 Perl_langinfo(const nl_item item)
1237 Perl_langinfo(const int item)
1240 return my_nl_langinfo(item, TRUE);
1244 #ifdef HAS_NL_LANGINFO
1245 S_my_nl_langinfo(const nl_item item, bool toggle)
1247 S_my_nl_langinfo(const int item, bool toggle)
1252 #if defined(HAS_NL_LANGINFO) /* nl_langinfo() is available. */
1253 #if ! defined(HAS_POSIX_2008_LOCALE)
1255 /* Here, use plain nl_langinfo(), switching to the underlying LC_NUMERIC
1256 * for those items dependent on it. This must be copied to a buffer before
1257 * switching back, as some systems destroy the buffer when setlocale() is
1263 if (item == PERL_RADIXCHAR || item == PERL_THOUSEP) {
1264 do_setlocale_c(LC_NUMERIC, PL_numeric_name);
1271 save_to_buffer(nl_langinfo(item), &PL_langinfo_buf, &PL_langinfo_bufsize, 0);
1274 do_setlocale_c(LC_NUMERIC, "C");
1279 return PL_langinfo_buf;
1281 # else /* Use nl_langinfo_l(), avoiding both a mutex and changing the locale */
1283 bool do_free = FALSE;
1284 locale_t cur = uselocale((locale_t) 0);
1286 if (cur == LC_GLOBAL_LOCALE) {
1287 cur = duplocale(LC_GLOBAL_LOCALE);
1292 && (item == PERL_RADIXCHAR || item == PERL_THOUSEP))
1294 cur = newlocale(LC_NUMERIC_MASK, PL_numeric_name, cur);
1298 save_to_buffer(nl_langinfo_l(item, cur),
1299 &PL_langinfo_buf, &PL_langinfo_bufsize, 0);
1304 return PL_langinfo_buf;
1307 #else /* Below, emulate nl_langinfo as best we can */
1308 # ifdef HAS_LOCALECONV
1310 const struct lconv* lc;
1313 # ifdef HAS_STRFTIME
1316 bool return_format = FALSE; /* Return the %format, not the value */
1317 const char * format;
1321 /* We copy the results to a per-thread buffer, even if not multi-threaded.
1322 * This is in part to simplify this code, and partly because we need a
1323 * buffer anyway for strftime(), and partly because a call of localeconv()
1324 * could otherwise wipe out the buffer, and the programmer would not be
1325 * expecting this, as this is a nl_langinfo() substitute after all, so s/he
1326 * might be thinking their localeconv() is safe until another localeconv()
1331 const char * retval;
1333 /* These 2 are unimplemented */
1335 case PERL_ERA: /* For use with strftime() %E modifier */
1340 /* We use only an English set, since we don't know any more */
1341 case PERL_YESEXPR: return "^[+1yY]";
1342 case PERL_NOEXPR: return "^[-0nN]";
1344 # ifdef HAS_LOCALECONV
1351 if (! lc || ! lc->currency_symbol || strEQ("", lc->currency_symbol))
1357 /* Leave the first spot empty to be filled in below */
1358 save_to_buffer(lc->currency_symbol, &PL_langinfo_buf,
1359 &PL_langinfo_bufsize, 1);
1360 if (lc->mon_decimal_point && strEQ(lc->mon_decimal_point, ""))
1361 { /* khw couldn't figure out how the localedef specifications
1362 would show that the $ should replace the radix; this is
1363 just a guess as to how it might work.*/
1364 *PL_langinfo_buf = '.';
1366 else if (lc->p_cs_precedes) {
1367 *PL_langinfo_buf = '-';
1370 *PL_langinfo_buf = '+';
1376 case PERL_RADIXCHAR:
1382 do_setlocale_c(LC_NUMERIC, PL_numeric_name);
1389 else switch (item) {
1390 case PERL_RADIXCHAR:
1391 if (! lc->decimal_point) {
1395 retval = lc->decimal_point;
1400 if (! lc->thousands_sep || strEQ("", lc->thousands_sep)) {
1404 retval = lc->thousands_sep;
1410 Perl_croak(aTHX_ "panic: %s: %d: switch case: %d problem",
1411 __FILE__, __LINE__, item);
1414 save_to_buffer(retval, &PL_langinfo_buf, &PL_langinfo_bufsize, 0);
1417 do_setlocale_c(LC_NUMERIC, "C");
1425 # ifdef HAS_STRFTIME
1427 /* These are defined by C89, so we assume that strftime supports them,
1428 * and so are returned unconditionally; they may not be what the locale
1429 * actually says, but should give good enough results for someone using
1430 * them as formats (as opposed to trying to parse them to figure out
1431 * what the locale says). The other format items are actually tested to
1432 * verify they work on the platform */
1433 case PERL_D_FMT: return "%x";
1434 case PERL_T_FMT: return "%X";
1435 case PERL_D_T_FMT: return "%c";
1437 /* These formats are only available in later strfmtime's */
1438 case PERL_ERA_D_FMT: case PERL_ERA_T_FMT: case PERL_ERA_D_T_FMT:
1439 case PERL_T_FMT_AMPM:
1441 /* The rest can be gotten from most versions of strftime(). */
1442 case PERL_ABDAY_1: case PERL_ABDAY_2: case PERL_ABDAY_3:
1443 case PERL_ABDAY_4: case PERL_ABDAY_5: case PERL_ABDAY_6:
1445 case PERL_ALT_DIGITS:
1446 case PERL_AM_STR: case PERL_PM_STR:
1447 case PERL_ABMON_1: case PERL_ABMON_2: case PERL_ABMON_3:
1448 case PERL_ABMON_4: case PERL_ABMON_5: case PERL_ABMON_6:
1449 case PERL_ABMON_7: case PERL_ABMON_8: case PERL_ABMON_9:
1450 case PERL_ABMON_10: case PERL_ABMON_11: case PERL_ABMON_12:
1451 case PERL_DAY_1: case PERL_DAY_2: case PERL_DAY_3: case PERL_DAY_4:
1452 case PERL_DAY_5: case PERL_DAY_6: case PERL_DAY_7:
1453 case PERL_MON_1: case PERL_MON_2: case PERL_MON_3: case PERL_MON_4:
1454 case PERL_MON_5: case PERL_MON_6: case PERL_MON_7: case PERL_MON_8:
1455 case PERL_MON_9: case PERL_MON_10: case PERL_MON_11: case PERL_MON_12:
1459 init_tm(&tm); /* Precaution against core dumps */
1463 tm.tm_year = 2017 - 1900;
1469 Perl_croak(aTHX_ "panic: %s: %d: switch case: %d problem",
1470 __FILE__, __LINE__, item);
1471 NOT_REACHED; /* NOTREACHED */
1473 case PERL_PM_STR: tm.tm_hour = 18;
1478 case PERL_ABDAY_7: tm.tm_wday++;
1479 case PERL_ABDAY_6: tm.tm_wday++;
1480 case PERL_ABDAY_5: tm.tm_wday++;
1481 case PERL_ABDAY_4: tm.tm_wday++;
1482 case PERL_ABDAY_3: tm.tm_wday++;
1483 case PERL_ABDAY_2: tm.tm_wday++;
1488 case PERL_DAY_7: tm.tm_wday++;
1489 case PERL_DAY_6: tm.tm_wday++;
1490 case PERL_DAY_5: tm.tm_wday++;
1491 case PERL_DAY_4: tm.tm_wday++;
1492 case PERL_DAY_3: tm.tm_wday++;
1493 case PERL_DAY_2: tm.tm_wday++;
1498 case PERL_ABMON_12: tm.tm_mon++;
1499 case PERL_ABMON_11: tm.tm_mon++;
1500 case PERL_ABMON_10: tm.tm_mon++;
1501 case PERL_ABMON_9: tm.tm_mon++;
1502 case PERL_ABMON_8: tm.tm_mon++;
1503 case PERL_ABMON_7: tm.tm_mon++;
1504 case PERL_ABMON_6: tm.tm_mon++;
1505 case PERL_ABMON_5: tm.tm_mon++;
1506 case PERL_ABMON_4: tm.tm_mon++;
1507 case PERL_ABMON_3: tm.tm_mon++;
1508 case PERL_ABMON_2: tm.tm_mon++;
1513 case PERL_MON_12: tm.tm_mon++;
1514 case PERL_MON_11: tm.tm_mon++;
1515 case PERL_MON_10: tm.tm_mon++;
1516 case PERL_MON_9: tm.tm_mon++;
1517 case PERL_MON_8: tm.tm_mon++;
1518 case PERL_MON_7: tm.tm_mon++;
1519 case PERL_MON_6: tm.tm_mon++;
1520 case PERL_MON_5: tm.tm_mon++;
1521 case PERL_MON_4: tm.tm_mon++;
1522 case PERL_MON_3: tm.tm_mon++;
1523 case PERL_MON_2: tm.tm_mon++;
1528 case PERL_T_FMT_AMPM:
1530 return_format = TRUE;
1533 case PERL_ERA_D_FMT:
1535 return_format = TRUE;
1538 case PERL_ERA_T_FMT:
1540 return_format = TRUE;
1543 case PERL_ERA_D_T_FMT:
1545 return_format = TRUE;
1548 case PERL_ALT_DIGITS:
1550 format = "%Ow"; /* Find the alternate digit for 0 */
1554 /* We can't use my_strftime() because it doesn't look at tm_wday */
1555 while (0 == strftime(PL_langinfo_buf, PL_langinfo_bufsize,
1558 /* A zero return means one of:
1559 * a) there wasn't enough space in PL_langinfo_buf
1560 * b) the format, like a plain %p, returns empty
1561 * c) it was an illegal format, though some implementations of
1562 * strftime will just return the illegal format as a plain
1563 * character sequence.
1565 * To quickly test for case 'b)', try again but precede the
1566 * format with a plain character. If that result is still
1567 * empty, the problem is either 'a)' or 'c)' */
1569 Size_t format_size = strlen(format) + 1;
1570 Size_t mod_size = format_size + 1;
1574 Newx(mod_format, mod_size, char);
1575 Newx(temp_result, PL_langinfo_bufsize, char);
1577 my_strlcpy(mod_format + 1, format, mod_size);
1578 len = strftime(temp_result,
1579 PL_langinfo_bufsize,
1581 Safefree(mod_format);
1582 Safefree(temp_result);
1584 /* If 'len' is non-zero, it means that we had a case like %p
1585 * which means the current locale doesn't use a.m. or p.m., and
1589 /* Here, still didn't work. If we get well beyond a
1590 * reasonable size, bail out to prevent an infinite loop. */
1592 if (PL_langinfo_bufsize > 100 * format_size) {
1593 *PL_langinfo_buf = '\0';
1595 else { /* Double the buffer size to retry; Add 1 in case
1596 original was 0, so we aren't stuck at 0. */
1597 PL_langinfo_bufsize *= 2;
1598 PL_langinfo_bufsize++;
1599 Renew(PL_langinfo_buf, PL_langinfo_bufsize, char);
1607 /* Here, we got a result.
1609 * If the item is 'ALT_DIGITS', PL_langinfo_buf contains the
1610 * alternate format for wday 0. If the value is the same as the
1611 * normal 0, there isn't an alternate, so clear the buffer. */
1612 if ( item == PERL_ALT_DIGITS
1613 && strEQ(PL_langinfo_buf, "0"))
1615 *PL_langinfo_buf = '\0';
1618 /* ALT_DIGITS is problematic. Experiments on it showed that
1619 * strftime() did not always work properly when going from alt-9 to
1620 * alt-10. Only a few locales have this item defined, and in all
1621 * of them on Linux that khw was able to find, nl_langinfo() merely
1622 * returned the alt-0 character, possibly doubled. Most Unicode
1623 * digits are in blocks of 10 consecutive code points, so that is
1624 * sufficient information for those scripts, as we can infer alt-1,
1625 * alt-2, .... But for a Japanese locale, a CJK ideographic 0 is
1626 * returned, and the CJK digits are not in code point order, so you
1627 * can't really infer anything. The localedef for this locale did
1628 * specify the succeeding digits, so that strftime() works properly
1629 * on them, without needing to infer anything. But the
1630 * nl_langinfo() return did not give sufficient information for the
1631 * caller to understand what's going on. So until there is
1632 * evidence that it should work differently, this returns the alt-0
1633 * string for ALT_DIGITS.
1635 * wday was chosen because its range is all a single digit. Things
1636 * like tm_sec have two digits as the minimum: '00' */
1640 /* If to return the format, not the value, overwrite the buffer
1641 * with it. But some strftime()s will keep the original format if
1642 * illegal, so change those to "" */
1643 if (return_format) {
1644 if (strEQ(PL_langinfo_buf, format)) {
1645 *PL_langinfo_buf = '\0';
1648 save_to_buffer(format, &PL_langinfo_buf,
1649 &PL_langinfo_bufsize, 0);
1659 return PL_langinfo_buf;
1666 * Initialize locale awareness.
1669 Perl_init_i18nl10n(pTHX_ int printwarn)
1673 * 0 if not to output warning when setup locale is bad
1674 * 1 if to output warning based on value of PERL_BADLANG
1675 * >1 if to output regardless of PERL_BADLANG
1678 * 1 = set ok or not applicable,
1679 * 0 = fallback to a locale of lower priority
1680 * -1 = fallback to all locales failed, not even to the C locale
1682 * Under -DDEBUGGING, if the environment variable PERL_DEBUG_LOCALE_INIT is
1683 * set, debugging information is output.
1685 * This looks more complicated than it is, mainly due to the #ifdefs.
1687 * We try to set LC_ALL to the value determined by the environment. If
1688 * there is no LC_ALL on this platform, we try the individual categories we
1689 * know about. If this works, we are done.
1691 * But if it doesn't work, we have to do something else. We search the
1692 * environment variables ourselves instead of relying on the system to do
1693 * it. We look at, in order, LC_ALL, LANG, a system default locale (if we
1694 * think there is one), and the ultimate fallback "C". This is all done in
1695 * the same loop as above to avoid duplicating code, but it makes things
1696 * more complex. The 'trial_locales' array is initialized with just one
1697 * element; it causes the behavior described in the paragraph above this to
1698 * happen. If that fails, we add elements to 'trial_locales', and do extra
1699 * loop iterations to cause the behavior described in this paragraph.
1701 * On Ultrix, the locale MUST come from the environment, so there is
1702 * preliminary code to set it. I (khw) am not sure that it is necessary,
1703 * and that this couldn't be folded into the loop, but barring any real
1704 * platforms to test on, it's staying as-is
1706 * A slight complication is that in embedded Perls, the locale may already
1707 * be set-up, and we don't want to get it from the normal environment
1708 * variables. This is handled by having a special environment variable
1709 * indicate we're in this situation. We simply set setlocale's 2nd
1710 * parameter to be a NULL instead of "". That indicates to setlocale that
1711 * it is not to change anything, but to return the current value,
1712 * effectively initializing perl's db to what the locale already is.
1714 * We play the same trick with NULL if a LC_ALL succeeds. We call
1715 * setlocale() on the individual categores with NULL to get their existing
1716 * values for our db, instead of trying to change them.
1723 PERL_UNUSED_ARG(printwarn);
1725 #else /* USE_LOCALE */
1726 # ifdef USE_LOCALE_CTYPE
1728 char *curctype = NULL;
1730 # endif /* USE_LOCALE_CTYPE */
1731 # ifdef USE_LOCALE_COLLATE
1733 char *curcoll = NULL;
1735 # endif /* USE_LOCALE_COLLATE */
1736 # ifdef USE_LOCALE_NUMERIC
1738 char *curnum = NULL;
1740 # endif /* USE_LOCALE_NUMERIC */
1743 const char * const language = savepv(PerlEnv_getenv("LANGUAGE"));
1747 /* NULL uses the existing already set up locale */
1748 const char * const setlocale_init = (PerlEnv_getenv("PERL_SKIP_LOCALE_INIT"))
1751 const char* trial_locales[5]; /* 5 = 1 each for "", LC_ALL, LANG, "", C */
1752 unsigned int trial_locales_count;
1753 const char * const lc_all = savepv(PerlEnv_getenv("LC_ALL"));
1754 const char * const lang = savepv(PerlEnv_getenv("LANG"));
1755 bool setlocale_failure = FALSE;
1758 /* A later getenv() could zap this, so only use here */
1759 const char * const bad_lang_use_once = PerlEnv_getenv("PERL_BADLANG");
1761 const bool locwarn = (printwarn > 1
1763 && (! bad_lang_use_once
1765 /* disallow with "" or "0" */
1767 && strNE("0", bad_lang_use_once)))));
1769 char * sl_result; /* return from setlocale() */
1770 char * locale_param;
1774 /* In some systems you can find out the system default locale
1775 * and use that as the fallback locale. */
1776 # define SYSTEM_DEFAULT_LOCALE
1778 # ifdef SYSTEM_DEFAULT_LOCALE
1780 const char *system_default_locale = NULL;
1785 DEBUG_INITIALIZATION_set(cBOOL(PerlEnv_getenv("PERL_DEBUG_LOCALE_INIT")));
1787 # define DEBUG_LOCALE_INIT(category, locale, result) \
1789 if (debug_initialization) { \
1790 PerlIO_printf(Perl_debug_log, \
1792 __FILE__, __LINE__, \
1793 setlocale_debug_string(category, \
1800 # define DEBUG_LOCALE_INIT(a,b,c)
1803 # ifndef LOCALE_ENVIRON_REQUIRED
1805 PERL_UNUSED_VAR(done);
1806 PERL_UNUSED_VAR(locale_param);
1811 * Ultrix setlocale(..., "") fails if there are no environment
1812 * variables from which to get a locale name.
1818 sl_result = do_setlocale_c(LC_ALL, setlocale_init);
1819 DEBUG_LOCALE_INIT(LC_ALL, setlocale_init, sl_result);
1823 setlocale_failure = TRUE;
1825 if (! setlocale_failure) {
1827 # ifdef USE_LOCALE_CTYPE
1829 locale_param = (! done && (lang || PerlEnv_getenv("LC_CTYPE")))
1832 curctype = do_setlocale_c(LC_CTYPE, locale_param);
1833 DEBUG_LOCALE_INIT(LC_CTYPE, locale_param, sl_result);
1835 setlocale_failure = TRUE;
1837 curctype = savepv(curctype);
1839 # endif /* USE_LOCALE_CTYPE */
1840 # ifdef USE_LOCALE_COLLATE
1842 locale_param = (! done && (lang || PerlEnv_getenv("LC_COLLATE")))
1845 curcoll = do_setlocale_c(LC_COLLATE, locale_param);
1846 DEBUG_LOCALE_INIT(LC_COLLATE, locale_param, sl_result);
1848 setlocale_failure = TRUE;
1850 curcoll = savepv(curcoll);
1852 # endif /* USE_LOCALE_COLLATE */
1853 # ifdef USE_LOCALE_NUMERIC
1855 locale_param = (! done && (lang || PerlEnv_getenv("LC_NUMERIC")))
1858 curnum = do_setlocale_c(LC_NUMERIC, locale_param);
1859 DEBUG_LOCALE_INIT(LC_NUMERIC, locale_param, sl_result);
1861 setlocale_failure = TRUE;
1863 curnum = savepv(curnum);
1865 # endif /* USE_LOCALE_NUMERIC */
1866 # ifdef USE_LOCALE_MESSAGES
1868 locale_param = (! done && (lang || PerlEnv_getenv("LC_MESSAGES")))
1871 sl_result = do_setlocale_c(LC_MESSAGES, locale_param);
1872 DEBUG_LOCALE_INIT(LC_MESSAGES, locale_param, sl_result);
1874 setlocale_failure = TRUE;
1877 # endif /* USE_LOCALE_MESSAGES */
1878 # ifdef USE_LOCALE_MONETARY
1880 locale_param = (! done && (lang || PerlEnv_getenv("LC_MONETARY")))
1883 sl_result = do_setlocale_c(LC_MONETARY, locale_param);
1884 DEBUG_LOCALE_INIT(LC_MONETARY, locale_param, sl_result);
1886 setlocale_failure = TRUE;
1889 # endif /* USE_LOCALE_MONETARY */
1893 # endif /* LC_ALL */
1894 # endif /* !LOCALE_ENVIRON_REQUIRED */
1896 /* We try each locale in the list until we get one that works, or exhaust
1897 * the list. Normally the loop is executed just once. But if setting the
1898 * locale fails, inside the loop we add fallback trials to the array and so
1899 * will execute the loop multiple times */
1900 trial_locales[0] = setlocale_init;
1901 trial_locales_count = 1;
1903 for (i= 0; i < trial_locales_count; i++) {
1904 const char * trial_locale = trial_locales[i];
1908 /* XXX This is to preserve old behavior for LOCALE_ENVIRON_REQUIRED
1909 * when i==0, but I (khw) don't think that behavior makes much
1911 setlocale_failure = FALSE;
1913 # ifdef SYSTEM_DEFAULT_LOCALE
1916 /* On Windows machines, an entry of "" after the 0th means to use
1917 * the system default locale, which we now proceed to get. */
1918 if (strEQ(trial_locale, "")) {
1921 /* Note that this may change the locale, but we are going to do
1922 * that anyway just below */
1923 system_default_locale = do_setlocale_c(LC_ALL, "");
1924 DEBUG_LOCALE_INIT(LC_ALL, "", system_default_locale);
1926 /* Skip if invalid or if it's already on the list of locales to
1928 if (! system_default_locale) {
1929 goto next_iteration;
1931 for (j = 0; j < trial_locales_count; j++) {
1932 if (strEQ(system_default_locale, trial_locales[j])) {
1933 goto next_iteration;
1937 trial_locale = system_default_locale;
1940 # endif /* SYSTEM_DEFAULT_LOCALE */
1945 sl_result = do_setlocale_c(LC_ALL, trial_locale);
1946 DEBUG_LOCALE_INIT(LC_ALL, trial_locale, sl_result);
1948 setlocale_failure = TRUE;
1951 /* Since LC_ALL succeeded, it should have changed all the other
1952 * categories it can to its value; so we massage things so that the
1953 * setlocales below just return their category's current values.
1954 * This adequately handles the case in NetBSD where LC_COLLATE may
1955 * not be defined for a locale, and setting it individually will
1956 * fail, whereas setting LC_ALL succeeds, leaving LC_COLLATE set to
1957 * the POSIX locale. */
1958 trial_locale = NULL;
1961 # endif /* LC_ALL */
1963 if (!setlocale_failure) {
1965 # ifdef USE_LOCALE_CTYPE
1968 curctype = do_setlocale_c(LC_CTYPE, trial_locale);
1969 DEBUG_LOCALE_INIT(LC_CTYPE, trial_locale, curctype);
1971 setlocale_failure = TRUE;
1973 curctype = savepv(curctype);
1975 # endif /* USE_LOCALE_CTYPE */
1976 # ifdef USE_LOCALE_COLLATE
1979 curcoll = do_setlocale_c(LC_COLLATE, trial_locale);
1980 DEBUG_LOCALE_INIT(LC_COLLATE, trial_locale, curcoll);
1982 setlocale_failure = TRUE;
1984 curcoll = savepv(curcoll);
1986 # endif /* USE_LOCALE_COLLATE */
1987 # ifdef USE_LOCALE_NUMERIC
1990 curnum = do_setlocale_c(LC_NUMERIC, trial_locale);
1991 DEBUG_LOCALE_INIT(LC_NUMERIC, trial_locale, curnum);
1993 setlocale_failure = TRUE;
1995 curnum = savepv(curnum);
1997 # endif /* USE_LOCALE_NUMERIC */
1998 # ifdef USE_LOCALE_MESSAGES
2000 sl_result = do_setlocale_c(LC_MESSAGES, trial_locale);
2001 DEBUG_LOCALE_INIT(LC_MESSAGES, trial_locale, sl_result);
2003 setlocale_failure = TRUE;
2005 # endif /* USE_LOCALE_MESSAGES */
2006 # ifdef USE_LOCALE_MONETARY
2008 sl_result = do_setlocale_c(LC_MONETARY, trial_locale);
2009 DEBUG_LOCALE_INIT(LC_MONETARY, trial_locale, sl_result);
2011 setlocale_failure = TRUE;
2013 # endif /* USE_LOCALE_MONETARY */
2015 if (! setlocale_failure) { /* Success */
2020 /* Here, something failed; will need to try a fallback. */
2026 if (locwarn) { /* Output failure info only on the first one */
2030 PerlIO_printf(Perl_error_log,
2031 "perl: warning: Setting locale failed.\n");
2033 # else /* !LC_ALL */
2035 PerlIO_printf(Perl_error_log,
2036 "perl: warning: Setting locale failed for the categories:\n\t");
2038 # ifdef USE_LOCALE_CTYPE
2041 PerlIO_printf(Perl_error_log, "LC_CTYPE ");
2043 # endif /* USE_LOCALE_CTYPE */
2044 # ifdef USE_LOCALE_COLLATE
2046 PerlIO_printf(Perl_error_log, "LC_COLLATE ");
2048 # endif /* USE_LOCALE_COLLATE */
2049 # ifdef USE_LOCALE_NUMERIC
2052 PerlIO_printf(Perl_error_log, "LC_NUMERIC ");
2054 # endif /* USE_LOCALE_NUMERIC */
2056 PerlIO_printf(Perl_error_log, "and possibly others\n");
2058 # endif /* LC_ALL */
2060 PerlIO_printf(Perl_error_log,
2061 "perl: warning: Please check that your locale settings:\n");
2065 PerlIO_printf(Perl_error_log,
2066 "\tLANGUAGE = %c%s%c,\n",
2067 language ? '"' : '(',
2068 language ? language : "unset",
2069 language ? '"' : ')');
2072 PerlIO_printf(Perl_error_log,
2073 "\tLC_ALL = %c%s%c,\n",
2075 lc_all ? lc_all : "unset",
2076 lc_all ? '"' : ')');
2078 # if defined(USE_ENVIRON_ARRAY)
2083 /* Look through the environment for any variables of the
2084 * form qr/ ^ LC_ [A-Z]+ = /x, except LC_ALL which was
2085 * already handled above. These are assumed to be locale
2086 * settings. Output them and their values. */
2087 for (e = environ; *e; e++) {
2088 const STRLEN prefix_len = sizeof("LC_") - 1;
2091 if ( strBEGINs(*e, "LC_")
2092 && ! strBEGINs(*e, "LC_ALL=")
2093 && (uppers_len = strspn(*e + prefix_len,
2094 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"))
2095 && ((*e)[prefix_len + uppers_len] == '='))
2097 PerlIO_printf(Perl_error_log, "\t%.*s = \"%s\",\n",
2098 (int) (prefix_len + uppers_len), *e,
2099 *e + prefix_len + uppers_len + 1);
2106 PerlIO_printf(Perl_error_log,
2107 "\t(possibly more locale environment variables)\n");
2111 PerlIO_printf(Perl_error_log,
2112 "\tLANG = %c%s%c\n",
2114 lang ? lang : "unset",
2117 PerlIO_printf(Perl_error_log,
2118 " are supported and installed on your system.\n");
2121 /* Calculate what fallback locales to try. We have avoided this
2122 * until we have to, because failure is quite unlikely. This will
2123 * usually change the upper bound of the loop we are in.
2125 * Since the system's default way of setting the locale has not
2126 * found one that works, We use Perl's defined ordering: LC_ALL,
2127 * LANG, and the C locale. We don't try the same locale twice, so
2128 * don't add to the list if already there. (On POSIX systems, the
2129 * LC_ALL element will likely be a repeat of the 0th element "",
2130 * but there's no harm done by doing it explicitly.
2132 * Note that this tries the LC_ALL environment variable even on
2133 * systems which have no LC_ALL locale setting. This may or may
2134 * not have been originally intentional, but there's no real need
2135 * to change the behavior. */
2137 for (j = 0; j < trial_locales_count; j++) {
2138 if (strEQ(lc_all, trial_locales[j])) {
2142 trial_locales[trial_locales_count++] = lc_all;
2147 for (j = 0; j < trial_locales_count; j++) {
2148 if (strEQ(lang, trial_locales[j])) {
2152 trial_locales[trial_locales_count++] = lang;
2156 # if defined(WIN32) && defined(LC_ALL)
2158 /* For Windows, we also try the system default locale before "C".
2159 * (If there exists a Windows without LC_ALL we skip this because
2160 * it gets too complicated. For those, the "C" is the next
2161 * fallback possibility). The "" is the same as the 0th element of
2162 * the array, but the code at the loop above knows to treat it
2163 * differently when not the 0th */
2164 trial_locales[trial_locales_count++] = "";
2168 for (j = 0; j < trial_locales_count; j++) {
2169 if (strEQ("C", trial_locales[j])) {
2173 trial_locales[trial_locales_count++] = "C";
2176 } /* end of first time through the loop */
2184 } /* end of looping through the trial locales */
2186 if (ok < 1) { /* If we tried to fallback */
2188 if (! setlocale_failure) { /* fallback succeeded */
2189 msg = "Falling back to";
2191 else { /* fallback failed */
2193 /* We dropped off the end of the loop, so have to decrement i to
2194 * get back to the value the last time through */
2198 msg = "Failed to fall back to";
2200 /* To continue, we should use whatever values we've got */
2202 # ifdef USE_LOCALE_CTYPE
2205 curctype = savepv(do_setlocale_c(LC_CTYPE, NULL));
2206 DEBUG_LOCALE_INIT(LC_CTYPE, NULL, curctype);
2208 # endif /* USE_LOCALE_CTYPE */
2209 # ifdef USE_LOCALE_COLLATE
2212 curcoll = savepv(do_setlocale_c(LC_COLLATE, NULL));
2213 DEBUG_LOCALE_INIT(LC_COLLATE, NULL, curcoll);
2215 # endif /* USE_LOCALE_COLLATE */
2216 # ifdef USE_LOCALE_NUMERIC
2219 curnum = savepv(do_setlocale_c(LC_NUMERIC, NULL));
2220 DEBUG_LOCALE_INIT(LC_NUMERIC, NULL, curnum);
2222 # endif /* USE_LOCALE_NUMERIC */
2227 const char * description;
2228 const char * name = "";
2229 if (strEQ(trial_locales[i], "C")) {
2230 description = "the standard locale";
2234 # ifdef SYSTEM_DEFAULT_LOCALE
2236 else if (strEQ(trial_locales[i], "")) {
2237 description = "the system default locale";
2238 if (system_default_locale) {
2239 name = system_default_locale;
2243 # endif /* SYSTEM_DEFAULT_LOCALE */
2246 description = "a fallback locale";
2247 name = trial_locales[i];
2249 if (name && strNE(name, "")) {
2250 PerlIO_printf(Perl_error_log,
2251 "perl: warning: %s %s (\"%s\").\n", msg, description, name);
2254 PerlIO_printf(Perl_error_log,
2255 "perl: warning: %s %s.\n", msg, description);
2258 } /* End of tried to fallback */
2260 # ifdef USE_LOCALE_CTYPE
2262 new_ctype(curctype);
2264 # endif /* USE_LOCALE_CTYPE */
2265 # ifdef USE_LOCALE_COLLATE
2267 new_collate(curcoll);
2269 # endif /* USE_LOCALE_COLLATE */
2270 # ifdef USE_LOCALE_NUMERIC
2272 new_numeric(curnum);
2274 # endif /* USE_LOCALE_NUMERIC */
2275 # if defined(USE_PERLIO) && defined(USE_LOCALE_CTYPE)
2277 /* Set PL_utf8locale to TRUE if using PerlIO _and_ the current LC_CTYPE
2278 * locale is UTF-8. If PL_utf8locale and PL_unicode (set by -C or by
2279 * $ENV{PERL_UNICODE}) are true, perl.c:S_parse_body() will turn on the
2280 * PerlIO :utf8 layer on STDIN, STDOUT, STDERR, _and_ the default open
2282 PL_utf8locale = _is_cur_LC_category_utf8(LC_CTYPE);
2284 /* Set PL_unicode to $ENV{PERL_UNICODE} if using PerlIO.
2285 This is an alternative to using the -C command line switch
2286 (the -C if present will override this). */
2288 const char *p = PerlEnv_getenv("PERL_UNICODE");
2289 PL_unicode = p ? parse_unicode_opts(&p) : 0;
2290 if (PL_unicode & PERL_UNICODE_UTF8CACHEASSERT_FLAG)
2295 # ifdef USE_LOCALE_CTYPE
2299 # endif /* USE_LOCALE_CTYPE */
2300 # ifdef USE_LOCALE_COLLATE
2304 # endif /* USE_LOCALE_COLLATE */
2305 # ifdef USE_LOCALE_NUMERIC
2309 # endif /* USE_LOCALE_NUMERIC */
2320 #endif /* USE_LOCALE */
2323 /* So won't continue to output stuff */
2324 DEBUG_INITIALIZATION_set(FALSE);
2331 #ifdef USE_LOCALE_COLLATE
2334 Perl__mem_collxfrm(pTHX_ const char *input_string,
2335 STRLEN len, /* Length of 'input_string' */
2336 STRLEN *xlen, /* Set to length of returned string
2337 (not including the collation index
2339 bool utf8 /* Is the input in UTF-8? */
2343 /* _mem_collxfrm() is a bit like strxfrm() but with two important
2344 * differences. First, it handles embedded NULs. Second, it allocates a bit
2345 * more memory than needed for the transformed data itself. The real
2346 * transformed data begins at offset COLLXFRM_HDR_LEN. *xlen is set to
2347 * the length of that, and doesn't include the collation index size.
2348 * Please see sv_collxfrm() to see how this is used. */
2350 #define COLLXFRM_HDR_LEN sizeof(PL_collation_ix)
2352 char * s = (char *) input_string;
2353 STRLEN s_strlen = strlen(input_string);
2355 STRLEN xAlloc; /* xalloc is a reserved word in VC */
2356 STRLEN length_in_chars;
2357 bool first_time = TRUE; /* Cleared after first loop iteration */
2359 PERL_ARGS_ASSERT__MEM_COLLXFRM;
2361 /* Must be NUL-terminated */
2362 assert(*(input_string + len) == '\0');
2364 /* If this locale has defective collation, skip */
2365 if (PL_collxfrm_base == 0 && PL_collxfrm_mult == 0) {
2366 DEBUG_L(PerlIO_printf(Perl_debug_log,
2367 "_mem_collxfrm: locale's collation is defective\n"));
2371 /* Replace any embedded NULs with the control that sorts before any others.
2372 * This will give as good as possible results on strings that don't
2373 * otherwise contain that character, but otherwise there may be
2374 * less-than-perfect results with that character and NUL. This is
2375 * unavoidable unless we replace strxfrm with our own implementation. */
2376 if (UNLIKELY(s_strlen < len)) { /* Only execute if there is an embedded
2380 STRLEN sans_nuls_len;
2381 int try_non_controls;
2382 char this_replacement_char[] = "?\0"; /* Room for a two-byte string,
2383 making sure 2nd byte is NUL.
2385 STRLEN this_replacement_len;
2387 /* If we don't know what non-NUL control character sorts lowest for
2388 * this locale, find it */
2389 if (PL_strxfrm_NUL_replacement == '\0') {
2391 char * cur_min_x = NULL; /* The min_char's xfrm, (except it also
2392 includes the collation index
2395 DEBUG_Lv(PerlIO_printf(Perl_debug_log, "Looking to replace NUL\n"));
2397 /* Unlikely, but it may be that no control will work to replace
2398 * NUL, in which case we instead look for any character. Controls
2399 * are preferred because collation order is, in general, context
2400 * sensitive, with adjoining characters affecting the order, and
2401 * controls are less likely to have such interactions, allowing the
2402 * NUL-replacement to stand on its own. (Another way to look at it
2403 * is to imagine what would happen if the NUL were replaced by a
2404 * combining character; it wouldn't work out all that well.) */
2405 for (try_non_controls = 0;
2406 try_non_controls < 2;
2409 /* Look through all legal code points (NUL isn't) */
2410 for (j = 1; j < 256; j++) {
2411 char * x; /* j's xfrm plus collation index */
2412 STRLEN x_len; /* length of 'x' */
2413 STRLEN trial_len = 1;
2414 char cur_source[] = { '\0', '\0' };
2416 /* Skip non-controls the first time through the loop. The
2417 * controls in a UTF-8 locale are the L1 ones */
2418 if (! try_non_controls && (PL_in_utf8_COLLATE_locale)
2425 /* Create a 1-char string of the current code point */
2426 cur_source[0] = (char) j;
2428 /* Then transform it */
2429 x = _mem_collxfrm(cur_source, trial_len, &x_len,
2430 0 /* The string is not in UTF-8 */);
2432 /* Ignore any character that didn't successfully transform.
2438 /* If this character's transformation is lower than
2439 * the current lowest, this one becomes the lowest */
2440 if ( cur_min_x == NULL
2441 || strLT(x + COLLXFRM_HDR_LEN,
2442 cur_min_x + COLLXFRM_HDR_LEN))
2444 PL_strxfrm_NUL_replacement = j;
2450 } /* end of loop through all 255 characters */
2452 /* Stop looking if found */
2457 /* Unlikely, but possible, if there aren't any controls that
2458 * work in the locale, repeat the loop, looking for any
2459 * character that works */
2460 DEBUG_L(PerlIO_printf(Perl_debug_log,
2461 "_mem_collxfrm: No control worked. Trying non-controls\n"));
2462 } /* End of loop to try first the controls, then any char */
2465 DEBUG_L(PerlIO_printf(Perl_debug_log,
2466 "_mem_collxfrm: Couldn't find any character to replace"
2467 " embedded NULs in locale %s with", PL_collation_name));
2471 DEBUG_L(PerlIO_printf(Perl_debug_log,
2472 "_mem_collxfrm: Replacing embedded NULs in locale %s with "
2473 "0x%02X\n", PL_collation_name, PL_strxfrm_NUL_replacement));
2475 Safefree(cur_min_x);
2476 } /* End of determining the character that is to replace NULs */
2478 /* If the replacement is variant under UTF-8, it must match the
2479 * UTF8-ness as the original */
2480 if ( ! UVCHR_IS_INVARIANT(PL_strxfrm_NUL_replacement) && utf8) {
2481 this_replacement_char[0] =
2482 UTF8_EIGHT_BIT_HI(PL_strxfrm_NUL_replacement);
2483 this_replacement_char[1] =
2484 UTF8_EIGHT_BIT_LO(PL_strxfrm_NUL_replacement);
2485 this_replacement_len = 2;
2488 this_replacement_char[0] = PL_strxfrm_NUL_replacement;
2489 /* this_replacement_char[1] = '\0' was done at initialization */
2490 this_replacement_len = 1;
2493 /* The worst case length for the replaced string would be if every
2494 * character in it is NUL. Multiply that by the length of each
2495 * replacement, and allow for a trailing NUL */
2496 sans_nuls_len = (len * this_replacement_len) + 1;
2497 Newx(sans_nuls, sans_nuls_len, char);
2500 /* Replace each NUL with the lowest collating control. Loop until have
2501 * exhausted all the NULs */
2502 while (s + s_strlen < e) {
2503 my_strlcat(sans_nuls, s, sans_nuls_len);
2505 /* Do the actual replacement */
2506 my_strlcat(sans_nuls, this_replacement_char, sans_nuls_len);
2508 /* Move past the input NUL */
2510 s_strlen = strlen(s);
2513 /* And add anything that trails the final NUL */
2514 my_strlcat(sans_nuls, s, sans_nuls_len);
2516 /* Switch so below we transform this modified string */
2519 } /* End of replacing NULs */
2521 /* Make sure the UTF8ness of the string and locale match */
2522 if (utf8 != PL_in_utf8_COLLATE_locale) {
2523 const char * const t = s; /* Temporary so we can later find where the
2526 /* Here they don't match. Change the string's to be what the locale is
2529 if (! utf8) { /* locale is UTF-8, but input isn't; upgrade the input */
2530 s = (char *) bytes_to_utf8((const U8 *) s, &len);
2533 else { /* locale is not UTF-8; but input is; downgrade the input */
2535 s = (char *) bytes_from_utf8((const U8 *) s, &len, &utf8);
2537 /* If the downgrade was successful we are done, but if the input
2538 * contains things that require UTF-8 to represent, have to do
2539 * damage control ... */
2540 if (UNLIKELY(utf8)) {
2542 /* What we do is construct a non-UTF-8 string with
2543 * 1) the characters representable by a single byte converted
2544 * to be so (if necessary);
2545 * 2) and the rest converted to collate the same as the
2546 * highest collating representable character. That makes
2547 * them collate at the end. This is similar to how we
2548 * handle embedded NULs, but we use the highest collating
2549 * code point instead of the smallest. Like the NUL case,
2550 * this isn't perfect, but is the best we can reasonably
2551 * do. Every above-255 code point will sort the same as
2552 * the highest-sorting 0-255 code point. If that code
2553 * point can combine in a sequence with some other code
2554 * points for weight calculations, us changing something to
2555 * be it can adversely affect the results. But in most
2556 * cases, it should work reasonably. And note that this is
2557 * really an illegal situation: using code points above 255
2558 * on a locale where only 0-255 are valid. If two strings
2559 * sort entirely equal, then the sort order for the
2560 * above-255 code points will be in code point order. */
2564 /* If we haven't calculated the code point with the maximum
2565 * collating order for this locale, do so now */
2566 if (! PL_strxfrm_max_cp) {
2569 /* The current transformed string that collates the
2570 * highest (except it also includes the prefixed collation
2572 char * cur_max_x = NULL;
2574 /* Look through all legal code points (NUL isn't) */
2575 for (j = 1; j < 256; j++) {
2578 char cur_source[] = { '\0', '\0' };
2580 /* Create a 1-char string of the current code point */
2581 cur_source[0] = (char) j;
2583 /* Then transform it */
2584 x = _mem_collxfrm(cur_source, 1, &x_len, FALSE);
2586 /* If something went wrong (which it shouldn't), just
2587 * ignore this code point */
2592 /* If this character's transformation is higher than
2593 * the current highest, this one becomes the highest */
2594 if ( cur_max_x == NULL
2595 || strGT(x + COLLXFRM_HDR_LEN,
2596 cur_max_x + COLLXFRM_HDR_LEN))
2598 PL_strxfrm_max_cp = j;
2607 DEBUG_L(PerlIO_printf(Perl_debug_log,
2608 "_mem_collxfrm: Couldn't find any character to"
2609 " replace above-Latin1 chars in locale %s with",
2610 PL_collation_name));
2614 DEBUG_L(PerlIO_printf(Perl_debug_log,
2615 "_mem_collxfrm: highest 1-byte collating character"
2616 " in locale %s is 0x%02X\n",
2618 PL_strxfrm_max_cp));
2620 Safefree(cur_max_x);
2623 /* Here we know which legal code point collates the highest.
2624 * We are ready to construct the non-UTF-8 string. The length
2625 * will be at least 1 byte smaller than the input string
2626 * (because we changed at least one 2-byte character into a
2627 * single byte), but that is eaten up by the trailing NUL */
2633 char * e = (char *) t + len;
2635 for (i = 0; i < len; i+= UTF8SKIP(t + i)) {
2637 if (UTF8_IS_INVARIANT(cur_char)) {
2640 else if (UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(t + i, e)) {
2641 s[d++] = EIGHT_BIT_UTF8_TO_NATIVE(cur_char, t[i+1]);
2643 else { /* Replace illegal cp with highest collating
2645 s[d++] = PL_strxfrm_max_cp;
2649 Renew(s, d, char); /* Free up unused space */
2654 /* Here, we have constructed a modified version of the input. It could
2655 * be that we already had a modified copy before we did this version.
2656 * If so, that copy is no longer needed */
2657 if (t != input_string) {
2662 length_in_chars = (utf8)
2663 ? utf8_length((U8 *) s, (U8 *) s + len)
2666 /* The first element in the output is the collation id, used by
2667 * sv_collxfrm(); then comes the space for the transformed string. The
2668 * equation should give us a good estimate as to how much is needed */
2669 xAlloc = COLLXFRM_HDR_LEN
2671 + (PL_collxfrm_mult * length_in_chars);
2672 Newx(xbuf, xAlloc, char);
2673 if (UNLIKELY(! xbuf)) {
2674 DEBUG_L(PerlIO_printf(Perl_debug_log,
2675 "_mem_collxfrm: Couldn't malloc %zu bytes\n", xAlloc));
2679 /* Store the collation id */
2680 *(U32*)xbuf = PL_collation_ix;
2682 /* Then the transformation of the input. We loop until successful, or we
2686 *xlen = strxfrm(xbuf + COLLXFRM_HDR_LEN, s, xAlloc - COLLXFRM_HDR_LEN);
2688 /* If the transformed string occupies less space than we told strxfrm()
2689 * was available, it means it successfully transformed the whole
2691 if (*xlen < xAlloc - COLLXFRM_HDR_LEN) {
2693 /* Some systems include a trailing NUL in the returned length.
2694 * Ignore it, using a loop in case multiple trailing NULs are
2697 && *(xbuf + COLLXFRM_HDR_LEN + (*xlen) - 1) == '\0')
2702 /* If the first try didn't get it, it means our prediction was low.
2703 * Modify the coefficients so that we predict a larger value in any
2704 * future transformations */
2706 STRLEN needed = *xlen + 1; /* +1 For trailing NUL */
2707 STRLEN computed_guess = PL_collxfrm_base
2708 + (PL_collxfrm_mult * length_in_chars);
2710 /* On zero-length input, just keep current slope instead of
2712 const STRLEN new_m = (length_in_chars != 0)
2713 ? needed / length_in_chars
2716 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
2717 "%s: %d: initial size of %zu bytes for a length "
2718 "%zu string was insufficient, %zu needed\n",
2720 computed_guess, length_in_chars, needed));
2722 /* If slope increased, use it, but discard this result for
2723 * length 1 strings, as we can't be sure that it's a real slope
2725 if (length_in_chars > 1 && new_m > PL_collxfrm_mult) {
2729 STRLEN old_m = PL_collxfrm_mult;
2730 STRLEN old_b = PL_collxfrm_base;
2734 PL_collxfrm_mult = new_m;
2735 PL_collxfrm_base = 1; /* +1 For trailing NUL */
2736 computed_guess = PL_collxfrm_base
2737 + (PL_collxfrm_mult * length_in_chars);
2738 if (computed_guess < needed) {
2739 PL_collxfrm_base += needed - computed_guess;
2742 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
2743 "%s: %d: slope is now %zu; was %zu, base "
2744 "is now %zu; was %zu\n",
2746 PL_collxfrm_mult, old_m,
2747 PL_collxfrm_base, old_b));
2749 else { /* Slope didn't change, but 'b' did */
2750 const STRLEN new_b = needed
2753 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
2754 "%s: %d: base is now %zu; was %zu\n",
2756 new_b, PL_collxfrm_base));
2757 PL_collxfrm_base = new_b;
2764 if (UNLIKELY(*xlen >= PERL_INT_MAX)) {
2765 DEBUG_L(PerlIO_printf(Perl_debug_log,
2766 "_mem_collxfrm: Needed %zu bytes, max permissible is %u\n",
2767 *xlen, PERL_INT_MAX));
2771 /* A well-behaved strxfrm() returns exactly how much space it needs
2772 * (usually not including the trailing NUL) when it fails due to not
2773 * enough space being provided. Assume that this is the case unless
2774 * it's been proven otherwise */
2775 if (LIKELY(PL_strxfrm_is_behaved) && first_time) {
2776 xAlloc = *xlen + COLLXFRM_HDR_LEN + 1;
2778 else { /* Here, either:
2779 * 1) The strxfrm() has previously shown bad behavior; or
2780 * 2) It isn't the first time through the loop, which means
2781 * that the strxfrm() is now showing bad behavior, because
2782 * we gave it what it said was needed in the previous
2783 * iteration, and it came back saying it needed still more.
2784 * (Many versions of cygwin fit this. When the buffer size
2785 * isn't sufficient, they return the input size instead of
2786 * how much is needed.)
2787 * Increase the buffer size by a fixed percentage and try again.
2789 xAlloc += (xAlloc / 4) + 1;
2790 PL_strxfrm_is_behaved = FALSE;
2794 if (DEBUG_Lv_TEST || debug_initialization) {
2795 PerlIO_printf(Perl_debug_log,
2796 "_mem_collxfrm required more space than previously calculated"
2797 " for locale %s, trying again with new guess=%d+%zu\n",
2798 PL_collation_name, (int) COLLXFRM_HDR_LEN,
2799 xAlloc - COLLXFRM_HDR_LEN);
2806 Renew(xbuf, xAlloc, char);
2807 if (UNLIKELY(! xbuf)) {
2808 DEBUG_L(PerlIO_printf(Perl_debug_log,
2809 "_mem_collxfrm: Couldn't realloc %zu bytes\n", xAlloc));
2819 if (DEBUG_Lv_TEST || debug_initialization) {
2821 print_collxfrm_input_and_return(s, s + len, xlen, utf8);
2822 PerlIO_printf(Perl_debug_log, "Its xfrm is:");
2823 PerlIO_printf(Perl_debug_log, "%s\n",
2824 _byte_dump_string((U8 *) xbuf + COLLXFRM_HDR_LEN,
2830 /* Free up unneeded space; retain ehough for trailing NUL */
2831 Renew(xbuf, COLLXFRM_HDR_LEN + *xlen + 1, char);
2833 if (s != input_string) {
2841 if (s != input_string) {
2848 if (DEBUG_Lv_TEST || debug_initialization) {
2849 print_collxfrm_input_and_return(s, s + len, NULL, utf8);
2860 S_print_collxfrm_input_and_return(pTHX_
2861 const char * const s,
2862 const char * const e,
2863 const STRLEN * const xlen,
2867 PERL_ARGS_ASSERT_PRINT_COLLXFRM_INPUT_AND_RETURN;
2869 PerlIO_printf(Perl_debug_log, "_mem_collxfrm[%" UVuf "]: returning ",
2870 (UV)PL_collation_ix);
2872 PerlIO_printf(Perl_debug_log, "%zu", *xlen);
2875 PerlIO_printf(Perl_debug_log, "NULL");
2877 PerlIO_printf(Perl_debug_log, " for locale '%s', string='",
2879 print_bytes_for_locale(s, e, is_utf8);
2881 PerlIO_printf(Perl_debug_log, "'\n");
2885 S_print_bytes_for_locale(pTHX_
2886 const char * const s,
2887 const char * const e,
2891 bool prev_was_printable = TRUE;
2892 bool first_time = TRUE;
2894 PERL_ARGS_ASSERT_PRINT_BYTES_FOR_LOCALE;
2898 ? utf8_to_uvchr_buf((U8 *) t, e, NULL)
2901 if (! prev_was_printable) {
2902 PerlIO_printf(Perl_debug_log, " ");
2904 PerlIO_printf(Perl_debug_log, "%c", (U8) cp);
2905 prev_was_printable = TRUE;
2909 PerlIO_printf(Perl_debug_log, " ");
2911 PerlIO_printf(Perl_debug_log, "%02" UVXf, cp);
2912 prev_was_printable = FALSE;
2914 t += (is_utf8) ? UTF8SKIP(t) : 1;
2919 # endif /* #ifdef DEBUGGING */
2920 #endif /* USE_LOCALE_COLLATE */
2925 Perl__is_cur_LC_category_utf8(pTHX_ int category)
2927 /* Returns TRUE if the current locale for 'category' is UTF-8; FALSE
2928 * otherwise. 'category' may not be LC_ALL. If the platform doesn't have
2929 * nl_langinfo(), nor MB_CUR_MAX, this employs a heuristic, which hence
2930 * could give the wrong result. The result will very likely be correct for
2931 * languages that have commonly used non-ASCII characters, but for notably
2932 * English, it comes down to if the locale's name ends in something like
2933 * "UTF-8". It errs on the side of not being a UTF-8 locale. */
2935 char *save_input_locale = NULL;
2940 assert(category != LC_ALL);
2944 /* First dispose of the trivial cases */
2945 save_input_locale = do_setlocale_r(category, NULL);
2946 if (! save_input_locale) {
2947 DEBUG_L(PerlIO_printf(Perl_debug_log,
2948 "Could not find current locale for category %d\n",
2950 return FALSE; /* XXX maybe should croak */
2952 save_input_locale = stdize_locale(savepv(save_input_locale));
2953 if (isNAME_C_OR_POSIX(save_input_locale)) {
2954 DEBUG_L(PerlIO_printf(Perl_debug_log,
2955 "Current locale for category %d is %s\n",
2956 category, save_input_locale));
2957 Safefree(save_input_locale);
2961 # if defined(USE_LOCALE_CTYPE) \
2962 && (defined(MB_CUR_MAX) || (defined(HAS_NL_LANGINFO) && defined(CODESET)))
2964 { /* Next try nl_langinfo or MB_CUR_MAX if available */
2966 char *save_ctype_locale = NULL;
2969 if (category != LC_CTYPE) { /* These work only on LC_CTYPE */
2971 /* Get the current LC_CTYPE locale */
2972 save_ctype_locale = do_setlocale_c(LC_CTYPE, NULL);
2973 if (! save_ctype_locale) {
2974 DEBUG_L(PerlIO_printf(Perl_debug_log,
2975 "Could not find current locale for LC_CTYPE\n"));
2976 goto cant_use_nllanginfo;
2978 save_ctype_locale = stdize_locale(savepv(save_ctype_locale));
2980 /* If LC_CTYPE and the desired category use the same locale, this
2981 * means that finding the value for LC_CTYPE is the same as finding
2982 * the value for the desired category. Otherwise, switch LC_CTYPE
2983 * to the desired category's locale */
2984 if (strEQ(save_ctype_locale, save_input_locale)) {
2985 Safefree(save_ctype_locale);
2986 save_ctype_locale = NULL;
2988 else if (! do_setlocale_c(LC_CTYPE, save_input_locale)) {
2989 DEBUG_L(PerlIO_printf(Perl_debug_log,
2990 "Could not change LC_CTYPE locale to %s\n",
2991 save_input_locale));
2992 Safefree(save_ctype_locale);
2993 goto cant_use_nllanginfo;
2997 DEBUG_L(PerlIO_printf(Perl_debug_log, "Current LC_CTYPE locale=%s\n",
2998 save_input_locale));
3000 /* Here the current LC_CTYPE is set to the locale of the category whose
3001 * information is desired. This means that nl_langinfo() and MB_CUR_MAX
3002 * should give the correct results */
3004 # if defined(HAS_NL_LANGINFO) && defined(CODESET)
3005 /* The task is easiest if has this POSIX 2001 function */
3008 const char *codeset = my_nl_langinfo(PERL_CODESET, FALSE);
3009 /* FALSE => already in dest locale */
3011 DEBUG_L(PerlIO_printf(Perl_debug_log,
3012 "\tnllanginfo returned CODESET '%s'\n", codeset));
3014 if (codeset && strNE(codeset, "")) {
3015 /* If we switched LC_CTYPE, switch back */
3016 if (save_ctype_locale) {
3017 do_setlocale_c(LC_CTYPE, save_ctype_locale);
3018 Safefree(save_ctype_locale);
3021 is_utf8 = foldEQ(codeset, STR_WITH_LEN("UTF-8"))
3022 || foldEQ(codeset, STR_WITH_LEN("UTF8"));
3024 DEBUG_L(PerlIO_printf(Perl_debug_log,
3025 "\tnllanginfo returned CODESET '%s'; ?UTF8 locale=%d\n",
3027 Safefree(save_input_locale);
3035 /* Here, either we don't have nl_langinfo, or it didn't return a
3036 * codeset. Try MB_CUR_MAX */
3038 /* Standard UTF-8 needs at least 4 bytes to represent the maximum
3039 * Unicode code point. Since UTF-8 is the only non-single byte
3040 * encoding we handle, we just say any such encoding is UTF-8, and if
3041 * turns out to be wrong, other things will fail */
3042 is_utf8 = MB_CUR_MAX >= 4;
3044 DEBUG_L(PerlIO_printf(Perl_debug_log,
3045 "\tMB_CUR_MAX=%d; ?UTF8 locale=%d\n",
3046 (int) MB_CUR_MAX, is_utf8));
3048 Safefree(save_input_locale);
3052 /* ... But, most system that have MB_CUR_MAX will also have mbtowc(),
3053 * since they are both in the C99 standard. We can feed a known byte
3054 * string to the latter function, and check that it gives the expected
3060 PERL_UNUSED_RESULT(mbtowc(&wc, NULL, 0));/* Reset any shift state */
3062 len = mbtowc(&wc, STR_WITH_LEN(REPLACEMENT_CHARACTER_UTF8));
3065 if ( len != STRLENs(REPLACEMENT_CHARACTER_UTF8)
3066 || wc != (wchar_t) 0xFFFD)
3069 DEBUG_L(PerlIO_printf(Perl_debug_log, "\replacement=U+%x\n",
3071 DEBUG_L(PerlIO_printf(Perl_debug_log,
3072 "\treturn from mbtowc=%d; errno=%d; ?UTF8 locale=0\n",
3079 /* If we switched LC_CTYPE, switch back */
3080 if (save_ctype_locale) {
3081 do_setlocale_c(LC_CTYPE, save_ctype_locale);
3082 Safefree(save_ctype_locale);
3091 cant_use_nllanginfo:
3093 # else /* nl_langinfo should work if available, so don't bother compiling this
3094 fallback code. The final fallback of looking at the name is
3095 compiled, and will be executed if nl_langinfo fails */
3097 /* nl_langinfo not available or failed somehow. Next try looking at the
3098 * currency symbol to see if it disambiguates things. Often that will be
3099 * in the native script, and if the symbol isn't in UTF-8, we know that the
3100 * locale isn't. If it is non-ASCII UTF-8, we infer that the locale is
3101 * too, as the odds of a non-UTF8 string being valid UTF-8 are quite small
3104 # ifdef HAS_LOCALECONV
3105 # ifdef USE_LOCALE_MONETARY
3108 char *save_monetary_locale = NULL;
3109 bool only_ascii = FALSE;
3110 bool is_utf8 = FALSE;
3113 /* Like above for LC_CTYPE, we first set LC_MONETARY to the locale of
3114 * the desired category, if it isn't that locale already */
3116 if (category != LC_MONETARY) {
3118 save_monetary_locale = do_setlocale_c(LC_MONETARY, NULL);
3119 if (! save_monetary_locale) {
3120 DEBUG_L(PerlIO_printf(Perl_debug_log,
3121 "Could not find current locale for LC_MONETARY\n"));
3122 goto cant_use_monetary;
3124 save_monetary_locale = stdize_locale(savepv(save_monetary_locale));
3126 if (strEQ(save_monetary_locale, save_input_locale)) {
3127 Safefree(save_monetary_locale);
3128 save_monetary_locale = NULL;
3130 else if (! do_setlocale_c(LC_MONETARY, save_input_locale)) {
3131 DEBUG_L(PerlIO_printf(Perl_debug_log,
3132 "Could not change LC_MONETARY locale to %s\n",
3133 save_input_locale));
3134 Safefree(save_monetary_locale);
3135 goto cant_use_monetary;
3139 /* Here the current LC_MONETARY is set to the locale of the category
3140 * whose information is desired. */
3144 || ! lc->currency_symbol
3145 || is_utf8_invariant_string((U8 *) lc->currency_symbol, 0))
3147 DEBUG_L(PerlIO_printf(Perl_debug_log, "Couldn't get currency symbol for %s, or contains only ASCII; can't use for determining if UTF-8 locale\n", save_input_locale));
3151 is_utf8 = is_utf8_string((U8 *) lc->currency_symbol, 0);
3154 /* If we changed it, restore LC_MONETARY to its original locale */
3155 if (save_monetary_locale) {
3156 do_setlocale_c(LC_MONETARY, save_monetary_locale);
3157 Safefree(save_monetary_locale);
3162 /* It isn't a UTF-8 locale if the symbol is not legal UTF-8;
3163 * otherwise assume the locale is UTF-8 if and only if the symbol
3164 * is non-ascii UTF-8. */
3165 DEBUG_L(PerlIO_printf(Perl_debug_log, "\t?Currency symbol for %s is UTF-8=%d\n",
3166 save_input_locale, is_utf8));
3167 Safefree(save_input_locale);
3173 # endif /* USE_LOCALE_MONETARY */
3174 # endif /* HAS_LOCALECONV */
3176 # if defined(HAS_STRFTIME) && defined(USE_LOCALE_TIME)
3178 /* Still haven't found a non-ASCII string to disambiguate UTF-8 or not. Try
3179 * the names of the months and weekdays, timezone, and am/pm indicator */
3181 char *save_time_locale = NULL;
3183 bool is_dst = FALSE;
3187 char * formatted_time;
3190 /* Like above for LC_MONETARY, we set LC_TIME to the locale of the
3191 * desired category, if it isn't that locale already */
3193 if (category != LC_TIME) {
3195 save_time_locale = do_setlocale_c(LC_TIME, NULL);
3196 if (! save_time_locale) {
3197 DEBUG_L(PerlIO_printf(Perl_debug_log,
3198 "Could not find current locale for LC_TIME\n"));
3201 save_time_locale = stdize_locale(savepv(save_time_locale));
3203 if (strEQ(save_time_locale, save_input_locale)) {
3204 Safefree(save_time_locale);
3205 save_time_locale = NULL;
3207 else if (! do_setlocale_c(LC_TIME, save_input_locale)) {
3208 DEBUG_L(PerlIO_printf(Perl_debug_log,
3209 "Could not change LC_TIME locale to %s\n",
3210 save_input_locale));
3211 Safefree(save_time_locale);
3216 /* Here the current LC_TIME is set to the locale of the category
3217 * whose information is desired. Look at all the days of the week and
3218 * month names, and the timezone and am/pm indicator for UTF-8 variant
3219 * characters. The first such a one found will tell us if the locale
3220 * is UTF-8 or not */
3222 for (i = 0; i < 7 + 12; i++) { /* 7 days; 12 months */
3223 formatted_time = my_strftime("%A %B %Z %p",
3224 0, 0, hour, dom, month, 2012 - 1900, 0, 0, is_dst);
3225 if ( ! formatted_time
3226 || is_utf8_invariant_string((U8 *) formatted_time, 0))
3229 /* Here, we didn't find a non-ASCII. Try the next time through
3230 * with the complemented dst and am/pm, and try with the next
3231 * weekday. After we have gotten all weekdays, try the next
3234 hour = (hour + 12) % 24;
3242 /* Here, we have a non-ASCII. Return TRUE is it is valid UTF8;
3243 * false otherwise. But first, restore LC_TIME to its original
3244 * locale if we changed it */
3245 if (save_time_locale) {
3246 do_setlocale_c(LC_TIME, save_time_locale);
3247 Safefree(save_time_locale);
3250 DEBUG_L(PerlIO_printf(Perl_debug_log, "\t?time-related strings for %s are UTF-8=%d\n",
3252 is_utf8_string((U8 *) formatted_time, 0)));
3253 Safefree(save_input_locale);
3254 return is_utf8_string((U8 *) formatted_time, 0);
3257 /* Falling off the end of the loop indicates all the names were just
3258 * ASCII. Go on to the next test. If we changed it, restore LC_TIME
3259 * to its original locale */
3260 if (save_time_locale) {
3261 do_setlocale_c(LC_TIME, save_time_locale);
3262 Safefree(save_time_locale);
3264 DEBUG_L(PerlIO_printf(Perl_debug_log, "All time-related words for %s contain only ASCII; can't use for determining if UTF-8 locale\n", save_input_locale));
3270 # if 0 && defined(USE_LOCALE_MESSAGES) && defined(HAS_SYS_ERRLIST)
3272 /* This code is ifdefd out because it was found to not be necessary in testing
3273 * on our dromedary test machine, which has over 700 locales. There, this
3274 * added no value to looking at the currency symbol and the time strings. I
3275 * left it in so as to avoid rewriting it if real-world experience indicates
3276 * that dromedary is an outlier. Essentially, instead of returning abpve if we
3277 * haven't found illegal utf8, we continue on and examine all the strerror()
3278 * messages on the platform for utf8ness. If all are ASCII, we still don't
3279 * know the answer; but otherwise we have a pretty good indication of the
3280 * utf8ness. The reason this doesn't help much is that the messages may not
3281 * have been translated into the locale. The currency symbol and time strings
3282 * are much more likely to have been translated. */
3285 bool is_utf8 = FALSE;
3286 bool non_ascii = FALSE;
3287 char *save_messages_locale = NULL;
3288 const char * errmsg = NULL;
3290 /* Like above, we set LC_MESSAGES to the locale of the desired
3291 * category, if it isn't that locale already */
3293 if (category != LC_MESSAGES) {
3295 save_messages_locale = do_setlocale_c(LC_MESSAGES, NULL);
3296 if (! save_messages_locale) {
3297 DEBUG_L(PerlIO_printf(Perl_debug_log,
3298 "Could not find current locale for LC_MESSAGES\n"));
3299 goto cant_use_messages;
3301 save_messages_locale = stdize_locale(savepv(save_messages_locale));
3303 if (strEQ(save_messages_locale, save_input_locale)) {
3304 Safefree(save_messages_locale);
3305 save_messages_locale = NULL;
3307 else if (! do_setlocale_c(LC_MESSAGES, save_input_locale)) {
3308 DEBUG_L(PerlIO_printf(Perl_debug_log,
3309 "Could not change LC_MESSAGES locale to %s\n",
3310 save_input_locale));
3311 Safefree(save_messages_locale);
3312 goto cant_use_messages;
3316 /* Here the current LC_MESSAGES is set to the locale of the category
3317 * whose information is desired. Look through all the messages. We
3318 * can't use Strerror() here because it may expand to code that
3319 * segfaults in miniperl */
3321 for (e = 0; e <= sys_nerr; e++) {
3323 errmsg = sys_errlist[e];
3324 if (errno || !errmsg) {
3327 errmsg = savepv(errmsg);
3328 if (! is_utf8_invariant_string((U8 *) errmsg, 0)) {
3330 is_utf8 = is_utf8_string((U8 *) errmsg, 0);
3336 /* And, if we changed it, restore LC_MESSAGES to its original locale */
3337 if (save_messages_locale) {
3338 do_setlocale_c(LC_MESSAGES, save_messages_locale);
3339 Safefree(save_messages_locale);
3344 /* Any non-UTF-8 message means not a UTF-8 locale; if all are valid,
3345 * any non-ascii means it is one; otherwise we assume it isn't */
3346 DEBUG_L(PerlIO_printf(Perl_debug_log, "\t?error messages for %s are UTF-8=%d\n",
3349 Safefree(save_input_locale);
3353 DEBUG_L(PerlIO_printf(Perl_debug_log, "All error messages for %s contain only ASCII; can't use for determining if UTF-8 locale\n", save_input_locale));
3358 # endif /* the code that is compiled when no nl_langinfo */
3360 # ifndef EBCDIC /* On os390, even if the name ends with "UTF-8', it isn't a
3363 /* As a last resort, look at the locale name to see if it matches
3364 * qr/UTF -? * 8 /ix, or some other common locale names. This "name", the
3365 * return of setlocale(), is actually defined to be opaque, so we can't
3366 * really rely on the absence of various substrings in the name to indicate
3367 * its UTF-8ness, but if it has UTF8 in the name, it is extremely likely to
3368 * be a UTF-8 locale. Similarly for the other common names */
3370 final_pos = strlen(save_input_locale) - 1;
3371 if (final_pos >= 3) {
3372 char *name = save_input_locale;
3374 /* Find next 'U' or 'u' and look from there */
3375 while ((name += strcspn(name, "Uu") + 1)
3376 <= save_input_locale + final_pos - 2)
3378 if ( isALPHA_FOLD_NE(*name, 't')
3379 || isALPHA_FOLD_NE(*(name + 1), 'f'))
3384 if (*(name) == '-') {
3385 if ((name > save_input_locale + final_pos - 1)) {
3390 if (*(name) == '8') {
3391 DEBUG_L(PerlIO_printf(Perl_debug_log,
3392 "Locale %s ends with UTF-8 in name\n",
3393 save_input_locale));
3394 Safefree(save_input_locale);
3398 DEBUG_L(PerlIO_printf(Perl_debug_log,
3399 "Locale %s doesn't end with UTF-8 in name\n",
3400 save_input_locale));
3406 /* http://msdn.microsoft.com/en-us/library/windows/desktop/dd317756.aspx */
3407 if (memENDs(save_input_locale, final_pos, "65001")) {
3408 DEBUG_L(PerlIO_printf(Perl_debug_log,
3409 "Locale %s ends with 65001 in name, is UTF-8 locale\n",
3410 save_input_locale));
3411 Safefree(save_input_locale);
3417 /* Other common encodings are the ISO 8859 series, which aren't UTF-8. But
3418 * since we are about to return FALSE anyway, there is no point in doing
3419 * this extra work */
3422 if (instr(save_input_locale, "8859")) {
3423 DEBUG_L(PerlIO_printf(Perl_debug_log,
3424 "Locale %s has 8859 in name, not UTF-8 locale\n",
3425 save_input_locale));
3426 Safefree(save_input_locale);
3431 DEBUG_L(PerlIO_printf(Perl_debug_log,
3432 "Assuming locale %s is not a UTF-8 locale\n",
3433 save_input_locale));
3434 Safefree(save_input_locale);
3442 Perl__is_in_locale_category(pTHX_ const bool compiling, const int category)
3445 /* Internal function which returns if we are in the scope of a pragma that
3446 * enables the locale category 'category'. 'compiling' should indicate if
3447 * this is during the compilation phase (TRUE) or not (FALSE). */
3449 const COP * const cop = (compiling) ? &PL_compiling : PL_curcop;
3451 SV *categories = cop_hints_fetch_pvs(cop, "locale", 0);
3452 if (! categories || categories == &PL_sv_placeholder) {
3456 /* The pseudo-category 'not_characters' is -1, so just add 1 to each to get
3457 * a valid unsigned */
3458 assert(category >= -1);
3459 return cBOOL(SvUV(categories) & (1U << (category + 1)));
3463 Perl_my_strerror(pTHX_ const int errnum)
3465 /* Returns a mortalized copy of the text of the error message associated
3466 * with 'errnum'. It uses the current locale's text unless the platform
3467 * doesn't have the LC_MESSAGES category or we are not being called from
3468 * within the scope of 'use locale'. In the former case, it uses whatever
3469 * strerror returns; in the latter case it uses the text from the C locale.
3471 * The function just calls strerror(), but temporarily switches, if needed,
3472 * to the C locale */
3477 #ifndef USE_LOCALE_MESSAGES
3479 /* If platform doesn't have messages category, we don't do any switching to
3480 * the C locale; we just use whatever strerror() returns */
3482 errstr = savepv(Strerror(errnum));
3484 #else /* Has locale messages */
3486 const bool within_locale_scope = IN_LC(LC_MESSAGES);
3488 # if defined(HAS_POSIX_2008_LOCALE) && defined(HAS_STRERROR_L)
3490 /* This function is trivial if we have strerror_l() */
3492 if (within_locale_scope) {
3493 errstr = savepv(strerror(errnum));
3496 errstr = savepv(strerror_l(errnum, PL_C_locale_obj));
3499 # else /* Doesn't have strerror_l(). */
3501 # ifdef USE_POSIX_2008_LOCALE
3503 locale_t save_locale = NULL;
3507 char * save_locale = NULL;
3508 bool locale_is_C = FALSE;
3510 /* We have a critical section to prevent another thread from changing the
3511 * locale out from under us (or zapping the buffer returned from
3517 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
3518 "my_strerror called with errnum %d\n", errnum));
3519 if (! within_locale_scope) {
3522 # ifdef USE_POSIX_2008_LOCALE /* Use the thread-safe locale functions */
3524 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
3525 "Not within locale scope, about to call"
3526 " uselocale(0x%p)\n", PL_C_locale_obj));
3527 save_locale = uselocale(PL_C_locale_obj);
3528 if (! save_locale) {
3529 DEBUG_L(PerlIO_printf(Perl_debug_log,
3530 "uselocale failed, errno=%d\n", errno));
3533 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
3534 "uselocale returned 0x%p\n", save_locale));
3537 # else /* Not thread-safe build */
3539 save_locale = do_setlocale_c(LC_MESSAGES, NULL);
3540 if (! save_locale) {
3541 DEBUG_L(PerlIO_printf(Perl_debug_log,
3542 "setlocale failed, errno=%d\n", errno));
3545 locale_is_C = isNAME_C_OR_POSIX(save_locale);
3547 /* Switch to the C locale if not already in it */
3548 if (! locale_is_C) {
3550 /* The setlocale() just below likely will zap 'save_locale', so
3552 save_locale = savepv(save_locale);
3553 do_setlocale_c(LC_MESSAGES, "C");
3559 } /* end of ! within_locale_scope */
3561 DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s: %d: WITHIN locale scope\n",
3562 __FILE__, __LINE__));
3565 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
3566 "Any locale change has been done; about to call Strerror\n"));
3567 errstr = savepv(Strerror(errnum));
3569 if (! within_locale_scope) {
3572 # ifdef USE_POSIX_2008_LOCALE
3574 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
3575 "%s: %d: not within locale scope, restoring the locale\n",
3576 __FILE__, __LINE__));
3577 if (save_locale && ! uselocale(save_locale)) {
3578 DEBUG_L(PerlIO_printf(Perl_debug_log,
3579 "uselocale restore failed, errno=%d\n", errno));
3585 if (save_locale && ! locale_is_C) {
3586 if (! do_setlocale_c(LC_MESSAGES, save_locale)) {
3587 DEBUG_L(PerlIO_printf(Perl_debug_log,
3588 "setlocale restore failed, errno=%d\n", errno));
3590 Safefree(save_locale);
3597 # endif /* End of doesn't have strerror_l */
3598 #endif /* End of does have locale messages */
3602 if (DEBUG_Lv_TEST) {
3603 PerlIO_printf(Perl_debug_log, "Strerror returned; saving a copy: '");
3604 print_bytes_for_locale(errstr, errstr + strlen(errstr), 0);
3605 PerlIO_printf(Perl_debug_log, "'\n");
3616 =for apidoc sync_locale
3618 Changing the program's locale should be avoided by XS code. Nevertheless,
3619 certain non-Perl libraries called from XS, such as C<Gtk> do so. When this
3620 happens, Perl needs to be told that the locale has changed. Use this function
3621 to do so, before returning to Perl.
3627 Perl_sync_locale(pTHX)
3631 #ifdef USE_LOCALE_CTYPE
3633 newlocale = do_setlocale_c(LC_CTYPE, NULL);
3634 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
3635 "%s:%d: %s\n", __FILE__, __LINE__,
3636 setlocale_debug_string(LC_CTYPE, NULL, newlocale)));
3637 new_ctype(newlocale);
3639 #endif /* USE_LOCALE_CTYPE */
3640 #ifdef USE_LOCALE_COLLATE
3642 newlocale = do_setlocale_c(LC_COLLATE, NULL);
3643 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
3644 "%s:%d: %s\n", __FILE__, __LINE__,
3645 setlocale_debug_string(LC_COLLATE, NULL, newlocale)));
3646 new_collate(newlocale);
3649 #ifdef USE_LOCALE_NUMERIC
3651 newlocale = do_setlocale_c(LC_NUMERIC, NULL);
3652 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
3653 "%s:%d: %s\n", __FILE__, __LINE__,
3654 setlocale_debug_string(LC_NUMERIC, NULL, newlocale)));
3655 new_numeric(newlocale);
3657 #endif /* USE_LOCALE_NUMERIC */
3661 #if defined(DEBUGGING) && defined(USE_LOCALE)
3664 S_setlocale_debug_string(const int category, /* category number,
3666 const char* const locale, /* locale name */
3668 /* return value from setlocale() when attempting to
3669 * set 'category' to 'locale' */
3670 const char* const retval)
3672 /* Returns a pointer to a NUL-terminated string in static storage with
3673 * added text about the info passed in. This is not thread safe and will
3674 * be overwritten by the next call, so this should be used just to
3675 * formulate a string to immediately print or savepv() on. */
3677 /* initialise to a non-null value to keep it out of BSS and so keep
3678 * -DPERL_GLOBAL_STRUCT_PRIVATE happy */
3679 static char ret[128] = "If you can read this, thank your buggy C"
3680 " library strlcpy(), and change your hints file"
3682 my_strlcpy(ret, "setlocale(", sizeof(ret));
3686 my_snprintf(ret, sizeof(ret), "%s? %d", ret, category);
3692 my_strlcat(ret, "LC_ALL", sizeof(ret));
3699 my_strlcat(ret, "LC_CTYPE", sizeof(ret));
3706 my_strlcat(ret, "LC_NUMERIC", sizeof(ret));
3713 my_strlcat(ret, "LC_COLLATE", sizeof(ret));
3720 my_strlcat(ret, "LC_TIME", sizeof(ret));
3727 my_strlcat(ret, "LC_MONETARY", sizeof(ret));
3734 my_strlcat(ret, "LC_MESSAGES", sizeof(ret));
3741 my_strlcat(ret, ", ", sizeof(ret));
3744 my_strlcat(ret, "\"", sizeof(ret));
3745 my_strlcat(ret, locale, sizeof(ret));
3746 my_strlcat(ret, "\"", sizeof(ret));
3749 my_strlcat(ret, "NULL", sizeof(ret));
3752 my_strlcat(ret, ") returned ", sizeof(ret));
3755 my_strlcat(ret, "\"", sizeof(ret));
3756 my_strlcat(ret, retval, sizeof(ret));
3757 my_strlcat(ret, "\"", sizeof(ret));
3760 my_strlcat(ret, "NULL", sizeof(ret));
3763 assert(strlen(ret) < sizeof(ret));
3772 * ex: set ts=8 sts=4 sw=4 et: