3 * Copyright (C) 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001,
4 * 2002, 2003, 2005, 2006, 2007, 2008 by Larry Wall and others
6 * You may distribute under the terms of either the GNU General Public
7 * License or the Artistic License, as specified in the README file.
12 * A Elbereth Gilthoniel,
13 * silivren penna míriel
14 * o menel aglar elenath!
15 * Na-chaered palan-díriel
16 * o galadhremmin ennorath,
17 * Fanuilos, le linnathon
18 * nef aear, si nef aearon!
20 * [p.238 of _The Lord of the Rings_, II/i: "Many Meetings"]
23 /* utility functions for handling locale-specific stuff like what
24 * character represents the decimal point.
26 * All C programs have an underlying locale. Perl generally doesn't pay any
27 * attention to it except within the scope of a 'use locale'. For most
28 * categories, it accomplishes this by just using different operations if it is
29 * in such scope than if not. However, various libc functions called by Perl
30 * are affected by the LC_NUMERIC category, so there are macros in perl.h that
31 * are used to toggle between the current locale and the C locale depending on
32 * the desired behavior of those functions at the moment. And, LC_MESSAGES is
33 * switched to the C locale for outputting the message unless within the scope
38 #define PERL_IN_LOCALE_C
39 #include "perl_langinfo.h"
44 /* If the environment says to, we can output debugging information during
45 * initialization. This is done before option parsing, and before any thread
46 * creation, so can be a file-level static */
48 # ifdef PERL_GLOBAL_STRUCT
49 /* no global syms allowed */
50 # define debug_initialization 0
51 # define DEBUG_INITIALIZATION_set(v)
53 static bool debug_initialization = FALSE;
54 # define DEBUG_INITIALIZATION_set(v) (debug_initialization = v)
61 * Standardize the locale name from a string returned by 'setlocale', possibly
62 * modifying that string.
64 * The typical return value of setlocale() is either
65 * (1) "xx_YY" if the first argument of setlocale() is not LC_ALL
66 * (2) "xa_YY xb_YY ..." if the first argument of setlocale() is LC_ALL
67 * (the space-separated values represent the various sublocales,
68 * in some unspecified order). This is not handled by this function.
70 * In some platforms it has a form like "LC_SOMETHING=Lang_Country.866\n",
71 * which is harmful for further use of the string in setlocale(). This
72 * function removes the trailing new line and everything up through the '='
76 S_stdize_locale(pTHX_ char *locs)
78 const char * const s = strchr(locs, '=');
81 PERL_ARGS_ASSERT_STDIZE_LOCALE;
84 const char * const t = strchr(s, '.');
87 const char * const u = strchr(t, '\n');
88 if (u && (u[1] == 0)) {
89 const STRLEN len = u - s;
90 Move(s + 1, locs, len, char);
98 Perl_croak(aTHX_ "Can't fix broken locale name \"%s\"", locs);
106 S_set_numeric_radix(pTHX)
108 #ifdef USE_LOCALE_NUMERIC
109 # ifdef HAS_LOCALECONV
110 const struct lconv* const lc = localeconv();
112 if (lc && lc->decimal_point) {
113 if (lc->decimal_point[0] == '.' && lc->decimal_point[1] == 0) {
114 SvREFCNT_dec(PL_numeric_radix_sv);
115 PL_numeric_radix_sv = NULL;
118 if (PL_numeric_radix_sv)
119 sv_setpv(PL_numeric_radix_sv, lc->decimal_point);
121 PL_numeric_radix_sv = newSVpv(lc->decimal_point, 0);
122 if (! is_utf8_invariant_string((U8 *) lc->decimal_point, 0)
123 && is_utf8_string((U8 *) lc->decimal_point, 0)
124 && _is_cur_LC_category_utf8(LC_NUMERIC))
126 SvUTF8_on(PL_numeric_radix_sv);
131 PL_numeric_radix_sv = NULL;
134 if (DEBUG_L_TEST || debug_initialization) {
135 PerlIO_printf(Perl_debug_log, "Locale radix is '%s', ?UTF-8=%d\n",
136 (PL_numeric_radix_sv)
137 ? SvPVX(PL_numeric_radix_sv)
139 (PL_numeric_radix_sv)
140 ? cBOOL(SvUTF8(PL_numeric_radix_sv))
145 # endif /* HAS_LOCALECONV */
146 #endif /* USE_LOCALE_NUMERIC */
149 /* Is the C string input 'name' "C" or "POSIX"? If so, and 'name' is the
150 * return of setlocale(), then this is extremely likely to be the C or POSIX
151 * locale. However, the output of setlocale() is documented to be opaque, but
152 * the odds are extremely small that it would return these two strings for some
153 * other locale. Note that VMS in these two locales includes many non-ASCII
154 * characters as controls and punctuation (below are hex bytes):
155 * cntrl: 00-1F 7F 84-97 9B-9F
156 * punct: 21-2F 3A-40 5B-60 7B-7E A1-A3 A5 A7-AB B0-B3 B5-B7 B9-BD BF-CF D1-DD DF-EF F1-FD
157 * Oddly, none there are listed as alphas, though some represent alphabetics
158 * http://www.nntp.perl.org/group/perl.perl5.porters/2013/02/msg198753.html */
159 #define isNAME_C_OR_POSIX(name) ((name) != NULL \
160 && ((*(name) == 'C' && (*(name + 1)) == '\0') \
161 || strEQ((name), "POSIX")))
164 Perl_new_numeric(pTHX_ const char *newnum)
166 #ifdef USE_LOCALE_NUMERIC
168 /* Called after all libc setlocale() calls affecting LC_NUMERIC, to tell
169 * core Perl this and that 'newnum' is the name of the new locale.
170 * It installs this locale as the current underlying default.
172 * The default locale and the C locale can be toggled between by use of the
173 * set_numeric_local() and set_numeric_standard() functions, which should
174 * probably not be called directly, but only via macros like
175 * SET_NUMERIC_STANDARD() in perl.h.
177 * The toggling is necessary mainly so that a non-dot radix decimal point
178 * character can be output, while allowing internal calculations to use a
181 * This sets several interpreter-level variables:
182 * PL_numeric_name The underlying locale's name: a copy of 'newnum'
183 * PL_numeric_local A boolean indicating if the toggled state is such
184 * that the current locale is the program's underlying
186 * PL_numeric_standard An int indicating if the toggled state is such
187 * that the current locale is the C locale. If non-zero,
188 * it is in C; if > 1, it means it may not be toggled away
190 * Note that both of the last two variables can be true at the same time,
191 * if the underlying locale is C. (Toggling is a no-op under these
194 * Any code changing the locale (outside this file) should use
195 * POSIX::setlocale, which calls this function. Therefore this function
196 * should be called directly only from this file and from
197 * POSIX::setlocale() */
202 Safefree(PL_numeric_name);
203 PL_numeric_name = NULL;
204 PL_numeric_standard = TRUE;
205 PL_numeric_local = TRUE;
209 save_newnum = stdize_locale(savepv(newnum));
211 PL_numeric_standard = isNAME_C_OR_POSIX(save_newnum);
212 PL_numeric_local = TRUE;
214 if (! PL_numeric_name || strNE(PL_numeric_name, save_newnum)) {
215 Safefree(PL_numeric_name);
216 PL_numeric_name = save_newnum;
219 Safefree(save_newnum);
222 /* Keep LC_NUMERIC in the C locale. This is for XS modules, so they don't
223 * have to worry about the radix being a non-dot. (Core operations that
224 * need the underlying locale change to it temporarily). */
225 set_numeric_standard();
230 PERL_UNUSED_ARG(newnum);
231 #endif /* USE_LOCALE_NUMERIC */
235 Perl_set_numeric_standard(pTHX)
237 #ifdef USE_LOCALE_NUMERIC
238 /* Toggle the LC_NUMERIC locale to C. Most code should use the macros like
239 * SET_NUMERIC_STANDARD() in perl.h instead of calling this directly. The
240 * macro avoids calling this routine if toggling isn't necessary according
241 * to our records (which could be wrong if some XS code has changed the
242 * locale behind our back) */
244 setlocale(LC_NUMERIC, "C");
245 PL_numeric_standard = TRUE;
246 PL_numeric_local = isNAME_C_OR_POSIX(PL_numeric_name);
249 if (DEBUG_L_TEST || debug_initialization) {
250 PerlIO_printf(Perl_debug_log,
251 "Underlying LC_NUMERIC locale now is C\n");
255 #endif /* USE_LOCALE_NUMERIC */
259 Perl_set_numeric_local(pTHX)
261 #ifdef USE_LOCALE_NUMERIC
262 /* Toggle the LC_NUMERIC locale to the current underlying default. Most
263 * code should use the macros like SET_NUMERIC_LOCAL() in perl.h instead of
264 * calling this directly. The macro avoids calling this routine if
265 * toggling isn't necessary according to our records (which could be wrong
266 * if some XS code has changed the locale behind our back) */
268 setlocale(LC_NUMERIC, PL_numeric_name);
269 PL_numeric_standard = isNAME_C_OR_POSIX(PL_numeric_name);
270 PL_numeric_local = TRUE;
273 if (DEBUG_L_TEST || debug_initialization) {
274 PerlIO_printf(Perl_debug_log,
275 "Underlying LC_NUMERIC locale now is %s\n",
280 #endif /* USE_LOCALE_NUMERIC */
284 * Set up for a new ctype locale.
287 S_new_ctype(pTHX_ const char *newctype)
289 #ifdef USE_LOCALE_CTYPE
291 /* Called after all libc setlocale() calls affecting LC_CTYPE, to tell
292 * core Perl this and that 'newctype' is the name of the new locale.
294 * This function sets up the folding arrays for all 256 bytes, assuming
295 * that tofold() is tolc() since fold case is not a concept in POSIX,
297 * Any code changing the locale (outside this file) should use
298 * POSIX::setlocale, which calls this function. Therefore this function
299 * should be called directly only from this file and from
300 * POSIX::setlocale() */
305 PERL_ARGS_ASSERT_NEW_CTYPE;
307 /* We will replace any bad locale warning with 1) nothing if the new one is
308 * ok; or 2) a new warning for the bad new locale */
309 if (PL_warn_locale) {
310 SvREFCNT_dec_NN(PL_warn_locale);
311 PL_warn_locale = NULL;
314 PL_in_utf8_CTYPE_locale = _is_cur_LC_category_utf8(LC_CTYPE);
316 /* A UTF-8 locale gets standard rules. But note that code still has to
317 * handle this specially because of the three problematic code points */
318 if (PL_in_utf8_CTYPE_locale) {
319 Copy(PL_fold_latin1, PL_fold_locale, 256, U8);
322 /* Assume enough space for every character being bad. 4 spaces each
323 * for the 94 printable characters that are output like "'x' "; and 5
324 * spaces each for "'\\' ", "'\t' ", and "'\n' "; plus a terminating
326 char bad_chars_list[ (94 * 4) + (3 * 5) + 1 ];
328 /* Don't check for problems if we are suppressing the warnings */
329 bool check_for_problems = ckWARN_d(WARN_LOCALE)
330 || UNLIKELY(DEBUG_L_TEST);
331 bool multi_byte_locale = FALSE; /* Assume is a single-byte locale
333 unsigned int bad_count = 0; /* Count of bad characters */
335 for (i = 0; i < 256; i++) {
336 if (isUPPER_LC((U8) i))
337 PL_fold_locale[i] = (U8) toLOWER_LC((U8) i);
338 else if (isLOWER_LC((U8) i))
339 PL_fold_locale[i] = (U8) toUPPER_LC((U8) i);
341 PL_fold_locale[i] = (U8) i;
343 /* If checking for locale problems, see if the native ASCII-range
344 * printables plus \n and \t are in their expected categories in
345 * the new locale. If not, this could mean big trouble, upending
346 * Perl's and most programs' assumptions, like having a
347 * metacharacter with special meaning become a \w. Fortunately,
348 * it's very rare to find locales that aren't supersets of ASCII
349 * nowadays. It isn't a problem for most controls to be changed
350 * into something else; we check only \n and \t, though perhaps \r
351 * could be an issue as well. */
352 if (check_for_problems
353 && (isGRAPH_A(i) || isBLANK_A(i) || i == '\n'))
355 if ((isALPHANUMERIC_A(i) && ! isALPHANUMERIC_LC(i))
356 || (isPUNCT_A(i) && ! isPUNCT_LC(i))
357 || (isBLANK_A(i) && ! isBLANK_LC(i))
358 || (i == '\n' && ! isCNTRL_LC(i)))
360 if (bad_count) { /* Separate multiple entries with a
362 bad_chars_list[bad_count++] = ' ';
364 bad_chars_list[bad_count++] = '\'';
366 bad_chars_list[bad_count++] = (char) i;
369 bad_chars_list[bad_count++] = '\\';
371 bad_chars_list[bad_count++] = 'n';
375 bad_chars_list[bad_count++] = 't';
378 bad_chars_list[bad_count++] = '\'';
379 bad_chars_list[bad_count] = '\0';
385 /* We only handle single-byte locales (outside of UTF-8 ones; so if
386 * this locale requires more than one byte, there are going to be
388 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
389 "%s:%d: check_for_problems=%d, MB_CUR_MAX=%d\n",
390 __FILE__, __LINE__, check_for_problems, (int) MB_CUR_MAX));
392 if (check_for_problems && MB_CUR_MAX > 1
394 /* Some platforms return MB_CUR_MAX > 1 for even the "C"
395 * locale. Just assume that the implementation for them (plus
396 * for POSIX) is correct and the > 1 value is spurious. (Since
397 * these are specially handled to never be considered UTF-8
398 * locales, as long as this is the only problem, everything
399 * should work fine */
400 && strNE(newctype, "C") && strNE(newctype, "POSIX"))
402 multi_byte_locale = TRUE;
406 if (bad_count || multi_byte_locale) {
407 PL_warn_locale = Perl_newSVpvf(aTHX_
408 "Locale '%s' may not work well.%s%s%s\n",
411 ? " Some characters in it are not recognized by"
415 ? "\nThe following characters (and maybe others)"
416 " may not have the same meaning as the Perl"
417 " program expects:\n"
423 /* If we are actually in the scope of the locale or are debugging,
424 * output the message now. If not in that scope, we save the
425 * message to be output at the first operation using this locale,
426 * if that actually happens. Most programs don't use locales, so
427 * they are immune to bad ones. */
428 if (IN_LC(LC_CTYPE) || UNLIKELY(DEBUG_L_TEST)) {
430 /* We have to save 'newctype' because the setlocale() just
431 * below may destroy it. The next setlocale() further down
432 * should restore it properly so that the intermediate change
433 * here is transparent to this function's caller */
434 const char * const badlocale = savepv(newctype);
436 setlocale(LC_CTYPE, "C");
438 /* The '0' below suppresses a bogus gcc compiler warning */
439 Perl_warner(aTHX_ packWARN(WARN_LOCALE), SvPVX(PL_warn_locale), 0);
441 setlocale(LC_CTYPE, badlocale);
444 if (IN_LC(LC_CTYPE)) {
445 SvREFCNT_dec_NN(PL_warn_locale);
446 PL_warn_locale = NULL;
452 #endif /* USE_LOCALE_CTYPE */
453 PERL_ARGS_ASSERT_NEW_CTYPE;
454 PERL_UNUSED_ARG(newctype);
459 Perl__warn_problematic_locale()
462 #ifdef USE_LOCALE_CTYPE
466 /* Internal-to-core function that outputs the message in PL_warn_locale,
467 * and then NULLS it. Should be called only through the macro
468 * _CHECK_AND_WARN_PROBLEMATIC_LOCALE */
470 if (PL_warn_locale) {
471 /*GCC_DIAG_IGNORE(-Wformat-security); Didn't work */
472 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
473 SvPVX(PL_warn_locale),
474 0 /* dummy to avoid compiler warning */ );
475 /* GCC_DIAG_RESTORE; */
476 SvREFCNT_dec_NN(PL_warn_locale);
477 PL_warn_locale = NULL;
485 S_new_collate(pTHX_ const char *newcoll)
487 #ifdef USE_LOCALE_COLLATE
489 /* Called after all libc setlocale() calls affecting LC_COLLATE, to tell
490 * core Perl this and that 'newcoll' is the name of the new locale.
492 * Any code changing the locale (outside this file) should use
493 * POSIX::setlocale, which calls this function. Therefore this function
494 * should be called directly only from this file and from
495 * POSIX::setlocale().
497 * The design of locale collation is that every locale change is given an
498 * index 'PL_collation_ix'. The first time a string particpates in an
499 * operation that requires collation while locale collation is active, it
500 * is given PERL_MAGIC_collxfrm magic (via sv_collxfrm_flags()). That
501 * magic includes the collation index, and the transformation of the string
502 * by strxfrm(), q.v. That transformation is used when doing comparisons,
503 * instead of the string itself. If a string changes, the magic is
504 * cleared. The next time the locale changes, the index is incremented,
505 * and so we know during a comparison that the transformation is not
506 * necessarily still valid, and so is recomputed. Note that if the locale
507 * changes enough times, the index could wrap (a U32), and it is possible
508 * that a transformation would improperly be considered valid, leading to
512 if (PL_collation_name) {
514 Safefree(PL_collation_name);
515 PL_collation_name = NULL;
517 PL_collation_standard = TRUE;
518 is_standard_collation:
519 PL_collxfrm_base = 0;
520 PL_collxfrm_mult = 2;
521 PL_in_utf8_COLLATE_locale = FALSE;
522 PL_strxfrm_NUL_replacement = '\0';
523 PL_strxfrm_max_cp = 0;
527 /* If this is not the same locale as currently, set the new one up */
528 if (! PL_collation_name || strNE(PL_collation_name, newcoll)) {
530 Safefree(PL_collation_name);
531 PL_collation_name = stdize_locale(savepv(newcoll));
532 PL_collation_standard = isNAME_C_OR_POSIX(newcoll);
533 if (PL_collation_standard) {
534 goto is_standard_collation;
537 PL_in_utf8_COLLATE_locale = _is_cur_LC_category_utf8(LC_COLLATE);
538 PL_strxfrm_NUL_replacement = '\0';
539 PL_strxfrm_max_cp = 0;
541 /* A locale collation definition includes primary, secondary, tertiary,
542 * etc. weights for each character. To sort, the primary weights are
543 * used, and only if they compare equal, then the secondary weights are
544 * used, and only if they compare equal, then the tertiary, etc.
546 * strxfrm() works by taking the input string, say ABC, and creating an
547 * output transformed string consisting of first the primary weights,
548 * A¹B¹C¹ followed by the secondary ones, A²B²C²; and then the
549 * tertiary, etc, yielding A¹B¹C¹ A²B²C² A³B³C³ .... Some characters
550 * may not have weights at every level. In our example, let's say B
551 * doesn't have a tertiary weight, and A doesn't have a secondary
552 * weight. The constructed string is then going to be
553 * A¹B¹C¹ B²C² A³C³ ....
554 * This has the desired effect that strcmp() will look at the secondary
555 * or tertiary weights only if the strings compare equal at all higher
556 * priority weights. The spaces shown here, like in
558 * are not just for readability. In the general case, these must
559 * actually be bytes, which we will call here 'separator weights'; and
560 * they must be smaller than any other weight value, but since these
561 * are C strings, only the terminating one can be a NUL (some
562 * implementations may include a non-NUL separator weight just before
563 * the NUL). Implementations tend to reserve 01 for the separator
564 * weights. They are needed so that a shorter string's secondary
565 * weights won't be misconstrued as primary weights of a longer string,
566 * etc. By making them smaller than any other weight, the shorter
567 * string will sort first. (Actually, if all secondary weights are
568 * smaller than all primary ones, there is no need for a separator
569 * weight between those two levels, etc.)
571 * The length of the transformed string is roughly a linear function of
572 * the input string. It's not exactly linear because some characters
573 * don't have weights at all levels. When we call strxfrm() we have to
574 * allocate some memory to hold the transformed string. The
575 * calculations below try to find coefficients 'm' and 'b' for this
576 * locale so that m*x + b equals how much space we need, given the size
577 * of the input string in 'x'. If we calculate too small, we increase
578 * the size as needed, and call strxfrm() again, but it is better to
579 * get it right the first time to avoid wasted expensive string
580 * transformations. */
583 /* We use the string below to find how long the tranformation of it
584 * is. Almost all locales are supersets of ASCII, or at least the
585 * ASCII letters. We use all of them, half upper half lower,
586 * because if we used fewer, we might hit just the ones that are
587 * outliers in a particular locale. Most of the strings being
588 * collated will contain a preponderance of letters, and even if
589 * they are above-ASCII, they are likely to have the same number of
590 * weight levels as the ASCII ones. It turns out that digits tend
591 * to have fewer levels, and some punctuation has more, but those
592 * are relatively sparse in text, and khw believes this gives a
593 * reasonable result, but it could be changed if experience so
595 const char longer[] = "ABCDEFGHIJKLMnopqrstuvwxyz";
596 char * x_longer; /* Transformed 'longer' */
597 Size_t x_len_longer; /* Length of 'x_longer' */
599 char * x_shorter; /* We also transform a substring of 'longer' */
600 Size_t x_len_shorter;
602 /* _mem_collxfrm() is used get the transformation (though here we
603 * are interested only in its length). It is used because it has
604 * the intelligence to handle all cases, but to work, it needs some
605 * values of 'm' and 'b' to get it started. For the purposes of
606 * this calculation we use a very conservative estimate of 'm' and
607 * 'b'. This assumes a weight can be multiple bytes, enough to
608 * hold any UV on the platform, and there are 5 levels, 4 weight
609 * bytes, and a trailing NUL. */
610 PL_collxfrm_base = 5;
611 PL_collxfrm_mult = 5 * sizeof(UV);
613 /* Find out how long the transformation really is */
614 x_longer = _mem_collxfrm(longer,
618 /* We avoid converting to UTF-8 in the
619 * called function by telling it the
620 * string is in UTF-8 if the locale is a
621 * UTF-8 one. Since the string passed
622 * here is invariant under UTF-8, we can
623 * claim it's UTF-8 even though it isn't.
625 PL_in_utf8_COLLATE_locale);
628 /* Find out how long the transformation of a substring of 'longer'
629 * is. Together the lengths of these transformations are
630 * sufficient to calculate 'm' and 'b'. The substring is all of
631 * 'longer' except the first character. This minimizes the chances
632 * of being swayed by outliers */
633 x_shorter = _mem_collxfrm(longer + 1,
636 PL_in_utf8_COLLATE_locale);
639 /* If the results are nonsensical for this simple test, the whole
640 * locale definition is suspect. Mark it so that locale collation
641 * is not active at all for it. XXX Should we warn? */
642 if ( x_len_shorter == 0
644 || x_len_shorter >= x_len_longer)
646 PL_collxfrm_mult = 0;
647 PL_collxfrm_base = 0;
650 SSize_t base; /* Temporary */
652 /* We have both: m * strlen(longer) + b = x_len_longer
653 * m * strlen(shorter) + b = x_len_shorter;
654 * subtracting yields:
655 * m * (strlen(longer) - strlen(shorter))
656 * = x_len_longer - x_len_shorter
657 * But we have set things up so that 'shorter' is 1 byte smaller
658 * than 'longer'. Hence:
659 * m = x_len_longer - x_len_shorter
661 * But if something went wrong, make sure the multiplier is at
664 if (x_len_longer > x_len_shorter) {
665 PL_collxfrm_mult = (STRLEN) x_len_longer - x_len_shorter;
668 PL_collxfrm_mult = 1;
673 * but in case something has gone wrong, make sure it is
675 base = x_len_longer - PL_collxfrm_mult * (sizeof(longer) - 1);
680 /* Add 1 for the trailing NUL */
681 PL_collxfrm_base = base + 1;
685 if (DEBUG_L_TEST || debug_initialization) {
686 PerlIO_printf(Perl_debug_log,
687 "%s:%d: ?UTF-8 locale=%d; x_len_shorter=%zu, "
689 " collate multipler=%zu, collate base=%zu\n",
691 PL_in_utf8_COLLATE_locale,
692 x_len_shorter, x_len_longer,
693 PL_collxfrm_mult, PL_collxfrm_base);
700 PERL_UNUSED_ARG(newcoll);
701 #endif /* USE_LOCALE_COLLATE */
704 #ifndef WIN32 /* No wrapper except on Windows */
706 #define my_setlocale(a,b) setlocale(a,b)
711 S_my_setlocale(pTHX_ int category, const char* locale)
713 /* This, for Windows, emulates POSIX setlocale() behavior. There is no
714 * difference unless the input locale is "", which means on Windows to get
715 * the machine default, which is set via the computer's "Regional and
716 * Language Options" (or its current equivalent). In POSIX, it instead
717 * means to find the locale from the user's environment. This routine
718 * looks in the environment, and, if anything is found, uses that instead
719 * of going to the machine default. If there is no environment override,
720 * the machine default is used, as normal, by calling the real setlocale()
721 * with "". The POSIX behavior is to use the LC_ALL variable if set;
722 * otherwise to use the particular category's variable if set; otherwise to
723 * use the LANG variable. */
725 bool override_LC_ALL = FALSE;
728 if (locale && strEQ(locale, "")) {
730 locale = PerlEnv_getenv("LC_ALL");
736 override_LC_ALL = TRUE;
737 break; /* We already know its variable isn't set */
739 # ifdef USE_LOCALE_TIME
741 locale = PerlEnv_getenv("LC_TIME");
744 # ifdef USE_LOCALE_CTYPE
746 locale = PerlEnv_getenv("LC_CTYPE");
749 # ifdef USE_LOCALE_COLLATE
751 locale = PerlEnv_getenv("LC_COLLATE");
754 # ifdef USE_LOCALE_MONETARY
756 locale = PerlEnv_getenv("LC_MONETARY");
759 # ifdef USE_LOCALE_NUMERIC
761 locale = PerlEnv_getenv("LC_NUMERIC");
764 # ifdef USE_LOCALE_MESSAGES
766 locale = PerlEnv_getenv("LC_MESSAGES");
770 /* This is a category, like PAPER_SIZE that we don't
771 * know about; and so can't provide a wrapper. */
775 locale = PerlEnv_getenv("LANG");
785 result = setlocale(category, locale);
786 DEBUG_L(PerlIO_printf(Perl_debug_log, "%s:%d: %s\n", __FILE__, __LINE__,
787 setlocale_debug_string(category, locale, result)));
789 if (! override_LC_ALL) {
793 /* Here the input category was LC_ALL, and we have set it to what is in the
794 * LANG variable or the system default if there is no LANG. But these have
795 * lower priority than the other LC_foo variables, so override it for each
796 * one that is set. (If they are set to "", it means to use the same thing
797 * we just set LC_ALL to, so can skip) */
798 # ifdef USE_LOCALE_TIME
799 result = PerlEnv_getenv("LC_TIME");
800 if (result && strNE(result, "")) {
801 setlocale(LC_TIME, result);
802 DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s:%d: %s\n",
804 setlocale_debug_string(LC_TIME, result, "not captured")));
807 # ifdef USE_LOCALE_CTYPE
808 result = PerlEnv_getenv("LC_CTYPE");
809 if (result && strNE(result, "")) {
810 setlocale(LC_CTYPE, result);
811 DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s:%d: %s\n",
813 setlocale_debug_string(LC_CTYPE, result, "not captured")));
816 # ifdef USE_LOCALE_COLLATE
817 result = PerlEnv_getenv("LC_COLLATE");
818 if (result && strNE(result, "")) {
819 setlocale(LC_COLLATE, result);
820 DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s:%d: %s\n",
822 setlocale_debug_string(LC_COLLATE, result, "not captured")));
825 # ifdef USE_LOCALE_MONETARY
826 result = PerlEnv_getenv("LC_MONETARY");
827 if (result && strNE(result, "")) {
828 setlocale(LC_MONETARY, result);
829 DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s:%d: %s\n",
831 setlocale_debug_string(LC_MONETARY, result, "not captured")));
834 # ifdef USE_LOCALE_NUMERIC
835 result = PerlEnv_getenv("LC_NUMERIC");
836 if (result && strNE(result, "")) {
837 setlocale(LC_NUMERIC, result);
838 DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s:%d: %s\n",
840 setlocale_debug_string(LC_NUMERIC, result, "not captured")));
843 # ifdef USE_LOCALE_MESSAGES
844 result = PerlEnv_getenv("LC_MESSAGES");
845 if (result && strNE(result, "")) {
846 setlocale(LC_MESSAGES, result);
847 DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s:%d: %s\n",
849 setlocale_debug_string(LC_MESSAGES, result, "not captured")));
853 result = setlocale(LC_ALL, NULL);
854 DEBUG_L(PerlIO_printf(Perl_debug_log, "%s:%d: %s\n",
856 setlocale_debug_string(LC_ALL, NULL, result)));
864 Perl_setlocale(int category, const char * locale)
866 /* This wraps POSIX::setlocale() */
872 #ifdef USE_LOCALE_NUMERIC
874 /* A NULL locale means only query what the current one is. We
875 * have the LC_NUMERIC name saved, because we are normally switched
876 * into the C locale for it. Switch back so an LC_ALL query will yield
877 * the correct results; all other categories don't require special
879 if (locale == NULL) {
880 if (category == LC_NUMERIC) {
881 return savepv(PL_numeric_name);
886 else if (category == LC_ALL) {
887 SET_NUMERIC_UNDERLYING();
896 retval = my_setlocale(category, locale);
898 DEBUG_L(PerlIO_printf(Perl_debug_log,
899 "%s:%d: %s\n", __FILE__, __LINE__,
900 setlocale_debug_string(category, locale, retval)));
902 /* Should never happen that a query would return an error, but be
903 * sure and reset to C locale */
905 SET_NUMERIC_STANDARD();
910 /* Save retval since subsequent setlocale() calls may overwrite it. */
911 retval = savepv(retval);
913 /* If locale == NULL, we are just querying the state, but may have switched
914 * to NUMERIC_UNDERLYING. Switch back before returning. */
915 if (locale == NULL) {
916 SET_NUMERIC_STANDARD();
919 else { /* Now that have switched locales, we have to update our records to
922 #ifdef USE_LOCALE_CTYPE
924 if ( category == LC_CTYPE
928 || category == LC_ALL
938 if (category == LC_ALL) {
939 newctype = setlocale(LC_CTYPE, NULL);
940 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
941 "%s:%d: %s\n", __FILE__, __LINE__,
942 setlocale_debug_string(LC_CTYPE, NULL, newctype)));
952 #endif /* USE_LOCALE_CTYPE */
954 #ifdef USE_LOCALE_COLLATE
956 if ( category == LC_COLLATE
960 || category == LC_ALL
970 if (category == LC_ALL) {
971 newcoll = setlocale(LC_COLLATE, NULL);
972 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
973 "%s:%d: %s\n", __FILE__, __LINE__,
974 setlocale_debug_string(LC_COLLATE, NULL, newcoll)));
981 new_collate(newcoll);
984 #endif /* USE_LOCALE_COLLATE */
986 #ifdef USE_LOCALE_NUMERIC
988 if ( category == LC_NUMERIC
992 || category == LC_ALL
1002 if (category == LC_ALL) {
1003 newnum = setlocale(LC_NUMERIC, NULL);
1004 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
1005 "%s:%d: %s\n", __FILE__, __LINE__,
1006 setlocale_debug_string(LC_NUMERIC, NULL, newnum)));
1013 new_numeric(newnum);
1016 #endif /* USE_LOCALE_NUMERIC */
1025 PERL_STATIC_INLINE const char *
1026 S_save_to_buffer(const char * string, char **buf, Size_t *buf_size, const Size_t offset)
1028 /* Copy the NUL-terminated 'string' to 'buf' + 'offset'. 'buf' has size 'buf_size',
1029 * growing it if necessary */
1031 const Size_t string_size = strlen(string) + offset + 1;
1033 PERL_ARGS_ASSERT_SAVE_TO_BUFFER;
1035 if (*buf_size == 0) {
1036 Newx(*buf, string_size, char);
1037 *buf_size = string_size;
1039 else if (string_size > *buf_size) {
1040 Renew(*buf, string_size, char);
1041 *buf_size = string_size;
1044 Copy(string, *buf + offset, string_size - offset, char);
1050 =head1 Locale-related functions and macros
1052 =for apidoc Perl_langinfo
1054 This is an (almostª) drop-in replacement for the system C<L<nl_langinfo(3)>>,
1055 taking the same C<item> parameter values, and returning the same information.
1056 But it is more thread-safe than regular C<nl_langinfo()>, and hides the quirks
1057 of Perl's locale handling from your code, and can be used on systems that lack
1058 a native C<nl_langinfo>.
1066 It delivers the correct results for the C<RADIXCHAR> and C<THOUSESEP> items,
1067 without you having to write extra code. The reason for the extra code would be
1068 because these are from the C<LC_NUMERIC> locale category, which is normally
1069 kept set to the C locale by Perl, no matter what the underlying locale is
1070 supposed to be, and so to get the expected results, you have to temporarily
1071 toggle into the underlying locale, and later toggle back. (You could use
1072 plain C<nl_langinfo> and C<L</STORE_LC_NUMERIC_FORCE_TO_UNDERLYING>> for this
1073 but then you wouldn't get the other advantages of C<Perl_langinfo()>; not
1074 keeping C<LC_NUMERIC> in the C locale would break a lot of CPAN, which is
1075 expecting the radix (decimal point) character to be a dot.)
1079 Depending on C<item>, it works on systems that don't have C<nl_langinfo>, hence
1080 makes your code more portable. Of the fifty-some possible items specified by
1081 the POSIX 2008 standard,
1082 L<http://pubs.opengroup.org/onlinepubs/9699919799/basedefs/langinfo.h.html>,
1083 only two are completely unimplemented. It uses various techniques to recover
1084 the other items, including calling C<L<localeconv(3)>>, and C<L<strftime(3)>>,
1085 both of which are specified in C89, so should be always be available. Later
1086 C<strftime()> versions have additional capabilities; C<""> is returned for
1087 those not available on your system.
1089 The details for those items which may differ from what this emulation returns
1090 and what a native C<nl_langinfo()> would return are:
1098 Unimplemented, so returns C<"">.
1104 Only the values for English are returned. Earlier POSIX standards also
1105 specified C<YESSTR> and C<NOSTR>, but these have been removed from POSIX 2008,
1106 and aren't supported by C<Perl_langinfo>.
1110 Always evaluates to C<%x>, the locale's appropriate date representation.
1114 Always evaluates to C<%X>, the locale's appropriate time representation.
1118 Always evaluates to C<%c>, the locale's appropriate date and time
1123 The return may be incorrect for those rare locales where the currency symbol
1124 replaces the radix character.
1125 Send email to L<mailto:perlbug@perl.org> if you have examples of it needing
1126 to work differently.
1130 Currently this gives the same results as Linux does.
1131 Send email to L<mailto:perlbug@perl.org> if you have examples of it needing
1132 to work differently.
1138 =item C<ERA_D_T_FMT>
1142 These are derived by using C<strftime()>, and not all versions of that function
1143 know about them. C<""> is returned for these on such systems.
1147 When using C<Perl_langinfo> on systems that don't have a native
1148 C<nl_langinfo()>, you must
1150 #include "perl_langinfo.h"
1152 before the C<perl.h> C<#include>. You can replace your C<langinfo.h>
1153 C<#include> with this one. (Doing it this way keeps out the symbols that plain
1154 C<langinfo.h> imports into the namespace for code that doesn't need it.)
1156 You also should not use the bare C<langinfo.h> item names, but should preface
1157 them with C<PERL_>, so use C<PERL_RADIXCHAR> instead of plain C<RADIXCHAR>.
1158 The C<PERL_I<foo>> versions will also work for this function on systems that do
1159 have a native C<nl_langinfo>.
1163 It is thread-friendly, returning its result in a buffer that won't be
1164 overwritten by another thread, so you don't have to code for that possibility.
1165 The buffer can be overwritten by the next call to C<nl_langinfo> or
1166 C<Perl_langinfo> in the same thread.
1170 ªIt returns S<C<const char *>>, whereas plain C<nl_langinfo()> returns S<C<char
1171 *>>, but you are (only by documentation) forbidden to write into the buffer.
1172 By declaring this C<const>, the compiler enforces this restriction. The extra
1173 C<const> is why this isn't an unequivocal drop-in replacement for
1178 The original impetus for C<Perl_langinfo()> was so that code that needs to
1179 find out the current currency symbol, floating point radix character, or digit
1180 grouping separator can use, on all systems, the simpler and more
1181 thread-friendly C<nl_langinfo> API instead of C<L<localeconv(3)>> which is a
1182 pain to make thread-friendly. For other fields returned by C<localeconv>, it
1183 is better to use the methods given in L<perlcall> to call
1184 L<C<POSIX::localeconv()>|POSIX/localeconv>, which is thread-friendly.
1191 #ifdef HAS_NL_LANGINFO
1192 Perl_langinfo(const nl_item item)
1194 Perl_langinfo(const int item)
1200 #if defined(HAS_NL_LANGINFO) /* nl_langinfo() is available. */
1201 #if ! defined(HAS_POSIX_2008_LOCALE)
1203 /* Here, use plain nl_langinfo(), switching to the underlying LC_NUMERIC
1204 * for those items dependent on it. This must be copied to a buffer before
1205 * switching back, as some systems destroy the buffer when setlocale() is
1211 if (item == PERL_RADIXCHAR || item == PERL_THOUSEP) {
1212 setlocale(LC_NUMERIC, PL_numeric_name);
1219 save_to_buffer(nl_langinfo(item), &PL_langinfo_buf, &PL_langinfo_bufsize, 0);
1222 setlocale(LC_NUMERIC, "C");
1227 return PL_langinfo_buf;
1229 # else /* Use nl_langinfo_l(), avoiding both a mutex and changing the locale */
1231 bool do_free = FALSE;
1232 locale_t cur = uselocale((locale_t) 0);
1234 if (cur == LC_GLOBAL_LOCALE) {
1235 cur = duplocale(LC_GLOBAL_LOCALE);
1240 && (item == PERL_RADIXCHAR || item == PERL_THOUSEP))
1242 cur = newlocale(LC_NUMERIC_MASK, PL_numeric_name, cur);
1246 save_to_buffer(nl_langinfo_l(item, cur),
1247 &PL_langinfo_buf, &PL_langinfo_bufsize, 0);
1252 return PL_langinfo_buf;
1255 #else /* Below, emulate nl_langinfo as best we can */
1256 # ifdef HAS_LOCALECONV
1258 const struct lconv* lc;
1261 # ifdef HAS_STRFTIME
1264 bool return_format = FALSE; /* Return the %format, not the value */
1265 const char * format;
1269 /* We copy the results to a per-thread buffer, even if not multi-threaded.
1270 * This is in part to simplify this code, and partly because we need a
1271 * buffer anyway for strftime(), and partly because a call of localeconv()
1272 * could otherwise wipe out the buffer, and the programmer would not be
1273 * expecting this, as this is a nl_langinfo() substitute after all, so s/he
1274 * might be thinking their localeconv() is safe until another localeconv()
1279 const char * retval;
1281 /* These 2 are unimplemented */
1283 case PERL_ERA: /* For use with strftime() %E modifier */
1288 /* We use only an English set, since we don't know any more */
1289 case PERL_YESEXPR: return "^[+1yY]";
1290 case PERL_NOEXPR: return "^[-0nN]";
1292 # ifdef HAS_LOCALECONV
1299 if (! lc || ! lc->currency_symbol || strEQ("", lc->currency_symbol))
1305 /* Leave the first spot empty to be filled in below */
1306 save_to_buffer(lc->currency_symbol, &PL_langinfo_buf,
1307 &PL_langinfo_bufsize, 1);
1308 if (lc->mon_decimal_point && strEQ(lc->mon_decimal_point, ""))
1309 { /* khw couldn't figure out how the localedef specifications
1310 would show that the $ should replace the radix; this is
1311 just a guess as to how it might work.*/
1312 *PL_langinfo_buf = '.';
1314 else if (lc->p_cs_precedes) {
1315 *PL_langinfo_buf = '-';
1318 *PL_langinfo_buf = '+';
1324 case PERL_RADIXCHAR:
1330 setlocale(LC_NUMERIC, PL_numeric_name);
1337 else switch (item) {
1338 case PERL_RADIXCHAR:
1339 if (! lc->decimal_point) {
1343 retval = lc->decimal_point;
1348 if (! lc->thousands_sep || strEQ("", lc->thousands_sep)) {
1352 retval = lc->thousands_sep;
1358 Perl_croak(aTHX_ "panic: %s: %d: switch case: %d problem",
1359 __FILE__, __LINE__, item);
1362 save_to_buffer(retval, &PL_langinfo_buf, &PL_langinfo_bufsize, 0);
1365 setlocale(LC_NUMERIC, "C");
1373 # ifdef HAS_STRFTIME
1375 /* These are defined by C89, so we assume that strftime supports them,
1376 * and so are returned unconditionally; they may not be what the locale
1377 * actually says, but should give good enough results for someone using
1378 * them as formats (as opposed to trying to parse them to figure out
1379 * what the locale says). The other format ones are actually tested to
1380 * verify they work on the platform */
1381 case PERL_D_FMT: return "%x";
1382 case PERL_T_FMT: return "%X";
1383 case PERL_D_T_FMT: return "%c";
1385 /* These formats are only available in later strfmtime's */
1386 case PERL_ERA_D_FMT: case PERL_ERA_T_FMT: case PERL_ERA_D_T_FMT:
1387 case PERL_T_FMT_AMPM:
1389 /* The rest can be gotten from most versions of strftime(). */
1390 case PERL_ABDAY_1: case PERL_ABDAY_2: case PERL_ABDAY_3:
1391 case PERL_ABDAY_4: case PERL_ABDAY_5: case PERL_ABDAY_6:
1393 case PERL_ALT_DIGITS:
1394 case PERL_AM_STR: case PERL_PM_STR:
1395 case PERL_ABMON_1: case PERL_ABMON_2: case PERL_ABMON_3:
1396 case PERL_ABMON_4: case PERL_ABMON_5: case PERL_ABMON_6:
1397 case PERL_ABMON_7: case PERL_ABMON_8: case PERL_ABMON_9:
1398 case PERL_ABMON_10: case PERL_ABMON_11: case PERL_ABMON_12:
1399 case PERL_DAY_1: case PERL_DAY_2: case PERL_DAY_3: case PERL_DAY_4:
1400 case PERL_DAY_5: case PERL_DAY_6: case PERL_DAY_7:
1401 case PERL_MON_1: case PERL_MON_2: case PERL_MON_3: case PERL_MON_4:
1402 case PERL_MON_5: case PERL_MON_6: case PERL_MON_7: case PERL_MON_8:
1403 case PERL_MON_9: case PERL_MON_10: case PERL_MON_11: case PERL_MON_12:
1407 init_tm(&tm); /* Precaution against core dumps */
1411 tm.tm_year = 2017 - 1900;
1417 Perl_croak(aTHX_ "panic: %s: %d: switch case: %d problem",
1418 __FILE__, __LINE__, item);
1419 NOT_REACHED; /* NOTREACHED */
1421 case PERL_PM_STR: tm.tm_hour = 18;
1426 case PERL_ABDAY_7: tm.tm_wday++;
1427 case PERL_ABDAY_6: tm.tm_wday++;
1428 case PERL_ABDAY_5: tm.tm_wday++;
1429 case PERL_ABDAY_4: tm.tm_wday++;
1430 case PERL_ABDAY_3: tm.tm_wday++;
1431 case PERL_ABDAY_2: tm.tm_wday++;
1436 case PERL_DAY_7: tm.tm_wday++;
1437 case PERL_DAY_6: tm.tm_wday++;
1438 case PERL_DAY_5: tm.tm_wday++;
1439 case PERL_DAY_4: tm.tm_wday++;
1440 case PERL_DAY_3: tm.tm_wday++;
1441 case PERL_DAY_2: tm.tm_wday++;
1446 case PERL_ABMON_12: tm.tm_mon++;
1447 case PERL_ABMON_11: tm.tm_mon++;
1448 case PERL_ABMON_10: tm.tm_mon++;
1449 case PERL_ABMON_9: tm.tm_mon++;
1450 case PERL_ABMON_8: tm.tm_mon++;
1451 case PERL_ABMON_7: tm.tm_mon++;
1452 case PERL_ABMON_6: tm.tm_mon++;
1453 case PERL_ABMON_5: tm.tm_mon++;
1454 case PERL_ABMON_4: tm.tm_mon++;
1455 case PERL_ABMON_3: tm.tm_mon++;
1456 case PERL_ABMON_2: tm.tm_mon++;
1461 case PERL_MON_12: tm.tm_mon++;
1462 case PERL_MON_11: tm.tm_mon++;
1463 case PERL_MON_10: tm.tm_mon++;
1464 case PERL_MON_9: tm.tm_mon++;
1465 case PERL_MON_8: tm.tm_mon++;
1466 case PERL_MON_7: tm.tm_mon++;
1467 case PERL_MON_6: tm.tm_mon++;
1468 case PERL_MON_5: tm.tm_mon++;
1469 case PERL_MON_4: tm.tm_mon++;
1470 case PERL_MON_3: tm.tm_mon++;
1471 case PERL_MON_2: tm.tm_mon++;
1476 case PERL_T_FMT_AMPM:
1478 return_format = TRUE;
1481 case PERL_ERA_D_FMT:
1483 return_format = TRUE;
1486 case PERL_ERA_T_FMT:
1488 return_format = TRUE;
1491 case PERL_ERA_D_T_FMT:
1493 return_format = TRUE;
1496 case PERL_ALT_DIGITS:
1498 format = "%Ow"; /* Find the alternate digit for 0 */
1502 /* We can't use my_strftime() because it doesn't look at tm_wday */
1503 while (0 == strftime(PL_langinfo_buf, PL_langinfo_bufsize,
1506 /* A zero return means one of:
1507 * a) there wasn't enough space in PL_langinfo_buf
1508 * b) the format, like a plain %p, returns empty
1509 * c) it was an illegal format, though some implementations of
1510 * strftime will just return the illegal format as a plain
1511 * character sequence.
1513 * To quickly test for case 'b)', try again but precede the
1514 * format with a plain character. If that result is still
1515 * empty, the problem is either 'a)' or 'c)' */
1517 Size_t format_size = strlen(format) + 1;
1518 Size_t mod_size = format_size + 1;
1522 Newx(mod_format, mod_size, char);
1523 Newx(temp_result, PL_langinfo_bufsize, char);
1525 my_strlcpy(mod_format + 1, format, mod_size);
1526 len = strftime(temp_result,
1527 PL_langinfo_bufsize,
1529 Safefree(mod_format);
1530 Safefree(temp_result);
1532 /* If 'len' is non-zero, it means that we had a case like %p
1533 * which means the current locale doesn't use a.m. or p.m., and
1537 /* Here, still didn't work. If we get well beyond a
1538 * reasonable size, bail out to prevent an infinite loop. */
1540 if (PL_langinfo_bufsize > 100 * format_size) {
1541 *PL_langinfo_buf = '\0';
1543 else { /* Double the buffer size to retry; Add 1 in case
1544 original was 0, so we aren't stuck at 0. */
1545 PL_langinfo_bufsize *= 2;
1546 PL_langinfo_bufsize++;
1547 Renew(PL_langinfo_buf, PL_langinfo_bufsize, char);
1555 /* Here, we got a result.
1557 * If the item is 'ALT_DIGITS', PL_langinfo_buf contains the
1558 * alternate format for wday 0. If the value is the same as the
1559 * normal 0, there isn't an alternate, so clear the buffer. */
1560 if ( item == PERL_ALT_DIGITS
1561 && strEQ(PL_langinfo_buf, "0"))
1563 *PL_langinfo_buf = '\0';
1566 /* ALT_DIGITS is problematic. Experiments on it showed that
1567 * strftime() did not always work properly when going from alt-9 to
1568 * alt-10. Only a few locales have this item defined, and in all
1569 * of them on Linux that khw was able to find, nl_langinfo() merely
1570 * returned the alt-0 character, possibly doubled. Most Unicode
1571 * digits are in blocks of 10 consecutive code points, so that is
1572 * sufficient information for those scripts, as we can infer alt-1,
1573 * alt-2, .... But for a Japanese locale, a CJK ideographic 0 is
1574 * returned, and the CJK digits are not in code point order, so you
1575 * can't really infer anything. The localedef for this locale did
1576 * specify the succeeding digits, so that strftime() works properly
1577 * on them, without needing to infer anything. But the
1578 * nl_langinfo() return did not give sufficient information for the
1579 * caller to understand what's going on. So until there is
1580 * evidence that it should work differently, this returns the alt-0
1581 * string for ALT_DIGITS.
1583 * wday was chosen because its range is all a single digit. Things
1584 * like tm_sec have two digits as the minimum: '00' */
1588 /* If to return the format, not the value, overwrite the buffer
1589 * with it. But some strftime()s will keep the original format if
1590 * illegal, so change those to "" */
1591 if (return_format) {
1592 if (strEQ(PL_langinfo_buf, format)) {
1593 *PL_langinfo_buf = '\0';
1596 save_to_buffer(format, &PL_langinfo_buf,
1597 &PL_langinfo_bufsize, 0);
1607 return PL_langinfo_buf;
1614 * Initialize locale awareness.
1617 Perl_init_i18nl10n(pTHX_ int printwarn)
1621 * 0 if not to output warning when setup locale is bad
1622 * 1 if to output warning based on value of PERL_BADLANG
1623 * >1 if to output regardless of PERL_BADLANG
1626 * 1 = set ok or not applicable,
1627 * 0 = fallback to a locale of lower priority
1628 * -1 = fallback to all locales failed, not even to the C locale
1630 * Under -DDEBUGGING, if the environment variable PERL_DEBUG_LOCALE_INIT is
1631 * set, debugging information is output.
1633 * This looks more complicated than it is, mainly due to the #ifdefs.
1635 * We try to set LC_ALL to the value determined by the environment. If
1636 * there is no LC_ALL on this platform, we try the individual categories we
1637 * know about. If this works, we are done.
1639 * But if it doesn't work, we have to do something else. We search the
1640 * environment variables ourselves instead of relying on the system to do
1641 * it. We look at, in order, LC_ALL, LANG, a system default locale (if we
1642 * think there is one), and the ultimate fallback "C". This is all done in
1643 * the same loop as above to avoid duplicating code, but it makes things
1644 * more complex. After the original failure, we add the fallback
1645 * possibilities to the list of locales to try, and iterate the loop
1646 * through them all until one succeeds.
1648 * On Ultrix, the locale MUST come from the environment, so there is
1649 * preliminary code to set it. I (khw) am not sure that it is necessary,
1650 * and that this couldn't be folded into the loop, but barring any real
1651 * platforms to test on, it's staying as-is
1653 * A slight complication is that in embedded Perls, the locale may already
1654 * be set-up, and we don't want to get it from the normal environment
1655 * variables. This is handled by having a special environment variable
1656 * indicate we're in this situation. We simply set setlocale's 2nd
1657 * parameter to be a NULL instead of "". That indicates to setlocale that
1658 * it is not to change anything, but to return the current value,
1659 * effectively initializing perl's db to what the locale already is.
1661 * We play the same trick with NULL if a LC_ALL succeeds. We call
1662 * setlocale() on the individual categores with NULL to get their existing
1663 * values for our db, instead of trying to change them.
1668 #if defined(USE_LOCALE)
1669 #ifdef USE_LOCALE_CTYPE
1670 char *curctype = NULL;
1671 #endif /* USE_LOCALE_CTYPE */
1672 #ifdef USE_LOCALE_COLLATE
1673 char *curcoll = NULL;
1674 #endif /* USE_LOCALE_COLLATE */
1675 #ifdef USE_LOCALE_NUMERIC
1676 char *curnum = NULL;
1677 #endif /* USE_LOCALE_NUMERIC */
1679 const char * const language = savepv(PerlEnv_getenv("LANGUAGE"));
1682 /* NULL uses the existing already set up locale */
1683 const char * const setlocale_init = (PerlEnv_getenv("PERL_SKIP_LOCALE_INIT"))
1686 const char* trial_locales[5]; /* 5 = 1 each for "", LC_ALL, LANG, "", C */
1687 unsigned int trial_locales_count;
1688 const char * const lc_all = savepv(PerlEnv_getenv("LC_ALL"));
1689 const char * const lang = savepv(PerlEnv_getenv("LANG"));
1690 bool setlocale_failure = FALSE;
1694 /* A later getenv() could zap this, so only use here */
1695 const char * const bad_lang_use_once = PerlEnv_getenv("PERL_BADLANG");
1697 const bool locwarn = (printwarn > 1
1699 && (! bad_lang_use_once
1701 /* disallow with "" or "0" */
1703 && strNE("0", bad_lang_use_once)))));
1705 char * sl_result; /* return from setlocale() */
1706 char * locale_param;
1708 /* In some systems you can find out the system default locale
1709 * and use that as the fallback locale. */
1710 # define SYSTEM_DEFAULT_LOCALE
1712 #ifdef SYSTEM_DEFAULT_LOCALE
1713 const char *system_default_locale = NULL;
1717 DEBUG_INITIALIZATION_set(cBOOL(PerlEnv_getenv("PERL_DEBUG_LOCALE_INIT")));
1718 # define DEBUG_LOCALE_INIT(category, locale, result) \
1720 if (debug_initialization) { \
1721 PerlIO_printf(Perl_debug_log, \
1723 __FILE__, __LINE__, \
1724 setlocale_debug_string(category, \
1730 # define DEBUG_LOCALE_INIT(a,b,c)
1733 #ifndef LOCALE_ENVIRON_REQUIRED
1734 PERL_UNUSED_VAR(done);
1735 PERL_UNUSED_VAR(locale_param);
1739 * Ultrix setlocale(..., "") fails if there are no environment
1740 * variables from which to get a locale name.
1745 sl_result = my_setlocale(LC_ALL, setlocale_init);
1746 DEBUG_LOCALE_INIT(LC_ALL, setlocale_init, sl_result);
1750 setlocale_failure = TRUE;
1752 if (! setlocale_failure) {
1753 # ifdef USE_LOCALE_CTYPE
1754 locale_param = (! done && (lang || PerlEnv_getenv("LC_CTYPE")))
1757 curctype = my_setlocale(LC_CTYPE, locale_param);
1758 DEBUG_LOCALE_INIT(LC_CTYPE, locale_param, sl_result);
1760 setlocale_failure = TRUE;
1762 curctype = savepv(curctype);
1763 # endif /* USE_LOCALE_CTYPE */
1764 # ifdef USE_LOCALE_COLLATE
1765 locale_param = (! done && (lang || PerlEnv_getenv("LC_COLLATE")))
1768 curcoll = my_setlocale(LC_COLLATE, locale_param);
1769 DEBUG_LOCALE_INIT(LC_COLLATE, locale_param, sl_result);
1771 setlocale_failure = TRUE;
1773 curcoll = savepv(curcoll);
1774 # endif /* USE_LOCALE_COLLATE */
1775 # ifdef USE_LOCALE_NUMERIC
1776 locale_param = (! done && (lang || PerlEnv_getenv("LC_NUMERIC")))
1779 curnum = my_setlocale(LC_NUMERIC, locale_param);
1780 DEBUG_LOCALE_INIT(LC_NUMERIC, locale_param, sl_result);
1782 setlocale_failure = TRUE;
1784 curnum = savepv(curnum);
1785 # endif /* USE_LOCALE_NUMERIC */
1786 # ifdef USE_LOCALE_MESSAGES
1787 locale_param = (! done && (lang || PerlEnv_getenv("LC_MESSAGES")))
1790 sl_result = my_setlocale(LC_MESSAGES, locale_param);
1791 DEBUG_LOCALE_INIT(LC_MESSAGES, locale_param, sl_result);
1793 setlocale_failure = TRUE;
1795 # endif /* USE_LOCALE_MESSAGES */
1796 # ifdef USE_LOCALE_MONETARY
1797 locale_param = (! done && (lang || PerlEnv_getenv("LC_MONETARY")))
1800 sl_result = my_setlocale(LC_MONETARY, locale_param);
1801 DEBUG_LOCALE_INIT(LC_MONETARY, locale_param, sl_result);
1803 setlocale_failure = TRUE;
1805 # endif /* USE_LOCALE_MONETARY */
1808 # endif /* LC_ALL */
1810 #endif /* !LOCALE_ENVIRON_REQUIRED */
1812 /* We try each locale in the list until we get one that works, or exhaust
1813 * the list. Normally the loop is executed just once. But if setting the
1814 * locale fails, inside the loop we add fallback trials to the array and so
1815 * will execute the loop multiple times */
1816 trial_locales[0] = setlocale_init;
1817 trial_locales_count = 1;
1818 for (i= 0; i < trial_locales_count; i++) {
1819 const char * trial_locale = trial_locales[i];
1823 /* XXX This is to preserve old behavior for LOCALE_ENVIRON_REQUIRED
1824 * when i==0, but I (khw) don't think that behavior makes much
1826 setlocale_failure = FALSE;
1828 #ifdef SYSTEM_DEFAULT_LOCALE
1830 /* On Windows machines, an entry of "" after the 0th means to use
1831 * the system default locale, which we now proceed to get. */
1832 if (strEQ(trial_locale, "")) {
1835 /* Note that this may change the locale, but we are going to do
1836 * that anyway just below */
1837 system_default_locale = setlocale(LC_ALL, "");
1838 DEBUG_LOCALE_INIT(LC_ALL, "", system_default_locale);
1840 /* Skip if invalid or it's already on the list of locales to
1842 if (! system_default_locale) {
1843 goto next_iteration;
1845 for (j = 0; j < trial_locales_count; j++) {
1846 if (strEQ(system_default_locale, trial_locales[j])) {
1847 goto next_iteration;
1851 trial_locale = system_default_locale;
1854 #endif /* SYSTEM_DEFAULT_LOCALE */
1858 sl_result = my_setlocale(LC_ALL, trial_locale);
1859 DEBUG_LOCALE_INIT(LC_ALL, trial_locale, sl_result);
1861 setlocale_failure = TRUE;
1864 /* Since LC_ALL succeeded, it should have changed all the other
1865 * categories it can to its value; so we massage things so that the
1866 * setlocales below just return their category's current values.
1867 * This adequately handles the case in NetBSD where LC_COLLATE may
1868 * not be defined for a locale, and setting it individually will
1869 * fail, whereas setting LC_ALL suceeds, leaving LC_COLLATE set to
1870 * the POSIX locale. */
1871 trial_locale = NULL;
1875 if (!setlocale_failure) {
1876 #ifdef USE_LOCALE_CTYPE
1878 curctype = my_setlocale(LC_CTYPE, trial_locale);
1879 DEBUG_LOCALE_INIT(LC_CTYPE, trial_locale, curctype);
1881 setlocale_failure = TRUE;
1883 curctype = savepv(curctype);
1884 #endif /* USE_LOCALE_CTYPE */
1885 #ifdef USE_LOCALE_COLLATE
1887 curcoll = my_setlocale(LC_COLLATE, trial_locale);
1888 DEBUG_LOCALE_INIT(LC_COLLATE, trial_locale, curcoll);
1890 setlocale_failure = TRUE;
1892 curcoll = savepv(curcoll);
1893 #endif /* USE_LOCALE_COLLATE */
1894 #ifdef USE_LOCALE_NUMERIC
1896 curnum = my_setlocale(LC_NUMERIC, trial_locale);
1897 DEBUG_LOCALE_INIT(LC_NUMERIC, trial_locale, curnum);
1899 setlocale_failure = TRUE;
1901 curnum = savepv(curnum);
1902 #endif /* USE_LOCALE_NUMERIC */
1903 #ifdef USE_LOCALE_MESSAGES
1904 sl_result = my_setlocale(LC_MESSAGES, trial_locale);
1905 DEBUG_LOCALE_INIT(LC_MESSAGES, trial_locale, sl_result);
1907 setlocale_failure = TRUE;
1908 #endif /* USE_LOCALE_MESSAGES */
1909 #ifdef USE_LOCALE_MONETARY
1910 sl_result = my_setlocale(LC_MONETARY, trial_locale);
1911 DEBUG_LOCALE_INIT(LC_MONETARY, trial_locale, sl_result);
1913 setlocale_failure = TRUE;
1914 #endif /* USE_LOCALE_MONETARY */
1916 if (! setlocale_failure) { /* Success */
1921 /* Here, something failed; will need to try a fallback. */
1927 if (locwarn) { /* Output failure info only on the first one */
1930 PerlIO_printf(Perl_error_log,
1931 "perl: warning: Setting locale failed.\n");
1935 PerlIO_printf(Perl_error_log,
1936 "perl: warning: Setting locale failed for the categories:\n\t");
1937 # ifdef USE_LOCALE_CTYPE
1939 PerlIO_printf(Perl_error_log, "LC_CTYPE ");
1940 # endif /* USE_LOCALE_CTYPE */
1941 # ifdef USE_LOCALE_COLLATE
1943 PerlIO_printf(Perl_error_log, "LC_COLLATE ");
1944 # endif /* USE_LOCALE_COLLATE */
1945 # ifdef USE_LOCALE_NUMERIC
1947 PerlIO_printf(Perl_error_log, "LC_NUMERIC ");
1948 # endif /* USE_LOCALE_NUMERIC */
1949 PerlIO_printf(Perl_error_log, "and possibly others\n");
1953 PerlIO_printf(Perl_error_log,
1954 "perl: warning: Please check that your locale settings:\n");
1957 PerlIO_printf(Perl_error_log,
1958 "\tLANGUAGE = %c%s%c,\n",
1959 language ? '"' : '(',
1960 language ? language : "unset",
1961 language ? '"' : ')');
1964 PerlIO_printf(Perl_error_log,
1965 "\tLC_ALL = %c%s%c,\n",
1967 lc_all ? lc_all : "unset",
1968 lc_all ? '"' : ')');
1970 #if defined(USE_ENVIRON_ARRAY)
1973 for (e = environ; *e; e++) {
1974 if (strEQs(*e, "LC_")
1975 && strNEs(*e, "LC_ALL=")
1976 && (p = strchr(*e, '=')))
1977 PerlIO_printf(Perl_error_log, "\t%.*s = \"%s\",\n",
1978 (int)(p - *e), *e, p + 1);
1982 PerlIO_printf(Perl_error_log,
1983 "\t(possibly more locale environment variables)\n");
1986 PerlIO_printf(Perl_error_log,
1987 "\tLANG = %c%s%c\n",
1989 lang ? lang : "unset",
1992 PerlIO_printf(Perl_error_log,
1993 " are supported and installed on your system.\n");
1996 /* Calculate what fallback locales to try. We have avoided this
1997 * until we have to, because failure is quite unlikely. This will
1998 * usually change the upper bound of the loop we are in.
2000 * Since the system's default way of setting the locale has not
2001 * found one that works, We use Perl's defined ordering: LC_ALL,
2002 * LANG, and the C locale. We don't try the same locale twice, so
2003 * don't add to the list if already there. (On POSIX systems, the
2004 * LC_ALL element will likely be a repeat of the 0th element "",
2005 * but there's no harm done by doing it explicitly.
2007 * Note that this tries the LC_ALL environment variable even on
2008 * systems which have no LC_ALL locale setting. This may or may
2009 * not have been originally intentional, but there's no real need
2010 * to change the behavior. */
2012 for (j = 0; j < trial_locales_count; j++) {
2013 if (strEQ(lc_all, trial_locales[j])) {
2017 trial_locales[trial_locales_count++] = lc_all;
2022 for (j = 0; j < trial_locales_count; j++) {
2023 if (strEQ(lang, trial_locales[j])) {
2027 trial_locales[trial_locales_count++] = lang;
2031 #if defined(WIN32) && defined(LC_ALL)
2032 /* For Windows, we also try the system default locale before "C".
2033 * (If there exists a Windows without LC_ALL we skip this because
2034 * it gets too complicated. For those, the "C" is the next
2035 * fallback possibility). The "" is the same as the 0th element of
2036 * the array, but the code at the loop above knows to treat it
2037 * differently when not the 0th */
2038 trial_locales[trial_locales_count++] = "";
2041 for (j = 0; j < trial_locales_count; j++) {
2042 if (strEQ("C", trial_locales[j])) {
2046 trial_locales[trial_locales_count++] = "C";
2049 } /* end of first time through the loop */
2055 } /* end of looping through the trial locales */
2057 if (ok < 1) { /* If we tried to fallback */
2059 if (! setlocale_failure) { /* fallback succeeded */
2060 msg = "Falling back to";
2062 else { /* fallback failed */
2064 /* We dropped off the end of the loop, so have to decrement i to
2065 * get back to the value the last time through */
2069 msg = "Failed to fall back to";
2071 /* To continue, we should use whatever values we've got */
2072 #ifdef USE_LOCALE_CTYPE
2074 curctype = savepv(setlocale(LC_CTYPE, NULL));
2075 DEBUG_LOCALE_INIT(LC_CTYPE, NULL, curctype);
2076 #endif /* USE_LOCALE_CTYPE */
2077 #ifdef USE_LOCALE_COLLATE
2079 curcoll = savepv(setlocale(LC_COLLATE, NULL));
2080 DEBUG_LOCALE_INIT(LC_COLLATE, NULL, curcoll);
2081 #endif /* USE_LOCALE_COLLATE */
2082 #ifdef USE_LOCALE_NUMERIC
2084 curnum = savepv(setlocale(LC_NUMERIC, NULL));
2085 DEBUG_LOCALE_INIT(LC_NUMERIC, NULL, curnum);
2086 #endif /* USE_LOCALE_NUMERIC */
2090 const char * description;
2091 const char * name = "";
2092 if (strEQ(trial_locales[i], "C")) {
2093 description = "the standard locale";
2096 #ifdef SYSTEM_DEFAULT_LOCALE
2097 else if (strEQ(trial_locales[i], "")) {
2098 description = "the system default locale";
2099 if (system_default_locale) {
2100 name = system_default_locale;
2103 #endif /* SYSTEM_DEFAULT_LOCALE */
2105 description = "a fallback locale";
2106 name = trial_locales[i];
2108 if (name && strNE(name, "")) {
2109 PerlIO_printf(Perl_error_log,
2110 "perl: warning: %s %s (\"%s\").\n", msg, description, name);
2113 PerlIO_printf(Perl_error_log,
2114 "perl: warning: %s %s.\n", msg, description);
2117 } /* End of tried to fallback */
2119 #ifdef USE_LOCALE_CTYPE
2120 new_ctype(curctype);
2121 #endif /* USE_LOCALE_CTYPE */
2123 #ifdef USE_LOCALE_COLLATE
2124 new_collate(curcoll);
2125 #endif /* USE_LOCALE_COLLATE */
2127 #ifdef USE_LOCALE_NUMERIC
2128 new_numeric(curnum);
2129 #endif /* USE_LOCALE_NUMERIC */
2131 #if defined(USE_PERLIO) && defined(USE_LOCALE_CTYPE)
2132 /* Set PL_utf8locale to TRUE if using PerlIO _and_ the current LC_CTYPE
2133 * locale is UTF-8. If PL_utf8locale and PL_unicode (set by -C or by
2134 * $ENV{PERL_UNICODE}) are true, perl.c:S_parse_body() will turn on the
2135 * PerlIO :utf8 layer on STDIN, STDOUT, STDERR, _and_ the default open
2137 PL_utf8locale = _is_cur_LC_category_utf8(LC_CTYPE);
2139 /* Set PL_unicode to $ENV{PERL_UNICODE} if using PerlIO.
2140 This is an alternative to using the -C command line switch
2141 (the -C if present will override this). */
2143 const char *p = PerlEnv_getenv("PERL_UNICODE");
2144 PL_unicode = p ? parse_unicode_opts(&p) : 0;
2145 if (PL_unicode & PERL_UNICODE_UTF8CACHEASSERT_FLAG)
2150 #ifdef USE_LOCALE_CTYPE
2152 #endif /* USE_LOCALE_CTYPE */
2153 #ifdef USE_LOCALE_COLLATE
2155 #endif /* USE_LOCALE_COLLATE */
2156 #ifdef USE_LOCALE_NUMERIC
2158 #endif /* USE_LOCALE_NUMERIC */
2167 #else /* !USE_LOCALE */
2168 PERL_UNUSED_ARG(printwarn);
2169 #endif /* USE_LOCALE */
2172 /* So won't continue to output stuff */
2173 DEBUG_INITIALIZATION_set(FALSE);
2179 #ifdef USE_LOCALE_COLLATE
2182 Perl__mem_collxfrm(pTHX_ const char *input_string,
2183 STRLEN len, /* Length of 'input_string' */
2184 STRLEN *xlen, /* Set to length of returned string
2185 (not including the collation index
2187 bool utf8 /* Is the input in UTF-8? */
2191 /* _mem_collxfrm() is a bit like strxfrm() but with two important
2192 * differences. First, it handles embedded NULs. Second, it allocates a bit
2193 * more memory than needed for the transformed data itself. The real
2194 * transformed data begins at offset COLLXFRM_HDR_LEN. *xlen is set to
2195 * the length of that, and doesn't include the collation index size.
2196 * Please see sv_collxfrm() to see how this is used. */
2198 #define COLLXFRM_HDR_LEN sizeof(PL_collation_ix)
2200 char * s = (char *) input_string;
2201 STRLEN s_strlen = strlen(input_string);
2203 STRLEN xAlloc; /* xalloc is a reserved word in VC */
2204 STRLEN length_in_chars;
2205 bool first_time = TRUE; /* Cleared after first loop iteration */
2207 PERL_ARGS_ASSERT__MEM_COLLXFRM;
2209 /* Must be NUL-terminated */
2210 assert(*(input_string + len) == '\0');
2212 /* If this locale has defective collation, skip */
2213 if (PL_collxfrm_base == 0 && PL_collxfrm_mult == 0) {
2214 DEBUG_L(PerlIO_printf(Perl_debug_log,
2215 "_mem_collxfrm: locale's collation is defective\n"));
2219 /* Replace any embedded NULs with the control that sorts before any others.
2220 * This will give as good as possible results on strings that don't
2221 * otherwise contain that character, but otherwise there may be
2222 * less-than-perfect results with that character and NUL. This is
2223 * unavoidable unless we replace strxfrm with our own implementation. */
2224 if (UNLIKELY(s_strlen < len)) { /* Only execute if there is an embedded
2228 STRLEN sans_nuls_len;
2229 int try_non_controls;
2230 char this_replacement_char[] = "?\0"; /* Room for a two-byte string,
2231 making sure 2nd byte is NUL.
2233 STRLEN this_replacement_len;
2235 /* If we don't know what non-NUL control character sorts lowest for
2236 * this locale, find it */
2237 if (PL_strxfrm_NUL_replacement == '\0') {
2239 char * cur_min_x = NULL; /* The min_char's xfrm, (except it also
2240 includes the collation index
2243 DEBUG_Lv(PerlIO_printf(Perl_debug_log, "Looking to replace NUL\n"));
2245 /* Unlikely, but it may be that no control will work to replace
2246 * NUL, in which case we instead look for any character. Controls
2247 * are preferred because collation order is, in general, context
2248 * sensitive, with adjoining characters affecting the order, and
2249 * controls are less likely to have such interactions, allowing the
2250 * NUL-replacement to stand on its own. (Another way to look at it
2251 * is to imagine what would happen if the NUL were replaced by a
2252 * combining character; it wouldn't work out all that well.) */
2253 for (try_non_controls = 0;
2254 try_non_controls < 2;
2257 /* Look through all legal code points (NUL isn't) */
2258 for (j = 1; j < 256; j++) {
2259 char * x; /* j's xfrm plus collation index */
2260 STRLEN x_len; /* length of 'x' */
2261 STRLEN trial_len = 1;
2262 char cur_source[] = { '\0', '\0' };
2264 /* Skip non-controls the first time through the loop. The
2265 * controls in a UTF-8 locale are the L1 ones */
2266 if (! try_non_controls && (PL_in_utf8_COLLATE_locale)
2273 /* Create a 1-char string of the current code point */
2274 cur_source[0] = (char) j;
2276 /* Then transform it */
2277 x = _mem_collxfrm(cur_source, trial_len, &x_len,
2278 0 /* The string is not in UTF-8 */);
2280 /* Ignore any character that didn't successfully transform.
2286 /* If this character's transformation is lower than
2287 * the current lowest, this one becomes the lowest */
2288 if ( cur_min_x == NULL
2289 || strLT(x + COLLXFRM_HDR_LEN,
2290 cur_min_x + COLLXFRM_HDR_LEN))
2292 PL_strxfrm_NUL_replacement = j;
2298 } /* end of loop through all 255 characters */
2300 /* Stop looking if found */
2305 /* Unlikely, but possible, if there aren't any controls that
2306 * work in the locale, repeat the loop, looking for any
2307 * character that works */
2308 DEBUG_L(PerlIO_printf(Perl_debug_log,
2309 "_mem_collxfrm: No control worked. Trying non-controls\n"));
2310 } /* End of loop to try first the controls, then any char */
2313 DEBUG_L(PerlIO_printf(Perl_debug_log,
2314 "_mem_collxfrm: Couldn't find any character to replace"
2315 " embedded NULs in locale %s with", PL_collation_name));
2319 DEBUG_L(PerlIO_printf(Perl_debug_log,
2320 "_mem_collxfrm: Replacing embedded NULs in locale %s with "
2321 "0x%02X\n", PL_collation_name, PL_strxfrm_NUL_replacement));
2323 Safefree(cur_min_x);
2324 } /* End of determining the character that is to replace NULs */
2326 /* If the replacement is variant under UTF-8, it must match the
2327 * UTF8-ness as the original */
2328 if ( ! UVCHR_IS_INVARIANT(PL_strxfrm_NUL_replacement) && utf8) {
2329 this_replacement_char[0] =
2330 UTF8_EIGHT_BIT_HI(PL_strxfrm_NUL_replacement);
2331 this_replacement_char[1] =
2332 UTF8_EIGHT_BIT_LO(PL_strxfrm_NUL_replacement);
2333 this_replacement_len = 2;
2336 this_replacement_char[0] = PL_strxfrm_NUL_replacement;
2337 /* this_replacement_char[1] = '\0' was done at initialization */
2338 this_replacement_len = 1;
2341 /* The worst case length for the replaced string would be if every
2342 * character in it is NUL. Multiply that by the length of each
2343 * replacement, and allow for a trailing NUL */
2344 sans_nuls_len = (len * this_replacement_len) + 1;
2345 Newx(sans_nuls, sans_nuls_len, char);
2348 /* Replace each NUL with the lowest collating control. Loop until have
2349 * exhausted all the NULs */
2350 while (s + s_strlen < e) {
2351 my_strlcat(sans_nuls, s, sans_nuls_len);
2353 /* Do the actual replacement */
2354 my_strlcat(sans_nuls, this_replacement_char, sans_nuls_len);
2356 /* Move past the input NUL */
2358 s_strlen = strlen(s);
2361 /* And add anything that trails the final NUL */
2362 my_strlcat(sans_nuls, s, sans_nuls_len);
2364 /* Switch so below we transform this modified string */
2367 } /* End of replacing NULs */
2369 /* Make sure the UTF8ness of the string and locale match */
2370 if (utf8 != PL_in_utf8_COLLATE_locale) {
2371 const char * const t = s; /* Temporary so we can later find where the
2374 /* Here they don't match. Change the string's to be what the locale is
2377 if (! utf8) { /* locale is UTF-8, but input isn't; upgrade the input */
2378 s = (char *) bytes_to_utf8((const U8 *) s, &len);
2381 else { /* locale is not UTF-8; but input is; downgrade the input */
2383 s = (char *) bytes_from_utf8((const U8 *) s, &len, &utf8);
2385 /* If the downgrade was successful we are done, but if the input
2386 * contains things that require UTF-8 to represent, have to do
2387 * damage control ... */
2388 if (UNLIKELY(utf8)) {
2390 /* What we do is construct a non-UTF-8 string with
2391 * 1) the characters representable by a single byte converted
2392 * to be so (if necessary);
2393 * 2) and the rest converted to collate the same as the
2394 * highest collating representable character. That makes
2395 * them collate at the end. This is similar to how we
2396 * handle embedded NULs, but we use the highest collating
2397 * code point instead of the smallest. Like the NUL case,
2398 * this isn't perfect, but is the best we can reasonably
2399 * do. Every above-255 code point will sort the same as
2400 * the highest-sorting 0-255 code point. If that code
2401 * point can combine in a sequence with some other code
2402 * points for weight calculations, us changing something to
2403 * be it can adversely affect the results. But in most
2404 * cases, it should work reasonably. And note that this is
2405 * really an illegal situation: using code points above 255
2406 * on a locale where only 0-255 are valid. If two strings
2407 * sort entirely equal, then the sort order for the
2408 * above-255 code points will be in code point order. */
2412 /* If we haven't calculated the code point with the maximum
2413 * collating order for this locale, do so now */
2414 if (! PL_strxfrm_max_cp) {
2417 /* The current transformed string that collates the
2418 * highest (except it also includes the prefixed collation
2420 char * cur_max_x = NULL;
2422 /* Look through all legal code points (NUL isn't) */
2423 for (j = 1; j < 256; j++) {
2426 char cur_source[] = { '\0', '\0' };
2428 /* Create a 1-char string of the current code point */
2429 cur_source[0] = (char) j;
2431 /* Then transform it */
2432 x = _mem_collxfrm(cur_source, 1, &x_len, FALSE);
2434 /* If something went wrong (which it shouldn't), just
2435 * ignore this code point */
2440 /* If this character's transformation is higher than
2441 * the current highest, this one becomes the highest */
2442 if ( cur_max_x == NULL
2443 || strGT(x + COLLXFRM_HDR_LEN,
2444 cur_max_x + COLLXFRM_HDR_LEN))
2446 PL_strxfrm_max_cp = j;
2455 DEBUG_L(PerlIO_printf(Perl_debug_log,
2456 "_mem_collxfrm: Couldn't find any character to"
2457 " replace above-Latin1 chars in locale %s with",
2458 PL_collation_name));
2462 DEBUG_L(PerlIO_printf(Perl_debug_log,
2463 "_mem_collxfrm: highest 1-byte collating character"
2464 " in locale %s is 0x%02X\n",
2466 PL_strxfrm_max_cp));
2468 Safefree(cur_max_x);
2471 /* Here we know which legal code point collates the highest.
2472 * We are ready to construct the non-UTF-8 string. The length
2473 * will be at least 1 byte smaller than the input string
2474 * (because we changed at least one 2-byte character into a
2475 * single byte), but that is eaten up by the trailing NUL */
2481 char * e = (char *) t + len;
2483 for (i = 0; i < len; i+= UTF8SKIP(t + i)) {
2485 if (UTF8_IS_INVARIANT(cur_char)) {
2488 else if (UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(t + i, e)) {
2489 s[d++] = EIGHT_BIT_UTF8_TO_NATIVE(cur_char, t[i+1]);
2491 else { /* Replace illegal cp with highest collating
2493 s[d++] = PL_strxfrm_max_cp;
2497 Renew(s, d, char); /* Free up unused space */
2502 /* Here, we have constructed a modified version of the input. It could
2503 * be that we already had a modified copy before we did this version.
2504 * If so, that copy is no longer needed */
2505 if (t != input_string) {
2510 length_in_chars = (utf8)
2511 ? utf8_length((U8 *) s, (U8 *) s + len)
2514 /* The first element in the output is the collation id, used by
2515 * sv_collxfrm(); then comes the space for the transformed string. The
2516 * equation should give us a good estimate as to how much is needed */
2517 xAlloc = COLLXFRM_HDR_LEN
2519 + (PL_collxfrm_mult * length_in_chars);
2520 Newx(xbuf, xAlloc, char);
2521 if (UNLIKELY(! xbuf)) {
2522 DEBUG_L(PerlIO_printf(Perl_debug_log,
2523 "_mem_collxfrm: Couldn't malloc %zu bytes\n", xAlloc));
2527 /* Store the collation id */
2528 *(U32*)xbuf = PL_collation_ix;
2530 /* Then the transformation of the input. We loop until successful, or we
2534 *xlen = strxfrm(xbuf + COLLXFRM_HDR_LEN, s, xAlloc - COLLXFRM_HDR_LEN);
2536 /* If the transformed string occupies less space than we told strxfrm()
2537 * was available, it means it successfully transformed the whole
2539 if (*xlen < xAlloc - COLLXFRM_HDR_LEN) {
2541 /* Some systems include a trailing NUL in the returned length.
2542 * Ignore it, using a loop in case multiple trailing NULs are
2545 && *(xbuf + COLLXFRM_HDR_LEN + (*xlen) - 1) == '\0')
2550 /* If the first try didn't get it, it means our prediction was low.
2551 * Modify the coefficients so that we predict a larger value in any
2552 * future transformations */
2554 STRLEN needed = *xlen + 1; /* +1 For trailing NUL */
2555 STRLEN computed_guess = PL_collxfrm_base
2556 + (PL_collxfrm_mult * length_in_chars);
2558 /* On zero-length input, just keep current slope instead of
2560 const STRLEN new_m = (length_in_chars != 0)
2561 ? needed / length_in_chars
2564 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
2565 "%s: %d: initial size of %zu bytes for a length "
2566 "%zu string was insufficient, %zu needed\n",
2568 computed_guess, length_in_chars, needed));
2570 /* If slope increased, use it, but discard this result for
2571 * length 1 strings, as we can't be sure that it's a real slope
2573 if (length_in_chars > 1 && new_m > PL_collxfrm_mult) {
2575 STRLEN old_m = PL_collxfrm_mult;
2576 STRLEN old_b = PL_collxfrm_base;
2578 PL_collxfrm_mult = new_m;
2579 PL_collxfrm_base = 1; /* +1 For trailing NUL */
2580 computed_guess = PL_collxfrm_base
2581 + (PL_collxfrm_mult * length_in_chars);
2582 if (computed_guess < needed) {
2583 PL_collxfrm_base += needed - computed_guess;
2586 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
2587 "%s: %d: slope is now %zu; was %zu, base "
2588 "is now %zu; was %zu\n",
2590 PL_collxfrm_mult, old_m,
2591 PL_collxfrm_base, old_b));
2593 else { /* Slope didn't change, but 'b' did */
2594 const STRLEN new_b = needed
2597 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
2598 "%s: %d: base is now %zu; was %zu\n",
2600 new_b, PL_collxfrm_base));
2601 PL_collxfrm_base = new_b;
2608 if (UNLIKELY(*xlen >= PERL_INT_MAX)) {
2609 DEBUG_L(PerlIO_printf(Perl_debug_log,
2610 "_mem_collxfrm: Needed %zu bytes, max permissible is %u\n",
2611 *xlen, PERL_INT_MAX));
2615 /* A well-behaved strxfrm() returns exactly how much space it needs
2616 * (usually not including the trailing NUL) when it fails due to not
2617 * enough space being provided. Assume that this is the case unless
2618 * it's been proven otherwise */
2619 if (LIKELY(PL_strxfrm_is_behaved) && first_time) {
2620 xAlloc = *xlen + COLLXFRM_HDR_LEN + 1;
2622 else { /* Here, either:
2623 * 1) The strxfrm() has previously shown bad behavior; or
2624 * 2) It isn't the first time through the loop, which means
2625 * that the strxfrm() is now showing bad behavior, because
2626 * we gave it what it said was needed in the previous
2627 * iteration, and it came back saying it needed still more.
2628 * (Many versions of cygwin fit this. When the buffer size
2629 * isn't sufficient, they return the input size instead of
2630 * how much is needed.)
2631 * Increase the buffer size by a fixed percentage and try again.
2633 xAlloc += (xAlloc / 4) + 1;
2634 PL_strxfrm_is_behaved = FALSE;
2637 if (DEBUG_Lv_TEST || debug_initialization) {
2638 PerlIO_printf(Perl_debug_log,
2639 "_mem_collxfrm required more space than previously calculated"
2640 " for locale %s, trying again with new guess=%d+%zu\n",
2641 PL_collation_name, (int) COLLXFRM_HDR_LEN,
2642 xAlloc - COLLXFRM_HDR_LEN);
2647 Renew(xbuf, xAlloc, char);
2648 if (UNLIKELY(! xbuf)) {
2649 DEBUG_L(PerlIO_printf(Perl_debug_log,
2650 "_mem_collxfrm: Couldn't realloc %zu bytes\n", xAlloc));
2659 if (DEBUG_Lv_TEST || debug_initialization) {
2661 print_collxfrm_input_and_return(s, s + len, xlen, utf8);
2662 PerlIO_printf(Perl_debug_log, "Its xfrm is:");
2663 PerlIO_printf(Perl_debug_log, "%s\n",
2664 _byte_dump_string((U8 *) xbuf + COLLXFRM_HDR_LEN,
2669 /* Free up unneeded space; retain ehough for trailing NUL */
2670 Renew(xbuf, COLLXFRM_HDR_LEN + *xlen + 1, char);
2672 if (s != input_string) {
2680 if (s != input_string) {
2685 if (DEBUG_Lv_TEST || debug_initialization) {
2686 print_collxfrm_input_and_return(s, s + len, NULL, utf8);
2695 S_print_collxfrm_input_and_return(pTHX_
2696 const char * const s,
2697 const char * const e,
2698 const STRLEN * const xlen,
2702 PERL_ARGS_ASSERT_PRINT_COLLXFRM_INPUT_AND_RETURN;
2704 PerlIO_printf(Perl_debug_log, "_mem_collxfrm[%" UVuf "]: returning ",
2705 (UV)PL_collation_ix);
2707 PerlIO_printf(Perl_debug_log, "%zu", *xlen);
2710 PerlIO_printf(Perl_debug_log, "NULL");
2712 PerlIO_printf(Perl_debug_log, " for locale '%s', string='",
2714 print_bytes_for_locale(s, e, is_utf8);
2716 PerlIO_printf(Perl_debug_log, "'\n");
2720 S_print_bytes_for_locale(pTHX_
2721 const char * const s,
2722 const char * const e,
2726 bool prev_was_printable = TRUE;
2727 bool first_time = TRUE;
2729 PERL_ARGS_ASSERT_PRINT_BYTES_FOR_LOCALE;
2733 ? utf8_to_uvchr_buf((U8 *) t, e, NULL)
2736 if (! prev_was_printable) {
2737 PerlIO_printf(Perl_debug_log, " ");
2739 PerlIO_printf(Perl_debug_log, "%c", (U8) cp);
2740 prev_was_printable = TRUE;
2744 PerlIO_printf(Perl_debug_log, " ");
2746 PerlIO_printf(Perl_debug_log, "%02" UVXf, cp);
2747 prev_was_printable = FALSE;
2749 t += (is_utf8) ? UTF8SKIP(t) : 1;
2754 #endif /* #ifdef DEBUGGING */
2756 #endif /* USE_LOCALE_COLLATE */
2761 Perl__is_cur_LC_category_utf8(pTHX_ int category)
2763 /* Returns TRUE if the current locale for 'category' is UTF-8; FALSE
2764 * otherwise. 'category' may not be LC_ALL. If the platform doesn't have
2765 * nl_langinfo(), nor MB_CUR_MAX, this employs a heuristic, which hence
2766 * could give the wrong result. The result will very likely be correct for
2767 * languages that have commonly used non-ASCII characters, but for notably
2768 * English, it comes down to if the locale's name ends in something like
2769 * "UTF-8". It errs on the side of not being a UTF-8 locale. */
2771 char *save_input_locale = NULL;
2775 assert(category != LC_ALL);
2778 /* First dispose of the trivial cases */
2779 save_input_locale = setlocale(category, NULL);
2780 if (! save_input_locale) {
2781 DEBUG_L(PerlIO_printf(Perl_debug_log,
2782 "Could not find current locale for category %d\n",
2784 return FALSE; /* XXX maybe should croak */
2786 save_input_locale = stdize_locale(savepv(save_input_locale));
2787 if (isNAME_C_OR_POSIX(save_input_locale)) {
2788 DEBUG_L(PerlIO_printf(Perl_debug_log,
2789 "Current locale for category %d is %s\n",
2790 category, save_input_locale));
2791 Safefree(save_input_locale);
2795 #if defined(USE_LOCALE_CTYPE) \
2796 && (defined(MB_CUR_MAX) || (defined(HAS_NL_LANGINFO) && defined(CODESET)))
2798 { /* Next try nl_langinfo or MB_CUR_MAX if available */
2800 char *save_ctype_locale = NULL;
2803 if (category != LC_CTYPE) { /* These work only on LC_CTYPE */
2805 /* Get the current LC_CTYPE locale */
2806 save_ctype_locale = setlocale(LC_CTYPE, NULL);
2807 if (! save_ctype_locale) {
2808 DEBUG_L(PerlIO_printf(Perl_debug_log,
2809 "Could not find current locale for LC_CTYPE\n"));
2810 goto cant_use_nllanginfo;
2812 save_ctype_locale = stdize_locale(savepv(save_ctype_locale));
2814 /* If LC_CTYPE and the desired category use the same locale, this
2815 * means that finding the value for LC_CTYPE is the same as finding
2816 * the value for the desired category. Otherwise, switch LC_CTYPE
2817 * to the desired category's locale */
2818 if (strEQ(save_ctype_locale, save_input_locale)) {
2819 Safefree(save_ctype_locale);
2820 save_ctype_locale = NULL;
2822 else if (! setlocale(LC_CTYPE, save_input_locale)) {
2823 DEBUG_L(PerlIO_printf(Perl_debug_log,
2824 "Could not change LC_CTYPE locale to %s\n",
2825 save_input_locale));
2826 Safefree(save_ctype_locale);
2827 goto cant_use_nllanginfo;
2831 DEBUG_L(PerlIO_printf(Perl_debug_log, "Current LC_CTYPE locale=%s\n",
2832 save_input_locale));
2834 /* Here the current LC_CTYPE is set to the locale of the category whose
2835 * information is desired. This means that nl_langinfo() and MB_CUR_MAX
2836 * should give the correct results */
2838 # if defined(HAS_NL_LANGINFO) && defined(CODESET)
2840 char *codeset = nl_langinfo(CODESET);
2841 if (codeset && strNE(codeset, "")) {
2842 codeset = savepv(codeset);
2844 /* If we switched LC_CTYPE, switch back */
2845 if (save_ctype_locale) {
2846 setlocale(LC_CTYPE, save_ctype_locale);
2847 Safefree(save_ctype_locale);
2850 is_utf8 = foldEQ(codeset, STR_WITH_LEN("UTF-8"))
2851 || foldEQ(codeset, STR_WITH_LEN("UTF8"));
2853 DEBUG_L(PerlIO_printf(Perl_debug_log,
2854 "\tnllanginfo returned CODESET '%s'; ?UTF8 locale=%d\n",
2857 Safefree(save_input_locale);
2865 /* Here, either we don't have nl_langinfo, or it didn't return a
2866 * codeset. Try MB_CUR_MAX */
2868 /* Standard UTF-8 needs at least 4 bytes to represent the maximum
2869 * Unicode code point. Since UTF-8 is the only non-single byte
2870 * encoding we handle, we just say any such encoding is UTF-8, and if
2871 * turns out to be wrong, other things will fail */
2872 is_utf8 = MB_CUR_MAX >= 4;
2874 DEBUG_L(PerlIO_printf(Perl_debug_log,
2875 "\tMB_CUR_MAX=%d; ?UTF8 locale=%d\n",
2876 (int) MB_CUR_MAX, is_utf8));
2878 Safefree(save_input_locale);
2882 /* ... But, most system that have MB_CUR_MAX will also have mbtowc(),
2883 * since they are both in the C99 standard. We can feed a known byte
2884 * string to the latter function, and check that it gives the expected
2888 PERL_UNUSED_RESULT(mbtowc(&wc, NULL, 0));/* Reset any shift state */
2890 if ((size_t)mbtowc(&wc, HYPHEN_UTF8, strlen(HYPHEN_UTF8))
2891 != strlen(HYPHEN_UTF8)
2892 || wc != (wchar_t) 0x2010)
2895 DEBUG_L(PerlIO_printf(Perl_debug_log, "\thyphen=U+%x\n", (unsigned int)wc));
2896 DEBUG_L(PerlIO_printf(Perl_debug_log,
2897 "\treturn from mbtowc=%d; errno=%d; ?UTF8 locale=0\n",
2898 mbtowc(&wc, HYPHEN_UTF8, strlen(HYPHEN_UTF8)), errno));
2903 /* If we switched LC_CTYPE, switch back */
2904 if (save_ctype_locale) {
2905 setlocale(LC_CTYPE, save_ctype_locale);
2906 Safefree(save_ctype_locale);
2913 cant_use_nllanginfo:
2915 #else /* nl_langinfo should work if available, so don't bother compiling this
2916 fallback code. The final fallback of looking at the name is
2917 compiled, and will be executed if nl_langinfo fails */
2919 /* nl_langinfo not available or failed somehow. Next try looking at the
2920 * currency symbol to see if it disambiguates things. Often that will be
2921 * in the native script, and if the symbol isn't in UTF-8, we know that the
2922 * locale isn't. If it is non-ASCII UTF-8, we infer that the locale is
2923 * too, as the odds of a non-UTF8 string being valid UTF-8 are quite small
2926 #ifdef HAS_LOCALECONV
2927 # ifdef USE_LOCALE_MONETARY
2929 char *save_monetary_locale = NULL;
2930 bool only_ascii = FALSE;
2931 bool is_utf8 = FALSE;
2934 /* Like above for LC_CTYPE, we first set LC_MONETARY to the locale of
2935 * the desired category, if it isn't that locale already */
2937 if (category != LC_MONETARY) {
2939 save_monetary_locale = setlocale(LC_MONETARY, NULL);
2940 if (! save_monetary_locale) {
2941 DEBUG_L(PerlIO_printf(Perl_debug_log,
2942 "Could not find current locale for LC_MONETARY\n"));
2943 goto cant_use_monetary;
2945 save_monetary_locale = stdize_locale(savepv(save_monetary_locale));
2947 if (strEQ(save_monetary_locale, save_input_locale)) {
2948 Safefree(save_monetary_locale);
2949 save_monetary_locale = NULL;
2951 else if (! setlocale(LC_MONETARY, save_input_locale)) {
2952 DEBUG_L(PerlIO_printf(Perl_debug_log,
2953 "Could not change LC_MONETARY locale to %s\n",
2954 save_input_locale));
2955 Safefree(save_monetary_locale);
2956 goto cant_use_monetary;
2960 /* Here the current LC_MONETARY is set to the locale of the category
2961 * whose information is desired. */
2965 || ! lc->currency_symbol
2966 || is_utf8_invariant_string((U8 *) lc->currency_symbol, 0))
2968 DEBUG_L(PerlIO_printf(Perl_debug_log, "Couldn't get currency symbol for %s, or contains only ASCII; can't use for determining if UTF-8 locale\n", save_input_locale));
2972 is_utf8 = is_utf8_string((U8 *) lc->currency_symbol, 0);
2975 /* If we changed it, restore LC_MONETARY to its original locale */
2976 if (save_monetary_locale) {
2977 setlocale(LC_MONETARY, save_monetary_locale);
2978 Safefree(save_monetary_locale);
2983 /* It isn't a UTF-8 locale if the symbol is not legal UTF-8;
2984 * otherwise assume the locale is UTF-8 if and only if the symbol
2985 * is non-ascii UTF-8. */
2986 DEBUG_L(PerlIO_printf(Perl_debug_log, "\t?Currency symbol for %s is UTF-8=%d\n",
2987 save_input_locale, is_utf8));
2988 Safefree(save_input_locale);
2994 # endif /* USE_LOCALE_MONETARY */
2995 #endif /* HAS_LOCALECONV */
2997 #if defined(HAS_STRFTIME) && defined(USE_LOCALE_TIME)
2999 /* Still haven't found a non-ASCII string to disambiguate UTF-8 or not. Try
3000 * the names of the months and weekdays, timezone, and am/pm indicator */
3002 char *save_time_locale = NULL;
3004 bool is_dst = FALSE;
3008 char * formatted_time;
3011 /* Like above for LC_MONETARY, we set LC_TIME to the locale of the
3012 * desired category, if it isn't that locale already */
3014 if (category != LC_TIME) {
3016 save_time_locale = setlocale(LC_TIME, NULL);
3017 if (! save_time_locale) {
3018 DEBUG_L(PerlIO_printf(Perl_debug_log,
3019 "Could not find current locale for LC_TIME\n"));
3022 save_time_locale = stdize_locale(savepv(save_time_locale));
3024 if (strEQ(save_time_locale, save_input_locale)) {
3025 Safefree(save_time_locale);
3026 save_time_locale = NULL;
3028 else if (! setlocale(LC_TIME, save_input_locale)) {
3029 DEBUG_L(PerlIO_printf(Perl_debug_log,
3030 "Could not change LC_TIME locale to %s\n",
3031 save_input_locale));
3032 Safefree(save_time_locale);
3037 /* Here the current LC_TIME is set to the locale of the category
3038 * whose information is desired. Look at all the days of the week and
3039 * month names, and the timezone and am/pm indicator for UTF-8 variant
3040 * characters. The first such a one found will tell us if the locale
3041 * is UTF-8 or not */
3043 for (i = 0; i < 7 + 12; i++) { /* 7 days; 12 months */
3044 formatted_time = my_strftime("%A %B %Z %p",
3045 0, 0, hour, dom, month, 2012 - 1900, 0, 0, is_dst);
3046 if ( ! formatted_time
3047 || is_utf8_invariant_string((U8 *) formatted_time, 0))
3050 /* Here, we didn't find a non-ASCII. Try the next time through
3051 * with the complemented dst and am/pm, and try with the next
3052 * weekday. After we have gotten all weekdays, try the next
3055 hour = (hour + 12) % 24;
3063 /* Here, we have a non-ASCII. Return TRUE is it is valid UTF8;
3064 * false otherwise. But first, restore LC_TIME to its original
3065 * locale if we changed it */
3066 if (save_time_locale) {
3067 setlocale(LC_TIME, save_time_locale);
3068 Safefree(save_time_locale);
3071 DEBUG_L(PerlIO_printf(Perl_debug_log, "\t?time-related strings for %s are UTF-8=%d\n",
3073 is_utf8_string((U8 *) formatted_time, 0)));
3074 Safefree(save_input_locale);
3075 return is_utf8_string((U8 *) formatted_time, 0);
3078 /* Falling off the end of the loop indicates all the names were just
3079 * ASCII. Go on to the next test. If we changed it, restore LC_TIME
3080 * to its original locale */
3081 if (save_time_locale) {
3082 setlocale(LC_TIME, save_time_locale);
3083 Safefree(save_time_locale);
3085 DEBUG_L(PerlIO_printf(Perl_debug_log, "All time-related words for %s contain only ASCII; can't use for determining if UTF-8 locale\n", save_input_locale));
3091 #if 0 && defined(USE_LOCALE_MESSAGES) && defined(HAS_SYS_ERRLIST)
3093 /* This code is ifdefd out because it was found to not be necessary in testing
3094 * on our dromedary test machine, which has over 700 locales. There, this
3095 * added no value to looking at the currency symbol and the time strings. I
3096 * left it in so as to avoid rewriting it if real-world experience indicates
3097 * that dromedary is an outlier. Essentially, instead of returning abpve if we
3098 * haven't found illegal utf8, we continue on and examine all the strerror()
3099 * messages on the platform for utf8ness. If all are ASCII, we still don't
3100 * know the answer; but otherwise we have a pretty good indication of the
3101 * utf8ness. The reason this doesn't help much is that the messages may not
3102 * have been translated into the locale. The currency symbol and time strings
3103 * are much more likely to have been translated. */
3106 bool is_utf8 = FALSE;
3107 bool non_ascii = FALSE;
3108 char *save_messages_locale = NULL;
3109 const char * errmsg = NULL;
3111 /* Like above, we set LC_MESSAGES to the locale of the desired
3112 * category, if it isn't that locale already */
3114 if (category != LC_MESSAGES) {
3116 save_messages_locale = setlocale(LC_MESSAGES, NULL);
3117 if (! save_messages_locale) {
3118 DEBUG_L(PerlIO_printf(Perl_debug_log,
3119 "Could not find current locale for LC_MESSAGES\n"));
3120 goto cant_use_messages;
3122 save_messages_locale = stdize_locale(savepv(save_messages_locale));
3124 if (strEQ(save_messages_locale, save_input_locale)) {
3125 Safefree(save_messages_locale);
3126 save_messages_locale = NULL;
3128 else if (! setlocale(LC_MESSAGES, save_input_locale)) {
3129 DEBUG_L(PerlIO_printf(Perl_debug_log,
3130 "Could not change LC_MESSAGES locale to %s\n",
3131 save_input_locale));
3132 Safefree(save_messages_locale);
3133 goto cant_use_messages;
3137 /* Here the current LC_MESSAGES is set to the locale of the category
3138 * whose information is desired. Look through all the messages. We
3139 * can't use Strerror() here because it may expand to code that
3140 * segfaults in miniperl */
3142 for (e = 0; e <= sys_nerr; e++) {
3144 errmsg = sys_errlist[e];
3145 if (errno || !errmsg) {
3148 errmsg = savepv(errmsg);
3149 if (! is_utf8_invariant_string((U8 *) errmsg, 0)) {
3151 is_utf8 = is_utf8_string((U8 *) errmsg, 0);
3157 /* And, if we changed it, restore LC_MESSAGES to its original locale */
3158 if (save_messages_locale) {
3159 setlocale(LC_MESSAGES, save_messages_locale);
3160 Safefree(save_messages_locale);
3165 /* Any non-UTF-8 message means not a UTF-8 locale; if all are valid,
3166 * any non-ascii means it is one; otherwise we assume it isn't */
3167 DEBUG_L(PerlIO_printf(Perl_debug_log, "\t?error messages for %s are UTF-8=%d\n",
3170 Safefree(save_input_locale);
3174 DEBUG_L(PerlIO_printf(Perl_debug_log, "All error messages for %s contain only ASCII; can't use for determining if UTF-8 locale\n", save_input_locale));
3180 #endif /* the code that is compiled when no nl_langinfo */
3182 #ifndef EBCDIC /* On os390, even if the name ends with "UTF-8', it isn't a
3184 /* As a last resort, look at the locale name to see if it matches
3185 * qr/UTF -? * 8 /ix, or some other common locale names. This "name", the
3186 * return of setlocale(), is actually defined to be opaque, so we can't
3187 * really rely on the absence of various substrings in the name to indicate
3188 * its UTF-8ness, but if it has UTF8 in the name, it is extremely likely to
3189 * be a UTF-8 locale. Similarly for the other common names */
3191 final_pos = strlen(save_input_locale) - 1;
3192 if (final_pos >= 3) {
3193 char *name = save_input_locale;
3195 /* Find next 'U' or 'u' and look from there */
3196 while ((name += strcspn(name, "Uu") + 1)
3197 <= save_input_locale + final_pos - 2)
3199 if (!isALPHA_FOLD_NE(*name, 't')
3200 || isALPHA_FOLD_NE(*(name + 1), 'f'))
3205 if (*(name) == '-') {
3206 if ((name > save_input_locale + final_pos - 1)) {
3211 if (*(name) == '8') {
3212 DEBUG_L(PerlIO_printf(Perl_debug_log,
3213 "Locale %s ends with UTF-8 in name\n",
3214 save_input_locale));
3215 Safefree(save_input_locale);
3219 DEBUG_L(PerlIO_printf(Perl_debug_log,
3220 "Locale %s doesn't end with UTF-8 in name\n",
3221 save_input_locale));
3226 /* http://msdn.microsoft.com/en-us/library/windows/desktop/dd317756.aspx */
3228 && *(save_input_locale + final_pos - 0) == '1'
3229 && *(save_input_locale + final_pos - 1) == '0'
3230 && *(save_input_locale + final_pos - 2) == '0'
3231 && *(save_input_locale + final_pos - 3) == '5'
3232 && *(save_input_locale + final_pos - 4) == '6')
3234 DEBUG_L(PerlIO_printf(Perl_debug_log,
3235 "Locale %s ends with 10056 in name, is UTF-8 locale\n",
3236 save_input_locale));
3237 Safefree(save_input_locale);
3242 /* Other common encodings are the ISO 8859 series, which aren't UTF-8. But
3243 * since we are about to return FALSE anyway, there is no point in doing
3244 * this extra work */
3246 if (instr(save_input_locale, "8859")) {
3247 DEBUG_L(PerlIO_printf(Perl_debug_log,
3248 "Locale %s has 8859 in name, not UTF-8 locale\n",
3249 save_input_locale));
3250 Safefree(save_input_locale);
3255 DEBUG_L(PerlIO_printf(Perl_debug_log,
3256 "Assuming locale %s is not a UTF-8 locale\n",
3257 save_input_locale));
3258 Safefree(save_input_locale);
3266 Perl__is_in_locale_category(pTHX_ const bool compiling, const int category)
3269 /* Internal function which returns if we are in the scope of a pragma that
3270 * enables the locale category 'category'. 'compiling' should indicate if
3271 * this is during the compilation phase (TRUE) or not (FALSE). */
3273 const COP * const cop = (compiling) ? &PL_compiling : PL_curcop;
3275 SV *categories = cop_hints_fetch_pvs(cop, "locale", 0);
3276 if (! categories || categories == &PL_sv_placeholder) {
3280 /* The pseudo-category 'not_characters' is -1, so just add 1 to each to get
3281 * a valid unsigned */
3282 assert(category >= -1);
3283 return cBOOL(SvUV(categories) & (1U << (category + 1)));
3287 Perl_my_strerror(pTHX_ const int errnum)
3289 /* Returns a mortalized copy of the text of the error message associated
3290 * with 'errnum'. It uses the current locale's text unless the platform
3291 * doesn't have the LC_MESSAGES category or we are not being called from
3292 * within the scope of 'use locale'. In the former case, it uses whatever
3293 * strerror returns; in the latter case it uses the text from the C locale.
3295 * The function just calls strerror(), but temporarily switches, if needed,
3296 * to the C locale */
3301 #ifndef USE_LOCALE_MESSAGES
3303 /* If platform doesn't have messages category, we don't do any switching to
3304 * the C locale; we just use whatever strerror() returns */
3306 errstr = savepv(Strerror(errnum));
3308 #else /* Has locale messages */
3310 const bool within_locale_scope = IN_LC(LC_MESSAGES);
3312 # if defined(HAS_POSIX_2008_LOCALE) && defined(HAS_STRERROR_L)
3314 /* This function is trivial if we have strerror_l() */
3316 if (within_locale_scope) {
3317 errstr = strerror(errnum);
3320 errstr = strerror_l(errnum, PL_C_locale_obj);
3323 errstr = savepv(errstr);
3325 # else /* Doesn't have strerror_l(). */
3327 # ifdef USE_POSIX_2008_LOCALE
3329 locale_t save_locale = NULL;
3333 char * save_locale = NULL;
3334 bool locale_is_C = FALSE;
3336 /* We have a critical section to prevent another thread from changing the
3337 * locale out from under us (or zapping the buffer returned from
3343 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
3344 "my_strerror called with errnum %d\n", errnum));
3345 if (! within_locale_scope) {
3348 # ifdef USE_POSIX_2008_LOCALE /* Use the thread-safe locale functions */
3350 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
3351 "Not within locale scope, about to call"
3352 " uselocale(0x%p)\n", PL_C_locale_obj));
3353 save_locale = uselocale(PL_C_locale_obj);
3354 if (! save_locale) {
3355 DEBUG_L(PerlIO_printf(Perl_debug_log,
3356 "uselocale failed, errno=%d\n", errno));
3359 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
3360 "uselocale returned 0x%p\n", save_locale));
3363 # else /* Not thread-safe build */
3365 save_locale = setlocale(LC_MESSAGES, NULL);
3366 if (! save_locale) {
3367 DEBUG_L(PerlIO_printf(Perl_debug_log,
3368 "setlocale failed, errno=%d\n", errno));
3371 locale_is_C = isNAME_C_OR_POSIX(save_locale);
3373 /* Switch to the C locale if not already in it */
3374 if (! locale_is_C) {
3376 /* The setlocale() just below likely will zap 'save_locale', so
3378 save_locale = savepv(save_locale);
3379 setlocale(LC_MESSAGES, "C");
3385 } /* end of ! within_locale_scope */
3387 DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s: %d: WITHIN locale scope\n",
3388 __FILE__, __LINE__));
3391 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
3392 "Any locale change has been done; about to call Strerror\n"));
3393 errstr = savepv(Strerror(errnum));
3395 if (! within_locale_scope) {
3398 # ifdef USE_POSIX_2008_LOCALE
3400 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
3401 "%s: %d: not within locale scope, restoring the locale\n",
3402 __FILE__, __LINE__));
3403 if (save_locale && ! uselocale(save_locale)) {
3404 DEBUG_L(PerlIO_printf(Perl_debug_log,
3405 "uselocale restore failed, errno=%d\n", errno));
3411 if (save_locale && ! locale_is_C) {
3412 if (! setlocale(LC_MESSAGES, save_locale)) {
3413 DEBUG_L(PerlIO_printf(Perl_debug_log,
3414 "setlocale restore failed, errno=%d\n", errno));
3416 Safefree(save_locale);
3423 # endif /* End of doesn't have strerror_l */
3424 #endif /* End of does have locale messages */
3428 if (DEBUG_Lv_TEST) {
3429 PerlIO_printf(Perl_debug_log, "Strerror returned; saving a copy: '");
3430 print_bytes_for_locale(errstr, errstr + strlen(errstr), 0);
3431 PerlIO_printf(Perl_debug_log, "'\n");
3442 =for apidoc sync_locale
3444 Changing the program's locale should be avoided by XS code. Nevertheless,
3445 certain non-Perl libraries called from XS, such as C<Gtk> do so. When this
3446 happens, Perl needs to be told that the locale has changed. Use this function
3447 to do so, before returning to Perl.
3453 Perl_sync_locale(pTHX)
3456 #ifdef USE_LOCALE_CTYPE
3457 new_ctype(setlocale(LC_CTYPE, NULL));
3458 #endif /* USE_LOCALE_CTYPE */
3460 #ifdef USE_LOCALE_COLLATE
3461 new_collate(setlocale(LC_COLLATE, NULL));
3464 #ifdef USE_LOCALE_NUMERIC
3465 set_numeric_local(); /* Switch from "C" to underlying LC_NUMERIC */
3466 new_numeric(setlocale(LC_NUMERIC, NULL));
3467 #endif /* USE_LOCALE_NUMERIC */
3471 #if defined(DEBUGGING) && defined(USE_LOCALE)
3474 S_setlocale_debug_string(const int category, /* category number,
3476 const char* const locale, /* locale name */
3478 /* return value from setlocale() when attempting to
3479 * set 'category' to 'locale' */
3480 const char* const retval)
3482 /* Returns a pointer to a NUL-terminated string in static storage with
3483 * added text about the info passed in. This is not thread safe and will
3484 * be overwritten by the next call, so this should be used just to
3485 * formulate a string to immediately print or savepv() on. */
3487 /* initialise to a non-null value to keep it out of BSS and so keep
3488 * -DPERL_GLOBAL_STRUCT_PRIVATE happy */
3489 static char ret[128] = "If you can read this, thank your buggy C"
3490 " library strlcpy(), and change your hints file"
3492 my_strlcpy(ret, "setlocale(", sizeof(ret));
3496 my_snprintf(ret, sizeof(ret), "%s? %d", ret, category);
3500 my_strlcat(ret, "LC_ALL", sizeof(ret));
3505 my_strlcat(ret, "LC_CTYPE", sizeof(ret));
3510 my_strlcat(ret, "LC_NUMERIC", sizeof(ret));
3515 my_strlcat(ret, "LC_COLLATE", sizeof(ret));
3520 my_strlcat(ret, "LC_TIME", sizeof(ret));
3525 my_strlcat(ret, "LC_MONETARY", sizeof(ret));
3530 my_strlcat(ret, "LC_MESSAGES", sizeof(ret));
3535 my_strlcat(ret, ", ", sizeof(ret));
3538 my_strlcat(ret, "\"", sizeof(ret));
3539 my_strlcat(ret, locale, sizeof(ret));
3540 my_strlcat(ret, "\"", sizeof(ret));
3543 my_strlcat(ret, "NULL", sizeof(ret));
3546 my_strlcat(ret, ") returned ", sizeof(ret));
3549 my_strlcat(ret, "\"", sizeof(ret));
3550 my_strlcat(ret, retval, sizeof(ret));
3551 my_strlcat(ret, "\"", sizeof(ret));
3554 my_strlcat(ret, "NULL", sizeof(ret));
3557 assert(strlen(ret) < sizeof(ret));
3566 * ex: set ts=8 sts=4 sw=4 et: