This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
locale.c: Rmv unnecessary locale toggle
[perl5.git] / locale.c
... / ...
CommitLineData
1/* locale.c
2 *
3 * Copyright (C) 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001,
4 * 2002, 2003, 2005, 2006, 2007, 2008 by Larry Wall and others
5 *
6 * You may distribute under the terms of either the GNU General Public
7 * License or the Artistic License, as specified in the README file.
8 *
9 */
10
11/*
12 * A Elbereth Gilthoniel,
13 * silivren penna míriel
14 * o menel aglar elenath!
15 * Na-chaered palan-díriel
16 * o galadhremmin ennorath,
17 * Fanuilos, le linnathon
18 * nef aear, si nef aearon!
19 *
20 * [p.238 of _The Lord of the Rings_, II/i: "Many Meetings"]
21 */
22
23/* utility functions for handling locale-specific stuff like what
24 * character represents the decimal point.
25 *
26 * All C programs have an underlying locale. Perl code generally doesn't pay
27 * any attention to it except within the scope of a 'use locale'. For most
28 * categories, it accomplishes this by just using different operations if it is
29 * in such scope than if not. However, various libc functions called by Perl
30 * are affected by the LC_NUMERIC category, so there are macros in perl.h that
31 * are used to toggle between the current locale and the C locale depending on
32 * the desired behavior of those functions at the moment. And, LC_MESSAGES is
33 * switched to the C locale for outputting the message unless within the scope
34 * of 'use locale'.
35 */
36
37#include "EXTERN.h"
38#define PERL_IN_LOCALE_C
39#include "perl_langinfo.h"
40#include "perl.h"
41
42#include "reentr.h"
43
44#ifdef I_WCHAR
45# include <wchar.h>
46#endif
47
48/* If the environment says to, we can output debugging information during
49 * initialization. This is done before option parsing, and before any thread
50 * creation, so can be a file-level static */
51#if ! defined(DEBUGGING) || defined(PERL_GLOBAL_STRUCT)
52# define debug_initialization 0
53# define DEBUG_INITIALIZATION_set(v)
54#else
55static bool debug_initialization = FALSE;
56# define DEBUG_INITIALIZATION_set(v) (debug_initialization = v)
57#endif
58
59/* strlen() of a literal string constant. We might want this more general,
60 * but using it in just this file for now. A problem with more generality is
61 * the compiler warnings about comparing unlike signs */
62#define STRLENs(s) (sizeof("" s "") - 1)
63
64/* Is the C string input 'name' "C" or "POSIX"? If so, and 'name' is the
65 * return of setlocale(), then this is extremely likely to be the C or POSIX
66 * locale. However, the output of setlocale() is documented to be opaque, but
67 * the odds are extremely small that it would return these two strings for some
68 * other locale. Note that VMS in these two locales includes many non-ASCII
69 * characters as controls and punctuation (below are hex bytes):
70 * cntrl: 84-97 9B-9F
71 * punct: A1-A3 A5 A7-AB B0-B3 B5-B7 B9-BD BF-CF D1-DD DF-EF F1-FD
72 * Oddly, none there are listed as alphas, though some represent alphabetics
73 * http://www.nntp.perl.org/group/perl.perl5.porters/2013/02/msg198753.html */
74#define isNAME_C_OR_POSIX(name) \
75 ( (name) != NULL \
76 && (( *(name) == 'C' && (*(name + 1)) == '\0') \
77 || strEQ((name), "POSIX")))
78
79#ifdef USE_LOCALE
80
81/* This code keeps a LRU cache of the UTF-8ness of the locales it has so-far
82 * looked up. This is in the form of a C string: */
83
84#define UTF8NESS_SEP "\v"
85#define UTF8NESS_PREFIX "\f"
86
87/* So, the string looks like:
88 *
89 * \vC\a0\vPOSIX\a0\vam_ET\a0\vaf_ZA.utf8\a1\ven_US.UTF-8\a1\0
90 *
91 * where the digit 0 after the \a indicates that the locale starting just
92 * after the preceding \v is not UTF-8, and the digit 1 mean it is. */
93
94STATIC_ASSERT_DECL(STRLENs(UTF8NESS_SEP) == 1);
95STATIC_ASSERT_DECL(STRLENs(UTF8NESS_PREFIX) == 1);
96
97#define C_and_POSIX_utf8ness UTF8NESS_SEP "C" UTF8NESS_PREFIX "0" \
98 UTF8NESS_SEP "POSIX" UTF8NESS_PREFIX "0"
99
100/* The cache is initialized to C_and_POSIX_utf8ness at start up. These are
101 * kept there always. The remining portion of the cache is LRU, with the
102 * oldest looked-up locale at the tail end */
103
104STATIC char *
105S_stdize_locale(pTHX_ char *locs)
106{
107 /* Standardize the locale name from a string returned by 'setlocale',
108 * possibly modifying that string.
109 *
110 * The typical return value of setlocale() is either
111 * (1) "xx_YY" if the first argument of setlocale() is not LC_ALL
112 * (2) "xa_YY xb_YY ..." if the first argument of setlocale() is LC_ALL
113 * (the space-separated values represent the various sublocales,
114 * in some unspecified order). This is not handled by this function.
115 *
116 * In some platforms it has a form like "LC_SOMETHING=Lang_Country.866\n",
117 * which is harmful for further use of the string in setlocale(). This
118 * function removes the trailing new line and everything up through the '='
119 * */
120
121 const char * const s = strchr(locs, '=');
122 bool okay = TRUE;
123
124 PERL_ARGS_ASSERT_STDIZE_LOCALE;
125
126 if (s) {
127 const char * const t = strchr(s, '.');
128 okay = FALSE;
129 if (t) {
130 const char * const u = strchr(t, '\n');
131 if (u && (u[1] == 0)) {
132 const STRLEN len = u - s;
133 Move(s + 1, locs, len, char);
134 locs[len] = 0;
135 okay = TRUE;
136 }
137 }
138 }
139
140 if (!okay)
141 Perl_croak(aTHX_ "Can't fix broken locale name \"%s\"", locs);
142
143 return locs;
144}
145
146/* Two parallel arrays; first the locale categories Perl uses on this system;
147 * the second array is their names. These arrays are in mostly arbitrary
148 * order. */
149
150const int categories[] = {
151
152# ifdef USE_LOCALE_NUMERIC
153 LC_NUMERIC,
154# endif
155# ifdef USE_LOCALE_CTYPE
156 LC_CTYPE,
157# endif
158# ifdef USE_LOCALE_COLLATE
159 LC_COLLATE,
160# endif
161# ifdef USE_LOCALE_TIME
162 LC_TIME,
163# endif
164# ifdef USE_LOCALE_MESSAGES
165 LC_MESSAGES,
166# endif
167# ifdef USE_LOCALE_MONETARY
168 LC_MONETARY,
169# endif
170# ifdef USE_LOCALE_ADDRESS
171 LC_ADDRESS,
172# endif
173# ifdef USE_LOCALE_IDENTIFICATION
174 LC_IDENTIFICATION,
175# endif
176# ifdef USE_LOCALE_MEASUREMENT
177 LC_MEASUREMENT,
178# endif
179# ifdef USE_LOCALE_PAPER
180 LC_PAPER,
181# endif
182# ifdef USE_LOCALE_TELEPHONE
183 LC_TELEPHONE,
184# endif
185# ifdef LC_ALL
186 LC_ALL,
187# endif
188 -1 /* Placeholder because C doesn't allow a
189 trailing comma, and it would get complicated
190 with all the #ifdef's */
191};
192
193/* The top-most real element is LC_ALL */
194
195const char * category_names[] = {
196
197# ifdef USE_LOCALE_NUMERIC
198 "LC_NUMERIC",
199# endif
200# ifdef USE_LOCALE_CTYPE
201 "LC_CTYPE",
202# endif
203# ifdef USE_LOCALE_COLLATE
204 "LC_COLLATE",
205# endif
206# ifdef USE_LOCALE_TIME
207 "LC_TIME",
208# endif
209# ifdef USE_LOCALE_MESSAGES
210 "LC_MESSAGES",
211# endif
212# ifdef USE_LOCALE_MONETARY
213 "LC_MONETARY",
214# endif
215# ifdef USE_LOCALE_ADDRESS
216 "LC_ADDRESS",
217# endif
218# ifdef USE_LOCALE_IDENTIFICATION
219 "LC_IDENTIFICATION",
220# endif
221# ifdef USE_LOCALE_MEASUREMENT
222 "LC_MEASUREMENT",
223# endif
224# ifdef USE_LOCALE_PAPER
225 "LC_PAPER",
226# endif
227# ifdef USE_LOCALE_TELEPHONE
228 "LC_TELEPHONE",
229# endif
230# ifdef LC_ALL
231 "LC_ALL",
232# endif
233 NULL /* Placeholder */
234 };
235
236# ifdef LC_ALL
237
238 /* On systems with LC_ALL, it is kept in the highest index position. (-2
239 * to account for the final unused placeholder element.) */
240# define NOMINAL_LC_ALL_INDEX (C_ARRAY_LENGTH(categories) - 2)
241
242# else
243
244 /* On systems without LC_ALL, we pretend it is there, one beyond the real
245 * top element, hence in the unused placeholder element. */
246# define NOMINAL_LC_ALL_INDEX (C_ARRAY_LENGTH(categories) - 1)
247
248# endif
249
250/* Pretending there is an LC_ALL element just above allows us to avoid most
251 * special cases. Most loops through these arrays in the code below are
252 * written like 'for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++)'. They will work
253 * on either type of system. But the code must be written to not access the
254 * element at 'LC_ALL_INDEX' except on platforms that have it. This can be
255 * checked for at compile time by using the #define LC_ALL_INDEX which is only
256 * defined if we do have LC_ALL. */
257
258STATIC const char *
259S_category_name(const int category)
260{
261 unsigned int i;
262
263#ifdef LC_ALL
264
265 if (category == LC_ALL) {
266 return "LC_ALL";
267 }
268
269#endif
270
271 for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++) {
272 if (category == categories[i]) {
273 return category_names[i];
274 }
275 }
276
277 {
278 const char suffix[] = " (unknown)";
279 int temp = category;
280 Size_t length = sizeof(suffix) + 1;
281 char * unknown;
282 dTHX;
283
284 if (temp < 0) {
285 length++;
286 temp = - temp;
287 }
288
289 /* Calculate the number of digits */
290 while (temp >= 10) {
291 temp /= 10;
292 length++;
293 }
294
295 Newx(unknown, length, char);
296 my_snprintf(unknown, length, "%d%s", category, suffix);
297 SAVEFREEPV(unknown);
298 return unknown;
299 }
300}
301
302/* Now create LC_foo_INDEX #defines for just those categories on this system */
303# ifdef USE_LOCALE_NUMERIC
304# define LC_NUMERIC_INDEX 0
305# define _DUMMY_NUMERIC LC_NUMERIC_INDEX
306# else
307# define _DUMMY_NUMERIC -1
308# endif
309# ifdef USE_LOCALE_CTYPE
310# define LC_CTYPE_INDEX _DUMMY_NUMERIC + 1
311# define _DUMMY_CTYPE LC_CTYPE_INDEX
312# else
313# define _DUMMY_CTYPE _DUMMY_NUMERIC
314# endif
315# ifdef USE_LOCALE_COLLATE
316# define LC_COLLATE_INDEX _DUMMY_CTYPE + 1
317# define _DUMMY_COLLATE LC_COLLATE_INDEX
318# else
319# define _DUMMY_COLLATE _DUMMY_COLLATE
320# endif
321# ifdef USE_LOCALE_TIME
322# define LC_TIME_INDEX _DUMMY_COLLATE + 1
323# define _DUMMY_TIME LC_TIME_INDEX
324# else
325# define _DUMMY_TIME _DUMMY_COLLATE
326# endif
327# ifdef USE_LOCALE_MESSAGES
328# define LC_MESSAGES_INDEX _DUMMY_TIME + 1
329# define _DUMMY_MESSAGES LC_MESSAGES_INDEX
330# else
331# define _DUMMY_MESSAGES _DUMMY_TIME
332# endif
333# ifdef USE_LOCALE_MONETARY
334# define LC_MONETARY_INDEX _DUMMY_MESSAGES + 1
335# define _DUMMY_MONETARY LC_MONETARY_INDEX
336# else
337# define _DUMMY_MONETARY _DUMMY_MESSAGES
338# endif
339# ifdef USE_LOCALE_ADDRESS
340# define LC_ADDRESS_INDEX _DUMMY_MONETARY + 1
341# define _DUMMY_ADDRESS LC_ADDRESS_INDEX
342# else
343# define _DUMMY_ADDRESS _DUMMY_MONETARY
344# endif
345# ifdef USE_LOCALE_IDENTIFICATION
346# define LC_IDENTIFICATION_INDEX _DUMMY_ADDRESS + 1
347# define _DUMMY_IDENTIFICATION LC_IDENTIFICATION_INDEX
348# else
349# define _DUMMY_IDENTIFICATION _DUMMY_ADDRESS
350# endif
351# ifdef USE_LOCALE_MEASUREMENT
352# define LC_MEASUREMENT_INDEX _DUMMY_IDENTIFICATION + 1
353# define _DUMMY_MEASUREMENT LC_MEASUREMENT_INDEX
354# else
355# define _DUMMY_MEASUREMENT _DUMMY_IDENTIFICATION
356# endif
357# ifdef USE_LOCALE_PAPER
358# define LC_PAPER_INDEX _DUMMY_MEASUREMENT + 1
359# define _DUMMY_PAPER LC_PAPER_INDEX
360# else
361# define _DUMMY_PAPER _DUMMY_MEASUREMENT
362# endif
363# ifdef USE_LOCALE_TELEPHONE
364# define LC_TELEPHONE_INDEX _DUMMY_PAPER + 1
365# define _DUMMY_TELEPHONE LC_TELEPHONE_INDEX
366# else
367# define _DUMMY_TELEPHONE _DUMMY_PAPER
368# endif
369# ifdef LC_ALL
370# define LC_ALL_INDEX _DUMMY_TELEPHONE + 1
371# endif
372#endif /* ifdef USE_LOCALE */
373
374/* Windows requres a customized base-level setlocale() */
375# ifdef WIN32
376# define my_setlocale(cat, locale) win32_setlocale(cat, locale)
377# else
378# define my_setlocale(cat, locale) setlocale(cat, locale)
379# endif
380
381/* Just placeholders for now. "_c" is intended to be called when the category
382 * is a constant known at compile time; "_r", not known until run time */
383# define do_setlocale_c(category, locale) my_setlocale(category, locale)
384# define do_setlocale_r(category, locale) my_setlocale(category, locale)
385
386STATIC void
387S_set_numeric_radix(pTHX_ const bool use_locale)
388{
389 /* If 'use_locale' is FALSE, set to use a dot for the radix character. If
390 * TRUE, use the radix character derived from the current locale */
391
392#if defined(USE_LOCALE_NUMERIC) && ( defined(HAS_LOCALECONV) \
393 || defined(HAS_NL_LANGINFO))
394
395 const char * radix = (use_locale)
396 ? my_nl_langinfo(PERL_RADIXCHAR, FALSE)
397 /* FALSE => already in dest locale */
398 : ".";
399
400 sv_setpv(PL_numeric_radix_sv, radix);
401
402 /* If this is valid UTF-8 that isn't totally ASCII, and we are in
403 * a UTF-8 locale, then mark the radix as being in UTF-8 */
404 if (is_utf8_non_invariant_string((U8 *) SvPVX(PL_numeric_radix_sv),
405 SvCUR(PL_numeric_radix_sv))
406 && _is_cur_LC_category_utf8(LC_NUMERIC))
407 {
408 SvUTF8_on(PL_numeric_radix_sv);
409 }
410
411# ifdef DEBUGGING
412
413 if (DEBUG_L_TEST || debug_initialization) {
414 PerlIO_printf(Perl_debug_log, "Locale radix is '%s', ?UTF-8=%d\n",
415 SvPVX(PL_numeric_radix_sv),
416 cBOOL(SvUTF8(PL_numeric_radix_sv)));
417 }
418
419# endif
420#endif /* USE_LOCALE_NUMERIC and can find the radix char */
421
422}
423
424
425void
426Perl_new_numeric(pTHX_ const char *newnum)
427{
428
429#ifndef USE_LOCALE_NUMERIC
430
431 PERL_UNUSED_ARG(newnum);
432
433#else
434
435 /* Called after each libc setlocale() call affecting LC_NUMERIC, to tell
436 * core Perl this and that 'newnum' is the name of the new locale.
437 * It installs this locale as the current underlying default.
438 *
439 * The default locale and the C locale can be toggled between by use of the
440 * set_numeric_underlying() and set_numeric_standard() functions, which
441 * should probably not be called directly, but only via macros like
442 * SET_NUMERIC_STANDARD() in perl.h.
443 *
444 * The toggling is necessary mainly so that a non-dot radix decimal point
445 * character can be output, while allowing internal calculations to use a
446 * dot.
447 *
448 * This sets several interpreter-level variables:
449 * PL_numeric_name The underlying locale's name: a copy of 'newnum'
450 * PL_numeric_underlying A boolean indicating if the toggled state is such
451 * that the current locale is the program's underlying
452 * locale
453 * PL_numeric_standard An int indicating if the toggled state is such
454 * that the current locale is the C locale or
455 * indistinguishable from the C locale. If non-zero, it
456 * is in C; if > 1, it means it may not be toggled away
457 * from C.
458 * PL_numeric_underlying_is_standard A bool kept by this function
459 * indicating that the underlying locale and the standard
460 * C locale are indistinguishable for the purposes of
461 * LC_NUMERIC. This happens when both of the above two
462 * variables are true at the same time. (Toggling is a
463 * no-op under these circumstances.) This variable is
464 * used to avoid having to recalculate.
465 * Any code changing the locale (outside this file) should use
466 * POSIX::setlocale, which calls this function. Therefore this function
467 * should be called directly only from this file and from
468 * POSIX::setlocale() */
469
470 char *save_newnum;
471
472 if (! newnum) {
473 Safefree(PL_numeric_name);
474 PL_numeric_name = NULL;
475 PL_numeric_standard = TRUE;
476 PL_numeric_underlying = TRUE;
477 PL_numeric_underlying_is_standard = TRUE;
478 return;
479 }
480
481 save_newnum = stdize_locale(savepv(newnum));
482 PL_numeric_underlying = TRUE;
483 PL_numeric_standard = isNAME_C_OR_POSIX(save_newnum);
484
485 /* If its name isn't C nor POSIX, it could still be indistinguishable from
486 * them */
487 if (! PL_numeric_standard) {
488 PL_numeric_standard = cBOOL(strEQ(".", my_nl_langinfo(PERL_RADIXCHAR,
489 FALSE /* Don't toggle locale */ ))
490 && strEQ("", my_nl_langinfo(PERL_THOUSEP,
491 FALSE)));
492 }
493
494 /* Save the new name if it isn't the same as the previous one, if any */
495 if (! PL_numeric_name || strNE(PL_numeric_name, save_newnum)) {
496 Safefree(PL_numeric_name);
497 PL_numeric_name = save_newnum;
498 }
499 else {
500 Safefree(save_newnum);
501 }
502
503 PL_numeric_underlying_is_standard = PL_numeric_standard;
504
505# ifdef HAS_POSIX_2008_LOCALE
506
507 PL_underlying_numeric_obj = newlocale(LC_NUMERIC_MASK,
508 PL_numeric_name,
509 PL_underlying_numeric_obj);
510
511#endif
512
513 if (DEBUG_L_TEST || debug_initialization) {
514 PerlIO_printf(Perl_debug_log, "Called new_numeric with %s, PL_numeric_name=%s\n", newnum, PL_numeric_name);
515 }
516
517 /* Keep LC_NUMERIC in the C locale. This is for XS modules, so they don't
518 * have to worry about the radix being a non-dot. (Core operations that
519 * need the underlying locale change to it temporarily). */
520 set_numeric_standard();
521
522#endif /* USE_LOCALE_NUMERIC */
523
524}
525
526void
527Perl_set_numeric_standard(pTHX)
528{
529
530#ifdef USE_LOCALE_NUMERIC
531
532 /* Toggle the LC_NUMERIC locale to C. Most code should use the macros like
533 * SET_NUMERIC_STANDARD() in perl.h instead of calling this directly. The
534 * macro avoids calling this routine if toggling isn't necessary according
535 * to our records (which could be wrong if some XS code has changed the
536 * locale behind our back) */
537
538 do_setlocale_c(LC_NUMERIC, "C");
539 PL_numeric_standard = TRUE;
540 PL_numeric_underlying = PL_numeric_underlying_is_standard;
541 set_numeric_radix(0);
542
543# ifdef DEBUGGING
544
545 if (DEBUG_L_TEST || debug_initialization) {
546 PerlIO_printf(Perl_debug_log,
547 "LC_NUMERIC locale now is standard C\n");
548 }
549
550# endif
551#endif /* USE_LOCALE_NUMERIC */
552
553}
554
555void
556Perl_set_numeric_underlying(pTHX)
557{
558
559#ifdef USE_LOCALE_NUMERIC
560
561 /* Toggle the LC_NUMERIC locale to the current underlying default. Most
562 * code should use the macros like SET_NUMERIC_UNDERLYING() in perl.h
563 * instead of calling this directly. The macro avoids calling this routine
564 * if toggling isn't necessary according to our records (which could be
565 * wrong if some XS code has changed the locale behind our back) */
566
567 do_setlocale_c(LC_NUMERIC, PL_numeric_name);
568 PL_numeric_standard = PL_numeric_underlying_is_standard;
569 PL_numeric_underlying = TRUE;
570 set_numeric_radix(! PL_numeric_standard);
571
572# ifdef DEBUGGING
573
574 if (DEBUG_L_TEST || debug_initialization) {
575 PerlIO_printf(Perl_debug_log,
576 "LC_NUMERIC locale now is %s\n",
577 PL_numeric_name);
578 }
579
580# endif
581#endif /* USE_LOCALE_NUMERIC */
582
583}
584
585/*
586 * Set up for a new ctype locale.
587 */
588STATIC void
589S_new_ctype(pTHX_ const char *newctype)
590{
591
592#ifndef USE_LOCALE_CTYPE
593
594 PERL_ARGS_ASSERT_NEW_CTYPE;
595 PERL_UNUSED_ARG(newctype);
596 PERL_UNUSED_CONTEXT;
597
598#else
599
600 /* Called after each libc setlocale() call affecting LC_CTYPE, to tell
601 * core Perl this and that 'newctype' is the name of the new locale.
602 *
603 * This function sets up the folding arrays for all 256 bytes, assuming
604 * that tofold() is tolc() since fold case is not a concept in POSIX,
605 *
606 * Any code changing the locale (outside this file) should use
607 * POSIX::setlocale, which calls this function. Therefore this function
608 * should be called directly only from this file and from
609 * POSIX::setlocale() */
610
611 dVAR;
612 UV i;
613
614 PERL_ARGS_ASSERT_NEW_CTYPE;
615
616 /* We will replace any bad locale warning with 1) nothing if the new one is
617 * ok; or 2) a new warning for the bad new locale */
618 if (PL_warn_locale) {
619 SvREFCNT_dec_NN(PL_warn_locale);
620 PL_warn_locale = NULL;
621 }
622
623 PL_in_utf8_CTYPE_locale = _is_cur_LC_category_utf8(LC_CTYPE);
624
625 /* A UTF-8 locale gets standard rules. But note that code still has to
626 * handle this specially because of the three problematic code points */
627 if (PL_in_utf8_CTYPE_locale) {
628 Copy(PL_fold_latin1, PL_fold_locale, 256, U8);
629 }
630 else {
631 /* Assume enough space for every character being bad. 4 spaces each
632 * for the 94 printable characters that are output like "'x' "; and 5
633 * spaces each for "'\\' ", "'\t' ", and "'\n' "; plus a terminating
634 * NUL */
635 char bad_chars_list[ (94 * 4) + (3 * 5) + 1 ];
636
637 /* Don't check for problems if we are suppressing the warnings */
638 bool check_for_problems = ckWARN_d(WARN_LOCALE)
639 || UNLIKELY(DEBUG_L_TEST);
640 bool multi_byte_locale = FALSE; /* Assume is a single-byte locale
641 to start */
642 unsigned int bad_count = 0; /* Count of bad characters */
643
644 for (i = 0; i < 256; i++) {
645 if (isupper(i))
646 PL_fold_locale[i] = (U8) tolower(i);
647 else if (islower(i))
648 PL_fold_locale[i] = (U8) toupper(i);
649 else
650 PL_fold_locale[i] = (U8) i;
651
652 /* If checking for locale problems, see if the native ASCII-range
653 * printables plus \n and \t are in their expected categories in
654 * the new locale. If not, this could mean big trouble, upending
655 * Perl's and most programs' assumptions, like having a
656 * metacharacter with special meaning become a \w. Fortunately,
657 * it's very rare to find locales that aren't supersets of ASCII
658 * nowadays. It isn't a problem for most controls to be changed
659 * into something else; we check only \n and \t, though perhaps \r
660 * could be an issue as well. */
661 if ( check_for_problems
662 && (isGRAPH_A(i) || isBLANK_A(i) || i == '\n'))
663 {
664 if ( cBOOL(isalnum(i)) != cBOOL(isALPHANUMERIC_A(i))
665 || cBOOL(isalpha(i)) != cBOOL(isALPHA_A(i))
666 || cBOOL(isdigit(i)) != cBOOL(isDIGIT_A(i))
667 || cBOOL(isgraph(i)) != cBOOL(isGRAPH_A(i))
668 || cBOOL(islower(i)) != cBOOL(isLOWER_A(i))
669 || cBOOL(isprint(i)) != cBOOL(isPRINT_A(i))
670 || cBOOL(ispunct(i)) != cBOOL(isPUNCT_A(i))
671 || cBOOL(isspace(i)) != cBOOL(isSPACE_A(i))
672 || cBOOL(isupper(i)) != cBOOL(isUPPER_A(i))
673 || cBOOL(isxdigit(i))!= cBOOL(isXDIGIT_A(i))
674 || tolower(i) != (int) toLOWER_A(i)
675 || toupper(i) != (int) toUPPER_A(i)
676 || (i == '\n' && ! isCNTRL_LC(i)))
677 {
678 if (bad_count) { /* Separate multiple entries with a
679 blank */
680 bad_chars_list[bad_count++] = ' ';
681 }
682 bad_chars_list[bad_count++] = '\'';
683 if (isPRINT_A(i)) {
684 bad_chars_list[bad_count++] = (char) i;
685 }
686 else {
687 bad_chars_list[bad_count++] = '\\';
688 if (i == '\n') {
689 bad_chars_list[bad_count++] = 'n';
690 }
691 else {
692 assert(i == '\t');
693 bad_chars_list[bad_count++] = 't';
694 }
695 }
696 bad_chars_list[bad_count++] = '\'';
697 bad_chars_list[bad_count] = '\0';
698 }
699 }
700 }
701
702# ifdef MB_CUR_MAX
703
704 /* We only handle single-byte locales (outside of UTF-8 ones; so if
705 * this locale requires more than one byte, there are going to be
706 * problems. */
707 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
708 "%s:%d: check_for_problems=%d, MB_CUR_MAX=%d\n",
709 __FILE__, __LINE__, check_for_problems, (int) MB_CUR_MAX));
710
711 if (check_for_problems && MB_CUR_MAX > 1
712
713 /* Some platforms return MB_CUR_MAX > 1 for even the "C"
714 * locale. Just assume that the implementation for them (plus
715 * for POSIX) is correct and the > 1 value is spurious. (Since
716 * these are specially handled to never be considered UTF-8
717 * locales, as long as this is the only problem, everything
718 * should work fine */
719 && strNE(newctype, "C") && strNE(newctype, "POSIX"))
720 {
721 multi_byte_locale = TRUE;
722 }
723
724# endif
725
726 if (bad_count || multi_byte_locale) {
727 PL_warn_locale = Perl_newSVpvf(aTHX_
728 "Locale '%s' may not work well.%s%s%s\n",
729 newctype,
730 (multi_byte_locale)
731 ? " Some characters in it are not recognized by"
732 " Perl."
733 : "",
734 (bad_count)
735 ? "\nThe following characters (and maybe others)"
736 " may not have the same meaning as the Perl"
737 " program expects:\n"
738 : "",
739 (bad_count)
740 ? bad_chars_list
741 : ""
742 );
743 /* If we are actually in the scope of the locale or are debugging,
744 * output the message now. If not in that scope, we save the
745 * message to be output at the first operation using this locale,
746 * if that actually happens. Most programs don't use locales, so
747 * they are immune to bad ones. */
748 if (IN_LC(LC_CTYPE) || UNLIKELY(DEBUG_L_TEST)) {
749
750 /* The '0' below suppresses a bogus gcc compiler warning */
751 Perl_warner(aTHX_ packWARN(WARN_LOCALE), SvPVX(PL_warn_locale), 0);
752
753 if (IN_LC(LC_CTYPE)) {
754 SvREFCNT_dec_NN(PL_warn_locale);
755 PL_warn_locale = NULL;
756 }
757 }
758 }
759 }
760
761#endif /* USE_LOCALE_CTYPE */
762
763}
764
765void
766Perl__warn_problematic_locale()
767{
768
769#ifdef USE_LOCALE_CTYPE
770
771 dTHX;
772
773 /* Internal-to-core function that outputs the message in PL_warn_locale,
774 * and then NULLS it. Should be called only through the macro
775 * _CHECK_AND_WARN_PROBLEMATIC_LOCALE */
776
777 if (PL_warn_locale) {
778 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
779 SvPVX(PL_warn_locale),
780 0 /* dummy to avoid compiler warning */ );
781 SvREFCNT_dec_NN(PL_warn_locale);
782 PL_warn_locale = NULL;
783 }
784
785#endif
786
787}
788
789STATIC void
790S_new_collate(pTHX_ const char *newcoll)
791{
792
793#ifndef USE_LOCALE_COLLATE
794
795 PERL_UNUSED_ARG(newcoll);
796 PERL_UNUSED_CONTEXT;
797
798#else
799
800 /* Called after each libc setlocale() call affecting LC_COLLATE, to tell
801 * core Perl this and that 'newcoll' is the name of the new locale.
802 *
803 * The design of locale collation is that every locale change is given an
804 * index 'PL_collation_ix'. The first time a string particpates in an
805 * operation that requires collation while locale collation is active, it
806 * is given PERL_MAGIC_collxfrm magic (via sv_collxfrm_flags()). That
807 * magic includes the collation index, and the transformation of the string
808 * by strxfrm(), q.v. That transformation is used when doing comparisons,
809 * instead of the string itself. If a string changes, the magic is
810 * cleared. The next time the locale changes, the index is incremented,
811 * and so we know during a comparison that the transformation is not
812 * necessarily still valid, and so is recomputed. Note that if the locale
813 * changes enough times, the index could wrap (a U32), and it is possible
814 * that a transformation would improperly be considered valid, leading to
815 * an unlikely bug */
816
817 if (! newcoll) {
818 if (PL_collation_name) {
819 ++PL_collation_ix;
820 Safefree(PL_collation_name);
821 PL_collation_name = NULL;
822 }
823 PL_collation_standard = TRUE;
824 is_standard_collation:
825 PL_collxfrm_base = 0;
826 PL_collxfrm_mult = 2;
827 PL_in_utf8_COLLATE_locale = FALSE;
828 PL_strxfrm_NUL_replacement = '\0';
829 PL_strxfrm_max_cp = 0;
830 return;
831 }
832
833 /* If this is not the same locale as currently, set the new one up */
834 if (! PL_collation_name || strNE(PL_collation_name, newcoll)) {
835 ++PL_collation_ix;
836 Safefree(PL_collation_name);
837 PL_collation_name = stdize_locale(savepv(newcoll));
838 PL_collation_standard = isNAME_C_OR_POSIX(newcoll);
839 if (PL_collation_standard) {
840 goto is_standard_collation;
841 }
842
843 PL_in_utf8_COLLATE_locale = _is_cur_LC_category_utf8(LC_COLLATE);
844 PL_strxfrm_NUL_replacement = '\0';
845 PL_strxfrm_max_cp = 0;
846
847 /* A locale collation definition includes primary, secondary, tertiary,
848 * etc. weights for each character. To sort, the primary weights are
849 * used, and only if they compare equal, then the secondary weights are
850 * used, and only if they compare equal, then the tertiary, etc.
851 *
852 * strxfrm() works by taking the input string, say ABC, and creating an
853 * output transformed string consisting of first the primary weights,
854 * A¹B¹C¹ followed by the secondary ones, A²B²C²; and then the
855 * tertiary, etc, yielding A¹B¹C¹ A²B²C² A³B³C³ .... Some characters
856 * may not have weights at every level. In our example, let's say B
857 * doesn't have a tertiary weight, and A doesn't have a secondary
858 * weight. The constructed string is then going to be
859 * A¹B¹C¹ B²C² A³C³ ....
860 * This has the desired effect that strcmp() will look at the secondary
861 * or tertiary weights only if the strings compare equal at all higher
862 * priority weights. The spaces shown here, like in
863 * "A¹B¹C¹ A²B²C² "
864 * are not just for readability. In the general case, these must
865 * actually be bytes, which we will call here 'separator weights'; and
866 * they must be smaller than any other weight value, but since these
867 * are C strings, only the terminating one can be a NUL (some
868 * implementations may include a non-NUL separator weight just before
869 * the NUL). Implementations tend to reserve 01 for the separator
870 * weights. They are needed so that a shorter string's secondary
871 * weights won't be misconstrued as primary weights of a longer string,
872 * etc. By making them smaller than any other weight, the shorter
873 * string will sort first. (Actually, if all secondary weights are
874 * smaller than all primary ones, there is no need for a separator
875 * weight between those two levels, etc.)
876 *
877 * The length of the transformed string is roughly a linear function of
878 * the input string. It's not exactly linear because some characters
879 * don't have weights at all levels. When we call strxfrm() we have to
880 * allocate some memory to hold the transformed string. The
881 * calculations below try to find coefficients 'm' and 'b' for this
882 * locale so that m*x + b equals how much space we need, given the size
883 * of the input string in 'x'. If we calculate too small, we increase
884 * the size as needed, and call strxfrm() again, but it is better to
885 * get it right the first time to avoid wasted expensive string
886 * transformations. */
887
888 {
889 /* We use the string below to find how long the tranformation of it
890 * is. Almost all locales are supersets of ASCII, or at least the
891 * ASCII letters. We use all of them, half upper half lower,
892 * because if we used fewer, we might hit just the ones that are
893 * outliers in a particular locale. Most of the strings being
894 * collated will contain a preponderance of letters, and even if
895 * they are above-ASCII, they are likely to have the same number of
896 * weight levels as the ASCII ones. It turns out that digits tend
897 * to have fewer levels, and some punctuation has more, but those
898 * are relatively sparse in text, and khw believes this gives a
899 * reasonable result, but it could be changed if experience so
900 * dictates. */
901 const char longer[] = "ABCDEFGHIJKLMnopqrstuvwxyz";
902 char * x_longer; /* Transformed 'longer' */
903 Size_t x_len_longer; /* Length of 'x_longer' */
904
905 char * x_shorter; /* We also transform a substring of 'longer' */
906 Size_t x_len_shorter;
907
908 /* _mem_collxfrm() is used get the transformation (though here we
909 * are interested only in its length). It is used because it has
910 * the intelligence to handle all cases, but to work, it needs some
911 * values of 'm' and 'b' to get it started. For the purposes of
912 * this calculation we use a very conservative estimate of 'm' and
913 * 'b'. This assumes a weight can be multiple bytes, enough to
914 * hold any UV on the platform, and there are 5 levels, 4 weight
915 * bytes, and a trailing NUL. */
916 PL_collxfrm_base = 5;
917 PL_collxfrm_mult = 5 * sizeof(UV);
918
919 /* Find out how long the transformation really is */
920 x_longer = _mem_collxfrm(longer,
921 sizeof(longer) - 1,
922 &x_len_longer,
923
924 /* We avoid converting to UTF-8 in the
925 * called function by telling it the
926 * string is in UTF-8 if the locale is a
927 * UTF-8 one. Since the string passed
928 * here is invariant under UTF-8, we can
929 * claim it's UTF-8 even though it isn't.
930 * */
931 PL_in_utf8_COLLATE_locale);
932 Safefree(x_longer);
933
934 /* Find out how long the transformation of a substring of 'longer'
935 * is. Together the lengths of these transformations are
936 * sufficient to calculate 'm' and 'b'. The substring is all of
937 * 'longer' except the first character. This minimizes the chances
938 * of being swayed by outliers */
939 x_shorter = _mem_collxfrm(longer + 1,
940 sizeof(longer) - 2,
941 &x_len_shorter,
942 PL_in_utf8_COLLATE_locale);
943 Safefree(x_shorter);
944
945 /* If the results are nonsensical for this simple test, the whole
946 * locale definition is suspect. Mark it so that locale collation
947 * is not active at all for it. XXX Should we warn? */
948 if ( x_len_shorter == 0
949 || x_len_longer == 0
950 || x_len_shorter >= x_len_longer)
951 {
952 PL_collxfrm_mult = 0;
953 PL_collxfrm_base = 0;
954 }
955 else {
956 SSize_t base; /* Temporary */
957
958 /* We have both: m * strlen(longer) + b = x_len_longer
959 * m * strlen(shorter) + b = x_len_shorter;
960 * subtracting yields:
961 * m * (strlen(longer) - strlen(shorter))
962 * = x_len_longer - x_len_shorter
963 * But we have set things up so that 'shorter' is 1 byte smaller
964 * than 'longer'. Hence:
965 * m = x_len_longer - x_len_shorter
966 *
967 * But if something went wrong, make sure the multiplier is at
968 * least 1.
969 */
970 if (x_len_longer > x_len_shorter) {
971 PL_collxfrm_mult = (STRLEN) x_len_longer - x_len_shorter;
972 }
973 else {
974 PL_collxfrm_mult = 1;
975 }
976
977 /* mx + b = len
978 * so: b = len - mx
979 * but in case something has gone wrong, make sure it is
980 * non-negative */
981 base = x_len_longer - PL_collxfrm_mult * (sizeof(longer) - 1);
982 if (base < 0) {
983 base = 0;
984 }
985
986 /* Add 1 for the trailing NUL */
987 PL_collxfrm_base = base + 1;
988 }
989
990# ifdef DEBUGGING
991
992 if (DEBUG_L_TEST || debug_initialization) {
993 PerlIO_printf(Perl_debug_log,
994 "%s:%d: ?UTF-8 locale=%d; x_len_shorter=%zu, "
995 "x_len_longer=%zu,"
996 " collate multipler=%zu, collate base=%zu\n",
997 __FILE__, __LINE__,
998 PL_in_utf8_COLLATE_locale,
999 x_len_shorter, x_len_longer,
1000 PL_collxfrm_mult, PL_collxfrm_base);
1001 }
1002# endif
1003
1004 }
1005 }
1006
1007#endif /* USE_LOCALE_COLLATE */
1008
1009}
1010
1011#ifdef WIN32
1012
1013STATIC char *
1014S_win32_setlocale(pTHX_ int category, const char* locale)
1015{
1016 /* This, for Windows, emulates POSIX setlocale() behavior. There is no
1017 * difference between the two unless the input locale is "", which normally
1018 * means on Windows to get the machine default, which is set via the
1019 * computer's "Regional and Language Options" (or its current equivalent).
1020 * In POSIX, it instead means to find the locale from the user's
1021 * environment. This routine changes the Windows behavior to first look in
1022 * the environment, and, if anything is found, use that instead of going to
1023 * the machine default. If there is no environment override, the machine
1024 * default is used, by calling the real setlocale() with "".
1025 *
1026 * The POSIX behavior is to use the LC_ALL variable if set; otherwise to
1027 * use the particular category's variable if set; otherwise to use the LANG
1028 * variable. */
1029
1030 bool override_LC_ALL = FALSE;
1031 char * result;
1032 unsigned int i;
1033
1034 if (locale && strEQ(locale, "")) {
1035
1036# ifdef LC_ALL
1037
1038 locale = PerlEnv_getenv("LC_ALL");
1039 if (! locale) {
1040 if (category == LC_ALL) {
1041 override_LC_ALL = TRUE;
1042 }
1043 else {
1044
1045# endif
1046
1047 for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++) {
1048 if (category == categories[i]) {
1049 locale = PerlEnv_getenv(category_names[i]);
1050 goto found_locale;
1051 }
1052 }
1053
1054 locale = PerlEnv_getenv("LANG");
1055 if (! locale) {
1056 locale = "";
1057 }
1058
1059 found_locale: ;
1060
1061# ifdef LC_ALL
1062
1063 }
1064 }
1065
1066# endif
1067
1068 }
1069
1070 result = setlocale(category, locale);
1071 DEBUG_L(PerlIO_printf(Perl_debug_log, "%s:%d: %s\n", __FILE__, __LINE__,
1072 setlocale_debug_string(category, locale, result)));
1073
1074 if (! override_LC_ALL) {
1075 return result;
1076 }
1077
1078 /* Here the input category was LC_ALL, and we have set it to what is in the
1079 * LANG variable or the system default if there is no LANG. But these have
1080 * lower priority than the other LC_foo variables, so override it for each
1081 * one that is set. (If they are set to "", it means to use the same thing
1082 * we just set LC_ALL to, so can skip) */
1083
1084 for (i = 0; i < LC_ALL_INDEX; i++) {
1085 result = PerlEnv_getenv(category_names[i]);
1086 if (result && strNE(result, "")) {
1087 setlocale(categories[i], result);
1088 DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s:%d: %s\n",
1089 __FILE__, __LINE__,
1090 setlocale_debug_string(categories[i], result, "not captured")));
1091 }
1092 }
1093
1094 result = setlocale(LC_ALL, NULL);
1095 DEBUG_L(PerlIO_printf(Perl_debug_log, "%s:%d: %s\n",
1096 __FILE__, __LINE__,
1097 setlocale_debug_string(LC_ALL, NULL, result)));
1098
1099 return result;
1100}
1101
1102#endif
1103
1104char *
1105Perl_setlocale(int category, const char * locale)
1106{
1107 /* This wraps POSIX::setlocale() */
1108
1109 char * retval;
1110 char * newlocale;
1111 dTHX;
1112
1113#ifdef USE_LOCALE_NUMERIC
1114
1115 /* A NULL locale means only query what the current one is. We have the
1116 * LC_NUMERIC name saved, because we are normally switched into the C
1117 * locale for it. For an LC_ALL query, switch back to get the correct
1118 * results. All other categories don't require special handling */
1119 if (locale == NULL) {
1120 if (category == LC_NUMERIC) {
1121 return savepv(PL_numeric_name);
1122 }
1123
1124# ifdef LC_ALL
1125
1126 else if (category == LC_ALL && ! PL_numeric_underlying) {
1127
1128 SET_NUMERIC_UNDERLYING();
1129 }
1130
1131# endif
1132
1133 }
1134
1135#endif
1136
1137 /* Save retval since subsequent setlocale() calls may overwrite it. */
1138 retval = savepv(do_setlocale_r(category, locale));
1139
1140 DEBUG_L(PerlIO_printf(Perl_debug_log,
1141 "%s:%d: %s\n", __FILE__, __LINE__,
1142 setlocale_debug_string(category, locale, retval)));
1143 if (! retval) {
1144 /* Should never happen that a query would return an error, but be
1145 * sure and reset to C locale */
1146 if (locale == 0) {
1147 SET_NUMERIC_STANDARD();
1148 }
1149
1150 return NULL;
1151 }
1152
1153 /* If locale == NULL, we are just querying the state, but may have switched
1154 * to NUMERIC_UNDERLYING. Switch back before returning. */
1155 if (locale == NULL) {
1156 SET_NUMERIC_STANDARD();
1157 return retval;
1158 }
1159
1160 /* Now that have switched locales, we have to update our records to
1161 * correspond. */
1162
1163 switch (category) {
1164
1165#ifdef USE_LOCALE_CTYPE
1166
1167 case LC_CTYPE:
1168 new_ctype(retval);
1169 break;
1170
1171#endif
1172#ifdef USE_LOCALE_COLLATE
1173
1174 case LC_COLLATE:
1175 new_collate(retval);
1176 break;
1177
1178#endif
1179#ifdef USE_LOCALE_NUMERIC
1180
1181 case LC_NUMERIC:
1182 new_numeric(retval);
1183 break;
1184
1185#endif
1186#ifdef LC_ALL
1187
1188 case LC_ALL:
1189
1190 /* LC_ALL updates all the things we care about. The values may not
1191 * be the same as 'retval', as the locale "" may have set things
1192 * individually */
1193
1194# ifdef USE_LOCALE_CTYPE
1195
1196 newlocale = do_setlocale_c(LC_CTYPE, NULL);
1197 new_ctype(newlocale);
1198
1199# endif /* USE_LOCALE_CTYPE */
1200# ifdef USE_LOCALE_COLLATE
1201
1202 newlocale = do_setlocale_c(LC_COLLATE, NULL);
1203 new_collate(newlocale);
1204
1205# endif
1206# ifdef USE_LOCALE_NUMERIC
1207
1208 newlocale = do_setlocale_c(LC_NUMERIC, NULL);
1209 new_numeric(newlocale);
1210
1211# endif /* USE_LOCALE_NUMERIC */
1212#endif /* LC_ALL */
1213
1214 default:
1215 break;
1216 }
1217
1218 return retval;
1219
1220
1221}
1222
1223PERL_STATIC_INLINE const char *
1224S_save_to_buffer(const char * string, char **buf, Size_t *buf_size, const Size_t offset)
1225{
1226 /* Copy the NUL-terminated 'string' to 'buf' + 'offset'. 'buf' has size 'buf_size',
1227 * growing it if necessary */
1228
1229 const Size_t string_size = strlen(string) + offset + 1;
1230
1231 PERL_ARGS_ASSERT_SAVE_TO_BUFFER;
1232
1233 if (*buf_size == 0) {
1234 Newx(*buf, string_size, char);
1235 *buf_size = string_size;
1236 }
1237 else if (string_size > *buf_size) {
1238 Renew(*buf, string_size, char);
1239 *buf_size = string_size;
1240 }
1241
1242 Copy(string, *buf + offset, string_size - offset, char);
1243 return *buf;
1244}
1245
1246/*
1247
1248=head1 Locale-related functions and macros
1249
1250=for apidoc Perl_langinfo
1251
1252This is an (almost ª) drop-in replacement for the system C<L<nl_langinfo(3)>>,
1253taking the same C<item> parameter values, and returning the same information.
1254But it is more thread-safe than regular C<nl_langinfo()>, and hides the quirks
1255of Perl's locale handling from your code, and can be used on systems that lack
1256a native C<nl_langinfo>.
1257
1258Expanding on these:
1259
1260=over
1261
1262=item *
1263
1264It delivers the correct results for the C<RADIXCHAR> and C<THOUSESEP> items,
1265without you having to write extra code. The reason for the extra code would be
1266because these are from the C<LC_NUMERIC> locale category, which is normally
1267kept set to the C locale by Perl, no matter what the underlying locale is
1268supposed to be, and so to get the expected results, you have to temporarily
1269toggle into the underlying locale, and later toggle back. (You could use
1270plain C<nl_langinfo> and C<L</STORE_LC_NUMERIC_FORCE_TO_UNDERLYING>> for this
1271but then you wouldn't get the other advantages of C<Perl_langinfo()>; not
1272keeping C<LC_NUMERIC> in the C locale would break a lot of CPAN, which is
1273expecting the radix (decimal point) character to be a dot.)
1274
1275=item *
1276
1277Depending on C<item>, it works on systems that don't have C<nl_langinfo>, hence
1278makes your code more portable. Of the fifty-some possible items specified by
1279the POSIX 2008 standard,
1280L<http://pubs.opengroup.org/onlinepubs/9699919799/basedefs/langinfo.h.html>,
1281only two are completely unimplemented. It uses various techniques to recover
1282the other items, including calling C<L<localeconv(3)>>, and C<L<strftime(3)>>,
1283both of which are specified in C89, so should be always be available. Later
1284C<strftime()> versions have additional capabilities; C<""> is returned for
1285those not available on your system.
1286
1287The details for those items which may differ from what this emulation returns
1288and what a native C<nl_langinfo()> would return are:
1289
1290=over
1291
1292=item C<CODESET>
1293
1294=item C<ERA>
1295
1296Unimplemented, so returns C<"">.
1297
1298=item C<YESEXPR>
1299
1300=item C<YESSTR>
1301
1302=item C<NOEXPR>
1303
1304=item C<NOSTR>
1305
1306Only the values for English are returned. C<YESSTR> and C<NOSTR> have been
1307removed from POSIX 2008, and are retained for backwards compatibility. Your
1308platform's C<nl_langinfo> may not support them.
1309
1310=item C<D_FMT>
1311
1312Always evaluates to C<%x>, the locale's appropriate date representation.
1313
1314=item C<T_FMT>
1315
1316Always evaluates to C<%X>, the locale's appropriate time representation.
1317
1318=item C<D_T_FMT>
1319
1320Always evaluates to C<%c>, the locale's appropriate date and time
1321representation.
1322
1323=item C<CRNCYSTR>
1324
1325The return may be incorrect for those rare locales where the currency symbol
1326replaces the radix character.
1327Send email to L<mailto:perlbug@perl.org> if you have examples of it needing
1328to work differently.
1329
1330=item C<ALT_DIGITS>
1331
1332Currently this gives the same results as Linux does.
1333Send email to L<mailto:perlbug@perl.org> if you have examples of it needing
1334to work differently.
1335
1336=item C<ERA_D_FMT>
1337
1338=item C<ERA_T_FMT>
1339
1340=item C<ERA_D_T_FMT>
1341
1342=item C<T_FMT_AMPM>
1343
1344These are derived by using C<strftime()>, and not all versions of that function
1345know about them. C<""> is returned for these on such systems.
1346
1347=back
1348
1349When using C<Perl_langinfo> on systems that don't have a native
1350C<nl_langinfo()>, you must
1351
1352 #include "perl_langinfo.h"
1353
1354before the C<perl.h> C<#include>. You can replace your C<langinfo.h>
1355C<#include> with this one. (Doing it this way keeps out the symbols that plain
1356C<langinfo.h> imports into the namespace for code that doesn't need it.)
1357
1358You also should not use the bare C<langinfo.h> item names, but should preface
1359them with C<PERL_>, so use C<PERL_RADIXCHAR> instead of plain C<RADIXCHAR>.
1360The C<PERL_I<foo>> versions will also work for this function on systems that do
1361have a native C<nl_langinfo>.
1362
1363=item *
1364
1365It is thread-friendly, returning its result in a buffer that won't be
1366overwritten by another thread, so you don't have to code for that possibility.
1367The buffer can be overwritten by the next call to C<nl_langinfo> or
1368C<Perl_langinfo> in the same thread.
1369
1370=item *
1371
1372ª It returns S<C<const char *>>, whereas plain C<nl_langinfo()> returns S<C<char
1373*>>, but you are (only by documentation) forbidden to write into the buffer.
1374By declaring this C<const>, the compiler enforces this restriction. The extra
1375C<const> is why this isn't an unequivocal drop-in replacement for
1376C<nl_langinfo>.
1377
1378=back
1379
1380The original impetus for C<Perl_langinfo()> was so that code that needs to
1381find out the current currency symbol, floating point radix character, or digit
1382grouping separator can use, on all systems, the simpler and more
1383thread-friendly C<nl_langinfo> API instead of C<L<localeconv(3)>> which is a
1384pain to make thread-friendly. For other fields returned by C<localeconv>, it
1385is better to use the methods given in L<perlcall> to call
1386L<C<POSIX::localeconv()>|POSIX/localeconv>, which is thread-friendly.
1387
1388=cut
1389
1390*/
1391
1392const char *
1393#ifdef HAS_NL_LANGINFO
1394Perl_langinfo(const nl_item item)
1395#else
1396Perl_langinfo(const int item)
1397#endif
1398{
1399 return my_nl_langinfo(item, TRUE);
1400}
1401
1402const char *
1403#ifdef HAS_NL_LANGINFO
1404S_my_nl_langinfo(const nl_item item, bool toggle)
1405#else
1406S_my_nl_langinfo(const int item, bool toggle)
1407#endif
1408{
1409 dTHX;
1410
1411 /* We only need to toggle into the underlying LC_NUMERIC locale for these
1412 * two items, and only if not already there */
1413 if (toggle && (( item != PERL_RADIXCHAR && item != PERL_THOUSEP)
1414 || PL_numeric_underlying))
1415 {
1416 toggle = FALSE;
1417 }
1418
1419#if defined(HAS_NL_LANGINFO) /* nl_langinfo() is available. */
1420#if ! defined(HAS_POSIX_2008_LOCALE)
1421
1422 /* Here, use plain nl_langinfo(), switching to the underlying LC_NUMERIC
1423 * for those items dependent on it. This must be copied to a buffer before
1424 * switching back, as some systems destroy the buffer when setlocale() is
1425 * called */
1426
1427 {
1428 DECLARATION_FOR_LC_NUMERIC_MANIPULATION;
1429
1430 if (toggle) {
1431 STORE_LC_NUMERIC_FORCE_TO_UNDERLYING();
1432 }
1433
1434 LOCALE_LOCK; /* Prevent interference from another thread executing
1435 this code section (the only call to nl_langinfo in
1436 the core) */
1437
1438 save_to_buffer(nl_langinfo(item), &PL_langinfo_buf,
1439 &PL_langinfo_bufsize, 0);
1440
1441 LOCALE_UNLOCK;
1442
1443 if (toggle) {
1444 RESTORE_LC_NUMERIC();
1445 }
1446 }
1447
1448# else /* Use nl_langinfo_l(), avoiding both a mutex and changing the locale */
1449
1450 {
1451 bool do_free = FALSE;
1452 locale_t cur = uselocale((locale_t) 0);
1453
1454 if (cur == LC_GLOBAL_LOCALE) {
1455 cur = duplocale(LC_GLOBAL_LOCALE);
1456 do_free = TRUE;
1457 }
1458
1459 if (toggle) {
1460 if (PL_underlying_numeric_obj) {
1461 cur = PL_underlying_numeric_obj;
1462 }
1463 else {
1464 cur = newlocale(LC_NUMERIC_MASK, PL_numeric_name, cur);
1465 do_free = TRUE;
1466 }
1467 }
1468
1469 save_to_buffer(nl_langinfo_l(item, cur),
1470 &PL_langinfo_buf, &PL_langinfo_bufsize, 0);
1471 if (do_free) {
1472 freelocale(cur);
1473 }
1474 }
1475
1476# endif
1477
1478 if (strEQ(PL_langinfo_buf, "")) {
1479 if (item == PERL_YESSTR) {
1480 return "yes";
1481 }
1482 if (item == PERL_NOSTR) {
1483 return "no";
1484 }
1485 }
1486
1487 return PL_langinfo_buf;
1488
1489#else /* Below, emulate nl_langinfo as best we can */
1490
1491 {
1492
1493# ifdef HAS_LOCALECONV
1494
1495 const struct lconv* lc;
1496 DECLARATION_FOR_LC_NUMERIC_MANIPULATION;
1497
1498# endif
1499# ifdef HAS_STRFTIME
1500
1501 struct tm tm;
1502 bool return_format = FALSE; /* Return the %format, not the value */
1503 const char * format;
1504
1505# endif
1506
1507 /* We copy the results to a per-thread buffer, even if not
1508 * multi-threaded. This is in part to simplify this code, and partly
1509 * because we need a buffer anyway for strftime(), and partly because a
1510 * call of localeconv() could otherwise wipe out the buffer, and the
1511 * programmer would not be expecting this, as this is a nl_langinfo()
1512 * substitute after all, so s/he might be thinking their localeconv()
1513 * is safe until another localeconv() call. */
1514
1515 switch (item) {
1516 Size_t len;
1517 const char * retval;
1518
1519 /* These 2 are unimplemented */
1520 case PERL_CODESET:
1521 case PERL_ERA: /* For use with strftime() %E modifier */
1522
1523 default:
1524 return "";
1525
1526 /* We use only an English set, since we don't know any more */
1527 case PERL_YESEXPR: return "^[+1yY]";
1528 case PERL_YESSTR: return "yes";
1529 case PERL_NOEXPR: return "^[-0nN]";
1530 case PERL_NOSTR: return "no";
1531
1532# ifdef HAS_LOCALECONV
1533
1534 case PERL_CRNCYSTR:
1535
1536 /* We don't bother with localeconv_l() because any system that
1537 * has it is likely to also have nl_langinfo() */
1538
1539 LOCALE_LOCK; /* Prevent interference with other threads
1540 using localeconv() */
1541
1542 lc = localeconv();
1543 if ( ! lc
1544 || ! lc->currency_symbol
1545 || strEQ("", lc->currency_symbol))
1546 {
1547 LOCALE_UNLOCK;
1548 return "";
1549 }
1550
1551 /* Leave the first spot empty to be filled in below */
1552 save_to_buffer(lc->currency_symbol, &PL_langinfo_buf,
1553 &PL_langinfo_bufsize, 1);
1554 if (lc->mon_decimal_point && strEQ(lc->mon_decimal_point, ""))
1555 { /* khw couldn't figure out how the localedef specifications
1556 would show that the $ should replace the radix; this is
1557 just a guess as to how it might work.*/
1558 *PL_langinfo_buf = '.';
1559 }
1560 else if (lc->p_cs_precedes) {
1561 *PL_langinfo_buf = '-';
1562 }
1563 else {
1564 *PL_langinfo_buf = '+';
1565 }
1566
1567 LOCALE_UNLOCK;
1568 break;
1569
1570 case PERL_RADIXCHAR:
1571 case PERL_THOUSEP:
1572
1573 if (toggle) {
1574 STORE_LC_NUMERIC_FORCE_TO_UNDERLYING();
1575 }
1576
1577 LOCALE_LOCK; /* Prevent interference with other threads
1578 using localeconv() */
1579
1580 lc = localeconv();
1581 if (! lc) {
1582 retval = "";
1583 }
1584 else {
1585 retval = (item == PERL_RADIXCHAR)
1586 ? lc->decimal_point
1587 : lc->thousands_sep;
1588 if (! retval) {
1589 retval = "";
1590 }
1591 }
1592
1593 save_to_buffer(retval, &PL_langinfo_buf,
1594 &PL_langinfo_bufsize, 0);
1595
1596 LOCALE_UNLOCK;
1597
1598 if (toggle) {
1599 RESTORE_LC_NUMERIC();
1600 }
1601
1602 break;
1603
1604# endif
1605# ifdef HAS_STRFTIME
1606
1607 /* These are defined by C89, so we assume that strftime supports
1608 * them, and so are returned unconditionally; they may not be what
1609 * the locale actually says, but should give good enough results
1610 * for someone using them as formats (as opposed to trying to parse
1611 * them to figure out what the locale says). The other format
1612 * items are actually tested to verify they work on the platform */
1613 case PERL_D_FMT: return "%x";
1614 case PERL_T_FMT: return "%X";
1615 case PERL_D_T_FMT: return "%c";
1616
1617 /* These formats are only available in later strfmtime's */
1618 case PERL_ERA_D_FMT: case PERL_ERA_T_FMT: case PERL_ERA_D_T_FMT:
1619 case PERL_T_FMT_AMPM:
1620
1621 /* The rest can be gotten from most versions of strftime(). */
1622 case PERL_ABDAY_1: case PERL_ABDAY_2: case PERL_ABDAY_3:
1623 case PERL_ABDAY_4: case PERL_ABDAY_5: case PERL_ABDAY_6:
1624 case PERL_ABDAY_7:
1625 case PERL_ALT_DIGITS:
1626 case PERL_AM_STR: case PERL_PM_STR:
1627 case PERL_ABMON_1: case PERL_ABMON_2: case PERL_ABMON_3:
1628 case PERL_ABMON_4: case PERL_ABMON_5: case PERL_ABMON_6:
1629 case PERL_ABMON_7: case PERL_ABMON_8: case PERL_ABMON_9:
1630 case PERL_ABMON_10: case PERL_ABMON_11: case PERL_ABMON_12:
1631 case PERL_DAY_1: case PERL_DAY_2: case PERL_DAY_3: case PERL_DAY_4:
1632 case PERL_DAY_5: case PERL_DAY_6: case PERL_DAY_7:
1633 case PERL_MON_1: case PERL_MON_2: case PERL_MON_3: case PERL_MON_4:
1634 case PERL_MON_5: case PERL_MON_6: case PERL_MON_7: case PERL_MON_8:
1635 case PERL_MON_9: case PERL_MON_10: case PERL_MON_11:
1636 case PERL_MON_12:
1637
1638 LOCALE_LOCK;
1639
1640 init_tm(&tm); /* Precaution against core dumps */
1641 tm.tm_sec = 30;
1642 tm.tm_min = 30;
1643 tm.tm_hour = 6;
1644 tm.tm_year = 2017 - 1900;
1645 tm.tm_wday = 0;
1646 tm.tm_mon = 0;
1647 switch (item) {
1648 default:
1649 LOCALE_UNLOCK;
1650 Perl_croak(aTHX_
1651 "panic: %s: %d: switch case: %d problem",
1652 __FILE__, __LINE__, item);
1653 NOT_REACHED; /* NOTREACHED */
1654
1655 case PERL_PM_STR: tm.tm_hour = 18;
1656 case PERL_AM_STR:
1657 format = "%p";
1658 break;
1659
1660 case PERL_ABDAY_7: tm.tm_wday++;
1661 case PERL_ABDAY_6: tm.tm_wday++;
1662 case PERL_ABDAY_5: tm.tm_wday++;
1663 case PERL_ABDAY_4: tm.tm_wday++;
1664 case PERL_ABDAY_3: tm.tm_wday++;
1665 case PERL_ABDAY_2: tm.tm_wday++;
1666 case PERL_ABDAY_1:
1667 format = "%a";
1668 break;
1669
1670 case PERL_DAY_7: tm.tm_wday++;
1671 case PERL_DAY_6: tm.tm_wday++;
1672 case PERL_DAY_5: tm.tm_wday++;
1673 case PERL_DAY_4: tm.tm_wday++;
1674 case PERL_DAY_3: tm.tm_wday++;
1675 case PERL_DAY_2: tm.tm_wday++;
1676 case PERL_DAY_1:
1677 format = "%A";
1678 break;
1679
1680 case PERL_ABMON_12: tm.tm_mon++;
1681 case PERL_ABMON_11: tm.tm_mon++;
1682 case PERL_ABMON_10: tm.tm_mon++;
1683 case PERL_ABMON_9: tm.tm_mon++;
1684 case PERL_ABMON_8: tm.tm_mon++;
1685 case PERL_ABMON_7: tm.tm_mon++;
1686 case PERL_ABMON_6: tm.tm_mon++;
1687 case PERL_ABMON_5: tm.tm_mon++;
1688 case PERL_ABMON_4: tm.tm_mon++;
1689 case PERL_ABMON_3: tm.tm_mon++;
1690 case PERL_ABMON_2: tm.tm_mon++;
1691 case PERL_ABMON_1:
1692 format = "%b";
1693 break;
1694
1695 case PERL_MON_12: tm.tm_mon++;
1696 case PERL_MON_11: tm.tm_mon++;
1697 case PERL_MON_10: tm.tm_mon++;
1698 case PERL_MON_9: tm.tm_mon++;
1699 case PERL_MON_8: tm.tm_mon++;
1700 case PERL_MON_7: tm.tm_mon++;
1701 case PERL_MON_6: tm.tm_mon++;
1702 case PERL_MON_5: tm.tm_mon++;
1703 case PERL_MON_4: tm.tm_mon++;
1704 case PERL_MON_3: tm.tm_mon++;
1705 case PERL_MON_2: tm.tm_mon++;
1706 case PERL_MON_1:
1707 format = "%B";
1708 break;
1709
1710 case PERL_T_FMT_AMPM:
1711 format = "%r";
1712 return_format = TRUE;
1713 break;
1714
1715 case PERL_ERA_D_FMT:
1716 format = "%Ex";
1717 return_format = TRUE;
1718 break;
1719
1720 case PERL_ERA_T_FMT:
1721 format = "%EX";
1722 return_format = TRUE;
1723 break;
1724
1725 case PERL_ERA_D_T_FMT:
1726 format = "%Ec";
1727 return_format = TRUE;
1728 break;
1729
1730 case PERL_ALT_DIGITS:
1731 tm.tm_wday = 0;
1732 format = "%Ow"; /* Find the alternate digit for 0 */
1733 break;
1734 }
1735
1736 /* We can't use my_strftime() because it doesn't look at
1737 * tm_wday */
1738 while (0 == strftime(PL_langinfo_buf, PL_langinfo_bufsize,
1739 format, &tm))
1740 {
1741 /* A zero return means one of:
1742 * a) there wasn't enough space in PL_langinfo_buf
1743 * b) the format, like a plain %p, returns empty
1744 * c) it was an illegal format, though some
1745 * implementations of strftime will just return the
1746 * illegal format as a plain character sequence.
1747 *
1748 * To quickly test for case 'b)', try again but precede
1749 * the format with a plain character. If that result is
1750 * still empty, the problem is either 'a)' or 'c)' */
1751
1752 Size_t format_size = strlen(format) + 1;
1753 Size_t mod_size = format_size + 1;
1754 char * mod_format;
1755 char * temp_result;
1756
1757 Newx(mod_format, mod_size, char);
1758 Newx(temp_result, PL_langinfo_bufsize, char);
1759 *mod_format = ' ';
1760 my_strlcpy(mod_format + 1, format, mod_size);
1761 len = strftime(temp_result,
1762 PL_langinfo_bufsize,
1763 mod_format, &tm);
1764 Safefree(mod_format);
1765 Safefree(temp_result);
1766
1767 /* If 'len' is non-zero, it means that we had a case like
1768 * %p which means the current locale doesn't use a.m. or
1769 * p.m., and that is valid */
1770 if (len == 0) {
1771
1772 /* Here, still didn't work. If we get well beyond a
1773 * reasonable size, bail out to prevent an infinite
1774 * loop. */
1775
1776 if (PL_langinfo_bufsize > 100 * format_size) {
1777 *PL_langinfo_buf = '\0';
1778 }
1779 else {
1780 /* Double the buffer size to retry; Add 1 in case
1781 * original was 0, so we aren't stuck at 0. */
1782 PL_langinfo_bufsize *= 2;
1783 PL_langinfo_bufsize++;
1784 Renew(PL_langinfo_buf, PL_langinfo_bufsize, char);
1785 continue;
1786 }
1787 }
1788
1789 break;
1790 }
1791
1792 /* Here, we got a result.
1793 *
1794 * If the item is 'ALT_DIGITS', PL_langinfo_buf contains the
1795 * alternate format for wday 0. If the value is the same as
1796 * the normal 0, there isn't an alternate, so clear the buffer.
1797 * */
1798 if ( item == PERL_ALT_DIGITS
1799 && strEQ(PL_langinfo_buf, "0"))
1800 {
1801 *PL_langinfo_buf = '\0';
1802 }
1803
1804 /* ALT_DIGITS is problematic. Experiments on it showed that
1805 * strftime() did not always work properly when going from
1806 * alt-9 to alt-10. Only a few locales have this item defined,
1807 * and in all of them on Linux that khw was able to find,
1808 * nl_langinfo() merely returned the alt-0 character, possibly
1809 * doubled. Most Unicode digits are in blocks of 10
1810 * consecutive code points, so that is sufficient information
1811 * for those scripts, as we can infer alt-1, alt-2, .... But
1812 * for a Japanese locale, a CJK ideographic 0 is returned, and
1813 * the CJK digits are not in code point order, so you can't
1814 * really infer anything. The localedef for this locale did
1815 * specify the succeeding digits, so that strftime() works
1816 * properly on them, without needing to infer anything. But
1817 * the nl_langinfo() return did not give sufficient information
1818 * for the caller to understand what's going on. So until
1819 * there is evidence that it should work differently, this
1820 * returns the alt-0 string for ALT_DIGITS.
1821 *
1822 * wday was chosen because its range is all a single digit.
1823 * Things like tm_sec have two digits as the minimum: '00' */
1824
1825 LOCALE_UNLOCK;
1826
1827 /* If to return the format, not the value, overwrite the buffer
1828 * with it. But some strftime()s will keep the original format
1829 * if illegal, so change those to "" */
1830 if (return_format) {
1831 if (strEQ(PL_langinfo_buf, format)) {
1832 *PL_langinfo_buf = '\0';
1833 }
1834 else {
1835 save_to_buffer(format, &PL_langinfo_buf,
1836 &PL_langinfo_bufsize, 0);
1837 }
1838 }
1839
1840 break;
1841
1842# endif
1843
1844 }
1845 }
1846
1847 return PL_langinfo_buf;
1848
1849#endif
1850
1851}
1852
1853/*
1854 * Initialize locale awareness.
1855 */
1856int
1857Perl_init_i18nl10n(pTHX_ int printwarn)
1858{
1859 /* printwarn is
1860 *
1861 * 0 if not to output warning when setup locale is bad
1862 * 1 if to output warning based on value of PERL_BADLANG
1863 * >1 if to output regardless of PERL_BADLANG
1864 *
1865 * returns
1866 * 1 = set ok or not applicable,
1867 * 0 = fallback to a locale of lower priority
1868 * -1 = fallback to all locales failed, not even to the C locale
1869 *
1870 * Under -DDEBUGGING, if the environment variable PERL_DEBUG_LOCALE_INIT is
1871 * set, debugging information is output.
1872 *
1873 * This looks more complicated than it is, mainly due to the #ifdefs.
1874 *
1875 * We try to set LC_ALL to the value determined by the environment. If
1876 * there is no LC_ALL on this platform, we try the individual categories we
1877 * know about. If this works, we are done.
1878 *
1879 * But if it doesn't work, we have to do something else. We search the
1880 * environment variables ourselves instead of relying on the system to do
1881 * it. We look at, in order, LC_ALL, LANG, a system default locale (if we
1882 * think there is one), and the ultimate fallback "C". This is all done in
1883 * the same loop as above to avoid duplicating code, but it makes things
1884 * more complex. The 'trial_locales' array is initialized with just one
1885 * element; it causes the behavior described in the paragraph above this to
1886 * happen. If that fails, we add elements to 'trial_locales', and do extra
1887 * loop iterations to cause the behavior described in this paragraph.
1888 *
1889 * On Ultrix, the locale MUST come from the environment, so there is
1890 * preliminary code to set it. I (khw) am not sure that it is necessary,
1891 * and that this couldn't be folded into the loop, but barring any real
1892 * platforms to test on, it's staying as-is
1893 *
1894 * A slight complication is that in embedded Perls, the locale may already
1895 * be set-up, and we don't want to get it from the normal environment
1896 * variables. This is handled by having a special environment variable
1897 * indicate we're in this situation. We simply set setlocale's 2nd
1898 * parameter to be a NULL instead of "". That indicates to setlocale that
1899 * it is not to change anything, but to return the current value,
1900 * effectively initializing perl's db to what the locale already is.
1901 *
1902 * We play the same trick with NULL if a LC_ALL succeeds. We call
1903 * setlocale() on the individual categores with NULL to get their existing
1904 * values for our db, instead of trying to change them.
1905 * */
1906
1907 int ok = 1;
1908
1909#ifndef USE_LOCALE
1910
1911 PERL_UNUSED_ARG(printwarn);
1912
1913#else /* USE_LOCALE */
1914# ifdef __GLIBC__
1915
1916 const char * const language = savepv(PerlEnv_getenv("LANGUAGE"));
1917
1918# endif
1919
1920 /* NULL uses the existing already set up locale */
1921 const char * const setlocale_init = (PerlEnv_getenv("PERL_SKIP_LOCALE_INIT"))
1922 ? NULL
1923 : "";
1924 const char* trial_locales[5]; /* 5 = 1 each for "", LC_ALL, LANG, "", C */
1925 unsigned int trial_locales_count;
1926 const char * const lc_all = savepv(PerlEnv_getenv("LC_ALL"));
1927 const char * const lang = savepv(PerlEnv_getenv("LANG"));
1928 bool setlocale_failure = FALSE;
1929 unsigned int i;
1930
1931 /* A later getenv() could zap this, so only use here */
1932 const char * const bad_lang_use_once = PerlEnv_getenv("PERL_BADLANG");
1933
1934 const bool locwarn = (printwarn > 1
1935 || ( printwarn
1936 && ( ! bad_lang_use_once
1937 || (
1938 /* disallow with "" or "0" */
1939 *bad_lang_use_once
1940 && strNE("0", bad_lang_use_once)))));
1941
1942 /* setlocale() return vals; not copied so must be looked at immediately */
1943 const char * sl_result[NOMINAL_LC_ALL_INDEX + 1];
1944
1945 /* current locale for given category; should have been copied so aren't
1946 * volatile */
1947 const char * curlocales[NOMINAL_LC_ALL_INDEX + 1];
1948
1949# ifdef WIN32
1950
1951 /* In some systems you can find out the system default locale
1952 * and use that as the fallback locale. */
1953# define SYSTEM_DEFAULT_LOCALE
1954# endif
1955# ifdef SYSTEM_DEFAULT_LOCALE
1956
1957 const char *system_default_locale = NULL;
1958
1959# endif
1960
1961# ifndef DEBUGGING
1962# define DEBUG_LOCALE_INIT(a,b,c)
1963# else
1964
1965 DEBUG_INITIALIZATION_set(cBOOL(PerlEnv_getenv("PERL_DEBUG_LOCALE_INIT")));
1966
1967# define DEBUG_LOCALE_INIT(category, locale, result) \
1968 STMT_START { \
1969 if (debug_initialization) { \
1970 PerlIO_printf(Perl_debug_log, \
1971 "%s:%d: %s\n", \
1972 __FILE__, __LINE__, \
1973 setlocale_debug_string(category, \
1974 locale, \
1975 result)); \
1976 } \
1977 } STMT_END
1978
1979/* Make sure the parallel arrays are properly set up */
1980# ifdef USE_LOCALE_NUMERIC
1981 assert(categories[LC_NUMERIC_INDEX] == LC_NUMERIC);
1982 assert(strEQ(category_names[LC_NUMERIC_INDEX], "LC_NUMERIC"));
1983# endif
1984# ifdef USE_LOCALE_CTYPE
1985 assert(categories[LC_CTYPE_INDEX] == LC_CTYPE);
1986 assert(strEQ(category_names[LC_CTYPE_INDEX], "LC_CTYPE"));
1987# endif
1988# ifdef USE_LOCALE_COLLATE
1989 assert(categories[LC_COLLATE_INDEX] == LC_COLLATE);
1990 assert(strEQ(category_names[LC_COLLATE_INDEX], "LC_COLLATE"));
1991# endif
1992# ifdef USE_LOCALE_TIME
1993 assert(categories[LC_TIME_INDEX] == LC_TIME);
1994 assert(strEQ(category_names[LC_TIME_INDEX], "LC_TIME"));
1995# endif
1996# ifdef USE_LOCALE_MESSAGES
1997 assert(categories[LC_MESSAGES_INDEX] == LC_MESSAGES);
1998 assert(strEQ(category_names[LC_MESSAGES_INDEX], "LC_MESSAGES"));
1999# endif
2000# ifdef USE_LOCALE_MONETARY
2001 assert(categories[LC_MONETARY_INDEX] == LC_MONETARY);
2002 assert(strEQ(category_names[LC_MONETARY_INDEX], "LC_MONETARY"));
2003# endif
2004# ifdef USE_LOCALE_ADDRESS
2005 assert(categories[LC_ADDRESS_INDEX] == LC_ADDRESS);
2006 assert(strEQ(category_names[LC_ADDRESS_INDEX], "LC_ADDRESS"));
2007# endif
2008# ifdef USE_LOCALE_IDENTIFICATION
2009 assert(categories[LC_IDENTIFICATION_INDEX] == LC_IDENTIFICATION);
2010 assert(strEQ(category_names[LC_IDENTIFICATION_INDEX], "LC_IDENTIFICATION"));
2011# endif
2012# ifdef USE_LOCALE_MEASUREMENT
2013 assert(categories[LC_MEASUREMENT_INDEX] == LC_MEASUREMENT);
2014 assert(strEQ(category_names[LC_MEASUREMENT_INDEX], "LC_MEASUREMENT"));
2015# endif
2016# ifdef USE_LOCALE_PAPER
2017 assert(categories[LC_PAPER_INDEX] == LC_PAPER);
2018 assert(strEQ(category_names[LC_PAPER_INDEX], "LC_PAPER"));
2019# endif
2020# ifdef USE_LOCALE_TELEPHONE
2021 assert(categories[LC_TELEPHONE_INDEX] == LC_TELEPHONE);
2022 assert(strEQ(category_names[LC_TELEPHONE_INDEX], "LC_TELEPHONE"));
2023# endif
2024# ifdef LC_ALL
2025 assert(categories[LC_ALL_INDEX] == LC_ALL);
2026 assert(strEQ(category_names[LC_ALL_INDEX], "LC_ALL"));
2027 assert(NOMINAL_LC_ALL_INDEX == LC_ALL_INDEX);
2028# endif
2029# endif /* DEBUGGING */
2030
2031 /* Initialize the cache of the program's UTF-8ness for the always known
2032 * locales C and POSIX */
2033 my_strlcpy(PL_locale_utf8ness, C_and_POSIX_utf8ness,
2034 sizeof(PL_locale_utf8ness));
2035
2036 PL_numeric_radix_sv = newSVpvs(".");
2037
2038# ifdef LOCALE_ENVIRON_REQUIRED
2039
2040 /*
2041 * Ultrix setlocale(..., "") fails if there are no environment
2042 * variables from which to get a locale name.
2043 */
2044
2045# ifndef LC_ALL
2046# error Ultrix without LC_ALL not implemented
2047# else
2048
2049 {
2050 bool done = FALSE;
2051 if (lang) {
2052 sl_result[LC_ALL_INDEX] = do_setlocale_c(LC_ALL, setlocale_init);
2053 DEBUG_LOCALE_INIT(LC_ALL, setlocale_init, sl_result[LC_ALL_INDEX]);
2054 if (sl_result[LC_ALL_INDEX])
2055 done = TRUE;
2056 else
2057 setlocale_failure = TRUE;
2058 }
2059 if (! setlocale_failure) {
2060 const char * locale_param;
2061 for (i = 0; i < LC_ALL_INDEX; i++) {
2062 locale_param = (! done && (lang || PerlEnv_getenv(category_names[i])))
2063 ? setlocale_init
2064 : NULL;
2065 sl_result[i] = do_setlocale_r(categories[i], locale_param);
2066 if (! sl_result[i]) {
2067 setlocale_failure = TRUE;
2068 }
2069 DEBUG_LOCALE_INIT(categories[i], locale_param, sl_result[i]);
2070 }
2071 }
2072 }
2073
2074# endif /* LC_ALL */
2075# endif /* LOCALE_ENVIRON_REQUIRED */
2076
2077 /* We try each locale in the list until we get one that works, or exhaust
2078 * the list. Normally the loop is executed just once. But if setting the
2079 * locale fails, inside the loop we add fallback trials to the array and so
2080 * will execute the loop multiple times */
2081 trial_locales[0] = setlocale_init;
2082 trial_locales_count = 1;
2083
2084 for (i= 0; i < trial_locales_count; i++) {
2085 const char * trial_locale = trial_locales[i];
2086
2087 if (i > 0) {
2088
2089 /* XXX This is to preserve old behavior for LOCALE_ENVIRON_REQUIRED
2090 * when i==0, but I (khw) don't think that behavior makes much
2091 * sense */
2092 setlocale_failure = FALSE;
2093
2094# ifdef SYSTEM_DEFAULT_LOCALE
2095# ifdef WIN32 /* Note that assumes Win32 has LC_ALL */
2096
2097 /* On Windows machines, an entry of "" after the 0th means to use
2098 * the system default locale, which we now proceed to get. */
2099 if (strEQ(trial_locale, "")) {
2100 unsigned int j;
2101
2102 /* Note that this may change the locale, but we are going to do
2103 * that anyway just below */
2104 system_default_locale = do_setlocale_c(LC_ALL, "");
2105 DEBUG_LOCALE_INIT(LC_ALL, "", system_default_locale);
2106
2107 /* Skip if invalid or if it's already on the list of locales to
2108 * try */
2109 if (! system_default_locale) {
2110 goto next_iteration;
2111 }
2112 for (j = 0; j < trial_locales_count; j++) {
2113 if (strEQ(system_default_locale, trial_locales[j])) {
2114 goto next_iteration;
2115 }
2116 }
2117
2118 trial_locale = system_default_locale;
2119 }
2120# else
2121# error SYSTEM_DEFAULT_LOCALE only implemented for Win32
2122# endif
2123# endif /* SYSTEM_DEFAULT_LOCALE */
2124
2125 } /* For i > 0 */
2126
2127# ifdef LC_ALL
2128
2129 sl_result[LC_ALL_INDEX] = do_setlocale_c(LC_ALL, trial_locale);
2130 DEBUG_LOCALE_INIT(LC_ALL, trial_locale, sl_result[LC_ALL_INDEX]);
2131 if (! sl_result[LC_ALL_INDEX]) {
2132 setlocale_failure = TRUE;
2133 }
2134 else {
2135 /* Since LC_ALL succeeded, it should have changed all the other
2136 * categories it can to its value; so we massage things so that the
2137 * setlocales below just return their category's current values.
2138 * This adequately handles the case in NetBSD where LC_COLLATE may
2139 * not be defined for a locale, and setting it individually will
2140 * fail, whereas setting LC_ALL succeeds, leaving LC_COLLATE set to
2141 * the POSIX locale. */
2142 trial_locale = NULL;
2143 }
2144
2145# endif /* LC_ALL */
2146
2147 if (! setlocale_failure) {
2148 unsigned int j;
2149 for (j = 0; j < NOMINAL_LC_ALL_INDEX; j++) {
2150 curlocales[j]
2151 = savepv(do_setlocale_r(categories[j], trial_locale));
2152 if (! curlocales[j]) {
2153 setlocale_failure = TRUE;
2154 }
2155 DEBUG_LOCALE_INIT(categories[j], trial_locale, curlocales[j]);
2156 }
2157
2158 if (! setlocale_failure) { /* All succeeded */
2159 break; /* Exit trial_locales loop */
2160 }
2161 }
2162
2163 /* Here, something failed; will need to try a fallback. */
2164 ok = 0;
2165
2166 if (i == 0) {
2167 unsigned int j;
2168
2169 if (locwarn) { /* Output failure info only on the first one */
2170
2171# ifdef LC_ALL
2172
2173 PerlIO_printf(Perl_error_log,
2174 "perl: warning: Setting locale failed.\n");
2175
2176# else /* !LC_ALL */
2177
2178 PerlIO_printf(Perl_error_log,
2179 "perl: warning: Setting locale failed for the categories:\n\t");
2180
2181 for (j = 0; j < NOMINAL_LC_ALL_INDEX; j++) {
2182 if (! curlocales[j]) {
2183 PerlIO_printf(Perl_error_log, category_names[j]);
2184 }
2185 else {
2186 Safefree(curlocales[j]);
2187 }
2188 }
2189
2190# endif /* LC_ALL */
2191
2192 PerlIO_printf(Perl_error_log,
2193 "perl: warning: Please check that your locale settings:\n");
2194
2195# ifdef __GLIBC__
2196
2197 PerlIO_printf(Perl_error_log,
2198 "\tLANGUAGE = %c%s%c,\n",
2199 language ? '"' : '(',
2200 language ? language : "unset",
2201 language ? '"' : ')');
2202# endif
2203
2204 PerlIO_printf(Perl_error_log,
2205 "\tLC_ALL = %c%s%c,\n",
2206 lc_all ? '"' : '(',
2207 lc_all ? lc_all : "unset",
2208 lc_all ? '"' : ')');
2209
2210# if defined(USE_ENVIRON_ARRAY)
2211
2212 {
2213 char **e;
2214
2215 /* Look through the environment for any variables of the
2216 * form qr/ ^ LC_ [A-Z]+ = /x, except LC_ALL which was
2217 * already handled above. These are assumed to be locale
2218 * settings. Output them and their values. */
2219 for (e = environ; *e; e++) {
2220 const STRLEN prefix_len = sizeof("LC_") - 1;
2221 STRLEN uppers_len;
2222
2223 if ( strBEGINs(*e, "LC_")
2224 && ! strBEGINs(*e, "LC_ALL=")
2225 && (uppers_len = strspn(*e + prefix_len,
2226 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"))
2227 && ((*e)[prefix_len + uppers_len] == '='))
2228 {
2229 PerlIO_printf(Perl_error_log, "\t%.*s = \"%s\",\n",
2230 (int) (prefix_len + uppers_len), *e,
2231 *e + prefix_len + uppers_len + 1);
2232 }
2233 }
2234 }
2235
2236# else
2237
2238 PerlIO_printf(Perl_error_log,
2239 "\t(possibly more locale environment variables)\n");
2240
2241# endif
2242
2243 PerlIO_printf(Perl_error_log,
2244 "\tLANG = %c%s%c\n",
2245 lang ? '"' : '(',
2246 lang ? lang : "unset",
2247 lang ? '"' : ')');
2248
2249 PerlIO_printf(Perl_error_log,
2250 " are supported and installed on your system.\n");
2251 }
2252
2253 /* Calculate what fallback locales to try. We have avoided this
2254 * until we have to, because failure is quite unlikely. This will
2255 * usually change the upper bound of the loop we are in.
2256 *
2257 * Since the system's default way of setting the locale has not
2258 * found one that works, We use Perl's defined ordering: LC_ALL,
2259 * LANG, and the C locale. We don't try the same locale twice, so
2260 * don't add to the list if already there. (On POSIX systems, the
2261 * LC_ALL element will likely be a repeat of the 0th element "",
2262 * but there's no harm done by doing it explicitly.
2263 *
2264 * Note that this tries the LC_ALL environment variable even on
2265 * systems which have no LC_ALL locale setting. This may or may
2266 * not have been originally intentional, but there's no real need
2267 * to change the behavior. */
2268 if (lc_all) {
2269 for (j = 0; j < trial_locales_count; j++) {
2270 if (strEQ(lc_all, trial_locales[j])) {
2271 goto done_lc_all;
2272 }
2273 }
2274 trial_locales[trial_locales_count++] = lc_all;
2275 }
2276 done_lc_all:
2277
2278 if (lang) {
2279 for (j = 0; j < trial_locales_count; j++) {
2280 if (strEQ(lang, trial_locales[j])) {
2281 goto done_lang;
2282 }
2283 }
2284 trial_locales[trial_locales_count++] = lang;
2285 }
2286 done_lang:
2287
2288# if defined(WIN32) && defined(LC_ALL)
2289
2290 /* For Windows, we also try the system default locale before "C".
2291 * (If there exists a Windows without LC_ALL we skip this because
2292 * it gets too complicated. For those, the "C" is the next
2293 * fallback possibility). The "" is the same as the 0th element of
2294 * the array, but the code at the loop above knows to treat it
2295 * differently when not the 0th */
2296 trial_locales[trial_locales_count++] = "";
2297
2298# endif
2299
2300 for (j = 0; j < trial_locales_count; j++) {
2301 if (strEQ("C", trial_locales[j])) {
2302 goto done_C;
2303 }
2304 }
2305 trial_locales[trial_locales_count++] = "C";
2306
2307 done_C: ;
2308 } /* end of first time through the loop */
2309
2310# ifdef WIN32
2311
2312 next_iteration: ;
2313
2314# endif
2315
2316 } /* end of looping through the trial locales */
2317
2318 if (ok < 1) { /* If we tried to fallback */
2319 const char* msg;
2320 if (! setlocale_failure) { /* fallback succeeded */
2321 msg = "Falling back to";
2322 }
2323 else { /* fallback failed */
2324 unsigned int j;
2325
2326 /* We dropped off the end of the loop, so have to decrement i to
2327 * get back to the value the last time through */
2328 i--;
2329
2330 ok = -1;
2331 msg = "Failed to fall back to";
2332
2333 /* To continue, we should use whatever values we've got */
2334
2335 for (j = 0; j < NOMINAL_LC_ALL_INDEX; j++) {
2336 Safefree(curlocales[j]);
2337 curlocales[j] = savepv(do_setlocale_r(categories[j], NULL));
2338 DEBUG_LOCALE_INIT(categories[j], NULL, curlocales[j]);
2339 }
2340 }
2341
2342 if (locwarn) {
2343 const char * description;
2344 const char * name = "";
2345 if (strEQ(trial_locales[i], "C")) {
2346 description = "the standard locale";
2347 name = "C";
2348 }
2349
2350# ifdef SYSTEM_DEFAULT_LOCALE
2351
2352 else if (strEQ(trial_locales[i], "")) {
2353 description = "the system default locale";
2354 if (system_default_locale) {
2355 name = system_default_locale;
2356 }
2357 }
2358
2359# endif /* SYSTEM_DEFAULT_LOCALE */
2360
2361 else {
2362 description = "a fallback locale";
2363 name = trial_locales[i];
2364 }
2365 if (name && strNE(name, "")) {
2366 PerlIO_printf(Perl_error_log,
2367 "perl: warning: %s %s (\"%s\").\n", msg, description, name);
2368 }
2369 else {
2370 PerlIO_printf(Perl_error_log,
2371 "perl: warning: %s %s.\n", msg, description);
2372 }
2373 }
2374 } /* End of tried to fallback */
2375
2376 /* Done with finding the locales; update our records */
2377
2378# ifdef USE_LOCALE_CTYPE
2379
2380 new_ctype(curlocales[LC_CTYPE_INDEX]);
2381
2382# endif
2383# ifdef USE_LOCALE_COLLATE
2384
2385 new_collate(curlocales[LC_COLLATE_INDEX]);
2386
2387# endif
2388# ifdef USE_LOCALE_NUMERIC
2389
2390 new_numeric(curlocales[LC_NUMERIC_INDEX]);
2391
2392# endif
2393
2394 for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++) {
2395
2396# if defined(USE_ITHREADS)
2397
2398 /* This caches whether each category's locale is UTF-8 or not. This
2399 * may involve changing the locale. It is ok to do this at
2400 * initialization time before any threads have started, but not later.
2401 * Caching means that if the program heeds our dictate not to change
2402 * locales in threaded applications, this data will remain valid, and
2403 * it may get queried without changing locales. If the environment is
2404 * such that all categories have the same locale, this isn't needed, as
2405 * the code will not change the locale; but this handles the uncommon
2406 * case where the environment has disparate locales for the categories
2407 * */
2408 (void) _is_cur_LC_category_utf8(categories[i]);
2409
2410# endif
2411
2412 Safefree(curlocales[i]);
2413 }
2414
2415# if defined(USE_PERLIO) && defined(USE_LOCALE_CTYPE)
2416
2417 /* Set PL_utf8locale to TRUE if using PerlIO _and_ the current LC_CTYPE
2418 * locale is UTF-8. The call to new_ctype() just above has already
2419 * calculated the latter value and saved it in PL_in_utf8_CTYPE_locale. If
2420 * both PL_utf8locale and PL_unicode (set by -C or by $ENV{PERL_UNICODE})
2421 * are true, perl.c:S_parse_body() will turn on the PerlIO :utf8 layer on
2422 * STDIN, STDOUT, STDERR, _and_ the default open discipline. */
2423 PL_utf8locale = PL_in_utf8_CTYPE_locale;
2424
2425 /* Set PL_unicode to $ENV{PERL_UNICODE} if using PerlIO.
2426 This is an alternative to using the -C command line switch
2427 (the -C if present will override this). */
2428 {
2429 const char *p = PerlEnv_getenv("PERL_UNICODE");
2430 PL_unicode = p ? parse_unicode_opts(&p) : 0;
2431 if (PL_unicode & PERL_UNICODE_UTF8CACHEASSERT_FLAG)
2432 PL_utf8cache = -1;
2433 }
2434
2435# endif
2436# ifdef __GLIBC__
2437
2438 Safefree(language);
2439
2440# endif
2441
2442 Safefree(lc_all);
2443 Safefree(lang);
2444
2445#endif /* USE_LOCALE */
2446#ifdef DEBUGGING
2447
2448 /* So won't continue to output stuff */
2449 DEBUG_INITIALIZATION_set(FALSE);
2450
2451#endif
2452
2453 return ok;
2454}
2455
2456#ifdef USE_LOCALE_COLLATE
2457
2458char *
2459Perl__mem_collxfrm(pTHX_ const char *input_string,
2460 STRLEN len, /* Length of 'input_string' */
2461 STRLEN *xlen, /* Set to length of returned string
2462 (not including the collation index
2463 prefix) */
2464 bool utf8 /* Is the input in UTF-8? */
2465 )
2466{
2467
2468 /* _mem_collxfrm() is a bit like strxfrm() but with two important
2469 * differences. First, it handles embedded NULs. Second, it allocates a bit
2470 * more memory than needed for the transformed data itself. The real
2471 * transformed data begins at offset COLLXFRM_HDR_LEN. *xlen is set to
2472 * the length of that, and doesn't include the collation index size.
2473 * Please see sv_collxfrm() to see how this is used. */
2474
2475#define COLLXFRM_HDR_LEN sizeof(PL_collation_ix)
2476
2477 char * s = (char *) input_string;
2478 STRLEN s_strlen = strlen(input_string);
2479 char *xbuf = NULL;
2480 STRLEN xAlloc; /* xalloc is a reserved word in VC */
2481 STRLEN length_in_chars;
2482 bool first_time = TRUE; /* Cleared after first loop iteration */
2483
2484 PERL_ARGS_ASSERT__MEM_COLLXFRM;
2485
2486 /* Must be NUL-terminated */
2487 assert(*(input_string + len) == '\0');
2488
2489 /* If this locale has defective collation, skip */
2490 if (PL_collxfrm_base == 0 && PL_collxfrm_mult == 0) {
2491 DEBUG_L(PerlIO_printf(Perl_debug_log,
2492 "_mem_collxfrm: locale's collation is defective\n"));
2493 goto bad;
2494 }
2495
2496 /* Replace any embedded NULs with the control that sorts before any others.
2497 * This will give as good as possible results on strings that don't
2498 * otherwise contain that character, but otherwise there may be
2499 * less-than-perfect results with that character and NUL. This is
2500 * unavoidable unless we replace strxfrm with our own implementation. */
2501 if (UNLIKELY(s_strlen < len)) { /* Only execute if there is an embedded
2502 NUL */
2503 char * e = s + len;
2504 char * sans_nuls;
2505 STRLEN sans_nuls_len;
2506 int try_non_controls;
2507 char this_replacement_char[] = "?\0"; /* Room for a two-byte string,
2508 making sure 2nd byte is NUL.
2509 */
2510 STRLEN this_replacement_len;
2511
2512 /* If we don't know what non-NUL control character sorts lowest for
2513 * this locale, find it */
2514 if (PL_strxfrm_NUL_replacement == '\0') {
2515 int j;
2516 char * cur_min_x = NULL; /* The min_char's xfrm, (except it also
2517 includes the collation index
2518 prefixed. */
2519
2520 DEBUG_Lv(PerlIO_printf(Perl_debug_log, "Looking to replace NUL\n"));
2521
2522 /* Unlikely, but it may be that no control will work to replace
2523 * NUL, in which case we instead look for any character. Controls
2524 * are preferred because collation order is, in general, context
2525 * sensitive, with adjoining characters affecting the order, and
2526 * controls are less likely to have such interactions, allowing the
2527 * NUL-replacement to stand on its own. (Another way to look at it
2528 * is to imagine what would happen if the NUL were replaced by a
2529 * combining character; it wouldn't work out all that well.) */
2530 for (try_non_controls = 0;
2531 try_non_controls < 2;
2532 try_non_controls++)
2533 {
2534 /* Look through all legal code points (NUL isn't) */
2535 for (j = 1; j < 256; j++) {
2536 char * x; /* j's xfrm plus collation index */
2537 STRLEN x_len; /* length of 'x' */
2538 STRLEN trial_len = 1;
2539 char cur_source[] = { '\0', '\0' };
2540
2541 /* Skip non-controls the first time through the loop. The
2542 * controls in a UTF-8 locale are the L1 ones */
2543 if (! try_non_controls && (PL_in_utf8_COLLATE_locale)
2544 ? ! isCNTRL_L1(j)
2545 : ! isCNTRL_LC(j))
2546 {
2547 continue;
2548 }
2549
2550 /* Create a 1-char string of the current code point */
2551 cur_source[0] = (char) j;
2552
2553 /* Then transform it */
2554 x = _mem_collxfrm(cur_source, trial_len, &x_len,
2555 0 /* The string is not in UTF-8 */);
2556
2557 /* Ignore any character that didn't successfully transform.
2558 * */
2559 if (! x) {
2560 continue;
2561 }
2562
2563 /* If this character's transformation is lower than
2564 * the current lowest, this one becomes the lowest */
2565 if ( cur_min_x == NULL
2566 || strLT(x + COLLXFRM_HDR_LEN,
2567 cur_min_x + COLLXFRM_HDR_LEN))
2568 {
2569 PL_strxfrm_NUL_replacement = j;
2570 cur_min_x = x;
2571 }
2572 else {
2573 Safefree(x);
2574 }
2575 } /* end of loop through all 255 characters */
2576
2577 /* Stop looking if found */
2578 if (cur_min_x) {
2579 break;
2580 }
2581
2582 /* Unlikely, but possible, if there aren't any controls that
2583 * work in the locale, repeat the loop, looking for any
2584 * character that works */
2585 DEBUG_L(PerlIO_printf(Perl_debug_log,
2586 "_mem_collxfrm: No control worked. Trying non-controls\n"));
2587 } /* End of loop to try first the controls, then any char */
2588
2589 if (! cur_min_x) {
2590 DEBUG_L(PerlIO_printf(Perl_debug_log,
2591 "_mem_collxfrm: Couldn't find any character to replace"
2592 " embedded NULs in locale %s with", PL_collation_name));
2593 goto bad;
2594 }
2595
2596 DEBUG_L(PerlIO_printf(Perl_debug_log,
2597 "_mem_collxfrm: Replacing embedded NULs in locale %s with "
2598 "0x%02X\n", PL_collation_name, PL_strxfrm_NUL_replacement));
2599
2600 Safefree(cur_min_x);
2601 } /* End of determining the character that is to replace NULs */
2602
2603 /* If the replacement is variant under UTF-8, it must match the
2604 * UTF8-ness of the original */
2605 if ( ! UVCHR_IS_INVARIANT(PL_strxfrm_NUL_replacement) && utf8) {
2606 this_replacement_char[0] =
2607 UTF8_EIGHT_BIT_HI(PL_strxfrm_NUL_replacement);
2608 this_replacement_char[1] =
2609 UTF8_EIGHT_BIT_LO(PL_strxfrm_NUL_replacement);
2610 this_replacement_len = 2;
2611 }
2612 else {
2613 this_replacement_char[0] = PL_strxfrm_NUL_replacement;
2614 /* this_replacement_char[1] = '\0' was done at initialization */
2615 this_replacement_len = 1;
2616 }
2617
2618 /* The worst case length for the replaced string would be if every
2619 * character in it is NUL. Multiply that by the length of each
2620 * replacement, and allow for a trailing NUL */
2621 sans_nuls_len = (len * this_replacement_len) + 1;
2622 Newx(sans_nuls, sans_nuls_len, char);
2623 *sans_nuls = '\0';
2624
2625 /* Replace each NUL with the lowest collating control. Loop until have
2626 * exhausted all the NULs */
2627 while (s + s_strlen < e) {
2628 my_strlcat(sans_nuls, s, sans_nuls_len);
2629
2630 /* Do the actual replacement */
2631 my_strlcat(sans_nuls, this_replacement_char, sans_nuls_len);
2632
2633 /* Move past the input NUL */
2634 s += s_strlen + 1;
2635 s_strlen = strlen(s);
2636 }
2637
2638 /* And add anything that trails the final NUL */
2639 my_strlcat(sans_nuls, s, sans_nuls_len);
2640
2641 /* Switch so below we transform this modified string */
2642 s = sans_nuls;
2643 len = strlen(s);
2644 } /* End of replacing NULs */
2645
2646 /* Make sure the UTF8ness of the string and locale match */
2647 if (utf8 != PL_in_utf8_COLLATE_locale) {
2648 const char * const t = s; /* Temporary so we can later find where the
2649 input was */
2650
2651 /* Here they don't match. Change the string's to be what the locale is
2652 * expecting */
2653
2654 if (! utf8) { /* locale is UTF-8, but input isn't; upgrade the input */
2655 s = (char *) bytes_to_utf8((const U8 *) s, &len);
2656 utf8 = TRUE;
2657 }
2658 else { /* locale is not UTF-8; but input is; downgrade the input */
2659
2660 s = (char *) bytes_from_utf8((const U8 *) s, &len, &utf8);
2661
2662 /* If the downgrade was successful we are done, but if the input
2663 * contains things that require UTF-8 to represent, have to do
2664 * damage control ... */
2665 if (UNLIKELY(utf8)) {
2666
2667 /* What we do is construct a non-UTF-8 string with
2668 * 1) the characters representable by a single byte converted
2669 * to be so (if necessary);
2670 * 2) and the rest converted to collate the same as the
2671 * highest collating representable character. That makes
2672 * them collate at the end. This is similar to how we
2673 * handle embedded NULs, but we use the highest collating
2674 * code point instead of the smallest. Like the NUL case,
2675 * this isn't perfect, but is the best we can reasonably
2676 * do. Every above-255 code point will sort the same as
2677 * the highest-sorting 0-255 code point. If that code
2678 * point can combine in a sequence with some other code
2679 * points for weight calculations, us changing something to
2680 * be it can adversely affect the results. But in most
2681 * cases, it should work reasonably. And note that this is
2682 * really an illegal situation: using code points above 255
2683 * on a locale where only 0-255 are valid. If two strings
2684 * sort entirely equal, then the sort order for the
2685 * above-255 code points will be in code point order. */
2686
2687 utf8 = FALSE;
2688
2689 /* If we haven't calculated the code point with the maximum
2690 * collating order for this locale, do so now */
2691 if (! PL_strxfrm_max_cp) {
2692 int j;
2693
2694 /* The current transformed string that collates the
2695 * highest (except it also includes the prefixed collation
2696 * index. */
2697 char * cur_max_x = NULL;
2698
2699 /* Look through all legal code points (NUL isn't) */
2700 for (j = 1; j < 256; j++) {
2701 char * x;
2702 STRLEN x_len;
2703 char cur_source[] = { '\0', '\0' };
2704
2705 /* Create a 1-char string of the current code point */
2706 cur_source[0] = (char) j;
2707
2708 /* Then transform it */
2709 x = _mem_collxfrm(cur_source, 1, &x_len, FALSE);
2710
2711 /* If something went wrong (which it shouldn't), just
2712 * ignore this code point */
2713 if (! x) {
2714 continue;
2715 }
2716
2717 /* If this character's transformation is higher than
2718 * the current highest, this one becomes the highest */
2719 if ( cur_max_x == NULL
2720 || strGT(x + COLLXFRM_HDR_LEN,
2721 cur_max_x + COLLXFRM_HDR_LEN))
2722 {
2723 PL_strxfrm_max_cp = j;
2724 cur_max_x = x;
2725 }
2726 else {
2727 Safefree(x);
2728 }
2729 }
2730
2731 if (! cur_max_x) {
2732 DEBUG_L(PerlIO_printf(Perl_debug_log,
2733 "_mem_collxfrm: Couldn't find any character to"
2734 " replace above-Latin1 chars in locale %s with",
2735 PL_collation_name));
2736 goto bad;
2737 }
2738
2739 DEBUG_L(PerlIO_printf(Perl_debug_log,
2740 "_mem_collxfrm: highest 1-byte collating character"
2741 " in locale %s is 0x%02X\n",
2742 PL_collation_name,
2743 PL_strxfrm_max_cp));
2744
2745 Safefree(cur_max_x);
2746 }
2747
2748 /* Here we know which legal code point collates the highest.
2749 * We are ready to construct the non-UTF-8 string. The length
2750 * will be at least 1 byte smaller than the input string
2751 * (because we changed at least one 2-byte character into a
2752 * single byte), but that is eaten up by the trailing NUL */
2753 Newx(s, len, char);
2754
2755 {
2756 STRLEN i;
2757 STRLEN d= 0;
2758 char * e = (char *) t + len;
2759
2760 for (i = 0; i < len; i+= UTF8SKIP(t + i)) {
2761 U8 cur_char = t[i];
2762 if (UTF8_IS_INVARIANT(cur_char)) {
2763 s[d++] = cur_char;
2764 }
2765 else if (UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(t + i, e)) {
2766 s[d++] = EIGHT_BIT_UTF8_TO_NATIVE(cur_char, t[i+1]);
2767 }
2768 else { /* Replace illegal cp with highest collating
2769 one */
2770 s[d++] = PL_strxfrm_max_cp;
2771 }
2772 }
2773 s[d++] = '\0';
2774 Renew(s, d, char); /* Free up unused space */
2775 }
2776 }
2777 }
2778
2779 /* Here, we have constructed a modified version of the input. It could
2780 * be that we already had a modified copy before we did this version.
2781 * If so, that copy is no longer needed */
2782 if (t != input_string) {
2783 Safefree(t);
2784 }
2785 }
2786
2787 length_in_chars = (utf8)
2788 ? utf8_length((U8 *) s, (U8 *) s + len)
2789 : len;
2790
2791 /* The first element in the output is the collation id, used by
2792 * sv_collxfrm(); then comes the space for the transformed string. The
2793 * equation should give us a good estimate as to how much is needed */
2794 xAlloc = COLLXFRM_HDR_LEN
2795 + PL_collxfrm_base
2796 + (PL_collxfrm_mult * length_in_chars);
2797 Newx(xbuf, xAlloc, char);
2798 if (UNLIKELY(! xbuf)) {
2799 DEBUG_L(PerlIO_printf(Perl_debug_log,
2800 "_mem_collxfrm: Couldn't malloc %zu bytes\n", xAlloc));
2801 goto bad;
2802 }
2803
2804 /* Store the collation id */
2805 *(U32*)xbuf = PL_collation_ix;
2806
2807 /* Then the transformation of the input. We loop until successful, or we
2808 * give up */
2809 for (;;) {
2810
2811 *xlen = strxfrm(xbuf + COLLXFRM_HDR_LEN, s, xAlloc - COLLXFRM_HDR_LEN);
2812
2813 /* If the transformed string occupies less space than we told strxfrm()
2814 * was available, it means it successfully transformed the whole
2815 * string. */
2816 if (*xlen < xAlloc - COLLXFRM_HDR_LEN) {
2817
2818 /* Some systems include a trailing NUL in the returned length.
2819 * Ignore it, using a loop in case multiple trailing NULs are
2820 * returned. */
2821 while ( (*xlen) > 0
2822 && *(xbuf + COLLXFRM_HDR_LEN + (*xlen) - 1) == '\0')
2823 {
2824 (*xlen)--;
2825 }
2826
2827 /* If the first try didn't get it, it means our prediction was low.
2828 * Modify the coefficients so that we predict a larger value in any
2829 * future transformations */
2830 if (! first_time) {
2831 STRLEN needed = *xlen + 1; /* +1 For trailing NUL */
2832 STRLEN computed_guess = PL_collxfrm_base
2833 + (PL_collxfrm_mult * length_in_chars);
2834
2835 /* On zero-length input, just keep current slope instead of
2836 * dividing by 0 */
2837 const STRLEN new_m = (length_in_chars != 0)
2838 ? needed / length_in_chars
2839 : PL_collxfrm_mult;
2840
2841 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
2842 "%s: %d: initial size of %zu bytes for a length "
2843 "%zu string was insufficient, %zu needed\n",
2844 __FILE__, __LINE__,
2845 computed_guess, length_in_chars, needed));
2846
2847 /* If slope increased, use it, but discard this result for
2848 * length 1 strings, as we can't be sure that it's a real slope
2849 * change */
2850 if (length_in_chars > 1 && new_m > PL_collxfrm_mult) {
2851
2852# ifdef DEBUGGING
2853
2854 STRLEN old_m = PL_collxfrm_mult;
2855 STRLEN old_b = PL_collxfrm_base;
2856
2857# endif
2858
2859 PL_collxfrm_mult = new_m;
2860 PL_collxfrm_base = 1; /* +1 For trailing NUL */
2861 computed_guess = PL_collxfrm_base
2862 + (PL_collxfrm_mult * length_in_chars);
2863 if (computed_guess < needed) {
2864 PL_collxfrm_base += needed - computed_guess;
2865 }
2866
2867 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
2868 "%s: %d: slope is now %zu; was %zu, base "
2869 "is now %zu; was %zu\n",
2870 __FILE__, __LINE__,
2871 PL_collxfrm_mult, old_m,
2872 PL_collxfrm_base, old_b));
2873 }
2874 else { /* Slope didn't change, but 'b' did */
2875 const STRLEN new_b = needed
2876 - computed_guess
2877 + PL_collxfrm_base;
2878 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
2879 "%s: %d: base is now %zu; was %zu\n",
2880 __FILE__, __LINE__,
2881 new_b, PL_collxfrm_base));
2882 PL_collxfrm_base = new_b;
2883 }
2884 }
2885
2886 break;
2887 }
2888
2889 if (UNLIKELY(*xlen >= PERL_INT_MAX)) {
2890 DEBUG_L(PerlIO_printf(Perl_debug_log,
2891 "_mem_collxfrm: Needed %zu bytes, max permissible is %u\n",
2892 *xlen, PERL_INT_MAX));
2893 goto bad;
2894 }
2895
2896 /* A well-behaved strxfrm() returns exactly how much space it needs
2897 * (usually not including the trailing NUL) when it fails due to not
2898 * enough space being provided. Assume that this is the case unless
2899 * it's been proven otherwise */
2900 if (LIKELY(PL_strxfrm_is_behaved) && first_time) {
2901 xAlloc = *xlen + COLLXFRM_HDR_LEN + 1;
2902 }
2903 else { /* Here, either:
2904 * 1) The strxfrm() has previously shown bad behavior; or
2905 * 2) It isn't the first time through the loop, which means
2906 * that the strxfrm() is now showing bad behavior, because
2907 * we gave it what it said was needed in the previous
2908 * iteration, and it came back saying it needed still more.
2909 * (Many versions of cygwin fit this. When the buffer size
2910 * isn't sufficient, they return the input size instead of
2911 * how much is needed.)
2912 * Increase the buffer size by a fixed percentage and try again.
2913 * */
2914 xAlloc += (xAlloc / 4) + 1;
2915 PL_strxfrm_is_behaved = FALSE;
2916
2917# ifdef DEBUGGING
2918
2919 if (DEBUG_Lv_TEST || debug_initialization) {
2920 PerlIO_printf(Perl_debug_log,
2921 "_mem_collxfrm required more space than previously calculated"
2922 " for locale %s, trying again with new guess=%d+%zu\n",
2923 PL_collation_name, (int) COLLXFRM_HDR_LEN,
2924 xAlloc - COLLXFRM_HDR_LEN);
2925 }
2926
2927# endif
2928
2929 }
2930
2931 Renew(xbuf, xAlloc, char);
2932 if (UNLIKELY(! xbuf)) {
2933 DEBUG_L(PerlIO_printf(Perl_debug_log,
2934 "_mem_collxfrm: Couldn't realloc %zu bytes\n", xAlloc));
2935 goto bad;
2936 }
2937
2938 first_time = FALSE;
2939 }
2940
2941
2942# ifdef DEBUGGING
2943
2944 if (DEBUG_Lv_TEST || debug_initialization) {
2945
2946 print_collxfrm_input_and_return(s, s + len, xlen, utf8);
2947 PerlIO_printf(Perl_debug_log, "Its xfrm is:");
2948 PerlIO_printf(Perl_debug_log, "%s\n",
2949 _byte_dump_string((U8 *) xbuf + COLLXFRM_HDR_LEN,
2950 *xlen, 1));
2951 }
2952
2953# endif
2954
2955 /* Free up unneeded space; retain ehough for trailing NUL */
2956 Renew(xbuf, COLLXFRM_HDR_LEN + *xlen + 1, char);
2957
2958 if (s != input_string) {
2959 Safefree(s);
2960 }
2961
2962 return xbuf;
2963
2964 bad:
2965 Safefree(xbuf);
2966 if (s != input_string) {
2967 Safefree(s);
2968 }
2969 *xlen = 0;
2970
2971# ifdef DEBUGGING
2972
2973 if (DEBUG_Lv_TEST || debug_initialization) {
2974 print_collxfrm_input_and_return(s, s + len, NULL, utf8);
2975 }
2976
2977# endif
2978
2979 return NULL;
2980}
2981
2982# ifdef DEBUGGING
2983
2984STATIC void
2985S_print_collxfrm_input_and_return(pTHX_
2986 const char * const s,
2987 const char * const e,
2988 const STRLEN * const xlen,
2989 const bool is_utf8)
2990{
2991
2992 PERL_ARGS_ASSERT_PRINT_COLLXFRM_INPUT_AND_RETURN;
2993
2994 PerlIO_printf(Perl_debug_log, "_mem_collxfrm[%" UVuf "]: returning ",
2995 (UV)PL_collation_ix);
2996 if (xlen) {
2997 PerlIO_printf(Perl_debug_log, "%zu", *xlen);
2998 }
2999 else {
3000 PerlIO_printf(Perl_debug_log, "NULL");
3001 }
3002 PerlIO_printf(Perl_debug_log, " for locale '%s', string='",
3003 PL_collation_name);
3004 print_bytes_for_locale(s, e, is_utf8);
3005
3006 PerlIO_printf(Perl_debug_log, "'\n");
3007}
3008
3009STATIC void
3010S_print_bytes_for_locale(pTHX_
3011 const char * const s,
3012 const char * const e,
3013 const bool is_utf8)
3014{
3015 const char * t = s;
3016 bool prev_was_printable = TRUE;
3017 bool first_time = TRUE;
3018
3019 PERL_ARGS_ASSERT_PRINT_BYTES_FOR_LOCALE;
3020
3021 while (t < e) {
3022 UV cp = (is_utf8)
3023 ? utf8_to_uvchr_buf((U8 *) t, e, NULL)
3024 : * (U8 *) t;
3025 if (isPRINT(cp)) {
3026 if (! prev_was_printable) {
3027 PerlIO_printf(Perl_debug_log, " ");
3028 }
3029 PerlIO_printf(Perl_debug_log, "%c", (U8) cp);
3030 prev_was_printable = TRUE;
3031 }
3032 else {
3033 if (! first_time) {
3034 PerlIO_printf(Perl_debug_log, " ");
3035 }
3036 PerlIO_printf(Perl_debug_log, "%02" UVXf, cp);
3037 prev_was_printable = FALSE;
3038 }
3039 t += (is_utf8) ? UTF8SKIP(t) : 1;
3040 first_time = FALSE;
3041 }
3042}
3043
3044# endif /* #ifdef DEBUGGING */
3045#endif /* USE_LOCALE_COLLATE */
3046
3047#ifdef USE_LOCALE
3048
3049STATIC const char *
3050S_switch_category_locale_to_template(pTHX_ const int switch_category, const int template_category, const char * template_locale)
3051{
3052 /* Changes the locale for LC_'switch_category" to that of
3053 * LC_'template_category', if they aren't already the same. If not NULL,
3054 * 'template_locale' is the locale that 'template_category' is in.
3055 *
3056 * Returns the original locale for 'switch_category' so can be switched
3057 * back to with the companion function restore_switched_locale(), (NULL if
3058 * no restoral is necessary.) */
3059
3060 char * restore_to_locale = NULL;
3061
3062 if (switch_category == template_category) { /* No changes needed */
3063 return NULL;
3064 }
3065
3066 /* Find the original locale of the category we may need to change, so that
3067 * it can be restored to later */
3068 restore_to_locale = stdize_locale(savepv(do_setlocale_r(switch_category,
3069 NULL)));
3070 if (! restore_to_locale) {
3071 Perl_croak(aTHX_
3072 "panic: %s: %d: Could not find current %s locale, errno=%d\n",
3073 __FILE__, __LINE__, category_name(switch_category), errno);
3074 }
3075
3076 /* If the locale of the template category wasn't passed in, find it now */
3077 if (template_locale == NULL) {
3078 template_locale = do_setlocale_r(template_category, NULL);
3079 if (! template_locale) {
3080 Perl_croak(aTHX_
3081 "panic: %s: %d: Could not find current %s locale, errno=%d\n",
3082 __FILE__, __LINE__, category_name(template_category), errno);
3083 }
3084 }
3085
3086 /* It the locales are the same, there's nothing to do */
3087 if (strEQ(restore_to_locale, template_locale)) {
3088 Safefree(restore_to_locale);
3089
3090 DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s locale unchanged as %s\n",
3091 category_name(switch_category), restore_to_locale));
3092
3093 return NULL;
3094 }
3095
3096 /* Finally, change the locale to the template one */
3097 if (! do_setlocale_r(switch_category, template_locale)) {
3098 Perl_croak(aTHX_
3099 "panic: %s: %d: Could not change %s locale to %s, errno=%d\n",
3100 __FILE__, __LINE__, category_name(switch_category),
3101 template_locale, errno);
3102 }
3103
3104 DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s locale switched to %s\n",
3105 category_name(switch_category), template_locale));
3106
3107 return restore_to_locale;
3108}
3109
3110STATIC void
3111S_restore_switched_locale(pTHX_ const int category, const char * const original_locale)
3112{
3113 /* Restores the locale for LC_'category' to 'original_locale', or do
3114 * nothing if the latter parameter is NULL */
3115
3116 if (original_locale == NULL) {
3117 return;
3118 }
3119
3120 if (! do_setlocale_r(category, original_locale)) {
3121 Perl_croak(aTHX_
3122 "panic: %s: %d: setlocale %s restore to %s failed, errno=%d\n",
3123 __FILE__, __LINE__,
3124 category_name(category), original_locale, errno);
3125 }
3126
3127 Safefree(original_locale);
3128}
3129
3130bool
3131Perl__is_cur_LC_category_utf8(pTHX_ int category)
3132{
3133 /* Returns TRUE if the current locale for 'category' is UTF-8; FALSE
3134 * otherwise. 'category' may not be LC_ALL. If the platform doesn't have
3135 * nl_langinfo(), nor MB_CUR_MAX, this employs a heuristic, which hence
3136 * could give the wrong result. The result will very likely be correct for
3137 * languages that have commonly used non-ASCII characters, but for notably
3138 * English, it comes down to if the locale's name ends in something like
3139 * "UTF-8". It errs on the side of not being a UTF-8 locale. */
3140
3141 /* Name of current locale corresponding to the input category */
3142 const char *save_input_locale = NULL;
3143
3144 bool is_utf8 = FALSE; /* The return value */
3145
3146 /* The variables below are for the cache of previous lookups using this
3147 * function. The cache is a C string, described at the definition for
3148 * 'C_and_POSIX_utf8ness'.
3149 *
3150 * The first part of the cache is fixed, for the C and POSIX locales. The
3151 * varying part starts just after them. */
3152 char * utf8ness_cache = PL_locale_utf8ness + STRLENs(C_and_POSIX_utf8ness);
3153
3154 Size_t utf8ness_cache_size; /* Size of the varying portion */
3155 Size_t input_name_len; /* Length in bytes of save_input_locale */
3156 Size_t input_name_len_with_overhead; /* plus extra chars used to store
3157 the name in the cache */
3158 char * delimited; /* The name plus the delimiters used to store
3159 it in the cache */
3160 char * name_pos; /* position of 'delimited' in the cache, or 0
3161 if not there */
3162
3163
3164# ifdef LC_ALL
3165
3166 assert(category != LC_ALL);
3167
3168# endif
3169
3170 /* Get the desired category's locale */
3171 save_input_locale = stdize_locale(savepv(do_setlocale_r(category, NULL)));
3172 if (! save_input_locale) {
3173 Perl_croak(aTHX_
3174 "panic: %s: %d: Could not find current %s locale, errno=%d\n",
3175 __FILE__, __LINE__, category_name(category), errno);
3176 }
3177
3178 DEBUG_L(PerlIO_printf(Perl_debug_log,
3179 "Current locale for %s is %s\n",
3180 category_name(category), save_input_locale));
3181
3182 input_name_len = strlen(save_input_locale);
3183
3184 /* In our cache, each name is accompanied by two delimiters and a single
3185 * utf8ness digit */
3186 input_name_len_with_overhead = input_name_len + 3;
3187
3188 /* Allocate and populate space for a copy of the name surrounded by the
3189 * delimiters */
3190 Newx(delimited, input_name_len_with_overhead, char);
3191 delimited[0] = UTF8NESS_SEP[0];
3192 Copy(save_input_locale, delimited + 1, input_name_len, char);
3193 delimited[input_name_len+1] = UTF8NESS_PREFIX[0];
3194 delimited[input_name_len+2] = '\0';
3195
3196 /* And see if that is in the cache */
3197 name_pos = instr(PL_locale_utf8ness, delimited);
3198 if (name_pos) {
3199 is_utf8 = *(name_pos + input_name_len_with_overhead - 1) - '0';
3200
3201# ifdef DEBUGGING
3202
3203 if (DEBUG_Lv_TEST || debug_initialization) {
3204 PerlIO_printf(Perl_debug_log, "UTF8ness for locale %s=%d, \n",
3205 save_input_locale, is_utf8);
3206 }
3207
3208# endif
3209
3210 /* And, if not already in that position, move it to the beginning of
3211 * the non-constant portion of the list, since it is the most recently
3212 * used. (We don't have to worry about overflow, since just moving
3213 * existing names around) */
3214 if (name_pos > utf8ness_cache) {
3215 Move(utf8ness_cache,
3216 utf8ness_cache + input_name_len_with_overhead,
3217 name_pos - utf8ness_cache, char);
3218 Copy(delimited,
3219 utf8ness_cache,
3220 input_name_len_with_overhead - 1, char);
3221 utf8ness_cache[input_name_len_with_overhead - 1] = is_utf8 + '0';
3222 }
3223
3224 Safefree(delimited);
3225 Safefree(save_input_locale);
3226 return is_utf8;
3227 }
3228
3229 /* Here we don't have stored the utf8ness for the input locale. We have to
3230 * calculate it */
3231
3232# if defined(USE_LOCALE_CTYPE) \
3233 && ( (defined(HAS_NL_LANGINFO) && defined(CODESET)) \
3234 || (defined(HAS_MBTOWC) || defined(HAS_MBRTOWC)))
3235
3236 {
3237 const char *original_ctype_locale
3238 = switch_category_locale_to_template(LC_CTYPE,
3239 category,
3240 save_input_locale);
3241
3242 /* Here the current LC_CTYPE is set to the locale of the category whose
3243 * information is desired. This means that nl_langinfo() and mbtowc()
3244 * should give the correct results */
3245
3246# ifdef MB_CUR_MAX /* But we can potentially rule out UTF-8ness, avoiding
3247 calling the functions if we have this */
3248
3249 /* Standard UTF-8 needs at least 4 bytes to represent the maximum
3250 * Unicode code point. */
3251
3252 DEBUG_L(PerlIO_printf(Perl_debug_log, "\tMB_CUR_MAX=%d\n",
3253 (int) MB_CUR_MAX));
3254 if ((unsigned) MB_CUR_MAX < STRLENs(MAX_UNICODE_UTF8)) {
3255 is_utf8 = FALSE;
3256 restore_switched_locale(LC_CTYPE, original_ctype_locale);
3257 goto finish_and_return;
3258 }
3259
3260# endif
3261# if defined(HAS_NL_LANGINFO) && defined(CODESET)
3262
3263 { /* The task is easiest if the platform has this POSIX 2001 function.
3264 Except on some platforms it can wrongly return "", so have to have
3265 a fallback. And it can return that it's UTF-8, even if there are
3266 variances from that. For example, Turkish locales may use the
3267 alternate dotted I rules, and sometimes it appears to be a
3268 defective locale definition. XXX We should probably check for
3269 these in the Latin1 range and warn */
3270 const char *codeset = my_nl_langinfo(PERL_CODESET, FALSE);
3271 /* FALSE => already in dest locale */
3272
3273 DEBUG_L(PerlIO_printf(Perl_debug_log,
3274 "\tnllanginfo returned CODESET '%s'\n", codeset));
3275
3276 if (codeset && strNE(codeset, "")) {
3277
3278 /* If the implementation of foldEQ() somehow were
3279 * to change to not go byte-by-byte, this could
3280 * read past end of string, as only one length is
3281 * checked. But currently, a premature NUL will
3282 * compare false, and it will stop there */
3283 is_utf8 = cBOOL( foldEQ(codeset, STR_WITH_LEN("UTF-8"))
3284 || foldEQ(codeset, STR_WITH_LEN("UTF8")));
3285
3286 DEBUG_L(PerlIO_printf(Perl_debug_log,
3287 "\tnllanginfo returned CODESET '%s'; ?UTF8 locale=%d\n",
3288 codeset, is_utf8));
3289 restore_switched_locale(LC_CTYPE, original_ctype_locale);
3290 goto finish_and_return;
3291 }
3292 }
3293
3294# endif
3295# if defined(HAS_MBTOWC) || defined(HAS_MBRTOWC)
3296 /* We can see if this is a UTF-8-like locale if have mbtowc(). It was a
3297 * late adder to C89, so very likely to have it. However, testing has
3298 * shown that, like nl_langinfo() above, there are locales that are not
3299 * strictly UTF-8 that this will return that they are */
3300
3301 {
3302 wchar_t wc;
3303 int len;
3304
3305# if defined(HAS_MBRTOWC) && defined(USE_ITHREADS)
3306
3307 mbstate_t ps;
3308
3309# endif
3310
3311 /* mbrtowc() and mbtowc() convert a byte string to a wide
3312 * character. Feed a byte string to one of them and check that the
3313 * result is the expected Unicode code point */
3314
3315# if defined(HAS_MBRTOWC) && defined(USE_ITHREADS)
3316 /* Prefer this function if available, as it's reentrant */
3317
3318 memset(&ps, 0, sizeof(ps));;
3319 PERL_UNUSED_RESULT(mbrtowc(&wc, NULL, 0, &ps)); /* Reset any shift
3320 state */
3321 errno = 0;
3322 len = mbrtowc(&wc, STR_WITH_LEN(REPLACEMENT_CHARACTER_UTF8), &ps);
3323
3324# else
3325
3326 PERL_UNUSED_RESULT(mbtowc(&wc, NULL, 0));/* Reset any shift state */
3327 errno = 0;
3328 len = mbtowc(&wc, STR_WITH_LEN(REPLACEMENT_CHARACTER_UTF8));
3329
3330# endif
3331
3332 DEBUG_L(PerlIO_printf(Perl_debug_log,
3333 "\treturn from mbtowc; len=%d; code_point=%x; errno=%d\n",
3334 len, (unsigned int) wc, errno));
3335
3336 is_utf8 = cBOOL( len == STRLENs(REPLACEMENT_CHARACTER_UTF8)
3337 && wc == (wchar_t) UNICODE_REPLACEMENT);
3338 }
3339
3340 restore_switched_locale(LC_CTYPE, original_ctype_locale);
3341 goto finish_and_return;
3342 }
3343
3344# endif
3345# else
3346
3347 /* Here, we must have a C89 compiler that doesn't have mbtowc(). Next
3348 * try looking at the currency symbol to see if it disambiguates
3349 * things. Often that will be in the native script, and if the symbol
3350 * isn't in UTF-8, we know that the locale isn't. If it is non-ASCII
3351 * UTF-8, we infer that the locale is too, as the odds of a non-UTF8
3352 * string being valid UTF-8 are quite small */
3353
3354# ifdef USE_LOCALE_MONETARY
3355
3356 /* If have LC_MONETARY, we can look at the currency symbol. Often that
3357 * will be in the native script. We do this one first because there is
3358 * just one string to examine, so potentially avoids work */
3359
3360 {
3361 const char *original_monetary_locale
3362 = switch_category_locale_to_template(LC_MONETARY,
3363 category,
3364 save_input_locale);
3365 bool only_ascii = FALSE;
3366 const U8 * currency_string
3367 = (const U8 *) my_nl_langinfo(PERL_CRNCYSTR, FALSE);
3368 /* 2nd param not relevant for this item */
3369 const U8 * first_variant;
3370
3371 assert( *currency_string == '-'
3372 || *currency_string == '+'
3373 || *currency_string == '.');
3374
3375 currency_string++;
3376
3377 if (is_utf8_invariant_string_loc(currency_string, 0, &first_variant))
3378 {
3379 DEBUG_L(PerlIO_printf(Perl_debug_log, "Couldn't get currency symbol for %s, or contains only ASCII; can't use for determining if UTF-8 locale\n", save_input_locale));
3380 only_ascii = TRUE;
3381 }
3382 else {
3383 is_utf8 = is_strict_utf8_string(first_variant, 0);
3384 }
3385
3386 restore_switched_locale(LC_MONETARY, original_monetary_locale);
3387
3388 if (! only_ascii) {
3389
3390 /* It isn't a UTF-8 locale if the symbol is not legal UTF-8;
3391 * otherwise assume the locale is UTF-8 if and only if the symbol
3392 * is non-ascii UTF-8. */
3393 DEBUG_L(PerlIO_printf(Perl_debug_log, "\t?Currency symbol for %s is UTF-8=%d\n",
3394 save_input_locale, is_utf8));
3395 goto finish_and_return;
3396 }
3397 }
3398
3399# endif /* USE_LOCALE_MONETARY */
3400# if defined(HAS_STRFTIME) && defined(USE_LOCALE_TIME)
3401
3402 /* Still haven't found a non-ASCII string to disambiguate UTF-8 or not. Try
3403 * the names of the months and weekdays, timezone, and am/pm indicator */
3404 {
3405 const char *original_time_locale
3406 = switch_category_locale_to_template(LC_TIME,
3407 category,
3408 save_input_locale);
3409 int hour = 10;
3410 bool is_dst = FALSE;
3411 int dom = 1;
3412 int month = 0;
3413 int i;
3414 char * formatted_time;
3415
3416 /* Here the current LC_TIME is set to the locale of the category
3417 * whose information is desired. Look at all the days of the week and
3418 * month names, and the timezone and am/pm indicator for UTF-8 variant
3419 * characters. The first such a one found will tell us if the locale
3420 * is UTF-8 or not */
3421
3422 for (i = 0; i < 7 + 12; i++) { /* 7 days; 12 months */
3423 formatted_time = my_strftime("%A %B %Z %p",
3424 0, 0, hour, dom, month, 2012 - 1900, 0, 0, is_dst);
3425 if ( ! formatted_time
3426 || is_utf8_invariant_string((U8 *) formatted_time, 0))
3427 {
3428
3429 /* Here, we didn't find a non-ASCII. Try the next time through
3430 * with the complemented dst and am/pm, and try with the next
3431 * weekday. After we have gotten all weekdays, try the next
3432 * month */
3433 is_dst = ! is_dst;
3434 hour = (hour + 12) % 24;
3435 dom++;
3436 if (i > 6) {
3437 month++;
3438 }
3439 continue;
3440 }
3441
3442 /* Here, we have a non-ASCII. Return TRUE is it is valid UTF8;
3443 * false otherwise. But first, restore LC_TIME to its original
3444 * locale if we changed it */
3445 restore_switched_locale(LC_TIME, original_time_locale);
3446
3447 DEBUG_L(PerlIO_printf(Perl_debug_log, "\t?time-related strings for %s are UTF-8=%d\n",
3448 save_input_locale,
3449 is_utf8_string((U8 *) formatted_time, 0)));
3450 is_utf8 = is_utf8_string((U8 *) formatted_time, 0);
3451 goto finish_and_return;
3452 }
3453
3454 /* Falling off the end of the loop indicates all the names were just
3455 * ASCII. Go on to the next test. If we changed it, restore LC_TIME
3456 * to its original locale */
3457 restore_switched_locale(LC_TIME, original_time_locale);
3458 DEBUG_L(PerlIO_printf(Perl_debug_log, "All time-related words for %s contain only ASCII; can't use for determining if UTF-8 locale\n", save_input_locale));
3459 }
3460
3461# endif
3462
3463# if 0 && defined(USE_LOCALE_MESSAGES) && defined(HAS_SYS_ERRLIST)
3464
3465 /* This code is ifdefd out because it was found to not be necessary in testing
3466 * on our dromedary test machine, which has over 700 locales. There, this
3467 * added no value to looking at the currency symbol and the time strings. I
3468 * left it in so as to avoid rewriting it if real-world experience indicates
3469 * that dromedary is an outlier. Essentially, instead of returning abpve if we
3470 * haven't found illegal utf8, we continue on and examine all the strerror()
3471 * messages on the platform for utf8ness. If all are ASCII, we still don't
3472 * know the answer; but otherwise we have a pretty good indication of the
3473 * utf8ness. The reason this doesn't help much is that the messages may not
3474 * have been translated into the locale. The currency symbol and time strings
3475 * are much more likely to have been translated. */
3476 {
3477 int e;
3478 bool non_ascii = FALSE;
3479 const char *original_messages_locale
3480 = switch_category_locale_to_template(LC_MESSAGES,
3481 category,
3482 save_input_locale);
3483 const char * errmsg = NULL;
3484
3485 /* Here the current LC_MESSAGES is set to the locale of the category
3486 * whose information is desired. Look through all the messages. We
3487 * can't use Strerror() here because it may expand to code that
3488 * segfaults in miniperl */
3489
3490 for (e = 0; e <= sys_nerr; e++) {
3491 errno = 0;
3492 errmsg = sys_errlist[e];
3493 if (errno || !errmsg) {
3494 break;
3495 }
3496 errmsg = savepv(errmsg);
3497 if (! is_utf8_invariant_string((U8 *) errmsg, 0)) {
3498 non_ascii = TRUE;
3499 is_utf8 = is_utf8_string((U8 *) errmsg, 0);
3500 break;
3501 }
3502 }
3503 Safefree(errmsg);
3504
3505 restore_switched_locale(LC_MESSAGES, original_messages_locale);
3506
3507 if (non_ascii) {
3508
3509 /* Any non-UTF-8 message means not a UTF-8 locale; if all are valid,
3510 * any non-ascii means it is one; otherwise we assume it isn't */
3511 DEBUG_L(PerlIO_printf(Perl_debug_log, "\t?error messages for %s are UTF-8=%d\n",
3512 save_input_locale,
3513 is_utf8));
3514 goto finish_and_return;
3515 }
3516
3517 DEBUG_L(PerlIO_printf(Perl_debug_log, "All error messages for %s contain only ASCII; can't use for determining if UTF-8 locale\n", save_input_locale));
3518 }
3519
3520# endif
3521# ifndef EBCDIC /* On os390, even if the name ends with "UTF-8', it isn't a
3522 UTF-8 locale */
3523
3524 /* As a last resort, look at the locale name to see if it matches
3525 * qr/UTF -? * 8 /ix, or some other common locale names. This "name", the
3526 * return of setlocale(), is actually defined to be opaque, so we can't
3527 * really rely on the absence of various substrings in the name to indicate
3528 * its UTF-8ness, but if it has UTF8 in the name, it is extremely likely to
3529 * be a UTF-8 locale. Similarly for the other common names */
3530
3531 {
3532 const Size_t final_pos = strlen(save_input_locale) - 1;
3533
3534 if (final_pos >= 3) {
3535 const char *name = save_input_locale;
3536
3537 /* Find next 'U' or 'u' and look from there */
3538 while ((name += strcspn(name, "Uu") + 1)
3539 <= save_input_locale + final_pos - 2)
3540 {
3541 if ( isALPHA_FOLD_NE(*name, 't')
3542 || isALPHA_FOLD_NE(*(name + 1), 'f'))
3543 {
3544 continue;
3545 }
3546 name += 2;
3547 if (*(name) == '-') {
3548 if ((name > save_input_locale + final_pos - 1)) {
3549 break;
3550 }
3551 name++;
3552 }
3553 if (*(name) == '8') {
3554 DEBUG_L(PerlIO_printf(Perl_debug_log,
3555 "Locale %s ends with UTF-8 in name\n",
3556 save_input_locale));
3557 is_utf8 = TRUE;
3558 goto finish_and_return;
3559 }
3560 }
3561 DEBUG_L(PerlIO_printf(Perl_debug_log,
3562 "Locale %s doesn't end with UTF-8 in name\n",
3563 save_input_locale));
3564 }
3565
3566# ifdef WIN32
3567
3568 /* http://msdn.microsoft.com/en-us/library/windows/desktop/dd317756.aspx */
3569 if (memENDs(save_input_locale, final_pos, "65001")) {
3570 DEBUG_L(PerlIO_printf(Perl_debug_log,
3571 "Locale %s ends with 65001 in name, is UTF-8 locale\n",
3572 save_input_locale));
3573 is_utf8 = TRUE;
3574 goto finish_and_return;
3575 }
3576 }
3577
3578# endif
3579# endif
3580
3581 /* Other common encodings are the ISO 8859 series, which aren't UTF-8. But
3582 * since we are about to return FALSE anyway, there is no point in doing
3583 * this extra work */
3584
3585# if 0
3586 if (instr(save_input_locale, "8859")) {
3587 DEBUG_L(PerlIO_printf(Perl_debug_log,
3588 "Locale %s has 8859 in name, not UTF-8 locale\n",
3589 save_input_locale));
3590 is_utf8 = FALSE;
3591 goto finish_and_return;
3592 }
3593# endif
3594
3595 DEBUG_L(PerlIO_printf(Perl_debug_log,
3596 "Assuming locale %s is not a UTF-8 locale\n",
3597 save_input_locale));
3598 is_utf8 = FALSE;
3599
3600# endif /* the code that is compiled when no modern LC_CTYPE */
3601
3602 finish_and_return:
3603
3604 /* Cache this result so we don't have to go through all this next time. */
3605 utf8ness_cache_size = sizeof(PL_locale_utf8ness)
3606 - (utf8ness_cache - PL_locale_utf8ness);
3607
3608 /* But we can't save it if it is too large for the total space available */
3609 if (LIKELY(input_name_len_with_overhead < utf8ness_cache_size)) {
3610 Size_t utf8ness_cache_len = strlen(utf8ness_cache);
3611
3612 /* Here it can fit, but we may need to clear out the oldest cached
3613 * result(s) to do so. Check */
3614 if (utf8ness_cache_len + input_name_len_with_overhead
3615 >= utf8ness_cache_size)
3616 {
3617 /* Here we have to clear something out to make room for this.
3618 * Start looking at the rightmost place where it could fit and find
3619 * the beginning of the entry that extends past that. */
3620 char * cutoff = (char *) my_memrchr(utf8ness_cache,
3621 UTF8NESS_SEP[0],
3622 utf8ness_cache_size
3623 - input_name_len_with_overhead);
3624
3625 assert(cutoff);
3626 assert(cutoff >= utf8ness_cache);
3627
3628 /* This and all subsequent entries must be removed */
3629 *cutoff = '\0';
3630 utf8ness_cache_len = strlen(utf8ness_cache);
3631 }
3632
3633 /* Make space for the new entry */
3634 Move(utf8ness_cache,
3635 utf8ness_cache + input_name_len_with_overhead,
3636 utf8ness_cache_len + 1 /* Incl. trailing NUL */, char);
3637
3638 /* And insert it */
3639 Copy(delimited, utf8ness_cache, input_name_len_with_overhead - 1, char);
3640 utf8ness_cache[input_name_len_with_overhead - 1] = is_utf8 + '0';
3641
3642 if ((PL_locale_utf8ness[strlen(PL_locale_utf8ness)-1]
3643 & (PERL_UINTMAX_T) ~1) != '0')
3644 {
3645 Perl_croak(aTHX_
3646 "panic: %s: %d: Corrupt utf8ness_cache=%s\nlen=%zu,"
3647 " inserted_name=%s, its_len=%zu\n",
3648 __FILE__, __LINE__,
3649 PL_locale_utf8ness, strlen(PL_locale_utf8ness),
3650 delimited, input_name_len_with_overhead);
3651 }
3652 }
3653
3654# ifdef DEBUGGING
3655
3656 if (DEBUG_Lv_TEST || debug_initialization) {
3657 PerlIO_printf(Perl_debug_log,
3658 "PL_locale_utf8ness is now %s; returning %d\n",
3659 PL_locale_utf8ness, is_utf8);
3660 }
3661
3662# endif
3663
3664 Safefree(delimited);
3665 Safefree(save_input_locale);
3666 return is_utf8;
3667}
3668
3669#endif
3670
3671bool
3672Perl__is_in_locale_category(pTHX_ const bool compiling, const int category)
3673{
3674 dVAR;
3675 /* Internal function which returns if we are in the scope of a pragma that
3676 * enables the locale category 'category'. 'compiling' should indicate if
3677 * this is during the compilation phase (TRUE) or not (FALSE). */
3678
3679 const COP * const cop = (compiling) ? &PL_compiling : PL_curcop;
3680
3681 SV *categories = cop_hints_fetch_pvs(cop, "locale", 0);
3682 if (! categories || categories == &PL_sv_placeholder) {
3683 return FALSE;
3684 }
3685
3686 /* The pseudo-category 'not_characters' is -1, so just add 1 to each to get
3687 * a valid unsigned */
3688 assert(category >= -1);
3689 return cBOOL(SvUV(categories) & (1U << (category + 1)));
3690}
3691
3692char *
3693Perl_my_strerror(pTHX_ const int errnum)
3694{
3695 /* Returns a mortalized copy of the text of the error message associated
3696 * with 'errnum'. It uses the current locale's text unless the platform
3697 * doesn't have the LC_MESSAGES category or we are not being called from
3698 * within the scope of 'use locale'. In the former case, it uses whatever
3699 * strerror returns; in the latter case it uses the text from the C locale.
3700 *
3701 * The function just calls strerror(), but temporarily switches, if needed,
3702 * to the C locale */
3703
3704 char *errstr;
3705 dVAR;
3706
3707#ifndef USE_LOCALE_MESSAGES
3708
3709 /* If platform doesn't have messages category, we don't do any switching to
3710 * the C locale; we just use whatever strerror() returns */
3711
3712 errstr = savepv(Strerror(errnum));
3713
3714#else /* Has locale messages */
3715
3716 const bool within_locale_scope = IN_LC(LC_MESSAGES);
3717
3718# if defined(HAS_POSIX_2008_LOCALE) && defined(HAS_STRERROR_L)
3719
3720 /* This function is trivial if we don't have to worry about thread safety
3721 * and have strerror_l(), as it handles the switch of locales so we don't
3722 * have to deal with that. We don't have to worry about thread safety if
3723 * this is an unthreaded build, or if strerror_r() is also available. Both
3724 * it and strerror_l() are thread-safe. Plain strerror() isn't thread
3725 * safe. But on threaded builds when strerror_r() is available, the
3726 * apparent call to strerror() below is actually a macro that
3727 * behind-the-scenes calls strerror_r().
3728 */
3729
3730# if ! defined(USE_ITHREADS) || defined(HAS_STRERROR_R)
3731
3732 if (within_locale_scope) {
3733 errstr = savepv(strerror(errnum));
3734 }
3735 else {
3736 errstr = savepv(strerror_l(errnum, PL_C_locale_obj));
3737 }
3738
3739# else
3740
3741 /* Here we have strerror_l(), but not strerror_r() and we are on a
3742 * threaded-build. We use strerror_l() for everything, constructing a
3743 * locale to pass to it if necessary */
3744
3745 bool do_free = FALSE;
3746 locale_t locale_to_use;
3747
3748 if (within_locale_scope) {
3749 locale_to_use = uselocale((locale_t) 0);
3750 if (locale_to_use == LC_GLOBAL_LOCALE) {
3751 locale_to_use = duplocale(LC_GLOBAL_LOCALE);
3752 do_free = TRUE;
3753 }
3754 }
3755 else { /* Use C locale if not within 'use locale' scope */
3756 locale_to_use = PL_C_locale_obj;
3757 }
3758
3759 errstr = savepv(strerror_l(errnum, locale_to_use));
3760
3761 if (do_free) {
3762 freelocale(locale_to_use);
3763 }
3764
3765# endif
3766# else /* Doesn't have strerror_l() */
3767
3768# ifdef USE_POSIX_2008_LOCALE
3769
3770 locale_t save_locale = NULL;
3771
3772# else
3773
3774 const char * save_locale = NULL;
3775 bool locale_is_C = FALSE;
3776
3777 /* We have a critical section to prevent another thread from changing the
3778 * locale out from under us (or zapping the buffer returned from
3779 * setlocale() ) */
3780 LOCALE_LOCK;
3781
3782# endif
3783
3784 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
3785 "my_strerror called with errnum %d\n", errnum));
3786 if (! within_locale_scope) {
3787 errno = 0;
3788
3789# ifdef USE_POSIX_2008_LOCALE /* Use the thread-safe locale functions */
3790
3791 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
3792 "Not within locale scope, about to call"
3793 " uselocale(0x%p)\n", PL_C_locale_obj));
3794 save_locale = uselocale(PL_C_locale_obj);
3795 if (! save_locale) {
3796 DEBUG_L(PerlIO_printf(Perl_debug_log,
3797 "uselocale failed, errno=%d\n", errno));
3798 }
3799 else {
3800 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
3801 "uselocale returned 0x%p\n", save_locale));
3802 }
3803
3804# else /* Not thread-safe build */
3805
3806 save_locale = do_setlocale_c(LC_MESSAGES, NULL);
3807 if (! save_locale) {
3808 Perl_croak(aTHX_
3809 "panic: %s: %d: Could not find current LC_MESSAGES locale,"
3810 " errno=%d\n", __FILE__, __LINE__, errno);
3811 }
3812 else {
3813 locale_is_C = isNAME_C_OR_POSIX(save_locale);
3814
3815 /* Switch to the C locale if not already in it */
3816 if (! locale_is_C) {
3817
3818 /* The setlocale() just below likely will zap 'save_locale', so
3819 * create a copy. */
3820 save_locale = savepv(save_locale);
3821 do_setlocale_c(LC_MESSAGES, "C");
3822 }
3823 }
3824
3825# endif
3826
3827 } /* end of ! within_locale_scope */
3828 else {
3829 DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s: %d: WITHIN locale scope\n",
3830 __FILE__, __LINE__));
3831 }
3832
3833 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
3834 "Any locale change has been done; about to call Strerror\n"));
3835 errstr = savepv(Strerror(errnum));
3836
3837 if (! within_locale_scope) {
3838 errno = 0;
3839
3840# ifdef USE_POSIX_2008_LOCALE
3841
3842 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
3843 "%s: %d: not within locale scope, restoring the locale\n",
3844 __FILE__, __LINE__));
3845 if (save_locale && ! uselocale(save_locale)) {
3846 DEBUG_L(PerlIO_printf(Perl_debug_log,
3847 "uselocale restore failed, errno=%d\n", errno));
3848 }
3849 }
3850
3851# else
3852
3853 if (save_locale && ! locale_is_C) {
3854 if (! do_setlocale_c(LC_MESSAGES, save_locale)) {
3855 Perl_croak(aTHX_
3856 "panic: %s: %d: setlocale restore failed, errno=%d\n",
3857 __FILE__, __LINE__, errno);
3858 }
3859 Safefree(save_locale);
3860 }
3861 }
3862
3863 LOCALE_UNLOCK;
3864
3865# endif
3866# endif /* End of doesn't have strerror_l */
3867#endif /* End of does have locale messages */
3868
3869#ifdef DEBUGGING
3870
3871 if (DEBUG_Lv_TEST) {
3872 PerlIO_printf(Perl_debug_log, "Strerror returned; saving a copy: '");
3873 print_bytes_for_locale(errstr, errstr + strlen(errstr), 0);
3874 PerlIO_printf(Perl_debug_log, "'\n");
3875 }
3876
3877#endif
3878
3879 SAVEFREEPV(errstr);
3880 return errstr;
3881}
3882
3883/*
3884
3885=for apidoc sync_locale
3886
3887Changing the program's locale should be avoided by XS code. Nevertheless,
3888certain non-Perl libraries called from XS, such as C<Gtk> do so. When this
3889happens, Perl needs to be told that the locale has changed. Use this function
3890to do so, before returning to Perl.
3891
3892=cut
3893*/
3894
3895void
3896Perl_sync_locale(pTHX)
3897{
3898 char * newlocale;
3899
3900#ifdef USE_LOCALE_CTYPE
3901
3902 newlocale = do_setlocale_c(LC_CTYPE, NULL);
3903 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
3904 "%s:%d: %s\n", __FILE__, __LINE__,
3905 setlocale_debug_string(LC_CTYPE, NULL, newlocale)));
3906 new_ctype(newlocale);
3907
3908#endif /* USE_LOCALE_CTYPE */
3909#ifdef USE_LOCALE_COLLATE
3910
3911 newlocale = do_setlocale_c(LC_COLLATE, NULL);
3912 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
3913 "%s:%d: %s\n", __FILE__, __LINE__,
3914 setlocale_debug_string(LC_COLLATE, NULL, newlocale)));
3915 new_collate(newlocale);
3916
3917#endif
3918#ifdef USE_LOCALE_NUMERIC
3919
3920 newlocale = do_setlocale_c(LC_NUMERIC, NULL);
3921 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
3922 "%s:%d: %s\n", __FILE__, __LINE__,
3923 setlocale_debug_string(LC_NUMERIC, NULL, newlocale)));
3924 new_numeric(newlocale);
3925
3926#endif /* USE_LOCALE_NUMERIC */
3927
3928}
3929
3930#if defined(DEBUGGING) && defined(USE_LOCALE)
3931
3932STATIC char *
3933S_setlocale_debug_string(const int category, /* category number,
3934 like LC_ALL */
3935 const char* const locale, /* locale name */
3936
3937 /* return value from setlocale() when attempting to
3938 * set 'category' to 'locale' */
3939 const char* const retval)
3940{
3941 /* Returns a pointer to a NUL-terminated string in static storage with
3942 * added text about the info passed in. This is not thread safe and will
3943 * be overwritten by the next call, so this should be used just to
3944 * formulate a string to immediately print or savepv() on. */
3945
3946 /* initialise to a non-null value to keep it out of BSS and so keep
3947 * -DPERL_GLOBAL_STRUCT_PRIVATE happy */
3948 static char ret[128] = "If you can read this, thank your buggy C"
3949 " library strlcpy(), and change your hints file"
3950 " to undef it";
3951
3952 my_strlcpy(ret, "setlocale(", sizeof(ret));
3953 my_strlcat(ret, category_name(category), sizeof(ret));
3954 my_strlcat(ret, ", ", sizeof(ret));
3955
3956 if (locale) {
3957 my_strlcat(ret, "\"", sizeof(ret));
3958 my_strlcat(ret, locale, sizeof(ret));
3959 my_strlcat(ret, "\"", sizeof(ret));
3960 }
3961 else {
3962 my_strlcat(ret, "NULL", sizeof(ret));
3963 }
3964
3965 my_strlcat(ret, ") returned ", sizeof(ret));
3966
3967 if (retval) {
3968 my_strlcat(ret, "\"", sizeof(ret));
3969 my_strlcat(ret, retval, sizeof(ret));
3970 my_strlcat(ret, "\"", sizeof(ret));
3971 }
3972 else {
3973 my_strlcat(ret, "NULL", sizeof(ret));
3974 }
3975
3976 assert(strlen(ret) < sizeof(ret));
3977
3978 return ret;
3979}
3980
3981#endif
3982
3983
3984/*
3985 * ex: set ts=8 sts=4 sw=4 et:
3986 */