This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
document the new flags behaviour and why
[perl5.git] / locale.c
CommitLineData
98994639
HS
1/* locale.c
2 *
1129b882
NC
3 * Copyright (C) 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001,
4 * 2002, 2003, 2005, 2006, 2007, 2008 by Larry Wall and others
98994639
HS
5 *
6 * You may distribute under the terms of either the GNU General Public
7 * License or the Artistic License, as specified in the README file.
8 *
9 */
10
11/*
4ac71550 12 * A Elbereth Gilthoniel,
cdad3b53 13 * silivren penna míriel
4ac71550 14 * o menel aglar elenath!
cdad3b53 15 * Na-chaered palan-díriel
4ac71550
TC
16 * o galadhremmin ennorath,
17 * Fanuilos, le linnathon
18 * nef aear, si nef aearon!
19 *
20 * [p.238 of _The Lord of the Rings_, II/i: "Many Meetings"]
98994639
HS
21 */
22
166f8a29
DM
23/* utility functions for handling locale-specific stuff like what
24 * character represents the decimal point.
0d071d52 25 *
7d4bcc4a
KW
26 * All C programs have an underlying locale. Perl code generally doesn't pay
27 * any attention to it except within the scope of a 'use locale'. For most
0d071d52
KW
28 * categories, it accomplishes this by just using different operations if it is
29 * in such scope than if not. However, various libc functions called by Perl
30 * are affected by the LC_NUMERIC category, so there are macros in perl.h that
31 * are used to toggle between the current locale and the C locale depending on
a9ad02a8
KW
32 * the desired behavior of those functions at the moment. And, LC_MESSAGES is
33 * switched to the C locale for outputting the message unless within the scope
34 * of 'use locale'.
e9bc6d6b
KW
35 *
36 * This code now has multi-thread-safe locale handling on systems that support
37 * that. This is completely transparent to most XS code. On earlier systems,
38 * it would be possible to emulate thread-safe locales, but this likely would
39 * involve a lot of locale switching, and would require XS code changes.
40 * Macros could be written so that the code wouldn't have to know which type of
41 * system is being used. It's unlikely that we would ever do that, since most
42 * modern systems support thread-safe locales, but there was code written to
43 * this end, and is retained, #ifdef'd out.
166f8a29
DM
44 */
45
98994639
HS
46#include "EXTERN.h"
47#define PERL_IN_LOCALE_C
f7416781 48#include "perl_langinfo.h"
98994639
HS
49#include "perl.h"
50
a4af207c
JH
51#include "reentr.h"
52
0dec74cd
KW
53#ifdef I_WCHAR
54# include <wchar.h>
55#endif
5b64f24c
KW
56#ifdef I_WCTYPE
57# include <wctype.h>
58#endif
0dec74cd 59
2fcc0ca9
KW
60/* If the environment says to, we can output debugging information during
61 * initialization. This is done before option parsing, and before any thread
62 * creation, so can be a file-level static */
8c3a0f6c 63#if ! defined(DEBUGGING)
5a4b0634
KW
64# define debug_initialization 0
65# define DEBUG_INITIALIZATION_set(v)
66#else
2fcc0ca9 67static bool debug_initialization = FALSE;
5a4b0634 68# define DEBUG_INITIALIZATION_set(v) (debug_initialization = v)
2fcc0ca9
KW
69#endif
70
48015184
KW
71
72/* Returns the Unix errno portion; ignoring any others. This is a macro here
73 * instead of putting it into perl.h, because unclear to khw what should be
74 * done generally. */
75#define GET_ERRNO saved_errno
76
291a84fb
KW
77/* strlen() of a literal string constant. We might want this more general,
78 * but using it in just this file for now. A problem with more generality is
79 * the compiler warnings about comparing unlike signs */
ff1b739b
KW
80#define STRLENs(s) (sizeof("" s "") - 1)
81
63e5b0d7
KW
82/* Is the C string input 'name' "C" or "POSIX"? If so, and 'name' is the
83 * return of setlocale(), then this is extremely likely to be the C or POSIX
84 * locale. However, the output of setlocale() is documented to be opaque, but
85 * the odds are extremely small that it would return these two strings for some
86 * other locale. Note that VMS in these two locales includes many non-ASCII
87 * characters as controls and punctuation (below are hex bytes):
88 * cntrl: 84-97 9B-9F
89 * punct: A1-A3 A5 A7-AB B0-B3 B5-B7 B9-BD BF-CF D1-DD DF-EF F1-FD
90 * Oddly, none there are listed as alphas, though some represent alphabetics
91 * http://www.nntp.perl.org/group/perl.perl5.porters/2013/02/msg198753.html */
92#define isNAME_C_OR_POSIX(name) \
93 ( (name) != NULL \
94 && (( *(name) == 'C' && (*(name + 1)) == '\0') \
95 || strEQ((name), "POSIX")))
96
8ef6e574
KW
97#ifdef USE_LOCALE
98
47280b20
KW
99/* This code keeps a LRU cache of the UTF-8ness of the locales it has so-far
100 * looked up. This is in the form of a C string: */
101
102#define UTF8NESS_SEP "\v"
103#define UTF8NESS_PREFIX "\f"
104
105/* So, the string looks like:
98994639 106 *
47280b20 107 * \vC\a0\vPOSIX\a0\vam_ET\a0\vaf_ZA.utf8\a1\ven_US.UTF-8\a1\0
98994639 108 *
47280b20
KW
109 * where the digit 0 after the \a indicates that the locale starting just
110 * after the preceding \v is not UTF-8, and the digit 1 mean it is. */
111
112STATIC_ASSERT_DECL(STRLENs(UTF8NESS_SEP) == 1);
113STATIC_ASSERT_DECL(STRLENs(UTF8NESS_PREFIX) == 1);
114
115#define C_and_POSIX_utf8ness UTF8NESS_SEP "C" UTF8NESS_PREFIX "0" \
116 UTF8NESS_SEP "POSIX" UTF8NESS_PREFIX "0"
117
118/* The cache is initialized to C_and_POSIX_utf8ness at start up. These are
119 * kept there always. The remining portion of the cache is LRU, with the
120 * oldest looked-up locale at the tail end */
121
98994639
HS
122STATIC char *
123S_stdize_locale(pTHX_ char *locs)
124{
47280b20
KW
125 /* Standardize the locale name from a string returned by 'setlocale',
126 * possibly modifying that string.
127 *
128 * The typical return value of setlocale() is either
129 * (1) "xx_YY" if the first argument of setlocale() is not LC_ALL
130 * (2) "xa_YY xb_YY ..." if the first argument of setlocale() is LC_ALL
131 * (the space-separated values represent the various sublocales,
132 * in some unspecified order). This is not handled by this function.
133 *
134 * In some platforms it has a form like "LC_SOMETHING=Lang_Country.866\n",
135 * which is harmful for further use of the string in setlocale(). This
136 * function removes the trailing new line and everything up through the '='
137 * */
138
7452cf6a 139 const char * const s = strchr(locs, '=');
98994639
HS
140 bool okay = TRUE;
141
7918f24d
NC
142 PERL_ARGS_ASSERT_STDIZE_LOCALE;
143
8772537c 144 if (s) {
1604cfb0
MS
145 const char * const t = strchr(s, '.');
146 okay = FALSE;
147 if (t) {
148 const char * const u = strchr(t, '\n');
149 if (u && (u[1] == 0)) {
150 const STRLEN len = u - s;
151 Move(s + 1, locs, len, char);
152 locs[len] = 0;
153 okay = TRUE;
154 }
155 }
98994639
HS
156 }
157
158 if (!okay)
1604cfb0 159 Perl_croak(aTHX_ "Can't fix broken locale name \"%s\"", locs);
98994639
HS
160
161 return locs;
162}
163
e5f10d49
KW
164/* Two parallel arrays; first the locale categories Perl uses on this system;
165 * the second array is their names. These arrays are in mostly arbitrary
166 * order. */
167
168const int categories[] = {
169
170# ifdef USE_LOCALE_NUMERIC
171 LC_NUMERIC,
172# endif
173# ifdef USE_LOCALE_CTYPE
174 LC_CTYPE,
175# endif
176# ifdef USE_LOCALE_COLLATE
177 LC_COLLATE,
178# endif
179# ifdef USE_LOCALE_TIME
180 LC_TIME,
181# endif
182# ifdef USE_LOCALE_MESSAGES
183 LC_MESSAGES,
184# endif
185# ifdef USE_LOCALE_MONETARY
186 LC_MONETARY,
187# endif
9821811f
KW
188# ifdef USE_LOCALE_ADDRESS
189 LC_ADDRESS,
190# endif
191# ifdef USE_LOCALE_IDENTIFICATION
192 LC_IDENTIFICATION,
193# endif
194# ifdef USE_LOCALE_MEASUREMENT
195 LC_MEASUREMENT,
196# endif
197# ifdef USE_LOCALE_PAPER
198 LC_PAPER,
199# endif
200# ifdef USE_LOCALE_TELEPHONE
201 LC_TELEPHONE,
202# endif
36dbc955
KW
203# ifdef USE_LOCALE_SYNTAX
204 LC_SYNTAX,
205# endif
206# ifdef USE_LOCALE_TOD
207 LC_TOD,
208# endif
e5f10d49
KW
209# ifdef LC_ALL
210 LC_ALL,
211# endif
212 -1 /* Placeholder because C doesn't allow a
213 trailing comma, and it would get complicated
214 with all the #ifdef's */
215};
216
217/* The top-most real element is LC_ALL */
218
4ef8bdf9 219const char * const category_names[] = {
e5f10d49
KW
220
221# ifdef USE_LOCALE_NUMERIC
222 "LC_NUMERIC",
223# endif
224# ifdef USE_LOCALE_CTYPE
225 "LC_CTYPE",
226# endif
227# ifdef USE_LOCALE_COLLATE
228 "LC_COLLATE",
229# endif
230# ifdef USE_LOCALE_TIME
231 "LC_TIME",
232# endif
233# ifdef USE_LOCALE_MESSAGES
234 "LC_MESSAGES",
235# endif
236# ifdef USE_LOCALE_MONETARY
237 "LC_MONETARY",
238# endif
9821811f
KW
239# ifdef USE_LOCALE_ADDRESS
240 "LC_ADDRESS",
241# endif
242# ifdef USE_LOCALE_IDENTIFICATION
243 "LC_IDENTIFICATION",
244# endif
245# ifdef USE_LOCALE_MEASUREMENT
246 "LC_MEASUREMENT",
247# endif
248# ifdef USE_LOCALE_PAPER
249 "LC_PAPER",
250# endif
251# ifdef USE_LOCALE_TELEPHONE
252 "LC_TELEPHONE",
253# endif
36dbc955
KW
254# ifdef USE_LOCALE_SYNTAX
255 "LC_SYNTAX",
256# endif
257# ifdef USE_LOCALE_TOD
258 "LC_TOD",
259# endif
e5f10d49
KW
260# ifdef LC_ALL
261 "LC_ALL",
262# endif
263 NULL /* Placeholder */
264 };
265
266# ifdef LC_ALL
267
268 /* On systems with LC_ALL, it is kept in the highest index position. (-2
269 * to account for the final unused placeholder element.) */
270# define NOMINAL_LC_ALL_INDEX (C_ARRAY_LENGTH(categories) - 2)
271
272# else
273
274 /* On systems without LC_ALL, we pretend it is there, one beyond the real
275 * top element, hence in the unused placeholder element. */
276# define NOMINAL_LC_ALL_INDEX (C_ARRAY_LENGTH(categories) - 1)
277
278# endif
279
280/* Pretending there is an LC_ALL element just above allows us to avoid most
281 * special cases. Most loops through these arrays in the code below are
282 * written like 'for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++)'. They will work
283 * on either type of system. But the code must be written to not access the
948523db
KW
284 * element at 'LC_ALL_INDEX' except on platforms that have it. This can be
285 * checked for at compile time by using the #define LC_ALL_INDEX which is only
286 * defined if we do have LC_ALL. */
e5f10d49 287
b09aaf40
KW
288STATIC const char *
289S_category_name(const int category)
290{
291 unsigned int i;
292
293#ifdef LC_ALL
294
295 if (category == LC_ALL) {
296 return "LC_ALL";
297 }
298
299#endif
300
301 for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++) {
302 if (category == categories[i]) {
303 return category_names[i];
304 }
305 }
306
307 {
308 const char suffix[] = " (unknown)";
309 int temp = category;
310 Size_t length = sizeof(suffix) + 1;
311 char * unknown;
312 dTHX;
313
314 if (temp < 0) {
315 length++;
316 temp = - temp;
317 }
318
319 /* Calculate the number of digits */
320 while (temp >= 10) {
321 temp /= 10;
322 length++;
323 }
324
325 Newx(unknown, length, char);
326 my_snprintf(unknown, length, "%d%s", category, suffix);
327 SAVEFREEPV(unknown);
328 return unknown;
329 }
330}
331
948523db
KW
332/* Now create LC_foo_INDEX #defines for just those categories on this system */
333# ifdef USE_LOCALE_NUMERIC
334# define LC_NUMERIC_INDEX 0
335# define _DUMMY_NUMERIC LC_NUMERIC_INDEX
336# else
337# define _DUMMY_NUMERIC -1
338# endif
339# ifdef USE_LOCALE_CTYPE
340# define LC_CTYPE_INDEX _DUMMY_NUMERIC + 1
341# define _DUMMY_CTYPE LC_CTYPE_INDEX
342# else
343# define _DUMMY_CTYPE _DUMMY_NUMERIC
344# endif
345# ifdef USE_LOCALE_COLLATE
346# define LC_COLLATE_INDEX _DUMMY_CTYPE + 1
347# define _DUMMY_COLLATE LC_COLLATE_INDEX
348# else
8203b3c3 349# define _DUMMY_COLLATE _DUMMY_CTYPE
948523db
KW
350# endif
351# ifdef USE_LOCALE_TIME
352# define LC_TIME_INDEX _DUMMY_COLLATE + 1
353# define _DUMMY_TIME LC_TIME_INDEX
354# else
355# define _DUMMY_TIME _DUMMY_COLLATE
356# endif
357# ifdef USE_LOCALE_MESSAGES
358# define LC_MESSAGES_INDEX _DUMMY_TIME + 1
359# define _DUMMY_MESSAGES LC_MESSAGES_INDEX
360# else
361# define _DUMMY_MESSAGES _DUMMY_TIME
362# endif
363# ifdef USE_LOCALE_MONETARY
364# define LC_MONETARY_INDEX _DUMMY_MESSAGES + 1
365# define _DUMMY_MONETARY LC_MONETARY_INDEX
366# else
367# define _DUMMY_MONETARY _DUMMY_MESSAGES
368# endif
9821811f
KW
369# ifdef USE_LOCALE_ADDRESS
370# define LC_ADDRESS_INDEX _DUMMY_MONETARY + 1
371# define _DUMMY_ADDRESS LC_ADDRESS_INDEX
372# else
373# define _DUMMY_ADDRESS _DUMMY_MONETARY
374# endif
375# ifdef USE_LOCALE_IDENTIFICATION
376# define LC_IDENTIFICATION_INDEX _DUMMY_ADDRESS + 1
377# define _DUMMY_IDENTIFICATION LC_IDENTIFICATION_INDEX
378# else
379# define _DUMMY_IDENTIFICATION _DUMMY_ADDRESS
380# endif
381# ifdef USE_LOCALE_MEASUREMENT
382# define LC_MEASUREMENT_INDEX _DUMMY_IDENTIFICATION + 1
383# define _DUMMY_MEASUREMENT LC_MEASUREMENT_INDEX
384# else
385# define _DUMMY_MEASUREMENT _DUMMY_IDENTIFICATION
386# endif
387# ifdef USE_LOCALE_PAPER
388# define LC_PAPER_INDEX _DUMMY_MEASUREMENT + 1
389# define _DUMMY_PAPER LC_PAPER_INDEX
390# else
391# define _DUMMY_PAPER _DUMMY_MEASUREMENT
392# endif
393# ifdef USE_LOCALE_TELEPHONE
394# define LC_TELEPHONE_INDEX _DUMMY_PAPER + 1
395# define _DUMMY_TELEPHONE LC_TELEPHONE_INDEX
396# else
397# define _DUMMY_TELEPHONE _DUMMY_PAPER
398# endif
36dbc955
KW
399# ifdef USE_LOCALE_SYNTAX
400# define LC_SYNTAX_INDEX _DUMMY_TELEPHONE + 1
401# define _DUMMY_SYNTAX LC_SYNTAX_INDEX
402# else
403# define _DUMMY_SYNTAX _DUMMY_TELEPHONE
404# endif
405# ifdef USE_LOCALE_TOD
406# define LC_TOD_INDEX _DUMMY_SYNTAX + 1
407# define _DUMMY_TOD LC_TOD_INDEX
408# else
409# define _DUMMY_TOD _DUMMY_SYNTAX
410# endif
948523db 411# ifdef LC_ALL
36dbc955 412# define LC_ALL_INDEX _DUMMY_TOD + 1
948523db
KW
413# endif
414#endif /* ifdef USE_LOCALE */
8ef6e574 415
d2b24094 416/* Windows requres a customized base-level setlocale() */
e9bc6d6b
KW
417#ifdef WIN32
418# define my_setlocale(cat, locale) win32_setlocale(cat, locale)
419#else
420# define my_setlocale(cat, locale) setlocale(cat, locale)
421#endif
422
423#ifndef USE_POSIX_2008_LOCALE
424
425/* "do_setlocale_c" is intended to be called when the category is a constant
426 * known at compile time; "do_setlocale_r", not known until run time */
427# define do_setlocale_c(cat, locale) my_setlocale(cat, locale)
428# define do_setlocale_r(cat, locale) my_setlocale(cat, locale)
45cef8fb 429# define FIX_GLIBC_LC_MESSAGES_BUG(i)
e9bc6d6b
KW
430
431#else /* Below uses POSIX 2008 */
432
433/* We emulate setlocale with our own function. LC_foo is not valid for the
434 * POSIX 2008 functions. Instead LC_foo_MASK is used, which we use an array
435 * lookup to convert to. At compile time we have defined LC_foo_INDEX as the
436 * proper offset into the array 'category_masks[]'. At runtime, we have to
437 * search through the array (as the actual numbers may not be small contiguous
438 * positive integers which would lend themselves to array lookup). */
439# define do_setlocale_c(cat, locale) \
440 emulate_setlocale(cat, locale, cat ## _INDEX, TRUE)
441# define do_setlocale_r(cat, locale) emulate_setlocale(cat, locale, 0, FALSE)
442
45cef8fb
KW
443# if ! defined(__GLIBC__) || ! defined(USE_LOCALE_MESSAGES)
444
445# define FIX_GLIBC_LC_MESSAGES_BUG(i)
446
447# else /* Invalidate glibc cache of loaded translations, see [perl #134264] */
448
449# include <libintl.h>
450# define FIX_GLIBC_LC_MESSAGES_BUG(i) \
451 STMT_START { \
452 if ((i) == LC_MESSAGES_INDEX) { \
453 textdomain(textdomain(NULL)); \
454 } \
455 } STMT_END
456
457# endif
458
e9bc6d6b
KW
459/* A third array, parallel to the ones above to map from category to its
460 * equivalent mask */
461const int category_masks[] = {
462# ifdef USE_LOCALE_NUMERIC
463 LC_NUMERIC_MASK,
464# endif
465# ifdef USE_LOCALE_CTYPE
466 LC_CTYPE_MASK,
467# endif
468# ifdef USE_LOCALE_COLLATE
469 LC_COLLATE_MASK,
470# endif
471# ifdef USE_LOCALE_TIME
472 LC_TIME_MASK,
473# endif
474# ifdef USE_LOCALE_MESSAGES
475 LC_MESSAGES_MASK,
476# endif
477# ifdef USE_LOCALE_MONETARY
478 LC_MONETARY_MASK,
479# endif
480# ifdef USE_LOCALE_ADDRESS
481 LC_ADDRESS_MASK,
482# endif
483# ifdef USE_LOCALE_IDENTIFICATION
484 LC_IDENTIFICATION_MASK,
485# endif
486# ifdef USE_LOCALE_MEASUREMENT
487 LC_MEASUREMENT_MASK,
488# endif
489# ifdef USE_LOCALE_PAPER
490 LC_PAPER_MASK,
491# endif
492# ifdef USE_LOCALE_TELEPHONE
493 LC_TELEPHONE_MASK,
494# endif
36dbc955
KW
495# ifdef USE_LOCALE_SYNTAX
496 LC_SYNTAX_MASK,
497# endif
498# ifdef USE_LOCALE_TOD
499 LC_TOD_MASK,
500# endif
e9bc6d6b
KW
501 /* LC_ALL can't be turned off by a Configure
502 * option, and in Posix 2008, should always be
503 * here, so compile it in unconditionally.
504 * This could catch some glitches at compile
505 * time */
506 LC_ALL_MASK
507 };
508
509STATIC const char *
510S_emulate_setlocale(const int category,
511 const char * locale,
512 unsigned int index,
513 const bool is_index_valid
514 )
515{
516 /* This function effectively performs a setlocale() on just the current
517 * thread; thus it is thread-safe. It does this by using the POSIX 2008
518 * locale functions to emulate the behavior of setlocale(). Similar to
519 * regular setlocale(), the return from this function points to memory that
520 * can be overwritten by other system calls, so needs to be copied
521 * immediately if you need to retain it. The difference here is that
522 * system calls besides another setlocale() can overwrite it.
523 *
524 * By doing this, most locale-sensitive functions become thread-safe. The
525 * exceptions are mostly those that return a pointer to static memory.
526 *
527 * This function takes the same parameters, 'category' and 'locale', that
528 * the regular setlocale() function does, but it also takes two additional
529 * ones. This is because the 2008 functions don't use a category; instead
530 * they use a corresponding mask. Because this function operates in both
531 * worlds, it may need one or the other or both. This function can
532 * calculate the mask from the input category, but to avoid this
533 * calculation, if the caller knows at compile time what the mask is, it
534 * can pass it, setting 'is_index_valid' to TRUE; otherwise the mask
535 * parameter is ignored.
536 *
537 * POSIX 2008, for some sick reason, chose not to provide a method to find
538 * the category name of a locale. Some vendors have created a
539 * querylocale() function to do just that. This function is a lot simpler
540 * to implement on systems that have this. Otherwise, we have to keep
541 * track of what the locale has been set to, so that we can return its
542 * name to emulate setlocale(). It's also possible for C code in some
543 * library to change the locale without us knowing it, though as of
544 * September 2017, there are no occurrences in CPAN of uselocale(). Some
545 * libraries do use setlocale(), but that changes the global locale, and
546 * threads using per-thread locales will just ignore those changes.
547 * Another problem is that without querylocale(), we have to guess at what
548 * was meant by setting a locale of "". We handle this by not actually
549 * ever setting to "" (unless querylocale exists), but to emulate what we
550 * think should happen for "".
551 */
552
553 int mask;
554 locale_t old_obj;
555 locale_t new_obj;
e0f95237 556 const char * safelocale = locale ? locale : "(null)";
e9bc6d6b
KW
557 dTHX;
558
559# ifdef DEBUGGING
560
561 if (DEBUG_Lv_TEST || debug_initialization) {
e0f95237 562 PerlIO_printf(Perl_debug_log, "%s:%d: emulate_setlocale input=%d (%s), \"%s\", %d, %d\n", __FILE__, __LINE__, category, category_name(category), safelocale, index, is_index_valid);
e9bc6d6b
KW
563 }
564
565# endif
566
567 /* If the input mask might be incorrect, calculate the correct one */
568 if (! is_index_valid) {
569 unsigned int i;
570
571# ifdef DEBUGGING
572
573 if (DEBUG_Lv_TEST || debug_initialization) {
574 PerlIO_printf(Perl_debug_log, "%s:%d: finding index of category %d (%s)\n", __FILE__, __LINE__, category, category_name(category));
575 }
576
577# endif
578
579 for (i = 0; i <= LC_ALL_INDEX; i++) {
580 if (category == categories[i]) {
581 index = i;
582 goto found_index;
583 }
584 }
585
586 /* Here, we don't know about this category, so can't handle it.
587 * Fallback to the early POSIX usages */
588 Perl_warner(aTHX_ packWARN(WARN_LOCALE),
589 "Unknown locale category %d; can't set it to %s\n",
e0f95237 590 category, safelocale);
e9bc6d6b
KW
591 return NULL;
592
593 found_index: ;
594
595# ifdef DEBUGGING
596
597 if (DEBUG_Lv_TEST || debug_initialization) {
598 PerlIO_printf(Perl_debug_log, "%s:%d: index is %d for %s\n", __FILE__, __LINE__, index, category_name(category));
599 }
600
601# endif
602
603 }
604
605 mask = category_masks[index];
606
607# ifdef DEBUGGING
608
609 if (DEBUG_Lv_TEST || debug_initialization) {
610 PerlIO_printf(Perl_debug_log, "%s:%d: category name is %s; mask is 0x%x\n", __FILE__, __LINE__, category_names[index], mask);
611 }
612
613# endif
614
615 /* If just querying what the existing locale is ... */
616 if (locale == NULL) {
617 locale_t cur_obj = uselocale((locale_t) 0);
618
619# ifdef DEBUGGING
620
621 if (DEBUG_Lv_TEST || debug_initialization) {
622 PerlIO_printf(Perl_debug_log, "%s:%d: emulate_setlocale querying %p\n", __FILE__, __LINE__, cur_obj);
623 }
624
625# endif
626
627 if (cur_obj == LC_GLOBAL_LOCALE) {
628 return my_setlocale(category, NULL);
629 }
630
631# ifdef HAS_QUERYLOCALE
632
633 return (char *) querylocale(mask, cur_obj);
634
d2b24094 635# else
ecdda939
KW
636
637 /* If this assert fails, adjust the size of curlocales in intrpvar.h */
638 STATIC_ASSERT_STMT(C_ARRAY_LENGTH(PL_curlocales) > LC_ALL_INDEX);
639
f33fd0eb
KW
640# if defined(_NL_LOCALE_NAME) \
641 && defined(DEBUGGING) \
642 /* On systems that accept any locale name, the real underlying \
643 * locale is often returned by this internal function, so we \
644 * can't use it */ \
8e13243a 645 && ! defined(SETLOCALE_ACCEPTS_ANY_LOCALE_NAME)
e9bc6d6b
KW
646 {
647 /* Internal glibc for querylocale(), but doesn't handle
648 * empty-string ("") locale properly; who knows what other
3a732259 649 * glitches. Check for it now, under debug. */
e9bc6d6b
KW
650
651 char * temp_name = nl_langinfo_l(_NL_LOCALE_NAME(category),
652 uselocale((locale_t) 0));
653 /*
654 PerlIO_printf(Perl_debug_log, "%s:%d: temp_name=%s\n", __FILE__, __LINE__, temp_name ? temp_name : "NULL");
655 PerlIO_printf(Perl_debug_log, "%s:%d: index=%d\n", __FILE__, __LINE__, index);
656 PerlIO_printf(Perl_debug_log, "%s:%d: PL_curlocales[index]=%s\n", __FILE__, __LINE__, PL_curlocales[index]);
657 */
658 if (temp_name && PL_curlocales[index] && strNE(temp_name, "")) {
659 if ( strNE(PL_curlocales[index], temp_name)
660 && ! ( isNAME_C_OR_POSIX(temp_name)
661 && isNAME_C_OR_POSIX(PL_curlocales[index]))) {
662
663# ifdef USE_C_BACKTRACE
664
665 dump_c_backtrace(Perl_debug_log, 20, 1);
666
667# endif
668
669 Perl_croak(aTHX_ "panic: Mismatch between what Perl thinks %s is"
670 " (%s) and what internal glibc thinks"
671 " (%s)\n", category_names[index],
672 PL_curlocales[index], temp_name);
673 }
674
675 return temp_name;
676 }
677 }
678
679# endif
680
681 /* Without querylocale(), we have to use our record-keeping we've
682 * done. */
683
684 if (category != LC_ALL) {
685
686# ifdef DEBUGGING
687
688 if (DEBUG_Lv_TEST || debug_initialization) {
689 PerlIO_printf(Perl_debug_log, "%s:%d: emulate_setlocale returning %s\n", __FILE__, __LINE__, PL_curlocales[index]);
690 }
691
692# endif
693
694 return PL_curlocales[index];
695 }
696 else { /* For LC_ALL */
697 unsigned int i;
698 Size_t names_len = 0;
699 char * all_string;
7c93a471 700 bool are_all_categories_the_same_locale = TRUE;
e9bc6d6b
KW
701
702 /* If we have a valid LC_ALL value, just return it */
703 if (PL_curlocales[LC_ALL_INDEX]) {
704
705# ifdef DEBUGGING
706
707 if (DEBUG_Lv_TEST || debug_initialization) {
708 PerlIO_printf(Perl_debug_log, "%s:%d: emulate_setlocale returning %s\n", __FILE__, __LINE__, PL_curlocales[LC_ALL_INDEX]);
709 }
710
711# endif
712
713 return PL_curlocales[LC_ALL_INDEX];
714 }
715
716 /* Otherwise, we need to construct a string of name=value pairs.
717 * We use the glibc syntax, like
718 * LC_NUMERIC=C;LC_TIME=en_US.UTF-8;...
7c93a471
KW
719 * First calculate the needed size. Along the way, check if all
720 * the locale names are the same */
e9bc6d6b
KW
721 for (i = 0; i < LC_ALL_INDEX; i++) {
722
723# ifdef DEBUGGING
724
725 if (DEBUG_Lv_TEST || debug_initialization) {
726 PerlIO_printf(Perl_debug_log, "%s:%d: emulate_setlocale i=%d, name=%s, locale=%s\n", __FILE__, __LINE__, i, category_names[i], PL_curlocales[i]);
727 }
728
729# endif
730
731 names_len += strlen(category_names[i])
732 + 1 /* '=' */
733 + strlen(PL_curlocales[i])
734 + 1; /* ';' */
7c93a471
KW
735
736 if (i > 0 && strNE(PL_curlocales[i], PL_curlocales[i-1])) {
737 are_all_categories_the_same_locale = FALSE;
738 }
739 }
740
741 /* If they are the same, we don't actually have to construct the
742 * string; we just make the entry in LC_ALL_INDEX valid, and be
743 * that single name */
744 if (are_all_categories_the_same_locale) {
745 PL_curlocales[LC_ALL_INDEX] = savepv(PL_curlocales[0]);
746 return PL_curlocales[LC_ALL_INDEX];
e9bc6d6b 747 }
7c93a471 748
e9bc6d6b
KW
749 names_len++; /* Trailing '\0' */
750 SAVEFREEPV(Newx(all_string, names_len, char));
751 *all_string = '\0';
752
753 /* Then fill in the string */
754 for (i = 0; i < LC_ALL_INDEX; i++) {
755
756# ifdef DEBUGGING
757
758 if (DEBUG_Lv_TEST || debug_initialization) {
759 PerlIO_printf(Perl_debug_log, "%s:%d: emulate_setlocale i=%d, name=%s, locale=%s\n", __FILE__, __LINE__, i, category_names[i], PL_curlocales[i]);
760 }
761
762# endif
763
764 my_strlcat(all_string, category_names[i], names_len);
765 my_strlcat(all_string, "=", names_len);
766 my_strlcat(all_string, PL_curlocales[i], names_len);
767 my_strlcat(all_string, ";", names_len);
768 }
769
770# ifdef DEBUGGING
771
772 if (DEBUG_L_TEST || debug_initialization) {
773 PerlIO_printf(Perl_debug_log, "%s:%d: emulate_setlocale returning %s\n", __FILE__, __LINE__, all_string);
774 }
775
776 #endif
777
778 return all_string;
779 }
780
781# ifdef EINVAL
782
783 SETERRNO(EINVAL, LIB_INVARG);
784
785# endif
786
787 return NULL;
788
d2b24094
KW
789# endif
790
6aba5c5e 791 } /* End of this being setlocale(LC_foo, NULL) */
e9bc6d6b 792
f8115d60 793 /* Here, we are switching locales. */
e9bc6d6b
KW
794
795# ifndef HAS_QUERYLOCALE
796
797 if (strEQ(locale, "")) {
798
799 /* For non-querylocale() systems, we do the setting of "" ourselves to
800 * be sure that we really know what's going on. We follow the Linux
801 * documented behavior (but if that differs from the actual behavior,
802 * this won't work exactly as the OS implements). We go out and
803 * examine the environment based on our understanding of how the system
804 * works, and use that to figure things out */
805
806 const char * const lc_all = PerlEnv_getenv("LC_ALL");
807
808 /* Use any "LC_ALL" environment variable, as it overrides everything
809 * else. */
810 if (lc_all && strNE(lc_all, "")) {
811 locale = lc_all;
812 }
813 else {
814
815 /* Otherwise, we need to dig deeper. Unless overridden, the
816 * default is the LANG environment variable; if it doesn't exist,
817 * then "C" */
818
819 const char * default_name;
820
207cc8df 821 default_name = PerlEnv_getenv("LANG");
e9bc6d6b
KW
822
823 if (! default_name || strEQ(default_name, "")) {
929a7f8c 824 default_name = "C";
e9bc6d6b 825 }
e9bc6d6b
KW
826
827 if (category != LC_ALL) {
828 const char * const name = PerlEnv_getenv(category_names[index]);
829
830 /* Here we are setting a single category. Assume will have the
831 * default name */
832 locale = default_name;
833
834 /* But then look for an overriding environment variable */
835 if (name && strNE(name, "")) {
836 locale = name;
837 }
838 }
839 else {
840 bool did_override = FALSE;
841 unsigned int i;
842
843 /* Here, we are getting LC_ALL. Any categories that don't have
844 * a corresponding environment variable set should be set to
845 * LANG, or to "C" if there is no LANG. If no individual
846 * categories differ from this, we can just set LC_ALL. This
847 * is buggy on systems that have extra categories that we don't
848 * know about. If there is an environment variable that sets
849 * that category, we won't know to look for it, and so our use
850 * of LANG or "C" improperly overrides it. On the other hand,
851 * if we don't do what is done here, and there is no
852 * environment variable, the category's locale should be set to
853 * LANG or "C". So there is no good solution. khw thinks the
854 * best is to look at systems to see what categories they have,
855 * and include them, and then to assume that we know the
856 * complete set */
857
858 for (i = 0; i < LC_ALL_INDEX; i++) {
859 const char * const env_override
24f3e849 860 = PerlEnv_getenv(category_names[i]);
e9bc6d6b
KW
861 const char * this_locale = ( env_override
862 && strNE(env_override, ""))
863 ? env_override
864 : default_name;
6435f98d
KW
865 if (! emulate_setlocale(categories[i], this_locale, i, TRUE))
866 {
6435f98d
KW
867 return NULL;
868 }
e9bc6d6b
KW
869
870 if (strNE(this_locale, default_name)) {
871 did_override = TRUE;
872 }
e9bc6d6b
KW
873 }
874
875 /* If all the categories are the same, we can set LC_ALL to
876 * that */
877 if (! did_override) {
878 locale = default_name;
879 }
880 else {
881
882 /* Here, LC_ALL is no longer valid, as some individual
883 * categories don't match it. We call ourselves
884 * recursively, as that will execute the code that
885 * generates the proper locale string for this situation.
886 * We don't do the remainder of this function, as that is
887 * to update our records, and we've just done that for the
888 * individual categories in the loop above, and doing so
889 * would cause LC_ALL to be done as well */
890 return emulate_setlocale(LC_ALL, NULL, LC_ALL_INDEX, TRUE);
891 }
892 }
893 }
6aba5c5e 894 } /* End of this being setlocale(LC_foo, "") */
e9bc6d6b
KW
895 else if (strchr(locale, ';')) {
896
897 /* LC_ALL may actually incude a conglomeration of various categories.
898 * Without querylocale, this code uses the glibc (as of this writing)
899 * syntax for representing that, but that is not a stable API, and
900 * other platforms do it differently, so we have to handle all cases
901 * ourselves */
902
f0023d47 903 unsigned int i;
e9bc6d6b
KW
904 const char * s = locale;
905 const char * e = locale + strlen(locale);
906 const char * p = s;
907 const char * category_end;
908 const char * name_start;
909 const char * name_end;
910
f0023d47
KW
911 /* If the string that gives what to set doesn't include all categories,
912 * the omitted ones get set to "C". To get this behavior, first set
913 * all the individual categories to "C", and override the furnished
914 * ones below */
915 for (i = 0; i < LC_ALL_INDEX; i++) {
916 if (! emulate_setlocale(categories[i], "C", i, TRUE)) {
917 return NULL;
918 }
919 }
920
e9bc6d6b 921 while (s < e) {
e9bc6d6b
KW
922
923 /* Parse through the category */
924 while (isWORDCHAR(*p)) {
925 p++;
926 }
927 category_end = p;
928
929 if (*p++ != '=') {
930 Perl_croak(aTHX_
931 "panic: %s: %d: Unexpected character in locale name '%02X",
932 __FILE__, __LINE__, *(p-1));
933 }
934
935 /* Parse through the locale name */
936 name_start = p;
bee74f4b
KW
937 while (p < e && *p != ';') {
938 if (! isGRAPH(*p)) {
939 Perl_croak(aTHX_
940 "panic: %s: %d: Unexpected character in locale name '%02X",
941 __FILE__, __LINE__, *(p-1));
942 }
e9bc6d6b
KW
943 p++;
944 }
945 name_end = p;
946
bee74f4b
KW
947 /* Space past the semi-colon */
948 if (p < e) {
949 p++;
e9bc6d6b
KW
950 }
951
952 /* Find the index of the category name in our lists */
953 for (i = 0; i < LC_ALL_INDEX; i++) {
3c76a412 954 char * individ_locale;
e9bc6d6b
KW
955
956 /* Keep going if this isn't the index. The strnNE() avoids a
957 * Perl_form(), but would fail if ever a category name could be
958 * a substring of another one, like if there were a
959 * "LC_TIME_DATE" */
960 if strnNE(s, category_names[i], category_end - s) {
961 continue;
962 }
963
964 /* If this index is for the single category we're changing, we
965 * have found the locale to set it to. */
966 if (category == categories[i]) {
967 locale = Perl_form(aTHX_ "%.*s",
968 (int) (name_end - name_start),
969 name_start);
970 goto ready_to_set;
971 }
972
3c76a412 973 assert(category == LC_ALL);
bee74f4b
KW
974 individ_locale = Perl_form(aTHX_ "%.*s",
975 (int) (name_end - name_start), name_start);
6435f98d
KW
976 if (! emulate_setlocale(categories[i], individ_locale, i, TRUE))
977 {
978 return NULL;
979 }
e9bc6d6b
KW
980 }
981
982 s = p;
983 }
984
985 /* Here we have set all the individual categories by recursive calls.
986 * These collectively should have fixed up LC_ALL, so can just query
987 * what that now is */
988 assert(category == LC_ALL);
989
990 return do_setlocale_c(LC_ALL, NULL);
6aba5c5e
KW
991 } /* End of this being setlocale(LC_ALL,
992 "LC_CTYPE=foo;LC_NUMERIC=bar;...") */
e9bc6d6b
KW
993
994 ready_to_set: ;
995
f8115d60
KW
996 /* Here at the end of having to deal with the absence of querylocale().
997 * Some cases have already been fully handled by recursive calls to this
998 * function. But at this point, we haven't dealt with those, but are now
999 * prepared to, knowing what the locale name to set this category to is.
1000 * This would have come for free if this system had had querylocale() */
1001
e9bc6d6b
KW
1002# endif /* end of ! querylocale */
1003
f8115d60
KW
1004 assert(PL_C_locale_obj);
1005
1006 /* Switching locales generally entails freeing the current one's space (at
1007 * the C library's discretion). We need to stop using that locale before
1008 * the switch. So switch to a known locale object that we don't otherwise
1009 * mess with. This returns the locale object in effect at the time of the
1010 * switch. */
1011 old_obj = uselocale(PL_C_locale_obj);
1012
1013# ifdef DEBUGGING
1014
1015 if (DEBUG_Lv_TEST || debug_initialization) {
1016 PerlIO_printf(Perl_debug_log, "%s:%d: emulate_setlocale was using %p\n", __FILE__, __LINE__, old_obj);
1017 }
1018
1019# endif
1020
1021 if (! old_obj) {
1022
1023# ifdef DEBUGGING
1024
1025 if (DEBUG_L_TEST || debug_initialization) {
1026 dSAVE_ERRNO;
1027 PerlIO_printf(Perl_debug_log, "%s:%d: emulate_setlocale switching to C failed: %d\n", __FILE__, __LINE__, GET_ERRNO);
1028 RESTORE_ERRNO;
1029 }
1030
1031# endif
1032
1033 return NULL;
1034 }
1035
1036# ifdef DEBUGGING
1037
1038 if (DEBUG_Lv_TEST || debug_initialization) {
b22e9937
KW
1039 PerlIO_printf(Perl_debug_log,
1040 "%s:%d: emulate_setlocale now using %p\n",
1041 __FILE__, __LINE__, PL_C_locale_obj);
f8115d60
KW
1042 }
1043
1044# endif
1045
6aba5c5e
KW
1046 /* If this call is to switch to the LC_ALL C locale, it already exists, and
1047 * in fact, we already have switched to it (in preparation for what
1048 * normally is to come). But since we're already there, continue to use
70bd6bc8
KW
1049 * it instead of trying to create a new locale */
1050 if (mask == LC_ALL_MASK && isNAME_C_OR_POSIX(locale)) {
1051
1052# ifdef DEBUGGING
1053
1054 if (DEBUG_Lv_TEST || debug_initialization) {
1055 PerlIO_printf(Perl_debug_log,
1056 "%s:%d: will stay in C object\n", __FILE__, __LINE__);
1057 }
1058
1059# endif
1060
1061 new_obj = PL_C_locale_obj;
1062
1063 /* We already had switched to the C locale in preparation for freeing
b22e9937 1064 * 'old_obj' */
70bd6bc8
KW
1065 if (old_obj != LC_GLOBAL_LOCALE && old_obj != PL_C_locale_obj) {
1066 freelocale(old_obj);
1067 }
1068 }
1069 else {
b22e9937
KW
1070 /* If we weren't in a thread safe locale, set so that newlocale() below
1071 * which uses 'old_obj', uses an empty one. Same for our reserved C
1072 * object. The latter is defensive coding, so that, even if there is
1073 * some bug, we will never end up trying to modify either of these, as
1074 * if passed to newlocale(), they can be. */
1075 if (old_obj == LC_GLOBAL_LOCALE || old_obj == PL_C_locale_obj) {
1076 old_obj = (locale_t) 0;
1077 }
f8115d60 1078
b22e9937
KW
1079 /* Ready to create a new locale by modification of the exising one */
1080 new_obj = newlocale(mask, locale, old_obj);
e9bc6d6b 1081
b22e9937
KW
1082 if (! new_obj) {
1083 dSAVE_ERRNO;
e9bc6d6b
KW
1084
1085# ifdef DEBUGGING
1086
b22e9937
KW
1087 if (DEBUG_L_TEST || debug_initialization) {
1088 PerlIO_printf(Perl_debug_log,
1089 "%s:%d: emulate_setlocale creating new object"
1090 " failed: %d\n", __FILE__, __LINE__, GET_ERRNO);
1091 }
e9bc6d6b
KW
1092
1093# endif
1094
b22e9937 1095 if (! uselocale(old_obj)) {
e9bc6d6b
KW
1096
1097# ifdef DEBUGGING
1098
b22e9937
KW
1099 if (DEBUG_L_TEST || debug_initialization) {
1100 PerlIO_printf(Perl_debug_log,
1101 "%s:%d: switching back failed: %d\n",
1102 __FILE__, __LINE__, GET_ERRNO);
1103 }
e9bc6d6b
KW
1104
1105# endif
1106
b22e9937
KW
1107 }
1108 RESTORE_ERRNO;
1109 return NULL;
e9bc6d6b 1110 }
e9bc6d6b
KW
1111
1112# ifdef DEBUGGING
1113
b22e9937
KW
1114 if (DEBUG_Lv_TEST || debug_initialization) {
1115 PerlIO_printf(Perl_debug_log,
1116 "%s:%d: emulate_setlocale created %p",
1117 __FILE__, __LINE__, new_obj);
1118 if (old_obj) {
1119 PerlIO_printf(Perl_debug_log,
1120 "; should have freed %p", old_obj);
1121 }
1122 PerlIO_printf(Perl_debug_log, "\n");
19ee3daf 1123 }
e9bc6d6b
KW
1124
1125# endif
1126
b22e9937
KW
1127 /* And switch into it */
1128 if (! uselocale(new_obj)) {
1129 dSAVE_ERRNO;
e9bc6d6b
KW
1130
1131# ifdef DEBUGGING
1132
b22e9937
KW
1133 if (DEBUG_L_TEST || debug_initialization) {
1134 PerlIO_printf(Perl_debug_log,
1135 "%s:%d: emulate_setlocale switching to new object"
1136 " failed\n", __FILE__, __LINE__);
1137 }
e9bc6d6b
KW
1138
1139# endif
1140
b22e9937 1141 if (! uselocale(old_obj)) {
e9bc6d6b
KW
1142
1143# ifdef DEBUGGING
1144
b22e9937
KW
1145 if (DEBUG_L_TEST || debug_initialization) {
1146 PerlIO_printf(Perl_debug_log,
1147 "%s:%d: switching back failed: %d\n",
1148 __FILE__, __LINE__, GET_ERRNO);
1149 }
e9bc6d6b
KW
1150
1151# endif
1152
b22e9937
KW
1153 }
1154 freelocale(new_obj);
1155 RESTORE_ERRNO;
1156 return NULL;
e9bc6d6b 1157 }
70bd6bc8 1158 }
e9bc6d6b
KW
1159
1160# ifdef DEBUGGING
1161
1162 if (DEBUG_Lv_TEST || debug_initialization) {
b22e9937
KW
1163 PerlIO_printf(Perl_debug_log,
1164 "%s:%d: emulate_setlocale now using %p\n",
1165 __FILE__, __LINE__, new_obj);
e9bc6d6b
KW
1166 }
1167
1168# endif
1169
1170 /* We are done, except for updating our records (if the system doesn't keep
1171 * them) and in the case of locale "", we don't actually know what the
1172 * locale that got switched to is, as it came from the environment. So
1173 * have to find it */
1174
1175# ifdef HAS_QUERYLOCALE
1176
1177 if (strEQ(locale, "")) {
1178 locale = querylocale(mask, new_obj);
1179 }
1180
1181# else
1182
1183 /* Here, 'locale' is the return value */
1184
1185 /* Without querylocale(), we have to update our records */
1186
1187 if (category == LC_ALL) {
1188 unsigned int i;
1189
1190 /* For LC_ALL, we change all individual categories to correspond */
1191 /* PL_curlocales is a parallel array, so has same
1192 * length as 'categories' */
1193 for (i = 0; i <= LC_ALL_INDEX; i++) {
1194 Safefree(PL_curlocales[i]);
1195 PL_curlocales[i] = savepv(locale);
1196 }
45cef8fb
KW
1197
1198 FIX_GLIBC_LC_MESSAGES_BUG(LC_MESSAGES_INDEX);
e9bc6d6b
KW
1199 }
1200 else {
1201
1202 /* For a single category, if it's not the same as the one in LC_ALL, we
1203 * nullify LC_ALL */
1204
1205 if (PL_curlocales[LC_ALL_INDEX] && strNE(PL_curlocales[LC_ALL_INDEX], locale)) {
1206 Safefree(PL_curlocales[LC_ALL_INDEX]);
1207 PL_curlocales[LC_ALL_INDEX] = NULL;
1208 }
1209
1210 /* Then update the category's record */
1211 Safefree(PL_curlocales[index]);
1212 PL_curlocales[index] = savepv(locale);
45cef8fb
KW
1213
1214 FIX_GLIBC_LC_MESSAGES_BUG(index);
e9bc6d6b
KW
1215 }
1216
1217# endif
1218
1219 return locale;
1220}
1221
1222#endif /* USE_POSIX_2008_LOCALE */
1223
d36adde0 1224#ifdef USE_LOCALE
837ce802 1225
a4f00dcc 1226STATIC void
86799d2d 1227S_set_numeric_radix(pTHX_ const bool use_locale)
98994639 1228{
86799d2d
KW
1229 /* If 'use_locale' is FALSE, set to use a dot for the radix character. If
1230 * TRUE, use the radix character derived from the current locale */
7d4bcc4a 1231
86799d2d
KW
1232#if defined(USE_LOCALE_NUMERIC) && ( defined(HAS_LOCALECONV) \
1233 || defined(HAS_NL_LANGINFO))
98994639 1234
87f8e8e7 1235 const char * radix = (use_locale)
4e6826bf 1236 ? my_nl_langinfo(RADIXCHAR, FALSE)
87f8e8e7
KW
1237 /* FALSE => already in dest locale */
1238 : ".";
2213a3be 1239
87f8e8e7 1240 sv_setpv(PL_numeric_radix_sv, radix);
86799d2d 1241
87f8e8e7
KW
1242 /* If this is valid UTF-8 that isn't totally ASCII, and we are in
1243 * a UTF-8 locale, then mark the radix as being in UTF-8 */
1244 if (is_utf8_non_invariant_string((U8 *) SvPVX(PL_numeric_radix_sv),
7a393424 1245 SvCUR(PL_numeric_radix_sv))
87f8e8e7
KW
1246 && _is_cur_LC_category_utf8(LC_NUMERIC))
1247 {
1248 SvUTF8_on(PL_numeric_radix_sv);
1249 }
86799d2d
KW
1250
1251# ifdef DEBUGGING
7d4bcc4a 1252
2fcc0ca9
KW
1253 if (DEBUG_L_TEST || debug_initialization) {
1254 PerlIO_printf(Perl_debug_log, "Locale radix is '%s', ?UTF-8=%d\n",
3ca88433
KW
1255 SvPVX(PL_numeric_radix_sv),
1256 cBOOL(SvUTF8(PL_numeric_radix_sv)));
2fcc0ca9 1257 }
69014004 1258
86799d2d 1259# endif
d36adde0
KW
1260#else
1261
1262 PERL_UNUSED_ARG(use_locale);
1263
86799d2d 1264#endif /* USE_LOCALE_NUMERIC and can find the radix char */
7d4bcc4a 1265
98994639
HS
1266}
1267
398eeea9
KW
1268STATIC void
1269S_new_numeric(pTHX_ const char *newnum)
98994639 1270{
7d4bcc4a
KW
1271
1272#ifndef USE_LOCALE_NUMERIC
1273
1274 PERL_UNUSED_ARG(newnum);
1275
1276#else
0d071d52 1277
291a84fb 1278 /* Called after each libc setlocale() call affecting LC_NUMERIC, to tell
0d071d52
KW
1279 * core Perl this and that 'newnum' is the name of the new locale.
1280 * It installs this locale as the current underlying default.
1281 *
1282 * The default locale and the C locale can be toggled between by use of the
5792c642
KW
1283 * set_numeric_underlying() and set_numeric_standard() functions, which
1284 * should probably not be called directly, but only via macros like
0d071d52
KW
1285 * SET_NUMERIC_STANDARD() in perl.h.
1286 *
1287 * The toggling is necessary mainly so that a non-dot radix decimal point
1288 * character can be output, while allowing internal calculations to use a
1289 * dot.
1290 *
1291 * This sets several interpreter-level variables:
bb304765 1292 * PL_numeric_name The underlying locale's name: a copy of 'newnum'
892e6465 1293 * PL_numeric_underlying A boolean indicating if the toggled state is such
7738054c
KW
1294 * that the current locale is the program's underlying
1295 * locale
1296 * PL_numeric_standard An int indicating if the toggled state is such
4c68b815
KW
1297 * that the current locale is the C locale or
1298 * indistinguishable from the C locale. If non-zero, it
1299 * is in C; if > 1, it means it may not be toggled away
7738054c 1300 * from C.
4c68b815
KW
1301 * PL_numeric_underlying_is_standard A bool kept by this function
1302 * indicating that the underlying locale and the standard
1303 * C locale are indistinguishable for the purposes of
1304 * LC_NUMERIC. This happens when both of the above two
1305 * variables are true at the same time. (Toggling is a
1306 * no-op under these circumstances.) This variable is
1307 * used to avoid having to recalculate.
398eeea9 1308 */
0d071d52 1309
b03f34cf 1310 char *save_newnum;
98994639
HS
1311
1312 if (! newnum) {
1604cfb0
MS
1313 Safefree(PL_numeric_name);
1314 PL_numeric_name = NULL;
1315 PL_numeric_standard = TRUE;
1316 PL_numeric_underlying = TRUE;
1317 PL_numeric_underlying_is_standard = TRUE;
1318 return;
98994639
HS
1319 }
1320
b03f34cf 1321 save_newnum = stdize_locale(savepv(newnum));
892e6465 1322 PL_numeric_underlying = TRUE;
4c68b815
KW
1323 PL_numeric_standard = isNAME_C_OR_POSIX(save_newnum);
1324
69c5e0db
KW
1325#ifndef TS_W32_BROKEN_LOCALECONV
1326
4c68b815 1327 /* If its name isn't C nor POSIX, it could still be indistinguishable from
69c5e0db
KW
1328 * them. But on broken Windows systems calling my_nl_langinfo() for
1329 * THOUSEP can currently (but rarely) cause a race, so avoid doing that,
1330 * and just always change the locale if not C nor POSIX on those systems */
4c68b815 1331 if (! PL_numeric_standard) {
4e6826bf 1332 PL_numeric_standard = cBOOL(strEQ(".", my_nl_langinfo(RADIXCHAR,
4c68b815 1333 FALSE /* Don't toggle locale */ ))
4e6826bf 1334 && strEQ("", my_nl_langinfo(THOUSEP, FALSE)));
4c68b815 1335 }
abe1abcf 1336
69c5e0db
KW
1337#endif
1338
4c68b815 1339 /* Save the new name if it isn't the same as the previous one, if any */
b03f34cf 1340 if (! PL_numeric_name || strNE(PL_numeric_name, save_newnum)) {
1604cfb0
MS
1341 Safefree(PL_numeric_name);
1342 PL_numeric_name = save_newnum;
b03f34cf 1343 }
abe1abcf 1344 else {
1604cfb0 1345 Safefree(save_newnum);
abe1abcf 1346 }
4c28b29c 1347
4c68b815
KW
1348 PL_numeric_underlying_is_standard = PL_numeric_standard;
1349
7e5377f7 1350# ifdef HAS_POSIX_2008_LOCALE
e1aa2579
KW
1351
1352 PL_underlying_numeric_obj = newlocale(LC_NUMERIC_MASK,
1353 PL_numeric_name,
1354 PL_underlying_numeric_obj);
1355
1356#endif
1357
4c68b815
KW
1358 if (DEBUG_L_TEST || debug_initialization) {
1359 PerlIO_printf(Perl_debug_log, "Called new_numeric with %s, PL_numeric_name=%s\n", newnum, PL_numeric_name);
1360 }
1361
4c28b29c
KW
1362 /* Keep LC_NUMERIC in the C locale. This is for XS modules, so they don't
1363 * have to worry about the radix being a non-dot. (Core operations that
1364 * need the underlying locale change to it temporarily). */
84f1d29f
KW
1365 if (PL_numeric_standard) {
1366 set_numeric_radix(0);
1367 }
1368 else {
1369 set_numeric_standard();
1370 }
4c28b29c 1371
98994639 1372#endif /* USE_LOCALE_NUMERIC */
7d4bcc4a 1373
98994639
HS
1374}
1375
1376void
1377Perl_set_numeric_standard(pTHX)
1378{
7d4bcc4a 1379
98994639 1380#ifdef USE_LOCALE_NUMERIC
7d4bcc4a 1381
28c1bf33
KW
1382 /* Toggle the LC_NUMERIC locale to C. Most code should use the macros like
1383 * SET_NUMERIC_STANDARD() in perl.h instead of calling this directly. The
1384 * macro avoids calling this routine if toggling isn't necessary according
1385 * to our records (which could be wrong if some XS code has changed the
1386 * locale behind our back) */
0d071d52 1387
7d4bcc4a
KW
1388# ifdef DEBUGGING
1389
2fcc0ca9
KW
1390 if (DEBUG_L_TEST || debug_initialization) {
1391 PerlIO_printf(Perl_debug_log,
7bb8f702 1392 "Setting LC_NUMERIC locale to standard C\n");
2fcc0ca9 1393 }
98994639 1394
7d4bcc4a 1395# endif
7bb8f702
KW
1396
1397 do_setlocale_c(LC_NUMERIC, "C");
1398 PL_numeric_standard = TRUE;
1399 PL_numeric_underlying = PL_numeric_underlying_is_standard;
1400 set_numeric_radix(0);
1401
98994639 1402#endif /* USE_LOCALE_NUMERIC */
7d4bcc4a 1403
98994639
HS
1404}
1405
1406void
5792c642 1407Perl_set_numeric_underlying(pTHX)
98994639 1408{
7d4bcc4a 1409
98994639 1410#ifdef USE_LOCALE_NUMERIC
7d4bcc4a 1411
28c1bf33 1412 /* Toggle the LC_NUMERIC locale to the current underlying default. Most
7d4bcc4a
KW
1413 * code should use the macros like SET_NUMERIC_UNDERLYING() in perl.h
1414 * instead of calling this directly. The macro avoids calling this routine
1415 * if toggling isn't necessary according to our records (which could be
1416 * wrong if some XS code has changed the locale behind our back) */
a9b8c0d8 1417
7d4bcc4a
KW
1418# ifdef DEBUGGING
1419
2fcc0ca9
KW
1420 if (DEBUG_L_TEST || debug_initialization) {
1421 PerlIO_printf(Perl_debug_log,
7bb8f702 1422 "Setting LC_NUMERIC locale to %s\n",
2fcc0ca9
KW
1423 PL_numeric_name);
1424 }
98994639 1425
7d4bcc4a 1426# endif
7bb8f702
KW
1427
1428 do_setlocale_c(LC_NUMERIC, PL_numeric_name);
1429 PL_numeric_standard = PL_numeric_underlying_is_standard;
1430 PL_numeric_underlying = TRUE;
1431 set_numeric_radix(! PL_numeric_standard);
1432
98994639 1433#endif /* USE_LOCALE_NUMERIC */
7d4bcc4a 1434
98994639
HS
1435}
1436
1437/*
1438 * Set up for a new ctype locale.
1439 */
a4f00dcc
KW
1440STATIC void
1441S_new_ctype(pTHX_ const char *newctype)
98994639 1442{
7d4bcc4a
KW
1443
1444#ifndef USE_LOCALE_CTYPE
1445
7d4bcc4a
KW
1446 PERL_UNUSED_ARG(newctype);
1447 PERL_UNUSED_CONTEXT;
1448
1449#else
0d071d52 1450
291a84fb 1451 /* Called after each libc setlocale() call affecting LC_CTYPE, to tell
0d071d52
KW
1452 * core Perl this and that 'newctype' is the name of the new locale.
1453 *
1454 * This function sets up the folding arrays for all 256 bytes, assuming
1455 * that tofold() is tolc() since fold case is not a concept in POSIX,
1456 *
1457 * Any code changing the locale (outside this file) should use
9aac5db8
KW
1458 * Perl_setlocale or POSIX::setlocale, which call this function. Therefore
1459 * this function should be called directly only from this file and from
0d071d52
KW
1460 * POSIX::setlocale() */
1461
013f4e03 1462 unsigned int i;
98994639 1463
8b7358b9
KW
1464 /* Don't check for problems if we are suppressing the warnings */
1465 bool check_for_problems = ckWARN_d(WARN_LOCALE) || UNLIKELY(DEBUG_L_TEST);
30d8090d 1466 bool maybe_utf8_turkic = FALSE;
8b7358b9 1467
7918f24d
NC
1468 PERL_ARGS_ASSERT_NEW_CTYPE;
1469
215c5139
KW
1470 /* We will replace any bad locale warning with 1) nothing if the new one is
1471 * ok; or 2) a new warning for the bad new locale */
1472 if (PL_warn_locale) {
1473 SvREFCNT_dec_NN(PL_warn_locale);
1474 PL_warn_locale = NULL;
1475 }
1476
c1284011 1477 PL_in_utf8_CTYPE_locale = _is_cur_LC_category_utf8(LC_CTYPE);
31f05a37
KW
1478
1479 /* A UTF-8 locale gets standard rules. But note that code still has to
1480 * handle this specially because of the three problematic code points */
1481 if (PL_in_utf8_CTYPE_locale) {
1482 Copy(PL_fold_latin1, PL_fold_locale, 256, U8);
30d8090d
KW
1483
1484 /* UTF-8 locales can have special handling for 'I' and 'i' if they are
1485 * Turkic. Make sure these two are the only anomalies. (We don't use
1486 * towupper and towlower because they aren't in C89.) */
5b64f24c
KW
1487
1488#if defined(HAS_TOWUPPER) && defined (HAS_TOWLOWER)
1489
1490 if (towupper('i') == 0x130 && towlower('I') == 0x131) {
1491
1492#else
1493
30d8090d 1494 if (toupper('i') == 'i' && tolower('I') == 'I') {
5b64f24c
KW
1495
1496#endif
30d8090d
KW
1497 check_for_problems = TRUE;
1498 maybe_utf8_turkic = TRUE;
1499 }
31f05a37 1500 }
8b7358b9
KW
1501
1502 /* We don't populate the other lists if a UTF-8 locale, but do check that
1503 * everything works as expected, unless checking turned off */
1504 if (check_for_problems || ! PL_in_utf8_CTYPE_locale) {
8c6180a9
KW
1505 /* Assume enough space for every character being bad. 4 spaces each
1506 * for the 94 printable characters that are output like "'x' "; and 5
1507 * spaces each for "'\\' ", "'\t' ", and "'\n' "; plus a terminating
1508 * NUL */
8b7358b9 1509 char bad_chars_list[ (94 * 4) + (3 * 5) + 1 ] = { '\0' };
8c6180a9
KW
1510 bool multi_byte_locale = FALSE; /* Assume is a single-byte locale
1511 to start */
1512 unsigned int bad_count = 0; /* Count of bad characters */
1513
baa60164 1514 for (i = 0; i < 256; i++) {
8b7358b9 1515 if (! PL_in_utf8_CTYPE_locale) {
bd6d0898
KW
1516 if (isupper(i))
1517 PL_fold_locale[i] = (U8) tolower(i);
1518 else if (islower(i))
1519 PL_fold_locale[i] = (U8) toupper(i);
1520 else
1521 PL_fold_locale[i] = (U8) i;
8b7358b9 1522 }
8c6180a9
KW
1523
1524 /* If checking for locale problems, see if the native ASCII-range
1525 * printables plus \n and \t are in their expected categories in
1526 * the new locale. If not, this could mean big trouble, upending
1527 * Perl's and most programs' assumptions, like having a
1528 * metacharacter with special meaning become a \w. Fortunately,
1529 * it's very rare to find locales that aren't supersets of ASCII
1530 * nowadays. It isn't a problem for most controls to be changed
1531 * into something else; we check only \n and \t, though perhaps \r
1532 * could be an issue as well. */
7d4bcc4a 1533 if ( check_for_problems
8c6180a9
KW
1534 && (isGRAPH_A(i) || isBLANK_A(i) || i == '\n'))
1535 {
8b7358b9 1536 bool is_bad = FALSE;
6726b4f4 1537 char name[4] = { '\0' };
8b7358b9
KW
1538
1539 /* Convert the name into a string */
6726b4f4 1540 if (isGRAPH_A(i)) {
8b7358b9
KW
1541 name[0] = i;
1542 name[1] = '\0';
1543 }
1544 else if (i == '\n') {
6726b4f4
KW
1545 my_strlcpy(name, "\\n", sizeof(name));
1546 }
1547 else if (i == '\t') {
1548 my_strlcpy(name, "\\t", sizeof(name));
8b7358b9
KW
1549 }
1550 else {
6726b4f4
KW
1551 assert(i == ' ');
1552 my_strlcpy(name, "' '", sizeof(name));
8b7358b9
KW
1553 }
1554
1555 /* Check each possibe class */
1556 if (UNLIKELY(cBOOL(isalnum(i)) != cBOOL(isALPHANUMERIC_A(i)))) {
1557 is_bad = TRUE;
1558 DEBUG_L(PerlIO_printf(Perl_debug_log,
1559 "isalnum('%s') unexpectedly is %d\n",
1560 name, cBOOL(isalnum(i))));
1561 }
1562 if (UNLIKELY(cBOOL(isalpha(i)) != cBOOL(isALPHA_A(i)))) {
1563 is_bad = TRUE;
1564 DEBUG_L(PerlIO_printf(Perl_debug_log,
1565 "isalpha('%s') unexpectedly is %d\n",
1566 name, cBOOL(isalpha(i))));
1567 }
1568 if (UNLIKELY(cBOOL(isdigit(i)) != cBOOL(isDIGIT_A(i)))) {
1569 is_bad = TRUE;
1570 DEBUG_L(PerlIO_printf(Perl_debug_log,
1571 "isdigit('%s') unexpectedly is %d\n",
1572 name, cBOOL(isdigit(i))));
1573 }
1574 if (UNLIKELY(cBOOL(isgraph(i)) != cBOOL(isGRAPH_A(i)))) {
1575 is_bad = TRUE;
1576 DEBUG_L(PerlIO_printf(Perl_debug_log,
1577 "isgraph('%s') unexpectedly is %d\n",
1578 name, cBOOL(isgraph(i))));
1579 }
1580 if (UNLIKELY(cBOOL(islower(i)) != cBOOL(isLOWER_A(i)))) {
1581 is_bad = TRUE;
1582 DEBUG_L(PerlIO_printf(Perl_debug_log,
1583 "islower('%s') unexpectedly is %d\n",
1584 name, cBOOL(islower(i))));
1585 }
1586 if (UNLIKELY(cBOOL(isprint(i)) != cBOOL(isPRINT_A(i)))) {
1587 is_bad = TRUE;
1588 DEBUG_L(PerlIO_printf(Perl_debug_log,
1589 "isprint('%s') unexpectedly is %d\n",
1590 name, cBOOL(isprint(i))));
1591 }
1592 if (UNLIKELY(cBOOL(ispunct(i)) != cBOOL(isPUNCT_A(i)))) {
1593 is_bad = TRUE;
1594 DEBUG_L(PerlIO_printf(Perl_debug_log,
1595 "ispunct('%s') unexpectedly is %d\n",
1596 name, cBOOL(ispunct(i))));
1597 }
1598 if (UNLIKELY(cBOOL(isspace(i)) != cBOOL(isSPACE_A(i)))) {
1599 is_bad = TRUE;
1600 DEBUG_L(PerlIO_printf(Perl_debug_log,
1601 "isspace('%s') unexpectedly is %d\n",
1602 name, cBOOL(isspace(i))));
1603 }
1604 if (UNLIKELY(cBOOL(isupper(i)) != cBOOL(isUPPER_A(i)))) {
1605 is_bad = TRUE;
1606 DEBUG_L(PerlIO_printf(Perl_debug_log,
1607 "isupper('%s') unexpectedly is %d\n",
1608 name, cBOOL(isupper(i))));
1609 }
1610 if (UNLIKELY(cBOOL(isxdigit(i))!= cBOOL(isXDIGIT_A(i)))) {
1611 is_bad = TRUE;
1612 DEBUG_L(PerlIO_printf(Perl_debug_log,
1613 "isxdigit('%s') unexpectedly is %d\n",
1614 name, cBOOL(isxdigit(i))));
1615 }
65c15174 1616 if (UNLIKELY(tolower(i) != (int) toLOWER_A(i))) {
8b7358b9
KW
1617 is_bad = TRUE;
1618 DEBUG_L(PerlIO_printf(Perl_debug_log,
1619 "tolower('%s')=0x%x instead of the expected 0x%x\n",
1620 name, tolower(i), (int) toLOWER_A(i)));
1621 }
65c15174 1622 if (UNLIKELY(toupper(i) != (int) toUPPER_A(i))) {
8b7358b9
KW
1623 is_bad = TRUE;
1624 DEBUG_L(PerlIO_printf(Perl_debug_log,
1625 "toupper('%s')=0x%x instead of the expected 0x%x\n",
1626 name, toupper(i), (int) toUPPER_A(i)));
1627 }
1628 if (UNLIKELY((i == '\n' && ! isCNTRL_LC(i)))) {
1629 is_bad = TRUE;
1630 DEBUG_L(PerlIO_printf(Perl_debug_log,
1631 "'\\n' (=%02X) is not a control\n", (int) i));
1632 }
1633
1634 /* Add to the list; Separate multiple entries with a blank */
1635 if (is_bad) {
1636 if (bad_count) {
1637 my_strlcat(bad_chars_list, " ", sizeof(bad_chars_list));
8c6180a9 1638 }
8b7358b9
KW
1639 my_strlcat(bad_chars_list, name, sizeof(bad_chars_list));
1640 bad_count++;
8c6180a9
KW
1641 }
1642 }
1643 }
1644
30d8090d
KW
1645 if (bad_count == 2 && maybe_utf8_turkic) {
1646 bad_count = 0;
1647 *bad_chars_list = '\0';
1648 PL_fold_locale['I'] = 'I';
1649 PL_fold_locale['i'] = 'i';
1650 PL_in_utf8_turkic_locale = TRUE;
1651 DEBUG_L(PerlIO_printf(Perl_debug_log, "%s:%d: %s is turkic\n",
1652 __FILE__, __LINE__, newctype));
1653 }
1654 else {
b8df1494 1655 PL_in_utf8_turkic_locale = FALSE;
30d8090d 1656 }
b8df1494 1657
7d4bcc4a
KW
1658# ifdef MB_CUR_MAX
1659
8c6180a9 1660 /* We only handle single-byte locales (outside of UTF-8 ones; so if
d35fca5f 1661 * this locale requires more than one byte, there are going to be
8c6180a9 1662 * problems. */
9c8a6dc2
KW
1663 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
1664 "%s:%d: check_for_problems=%d, MB_CUR_MAX=%d\n",
1665 __FILE__, __LINE__, check_for_problems, (int) MB_CUR_MAX));
1666
8b7358b9
KW
1667 if ( check_for_problems && MB_CUR_MAX > 1
1668 && ! PL_in_utf8_CTYPE_locale
ba1a4362
KW
1669
1670 /* Some platforms return MB_CUR_MAX > 1 for even the "C"
1671 * locale. Just assume that the implementation for them (plus
1672 * for POSIX) is correct and the > 1 value is spurious. (Since
1673 * these are specially handled to never be considered UTF-8
1674 * locales, as long as this is the only problem, everything
1675 * should work fine */
1676 && strNE(newctype, "C") && strNE(newctype, "POSIX"))
1677 {
8c6180a9
KW
1678 multi_byte_locale = TRUE;
1679 }
7d4bcc4a
KW
1680
1681# endif
8c6180a9 1682
30d8090d
KW
1683 /* If we found problems and we want them output, do so */
1684 if ( (UNLIKELY(bad_count) || UNLIKELY(multi_byte_locale))
1685 && (LIKELY(ckWARN_d(WARN_LOCALE)) || UNLIKELY(DEBUG_L_TEST)))
1686 {
8b7358b9
KW
1687 if (UNLIKELY(bad_count) && PL_in_utf8_CTYPE_locale) {
1688 PL_warn_locale = Perl_newSVpvf(aTHX_
1689 "Locale '%s' contains (at least) the following characters"
578a6a87
KW
1690 " which have\nunexpected meanings: %s\nThe Perl program"
1691 " will use the expected meanings",
8b7358b9 1692 newctype, bad_chars_list);
8b7358b9
KW
1693 }
1694 else {
1695 PL_warn_locale = Perl_newSVpvf(aTHX_
8c6180a9 1696 "Locale '%s' may not work well.%s%s%s\n",
780fcc9f 1697 newctype,
8c6180a9
KW
1698 (multi_byte_locale)
1699 ? " Some characters in it are not recognized by"
1700 " Perl."
1701 : "",
1702 (bad_count)
1703 ? "\nThe following characters (and maybe others)"
1704 " may not have the same meaning as the Perl"
1705 " program expects:\n"
1706 : "",
1707 (bad_count)
1708 ? bad_chars_list
1709 : ""
1710 );
8b7358b9
KW
1711 }
1712
b119c2be
KW
1713# ifdef HAS_NL_LANGINFO
1714
1715 Perl_sv_catpvf(aTHX_ PL_warn_locale, "; codeset=%s",
1716 /* parameter FALSE is a don't care here */
4e6826bf 1717 my_nl_langinfo(CODESET, FALSE));
b119c2be
KW
1718
1719# endif
1720
8b7358b9
KW
1721 Perl_sv_catpvf(aTHX_ PL_warn_locale, "\n");
1722
cc9eaeb0 1723 /* If we are actually in the scope of the locale or are debugging,
bddebb56
KW
1724 * output the message now. If not in that scope, we save the
1725 * message to be output at the first operation using this locale,
1726 * if that actually happens. Most programs don't use locales, so
1727 * they are immune to bad ones. */
cc9eaeb0 1728 if (IN_LC(LC_CTYPE) || UNLIKELY(DEBUG_L_TEST)) {
780fcc9f 1729
780fcc9f
KW
1730 /* The '0' below suppresses a bogus gcc compiler warning */
1731 Perl_warner(aTHX_ packWARN(WARN_LOCALE), SvPVX(PL_warn_locale), 0);
bddebb56 1732
bddebb56
KW
1733 if (IN_LC(LC_CTYPE)) {
1734 SvREFCNT_dec_NN(PL_warn_locale);
1735 PL_warn_locale = NULL;
1736 }
780fcc9f 1737 }
baa60164 1738 }
31f05a37 1739 }
98994639
HS
1740
1741#endif /* USE_LOCALE_CTYPE */
7d4bcc4a 1742
98994639
HS
1743}
1744
98994639 1745void
2726666d
KW
1746Perl__warn_problematic_locale()
1747{
2726666d
KW
1748
1749#ifdef USE_LOCALE_CTYPE
1750
5f04a188
KW
1751 dTHX;
1752
1753 /* Internal-to-core function that outputs the message in PL_warn_locale,
1754 * and then NULLS it. Should be called only through the macro
1755 * _CHECK_AND_WARN_PROBLEMATIC_LOCALE */
1756
2726666d 1757 if (PL_warn_locale) {
2726666d
KW
1758 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
1759 SvPVX(PL_warn_locale),
1760 0 /* dummy to avoid compiler warning */ );
2726666d
KW
1761 SvREFCNT_dec_NN(PL_warn_locale);
1762 PL_warn_locale = NULL;
1763 }
1764
1765#endif
1766
1767}
1768
a4f00dcc
KW
1769STATIC void
1770S_new_collate(pTHX_ const char *newcoll)
98994639 1771{
7d4bcc4a
KW
1772
1773#ifndef USE_LOCALE_COLLATE
1774
1775 PERL_UNUSED_ARG(newcoll);
1776 PERL_UNUSED_CONTEXT;
1777
1778#else
0d071d52 1779
291a84fb 1780 /* Called after each libc setlocale() call affecting LC_COLLATE, to tell
0d071d52
KW
1781 * core Perl this and that 'newcoll' is the name of the new locale.
1782 *
d35fca5f
KW
1783 * The design of locale collation is that every locale change is given an
1784 * index 'PL_collation_ix'. The first time a string particpates in an
1785 * operation that requires collation while locale collation is active, it
1786 * is given PERL_MAGIC_collxfrm magic (via sv_collxfrm_flags()). That
1787 * magic includes the collation index, and the transformation of the string
1788 * by strxfrm(), q.v. That transformation is used when doing comparisons,
1789 * instead of the string itself. If a string changes, the magic is
1790 * cleared. The next time the locale changes, the index is incremented,
1791 * and so we know during a comparison that the transformation is not
1792 * necessarily still valid, and so is recomputed. Note that if the locale
1793 * changes enough times, the index could wrap (a U32), and it is possible
1794 * that a transformation would improperly be considered valid, leading to
1795 * an unlikely bug */
0d071d52 1796
98994639 1797 if (! newcoll) {
1604cfb0
MS
1798 if (PL_collation_name) {
1799 ++PL_collation_ix;
1800 Safefree(PL_collation_name);
1801 PL_collation_name = NULL;
1802 }
1803 PL_collation_standard = TRUE;
00bf60ca 1804 is_standard_collation:
1604cfb0
MS
1805 PL_collxfrm_base = 0;
1806 PL_collxfrm_mult = 2;
165a1c52 1807 PL_in_utf8_COLLATE_locale = FALSE;
f28f4d2a 1808 PL_strxfrm_NUL_replacement = '\0';
a4a439fb 1809 PL_strxfrm_max_cp = 0;
1604cfb0 1810 return;
98994639
HS
1811 }
1812
d35fca5f 1813 /* If this is not the same locale as currently, set the new one up */
98994639 1814 if (! PL_collation_name || strNE(PL_collation_name, newcoll)) {
1604cfb0
MS
1815 ++PL_collation_ix;
1816 Safefree(PL_collation_name);
1817 PL_collation_name = stdize_locale(savepv(newcoll));
1818 PL_collation_standard = isNAME_C_OR_POSIX(newcoll);
00bf60ca
KW
1819 if (PL_collation_standard) {
1820 goto is_standard_collation;
1821 }
98994639 1822
165a1c52 1823 PL_in_utf8_COLLATE_locale = _is_cur_LC_category_utf8(LC_COLLATE);
f28f4d2a 1824 PL_strxfrm_NUL_replacement = '\0';
a4a439fb 1825 PL_strxfrm_max_cp = 0;
165a1c52 1826
59c018b9
KW
1827 /* A locale collation definition includes primary, secondary, tertiary,
1828 * etc. weights for each character. To sort, the primary weights are
1829 * used, and only if they compare equal, then the secondary weights are
1830 * used, and only if they compare equal, then the tertiary, etc.
1831 *
1832 * strxfrm() works by taking the input string, say ABC, and creating an
1833 * output transformed string consisting of first the primary weights,
1834 * A¹B¹C¹ followed by the secondary ones, A²B²C²; and then the
1835 * tertiary, etc, yielding A¹B¹C¹ A²B²C² A³B³C³ .... Some characters
1836 * may not have weights at every level. In our example, let's say B
1837 * doesn't have a tertiary weight, and A doesn't have a secondary
1838 * weight. The constructed string is then going to be
1839 * A¹B¹C¹ B²C² A³C³ ....
1840 * This has the desired effect that strcmp() will look at the secondary
1841 * or tertiary weights only if the strings compare equal at all higher
1842 * priority weights. The spaces shown here, like in
c342d20e 1843 * "A¹B¹C¹ A²B²C² "
59c018b9
KW
1844 * are not just for readability. In the general case, these must
1845 * actually be bytes, which we will call here 'separator weights'; and
1846 * they must be smaller than any other weight value, but since these
1847 * are C strings, only the terminating one can be a NUL (some
1848 * implementations may include a non-NUL separator weight just before
1849 * the NUL). Implementations tend to reserve 01 for the separator
1850 * weights. They are needed so that a shorter string's secondary
1851 * weights won't be misconstrued as primary weights of a longer string,
1852 * etc. By making them smaller than any other weight, the shorter
1853 * string will sort first. (Actually, if all secondary weights are
1854 * smaller than all primary ones, there is no need for a separator
1855 * weight between those two levels, etc.)
1856 *
1857 * The length of the transformed string is roughly a linear function of
1858 * the input string. It's not exactly linear because some characters
1859 * don't have weights at all levels. When we call strxfrm() we have to
1860 * allocate some memory to hold the transformed string. The
1861 * calculations below try to find coefficients 'm' and 'b' for this
1862 * locale so that m*x + b equals how much space we need, given the size
1863 * of the input string in 'x'. If we calculate too small, we increase
1864 * the size as needed, and call strxfrm() again, but it is better to
1865 * get it right the first time to avoid wasted expensive string
1866 * transformations. */
1867
1604cfb0 1868 {
79f120c8
KW
1869 /* We use the string below to find how long the tranformation of it
1870 * is. Almost all locales are supersets of ASCII, or at least the
1871 * ASCII letters. We use all of them, half upper half lower,
1872 * because if we used fewer, we might hit just the ones that are
1873 * outliers in a particular locale. Most of the strings being
1874 * collated will contain a preponderance of letters, and even if
1875 * they are above-ASCII, they are likely to have the same number of
1876 * weight levels as the ASCII ones. It turns out that digits tend
1877 * to have fewer levels, and some punctuation has more, but those
1878 * are relatively sparse in text, and khw believes this gives a
1879 * reasonable result, but it could be changed if experience so
1880 * dictates. */
1881 const char longer[] = "ABCDEFGHIJKLMnopqrstuvwxyz";
1882 char * x_longer; /* Transformed 'longer' */
1883 Size_t x_len_longer; /* Length of 'x_longer' */
1884
1885 char * x_shorter; /* We also transform a substring of 'longer' */
1886 Size_t x_len_shorter;
1887
a4a439fb 1888 /* _mem_collxfrm() is used get the transformation (though here we
79f120c8
KW
1889 * are interested only in its length). It is used because it has
1890 * the intelligence to handle all cases, but to work, it needs some
1891 * values of 'm' and 'b' to get it started. For the purposes of
1892 * this calculation we use a very conservative estimate of 'm' and
1893 * 'b'. This assumes a weight can be multiple bytes, enough to
1894 * hold any UV on the platform, and there are 5 levels, 4 weight
1895 * bytes, and a trailing NUL. */
1896 PL_collxfrm_base = 5;
1897 PL_collxfrm_mult = 5 * sizeof(UV);
1898
1899 /* Find out how long the transformation really is */
a4a439fb
KW
1900 x_longer = _mem_collxfrm(longer,
1901 sizeof(longer) - 1,
1902 &x_len_longer,
1903
1904 /* We avoid converting to UTF-8 in the
1905 * called function by telling it the
1906 * string is in UTF-8 if the locale is a
1907 * UTF-8 one. Since the string passed
1908 * here is invariant under UTF-8, we can
1909 * claim it's UTF-8 even though it isn't.
1910 * */
1911 PL_in_utf8_COLLATE_locale);
79f120c8
KW
1912 Safefree(x_longer);
1913
1914 /* Find out how long the transformation of a substring of 'longer'
1915 * is. Together the lengths of these transformations are
1916 * sufficient to calculate 'm' and 'b'. The substring is all of
1917 * 'longer' except the first character. This minimizes the chances
1918 * of being swayed by outliers */
a4a439fb 1919 x_shorter = _mem_collxfrm(longer + 1,
79f120c8 1920 sizeof(longer) - 2,
a4a439fb
KW
1921 &x_len_shorter,
1922 PL_in_utf8_COLLATE_locale);
79f120c8
KW
1923 Safefree(x_shorter);
1924
1925 /* If the results are nonsensical for this simple test, the whole
1926 * locale definition is suspect. Mark it so that locale collation
1927 * is not active at all for it. XXX Should we warn? */
1928 if ( x_len_shorter == 0
1929 || x_len_longer == 0
1930 || x_len_shorter >= x_len_longer)
1931 {
1932 PL_collxfrm_mult = 0;
1933 PL_collxfrm_base = 0;
1934 }
1935 else {
1936 SSize_t base; /* Temporary */
1937
1938 /* We have both: m * strlen(longer) + b = x_len_longer
1939 * m * strlen(shorter) + b = x_len_shorter;
1940 * subtracting yields:
1941 * m * (strlen(longer) - strlen(shorter))
1942 * = x_len_longer - x_len_shorter
1943 * But we have set things up so that 'shorter' is 1 byte smaller
1944 * than 'longer'. Hence:
1945 * m = x_len_longer - x_len_shorter
1946 *
1947 * But if something went wrong, make sure the multiplier is at
1948 * least 1.
1949 */
1950 if (x_len_longer > x_len_shorter) {
1951 PL_collxfrm_mult = (STRLEN) x_len_longer - x_len_shorter;
1952 }
1953 else {
1954 PL_collxfrm_mult = 1;
1955 }
1956
1957 /* mx + b = len
1958 * so: b = len - mx
1959 * but in case something has gone wrong, make sure it is
1960 * non-negative */
1961 base = x_len_longer - PL_collxfrm_mult * (sizeof(longer) - 1);
1962 if (base < 0) {
1963 base = 0;
1964 }
1965
1966 /* Add 1 for the trailing NUL */
1967 PL_collxfrm_base = base + 1;
1968 }
58eebef2 1969
7d4bcc4a
KW
1970# ifdef DEBUGGING
1971
58eebef2
KW
1972 if (DEBUG_L_TEST || debug_initialization) {
1973 PerlIO_printf(Perl_debug_log,
b07929e4
KW
1974 "%s:%d: ?UTF-8 locale=%d; x_len_shorter=%zu, "
1975 "x_len_longer=%zu,"
1976 " collate multipler=%zu, collate base=%zu\n",
58eebef2
KW
1977 __FILE__, __LINE__,
1978 PL_in_utf8_COLLATE_locale,
1979 x_len_shorter, x_len_longer,
1980 PL_collxfrm_mult, PL_collxfrm_base);
1981 }
7d4bcc4a
KW
1982# endif
1983
1604cfb0 1984 }
98994639
HS
1985 }
1986
1987#endif /* USE_LOCALE_COLLATE */
7d4bcc4a 1988
98994639
HS
1989}
1990
d36adde0
KW
1991#endif
1992
d2b24094 1993#ifdef WIN32
b8cc575c 1994
6c332036
TC
1995#define USE_WSETLOCALE
1996
1997#ifdef USE_WSETLOCALE
1998
1999STATIC char *
2000S_wrap_wsetlocale(pTHX_ int category, const char *locale) {
2001 wchar_t *wlocale;
2002 wchar_t *wresult;
2003 char *result;
2004
2005 if (locale) {
2006 int req_size =
2007 MultiByteToWideChar(CP_UTF8, 0, locale, -1, NULL, 0);
2008
2009 if (!req_size) {
2010 errno = EINVAL;
2011 return NULL;
2012 }
2013
2014 Newx(wlocale, req_size, wchar_t);
2015 if (!MultiByteToWideChar(CP_UTF8, 0, locale, -1, wlocale, req_size)) {
2016 Safefree(wlocale);
2017 errno = EINVAL;
2018 return NULL;
2019 }
2020 }
2021 else {
2022 wlocale = NULL;
2023 }
2024 wresult = _wsetlocale(category, wlocale);
2025 Safefree(wlocale);
2026 if (wresult) {
2027 int req_size =
2028 WideCharToMultiByte(CP_UTF8, 0, wresult, -1, NULL, 0, NULL, NULL);
2029 Newx(result, req_size, char);
2030 SAVEFREEPV(result); /* is there something better we can do here? */
2031 if (!WideCharToMultiByte(CP_UTF8, 0, wresult, -1,
2032 result, req_size, NULL, NULL)) {
2033 errno = EINVAL;
2034 return NULL;
2035 }
2036 }
2037 else {
2038 result = NULL;
2039 }
2040
2041 return result;
2042}
2043
2044#endif
2045
a4f00dcc 2046STATIC char *
b8cc575c 2047S_win32_setlocale(pTHX_ int category, const char* locale)
b385bb4d
KW
2048{
2049 /* This, for Windows, emulates POSIX setlocale() behavior. There is no
7d4bcc4a
KW
2050 * difference between the two unless the input locale is "", which normally
2051 * means on Windows to get the machine default, which is set via the
2052 * computer's "Regional and Language Options" (or its current equivalent).
2053 * In POSIX, it instead means to find the locale from the user's
2054 * environment. This routine changes the Windows behavior to first look in
2055 * the environment, and, if anything is found, use that instead of going to
2056 * the machine default. If there is no environment override, the machine
2057 * default is used, by calling the real setlocale() with "".
2058 *
2059 * The POSIX behavior is to use the LC_ALL variable if set; otherwise to
2060 * use the particular category's variable if set; otherwise to use the LANG
2061 * variable. */
b385bb4d 2062
175c4cf9 2063 bool override_LC_ALL = FALSE;
89f7b9aa 2064 char * result;
e5f10d49 2065 unsigned int i;
89f7b9aa 2066
b385bb4d 2067 if (locale && strEQ(locale, "")) {
7d4bcc4a
KW
2068
2069# ifdef LC_ALL
2070
b385bb4d
KW
2071 locale = PerlEnv_getenv("LC_ALL");
2072 if (! locale) {
e5f10d49
KW
2073 if (category == LC_ALL) {
2074 override_LC_ALL = TRUE;
2075 }
2076 else {
7d4bcc4a
KW
2077
2078# endif
7d4bcc4a 2079
e5f10d49
KW
2080 for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++) {
2081 if (category == categories[i]) {
2082 locale = PerlEnv_getenv(category_names[i]);
2083 goto found_locale;
2084 }
2085 }
7d4bcc4a 2086
b385bb4d 2087 locale = PerlEnv_getenv("LANG");
481465ea 2088 if (! locale) {
b385bb4d
KW
2089 locale = "";
2090 }
e5f10d49
KW
2091
2092 found_locale: ;
7d4bcc4a
KW
2093
2094# ifdef LC_ALL
2095
e5f10d49 2096 }
b385bb4d 2097 }
7d4bcc4a
KW
2098
2099# endif
2100
b385bb4d
KW
2101 }
2102
6c332036
TC
2103#ifdef USE_WSETLOCALE
2104 result = S_wrap_wsetlocale(aTHX_ category, locale);
2105#else
89f7b9aa 2106 result = setlocale(category, locale);
6c332036 2107#endif
48015184
KW
2108 DEBUG_L(STMT_START {
2109 dSAVE_ERRNO;
2110 PerlIO_printf(Perl_debug_log, "%s:%d: %s\n", __FILE__, __LINE__,
2111 setlocale_debug_string(category, locale, result));
2112 RESTORE_ERRNO;
2113 } STMT_END);
89f7b9aa 2114
481465ea 2115 if (! override_LC_ALL) {
89f7b9aa
KW
2116 return result;
2117 }
2118
dfd77d7a 2119 /* Here the input category was LC_ALL, and we have set it to what is in the
481465ea
KW
2120 * LANG variable or the system default if there is no LANG. But these have
2121 * lower priority than the other LC_foo variables, so override it for each
2122 * one that is set. (If they are set to "", it means to use the same thing
2123 * we just set LC_ALL to, so can skip) */
7d4bcc4a 2124
948523db 2125 for (i = 0; i < LC_ALL_INDEX; i++) {
e5f10d49
KW
2126 result = PerlEnv_getenv(category_names[i]);
2127 if (result && strNE(result, "")) {
6c332036 2128#ifdef USE_WSETLOCALE
17a1e9ac 2129 S_wrap_wsetlocale(aTHX_ categories[i], result);
6c332036 2130#else
17a1e9ac 2131 setlocale(categories[i], result);
6c332036 2132#endif
e5f10d49
KW
2133 DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s:%d: %s\n",
2134 __FILE__, __LINE__,
2135 setlocale_debug_string(categories[i], result, "not captured")));
2136 }
89f7b9aa 2137 }
7d4bcc4a 2138
bbc98134 2139 result = setlocale(LC_ALL, NULL);
48015184
KW
2140 DEBUG_L(STMT_START {
2141 dSAVE_ERRNO;
2142 PerlIO_printf(Perl_debug_log, "%s:%d: %s\n",
bbc98134 2143 __FILE__, __LINE__,
48015184
KW
2144 setlocale_debug_string(LC_ALL, NULL, result));
2145 RESTORE_ERRNO;
2146 } STMT_END);
89f7b9aa 2147
bbc98134 2148 return result;
b385bb4d
KW
2149}
2150
2151#endif
2152
9aac5db8 2153/*
9aac5db8
KW
2154=for apidoc Perl_setlocale
2155
2156This is an (almost) drop-in replacement for the system L<C<setlocale(3)>>,
2157taking the same parameters, and returning the same information, except that it
9487427b
KW
2158returns the correct underlying C<LC_NUMERIC> locale. Regular C<setlocale> will
2159instead return C<C> if the underlying locale has a non-dot decimal point
2160character, or a non-empty thousands separator for displaying floating point
2161numbers. This is because perl keeps that locale category such that it has a
2162dot and empty separator, changing the locale briefly during the operations
2163where the underlying one is required. C<Perl_setlocale> knows about this, and
2164compensates; regular C<setlocale> doesn't.
9aac5db8 2165
e9bc6d6b 2166Another reason it isn't completely a drop-in replacement is that it is
9aac5db8 2167declared to return S<C<const char *>>, whereas the system setlocale omits the
163deff3
KW
2168C<const> (presumably because its API was specified long ago, and can't be
2169updated; it is illegal to change the information C<setlocale> returns; doing
2170so leads to segfaults.)
9aac5db8 2171
e9bc6d6b
KW
2172Finally, C<Perl_setlocale> works under all circumstances, whereas plain
2173C<setlocale> can be completely ineffective on some platforms under some
2174configurations.
2175
9aac5db8 2176C<Perl_setlocale> should not be used to change the locale except on systems
e9bc6d6b
KW
2177where the predefined variable C<${^SAFE_LOCALES}> is 1. On some such systems,
2178the system C<setlocale()> is ineffective, returning the wrong information, and
2179failing to actually change the locale. C<Perl_setlocale>, however works
2180properly in all circumstances.
9aac5db8
KW
2181
2182The return points to a per-thread static buffer, which is overwritten the next
2183time C<Perl_setlocale> is called from the same thread.
2184
2185=cut
2186
2187*/
2188
2189const char *
2190Perl_setlocale(const int category, const char * locale)
a4f00dcc
KW
2191{
2192 /* This wraps POSIX::setlocale() */
2193
52129632 2194#ifndef USE_LOCALE
d36adde0
KW
2195
2196 PERL_UNUSED_ARG(category);
2197 PERL_UNUSED_ARG(locale);
2198
2199 return "C";
2200
2201#else
2202
9aac5db8
KW
2203 const char * retval;
2204 const char * newlocale;
2205 dSAVEDERRNO;
a4f00dcc 2206 dTHX;
d36adde0 2207 DECLARATION_FOR_LC_NUMERIC_MANIPULATION;
a4f00dcc 2208
a4f00dcc
KW
2209#ifdef USE_LOCALE_NUMERIC
2210
291a84fb
KW
2211 /* A NULL locale means only query what the current one is. We have the
2212 * LC_NUMERIC name saved, because we are normally switched into the C
9487427b
KW
2213 * (or equivalent) locale for it. For an LC_ALL query, switch back to get
2214 * the correct results. All other categories don't require special
2215 * handling */
a4f00dcc
KW
2216 if (locale == NULL) {
2217 if (category == LC_NUMERIC) {
9aac5db8
KW
2218
2219 /* We don't have to copy this return value, as it is a per-thread
2220 * variable, and won't change until a future setlocale */
2221 return PL_numeric_name;
a4f00dcc
KW
2222 }
2223
7d4bcc4a 2224# ifdef LC_ALL
a4f00dcc 2225
9aac5db8
KW
2226 else if (category == LC_ALL) {
2227 STORE_LC_NUMERIC_FORCE_TO_UNDERLYING();
a4f00dcc
KW
2228 }
2229
7d4bcc4a 2230# endif
a4f00dcc
KW
2231
2232 }
2233
2234#endif
2235
33e5a354
KW
2236 retval = save_to_buffer(do_setlocale_r(category, locale),
2237 &PL_setlocale_buf, &PL_setlocale_bufsize, 0);
9aac5db8
KW
2238 SAVE_ERRNO;
2239
2240#if defined(USE_LOCALE_NUMERIC) && defined(LC_ALL)
2241
2242 if (locale == NULL && category == LC_ALL) {
2243 RESTORE_LC_NUMERIC();
2244 }
2245
2246#endif
a4f00dcc
KW
2247
2248 DEBUG_L(PerlIO_printf(Perl_debug_log,
2249 "%s:%d: %s\n", __FILE__, __LINE__,
2250 setlocale_debug_string(category, locale, retval)));
7d4bcc4a 2251
9aac5db8
KW
2252 RESTORE_ERRNO;
2253
2254 if (! retval) {
a4f00dcc
KW
2255 return NULL;
2256 }
2257
9aac5db8 2258 /* If locale == NULL, we are just querying the state */
a4f00dcc 2259 if (locale == NULL) {
a4f00dcc
KW
2260 return retval;
2261 }
a4f00dcc 2262
1159483a
KW
2263 /* Now that have switched locales, we have to update our records to
2264 * correspond. */
a4f00dcc 2265
1159483a 2266 switch (category) {
a4f00dcc 2267
1159483a 2268#ifdef USE_LOCALE_CTYPE
a4f00dcc 2269
1159483a
KW
2270 case LC_CTYPE:
2271 new_ctype(retval);
2272 break;
a4f00dcc 2273
1159483a 2274#endif
a4f00dcc
KW
2275#ifdef USE_LOCALE_COLLATE
2276
1159483a
KW
2277 case LC_COLLATE:
2278 new_collate(retval);
2279 break;
a4f00dcc 2280
1159483a 2281#endif
a4f00dcc
KW
2282#ifdef USE_LOCALE_NUMERIC
2283
1159483a
KW
2284 case LC_NUMERIC:
2285 new_numeric(retval);
2286 break;
a4f00dcc 2287
1159483a
KW
2288#endif
2289#ifdef LC_ALL
a4f00dcc 2290
1159483a 2291 case LC_ALL:
a4f00dcc 2292
1159483a
KW
2293 /* LC_ALL updates all the things we care about. The values may not
2294 * be the same as 'retval', as the locale "" may have set things
2295 * individually */
a4f00dcc 2296
1159483a 2297# ifdef USE_LOCALE_CTYPE
a4f00dcc 2298
b0bde642 2299 newlocale = savepv(do_setlocale_c(LC_CTYPE, NULL));
1159483a 2300 new_ctype(newlocale);
b0bde642 2301 Safefree(newlocale);
a4f00dcc 2302
1159483a
KW
2303# endif /* USE_LOCALE_CTYPE */
2304# ifdef USE_LOCALE_COLLATE
2305
b0bde642 2306 newlocale = savepv(do_setlocale_c(LC_COLLATE, NULL));
1159483a 2307 new_collate(newlocale);
b0bde642 2308 Safefree(newlocale);
a4f00dcc 2309
7d4bcc4a 2310# endif
1159483a 2311# ifdef USE_LOCALE_NUMERIC
a4f00dcc 2312
b0bde642 2313 newlocale = savepv(do_setlocale_c(LC_NUMERIC, NULL));
1159483a 2314 new_numeric(newlocale);
b0bde642 2315 Safefree(newlocale);
a4f00dcc 2316
1159483a
KW
2317# endif /* USE_LOCALE_NUMERIC */
2318#endif /* LC_ALL */
a4f00dcc 2319
1159483a
KW
2320 default:
2321 break;
a4f00dcc
KW
2322 }
2323
2324 return retval;
2325
d36adde0
KW
2326#endif
2327
f7416781
KW
2328}
2329
2330PERL_STATIC_INLINE const char *
2331S_save_to_buffer(const char * string, char **buf, Size_t *buf_size, const Size_t offset)
2332{
2333 /* Copy the NUL-terminated 'string' to 'buf' + 'offset'. 'buf' has size 'buf_size',
2334 * growing it if necessary */
2335
0b6697c5 2336 Size_t string_size;
f7416781
KW
2337
2338 PERL_ARGS_ASSERT_SAVE_TO_BUFFER;
2339
0b6697c5
KW
2340 if (! string) {
2341 return NULL;
2342 }
2343
2344 string_size = strlen(string) + offset + 1;
2345
f7416781
KW
2346 if (*buf_size == 0) {
2347 Newx(*buf, string_size, char);
2348 *buf_size = string_size;
2349 }
2350 else if (string_size > *buf_size) {
2351 Renew(*buf, string_size, char);
2352 *buf_size = string_size;
2353 }
2354
2355 Copy(string, *buf + offset, string_size - offset, char);
2356 return *buf;
2357}
2358
2359/*
2360
f7416781
KW
2361=for apidoc Perl_langinfo
2362
ce181bf6 2363This is an (almost) drop-in replacement for the system C<L<nl_langinfo(3)>>,
f7416781
KW
2364taking the same C<item> parameter values, and returning the same information.
2365But it is more thread-safe than regular C<nl_langinfo()>, and hides the quirks
2366of Perl's locale handling from your code, and can be used on systems that lack
2367a native C<nl_langinfo>.
2368
2369Expanding on these:
2370
2371=over
2372
2373=item *
2374
ce181bf6
KW
2375The reason it isn't quite a drop-in replacement is actually an advantage. The
2376only difference is that it returns S<C<const char *>>, whereas plain
2377C<nl_langinfo()> returns S<C<char *>>, but you are (only by documentation)
2378forbidden to write into the buffer. By declaring this C<const>, the compiler
2379enforces this restriction, so if it is violated, you know at compilation time,
2380rather than getting segfaults at runtime.
2381
2382=item *
2383
69e2275e 2384It delivers the correct results for the C<RADIXCHAR> and C<THOUSEP> items,
f7416781
KW
2385without you having to write extra code. The reason for the extra code would be
2386because these are from the C<LC_NUMERIC> locale category, which is normally
9487427b
KW
2387kept set by Perl so that the radix is a dot, and the separator is the empty
2388string, no matter what the underlying locale is supposed to be, and so to get
2389the expected results, you have to temporarily toggle into the underlying
2390locale, and later toggle back. (You could use plain C<nl_langinfo> and
2391C<L</STORE_LC_NUMERIC_FORCE_TO_UNDERLYING>> for this but then you wouldn't get
2392the other advantages of C<Perl_langinfo()>; not keeping C<LC_NUMERIC> in the C
2393(or equivalent) locale would break a lot of CPAN, which is expecting the radix
2394(decimal point) character to be a dot.)
ce181bf6
KW
2395
2396=item *
2397
2398The system function it replaces can have its static return buffer trashed,
f1460a66 2399not only by a subsequent call to that function, but by a C<freelocale>,
ce181bf6
KW
2400C<setlocale>, or other locale change. The returned buffer of this function is
2401not changed until the next call to it, so the buffer is never in a trashed
2402state.
f7416781
KW
2403
2404=item *
2405
ce181bf6
KW
2406Its return buffer is per-thread, so it also is never overwritten by a call to
2407this function from another thread; unlike the function it replaces.
2408
2409=item *
2410
2411But most importantly, it works on systems that don't have C<nl_langinfo>, such
2412as Windows, hence makes your code more portable. Of the fifty-some possible
2413items specified by the POSIX 2008 standard,
f7416781 2414L<http://pubs.opengroup.org/onlinepubs/9699919799/basedefs/langinfo.h.html>,
8d72e74e
KW
2415only one is completely unimplemented, though on non-Windows platforms, another
2416significant one is also not implemented). It uses various techniques to
2417recover the other items, including calling C<L<localeconv(3)>>, and
2418C<L<strftime(3)>>, both of which are specified in C89, so should be always be
2419available. Later C<strftime()> versions have additional capabilities; C<""> is
2420returned for those not available on your system.
ce181bf6
KW
2421
2422It is important to note that when called with an item that is recovered by
2423using C<localeconv>, the buffer from any previous explicit call to
2424C<localeconv> will be overwritten. This means you must save that buffer's
c3997e39
KW
2425contents if you need to access them after a call to this function. (But note
2426that you might not want to be using C<localeconv()> directly anyway, because of
2427issues like the ones listed in the second item of this list (above) for
2428C<RADIXCHAR> and C<THOUSEP>. You can use the methods given in L<perlcall> to
2429call L<POSIX/localeconv> and avoid all the issues, but then you have a hash to
2430unpack).
9f0bc911 2431
6201c220
KW
2432The details for those items which may deviate from what this emulation returns
2433and what a native C<nl_langinfo()> would return are specified in
2434L<I18N::Langinfo>.
f7416781 2435
ce181bf6
KW
2436=back
2437
f7416781
KW
2438When using C<Perl_langinfo> on systems that don't have a native
2439C<nl_langinfo()>, you must
2440
2441 #include "perl_langinfo.h"
2442
2443before the C<perl.h> C<#include>. You can replace your C<langinfo.h>
2444C<#include> with this one. (Doing it this way keeps out the symbols that plain
c3997e39
KW
2445C<langinfo.h> would try to import into the namespace for code that doesn't need
2446it.)
f7416781 2447
f7416781
KW
2448The original impetus for C<Perl_langinfo()> was so that code that needs to
2449find out the current currency symbol, floating point radix character, or digit
2450grouping separator can use, on all systems, the simpler and more
2451thread-friendly C<nl_langinfo> API instead of C<L<localeconv(3)>> which is a
2452pain to make thread-friendly. For other fields returned by C<localeconv>, it
2453is better to use the methods given in L<perlcall> to call
2454L<C<POSIX::localeconv()>|POSIX/localeconv>, which is thread-friendly.
2455
2456=cut
2457
2458*/
2459
2460const char *
2461#ifdef HAS_NL_LANGINFO
2462Perl_langinfo(const nl_item item)
2463#else
2464Perl_langinfo(const int item)
2465#endif
2466{
f61748ac
KW
2467 return my_nl_langinfo(item, TRUE);
2468}
2469
59a15af0 2470STATIC const char *
f61748ac
KW
2471#ifdef HAS_NL_LANGINFO
2472S_my_nl_langinfo(const nl_item item, bool toggle)
2473#else
2474S_my_nl_langinfo(const int item, bool toggle)
2475#endif
2476{
ae74815b 2477 dTHX;
0b6697c5 2478 const char * retval;
f7416781 2479
d36adde0
KW
2480#ifdef USE_LOCALE_NUMERIC
2481
5a854ab3
KW
2482 /* We only need to toggle into the underlying LC_NUMERIC locale for these
2483 * two items, and only if not already there */
4e6826bf 2484 if (toggle && (( item != RADIXCHAR && item != THOUSEP)
5a854ab3 2485 || PL_numeric_underlying))
d36adde0
KW
2486
2487#endif /* No toggling needed if not using LC_NUMERIC */
2488
5a854ab3 2489 toggle = FALSE;
5a854ab3 2490
ab340fff 2491#if defined(HAS_NL_LANGINFO) /* nl_langinfo() is available. */
ee90eb2d 2492# if ! defined(HAS_THREAD_SAFE_NL_LANGINFO_L) \
8fdf38a3 2493 || ! defined(HAS_POSIX_2008_LOCALE)
f7416781 2494
ab340fff 2495 /* Here, use plain nl_langinfo(), switching to the underlying LC_NUMERIC
ae74815b
KW
2496 * for those items dependent on it. This must be copied to a buffer before
2497 * switching back, as some systems destroy the buffer when setlocale() is
2498 * called */
f7416781 2499
038d3702
KW
2500 {
2501 DECLARATION_FOR_LC_NUMERIC_MANIPULATION;
2502
b2fee59c 2503 if (toggle) {
038d3702 2504 STORE_LC_NUMERIC_FORCE_TO_UNDERLYING();
b2fee59c 2505 }
f7416781 2506
7953f73f
KW
2507 /* Prevent interference from another thread executing this code
2508 * section. */
2509 NL_LANGINFO_LOCK;
ee90eb2d 2510
628ff75a
KW
2511 /* Copy to a per-thread buffer, which is also one that won't be
2512 * destroyed by a subsequent setlocale(), such as the
2513 * RESTORE_LC_NUMERIC may do just below. */
0b6697c5
KW
2514 retval = save_to_buffer(nl_langinfo(item),
2515 &PL_langinfo_buf, &PL_langinfo_bufsize, 0);
7953f73f 2516 NL_LANGINFO_UNLOCK;
5acc4454 2517
b2fee59c 2518 if (toggle) {
038d3702 2519 RESTORE_LC_NUMERIC();
b2fee59c 2520 }
f7416781
KW
2521 }
2522
ab340fff
KW
2523# else /* Use nl_langinfo_l(), avoiding both a mutex and changing the locale */
2524
5a854ab3 2525 {
b2fee59c
KW
2526 bool do_free = FALSE;
2527 locale_t cur = uselocale((locale_t) 0);
ab340fff 2528
b2fee59c
KW
2529 if (cur == LC_GLOBAL_LOCALE) {
2530 cur = duplocale(LC_GLOBAL_LOCALE);
2531 do_free = TRUE;
2532 }
ab340fff 2533
d36adde0
KW
2534# ifdef USE_LOCALE_NUMERIC
2535
b2fee59c 2536 if (toggle) {
e1aa2579
KW
2537 if (PL_underlying_numeric_obj) {
2538 cur = PL_underlying_numeric_obj;
2539 }
2540 else {
2541 cur = newlocale(LC_NUMERIC_MASK, PL_numeric_name, cur);
2542 do_free = TRUE;
2543 }
b2fee59c 2544 }
ab340fff 2545
d36adde0
KW
2546# endif
2547
628ff75a
KW
2548 /* We have to save it to a buffer, because the freelocale() just below
2549 * can invalidate the internal one */
0b6697c5
KW
2550 retval = save_to_buffer(nl_langinfo_l(item, cur),
2551 &PL_langinfo_buf, &PL_langinfo_bufsize, 0);
ee90eb2d 2552
b2fee59c
KW
2553 if (do_free) {
2554 freelocale(cur);
2555 }
5a854ab3 2556 }
ab340fff 2557
c1566110
KW
2558# endif
2559
0b6697c5 2560 if (strEQ(retval, "")) {
4e6826bf 2561 if (item == YESSTR) {
c1566110
KW
2562 return "yes";
2563 }
4e6826bf 2564 if (item == NOSTR) {
c1566110
KW
2565 return "no";
2566 }
2567 }
2568
0b6697c5 2569 return retval;
ab340fff 2570
f7416781 2571#else /* Below, emulate nl_langinfo as best we can */
43dd6b15
KW
2572
2573 {
2574
f7416781
KW
2575# ifdef HAS_LOCALECONV
2576
43dd6b15 2577 const struct lconv* lc;
628ff75a 2578 const char * temp;
038d3702 2579 DECLARATION_FOR_LC_NUMERIC_MANIPULATION;
f7416781 2580
69c5e0db
KW
2581# ifdef TS_W32_BROKEN_LOCALECONV
2582
2583 const char * save_global;
2584 const char * save_thread;
2585 int needed_size;
2586 char * ptr;
2587 char * e;
2588 char * item_start;
2589
2590# endif
f7416781
KW
2591# endif
2592# ifdef HAS_STRFTIME
2593
43dd6b15
KW
2594 struct tm tm;
2595 bool return_format = FALSE; /* Return the %format, not the value */
2596 const char * format;
f7416781
KW
2597
2598# endif
2599
43dd6b15
KW
2600 /* We copy the results to a per-thread buffer, even if not
2601 * multi-threaded. This is in part to simplify this code, and partly
2602 * because we need a buffer anyway for strftime(), and partly because a
2603 * call of localeconv() could otherwise wipe out the buffer, and the
2604 * programmer would not be expecting this, as this is a nl_langinfo()
2605 * substitute after all, so s/he might be thinking their localeconv()
2606 * is safe until another localeconv() call. */
f7416781 2607
43dd6b15
KW
2608 switch (item) {
2609 Size_t len;
f7416781 2610
8d72e74e 2611 /* This is unimplemented */
4e6826bf 2612 case ERA: /* For use with strftime() %E modifier */
f7416781 2613
43dd6b15
KW
2614 default:
2615 return "";
f7416781 2616
43dd6b15 2617 /* We use only an English set, since we don't know any more */
4e6826bf
KW
2618 case YESEXPR: return "^[+1yY]";
2619 case YESSTR: return "yes";
2620 case NOEXPR: return "^[-0nN]";
2621 case NOSTR: return "no";
2622
8d72e74e
KW
2623 case CODESET:
2624
2625# ifndef WIN32
2626
2627 /* On non-windows, this is unimplemented, in part because of
2628 * inconsistencies between vendors. The Darwin native
2629 * nl_langinfo() implementation simply looks at everything past
2630 * any dot in the name, but that doesn't work for other
2631 * vendors. Many Linux locales that don't have UTF-8 in their
2632 * names really are UTF-8, for example; z/OS locales that do
2633 * have UTF-8 in their names, aren't really UTF-8 */
2634 return "";
2635
2636# else
2637
2638 { /* But on Windows, the name does seem to be consistent, so
2639 use that. */
2640 const char * p;
2641 const char * first;
2642 Size_t offset = 0;
2643 const char * name = my_setlocale(LC_CTYPE, NULL);
2644
2645 if (isNAME_C_OR_POSIX(name)) {
2646 return "ANSI_X3.4-1968";
2647 }
2648
2649 /* Find the dot in the locale name */
2650 first = (const char *) strchr(name, '.');
2651 if (! first) {
2652 first = name;
2653 goto has_nondigit;
2654 }
2655
2656 /* Look at everything past the dot */
2657 first++;
2658 p = first;
2659
2660 while (*p) {
2661 if (! isDIGIT(*p)) {
2662 goto has_nondigit;
2663 }
2664
2665 p++;
2666 }
2667
2668 /* Here everything past the dot is a digit. Treat it as a
2669 * code page */
18503cbd 2670 retval = save_to_buffer("CP", &PL_langinfo_buf,
32a62865 2671 &PL_langinfo_bufsize, 0);
8d72e74e 2672 offset = STRLENs("CP");
f7416781 2673
8d72e74e
KW
2674 has_nondigit:
2675
2676 retval = save_to_buffer(first, &PL_langinfo_buf,
2677 &PL_langinfo_bufsize, offset);
2678 }
2679
2680 break;
2681
2682# endif
f7416781
KW
2683# ifdef HAS_LOCALECONV
2684
4e6826bf 2685 case CRNCYSTR:
f7416781 2686
43dd6b15
KW
2687 /* We don't bother with localeconv_l() because any system that
2688 * has it is likely to also have nl_langinfo() */
291a84fb 2689
8609fe00
KW
2690 LOCALECONV_LOCK; /* Prevent interference with other threads
2691 using localeconv() */
69c5e0db
KW
2692
2693# ifdef TS_W32_BROKEN_LOCALECONV
2694
2695 /* This is a workaround for a Windows bug prior to VS 15.
2696 * What we do here is, while locked, switch to the global
2697 * locale so localeconv() works; then switch back just before
2698 * the unlock. This can screw things up if some thread is
2699 * already using the global locale while assuming no other is.
2700 * A different workaround would be to call GetCurrencyFormat on
2701 * a known value, and parse it; patches welcome
2702 *
2703 * We have to use LC_ALL instead of LC_MONETARY because of
2704 * another bug in Windows */
2705
2706 save_thread = savepv(my_setlocale(LC_ALL, NULL));
2707 _configthreadlocale(_DISABLE_PER_THREAD_LOCALE);
2708 save_global= savepv(my_setlocale(LC_ALL, NULL));
2709 my_setlocale(LC_ALL, save_thread);
2710
2711# endif
5acc4454 2712
43dd6b15
KW
2713 lc = localeconv();
2714 if ( ! lc
2715 || ! lc->currency_symbol
2716 || strEQ("", lc->currency_symbol))
2717 {
8609fe00 2718 LOCALECONV_UNLOCK;
43dd6b15
KW
2719 return "";
2720 }
f7416781 2721
43dd6b15 2722 /* Leave the first spot empty to be filled in below */
0b6697c5
KW
2723 retval = save_to_buffer(lc->currency_symbol, &PL_langinfo_buf,
2724 &PL_langinfo_bufsize, 1);
43dd6b15
KW
2725 if (lc->mon_decimal_point && strEQ(lc->mon_decimal_point, ""))
2726 { /* khw couldn't figure out how the localedef specifications
2727 would show that the $ should replace the radix; this is
2728 just a guess as to how it might work.*/
a34edee3 2729 PL_langinfo_buf[0] = '.';
43dd6b15
KW
2730 }
2731 else if (lc->p_cs_precedes) {
a34edee3 2732 PL_langinfo_buf[0] = '-';
43dd6b15
KW
2733 }
2734 else {
a34edee3 2735 PL_langinfo_buf[0] = '+';
43dd6b15 2736 }
f7416781 2737
69c5e0db
KW
2738# ifdef TS_W32_BROKEN_LOCALECONV
2739
2740 my_setlocale(LC_ALL, save_global);
2741 _configthreadlocale(_ENABLE_PER_THREAD_LOCALE);
2742 my_setlocale(LC_ALL, save_thread);
2743 Safefree(save_global);
2744 Safefree(save_thread);
2745
2746# endif
2747
8609fe00 2748 LOCALECONV_UNLOCK;
43dd6b15 2749 break;
f7416781 2750
69c5e0db
KW
2751# ifdef TS_W32_BROKEN_LOCALECONV
2752
4e6826bf 2753 case RADIXCHAR:
69c5e0db
KW
2754
2755 /* For this, we output a known simple floating point number to
2756 * a buffer, and parse it, looking for the radix */
2757
2758 if (toggle) {
2759 STORE_LC_NUMERIC_FORCE_TO_UNDERLYING();
2760 }
2761
2762 if (PL_langinfo_bufsize < 10) {
2763 PL_langinfo_bufsize = 10;
2764 Renew(PL_langinfo_buf, PL_langinfo_bufsize, char);
2765 }
2766
2767 needed_size = my_snprintf(PL_langinfo_buf, PL_langinfo_bufsize,
2768 "%.1f", 1.5);
2769 if (needed_size >= (int) PL_langinfo_bufsize) {
2770 PL_langinfo_bufsize = needed_size + 1;
2771 Renew(PL_langinfo_buf, PL_langinfo_bufsize, char);
2772 needed_size = my_snprintf(PL_langinfo_buf, PL_langinfo_bufsize,
2773 "%.1f", 1.5);
2774 assert(needed_size < (int) PL_langinfo_bufsize);
2775 }
2776
2777 ptr = PL_langinfo_buf;
2778 e = PL_langinfo_buf + PL_langinfo_bufsize;
2779
2780 /* Find the '1' */
2781 while (ptr < e && *ptr != '1') {
2782 ptr++;
2783 }
2784 ptr++;
2785
2786 /* Find the '5' */
2787 item_start = ptr;
2788 while (ptr < e && *ptr != '5') {
2789 ptr++;
2790 }
2791
2792 /* Everything in between is the radix string */
2793 if (ptr >= e) {
2794 PL_langinfo_buf[0] = '?';
2795 PL_langinfo_buf[1] = '\0';
2796 }
2797 else {
2798 *ptr = '\0';
2799 Move(item_start, PL_langinfo_buf, ptr - PL_langinfo_buf, char);
2800 }
2801
2802 if (toggle) {
2803 RESTORE_LC_NUMERIC();
2804 }
2805
2806 retval = PL_langinfo_buf;
2807 break;
2808
2809# else
2810
2811 case RADIXCHAR: /* No special handling needed */
2812
2813# endif
2814
4e6826bf 2815 case THOUSEP:
f7416781 2816
43dd6b15 2817 if (toggle) {
038d3702 2818 STORE_LC_NUMERIC_FORCE_TO_UNDERLYING();
c0d737a8 2819 }
f7416781 2820
8609fe00
KW
2821 LOCALECONV_LOCK; /* Prevent interference with other threads
2822 using localeconv() */
69c5e0db
KW
2823
2824# ifdef TS_W32_BROKEN_LOCALECONV
2825
2826 /* This should only be for the thousands separator. A
2827 * different work around would be to use GetNumberFormat on a
2828 * known value and parse the result to find the separator */
2829 save_thread = savepv(my_setlocale(LC_ALL, NULL));
2830 _configthreadlocale(_DISABLE_PER_THREAD_LOCALE);
2831 save_global = savepv(my_setlocale(LC_ALL, NULL));
2832 my_setlocale(LC_ALL, save_thread);
2833# if 0
2834 /* This is the start of code that for broken Windows replaces
2835 * the above and below code, and instead calls
2836 * GetNumberFormat() and then would parse that to find the
2837 * thousands separator. It needs to handle UTF-16 vs -8
2838 * issues. */
2839
2840 needed_size = GetNumberFormatEx(PL_numeric_name, 0, "1234.5", NULL, PL_langinfo_buf, PL_langinfo_bufsize);
2841 DEBUG_L(PerlIO_printf(Perl_debug_log,
2842 "%s: %d: return from GetNumber, count=%d, val=%s\n",
2843 __FILE__, __LINE__, needed_size, PL_langinfo_buf));
2844
2845# endif
2846# endif
5acc4454 2847
43dd6b15
KW
2848 lc = localeconv();
2849 if (! lc) {
628ff75a 2850 temp = "";
33394adc 2851 }
43dd6b15 2852 else {
4e6826bf 2853 temp = (item == RADIXCHAR)
43dd6b15
KW
2854 ? lc->decimal_point
2855 : lc->thousands_sep;
628ff75a
KW
2856 if (! temp) {
2857 temp = "";
43dd6b15
KW
2858 }
2859 }
f7416781 2860
0b6697c5
KW
2861 retval = save_to_buffer(temp, &PL_langinfo_buf,
2862 &PL_langinfo_bufsize, 0);
f7416781 2863
69c5e0db
KW
2864# ifdef TS_W32_BROKEN_LOCALECONV
2865
2866 my_setlocale(LC_ALL, save_global);
2867 _configthreadlocale(_ENABLE_PER_THREAD_LOCALE);
2868 my_setlocale(LC_ALL, save_thread);
2869 Safefree(save_global);
2870 Safefree(save_thread);
2871
2872# endif
2873
8609fe00 2874 LOCALECONV_UNLOCK;
5acc4454 2875
43dd6b15 2876 if (toggle) {
038d3702 2877 RESTORE_LC_NUMERIC();
43dd6b15 2878 }
f7416781 2879
43dd6b15 2880 break;
f7416781
KW
2881
2882# endif
2883# ifdef HAS_STRFTIME
2884
43dd6b15
KW
2885 /* These are defined by C89, so we assume that strftime supports
2886 * them, and so are returned unconditionally; they may not be what
2887 * the locale actually says, but should give good enough results
2888 * for someone using them as formats (as opposed to trying to parse
2889 * them to figure out what the locale says). The other format
2890 * items are actually tested to verify they work on the platform */
4e6826bf
KW
2891 case D_FMT: return "%x";
2892 case T_FMT: return "%X";
2893 case D_T_FMT: return "%c";
43dd6b15
KW
2894
2895 /* These formats are only available in later strfmtime's */
4e6826bf 2896 case ERA_D_FMT: case ERA_T_FMT: case ERA_D_T_FMT: case T_FMT_AMPM:
43dd6b15
KW
2897
2898 /* The rest can be gotten from most versions of strftime(). */
4e6826bf
KW
2899 case ABDAY_1: case ABDAY_2: case ABDAY_3:
2900 case ABDAY_4: case ABDAY_5: case ABDAY_6: case ABDAY_7:
2901 case ALT_DIGITS:
2902 case AM_STR: case PM_STR:
2903 case ABMON_1: case ABMON_2: case ABMON_3: case ABMON_4:
2904 case ABMON_5: case ABMON_6: case ABMON_7: case ABMON_8:
2905 case ABMON_9: case ABMON_10: case ABMON_11: case ABMON_12:
2906 case DAY_1: case DAY_2: case DAY_3: case DAY_4:
2907 case DAY_5: case DAY_6: case DAY_7:
2908 case MON_1: case MON_2: case MON_3: case MON_4:
2909 case MON_5: case MON_6: case MON_7: case MON_8:
2910 case MON_9: case MON_10: case MON_11: case MON_12:
43dd6b15 2911
43dd6b15
KW
2912 init_tm(&tm); /* Precaution against core dumps */
2913 tm.tm_sec = 30;
2914 tm.tm_min = 30;
2915 tm.tm_hour = 6;
2916 tm.tm_year = 2017 - 1900;
2917 tm.tm_wday = 0;
2918 tm.tm_mon = 0;
639a7c35
KW
2919
2920 GCC_DIAG_IGNORE_STMT(-Wimplicit-fallthrough);
2921
43dd6b15
KW
2922 switch (item) {
2923 default:
43dd6b15
KW
2924 Perl_croak(aTHX_
2925 "panic: %s: %d: switch case: %d problem",
2926 __FILE__, __LINE__, item);
2927 NOT_REACHED; /* NOTREACHED */
2928
4e6826bf
KW
2929 case PM_STR: tm.tm_hour = 18;
2930 case AM_STR:
43dd6b15
KW
2931 format = "%p";
2932 break;
2933
4e6826bf
KW
2934 case ABDAY_7: tm.tm_wday++;
2935 case ABDAY_6: tm.tm_wday++;
2936 case ABDAY_5: tm.tm_wday++;
2937 case ABDAY_4: tm.tm_wday++;
2938 case ABDAY_3: tm.tm_wday++;
2939 case ABDAY_2: tm.tm_wday++;
2940 case ABDAY_1:
43dd6b15
KW
2941 format = "%a";
2942 break;
2943
4e6826bf
KW
2944 case DAY_7: tm.tm_wday++;
2945 case DAY_6: tm.tm_wday++;
2946 case DAY_5: tm.tm_wday++;
2947 case DAY_4: tm.tm_wday++;
2948 case DAY_3: tm.tm_wday++;
2949 case DAY_2: tm.tm_wday++;
2950 case DAY_1:
43dd6b15
KW
2951 format = "%A";
2952 break;
2953
4e6826bf
KW
2954 case ABMON_12: tm.tm_mon++;
2955 case ABMON_11: tm.tm_mon++;
2956 case ABMON_10: tm.tm_mon++;
2957 case ABMON_9: tm.tm_mon++;
2958 case ABMON_8: tm.tm_mon++;
2959 case ABMON_7: tm.tm_mon++;
2960 case ABMON_6: tm.tm_mon++;
2961 case ABMON_5: tm.tm_mon++;
2962 case ABMON_4: tm.tm_mon++;
2963 case ABMON_3: tm.tm_mon++;
2964 case ABMON_2: tm.tm_mon++;
2965 case ABMON_1:
43dd6b15
KW
2966 format = "%b";
2967 break;
2968
4e6826bf
KW
2969 case MON_12: tm.tm_mon++;
2970 case MON_11: tm.tm_mon++;
2971 case MON_10: tm.tm_mon++;
2972 case MON_9: tm.tm_mon++;
2973 case MON_8: tm.tm_mon++;
2974 case MON_7: tm.tm_mon++;
2975 case MON_6: tm.tm_mon++;
2976 case MON_5: tm.tm_mon++;
2977 case MON_4: tm.tm_mon++;
2978 case MON_3: tm.tm_mon++;
2979 case MON_2: tm.tm_mon++;
2980 case MON_1:
43dd6b15
KW
2981 format = "%B";
2982 break;
2983
4e6826bf 2984 case T_FMT_AMPM:
43dd6b15
KW
2985 format = "%r";
2986 return_format = TRUE;
2987 break;
2988
4e6826bf 2989 case ERA_D_FMT:
43dd6b15
KW
2990 format = "%Ex";
2991 return_format = TRUE;
2992 break;
2993
4e6826bf 2994 case ERA_T_FMT:
43dd6b15
KW
2995 format = "%EX";
2996 return_format = TRUE;
2997 break;
2998
4e6826bf 2999 case ERA_D_T_FMT:
43dd6b15
KW
3000 format = "%Ec";
3001 return_format = TRUE;
3002 break;
3003
4e6826bf 3004 case ALT_DIGITS:
43dd6b15
KW
3005 tm.tm_wday = 0;
3006 format = "%Ow"; /* Find the alternate digit for 0 */
3007 break;
3008 }
f7416781 3009
639a7c35
KW
3010 GCC_DIAG_RESTORE_STMT;
3011
43dd6b15
KW
3012 /* We can't use my_strftime() because it doesn't look at
3013 * tm_wday */
3014 while (0 == strftime(PL_langinfo_buf, PL_langinfo_bufsize,
3015 format, &tm))
3016 {
3017 /* A zero return means one of:
3018 * a) there wasn't enough space in PL_langinfo_buf
3019 * b) the format, like a plain %p, returns empty
3020 * c) it was an illegal format, though some
3021 * implementations of strftime will just return the
3022 * illegal format as a plain character sequence.
3023 *
3024 * To quickly test for case 'b)', try again but precede
3025 * the format with a plain character. If that result is
3026 * still empty, the problem is either 'a)' or 'c)' */
3027
3028 Size_t format_size = strlen(format) + 1;
3029 Size_t mod_size = format_size + 1;
3030 char * mod_format;
3031 char * temp_result;
3032
3033 Newx(mod_format, mod_size, char);
3034 Newx(temp_result, PL_langinfo_bufsize, char);
6873aa47 3035 *mod_format = ' ';
43dd6b15
KW
3036 my_strlcpy(mod_format + 1, format, mod_size);
3037 len = strftime(temp_result,
3038 PL_langinfo_bufsize,
3039 mod_format, &tm);
3040 Safefree(mod_format);
3041 Safefree(temp_result);
3042
3043 /* If 'len' is non-zero, it means that we had a case like
3044 * %p which means the current locale doesn't use a.m. or
3045 * p.m., and that is valid */
3046 if (len == 0) {
3047
3048 /* Here, still didn't work. If we get well beyond a
3049 * reasonable size, bail out to prevent an infinite
3050 * loop. */
3051
3052 if (PL_langinfo_bufsize > 100 * format_size) {
3053 *PL_langinfo_buf = '\0';
3054 }
3055 else {
3056 /* Double the buffer size to retry; Add 1 in case
3057 * original was 0, so we aren't stuck at 0. */
3058 PL_langinfo_bufsize *= 2;
3059 PL_langinfo_bufsize++;
3060 Renew(PL_langinfo_buf, PL_langinfo_bufsize, char);
3061 continue;
3062 }
3063 }
f7416781 3064
f7416781 3065 break;
43dd6b15 3066 }
f7416781 3067
43dd6b15
KW
3068 /* Here, we got a result.
3069 *
3070 * If the item is 'ALT_DIGITS', PL_langinfo_buf contains the
3071 * alternate format for wday 0. If the value is the same as
3072 * the normal 0, there isn't an alternate, so clear the buffer.
3073 * */
4e6826bf 3074 if ( item == ALT_DIGITS
43dd6b15
KW
3075 && strEQ(PL_langinfo_buf, "0"))
3076 {
3077 *PL_langinfo_buf = '\0';
3078 }
f7416781 3079
43dd6b15
KW
3080 /* ALT_DIGITS is problematic. Experiments on it showed that
3081 * strftime() did not always work properly when going from
3082 * alt-9 to alt-10. Only a few locales have this item defined,
3083 * and in all of them on Linux that khw was able to find,
3084 * nl_langinfo() merely returned the alt-0 character, possibly
3085 * doubled. Most Unicode digits are in blocks of 10
3086 * consecutive code points, so that is sufficient information
3087 * for those scripts, as we can infer alt-1, alt-2, .... But
3088 * for a Japanese locale, a CJK ideographic 0 is returned, and
3089 * the CJK digits are not in code point order, so you can't
3090 * really infer anything. The localedef for this locale did
3091 * specify the succeeding digits, so that strftime() works
3092 * properly on them, without needing to infer anything. But
3093 * the nl_langinfo() return did not give sufficient information
3094 * for the caller to understand what's going on. So until
3095 * there is evidence that it should work differently, this
3096 * returns the alt-0 string for ALT_DIGITS.
3097 *
3098 * wday was chosen because its range is all a single digit.
3099 * Things like tm_sec have two digits as the minimum: '00' */
f7416781 3100
d1b150d9
KW
3101 retval = PL_langinfo_buf;
3102
43dd6b15
KW
3103 /* If to return the format, not the value, overwrite the buffer
3104 * with it. But some strftime()s will keep the original format
3105 * if illegal, so change those to "" */
3106 if (return_format) {
3107 if (strEQ(PL_langinfo_buf, format)) {
f7416781
KW
3108 *PL_langinfo_buf = '\0';
3109 }
43dd6b15 3110 else {
0b6697c5
KW
3111 retval = save_to_buffer(format, &PL_langinfo_buf,
3112 &PL_langinfo_bufsize, 0);
f7416781
KW
3113 }
3114 }
3115
3116 break;
f7416781
KW
3117
3118# endif
3119
43dd6b15 3120 }
f7416781
KW
3121 }
3122
0b6697c5 3123 return retval;
f7416781
KW
3124
3125#endif
3126
a4f00dcc 3127}
b385bb4d 3128
98994639
HS
3129/*
3130 * Initialize locale awareness.
3131 */
3132int
3133Perl_init_i18nl10n(pTHX_ int printwarn)
3134{
0e92a118
KW
3135 /* printwarn is
3136 *
3137 * 0 if not to output warning when setup locale is bad
3138 * 1 if to output warning based on value of PERL_BADLANG
3139 * >1 if to output regardless of PERL_BADLANG
3140 *
3141 * returns
98994639 3142 * 1 = set ok or not applicable,
0e92a118
KW
3143 * 0 = fallback to a locale of lower priority
3144 * -1 = fallback to all locales failed, not even to the C locale
6b058d42
KW
3145 *
3146 * Under -DDEBUGGING, if the environment variable PERL_DEBUG_LOCALE_INIT is
3147 * set, debugging information is output.
3148 *
b85af0ab
KW
3149 * This looks more complicated than it is, mainly due to the #ifdefs and
3150 * error handling.
6b058d42 3151 *
b85af0ab
KW
3152 * Besides some asserts, data structure initialization, and specific
3153 * platform complications, this routine is effectively just two things.
6b058d42 3154 *
b85af0ab
KW
3155 * a) setlocale(LC_ALL, "");
3156 *
3157 * which sets LC_ALL to the values in the current environment.
3158 *
3159 * And for each individual category 'foo' whose value we care about:
3160 *
3161 * b) save_foo = setlocale(LC_foo, NULL); handle_foo(save_foo);
3162 *
3163 * (We don't tend to care about categories like LC_PAPER, for example.)
3164 *
3165 * But there are complications. On systems without LC_ALL, it emulates
3166 * step a) by looping through all the categories, and doing
3167 *
3168 * setlocale(LC_foo, "");
3169 *
3170 * on each.
3171 *
3172 * And it has to deal with if this is an embedded perl, whose locale
3173 * doesn't come from the environment, but has been set up by the caller.
3174 * This is pretty simply handled: the "" in the setlocale calls is not a
3175 * string constant, but a variable which is set to NULL in the embedded
3176 * case.
3177 *
3178 * But the major complication is handling failure and doing fallback.
3179 * There is an array, trial_locales, the elements of which are looped over
3180 * until the locale is successfully set. The array is initialized with
3181 * just one element, for
3182 * setlocale(LC_ALL, $NULL_or_empty)
3183 * If that works, as it almost always does, there's no more elements and
3184 * the loop iterates just the once. Otherwise elements are added for each
3185 * of the environment variables that POSIX dictates should control the
3186 * program, in priority order, with a final one being "C". The loop is
3187 * repeated until the first one succeeds. If all fail, we limp along with
3188 * whatever state we got to. If there is no LC_ALL, an inner loop is run
3189 * through all categories (making things look complex).
3190 *
3191 * A further complication is that Windows has an additional fallback, the
3192 * user-default ANSI code page obtained from the operating system. This is
3193 * added as yet another loop iteration, just before the final "C"
6b058d42
KW
3194 *
3195 * On Ultrix, the locale MUST come from the environment, so there is
3196 * preliminary code to set it. I (khw) am not sure that it is necessary,
3197 * and that this couldn't be folded into the loop, but barring any real
3198 * platforms to test on, it's staying as-is
b85af0ab 3199 */
1565c085 3200
0e92a118
KW
3201 int ok = 1;
3202
7d4bcc4a
KW
3203#ifndef USE_LOCALE
3204
3205 PERL_UNUSED_ARG(printwarn);
3206
3207#else /* USE_LOCALE */
7d4bcc4a
KW
3208# ifdef __GLIBC__
3209
24f3e849 3210 const char * const language = PerlEnv_getenv("LANGUAGE");
7d4bcc4a
KW
3211
3212# endif
65ebb059 3213
ccd65d51
KW
3214 /* NULL uses the existing already set up locale */
3215 const char * const setlocale_init = (PerlEnv_getenv("PERL_SKIP_LOCALE_INIT"))
3216 ? NULL
3217 : "";
c3fcd832
KW
3218 const char* trial_locales[5]; /* 5 = 1 each for "", LC_ALL, LANG, "", C */
3219 unsigned int trial_locales_count;
24f3e849
KW
3220 const char * const lc_all = PerlEnv_getenv("LC_ALL");
3221 const char * const lang = PerlEnv_getenv("LANG");
98994639 3222 bool setlocale_failure = FALSE;
65ebb059 3223 unsigned int i;
175c4cf9
KW
3224
3225 /* A later getenv() could zap this, so only use here */
3226 const char * const bad_lang_use_once = PerlEnv_getenv("PERL_BADLANG");
3227
3228 const bool locwarn = (printwarn > 1
e5f10d49
KW
3229 || ( printwarn
3230 && ( ! bad_lang_use_once
22ff3130 3231 || (
e5f10d49
KW
3232 /* disallow with "" or "0" */
3233 *bad_lang_use_once
3234 && strNE("0", bad_lang_use_once)))));
ea92aad8 3235
291a84fb 3236 /* setlocale() return vals; not copied so must be looked at immediately */
8de4332b 3237 const char * sl_result[NOMINAL_LC_ALL_INDEX + 1];
291a84fb
KW
3238
3239 /* current locale for given category; should have been copied so aren't
3240 * volatile */
8de4332b 3241 const char * curlocales[NOMINAL_LC_ALL_INDEX + 1];
291a84fb 3242
7d4bcc4a
KW
3243# ifdef WIN32
3244
6bce99ee
JH
3245 /* In some systems you can find out the system default locale
3246 * and use that as the fallback locale. */
7d4bcc4a
KW
3247# define SYSTEM_DEFAULT_LOCALE
3248# endif
3249# ifdef SYSTEM_DEFAULT_LOCALE
3250
65ebb059 3251 const char *system_default_locale = NULL;
98994639 3252
7d4bcc4a 3253# endif
948523db
KW
3254
3255# ifndef DEBUGGING
3256# define DEBUG_LOCALE_INIT(a,b,c)
3257# else
7d4bcc4a 3258
8298454c 3259 DEBUG_INITIALIZATION_set(cBOOL(PerlEnv_getenv("PERL_DEBUG_LOCALE_INIT")));
7d4bcc4a
KW
3260
3261# define DEBUG_LOCALE_INIT(category, locale, result) \
1604cfb0
MS
3262 STMT_START { \
3263 if (debug_initialization) { \
2fcc0ca9
KW
3264 PerlIO_printf(Perl_debug_log, \
3265 "%s:%d: %s\n", \
3266 __FILE__, __LINE__, \
a4f00dcc 3267 setlocale_debug_string(category, \
2fcc0ca9
KW
3268 locale, \
3269 result)); \
3270 } \
1604cfb0 3271 } STMT_END
2fcc0ca9 3272
948523db
KW
3273/* Make sure the parallel arrays are properly set up */
3274# ifdef USE_LOCALE_NUMERIC
3275 assert(categories[LC_NUMERIC_INDEX] == LC_NUMERIC);
3276 assert(strEQ(category_names[LC_NUMERIC_INDEX], "LC_NUMERIC"));
e9bc6d6b
KW
3277# ifdef USE_POSIX_2008_LOCALE
3278 assert(category_masks[LC_NUMERIC_INDEX] == LC_NUMERIC_MASK);
3279# endif
948523db
KW
3280# endif
3281# ifdef USE_LOCALE_CTYPE
3282 assert(categories[LC_CTYPE_INDEX] == LC_CTYPE);
3283 assert(strEQ(category_names[LC_CTYPE_INDEX], "LC_CTYPE"));
e9bc6d6b
KW
3284# ifdef USE_POSIX_2008_LOCALE
3285 assert(category_masks[LC_CTYPE_INDEX] == LC_CTYPE_MASK);
3286# endif
948523db
KW
3287# endif
3288# ifdef USE_LOCALE_COLLATE
3289 assert(categories[LC_COLLATE_INDEX] == LC_COLLATE);
3290 assert(strEQ(category_names[LC_COLLATE_INDEX], "LC_COLLATE"));
e9bc6d6b
KW
3291# ifdef USE_POSIX_2008_LOCALE
3292 assert(category_masks[LC_COLLATE_INDEX] == LC_COLLATE_MASK);
3293# endif
948523db
KW
3294# endif
3295# ifdef USE_LOCALE_TIME
3296 assert(categories[LC_TIME_INDEX] == LC_TIME);
3297 assert(strEQ(category_names[LC_TIME_INDEX], "LC_TIME"));
e9bc6d6b
KW
3298# ifdef USE_POSIX_2008_LOCALE
3299 assert(category_masks[LC_TIME_INDEX] == LC_TIME_MASK);
3300# endif
948523db
KW
3301# endif
3302# ifdef USE_LOCALE_MESSAGES
3303 assert(categories[LC_MESSAGES_INDEX] == LC_MESSAGES);
3304 assert(strEQ(category_names[LC_MESSAGES_INDEX], "LC_MESSAGES"));
e9bc6d6b
KW
3305# ifdef USE_POSIX_2008_LOCALE
3306 assert(category_masks[LC_MESSAGES_INDEX] == LC_MESSAGES_MASK);
3307# endif
948523db
KW
3308# endif
3309# ifdef USE_LOCALE_MONETARY
3310 assert(categories[LC_MONETARY_INDEX] == LC_MONETARY);
3311 assert(strEQ(category_names[LC_MONETARY_INDEX], "LC_MONETARY"));
e9bc6d6b
KW
3312# ifdef USE_POSIX_2008_LOCALE
3313 assert(category_masks[LC_MONETARY_INDEX] == LC_MONETARY_MASK);
3314# endif
948523db 3315# endif
9821811f
KW
3316# ifdef USE_LOCALE_ADDRESS
3317 assert(categories[LC_ADDRESS_INDEX] == LC_ADDRESS);
3318 assert(strEQ(category_names[LC_ADDRESS_INDEX], "LC_ADDRESS"));
e9bc6d6b
KW
3319# ifdef USE_POSIX_2008_LOCALE
3320 assert(category_masks[LC_ADDRESS_INDEX] == LC_ADDRESS_MASK);
3321# endif
9821811f
KW
3322# endif
3323# ifdef USE_LOCALE_IDENTIFICATION
3324 assert(categories[LC_IDENTIFICATION_INDEX] == LC_IDENTIFICATION);
3325 assert(strEQ(category_names[LC_IDENTIFICATION_INDEX], "LC_IDENTIFICATION"));
e9bc6d6b
KW
3326# ifdef USE_POSIX_2008_LOCALE
3327 assert(category_masks[LC_IDENTIFICATION_INDEX] == LC_IDENTIFICATION_MASK);
3328# endif
9821811f
KW
3329# endif
3330# ifdef USE_LOCALE_MEASUREMENT
3331 assert(categories[LC_MEASUREMENT_INDEX] == LC_MEASUREMENT);
3332 assert(strEQ(category_names[LC_MEASUREMENT_INDEX], "LC_MEASUREMENT"));
e9bc6d6b
KW
3333# ifdef USE_POSIX_2008_LOCALE
3334 assert(category_masks[LC_MEASUREMENT_INDEX] == LC_MEASUREMENT_MASK);
3335# endif
9821811f
KW
3336# endif
3337# ifdef USE_LOCALE_PAPER
3338 assert(categories[LC_PAPER_INDEX] == LC_PAPER);
3339 assert(strEQ(category_names[LC_PAPER_INDEX], "LC_PAPER"));
e9bc6d6b
KW
3340# ifdef USE_POSIX_2008_LOCALE
3341 assert(category_masks[LC_PAPER_INDEX] == LC_PAPER_MASK);
3342# endif
9821811f
KW
3343# endif
3344# ifdef USE_LOCALE_TELEPHONE
3345 assert(categories[LC_TELEPHONE_INDEX] == LC_TELEPHONE);
3346 assert(strEQ(category_names[LC_TELEPHONE_INDEX], "LC_TELEPHONE"));
e9bc6d6b
KW
3347# ifdef USE_POSIX_2008_LOCALE
3348 assert(category_masks[LC_TELEPHONE_INDEX] == LC_TELEPHONE_MASK);
3349# endif
9821811f 3350# endif
36dbc955
KW
3351# ifdef USE_LOCALE_SYNTAX
3352 assert(categories[LC_SYNTAX_INDEX] == LC_SYNTAX);
3353 assert(strEQ(category_names[LC_SYNTAX_INDEX], "LC_SYNTAX"));
3354# ifdef USE_POSIX_2008_LOCALE
3355 assert(category_masks[LC_SYNTAX_INDEX] == LC_SYNTAX_MASK);
3356# endif
3357# endif
3358# ifdef USE_LOCALE_TOD
3359 assert(categories[LC_TOD_INDEX] == LC_TOD);
3360 assert(strEQ(category_names[LC_TOD_INDEX], "LC_TOD"));
3361# ifdef USE_POSIX_2008_LOCALE
3362 assert(category_masks[LC_TOD_INDEX] == LC_TOD_MASK);
3363# endif
3364# endif
948523db
KW
3365# ifdef LC_ALL
3366 assert(categories[LC_ALL_INDEX] == LC_ALL);
3367 assert(strEQ(category_names[LC_ALL_INDEX], "LC_ALL"));
3368 assert(NOMINAL_LC_ALL_INDEX == LC_ALL_INDEX);
e9bc6d6b
KW
3369# ifdef USE_POSIX_2008_LOCALE
3370 assert(category_masks[LC_ALL_INDEX] == LC_ALL_MASK);
3371# endif
948523db
KW
3372# endif
3373# endif /* DEBUGGING */
47280b20 3374
5a6637f0
KW
3375 /* Initialize the per-thread mbrFOO() state variables. See POSIX.xs for
3376 * why these particular incantations are used. */
d2c9cb53
KW
3377#ifdef HAS_MBRLEN
3378 memzero(&PL_mbrlen_ps, sizeof(PL_mbrlen_ps));
3379#endif
5a6637f0
KW
3380#ifdef HAS_MBRTOWC
3381 memzero(&PL_mbrtowc_ps, sizeof(PL_mbrtowc_ps));
3382#endif
3383#ifdef HAS_WCTOMBR
3384 wcrtomb(NULL, L'\0', &PL_wcrtomb_ps);
3385#endif
d2c9cb53 3386
47280b20
KW
3387 /* Initialize the cache of the program's UTF-8ness for the always known
3388 * locales C and POSIX */
3389 my_strlcpy(PL_locale_utf8ness, C_and_POSIX_utf8ness,
3390 sizeof(PL_locale_utf8ness));
3391
8a2e41ff
HS
3392 /* See https://github.com/Perl/perl5/issues/17824 */
3393 Zero(curlocales, NOMINAL_LC_ALL_INDEX, char *);
3394
e9bc6d6b
KW
3395# ifdef USE_THREAD_SAFE_LOCALE
3396# ifdef WIN32
3397
3398 _configthreadlocale(_ENABLE_PER_THREAD_LOCALE);
3399
3400# endif
3401# endif
39e69e77 3402# ifdef USE_POSIX_2008_LOCALE
e9bc6d6b
KW
3403
3404 PL_C_locale_obj = newlocale(LC_ALL_MASK, "C", (locale_t) 0);
3405 if (! PL_C_locale_obj) {
3406 Perl_croak_nocontext(
3407 "panic: Cannot create POSIX 2008 C locale object; errno=%d", errno);
3408 }
3409 if (DEBUG_Lv_TEST || debug_initialization) {
3410 PerlIO_printf(Perl_debug_log, "%s:%d: created C object %p\n", __FILE__, __LINE__, PL_C_locale_obj);
3411 }
3412
3413# endif
3414
78f7c09f
KW
3415# ifdef USE_LOCALE_NUMERIC
3416
3ca88433
KW
3417 PL_numeric_radix_sv = newSVpvs(".");
3418
78f7c09f
KW
3419# endif
3420
e9bc6d6b
KW
3421# if defined(USE_POSIX_2008_LOCALE) && ! defined(HAS_QUERYLOCALE)
3422
3423 /* Initialize our records. If we have POSIX 2008, we have LC_ALL */
3424 do_setlocale_c(LC_ALL, my_setlocale(LC_ALL, NULL));
3425
3426# endif
b56c4436 3427# ifdef LOCALE_ENVIRON_REQUIRED
98994639
HS
3428
3429 /*
3430 * Ultrix setlocale(..., "") fails if there are no environment
3431 * variables from which to get a locale name.
3432 */
3433
b56c4436
KW
3434# ifndef LC_ALL
3435# error Ultrix without LC_ALL not implemented
3436# else
7d4bcc4a 3437
b56c4436
KW
3438 {
3439 bool done = FALSE;
ea92aad8
KW
3440 if (lang) {
3441 sl_result[LC_ALL_INDEX] = do_setlocale_c(LC_ALL, setlocale_init);
3442 DEBUG_LOCALE_INIT(LC_ALL, setlocale_init, sl_result[LC_ALL_INDEX]);
3443 if (sl_result[LC_ALL_INDEX])
3444 done = TRUE;
3445 else
e5f10d49 3446 setlocale_failure = TRUE;
ea92aad8
KW
3447 }
3448 if (! setlocale_failure) {
3449 const char * locale_param;
3450 for (i = 0; i < LC_ALL_INDEX; i++) {
3451 locale_param = (! done && (lang || PerlEnv_getenv(category_names[i])))
3452 ? setlocale_init
3453 : NULL;
3454 sl_result[i] = do_setlocale_r(categories[i], locale_param);
3455 if (! sl_result[i]) {
3456 setlocale_failure = TRUE;
3457 }
3458 DEBUG_LOCALE_INIT(categories[i], locale_param, sl_result[i]);
e5f10d49 3459 }
c835d6be 3460 }
7d4bcc4a
KW
3461 }
3462
3463# endif /* LC_ALL */
e5f10d49 3464# endif /* LOCALE_ENVIRON_REQUIRED */
98994639 3465
65ebb059 3466 /* We try each locale in the list until we get one that works, or exhaust
20a240df
KW
3467 * the list. Normally the loop is executed just once. But if setting the
3468 * locale fails, inside the loop we add fallback trials to the array and so
3469 * will execute the loop multiple times */
c3fcd832
KW
3470 trial_locales[0] = setlocale_init;
3471 trial_locales_count = 1;
7d4bcc4a 3472
65ebb059
KW
3473 for (i= 0; i < trial_locales_count; i++) {
3474 const char * trial_locale = trial_locales[i];
3475
3476 if (i > 0) {
3477
3478 /* XXX This is to preserve old behavior for LOCALE_ENVIRON_REQUIRED
3479 * when i==0, but I (khw) don't think that behavior makes much
3480 * sense */
3481 setlocale_failure = FALSE;
3482
7d4bcc4a 3483# ifdef SYSTEM_DEFAULT_LOCALE
291a84fb 3484# ifdef WIN32 /* Note that assumes Win32 has LC_ALL */
7d4bcc4a 3485
65ebb059
KW
3486 /* On Windows machines, an entry of "" after the 0th means to use
3487 * the system default locale, which we now proceed to get. */
3488 if (strEQ(trial_locale, "")) {
3489 unsigned int j;
3490
3491 /* Note that this may change the locale, but we are going to do
3492 * that anyway just below */
837ce802 3493 system_default_locale = do_setlocale_c(LC_ALL, "");
5d1187d1 3494 DEBUG_LOCALE_INIT(LC_ALL, "", system_default_locale);
65ebb059 3495
7d4bcc4a 3496 /* Skip if invalid or if it's already on the list of locales to
65ebb059
KW
3497 * try */
3498 if (! system_default_locale) {
3499 goto next_iteration;
3500 }
3501 for (j = 0; j < trial_locales_count; j++) {
3502 if (strEQ(system_default_locale, trial_locales[j])) {
3503 goto next_iteration;
3504 }
3505 }
3506
3507 trial_locale = system_default_locale;
3508 }
ec0202b5
KW
3509# else
3510# error SYSTEM_DEFAULT_LOCALE only implemented for Win32
3511# endif
7d4bcc4a 3512# endif /* SYSTEM_DEFAULT_LOCALE */
291a84fb
KW
3513
3514 } /* For i > 0 */
65ebb059 3515
7d4bcc4a
KW
3516# ifdef LC_ALL
3517
948523db
KW
3518 sl_result[LC_ALL_INDEX] = do_setlocale_c(LC_ALL, trial_locale);
3519 DEBUG_LOCALE_INIT(LC_ALL, trial_locale, sl_result[LC_ALL_INDEX]);
3520 if (! sl_result[LC_ALL_INDEX]) {
49c85077 3521 setlocale_failure = TRUE;
7cd8b568
KW
3522 }
3523 else {
3524 /* Since LC_ALL succeeded, it should have changed all the other
3525 * categories it can to its value; so we massage things so that the
3526 * setlocales below just return their category's current values.
3527 * This adequately handles the case in NetBSD where LC_COLLATE may
3528 * not be defined for a locale, and setting it individually will
7d4bcc4a 3529 * fail, whereas setting LC_ALL succeeds, leaving LC_COLLATE set to
7cd8b568
KW
3530 * the POSIX locale. */
3531 trial_locale = NULL;
3532 }
7d4bcc4a
KW
3533
3534# endif /* LC_ALL */
98994639 3535
e5f10d49
KW
3536 if (! setlocale_failure) {
3537 unsigned int j;
3538 for (j = 0; j < NOMINAL_LC_ALL_INDEX; j++) {
3539 curlocales[j]
3540 = savepv(do_setlocale_r(categories[j], trial_locale));
3541 if (! curlocales[j]) {
3542 setlocale_failure = TRUE;
3543 }
3544 DEBUG_LOCALE_INIT(categories[j], trial_locale, curlocales[j]);
3545 }
c835d6be 3546
f6665bcd 3547 if (LIKELY(! setlocale_failure)) { /* All succeeded */
e5f10d49 3548 break; /* Exit trial_locales loop */
49c85077 3549 }
65ebb059 3550 }
98994639 3551
49c85077
KW
3552 /* Here, something failed; will need to try a fallback. */
3553 ok = 0;
65ebb059 3554
49c85077
KW
3555 if (i == 0) {
3556 unsigned int j;
98994639 3557
65ebb059 3558 if (locwarn) { /* Output failure info only on the first one */
7d4bcc4a
KW
3559
3560# ifdef LC_ALL
98994639 3561
49c85077
KW
3562 PerlIO_printf(Perl_error_log,
3563 "perl: warning: Setting locale failed.\n");
98994639 3564
7d4bcc4a 3565# else /* !LC_ALL */
98994639 3566
49c85077
KW
3567 PerlIO_printf(Perl_error_log,
3568 "perl: warning: Setting locale failed for the categories:\n\t");
7d4bcc4a 3569
e5f10d49
KW
3570 for (j = 0; j < NOMINAL_LC_ALL_INDEX; j++) {
3571 if (! curlocales[j]) {
3572 PerlIO_printf(Perl_error_log, category_names[j]);
3573 }
3574 else {
3575 Safefree(curlocales[j]);
3576 }
3577 }
7d4bcc4a 3578
7d4bcc4a 3579# endif /* LC_ALL */
98994639 3580
49c85077
KW
3581 PerlIO_printf(Perl_error_log,
3582 "perl: warning: Please check that your locale settings:\n");
98994639 3583
7d4bcc4a
KW
3584# ifdef __GLIBC__
3585
49c85077
KW
3586 PerlIO_printf(Perl_error_log,
3587 "\tLANGUAGE = %c%s%c,\n",
3588 language ? '"' : '(',
3589 language ? language : "unset",
3590 language ? '"' : ')');
7d4bcc4a 3591# endif
98994639 3592
49c85077
KW
3593 PerlIO_printf(Perl_error_log,
3594 "\tLC_ALL = %c%s%c,\n",
3595 lc_all ? '"' : '(',
3596 lc_all ? lc_all : "unset",
3597 lc_all ? '"' : ')');
98994639 3598
7d4bcc4a
KW
3599# if defined(USE_ENVIRON_ARRAY)
3600
49c85077 3601 {
cd999af9 3602 char **e;
d5e32b93
KW
3603
3604 /* Look through the environment for any variables of the
3605 * form qr/ ^ LC_ [A-Z]+ = /x, except LC_ALL which was
3606 * already handled above. These are assumed to be locale
3607 * settings. Output them and their values. */
cd999af9 3608 for (e = environ; *e; e++) {
d5e32b93
KW
3609 const STRLEN prefix_len = sizeof("LC_") - 1;
3610 STRLEN uppers_len;
3611
cd999af9 3612 if ( strBEGINs(*e, "LC_")
c8b388b0 3613 && ! strBEGINs(*e, "LC_ALL=")
d5e32b93
KW
3614 && (uppers_len = strspn(*e + prefix_len,
3615 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"))
3616 && ((*e)[prefix_len + uppers_len] == '='))
cd999af9
KW
3617 {
3618 PerlIO_printf(Perl_error_log, "\t%.*s = \"%s\",\n",
d5e32b93
KW
3619 (int) (prefix_len + uppers_len), *e,
3620 *e + prefix_len + uppers_len + 1);
cd999af9
KW
3621 }
3622 }
49c85077 3623 }
7d4bcc4a
KW
3624
3625# else
3626
49c85077
KW
3627 PerlIO_printf(Perl_error_log,
3628 "\t(possibly more locale environment variables)\n");
7d4bcc4a
KW
3629
3630# endif
98994639 3631
49c85077
KW
3632 PerlIO_printf(Perl_error_log,
3633 "\tLANG = %c%s%c\n",
3634 lang ? '"' : '(',
3635 lang ? lang : "unset",
3636 lang ? '"' : ')');
98994639 3637
49c85077
KW
3638 PerlIO_printf(Perl_error_log,
3639 " are supported and installed on your system.\n");
3640 }
98994639 3641
65ebb059 3642 /* Calculate what fallback locales to try. We have avoided this
f6bab5f6 3643 * until we have to, because failure is quite unlikely. This will
65ebb059
KW
3644 * usually change the upper bound of the loop we are in.
3645 *
3646 * Since the system's default way of setting the locale has not
3647 * found one that works, We use Perl's defined ordering: LC_ALL,
3648 * LANG, and the C locale. We don't try the same locale twice, so
3649 * don't add to the list if already there. (On POSIX systems, the
3650 * LC_ALL element will likely be a repeat of the 0th element "",
6b058d42
KW
3651 * but there's no harm done by doing it explicitly.
3652 *
3653 * Note that this tries the LC_ALL environment variable even on
3654 * systems which have no LC_ALL locale setting. This may or may
3655 * not have been originally intentional, but there's no real need
3656 * to change the behavior. */
65ebb059
KW
3657 if (lc_all) {
3658 for (j = 0; j < trial_locales_count; j++) {
3659 if (strEQ(lc_all, trial_locales[j])) {
3660 goto done_lc_all;
3661 }
3662 }
3663 trial_locales[trial_locales_count++] = lc_all;
3664 }
3665 done_lc_all:
98994639 3666
65ebb059
KW
3667 if (lang) {
3668 for (j = 0; j < trial_locales_count; j++) {
3669 if (strEQ(lang, trial_locales[j])) {
3670 goto done_lang;
3671 }
3672 }
3673 trial_locales[trial_locales_count++] = lang;
3674 }
3675 done_lang:
3676
7d4bcc4a
KW
3677# if defined(WIN32) && defined(LC_ALL)
3678
65ebb059
KW
3679 /* For Windows, we also try the system default locale before "C".
3680 * (If there exists a Windows without LC_ALL we skip this because
3681 * it gets too complicated. For those, the "C" is the next
3682 * fallback possibility). The "" is the same as the 0th element of
3683 * the array, but the code at the loop above knows to treat it
3684 * differently when not the 0th */
3685 trial_locales[trial_locales_count++] = "";
7d4bcc4a
KW
3686
3687# endif
65ebb059
KW
3688
3689 for (j = 0; j < trial_locales_count; j++) {
3690 if (strEQ("C", trial_locales[j])) {
3691 goto done_C;
3692 }
3693 }
3694 trial_locales[trial_locales_count++] = "C";
98994639 3695
65ebb059
KW
3696 done_C: ;
3697 } /* end of first time through the loop */
98994639 3698
7d4bcc4a
KW
3699# ifdef WIN32
3700
65ebb059 3701 next_iteration: ;
7d4bcc4a
KW
3702
3703# endif
65ebb059
KW
3704
3705 } /* end of looping through the trial locales */
3706
3707 if (ok < 1) { /* If we tried to fallback */
3708 const char* msg;
3709 if (! setlocale_failure) { /* fallback succeeded */
3710 msg = "Falling back to";
3711 }
3712 else { /* fallback failed */
e5f10d49 3713 unsigned int j;
98994639 3714
65ebb059
KW
3715 /* We dropped off the end of the loop, so have to decrement i to
3716 * get back to the value the last time through */
3717 i--;
98994639 3718
65ebb059
KW
3719 ok = -1;
3720 msg = "Failed to fall back to";
3721
3722 /* To continue, we should use whatever values we've got */
7d4bcc4a 3723
e5f10d49
KW
3724 for (j = 0; j < NOMINAL_LC_ALL_INDEX; j++) {
3725 Safefree(curlocales[j]);
3726 curlocales[j] = savepv(do_setlocale_r(categories[j], NULL));
3727 DEBUG_LOCALE_INIT(categories[j], NULL, curlocales[j]);
3728 }
65ebb059
KW
3729 }
3730
3731 if (locwarn) {
3732 const char * description;
3733 const char * name = "";
3734 if (strEQ(trial_locales[i], "C")) {
3735 description = "the standard locale";
3736 name = "C";
3737 }
7d4bcc4a
KW
3738
3739# ifdef SYSTEM_DEFAULT_LOCALE
3740
65ebb059
KW
3741 else if (strEQ(trial_locales[i], "")) {
3742 description = "the system default locale";
3743 if (system_default_locale) {
3744 name = system_default_locale;
3745 }
3746 }
7d4bcc4a
KW
3747
3748# endif /* SYSTEM_DEFAULT_LOCALE */
3749
65ebb059
KW
3750 else {
3751 description = "a fallback locale";
3752 name = trial_locales[i];
3753 }
3754 if (name && strNE(name, "")) {
3755 PerlIO_printf(Perl_error_log,
3756 "perl: warning: %s %s (\"%s\").\n", msg, description, name);
3757 }
3758 else {
3759 PerlIO_printf(Perl_error_log,
3760 "perl: warning: %s %s.\n", msg, description);
3761 }
3762 }
3763 } /* End of tried to fallback */
98994639 3764
e5f10d49
KW
3765 /* Done with finding the locales; update our records */
3766
7d4bcc4a
KW
3767# ifdef USE_LOCALE_CTYPE
3768
948523db 3769 new_ctype(curlocales[LC_CTYPE_INDEX]);
98994639 3770
e5f10d49 3771# endif
7d4bcc4a
KW
3772# ifdef USE_LOCALE_COLLATE
3773
948523db 3774 new_collate(curlocales[LC_COLLATE_INDEX]);
98994639 3775
e5f10d49 3776# endif
7d4bcc4a
KW
3777# ifdef USE_LOCALE_NUMERIC
3778
948523db 3779 new_numeric(curlocales[LC_NUMERIC_INDEX]);
e5f10d49
KW
3780
3781# endif
3782
948523db 3783 for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++) {
47280b20 3784
e9bc6d6b 3785# if defined(USE_ITHREADS) && ! defined(USE_THREAD_SAFE_LOCALE)
47280b20
KW
3786
3787 /* This caches whether each category's locale is UTF-8 or not. This
3788 * may involve changing the locale. It is ok to do this at
e9bc6d6b
KW
3789 * initialization time before any threads have started, but not later
3790 * unless thread-safe operations are used.
47280b20
KW
3791 * Caching means that if the program heeds our dictate not to change
3792 * locales in threaded applications, this data will remain valid, and
9de6fe47
KW
3793 * it may get queried without having to change locales. If the
3794 * environment is such that all categories have the same locale, this
3795 * isn't needed, as the code will not change the locale; but this
3796 * handles the uncommon case where the environment has disparate
3797 * locales for the categories */
47280b20
KW
3798 (void) _is_cur_LC_category_utf8(categories[i]);
3799
3800# endif
3801
e5f10d49
KW
3802 Safefree(curlocales[i]);
3803 }
b310b053 3804
7d4bcc4a
KW
3805# if defined(USE_PERLIO) && defined(USE_LOCALE_CTYPE)
3806
49c85077 3807 /* Set PL_utf8locale to TRUE if using PerlIO _and_ the current LC_CTYPE
50bf02bd
KW
3808 * locale is UTF-8. The call to new_ctype() just above has already
3809 * calculated the latter value and saved it in PL_in_utf8_CTYPE_locale. If
3810 * both PL_utf8locale and PL_unicode (set by -C or by $ENV{PERL_UNICODE})
3811 * are true, perl.c:S_parse_body() will turn on the PerlIO :utf8 layer on
3812 * STDIN, STDOUT, STDERR, _and_ the default open discipline. */
3813 PL_utf8locale = PL_in_utf8_CTYPE_locale;
49c85077 3814
a05d7ebb 3815 /* Set PL_unicode to $ENV{PERL_UNICODE} if using PerlIO.
fde18df1
JH
3816 This is an alternative to using the -C command line switch
3817 (the -C if present will override this). */
3818 {
1604cfb0
MS
3819 const char *p = PerlEnv_getenv("PERL_UNICODE");
3820 PL_unicode = p ? parse_unicode_opts(&p) : 0;
3821 if (PL_unicode & PERL_UNICODE_UTF8CACHEASSERT_FLAG)
3822 PL_utf8cache = -1;
b310b053
JH
3823 }
3824
7d4bcc4a 3825# endif
e3305790 3826#endif /* USE_LOCALE */
2fcc0ca9 3827#ifdef DEBUGGING
7d4bcc4a 3828
2fcc0ca9 3829 /* So won't continue to output stuff */
27cdc72e 3830 DEBUG_INITIALIZATION_set(FALSE);
7d4bcc4a 3831
2fcc0ca9
KW
3832#endif
3833
98994639
HS
3834 return ok;
3835}
3836
98994639
HS
3837#ifdef USE_LOCALE_COLLATE
3838
a4a439fb 3839char *
a4a439fb
KW
3840Perl__mem_collxfrm(pTHX_ const char *input_string,
3841 STRLEN len, /* Length of 'input_string' */
3842 STRLEN *xlen, /* Set to length of returned string
3843 (not including the collation index
3844 prefix) */
3845 bool utf8 /* Is the input in UTF-8? */
6696cfa7 3846 )
98994639 3847{
a4a439fb
KW
3848
3849 /* _mem_collxfrm() is a bit like strxfrm() but with two important
3850 * differences. First, it handles embedded NULs. Second, it allocates a bit
3851 * more memory than needed for the transformed data itself. The real
55e5378d 3852 * transformed data begins at offset COLLXFRM_HDR_LEN. *xlen is set to
a4a439fb
KW
3853 * the length of that, and doesn't include the collation index size.
3854 * Please see sv_collxfrm() to see how this is used. */
3855
55e5378d
KW
3856#define COLLXFRM_HDR_LEN sizeof(PL_collation_ix)
3857
6696cfa7
KW
3858 char * s = (char *) input_string;
3859 STRLEN s_strlen = strlen(input_string);
79f120c8 3860 char *xbuf = NULL;
55e5378d 3861 STRLEN xAlloc; /* xalloc is a reserved word in VC */
17f41037 3862 STRLEN length_in_chars;
c664130f 3863 bool first_time = TRUE; /* Cleared after first loop iteration */
98994639 3864
a4a439fb
KW
3865 PERL_ARGS_ASSERT__MEM_COLLXFRM;
3866
3867 /* Must be NUL-terminated */
3868 assert(*(input_string + len) == '\0');
7918f24d 3869
79f120c8
KW
3870 /* If this locale has defective collation, skip */
3871 if (PL_collxfrm_base == 0 && PL_collxfrm_mult == 0) {
c7202dee
KW
3872 DEBUG_L(PerlIO_printf(Perl_debug_log,
3873 "_mem_collxfrm: locale's collation is defective\n"));
79f120c8
KW
3874 goto bad;
3875 }
3876
6696cfa7
KW
3877 /* Replace any embedded NULs with the control that sorts before any others.
3878 * This will give as good as possible results on strings that don't
3879 * otherwise contain that character, but otherwise there may be
3880 * less-than-perfect results with that character and NUL. This is
fdc080f3 3881 * unavoidable unless we replace strxfrm with our own implementation. */
fd43f63c
KW
3882 if (UNLIKELY(s_strlen < len)) { /* Only execute if there is an embedded
3883 NUL */
6696cfa7
KW
3884 char * e = s + len;
3885 char * sans_nuls;
fdc080f3 3886 STRLEN sans_nuls_len;
94762aa0 3887 int try_non_controls;
afc4976f
KW
3888 char this_replacement_char[] = "?\0"; /* Room for a two-byte string,
3889 making sure 2nd byte is NUL.
3890 */
3891 STRLEN this_replacement_len;
3892
1e4c9676
KW
3893 /* If we don't know what non-NUL control character sorts lowest for
3894 * this locale, find it */
f28f4d2a 3895 if (PL_strxfrm_NUL_replacement == '\0') {
6696cfa7 3896 int j;
afc4976f 3897 char * cur_min_x = NULL; /* The min_char's xfrm, (except it also
6696cfa7
KW
3898 includes the collation index
3899 prefixed. */
3900
91c0e2e0 3901 DEBUG_Lv(PerlIO_printf(Perl_debug_log, "Looking to replace NUL\n"));
94762aa0
KW
3902
3903 /* Unlikely, but it may be that no control will work to replace
1e4c9676
KW
3904 * NUL, in which case we instead look for any character. Controls
3905 * are preferred because collation order is, in general, context
3906 * sensitive, with adjoining characters affecting the order, and
3907 * controls are less likely to have such interactions, allowing the
3908 * NUL-replacement to stand on its own. (Another way to look at it
3909 * is to imagine what would happen if the NUL were replaced by a
3910 * combining character; it wouldn't work out all that well.) */
94762aa0
KW
3911 for (try_non_controls = 0;
3912 try_non_controls < 2;
3913 try_non_controls++)
3914 {
d4ff9586
KW
3915 /* Look through all legal code points (NUL isn't) */
3916 for (j = 1; j < 256; j++) {
3917 char * x; /* j's xfrm plus collation index */
3918 STRLEN x_len; /* length of 'x' */
3919 STRLEN trial_len = 1;
736a4fed 3920 char cur_source[] = { '\0', '\0' };
d4ff9586 3921
736a4fed
KW
3922 /* Skip non-controls the first time through the loop. The
3923 * controls in a UTF-8 locale are the L1 ones */
afc4976f
KW
3924 if (! try_non_controls && (PL_in_utf8_COLLATE_locale)
3925 ? ! isCNTRL_L1(j)
3926 : ! isCNTRL_LC(j))
3927 {
d4ff9586 3928 continue;
6696cfa7 3929 }
6696cfa7 3930
736a4fed
KW
3931 /* Create a 1-char string of the current code point */
3932 cur_source[0] = (char) j;
3933
d4ff9586
KW
3934 /* Then transform it */
3935 x = _mem_collxfrm(cur_source, trial_len, &x_len,
afc4976f 3936 0 /* The string is not in UTF-8 */);
6696cfa7 3937
1e4c9676 3938 /* Ignore any character that didn't successfully transform.
d4ff9586
KW
3939 * */
3940 if (! x) {
3941 continue;
3942 }
6696cfa7 3943
d4ff9586
KW
3944 /* If this character's transformation is lower than
3945 * the current lowest, this one becomes the lowest */
3946 if ( cur_min_x == NULL
3947 || strLT(x + COLLXFRM_HDR_LEN,
3948 cur_min_x + COLLXFRM_HDR_LEN))
3949 {
f28f4d2a 3950 PL_strxfrm_NUL_replacement = j;
62d66863 3951 Safefree(cur_min_x);
d4ff9586 3952 cur_min_x = x;
d4ff9586
KW
3953 }
3954 else {
3955 Safefree(x);
3956 }
1e4c9676 3957 } /* end of loop through all 255 characters */
6696cfa7 3958
1e4c9676 3959 /* Stop looking if found */
94762aa0
KW
3960 if (cur_min_x) {
3961 break;
3962 }
3963
3964 /* Unlikely, but possible, if there aren't any controls that
3965 * work in the locale, repeat the loop, looking for any
3966 * character that works */
3967 DEBUG_L(PerlIO_printf(Perl_debug_log,
3968 "_mem_collxfrm: No control worked. Trying non-controls\n"));
1e4c9676 3969 } /* End of loop to try first the controls, then any char */
6696cfa7 3970
94762aa0
KW
3971 if (! cur_min_x) {
3972 DEBUG_L(PerlIO_printf(Perl_debug_log,
3973 "_mem_collxfrm: Couldn't find any character to replace"
3974 " embedded NULs in locale %s with", PL_collation_name));
3975 goto bad;
58eebef2
KW
3976 }
3977
94762aa0
KW
3978 DEBUG_L(PerlIO_printf(Perl_debug_log,
3979 "_mem_collxfrm: Replacing embedded NULs in locale %s with "
f28f4d2a 3980 "0x%02X\n", PL_collation_name, PL_strxfrm_NUL_replacement));
94762aa0 3981
6696cfa7 3982 Safefree(cur_min_x);
1e4c9676 3983 } /* End of determining the character that is to replace NULs */
afc4976f
KW
3984
3985 /* If the replacement is variant under UTF-8, it must match the
291a84fb 3986 * UTF8-ness of the original */
f28f4d2a
KW
3987 if ( ! UVCHR_IS_INVARIANT(PL_strxfrm_NUL_replacement) && utf8) {
3988 this_replacement_char[0] =
3989 UTF8_EIGHT_BIT_HI(PL_strxfrm_NUL_replacement);
3990 this_replacement_char[1] =
3991 UTF8_EIGHT_BIT_LO(PL_strxfrm_NUL_replacement);
afc4976f
KW
3992 this_replacement_len = 2;
3993 }
3994 else {
f28f4d2a 3995 this_replacement_char[0] = PL_strxfrm_NUL_replacement;
afc4976f
KW
3996 /* this_replacement_char[1] = '\0' was done at initialization */
3997 this_replacement_len = 1;
6696cfa7
KW
3998 }
3999
4000 /* The worst case length for the replaced string would be if every
4001 * character in it is NUL. Multiply that by the length of each
4002 * replacement, and allow for a trailing NUL */
afc4976f 4003 sans_nuls_len = (len * this_replacement_len) + 1;
fdc080f3 4004 Newx(sans_nuls, sans_nuls_len, char);
6696cfa7
KW
4005 *sans_nuls = '\0';
4006
6696cfa7
KW
4007 /* Replace each NUL with the lowest collating control. Loop until have
4008 * exhausted all the NULs */
4009 while (s + s_strlen < e) {
6069d6c5 4010 my_strlcat(sans_nuls, s, sans_nuls_len);
6696cfa7
KW
4011
4012 /* Do the actual replacement */
6069d6c5 4013 my_strlcat(sans_nuls, this_replacement_char, sans_nuls_len);
6696cfa7
KW
4014
4015 /* Move past the input NUL */
4016 s += s_strlen + 1;
4017 s_strlen = strlen(s);
4018 }
4019
4020 /* And add anything that trails the final NUL */
6069d6c5 4021 my_strlcat(sans_nuls, s, sans_nuls_len);
6696cfa7
KW
4022
4023 /* Switch so below we transform this modified string */
4024 s = sans_nuls;
4025 len = strlen(s);
1e4c9676 4026 } /* End of replacing NULs */
6696cfa7 4027
a4a439fb
KW
4028 /* Make sure the UTF8ness of the string and locale match */
4029 if (utf8 != PL_in_utf8_COLLATE_locale) {
9de6fe47 4030 /* XXX convert above Unicode to 10FFFF? */
a4a439fb
KW
4031 const char * const t = s; /* Temporary so we can later find where the
4032 input was */
4033
4034 /* Here they don't match. Change the string's to be what the locale is
4035 * expecting */
4036
4037 if (! utf8) { /* locale is UTF-8, but input isn't; upgrade the input */
4038 s = (char *) bytes_to_utf8((const U8 *) s, &len);
4039 utf8 = TRUE;
4040 }
4041 else { /* locale is not UTF-8; but input is; downgrade the input */
4042
4043 s = (char *) bytes_from_utf8((const U8 *) s, &len, &utf8);
4044
4045 /* If the downgrade was successful we are done, but if the input
4046 * contains things that require UTF-8 to represent, have to do
4047 * damage control ... */
4048 if (UNLIKELY(utf8)) {
4049
4050 /* What we do is construct a non-UTF-8 string with
4051 * 1) the characters representable by a single byte converted
4052 * to be so (if necessary);
4053 * 2) and the rest converted to collate the same as the
4054 * highest collating representable character. That makes
4055 * them collate at the end. This is similar to how we
4056 * handle embedded NULs, but we use the highest collating
4057 * code point instead of the smallest. Like the NUL case,
4058 * this isn't perfect, but is the best we can reasonably
4059 * do. Every above-255 code point will sort the same as
4060 * the highest-sorting 0-255 code point. If that code
4061 * point can combine in a sequence with some other code
4062 * points for weight calculations, us changing something to
4063 * be it can adversely affect the results. But in most
4064 * cases, it should work reasonably. And note that this is
4065 * really an illegal situation: using code points above 255
4066 * on a locale where only 0-255 are valid. If two strings
4067 * sort entirely equal, then the sort order for the
4068 * above-255 code points will be in code point order. */
4069
4070 utf8 = FALSE;
4071
4072 /* If we haven't calculated the code point with the maximum
4073 * collating order for this locale, do so now */
4074 if (! PL_strxfrm_max_cp) {
4075 int j;
4076
4077 /* The current transformed string that collates the
4078 * highest (except it also includes the prefixed collation
4079 * index. */
4080 char * cur_max_x = NULL;
4081
4082 /* Look through all legal code points (NUL isn't) */
4083 for (j = 1; j < 256; j++) {
4084 char * x;
4085 STRLEN x_len;
736a4fed 4086 char cur_source[] = { '\0', '\0' };
a4a439fb 4087
736a4fed
KW
4088 /* Create a 1-char string of the current code point */
4089 cur_source[0] = (char) j;
a4a439fb
KW
4090
4091 /* Then transform it */
4092 x = _mem_collxfrm(cur_source, 1, &x_len, FALSE);
4093
4094 /* If something went wrong (which it shouldn't), just
4095 * ignore this code point */
94762aa0 4096 if (! x) {
a4a439fb
KW
4097 continue;
4098 }
4099
4100 /* If this character's transformation is higher than
4101 * the current highest, this one becomes the highest */
4102 if ( cur_max_x == NULL
55e5378d
KW
4103 || strGT(x + COLLXFRM_HDR_LEN,
4104 cur_max_x + COLLXFRM_HDR_LEN))
a4a439fb
KW
4105 {
4106 PL_strxfrm_max_cp = j;
62d66863 4107 Safefree(cur_max_x);
a4a439fb
KW
4108 cur_max_x = x;
4109 }
4110 else {
4111 Safefree(x);
4112 }
4113 }
4114
94762aa0
KW
4115 if (! cur_max_x) {
4116 DEBUG_L(PerlIO_printf(Perl_debug_log,
4117 "_mem_collxfrm: Couldn't find any character to"
4118 " replace above-Latin1 chars in locale %s with",
4119 PL_collation_name));
4120 goto bad;
4121 }
4122
58eebef2
KW
4123 DEBUG_L(PerlIO_printf(Perl_debug_log,
4124 "_mem_collxfrm: highest 1-byte collating character"
4125 " in locale %s is 0x%02X\n",
4126 PL_collation_name,
4127 PL_strxfrm_max_cp));
58eebef2 4128
a4a439fb
KW
4129 Safefree(cur_max_x);
4130 }
4131
4132 /* Here we know which legal code point collates the highest.
4133 * We are ready to construct the non-UTF-8 string. The length
4134 * will be at least 1 byte smaller than the input string
4135 * (because we changed at least one 2-byte character into a
4136 * single byte), but that is eaten up by the trailing NUL */
4137 Newx(s, len, char);
4138
4139 {
4140 STRLEN i;
4141 STRLEN d= 0;
042d9e50 4142 char * e = (char *) t + len;
a4a439fb
KW
4143
4144 for (i = 0; i < len; i+= UTF8SKIP(t + i)) {
4145 U8 cur_char = t[i];
4146 if (UTF8_IS_INVARIANT(cur_char)) {
4147 s[d++] = cur_char;
4148 }
042d9e50 4149 else if (UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(t + i, e)) {
a4a439fb
KW
4150 s[d++] = EIGHT_BIT_UTF8_TO_NATIVE(cur_char, t[i+1]);
4151 }
4152 else { /* Replace illegal cp with highest collating
4153 one */
4154 s[d++] = PL_strxfrm_max_cp;
4155 }
4156 }
4157 s[d++] = '\0';
4158 Renew(s, d, char); /* Free up unused space */
4159 }
4160 }
4161 }
4162
4163 /* Here, we have constructed a modified version of the input. It could
4164 * be that we already had a modified copy before we did this version.
4165 * If so, that copy is no longer needed */
4166 if (t != input_string) {
4167 Safefree(t);
4168 }
4169 }
4170
17f41037
KW
4171 length_in_chars = (utf8)
4172 ? utf8_length((U8 *) s, (U8 *) s + len)
4173 : len;
4174
59c018b9
KW
4175 /* The first element in the output is the collation id, used by
4176 * sv_collxfrm(); then comes the space for the transformed string. The
4177 * equation should give us a good estimate as to how much is needed */
55e5378d 4178 xAlloc = COLLXFRM_HDR_LEN
a4a439fb 4179 + PL_collxfrm_base
17f41037 4180 + (PL_collxfrm_mult * length_in_chars);
a02a5408 4181 Newx(xbuf, xAlloc, char);
c7202dee
KW
4182 if (UNLIKELY(! xbuf)) {
4183 DEBUG_L(PerlIO_printf(Perl_debug_log,
4184 "_mem_collxfrm: Couldn't malloc %zu bytes\n", xAlloc));
1604cfb0 4185 goto bad;
c7202dee 4186 }
98994639 4187
d35fca5f 4188 /* Store the collation id */
98994639 4189 *(U32*)xbuf = PL_collation_ix;
d35fca5f
KW
4190
4191 /* Then the transformation of the input. We loop until successful, or we
4192 * give up */
4ebeff16 4193 for (;;) {
1adab0a7 4194
55e5378d 4195 *xlen = strxfrm(xbuf + COLLXFRM_HDR_LEN, s, xAlloc - COLLXFRM_HDR_LEN);
4ebeff16
KW
4196
4197 /* If the transformed string occupies less space than we told strxfrm()
4198 * was available, it means it successfully transformed the whole
4199 * string. */
55e5378d 4200 if (*xlen < xAlloc - COLLXFRM_HDR_LEN) {
17f41037 4201
1adab0a7
KW
4202 /* Some systems include a trailing NUL in the returned length.
4203 * Ignore it, using a loop in case multiple trailing NULs are
4204 * returned. */
4205 while ( (*xlen) > 0
4206 && *(xbuf + COLLXFRM_HDR_LEN + (*xlen) - 1) == '\0')
4207 {
4208 (*xlen)--;
4209 }
4210
17f41037
KW
4211 /* If the first try didn't get it, it means our prediction was low.
4212 * Modify the coefficients so that we predict a larger value in any
4213 * future transformations */
4214 if (! first_time) {
4215 STRLEN needed = *xlen + 1; /* +1 For trailing NUL */
4216 STRLEN computed_guess = PL_collxfrm_base
4217 + (PL_collxfrm_mult * length_in_chars);
e1c30f0c
KW
4218
4219 /* On zero-length input, just keep current slope instead of
4220 * dividing by 0 */
4221 const STRLEN new_m = (length_in_chars != 0)
4222 ? needed / length_in_chars
4223 : PL_collxfrm_mult;
17f41037
KW
4224
4225 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
b07929e4
KW
4226 "%s: %d: initial size of %zu bytes for a length "
4227 "%zu string was insufficient, %zu needed\n",
17f41037 4228 __FILE__, __LINE__,
b07929e4 4229 computed_guess, length_in_chars, needed));
17f41037
KW
4230
4231 /* If slope increased, use it, but discard this result for
4232 * length 1 strings, as we can't be sure that it's a real slope
4233 * change */
4234 if (length_in_chars > 1 && new_m > PL_collxfrm_mult) {
7d4bcc4a
KW
4235
4236# ifdef DEBUGGING
4237
17f41037
KW
4238 STRLEN old_m = PL_collxfrm_mult;
4239 STRLEN old_b = PL_collxfrm_base;
7d4bcc4a
KW
4240
4241# endif
4242
17f41037
KW
4243 PL_collxfrm_mult = new_m;
4244 PL_collxfrm_base = 1; /* +1 For trailing NUL */
4245 computed_guess = PL_collxfrm_base
4246 + (PL_collxfrm_mult * length_in_chars);
4247 if (computed_guess < needed) {
4248 PL_collxfrm_base += needed - computed_guess;
4249 }
4250
4251 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
b07929e4
KW
4252 "%s: %d: slope is now %zu; was %zu, base "
4253 "is now %zu; was %zu\n",
17f41037 4254 __FILE__, __LINE__,
b07929e4
KW
4255 PL_collxfrm_mult, old_m,
4256 PL_collxfrm_base, old_b));
17f41037
KW
4257 }
4258 else { /* Slope didn't change, but 'b' did */
4259 const STRLEN new_b = needed
4260 - computed_guess
4261 + PL_collxfrm_base;
4262 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
b07929e4 4263 "%s: %d: base is now %zu; was %zu\n",
17f41037 4264 __FILE__, __LINE__,
b07929e4 4265 new_b, PL_collxfrm_base));
17f41037
KW
4266 PL_collxfrm_base = new_b;
4267 }
4268 }
4269
4ebeff16
KW
4270 break;
4271 }
bb0f664e 4272
c7202dee
KW
4273 if (UNLIKELY(*xlen >= PERL_INT_MAX)) {
4274 DEBUG_L(PerlIO_printf(Perl_debug_log,
4275 "_mem_collxfrm: Needed %zu bytes, max permissible is %u\n",
4276 *xlen, PERL_INT_MAX));
4ebeff16 4277 goto bad;
c7202dee 4278 }
d35fca5f 4279
c664130f 4280 /* A well-behaved strxfrm() returns exactly how much space it needs
1adab0a7
KW
4281 * (usually not including the trailing NUL) when it fails due to not
4282 * enough space being provided. Assume that this is the case unless
4283 * it's been proven otherwise */
c664130f 4284 if (LIKELY(PL_strxfrm_is_behaved) && first_time) {
55e5378d 4285 xAlloc = *xlen + COLLXFRM_HDR_LEN + 1;
c664130f
KW
4286 }
4287 else { /* Here, either:
4288 * 1) The strxfrm() has previously shown bad behavior; or
4289 * 2) It isn't the first time through the loop, which means
4290 * that the strxfrm() is now showing bad behavior, because
4291 * we gave it what it said was needed in the previous
4292 * iteration, and it came back saying it needed still more.
4293 * (Many versions of cygwin fit this. When the buffer size
4294 * isn't sufficient, they return the input size instead of
4295 * how much is needed.)
d4ff9586
KW
4296 * Increase the buffer size by a fixed percentage and try again.
4297 * */
6ddd902c 4298 xAlloc += (xAlloc / 4) + 1;
c664130f 4299 PL_strxfrm_is_behaved = FALSE;
c664130f 4300
7d4bcc4a
KW
4301# ifdef DEBUGGING
4302
58eebef2
KW
4303 if (DEBUG_Lv_TEST || debug_initialization) {
4304 PerlIO_printf(Perl_debug_log,
4305 "_mem_collxfrm required more space than previously calculated"
3e25f6d2
KW
4306 " for locale %s, trying again with new guess=%zu+%zu\n",
4307 PL_collation_name, COLLXFRM_HDR_LEN,
b07929e4 4308 xAlloc - COLLXFRM_HDR_LEN);
58eebef2 4309 }
7d4bcc4a
KW
4310
4311# endif
4312
58eebef2 4313 }
c664130f 4314
4ebeff16 4315 Renew(xbuf, xAlloc, char);
c7202dee
KW
4316 if (UNLIKELY(! xbuf)) {
4317 DEBUG_L(PerlIO_printf(Perl_debug_log,
4318 "_mem_collxfrm: Couldn't realloc %zu bytes\n", xAlloc));
4ebeff16 4319 goto bad;
c7202dee 4320 }
c664130f
KW
4321
4322 first_time = FALSE;
4ebeff16 4323 }
98994639 4324
6696cfa7 4325
7d4bcc4a
KW
4326# ifdef DEBUGGING
4327
58eebef2 4328 if (DEBUG_Lv_TEST || debug_initialization) {
c7202dee
KW
4329
4330 print_collxfrm_input_and_return(s, s + len, xlen, utf8);
4331 PerlIO_printf(Perl_debug_log, "Its xfrm is:");
7e2f38b2
KW
4332 PerlIO_printf(Perl_debug_log, "%s\n",
4333 _byte_dump_string((U8 *) xbuf + COLLXFRM_HDR_LEN,
4334 *xlen, 1));
58eebef2 4335 }
7d4bcc4a
KW
4336
4337# endif
58eebef2 4338
3c5f993e 4339 /* Free up unneeded space; retain ehough for trailing NUL */
55e5378d 4340 Renew(xbuf, COLLXFRM_HDR_LEN + *xlen + 1, char);
98994639 4341
6696cfa7
KW
4342 if (s != input_string) {
4343 Safefree(s);
98994639
HS
4344 }
4345
98994639
HS
4346 return xbuf;
4347
4348 bad:
7d4bcc4a
KW
4349
4350# ifdef DEBUGGING
4351
58eebef2 4352 if (DEBUG_Lv_TEST || debug_initialization) {
c7202dee 4353 print_collxfrm_input_and_return(s, s + len, NULL, utf8);
58eebef2 4354 }
7d4bcc4a
KW
4355
4356# endif
4357
21dce8f4
KW
4358 Safefree(xbuf);
4359 if (s != input_string) {
4360 Safefree(s);
4361 }
4362 *xlen = 0;
4363
98994639
HS
4364 return NULL;
4365}
4366
7d4bcc4a 4367# ifdef DEBUGGING
c7202dee 4368
4cbaac56 4369STATIC void
c7202dee
KW
4370S_print_collxfrm_input_and_return(pTHX_
4371 const char * const s,
4372 const char * const e,
4373 const STRLEN * const xlen,
4374 const bool is_utf8)
4375{
c7202dee
KW
4376
4377 PERL_ARGS_ASSERT_PRINT_COLLXFRM_INPUT_AND_RETURN;
4378
511e4ff7
DM
4379 PerlIO_printf(Perl_debug_log, "_mem_collxfrm[%" UVuf "]: returning ",
4380 (UV)PL_collation_ix);
c7202dee 4381 if (xlen) {
08b6dc1d 4382 PerlIO_printf(Perl_debug_log, "%zu", *xlen);
c7202dee
KW
4383 }
4384 else {
4385 PerlIO_printf(Perl_debug_log, "NULL");
4386 }
4387 PerlIO_printf(Perl_debug_log, " for locale '%s', string='",
4388 PL_collation_name);
9c8a6dc2
KW
4389 print_bytes_for_locale(s, e, is_utf8);
4390
4391 PerlIO_printf(Perl_debug_log, "'\n");
4392}
4393
4d9252fd
KW
4394# endif /* DEBUGGING */
4395#endif /* USE_LOCALE_COLLATE */
1c0e67b5
KW
4396#ifdef USE_LOCALE
4397# ifdef DEBUGGING
4d9252fd 4398
9c8a6dc2
KW
4399STATIC void
4400S_print_bytes_for_locale(pTHX_
4401 const char * const s,
4402 const char * const e,
4403 const bool is_utf8)
4404{
4405 const char * t = s;
4406 bool prev_was_printable = TRUE;
4407 bool first_time = TRUE;
4408
4409 PERL_ARGS_ASSERT_PRINT_BYTES_FOR_LOCALE;
c7202dee
KW
4410
4411 while (t < e) {
4412 UV cp = (is_utf8)
4413 ? utf8_to_uvchr_buf((U8 *) t, e, NULL)
4414 : * (U8 *) t;
4415 if (isPRINT(cp)) {
4416 if (! prev_was_printable) {
4417 PerlIO_printf(Perl_debug_log, " ");
4418 }
4419 PerlIO_printf(Perl_debug_log, "%c", (U8) cp);
4420 prev_was_printable = TRUE;
4421 }
4422 else {
4423 if (! first_time) {
4424 PerlIO_printf(Perl_debug_log, " ");
4425 }
147e3846 4426 PerlIO_printf(Perl_debug_log, "%02" UVXf, cp);
c7202dee
KW
4427 prev_was_printable = FALSE;
4428 }
4429 t += (is_utf8) ? UTF8SKIP(t) : 1;
4430 first_time = FALSE;
4431 }
c7202dee
KW
4432}
4433
1c0e67b5 4434# endif /* #ifdef DEBUGGING */
8ef6e574 4435
962aa53f
KW
4436STATIC const char *
4437S_switch_category_locale_to_template(pTHX_ const int switch_category, const int template_category, const char * template_locale)
4438{
4439 /* Changes the locale for LC_'switch_category" to that of
4440 * LC_'template_category', if they aren't already the same. If not NULL,
4441 * 'template_locale' is the locale that 'template_category' is in.
4442 *
9de6fe47
KW
4443 * Returns a copy of the name of the original locale for 'switch_category'
4444 * so can be switched back to with the companion function
4445 * restore_switched_locale(), (NULL if no restoral is necessary.) */
962aa53f
KW
4446
4447 char * restore_to_locale = NULL;
4448
4449 if (switch_category == template_category) { /* No changes needed */
4450 return NULL;
4451 }
4452
4453 /* Find the original locale of the category we may need to change, so that
4454 * it can be restored to later */
d6189047
KW
4455 restore_to_locale = stdize_locale(savepv(do_setlocale_r(switch_category,
4456 NULL)));
962aa53f
KW
4457 if (! restore_to_locale) {
4458 Perl_croak(aTHX_
4459 "panic: %s: %d: Could not find current %s locale, errno=%d\n",
4460 __FILE__, __LINE__, category_name(switch_category), errno);
4461 }
962aa53f
KW
4462
4463 /* If the locale of the template category wasn't passed in, find it now */
4464 if (template_locale == NULL) {
4465 template_locale = do_setlocale_r(template_category, NULL);
4466 if (! template_locale) {
4467 Perl_croak(aTHX_
4468 "panic: %s: %d: Could not find current %s locale, errno=%d\n",
4469 __FILE__, __LINE__, category_name(template_category), errno);
4470 }
4471 }
4472
4473 /* It the locales are the same, there's nothing to do */
4474 if (strEQ(restore_to_locale, template_locale)) {
4475 Safefree(restore_to_locale);
4476
4477 DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s locale unchanged as %s\n",
e1c8059f 4478 category_name(switch_category), template_locale));
962aa53f
KW
4479
4480 return NULL;
4481 }
4482
4483 /* Finally, change the locale to the template one */
4484 if (! do_setlocale_r(switch_category, template_locale)) {
4485 Perl_croak(aTHX_
4486 "panic: %s: %d: Could not change %s locale to %s, errno=%d\n",
4487 __FILE__, __LINE__, category_name(switch_category),
4488 template_locale, errno);
4489 }
4490
4491 DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s locale switched to %s\n",
4492 category_name(switch_category), template_locale));
4493
4494 return restore_to_locale;
4495}
4496
4497STATIC void
4498S_restore_switched_locale(pTHX_ const int category, const char * const original_locale)
4499{
9de6fe47
KW
4500 /* Restores the locale for LC_'category' to 'original_locale' (which is a
4501 * copy that will be freed by this function), or do nothing if the latter
4502 * parameter is NULL */
962aa53f
KW
4503
4504 if (original_locale == NULL) {
4505 return;
4506 }
4507
4508 if (! do_setlocale_r(category, original_locale)) {
4509 Perl_croak(aTHX_
4510 "panic: %s: %d: setlocale %s restore to %s failed, errno=%d\n",
4511 __FILE__, __LINE__,
4512 category_name(category), original_locale, errno);
4513 }
4514
4515 Safefree(original_locale);
4516}
4517
51332242
N
4518/* is_cur_LC_category_utf8 uses a small char buffer to avoid malloc/free */
4519#define CUR_LC_BUFFER_SIZE 64
4520
c1284011
KW
4521bool
4522Perl__is_cur_LC_category_utf8(pTHX_ int category)
7d74bb61
KW
4523{
4524 /* Returns TRUE if the current locale for 'category' is UTF-8; FALSE
4525 * otherwise. 'category' may not be LC_ALL. If the platform doesn't have
119ee68b 4526 * nl_langinfo(), nor MB_CUR_MAX, this employs a heuristic, which hence
609548d2
KW
4527 * could give the wrong result. The result will very likely be correct for
4528 * languages that have commonly used non-ASCII characters, but for notably
4529 * English, it comes down to if the locale's name ends in something like
9de6fe47
KW
4530 * "UTF-8". It errs on the side of not being a UTF-8 locale.
4531 *
4532 * If the platform is early C89, not containing mbtowc(), or we are
4533 * compiled to not pay attention to LC_CTYPE, this employs heuristics.
4534 * These work very well for non-Latin locales or those whose currency
4535 * symbol isn't a '$' nor plain ASCII text. But without LC_CTYPE and at
4536 * least MB_CUR_MAX, English locales with an ASCII currency symbol depend
4537 * on the name containing UTF-8 or not. */
7d74bb61 4538
47280b20 4539 /* Name of current locale corresponding to the input category */
8de4332b 4540 const char *save_input_locale = NULL;
47280b20
KW
4541
4542 bool is_utf8 = FALSE; /* The return value */
7d74bb61 4543
47280b20
KW
4544 /* The variables below are for the cache of previous lookups using this
4545 * function. The cache is a C string, described at the definition for
4546 * 'C_and_POSIX_utf8ness'.
4547 *
4548 * The first part of the cache is fixed, for the C and POSIX locales. The
4549 * varying part starts just after them. */
4550 char * utf8ness_cache = PL_locale_utf8ness + STRLENs(C_and_POSIX_utf8ness);
4551
4552 Size_t utf8ness_cache_size; /* Size of the varying portion */
4553 Size_t input_name_len; /* Length in bytes of save_input_locale */
4554 Size_t input_name_len_with_overhead; /* plus extra chars used to store
4555 the name in the cache */
4556 char * delimited; /* The name plus the delimiters used to store
4557 it in the cache */
51332242 4558 char buffer[CUR_LC_BUFFER_SIZE]; /* small buffer */
47280b20
KW
4559 char * name_pos; /* position of 'delimited' in the cache, or 0
4560 if not there */
4561
4562
7d4bcc4a
KW
4563# ifdef LC_ALL
4564
7d74bb61 4565 assert(category != LC_ALL);
7d4bcc4a
KW
4566
4567# endif
7d74bb61 4568
47280b20 4569 /* Get the desired category's locale */
d6189047 4570 save_input_locale = stdize_locale(savepv(do_setlocale_r(category, NULL)));
7d74bb61 4571 if (! save_input_locale) {
47280b20 4572 Perl_croak(aTHX_
d707d779
KW
4573 "panic: %s: %d: Could not find current %s locale, errno=%d\n",
4574 __FILE__, __LINE__, category_name(category), errno);
7d74bb61 4575 }
47280b20 4576
47280b20
KW
4577 DEBUG_L(PerlIO_printf(Perl_debug_log,
4578 "Current locale for %s is %s\n",
4579 category_name(category), save_input_locale));
4580
4581 input_name_len = strlen(save_input_locale);
4582
4583 /* In our cache, each name is accompanied by two delimiters and a single
4584 * utf8ness digit */
4585 input_name_len_with_overhead = input_name_len + 3;
4586
51332242
N
4587 if ( input_name_len_with_overhead <= CUR_LC_BUFFER_SIZE ) {
4588 /* we can use the buffer, avoid a malloc */
4589 delimited = buffer;
4590 } else { /* need a malloc */
4591 /* Allocate and populate space for a copy of the name surrounded by the
4592 * delimiters */
4593 Newx(delimited, input_name_len_with_overhead, char);
4594 }
4595
47280b20
KW
4596 delimited[0] = UTF8NESS_SEP[0];
4597 Copy(save_input_locale, delimited + 1, input_name_len, char);
4598 delimited[input_name_len+1] = UTF8NESS_PREFIX[0];
4599 delimited[input_name_len+2] = '\0';
4600
4601 /* And see if that is in the cache */
4602 name_pos = instr(PL_locale_utf8ness, delimited);
4603 if (name_pos) {
4604 is_utf8 = *(name_pos + input_name_len_with_overhead - 1) - '0';
4605
4606# ifdef DEBUGGING
4607
4608 if (DEBUG_Lv_TEST || debug_initialization) {
4609 PerlIO_printf(Perl_debug_log, "UTF8ness for locale %s=%d, \n",
4610 save_input_locale, is_utf8);
4611 }
4612
4613# endif
4614
4615 /* And, if not already in that position, move it to the beginning of
4616 * the non-constant portion of the list, since it is the most recently
4617 * used. (We don't have to worry about overflow, since just moving
4618 * existing names around) */
4619 if (name_pos > utf8ness_cache) {
4620 Move(utf8ness_cache,
4621 utf8ness_cache + input_name_len_with_overhead,
4622 name_pos - utf8ness_cache, char);
4623 Copy(delimited,
4624 utf8ness_cache,
4625 input_name_len_with_overhead - 1, char);
4626 utf8ness_cache[input_name_len_with_overhead - 1] = is_utf8 + '0';
4627 }
4628
51332242
N
4629 /* free only when not using the buffer */
4630 if ( delimited != buffer ) Safefree(delimited);
b07fffd1 4631 Safefree(save_input_locale);
47280b20 4632 return is_utf8;
7d74bb61
KW
4633 }
4634
47280b20
KW
4635 /* Here we don't have stored the utf8ness for the input locale. We have to
4636 * calculate it */
4637
94646a69 4638# if defined(USE_LOCALE_CTYPE) \
6201c220 4639 && ( defined(HAS_NL_LANGINFO) \
94646a69 4640 || (defined(HAS_MBTOWC) || defined(HAS_MBRTOWC)))
7d74bb61 4641
0dec74cd 4642 {
962aa53f
KW
4643 const char *original_ctype_locale
4644 = switch_category_locale_to_template(LC_CTYPE,
4645 category,
4646 save_input_locale);
69014004 4647
7d74bb61 4648 /* Here the current LC_CTYPE is set to the locale of the category whose
0dec74cd 4649 * information is desired. This means that nl_langinfo() and mbtowc()
1d958db2 4650 * should give the correct results */
119ee68b 4651
0dec74cd
KW
4652# ifdef MB_CUR_MAX /* But we can potentially rule out UTF-8ness, avoiding
4653 calling the functions if we have this */
4654
4655 /* Standard UTF-8 needs at least 4 bytes to represent the maximum
4656 * Unicode code point. */
4657
de3a3072
KW
4658 DEBUG_L(PerlIO_printf(Perl_debug_log, "%s: %d: MB_CUR_MAX=%d\n",
4659 __FILE__, __LINE__, (int) MB_CUR_MAX));
0dec74cd
KW
4660 if ((unsigned) MB_CUR_MAX < STRLENs(MAX_UNICODE_UTF8)) {
4661 is_utf8 = FALSE;
b5b2847c
KW
4662 restore_switched_locale(LC_CTYPE, original_ctype_locale);
4663 goto finish_and_return;
0dec74cd
KW
4664 }
4665
4666# endif
6201c220 4667# if defined(HAS_NL_LANGINFO)
7d4bcc4a 4668
0dec74cd
KW
4669 { /* The task is easiest if the platform has this POSIX 2001 function.
4670 Except on some platforms it can wrongly return "", so have to have
4671 a fallback. And it can return that it's UTF-8, even if there are
4672 variances from that. For example, Turkish locales may use the
4673 alternate dotted I rules, and sometimes it appears to be a
4674 defective locale definition. XXX We should probably check for
9de6fe47
KW
4675 these in the Latin1 range and warn (but on glibc, requires
4676 iswalnum() etc. due to their not handling 80-FF correctly */
4e6826bf 4677 const char *codeset = my_nl_langinfo(CODESET, FALSE);
c70a3e68 4678 /* FALSE => already in dest locale */
119ee68b 4679
af84e886 4680 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
c70a3e68
KW
4681 "\tnllanginfo returned CODESET '%s'\n", codeset));
4682
4683 if (codeset && strNE(codeset, "")) {
1d958db2 4684
1d075173
KW
4685 /* If the implementation of foldEQ() somehow were
4686 * to change to not go byte-by-byte, this could
4687 * read past end of string, as only one length is
4688 * checked. But currently, a premature NUL will
4689 * compare false, and it will stop there */
4690 is_utf8 = cBOOL( foldEQ(codeset, STR_WITH_LEN("UTF-8"))
4691 || foldEQ(codeset, STR_WITH_LEN("UTF8")));
1d958db2 4692
69014004
KW
4693 DEBUG_L(PerlIO_printf(Perl_debug_log,
4694 "\tnllanginfo returned CODESET '%s'; ?UTF8 locale=%d\n",
4695 codeset, is_utf8));
b5b2847c
KW
4696 restore_switched_locale(LC_CTYPE, original_ctype_locale);
4697 goto finish_and_return;
1d958db2 4698 }
119ee68b
KW
4699 }
4700
7d4bcc4a 4701# endif
94646a69 4702# if defined(HAS_MBTOWC) || defined(HAS_MBRTOWC)
0dec74cd
KW
4703 /* We can see if this is a UTF-8-like locale if have mbtowc(). It was a
4704 * late adder to C89, so very likely to have it. However, testing has
4705 * shown that, like nl_langinfo() above, there are locales that are not
4706 * strictly UTF-8 that this will return that they are */
69014004 4707
0dec74cd 4708 {
119ee68b 4709 wchar_t wc;
51fc4b19 4710 int len;
48015184 4711 dSAVEDERRNO;
51fc4b19 4712
94646a69
KW
4713# if defined(HAS_MBRTOWC) && defined(USE_ITHREADS)
4714
4715 mbstate_t ps;
4716
4717# endif
4718
4719 /* mbrtowc() and mbtowc() convert a byte string to a wide
4720 * character. Feed a byte string to one of them and check that the
4721 * result is the expected Unicode code point */
4722
4723# if defined(HAS_MBRTOWC) && defined(USE_ITHREADS)
4724 /* Prefer this function if available, as it's reentrant */
4725
a16bc7c9 4726 memzero(&ps, sizeof(ps));;
94646a69
KW
4727 PERL_UNUSED_RESULT(mbrtowc(&wc, NULL, 0, &ps)); /* Reset any shift
4728 state */
48015184 4729 SETERRNO(0, 0);
94646a69 4730 len = mbrtowc(&wc, STR_WITH_LEN(REPLACEMENT_CHARACTER_UTF8), &ps);
48015184 4731 SAVE_ERRNO;
94646a69
KW
4732
4733# else
0dec74cd 4734
7953f73f 4735 MBTOWC_LOCK;
856b881c 4736 PERL_UNUSED_RESULT(mbtowc(&wc, NULL, 0));/* Reset any shift state */
48015184 4737 SETERRNO(0, 0);
b1d4925c 4738 len = mbtowc(&wc, STR_WITH_LEN(REPLACEMENT_CHARACTER_UTF8));
48015184 4739 SAVE_ERRNO;
7953f73f 4740 MBTOWC_UNLOCK;
51fc4b19 4741
94646a69
KW
4742# endif
4743
48015184 4744 RESTORE_ERRNO;
af84e886 4745 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
0dec74cd 4746 "\treturn from mbtowc; len=%d; code_point=%x; errno=%d\n",
48015184 4747 len, (unsigned int) wc, GET_ERRNO));
b1d4925c 4748
0dec74cd
KW
4749 is_utf8 = cBOOL( len == STRLENs(REPLACEMENT_CHARACTER_UTF8)
4750 && wc == (wchar_t) UNICODE_REPLACEMENT);
119ee68b 4751 }
7d4bcc4a 4752
17dd77cd
FP
4753# endif
4754
962aa53f 4755 restore_switched_locale(LC_CTYPE, original_ctype_locale);
47280b20 4756 goto finish_and_return;
7d74bb61 4757 }
119ee68b 4758
0dec74cd 4759# else
7d74bb61 4760
0dec74cd
KW
4761 /* Here, we must have a C89 compiler that doesn't have mbtowc(). Next
4762 * try looking at the currency symbol to see if it disambiguates
4763 * things. Often that will be in the native script, and if the symbol
4764 * isn't in UTF-8, we know that the locale isn't. If it is non-ASCII
4765 * UTF-8, we infer that the locale is too, as the odds of a non-UTF8
4766 * string being valid UTF-8 are quite small */
fa9b773e 4767
660ee27b
KW
4768# ifdef USE_LOCALE_MONETARY
4769
4770 /* If have LC_MONETARY, we can look at the currency symbol. Often that
4771 * will be in the native script. We do this one first because there is
4772 * just one string to examine, so potentially avoids work */
7d4bcc4a 4773
9db16864
KW
4774 {
4775 const char *original_monetary_locale
660ee27b
KW
4776 = switch_category_locale_to_template(LC_MONETARY,
4777 category,
4778 save_input_locale);
9db16864 4779 bool only_ascii = FALSE;
660ee27b 4780 const U8 * currency_string
4e6826bf 4781 = (const U8 *) my_nl_langinfo(CRNCYSTR, FALSE);
660ee27b
KW
4782 /* 2nd param not relevant for this item */
4783 const U8 * first_variant;
fa9b773e 4784
660ee27b
KW
4785 assert( *currency_string == '-'
4786 || *currency_string == '+'
4787 || *currency_string == '.');
97f4de96 4788
660ee27b 4789 currency_string++;
fa9b773e 4790
660ee27b 4791 if (is_utf8_invariant_string_loc(currency_string, 0, &first_variant))
9db16864
KW
4792 {
4793 DEBUG_L(PerlIO_printf(Perl_debug_log, "Couldn't get currency symbol for %s, or contains only ASCII; can't use for determining if UTF-8 locale\n", save_input_locale));
4794 only_ascii = TRUE;
4795 }
4796 else {
660ee27b 4797 is_utf8 = is_strict_utf8_string(first_variant, 0);
9db16864 4798 }
fa9b773e 4799
9db16864 4800 restore_switched_locale(LC_MONETARY, original_monetary_locale);
fa9b773e 4801
9db16864 4802 if (! only_ascii) {
fa9b773e 4803
9db16864
KW
4804 /* It isn't a UTF-8 locale if the symbol is not legal UTF-8;
4805 * otherwise assume the locale is UTF-8 if and only if the symbol
4806 * is non-ascii UTF-8. */
af84e886 4807 DEBUG_Lv(PerlIO_printf(Perl_debug_log, "\t?Currency symbol for %s is UTF-8=%d\n",
9db16864
KW
4808 save_input_locale, is_utf8));
4809 goto finish_and_return;
4810 }
13542a67 4811 }
fa9b773e 4812
660ee27b 4813# endif /* USE_LOCALE_MONETARY */
7d4bcc4a 4814# if defined(HAS_STRFTIME) && defined(USE_LOCALE_TIME)
15f7e74e 4815
9db16864
KW
4816 /* Still haven't found a non-ASCII string to disambiguate UTF-8 or not. Try
4817 * the names of the months and weekdays, timezone, and am/pm indicator */
4818 {
4819 const char *original_time_locale
4820 = switch_category_locale_to_template(LC_TIME,
4821 category,
4822 save_input_locale);
4823 int hour = 10;
4824 bool is_dst = FALSE;
4825 int dom = 1;
4826 int month = 0;
4827 int i;
4828 char * formatted_time;
4829
4830 /* Here the current LC_TIME is set to the locale of the category
4831 * whose information is desired. Look at all the days of the week and
4832 * month names, and the timezone and am/pm indicator for UTF-8 variant
4833 * characters. The first such a one found will tell us if the locale
4834 * is UTF-8 or not */
4835
4836 for (i = 0; i < 7 + 12; i++) { /* 7 days; 12 months */
4837 formatted_time = my_strftime("%A %B %Z %p",
4838 0, 0, hour, dom, month, 2012 - 1900, 0, 0, is_dst);
4839 if ( ! formatted_time
4840 || is_utf8_invariant_string((U8 *) formatted_time, 0))
4841 {
15f7e74e 4842
9db16864
KW
4843 /* Here, we didn't find a non-ASCII. Try the next time through
4844 * with the complemented dst and am/pm, and try with the next
4845 * weekday. After we have gotten all weekdays, try the next
4846 * month */
4847 is_dst = ! is_dst;
4848 hour = (hour + 12) % 24;
4849 dom++;
4850 if (i > 6) {
4851 month++;
4852 }
4853 continue;
15f7e74e 4854 }
9db16864
KW
4855
4856 /* Here, we have a non-ASCII. Return TRUE is it is valid UTF8;
4857 * false otherwise. But first, restore LC_TIME to its original
4858 * locale if we changed it */
4859 restore_switched_locale(LC_TIME, original_time_locale);
4860
af84e886 4861 DEBUG_Lv(PerlIO_printf(Perl_debug_log, "\t?time-related strings for %s are UTF-8=%d\n",
9db16864
KW
4862 save_input_locale,
4863 is_utf8_string((U8 *) formatted_time, 0)));
4864 is_utf8 = is_utf8_string((U8 *) formatted_time, 0);
4865 goto finish_and_return;
15f7e74e
KW
4866 }
4867
9db16864
KW
4868 /* Falling off the end of the loop indicates all the names were just
4869 * ASCII. Go on to the next test. If we changed it, restore LC_TIME
4870 * to its original locale */
962aa53f 4871 restore_switched_locale(LC_TIME, original_time_locale);
af84e886 4872 DEBUG_Lv(PerlIO_printf(Perl_debug_log, "All time-related words for %s contain only ASCII; can't use for determining if UTF-8 locale\n", save_input_locale));
15f7e74e
KW
4873 }
4874
7d4bcc4a 4875# endif
15f7e74e 4876
7d4bcc4a 4877# if 0 && defined(USE_LOCALE_MESSAGES) && defined(HAS_SYS_ERRLIST)
855aeb93 4878
9db16864
KW
4879 /* This code is ifdefd out because it was found to not be necessary in testing
4880 * on our dromedary test machine, which has over 700 locales. There, this
4881 * added no value to looking at the currency symbol and the time strings. I
4882 * left it in so as to avoid rewriting it if real-world experience indicates
4883 * that dromedary is an outlier. Essentially, instead of returning abpve if we
4884 * haven't found illegal utf8, we continue on and examine all the strerror()
4885 * messages on the platform for utf8ness. If all are ASCII, we still don't
4886 * know the answer; but otherwise we have a pretty good indication of the
4887 * utf8ness. The reason this doesn't help much is that the messages may not
4888 * have been translated into the locale. The currency symbol and time strings
4889 * are much more likely to have been translated. */
4890 {
4891 int e;
4892 bool non_ascii = FALSE;
4893 const char *original_messages_locale
4894 = switch_category_locale_to_template(LC_MESSAGES,
4895 category,
4896 save_input_locale);
4897 const char * errmsg = NULL;
4898
4899 /* Here the current LC_MESSAGES is set to the locale of the category
4900 * whose information is desired. Look through all the messages. We
4901 * can't use Strerror() here because it may expand to code that
4902 * segfaults in miniperl */
4903
4904 for (e = 0; e <= sys_nerr; e++) {
4905 errno = 0;
4906 errmsg = sys_errlist[e];
4907 if (errno || !errmsg) {
4908 break;
4909 }
4910 errmsg = savepv(errmsg);
4911 if (! is_utf8_invariant_string((U8 *) errmsg, 0)) {
4912 non_ascii = TRUE;
4913 is_utf8 = is_utf8_string((U8 *) errmsg, 0);
4914 break;
4915 }
855aeb93 4916 }
9db16864 4917 Safefree(errmsg);
855aeb93 4918
9db16864 4919 restore_switched_locale(LC_MESSAGES, original_messages_locale);
855aeb93 4920
9db16864 4921 if (non_ascii) {
5857e934 4922
9db16864
KW
4923 /* Any non-UTF-8 message means not a UTF-8 locale; if all are valid,
4924 * any non-ascii means it is one; otherwise we assume it isn't */
af84e886 4925 DEBUG_Lv(PerlIO_printf(Perl_debug_log, "\t?error messages for %s are UTF-8=%d\n",
9db16864
KW
4926 save_input_locale,
4927 is_utf8));
4928 goto finish_and_return;
4929 }
855aeb93 4930
9db16864
KW
4931 DEBUG_L(PerlIO_printf(Perl_debug_log, "All error messages for %s contain only ASCII; can't use for determining if UTF-8 locale\n", save_input_locale));
4932 }
855aeb93 4933
7d4bcc4a 4934# endif
0dec74cd 4935# ifndef EBCDIC /* On os390, even if the name ends with "UTF-8', it isn't a
92c0a900 4936 UTF-8 locale */
7d4bcc4a 4937
97f4de96
KW
4938 /* As a last resort, look at the locale name to see if it matches
4939 * qr/UTF -? * 8 /ix, or some other common locale names. This "name", the
4940 * return of setlocale(), is actually defined to be opaque, so we can't
4941 * really rely on the absence of various substrings in the name to indicate
4942 * its UTF-8ness, but if it has UTF8 in the name, it is extremely likely to
4943 * be a UTF-8 locale. Similarly for the other common names */
4944
0dec74cd 4945 {
c36d8df8 4946 const Size_t final_pos = strlen(save_input_locale) - 1;
0dec74cd 4947
c36d8df8
KW
4948 if (final_pos >= 3) {
4949 const char *name = save_input_locale;
97f4de96 4950
c36d8df8
KW
4951 /* Find next 'U' or 'u' and look from there */
4952 while ((name += strcspn(name, "Uu") + 1)
4953 <= save_input_locale + final_pos - 2)
97f4de96 4954 {
c36d8df8
KW
4955 if ( isALPHA_FOLD_NE(*name, 't')
4956 || isALPHA_FOLD_NE(*(name + 1), 'f'))
4957 {
4958 continue;
4959 }
4960 name += 2;
4961 if (*(name) == '-') {
4962 if ((name > save_input_locale + final_pos - 1)) {
4963 break;
4964 }
4965 name++;
4966 }
4967 if (*(name) == '8') {
4968 DEBUG_L(PerlIO_printf(Perl_debug_log,
4969 "Locale %s ends with UTF-8 in name\n",
4970 save_input_locale));
4971 is_utf8 = TRUE;
4972 goto finish_and_return;
97f4de96 4973 }
97f4de96 4974 }
c36d8df8
KW
4975 DEBUG_L(PerlIO_printf(Perl_debug_log,
4976 "Locale %s doesn't end with UTF-8 in name\n",
4977 save_input_locale));
97f4de96 4978 }
97f4de96 4979
4d8d465a 4980# ifdef WIN32
7d4bcc4a 4981
c36d8df8
KW
4982 /* http://msdn.microsoft.com/en-us/library/windows/desktop/dd317756.aspx */
4983 if (memENDs(save_input_locale, final_pos, "65001")) {
4984 DEBUG_L(PerlIO_printf(Perl_debug_log,
8a0832a1 4985 "Locale %s ends with 65001 in name, is UTF-8 locale\n",
97f4de96 4986 save_input_locale));
c36d8df8
KW
4987 is_utf8 = TRUE;
4988 goto finish_and_return;
4989 }
7d4bcc4a 4990
4d8d465a 4991# endif
17dd77cd 4992 }
4d8d465a 4993# endif
97f4de96
KW
4994
4995 /* Other common encodings are the ISO 8859 series, which aren't UTF-8. But
4996 * since we are about to return FALSE anyway, there is no point in doing
4997 * this extra work */
7d4bcc4a 4998
0dec74cd 4999# if 0
97f4de96
KW
5000 if (instr(save_input_locale, "8859")) {
5001 DEBUG_L(PerlIO_printf(Perl_debug_log,
5002 "Locale %s has 8859 in name, not UTF-8 locale\n",
5003 save_input_locale));
47280b20
KW
5004 is_utf8 = FALSE;
5005 goto finish_and_return;
97f4de96 5006 }
0dec74cd 5007# endif
97f4de96 5008
69014004
KW
5009 DEBUG_L(PerlIO_printf(Perl_debug_log,
5010 "Assuming locale %s is not a UTF-8 locale\n",
5011 save_input_locale));
47280b20
KW
5012 is_utf8 = FALSE;
5013
0dec74cd
KW
5014# endif /* the code that is compiled when no modern LC_CTYPE */
5015
47280b20
KW
5016 finish_and_return:
5017
5018 /* Cache this result so we don't have to go through all this next time. */
5019 utf8ness_cache_size = sizeof(PL_locale_utf8ness)
5020 - (utf8ness_cache - PL_locale_utf8ness);
5021
5022 /* But we can't save it if it is too large for the total space available */
5023 if (LIKELY(input_name_len_with_overhead < utf8ness_cache_size)) {
5024 Size_t utf8ness_cache_len = strlen(utf8ness_cache);
5025
5026 /* Here it can fit, but we may need to clear out the oldest cached
5027 * result(s) to do so. Check */
5028 if (utf8ness_cache_len + input_name_len_with_overhead
5029 >= utf8ness_cache_size)
5030 {
5031 /* Here we have to clear something out to make room for this.
5032 * Start looking at the rightmost place where it could fit and find
5033 * the beginning of the entry that extends past that. */
5034 char * cutoff = (char *) my_memrchr(utf8ness_cache,
5035 UTF8NESS_SEP[0],
5036 utf8ness_cache_size
5037 - input_name_len_with_overhead);
5038
5039 assert(cutoff);
5040 assert(cutoff >= utf8ness_cache);
5041
5042 /* This and all subsequent entries must be removed */
5043 *cutoff = '\0';
5044 utf8ness_cache_len = strlen(utf8ness_cache);
5045 }
5046
5047 /* Make space for the new entry */
5048 Move(utf8ness_cache,
5049 utf8ness_cache + input_name_len_with_overhead,
5050 utf8ness_cache_len + 1 /* Incl. trailing NUL */, char);
5051
5052 /* And insert it */
5053 Copy(delimited, utf8ness_cache, input_name_len_with_overhead - 1, char);
5054 utf8ness_cache[input_name_len_with_overhead - 1] = is_utf8 + '0';
5055
72299e00 5056 if ((PL_locale_utf8ness[strlen(PL_locale_utf8ness)-1] & ~1) != '0') {
47280b20 5057 Perl_croak(aTHX_
7fcf9272
KW
5058 "panic: %s: %d: Corrupt utf8ness_cache=%s\nlen=%zu,"
5059 " inserted_name=%s, its_len=%zu\n",
47280b20
KW
5060 __FILE__, __LINE__,
5061 PL_locale_utf8ness, strlen(PL_locale_utf8ness),
5062 delimited, input_name_len_with_overhead);
5063 }
5064 }
5065
5066# ifdef DEBUGGING
5067
de3a3072
KW
5068 if (DEBUG_Lv_TEST) {
5069 const char * s = PL_locale_utf8ness;
5070
5071 /* Audit the structure */
5072 while (s < PL_locale_utf8ness + strlen(PL_locale_utf8ness)) {
5073 const char *e;
5074
5075 if (*s != UTF8NESS_SEP[0]) {
5076 Perl_croak(aTHX_
5077 "panic: %s: %d: Corrupt utf8ness_cache: missing"
5078 " separator %.*s<-- HERE %s\n",
5079 __FILE__, __LINE__,
fe301c93 5080 (int) (s - PL_locale_utf8ness), PL_locale_utf8ness,
de3a3072
KW
5081 s);
5082 }
5083 s++;
5084 e = strchr(s, UTF8NESS_PREFIX[0]);
5085 if (! e) {
01b91050 5086 e = PL_locale_utf8ness + strlen(PL_locale_utf8ness);
de3a3072
KW
5087 Perl_croak(aTHX_
5088 "panic: %s: %d: Corrupt utf8ness_cache: missing"
5089 " separator %.*s<-- HERE %s\n",
5090 __FILE__, __LINE__,
fe301c93 5091 (int) (e - PL_locale_utf8ness), PL_locale_utf8ness,
de3a3072
KW
5092 e);
5093 }
5094 e++;
5095 if (*e != '0' && *e != '1') {
5096 Perl_croak(aTHX_
5097 "panic: %s: %d: Corrupt utf8ness_cache: utf8ness"
5098 " must be [01] %.*s<-- HERE %s\n",
5099 __FILE__, __LINE__,
fe301c93
KW
5100 (int) (e + 1 - PL_locale_utf8ness),
5101 PL_locale_utf8ness, e + 1);
de3a3072
KW
5102 }
5103 if (ninstr(PL_locale_utf8ness, s, s-1, e)) {
5104 Perl_croak(aTHX_
5105 "panic: %s: %d: Corrupt utf8ness_cache: entry"
5106 " has duplicate %.*s<-- HERE %s\n",
5107 __FILE__, __LINE__,
fe301c93 5108 (int) (e - PL_locale_utf8ness), PL_locale_utf8ness,
de3a3072
KW
5109 e);
5110 }
5111 s = e + 1;
5112 }
5113 }
5114
47280b20 5115 if (DEBUG_Lv_TEST || debug_initialization) {
de3a3072 5116
47280b20
KW
5117 PerlIO_printf(Perl_debug_log,
5118 "PL_locale_utf8ness is now %s; returning %d\n",
5119 PL_locale_utf8ness, is_utf8);
5120 }
5121
5122# endif
5123
51332242
N
5124 /* free only when not using the buffer */
5125 if ( delimited != buffer ) Safefree(delimited);
fa9b773e 5126 Safefree(save_input_locale);
47280b20 5127 return is_utf8;
7d74bb61
KW
5128}
5129
8ef6e574 5130#endif
7d74bb61 5131
d6ded950
KW
5132bool
5133Perl__is_in_locale_category(pTHX_ const bool compiling, const int category)
5134{
5135 /* Internal function which returns if we are in the scope of a pragma that
5136 * enables the locale category 'category'. 'compiling' should indicate if
5137 * this is during the compilation phase (TRUE) or not (FALSE). */
5138
5139 const COP * const cop = (compiling) ? &PL_compiling : PL_curcop;
5140
363d7d4a
JK
5141 SV *these_categories = cop_hints_fetch_pvs(cop, "locale", 0);
5142 if (! these_categories || these_categories == &PL_sv_placeholder) {
d6ded950
KW
5143 return FALSE;
5144 }
5145
5146 /* The pseudo-category 'not_characters' is -1, so just add 1 to each to get
5147 * a valid unsigned */
5148 assert(category >= -1);
363d7d4a 5149 return cBOOL(SvUV(these_categories) & (1U << (category + 1)));
d6ded950
KW
5150}
5151
2c6ee1a7 5152char *
6ebbc862
KW
5153Perl_my_strerror(pTHX_ const int errnum)
5154{
5155 /* Returns a mortalized copy of the text of the error message associated
5156 * with 'errnum'. It uses the current locale's text unless the platform
5157 * doesn't have the LC_MESSAGES category or we are not being called from
5158 * within the scope of 'use locale'. In the former case, it uses whatever
5159 * strerror returns; in the latter case it uses the text from the C locale.
5160 *
5161 * The function just calls strerror(), but temporarily switches, if needed,
5162 * to the C locale */
5163
5164 char *errstr;
5165
52770946 5166#ifndef USE_LOCALE_MESSAGES
6ebbc862 5167
52770946
KW
5168 /* If platform doesn't have messages category, we don't do any switching to
5169 * the C locale; we just use whatever strerror() returns */
5170
5171 errstr = savepv(Strerror(errnum));
5172
5173#else /* Has locale messages */
5174
5175 const bool within_locale_scope = IN_LC(LC_MESSAGES);
2c6ee1a7 5176
39e69e77 5177# ifndef USE_ITHREADS
7aaa36b1 5178
39e69e77
KW
5179 /* This function is trivial without threads. */
5180 if (within_locale_scope) {
5181 errstr = savepv(strerror(errnum));
5182 }
5183 else {
b0bde642 5184 const char * save_locale = savepv(do_setlocale_c(LC_MESSAGES, NULL));
39e69e77
KW
5185
5186 do_setlocale_c(LC_MESSAGES, "C");
5187 errstr = savepv(strerror(errnum));
5188 do_setlocale_c(LC_MESSAGES, save_locale);
b0bde642 5189 Safefree(save_locale);
39e69e77
KW
5190 }
5191
fd639d5f 5192# elif defined(USE_POSIX_2008_LOCALE) \
8fdf38a3 5193 && defined(HAS_STRERROR_L)
39e69e77
KW
5194
5195 /* This function is also trivial if we don't have to worry about thread
5196 * safety and have strerror_l(), as it handles the switch of locales so we
5197 * don't have to deal with that. We don't have to worry about thread
5198 * safety if strerror_r() is also available. Both it and strerror_l() are
5199 * thread-safe. Plain strerror() isn't thread safe. But on threaded
5200 * builds when strerror_r() is available, the apparent call to strerror()
5201 * below is actually a macro that behind-the-scenes calls strerror_r(). */
43cb6651 5202
39e69e77 5203# ifdef HAS_STRERROR_R
7aaa36b1
KW
5204
5205 if (within_locale_scope) {
4eb27fc5 5206 errstr = savepv(strerror(errnum));
7aaa36b1
KW
5207 }
5208 else {
4eb27fc5 5209 errstr = savepv(strerror_l(errnum, PL_C_locale_obj));
7aaa36b1
KW
5210 }
5211
43cb6651
KW
5212# else
5213
5214 /* Here we have strerror_l(), but not strerror_r() and we are on a
5215 * threaded-build. We use strerror_l() for everything, constructing a
5216 * locale to pass to it if necessary */
5217
5218 bool do_free = FALSE;
5219 locale_t locale_to_use;
5220
5221 if (within_locale_scope) {
5222 locale_to_use = uselocale((locale_t) 0);
5223 if (locale_to_use == LC_GLOBAL_LOCALE) {
5224 locale_to_use = duplocale(LC_GLOBAL_LOCALE);
5225 do_free = TRUE;
5226 }
5227 }
5228 else { /* Use C locale if not within 'use locale' scope */
5229 locale_to_use = PL_C_locale_obj;
5230 }
5231
5232 errstr = savepv(strerror_l(errnum, locale_to_use));
5233
5234 if (do_free) {
5235 freelocale(locale_to_use);
5236 }
5237
5238# endif
5239# else /* Doesn't have strerror_l() */
7aaa36b1 5240
8de4332b 5241 const char * save_locale = NULL;
c9dda6da 5242 bool locale_is_C = FALSE;
2c6ee1a7 5243
9de6fe47 5244 /* We have a critical section to prevent another thread from executing this
e9bc6d6b 5245 * same code at the same time. (On thread-safe perls, the LOCK is a
9de6fe47
KW
5246 * no-op.) Since this is the only place in core that changes LC_MESSAGES
5247 * (unless the user has called setlocale(), this works to prevent races. */
7953f73f 5248 SETLOCALE_LOCK;
6ebbc862 5249
9c8a6dc2
KW
5250 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
5251 "my_strerror called with errnum %d\n", errnum));
6ebbc862 5252 if (! within_locale_scope) {
837ce802 5253 save_locale = do_setlocale_c(LC_MESSAGES, NULL);
c9dda6da 5254 if (! save_locale) {
d523eeeb 5255 SETLOCALE_UNLOCK;
d707d779
KW
5256 Perl_croak(aTHX_
5257 "panic: %s: %d: Could not find current LC_MESSAGES locale,"
5258 " errno=%d\n", __FILE__, __LINE__, errno);
c9dda6da
KW
5259 }
5260 else {
5261 locale_is_C = isNAME_C_OR_POSIX(save_locale);
2c6ee1a7 5262
c9dda6da
KW
5263 /* Switch to the C locale if not already in it */
5264 if (! locale_is_C) {
2c6ee1a7 5265
c9dda6da
KW
5266 /* The setlocale() just below likely will zap 'save_locale', so
5267 * create a copy. */
5268 save_locale = savepv(save_locale);
a83cff23
KW
5269 if (! do_setlocale_c(LC_MESSAGES, "C")) {
5270
5271 /* If, for some reason, the locale change failed, we
5272 * soldier on as best as possible under the circumstances,
5273 * using the current locale, and clear save_locale, so we
5274 * don't try to change back. On z/0S, all setlocale()
5275 * calls fail after you've created a thread. This is their
5276 * way of making sure the entire process is always a single
5277 * locale. This means that 'use locale' is always in place
5278 * for messages under these circumstances. */
5279 Safefree(save_locale);
5280 save_locale = NULL;
5281 }
c9dda6da 5282 }
6ebbc862 5283 }
6ebbc862 5284 } /* end of ! within_locale_scope */
9c8a6dc2
KW
5285 else {
5286 DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s: %d: WITHIN locale scope\n",
5287 __FILE__, __LINE__));
5288 }
a0b53297 5289
9c8a6dc2
KW
5290 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
5291 "Any locale change has been done; about to call Strerror\n"));
52770946 5292 errstr = savepv(Strerror(errnum));
6ebbc862
KW
5293
5294 if (! within_locale_scope) {
c9dda6da 5295 if (save_locale && ! locale_is_C) {
837ce802 5296 if (! do_setlocale_c(LC_MESSAGES, save_locale)) {
d523eeeb 5297 SETLOCALE_UNLOCK;
d707d779 5298 Perl_croak(aTHX_
cfaae47f
KW
5299 "panic: %s: %d: setlocale restore to '%s' failed, errno=%d\n",
5300 __FILE__, __LINE__, save_locale, errno);
c9dda6da 5301 }
6ebbc862
KW
5302 Safefree(save_locale);
5303 }
5304 }
5305
7953f73f 5306 SETLOCALE_UNLOCK;
6ebbc862 5307
7aaa36b1 5308# endif /* End of doesn't have strerror_l */
d36adde0 5309# ifdef DEBUGGING
6affbbf0
KW
5310
5311 if (DEBUG_Lv_TEST) {
5312 PerlIO_printf(Perl_debug_log, "Strerror returned; saving a copy: '");
5313 print_bytes_for_locale(errstr, errstr + strlen(errstr), 0);
5314 PerlIO_printf(Perl_debug_log, "'\n");
5315 }
5316
d36adde0
KW
5317# endif
5318#endif /* End of does have locale messages */
2c6ee1a7 5319
52770946 5320 SAVEFREEPV(errstr);
6ebbc862 5321 return errstr;
2c6ee1a7
KW
5322}
5323
66610fdd 5324/*
747c467a 5325
58e641fb
KW
5326=for apidoc switch_to_global_locale
5327
75920755 5328On systems without locale support, or on typical single-threaded builds, or on
58e641fb
KW
5329platforms that do not support per-thread locale operations, this function does
5330nothing. On such systems that do have locale support, only a locale global to
5331the whole program is available.
5332
5333On multi-threaded builds on systems that do have per-thread locale operations,
5334this function converts the thread it is running in to use the global locale.
5335This is for code that has not yet or cannot be updated to handle multi-threaded
5336locale operation. As long as only a single thread is so-converted, everything
5337works fine, as all the other threads continue to ignore the global one, so only
5338this thread looks at it.
5339
69c5e0db
KW
5340However, on Windows systems this isn't quite true prior to Visual Studio 15,
5341at which point Microsoft fixed a bug. A race can occur if you use the
5342following operations on earlier Windows platforms:
5343
5344=over
5345
5346=item L<POSIX::localeconv|POSIX/localeconv>
5347
5348=item L<I18N::Langinfo>, items C<CRNCYSTR> and C<THOUSEP>
5349
5350=item L<perlapi/Perl_langinfo>, items C<CRNCYSTR> and C<THOUSEP>
5351
5352=back
5353
5354The first item is not fixable (except by upgrading to a later Visual Studio
5355release), but it would be possible to work around the latter two items by using
5356the Windows API functions C<GetNumberFormat> and C<GetCurrencyFormat>; patches
5357welcome.
5358
58e641fb
KW
5359Without this function call, threads that use the L<C<setlocale(3)>> system
5360function will not work properly, as all the locale-sensitive functions will
5361look at the per-thread locale, and C<setlocale> will have no effect on this
5362thread.
5363
5364Perl code should convert to either call
5365L<C<Perl_setlocale>|perlapi/Perl_setlocale> (which is a drop-in for the system
5366C<setlocale>) or use the methods given in L<perlcall> to call
5367L<C<POSIX::setlocale>|POSIX/setlocale>. Either one will transparently properly
5368handle all cases of single- vs multi-thread, POSIX 2008-supported or not.
5369
5370Non-Perl libraries, such as C<gtk>, that call the system C<setlocale> can
5371continue to work if this function is called before transferring control to the
5372library.
5373
5374Upon return from the code that needs to use the global locale,
5375L<C<sync_locale()>|perlapi/sync_locale> should be called to restore the safe
5376multi-thread operation.
5377
5378=cut
5379*/
5380
5381void
5382Perl_switch_to_global_locale()
5383{
5384
5385#ifdef USE_THREAD_SAFE_LOCALE
5386# ifdef WIN32
5387
5388 _configthreadlocale(_DISABLE_PER_THREAD_LOCALE);
5389
5390# else
5391# ifdef HAS_QUERYLOCALE
5392
5393 setlocale(LC_ALL, querylocale(LC_ALL_MASK, uselocale((locale_t) 0)));
5394
5395# else
5396
5397 {
5398 unsigned int i;
5399
5400 for (i = 0; i < LC_ALL_INDEX; i++) {
5401 setlocale(categories[i], do_setlocale_r(categories[i], NULL));
5402 }
5403 }
5404
5405# endif
5406
5407 uselocale(LC_GLOBAL_LOCALE);
5408
5409# endif
5410#endif
5411
5412}
5413
5414/*
5415
747c467a
KW
5416=for apidoc sync_locale
5417
b2e9ba0c
KW
5418L<C<Perl_setlocale>|perlapi/Perl_setlocale> can be used at any time to query or
5419change the locale (though changing the locale is antisocial and dangerous on
5420multi-threaded systems that don't have multi-thread safe locale operations.
5421(See L<perllocale/Multi-threaded operation>). Using the system
5422L<C<setlocale(3)>> should be avoided. Nevertheless, certain non-Perl libraries
5423called from XS, such as C<Gtk> do so, and this can't be changed. When the
5424locale is changed by XS code that didn't use
5425L<C<Perl_setlocale>|perlapi/Perl_setlocale>, Perl needs to be told that the
5426locale has changed. Use this function to do so, before returning to Perl.
5427
5428The return value is a boolean: TRUE if the global locale at the time of call
5429was in effect; and FALSE if a per-thread locale was in effect. This can be
5430used by the caller that needs to restore things as-they-were to decide whether
5431or not to call
5432L<C<Perl_switch_to_global_locale>|perlapi/switch_to_global_locale>.
747c467a
KW
5433
5434=cut
5435*/
5436
b2e9ba0c
KW
5437bool
5438Perl_sync_locale()
747c467a 5439{
d36adde0
KW
5440
5441#ifndef USE_LOCALE
5442
5443 return TRUE;
5444
5445#else
5446
e9bc6d6b 5447 const char * newlocale;
b2e9ba0c
KW
5448 dTHX;
5449
d36adde0 5450# ifdef USE_POSIX_2008_LOCALE
b2e9ba0c
KW
5451
5452 bool was_in_global_locale = FALSE;
5453 locale_t cur_obj = uselocale((locale_t) 0);
5454
5455 /* On Windows, unless the foreign code has turned off the thread-safe
5456 * locale setting, any plain setlocale() will have affected what we see, so
5457 * no need to worry. Otherwise, If the foreign code has done a plain
5458 * setlocale(), it will only affect the global locale on POSIX systems, but
5459 * will affect the */
5460 if (cur_obj == LC_GLOBAL_LOCALE) {
5461
d36adde0 5462# ifdef HAS_QUERY_LOCALE
b2e9ba0c
KW
5463
5464 do_setlocale_c(LC_ALL, setlocale(LC_ALL, NULL));
5465
d36adde0 5466# else
b2e9ba0c
KW
5467
5468 unsigned int i;
5469
5470 /* We can't trust that we can read the LC_ALL format on the
5471 * platform, so do them individually */
5472 for (i = 0; i < LC_ALL_INDEX; i++) {
5473 do_setlocale_r(categories[i], setlocale(categories[i], NULL));
5474 }
747c467a 5475
d36adde0 5476# endif
b2e9ba0c
KW
5477
5478 was_in_global_locale = TRUE;
5479 }
5480
d36adde0 5481# else
b2e9ba0c
KW
5482
5483 bool was_in_global_locale = TRUE;
5484
d36adde0
KW
5485# endif
5486# ifdef USE_LOCALE_CTYPE
7d4bcc4a 5487
b0bde642 5488 newlocale = savepv(do_setlocale_c(LC_CTYPE, NULL));
9f82ea3e
KW
5489 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
5490 "%s:%d: %s\n", __FILE__, __LINE__,
5491 setlocale_debug_string(LC_CTYPE, NULL, newlocale)));
5492 new_ctype(newlocale);
b0bde642 5493 Safefree(newlocale);
747c467a 5494
d36adde0
KW
5495# endif /* USE_LOCALE_CTYPE */
5496# ifdef USE_LOCALE_COLLATE
7d4bcc4a 5497
b0bde642 5498 newlocale = savepv(do_setlocale_c(LC_COLLATE, NULL));
9f82ea3e
KW
5499 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
5500 "%s:%d: %s\n", __FILE__, __LINE__,
5501 setlocale_debug_string(LC_COLLATE, NULL, newlocale)));
5502 new_collate(newlocale);
b0bde642 5503 Safefree(newlocale);
747c467a 5504
d36adde0
KW
5505# endif
5506# ifdef USE_LOCALE_NUMERIC
7d4bcc4a 5507
b0bde642 5508 newlocale = savepv(do_setlocale_c(LC_NUMERIC, NULL));
9f82ea3e
KW
5509 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
5510 "%s:%d: %s\n", __FILE__, __LINE__,
5511 setlocale_debug_string(LC_NUMERIC, NULL, newlocale)));
5512 new_numeric(newlocale);
b0bde642 5513 Safefree(newlocale);
7d4bcc4a 5514
d36adde0 5515# endif /* USE_LOCALE_NUMERIC */
747c467a 5516
b2e9ba0c 5517 return was_in_global_locale;
d36adde0
KW
5518
5519#endif
5520
747c467a
KW
5521}
5522
5d1187d1
KW
5523#if defined(DEBUGGING) && defined(USE_LOCALE)
5524
a4f00dcc
KW
5525STATIC char *
5526S_setlocale_debug_string(const int category, /* category number,
5d1187d1
KW
5527 like LC_ALL */
5528 const char* const locale, /* locale name */
5529
5530 /* return value from setlocale() when attempting to
5531 * set 'category' to 'locale' */
5532 const char* const retval)
5533{
5534 /* Returns a pointer to a NUL-terminated string in static storage with
5535 * added text about the info passed in. This is not thread safe and will
5536 * be overwritten by the next call, so this should be used just to
fa07b8e5 5537 * formulate a string to immediately print or savepv() on. */
5d1187d1 5538
8c3a0f6c 5539 static char ret[256];
7d4bcc4a 5540
e5f10d49 5541 my_strlcpy(ret, "setlocale(", sizeof(ret));
b09aaf40 5542 my_strlcat(ret, category_name(category), sizeof(ret));
fa07b8e5 5543 my_strlcat(ret, ", ", sizeof(ret));
5d1187d1
KW
5544
5545 if (locale) {
fa07b8e5
KW
5546 my_strlcat(ret, "\"", sizeof(ret));
5547 my_strlcat(ret, locale, sizeof(ret));
5548 my_strlcat(ret, "\"", sizeof(ret));
5d1187d1
KW
5549 }
5550 else {
fa07b8e5 5551 my_strlcat(ret, "NULL", sizeof(ret));
5d1187d1
KW
5552 }
5553
fa07b8e5 5554 my_strlcat(ret, ") returned ", sizeof(ret));
5d1187d1
KW
5555
5556 if (retval) {
fa07b8e5
KW
5557 my_strlcat(ret, "\"", sizeof(ret));
5558 my_strlcat(ret, retval, sizeof(ret));
5559 my_strlcat(ret, "\"", sizeof(ret));
5d1187d1
KW
5560 }
5561 else {
fa07b8e5 5562 my_strlcat(ret, "NULL", sizeof(ret));
5d1187d1
KW
5563 }
5564
5565 assert(strlen(ret) < sizeof(ret));
5566
5567 return ret;
5568}
5569
5570#endif
747c467a 5571
e9bc6d6b
KW
5572void
5573Perl_thread_locale_init()
5574{
5575 /* Called from a thread on startup*/
5576
5577#ifdef USE_THREAD_SAFE_LOCALE
5578
5579 dTHX_DEBUGGING;
5580
5581 /* C starts the new thread in the global C locale. If we are thread-safe,
5582 * we want to not be in the global locale */
5583
5584 DEBUG_L(PerlIO_printf(Perl_debug_log,
5585 "%s:%d: new thread, initial locale is %s; calling setlocale\n",
5586 __FILE__, __LINE__, setlocale(LC_ALL, NULL)));
5587
5588# ifdef WIN32
5589
5590 _configthreadlocale(_ENABLE_PER_THREAD_LOCALE);
5591
5592# else
5593
5594 Perl_setlocale(LC_ALL, "C");
5595
5596# endif
5597#endif
5598
5599}
5600
5601void
5602Perl_thread_locale_term()
5603{
5604 /* Called from a thread as it gets ready to terminate */
5605
5606#ifdef USE_THREAD_SAFE_LOCALE
5607
5608 /* C starts the new thread in the global C locale. If we are thread-safe,
5609 * we want to not be in the global locale */
5610
5611# ifndef WIN32
5612
5613 { /* Free up */
5614 locale_t cur_obj = uselocale(LC_GLOBAL_LOCALE);
e72200e7 5615 if (cur_obj != LC_GLOBAL_LOCALE && cur_obj != PL_C_locale_obj) {
e9bc6d6b
KW
5616 freelocale(cur_obj);
5617 }
5618 }
5619
5620# endif
5621#endif
5622
5623}
747c467a
KW
5624
5625/*
14d04a33 5626 * ex: set ts=8 sts=4 sw=4 et:
37442d52 5627 */