This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
locale.c: Refactor some derived #defines
[perl5.git] / locale.c
CommitLineData
98994639
HS
1/* locale.c
2 *
1129b882
NC
3 * Copyright (C) 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001,
4 * 2002, 2003, 2005, 2006, 2007, 2008 by Larry Wall and others
98994639
HS
5 *
6 * You may distribute under the terms of either the GNU General Public
7 * License or the Artistic License, as specified in the README file.
8 *
9 */
10
11/*
4ac71550 12 * A Elbereth Gilthoniel,
cdad3b53 13 * silivren penna míriel
4ac71550 14 * o menel aglar elenath!
cdad3b53 15 * Na-chaered palan-díriel
4ac71550
TC
16 * o galadhremmin ennorath,
17 * Fanuilos, le linnathon
18 * nef aear, si nef aearon!
19 *
20 * [p.238 of _The Lord of the Rings_, II/i: "Many Meetings"]
98994639
HS
21 */
22
166f8a29
DM
23/* utility functions for handling locale-specific stuff like what
24 * character represents the decimal point.
0d071d52 25 *
7d4bcc4a
KW
26 * All C programs have an underlying locale. Perl code generally doesn't pay
27 * any attention to it except within the scope of a 'use locale'. For most
0d071d52
KW
28 * categories, it accomplishes this by just using different operations if it is
29 * in such scope than if not. However, various libc functions called by Perl
30 * are affected by the LC_NUMERIC category, so there are macros in perl.h that
31 * are used to toggle between the current locale and the C locale depending on
a9ad02a8
KW
32 * the desired behavior of those functions at the moment. And, LC_MESSAGES is
33 * switched to the C locale for outputting the message unless within the scope
34 * of 'use locale'.
e9bc6d6b
KW
35 *
36 * This code now has multi-thread-safe locale handling on systems that support
37 * that. This is completely transparent to most XS code. On earlier systems,
38 * it would be possible to emulate thread-safe locales, but this likely would
39 * involve a lot of locale switching, and would require XS code changes.
40 * Macros could be written so that the code wouldn't have to know which type of
fe1c1494
KW
41 * system is being used.
42 *
43 * Table-driven code is used for simplicity and clarity, as many operations
44 * differ only in which category is being worked on. However the system
45 * categories need not be small contiguous integers, so do not lend themselves
46 * to table lookup. Instead we have created our own equivalent values which
47 * are all small contiguous non-negative integers, and translation functions
48 * between the two sets. For category 'LC_foo', the name of our index is
49 * LC_foo_INDEX_. Various parallel tables, indexed by these, are used.
50 *
51 * Many of the macros and functions in this file have one of the suffixes '_c',
52 * '_r', or '_i'. khw found these useful in remembering what type of locale
53 * category to use as their parameter. '_r' takes an int category number as
54 * passed to setlocale(), like LC_ALL, LC_CTYPE, etc. The 'r' indicates that
55 * the value isn't known until runtime. '_c' also indicates such a category
56 * number, but its value is known at compile time. These are both converted
57 * into unsigned indexes into various tables of category information, where the
58 * real work is generally done. The tables are generated at compile-time based
59 * on platform characteristics and Configure options. They hide from the code
60 * many of the vagaries of the different locale implementations out there. You
61 * may have already guessed that '_i' indicates the parameter is such an
62 * unsigned index. Converting from '_r' to '_i' requires run-time lookup.
63 * '_c' is used to get cpp to do this at compile time. To avoid the runtime
64 * expense, the code is structured to use '_r' at the API level, and once
65 * converted, everything possible is done using the table indexes.
66 *
67 * On unthreaded perls, most operations expand out to just the basic
68 * setlocale() calls. The same is true on threaded perls on modern Windows
69 * systems where the same API, after set up, is used for thread-safe locale
70 * handling. On other systems, there is a completely different API, specified
71 * in POSIX 2008, to do thread-safe locales. On these systems, our
72 * emulate_setlocale_i() function is used to hide the different API from the
73 * outside. This makes it completely transparent to most XS code.
74 *
75 * A huge complicating factor is that the LC_NUMERIC category is normally held
76 * in the C locale, except during those relatively rare times when it needs to
77 * be in the underlying locale. There is a bunch of code to accomplish this,
78 * and to allow easy switches from one state to the other.
79 *
80 * z/OS (os390) is an outlier. Locales really don't work under threads when
81 * either the radix character isn't a dot, or attempts are made to change
82 * locales after the first thread is created. The reason is that IBM has made
83 * it thread-safe by refusing to change locales (returning failure if
84 * attempted) any time after an application has called pthread_create() to
85 * create another thread. The expectation is that an application will set up
86 * its locale information before the first fork, and be stable thereafter. But
87 * perl toggles LC_NUMERIC if the locale's radix character isn't a dot, as do
88 * the other toggles, which are less common.
166f8a29
DM
89 */
90
4767484d
KW
91/* If the environment says to, we can output debugging information during
92 * initialization. This is done before option parsing, and before any thread
93 * creation, so can be a file-level static. (Must come before #including
94 * perl.h) */
95#ifdef DEBUGGING
96static int debug_initialization = 0;
97# define DEBUG_INITIALIZATION_set(v) (debug_initialization = v)
98# define DEBUG_LOCALE_INITIALIZATION_ debug_initialization
99#else
100# define debug_initialization 0
101# define DEBUG_INITIALIZATION_set(v)
102#endif
103
98994639
HS
104#include "EXTERN.h"
105#define PERL_IN_LOCALE_C
f7416781 106#include "perl_langinfo.h"
98994639
HS
107#include "perl.h"
108
a4af207c
JH
109#include "reentr.h"
110
0dec74cd
KW
111#ifdef I_WCHAR
112# include <wchar.h>
113#endif
5b64f24c
KW
114#ifdef I_WCTYPE
115# include <wctype.h>
116#endif
0dec74cd 117
48015184
KW
118
119/* Returns the Unix errno portion; ignoring any others. This is a macro here
120 * instead of putting it into perl.h, because unclear to khw what should be
121 * done generally. */
122#define GET_ERRNO saved_errno
123
63e5b0d7
KW
124/* Is the C string input 'name' "C" or "POSIX"? If so, and 'name' is the
125 * return of setlocale(), then this is extremely likely to be the C or POSIX
126 * locale. However, the output of setlocale() is documented to be opaque, but
127 * the odds are extremely small that it would return these two strings for some
128 * other locale. Note that VMS in these two locales includes many non-ASCII
129 * characters as controls and punctuation (below are hex bytes):
130 * cntrl: 84-97 9B-9F
131 * punct: A1-A3 A5 A7-AB B0-B3 B5-B7 B9-BD BF-CF D1-DD DF-EF F1-FD
132 * Oddly, none there are listed as alphas, though some represent alphabetics
133 * http://www.nntp.perl.org/group/perl.perl5.porters/2013/02/msg198753.html */
134#define isNAME_C_OR_POSIX(name) \
135 ( (name) != NULL \
136 && (( *(name) == 'C' && (*(name + 1)) == '\0') \
137 || strEQ((name), "POSIX")))
138
8ef6e574
KW
139#ifdef USE_LOCALE
140
47280b20
KW
141/* This code keeps a LRU cache of the UTF-8ness of the locales it has so-far
142 * looked up. This is in the form of a C string: */
143
fe1c1494
KW
144# define UTF8NESS_SEP "\v"
145# define UTF8NESS_PREFIX "\f"
47280b20
KW
146
147/* So, the string looks like:
98994639 148 *
47280b20 149 * \vC\a0\vPOSIX\a0\vam_ET\a0\vaf_ZA.utf8\a1\ven_US.UTF-8\a1\0
98994639 150 *
47280b20
KW
151 * where the digit 0 after the \a indicates that the locale starting just
152 * after the preceding \v is not UTF-8, and the digit 1 mean it is. */
153
154STATIC_ASSERT_DECL(STRLENs(UTF8NESS_SEP) == 1);
155STATIC_ASSERT_DECL(STRLENs(UTF8NESS_PREFIX) == 1);
156
fe1c1494 157# define C_and_POSIX_utf8ness UTF8NESS_SEP "C" UTF8NESS_PREFIX "0" \
47280b20
KW
158 UTF8NESS_SEP "POSIX" UTF8NESS_PREFIX "0"
159
160/* The cache is initialized to C_and_POSIX_utf8ness at start up. These are
161 * kept there always. The remining portion of the cache is LRU, with the
162 * oldest looked-up locale at the tail end */
163
d188fe6d
KW
164# ifdef DEBUGGING
165# define setlocale_debug_string_c(category, locale, result) \
166 setlocale_debug_string_i(category##_INDEX_, locale, result)
167# define setlocale_debug_string_r(category, locale, result) \
168 setlocale_debug_string_i(get_category_index(category, locale), \
169 locale, result)
170# endif
171
fe1c1494
KW
172/* Two parallel arrays indexed by our mapping of category numbers into small
173 * non-negative indexes; first the locale categories Perl uses on this system,
174 * used to do the inverse mapping. The second array is their names. These
175 * arrays are in mostly arbitrary order. */
e5f10d49 176
e129f2eb 177STATIC const int categories[] = {
e5f10d49
KW
178
179# ifdef USE_LOCALE_NUMERIC
180 LC_NUMERIC,
181# endif
182# ifdef USE_LOCALE_CTYPE
183 LC_CTYPE,
184# endif
185# ifdef USE_LOCALE_COLLATE
186 LC_COLLATE,
187# endif
188# ifdef USE_LOCALE_TIME
189 LC_TIME,
190# endif
191# ifdef USE_LOCALE_MESSAGES
192 LC_MESSAGES,
193# endif
194# ifdef USE_LOCALE_MONETARY
195 LC_MONETARY,
196# endif
9821811f
KW
197# ifdef USE_LOCALE_ADDRESS
198 LC_ADDRESS,
199# endif
200# ifdef USE_LOCALE_IDENTIFICATION
201 LC_IDENTIFICATION,
202# endif
203# ifdef USE_LOCALE_MEASUREMENT
204 LC_MEASUREMENT,
205# endif
206# ifdef USE_LOCALE_PAPER
207 LC_PAPER,
208# endif
209# ifdef USE_LOCALE_TELEPHONE
210 LC_TELEPHONE,
211# endif
36dbc955
KW
212# ifdef USE_LOCALE_SYNTAX
213 LC_SYNTAX,
214# endif
215# ifdef USE_LOCALE_TOD
216 LC_TOD,
217# endif
e5f10d49
KW
218# ifdef LC_ALL
219 LC_ALL,
220# endif
7e1cacef
KW
221
222 /* Placeholder as a precaution if code fails to check the return of
223 * get_category_index(), which returns this element to indicate an error */
224 -1
e5f10d49
KW
225};
226
227/* The top-most real element is LC_ALL */
228
e129f2eb 229STATIC const char * const category_names[] = {
e5f10d49
KW
230
231# ifdef USE_LOCALE_NUMERIC
232 "LC_NUMERIC",
233# endif
234# ifdef USE_LOCALE_CTYPE
235 "LC_CTYPE",
236# endif
237# ifdef USE_LOCALE_COLLATE
238 "LC_COLLATE",
239# endif
240# ifdef USE_LOCALE_TIME
241 "LC_TIME",
242# endif
243# ifdef USE_LOCALE_MESSAGES
244 "LC_MESSAGES",
245# endif
246# ifdef USE_LOCALE_MONETARY
247 "LC_MONETARY",
248# endif
9821811f
KW
249# ifdef USE_LOCALE_ADDRESS
250 "LC_ADDRESS",
251# endif
252# ifdef USE_LOCALE_IDENTIFICATION
253 "LC_IDENTIFICATION",
254# endif
255# ifdef USE_LOCALE_MEASUREMENT
256 "LC_MEASUREMENT",
257# endif
258# ifdef USE_LOCALE_PAPER
259 "LC_PAPER",
260# endif
261# ifdef USE_LOCALE_TELEPHONE
262 "LC_TELEPHONE",
263# endif
36dbc955
KW
264# ifdef USE_LOCALE_SYNTAX
265 "LC_SYNTAX",
266# endif
267# ifdef USE_LOCALE_TOD
268 "LC_TOD",
269# endif
e5f10d49
KW
270# ifdef LC_ALL
271 "LC_ALL",
272# endif
7e1cacef
KW
273
274 /* Placeholder as a precaution if code fails to check the return of
275 * get_category_index(), which returns this element to indicate an error */
276 NULL
277};
e5f10d49 278
c7dc1ea8
KW
279/* A few categories require additional setup when they are changed. This table
280 * points to the functions that do that setup */
281STATIC void (*update_functions[]) (pTHX_ const char *) = {
282# ifdef USE_LOCALE_NUMERIC
283 S_new_numeric,
284# endif
285# ifdef USE_LOCALE_CTYPE
286 S_new_ctype,
287# endif
288# ifdef USE_LOCALE_COLLATE
289 S_new_collate,
290# endif
291# ifdef USE_LOCALE_TIME
292 NULL,
293# endif
294# ifdef USE_LOCALE_MESSAGES
295 NULL,
296# endif
297# ifdef USE_LOCALE_MONETARY
298 NULL,
299# endif
300# ifdef USE_LOCALE_ADDRESS
301 NULL,
302# endif
303# ifdef USE_LOCALE_IDENTIFICATION
304 NULL,
305# endif
306# ifdef USE_LOCALE_MEASUREMENT
307 NULL,
308# endif
309# ifdef USE_LOCALE_PAPER
310 NULL,
311# endif
312# ifdef USE_LOCALE_TELEPHONE
313 NULL,
314# endif
315# ifdef USE_LOCALE_SYNTAX
316 NULL,
317# endif
318# ifdef USE_LOCALE_TOD
319 NULL,
320# endif
321 /* No harm done to have this even without an LC_ALL */
322 S_new_LC_ALL,
323
324 /* Placeholder as a precaution if code fails to check the return of
325 * get_category_index(), which returns this element to indicate an error */
326 NULL
327};
328
e5f10d49
KW
329# ifdef LC_ALL
330
331 /* On systems with LC_ALL, it is kept in the highest index position. (-2
332 * to account for the final unused placeholder element.) */
333# define NOMINAL_LC_ALL_INDEX (C_ARRAY_LENGTH(categories) - 2)
e5f10d49
KW
334# else
335
336 /* On systems without LC_ALL, we pretend it is there, one beyond the real
337 * top element, hence in the unused placeholder element. */
338# define NOMINAL_LC_ALL_INDEX (C_ARRAY_LENGTH(categories) - 1)
e5f10d49
KW
339# endif
340
341/* Pretending there is an LC_ALL element just above allows us to avoid most
342 * special cases. Most loops through these arrays in the code below are
343 * written like 'for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++)'. They will work
344 * on either type of system. But the code must be written to not access the
4fd76d49
KW
345 * element at 'LC_ALL_INDEX_' except on platforms that have it. This can be
346 * checked for at compile time by using the #define LC_ALL_INDEX_ which is only
948523db 347 * defined if we do have LC_ALL. */
e5f10d49 348
7e1cacef
KW
349STATIC unsigned int
350S_get_category_index(const int category, const char * locale)
351{
352 /* Given a category, return the equivalent internal index we generally use
353 * instead.
354 *
355 * 'locale' is for use in any generated diagnostics, and may be NULL
356 *
357 * Some sort of hash could be used instead of this loop, but the number of
358 * elements is so far at most 12 */
359
360 unsigned int i;
fef64ad9 361 const char * conditional_warn_text = "; can't set it to ";
7e1cacef
KW
362
363 PERL_ARGS_ASSERT_GET_CATEGORY_INDEX;
364
365# ifdef LC_ALL
366 for (i = 0; i <= LC_ALL_INDEX_; i++)
367# else
368 for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++)
369# endif
370 {
371 if (category == categories[i]) {
372 dTHX_DEBUGGING;
373 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
374 "%s:%d: index of category %d (%s) is %d\n",
375 __FILE__, __LINE__, category, category_names[i], i));
376 return i;
377 }
378 }
379
380 /* Here, we don't know about this category, so can't handle it. */
fef64ad9 381
7e1cacef 382 if (! locale) {
fef64ad9
KW
383 locale = "";
384 conditional_warn_text = "";
7e1cacef 385 }
fef64ad9
KW
386
387 /* diag_listed_as: Unknown locale category %d; can't set it to %s */
7e1cacef 388 Perl_warner_nocontext(packWARN(WARN_LOCALE),
fef64ad9
KW
389 "Unknown locale category %d%s%s",
390 category, conditional_warn_text, locale);
7e1cacef 391
01c966a3
KW
392# ifdef EINVAL
393
394 SETERRNO(EINVAL, LIB_INVARG);
395
396# endif
397
7e1cacef
KW
398 /* Return an out-of-bounds value */
399 return NOMINAL_LC_ALL_INDEX + 1;
400}
401
b09aaf40
KW
402STATIC const char *
403S_category_name(const int category)
404{
fef64ad9 405 unsigned int index;
b09aaf40 406
fef64ad9 407 index = get_category_index(category, NULL);
b09aaf40 408
fef64ad9
KW
409 if (index <= NOMINAL_LC_ALL_INDEX) {
410 return category_names[index];
b09aaf40
KW
411 }
412
fef64ad9 413 return Perl_form_nocontext("%d (unknown)", category);
b09aaf40
KW
414}
415
948523db 416#endif /* ifdef USE_LOCALE */
8ef6e574 417
94202fb8
KW
418#define setlocale_failure_panic_c( \
419 cat, current, failed, caller_0_line, caller_1_line) \
420 setlocale_failure_panic_i(cat##_INDEX_, current, failed, \
421 caller_0_line, caller_1_line)
422
fe1c1494
KW
423/* porcelain_setlocale() presents a consistent POSIX-compliant interface to
424 * setlocale(). Windows requres a customized base-level setlocale() */
e9bc6d6b 425#ifdef WIN32
fb24522c 426# define porcelain_setlocale(cat, locale) win32_setlocale(cat, locale)
e9bc6d6b 427#else
d19f9f01
KW
428# define porcelain_setlocale(cat, locale) \
429 ((const char *) setlocale(cat, locale))
e9bc6d6b
KW
430#endif
431
32899cfb
KW
432/* The next layer up is to catch vagaries and bugs in the libc setlocale return
433 * value */
434#ifdef stdize_locale
435# define stdized_setlocale(cat, locale) \
436 stdize_locale(cat, porcelain_setlocale(cat, locale), \
437 &PL_stdize_locale_buf, &PL_stdize_locale_bufsize, __LINE__)
438#else
439# define stdized_setlocale(cat, locale) porcelain_setlocale(cat, locale)
440#endif
441
3980cddb 442/* The next many lines form a layer above the close-to-the-metal 'porcelain'
32899cfb
KW
443 * and 'stdized' macros. They are used to present a uniform API to the rest of
444 * the code in this file in spite of the disparate underlying implementations.
445 * */
fe1c1494 446
e9bc6d6b
KW
447#ifndef USE_POSIX_2008_LOCALE
448
fe1c1494
KW
449/* For non-threaded perls (which we are not to use the POSIX 2008 API on), or a
450 * thread-safe Windows one in which threading is invisible to us, the added
451 * layer just calls the base-level functions. See the introductory comments in
452 * this file for the meaning of the suffixes '_c', '_r', '_i'. */
453
50ffb02b
KW
454# define setlocale_r(cat, locale) stdized_setlocale(cat, locale)
455# define setlocale_i(i, locale) setlocale_r(categories[i], locale)
456# define setlocale_c(cat, locale) setlocale_r(cat, locale)
3980cddb 457
94202fb8
KW
458# define void_setlocale_i(i, locale) \
459 STMT_START { \
460 if (! porcelain_setlocale(categories[i], locale)) { \
461 setlocale_failure_panic_i(i, NULL, locale, __LINE__, 0); \
462 NOT_REACHED; /* NOTREACHED */ \
463 } \
464 } STMT_END
465# define void_setlocale_c(cat, locale) \
466 void_setlocale_i(cat##_INDEX_, locale)
467# define void_setlocale_r(cat, locale) \
468 void_setlocale_i(get_category_index(cat, locale), locale)
3980cddb 469
50ffb02b 470# define bool_setlocale_r(cat, locale) \
32899cfb
KW
471 cBOOL(porcelain_setlocale(cat, locale))
472# define bool_setlocale_i(i, locale) \
473 bool_setlocale_c(categories[i], locale)
50ffb02b 474# define bool_setlocale_c(cat, locale) bool_setlocale_r(cat, locale)
a683a66a 475
50ffb02b
KW
476# define querylocale_r(cat) setlocale_r(cat, NULL)
477# define querylocale_c(cat) querylocale_r(cat)
a683a66a
KW
478# define querylocale_i(i) querylocale_c(categories[i])
479
45cef8fb 480# define FIX_GLIBC_LC_MESSAGES_BUG(i)
e9bc6d6b 481
32899cfb 482#else /* Below is defined(POSIX 2008) */
e9bc6d6b 483
fe1c1494
KW
484/* Here, there is a completely different API to get thread-safe locales. We
485 * emulate the setlocale() API with our own function(s). setlocale categories,
486 * like LC_NUMERIC, are not valid here for the POSIX 2008 API. Instead, there
487 * are equivalents, like LC_NUMERIC_MASK, which we use instead, converting to
488 * by using get_category_index() followed by table lookup. */
489
3980cddb
KW
490# define setlocale_i(i, locale) emulate_setlocale_i(i, locale)
491# define setlocale_c(cat, locale) setlocale_i(cat##_INDEX_, locale)
492# define setlocale_r(cat, locale) \
493 setlocale_i(get_category_index(cat, locale), locale)
494
495# define void_setlocale_i(i, locale) ((void) setlocale_i(i, locale))
496# define void_setlocale_c(cat, locale) \
497 void_setlocale_i(cat##_INDEX_, locale)
498# define void_setlocale_r(cat, locale) ((void) setlocale_r(cat, locale))
499
500# define bool_setlocale_i(i, locale) cBOOL(setlocale_i(i, locale))
501# define bool_setlocale_c(cat, locale) \
502 bool_setlocale_i(cat##_INDEX_, locale)
503# define bool_setlocale_r(cat, locale) cBOOL(setlocale_r(cat, locale))
e9bc6d6b 504
a683a66a
KW
505# define querylocale_i(i) my_querylocale_i(i)
506# define querylocale_c(cat) querylocale_i(cat##_INDEX_)
507# define querylocale_r(cat) querylocale_i(get_category_index(cat,NULL))
508
67c13e2e 509# ifndef USE_QUERYLOCALE
434e0069 510# define USE_PL_CURLOCALES
67c13e2e 511# else
d92b203c
KW
512# define isSINGLE_BIT_SET(mask) isPOWER_OF_2(mask)
513
514 /* This code used to think querylocale() was valid on LC_ALL. Make sure
515 * all instances of that have been removed */
516# define QUERYLOCALE_ASSERT(index) \
517 __ASSERT_(isSINGLE_BIT_SET(category_masks[index]))
67c13e2e
KW
518# if ! defined(HAS_QUERYLOCALE) && defined(_NL_LOCALE_NAME)
519# define querylocale_l(index, locale_obj) \
d92b203c
KW
520 (QUERYLOCALE_ASSERT(index) \
521 nl_langinfo_l(_NL_LOCALE_NAME(categories[index]), locale_obj))
67c13e2e
KW
522# else
523# define querylocale_l(index, locale_obj) \
d92b203c
KW
524 (QUERYLOCALE_ASSERT(index) \
525 querylocale(category_masks[index], locale_obj))
67c13e2e 526# endif
434e0069 527# endif
45cef8fb 528# if ! defined(__GLIBC__) || ! defined(USE_LOCALE_MESSAGES)
45cef8fb 529# define FIX_GLIBC_LC_MESSAGES_BUG(i)
45cef8fb
KW
530# else /* Invalidate glibc cache of loaded translations, see [perl #134264] */
531
532# include <libintl.h>
533# define FIX_GLIBC_LC_MESSAGES_BUG(i) \
534 STMT_START { \
4fd76d49 535 if ((i) == LC_MESSAGES_INDEX_) { \
45cef8fb
KW
536 textdomain(textdomain(NULL)); \
537 } \
538 } STMT_END
539
540# endif
541
c7dc1ea8 542/* A fourth array, parallel to the ones above to map from category to its
e9bc6d6b 543 * equivalent mask */
e129f2eb 544STATIC const int category_masks[] = {
e9bc6d6b
KW
545# ifdef USE_LOCALE_NUMERIC
546 LC_NUMERIC_MASK,
547# endif
548# ifdef USE_LOCALE_CTYPE
549 LC_CTYPE_MASK,
550# endif
551# ifdef USE_LOCALE_COLLATE
552 LC_COLLATE_MASK,
553# endif
554# ifdef USE_LOCALE_TIME
555 LC_TIME_MASK,
556# endif
557# ifdef USE_LOCALE_MESSAGES
558 LC_MESSAGES_MASK,
559# endif
560# ifdef USE_LOCALE_MONETARY
561 LC_MONETARY_MASK,
562# endif
563# ifdef USE_LOCALE_ADDRESS
564 LC_ADDRESS_MASK,
565# endif
566# ifdef USE_LOCALE_IDENTIFICATION
567 LC_IDENTIFICATION_MASK,
568# endif
569# ifdef USE_LOCALE_MEASUREMENT
570 LC_MEASUREMENT_MASK,
571# endif
572# ifdef USE_LOCALE_PAPER
573 LC_PAPER_MASK,
574# endif
575# ifdef USE_LOCALE_TELEPHONE
576 LC_TELEPHONE_MASK,
577# endif
36dbc955
KW
578# ifdef USE_LOCALE_SYNTAX
579 LC_SYNTAX_MASK,
580# endif
581# ifdef USE_LOCALE_TOD
582 LC_TOD_MASK,
583# endif
e9bc6d6b
KW
584 /* LC_ALL can't be turned off by a Configure
585 * option, and in Posix 2008, should always be
586 * here, so compile it in unconditionally.
587 * This could catch some glitches at compile
588 * time */
7e1cacef
KW
589 LC_ALL_MASK,
590
591 /* Placeholder as a precaution if code fails to check the return of
592 * get_category_index(), which returns this element to indicate an error */
593 0
fe1c1494 594};
e9bc6d6b 595
935c09ec
KW
596# define my_querylocale_c(cat) my_querylocale_i(cat##_INDEX_)
597
e9bc6d6b 598STATIC const char *
935c09ec 599S_my_querylocale_i(pTHX_ const unsigned int index)
e9bc6d6b 600{
935c09ec
KW
601 /* This function returns the name of the locale category given by the input
602 * index into our parallel tables of them.
e9bc6d6b
KW
603 *
604 * POSIX 2008, for some sick reason, chose not to provide a method to find
935c09ec
KW
605 * the category name of a locale, discarding a basic linguistic tenet that
606 * for any object, people will create a name for it. Some vendors have
607 * created a querylocale() function to do just that. This function is a
608 * lot simpler to implement on systems that have this. Otherwise, we have
609 * to keep track of what the locale has been set to, so that we can return
610 * its name so as to emulate setlocale(). It's also possible for C code in
611 * some library to change the locale without us knowing it, though as of
e9bc6d6b
KW
612 * September 2017, there are no occurrences in CPAN of uselocale(). Some
613 * libraries do use setlocale(), but that changes the global locale, and
935c09ec 614 * threads using per-thread locales will just ignore those changes. */
e9bc6d6b 615
4bcc858c 616 int category;
935c09ec 617 const locale_t cur_obj = uselocale((locale_t) 0);
d92b203c 618 const char * retval;
e9bc6d6b 619
935c09ec 620 PERL_ARGS_ASSERT_MY_QUERYLOCALE_I;
4bcc858c 621 assert(index <= NOMINAL_LC_ALL_INDEX);
e9bc6d6b 622
4bcc858c 623 category = categories[index];
e9bc6d6b 624
935c09ec 625 DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s:%d: my_querylocale_i %p\n",
94d521c0 626 __FILE__, __LINE__, cur_obj));
e9bc6d6b 627 if (cur_obj == LC_GLOBAL_LOCALE) {
d92b203c 628 retval = porcelain_setlocale(category, NULL);
e9bc6d6b 629 }
d92b203c 630 else {
e9bc6d6b 631
67c13e2e 632# ifdef USE_QUERYLOCALE
e9bc6d6b 633
d92b203c
KW
634 /* We don't currently keep records when there is querylocale(), so have
635 * to get it anew each time */
636 retval = (index == LC_ALL_INDEX_)
637 ? calculate_LC_ALL(cur_obj)
638 : querylocale_l(index, cur_obj);
e9bc6d6b 639
d2b24094 640# else
ecdda939 641
d92b203c
KW
642 /* But we do have up-to-date values when we keep our own records */
643 retval = PL_curlocales[index];
e9bc6d6b 644
d92b203c 645# endif
e9bc6d6b 646
e9bc6d6b
KW
647 }
648
94d521c0 649 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
d92b203c
KW
650 "%s:%d: my_querylocale_i(%s) returning '%s'\n",
651 __FILE__, __LINE__, category_names[index], retval));
652 return retval;
935c09ec
KW
653}
654
655STATIC const char *
656S_emulate_setlocale_i(pTHX_ const unsigned int index, const char * locale)
657{
658 /* This function effectively performs a setlocale() on just the current
659 * thread; thus it is thread-safe. It does this by using the POSIX 2008
660 * locale functions to emulate the behavior of setlocale(). Similar to
661 * regular setlocale(), the return from this function points to memory that
662 * can be overwritten by other system calls, so needs to be copied
663 * immediately if you need to retain it. The difference here is that
664 * system calls besides another setlocale() can overwrite it.
665 *
666 * By doing this, most locale-sensitive functions become thread-safe. The
667 * exceptions are mostly those that return a pointer to static memory.
668 *
669 * This function takes our internal index of the 'category' setlocale is
670 * called with, and the 'locale' to set the category to. It uses the
671 * index to find the category mask that the POSIX 2008 functions use. */
672
673 int mask;
674 int category;
675 locale_t old_obj;
676 locale_t new_obj;
677
678 PERL_ARGS_ASSERT_EMULATE_SETLOCALE_I;
679 assert(index <= NOMINAL_LC_ALL_INDEX);
680
681 mask = category_masks[index];
682 category = categories[index];
683
684 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
685 "%s:%d: emulate_setlocale_i input=%d (%s), mask=0x%x, \"%s\", %d\n",
686 __FILE__, __LINE__, category, category_name(category), mask,
687 locale, index));
e9bc6d6b 688
935c09ec
KW
689 /* If just querying what the existing locale is ... */
690 if (locale == NULL) {
691 return my_querylocale_i(index);
692 }
e9bc6d6b 693
67c13e2e 694# ifndef USE_QUERYLOCALE
e9bc6d6b
KW
695
696 if (strEQ(locale, "")) {
697
698 /* For non-querylocale() systems, we do the setting of "" ourselves to
699 * be sure that we really know what's going on. We follow the Linux
700 * documented behavior (but if that differs from the actual behavior,
701 * this won't work exactly as the OS implements). We go out and
702 * examine the environment based on our understanding of how the system
fe1c1494
KW
703 * works, and use that to figure things out.
704 *
705 * Another option would be to toggle to the global locale, and do a
706 * straight setlocale(LC_ALL, ""). But that could cause races with any
707 * other thread that has also switched. That's probably a rare event,
708 * and we could have a global boolean that indicates if any thread has
709 * switched, but we'd still need the following backup code anyway. The
710 * only real reason to make the switch is because some alien library
711 * that can't be changed, like GTk, is doing its own setlocales, */
e9bc6d6b
KW
712
713 const char * const lc_all = PerlEnv_getenv("LC_ALL");
714
715 /* Use any "LC_ALL" environment variable, as it overrides everything
716 * else. */
717 if (lc_all && strNE(lc_all, "")) {
718 locale = lc_all;
719 }
720 else {
721
722 /* Otherwise, we need to dig deeper. Unless overridden, the
723 * default is the LANG environment variable; if it doesn't exist,
724 * then "C" */
725
726 const char * default_name;
727
207cc8df 728 default_name = PerlEnv_getenv("LANG");
e9bc6d6b
KW
729
730 if (! default_name || strEQ(default_name, "")) {
929a7f8c 731 default_name = "C";
e9bc6d6b 732 }
e9bc6d6b
KW
733
734 if (category != LC_ALL) {
735 const char * const name = PerlEnv_getenv(category_names[index]);
736
737 /* Here we are setting a single category. Assume will have the
738 * default name */
739 locale = default_name;
740
741 /* But then look for an overriding environment variable */
742 if (name && strNE(name, "")) {
743 locale = name;
744 }
745 }
746 else {
747 bool did_override = FALSE;
748 unsigned int i;
749
750 /* Here, we are getting LC_ALL. Any categories that don't have
751 * a corresponding environment variable set should be set to
752 * LANG, or to "C" if there is no LANG. If no individual
753 * categories differ from this, we can just set LC_ALL. This
754 * is buggy on systems that have extra categories that we don't
755 * know about. If there is an environment variable that sets
756 * that category, we won't know to look for it, and so our use
757 * of LANG or "C" improperly overrides it. On the other hand,
758 * if we don't do what is done here, and there is no
759 * environment variable, the category's locale should be set to
760 * LANG or "C". So there is no good solution. khw thinks the
761 * best is to look at systems to see what categories they have,
762 * and include them, and then to assume that we know the
763 * complete set */
764
4fd76d49 765 for (i = 0; i < LC_ALL_INDEX_; i++) {
e9bc6d6b 766 const char * const env_override
24f3e849 767 = PerlEnv_getenv(category_names[i]);
e9bc6d6b
KW
768 const char * this_locale = ( env_override
769 && strNE(env_override, ""))
770 ? env_override
771 : default_name;
4bcc858c 772 if (! emulate_setlocale_i(i, this_locale))
6435f98d 773 {
6435f98d
KW
774 return NULL;
775 }
e9bc6d6b
KW
776
777 if (strNE(this_locale, default_name)) {
778 did_override = TRUE;
779 }
e9bc6d6b
KW
780 }
781
782 /* If all the categories are the same, we can set LC_ALL to
783 * that */
784 if (! did_override) {
785 locale = default_name;
786 }
787 else {
788
789 /* Here, LC_ALL is no longer valid, as some individual
790 * categories don't match it. We call ourselves
791 * recursively, as that will execute the code that
792 * generates the proper locale string for this situation.
793 * We don't do the remainder of this function, as that is
794 * to update our records, and we've just done that for the
795 * individual categories in the loop above, and doing so
796 * would cause LC_ALL to be done as well */
a683a66a 797 return querylocale_c(LC_ALL);
e9bc6d6b
KW
798 }
799 }
800 }
6aba5c5e 801 } /* End of this being setlocale(LC_foo, "") */
e9bc6d6b
KW
802 else if (strchr(locale, ';')) {
803
804 /* LC_ALL may actually incude a conglomeration of various categories.
805 * Without querylocale, this code uses the glibc (as of this writing)
806 * syntax for representing that, but that is not a stable API, and
807 * other platforms do it differently, so we have to handle all cases
808 * ourselves */
809
f0023d47 810 unsigned int i;
e9bc6d6b
KW
811 const char * s = locale;
812 const char * e = locale + strlen(locale);
813 const char * p = s;
814 const char * category_end;
815 const char * name_start;
816 const char * name_end;
817
f0023d47
KW
818 /* If the string that gives what to set doesn't include all categories,
819 * the omitted ones get set to "C". To get this behavior, first set
820 * all the individual categories to "C", and override the furnished
821 * ones below */
4fd76d49 822 for (i = 0; i < LC_ALL_INDEX_; i++) {
4bcc858c 823 if (! emulate_setlocale_i(i, "C")) {
f0023d47
KW
824 return NULL;
825 }
826 }
827
e9bc6d6b 828 while (s < e) {
e9bc6d6b
KW
829
830 /* Parse through the category */
831 while (isWORDCHAR(*p)) {
832 p++;
833 }
834 category_end = p;
835
836 if (*p++ != '=') {
837 Perl_croak(aTHX_
f052c885
KW
838 "panic: %s: %d: Unexpected character in locale category"
839 " name '%02X",
e9bc6d6b
KW
840 __FILE__, __LINE__, *(p-1));
841 }
842
843 /* Parse through the locale name */
844 name_start = p;
bee74f4b
KW
845 while (p < e && *p != ';') {
846 if (! isGRAPH(*p)) {
847 Perl_croak(aTHX_
848 "panic: %s: %d: Unexpected character in locale name '%02X",
f1b080c5 849 __FILE__, __LINE__, *p);
bee74f4b 850 }
e9bc6d6b
KW
851 p++;
852 }
853 name_end = p;
854
bee74f4b
KW
855 /* Space past the semi-colon */
856 if (p < e) {
857 p++;
e9bc6d6b
KW
858 }
859
860 /* Find the index of the category name in our lists */
4fd76d49 861 for (i = 0; i < LC_ALL_INDEX_; i++) {
3c76a412 862 char * individ_locale;
e9bc6d6b
KW
863
864 /* Keep going if this isn't the index. The strnNE() avoids a
865 * Perl_form(), but would fail if ever a category name could be
866 * a substring of another one, like if there were a
867 * "LC_TIME_DATE" */
868 if strnNE(s, category_names[i], category_end - s) {
869 continue;
870 }
871
872 /* If this index is for the single category we're changing, we
873 * have found the locale to set it to. */
874 if (category == categories[i]) {
875 locale = Perl_form(aTHX_ "%.*s",
876 (int) (name_end - name_start),
877 name_start);
878 goto ready_to_set;
879 }
880
3c76a412 881 assert(category == LC_ALL);
bee74f4b
KW
882 individ_locale = Perl_form(aTHX_ "%.*s",
883 (int) (name_end - name_start), name_start);
4bcc858c 884 if (! emulate_setlocale_i(i, individ_locale))
6435f98d
KW
885 {
886 return NULL;
887 }
e9bc6d6b
KW
888 }
889
890 s = p;
891 }
892
893 /* Here we have set all the individual categories by recursive calls.
894 * These collectively should have fixed up LC_ALL, so can just query
895 * what that now is */
896 assert(category == LC_ALL);
897
a683a66a 898 return querylocale_c(LC_ALL);
6aba5c5e
KW
899 } /* End of this being setlocale(LC_ALL,
900 "LC_CTYPE=foo;LC_NUMERIC=bar;...") */
e9bc6d6b
KW
901
902 ready_to_set: ;
903
f8115d60
KW
904 /* Here at the end of having to deal with the absence of querylocale().
905 * Some cases have already been fully handled by recursive calls to this
906 * function. But at this point, we haven't dealt with those, but are now
907 * prepared to, knowing what the locale name to set this category to is.
908 * This would have come for free if this system had had querylocale() */
909
e9bc6d6b
KW
910# endif /* end of ! querylocale */
911
f8115d60
KW
912 assert(PL_C_locale_obj);
913
914 /* Switching locales generally entails freeing the current one's space (at
fe1c1494
KW
915 * the C library's discretion), hence we can't be using that locale at the
916 * time of the switch (this wasn't obvious to khw from the man pages). So
917 * switch to a known locale object that we don't otherwise mess with; the
918 * function returns the locale object in effect prior to the switch. */
f8115d60
KW
919 old_obj = uselocale(PL_C_locale_obj);
920
94d521c0 921 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
4bcc858c 922 "%s:%d: emulate_setlocale_i was using %p\n",
94d521c0 923 __FILE__, __LINE__, old_obj));
f8115d60
KW
924
925 if (! old_obj) {
f8115d60 926 dSAVE_ERRNO;
94d521c0 927 DEBUG_L(PerlIO_printf(Perl_debug_log,
4bcc858c 928 "%s:%d: emulate_setlocale_i switching to C"
94d521c0 929 " failed: %d\n", __FILE__, __LINE__, GET_ERRNO));
f8115d60 930 RESTORE_ERRNO;
f8115d60
KW
931
932 return NULL;
933 }
934
94d521c0 935 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
4bcc858c 936 "%s:%d: emulate_setlocale_i now using %p\n",
94d521c0 937 __FILE__, __LINE__, PL_C_locale_obj));
f8115d60 938
fe1c1494
KW
939 /* If this call is to switch LC_ALL to the 'C' locale, it already exists,
940 * and in fact, we already have switched to it (in preparation for what
6aba5c5e 941 * normally is to come). But since we're already there, continue to use
70bd6bc8
KW
942 * it instead of trying to create a new locale */
943 if (mask == LC_ALL_MASK && isNAME_C_OR_POSIX(locale)) {
944
94d521c0
KW
945 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
946 "%s:%d: will stay in C object\n",
947 __FILE__, __LINE__));
70bd6bc8
KW
948
949 new_obj = PL_C_locale_obj;
950
951 /* We already had switched to the C locale in preparation for freeing
b22e9937 952 * 'old_obj' */
70bd6bc8
KW
953 if (old_obj != LC_GLOBAL_LOCALE && old_obj != PL_C_locale_obj) {
954 freelocale(old_obj);
955 }
956 }
957 else {
b22e9937
KW
958 /* If we weren't in a thread safe locale, set so that newlocale() below
959 * which uses 'old_obj', uses an empty one. Same for our reserved C
960 * object. The latter is defensive coding, so that, even if there is
961 * some bug, we will never end up trying to modify either of these, as
fe1c1494 962 * newlocale() just below would otherwise do. */
b22e9937
KW
963 if (old_obj == LC_GLOBAL_LOCALE || old_obj == PL_C_locale_obj) {
964 old_obj = (locale_t) 0;
965 }
f8115d60 966
b22e9937
KW
967 /* Ready to create a new locale by modification of the exising one */
968 new_obj = newlocale(mask, locale, old_obj);
e9bc6d6b 969
b22e9937
KW
970 if (! new_obj) {
971 dSAVE_ERRNO;
e9bc6d6b 972
94d521c0 973 DEBUG_L(PerlIO_printf(Perl_debug_log,
4bcc858c 974 "%s:%d: emulate_setlocale_i creating new object"
94d521c0 975 " failed: %d\n", __FILE__, __LINE__, GET_ERRNO));
e9bc6d6b 976
b22e9937 977 if (! uselocale(old_obj)) {
94d521c0 978 DEBUG_L(PerlIO_printf(Perl_debug_log,
b22e9937 979 "%s:%d: switching back failed: %d\n",
94d521c0 980 __FILE__, __LINE__, GET_ERRNO));
b22e9937
KW
981 }
982 RESTORE_ERRNO;
983 return NULL;
e9bc6d6b 984 }
e9bc6d6b 985
94d521c0 986 DEBUG_Lv(STMT_START {
b22e9937 987 PerlIO_printf(Perl_debug_log,
4bcc858c 988 "%s:%d: emulate_setlocale_i created %p",
b22e9937 989 __FILE__, __LINE__, new_obj);
94d521c0 990 if (old_obj) PerlIO_printf(Perl_debug_log,
b22e9937 991 "; should have freed %p", old_obj);
b22e9937 992 PerlIO_printf(Perl_debug_log, "\n");
94d521c0 993 } STMT_END);
e9bc6d6b 994
b22e9937
KW
995 /* And switch into it */
996 if (! uselocale(new_obj)) {
997 dSAVE_ERRNO;
e9bc6d6b 998
94d521c0 999 DEBUG_L(PerlIO_printf(Perl_debug_log,
4bcc858c 1000 "%s:%d: emulate_setlocale_i switching to new object"
94d521c0 1001 " failed\n", __FILE__, __LINE__));
e9bc6d6b 1002
b22e9937 1003 if (! uselocale(old_obj)) {
e9bc6d6b 1004
94d521c0 1005 DEBUG_L(PerlIO_printf(Perl_debug_log,
b22e9937 1006 "%s:%d: switching back failed: %d\n",
94d521c0 1007 __FILE__, __LINE__, GET_ERRNO));
e9bc6d6b 1008
b22e9937
KW
1009 }
1010 freelocale(new_obj);
1011 RESTORE_ERRNO;
1012 return NULL;
e9bc6d6b 1013 }
70bd6bc8 1014 }
e9bc6d6b 1015
fe1c1494 1016 /* Here, we are using 'new_obj' which matches the input 'locale'. */
94d521c0 1017 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
4bcc858c 1018 "%s:%d: emulate_setlocale_i now using %p\n",
94d521c0 1019 __FILE__, __LINE__, new_obj));
e9bc6d6b
KW
1020
1021 /* We are done, except for updating our records (if the system doesn't keep
1022 * them) and in the case of locale "", we don't actually know what the
1023 * locale that got switched to is, as it came from the environment. So
1024 * have to find it */
1025
67c13e2e 1026# ifdef USE_QUERYLOCALE
e9bc6d6b
KW
1027
1028 if (strEQ(locale, "")) {
d92b203c 1029 locale = querylocale_i(index);
e9bc6d6b
KW
1030 }
1031
1032# else
1033
1034 /* Here, 'locale' is the return value */
1035
1036 /* Without querylocale(), we have to update our records */
1037
1038 if (category == LC_ALL) {
1039 unsigned int i;
1040
1041 /* For LC_ALL, we change all individual categories to correspond */
1042 /* PL_curlocales is a parallel array, so has same
1043 * length as 'categories' */
4fd76d49 1044 for (i = 0; i <= LC_ALL_INDEX_; i++) {
e9bc6d6b
KW
1045 Safefree(PL_curlocales[i]);
1046 PL_curlocales[i] = savepv(locale);
1047 }
45cef8fb 1048
4fd76d49 1049 FIX_GLIBC_LC_MESSAGES_BUG(LC_MESSAGES_INDEX_);
e9bc6d6b
KW
1050 }
1051 else {
1052
d92b203c 1053 /* Otherwise, update the single category, plus LC_ALL */
e9bc6d6b 1054
d92b203c
KW
1055 Safefree(PL_curlocales[index]);
1056 PL_curlocales[index] = savepv(locale);
1057
1058 if ( PL_curlocales[LC_ALL_INDEX_] == NULL
1059 || strNE(PL_curlocales[LC_ALL_INDEX_], locale))
fe1c1494 1060 {
4fd76d49 1061 Safefree(PL_curlocales[LC_ALL_INDEX_]);
d92b203c
KW
1062 PL_curlocales[LC_ALL_INDEX_] =
1063 savepv(calculate_LC_ALL(PL_curlocales));
e9bc6d6b
KW
1064 }
1065
45cef8fb 1066 FIX_GLIBC_LC_MESSAGES_BUG(index);
e9bc6d6b
KW
1067 }
1068
1069# endif
1070
1071 return locale;
1072}
1073
3980cddb
KW
1074#endif /* End of the various implementations of the setlocale and
1075 querylocale macros used in the remainder of this program */
e9bc6d6b 1076
d36adde0 1077#ifdef USE_LOCALE
837ce802 1078
32899cfb
KW
1079/* So far, the locale strings returned by modern 2008-compliant systems have
1080 * been fine */
1081
1082STATIC const char *
1083S_stdize_locale(pTHX_ const int category,
1084 const char *input_locale,
1085 const char **buf,
1086 Size_t *buf_size,
1087 const line_t caller_line)
54ddc3dd 1088{
32899cfb
KW
1089 /* The return value of setlocale() is opaque, but is required to be usable
1090 * as input to a future setlocale() to create the same state.
1091 * Unfortunately not all systems are compliant. But most often they are of
1092 * a very restricted set of forms that this file has been coded to expect.
54ddc3dd 1093 *
32899cfb 1094 * There are some outliers, though, that this function tries to tame:
54ddc3dd 1095 *
32899cfb
KW
1096 * 1) A new-line. This function chomps any \n characters
1097 * 2) foo=bar. 'bar' is what is generally meant, and the foo= part is
1098 * stripped. This form is legal for LC_ALL. When found in
1099 * that category group, the function calls itself
1100 * recursively on each possible component category to make
1101 * sure the individual categories are ok.
1102 *
1103 * If no changes to the input were made, it is returned; otherwise the
1104 * changed version is stored into memory at *buf, with *buf_size set to its
1105 * new value, and *buf is returned.
1106 */
54ddc3dd 1107
32899cfb
KW
1108 const char * first_bad;
1109 const char * retval;
54ddc3dd
KW
1110
1111 PERL_ARGS_ASSERT_STDIZE_LOCALE;
1112
32899cfb
KW
1113 if (input_locale == NULL) {
1114 return NULL;
1115 }
1116
1117 first_bad = strpbrk(input_locale, "=\n");
1118
1119 /* Most likely, there isn't a problem with the input */
1120 if (LIKELY(! first_bad)) {
1121 return input_locale;
1122 }
1123
1124# ifdef LC_ALL
1125
1126 /* But if there is, and the category is LC_ALL, we have to look at each
1127 * component category */
1128 if (category == LC_ALL) {
1129 const char * individ_locales[LC_ALL_INDEX_];
1130 bool made_changes = FALSE;
1131 unsigned int i;
1132
1133 for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++) {
1134 Size_t this_size = 0;
1135 individ_locales[i] = stdize_locale(categories[i],
1136 porcelain_setlocale(categories[i],
1137 NULL),
1138 &individ_locales[i],
1139 &this_size,
1140 caller_line);
1141
1142 /* If the size didn't change, it means this category did not have
1143 * to be adjusted, and individ_locales[i] points to the buffer
1144 * returned by porcelain_setlocale(); we have to copy that before
1145 * it's called again in the next iteration */
1146 if (this_size == 0) {
1147 individ_locales[i] = savepv(individ_locales[i]);
54ddc3dd 1148 }
32899cfb
KW
1149 else {
1150 made_changes = TRUE;
1151 }
1152 }
1153
1154 /* If all the individual categories were ok as-is, this was a false
1155 * alarm. We must have seen an '=' which was a legal occurrence in
1156 * this combination locale */
1157 if (! made_changes) {
1158 retval = input_locale; /* The input can be returned unchanged */
1159 }
1160 else {
1161 retval = save_to_buffer(querylocale_c(LC_ALL), buf, buf_size, 0);
1162 }
1163
1164 for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++) {
1165 Safefree(individ_locales[i]);
54ddc3dd 1166 }
32899cfb
KW
1167
1168 return retval;
54ddc3dd
KW
1169 }
1170
32899cfb
KW
1171# endif
1172
1173 /* Here, there was a problem in an individual category. This means that at
1174 * least one adjustment will be necessary. Create a modifiable copy */
1175 retval = save_to_buffer(input_locale, buf, buf_size, 0);
1176
1177 if (*first_bad != '=') {
54ddc3dd 1178
32899cfb
KW
1179 /* Translate the found position into terms of the copy */
1180 first_bad = retval + (first_bad - input_locale);
1181 }
1182 else { /* An '=' */
1183
1184 /* It is unlikely that the return is so screwed-up that it contains
1185 * multiple equals signs, but handle that case by stripping all of
1186 * them. */
1187 const char * final_equals = strrchr(retval, '=');
1188
1189 /* The length passed here causes the move to include the terminating
1190 * NUL */
1191 Move(final_equals + 1, retval, strlen(final_equals), char);
1192
1193 /* See if there are additional problems; if not, we're good to return.
1194 * */
1195 first_bad = strpbrk(retval, "\n");
1196
1197 if (! first_bad) {
1198 return retval;
1199 }
1200 }
1201
1202 /* Here, the problem must be a \n. Get rid of it and what follows.
1203 * (Originally, only a trailing \n was stripped. Unsure what to do if not
1204 * trailing) */
1205 *((char *) first_bad) = '\0';
1206 return retval;
54ddc3dd
KW
1207}
1208
6df0bfb6 1209#if defined(USE_POSIX_2008_LOCALE)
32899cfb 1210
d92b203c
KW
1211STATIC
1212const char *
1213
1214# ifdef USE_QUERYLOCALE
1215S_calculate_LC_ALL(pTHX_ const locale_t cur_obj)
1216# else
1217S_calculate_LC_ALL(pTHX_ const char ** individ_locales)
1218# endif
1219
1220{
1221 /* For POSIX 2008, we have to figure out LC_ALL ourselves when needed.
1222 * querylocale(), on systems that have it, doesn't tend to work for LC_ALL.
1223 * So we have to construct the answer ourselves based on the passed in
1224 * data, which is either a locale_t object, for systems with querylocale(),
1225 * or an array we keep updated to the proper values, otherwise.
1226 *
1227 * This returns a mortalized string containing the locale name(s) of
1228 * LC_ALL.
1229 *
1230 * If all individual categories are the same locale, we can just set LC_ALL
1231 * to that locale. But if not, we have to create an aggregation of all the
1232 * categories on the system. Platforms differ as to the syntax they use
1233 * for these non-uniform locales for LC_ALL. Some use a '/' or other
1234 * delimiter of the locales with a predetermined order of categories; a
1235 * Configure probe would be needed to tell us how to decipher those. glibc
1236 * uses a series of name=value pairs, like
1237 * LC_NUMERIC=C;LC_TIME=en_US.UTF-8;...
1238 * The syntax we use for our aggregation doesn't much matter, as we take
1239 * care not to use the native setlocale() function on whatever style is
1240 * chosen. But, it would be possible for someone to call Perl_setlocale()
1241 * using a native style we don't understand. So far no one has complained.
1242 *
1243 * For systems that have categories we don't know about, the algorithm
1244 * below won't know about those missing categories, leading to potential
1245 * bugs for code that looks at them. If there is an environment variable
1246 * that sets that category, we won't know to look for it, and so our use of
1247 * LANG or "C" improperly overrides it. On the other hand, if we don't do
1248 * what is done here, and there is no environment variable, the category's
1249 * locale should be set to LANG or "C". So there is no good solution. khw
1250 * thinks the best is to make sure we have a complete list of possible
1251 * categories, adding new ones as they show up on obscure platforms.
1252 */
1253
1254 unsigned int i;
1255 Size_t names_len = 0;
1256 bool are_all_categories_the_same_locale = TRUE;
1257 char * aggregate_locale;
1258 char * previous_start = NULL;
36540e72 1259 char * this_start = NULL;
d92b203c
KW
1260 Size_t entry_len = 0;
1261
1262 PERL_ARGS_ASSERT_CALCULATE_LC_ALL;
1263
1264 /* First calculate the needed size for the string listing the categories
1265 * and their locales. */
1266 for (i = 0; i < LC_ALL_INDEX_; i++) {
1267
1268# ifdef USE_QUERYLOCALE
1269 const char * entry = querylocale_l(i, cur_obj);
1270# else
1271 const char * entry = individ_locales[i];
1272# endif
1273
1274 if (entry == NULL) continue; /* XXX Temporary */
1275 names_len += strlen(category_names[i])
1276 + 1 /* '=' */
1277 + strlen(entry)
1278 + 1; /* ';' */
1279 }
1280
1281 names_len++; /* Trailing '\0' */
1282
1283 /* Allocate enough space for the aggregated string */
1284 SAVEFREEPV(Newxz(aggregate_locale, names_len, char));
1285
1286 /* Then fill it in */
1287 for (i = 0; i < LC_ALL_INDEX_; i++) {
1288 Size_t new_len;
1289
1290# ifdef USE_QUERYLOCALE
1291 const char * entry = querylocale_l(i, cur_obj);
1292# else
1293 const char * entry = individ_locales[i];
1294# endif
1295
1296 if (entry == NULL) continue; /* XXX Temporary */
1297 new_len = my_strlcat(aggregate_locale, category_names[i], names_len);
1298 assert(new_len <= names_len);
1299 new_len = my_strlcat(aggregate_locale, "=", names_len);
1300 assert(new_len <= names_len);
1301
1302 this_start = aggregate_locale + strlen(aggregate_locale);
1303 entry_len = strlen(entry);
1304
1305 new_len = my_strlcat(aggregate_locale, entry, names_len);
1306 assert(new_len <= names_len);
1307 new_len = my_strlcat(aggregate_locale, ";", names_len);
1308 assert(new_len <= names_len);
1309 PERL_UNUSED_VAR(new_len); /* Only used in DEBUGGING */
1310
1311 if ( i > 0
1312 && are_all_categories_the_same_locale
1313 && memNE(previous_start, this_start, entry_len + 1))
1314 {
1315 are_all_categories_the_same_locale = FALSE;
1316 }
1317 else {
1318 previous_start = this_start;
1319 }
1320 }
1321
1322 /* If they are all the same, just return any one of them */
1323 if (are_all_categories_the_same_locale) {
1324 aggregate_locale = this_start;
1325 aggregate_locale[entry_len] = '\0';
1326 }
1327
1328 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
1329 "%s:%d: calculate_LC_ALL returning '%s'\n",
1330 __FILE__, __LINE__, aggregate_locale));
1331
1332 return aggregate_locale;
1333}
6df0bfb6 1334#endif /*defined(USE_POSIX_2008_LOCALE)*/
d92b203c 1335
8befffe0 1336#ifndef USE_POSIX_2008_LOCALE
12f1a329 1337
a4f00dcc 1338STATIC void
94202fb8
KW
1339S_setlocale_failure_panic_i(pTHX_
1340 const unsigned int cat_index,
1341 const char * current,
1342 const char * failed,
1343 const line_t caller_0_line,
1344 const line_t caller_1_line)
1345{
1346 const int cat = categories[cat_index];
1347 const char * name = category_names[cat_index];
1348 dSAVE_ERRNO;
1349
1350 PERL_ARGS_ASSERT_SETLOCALE_FAILURE_PANIC_I;
1351
1352#ifdef USE_C_BACKTRACE
1353 dump_c_backtrace(Perl_debug_log, 20, 1);
1354#endif
1355
1356 SETLOCALE_UNLOCK;
1357
1358 if (current == NULL) {
1359 current = querylocale_i(cat_index);
1360 }
1361
1362 RESTORE_ERRNO;
ee178617
KW
1363 Perl_croak(aTHX_ "panic: %s: %" LINE_Tf "d:(%" LINE_Tf
1364 "): Can't change locale for" " %s(%d)"
94202fb8
KW
1365 " from '%s' to '%s'; errno=%d\n",
1366 __FILE__, caller_0_line, caller_1_line, name, cat,
1367 current, failed, errno);
1368 NOT_REACHED; /* NOTREACHED */
1369}
1370
12f1a329
KW
1371#endif
1372
94202fb8 1373STATIC void
86799d2d 1374S_set_numeric_radix(pTHX_ const bool use_locale)
98994639 1375{
86799d2d
KW
1376 /* If 'use_locale' is FALSE, set to use a dot for the radix character. If
1377 * TRUE, use the radix character derived from the current locale */
7d4bcc4a 1378
fe1c1494 1379# if defined(USE_LOCALE_NUMERIC) && ( defined(HAS_LOCALECONV) \
86799d2d 1380 || defined(HAS_NL_LANGINFO))
98994639 1381
87f8e8e7 1382 const char * radix = (use_locale)
4e6826bf 1383 ? my_nl_langinfo(RADIXCHAR, FALSE)
87f8e8e7
KW
1384 /* FALSE => already in dest locale */
1385 : ".";
2213a3be 1386
87f8e8e7 1387 sv_setpv(PL_numeric_radix_sv, radix);
86799d2d 1388
87f8e8e7
KW
1389 /* If this is valid UTF-8 that isn't totally ASCII, and we are in
1390 * a UTF-8 locale, then mark the radix as being in UTF-8 */
1391 if (is_utf8_non_invariant_string((U8 *) SvPVX(PL_numeric_radix_sv),
7a393424 1392 SvCUR(PL_numeric_radix_sv))
87f8e8e7
KW
1393 && _is_cur_LC_category_utf8(LC_NUMERIC))
1394 {
1395 SvUTF8_on(PL_numeric_radix_sv);
1396 }
86799d2d 1397
94d521c0 1398 DEBUG_L(PerlIO_printf(Perl_debug_log, "Locale radix is '%s', ?UTF-8=%d\n",
3ca88433 1399 SvPVX(PL_numeric_radix_sv),
94d521c0 1400 cBOOL(SvUTF8(PL_numeric_radix_sv))));
fe1c1494 1401# else
d36adde0
KW
1402
1403 PERL_UNUSED_ARG(use_locale);
1404
fe1c1494 1405# endif /* USE_LOCALE_NUMERIC and can find the radix char */
7d4bcc4a 1406
98994639
HS
1407}
1408
398eeea9
KW
1409STATIC void
1410S_new_numeric(pTHX_ const char *newnum)
98994639 1411{
7d4bcc4a 1412
fe1c1494 1413# ifndef USE_LOCALE_NUMERIC
7d4bcc4a
KW
1414
1415 PERL_UNUSED_ARG(newnum);
1416
fe1c1494 1417# else
0d071d52 1418
291a84fb 1419 /* Called after each libc setlocale() call affecting LC_NUMERIC, to tell
fe1c1494
KW
1420 * core Perl this and that 'newnum' is the name of the new locale, and we
1421 * are switched into it. It installs this locale as the current underlying
1422 * default, and then switches to the C locale, if necessary, so that the
1423 * code that has traditionally expected the radix character to be a dot may
1424 * continue to do so.
0d071d52
KW
1425 *
1426 * The default locale and the C locale can be toggled between by use of the
5792c642
KW
1427 * set_numeric_underlying() and set_numeric_standard() functions, which
1428 * should probably not be called directly, but only via macros like
0d071d52
KW
1429 * SET_NUMERIC_STANDARD() in perl.h.
1430 *
1431 * The toggling is necessary mainly so that a non-dot radix decimal point
fe1c1494
KW
1432 * character can be input and output, while allowing internal calculations
1433 * to use a dot.
0d071d52
KW
1434 *
1435 * This sets several interpreter-level variables:
bb304765 1436 * PL_numeric_name The underlying locale's name: a copy of 'newnum'
892e6465 1437 * PL_numeric_underlying A boolean indicating if the toggled state is such
7738054c
KW
1438 * that the current locale is the program's underlying
1439 * locale
1440 * PL_numeric_standard An int indicating if the toggled state is such
4c68b815
KW
1441 * that the current locale is the C locale or
1442 * indistinguishable from the C locale. If non-zero, it
1443 * is in C; if > 1, it means it may not be toggled away
7738054c 1444 * from C.
4c68b815
KW
1445 * PL_numeric_underlying_is_standard A bool kept by this function
1446 * indicating that the underlying locale and the standard
1447 * C locale are indistinguishable for the purposes of
1448 * LC_NUMERIC. This happens when both of the above two
1449 * variables are true at the same time. (Toggling is a
1450 * no-op under these circumstances.) This variable is
1451 * used to avoid having to recalculate.
fe1c1494
KW
1452 * PL_numeric_radix_sv Contains the string that code should use for the
1453 * decimal point. It is set to either a dot or the
1454 * program's underlying locale's radix character string,
1455 * depending on the situation.
1456 * PL_underlying_numeric_obj = (only on POSIX 2008 platforms) An object
1457 * with everything set up properly so as to avoid work on
1458 * such platforms.
398eeea9 1459 */
0d071d52 1460
b03f34cf 1461 char *save_newnum;
98994639
HS
1462
1463 if (! newnum) {
1604cfb0
MS
1464 Safefree(PL_numeric_name);
1465 PL_numeric_name = NULL;
1466 PL_numeric_standard = TRUE;
1467 PL_numeric_underlying = TRUE;
1468 PL_numeric_underlying_is_standard = TRUE;
1469 return;
98994639
HS
1470 }
1471
32899cfb 1472 save_newnum = savepv(newnum);
892e6465 1473 PL_numeric_underlying = TRUE;
4c68b815
KW
1474 PL_numeric_standard = isNAME_C_OR_POSIX(save_newnum);
1475
fe1c1494 1476# ifndef TS_W32_BROKEN_LOCALECONV
69c5e0db 1477
4c68b815 1478 /* If its name isn't C nor POSIX, it could still be indistinguishable from
69c5e0db
KW
1479 * them. But on broken Windows systems calling my_nl_langinfo() for
1480 * THOUSEP can currently (but rarely) cause a race, so avoid doing that,
1481 * and just always change the locale if not C nor POSIX on those systems */
4c68b815 1482 if (! PL_numeric_standard) {
4e6826bf 1483 PL_numeric_standard = cBOOL(strEQ(".", my_nl_langinfo(RADIXCHAR,
4c68b815 1484 FALSE /* Don't toggle locale */ ))
4e6826bf 1485 && strEQ("", my_nl_langinfo(THOUSEP, FALSE)));
4c68b815 1486 }
abe1abcf 1487
fe1c1494 1488# endif
69c5e0db 1489
4c68b815 1490 /* Save the new name if it isn't the same as the previous one, if any */
b03f34cf 1491 if (! PL_numeric_name || strNE(PL_numeric_name, save_newnum)) {
fe1c1494 1492 /* Save the locale name for future use */
1604cfb0
MS
1493 Safefree(PL_numeric_name);
1494 PL_numeric_name = save_newnum;
b03f34cf 1495 }
abe1abcf 1496 else {
1604cfb0 1497 Safefree(save_newnum);
abe1abcf 1498 }
4c28b29c 1499
4c68b815
KW
1500 PL_numeric_underlying_is_standard = PL_numeric_standard;
1501
4b06b4d0 1502# ifdef USE_POSIX_2008_LOCALE
e1aa2579 1503
fe1c1494 1504 /* We keep a special object for easy switching to */
e1aa2579
KW
1505 PL_underlying_numeric_obj = newlocale(LC_NUMERIC_MASK,
1506 PL_numeric_name,
1507 PL_underlying_numeric_obj);
1508
fe1c1494 1509# endif
e1aa2579 1510
94d521c0
KW
1511 DEBUG_L( PerlIO_printf(Perl_debug_log,
1512 "Called new_numeric with %s, PL_numeric_name=%s\n",
1513 newnum, PL_numeric_name));
4c68b815 1514
fe1c1494
KW
1515 /* Keep LC_NUMERIC so that it has the C locale radix and thousands
1516 * separator. This is for XS modules, so they don't have to worry about
1517 * the radix being a non-dot. (Core operations that need the underlying
1518 * locale change to it temporarily). */
84f1d29f
KW
1519 if (PL_numeric_standard) {
1520 set_numeric_radix(0);
1521 }
1522 else {
1523 set_numeric_standard();
1524 }
4c28b29c 1525
fe1c1494 1526# endif
7d4bcc4a 1527
98994639
HS
1528}
1529
1530void
1531Perl_set_numeric_standard(pTHX)
1532{
7d4bcc4a 1533
fe1c1494 1534# ifdef USE_LOCALE_NUMERIC
7d4bcc4a 1535
fe1c1494
KW
1536 /* Unconditionally toggle the LC_NUMERIC locale to the current underlying
1537 * default.
1538 *
1539 * Most code should use the macro SET_NUMERIC_STANDARD() in perl.h
1540 * instead of calling this directly. The macro avoids calling this routine
1541 * if toggling isn't necessary according to our records (which could be
1542 * wrong if some XS code has changed the locale behind our back) */
0d071d52 1543
94d521c0
KW
1544 DEBUG_L(PerlIO_printf(Perl_debug_log,
1545 "Setting LC_NUMERIC locale to standard C\n"));
7bb8f702 1546
3980cddb 1547 void_setlocale_c(LC_NUMERIC, "C");
7bb8f702
KW
1548 PL_numeric_standard = TRUE;
1549 PL_numeric_underlying = PL_numeric_underlying_is_standard;
1550 set_numeric_radix(0);
1551
fe1c1494 1552# endif /* USE_LOCALE_NUMERIC */
7d4bcc4a 1553
98994639
HS
1554}
1555
1556void
5792c642 1557Perl_set_numeric_underlying(pTHX)
98994639 1558{
7d4bcc4a 1559
fe1c1494 1560# ifdef USE_LOCALE_NUMERIC
7d4bcc4a 1561
fe1c1494
KW
1562 /* Unconditionally toggle the LC_NUMERIC locale to the current underlying
1563 * default.
1564 *
1565 * Most code should use the macro SET_NUMERIC_UNDERLYING() in perl.h
7d4bcc4a
KW
1566 * instead of calling this directly. The macro avoids calling this routine
1567 * if toggling isn't necessary according to our records (which could be
1568 * wrong if some XS code has changed the locale behind our back) */
a9b8c0d8 1569
94d521c0
KW
1570 DEBUG_L(PerlIO_printf(Perl_debug_log, "Setting LC_NUMERIC locale to %s\n",
1571 PL_numeric_name));
7bb8f702 1572
3980cddb 1573 void_setlocale_c(LC_NUMERIC, PL_numeric_name);
7bb8f702
KW
1574 PL_numeric_standard = PL_numeric_underlying_is_standard;
1575 PL_numeric_underlying = TRUE;
1576 set_numeric_radix(! PL_numeric_standard);
1577
fe1c1494 1578# endif /* USE_LOCALE_NUMERIC */
7d4bcc4a 1579
98994639
HS
1580}
1581
1582/*
1583 * Set up for a new ctype locale.
1584 */
a4f00dcc
KW
1585STATIC void
1586S_new_ctype(pTHX_ const char *newctype)
98994639 1587{
7d4bcc4a 1588
fe1c1494 1589# ifndef USE_LOCALE_CTYPE
7d4bcc4a 1590
7d4bcc4a
KW
1591 PERL_UNUSED_ARG(newctype);
1592 PERL_UNUSED_CONTEXT;
1593
fe1c1494 1594# else
0d071d52 1595
291a84fb 1596 /* Called after each libc setlocale() call affecting LC_CTYPE, to tell
0d071d52
KW
1597 * core Perl this and that 'newctype' is the name of the new locale.
1598 *
1599 * This function sets up the folding arrays for all 256 bytes, assuming
1600 * that tofold() is tolc() since fold case is not a concept in POSIX,
1601 *
1602 * Any code changing the locale (outside this file) should use
9aac5db8
KW
1603 * Perl_setlocale or POSIX::setlocale, which call this function. Therefore
1604 * this function should be called directly only from this file and from
0d071d52
KW
1605 * POSIX::setlocale() */
1606
013f4e03 1607 unsigned int i;
98994639 1608
8b7358b9
KW
1609 /* Don't check for problems if we are suppressing the warnings */
1610 bool check_for_problems = ckWARN_d(WARN_LOCALE) || UNLIKELY(DEBUG_L_TEST);
30d8090d 1611 bool maybe_utf8_turkic = FALSE;
8b7358b9 1612
7918f24d
NC
1613 PERL_ARGS_ASSERT_NEW_CTYPE;
1614
215c5139
KW
1615 /* We will replace any bad locale warning with 1) nothing if the new one is
1616 * ok; or 2) a new warning for the bad new locale */
1617 if (PL_warn_locale) {
1618 SvREFCNT_dec_NN(PL_warn_locale);
1619 PL_warn_locale = NULL;
1620 }
1621
c1284011 1622 PL_in_utf8_CTYPE_locale = _is_cur_LC_category_utf8(LC_CTYPE);
31f05a37
KW
1623
1624 /* A UTF-8 locale gets standard rules. But note that code still has to
1625 * handle this specially because of the three problematic code points */
1626 if (PL_in_utf8_CTYPE_locale) {
1627 Copy(PL_fold_latin1, PL_fold_locale, 256, U8);
30d8090d
KW
1628
1629 /* UTF-8 locales can have special handling for 'I' and 'i' if they are
fe1c1494
KW
1630 * Turkic. Make sure these two are the only anomalies. (We don't
1631 * require towupper and towlower because they aren't in C89.) */
5b64f24c 1632
fe1c1494 1633# if defined(HAS_TOWUPPER) && defined (HAS_TOWLOWER)
5b64f24c 1634
fe1c1494 1635 if (towupper('i') == 0x130 && towlower('I') == 0x131)
5b64f24c 1636
fe1c1494 1637# else
5b64f24c 1638
fe1c1494 1639 if (toU8_UPPER_LC('i') == 'i' && toU8_LOWER_LC('I') == 'I')
5b64f24c 1640
fe1c1494
KW
1641# endif
1642
1643 {
1644 /* This is how we determine it really is Turkic */
30d8090d
KW
1645 check_for_problems = TRUE;
1646 maybe_utf8_turkic = TRUE;
1647 }
31f05a37 1648 }
8b7358b9
KW
1649
1650 /* We don't populate the other lists if a UTF-8 locale, but do check that
1651 * everything works as expected, unless checking turned off */
1652 if (check_for_problems || ! PL_in_utf8_CTYPE_locale) {
8c6180a9
KW
1653 /* Assume enough space for every character being bad. 4 spaces each
1654 * for the 94 printable characters that are output like "'x' "; and 5
1655 * spaces each for "'\\' ", "'\t' ", and "'\n' "; plus a terminating
1656 * NUL */
8b7358b9 1657 char bad_chars_list[ (94 * 4) + (3 * 5) + 1 ] = { '\0' };
8c6180a9
KW
1658 bool multi_byte_locale = FALSE; /* Assume is a single-byte locale
1659 to start */
1660 unsigned int bad_count = 0; /* Count of bad characters */
1661
baa60164 1662 for (i = 0; i < 256; i++) {
8b7358b9 1663 if (! PL_in_utf8_CTYPE_locale) {
dcbeec20
KW
1664 if (isU8_UPPER_LC(i))
1665 PL_fold_locale[i] = (U8) toU8_LOWER_LC(i);
1666 else if (isU8_LOWER_LC(i))
1667 PL_fold_locale[i] = (U8) toU8_UPPER_LC(i);
bd6d0898
KW
1668 else
1669 PL_fold_locale[i] = (U8) i;
8b7358b9 1670 }
8c6180a9
KW
1671
1672 /* If checking for locale problems, see if the native ASCII-range
1673 * printables plus \n and \t are in their expected categories in
1674 * the new locale. If not, this could mean big trouble, upending
1675 * Perl's and most programs' assumptions, like having a
1676 * metacharacter with special meaning become a \w. Fortunately,
1677 * it's very rare to find locales that aren't supersets of ASCII
1678 * nowadays. It isn't a problem for most controls to be changed
1679 * into something else; we check only \n and \t, though perhaps \r
1680 * could be an issue as well. */
7d4bcc4a 1681 if ( check_for_problems
8c6180a9
KW
1682 && (isGRAPH_A(i) || isBLANK_A(i) || i == '\n'))
1683 {
8b7358b9 1684 bool is_bad = FALSE;
6726b4f4 1685 char name[4] = { '\0' };
8b7358b9
KW
1686
1687 /* Convert the name into a string */
6726b4f4 1688 if (isGRAPH_A(i)) {
8b7358b9
KW
1689 name[0] = i;
1690 name[1] = '\0';
1691 }
1692 else if (i == '\n') {
6726b4f4
KW
1693 my_strlcpy(name, "\\n", sizeof(name));
1694 }
1695 else if (i == '\t') {
1696 my_strlcpy(name, "\\t", sizeof(name));
8b7358b9
KW
1697 }
1698 else {
6726b4f4
KW
1699 assert(i == ' ');
1700 my_strlcpy(name, "' '", sizeof(name));
8b7358b9
KW
1701 }
1702
1703 /* Check each possibe class */
dcbeec20 1704 if (UNLIKELY(cBOOL(isU8_ALPHANUMERIC_LC(i)) != cBOOL(isALPHANUMERIC_A(i)))) {
8b7358b9
KW
1705 is_bad = TRUE;
1706 DEBUG_L(PerlIO_printf(Perl_debug_log,
1707 "isalnum('%s') unexpectedly is %d\n",
dcbeec20 1708 name, cBOOL(isU8_ALPHANUMERIC_LC(i))));
8b7358b9 1709 }
dcbeec20 1710 if (UNLIKELY(cBOOL(isU8_ALPHA_LC(i)) != cBOOL(isALPHA_A(i)))) {
8b7358b9
KW
1711 is_bad = TRUE;
1712 DEBUG_L(PerlIO_printf(Perl_debug_log,
1713 "isalpha('%s') unexpectedly is %d\n",
dcbeec20 1714 name, cBOOL(isU8_ALPHA_LC(i))));
8b7358b9 1715 }
dcbeec20 1716 if (UNLIKELY(cBOOL(isU8_DIGIT_LC(i)) != cBOOL(isDIGIT_A(i)))) {
8b7358b9
KW
1717 is_bad = TRUE;
1718 DEBUG_L(PerlIO_printf(Perl_debug_log,
1719 "isdigit('%s') unexpectedly is %d\n",
dcbeec20 1720 name, cBOOL(isU8_DIGIT_LC(i))));
8b7358b9 1721 }
dcbeec20 1722 if (UNLIKELY(cBOOL(isU8_GRAPH_LC(i)) != cBOOL(isGRAPH_A(i)))) {
8b7358b9
KW
1723 is_bad = TRUE;
1724 DEBUG_L(PerlIO_printf(Perl_debug_log,
1725 "isgraph('%s') unexpectedly is %d\n",
dcbeec20 1726 name, cBOOL(isU8_GRAPH_LC(i))));
8b7358b9 1727 }
dcbeec20 1728 if (UNLIKELY(cBOOL(isU8_LOWER_LC(i)) != cBOOL(isLOWER_A(i)))) {
8b7358b9
KW
1729 is_bad = TRUE;
1730 DEBUG_L(PerlIO_printf(Perl_debug_log,
1731 "islower('%s') unexpectedly is %d\n",
dcbeec20 1732 name, cBOOL(isU8_LOWER_LC(i))));
8b7358b9 1733 }
dcbeec20 1734 if (UNLIKELY(cBOOL(isU8_PRINT_LC(i)) != cBOOL(isPRINT_A(i)))) {
8b7358b9
KW
1735 is_bad = TRUE;
1736 DEBUG_L(PerlIO_printf(Perl_debug_log,
1737 "isprint('%s') unexpectedly is %d\n",
dcbeec20 1738 name, cBOOL(isU8_PRINT_LC(i))));
8b7358b9 1739 }
dcbeec20 1740 if (UNLIKELY(cBOOL(isU8_PUNCT_LC(i)) != cBOOL(isPUNCT_A(i)))) {
8b7358b9
KW
1741 is_bad = TRUE;
1742 DEBUG_L(PerlIO_printf(Perl_debug_log,
1743 "ispunct('%s') unexpectedly is %d\n",
dcbeec20 1744 name, cBOOL(isU8_PUNCT_LC(i))));
8b7358b9 1745 }
dcbeec20 1746 if (UNLIKELY(cBOOL(isU8_SPACE_LC(i)) != cBOOL(isSPACE_A(i)))) {
8b7358b9
KW
1747 is_bad = TRUE;
1748 DEBUG_L(PerlIO_printf(Perl_debug_log,
1749 "isspace('%s') unexpectedly is %d\n",
dcbeec20 1750 name, cBOOL(isU8_SPACE_LC(i))));
8b7358b9 1751 }
dcbeec20 1752 if (UNLIKELY(cBOOL(isU8_UPPER_LC(i)) != cBOOL(isUPPER_A(i)))) {
8b7358b9
KW
1753 is_bad = TRUE;
1754 DEBUG_L(PerlIO_printf(Perl_debug_log,
1755 "isupper('%s') unexpectedly is %d\n",
dcbeec20 1756 name, cBOOL(isU8_UPPER_LC(i))));
8b7358b9 1757 }
dcbeec20 1758 if (UNLIKELY(cBOOL(isU8_XDIGIT_LC(i))!= cBOOL(isXDIGIT_A(i)))) {
8b7358b9
KW
1759 is_bad = TRUE;
1760 DEBUG_L(PerlIO_printf(Perl_debug_log,
1761 "isxdigit('%s') unexpectedly is %d\n",
dcbeec20 1762 name, cBOOL(isU8_XDIGIT_LC(i))));
8b7358b9 1763 }
dcbeec20 1764 if (UNLIKELY(toU8_LOWER_LC(i) != (int) toLOWER_A(i))) {
8b7358b9
KW
1765 is_bad = TRUE;
1766 DEBUG_L(PerlIO_printf(Perl_debug_log,
1767 "tolower('%s')=0x%x instead of the expected 0x%x\n",
dcbeec20 1768 name, toU8_LOWER_LC(i), (int) toLOWER_A(i)));
8b7358b9 1769 }
dcbeec20 1770 if (UNLIKELY(toU8_UPPER_LC(i) != (int) toUPPER_A(i))) {
8b7358b9
KW
1771 is_bad = TRUE;
1772 DEBUG_L(PerlIO_printf(Perl_debug_log,
1773 "toupper('%s')=0x%x instead of the expected 0x%x\n",
dcbeec20 1774 name, toU8_UPPER_LC(i), (int) toUPPER_A(i)));
8b7358b9
KW
1775 }
1776 if (UNLIKELY((i == '\n' && ! isCNTRL_LC(i)))) {
1777 is_bad = TRUE;
1778 DEBUG_L(PerlIO_printf(Perl_debug_log,
1779 "'\\n' (=%02X) is not a control\n", (int) i));
1780 }
1781
1782 /* Add to the list; Separate multiple entries with a blank */
1783 if (is_bad) {
1784 if (bad_count) {
1785 my_strlcat(bad_chars_list, " ", sizeof(bad_chars_list));
8c6180a9 1786 }
8b7358b9
KW
1787 my_strlcat(bad_chars_list, name, sizeof(bad_chars_list));
1788 bad_count++;
8c6180a9
KW
1789 }
1790 }
1791 }
1792
30d8090d
KW
1793 if (bad_count == 2 && maybe_utf8_turkic) {
1794 bad_count = 0;
1795 *bad_chars_list = '\0';
1796 PL_fold_locale['I'] = 'I';
1797 PL_fold_locale['i'] = 'i';
1798 PL_in_utf8_turkic_locale = TRUE;
1799 DEBUG_L(PerlIO_printf(Perl_debug_log, "%s:%d: %s is turkic\n",
1800 __FILE__, __LINE__, newctype));
1801 }
1802 else {
b8df1494 1803 PL_in_utf8_turkic_locale = FALSE;
30d8090d 1804 }
b8df1494 1805
7d4bcc4a
KW
1806# ifdef MB_CUR_MAX
1807
8c6180a9 1808 /* We only handle single-byte locales (outside of UTF-8 ones; so if
d35fca5f 1809 * this locale requires more than one byte, there are going to be
8c6180a9 1810 * problems. */
9c8a6dc2
KW
1811 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
1812 "%s:%d: check_for_problems=%d, MB_CUR_MAX=%d\n",
1813 __FILE__, __LINE__, check_for_problems, (int) MB_CUR_MAX));
1814
8b7358b9
KW
1815 if ( check_for_problems && MB_CUR_MAX > 1
1816 && ! PL_in_utf8_CTYPE_locale
ba1a4362
KW
1817
1818 /* Some platforms return MB_CUR_MAX > 1 for even the "C"
1819 * locale. Just assume that the implementation for them (plus
1820 * for POSIX) is correct and the > 1 value is spurious. (Since
1821 * these are specially handled to never be considered UTF-8
1822 * locales, as long as this is the only problem, everything
1823 * should work fine */
1824 && strNE(newctype, "C") && strNE(newctype, "POSIX"))
1825 {
8c6180a9
KW
1826 multi_byte_locale = TRUE;
1827 }
7d4bcc4a
KW
1828
1829# endif
8c6180a9 1830
30d8090d
KW
1831 /* If we found problems and we want them output, do so */
1832 if ( (UNLIKELY(bad_count) || UNLIKELY(multi_byte_locale))
1833 && (LIKELY(ckWARN_d(WARN_LOCALE)) || UNLIKELY(DEBUG_L_TEST)))
1834 {
8b7358b9
KW
1835 if (UNLIKELY(bad_count) && PL_in_utf8_CTYPE_locale) {
1836 PL_warn_locale = Perl_newSVpvf(aTHX_
1837 "Locale '%s' contains (at least) the following characters"
578a6a87
KW
1838 " which have\nunexpected meanings: %s\nThe Perl program"
1839 " will use the expected meanings",
8b7358b9 1840 newctype, bad_chars_list);
8b7358b9
KW
1841 }
1842 else {
1843 PL_warn_locale = Perl_newSVpvf(aTHX_
8c6180a9 1844 "Locale '%s' may not work well.%s%s%s\n",
780fcc9f 1845 newctype,
8c6180a9
KW
1846 (multi_byte_locale)
1847 ? " Some characters in it are not recognized by"
1848 " Perl."
1849 : "",
1850 (bad_count)
1851 ? "\nThe following characters (and maybe others)"
1852 " may not have the same meaning as the Perl"
1853 " program expects:\n"
1854 : "",
1855 (bad_count)
1856 ? bad_chars_list
1857 : ""
1858 );
8b7358b9
KW
1859 }
1860
b119c2be
KW
1861# ifdef HAS_NL_LANGINFO
1862
1863 Perl_sv_catpvf(aTHX_ PL_warn_locale, "; codeset=%s",
1864 /* parameter FALSE is a don't care here */
4e6826bf 1865 my_nl_langinfo(CODESET, FALSE));
b119c2be
KW
1866
1867# endif
1868
8b7358b9
KW
1869 Perl_sv_catpvf(aTHX_ PL_warn_locale, "\n");
1870
cc9eaeb0 1871 /* If we are actually in the scope of the locale or are debugging,
bddebb56
KW
1872 * output the message now. If not in that scope, we save the
1873 * message to be output at the first operation using this locale,
1874 * if that actually happens. Most programs don't use locales, so
1875 * they are immune to bad ones. */
cc9eaeb0 1876 if (IN_LC(LC_CTYPE) || UNLIKELY(DEBUG_L_TEST)) {
780fcc9f 1877
780fcc9f 1878 /* The '0' below suppresses a bogus gcc compiler warning */
fe1c1494
KW
1879 Perl_warner(aTHX_ packWARN(WARN_LOCALE), SvPVX(PL_warn_locale),
1880 0);
bddebb56 1881
bddebb56
KW
1882 if (IN_LC(LC_CTYPE)) {
1883 SvREFCNT_dec_NN(PL_warn_locale);
1884 PL_warn_locale = NULL;
1885 }
780fcc9f 1886 }
baa60164 1887 }
31f05a37 1888 }
98994639 1889
fe1c1494 1890# endif /* USE_LOCALE_CTYPE */
7d4bcc4a 1891
98994639
HS
1892}
1893
98994639 1894void
2726666d
KW
1895Perl__warn_problematic_locale()
1896{
2726666d 1897
fe1c1494 1898# ifdef USE_LOCALE_CTYPE
2726666d 1899
5f04a188
KW
1900 dTHX;
1901
1902 /* Internal-to-core function that outputs the message in PL_warn_locale,
1903 * and then NULLS it. Should be called only through the macro
1629a27e 1904 * CHECK_AND_WARN_PROBLEMATIC_LOCALE_ */
5f04a188 1905
2726666d 1906 if (PL_warn_locale) {
2726666d
KW
1907 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
1908 SvPVX(PL_warn_locale),
1909 0 /* dummy to avoid compiler warning */ );
2726666d
KW
1910 SvREFCNT_dec_NN(PL_warn_locale);
1911 PL_warn_locale = NULL;
1912 }
1913
fe1c1494 1914# endif
2726666d
KW
1915
1916}
1917
a4f00dcc 1918STATIC void
c7dc1ea8
KW
1919S_new_LC_ALL(pTHX_ const char *unused)
1920{
1921 unsigned int i;
1922
1923 /* LC_ALL updates all the things we care about. */
1924
1925 PERL_UNUSED_ARG(unused);
1926
1927 for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++) {
1928 if (update_functions[i]) {
1f9a6108
KW
1929 const char * this_locale = savepv(querylocale_i(i));
1930 update_functions[i](aTHX_ this_locale);
1931 Safefree(this_locale);
c7dc1ea8
KW
1932 }
1933 }
1934}
1935
1936STATIC void
a4f00dcc 1937S_new_collate(pTHX_ const char *newcoll)
98994639 1938{
7d4bcc4a 1939
fe1c1494 1940# ifndef USE_LOCALE_COLLATE
7d4bcc4a
KW
1941
1942 PERL_UNUSED_ARG(newcoll);
1943 PERL_UNUSED_CONTEXT;
1944
fe1c1494 1945# else
0d071d52 1946
291a84fb 1947 /* Called after each libc setlocale() call affecting LC_COLLATE, to tell
0d071d52
KW
1948 * core Perl this and that 'newcoll' is the name of the new locale.
1949 *
d35fca5f
KW
1950 * The design of locale collation is that every locale change is given an
1951 * index 'PL_collation_ix'. The first time a string particpates in an
1952 * operation that requires collation while locale collation is active, it
1953 * is given PERL_MAGIC_collxfrm magic (via sv_collxfrm_flags()). That
1954 * magic includes the collation index, and the transformation of the string
1955 * by strxfrm(), q.v. That transformation is used when doing comparisons,
1956 * instead of the string itself. If a string changes, the magic is
1957 * cleared. The next time the locale changes, the index is incremented,
1958 * and so we know during a comparison that the transformation is not
1959 * necessarily still valid, and so is recomputed. Note that if the locale
1960 * changes enough times, the index could wrap (a U32), and it is possible
1961 * that a transformation would improperly be considered valid, leading to
1962 * an unlikely bug */
0d071d52 1963
98994639 1964 if (! newcoll) {
1604cfb0
MS
1965 if (PL_collation_name) {
1966 ++PL_collation_ix;
1967 Safefree(PL_collation_name);
1968 PL_collation_name = NULL;
1969 }
1970 PL_collation_standard = TRUE;
00bf60ca 1971 is_standard_collation:
1604cfb0
MS
1972 PL_collxfrm_base = 0;
1973 PL_collxfrm_mult = 2;
165a1c52 1974 PL_in_utf8_COLLATE_locale = FALSE;
f28f4d2a 1975 PL_strxfrm_NUL_replacement = '\0';
a4a439fb 1976 PL_strxfrm_max_cp = 0;
1604cfb0 1977 return;
98994639
HS
1978 }
1979
d35fca5f 1980 /* If this is not the same locale as currently, set the new one up */
98994639 1981 if (! PL_collation_name || strNE(PL_collation_name, newcoll)) {
1604cfb0
MS
1982 ++PL_collation_ix;
1983 Safefree(PL_collation_name);
32899cfb 1984 PL_collation_name = savepv(newcoll);
1604cfb0 1985 PL_collation_standard = isNAME_C_OR_POSIX(newcoll);
00bf60ca
KW
1986 if (PL_collation_standard) {
1987 goto is_standard_collation;
1988 }
98994639 1989
165a1c52 1990 PL_in_utf8_COLLATE_locale = _is_cur_LC_category_utf8(LC_COLLATE);
f28f4d2a 1991 PL_strxfrm_NUL_replacement = '\0';
a4a439fb 1992 PL_strxfrm_max_cp = 0;
165a1c52 1993
59c018b9
KW
1994 /* A locale collation definition includes primary, secondary, tertiary,
1995 * etc. weights for each character. To sort, the primary weights are
1996 * used, and only if they compare equal, then the secondary weights are
1997 * used, and only if they compare equal, then the tertiary, etc.
1998 *
1999 * strxfrm() works by taking the input string, say ABC, and creating an
2000 * output transformed string consisting of first the primary weights,
2001 * A¹B¹C¹ followed by the secondary ones, A²B²C²; and then the
2002 * tertiary, etc, yielding A¹B¹C¹ A²B²C² A³B³C³ .... Some characters
2003 * may not have weights at every level. In our example, let's say B
2004 * doesn't have a tertiary weight, and A doesn't have a secondary
2005 * weight. The constructed string is then going to be
2006 * A¹B¹C¹ B²C² A³C³ ....
2007 * This has the desired effect that strcmp() will look at the secondary
2008 * or tertiary weights only if the strings compare equal at all higher
2009 * priority weights. The spaces shown here, like in
c342d20e 2010 * "A¹B¹C¹ A²B²C² "
59c018b9
KW
2011 * are not just for readability. In the general case, these must
2012 * actually be bytes, which we will call here 'separator weights'; and
2013 * they must be smaller than any other weight value, but since these
2014 * are C strings, only the terminating one can be a NUL (some
2015 * implementations may include a non-NUL separator weight just before
2016 * the NUL). Implementations tend to reserve 01 for the separator
2017 * weights. They are needed so that a shorter string's secondary
2018 * weights won't be misconstrued as primary weights of a longer string,
2019 * etc. By making them smaller than any other weight, the shorter
2020 * string will sort first. (Actually, if all secondary weights are
2021 * smaller than all primary ones, there is no need for a separator
2022 * weight between those two levels, etc.)
2023 *
2024 * The length of the transformed string is roughly a linear function of
2025 * the input string. It's not exactly linear because some characters
2026 * don't have weights at all levels. When we call strxfrm() we have to
2027 * allocate some memory to hold the transformed string. The
2028 * calculations below try to find coefficients 'm' and 'b' for this
2029 * locale so that m*x + b equals how much space we need, given the size
2030 * of the input string in 'x'. If we calculate too small, we increase
2031 * the size as needed, and call strxfrm() again, but it is better to
2032 * get it right the first time to avoid wasted expensive string
2033 * transformations. */
2034
1604cfb0 2035 {
79f120c8
KW
2036 /* We use the string below to find how long the tranformation of it
2037 * is. Almost all locales are supersets of ASCII, or at least the
2038 * ASCII letters. We use all of them, half upper half lower,
2039 * because if we used fewer, we might hit just the ones that are
2040 * outliers in a particular locale. Most of the strings being
2041 * collated will contain a preponderance of letters, and even if
2042 * they are above-ASCII, they are likely to have the same number of
2043 * weight levels as the ASCII ones. It turns out that digits tend
2044 * to have fewer levels, and some punctuation has more, but those
2045 * are relatively sparse in text, and khw believes this gives a
2046 * reasonable result, but it could be changed if experience so
2047 * dictates. */
2048 const char longer[] = "ABCDEFGHIJKLMnopqrstuvwxyz";
2049 char * x_longer; /* Transformed 'longer' */
2050 Size_t x_len_longer; /* Length of 'x_longer' */
2051
2052 char * x_shorter; /* We also transform a substring of 'longer' */
2053 Size_t x_len_shorter;
2054
a4a439fb 2055 /* _mem_collxfrm() is used get the transformation (though here we
79f120c8
KW
2056 * are interested only in its length). It is used because it has
2057 * the intelligence to handle all cases, but to work, it needs some
2058 * values of 'm' and 'b' to get it started. For the purposes of
2059 * this calculation we use a very conservative estimate of 'm' and
2060 * 'b'. This assumes a weight can be multiple bytes, enough to
2061 * hold any UV on the platform, and there are 5 levels, 4 weight
2062 * bytes, and a trailing NUL. */
2063 PL_collxfrm_base = 5;
2064 PL_collxfrm_mult = 5 * sizeof(UV);
2065
2066 /* Find out how long the transformation really is */
a4a439fb
KW
2067 x_longer = _mem_collxfrm(longer,
2068 sizeof(longer) - 1,
2069 &x_len_longer,
2070
2071 /* We avoid converting to UTF-8 in the
2072 * called function by telling it the
2073 * string is in UTF-8 if the locale is a
2074 * UTF-8 one. Since the string passed
2075 * here is invariant under UTF-8, we can
2076 * claim it's UTF-8 even though it isn't.
2077 * */
2078 PL_in_utf8_COLLATE_locale);
79f120c8
KW
2079 Safefree(x_longer);
2080
2081 /* Find out how long the transformation of a substring of 'longer'
2082 * is. Together the lengths of these transformations are
2083 * sufficient to calculate 'm' and 'b'. The substring is all of
2084 * 'longer' except the first character. This minimizes the chances
2085 * of being swayed by outliers */
a4a439fb 2086 x_shorter = _mem_collxfrm(longer + 1,
79f120c8 2087 sizeof(longer) - 2,
a4a439fb
KW
2088 &x_len_shorter,
2089 PL_in_utf8_COLLATE_locale);
79f120c8
KW
2090 Safefree(x_shorter);
2091
2092 /* If the results are nonsensical for this simple test, the whole
2093 * locale definition is suspect. Mark it so that locale collation
2094 * is not active at all for it. XXX Should we warn? */
2095 if ( x_len_shorter == 0
2096 || x_len_longer == 0
2097 || x_len_shorter >= x_len_longer)
2098 {
2099 PL_collxfrm_mult = 0;
2100 PL_collxfrm_base = 0;
2101 }
2102 else {
2103 SSize_t base; /* Temporary */
2104
2105 /* We have both: m * strlen(longer) + b = x_len_longer
2106 * m * strlen(shorter) + b = x_len_shorter;
2107 * subtracting yields:
2108 * m * (strlen(longer) - strlen(shorter))
2109 * = x_len_longer - x_len_shorter
2110 * But we have set things up so that 'shorter' is 1 byte smaller
2111 * than 'longer'. Hence:
2112 * m = x_len_longer - x_len_shorter
2113 *
2114 * But if something went wrong, make sure the multiplier is at
2115 * least 1.
2116 */
2117 if (x_len_longer > x_len_shorter) {
2118 PL_collxfrm_mult = (STRLEN) x_len_longer - x_len_shorter;
2119 }
2120 else {
2121 PL_collxfrm_mult = 1;
2122 }
2123
2124 /* mx + b = len
2125 * so: b = len - mx
2126 * but in case something has gone wrong, make sure it is
2127 * non-negative */
2128 base = x_len_longer - PL_collxfrm_mult * (sizeof(longer) - 1);
2129 if (base < 0) {
2130 base = 0;
2131 }
2132
2133 /* Add 1 for the trailing NUL */
2134 PL_collxfrm_base = base + 1;
2135 }
58eebef2 2136
94d521c0 2137 DEBUG_L(PerlIO_printf(Perl_debug_log,
b07929e4
KW
2138 "%s:%d: ?UTF-8 locale=%d; x_len_shorter=%zu, "
2139 "x_len_longer=%zu,"
2140 " collate multipler=%zu, collate base=%zu\n",
58eebef2
KW
2141 __FILE__, __LINE__,
2142 PL_in_utf8_COLLATE_locale,
2143 x_len_shorter, x_len_longer,
94d521c0 2144 PL_collxfrm_mult, PL_collxfrm_base));
1604cfb0 2145 }
98994639
HS
2146 }
2147
fe1c1494 2148# endif /* USE_LOCALE_COLLATE */
7d4bcc4a 2149
98994639
HS
2150}
2151
fe1c1494 2152#endif /* USE_LOCALE */
d36adde0 2153
d2b24094 2154#ifdef WIN32
b8cc575c 2155
6c332036
TC
2156#define USE_WSETLOCALE
2157
2158#ifdef USE_WSETLOCALE
2159
2160STATIC char *
2161S_wrap_wsetlocale(pTHX_ int category, const char *locale) {
2162 wchar_t *wlocale;
2163 wchar_t *wresult;
2164 char *result;
2165
2166 if (locale) {
2167 int req_size =
2168 MultiByteToWideChar(CP_UTF8, 0, locale, -1, NULL, 0);
2169
2170 if (!req_size) {
2171 errno = EINVAL;
2172 return NULL;
2173 }
2174
2175 Newx(wlocale, req_size, wchar_t);
2176 if (!MultiByteToWideChar(CP_UTF8, 0, locale, -1, wlocale, req_size)) {
2177 Safefree(wlocale);
2178 errno = EINVAL;
2179 return NULL;
2180 }
2181 }
2182 else {
2183 wlocale = NULL;
2184 }
2185 wresult = _wsetlocale(category, wlocale);
2186 Safefree(wlocale);
2187 if (wresult) {
2188 int req_size =
2189 WideCharToMultiByte(CP_UTF8, 0, wresult, -1, NULL, 0, NULL, NULL);
2190 Newx(result, req_size, char);
2191 SAVEFREEPV(result); /* is there something better we can do here? */
2192 if (!WideCharToMultiByte(CP_UTF8, 0, wresult, -1,
2193 result, req_size, NULL, NULL)) {
2194 errno = EINVAL;
2195 return NULL;
2196 }
2197 }
2198 else {
2199 result = NULL;
2200 }
2201
2202 return result;
2203}
2204
2205#endif
2206
a4f00dcc 2207STATIC char *
b8cc575c 2208S_win32_setlocale(pTHX_ int category, const char* locale)
b385bb4d
KW
2209{
2210 /* This, for Windows, emulates POSIX setlocale() behavior. There is no
7d4bcc4a
KW
2211 * difference between the two unless the input locale is "", which normally
2212 * means on Windows to get the machine default, which is set via the
2213 * computer's "Regional and Language Options" (or its current equivalent).
2214 * In POSIX, it instead means to find the locale from the user's
2215 * environment. This routine changes the Windows behavior to first look in
2216 * the environment, and, if anything is found, use that instead of going to
2217 * the machine default. If there is no environment override, the machine
2218 * default is used, by calling the real setlocale() with "".
2219 *
2220 * The POSIX behavior is to use the LC_ALL variable if set; otherwise to
2221 * use the particular category's variable if set; otherwise to use the LANG
2222 * variable. */
b385bb4d 2223
175c4cf9 2224 bool override_LC_ALL = FALSE;
89f7b9aa 2225 char * result;
e5f10d49 2226 unsigned int i;
89f7b9aa 2227
b385bb4d 2228 if (locale && strEQ(locale, "")) {
7d4bcc4a
KW
2229
2230# ifdef LC_ALL
2231
b385bb4d
KW
2232 locale = PerlEnv_getenv("LC_ALL");
2233 if (! locale) {
e5f10d49
KW
2234 if (category == LC_ALL) {
2235 override_LC_ALL = TRUE;
2236 }
2237 else {
7d4bcc4a
KW
2238
2239# endif
7d4bcc4a 2240
e5f10d49
KW
2241 for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++) {
2242 if (category == categories[i]) {
2243 locale = PerlEnv_getenv(category_names[i]);
2244 goto found_locale;
2245 }
2246 }
7d4bcc4a 2247
b385bb4d 2248 locale = PerlEnv_getenv("LANG");
481465ea 2249 if (! locale) {
b385bb4d
KW
2250 locale = "";
2251 }
e5f10d49
KW
2252
2253 found_locale: ;
7d4bcc4a
KW
2254
2255# ifdef LC_ALL
2256
e5f10d49 2257 }
b385bb4d 2258 }
7d4bcc4a
KW
2259
2260# endif
2261
b385bb4d
KW
2262 }
2263
6c332036
TC
2264#ifdef USE_WSETLOCALE
2265 result = S_wrap_wsetlocale(aTHX_ category, locale);
2266#else
89f7b9aa 2267 result = setlocale(category, locale);
6c332036 2268#endif
48015184
KW
2269 DEBUG_L(STMT_START {
2270 dSAVE_ERRNO;
2271 PerlIO_printf(Perl_debug_log, "%s:%d: %s\n", __FILE__, __LINE__,
d188fe6d 2272 setlocale_debug_string_r(category, locale, result));
48015184
KW
2273 RESTORE_ERRNO;
2274 } STMT_END);
89f7b9aa 2275
481465ea 2276 if (! override_LC_ALL) {
89f7b9aa
KW
2277 return result;
2278 }
2279
dfd77d7a 2280 /* Here the input category was LC_ALL, and we have set it to what is in the
481465ea
KW
2281 * LANG variable or the system default if there is no LANG. But these have
2282 * lower priority than the other LC_foo variables, so override it for each
2283 * one that is set. (If they are set to "", it means to use the same thing
2284 * we just set LC_ALL to, so can skip) */
7d4bcc4a 2285
4fd76d49 2286 for (i = 0; i < LC_ALL_INDEX_; i++) {
e5f10d49
KW
2287 result = PerlEnv_getenv(category_names[i]);
2288 if (result && strNE(result, "")) {
6c332036 2289#ifdef USE_WSETLOCALE
17a1e9ac 2290 S_wrap_wsetlocale(aTHX_ categories[i], result);
6c332036 2291#else
17a1e9ac 2292 setlocale(categories[i], result);
6c332036 2293#endif
e5f10d49
KW
2294 DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s:%d: %s\n",
2295 __FILE__, __LINE__,
d188fe6d 2296 setlocale_debug_string_i(i, result, "not captured")));
e5f10d49 2297 }
89f7b9aa 2298 }
7d4bcc4a 2299
bbc98134 2300 result = setlocale(LC_ALL, NULL);
48015184
KW
2301 DEBUG_L(STMT_START {
2302 dSAVE_ERRNO;
2303 PerlIO_printf(Perl_debug_log, "%s:%d: %s\n",
bbc98134 2304 __FILE__, __LINE__,
d188fe6d 2305 setlocale_debug_string_c(LC_ALL, NULL, result));
48015184
KW
2306 RESTORE_ERRNO;
2307 } STMT_END);
89f7b9aa 2308
bbc98134 2309 return result;
b385bb4d
KW
2310}
2311
2312#endif
2313
9aac5db8 2314/*
9aac5db8
KW
2315=for apidoc Perl_setlocale
2316
2317This is an (almost) drop-in replacement for the system L<C<setlocale(3)>>,
2318taking the same parameters, and returning the same information, except that it
9487427b
KW
2319returns the correct underlying C<LC_NUMERIC> locale. Regular C<setlocale> will
2320instead return C<C> if the underlying locale has a non-dot decimal point
2321character, or a non-empty thousands separator for displaying floating point
2322numbers. This is because perl keeps that locale category such that it has a
2323dot and empty separator, changing the locale briefly during the operations
2324where the underlying one is required. C<Perl_setlocale> knows about this, and
2325compensates; regular C<setlocale> doesn't.
9aac5db8 2326
e9bc6d6b 2327Another reason it isn't completely a drop-in replacement is that it is
9aac5db8 2328declared to return S<C<const char *>>, whereas the system setlocale omits the
163deff3
KW
2329C<const> (presumably because its API was specified long ago, and can't be
2330updated; it is illegal to change the information C<setlocale> returns; doing
2331so leads to segfaults.)
9aac5db8 2332
e9bc6d6b
KW
2333Finally, C<Perl_setlocale> works under all circumstances, whereas plain
2334C<setlocale> can be completely ineffective on some platforms under some
2335configurations.
2336
9aac5db8 2337C<Perl_setlocale> should not be used to change the locale except on systems
e9bc6d6b
KW
2338where the predefined variable C<${^SAFE_LOCALES}> is 1. On some such systems,
2339the system C<setlocale()> is ineffective, returning the wrong information, and
2340failing to actually change the locale. C<Perl_setlocale>, however works
2341properly in all circumstances.
9aac5db8
KW
2342
2343The return points to a per-thread static buffer, which is overwritten the next
2344time C<Perl_setlocale> is called from the same thread.
2345
2346=cut
2347
2348*/
2349
2350const char *
2351Perl_setlocale(const int category, const char * locale)
a4f00dcc
KW
2352{
2353 /* This wraps POSIX::setlocale() */
2354
52129632 2355#ifndef USE_LOCALE
d36adde0
KW
2356
2357 PERL_UNUSED_ARG(category);
2358 PERL_UNUSED_ARG(locale);
2359
2360 return "C";
2361
2362#else
2363
9aac5db8 2364 const char * retval;
c7dc1ea8 2365 unsigned int cat_index;
9aac5db8 2366 dSAVEDERRNO;
a4f00dcc 2367 dTHX;
0bb37942
KW
2368
2369 /* A NULL locale means only query what the current one is. */
2370 if (locale == NULL) {
2371
6a7aca81 2372# ifndef USE_LOCALE_NUMERIC
0bb37942 2373
6a7aca81
KW
2374 /* Without LC_NUMERIC, it's trivial; we just return the value */
2375 return querylocale_r(category);
a4f00dcc 2376
6a7aca81 2377# else
a4f00dcc 2378
0bb37942
KW
2379 /* We have the LC_NUMERIC name saved, because we are normally switched
2380 * into the C locale (or equivalent) for it. */
a4f00dcc 2381 if (category == LC_NUMERIC) {
9aac5db8
KW
2382
2383 /* We don't have to copy this return value, as it is a per-thread
2384 * variable, and won't change until a future setlocale */
2385 return PL_numeric_name;
a4f00dcc
KW
2386 }
2387
6a7aca81
KW
2388# ifndef LC_ALL
2389
2390 /* Without LC_ALL, just return the value */
2391 return querylocale_r(category);
2392
2393# else
2394
2395 /* Here, LC_ALL is available on this platform. It's the one
2396 * complicating category (because it can contain a toggled LC_NUMERIC
2397 * value), for all the remaining ones (we took care of LC_NUMERIC
2398 * above), just return the value */
2399 if (category != LC_ALL) {
2400 return querylocale_r(category);
2401 }
2402
2403 bool toggled = FALSE;
a4f00dcc 2404
0bb37942
KW
2405 /* For an LC_ALL query, switch back to the underlying numeric locale
2406 * (if we aren't there already) so as to get the correct results. Our
2407 * records for all the other categories are valid without switching */
6a7aca81 2408 if (! PL_numeric_underlying) {
996e6b9e
KW
2409 set_numeric_underlying();
2410 toggled = TRUE;
a4f00dcc
KW
2411 }
2412
6a7aca81 2413 retval = querylocale_c(LC_ALL);
996e6b9e
KW
2414
2415 if (toggled) {
0bb37942 2416
996e6b9e
KW
2417 /* This toggling back could destroy 'retval' */
2418 retval = save_to_buffer(retval,
2419 &PL_setlocale_buf, &PL_setlocale_bufsize, 0);
2420 set_numeric_standard();
6a7aca81 2421 }
0bb37942
KW
2422
2423 return retval;
6a7aca81
KW
2424
2425# endif /* Has LC_ALL */
2426# endif /* Has LC_NUMERIC */
2427
0bb37942 2428 } /* End of querying the current locale */
a4f00dcc 2429
c7dc1ea8
KW
2430 cat_index = get_category_index(category, NULL);
2431 retval = save_to_buffer(setlocale_i(cat_index, locale),
33e5a354 2432 &PL_setlocale_buf, &PL_setlocale_bufsize, 0);
9aac5db8
KW
2433 SAVE_ERRNO;
2434
a4f00dcc
KW
2435 DEBUG_L(PerlIO_printf(Perl_debug_log,
2436 "%s:%d: %s\n", __FILE__, __LINE__,
d188fe6d 2437 setlocale_debug_string_r(category, locale, retval)));
7d4bcc4a 2438
9aac5db8
KW
2439 RESTORE_ERRNO;
2440
2441 if (! retval) {
a4f00dcc
KW
2442 return NULL;
2443 }
2444
fe1c1494
KW
2445 /* Now that have changed locales, we have to update our records to
2446 * correspond. Only certain categories have extra work to update. */
c7dc1ea8
KW
2447 if (update_functions[cat_index]) {
2448 update_functions[cat_index](aTHX_ retval);
a4f00dcc
KW
2449 }
2450
2451 return retval;
2452
d36adde0
KW
2453#endif
2454
f7416781
KW
2455}
2456
2457PERL_STATIC_INLINE const char *
f3e90222 2458S_save_to_buffer(const char * string, const char **buf, Size_t *buf_size,
fe1c1494 2459 const Size_t offset)
f7416781 2460{
fe1c1494
KW
2461 /* Copy the NUL-terminated 'string' to 'buf' + 'offset'. 'buf' has size
2462 * 'buf_size', growing it if necessary */
f7416781 2463
0b6697c5 2464 Size_t string_size;
f7416781
KW
2465
2466 PERL_ARGS_ASSERT_SAVE_TO_BUFFER;
2467
0b6697c5
KW
2468 if (! string) {
2469 return NULL;
2470 }
2471
2472 string_size = strlen(string) + offset + 1;
2473
f7416781
KW
2474 if (*buf_size == 0) {
2475 Newx(*buf, string_size, char);
2476 *buf_size = string_size;
2477 }
2478 else if (string_size > *buf_size) {
2479 Renew(*buf, string_size, char);
2480 *buf_size = string_size;
2481 }
2482
2483 Copy(string, *buf + offset, string_size - offset, char);
2484 return *buf;
2485}
2486
2487/*
2488
f7416781
KW
2489=for apidoc Perl_langinfo
2490
ce181bf6 2491This is an (almost) drop-in replacement for the system C<L<nl_langinfo(3)>>,
f7416781
KW
2492taking the same C<item> parameter values, and returning the same information.
2493But it is more thread-safe than regular C<nl_langinfo()>, and hides the quirks
2494of Perl's locale handling from your code, and can be used on systems that lack
2495a native C<nl_langinfo>.
2496
2497Expanding on these:
2498
2499=over
2500
2501=item *
2502
ce181bf6
KW
2503The reason it isn't quite a drop-in replacement is actually an advantage. The
2504only difference is that it returns S<C<const char *>>, whereas plain
2505C<nl_langinfo()> returns S<C<char *>>, but you are (only by documentation)
2506forbidden to write into the buffer. By declaring this C<const>, the compiler
2507enforces this restriction, so if it is violated, you know at compilation time,
2508rather than getting segfaults at runtime.
2509
2510=item *
2511
69e2275e 2512It delivers the correct results for the C<RADIXCHAR> and C<THOUSEP> items,
f7416781
KW
2513without you having to write extra code. The reason for the extra code would be
2514because these are from the C<LC_NUMERIC> locale category, which is normally
9487427b
KW
2515kept set by Perl so that the radix is a dot, and the separator is the empty
2516string, no matter what the underlying locale is supposed to be, and so to get
2517the expected results, you have to temporarily toggle into the underlying
2518locale, and later toggle back. (You could use plain C<nl_langinfo> and
2519C<L</STORE_LC_NUMERIC_FORCE_TO_UNDERLYING>> for this but then you wouldn't get
2520the other advantages of C<Perl_langinfo()>; not keeping C<LC_NUMERIC> in the C
2521(or equivalent) locale would break a lot of CPAN, which is expecting the radix
2522(decimal point) character to be a dot.)
ce181bf6
KW
2523
2524=item *
2525
2526The system function it replaces can have its static return buffer trashed,
f1460a66 2527not only by a subsequent call to that function, but by a C<freelocale>,
ce181bf6
KW
2528C<setlocale>, or other locale change. The returned buffer of this function is
2529not changed until the next call to it, so the buffer is never in a trashed
2530state.
f7416781
KW
2531
2532=item *
2533
ce181bf6
KW
2534Its return buffer is per-thread, so it also is never overwritten by a call to
2535this function from another thread; unlike the function it replaces.
2536
2537=item *
2538
2539But most importantly, it works on systems that don't have C<nl_langinfo>, such
2540as Windows, hence makes your code more portable. Of the fifty-some possible
2541items specified by the POSIX 2008 standard,
f7416781 2542L<http://pubs.opengroup.org/onlinepubs/9699919799/basedefs/langinfo.h.html>,
8d72e74e
KW
2543only one is completely unimplemented, though on non-Windows platforms, another
2544significant one is also not implemented). It uses various techniques to
2545recover the other items, including calling C<L<localeconv(3)>>, and
2546C<L<strftime(3)>>, both of which are specified in C89, so should be always be
2547available. Later C<strftime()> versions have additional capabilities; C<""> is
2548returned for those not available on your system.
ce181bf6
KW
2549
2550It is important to note that when called with an item that is recovered by
2551using C<localeconv>, the buffer from any previous explicit call to
2552C<localeconv> will be overwritten. This means you must save that buffer's
c3997e39
KW
2553contents if you need to access them after a call to this function. (But note
2554that you might not want to be using C<localeconv()> directly anyway, because of
2555issues like the ones listed in the second item of this list (above) for
2556C<RADIXCHAR> and C<THOUSEP>. You can use the methods given in L<perlcall> to
2557call L<POSIX/localeconv> and avoid all the issues, but then you have a hash to
2558unpack).
9f0bc911 2559
6201c220
KW
2560The details for those items which may deviate from what this emulation returns
2561and what a native C<nl_langinfo()> would return are specified in
2562L<I18N::Langinfo>.
f7416781 2563
ce181bf6
KW
2564=back
2565
f7416781
KW
2566When using C<Perl_langinfo> on systems that don't have a native
2567C<nl_langinfo()>, you must
2568
2569 #include "perl_langinfo.h"
2570
2571before the C<perl.h> C<#include>. You can replace your C<langinfo.h>
2572C<#include> with this one. (Doing it this way keeps out the symbols that plain
c3997e39
KW
2573C<langinfo.h> would try to import into the namespace for code that doesn't need
2574it.)
f7416781 2575
f7416781
KW
2576The original impetus for C<Perl_langinfo()> was so that code that needs to
2577find out the current currency symbol, floating point radix character, or digit
2578grouping separator can use, on all systems, the simpler and more
2579thread-friendly C<nl_langinfo> API instead of C<L<localeconv(3)>> which is a
2580pain to make thread-friendly. For other fields returned by C<localeconv>, it
2581is better to use the methods given in L<perlcall> to call
2582L<C<POSIX::localeconv()>|POSIX/localeconv>, which is thread-friendly.
2583
2584=cut
2585
2586*/
2587
2588const char *
2589#ifdef HAS_NL_LANGINFO
2590Perl_langinfo(const nl_item item)
2591#else
2592Perl_langinfo(const int item)
2593#endif
2594{
f61748ac
KW
2595 return my_nl_langinfo(item, TRUE);
2596}
2597
59a15af0 2598STATIC const char *
fe1c1494 2599# ifdef HAS_NL_LANGINFO
f61748ac 2600S_my_nl_langinfo(const nl_item item, bool toggle)
fe1c1494 2601# else
f61748ac 2602S_my_nl_langinfo(const int item, bool toggle)
fe1c1494 2603# endif
f61748ac 2604{
ae74815b 2605 dTHX;
0b6697c5 2606 const char * retval;
f7416781 2607
fe1c1494 2608# ifdef USE_LOCALE_NUMERIC
d36adde0 2609
5a854ab3
KW
2610 /* We only need to toggle into the underlying LC_NUMERIC locale for these
2611 * two items, and only if not already there */
4e6826bf 2612 if (toggle && (( item != RADIXCHAR && item != THOUSEP)
5a854ab3 2613 || PL_numeric_underlying))
d36adde0 2614
fe1c1494 2615# endif /* No toggling needed if not using LC_NUMERIC */
d36adde0 2616
5a854ab3 2617 toggle = FALSE;
5a854ab3 2618
fe1c1494
KW
2619/*--------------------------------------------------------------------------*/
2620/* Above is the common beginning to all the implementations of my_langinfo().
2621 * Below are the various completions */
2622# if defined(HAS_NL_LANGINFO) /* nl_langinfo() is available. */
ee90eb2d 2623# if ! defined(HAS_THREAD_SAFE_NL_LANGINFO_L) \
4b06b4d0 2624 || ! defined(USE_POSIX_2008_LOCALE)
f7416781 2625
ab340fff 2626 /* Here, use plain nl_langinfo(), switching to the underlying LC_NUMERIC
ae74815b
KW
2627 * for those items dependent on it. This must be copied to a buffer before
2628 * switching back, as some systems destroy the buffer when setlocale() is
2629 * called */
f7416781 2630
038d3702
KW
2631 {
2632 DECLARATION_FOR_LC_NUMERIC_MANIPULATION;
2633
b2fee59c 2634 if (toggle) {
038d3702 2635 STORE_LC_NUMERIC_FORCE_TO_UNDERLYING();
b2fee59c 2636 }
f7416781 2637
7953f73f
KW
2638 /* Prevent interference from another thread executing this code
2639 * section. */
2640 NL_LANGINFO_LOCK;
ee90eb2d 2641
628ff75a
KW
2642 /* Copy to a per-thread buffer, which is also one that won't be
2643 * destroyed by a subsequent setlocale(), such as the
2644 * RESTORE_LC_NUMERIC may do just below. */
0b6697c5 2645 retval = save_to_buffer(nl_langinfo(item),
f3e90222
KW
2646 ((const char **) &PL_langinfo_buf),
2647 &PL_langinfo_bufsize, 0);
7953f73f 2648 NL_LANGINFO_UNLOCK;
5acc4454 2649
b2fee59c 2650 if (toggle) {
038d3702 2651 RESTORE_LC_NUMERIC();
b2fee59c 2652 }
f7416781 2653 }
fe1c1494
KW
2654/*--------------------------------------------------------------------------*/
2655# else /* Use nl_langinfo_l(), avoiding both a mutex and changing the
2656 locale. */
ab340fff 2657
5a854ab3 2658 {
b2fee59c
KW
2659 bool do_free = FALSE;
2660 locale_t cur = uselocale((locale_t) 0);
ab340fff 2661
b2fee59c
KW
2662 if (cur == LC_GLOBAL_LOCALE) {
2663 cur = duplocale(LC_GLOBAL_LOCALE);
2664 do_free = TRUE;
2665 }
ab340fff 2666
d36adde0
KW
2667# ifdef USE_LOCALE_NUMERIC
2668
b2fee59c 2669 if (toggle) {
e1aa2579
KW
2670 if (PL_underlying_numeric_obj) {
2671 cur = PL_underlying_numeric_obj;
2672 }
2673 else {
2674 cur = newlocale(LC_NUMERIC_MASK, PL_numeric_name, cur);
2675 do_free = TRUE;
2676 }
b2fee59c 2677 }
ab340fff 2678
d36adde0
KW
2679# endif
2680
628ff75a
KW
2681 /* We have to save it to a buffer, because the freelocale() just below
2682 * can invalidate the internal one */
0b6697c5 2683 retval = save_to_buffer(nl_langinfo_l(item, cur),
f3e90222
KW
2684 ((const char **) &PL_langinfo_buf),
2685 &PL_langinfo_bufsize, 0);
ee90eb2d 2686
b2fee59c
KW
2687 if (do_free) {
2688 freelocale(cur);
2689 }
5a854ab3 2690 }
ab340fff 2691
c1566110
KW
2692# endif
2693
fe1c1494 2694 /* We can return 'yes' and 'no' even if we didn't get a result */
0b6697c5 2695 if (strEQ(retval, "")) {
4e6826bf 2696 if (item == YESSTR) {
c1566110
KW
2697 return "yes";
2698 }
4e6826bf 2699 if (item == NOSTR) {
c1566110
KW
2700 return "no";
2701 }
2702 }
2703
0b6697c5 2704 return retval;
fe1c1494
KW
2705/*--------------------------------------------------------------------------*/
2706# else /* Below, emulate nl_langinfo as best we can */
43dd6b15
KW
2707
2708 {
2709
f7416781
KW
2710# ifdef HAS_LOCALECONV
2711
43dd6b15 2712 const struct lconv* lc;
628ff75a 2713 const char * temp;
038d3702 2714 DECLARATION_FOR_LC_NUMERIC_MANIPULATION;
f7416781 2715
69c5e0db
KW
2716# ifdef TS_W32_BROKEN_LOCALECONV
2717
2718 const char * save_global;
2719 const char * save_thread;
2720 int needed_size;
2721 char * ptr;
2722 char * e;
2723 char * item_start;
2724
2725# endif
f7416781
KW
2726# endif
2727# ifdef HAS_STRFTIME
2728
43dd6b15
KW
2729 struct tm tm;
2730 bool return_format = FALSE; /* Return the %format, not the value */
2731 const char * format;
f7416781
KW
2732
2733# endif
2734
43dd6b15
KW
2735 /* We copy the results to a per-thread buffer, even if not
2736 * multi-threaded. This is in part to simplify this code, and partly
2737 * because we need a buffer anyway for strftime(), and partly because a
2738 * call of localeconv() could otherwise wipe out the buffer, and the
2739 * programmer would not be expecting this, as this is a nl_langinfo()
2740 * substitute after all, so s/he might be thinking their localeconv()
2741 * is safe until another localeconv() call. */
f7416781 2742
43dd6b15
KW
2743 switch (item) {
2744 Size_t len;
f7416781 2745
8d72e74e 2746 /* This is unimplemented */
4e6826bf 2747 case ERA: /* For use with strftime() %E modifier */
f7416781 2748
43dd6b15
KW
2749 default:
2750 return "";
f7416781 2751
43dd6b15 2752 /* We use only an English set, since we don't know any more */
4e6826bf
KW
2753 case YESEXPR: return "^[+1yY]";
2754 case YESSTR: return "yes";
2755 case NOEXPR: return "^[-0nN]";
2756 case NOSTR: return "no";
2757
8d72e74e
KW
2758 case CODESET:
2759
2760# ifndef WIN32
2761
2762 /* On non-windows, this is unimplemented, in part because of
2763 * inconsistencies between vendors. The Darwin native
2764 * nl_langinfo() implementation simply looks at everything past
2765 * any dot in the name, but that doesn't work for other
2766 * vendors. Many Linux locales that don't have UTF-8 in their
2767 * names really are UTF-8, for example; z/OS locales that do
2768 * have UTF-8 in their names, aren't really UTF-8 */
2769 return "";
2770
2771# else
2772
2773 { /* But on Windows, the name does seem to be consistent, so
2774 use that. */
2775 const char * p;
2776 const char * first;
2777 Size_t offset = 0;
9b271b93 2778 const char * name = querylocale_c(LC_CTYPE);
8d72e74e
KW
2779
2780 if (isNAME_C_OR_POSIX(name)) {
2781 return "ANSI_X3.4-1968";
2782 }
2783
2784 /* Find the dot in the locale name */
2785 first = (const char *) strchr(name, '.');
2786 if (! first) {
2787 first = name;
2788 goto has_nondigit;
2789 }
2790
2791 /* Look at everything past the dot */
2792 first++;
2793 p = first;
2794
2795 while (*p) {
2796 if (! isDIGIT(*p)) {
2797 goto has_nondigit;
2798 }
2799
2800 p++;
2801 }
2802
2803 /* Here everything past the dot is a digit. Treat it as a
2804 * code page */
f3e90222
KW
2805 retval = save_to_buffer("CP",
2806 ((const char **) &PL_langinfo_buf),
2807 &PL_langinfo_bufsize, 0);
8d72e74e 2808 offset = STRLENs("CP");
f7416781 2809
8d72e74e
KW
2810 has_nondigit:
2811
f3e90222
KW
2812 retval = save_to_buffer(first,
2813 ((const char **) &PL_langinfo_buf),
8d72e74e
KW
2814 &PL_langinfo_bufsize, offset);
2815 }
2816
2817 break;
2818
2819# endif
f7416781
KW
2820# ifdef HAS_LOCALECONV
2821
4e6826bf 2822 case CRNCYSTR:
f7416781 2823
43dd6b15
KW
2824 /* We don't bother with localeconv_l() because any system that
2825 * has it is likely to also have nl_langinfo() */
291a84fb 2826
8609fe00
KW
2827 LOCALECONV_LOCK; /* Prevent interference with other threads
2828 using localeconv() */
69c5e0db
KW
2829
2830# ifdef TS_W32_BROKEN_LOCALECONV
2831
2832 /* This is a workaround for a Windows bug prior to VS 15.
2833 * What we do here is, while locked, switch to the global
2834 * locale so localeconv() works; then switch back just before
2835 * the unlock. This can screw things up if some thread is
2836 * already using the global locale while assuming no other is.
2837 * A different workaround would be to call GetCurrencyFormat on
2838 * a known value, and parse it; patches welcome
2839 *
2840 * We have to use LC_ALL instead of LC_MONETARY because of
2841 * another bug in Windows */
2842
9b271b93 2843 save_thread = savepv(querylocale_c(LC_ALL));
69c5e0db 2844 _configthreadlocale(_DISABLE_PER_THREAD_LOCALE);
9b271b93 2845 save_global= savepv(querylocale_c(LC_ALL));
3980cddb 2846 void_setlocale_c(LC_ALL, save_thread);
69c5e0db
KW
2847
2848# endif
5acc4454 2849
43dd6b15 2850 lc = localeconv();
fe1c1494 2851
43dd6b15
KW
2852 if ( ! lc
2853 || ! lc->currency_symbol
2854 || strEQ("", lc->currency_symbol))
2855 {
8609fe00 2856 LOCALECONV_UNLOCK;
43dd6b15
KW
2857 return "";
2858 }
f7416781 2859
43dd6b15 2860 /* Leave the first spot empty to be filled in below */
f3e90222
KW
2861 retval = save_to_buffer(lc->currency_symbol,
2862 ((const char **) &PL_langinfo_buf),
0b6697c5 2863 &PL_langinfo_bufsize, 1);
43dd6b15
KW
2864 if (lc->mon_decimal_point && strEQ(lc->mon_decimal_point, ""))
2865 { /* khw couldn't figure out how the localedef specifications
2866 would show that the $ should replace the radix; this is
2867 just a guess as to how it might work.*/
a34edee3 2868 PL_langinfo_buf[0] = '.';
43dd6b15
KW
2869 }
2870 else if (lc->p_cs_precedes) {
a34edee3 2871 PL_langinfo_buf[0] = '-';
43dd6b15
KW
2872 }
2873 else {
a34edee3 2874 PL_langinfo_buf[0] = '+';
43dd6b15 2875 }
f7416781 2876
69c5e0db
KW
2877# ifdef TS_W32_BROKEN_LOCALECONV
2878
3980cddb 2879 void_setlocale_c(LC_ALL, save_global);
69c5e0db 2880 _configthreadlocale(_ENABLE_PER_THREAD_LOCALE);
3980cddb 2881 void_setlocale_c(LC_ALL, save_thread);
69c5e0db
KW
2882 Safefree(save_global);
2883 Safefree(save_thread);
2884
2885# endif
2886
8609fe00 2887 LOCALECONV_UNLOCK;
43dd6b15 2888 break;
f7416781 2889
69c5e0db
KW
2890# ifdef TS_W32_BROKEN_LOCALECONV
2891
4e6826bf 2892 case RADIXCHAR:
69c5e0db
KW
2893
2894 /* For this, we output a known simple floating point number to
2895 * a buffer, and parse it, looking for the radix */
2896
2897 if (toggle) {
2898 STORE_LC_NUMERIC_FORCE_TO_UNDERLYING();
2899 }
2900
2901 if (PL_langinfo_bufsize < 10) {
2902 PL_langinfo_bufsize = 10;
2903 Renew(PL_langinfo_buf, PL_langinfo_bufsize, char);
2904 }
2905
2906 needed_size = my_snprintf(PL_langinfo_buf, PL_langinfo_bufsize,
2907 "%.1f", 1.5);
2908 if (needed_size >= (int) PL_langinfo_bufsize) {
2909 PL_langinfo_bufsize = needed_size + 1;
2910 Renew(PL_langinfo_buf, PL_langinfo_bufsize, char);
fe1c1494
KW
2911 needed_size
2912 = my_snprintf(PL_langinfo_buf, PL_langinfo_bufsize,
69c5e0db
KW
2913 "%.1f", 1.5);
2914 assert(needed_size < (int) PL_langinfo_bufsize);
2915 }
2916
2917 ptr = PL_langinfo_buf;
2918 e = PL_langinfo_buf + PL_langinfo_bufsize;
2919
2920 /* Find the '1' */
2921 while (ptr < e && *ptr != '1') {
2922 ptr++;
2923 }
2924 ptr++;
2925
2926 /* Find the '5' */
2927 item_start = ptr;
2928 while (ptr < e && *ptr != '5') {
2929 ptr++;
2930 }
2931
2932 /* Everything in between is the radix string */
2933 if (ptr >= e) {
2934 PL_langinfo_buf[0] = '?';
2935 PL_langinfo_buf[1] = '\0';
2936 }
2937 else {
2938 *ptr = '\0';
fe1c1494
KW
2939 Move(item_start, PL_langinfo_buf, ptr - PL_langinfo_buf,
2940 char);
69c5e0db
KW
2941 }
2942
2943 if (toggle) {
2944 RESTORE_LC_NUMERIC();
2945 }
2946
2947 retval = PL_langinfo_buf;
2948 break;
2949
2950# else
2951
2952 case RADIXCHAR: /* No special handling needed */
2953
2954# endif
2955
4e6826bf 2956 case THOUSEP:
f7416781 2957
43dd6b15 2958 if (toggle) {
038d3702 2959 STORE_LC_NUMERIC_FORCE_TO_UNDERLYING();
c0d737a8 2960 }
f7416781 2961
8609fe00
KW
2962 LOCALECONV_LOCK; /* Prevent interference with other threads
2963 using localeconv() */
69c5e0db
KW
2964
2965# ifdef TS_W32_BROKEN_LOCALECONV
2966
2967 /* This should only be for the thousands separator. A
2968 * different work around would be to use GetNumberFormat on a
2969 * known value and parse the result to find the separator */
9b271b93 2970 save_thread = savepv(querylocale_c(LC_ALL));
69c5e0db 2971 _configthreadlocale(_DISABLE_PER_THREAD_LOCALE);
9b271b93 2972 save_global = savepv(querylocale_c(LC_ALL));
3980cddb 2973 void_setlocale_c(LC_ALL, save_thread);
69c5e0db
KW
2974# if 0
2975 /* This is the start of code that for broken Windows replaces
2976 * the above and below code, and instead calls
2977 * GetNumberFormat() and then would parse that to find the
2978 * thousands separator. It needs to handle UTF-16 vs -8
2979 * issues. */
2980
fe1c1494
KW
2981 needed_size = GetNumberFormatEx(PL_numeric_name, 0, "1234.5",
2982 NULL, PL_langinfo_buf, PL_langinfo_bufsize);
69c5e0db
KW
2983 DEBUG_L(PerlIO_printf(Perl_debug_log,
2984 "%s: %d: return from GetNumber, count=%d, val=%s\n",
2985 __FILE__, __LINE__, needed_size, PL_langinfo_buf));
2986
2987# endif
2988# endif
5acc4454 2989
43dd6b15
KW
2990 lc = localeconv();
2991 if (! lc) {
628ff75a 2992 temp = "";
33394adc 2993 }
43dd6b15 2994 else {
4e6826bf 2995 temp = (item == RADIXCHAR)
43dd6b15
KW
2996 ? lc->decimal_point
2997 : lc->thousands_sep;
628ff75a
KW
2998 if (! temp) {
2999 temp = "";
43dd6b15
KW
3000 }
3001 }
f7416781 3002
f3e90222
KW
3003 retval = save_to_buffer(temp,
3004 ((const char **) &PL_langinfo_buf),
0b6697c5 3005 &PL_langinfo_bufsize, 0);
f7416781 3006
69c5e0db
KW
3007# ifdef TS_W32_BROKEN_LOCALECONV
3008
3980cddb 3009 void_setlocale_c(LC_ALL, save_global);
69c5e0db 3010 _configthreadlocale(_ENABLE_PER_THREAD_LOCALE);
3980cddb 3011 void_setlocale_c(LC_ALL, save_thread);
69c5e0db
KW
3012 Safefree(save_global);
3013 Safefree(save_thread);
3014
3015# endif
3016
8609fe00 3017 LOCALECONV_UNLOCK;
5acc4454 3018
43dd6b15 3019 if (toggle) {
038d3702 3020 RESTORE_LC_NUMERIC();
43dd6b15 3021 }
f7416781 3022
43dd6b15 3023 break;
f7416781
KW
3024
3025# endif
3026# ifdef HAS_STRFTIME
3027
43dd6b15
KW
3028 /* These are defined by C89, so we assume that strftime supports
3029 * them, and so are returned unconditionally; they may not be what
3030 * the locale actually says, but should give good enough results
3031 * for someone using them as formats (as opposed to trying to parse
3032 * them to figure out what the locale says). The other format
3033 * items are actually tested to verify they work on the platform */
4e6826bf
KW
3034 case D_FMT: return "%x";
3035 case T_FMT: return "%X";
3036 case D_T_FMT: return "%c";
43dd6b15
KW
3037
3038 /* These formats are only available in later strfmtime's */
4e6826bf 3039 case ERA_D_FMT: case ERA_T_FMT: case ERA_D_T_FMT: case T_FMT_AMPM:
43dd6b15
KW
3040
3041 /* The rest can be gotten from most versions of strftime(). */
4e6826bf
KW
3042 case ABDAY_1: case ABDAY_2: case ABDAY_3:
3043 case ABDAY_4: case ABDAY_5: case ABDAY_6: case ABDAY_7:
3044 case ALT_DIGITS:
3045 case AM_STR: case PM_STR:
3046 case ABMON_1: case ABMON_2: case ABMON_3: case ABMON_4:
3047 case ABMON_5: case ABMON_6: case ABMON_7: case ABMON_8:
3048 case ABMON_9: case ABMON_10: case ABMON_11: case ABMON_12:
3049 case DAY_1: case DAY_2: case DAY_3: case DAY_4:
3050 case DAY_5: case DAY_6: case DAY_7:
3051 case MON_1: case MON_2: case MON_3: case MON_4:
3052 case MON_5: case MON_6: case MON_7: case MON_8:
3053 case MON_9: case MON_10: case MON_11: case MON_12:
43dd6b15 3054
43dd6b15
KW
3055 init_tm(&tm); /* Precaution against core dumps */
3056 tm.tm_sec = 30;
3057 tm.tm_min = 30;
3058 tm.tm_hour = 6;
3059 tm.tm_year = 2017 - 1900;
3060 tm.tm_wday = 0;
3061 tm.tm_mon = 0;
639a7c35
KW
3062
3063 GCC_DIAG_IGNORE_STMT(-Wimplicit-fallthrough);
3064
43dd6b15
KW
3065 switch (item) {
3066 default:
43dd6b15
KW
3067 Perl_croak(aTHX_
3068 "panic: %s: %d: switch case: %d problem",
3069 __FILE__, __LINE__, item);
3070 NOT_REACHED; /* NOTREACHED */
3071
4e6826bf
KW
3072 case PM_STR: tm.tm_hour = 18;
3073 case AM_STR:
43dd6b15
KW
3074 format = "%p";
3075 break;
3076
4e6826bf
KW
3077 case ABDAY_7: tm.tm_wday++;
3078 case ABDAY_6: tm.tm_wday++;
3079 case ABDAY_5: tm.tm_wday++;
3080 case ABDAY_4: tm.tm_wday++;
3081 case ABDAY_3: tm.tm_wday++;
3082 case ABDAY_2: tm.tm_wday++;
3083 case ABDAY_1:
43dd6b15
KW
3084 format = "%a";
3085 break;
3086
4e6826bf
KW
3087 case DAY_7: tm.tm_wday++;
3088 case DAY_6: tm.tm_wday++;
3089 case DAY_5: tm.tm_wday++;
3090 case DAY_4: tm.tm_wday++;
3091 case DAY_3: tm.tm_wday++;
3092 case DAY_2: tm.tm_wday++;
3093 case DAY_1:
43dd6b15
KW
3094 format = "%A";
3095 break;
3096
4e6826bf
KW
3097 case ABMON_12: tm.tm_mon++;
3098 case ABMON_11: tm.tm_mon++;
3099 case ABMON_10: tm.tm_mon++;
3100 case ABMON_9: tm.tm_mon++;
3101 case ABMON_8: tm.tm_mon++;
3102 case ABMON_7: tm.tm_mon++;
3103 case ABMON_6: tm.tm_mon++;
3104 case ABMON_5: tm.tm_mon++;
3105 case ABMON_4: tm.tm_mon++;
3106 case ABMON_3: tm.tm_mon++;
3107 case ABMON_2: tm.tm_mon++;
3108 case ABMON_1:
43dd6b15
KW
3109 format = "%b";
3110 break;
3111
4e6826bf
KW
3112 case MON_12: tm.tm_mon++;
3113 case MON_11: tm.tm_mon++;
3114 case MON_10: tm.tm_mon++;
3115 case MON_9: tm.tm_mon++;
3116 case MON_8: tm.tm_mon++;
3117 case MON_7: tm.tm_mon++;
3118 case MON_6: tm.tm_mon++;
3119 case MON_5: tm.tm_mon++;
3120 case MON_4: tm.tm_mon++;
3121 case MON_3: tm.tm_mon++;
3122 case MON_2: tm.tm_mon++;
3123 case MON_1:
43dd6b15
KW
3124 format = "%B";
3125 break;
3126
4e6826bf 3127 case T_FMT_AMPM:
43dd6b15
KW
3128 format = "%r";
3129 return_format = TRUE;
3130 break;
3131
4e6826bf 3132 case ERA_D_FMT:
43dd6b15
KW
3133 format = "%Ex";
3134 return_format = TRUE;
3135 break;
3136
4e6826bf 3137 case ERA_T_FMT:
43dd6b15
KW
3138 format = "%EX";
3139 return_format = TRUE;
3140 break;
3141
4e6826bf 3142 case ERA_D_T_FMT:
43dd6b15
KW
3143 format = "%Ec";
3144 return_format = TRUE;
3145 break;
3146
4e6826bf 3147 case ALT_DIGITS:
43dd6b15
KW
3148 tm.tm_wday = 0;
3149 format = "%Ow"; /* Find the alternate digit for 0 */
3150 break;
3151 }
f7416781 3152
639a7c35
KW
3153 GCC_DIAG_RESTORE_STMT;
3154
43dd6b15
KW
3155 /* We can't use my_strftime() because it doesn't look at
3156 * tm_wday */
3157 while (0 == strftime(PL_langinfo_buf, PL_langinfo_bufsize,
3158 format, &tm))
3159 {
3160 /* A zero return means one of:
3161 * a) there wasn't enough space in PL_langinfo_buf
3162 * b) the format, like a plain %p, returns empty
3163 * c) it was an illegal format, though some
3164 * implementations of strftime will just return the
3165 * illegal format as a plain character sequence.
3166 *
3167 * To quickly test for case 'b)', try again but precede
3168 * the format with a plain character. If that result is
3169 * still empty, the problem is either 'a)' or 'c)' */
3170
3171 Size_t format_size = strlen(format) + 1;
3172 Size_t mod_size = format_size + 1;
3173 char * mod_format;
3174 char * temp_result;
3175
3176 Newx(mod_format, mod_size, char);
3177 Newx(temp_result, PL_langinfo_bufsize, char);
6873aa47 3178 *mod_format = ' ';
43dd6b15
KW
3179 my_strlcpy(mod_format + 1, format, mod_size);
3180 len = strftime(temp_result,
3181 PL_langinfo_bufsize,
3182 mod_format, &tm);
3183 Safefree(mod_format);
3184 Safefree(temp_result);
3185
3186 /* If 'len' is non-zero, it means that we had a case like
3187 * %p which means the current locale doesn't use a.m. or
3188 * p.m., and that is valid */
3189 if (len == 0) {
3190
3191 /* Here, still didn't work. If we get well beyond a
3192 * reasonable size, bail out to prevent an infinite
3193 * loop. */
3194
3195 if (PL_langinfo_bufsize > 100 * format_size) {
3196 *PL_langinfo_buf = '\0';
3197 }
3198 else {
3199 /* Double the buffer size to retry; Add 1 in case
3200 * original was 0, so we aren't stuck at 0. */
3201 PL_langinfo_bufsize *= 2;
3202 PL_langinfo_bufsize++;
3203 Renew(PL_langinfo_buf, PL_langinfo_bufsize, char);
3204 continue;
3205 }
3206 }
f7416781 3207
f7416781 3208 break;
43dd6b15 3209 }
f7416781 3210
43dd6b15
KW
3211 /* Here, we got a result.
3212 *
fe1c1494
KW
3213 * If the item is 'ALT_DIGITS', 'PL_langinfo_buf' contains the
3214 * alternate format for wday 0. If the value is the same as the
3215 * normal 0, there isn't an alternate, so clear the buffer. */
3216 if (item == ALT_DIGITS && strEQ(PL_langinfo_buf, "0")) {
43dd6b15
KW
3217 *PL_langinfo_buf = '\0';
3218 }
f7416781 3219
43dd6b15 3220 /* ALT_DIGITS is problematic. Experiments on it showed that
fe1c1494
KW
3221 * strftime() did not always work properly when going from alt-9 to
3222 * alt-10. Only a few locales have this item defined, and in all
3223 * of them on Linux that khw was able to find, nl_langinfo() merely
3224 * returned the alt-0 character, possibly doubled. Most Unicode
3225 * digits are in blocks of 10 consecutive code points, so that is
3226 * sufficient information for such scripts, as we can infer alt-1,
3227 * alt-2, .... But for a Japanese locale, a CJK ideographic 0 is
3228 * returned, and the CJK digits are not in code point order, so you
3229 * can't really infer anything. The localedef for this locale did
3230 * specify the succeeding digits, so that strftime() works properly
3231 * on them, without needing to infer anything. But the
3232 * nl_langinfo() return did not give sufficient information for the
3233 * caller to understand what's going on. So until there is
3234 * evidence that it should work differently, this returns the alt-0
3235 * string for ALT_DIGITS.
43dd6b15
KW
3236 *
3237 * wday was chosen because its range is all a single digit.
3238 * Things like tm_sec have two digits as the minimum: '00' */
f7416781 3239
d1b150d9
KW
3240 retval = PL_langinfo_buf;
3241
43dd6b15 3242 /* If to return the format, not the value, overwrite the buffer
fe1c1494
KW
3243 * with it. But some strftime()s will keep the original format if
3244 * illegal, so change those to "" */
43dd6b15
KW
3245 if (return_format) {
3246 if (strEQ(PL_langinfo_buf, format)) {
f7416781
KW
3247 *PL_langinfo_buf = '\0';
3248 }
43dd6b15 3249 else {
f3e90222
KW
3250 retval = save_to_buffer(format,
3251 ((const char **) &PL_langinfo_buf),
0b6697c5 3252 &PL_langinfo_bufsize, 0);
f7416781
KW
3253 }
3254 }
3255
3256 break;
f7416781
KW
3257
3258# endif
3259
43dd6b15 3260 }
f7416781
KW
3261 }
3262
0b6697c5 3263 return retval;
f7416781 3264
fe1c1494
KW
3265# endif
3266/*--------------------------------------------------------------------------*/
a4f00dcc 3267}
b385bb4d 3268
98994639
HS
3269/*
3270 * Initialize locale awareness.
3271 */
3272int
3273Perl_init_i18nl10n(pTHX_ int printwarn)
3274{
0e92a118
KW
3275 /* printwarn is
3276 *
3277 * 0 if not to output warning when setup locale is bad
3278 * 1 if to output warning based on value of PERL_BADLANG
3279 * >1 if to output regardless of PERL_BADLANG
3280 *
3281 * returns
98994639 3282 * 1 = set ok or not applicable,
0e92a118
KW
3283 * 0 = fallback to a locale of lower priority
3284 * -1 = fallback to all locales failed, not even to the C locale
6b058d42
KW
3285 *
3286 * Under -DDEBUGGING, if the environment variable PERL_DEBUG_LOCALE_INIT is
3287 * set, debugging information is output.
3288 *
b85af0ab
KW
3289 * This looks more complicated than it is, mainly due to the #ifdefs and
3290 * error handling.
6b058d42 3291 *
b85af0ab 3292 * Besides some asserts, data structure initialization, and specific
fe1c1494
KW
3293 * platform complications, this routine is effectively represented by this
3294 * pseudo-code:
b85af0ab 3295 *
fe1c1494
KW
3296 * setlocale(LC_ALL, ""); x
3297 * foreach (subcategory) { x
3298 * curlocales[f(subcategory)] = setlocale(subcategory, NULL); x
3299 * } x
3300 * if (platform_so_requires) {
3301 * foreach (subcategory) {
3302 * PL_curlocales[f(subcategory)] = curlocales[f(subcategory)]
3303 * }
3304 * }
3305 * foreach (subcategory) {
3306 * if (needs_special_handling[f(subcategory)] &this_subcat_handler
3307 * }
b85af0ab 3308 *
fe1c1494
KW
3309 * This sets all the categories to the values in the current environment,
3310 * saves them temporarily in curlocales[] until they can be handled and/or
3311 * on some platforms saved in a per-thread array PL_curlocales[].
b85af0ab 3312 *
fe1c1494
KW
3313 * f(foo) is a mapping from the opaque system category numbers to small
3314 * non-negative integers used most everywhere in this file as indices into
3315 * arrays (such as curlocales[]) so the program doesn't have to otherwise
3316 * deal with the opaqueness.
b85af0ab 3317 *
fe1c1494
KW
3318 * If the platform doesn't have LC_ALL, the lines marked 'x' above are
3319 * effectively replaced by:
3320 * foreach (subcategory) { y
3321 * curlocales[f(subcategory)] = setlocale(subcategory, ""); y
3322 * } y
b85af0ab 3323 *
fe1c1494
KW
3324 * The only differences being the lack of an LC_ALL call, and using ""
3325 * instead of NULL in the setlocale calls.
b85af0ab 3326 *
fe1c1494 3327 * But there are, of course, complications.
b85af0ab 3328 *
fe1c1494
KW
3329 * it has to deal with if this is an embedded perl, whose locale doesn't
3330 * come from the environment, but has been set up by the caller. This is
3331 * pretty simply handled: the "" in the setlocale calls is not a string
3332 * constant, but a variable which is set to NULL in the embedded case.
b85af0ab 3333 *
fe1c1494
KW
3334 * But the major complication is handling failure and doing fallback. All
3335 * the code marked 'x' or 'y' above is actually enclosed in an outer loop,
3336 * using the array trial_locales[]. On entry, trial_locales[] is
3337 * initialized to just one entry, containing the NULL or "" locale argument
3338 * shown above. If, as is almost always the case, everything works, it
3339 * exits after just the one iteration, going on to the next step.
b85af0ab 3340 *
fe1c1494
KW
3341 * But if there is a failure, the code tries its best to honor the
3342 * environment as much as possible. It self-modifies trial_locales[] to
3343 * have more elements, one for each of the POSIX-specified settings from
3344 * the environment, such as LANG, ending in the ultimate fallback, the C
3345 * locale. Thus if there is something bogus with a higher priority
3346 * environment variable, it will try with the next highest, until something
3347 * works. If everything fails, it limps along with whatever state it got
3348 * to.
b85af0ab
KW
3349 *
3350 * A further complication is that Windows has an additional fallback, the
3351 * user-default ANSI code page obtained from the operating system. This is
3352 * added as yet another loop iteration, just before the final "C"
6b058d42 3353 *
1aa8e6dd
KW
3354 * A slight complication is that in embedded Perls, the locale may already
3355 * be set-up, and we don't want to get it from the normal environment
3356 * variables. This is handled by having a special environment variable
3357 * indicate we're in this situation. We simply set setlocale's 2nd
3358 * parameter to be a NULL instead of "". That indicates to setlocale that
3359 * it is not to change anything, but to return the current value,
3360 * effectively initializing perl's db to what the locale already is.
3361 *
3362 * We play the same trick with NULL if a LC_ALL succeeds. We call
3363 * setlocale() on the individual categores with NULL to get their existing
3364 * values for our db, instead of trying to change them.
3365 * */
1565c085 3366
0e92a118
KW
3367 int ok = 1;
3368
7d4bcc4a
KW
3369#ifndef USE_LOCALE
3370
3371 PERL_UNUSED_ARG(printwarn);
3372
3373#else /* USE_LOCALE */
7d4bcc4a
KW
3374# ifdef __GLIBC__
3375
24f3e849 3376 const char * const language = PerlEnv_getenv("LANGUAGE");
7d4bcc4a
KW
3377
3378# endif
65ebb059 3379
ccd65d51
KW
3380 /* NULL uses the existing already set up locale */
3381 const char * const setlocale_init = (PerlEnv_getenv("PERL_SKIP_LOCALE_INIT"))
3382 ? NULL
3383 : "";
c3fcd832
KW
3384 const char* trial_locales[5]; /* 5 = 1 each for "", LC_ALL, LANG, "", C */
3385 unsigned int trial_locales_count;
24f3e849
KW
3386 const char * const lc_all = PerlEnv_getenv("LC_ALL");
3387 const char * const lang = PerlEnv_getenv("LANG");
98994639 3388 bool setlocale_failure = FALSE;
65ebb059 3389 unsigned int i;
175c4cf9
KW
3390
3391 /* A later getenv() could zap this, so only use here */
3392 const char * const bad_lang_use_once = PerlEnv_getenv("PERL_BADLANG");
3393
3394 const bool locwarn = (printwarn > 1
e5f10d49
KW
3395 || ( printwarn
3396 && ( ! bad_lang_use_once
22ff3130 3397 || (
e5f10d49
KW
3398 /* disallow with "" or "0" */
3399 *bad_lang_use_once
3400 && strNE("0", bad_lang_use_once)))));
ea92aad8 3401
291a84fb 3402 /* setlocale() return vals; not copied so must be looked at immediately */
8de4332b 3403 const char * sl_result[NOMINAL_LC_ALL_INDEX + 1];
291a84fb
KW
3404
3405 /* current locale for given category; should have been copied so aren't
3406 * volatile */
8de4332b 3407 const char * curlocales[NOMINAL_LC_ALL_INDEX + 1];
291a84fb 3408
7d4bcc4a
KW
3409# ifdef WIN32
3410
6bce99ee
JH
3411 /* In some systems you can find out the system default locale
3412 * and use that as the fallback locale. */
7d4bcc4a
KW
3413# define SYSTEM_DEFAULT_LOCALE
3414# endif
3415# ifdef SYSTEM_DEFAULT_LOCALE
3416
65ebb059 3417 const char *system_default_locale = NULL;
98994639 3418
7d4bcc4a 3419# endif
948523db
KW
3420
3421# ifndef DEBUGGING
3422# define DEBUG_LOCALE_INIT(a,b,c)
3423# else
7d4bcc4a 3424
8298454c 3425 DEBUG_INITIALIZATION_set(cBOOL(PerlEnv_getenv("PERL_DEBUG_LOCALE_INIT")));
7d4bcc4a 3426
d188fe6d 3427# define DEBUG_LOCALE_INIT(cat_index, locale, result) \
94d521c0
KW
3428 DEBUG_L(PerlIO_printf(Perl_debug_log, \
3429 "%s:%d: %s\n", __FILE__, __LINE__, \
d188fe6d 3430 setlocale_debug_string_i(cat_index, locale, result)));
2fcc0ca9 3431
948523db
KW
3432/* Make sure the parallel arrays are properly set up */
3433# ifdef USE_LOCALE_NUMERIC
4fd76d49
KW
3434 assert(categories[LC_NUMERIC_INDEX_] == LC_NUMERIC);
3435 assert(strEQ(category_names[LC_NUMERIC_INDEX_], "LC_NUMERIC"));
e9bc6d6b 3436# ifdef USE_POSIX_2008_LOCALE
4fd76d49 3437 assert(category_masks[LC_NUMERIC_INDEX_] == LC_NUMERIC_MASK);
e9bc6d6b 3438# endif
948523db
KW
3439# endif
3440# ifdef USE_LOCALE_CTYPE
4fd76d49
KW
3441 assert(categories[LC_CTYPE_INDEX_] == LC_CTYPE);
3442 assert(strEQ(category_names[LC_CTYPE_INDEX_], "LC_CTYPE"));
e9bc6d6b 3443# ifdef USE_POSIX_2008_LOCALE
4fd76d49 3444 assert(category_masks[LC_CTYPE_INDEX_] == LC_CTYPE_MASK);
e9bc6d6b 3445# endif
948523db
KW
3446# endif
3447# ifdef USE_LOCALE_COLLATE
4fd76d49
KW
3448 assert(categories[LC_COLLATE_INDEX_] == LC_COLLATE);
3449 assert(strEQ(category_names[LC_COLLATE_INDEX_], "LC_COLLATE"));
e9bc6d6b 3450# ifdef USE_POSIX_2008_LOCALE
4fd76d49 3451 assert(category_masks[LC_COLLATE_INDEX_] == LC_COLLATE_MASK);
e9bc6d6b 3452# endif
948523db
KW
3453# endif
3454# ifdef USE_LOCALE_TIME
4fd76d49
KW
3455 assert(categories[LC_TIME_INDEX_] == LC_TIME);
3456 assert(strEQ(category_names[LC_TIME_INDEX_], "LC_TIME"));
e9bc6d6b 3457# ifdef USE_POSIX_2008_LOCALE
4fd76d49 3458 assert(category_masks[LC_TIME_INDEX_] == LC_TIME_MASK);
e9bc6d6b 3459# endif
948523db
KW
3460# endif
3461# ifdef USE_LOCALE_MESSAGES
4fd76d49
KW
3462 assert(categories[LC_MESSAGES_INDEX_] == LC_MESSAGES);
3463 assert(strEQ(category_names[LC_MESSAGES_INDEX_], "LC_MESSAGES"));
e9bc6d6b 3464# ifdef USE_POSIX_2008_LOCALE
4fd76d49 3465 assert(category_masks[LC_MESSAGES_INDEX_] == LC_MESSAGES_MASK);
e9bc6d6b 3466# endif
948523db
KW
3467# endif
3468# ifdef USE_LOCALE_MONETARY
4fd76d49
KW
3469 assert(categories[LC_MONETARY_INDEX_] == LC_MONETARY);
3470 assert(strEQ(category_names[LC_MONETARY_INDEX_], "LC_MONETARY"));
e9bc6d6b 3471# ifdef USE_POSIX_2008_LOCALE
4fd76d49 3472 assert(category_masks[LC_MONETARY_INDEX_] == LC_MONETARY_MASK);
e9bc6d6b 3473# endif
948523db 3474# endif
9821811f 3475# ifdef USE_LOCALE_ADDRESS
4fd76d49
KW
3476 assert(categories[LC_ADDRESS_INDEX_] == LC_ADDRESS);
3477 assert(strEQ(category_names[LC_ADDRESS_INDEX_], "LC_ADDRESS"));
e9bc6d6b 3478# ifdef USE_POSIX_2008_LOCALE
4fd76d49 3479 assert(category_masks[LC_ADDRESS_INDEX_] == LC_ADDRESS_MASK);
e9bc6d6b 3480# endif
9821811f
KW
3481# endif
3482# ifdef USE_LOCALE_IDENTIFICATION
4fd76d49
KW
3483 assert(categories[LC_IDENTIFICATION_INDEX_] == LC_IDENTIFICATION);
3484 assert(strEQ(category_names[LC_IDENTIFICATION_INDEX_], "LC_IDENTIFICATION"));
e9bc6d6b 3485# ifdef USE_POSIX_2008_LOCALE
4fd76d49 3486 assert(category_masks[LC_IDENTIFICATION_INDEX_] == LC_IDENTIFICATION_MASK);
e9bc6d6b 3487# endif
9821811f
KW
3488# endif
3489# ifdef USE_LOCALE_MEASUREMENT
4fd76d49
KW
3490 assert(categories[LC_MEASUREMENT_INDEX_] == LC_MEASUREMENT);
3491 assert(strEQ(category_names[LC_MEASUREMENT_INDEX_], "LC_MEASUREMENT"));
e9bc6d6b 3492# ifdef USE_POSIX_2008_LOCALE
4fd76d49 3493 assert(category_masks[LC_MEASUREMENT_INDEX_] == LC_MEASUREMENT_MASK);
e9bc6d6b 3494# endif
9821811f
KW
3495# endif
3496# ifdef USE_LOCALE_PAPER
4fd76d49
KW
3497 assert(categories[LC_PAPER_INDEX_] == LC_PAPER);
3498 assert(strEQ(category_names[LC_PAPER_INDEX_], "LC_PAPER"));
e9bc6d6b 3499# ifdef USE_POSIX_2008_LOCALE
4fd76d49 3500 assert(category_masks[LC_PAPER_INDEX_] == LC_PAPER_MASK);
e9bc6d6b 3501# endif
9821811f
KW
3502# endif
3503# ifdef USE_LOCALE_TELEPHONE
4fd76d49
KW
3504 assert(categories[LC_TELEPHONE_INDEX_] == LC_TELEPHONE);
3505 assert(strEQ(category_names[LC_TELEPHONE_INDEX_], "LC_TELEPHONE"));
e9bc6d6b 3506# ifdef USE_POSIX_2008_LOCALE
4fd76d49 3507 assert(category_masks[LC_TELEPHONE_INDEX_] == LC_TELEPHONE_MASK);
e9bc6d6b 3508# endif
9821811f 3509# endif
36dbc955 3510# ifdef USE_LOCALE_SYNTAX
4fd76d49
KW
3511 assert(categories[LC_SYNTAX_INDEX_] == LC_SYNTAX);
3512 assert(strEQ(category_names[LC_SYNTAX_INDEX_], "LC_SYNTAX"));
36dbc955 3513# ifdef USE_POSIX_2008_LOCALE
4fd76d49 3514 assert(category_masks[LC_SYNTAX_INDEX_] == LC_SYNTAX_MASK);
36dbc955
KW
3515# endif
3516# endif
3517# ifdef USE_LOCALE_TOD
4fd76d49
KW
3518 assert(categories[LC_TOD_INDEX_] == LC_TOD);
3519 assert(strEQ(category_names[LC_TOD_INDEX_], "LC_TOD"));
36dbc955 3520# ifdef USE_POSIX_2008_LOCALE
4fd76d49 3521 assert(category_masks[LC_TOD_INDEX_] == LC_TOD_MASK);
36dbc955
KW
3522# endif
3523# endif
948523db 3524# ifdef LC_ALL
4fd76d49
KW
3525 assert(categories[LC_ALL_INDEX_] == LC_ALL);
3526 assert(strEQ(category_names[LC_ALL_INDEX_], "LC_ALL"));
3527 assert(NOMINAL_LC_ALL_INDEX == LC_ALL_INDEX_);
e9bc6d6b 3528# ifdef USE_POSIX_2008_LOCALE
4fd76d49 3529 assert(category_masks[LC_ALL_INDEX_] == LC_ALL_MASK);
e9bc6d6b 3530# endif
948523db
KW
3531# endif
3532# endif /* DEBUGGING */
47280b20 3533
5a6637f0
KW
3534 /* Initialize the per-thread mbrFOO() state variables. See POSIX.xs for
3535 * why these particular incantations are used. */
fe1c1494 3536# ifdef HAS_MBRLEN
d2c9cb53 3537 memzero(&PL_mbrlen_ps, sizeof(PL_mbrlen_ps));
fe1c1494
KW
3538# endif
3539# ifdef HAS_MBRTOWC
5a6637f0 3540 memzero(&PL_mbrtowc_ps, sizeof(PL_mbrtowc_ps));
fe1c1494
KW
3541# endif
3542# ifdef HAS_WCTOMBR
5a6637f0 3543 wcrtomb(NULL, L'\0', &PL_wcrtomb_ps);
fe1c1494 3544# endif
d2c9cb53 3545
47280b20
KW
3546 /* Initialize the cache of the program's UTF-8ness for the always known
3547 * locales C and POSIX */
3548 my_strlcpy(PL_locale_utf8ness, C_and_POSIX_utf8ness,
3549 sizeof(PL_locale_utf8ness));
3550
8a2e41ff
HS
3551 /* See https://github.com/Perl/perl5/issues/17824 */
3552 Zero(curlocales, NOMINAL_LC_ALL_INDEX, char *);
3553
e9bc6d6b
KW
3554# ifdef USE_THREAD_SAFE_LOCALE
3555# ifdef WIN32
3556
3557 _configthreadlocale(_ENABLE_PER_THREAD_LOCALE);
3558
3559# endif
3560# endif
39e69e77 3561# ifdef USE_POSIX_2008_LOCALE
e9bc6d6b
KW
3562
3563 PL_C_locale_obj = newlocale(LC_ALL_MASK, "C", (locale_t) 0);
3564 if (! PL_C_locale_obj) {
3565 Perl_croak_nocontext(
3566 "panic: Cannot create POSIX 2008 C locale object; errno=%d", errno);
3567 }
e9bc6d6b 3568
94d521c0
KW
3569 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
3570 "%s:%d: created C object %p\n",
3571 __FILE__, __LINE__, PL_C_locale_obj));
e9bc6d6b 3572# endif
78f7c09f
KW
3573# ifdef USE_LOCALE_NUMERIC
3574
3ca88433
KW
3575 PL_numeric_radix_sv = newSVpvs(".");
3576
78f7c09f 3577# endif
434e0069 3578# ifdef USE_PL_CURLOCALES
e9bc6d6b
KW
3579
3580 /* Initialize our records. If we have POSIX 2008, we have LC_ALL */
3980cddb 3581 void_setlocale_c(LC_ALL, porcelain_setlocale(LC_ALL, NULL));
e9bc6d6b
KW
3582
3583# endif
98994639 3584
65ebb059 3585 /* We try each locale in the list until we get one that works, or exhaust
20a240df
KW
3586 * the list. Normally the loop is executed just once. But if setting the
3587 * locale fails, inside the loop we add fallback trials to the array and so
3588 * will execute the loop multiple times */
c3fcd832
KW
3589 trial_locales[0] = setlocale_init;
3590 trial_locales_count = 1;
7d4bcc4a 3591
65ebb059
KW
3592 for (i= 0; i < trial_locales_count; i++) {
3593 const char * trial_locale = trial_locales[i];
503bf960 3594 setlocale_failure = FALSE;
65ebb059
KW
3595
3596 if (i > 0) {
7d4bcc4a 3597# ifdef SYSTEM_DEFAULT_LOCALE
291a84fb 3598# ifdef WIN32 /* Note that assumes Win32 has LC_ALL */
7d4bcc4a 3599
65ebb059
KW
3600 /* On Windows machines, an entry of "" after the 0th means to use
3601 * the system default locale, which we now proceed to get. */
3602 if (strEQ(trial_locale, "")) {
3603 unsigned int j;
3604
3605 /* Note that this may change the locale, but we are going to do
3606 * that anyway just below */
3980cddb 3607 system_default_locale = setlocale_c(LC_ALL, "");
d188fe6d 3608 DEBUG_LOCALE_INIT(LC_ALL_INDEX_, "", system_default_locale);
65ebb059 3609
7d4bcc4a 3610 /* Skip if invalid or if it's already on the list of locales to
65ebb059
KW
3611 * try */
3612 if (! system_default_locale) {
3613 goto next_iteration;
3614 }
3615 for (j = 0; j < trial_locales_count; j++) {
3616 if (strEQ(system_default_locale, trial_locales[j])) {
3617 goto next_iteration;
3618 }
3619 }
3620
3621 trial_locale = system_default_locale;
3622 }
ec0202b5
KW
3623# else
3624# error SYSTEM_DEFAULT_LOCALE only implemented for Win32
3625# endif
7d4bcc4a 3626# endif /* SYSTEM_DEFAULT_LOCALE */
291a84fb
KW
3627
3628 } /* For i > 0 */
65ebb059 3629
7d4bcc4a
KW
3630# ifdef LC_ALL
3631
3980cddb 3632 sl_result[LC_ALL_INDEX_] = setlocale_c(LC_ALL, trial_locale);
d188fe6d 3633 DEBUG_LOCALE_INIT(LC_ALL_INDEX_, trial_locale, sl_result[LC_ALL_INDEX_]);
4fd76d49 3634 if (! sl_result[LC_ALL_INDEX_]) {
49c85077 3635 setlocale_failure = TRUE;
7cd8b568
KW
3636 }
3637 else {
3638 /* Since LC_ALL succeeded, it should have changed all the other
3639 * categories it can to its value; so we massage things so that the
3640 * setlocales below just return their category's current values.
3641 * This adequately handles the case in NetBSD where LC_COLLATE may
3642 * not be defined for a locale, and setting it individually will
7d4bcc4a 3643 * fail, whereas setting LC_ALL succeeds, leaving LC_COLLATE set to
7cd8b568
KW
3644 * the POSIX locale. */
3645 trial_locale = NULL;
3646 }
7d4bcc4a
KW
3647
3648# endif /* LC_ALL */
98994639 3649
e5f10d49
KW
3650 if (! setlocale_failure) {
3651 unsigned int j;
3652 for (j = 0; j < NOMINAL_LC_ALL_INDEX; j++) {
cf59ced9 3653 Safefree(curlocales[j]);
3980cddb 3654 curlocales[j] = setlocale_i(j, trial_locale);
e5f10d49
KW
3655 if (! curlocales[j]) {
3656 setlocale_failure = TRUE;
3657 }
a683a66a 3658 curlocales[j] = savepv(curlocales[j]);
d188fe6d 3659 DEBUG_LOCALE_INIT(j, trial_locale, curlocales[j]);
e5f10d49 3660 }
c835d6be 3661
f6665bcd 3662 if (LIKELY(! setlocale_failure)) { /* All succeeded */
e5f10d49 3663 break; /* Exit trial_locales loop */
49c85077 3664 }
65ebb059 3665 }
98994639 3666
49c85077
KW
3667 /* Here, something failed; will need to try a fallback. */
3668 ok = 0;
65ebb059 3669
49c85077
KW
3670 if (i == 0) {
3671 unsigned int j;
98994639 3672
65ebb059 3673 if (locwarn) { /* Output failure info only on the first one */
7d4bcc4a
KW
3674
3675# ifdef LC_ALL
98994639 3676
49c85077
KW
3677 PerlIO_printf(Perl_error_log,
3678 "perl: warning: Setting locale failed.\n");
98994639 3679
7d4bcc4a 3680# else /* !LC_ALL */
98994639 3681
49c85077 3682 PerlIO_printf(Perl_error_log,
71ce8c74 3683 "perl: warning: Setting locale failed for the categories:\n");
7d4bcc4a 3684
e5f10d49
KW
3685 for (j = 0; j < NOMINAL_LC_ALL_INDEX; j++) {
3686 if (! curlocales[j]) {
71ce8c74 3687 PerlIO_printf(Perl_error_log, "\t%s\n", category_names[j]);
e5f10d49 3688 }
e5f10d49 3689 }
7d4bcc4a 3690
7d4bcc4a 3691# endif /* LC_ALL */
98994639 3692
49c85077
KW
3693 PerlIO_printf(Perl_error_log,
3694 "perl: warning: Please check that your locale settings:\n");
98994639 3695
7d4bcc4a
KW
3696# ifdef __GLIBC__
3697
49c85077
KW
3698 PerlIO_printf(Perl_error_log,
3699 "\tLANGUAGE = %c%s%c,\n",
3700 language ? '"' : '(',
3701 language ? language : "unset",
3702 language ? '"' : ')');
7d4bcc4a 3703# endif
98994639 3704
49c85077
KW
3705 PerlIO_printf(Perl_error_log,
3706 "\tLC_ALL = %c%s%c,\n",
3707 lc_all ? '"' : '(',
3708 lc_all ? lc_all : "unset",
3709 lc_all ? '"' : ')');
98994639 3710
7d4bcc4a
KW
3711# if defined(USE_ENVIRON_ARRAY)
3712
49c85077 3713 {
cd999af9 3714 char **e;
d5e32b93
KW
3715
3716 /* Look through the environment for any variables of the
3717 * form qr/ ^ LC_ [A-Z]+ = /x, except LC_ALL which was
3718 * already handled above. These are assumed to be locale
3719 * settings. Output them and their values. */
cd999af9 3720 for (e = environ; *e; e++) {
d5e32b93
KW
3721 const STRLEN prefix_len = sizeof("LC_") - 1;
3722 STRLEN uppers_len;
3723
cd999af9 3724 if ( strBEGINs(*e, "LC_")
c8b388b0 3725 && ! strBEGINs(*e, "LC_ALL=")
d5e32b93
KW
3726 && (uppers_len = strspn(*e + prefix_len,
3727 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"))
3728 && ((*e)[prefix_len + uppers_len] == '='))
cd999af9
KW
3729 {
3730 PerlIO_printf(Perl_error_log, "\t%.*s = \"%s\",\n",
d5e32b93
KW
3731 (int) (prefix_len + uppers_len), *e,
3732 *e + prefix_len + uppers_len + 1);
cd999af9
KW
3733 }
3734 }
49c85077 3735 }
7d4bcc4a
KW
3736
3737# else
3738
49c85077
KW
3739 PerlIO_printf(Perl_error_log,
3740 "\t(possibly more locale environment variables)\n");
7d4bcc4a
KW
3741
3742# endif
98994639 3743
49c85077
KW
3744 PerlIO_printf(Perl_error_log,
3745 "\tLANG = %c%s%c\n",
3746 lang ? '"' : '(',
3747 lang ? lang : "unset",
3748 lang ? '"' : ')');
98994639 3749
49c85077
KW
3750 PerlIO_printf(Perl_error_log,
3751 " are supported and installed on your system.\n");
3752 }
98994639 3753
65ebb059 3754 /* Calculate what fallback locales to try. We have avoided this
f6bab5f6 3755 * until we have to, because failure is quite unlikely. This will
65ebb059
KW
3756 * usually change the upper bound of the loop we are in.
3757 *
3758 * Since the system's default way of setting the locale has not
3759 * found one that works, We use Perl's defined ordering: LC_ALL,
3760 * LANG, and the C locale. We don't try the same locale twice, so
3761 * don't add to the list if already there. (On POSIX systems, the
3762 * LC_ALL element will likely be a repeat of the 0th element "",
6b058d42
KW
3763 * but there's no harm done by doing it explicitly.
3764 *
3765 * Note that this tries the LC_ALL environment variable even on
3766 * systems which have no LC_ALL locale setting. This may or may
3767 * not have been originally intentional, but there's no real need
3768 * to change the behavior. */
65ebb059
KW
3769 if (lc_all) {
3770 for (j = 0; j < trial_locales_count; j++) {
3771 if (strEQ(lc_all, trial_locales[j])) {
3772 goto done_lc_all;
3773 }
3774 }
3775 trial_locales[trial_locales_count++] = lc_all;
3776 }
3777 done_lc_all:
98994639 3778
65ebb059
KW
3779 if (lang) {
3780 for (j = 0; j < trial_locales_count; j++) {
3781 if (strEQ(lang, trial_locales[j])) {
3782 goto done_lang;
3783 }
3784 }
3785 trial_locales[trial_locales_count++] = lang;
3786 }
3787 done_lang:
3788
7d4bcc4a
KW
3789# if defined(WIN32) && defined(LC_ALL)
3790
65ebb059
KW
3791 /* For Windows, we also try the system default locale before "C".
3792 * (If there exists a Windows without LC_ALL we skip this because
3793 * it gets too complicated. For those, the "C" is the next
3794 * fallback possibility). The "" is the same as the 0th element of
3795 * the array, but the code at the loop above knows to treat it
3796 * differently when not the 0th */
3797 trial_locales[trial_locales_count++] = "";
7d4bcc4a
KW
3798
3799# endif
65ebb059
KW
3800
3801 for (j = 0; j < trial_locales_count; j++) {
3802 if (strEQ("C", trial_locales[j])) {
3803 goto done_C;
3804 }
3805 }
3806 trial_locales[trial_locales_count++] = "C";
98994639 3807
65ebb059
KW
3808 done_C: ;
3809 } /* end of first time through the loop */
98994639 3810
7d4bcc4a
KW
3811# ifdef WIN32
3812
65ebb059 3813 next_iteration: ;
7d4bcc4a
KW
3814
3815# endif
65ebb059
KW
3816
3817 } /* end of looping through the trial locales */
3818
3819 if (ok < 1) { /* If we tried to fallback */
3820 const char* msg;
3821 if (! setlocale_failure) { /* fallback succeeded */
3822 msg = "Falling back to";
3823 }
3824 else { /* fallback failed */
e5f10d49 3825 unsigned int j;
98994639 3826
65ebb059
KW
3827 /* We dropped off the end of the loop, so have to decrement i to
3828 * get back to the value the last time through */
3829 i--;
98994639 3830
65ebb059
KW
3831 ok = -1;
3832 msg = "Failed to fall back to";
3833
3834 /* To continue, we should use whatever values we've got */
7d4bcc4a 3835
e5f10d49
KW
3836 for (j = 0; j < NOMINAL_LC_ALL_INDEX; j++) {
3837 Safefree(curlocales[j]);
a683a66a 3838 curlocales[j] = savepv(querylocale_i(j));
d188fe6d 3839 DEBUG_LOCALE_INIT(j, NULL, curlocales[j]);
e5f10d49 3840 }
65ebb059
KW
3841 }
3842
3843 if (locwarn) {
3844 const char * description;
3845 const char * name = "";
3846 if (strEQ(trial_locales[i], "C")) {
3847 description = "the standard locale";
3848 name = "C";
3849 }
7d4bcc4a
KW
3850
3851# ifdef SYSTEM_DEFAULT_LOCALE
3852
65ebb059
KW
3853 else if (strEQ(trial_locales[i], "")) {
3854 description = "the system default locale";
3855 if (system_default_locale) {
3856 name = system_default_locale;
3857 }
3858 }
7d4bcc4a
KW
3859
3860# endif /* SYSTEM_DEFAULT_LOCALE */
3861
65ebb059
KW
3862 else {
3863 description = "a fallback locale";
3864 name = trial_locales[i];
3865 }
3866 if (name && strNE(name, "")) {
3867 PerlIO_printf(Perl_error_log,
3868 "perl: warning: %s %s (\"%s\").\n", msg, description, name);
3869 }
3870 else {
3871 PerlIO_printf(Perl_error_log,
3872 "perl: warning: %s %s.\n", msg, description);
3873 }
3874 }
3875 } /* End of tried to fallback */
98994639 3876
e5f10d49 3877 /* Done with finding the locales; update our records */
c7dc1ea8 3878 new_LC_ALL(NULL);
e5f10d49 3879
948523db 3880 for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++) {
47280b20 3881
8751a11f 3882# if defined(USE_LOCALE_THREADS) && ! defined(USE_THREAD_SAFE_LOCALE)
47280b20
KW
3883
3884 /* This caches whether each category's locale is UTF-8 or not. This
3885 * may involve changing the locale. It is ok to do this at
e9bc6d6b
KW
3886 * initialization time before any threads have started, but not later
3887 * unless thread-safe operations are used.
47280b20
KW
3888 * Caching means that if the program heeds our dictate not to change
3889 * locales in threaded applications, this data will remain valid, and
9de6fe47
KW
3890 * it may get queried without having to change locales. If the
3891 * environment is such that all categories have the same locale, this
3892 * isn't needed, as the code will not change the locale; but this
3893 * handles the uncommon case where the environment has disparate
3894 * locales for the categories */
47280b20
KW
3895 (void) _is_cur_LC_category_utf8(categories[i]);
3896
3897# endif
3898
e5f10d49
KW
3899 Safefree(curlocales[i]);
3900 }
b310b053 3901
7d4bcc4a
KW
3902# if defined(USE_PERLIO) && defined(USE_LOCALE_CTYPE)
3903
49c85077 3904 /* Set PL_utf8locale to TRUE if using PerlIO _and_ the current LC_CTYPE
50bf02bd
KW
3905 * locale is UTF-8. The call to new_ctype() just above has already
3906 * calculated the latter value and saved it in PL_in_utf8_CTYPE_locale. If
3907 * both PL_utf8locale and PL_unicode (set by -C or by $ENV{PERL_UNICODE})
3908 * are true, perl.c:S_parse_body() will turn on the PerlIO :utf8 layer on
3909 * STDIN, STDOUT, STDERR, _and_ the default open discipline. */
3910 PL_utf8locale = PL_in_utf8_CTYPE_locale;
49c85077 3911
a05d7ebb 3912 /* Set PL_unicode to $ENV{PERL_UNICODE} if using PerlIO.
fde18df1
JH
3913 This is an alternative to using the -C command line switch
3914 (the -C if present will override this). */
3915 {
1604cfb0
MS
3916 const char *p = PerlEnv_getenv("PERL_UNICODE");
3917 PL_unicode = p ? parse_unicode_opts(&p) : 0;
3918 if (PL_unicode & PERL_UNICODE_UTF8CACHEASSERT_FLAG)
3919 PL_utf8cache = -1;
b310b053
JH
3920 }
3921
7d4bcc4a 3922# endif
e3305790 3923#endif /* USE_LOCALE */
7d4bcc4a 3924
2fcc0ca9 3925 /* So won't continue to output stuff */
27cdc72e 3926 DEBUG_INITIALIZATION_set(FALSE);
7d4bcc4a 3927
98994639
HS
3928 return ok;
3929}
3930
98994639
HS
3931#ifdef USE_LOCALE_COLLATE
3932
a4a439fb 3933char *
a4a439fb
KW
3934Perl__mem_collxfrm(pTHX_ const char *input_string,
3935 STRLEN len, /* Length of 'input_string' */
3936 STRLEN *xlen, /* Set to length of returned string
3937 (not including the collation index
3938 prefix) */
3939 bool utf8 /* Is the input in UTF-8? */
6696cfa7 3940 )
98994639 3941{
fe1c1494
KW
3942 /* _mem_collxfrm() is like strxfrm() but with two important differences.
3943 * First, it handles embedded NULs. Second, it allocates a bit more memory
3944 * than needed for the transformed data itself. The real transformed data
3945 * begins at offset COLLXFRM_HDR_LEN. *xlen is set to the length of that,
3946 * and doesn't include the collation index size.
3947 *
3948 * It is the caller's responsibility to eventually free the memory returned
3949 * by this function.
3950 *
a4a439fb
KW
3951 * Please see sv_collxfrm() to see how this is used. */
3952
fe1c1494 3953# define COLLXFRM_HDR_LEN sizeof(PL_collation_ix)
55e5378d 3954
6696cfa7
KW
3955 char * s = (char *) input_string;
3956 STRLEN s_strlen = strlen(input_string);
79f120c8 3957 char *xbuf = NULL;
55e5378d 3958 STRLEN xAlloc; /* xalloc is a reserved word in VC */
17f41037 3959 STRLEN length_in_chars;
c664130f 3960 bool first_time = TRUE; /* Cleared after first loop iteration */
98994639 3961
a4a439fb
KW
3962 PERL_ARGS_ASSERT__MEM_COLLXFRM;
3963
3964 /* Must be NUL-terminated */
3965 assert(*(input_string + len) == '\0');
7918f24d 3966
79f120c8
KW
3967 /* If this locale has defective collation, skip */
3968 if (PL_collxfrm_base == 0 && PL_collxfrm_mult == 0) {
c7202dee
KW
3969 DEBUG_L(PerlIO_printf(Perl_debug_log,
3970 "_mem_collxfrm: locale's collation is defective\n"));
79f120c8
KW
3971 goto bad;
3972 }
3973
6696cfa7
KW
3974 /* Replace any embedded NULs with the control that sorts before any others.
3975 * This will give as good as possible results on strings that don't
3976 * otherwise contain that character, but otherwise there may be
3977 * less-than-perfect results with that character and NUL. This is
fdc080f3 3978 * unavoidable unless we replace strxfrm with our own implementation. */
fd43f63c
KW
3979 if (UNLIKELY(s_strlen < len)) { /* Only execute if there is an embedded
3980 NUL */
6696cfa7
KW
3981 char * e = s + len;
3982 char * sans_nuls;
fdc080f3 3983 STRLEN sans_nuls_len;
94762aa0 3984 int try_non_controls;
afc4976f
KW
3985 char this_replacement_char[] = "?\0"; /* Room for a two-byte string,
3986 making sure 2nd byte is NUL.
3987 */
3988 STRLEN this_replacement_len;
3989
1e4c9676
KW
3990 /* If we don't know what non-NUL control character sorts lowest for
3991 * this locale, find it */
f28f4d2a 3992 if (PL_strxfrm_NUL_replacement == '\0') {
6696cfa7 3993 int j;
afc4976f 3994 char * cur_min_x = NULL; /* The min_char's xfrm, (except it also
6696cfa7
KW
3995 includes the collation index
3996 prefixed. */
3997
91c0e2e0 3998 DEBUG_Lv(PerlIO_printf(Perl_debug_log, "Looking to replace NUL\n"));
94762aa0
KW
3999
4000 /* Unlikely, but it may be that no control will work to replace
1e4c9676
KW
4001 * NUL, in which case we instead look for any character. Controls
4002 * are preferred because collation order is, in general, context
4003 * sensitive, with adjoining characters affecting the order, and
4004 * controls are less likely to have such interactions, allowing the
4005 * NUL-replacement to stand on its own. (Another way to look at it
4006 * is to imagine what would happen if the NUL were replaced by a
4007 * combining character; it wouldn't work out all that well.) */
94762aa0
KW
4008 for (try_non_controls = 0;
4009 try_non_controls < 2;
4010 try_non_controls++)
4011 {
d4ff9586
KW
4012 /* Look through all legal code points (NUL isn't) */
4013 for (j = 1; j < 256; j++) {
4014 char * x; /* j's xfrm plus collation index */
4015 STRLEN x_len; /* length of 'x' */
4016 STRLEN trial_len = 1;
736a4fed 4017 char cur_source[] = { '\0', '\0' };
d4ff9586 4018
736a4fed
KW
4019 /* Skip non-controls the first time through the loop. The
4020 * controls in a UTF-8 locale are the L1 ones */
afc4976f
KW
4021 if (! try_non_controls && (PL_in_utf8_COLLATE_locale)
4022 ? ! isCNTRL_L1(j)
4023 : ! isCNTRL_LC(j))
4024 {
d4ff9586 4025 continue;
6696cfa7 4026 }
6696cfa7 4027
736a4fed
KW
4028 /* Create a 1-char string of the current code point */
4029 cur_source[0] = (char) j;
4030
d4ff9586
KW
4031 /* Then transform it */
4032 x = _mem_collxfrm(cur_source, trial_len, &x_len,
afc4976f 4033 0 /* The string is not in UTF-8 */);
6696cfa7 4034
1e4c9676 4035 /* Ignore any character that didn't successfully transform.
d4ff9586
KW
4036 * */
4037 if (! x) {
4038 continue;
4039 }
6696cfa7 4040
d4ff9586
KW
4041 /* If this character's transformation is lower than
4042 * the current lowest, this one becomes the lowest */
4043 if ( cur_min_x == NULL
4044 || strLT(x + COLLXFRM_HDR_LEN,
4045 cur_min_x + COLLXFRM_HDR_LEN))
4046 {
f28f4d2a 4047 PL_strxfrm_NUL_replacement = j;
62d66863 4048 Safefree(cur_min_x);
d4ff9586 4049 cur_min_x = x;
d4ff9586
KW
4050 }
4051 else {
4052 Safefree(x);
4053 }
1e4c9676 4054 } /* end of loop through all 255 characters */
6696cfa7 4055
1e4c9676 4056 /* Stop looking if found */
94762aa0
KW
4057 if (cur_min_x) {
4058 break;
4059 }
4060
4061 /* Unlikely, but possible, if there aren't any controls that
4062 * work in the locale, repeat the loop, looking for any
4063 * character that works */
4064 DEBUG_L(PerlIO_printf(Perl_debug_log,
4065 "_mem_collxfrm: No control worked. Trying non-controls\n"));
1e4c9676 4066 } /* End of loop to try first the controls, then any char */
6696cfa7 4067
94762aa0
KW
4068 if (! cur_min_x) {
4069 DEBUG_L(PerlIO_printf(Perl_debug_log,
4070 "_mem_collxfrm: Couldn't find any character to replace"
4071 " embedded NULs in locale %s with", PL_collation_name));
4072 goto bad;
58eebef2
KW
4073 }
4074
94762aa0
KW
4075 DEBUG_L(PerlIO_printf(Perl_debug_log,
4076 "_mem_collxfrm: Replacing embedded NULs in locale %s with "
f28f4d2a 4077 "0x%02X\n", PL_collation_name, PL_strxfrm_NUL_replacement));
94762aa0 4078
6696cfa7 4079 Safefree(cur_min_x);
1e4c9676 4080 } /* End of determining the character that is to replace NULs */
afc4976f
KW
4081
4082 /* If the replacement is variant under UTF-8, it must match the
291a84fb 4083 * UTF8-ness of the original */
f28f4d2a
KW
4084 if ( ! UVCHR_IS_INVARIANT(PL_strxfrm_NUL_replacement) && utf8) {
4085 this_replacement_char[0] =
4086 UTF8_EIGHT_BIT_HI(PL_strxfrm_NUL_replacement);
4087 this_replacement_char[1] =
4088 UTF8_EIGHT_BIT_LO(PL_strxfrm_NUL_replacement);
afc4976f
KW
4089 this_replacement_len = 2;
4090 }
4091 else {
f28f4d2a 4092 this_replacement_char[0] = PL_strxfrm_NUL_replacement;
afc4976f
KW
4093 /* this_replacement_char[1] = '\0' was done at initialization */
4094 this_replacement_len = 1;
6696cfa7
KW
4095 }
4096
4097 /* The worst case length for the replaced string would be if every
4098 * character in it is NUL. Multiply that by the length of each
4099 * replacement, and allow for a trailing NUL */
afc4976f 4100 sans_nuls_len = (len * this_replacement_len) + 1;
fdc080f3 4101 Newx(sans_nuls, sans_nuls_len, char);
6696cfa7
KW
4102 *sans_nuls = '\0';
4103
6696cfa7
KW
4104 /* Replace each NUL with the lowest collating control. Loop until have
4105 * exhausted all the NULs */
4106 while (s + s_strlen < e) {
6069d6c5 4107 my_strlcat(sans_nuls, s, sans_nuls_len);
6696cfa7
KW
4108
4109 /* Do the actual replacement */
6069d6c5 4110 my_strlcat(sans_nuls, this_replacement_char, sans_nuls_len);
6696cfa7
KW
4111
4112 /* Move past the input NUL */
4113 s += s_strlen + 1;
4114 s_strlen = strlen(s);
4115 }
4116
4117 /* And add anything that trails the final NUL */
6069d6c5 4118 my_strlcat(sans_nuls, s, sans_nuls_len);
6696cfa7
KW
4119
4120 /* Switch so below we transform this modified string */
4121 s = sans_nuls;
4122 len = strlen(s);
1e4c9676 4123 } /* End of replacing NULs */
6696cfa7 4124
a4a439fb
KW
4125 /* Make sure the UTF8ness of the string and locale match */
4126 if (utf8 != PL_in_utf8_COLLATE_locale) {
9de6fe47 4127 /* XXX convert above Unicode to 10FFFF? */
a4a439fb
KW
4128 const char * const t = s; /* Temporary so we can later find where the
4129 input was */
4130
4131 /* Here they don't match. Change the string's to be what the locale is
4132 * expecting */
4133
4134 if (! utf8) { /* locale is UTF-8, but input isn't; upgrade the input */
4135 s = (char *) bytes_to_utf8((const U8 *) s, &len);
4136 utf8 = TRUE;
4137 }
4138 else { /* locale is not UTF-8; but input is; downgrade the input */
4139
4140 s = (char *) bytes_from_utf8((const U8 *) s, &len, &utf8);
4141
4142 /* If the downgrade was successful we are done, but if the input
4143 * contains things that require UTF-8 to represent, have to do
4144 * damage control ... */
4145 if (UNLIKELY(utf8)) {
4146
4147 /* What we do is construct a non-UTF-8 string with
4148 * 1) the characters representable by a single byte converted
4149 * to be so (if necessary);
4150 * 2) and the rest converted to collate the same as the
4151 * highest collating representable character. That makes
4152 * them collate at the end. This is similar to how we
4153 * handle embedded NULs, but we use the highest collating
4154 * code point instead of the smallest. Like the NUL case,
4155 * this isn't perfect, but is the best we can reasonably
4156 * do. Every above-255 code point will sort the same as
4157 * the highest-sorting 0-255 code point. If that code
4158 * point can combine in a sequence with some other code
4159 * points for weight calculations, us changing something to
4160 * be it can adversely affect the results. But in most
4161 * cases, it should work reasonably. And note that this is
4162 * really an illegal situation: using code points above 255
4163 * on a locale where only 0-255 are valid. If two strings
4164 * sort entirely equal, then the sort order for the
4165 * above-255 code points will be in code point order. */
4166
4167 utf8 = FALSE;
4168
4169 /* If we haven't calculated the code point with the maximum
4170 * collating order for this locale, do so now */
4171 if (! PL_strxfrm_max_cp) {
4172 int j;
4173
4174 /* The current transformed string that collates the
4175 * highest (except it also includes the prefixed collation
4176 * index. */
4177 char * cur_max_x = NULL;
4178
4179 /* Look through all legal code points (NUL isn't) */
4180 for (j = 1; j < 256; j++) {
4181 char * x;
4182 STRLEN x_len;
736a4fed 4183 char cur_source[] = { '\0', '\0' };
a4a439fb 4184
736a4fed
KW
4185 /* Create a 1-char string of the current code point */
4186 cur_source[0] = (char) j;
a4a439fb
KW
4187
4188 /* Then transform it */
4189 x = _mem_collxfrm(cur_source, 1, &x_len, FALSE);
4190
4191 /* If something went wrong (which it shouldn't), just
4192 * ignore this code point */
94762aa0 4193 if (! x) {
a4a439fb
KW
4194 continue;
4195 }
4196
4197 /* If this character's transformation is higher than
4198 * the current highest, this one becomes the highest */
4199 if ( cur_max_x == NULL
55e5378d
KW
4200 || strGT(x + COLLXFRM_HDR_LEN,
4201 cur_max_x + COLLXFRM_HDR_LEN))
a4a439fb
KW
4202 {
4203 PL_strxfrm_max_cp = j;
62d66863 4204 Safefree(cur_max_x);
a4a439fb
KW
4205 cur_max_x = x;
4206 }
4207 else {
4208 Safefree(x);
4209 }
4210 }
4211
94762aa0
KW
4212 if (! cur_max_x) {
4213 DEBUG_L(PerlIO_printf(Perl_debug_log,
4214 "_mem_collxfrm: Couldn't find any character to"
4215 " replace above-Latin1 chars in locale %s with",
4216 PL_collation_name));
4217 goto bad;
4218 }
4219
58eebef2
KW
4220 DEBUG_L(PerlIO_printf(Perl_debug_log,
4221 "_mem_collxfrm: highest 1-byte collating character"
4222 " in locale %s is 0x%02X\n",
4223 PL_collation_name,
4224 PL_strxfrm_max_cp));
58eebef2 4225
a4a439fb
KW
4226 Safefree(cur_max_x);
4227 }
4228
4229 /* Here we know which legal code point collates the highest.
4230 * We are ready to construct the non-UTF-8 string. The length
4231 * will be at least 1 byte smaller than the input string
4232 * (because we changed at least one 2-byte character into a
4233 * single byte), but that is eaten up by the trailing NUL */
4234 Newx(s, len, char);
4235
4236 {
4237 STRLEN i;
4238 STRLEN d= 0;
042d9e50 4239 char * e = (char *) t + len;
a4a439fb
KW
4240
4241 for (i = 0; i < len; i+= UTF8SKIP(t + i)) {
4242 U8 cur_char = t[i];
4243 if (UTF8_IS_INVARIANT(cur_char)) {
4244 s[d++] = cur_char;
4245 }
042d9e50 4246 else if (UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(t + i, e)) {
a4a439fb
KW
4247 s[d++] = EIGHT_BIT_UTF8_TO_NATIVE(cur_char, t[i+1]);
4248 }
4249 else { /* Replace illegal cp with highest collating
4250 one */
4251 s[d++] = PL_strxfrm_max_cp;
4252 }
4253 }
4254 s[d++] = '\0';
4255 Renew(s, d, char); /* Free up unused space */
4256 }
4257 }
4258 }
4259
4260 /* Here, we have constructed a modified version of the input. It could
4261 * be that we already had a modified copy before we did this version.
4262 * If so, that copy is no longer needed */
4263 if (t != input_string) {
4264 Safefree(t);
4265 }
4266 }
4267
17f41037
KW
4268 length_in_chars = (utf8)
4269 ? utf8_length((U8 *) s, (U8 *) s + len)
4270 : len;
4271
59c018b9
KW
4272 /* The first element in the output is the collation id, used by
4273 * sv_collxfrm(); then comes the space for the transformed string. The
4274 * equation should give us a good estimate as to how much is needed */
55e5378d 4275 xAlloc = COLLXFRM_HDR_LEN
a4a439fb 4276 + PL_collxfrm_base
17f41037 4277 + (PL_collxfrm_mult * length_in_chars);
a02a5408 4278 Newx(xbuf, xAlloc, char);
c7202dee
KW
4279 if (UNLIKELY(! xbuf)) {
4280 DEBUG_L(PerlIO_printf(Perl_debug_log,
4281 "_mem_collxfrm: Couldn't malloc %zu bytes\n", xAlloc));
1604cfb0 4282 goto bad;
c7202dee 4283 }
98994639 4284
d35fca5f 4285 /* Store the collation id */
98994639 4286 *(U32*)xbuf = PL_collation_ix;
d35fca5f
KW
4287
4288 /* Then the transformation of the input. We loop until successful, or we
4289 * give up */
4ebeff16 4290 for (;;) {
1adab0a7 4291
55e5378d 4292 *xlen = strxfrm(xbuf + COLLXFRM_HDR_LEN, s, xAlloc - COLLXFRM_HDR_LEN);
4ebeff16
KW
4293
4294 /* If the transformed string occupies less space than we told strxfrm()
4295 * was available, it means it successfully transformed the whole
4296 * string. */
55e5378d 4297 if (*xlen < xAlloc - COLLXFRM_HDR_LEN) {
17f41037 4298
1adab0a7
KW
4299 /* Some systems include a trailing NUL in the returned length.
4300 * Ignore it, using a loop in case multiple trailing NULs are
4301 * returned. */
4302 while ( (*xlen) > 0
4303 && *(xbuf + COLLXFRM_HDR_LEN + (*xlen) - 1) == '\0')
4304 {
4305 (*xlen)--;
4306 }
4307
17f41037
KW
4308 /* If the first try didn't get it, it means our prediction was low.
4309 * Modify the coefficients so that we predict a larger value in any
4310 * future transformations */
4311 if (! first_time) {
4312 STRLEN needed = *xlen + 1; /* +1 For trailing NUL */
4313 STRLEN computed_guess = PL_collxfrm_base
4314 + (PL_collxfrm_mult * length_in_chars);
e1c30f0c
KW
4315
4316 /* On zero-length input, just keep current slope instead of
4317 * dividing by 0 */
4318 const STRLEN new_m = (length_in_chars != 0)
4319 ? needed / length_in_chars
4320 : PL_collxfrm_mult;
17f41037
KW
4321
4322 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
b07929e4
KW
4323 "%s: %d: initial size of %zu bytes for a length "
4324 "%zu string was insufficient, %zu needed\n",
17f41037 4325 __FILE__, __LINE__,
b07929e4 4326 computed_guess, length_in_chars, needed));
17f41037
KW
4327
4328 /* If slope increased, use it, but discard this result for
4329 * length 1 strings, as we can't be sure that it's a real slope
4330 * change */
4331 if (length_in_chars > 1 && new_m > PL_collxfrm_mult) {
7d4bcc4a
KW
4332
4333# ifdef DEBUGGING
4334
17f41037
KW
4335 STRLEN old_m = PL_collxfrm_mult;
4336 STRLEN old_b = PL_collxfrm_base;
7d4bcc4a
KW
4337
4338# endif
4339
17f41037
KW
4340 PL_collxfrm_mult = new_m;
4341 PL_collxfrm_base = 1; /* +1 For trailing NUL */
4342 computed_guess = PL_collxfrm_base
4343 + (PL_collxfrm_mult * length_in_chars);
4344 if (computed_guess < needed) {
4345 PL_collxfrm_base += needed - computed_guess;
4346 }
4347
4348 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
b07929e4
KW
4349 "%s: %d: slope is now %zu; was %zu, base "
4350 "is now %zu; was %zu\n",
17f41037 4351 __FILE__, __LINE__,
b07929e4
KW
4352 PL_collxfrm_mult, old_m,
4353 PL_collxfrm_base, old_b));
17f41037
KW
4354 }
4355 else { /* Slope didn't change, but 'b' did */
4356 const STRLEN new_b = needed
4357 - computed_guess
4358 + PL_collxfrm_base;
4359 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
b07929e4 4360 "%s: %d: base is now %zu; was %zu\n",
17f41037 4361 __FILE__, __LINE__,
b07929e4 4362 new_b, PL_collxfrm_base));
17f41037
KW
4363 PL_collxfrm_base = new_b;
4364 }
4365 }
4366
4ebeff16
KW
4367 break;
4368 }
bb0f664e 4369
c7202dee
KW
4370 if (UNLIKELY(*xlen >= PERL_INT_MAX)) {
4371 DEBUG_L(PerlIO_printf(Perl_debug_log,
4372 "_mem_collxfrm: Needed %zu bytes, max permissible is %u\n",
4373 *xlen, PERL_INT_MAX));
4ebeff16 4374 goto bad;
c7202dee 4375 }
d35fca5f 4376
c664130f 4377 /* A well-behaved strxfrm() returns exactly how much space it needs
1adab0a7
KW
4378 * (usually not including the trailing NUL) when it fails due to not
4379 * enough space being provided. Assume that this is the case unless
4380 * it's been proven otherwise */
c664130f 4381 if (LIKELY(PL_strxfrm_is_behaved) && first_time) {
55e5378d 4382 xAlloc = *xlen + COLLXFRM_HDR_LEN + 1;
c664130f
KW
4383 }
4384 else { /* Here, either:
4385 * 1) The strxfrm() has previously shown bad behavior; or
4386 * 2) It isn't the first time through the loop, which means
4387 * that the strxfrm() is now showing bad behavior, because
4388 * we gave it what it said was needed in the previous
4389 * iteration, and it came back saying it needed still more.
4390 * (Many versions of cygwin fit this. When the buffer size
4391 * isn't sufficient, they return the input size instead of
4392 * how much is needed.)
d4ff9586
KW
4393 * Increase the buffer size by a fixed percentage and try again.
4394 * */
6ddd902c 4395 xAlloc += (xAlloc / 4) + 1;
c664130f 4396 PL_strxfrm_is_behaved = FALSE;
c664130f 4397
94d521c0
KW
4398 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
4399 "_mem_collxfrm required more space than previously"
4400 " calculated for locale %s, trying again with new"
4401 " guess=%zu+%zu\n",
3e25f6d2 4402 PL_collation_name, COLLXFRM_HDR_LEN,
94d521c0 4403 xAlloc - COLLXFRM_HDR_LEN));
58eebef2 4404 }
c664130f 4405
4ebeff16 4406 Renew(xbuf, xAlloc, char);
c7202dee
KW
4407 if (UNLIKELY(! xbuf)) {
4408 DEBUG_L(PerlIO_printf(Perl_debug_log,
4409 "_mem_collxfrm: Couldn't realloc %zu bytes\n", xAlloc));
4ebeff16 4410 goto bad;
c7202dee 4411 }
c664130f
KW
4412
4413 first_time = FALSE;
4ebeff16 4414 }
98994639 4415
94d521c0
KW
4416 DEBUG_Lv((print_collxfrm_input_and_return(s, s + len, xlen, utf8),
4417 PerlIO_printf(Perl_debug_log, "Its xfrm is:"),
7e2f38b2
KW
4418 PerlIO_printf(Perl_debug_log, "%s\n",
4419 _byte_dump_string((U8 *) xbuf + COLLXFRM_HDR_LEN,
94d521c0 4420 *xlen, 1))));
58eebef2 4421
fe1c1494 4422 /* Free up unneeded space; retain enough for trailing NUL */
55e5378d 4423 Renew(xbuf, COLLXFRM_HDR_LEN + *xlen + 1, char);
98994639 4424
6696cfa7
KW
4425 if (s != input_string) {
4426 Safefree(s);
98994639
HS
4427 }
4428
98994639
HS
4429 return xbuf;
4430
4431 bad:
7d4bcc4a 4432
94d521c0 4433 DEBUG_Lv(print_collxfrm_input_and_return(s, s + len, NULL, utf8));
7d4bcc4a 4434
21dce8f4
KW
4435 Safefree(xbuf);
4436 if (s != input_string) {
4437 Safefree(s);
4438 }
4439 *xlen = 0;
4440
98994639
HS
4441 return NULL;
4442}
4443
7d4bcc4a 4444# ifdef DEBUGGING
c7202dee 4445
4cbaac56 4446STATIC void
c7202dee
KW
4447S_print_collxfrm_input_and_return(pTHX_
4448 const char * const s,
4449 const char * const e,
4450 const STRLEN * const xlen,
4451 const bool is_utf8)
4452{
c7202dee
KW
4453
4454 PERL_ARGS_ASSERT_PRINT_COLLXFRM_INPUT_AND_RETURN;
4455
511e4ff7
DM
4456 PerlIO_printf(Perl_debug_log, "_mem_collxfrm[%" UVuf "]: returning ",
4457 (UV)PL_collation_ix);
c7202dee 4458 if (xlen) {
08b6dc1d 4459 PerlIO_printf(Perl_debug_log, "%zu", *xlen);
c7202dee
KW
4460 }
4461 else {
4462 PerlIO_printf(Perl_debug_log, "NULL");
4463 }
4464 PerlIO_printf(Perl_debug_log, " for locale '%s', string='",
4465 PL_collation_name);
9c8a6dc2
KW
4466 print_bytes_for_locale(s, e, is_utf8);
4467
4468 PerlIO_printf(Perl_debug_log, "'\n");
4469}
4470
4d9252fd
KW
4471# endif /* DEBUGGING */
4472#endif /* USE_LOCALE_COLLATE */
fe1c1494 4473
1c0e67b5
KW
4474#ifdef USE_LOCALE
4475# ifdef DEBUGGING
4d9252fd 4476
9c8a6dc2
KW
4477STATIC void
4478S_print_bytes_for_locale(pTHX_
4479 const char * const s,
4480 const char * const e,
4481 const bool is_utf8)
4482{
4483 const char * t = s;
4484 bool prev_was_printable = TRUE;
4485 bool first_time = TRUE;
4486
4487 PERL_ARGS_ASSERT_PRINT_BYTES_FOR_LOCALE;
c7202dee
KW
4488
4489 while (t < e) {
4490 UV cp = (is_utf8)
4491 ? utf8_to_uvchr_buf((U8 *) t, e, NULL)
4492 : * (U8 *) t;
4493 if (isPRINT(cp)) {
4494 if (! prev_was_printable) {
4495 PerlIO_printf(Perl_debug_log, " ");
4496 }
4497 PerlIO_printf(Perl_debug_log, "%c", (U8) cp);
4498 prev_was_printable = TRUE;
4499 }
4500 else {
4501 if (! first_time) {
4502 PerlIO_printf(Perl_debug_log, " ");
4503 }
147e3846 4504 PerlIO_printf(Perl_debug_log, "%02" UVXf, cp);
c7202dee
KW
4505 prev_was_printable = FALSE;
4506 }
4507 t += (is_utf8) ? UTF8SKIP(t) : 1;
4508 first_time = FALSE;
4509 }
c7202dee
KW
4510}
4511
1c0e67b5 4512# endif /* #ifdef DEBUGGING */
8ef6e574 4513
962aa53f 4514STATIC const char *
fe1c1494
KW
4515S_switch_category_locale_to_template(pTHX_ const int switch_category,
4516 const int template_category,
4517 const char * template_locale)
962aa53f
KW
4518{
4519 /* Changes the locale for LC_'switch_category" to that of
4520 * LC_'template_category', if they aren't already the same. If not NULL,
4521 * 'template_locale' is the locale that 'template_category' is in.
4522 *
9de6fe47
KW
4523 * Returns a copy of the name of the original locale for 'switch_category'
4524 * so can be switched back to with the companion function
4525 * restore_switched_locale(), (NULL if no restoral is necessary.) */
962aa53f 4526
b0fb28a3 4527 const char * restore_to_locale = NULL;
962aa53f
KW
4528
4529 if (switch_category == template_category) { /* No changes needed */
4530 return NULL;
4531 }
4532
4533 /* Find the original locale of the category we may need to change, so that
4534 * it can be restored to later */
32899cfb 4535 restore_to_locale = querylocale_r(switch_category);
962aa53f
KW
4536 if (! restore_to_locale) {
4537 Perl_croak(aTHX_
4538 "panic: %s: %d: Could not find current %s locale, errno=%d\n",
4539 __FILE__, __LINE__, category_name(switch_category), errno);
4540 }
32899cfb 4541 restore_to_locale = savepv(restore_to_locale);
962aa53f
KW
4542
4543 /* If the locale of the template category wasn't passed in, find it now */
4544 if (template_locale == NULL) {
a683a66a 4545 template_locale = querylocale_r(template_category);
962aa53f
KW
4546 if (! template_locale) {
4547 Perl_croak(aTHX_
4548 "panic: %s: %d: Could not find current %s locale, errno=%d\n",
4549 __FILE__, __LINE__, category_name(template_category), errno);
4550 }
4551 }
4552
4553 /* It the locales are the same, there's nothing to do */
4554 if (strEQ(restore_to_locale, template_locale)) {
4555 Safefree(restore_to_locale);
4556
4557 DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s locale unchanged as %s\n",
e1c8059f 4558 category_name(switch_category), template_locale));
962aa53f
KW
4559
4560 return NULL;
4561 }
4562
4563 /* Finally, change the locale to the template one */
3980cddb 4564 if (! bool_setlocale_r(switch_category, template_locale)) {
962aa53f
KW
4565 Perl_croak(aTHX_
4566 "panic: %s: %d: Could not change %s locale to %s, errno=%d\n",
4567 __FILE__, __LINE__, category_name(switch_category),
4568 template_locale, errno);
4569 }
4570
4571 DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s locale switched to %s\n",
4572 category_name(switch_category), template_locale));
4573
4574 return restore_to_locale;
4575}
4576
4577STATIC void
fe1c1494
KW
4578S_restore_switched_locale(pTHX_ const int category,
4579 const char * const original_locale)
962aa53f 4580{
9de6fe47
KW
4581 /* Restores the locale for LC_'category' to 'original_locale' (which is a
4582 * copy that will be freed by this function), or do nothing if the latter
4583 * parameter is NULL */
962aa53f
KW
4584
4585 if (original_locale == NULL) {
4586 return;
4587 }
4588
3980cddb 4589 if (! bool_setlocale_r(category, original_locale)) {
962aa53f
KW
4590 Perl_croak(aTHX_
4591 "panic: %s: %d: setlocale %s restore to %s failed, errno=%d\n",
4592 __FILE__, __LINE__,
4593 category_name(category), original_locale, errno);
4594 }
4595
4596 Safefree(original_locale);
4597}
4598
51332242 4599/* is_cur_LC_category_utf8 uses a small char buffer to avoid malloc/free */
fe1c1494 4600# define CUR_LC_BUFFER_SIZE 64
51332242 4601
c1284011
KW
4602bool
4603Perl__is_cur_LC_category_utf8(pTHX_ int category)
7d74bb61
KW
4604{
4605 /* Returns TRUE if the current locale for 'category' is UTF-8; FALSE
4606 * otherwise. 'category' may not be LC_ALL. If the platform doesn't have
119ee68b 4607 * nl_langinfo(), nor MB_CUR_MAX, this employs a heuristic, which hence
609548d2
KW
4608 * could give the wrong result. The result will very likely be correct for
4609 * languages that have commonly used non-ASCII characters, but for notably
4610 * English, it comes down to if the locale's name ends in something like
9de6fe47
KW
4611 * "UTF-8". It errs on the side of not being a UTF-8 locale.
4612 *
4613 * If the platform is early C89, not containing mbtowc(), or we are
4614 * compiled to not pay attention to LC_CTYPE, this employs heuristics.
4615 * These work very well for non-Latin locales or those whose currency
4616 * symbol isn't a '$' nor plain ASCII text. But without LC_CTYPE and at
4617 * least MB_CUR_MAX, English locales with an ASCII currency symbol depend
4618 * on the name containing UTF-8 or not. */
7d74bb61 4619
47280b20 4620 /* Name of current locale corresponding to the input category */
8de4332b 4621 const char *save_input_locale = NULL;
47280b20
KW
4622
4623 bool is_utf8 = FALSE; /* The return value */
7d74bb61 4624
47280b20
KW
4625 /* The variables below are for the cache of previous lookups using this
4626 * function. The cache is a C string, described at the definition for
4627 * 'C_and_POSIX_utf8ness'.
4628 *
4629 * The first part of the cache is fixed, for the C and POSIX locales. The
4630 * varying part starts just after them. */
4631 char * utf8ness_cache = PL_locale_utf8ness + STRLENs(C_and_POSIX_utf8ness);
4632
4633 Size_t utf8ness_cache_size; /* Size of the varying portion */
4634 Size_t input_name_len; /* Length in bytes of save_input_locale */
4635 Size_t input_name_len_with_overhead; /* plus extra chars used to store
4636 the name in the cache */
4637 char * delimited; /* The name plus the delimiters used to store
4638 it in the cache */
51332242 4639 char buffer[CUR_LC_BUFFER_SIZE]; /* small buffer */
47280b20
KW
4640 char * name_pos; /* position of 'delimited' in the cache, or 0
4641 if not there */
4642
4643
7d4bcc4a
KW
4644# ifdef LC_ALL
4645
7d74bb61 4646 assert(category != LC_ALL);
7d4bcc4a
KW
4647
4648# endif
7d74bb61 4649
47280b20 4650 /* Get the desired category's locale */
32899cfb 4651 save_input_locale = querylocale_r(category);
7d74bb61 4652 if (! save_input_locale) {
47280b20 4653 Perl_croak(aTHX_
d707d779
KW
4654 "panic: %s: %d: Could not find current %s locale, errno=%d\n",
4655 __FILE__, __LINE__, category_name(category), errno);
7d74bb61 4656 }
32899cfb 4657 save_input_locale = savepv(save_input_locale);
47280b20 4658
47280b20
KW
4659 DEBUG_L(PerlIO_printf(Perl_debug_log,
4660 "Current locale for %s is %s\n",
4661 category_name(category), save_input_locale));
4662
4663 input_name_len = strlen(save_input_locale);
4664
4665 /* In our cache, each name is accompanied by two delimiters and a single
4666 * utf8ness digit */
4667 input_name_len_with_overhead = input_name_len + 3;
4668
51332242
N
4669 if ( input_name_len_with_overhead <= CUR_LC_BUFFER_SIZE ) {
4670 /* we can use the buffer, avoid a malloc */
4671 delimited = buffer;
4672 } else { /* need a malloc */
4673 /* Allocate and populate space for a copy of the name surrounded by the
4674 * delimiters */
4675 Newx(delimited, input_name_len_with_overhead, char);
4676 }
4677
47280b20
KW
4678 delimited[0] = UTF8NESS_SEP[0];
4679 Copy(save_input_locale, delimited + 1, input_name_len, char);
4680 delimited[input_name_len+1] = UTF8NESS_PREFIX[0];
4681 delimited[input_name_len+2] = '\0';
4682
4683 /* And see if that is in the cache */
4684 name_pos = instr(PL_locale_utf8ness, delimited);
4685 if (name_pos) {
4686 is_utf8 = *(name_pos + input_name_len_with_overhead - 1) - '0';
4687
94d521c0
KW
4688 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
4689 "UTF8ness for locale %s=%d, \n",
4690 save_input_locale, is_utf8));
47280b20
KW
4691
4692 /* And, if not already in that position, move it to the beginning of
4693 * the non-constant portion of the list, since it is the most recently
4694 * used. (We don't have to worry about overflow, since just moving
4695 * existing names around) */
4696 if (name_pos > utf8ness_cache) {
4697 Move(utf8ness_cache,
4698 utf8ness_cache + input_name_len_with_overhead,
4699 name_pos - utf8ness_cache, char);
4700 Copy(delimited,
4701 utf8ness_cache,
4702 input_name_len_with_overhead - 1, char);
4703 utf8ness_cache[input_name_len_with_overhead - 1] = is_utf8 + '0';
4704 }
4705
51332242
N
4706 /* free only when not using the buffer */
4707 if ( delimited != buffer ) Safefree(delimited);
b07fffd1 4708 Safefree(save_input_locale);
47280b20 4709 return is_utf8;
7d74bb61
KW
4710 }
4711
47280b20
KW
4712 /* Here we don't have stored the utf8ness for the input locale. We have to
4713 * calculate it */
4714
94646a69 4715# if defined(USE_LOCALE_CTYPE) \
6201c220 4716 && ( defined(HAS_NL_LANGINFO) \
94646a69 4717 || (defined(HAS_MBTOWC) || defined(HAS_MBRTOWC)))
7d74bb61 4718
0dec74cd 4719 {
962aa53f
KW
4720 const char *original_ctype_locale
4721 = switch_category_locale_to_template(LC_CTYPE,
4722 category,
4723 save_input_locale);
69014004 4724
7d74bb61 4725 /* Here the current LC_CTYPE is set to the locale of the category whose
0dec74cd 4726 * information is desired. This means that nl_langinfo() and mbtowc()
1d958db2 4727 * should give the correct results */
119ee68b 4728
0dec74cd
KW
4729# ifdef MB_CUR_MAX /* But we can potentially rule out UTF-8ness, avoiding
4730 calling the functions if we have this */
4731
4732 /* Standard UTF-8 needs at least 4 bytes to represent the maximum
4733 * Unicode code point. */
4734
de3a3072
KW
4735 DEBUG_L(PerlIO_printf(Perl_debug_log, "%s: %d: MB_CUR_MAX=%d\n",
4736 __FILE__, __LINE__, (int) MB_CUR_MAX));
0dec74cd
KW
4737 if ((unsigned) MB_CUR_MAX < STRLENs(MAX_UNICODE_UTF8)) {
4738 is_utf8 = FALSE;
b5b2847c
KW
4739 restore_switched_locale(LC_CTYPE, original_ctype_locale);
4740 goto finish_and_return;
0dec74cd
KW
4741 }
4742
4743# endif
6201c220 4744# if defined(HAS_NL_LANGINFO)
7d4bcc4a 4745
0dec74cd
KW
4746 { /* The task is easiest if the platform has this POSIX 2001 function.
4747 Except on some platforms it can wrongly return "", so have to have
4748 a fallback. And it can return that it's UTF-8, even if there are
4749 variances from that. For example, Turkish locales may use the
4750 alternate dotted I rules, and sometimes it appears to be a
4751 defective locale definition. XXX We should probably check for
9de6fe47
KW
4752 these in the Latin1 range and warn (but on glibc, requires
4753 iswalnum() etc. due to their not handling 80-FF correctly */
4e6826bf 4754 const char *codeset = my_nl_langinfo(CODESET, FALSE);
c70a3e68 4755 /* FALSE => already in dest locale */
119ee68b 4756
af84e886 4757 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
c70a3e68
KW
4758 "\tnllanginfo returned CODESET '%s'\n", codeset));
4759
4760 if (codeset && strNE(codeset, "")) {
1d958db2 4761
1d075173
KW
4762 /* If the implementation of foldEQ() somehow were
4763 * to change to not go byte-by-byte, this could
4764 * read past end of string, as only one length is
4765 * checked. But currently, a premature NUL will
4766 * compare false, and it will stop there */
c3c9077b
KW
4767 is_utf8 = cBOOL( foldEQ(codeset, "UTF-8", STRLENs("UTF-8"))
4768 || foldEQ(codeset, "UTF8", STRLENs("UTF8")));
1d958db2 4769
69014004
KW
4770 DEBUG_L(PerlIO_printf(Perl_debug_log,
4771 "\tnllanginfo returned CODESET '%s'; ?UTF8 locale=%d\n",
4772 codeset, is_utf8));
b5b2847c
KW
4773 restore_switched_locale(LC_CTYPE, original_ctype_locale);
4774 goto finish_and_return;
1d958db2 4775 }
119ee68b
KW
4776 }
4777
7d4bcc4a 4778# endif
94646a69 4779# if defined(HAS_MBTOWC) || defined(HAS_MBRTOWC)
0dec74cd
KW
4780 /* We can see if this is a UTF-8-like locale if have mbtowc(). It was a
4781 * late adder to C89, so very likely to have it. However, testing has
4782 * shown that, like nl_langinfo() above, there are locales that are not
4783 * strictly UTF-8 that this will return that they are */
69014004 4784
0dec74cd 4785 {
119ee68b 4786 wchar_t wc;
51fc4b19 4787 int len;
48015184 4788 dSAVEDERRNO;
51fc4b19 4789
8751a11f 4790# if defined(HAS_MBRTOWC) && defined(USE_LOCALE_THREADS)
94646a69
KW
4791
4792 mbstate_t ps;
4793
4794# endif
4795
4796 /* mbrtowc() and mbtowc() convert a byte string to a wide
4797 * character. Feed a byte string to one of them and check that the
4798 * result is the expected Unicode code point */
4799
8751a11f 4800# if defined(HAS_MBRTOWC) && defined(USE_LOCALE_THREADS)
94646a69
KW
4801 /* Prefer this function if available, as it's reentrant */
4802
a16bc7c9 4803 memzero(&ps, sizeof(ps));;
94646a69
KW
4804 PERL_UNUSED_RESULT(mbrtowc(&wc, NULL, 0, &ps)); /* Reset any shift
4805 state */
48015184 4806 SETERRNO(0, 0);
94646a69 4807 len = mbrtowc(&wc, STR_WITH_LEN(REPLACEMENT_CHARACTER_UTF8), &ps);
48015184 4808 SAVE_ERRNO;
94646a69
KW
4809
4810# else
0dec74cd 4811
7953f73f 4812 MBTOWC_LOCK;
856b881c 4813 PERL_UNUSED_RESULT(mbtowc(&wc, NULL, 0));/* Reset any shift state */
48015184 4814 SETERRNO(0, 0);
b1d4925c 4815 len = mbtowc(&wc, STR_WITH_LEN(REPLACEMENT_CHARACTER_UTF8));
48015184 4816 SAVE_ERRNO;
7953f73f 4817 MBTOWC_UNLOCK;
51fc4b19 4818
94646a69
KW
4819# endif
4820
48015184 4821 RESTORE_ERRNO;
af84e886 4822 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
0dec74cd 4823 "\treturn from mbtowc; len=%d; code_point=%x; errno=%d\n",
48015184 4824 len, (unsigned int) wc, GET_ERRNO));
b1d4925c 4825
0dec74cd
KW
4826 is_utf8 = cBOOL( len == STRLENs(REPLACEMENT_CHARACTER_UTF8)
4827 && wc == (wchar_t) UNICODE_REPLACEMENT);
119ee68b 4828 }
7d4bcc4a 4829
17dd77cd
FP
4830# endif
4831
962aa53f 4832 restore_switched_locale(LC_CTYPE, original_ctype_locale);
47280b20 4833 goto finish_and_return;
7d74bb61 4834 }
119ee68b 4835
0dec74cd 4836# else
7d74bb61 4837
0dec74cd
KW
4838 /* Here, we must have a C89 compiler that doesn't have mbtowc(). Next
4839 * try looking at the currency symbol to see if it disambiguates
4840 * things. Often that will be in the native script, and if the symbol
4841 * isn't in UTF-8, we know that the locale isn't. If it is non-ASCII
4842 * UTF-8, we infer that the locale is too, as the odds of a non-UTF8
4843 * string being valid UTF-8 are quite small */
fa9b773e 4844
660ee27b
KW
4845# ifdef USE_LOCALE_MONETARY
4846
4847 /* If have LC_MONETARY, we can look at the currency symbol. Often that
4848 * will be in the native script. We do this one first because there is
4849 * just one string to examine, so potentially avoids work */
7d4bcc4a 4850
9db16864
KW
4851 {
4852 const char *original_monetary_locale
660ee27b
KW
4853 = switch_category_locale_to_template(LC_MONETARY,
4854 category,
4855 save_input_locale);
9db16864 4856 bool only_ascii = FALSE;
660ee27b 4857 const U8 * currency_string
4e6826bf 4858 = (const U8 *) my_nl_langinfo(CRNCYSTR, FALSE);
660ee27b
KW
4859 /* 2nd param not relevant for this item */
4860 const U8 * first_variant;
fa9b773e 4861
660ee27b
KW
4862 assert( *currency_string == '-'
4863 || *currency_string == '+'
4864 || *currency_string == '.');
97f4de96 4865
660ee27b 4866 currency_string++;
fa9b773e 4867
660ee27b 4868 if (is_utf8_invariant_string_loc(currency_string, 0, &first_variant))
9db16864 4869 {
fe1c1494
KW
4870 DEBUG_L(PerlIO_printf(Perl_debug_log,
4871 "Couldn't get currency symbol for %s, or contains"
4872 " only ASCII; can't use for determining if UTF-8"
4873 " locale\n", save_input_locale));
9db16864
KW
4874 only_ascii = TRUE;
4875 }
4876 else {
660ee27b 4877 is_utf8 = is_strict_utf8_string(first_variant, 0);
9db16864 4878 }
fa9b773e 4879
9db16864 4880 restore_switched_locale(LC_MONETARY, original_monetary_locale);
fa9b773e 4881
9db16864 4882 if (! only_ascii) {
fa9b773e 4883
9db16864
KW
4884 /* It isn't a UTF-8 locale if the symbol is not legal UTF-8;
4885 * otherwise assume the locale is UTF-8 if and only if the symbol
4886 * is non-ascii UTF-8. */
fe1c1494
KW
4887 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
4888 "\t?Currency symbol for %s is UTF-8=%d\n",
9db16864
KW
4889 save_input_locale, is_utf8));
4890 goto finish_and_return;
4891 }
13542a67 4892 }
fa9b773e 4893
660ee27b 4894# endif /* USE_LOCALE_MONETARY */
7d4bcc4a 4895# if defined(HAS_STRFTIME) && defined(USE_LOCALE_TIME)
15f7e74e 4896
9db16864
KW
4897 /* Still haven't found a non-ASCII string to disambiguate UTF-8 or not. Try
4898 * the names of the months and weekdays, timezone, and am/pm indicator */
4899 {
4900 const char *original_time_locale
4901 = switch_category_locale_to_template(LC_TIME,
4902 category,
4903 save_input_locale);
4904 int hour = 10;
4905 bool is_dst = FALSE;
4906 int dom = 1;
4907 int month = 0;
4908 int i;
4909 char * formatted_time;
4910
4911 /* Here the current LC_TIME is set to the locale of the category
fe1c1494
KW
4912 * whose information is desired. Look at all the days of the week
4913 * and month names, and the timezone and am/pm indicator for UTF-8
4914 * variant characters. The first such a one found will tell us if
4915 * the locale is UTF-8 or not */
9db16864
KW
4916
4917 for (i = 0; i < 7 + 12; i++) { /* 7 days; 12 months */
4918 formatted_time = my_strftime("%A %B %Z %p",
4919 0, 0, hour, dom, month, 2012 - 1900, 0, 0, is_dst);
4920 if ( ! formatted_time
4921 || is_utf8_invariant_string((U8 *) formatted_time, 0))
4922 {
15f7e74e 4923
fe1c1494
KW
4924 /* Here, we didn't find a non-ASCII. Try the next time
4925 * through with the complemented dst and am/pm, and try
4926 * with the next weekday. After we have gotten all
4927 * weekdays, try the next month */
9db16864
KW
4928 is_dst = ! is_dst;
4929 hour = (hour + 12) % 24;
4930 dom++;
4931 if (i > 6) {
4932 month++;
4933 }
4934 continue;
15f7e74e 4935 }
9db16864
KW
4936
4937 /* Here, we have a non-ASCII. Return TRUE is it is valid UTF8;
4938 * false otherwise. But first, restore LC_TIME to its original
4939 * locale if we changed it */
4940 restore_switched_locale(LC_TIME, original_time_locale);
4941
fe1c1494
KW
4942 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
4943 "\t?time-related strings for %s are UTF-8=%d\n",
9db16864
KW
4944 save_input_locale,
4945 is_utf8_string((U8 *) formatted_time, 0)));
4946 is_utf8 = is_utf8_string((U8 *) formatted_time, 0);
4947 goto finish_and_return;
15f7e74e
KW
4948 }
4949
9db16864
KW
4950 /* Falling off the end of the loop indicates all the names were just
4951 * ASCII. Go on to the next test. If we changed it, restore LC_TIME
4952 * to its original locale */
962aa53f 4953 restore_switched_locale(LC_TIME, original_time_locale);
fe1c1494
KW
4954 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
4955 "All time-related words for %s contain only ASCII;"
4956 " can't use for determining if UTF-8 locale\n",
4957 save_input_locale));
15f7e74e
KW
4958 }
4959
7d4bcc4a 4960# endif
15f7e74e 4961
7d4bcc4a 4962# if 0 && defined(USE_LOCALE_MESSAGES) && defined(HAS_SYS_ERRLIST)
855aeb93 4963
fe1c1494
KW
4964 /* This code is ifdefd out because it was found to not be necessary in
4965 * testing on our dromedary test machine, which has over 700 locales.
4966 * There, this added no value to looking at the currency symbol and the
4967 * time strings. I left it in so as to avoid rewriting it if real-world
4968 * experience indicates that dromedary is an outlier. Essentially, instead
4969 * of returning abpve if we haven't found illegal utf8, we continue on and
4970 * examine all the strerror() messages on the platform for utf8ness. If
4971 * all are ASCII, we still don't know the answer; but otherwise we have a
4972 * pretty good indication of the utf8ness. The reason this doesn't help
4973 * much is that the messages may not have been translated into the locale.
4974 * The currency symbol and time strings are much more likely to have been
4975 * translated. */
9db16864
KW
4976 {
4977 int e;
4978 bool non_ascii = FALSE;
4979 const char *original_messages_locale
4980 = switch_category_locale_to_template(LC_MESSAGES,
4981 category,
4982 save_input_locale);
4983 const char * errmsg = NULL;
4984
4985 /* Here the current LC_MESSAGES is set to the locale of the category
4986 * whose information is desired. Look through all the messages. We
4987 * can't use Strerror() here because it may expand to code that
4988 * segfaults in miniperl */
4989
4990 for (e = 0; e <= sys_nerr; e++) {
4991 errno = 0;
4992 errmsg = sys_errlist[e];
4993 if (errno || !errmsg) {
4994 break;
4995 }
4996 errmsg = savepv(errmsg);
4997 if (! is_utf8_invariant_string((U8 *) errmsg, 0)) {
4998 non_ascii = TRUE;
4999 is_utf8 = is_utf8_string((U8 *) errmsg, 0);
5000 break;
5001 }
855aeb93 5002 }
9db16864 5003 Safefree(errmsg);
855aeb93 5004
9db16864 5005 restore_switched_locale(LC_MESSAGES, original_messages_locale);
855aeb93 5006
9db16864 5007 if (non_ascii) {
5857e934 5008
fe1c1494
KW
5009 /* Any non-UTF-8 message means not a UTF-8 locale; if all are
5010 * valid, any non-ascii means it is one; otherwise we assume it
5011 * isn't */
5012 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
5013 "\t?error messages for %s are UTF-8=%d\n",
9db16864
KW
5014 save_input_locale,
5015 is_utf8));
5016 goto finish_and_return;
5017 }
855aeb93 5018
fe1c1494
KW
5019 DEBUG_L(PerlIO_printf(Perl_debug_log,
5020 "All error messages for %s contain only ASCII;"
5021 " can't use for determining if UTF-8 locale\n",
5022 save_input_locale));
9db16864 5023 }
855aeb93 5024
7d4bcc4a 5025# endif
0dec74cd 5026# ifndef EBCDIC /* On os390, even if the name ends with "UTF-8', it isn't a
92c0a900 5027 UTF-8 locale */
7d4bcc4a 5028
97f4de96
KW
5029 /* As a last resort, look at the locale name to see if it matches
5030 * qr/UTF -? * 8 /ix, or some other common locale names. This "name", the
5031 * return of setlocale(), is actually defined to be opaque, so we can't
5032 * really rely on the absence of various substrings in the name to indicate
5033 * its UTF-8ness, but if it has UTF8 in the name, it is extremely likely to
5034 * be a UTF-8 locale. Similarly for the other common names */
5035
0dec74cd 5036 {
c36d8df8 5037 const Size_t final_pos = strlen(save_input_locale) - 1;
0dec74cd 5038
c36d8df8
KW
5039 if (final_pos >= 3) {
5040 const char *name = save_input_locale;
97f4de96 5041
c36d8df8
KW
5042 /* Find next 'U' or 'u' and look from there */
5043 while ((name += strcspn(name, "Uu") + 1)
5044 <= save_input_locale + final_pos - 2)
97f4de96 5045 {
c36d8df8
KW
5046 if ( isALPHA_FOLD_NE(*name, 't')
5047 || isALPHA_FOLD_NE(*(name + 1), 'f'))
5048 {
5049 continue;
5050 }
5051 name += 2;
5052 if (*(name) == '-') {
5053 if ((name > save_input_locale + final_pos - 1)) {
5054 break;
5055 }
5056 name++;
5057 }
5058 if (*(name) == '8') {
5059 DEBUG_L(PerlIO_printf(Perl_debug_log,
5060 "Locale %s ends with UTF-8 in name\n",
5061 save_input_locale));
5062 is_utf8 = TRUE;
5063 goto finish_and_return;
97f4de96 5064 }
97f4de96 5065 }
c36d8df8
KW
5066 DEBUG_L(PerlIO_printf(Perl_debug_log,
5067 "Locale %s doesn't end with UTF-8 in name\n",
5068 save_input_locale));
97f4de96 5069 }
97f4de96 5070
4d8d465a 5071# ifdef WIN32
7d4bcc4a 5072
c36d8df8
KW
5073 /* http://msdn.microsoft.com/en-us/library/windows/desktop/dd317756.aspx */
5074 if (memENDs(save_input_locale, final_pos, "65001")) {
5075 DEBUG_L(PerlIO_printf(Perl_debug_log,
8a0832a1 5076 "Locale %s ends with 65001 in name, is UTF-8 locale\n",
97f4de96 5077 save_input_locale));
c36d8df8
KW
5078 is_utf8 = TRUE;
5079 goto finish_and_return;
5080 }
7d4bcc4a 5081
4d8d465a 5082# endif
17dd77cd 5083 }
4d8d465a 5084# endif
97f4de96
KW
5085
5086 /* Other common encodings are the ISO 8859 series, which aren't UTF-8. But
5087 * since we are about to return FALSE anyway, there is no point in doing
5088 * this extra work */
7d4bcc4a 5089
0dec74cd 5090# if 0
97f4de96
KW
5091 if (instr(save_input_locale, "8859")) {
5092 DEBUG_L(PerlIO_printf(Perl_debug_log,
5093 "Locale %s has 8859 in name, not UTF-8 locale\n",
5094 save_input_locale));
47280b20
KW
5095 is_utf8 = FALSE;
5096 goto finish_and_return;
97f4de96 5097 }
0dec74cd 5098# endif
97f4de96 5099
69014004
KW
5100 DEBUG_L(PerlIO_printf(Perl_debug_log,
5101 "Assuming locale %s is not a UTF-8 locale\n",
5102 save_input_locale));
47280b20
KW
5103 is_utf8 = FALSE;
5104
0dec74cd
KW
5105# endif /* the code that is compiled when no modern LC_CTYPE */
5106
47280b20
KW
5107 finish_and_return:
5108
5109 /* Cache this result so we don't have to go through all this next time. */
5110 utf8ness_cache_size = sizeof(PL_locale_utf8ness)
5111 - (utf8ness_cache - PL_locale_utf8ness);
5112
5113 /* But we can't save it if it is too large for the total space available */
5114 if (LIKELY(input_name_len_with_overhead < utf8ness_cache_size)) {
5115 Size_t utf8ness_cache_len = strlen(utf8ness_cache);
5116
5117 /* Here it can fit, but we may need to clear out the oldest cached
5118 * result(s) to do so. Check */
5119 if (utf8ness_cache_len + input_name_len_with_overhead
5120 >= utf8ness_cache_size)
5121 {
5122 /* Here we have to clear something out to make room for this.
5123 * Start looking at the rightmost place where it could fit and find
5124 * the beginning of the entry that extends past that. */
5125 char * cutoff = (char *) my_memrchr(utf8ness_cache,
5126 UTF8NESS_SEP[0],
5127 utf8ness_cache_size
5128 - input_name_len_with_overhead);
5129
5130 assert(cutoff);
5131 assert(cutoff >= utf8ness_cache);
5132
5133 /* This and all subsequent entries must be removed */
5134 *cutoff = '\0';
5135 utf8ness_cache_len = strlen(utf8ness_cache);
5136 }
5137
5138 /* Make space for the new entry */
5139 Move(utf8ness_cache,
5140 utf8ness_cache + input_name_len_with_overhead,
5141 utf8ness_cache_len + 1 /* Incl. trailing NUL */, char);
5142
5143 /* And insert it */
5144 Copy(delimited, utf8ness_cache, input_name_len_with_overhead - 1, char);
5145 utf8ness_cache[input_name_len_with_overhead - 1] = is_utf8 + '0';
5146
72299e00 5147 if ((PL_locale_utf8ness[strlen(PL_locale_utf8ness)-1] & ~1) != '0') {
47280b20 5148 Perl_croak(aTHX_
7fcf9272
KW
5149 "panic: %s: %d: Corrupt utf8ness_cache=%s\nlen=%zu,"
5150 " inserted_name=%s, its_len=%zu\n",
47280b20
KW
5151 __FILE__, __LINE__,
5152 PL_locale_utf8ness, strlen(PL_locale_utf8ness),
5153 delimited, input_name_len_with_overhead);
5154 }
5155 }
5156
5157# ifdef DEBUGGING
5158
de3a3072
KW
5159 if (DEBUG_Lv_TEST) {
5160 const char * s = PL_locale_utf8ness;
5161
5162 /* Audit the structure */
5163 while (s < PL_locale_utf8ness + strlen(PL_locale_utf8ness)) {
5164 const char *e;
5165
5166 if (*s != UTF8NESS_SEP[0]) {
5167 Perl_croak(aTHX_
5168 "panic: %s: %d: Corrupt utf8ness_cache: missing"
5169 " separator %.*s<-- HERE %s\n",
5170 __FILE__, __LINE__,
fe301c93 5171 (int) (s - PL_locale_utf8ness), PL_locale_utf8ness,
de3a3072
KW
5172 s);
5173 }
5174 s++;
5175 e = strchr(s, UTF8NESS_PREFIX[0]);
5176 if (! e) {
01b91050 5177 e = PL_locale_utf8ness + strlen(PL_locale_utf8ness);
de3a3072
KW
5178 Perl_croak(aTHX_
5179 "panic: %s: %d: Corrupt utf8ness_cache: missing"
5180 " separator %.*s<-- HERE %s\n",
5181 __FILE__, __LINE__,
fe301c93 5182 (int) (e - PL_locale_utf8ness), PL_locale_utf8ness,
de3a3072
KW
5183 e);
5184 }
5185 e++;
5186 if (*e != '0' && *e != '1') {
5187 Perl_croak(aTHX_
5188 "panic: %s: %d: Corrupt utf8ness_cache: utf8ness"
5189 " must be [01] %.*s<-- HERE %s\n",
5190 __FILE__, __LINE__,
fe301c93
KW
5191 (int) (e + 1 - PL_locale_utf8ness),
5192 PL_locale_utf8ness, e + 1);
de3a3072
KW
5193 }
5194 if (ninstr(PL_locale_utf8ness, s, s-1, e)) {
5195 Perl_croak(aTHX_
5196 "panic: %s: %d: Corrupt utf8ness_cache: entry"
5197 " has duplicate %.*s<-- HERE %s\n",
5198 __FILE__, __LINE__,
fe301c93 5199 (int) (e - PL_locale_utf8ness), PL_locale_utf8ness,
de3a3072
KW
5200 e);
5201 }
5202 s = e + 1;
5203 }
5204 }
5205
94d521c0 5206 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
47280b20 5207 "PL_locale_utf8ness is now %s; returning %d\n",
94d521c0 5208 PL_locale_utf8ness, is_utf8));
47280b20
KW
5209
5210# endif
5211
51332242
N
5212 /* free only when not using the buffer */
5213 if ( delimited != buffer ) Safefree(delimited);
fa9b773e 5214 Safefree(save_input_locale);
47280b20 5215 return is_utf8;
7d74bb61
KW
5216}
5217
8ef6e574 5218#endif
7d74bb61 5219
d6ded950
KW
5220bool
5221Perl__is_in_locale_category(pTHX_ const bool compiling, const int category)
5222{
5223 /* Internal function which returns if we are in the scope of a pragma that
5224 * enables the locale category 'category'. 'compiling' should indicate if
5225 * this is during the compilation phase (TRUE) or not (FALSE). */
5226
5227 const COP * const cop = (compiling) ? &PL_compiling : PL_curcop;
5228
363d7d4a
JK
5229 SV *these_categories = cop_hints_fetch_pvs(cop, "locale", 0);
5230 if (! these_categories || these_categories == &PL_sv_placeholder) {
d6ded950
KW
5231 return FALSE;
5232 }
5233
5234 /* The pseudo-category 'not_characters' is -1, so just add 1 to each to get
5235 * a valid unsigned */
5236 assert(category >= -1);
363d7d4a 5237 return cBOOL(SvUV(these_categories) & (1U << (category + 1)));
d6ded950
KW
5238}
5239
2c6ee1a7 5240char *
6ebbc862
KW
5241Perl_my_strerror(pTHX_ const int errnum)
5242{
5243 /* Returns a mortalized copy of the text of the error message associated
5244 * with 'errnum'. It uses the current locale's text unless the platform
5245 * doesn't have the LC_MESSAGES category or we are not being called from
5246 * within the scope of 'use locale'. In the former case, it uses whatever
5247 * strerror returns; in the latter case it uses the text from the C locale.
5248 *
5249 * The function just calls strerror(), but temporarily switches, if needed,
5250 * to the C locale */
5251
5252 char *errstr;
5253
52770946 5254#ifndef USE_LOCALE_MESSAGES
6ebbc862 5255
52770946
KW
5256 /* If platform doesn't have messages category, we don't do any switching to
5257 * the C locale; we just use whatever strerror() returns */
5258
5259 errstr = savepv(Strerror(errnum));
5260
5261#else /* Has locale messages */
5262
5263 const bool within_locale_scope = IN_LC(LC_MESSAGES);
2c6ee1a7 5264
8751a11f 5265# ifndef USE_LOCALE_THREADS
7aaa36b1 5266
39e69e77
KW
5267 /* This function is trivial without threads. */
5268 if (within_locale_scope) {
f9ea7fff 5269 errstr = savepv(Strerror(errnum));
39e69e77
KW
5270 }
5271 else {
a683a66a 5272 const char * save_locale = savepv(querylocale_c(LC_MESSAGES));
39e69e77 5273
3980cddb 5274 void_setlocale_c(LC_MESSAGES, "C");
f9ea7fff 5275 errstr = savepv(Strerror(errnum));
3980cddb 5276 void_setlocale_c(LC_MESSAGES, save_locale);
b0bde642 5277 Safefree(save_locale);
39e69e77
KW
5278 }
5279
fe1c1494 5280# elif defined(USE_POSIX_2008_LOCALE) && defined(HAS_STRERROR_L)
39e69e77
KW
5281
5282 /* This function is also trivial if we don't have to worry about thread
5283 * safety and have strerror_l(), as it handles the switch of locales so we
5284 * don't have to deal with that. We don't have to worry about thread
5285 * safety if strerror_r() is also available. Both it and strerror_l() are
5286 * thread-safe. Plain strerror() isn't thread safe. But on threaded
5287 * builds when strerror_r() is available, the apparent call to strerror()
5288 * below is actually a macro that behind-the-scenes calls strerror_r(). */
43cb6651 5289
39e69e77 5290# ifdef HAS_STRERROR_R
7aaa36b1
KW
5291
5292 if (within_locale_scope) {
f9ea7fff 5293 errstr = savepv(Strerror(errnum));
7aaa36b1
KW
5294 }
5295 else {
4eb27fc5 5296 errstr = savepv(strerror_l(errnum, PL_C_locale_obj));
7aaa36b1
KW
5297 }
5298
43cb6651
KW
5299# else
5300
5301 /* Here we have strerror_l(), but not strerror_r() and we are on a
5302 * threaded-build. We use strerror_l() for everything, constructing a
5303 * locale to pass to it if necessary */
5304
5305 bool do_free = FALSE;
5306 locale_t locale_to_use;
5307
5308 if (within_locale_scope) {
5309 locale_to_use = uselocale((locale_t) 0);
5310 if (locale_to_use == LC_GLOBAL_LOCALE) {
5311 locale_to_use = duplocale(LC_GLOBAL_LOCALE);
5312 do_free = TRUE;
5313 }
5314 }
5315 else { /* Use C locale if not within 'use locale' scope */
5316 locale_to_use = PL_C_locale_obj;
5317 }
5318
5319 errstr = savepv(strerror_l(errnum, locale_to_use));
5320
5321 if (do_free) {
5322 freelocale(locale_to_use);
5323 }
5324
5325# endif
5326# else /* Doesn't have strerror_l() */
7aaa36b1 5327
8de4332b 5328 const char * save_locale = NULL;
c9dda6da 5329 bool locale_is_C = FALSE;
2c6ee1a7 5330
9de6fe47 5331 /* We have a critical section to prevent another thread from executing this
e9bc6d6b 5332 * same code at the same time. (On thread-safe perls, the LOCK is a
9de6fe47
KW
5333 * no-op.) Since this is the only place in core that changes LC_MESSAGES
5334 * (unless the user has called setlocale(), this works to prevent races. */
7953f73f 5335 SETLOCALE_LOCK;
6ebbc862 5336
9c8a6dc2
KW
5337 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
5338 "my_strerror called with errnum %d\n", errnum));
6ebbc862 5339 if (! within_locale_scope) {
a683a66a 5340 save_locale = querylocale_c(LC_MESSAGES);
c9dda6da 5341 if (! save_locale) {
d523eeeb 5342 SETLOCALE_UNLOCK;
d707d779
KW
5343 Perl_croak(aTHX_
5344 "panic: %s: %d: Could not find current LC_MESSAGES locale,"
5345 " errno=%d\n", __FILE__, __LINE__, errno);
c9dda6da
KW
5346 }
5347 else {
5348 locale_is_C = isNAME_C_OR_POSIX(save_locale);
2c6ee1a7 5349
c9dda6da
KW
5350 /* Switch to the C locale if not already in it */
5351 if (! locale_is_C) {
2c6ee1a7 5352
c9dda6da
KW
5353 /* The setlocale() just below likely will zap 'save_locale', so
5354 * create a copy. */
5355 save_locale = savepv(save_locale);
3980cddb 5356 if (! bool_setlocale_c(LC_MESSAGES, "C")) {
a83cff23
KW
5357
5358 /* If, for some reason, the locale change failed, we
5359 * soldier on as best as possible under the circumstances,
5360 * using the current locale, and clear save_locale, so we
5361 * don't try to change back. On z/0S, all setlocale()
5362 * calls fail after you've created a thread. This is their
5363 * way of making sure the entire process is always a single
5364 * locale. This means that 'use locale' is always in place
5365 * for messages under these circumstances. */
5366 Safefree(save_locale);
5367 save_locale = NULL;
5368 }
c9dda6da 5369 }
6ebbc862 5370 }
6ebbc862 5371 } /* end of ! within_locale_scope */
9c8a6dc2
KW
5372 else {
5373 DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s: %d: WITHIN locale scope\n",
5374 __FILE__, __LINE__));
5375 }
a0b53297 5376
9c8a6dc2
KW
5377 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
5378 "Any locale change has been done; about to call Strerror\n"));
52770946 5379 errstr = savepv(Strerror(errnum));
6ebbc862
KW
5380
5381 if (! within_locale_scope) {
c9dda6da 5382 if (save_locale && ! locale_is_C) {
3980cddb 5383 if (! bool_setlocale_c(LC_MESSAGES, save_locale)) {
d523eeeb 5384 SETLOCALE_UNLOCK;
d707d779 5385 Perl_croak(aTHX_
cfaae47f
KW
5386 "panic: %s: %d: setlocale restore to '%s' failed, errno=%d\n",
5387 __FILE__, __LINE__, save_locale, errno);
c9dda6da 5388 }
6ebbc862
KW
5389 Safefree(save_locale);
5390 }
5391 }
5392
7953f73f 5393 SETLOCALE_UNLOCK;
6ebbc862 5394
7aaa36b1 5395# endif /* End of doesn't have strerror_l */
6affbbf0 5396
94d521c0
KW
5397 DEBUG_Lv((PerlIO_printf(Perl_debug_log,
5398 "Strerror returned; saving a copy: '"),
5399 print_bytes_for_locale(errstr, errstr + strlen(errstr), 0),
5400 PerlIO_printf(Perl_debug_log, "'\n")));
6affbbf0 5401
d36adde0 5402#endif /* End of does have locale messages */
2c6ee1a7 5403
52770946 5404 SAVEFREEPV(errstr);
6ebbc862 5405 return errstr;
2c6ee1a7
KW
5406}
5407
66610fdd 5408/*
747c467a 5409
58e641fb
KW
5410=for apidoc switch_to_global_locale
5411
75920755 5412On systems without locale support, or on typical single-threaded builds, or on
58e641fb
KW
5413platforms that do not support per-thread locale operations, this function does
5414nothing. On such systems that do have locale support, only a locale global to
5415the whole program is available.
5416
5417On multi-threaded builds on systems that do have per-thread locale operations,
5418this function converts the thread it is running in to use the global locale.
5419This is for code that has not yet or cannot be updated to handle multi-threaded
5420locale operation. As long as only a single thread is so-converted, everything
5421works fine, as all the other threads continue to ignore the global one, so only
5422this thread looks at it.
5423
69c5e0db
KW
5424However, on Windows systems this isn't quite true prior to Visual Studio 15,
5425at which point Microsoft fixed a bug. A race can occur if you use the
5426following operations on earlier Windows platforms:
5427
5428=over
5429
5430=item L<POSIX::localeconv|POSIX/localeconv>
5431
5432=item L<I18N::Langinfo>, items C<CRNCYSTR> and C<THOUSEP>
5433
5434=item L<perlapi/Perl_langinfo>, items C<CRNCYSTR> and C<THOUSEP>
5435
5436=back
5437
5438The first item is not fixable (except by upgrading to a later Visual Studio
5439release), but it would be possible to work around the latter two items by using
5440the Windows API functions C<GetNumberFormat> and C<GetCurrencyFormat>; patches
5441welcome.
5442
58e641fb
KW
5443Without this function call, threads that use the L<C<setlocale(3)>> system
5444function will not work properly, as all the locale-sensitive functions will
5445look at the per-thread locale, and C<setlocale> will have no effect on this
5446thread.
5447
5448Perl code should convert to either call
5449L<C<Perl_setlocale>|perlapi/Perl_setlocale> (which is a drop-in for the system
5450C<setlocale>) or use the methods given in L<perlcall> to call
5451L<C<POSIX::setlocale>|POSIX/setlocale>. Either one will transparently properly
5452handle all cases of single- vs multi-thread, POSIX 2008-supported or not.
5453
5454Non-Perl libraries, such as C<gtk>, that call the system C<setlocale> can
5455continue to work if this function is called before transferring control to the
5456library.
5457
5458Upon return from the code that needs to use the global locale,
5459L<C<sync_locale()>|perlapi/sync_locale> should be called to restore the safe
5460multi-thread operation.
5461
5462=cut
5463*/
5464
5465void
5466Perl_switch_to_global_locale()
5467{
4bcc858c 5468 dTHX;
58e641fb
KW
5469
5470#ifdef USE_THREAD_SAFE_LOCALE
5471# ifdef WIN32
5472
5473 _configthreadlocale(_DISABLE_PER_THREAD_LOCALE);
5474
5475# else
58e641fb
KW
5476
5477 {
5478 unsigned int i;
5479
4fd76d49 5480 for (i = 0; i < LC_ALL_INDEX_; i++) {
a683a66a 5481 setlocale(categories[i], querylocale_i(i));
58e641fb
KW
5482 }
5483 }
5484
58e641fb
KW
5485 uselocale(LC_GLOBAL_LOCALE);
5486
5487# endif
5488#endif
5489
5490}
5491
5492/*
5493
747c467a
KW
5494=for apidoc sync_locale
5495
b2e9ba0c
KW
5496L<C<Perl_setlocale>|perlapi/Perl_setlocale> can be used at any time to query or
5497change the locale (though changing the locale is antisocial and dangerous on
5498multi-threaded systems that don't have multi-thread safe locale operations.
5499(See L<perllocale/Multi-threaded operation>). Using the system
5500L<C<setlocale(3)>> should be avoided. Nevertheless, certain non-Perl libraries
5501called from XS, such as C<Gtk> do so, and this can't be changed. When the
5502locale is changed by XS code that didn't use
5503L<C<Perl_setlocale>|perlapi/Perl_setlocale>, Perl needs to be told that the
5504locale has changed. Use this function to do so, before returning to Perl.
5505
5506The return value is a boolean: TRUE if the global locale at the time of call
5507was in effect; and FALSE if a per-thread locale was in effect. This can be
5508used by the caller that needs to restore things as-they-were to decide whether
5509or not to call
5510L<C<Perl_switch_to_global_locale>|perlapi/switch_to_global_locale>.
747c467a
KW
5511
5512=cut
5513*/
5514
b2e9ba0c
KW
5515bool
5516Perl_sync_locale()
747c467a 5517{
d36adde0
KW
5518
5519#ifndef USE_LOCALE
5520
5521 return TRUE;
5522
5523#else
5524
e9bc6d6b 5525 const char * newlocale;
b2e9ba0c
KW
5526 dTHX;
5527
d36adde0 5528# ifdef USE_POSIX_2008_LOCALE
b2e9ba0c
KW
5529
5530 bool was_in_global_locale = FALSE;
5531 locale_t cur_obj = uselocale((locale_t) 0);
5532
5533 /* On Windows, unless the foreign code has turned off the thread-safe
5534 * locale setting, any plain setlocale() will have affected what we see, so
5535 * no need to worry. Otherwise, If the foreign code has done a plain
5536 * setlocale(), it will only affect the global locale on POSIX systems, but
5537 * will affect the */
5538 if (cur_obj == LC_GLOBAL_LOCALE) {
5539
d36adde0 5540# ifdef HAS_QUERY_LOCALE
b2e9ba0c 5541
a683a66a 5542 void_setlocale_c(LC_ALL, querylocale_c(LC_ALL));
b2e9ba0c 5543
d36adde0 5544# else
b2e9ba0c
KW
5545
5546 unsigned int i;
5547
5548 /* We can't trust that we can read the LC_ALL format on the
5549 * platform, so do them individually */
4fd76d49 5550 for (i = 0; i < LC_ALL_INDEX_; i++) {
3980cddb 5551 void_setlocale_i(i, querylocale_i(i));
b2e9ba0c 5552 }
747c467a 5553
d36adde0 5554# endif
b2e9ba0c
KW
5555
5556 was_in_global_locale = TRUE;
5557 }
5558
d36adde0 5559# else
b2e9ba0c
KW
5560
5561 bool was_in_global_locale = TRUE;
5562
d36adde0
KW
5563# endif
5564# ifdef USE_LOCALE_CTYPE
7d4bcc4a 5565
a683a66a 5566 newlocale = savepv(querylocale_c(LC_CTYPE));
9f82ea3e
KW
5567 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
5568 "%s:%d: %s\n", __FILE__, __LINE__,
d188fe6d 5569 setlocale_debug_string_c(LC_CTYPE, NULL, newlocale)));
9f82ea3e 5570 new_ctype(newlocale);
b0bde642 5571 Safefree(newlocale);
747c467a 5572
d36adde0
KW
5573# endif /* USE_LOCALE_CTYPE */
5574# ifdef USE_LOCALE_COLLATE
7d4bcc4a 5575
a683a66a 5576 newlocale = savepv(querylocale_c(LC_COLLATE));
9f82ea3e
KW
5577 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
5578 "%s:%d: %s\n", __FILE__, __LINE__,
d188fe6d 5579 setlocale_debug_string_c(LC_COLLATE, NULL, newlocale)));
9f82ea3e 5580 new_collate(newlocale);
b0bde642 5581 Safefree(newlocale);
747c467a 5582
d36adde0
KW
5583# endif
5584# ifdef USE_LOCALE_NUMERIC
7d4bcc4a 5585
a683a66a 5586 newlocale = savepv(querylocale_c(LC_NUMERIC));
9f82ea3e
KW
5587 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
5588 "%s:%d: %s\n", __FILE__, __LINE__,
d188fe6d 5589 setlocale_debug_string_c(LC_NUMERIC, NULL, newlocale)));
9f82ea3e 5590 new_numeric(newlocale);
b0bde642 5591 Safefree(newlocale);
7d4bcc4a 5592
d36adde0 5593# endif /* USE_LOCALE_NUMERIC */
747c467a 5594
b2e9ba0c 5595 return was_in_global_locale;
d36adde0
KW
5596
5597#endif
5598
747c467a
KW
5599}
5600
5d1187d1
KW
5601#if defined(DEBUGGING) && defined(USE_LOCALE)
5602
a4f00dcc 5603STATIC char *
d188fe6d
KW
5604S_setlocale_debug_string_i(const unsigned cat_index,
5605 const char* const locale, /* Optional locale name */
5d1187d1
KW
5606
5607 /* return value from setlocale() when attempting to
5608 * set 'category' to 'locale' */
5609 const char* const retval)
5610{
5611 /* Returns a pointer to a NUL-terminated string in static storage with
5612 * added text about the info passed in. This is not thread safe and will
5613 * be overwritten by the next call, so this should be used just to
fa07b8e5 5614 * formulate a string to immediately print or savepv() on. */
5d1187d1 5615
73af6c07 5616 static char ret[1024];
d188fe6d 5617 assert(cat_index <= NOMINAL_LC_ALL_INDEX);
7d4bcc4a 5618
e5f10d49 5619 my_strlcpy(ret, "setlocale(", sizeof(ret));
d188fe6d 5620 my_strlcat(ret, category_names[cat_index], sizeof(ret));
fa07b8e5 5621 my_strlcat(ret, ", ", sizeof(ret));
5d1187d1
KW
5622
5623 if (locale) {
fa07b8e5
KW
5624 my_strlcat(ret, "\"", sizeof(ret));
5625 my_strlcat(ret, locale, sizeof(ret));
5626 my_strlcat(ret, "\"", sizeof(ret));
5d1187d1
KW
5627 }
5628 else {
fa07b8e5 5629 my_strlcat(ret, "NULL", sizeof(ret));
5d1187d1
KW
5630 }
5631
fa07b8e5 5632 my_strlcat(ret, ") returned ", sizeof(ret));
5d1187d1
KW
5633
5634 if (retval) {
fa07b8e5
KW
5635 my_strlcat(ret, "\"", sizeof(ret));
5636 my_strlcat(ret, retval, sizeof(ret));
5637 my_strlcat(ret, "\"", sizeof(ret));
5d1187d1
KW
5638 }
5639 else {
fa07b8e5 5640 my_strlcat(ret, "NULL", sizeof(ret));
5d1187d1
KW
5641 }
5642
5643 assert(strlen(ret) < sizeof(ret));
5644
5645 return ret;
5646}
5647
5648#endif
747c467a 5649
e9bc6d6b
KW
5650void
5651Perl_thread_locale_init()
5652{
5653 /* Called from a thread on startup*/
5654
5655#ifdef USE_THREAD_SAFE_LOCALE
5656
5657 dTHX_DEBUGGING;
5658
e9bc6d6b
KW
5659
5660 DEBUG_L(PerlIO_printf(Perl_debug_log,
5661 "%s:%d: new thread, initial locale is %s; calling setlocale\n",
5662 __FILE__, __LINE__, setlocale(LC_ALL, NULL)));
5663
5664# ifdef WIN32
5665
fe1c1494 5666 /* On Windows, make sure new thread has per-thread locales enabled */
e9bc6d6b
KW
5667 _configthreadlocale(_ENABLE_PER_THREAD_LOCALE);
5668
5669# else
5670
fe1c1494 5671 /* This thread starts off in the C locale */
e9bc6d6b
KW
5672 Perl_setlocale(LC_ALL, "C");
5673
5674# endif
5675#endif
5676
5677}
5678
5679void
5680Perl_thread_locale_term()
5681{
5682 /* Called from a thread as it gets ready to terminate */
5683
4b06b4d0 5684#ifdef USE_POSIX_2008_LOCALE
e9bc6d6b
KW
5685
5686 /* C starts the new thread in the global C locale. If we are thread-safe,
5687 * we want to not be in the global locale */
5688
e9bc6d6b
KW
5689 { /* Free up */
5690 locale_t cur_obj = uselocale(LC_GLOBAL_LOCALE);
e72200e7 5691 if (cur_obj != LC_GLOBAL_LOCALE && cur_obj != PL_C_locale_obj) {
e9bc6d6b
KW
5692 freelocale(cur_obj);
5693 }
5694 }
5695
e9bc6d6b
KW
5696#endif
5697
5698}
747c467a
KW
5699
5700/*
14d04a33 5701 * ex: set ts=8 sts=4 sw=4 et:
37442d52 5702 */