This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Promote v5.36 usage and feature bundles doc
[perl5.git] / locale.c
CommitLineData
98994639
HS
1/* locale.c
2 *
1129b882
NC
3 * Copyright (C) 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001,
4 * 2002, 2003, 2005, 2006, 2007, 2008 by Larry Wall and others
98994639
HS
5 *
6 * You may distribute under the terms of either the GNU General Public
7 * License or the Artistic License, as specified in the README file.
8 *
9 */
10
11/*
4ac71550 12 * A Elbereth Gilthoniel,
cdad3b53 13 * silivren penna míriel
4ac71550 14 * o menel aglar elenath!
cdad3b53 15 * Na-chaered palan-díriel
4ac71550
TC
16 * o galadhremmin ennorath,
17 * Fanuilos, le linnathon
18 * nef aear, si nef aearon!
19 *
20 * [p.238 of _The Lord of the Rings_, II/i: "Many Meetings"]
98994639
HS
21 */
22
166f8a29
DM
23/* utility functions for handling locale-specific stuff like what
24 * character represents the decimal point.
0d071d52 25 *
7d4bcc4a
KW
26 * All C programs have an underlying locale. Perl code generally doesn't pay
27 * any attention to it except within the scope of a 'use locale'. For most
0d071d52
KW
28 * categories, it accomplishes this by just using different operations if it is
29 * in such scope than if not. However, various libc functions called by Perl
30 * are affected by the LC_NUMERIC category, so there are macros in perl.h that
31 * are used to toggle between the current locale and the C locale depending on
a9ad02a8
KW
32 * the desired behavior of those functions at the moment. And, LC_MESSAGES is
33 * switched to the C locale for outputting the message unless within the scope
34 * of 'use locale'.
e9bc6d6b
KW
35 *
36 * This code now has multi-thread-safe locale handling on systems that support
37 * that. This is completely transparent to most XS code. On earlier systems,
38 * it would be possible to emulate thread-safe locales, but this likely would
39 * involve a lot of locale switching, and would require XS code changes.
40 * Macros could be written so that the code wouldn't have to know which type of
fe1c1494
KW
41 * system is being used.
42 *
43 * Table-driven code is used for simplicity and clarity, as many operations
44 * differ only in which category is being worked on. However the system
45 * categories need not be small contiguous integers, so do not lend themselves
46 * to table lookup. Instead we have created our own equivalent values which
47 * are all small contiguous non-negative integers, and translation functions
48 * between the two sets. For category 'LC_foo', the name of our index is
49 * LC_foo_INDEX_. Various parallel tables, indexed by these, are used.
50 *
51 * Many of the macros and functions in this file have one of the suffixes '_c',
52 * '_r', or '_i'. khw found these useful in remembering what type of locale
53 * category to use as their parameter. '_r' takes an int category number as
54 * passed to setlocale(), like LC_ALL, LC_CTYPE, etc. The 'r' indicates that
55 * the value isn't known until runtime. '_c' also indicates such a category
56 * number, but its value is known at compile time. These are both converted
57 * into unsigned indexes into various tables of category information, where the
58 * real work is generally done. The tables are generated at compile-time based
59 * on platform characteristics and Configure options. They hide from the code
60 * many of the vagaries of the different locale implementations out there. You
61 * may have already guessed that '_i' indicates the parameter is such an
62 * unsigned index. Converting from '_r' to '_i' requires run-time lookup.
63 * '_c' is used to get cpp to do this at compile time. To avoid the runtime
64 * expense, the code is structured to use '_r' at the API level, and once
65 * converted, everything possible is done using the table indexes.
66 *
67 * On unthreaded perls, most operations expand out to just the basic
68 * setlocale() calls. The same is true on threaded perls on modern Windows
69 * systems where the same API, after set up, is used for thread-safe locale
70 * handling. On other systems, there is a completely different API, specified
71 * in POSIX 2008, to do thread-safe locales. On these systems, our
72 * emulate_setlocale_i() function is used to hide the different API from the
73 * outside. This makes it completely transparent to most XS code.
74 *
75 * A huge complicating factor is that the LC_NUMERIC category is normally held
76 * in the C locale, except during those relatively rare times when it needs to
77 * be in the underlying locale. There is a bunch of code to accomplish this,
78 * and to allow easy switches from one state to the other.
79 *
80 * z/OS (os390) is an outlier. Locales really don't work under threads when
81 * either the radix character isn't a dot, or attempts are made to change
82 * locales after the first thread is created. The reason is that IBM has made
83 * it thread-safe by refusing to change locales (returning failure if
84 * attempted) any time after an application has called pthread_create() to
85 * create another thread. The expectation is that an application will set up
86 * its locale information before the first fork, and be stable thereafter. But
87 * perl toggles LC_NUMERIC if the locale's radix character isn't a dot, as do
88 * the other toggles, which are less common.
166f8a29
DM
89 */
90
4767484d
KW
91/* If the environment says to, we can output debugging information during
92 * initialization. This is done before option parsing, and before any thread
93 * creation, so can be a file-level static. (Must come before #including
94 * perl.h) */
95#ifdef DEBUGGING
96static int debug_initialization = 0;
97# define DEBUG_INITIALIZATION_set(v) (debug_initialization = v)
98# define DEBUG_LOCALE_INITIALIZATION_ debug_initialization
99#else
100# define debug_initialization 0
101# define DEBUG_INITIALIZATION_set(v)
102#endif
103
b26541a9
KW
104#define DEBUG_PRE_STMTS dSAVE_ERRNO; \
105 PerlIO_printf(Perl_debug_log, "%s: %" LINE_Tf ": ", __FILE__, __LINE__);
0494a7e2
KW
106#define DEBUG_POST_STMTS RESTORE_ERRNO;
107
98994639
HS
108#include "EXTERN.h"
109#define PERL_IN_LOCALE_C
f7416781 110#include "perl_langinfo.h"
98994639
HS
111#include "perl.h"
112
a4af207c
JH
113#include "reentr.h"
114
0dec74cd
KW
115#ifdef I_WCHAR
116# include <wchar.h>
117#endif
5b64f24c
KW
118#ifdef I_WCTYPE
119# include <wctype.h>
120#endif
0dec74cd 121
22392ba2
KW
122PERL_STATIC_INLINE const char *
123S_mortalized_pv_copy(pTHX_ const char * const pv)
124{
125 PERL_ARGS_ASSERT_MORTALIZED_PV_COPY;
126
127 /* Copies the input pv, and arranges for it to be freed at an unspecified
128 * later time. */
129
130 if (pv == NULL) {
131 return NULL;
132 }
133
134 const char * copy = savepv(pv);
135 SAVEFREEPV(copy);
136 return copy;
137}
138
48015184
KW
139
140/* Returns the Unix errno portion; ignoring any others. This is a macro here
141 * instead of putting it into perl.h, because unclear to khw what should be
142 * done generally. */
143#define GET_ERRNO saved_errno
144
2fbc73dd
KW
145/* Default values come from the C locale */
146static const char C_codeset[] = "ANSI_X3.4-1968";
147static const char C_decimal_point[] = ".";
148static const char C_thousands_sep[] = "";
149
63e5b0d7
KW
150/* Is the C string input 'name' "C" or "POSIX"? If so, and 'name' is the
151 * return of setlocale(), then this is extremely likely to be the C or POSIX
152 * locale. However, the output of setlocale() is documented to be opaque, but
153 * the odds are extremely small that it would return these two strings for some
154 * other locale. Note that VMS in these two locales includes many non-ASCII
155 * characters as controls and punctuation (below are hex bytes):
156 * cntrl: 84-97 9B-9F
157 * punct: A1-A3 A5 A7-AB B0-B3 B5-B7 B9-BD BF-CF D1-DD DF-EF F1-FD
158 * Oddly, none there are listed as alphas, though some represent alphabetics
159 * http://www.nntp.perl.org/group/perl.perl5.porters/2013/02/msg198753.html */
160#define isNAME_C_OR_POSIX(name) \
161 ( (name) != NULL \
162 && (( *(name) == 'C' && (*(name + 1)) == '\0') \
163 || strEQ((name), "POSIX")))
164
3871c541
KW
165#if defined(HAS_NL_LANGINFO_L) || defined(HAS_NL_LANGINFO)
166# define HAS_SOME_LANGINFO
167#endif
168#if defined(HAS_LOCALECONV) || defined(HAS_LOCALECONV_L)
169# define HAS_SOME_LOCALECONV
170#endif
171
fffcb78f
KW
172#define my_langinfo_c(item, category, locale, retbufp, retbuf_sizep, utf8ness) \
173 my_langinfo_i(item, category##_INDEX_, locale, retbufp, \
174 retbuf_sizep, utf8ness)
fc46dd0d 175
8ef6e574
KW
176#ifdef USE_LOCALE
177
47280b20
KW
178/* This code keeps a LRU cache of the UTF-8ness of the locales it has so-far
179 * looked up. This is in the form of a C string: */
180
fe1c1494
KW
181# define UTF8NESS_SEP "\v"
182# define UTF8NESS_PREFIX "\f"
47280b20
KW
183
184/* So, the string looks like:
98994639 185 *
47280b20 186 * \vC\a0\vPOSIX\a0\vam_ET\a0\vaf_ZA.utf8\a1\ven_US.UTF-8\a1\0
98994639 187 *
47280b20
KW
188 * where the digit 0 after the \a indicates that the locale starting just
189 * after the preceding \v is not UTF-8, and the digit 1 mean it is. */
190
191STATIC_ASSERT_DECL(STRLENs(UTF8NESS_SEP) == 1);
192STATIC_ASSERT_DECL(STRLENs(UTF8NESS_PREFIX) == 1);
193
fe1c1494 194# define C_and_POSIX_utf8ness UTF8NESS_SEP "C" UTF8NESS_PREFIX "0" \
47280b20
KW
195 UTF8NESS_SEP "POSIX" UTF8NESS_PREFIX "0"
196
197/* The cache is initialized to C_and_POSIX_utf8ness at start up. These are
198 * kept there always. The remining portion of the cache is LRU, with the
199 * oldest looked-up locale at the tail end */
200
d188fe6d
KW
201# ifdef DEBUGGING
202# define setlocale_debug_string_c(category, locale, result) \
203 setlocale_debug_string_i(category##_INDEX_, locale, result)
204# define setlocale_debug_string_r(category, locale, result) \
205 setlocale_debug_string_i(get_category_index(category, locale), \
206 locale, result)
207# endif
208
497bfce4
KW
209# define toggle_locale_i(index, locale) \
210 S_toggle_locale_i(aTHX_ index, locale, __LINE__)
211# define toggle_locale_c(cat, locale) toggle_locale_i(cat##_INDEX_, locale)
212# define restore_toggled_locale_i(index, locale) \
213 S_restore_toggled_locale_i(aTHX_ index, locale, __LINE__)
214# define restore_toggled_locale_c(cat, locale) \
215 restore_toggled_locale_i(cat##_INDEX_, locale)
216
fe1c1494
KW
217/* Two parallel arrays indexed by our mapping of category numbers into small
218 * non-negative indexes; first the locale categories Perl uses on this system,
219 * used to do the inverse mapping. The second array is their names. These
220 * arrays are in mostly arbitrary order. */
e5f10d49 221
e129f2eb 222STATIC const int categories[] = {
e5f10d49 223
e5f10d49
KW
224# ifdef USE_LOCALE_CTYPE
225 LC_CTYPE,
226# endif
bfcd7d51
KW
227# ifdef USE_LOCALE_NUMERIC
228 LC_NUMERIC,
229# endif
e5f10d49
KW
230# ifdef USE_LOCALE_COLLATE
231 LC_COLLATE,
232# endif
233# ifdef USE_LOCALE_TIME
234 LC_TIME,
235# endif
236# ifdef USE_LOCALE_MESSAGES
237 LC_MESSAGES,
238# endif
239# ifdef USE_LOCALE_MONETARY
240 LC_MONETARY,
241# endif
9821811f
KW
242# ifdef USE_LOCALE_ADDRESS
243 LC_ADDRESS,
244# endif
245# ifdef USE_LOCALE_IDENTIFICATION
246 LC_IDENTIFICATION,
247# endif
248# ifdef USE_LOCALE_MEASUREMENT
249 LC_MEASUREMENT,
250# endif
251# ifdef USE_LOCALE_PAPER
252 LC_PAPER,
253# endif
254# ifdef USE_LOCALE_TELEPHONE
255 LC_TELEPHONE,
256# endif
36dbc955
KW
257# ifdef USE_LOCALE_SYNTAX
258 LC_SYNTAX,
259# endif
260# ifdef USE_LOCALE_TOD
261 LC_TOD,
262# endif
e5f10d49
KW
263# ifdef LC_ALL
264 LC_ALL,
265# endif
7e1cacef
KW
266
267 /* Placeholder as a precaution if code fails to check the return of
268 * get_category_index(), which returns this element to indicate an error */
269 -1
e5f10d49
KW
270};
271
272/* The top-most real element is LC_ALL */
273
e129f2eb 274STATIC const char * const category_names[] = {
e5f10d49 275
e5f10d49
KW
276# ifdef USE_LOCALE_CTYPE
277 "LC_CTYPE",
278# endif
bfcd7d51
KW
279# ifdef USE_LOCALE_NUMERIC
280 "LC_NUMERIC",
281# endif
e5f10d49
KW
282# ifdef USE_LOCALE_COLLATE
283 "LC_COLLATE",
284# endif
285# ifdef USE_LOCALE_TIME
286 "LC_TIME",
287# endif
288# ifdef USE_LOCALE_MESSAGES
289 "LC_MESSAGES",
290# endif
291# ifdef USE_LOCALE_MONETARY
292 "LC_MONETARY",
293# endif
9821811f
KW
294# ifdef USE_LOCALE_ADDRESS
295 "LC_ADDRESS",
296# endif
297# ifdef USE_LOCALE_IDENTIFICATION
298 "LC_IDENTIFICATION",
299# endif
300# ifdef USE_LOCALE_MEASUREMENT
301 "LC_MEASUREMENT",
302# endif
303# ifdef USE_LOCALE_PAPER
304 "LC_PAPER",
305# endif
306# ifdef USE_LOCALE_TELEPHONE
307 "LC_TELEPHONE",
308# endif
36dbc955
KW
309# ifdef USE_LOCALE_SYNTAX
310 "LC_SYNTAX",
311# endif
312# ifdef USE_LOCALE_TOD
313 "LC_TOD",
314# endif
e5f10d49
KW
315# ifdef LC_ALL
316 "LC_ALL",
317# endif
7e1cacef
KW
318
319 /* Placeholder as a precaution if code fails to check the return of
320 * get_category_index(), which returns this element to indicate an error */
321 NULL
322};
e5f10d49 323
c7dc1ea8
KW
324/* A few categories require additional setup when they are changed. This table
325 * points to the functions that do that setup */
326STATIC void (*update_functions[]) (pTHX_ const char *) = {
c7dc1ea8
KW
327# ifdef USE_LOCALE_CTYPE
328 S_new_ctype,
329# endif
bfcd7d51
KW
330# ifdef USE_LOCALE_NUMERIC
331 S_new_numeric,
332# endif
c7dc1ea8
KW
333# ifdef USE_LOCALE_COLLATE
334 S_new_collate,
335# endif
336# ifdef USE_LOCALE_TIME
337 NULL,
338# endif
339# ifdef USE_LOCALE_MESSAGES
340 NULL,
341# endif
342# ifdef USE_LOCALE_MONETARY
343 NULL,
344# endif
345# ifdef USE_LOCALE_ADDRESS
346 NULL,
347# endif
348# ifdef USE_LOCALE_IDENTIFICATION
349 NULL,
350# endif
351# ifdef USE_LOCALE_MEASUREMENT
352 NULL,
353# endif
354# ifdef USE_LOCALE_PAPER
355 NULL,
356# endif
357# ifdef USE_LOCALE_TELEPHONE
358 NULL,
359# endif
360# ifdef USE_LOCALE_SYNTAX
361 NULL,
362# endif
363# ifdef USE_LOCALE_TOD
364 NULL,
365# endif
366 /* No harm done to have this even without an LC_ALL */
367 S_new_LC_ALL,
368
369 /* Placeholder as a precaution if code fails to check the return of
370 * get_category_index(), which returns this element to indicate an error */
371 NULL
372};
373
e5f10d49
KW
374# ifdef LC_ALL
375
376 /* On systems with LC_ALL, it is kept in the highest index position. (-2
377 * to account for the final unused placeholder element.) */
378# define NOMINAL_LC_ALL_INDEX (C_ARRAY_LENGTH(categories) - 2)
e5f10d49
KW
379# else
380
381 /* On systems without LC_ALL, we pretend it is there, one beyond the real
382 * top element, hence in the unused placeholder element. */
383# define NOMINAL_LC_ALL_INDEX (C_ARRAY_LENGTH(categories) - 1)
e5f10d49
KW
384# endif
385
386/* Pretending there is an LC_ALL element just above allows us to avoid most
387 * special cases. Most loops through these arrays in the code below are
388 * written like 'for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++)'. They will work
389 * on either type of system. But the code must be written to not access the
4fd76d49
KW
390 * element at 'LC_ALL_INDEX_' except on platforms that have it. This can be
391 * checked for at compile time by using the #define LC_ALL_INDEX_ which is only
948523db 392 * defined if we do have LC_ALL. */
e5f10d49 393
7e1cacef
KW
394STATIC unsigned int
395S_get_category_index(const int category, const char * locale)
396{
397 /* Given a category, return the equivalent internal index we generally use
398 * instead.
399 *
400 * 'locale' is for use in any generated diagnostics, and may be NULL
401 *
402 * Some sort of hash could be used instead of this loop, but the number of
403 * elements is so far at most 12 */
404
405 unsigned int i;
fef64ad9 406 const char * conditional_warn_text = "; can't set it to ";
7e1cacef
KW
407
408 PERL_ARGS_ASSERT_GET_CATEGORY_INDEX;
409
410# ifdef LC_ALL
411 for (i = 0; i <= LC_ALL_INDEX_; i++)
412# else
413 for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++)
414# endif
415 {
416 if (category == categories[i]) {
417 dTHX_DEBUGGING;
418 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
b26541a9
KW
419 "index of category %d (%s) is %d\n",
420 category, category_names[i], i));
7e1cacef
KW
421 return i;
422 }
423 }
424
425 /* Here, we don't know about this category, so can't handle it. */
fef64ad9 426
7e1cacef 427 if (! locale) {
fef64ad9
KW
428 locale = "";
429 conditional_warn_text = "";
7e1cacef 430 }
fef64ad9
KW
431
432 /* diag_listed_as: Unknown locale category %d; can't set it to %s */
7e1cacef 433 Perl_warner_nocontext(packWARN(WARN_LOCALE),
fef64ad9
KW
434 "Unknown locale category %d%s%s",
435 category, conditional_warn_text, locale);
7e1cacef 436
01c966a3
KW
437# ifdef EINVAL
438
439 SETERRNO(EINVAL, LIB_INVARG);
440
441# endif
442
7e1cacef
KW
443 /* Return an out-of-bounds value */
444 return NOMINAL_LC_ALL_INDEX + 1;
445}
446
b09aaf40
KW
447STATIC const char *
448S_category_name(const int category)
449{
fef64ad9 450 unsigned int index;
b09aaf40 451
fef64ad9 452 index = get_category_index(category, NULL);
b09aaf40 453
fef64ad9
KW
454 if (index <= NOMINAL_LC_ALL_INDEX) {
455 return category_names[index];
b09aaf40
KW
456 }
457
fef64ad9 458 return Perl_form_nocontext("%d (unknown)", category);
b09aaf40
KW
459}
460
948523db 461#endif /* ifdef USE_LOCALE */
20ad7b23
KW
462
463void
464Perl_force_locale_unlock()
465{
466
467#if defined(USE_LOCALE_THREADS)
468
469 dTHX;
470# ifdef LOCALE_UNLOCK_
471 LOCALE_UNLOCK_;
472# endif
473
474#endif
475
476}
477
827a4a05
KW
478#ifdef USE_POSIX_2008_LOCALE
479
480STATIC locale_t
481S_use_curlocale_scratch(pTHX)
482{
483 /* This function is used to hide from the caller the case where the current
484 * locale_t object in POSIX 2008 is the global one, which is illegal in
485 * many of the P2008 API calls. This checks for that and, if necessary
486 * creates a proper P2008 object. Any prior object is deleted, as is any
487 * remaining object during global destruction. */
488
489 locale_t cur = uselocale((locale_t) 0);
490
491 if (cur != LC_GLOBAL_LOCALE) {
492 return cur;
493 }
494
495 if (PL_scratch_locale_obj) {
496 freelocale(PL_scratch_locale_obj);
497 }
498
499 PL_scratch_locale_obj = duplocale(LC_GLOBAL_LOCALE);
500 return PL_scratch_locale_obj;
501}
502
503#endif
8ef6e574 504
20ad7b23
KW
505void
506Perl_locale_panic(const char * msg,
507 const char * file_name,
508 const line_t line,
509 const int errnum)
510{
511 dTHX;
512
513 PERL_ARGS_ASSERT_LOCALE_PANIC;
514
515 force_locale_unlock();
516
517#ifdef USE_C_BACKTRACE
518 dump_c_backtrace(Perl_debug_log, 20, 1);
519#endif
520
521 /* diag_listed_as: panic: %s */
522 Perl_croak(aTHX_ "%s: %d: panic: %s; errno=%d\n",
523 file_name, line, msg, errnum);
524}
525
94202fb8
KW
526#define setlocale_failure_panic_c( \
527 cat, current, failed, caller_0_line, caller_1_line) \
528 setlocale_failure_panic_i(cat##_INDEX_, current, failed, \
529 caller_0_line, caller_1_line)
530
fe1c1494
KW
531/* porcelain_setlocale() presents a consistent POSIX-compliant interface to
532 * setlocale(). Windows requres a customized base-level setlocale() */
e9bc6d6b 533#ifdef WIN32
fb24522c 534# define porcelain_setlocale(cat, locale) win32_setlocale(cat, locale)
e9bc6d6b 535#else
d19f9f01
KW
536# define porcelain_setlocale(cat, locale) \
537 ((const char *) setlocale(cat, locale))
e9bc6d6b
KW
538#endif
539
32899cfb
KW
540/* The next layer up is to catch vagaries and bugs in the libc setlocale return
541 * value */
542#ifdef stdize_locale
543# define stdized_setlocale(cat, locale) \
544 stdize_locale(cat, porcelain_setlocale(cat, locale), \
545 &PL_stdize_locale_buf, &PL_stdize_locale_bufsize, __LINE__)
546#else
547# define stdized_setlocale(cat, locale) porcelain_setlocale(cat, locale)
548#endif
549
3980cddb 550/* The next many lines form a layer above the close-to-the-metal 'porcelain'
32899cfb
KW
551 * and 'stdized' macros. They are used to present a uniform API to the rest of
552 * the code in this file in spite of the disparate underlying implementations.
553 * */
fe1c1494 554
e9bc6d6b
KW
555#ifndef USE_POSIX_2008_LOCALE
556
fe1c1494
KW
557/* For non-threaded perls (which we are not to use the POSIX 2008 API on), or a
558 * thread-safe Windows one in which threading is invisible to us, the added
559 * layer just calls the base-level functions. See the introductory comments in
560 * this file for the meaning of the suffixes '_c', '_r', '_i'. */
561
50ffb02b
KW
562# define setlocale_r(cat, locale) stdized_setlocale(cat, locale)
563# define setlocale_i(i, locale) setlocale_r(categories[i], locale)
564# define setlocale_c(cat, locale) setlocale_r(cat, locale)
3980cddb 565
94202fb8
KW
566# define void_setlocale_i(i, locale) \
567 STMT_START { \
568 if (! porcelain_setlocale(categories[i], locale)) { \
569 setlocale_failure_panic_i(i, NULL, locale, __LINE__, 0); \
570 NOT_REACHED; /* NOTREACHED */ \
571 } \
572 } STMT_END
573# define void_setlocale_c(cat, locale) \
574 void_setlocale_i(cat##_INDEX_, locale)
575# define void_setlocale_r(cat, locale) \
576 void_setlocale_i(get_category_index(cat, locale), locale)
3980cddb 577
50ffb02b 578# define bool_setlocale_r(cat, locale) \
32899cfb
KW
579 cBOOL(porcelain_setlocale(cat, locale))
580# define bool_setlocale_i(i, locale) \
581 bool_setlocale_c(categories[i], locale)
50ffb02b 582# define bool_setlocale_c(cat, locale) bool_setlocale_r(cat, locale)
a683a66a 583
22392ba2
KW
584/* All the querylocale...() forms return a mortalized copy. If you need
585 * something stable across calls, you need to savepv() the result yourself */
586
587# define querylocale_r(cat) mortalized_pv_copy(setlocale_r(cat, NULL))
50ffb02b 588# define querylocale_c(cat) querylocale_r(cat)
a683a66a
KW
589# define querylocale_i(i) querylocale_c(categories[i])
590
32899cfb 591#else /* Below is defined(POSIX 2008) */
e9bc6d6b 592
fe1c1494
KW
593/* Here, there is a completely different API to get thread-safe locales. We
594 * emulate the setlocale() API with our own function(s). setlocale categories,
595 * like LC_NUMERIC, are not valid here for the POSIX 2008 API. Instead, there
596 * are equivalents, like LC_NUMERIC_MASK, which we use instead, converting to
597 * by using get_category_index() followed by table lookup. */
598
1a63cba7
KW
599# define emulate_setlocale_c(cat, locale, recalc_LC_ALL, line) \
600 emulate_setlocale_i(cat##_INDEX_, locale, recalc_LC_ALL, line)
f2780a3f 601
20be777a 602 /* A wrapper for the macros below. */
11d8dd19 603# define common_emulate_setlocale(i, locale) \
20be777a 604 emulate_setlocale_i(i, locale, YES_RECALC_LC_ALL, __LINE__)
11d8dd19
KW
605
606# define setlocale_i(i, locale) common_emulate_setlocale(i, locale)
3980cddb
KW
607# define setlocale_c(cat, locale) setlocale_i(cat##_INDEX_, locale)
608# define setlocale_r(cat, locale) \
609 setlocale_i(get_category_index(cat, locale), locale)
610
611# define void_setlocale_i(i, locale) ((void) setlocale_i(i, locale))
612# define void_setlocale_c(cat, locale) \
613 void_setlocale_i(cat##_INDEX_, locale)
614# define void_setlocale_r(cat, locale) ((void) setlocale_r(cat, locale))
615
616# define bool_setlocale_i(i, locale) cBOOL(setlocale_i(i, locale))
617# define bool_setlocale_c(cat, locale) \
618 bool_setlocale_i(cat##_INDEX_, locale)
619# define bool_setlocale_r(cat, locale) cBOOL(setlocale_r(cat, locale))
e9bc6d6b 620
22392ba2 621# define querylocale_i(i) mortalized_pv_copy(my_querylocale_i(i))
a683a66a
KW
622# define querylocale_c(cat) querylocale_i(cat##_INDEX_)
623# define querylocale_r(cat) querylocale_i(get_category_index(cat,NULL))
624
67c13e2e 625# ifndef USE_QUERYLOCALE
434e0069 626# define USE_PL_CURLOCALES
67c13e2e 627# else
d92b203c
KW
628# define isSINGLE_BIT_SET(mask) isPOWER_OF_2(mask)
629
630 /* This code used to think querylocale() was valid on LC_ALL. Make sure
631 * all instances of that have been removed */
632# define QUERYLOCALE_ASSERT(index) \
633 __ASSERT_(isSINGLE_BIT_SET(category_masks[index]))
67c13e2e
KW
634# if ! defined(HAS_QUERYLOCALE) && defined(_NL_LOCALE_NAME)
635# define querylocale_l(index, locale_obj) \
d92b203c 636 (QUERYLOCALE_ASSERT(index) \
22392ba2
KW
637 mortalized_pv_copy(nl_langinfo_l( \
638 _NL_LOCALE_NAME(categories[index]), locale_obj)))
67c13e2e
KW
639# else
640# define querylocale_l(index, locale_obj) \
22392ba2
KW
641 (QUERYLOCALE_ASSERT(index) \
642 mortalized_pv_copy(querylocale(category_masks[index], locale_obj)))
67c13e2e 643# endif
434e0069 644# endif
74a90e54
KW
645# if defined(__GLIBC__) && defined(USE_LOCALE_MESSAGES)
646# define HAS_GLIBC_LC_MESSAGES_BUG
45cef8fb 647# include <libintl.h>
45cef8fb
KW
648# endif
649
c7dc1ea8 650/* A fourth array, parallel to the ones above to map from category to its
e9bc6d6b 651 * equivalent mask */
e129f2eb 652STATIC const int category_masks[] = {
e9bc6d6b
KW
653# ifdef USE_LOCALE_CTYPE
654 LC_CTYPE_MASK,
655# endif
bfcd7d51
KW
656# ifdef USE_LOCALE_NUMERIC
657 LC_NUMERIC_MASK,
658# endif
e9bc6d6b
KW
659# ifdef USE_LOCALE_COLLATE
660 LC_COLLATE_MASK,
661# endif
662# ifdef USE_LOCALE_TIME
663 LC_TIME_MASK,
664# endif
665# ifdef USE_LOCALE_MESSAGES
666 LC_MESSAGES_MASK,
667# endif
668# ifdef USE_LOCALE_MONETARY
669 LC_MONETARY_MASK,
670# endif
671# ifdef USE_LOCALE_ADDRESS
672 LC_ADDRESS_MASK,
673# endif
674# ifdef USE_LOCALE_IDENTIFICATION
675 LC_IDENTIFICATION_MASK,
676# endif
677# ifdef USE_LOCALE_MEASUREMENT
678 LC_MEASUREMENT_MASK,
679# endif
680# ifdef USE_LOCALE_PAPER
681 LC_PAPER_MASK,
682# endif
683# ifdef USE_LOCALE_TELEPHONE
684 LC_TELEPHONE_MASK,
685# endif
36dbc955
KW
686# ifdef USE_LOCALE_SYNTAX
687 LC_SYNTAX_MASK,
688# endif
689# ifdef USE_LOCALE_TOD
690 LC_TOD_MASK,
691# endif
e9bc6d6b
KW
692 /* LC_ALL can't be turned off by a Configure
693 * option, and in Posix 2008, should always be
694 * here, so compile it in unconditionally.
695 * This could catch some glitches at compile
696 * time */
7e1cacef
KW
697 LC_ALL_MASK,
698
699 /* Placeholder as a precaution if code fails to check the return of
700 * get_category_index(), which returns this element to indicate an error */
701 0
fe1c1494 702};
e9bc6d6b 703
935c09ec
KW
704# define my_querylocale_c(cat) my_querylocale_i(cat##_INDEX_)
705
e9bc6d6b 706STATIC const char *
935c09ec 707S_my_querylocale_i(pTHX_ const unsigned int index)
e9bc6d6b 708{
935c09ec
KW
709 /* This function returns the name of the locale category given by the input
710 * index into our parallel tables of them.
e9bc6d6b
KW
711 *
712 * POSIX 2008, for some sick reason, chose not to provide a method to find
935c09ec
KW
713 * the category name of a locale, discarding a basic linguistic tenet that
714 * for any object, people will create a name for it. Some vendors have
715 * created a querylocale() function to do just that. This function is a
716 * lot simpler to implement on systems that have this. Otherwise, we have
717 * to keep track of what the locale has been set to, so that we can return
718 * its name so as to emulate setlocale(). It's also possible for C code in
719 * some library to change the locale without us knowing it, though as of
e9bc6d6b
KW
720 * September 2017, there are no occurrences in CPAN of uselocale(). Some
721 * libraries do use setlocale(), but that changes the global locale, and
935c09ec 722 * threads using per-thread locales will just ignore those changes. */
e9bc6d6b 723
4bcc858c 724 int category;
935c09ec 725 const locale_t cur_obj = uselocale((locale_t) 0);
d92b203c 726 const char * retval;
e9bc6d6b 727
935c09ec 728 PERL_ARGS_ASSERT_MY_QUERYLOCALE_I;
4bcc858c 729 assert(index <= NOMINAL_LC_ALL_INDEX);
e9bc6d6b 730
4bcc858c 731 category = categories[index];
e9bc6d6b 732
b26541a9
KW
733 DEBUG_Lv(PerlIO_printf(Perl_debug_log, "my_querylocale_i(%s) on %p\n",
734 category_names[index], cur_obj));
e9bc6d6b 735 if (cur_obj == LC_GLOBAL_LOCALE) {
d92b203c 736 retval = porcelain_setlocale(category, NULL);
e9bc6d6b 737 }
d92b203c 738 else {
e9bc6d6b 739
67c13e2e 740# ifdef USE_QUERYLOCALE
e9bc6d6b 741
d92b203c
KW
742 /* We don't currently keep records when there is querylocale(), so have
743 * to get it anew each time */
744 retval = (index == LC_ALL_INDEX_)
745 ? calculate_LC_ALL(cur_obj)
746 : querylocale_l(index, cur_obj);
e9bc6d6b 747
d2b24094 748# else
ecdda939 749
d92b203c
KW
750 /* But we do have up-to-date values when we keep our own records */
751 retval = PL_curlocales[index];
e9bc6d6b 752
d92b203c 753# endif
e9bc6d6b 754
e9bc6d6b
KW
755 }
756
94d521c0 757 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
b26541a9
KW
758 "my_querylocale_i(%s) returning '%s'\n",
759 category_names[index], retval));
d92b203c 760 return retval;
935c09ec
KW
761}
762
11d8dd19
KW
763# ifdef USE_PL_CURLOCALES
764
765STATIC const char *
766S_update_PL_curlocales_i(pTHX_
767 const unsigned int index,
768 const char * new_locale,
20be777a 769 recalc_lc_all_t recalc_LC_ALL)
11d8dd19
KW
770{
771 /* This is a helper function for emulate_setlocale_i(), mostly used to
772 * make that function easier to read. */
773
774 PERL_ARGS_ASSERT_UPDATE_PL_CURLOCALES_I;
775 assert(index <= NOMINAL_LC_ALL_INDEX);
776
777 if (index == LC_ALL_INDEX_) {
778 unsigned int i;
779
780 /* For LC_ALL, we change all individual categories to correspond */
781 /* PL_curlocales is a parallel array, so has same
782 * length as 'categories' */
783 for (i = 0; i < LC_ALL_INDEX_; i++) {
784 Safefree(PL_curlocales[i]);
785 PL_curlocales[i] = savepv(new_locale);
786 }
787
20be777a 788 recalc_LC_ALL = YES_RECALC_LC_ALL;
11d8dd19
KW
789 }
790 else {
791
792 /* Update the single category's record */
793 Safefree(PL_curlocales[index]);
794 PL_curlocales[index] = savepv(new_locale);
795
20be777a
KW
796 if (recalc_LC_ALL == RECALCULATE_LC_ALL_ON_FINAL_INTERATION) {
797 recalc_LC_ALL = (index == NOMINAL_LC_ALL_INDEX - 1)
798 ? YES_RECALC_LC_ALL
799 : DONT_RECALC_LC_ALL;
11d8dd19
KW
800 }
801 }
802
20be777a 803 if (recalc_LC_ALL == YES_RECALC_LC_ALL) {
11d8dd19
KW
804 Safefree(PL_curlocales[LC_ALL_INDEX_]);
805 PL_curlocales[LC_ALL_INDEX_] =
806 savepv(calculate_LC_ALL(PL_curlocales));
807 }
808
809 return PL_curlocales[index];
810}
f2780a3f 811
0d17a9c8
KW
812# endif /* Need PL_curlocales[] */
813
f2780a3f 814STATIC const char *
1a63cba7 815S_setlocale_from_aggregate_LC_ALL(pTHX_ const char * locale, const line_t line)
f2780a3f
KW
816{
817 /* This function parses the value of the LC_ALL locale, assuming glibc
818 * syntax, and sets each individual category on the system to the proper
819 * value.
820 *
821 * This is likely to only ever be called from one place, so exists to make
822 * the calling function easier to read by moving this ancillary code out of
823 * the main line.
824 *
825 * The locale for each category is independent of the other categories.
826 * Often, they are all the same, but certainly not always. Perl, in fact,
827 * usually keeps LC_NUMERIC in the C locale, regardless of the underlying
828 * locale. LC_ALL has to be able to represent the case of when there are
829 * varying locales. Platforms have differing ways of representing this.
830 * Because of this, the code in this file goes to lengths to avoid the
831 * issue, generally looping over the component categories instead of
832 * referring to them in the aggregate, wherever possible. However, there
833 * are cases where we have to parse our own constructed aggregates, which use
834 * the glibc syntax. */
835
22392ba2 836 const char * locale_on_entry = querylocale_c(LC_ALL);
f2780a3f
KW
837
838 PERL_ARGS_ASSERT_SETLOCALE_FROM_AGGREGATE_LC_ALL;
839
840 /* If the string that gives what to set doesn't include all categories,
841 * the omitted ones get set to "C". To get this behavior, first set
842 * all the individual categories to "C", and override the furnished
11d8dd19
KW
843 * ones below. FALSE => No need to recalculate LC_ALL, as this is a
844 * temporary state */
20be777a 845 if (! emulate_setlocale_c(LC_ALL, "C", DONT_RECALC_LC_ALL, line)) {
0998f1fd 846 setlocale_failure_panic_c(LC_ALL, locale_on_entry,
1a63cba7 847 "C", __LINE__, line);
0998f1fd 848 NOT_REACHED; /* NOTREACHED */
f2780a3f
KW
849 }
850
2ee51ed7
KW
851 const char * s = locale;
852 const char * e = locale + strlen(locale);
f2780a3f 853 while (s < e) {
2ee51ed7 854 const char * p = s;
f2780a3f
KW
855
856 /* Parse through the category */
857 while (isWORDCHAR(*p)) {
858 p++;
859 }
2ee51ed7
KW
860
861 const char * category_end = p;
f2780a3f
KW
862
863 if (*p++ != '=') {
20ad7b23
KW
864 locale_panic_(Perl_form(aTHX_
865 "Unexpected character in locale category name '%02X", *(p-1)));
f2780a3f
KW
866 }
867
868 /* Parse through the locale name */
2ee51ed7 869 const char * name_start = p;
f2780a3f
KW
870 while (p < e && *p != ';') {
871 if (! isGRAPH(*p)) {
20ad7b23
KW
872 locale_panic_(Perl_form(aTHX_
873 "Unexpected character in locale name '%02X", *p));
f2780a3f
KW
874 }
875 p++;
876 }
2ee51ed7
KW
877
878 const char * name_end = p;
f2780a3f
KW
879
880 /* Space past the semi-colon */
881 if (p < e) {
882 p++;
883 }
884
885 /* Find the index of the category name in our lists */
2ee51ed7 886 for (PERL_UINT_FAST8_T i = 0; i < LC_ALL_INDEX_; i++) {
f2780a3f 887
2ee51ed7
KW
888 /* Keep going if this index doesn't point to the category being
889 * parsed. The strnNE() avoids a Perl_form(), but would fail if
890 * ever a category name could be a substring of another one, e.g.,
891 * if there were a "LC_TIME_DATE" */
f2780a3f
KW
892 if strnNE(s, category_names[i], category_end - s) {
893 continue;
894 }
895
2ee51ed7
KW
896 /* Here i points to the category being parsed. Now isolate the
897 * locale it is being changed to */
898 const char * individ_locale = Perl_form(aTHX_ "%.*s",
f2780a3f 899 (int) (name_end - name_start), name_start);
2ee51ed7 900
11d8dd19
KW
901 /* And do the change. FALSE => Don't recalculate LC_ALL; we'll do
902 * it ourselves after the loop */
20be777a
KW
903 if (! emulate_setlocale_i(i, individ_locale,
904 DONT_RECALC_LC_ALL, line))
905 {
2ee51ed7 906
11d8dd19 907 /* But if we have to back out, do fix up LC_ALL */
20be777a
KW
908 if (! emulate_setlocale_c(LC_ALL, locale_on_entry,
909 YES_RECALC_LC_ALL, line))
1a63cba7 910 {
f2780a3f 911 setlocale_failure_panic_i(i, individ_locale,
1a63cba7 912 locale, __LINE__, line);
f2780a3f
KW
913 NOT_REACHED; /* NOTREACHED */
914 }
2ee51ed7 915
2ee51ed7
KW
916 /* Reverting to the entry value succeeded, but the operation
917 * failed to go to the requested locale. */
f2780a3f
KW
918 return NULL;
919 }
920
2ee51ed7
KW
921 /* Found and handled the desired category. Quit the inner loop to
922 * try the next category */
f2780a3f
KW
923 break;
924 }
925
2ee51ed7 926 /* Finished with this category; iterate to the next one in the input */
f2780a3f
KW
927 s = p;
928 }
929
0d17a9c8
KW
930# ifdef USE_PL_CURLOCALES
931
2ee51ed7
KW
932 /* Here we have set all the individual categories. Update the LC_ALL entry
933 * as well. We can't just use the input 'locale' as the value may omit
934 * categories whose locale is 'C'. khw thinks it's better to store a
935 * complete LC_ALL. So calculate it. */
936 const char * retval = savepv(calculate_LC_ALL(PL_curlocales));
f2780a3f
KW
937 Safefree(PL_curlocales[LC_ALL_INDEX_]);
938 PL_curlocales[LC_ALL_INDEX_] = retval;
939
0d17a9c8
KW
940# else
941
942 const char * retval = querylocale_c(LC_ALL);
943
944# endif
945
f2780a3f
KW
946 return retval;
947}
948
0d17a9c8
KW
949# ifndef USE_QUERYLOCALE
950
9444acab
KW
951STATIC const char *
952S_find_locale_from_environment(pTHX_ const unsigned int index)
953{
954 /* On systems without querylocale(), it is problematic getting the results
955 * of the POSIX 2008 equivalent of setlocale(category, "") (which gets the
956 * locale from the environment).
957 *
958 * To ensure that we know exactly what those values are, we do the setting
959 * ourselves, using the documented algorithm (assuming the documentation is
960 * correct) rather than use "" as the locale. This will lead to results
961 * that differ from native behavior if the native behavior differs from the
962 * standard documented value, but khw believes it is better to know what's
963 * going on, even if different from native, than to just guess.
964 *
965 * Another option would be, in a critical section, to save the global
966 * locale's current value, and do a straight setlocale(LC_ALL, ""). That
967 * would return our desired values, destroying the global locale's, which
968 * we would then restore. But that could cause races with any other thread
969 * that is using the global locale and isn't using the mutex. And, the
970 * only reason someone would have done that is because they are calling a
971 * library function, like in gtk, that calls setlocale(), and which can't
972 * be changed to use the mutex. That wouldn't be a problem if this were to
973 * be done before any threads had switched, say during perl construction
974 * time. But this code would still be needed for the general case. */
975
976 const char * default_name;
977 unsigned int i;
978 const char * locale_names[LC_ALL_INDEX_];
979
980 /* We rely on PerlEnv_getenv() returning a mortalized copy */
981 const char * const lc_all = PerlEnv_getenv("LC_ALL");
982
983 /* Use any "LC_ALL" environment variable, as it overrides everything
984 * else. */
985 if (lc_all && strNE(lc_all, "")) {
986 return lc_all;
987 }
988
989 /* Otherwise, we need to dig deeper. Unless overridden, the default is
990 * the LANG environment variable; "C" if it doesn't exist. */
991 default_name = PerlEnv_getenv("LANG");
992 if (! default_name || strEQ(default_name, "")) {
993 default_name = "C";
994 }
995
996 /* If setting an individual category, use its corresponding value found in
997 * the environment, if any; otherwise use the default we already
998 * calculated. */
999 if (index != LC_ALL_INDEX_) {
1000 const char * const new_value = PerlEnv_getenv(category_names[index]);
1001
1002 return (new_value && strNE(new_value, ""))
1003 ? new_value
1004 : default_name;
1005 }
1006
1007 /* Here, we are getting LC_ALL. Any categories that don't have a
1008 * corresponding environment variable set should be set to 'default_name'
1009 *
1010 * Simply find the values for all categories, and call the function to
1011 * compute LC_ALL. */
1012 for (i = 0; i < LC_ALL_INDEX_; i++) {
1013 const char * const env_override = PerlEnv_getenv(category_names[i]);
1014
1015 locale_names[i] = (env_override && strNE(env_override, ""))
1016 ? env_override
1017 : default_name;
1018
1019 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
b26541a9
KW
1020 "find_locale_from_environment i=%d, name=%s, locale=%s\n",
1021 i, category_names[i], locale_names[i]));
9444acab
KW
1022 }
1023
1024 return calculate_LC_ALL(locale_names);
1025}
1026
0d17a9c8 1027# endif
f2780a3f 1028
935c09ec 1029STATIC const char *
11d8dd19 1030S_emulate_setlocale_i(pTHX_
20be777a
KW
1031
1032 /* Our internal index of the 'category' setlocale is
1033 called with */
1034 const unsigned int index,
1035
1036 const char * new_locale, /* The locale to set the category to */
1037 const recalc_lc_all_t recalc_LC_ALL, /* Explained below */
1038 const line_t line /* Called from this line number */
1039 )
935c09ec 1040{
66a28734
KW
1041 PERL_ARGS_ASSERT_EMULATE_SETLOCALE_I;
1042 assert(index <= NOMINAL_LC_ALL_INDEX);
1043
935c09ec
KW
1044 /* This function effectively performs a setlocale() on just the current
1045 * thread; thus it is thread-safe. It does this by using the POSIX 2008
1046 * locale functions to emulate the behavior of setlocale(). Similar to
1047 * regular setlocale(), the return from this function points to memory that
1048 * can be overwritten by other system calls, so needs to be copied
1049 * immediately if you need to retain it. The difference here is that
1050 * system calls besides another setlocale() can overwrite it.
1051 *
1052 * By doing this, most locale-sensitive functions become thread-safe. The
1053 * exceptions are mostly those that return a pointer to static memory.
1054 *
20be777a
KW
1055 * This function may be called in a tight loop that iterates over all
1056 * categories. Because LC_ALL is not a "real" category, but merely the sum
1057 * of all the other ones, such loops don't include LC_ALL. On systems that
1058 * have querylocale() or similar, the current LC_ALL value is immediately
1059 * retrievable; on systems lacking that feature, we have to keep track of
1060 * LC_ALL ourselves. We could do that on each iteration, only to throw it
1061 * away on the next, but the calculation is more than a trivial amount of
1062 * work. Instead, the 'recalc_LC_ALL' parameter is set to
1063 * RECALCULATE_LC_ALL_ON_FINAL_INTERATION to only do the calculation once.
1064 * This function calls itself recursively in such a loop.
11d8dd19 1065 *
20be777a
KW
1066 * When not in such a loop, the parameter is set to the other enum values
1067 * DONT_RECALC_LC_ALL or YES_RECALC_LC_ALL. */
935c09ec 1068
66a28734
KW
1069 int mask = category_masks[index];
1070 const locale_t entry_obj = uselocale((locale_t) 0);
1071 const char * locale_on_entry = querylocale_i(index);
935c09ec
KW
1072
1073 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
66a28734
KW
1074 "emulate_setlocale_i input=%d (%s), mask=0x%x,"
1075 " new locale=\"%s\", current locale=\"%s\","
1076 "index=%d, object=%p\n",
1077 categories[index], category_name(categories[index]), mask,
1078 ((new_locale == NULL) ? "(nil)" : new_locale),
1079 locale_on_entry, index, entry_obj));
1080
1081 /* Return the already-calculated info if just querying what the existing
1082 * locale is */
9ec822ee 1083 if (new_locale == NULL) {
66a28734 1084 return locale_on_entry;
935c09ec 1085 }
e9bc6d6b 1086
19e550d7
KW
1087 /* Here, trying to change the locale, but it is a no-op if the new boss is
1088 * the same as the old boss. Except this routine is called when converting
1089 * from the global locale, so in that case we will create a per-thread
1090 * locale below (with the current values). Bitter experience also
1091 * indicates that newlocale() can free up the basis locale memory if we
1092 * call it with the new and old being the same. */
1093 if ( entry_obj != LC_GLOBAL_LOCALE
1094 && locale_on_entry
1095 && strEQ(new_locale, locale_on_entry))
1096 {
1097 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
1098 "(%" LINE_Tf "): emulate_setlocale_i"
1099 " no-op to change to what it already was\n",
1100 line));
42192b39
KW
1101
1102# ifdef USE_PL_CURLOCALES
1103
1104 /* On the final iteration of a loop that needs to recalculate LC_ALL, do
1105 * so. If no iteration changed anything, LC_ALL also doesn't change,
1106 * but khw believes the complexity needed to keep track of that isn't
1107 * worth it. */
1108 if (UNLIKELY( recalc_LC_ALL == RECALCULATE_LC_ALL_ON_FINAL_INTERATION
1109 && index == NOMINAL_LC_ALL_INDEX - 1))
1110 {
1111 Safefree(PL_curlocales[LC_ALL_INDEX_]);
1112 PL_curlocales[LC_ALL_INDEX_] =
1113 savepv(calculate_LC_ALL(PL_curlocales));
1114 }
1115
1116# endif
1117
19e550d7
KW
1118 return locale_on_entry;
1119 }
1120
67c13e2e 1121# ifndef USE_QUERYLOCALE
e9bc6d6b 1122
0d17a9c8
KW
1123 /* Without a querylocale() mechanism, we have to figure out ourselves what
1124 * happens with setting a locale to "" */
9ec822ee 1125 if (strEQ(new_locale, "")) {
9444acab
KW
1126 new_locale = find_locale_from_environment(index);
1127 }
c61e7ca7 1128
0d17a9c8
KW
1129# endif
1130
1131 /* So far, it has worked that a semi-colon in the locale name means that
1132 * the category is LC_ALL and it subsumes categories which don't all have
1133 * the same locale. This is the glibc syntax. */
c61e7ca7 1134 if (strchr(new_locale, ';')) {
0d17a9c8 1135 assert(index == LC_ALL_INDEX_);
1a63cba7 1136 return setlocale_from_aggregate_LC_ALL(new_locale, line);
f2780a3f 1137 }
e9bc6d6b 1138
74a90e54
KW
1139# ifdef HAS_GLIBC_LC_MESSAGES_BUG
1140
1141 /* For this bug, if the LC_MESSAGES locale changes, we have to do an
1142 * expensive workaround. Save the current value so we can later determine
1143 * if it changed. */
66a28734 1144 const char * old_messages_locale = NULL;
74a90e54
KW
1145 if ( (index == LC_MESSAGES_INDEX_ || index == LC_ALL_INDEX_)
1146 && LIKELY(PL_phase != PERL_PHASE_CONSTRUCT))
1147 {
22392ba2 1148 old_messages_locale = querylocale_c(LC_MESSAGES);
74a90e54
KW
1149 }
1150
74a90e54 1151# endif
e9bc6d6b 1152
f8115d60
KW
1153 assert(PL_C_locale_obj);
1154
2ee51ed7
KW
1155 /* Now ready to switch to the input 'new_locale' */
1156
f8115d60 1157 /* Switching locales generally entails freeing the current one's space (at
fe1c1494
KW
1158 * the C library's discretion), hence we can't be using that locale at the
1159 * time of the switch (this wasn't obvious to khw from the man pages). So
c2c9cf19
KW
1160 * switch to a known locale object that we don't otherwise mess with. */
1161 if (! uselocale(PL_C_locale_obj)) {
f8115d60 1162
947b444c
KW
1163 /* Not being able to change to the C locale is severe; don't keep
1164 * going. */
1165 setlocale_failure_panic_i(index, locale_on_entry, "C", __LINE__, line);
1166 NOT_REACHED; /* NOTREACHED */
f8115d60
KW
1167 }
1168
94d521c0 1169 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
b26541a9
KW
1170 "(%" LINE_Tf "): emulate_setlocale_i now using C"
1171 " object=%p\n", line, PL_C_locale_obj));
f8115d60 1172
66a28734
KW
1173 locale_t new_obj;
1174
2ee51ed7
KW
1175 /* We created a (never changing) object at start-up for LC_ALL being in the
1176 * C locale. If this call is to switch to LC_ALL=>C, simply use that
1177 * object. But in fact, we already have switched to it just above, in
1178 * preparation for the general case. Since we're already there, no need to
1179 * do further switching. */
9ec822ee 1180 if (mask == LC_ALL_MASK && isNAME_C_OR_POSIX(new_locale)) {
b26541a9
KW
1181 DEBUG_Lv(PerlIO_printf(Perl_debug_log, "(%" LINE_Tf "):"
1182 " emulate_setlocale_i will stay"
1183 " in C object\n", line));
70bd6bc8
KW
1184 new_obj = PL_C_locale_obj;
1185
2ee51ed7 1186 /* And free the old object if it isn't a special one */
c2c9cf19
KW
1187 if (entry_obj != LC_GLOBAL_LOCALE && entry_obj != PL_C_locale_obj) {
1188 freelocale(entry_obj);
70bd6bc8
KW
1189 }
1190 }
2ee51ed7 1191 else { /* Here is the general case, not to LC_ALL=>C */
43c2ca2c 1192 locale_t basis_obj = entry_obj;
5ce9bebe 1193
43c2ca2c
KW
1194 /* Specially handle two objects */
1195 if (basis_obj == LC_GLOBAL_LOCALE || basis_obj == PL_C_locale_obj) {
1196
1197 /* For these two objects, we make duplicates to hand to newlocale()
1198 * below. For LC_GLOBAL_LOCALE, this is because newlocale()
1199 * doesn't necessarily accept it as input (the results are
1200 * undefined). For PL_C_locale_obj, it is so that it never gets
1201 * modified, as otherwise newlocale() is free to do so */
1202 basis_obj = duplocale(entry_obj);
1203 if (! basis_obj) {
b5016ef4
KW
1204 locale_panic_(Perl_form(aTHX_ "(%" LINE_Tf "): duplocale failed",
1205 line));
43c2ca2c 1206 NOT_REACHED; /* NOTREACHED */
5ce9bebe 1207 }
43c2ca2c
KW
1208
1209 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
1210 "(%" LINE_Tf "): emulate_setlocale_i"
1211 " created %p by duping the input\n",
1212 line, basis_obj));
b22e9937 1213 }
f8115d60 1214
b22e9937 1215 /* Ready to create a new locale by modification of the exising one */
43c2ca2c 1216 new_obj = newlocale(mask, new_locale, basis_obj);
e9bc6d6b 1217
b22e9937 1218 if (! new_obj) {
94d521c0 1219 DEBUG_L(PerlIO_printf(Perl_debug_log,
b26541a9 1220 " (%" LINE_Tf "): emulate_setlocale_i"
43c2ca2c
KW
1221 " creating new object from %p failed:"
1222 " errno=%d\n",
1223 line, basis_obj, GET_ERRNO));
e9bc6d6b 1224
2ee51ed7
KW
1225 /* Failed. Likely this is because the proposed new locale isn't
1226 * valid on this system. But we earlier switched to the LC_ALL=>C
1227 * locale in anticipation of it succeeding, Now have to switch
1228 * back to the state upon entry */
43c2ca2c 1229 if (! uselocale(entry_obj)) {
947b444c
KW
1230 setlocale_failure_panic_i(index, "switching back to",
1231 locale_on_entry, __LINE__, line);
1232 NOT_REACHED; /* NOTREACHED */
b22e9937 1233 }
5ce9bebe
KW
1234
1235# ifdef USE_PL_CURLOCALES
1236
c2c9cf19 1237 if (entry_obj == LC_GLOBAL_LOCALE) {
5ce9bebe
KW
1238 /* Here, we are back in the global locale. We may never have
1239 * set PL_curlocales. If the locale change had succeeded, the
1240 * code would have then set them up, but since it didn't, do so
1241 * here. khw isn't sure if this prevents some issues or not,
1242 * but tis is defensive coding. The system setlocale() returns
f14a9aff 1243 * the desired information. This will calculate LC_ALL's entry
20be777a 1244 * only on the final iteration */
f14a9aff 1245 for (PERL_UINT_FAST8_T i = 0; i < NOMINAL_LC_ALL_INDEX; i++) {
5ce9bebe
KW
1246 update_PL_curlocales_i(i,
1247 porcelain_setlocale(categories[i], NULL),
20be777a 1248 RECALCULATE_LC_ALL_ON_FINAL_INTERATION);
5ce9bebe
KW
1249 }
1250 }
1251# endif
1252
b22e9937 1253 return NULL;
e9bc6d6b 1254 }
e9bc6d6b 1255
43c2ca2c
KW
1256 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
1257 "(%" LINE_Tf "): emulate_setlocale_i created %p"
1258 " while freeing %p\n", line, new_obj, basis_obj));
e9bc6d6b 1259
2ee51ed7
KW
1260 /* Here, successfully created an object representing the desired
1261 * locale; now switch into it */
b22e9937 1262 if (! uselocale(new_obj)) {
b22e9937 1263 freelocale(new_obj);
947b444c
KW
1264 locale_panic_(Perl_form(aTHX_ "(%" LINE_Tf "): emulate_setlocale_i"
1265 " switching into new locale failed",
1266 line));
e9bc6d6b 1267 }
70bd6bc8 1268 }
e9bc6d6b 1269
9ec822ee 1270 /* Here, we are using 'new_obj' which matches the input 'new_locale'. */
94d521c0 1271 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
b26541a9 1272 "(%" LINE_Tf "): emulate_setlocale_i now using %p\n", line, new_obj));
e9bc6d6b
KW
1273
1274 /* We are done, except for updating our records (if the system doesn't keep
1275 * them) and in the case of locale "", we don't actually know what the
1276 * locale that got switched to is, as it came from the environment. So
1277 * have to find it */
1278
67c13e2e 1279# ifdef USE_QUERYLOCALE
e9bc6d6b 1280
9ec822ee
KW
1281 if (strEQ(new_locale, "")) {
1282 new_locale = querylocale_i(index);
e9bc6d6b
KW
1283 }
1284
11d8dd19 1285 PERL_UNUSED_ARG(recalc_LC_ALL);
e9bc6d6b 1286
11d8dd19 1287# else
d92b203c 1288
11d8dd19 1289 new_locale = update_PL_curlocales_i(index, new_locale, recalc_LC_ALL);
74a90e54
KW
1290
1291# endif
1292# ifdef HAS_GLIBC_LC_MESSAGES_BUG
1293
1294 /* Invalidate the glibc cache of loaded translations if the locale has changed,
1295 * see [perl #134264] */
1296 if (old_messages_locale) {
1297 if (strNE(old_messages_locale, my_querylocale_c(LC_MESSAGES))) {
1298 textdomain(textdomain(NULL));
1299 }
e9bc6d6b
KW
1300 }
1301
1302# endif
1303
9ec822ee 1304 return new_locale;
e9bc6d6b
KW
1305}
1306
3980cddb
KW
1307#endif /* End of the various implementations of the setlocale and
1308 querylocale macros used in the remainder of this program */
e9bc6d6b 1309
d36adde0 1310#ifdef USE_LOCALE
837ce802 1311
32899cfb
KW
1312/* So far, the locale strings returned by modern 2008-compliant systems have
1313 * been fine */
1314
1315STATIC const char *
1316S_stdize_locale(pTHX_ const int category,
1317 const char *input_locale,
1318 const char **buf,
1319 Size_t *buf_size,
1320 const line_t caller_line)
54ddc3dd 1321{
32899cfb
KW
1322 /* The return value of setlocale() is opaque, but is required to be usable
1323 * as input to a future setlocale() to create the same state.
1324 * Unfortunately not all systems are compliant. But most often they are of
1325 * a very restricted set of forms that this file has been coded to expect.
54ddc3dd 1326 *
32899cfb 1327 * There are some outliers, though, that this function tries to tame:
54ddc3dd 1328 *
32899cfb
KW
1329 * 1) A new-line. This function chomps any \n characters
1330 * 2) foo=bar. 'bar' is what is generally meant, and the foo= part is
1331 * stripped. This form is legal for LC_ALL. When found in
1332 * that category group, the function calls itself
1333 * recursively on each possible component category to make
1334 * sure the individual categories are ok.
1335 *
1336 * If no changes to the input were made, it is returned; otherwise the
1337 * changed version is stored into memory at *buf, with *buf_size set to its
1338 * new value, and *buf is returned.
1339 */
54ddc3dd 1340
32899cfb
KW
1341 const char * first_bad;
1342 const char * retval;
54ddc3dd
KW
1343
1344 PERL_ARGS_ASSERT_STDIZE_LOCALE;
1345
32899cfb
KW
1346 if (input_locale == NULL) {
1347 return NULL;
1348 }
1349
1350 first_bad = strpbrk(input_locale, "=\n");
1351
1352 /* Most likely, there isn't a problem with the input */
1353 if (LIKELY(! first_bad)) {
1354 return input_locale;
1355 }
1356
1357# ifdef LC_ALL
1358
1359 /* But if there is, and the category is LC_ALL, we have to look at each
1360 * component category */
1361 if (category == LC_ALL) {
1362 const char * individ_locales[LC_ALL_INDEX_];
1363 bool made_changes = FALSE;
1364 unsigned int i;
1365
1366 for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++) {
1367 Size_t this_size = 0;
1368 individ_locales[i] = stdize_locale(categories[i],
1369 porcelain_setlocale(categories[i],
1370 NULL),
1371 &individ_locales[i],
1372 &this_size,
1373 caller_line);
1374
1375 /* If the size didn't change, it means this category did not have
1376 * to be adjusted, and individ_locales[i] points to the buffer
1377 * returned by porcelain_setlocale(); we have to copy that before
1378 * it's called again in the next iteration */
1379 if (this_size == 0) {
1380 individ_locales[i] = savepv(individ_locales[i]);
54ddc3dd 1381 }
32899cfb
KW
1382 else {
1383 made_changes = TRUE;
1384 }
1385 }
1386
1387 /* If all the individual categories were ok as-is, this was a false
1388 * alarm. We must have seen an '=' which was a legal occurrence in
1389 * this combination locale */
1390 if (! made_changes) {
1391 retval = input_locale; /* The input can be returned unchanged */
1392 }
1393 else {
0b8fa716 1394 retval = save_to_buffer(querylocale_c(LC_ALL), buf, buf_size);
32899cfb
KW
1395 }
1396
1397 for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++) {
1398 Safefree(individ_locales[i]);
54ddc3dd 1399 }
32899cfb
KW
1400
1401 return retval;
54ddc3dd
KW
1402 }
1403
c117e8b0
KW
1404# else /* else no LC_ALL */
1405
1406 PERL_UNUSED_ARG(category);
1407 PERL_UNUSED_ARG(caller_line);
1408
32899cfb
KW
1409# endif
1410
1411 /* Here, there was a problem in an individual category. This means that at
1412 * least one adjustment will be necessary. Create a modifiable copy */
0b8fa716 1413 retval = save_to_buffer(input_locale, buf, buf_size);
32899cfb
KW
1414
1415 if (*first_bad != '=') {
54ddc3dd 1416
32899cfb
KW
1417 /* Translate the found position into terms of the copy */
1418 first_bad = retval + (first_bad - input_locale);
1419 }
1420 else { /* An '=' */
1421
1422 /* It is unlikely that the return is so screwed-up that it contains
1423 * multiple equals signs, but handle that case by stripping all of
1424 * them. */
1425 const char * final_equals = strrchr(retval, '=');
1426
1427 /* The length passed here causes the move to include the terminating
1428 * NUL */
1429 Move(final_equals + 1, retval, strlen(final_equals), char);
1430
1431 /* See if there are additional problems; if not, we're good to return.
1432 * */
1433 first_bad = strpbrk(retval, "\n");
1434
1435 if (! first_bad) {
1436 return retval;
1437 }
1438 }
1439
1440 /* Here, the problem must be a \n. Get rid of it and what follows.
1441 * (Originally, only a trailing \n was stripped. Unsure what to do if not
1442 * trailing) */
1443 *((char *) first_bad) = '\0';
1444 return retval;
54ddc3dd
KW
1445}
1446
6df0bfb6 1447#if defined(USE_POSIX_2008_LOCALE)
32899cfb 1448
d92b203c
KW
1449STATIC
1450const char *
1451
1452# ifdef USE_QUERYLOCALE
1453S_calculate_LC_ALL(pTHX_ const locale_t cur_obj)
1454# else
1455S_calculate_LC_ALL(pTHX_ const char ** individ_locales)
1456# endif
1457
1458{
1459 /* For POSIX 2008, we have to figure out LC_ALL ourselves when needed.
1460 * querylocale(), on systems that have it, doesn't tend to work for LC_ALL.
1461 * So we have to construct the answer ourselves based on the passed in
1462 * data, which is either a locale_t object, for systems with querylocale(),
1463 * or an array we keep updated to the proper values, otherwise.
1464 *
1465 * This returns a mortalized string containing the locale name(s) of
1466 * LC_ALL.
1467 *
1468 * If all individual categories are the same locale, we can just set LC_ALL
1469 * to that locale. But if not, we have to create an aggregation of all the
1470 * categories on the system. Platforms differ as to the syntax they use
1471 * for these non-uniform locales for LC_ALL. Some use a '/' or other
1472 * delimiter of the locales with a predetermined order of categories; a
1473 * Configure probe would be needed to tell us how to decipher those. glibc
1474 * uses a series of name=value pairs, like
1475 * LC_NUMERIC=C;LC_TIME=en_US.UTF-8;...
1476 * The syntax we use for our aggregation doesn't much matter, as we take
1477 * care not to use the native setlocale() function on whatever style is
1478 * chosen. But, it would be possible for someone to call Perl_setlocale()
1479 * using a native style we don't understand. So far no one has complained.
1480 *
1481 * For systems that have categories we don't know about, the algorithm
1482 * below won't know about those missing categories, leading to potential
1483 * bugs for code that looks at them. If there is an environment variable
1484 * that sets that category, we won't know to look for it, and so our use of
1485 * LANG or "C" improperly overrides it. On the other hand, if we don't do
1486 * what is done here, and there is no environment variable, the category's
1487 * locale should be set to LANG or "C". So there is no good solution. khw
1488 * thinks the best is to make sure we have a complete list of possible
1489 * categories, adding new ones as they show up on obscure platforms.
1490 */
1491
1492 unsigned int i;
1493 Size_t names_len = 0;
1494 bool are_all_categories_the_same_locale = TRUE;
1495 char * aggregate_locale;
1496 char * previous_start = NULL;
36540e72 1497 char * this_start = NULL;
d92b203c
KW
1498 Size_t entry_len = 0;
1499
1500 PERL_ARGS_ASSERT_CALCULATE_LC_ALL;
1501
1502 /* First calculate the needed size for the string listing the categories
1503 * and their locales. */
1504 for (i = 0; i < LC_ALL_INDEX_; i++) {
1505
1506# ifdef USE_QUERYLOCALE
1507 const char * entry = querylocale_l(i, cur_obj);
1508# else
1509 const char * entry = individ_locales[i];
1510# endif
1511
d92b203c
KW
1512 names_len += strlen(category_names[i])
1513 + 1 /* '=' */
1514 + strlen(entry)
1515 + 1; /* ';' */
1516 }
1517
1518 names_len++; /* Trailing '\0' */
1519
1520 /* Allocate enough space for the aggregated string */
1521 SAVEFREEPV(Newxz(aggregate_locale, names_len, char));
1522
1523 /* Then fill it in */
1524 for (i = 0; i < LC_ALL_INDEX_; i++) {
1525 Size_t new_len;
1526
1527# ifdef USE_QUERYLOCALE
1528 const char * entry = querylocale_l(i, cur_obj);
1529# else
1530 const char * entry = individ_locales[i];
1531# endif
1532
d92b203c
KW
1533 new_len = my_strlcat(aggregate_locale, category_names[i], names_len);
1534 assert(new_len <= names_len);
1535 new_len = my_strlcat(aggregate_locale, "=", names_len);
1536 assert(new_len <= names_len);
1537
1538 this_start = aggregate_locale + strlen(aggregate_locale);
1539 entry_len = strlen(entry);
1540
1541 new_len = my_strlcat(aggregate_locale, entry, names_len);
1542 assert(new_len <= names_len);
1543 new_len = my_strlcat(aggregate_locale, ";", names_len);
1544 assert(new_len <= names_len);
1545 PERL_UNUSED_VAR(new_len); /* Only used in DEBUGGING */
1546
1547 if ( i > 0
1548 && are_all_categories_the_same_locale
1549 && memNE(previous_start, this_start, entry_len + 1))
1550 {
1551 are_all_categories_the_same_locale = FALSE;
1552 }
1553 else {
1554 previous_start = this_start;
1555 }
1556 }
1557
1558 /* If they are all the same, just return any one of them */
1559 if (are_all_categories_the_same_locale) {
1560 aggregate_locale = this_start;
1561 aggregate_locale[entry_len] = '\0';
1562 }
1563
1564 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
b26541a9
KW
1565 "calculate_LC_ALL returning '%s'\n",
1566 aggregate_locale));
d92b203c
KW
1567
1568 return aggregate_locale;
1569}
6df0bfb6 1570#endif /*defined(USE_POSIX_2008_LOCALE)*/
d92b203c 1571
a4f00dcc 1572STATIC void
94202fb8
KW
1573S_setlocale_failure_panic_i(pTHX_
1574 const unsigned int cat_index,
1575 const char * current,
1576 const char * failed,
1577 const line_t caller_0_line,
1578 const line_t caller_1_line)
1579{
20ad7b23 1580 dSAVE_ERRNO;
94202fb8
KW
1581 const int cat = categories[cat_index];
1582 const char * name = category_names[cat_index];
94202fb8
KW
1583
1584 PERL_ARGS_ASSERT_SETLOCALE_FAILURE_PANIC_I;
1585
94202fb8
KW
1586 if (current == NULL) {
1587 current = querylocale_i(cat_index);
1588 }
1589
20ad7b23
KW
1590 Perl_locale_panic(Perl_form(aTHX_ "(%" LINE_Tf
1591 "): Can't change locale for %s(%d)"
1592 " from '%s' to '%s'",
1593 caller_1_line, name, cat,
1594 current, failed),
1595 __FILE__, caller_0_line, GET_ERRNO);
94202fb8
KW
1596 NOT_REACHED; /* NOTREACHED */
1597}
1598
fc76ce84
KW
1599/* Any of these will allow us to find the RADIX */
1600# if defined(USE_LOCALE_NUMERIC) && ( defined(HAS_SOME_LANGINFO) \
1601 || defined(HAS_SOME_LOCALECONV) \
1602 || defined(HAS_SNPRINTF))
1603# define CAN_CALCULATE_RADIX
1604# endif
1605
94202fb8 1606STATIC void
86799d2d 1607S_set_numeric_radix(pTHX_ const bool use_locale)
98994639 1608{
86799d2d
KW
1609 /* If 'use_locale' is FALSE, set to use a dot for the radix character. If
1610 * TRUE, use the radix character derived from the current locale */
7d4bcc4a 1611
fc76ce84 1612# ifdef CAN_CALCULATE_RADIX
98994639 1613
fffcb78f 1614 utf8ness_t utf8ness = UTF8NESS_IMMATERIAL;
fc46dd0d 1615 const char * radix;
07492de5 1616 const char * scratch_buffer = NULL;
fc46dd0d
KW
1617
1618 if (! use_locale) {
1619 radix = C_decimal_point;
1620 }
1621 else {
1622 radix = my_langinfo_c(RADIXCHAR, LC_NUMERIC,
1623 PL_numeric_name,
fffcb78f 1624 &scratch_buffer, NULL, &utf8ness);
fc46dd0d 1625 }
2213a3be 1626
87f8e8e7 1627 sv_setpv(PL_numeric_radix_sv, radix);
07492de5 1628 Safefree(scratch_buffer);
86799d2d 1629
fffcb78f 1630 if (utf8ness == UTF8NESS_YES) {
87f8e8e7
KW
1631 SvUTF8_on(PL_numeric_radix_sv);
1632 }
86799d2d 1633
94d521c0 1634 DEBUG_L(PerlIO_printf(Perl_debug_log, "Locale radix is '%s', ?UTF-8=%d\n",
3ca88433 1635 SvPVX(PL_numeric_radix_sv),
94d521c0 1636 cBOOL(SvUTF8(PL_numeric_radix_sv))));
fe1c1494 1637# else
d36adde0
KW
1638
1639 PERL_UNUSED_ARG(use_locale);
1640
fe1c1494 1641# endif /* USE_LOCALE_NUMERIC and can find the radix char */
7d4bcc4a 1642
98994639
HS
1643}
1644
398eeea9
KW
1645STATIC void
1646S_new_numeric(pTHX_ const char *newnum)
98994639 1647{
7d4bcc4a 1648
fe1c1494 1649# ifndef USE_LOCALE_NUMERIC
7d4bcc4a
KW
1650
1651 PERL_UNUSED_ARG(newnum);
1652
fe1c1494 1653# else
0d071d52 1654
291a84fb 1655 /* Called after each libc setlocale() call affecting LC_NUMERIC, to tell
fe1c1494
KW
1656 * core Perl this and that 'newnum' is the name of the new locale, and we
1657 * are switched into it. It installs this locale as the current underlying
1658 * default, and then switches to the C locale, if necessary, so that the
1659 * code that has traditionally expected the radix character to be a dot may
1660 * continue to do so.
0d071d52
KW
1661 *
1662 * The default locale and the C locale can be toggled between by use of the
5792c642
KW
1663 * set_numeric_underlying() and set_numeric_standard() functions, which
1664 * should probably not be called directly, but only via macros like
0d071d52
KW
1665 * SET_NUMERIC_STANDARD() in perl.h.
1666 *
1667 * The toggling is necessary mainly so that a non-dot radix decimal point
fe1c1494
KW
1668 * character can be input and output, while allowing internal calculations
1669 * to use a dot.
0d071d52
KW
1670 *
1671 * This sets several interpreter-level variables:
bb304765 1672 * PL_numeric_name The underlying locale's name: a copy of 'newnum'
892e6465 1673 * PL_numeric_underlying A boolean indicating if the toggled state is such
7738054c
KW
1674 * that the current locale is the program's underlying
1675 * locale
1676 * PL_numeric_standard An int indicating if the toggled state is such
4c68b815
KW
1677 * that the current locale is the C locale or
1678 * indistinguishable from the C locale. If non-zero, it
1679 * is in C; if > 1, it means it may not be toggled away
7738054c 1680 * from C.
4c68b815
KW
1681 * PL_numeric_underlying_is_standard A bool kept by this function
1682 * indicating that the underlying locale and the standard
1683 * C locale are indistinguishable for the purposes of
1684 * LC_NUMERIC. This happens when both of the above two
1685 * variables are true at the same time. (Toggling is a
1686 * no-op under these circumstances.) This variable is
1687 * used to avoid having to recalculate.
fe1c1494
KW
1688 * PL_numeric_radix_sv Contains the string that code should use for the
1689 * decimal point. It is set to either a dot or the
1690 * program's underlying locale's radix character string,
1691 * depending on the situation.
1692 * PL_underlying_numeric_obj = (only on POSIX 2008 platforms) An object
1693 * with everything set up properly so as to avoid work on
1694 * such platforms.
398eeea9 1695 */
0d071d52 1696
b03f34cf 1697 char *save_newnum;
98994639
HS
1698
1699 if (! newnum) {
1604cfb0 1700 Safefree(PL_numeric_name);
7b9f2e4d 1701 PL_numeric_name = savepv("C");
1604cfb0
MS
1702 PL_numeric_standard = TRUE;
1703 PL_numeric_underlying = TRUE;
1704 PL_numeric_underlying_is_standard = TRUE;
1705 return;
98994639
HS
1706 }
1707
32899cfb 1708 save_newnum = savepv(newnum);
892e6465 1709 PL_numeric_underlying = TRUE;
4c68b815
KW
1710 PL_numeric_standard = isNAME_C_OR_POSIX(save_newnum);
1711
fe1c1494 1712# ifndef TS_W32_BROKEN_LOCALECONV
69c5e0db 1713
4c68b815 1714 /* If its name isn't C nor POSIX, it could still be indistinguishable from
aa8dc9b1 1715 * them. But on broken Windows systems calling my_langinfo() for
69c5e0db
KW
1716 * THOUSEP can currently (but rarely) cause a race, so avoid doing that,
1717 * and just always change the locale if not C nor POSIX on those systems */
4c68b815 1718 if (! PL_numeric_standard) {
07492de5
KW
1719 const char * scratch_buffer = NULL;
1720 PL_numeric_standard = strEQ(C_decimal_point,
fc46dd0d
KW
1721 my_langinfo_c(RADIXCHAR, LC_NUMERIC,
1722 save_newnum,
fffcb78f 1723 &scratch_buffer, NULL, NULL));
07492de5 1724 Safefree(scratch_buffer);
fc46dd0d 1725 scratch_buffer = NULL;
07492de5
KW
1726
1727 PL_numeric_standard &= strEQ(C_thousands_sep,
fc46dd0d
KW
1728 my_langinfo_c(THOUSEP, LC_NUMERIC,
1729 save_newnum,
fffcb78f 1730 &scratch_buffer, NULL, NULL));
07492de5 1731 Safefree(scratch_buffer);
4c68b815 1732 }
abe1abcf 1733
fe1c1494 1734# endif
69c5e0db 1735
4c68b815 1736 /* Save the new name if it isn't the same as the previous one, if any */
7b9f2e4d 1737 if (strNE(PL_numeric_name, save_newnum)) {
fe1c1494 1738 /* Save the locale name for future use */
1604cfb0
MS
1739 Safefree(PL_numeric_name);
1740 PL_numeric_name = save_newnum;
b03f34cf 1741 }
abe1abcf 1742 else {
1604cfb0 1743 Safefree(save_newnum);
abe1abcf 1744 }
4c28b29c 1745
4c68b815
KW
1746 PL_numeric_underlying_is_standard = PL_numeric_standard;
1747
4b06b4d0 1748# ifdef USE_POSIX_2008_LOCALE
e1aa2579 1749
fe1c1494 1750 /* We keep a special object for easy switching to */
e1aa2579
KW
1751 PL_underlying_numeric_obj = newlocale(LC_NUMERIC_MASK,
1752 PL_numeric_name,
1753 PL_underlying_numeric_obj);
1754
fe1c1494 1755# endif
e1aa2579 1756
94d521c0
KW
1757 DEBUG_L( PerlIO_printf(Perl_debug_log,
1758 "Called new_numeric with %s, PL_numeric_name=%s\n",
1759 newnum, PL_numeric_name));
4c68b815 1760
fe1c1494
KW
1761 /* Keep LC_NUMERIC so that it has the C locale radix and thousands
1762 * separator. This is for XS modules, so they don't have to worry about
1763 * the radix being a non-dot. (Core operations that need the underlying
1764 * locale change to it temporarily). */
84f1d29f
KW
1765 if (PL_numeric_standard) {
1766 set_numeric_radix(0);
1767 }
1768 else {
1769 set_numeric_standard();
1770 }
4c28b29c 1771
fe1c1494 1772# endif
7d4bcc4a 1773
98994639
HS
1774}
1775
1776void
1777Perl_set_numeric_standard(pTHX)
1778{
7d4bcc4a 1779
fe1c1494 1780# ifdef USE_LOCALE_NUMERIC
7d4bcc4a 1781
fe1c1494
KW
1782 /* Unconditionally toggle the LC_NUMERIC locale to the current underlying
1783 * default.
1784 *
1785 * Most code should use the macro SET_NUMERIC_STANDARD() in perl.h
1786 * instead of calling this directly. The macro avoids calling this routine
1787 * if toggling isn't necessary according to our records (which could be
1788 * wrong if some XS code has changed the locale behind our back) */
0d071d52 1789
94d521c0
KW
1790 DEBUG_L(PerlIO_printf(Perl_debug_log,
1791 "Setting LC_NUMERIC locale to standard C\n"));
7bb8f702 1792
3980cddb 1793 void_setlocale_c(LC_NUMERIC, "C");
7bb8f702
KW
1794 PL_numeric_standard = TRUE;
1795 PL_numeric_underlying = PL_numeric_underlying_is_standard;
1796 set_numeric_radix(0);
1797
fe1c1494 1798# endif /* USE_LOCALE_NUMERIC */
7d4bcc4a 1799
98994639
HS
1800}
1801
1802void
5792c642 1803Perl_set_numeric_underlying(pTHX)
98994639 1804{
7d4bcc4a 1805
fe1c1494 1806# ifdef USE_LOCALE_NUMERIC
7d4bcc4a 1807
fe1c1494
KW
1808 /* Unconditionally toggle the LC_NUMERIC locale to the current underlying
1809 * default.
1810 *
1811 * Most code should use the macro SET_NUMERIC_UNDERLYING() in perl.h
7d4bcc4a
KW
1812 * instead of calling this directly. The macro avoids calling this routine
1813 * if toggling isn't necessary according to our records (which could be
1814 * wrong if some XS code has changed the locale behind our back) */
a9b8c0d8 1815
94d521c0
KW
1816 DEBUG_L(PerlIO_printf(Perl_debug_log, "Setting LC_NUMERIC locale to %s\n",
1817 PL_numeric_name));
7bb8f702 1818
3980cddb 1819 void_setlocale_c(LC_NUMERIC, PL_numeric_name);
7bb8f702
KW
1820 PL_numeric_standard = PL_numeric_underlying_is_standard;
1821 PL_numeric_underlying = TRUE;
1822 set_numeric_radix(! PL_numeric_standard);
1823
fe1c1494 1824# endif /* USE_LOCALE_NUMERIC */
7d4bcc4a 1825
98994639
HS
1826}
1827
1828/*
1829 * Set up for a new ctype locale.
1830 */
a4f00dcc
KW
1831STATIC void
1832S_new_ctype(pTHX_ const char *newctype)
98994639 1833{
7d4bcc4a 1834
fe1c1494 1835# ifndef USE_LOCALE_CTYPE
7d4bcc4a 1836
7d4bcc4a
KW
1837 PERL_UNUSED_ARG(newctype);
1838 PERL_UNUSED_CONTEXT;
1839
fe1c1494 1840# else
0d071d52 1841
291a84fb 1842 /* Called after each libc setlocale() call affecting LC_CTYPE, to tell
0d071d52
KW
1843 * core Perl this and that 'newctype' is the name of the new locale.
1844 *
1845 * This function sets up the folding arrays for all 256 bytes, assuming
1846 * that tofold() is tolc() since fold case is not a concept in POSIX,
1847 *
1848 * Any code changing the locale (outside this file) should use
9aac5db8
KW
1849 * Perl_setlocale or POSIX::setlocale, which call this function. Therefore
1850 * this function should be called directly only from this file and from
0d071d52
KW
1851 * POSIX::setlocale() */
1852
013f4e03 1853 unsigned int i;
98994639 1854
8b7358b9
KW
1855 /* Don't check for problems if we are suppressing the warnings */
1856 bool check_for_problems = ckWARN_d(WARN_LOCALE) || UNLIKELY(DEBUG_L_TEST);
30d8090d 1857 bool maybe_utf8_turkic = FALSE;
8b7358b9 1858
7918f24d
NC
1859 PERL_ARGS_ASSERT_NEW_CTYPE;
1860
1a63cba7
KW
1861 DEBUG_L(PerlIO_printf(Perl_debug_log, "Entering new_ctype(%s)\n", newctype));
1862
215c5139
KW
1863 /* We will replace any bad locale warning with 1) nothing if the new one is
1864 * ok; or 2) a new warning for the bad new locale */
1865 if (PL_warn_locale) {
1866 SvREFCNT_dec_NN(PL_warn_locale);
1867 PL_warn_locale = NULL;
1868 }
1869
a0eeb339 1870 PL_in_utf8_CTYPE_locale = is_locale_utf8(newctype);
31f05a37
KW
1871
1872 /* A UTF-8 locale gets standard rules. But note that code still has to
1873 * handle this specially because of the three problematic code points */
1874 if (PL_in_utf8_CTYPE_locale) {
1875 Copy(PL_fold_latin1, PL_fold_locale, 256, U8);
30d8090d
KW
1876
1877 /* UTF-8 locales can have special handling for 'I' and 'i' if they are
fe1c1494
KW
1878 * Turkic. Make sure these two are the only anomalies. (We don't
1879 * require towupper and towlower because they aren't in C89.) */
5b64f24c 1880
fe1c1494 1881# if defined(HAS_TOWUPPER) && defined (HAS_TOWLOWER)
5b64f24c 1882
fe1c1494 1883 if (towupper('i') == 0x130 && towlower('I') == 0x131)
5b64f24c 1884
fe1c1494 1885# else
5b64f24c 1886
fe1c1494 1887 if (toU8_UPPER_LC('i') == 'i' && toU8_LOWER_LC('I') == 'I')
5b64f24c 1888
fe1c1494
KW
1889# endif
1890
1891 {
1892 /* This is how we determine it really is Turkic */
30d8090d
KW
1893 check_for_problems = TRUE;
1894 maybe_utf8_turkic = TRUE;
1895 }
31f05a37 1896 }
8b7358b9
KW
1897
1898 /* We don't populate the other lists if a UTF-8 locale, but do check that
1899 * everything works as expected, unless checking turned off */
1900 if (check_for_problems || ! PL_in_utf8_CTYPE_locale) {
8c6180a9
KW
1901 /* Assume enough space for every character being bad. 4 spaces each
1902 * for the 94 printable characters that are output like "'x' "; and 5
1903 * spaces each for "'\\' ", "'\t' ", and "'\n' "; plus a terminating
1904 * NUL */
8b7358b9 1905 char bad_chars_list[ (94 * 4) + (3 * 5) + 1 ] = { '\0' };
8c6180a9
KW
1906 bool multi_byte_locale = FALSE; /* Assume is a single-byte locale
1907 to start */
1908 unsigned int bad_count = 0; /* Count of bad characters */
1909
baa60164 1910 for (i = 0; i < 256; i++) {
8b7358b9 1911 if (! PL_in_utf8_CTYPE_locale) {
dcbeec20
KW
1912 if (isU8_UPPER_LC(i))
1913 PL_fold_locale[i] = (U8) toU8_LOWER_LC(i);
1914 else if (isU8_LOWER_LC(i))
1915 PL_fold_locale[i] = (U8) toU8_UPPER_LC(i);
bd6d0898
KW
1916 else
1917 PL_fold_locale[i] = (U8) i;
8b7358b9 1918 }
8c6180a9
KW
1919
1920 /* If checking for locale problems, see if the native ASCII-range
1921 * printables plus \n and \t are in their expected categories in
1922 * the new locale. If not, this could mean big trouble, upending
1923 * Perl's and most programs' assumptions, like having a
1924 * metacharacter with special meaning become a \w. Fortunately,
1925 * it's very rare to find locales that aren't supersets of ASCII
1926 * nowadays. It isn't a problem for most controls to be changed
1927 * into something else; we check only \n and \t, though perhaps \r
1928 * could be an issue as well. */
7d4bcc4a 1929 if ( check_for_problems
8c6180a9
KW
1930 && (isGRAPH_A(i) || isBLANK_A(i) || i == '\n'))
1931 {
8b7358b9 1932 bool is_bad = FALSE;
6726b4f4 1933 char name[4] = { '\0' };
8b7358b9
KW
1934
1935 /* Convert the name into a string */
6726b4f4 1936 if (isGRAPH_A(i)) {
8b7358b9
KW
1937 name[0] = i;
1938 name[1] = '\0';
1939 }
1940 else if (i == '\n') {
6726b4f4
KW
1941 my_strlcpy(name, "\\n", sizeof(name));
1942 }
1943 else if (i == '\t') {
1944 my_strlcpy(name, "\\t", sizeof(name));
8b7358b9
KW
1945 }
1946 else {
6726b4f4
KW
1947 assert(i == ' ');
1948 my_strlcpy(name, "' '", sizeof(name));
8b7358b9
KW
1949 }
1950
1951 /* Check each possibe class */
dcbeec20 1952 if (UNLIKELY(cBOOL(isU8_ALPHANUMERIC_LC(i)) != cBOOL(isALPHANUMERIC_A(i)))) {
8b7358b9
KW
1953 is_bad = TRUE;
1954 DEBUG_L(PerlIO_printf(Perl_debug_log,
58236863 1955 "isalnum('%s') unexpectedly is %x\n",
dcbeec20 1956 name, cBOOL(isU8_ALPHANUMERIC_LC(i))));
8b7358b9 1957 }
dcbeec20 1958 if (UNLIKELY(cBOOL(isU8_ALPHA_LC(i)) != cBOOL(isALPHA_A(i)))) {
8b7358b9
KW
1959 is_bad = TRUE;
1960 DEBUG_L(PerlIO_printf(Perl_debug_log,
58236863 1961 "isalpha('%s') unexpectedly is %x\n",
dcbeec20 1962 name, cBOOL(isU8_ALPHA_LC(i))));
8b7358b9 1963 }
dcbeec20 1964 if (UNLIKELY(cBOOL(isU8_DIGIT_LC(i)) != cBOOL(isDIGIT_A(i)))) {
8b7358b9
KW
1965 is_bad = TRUE;
1966 DEBUG_L(PerlIO_printf(Perl_debug_log,
58236863 1967 "isdigit('%s') unexpectedly is %x\n",
dcbeec20 1968 name, cBOOL(isU8_DIGIT_LC(i))));
8b7358b9 1969 }
dcbeec20 1970 if (UNLIKELY(cBOOL(isU8_GRAPH_LC(i)) != cBOOL(isGRAPH_A(i)))) {
8b7358b9
KW
1971 is_bad = TRUE;
1972 DEBUG_L(PerlIO_printf(Perl_debug_log,
58236863 1973 "isgraph('%s') unexpectedly is %x\n",
dcbeec20 1974 name, cBOOL(isU8_GRAPH_LC(i))));
8b7358b9 1975 }
dcbeec20 1976 if (UNLIKELY(cBOOL(isU8_LOWER_LC(i)) != cBOOL(isLOWER_A(i)))) {
8b7358b9
KW
1977 is_bad = TRUE;
1978 DEBUG_L(PerlIO_printf(Perl_debug_log,
58236863 1979 "islower('%s') unexpectedly is %x\n",
dcbeec20 1980 name, cBOOL(isU8_LOWER_LC(i))));
8b7358b9 1981 }
dcbeec20 1982 if (UNLIKELY(cBOOL(isU8_PRINT_LC(i)) != cBOOL(isPRINT_A(i)))) {
8b7358b9
KW
1983 is_bad = TRUE;
1984 DEBUG_L(PerlIO_printf(Perl_debug_log,
58236863 1985 "isprint('%s') unexpectedly is %x\n",
dcbeec20 1986 name, cBOOL(isU8_PRINT_LC(i))));
8b7358b9 1987 }
dcbeec20 1988 if (UNLIKELY(cBOOL(isU8_PUNCT_LC(i)) != cBOOL(isPUNCT_A(i)))) {
8b7358b9
KW
1989 is_bad = TRUE;
1990 DEBUG_L(PerlIO_printf(Perl_debug_log,
58236863 1991 "ispunct('%s') unexpectedly is %x\n",
dcbeec20 1992 name, cBOOL(isU8_PUNCT_LC(i))));
8b7358b9 1993 }
dcbeec20 1994 if (UNLIKELY(cBOOL(isU8_SPACE_LC(i)) != cBOOL(isSPACE_A(i)))) {
8b7358b9
KW
1995 is_bad = TRUE;
1996 DEBUG_L(PerlIO_printf(Perl_debug_log,
58236863 1997 "isspace('%s') unexpectedly is %x\n",
dcbeec20 1998 name, cBOOL(isU8_SPACE_LC(i))));
8b7358b9 1999 }
dcbeec20 2000 if (UNLIKELY(cBOOL(isU8_UPPER_LC(i)) != cBOOL(isUPPER_A(i)))) {
8b7358b9
KW
2001 is_bad = TRUE;
2002 DEBUG_L(PerlIO_printf(Perl_debug_log,
58236863 2003 "isupper('%s') unexpectedly is %x\n",
dcbeec20 2004 name, cBOOL(isU8_UPPER_LC(i))));
8b7358b9 2005 }
dcbeec20 2006 if (UNLIKELY(cBOOL(isU8_XDIGIT_LC(i))!= cBOOL(isXDIGIT_A(i)))) {
8b7358b9
KW
2007 is_bad = TRUE;
2008 DEBUG_L(PerlIO_printf(Perl_debug_log,
58236863 2009 "isxdigit('%s') unexpectedly is %x\n",
dcbeec20 2010 name, cBOOL(isU8_XDIGIT_LC(i))));
8b7358b9 2011 }
dcbeec20 2012 if (UNLIKELY(toU8_LOWER_LC(i) != (int) toLOWER_A(i))) {
8b7358b9
KW
2013 is_bad = TRUE;
2014 DEBUG_L(PerlIO_printf(Perl_debug_log,
2015 "tolower('%s')=0x%x instead of the expected 0x%x\n",
dcbeec20 2016 name, toU8_LOWER_LC(i), (int) toLOWER_A(i)));
8b7358b9 2017 }
dcbeec20 2018 if (UNLIKELY(toU8_UPPER_LC(i) != (int) toUPPER_A(i))) {
8b7358b9
KW
2019 is_bad = TRUE;
2020 DEBUG_L(PerlIO_printf(Perl_debug_log,
2021 "toupper('%s')=0x%x instead of the expected 0x%x\n",
dcbeec20 2022 name, toU8_UPPER_LC(i), (int) toUPPER_A(i)));
8b7358b9
KW
2023 }
2024 if (UNLIKELY((i == '\n' && ! isCNTRL_LC(i)))) {
2025 is_bad = TRUE;
2026 DEBUG_L(PerlIO_printf(Perl_debug_log,
2027 "'\\n' (=%02X) is not a control\n", (int) i));
2028 }
2029
2030 /* Add to the list; Separate multiple entries with a blank */
2031 if (is_bad) {
2032 if (bad_count) {
2033 my_strlcat(bad_chars_list, " ", sizeof(bad_chars_list));
8c6180a9 2034 }
8b7358b9
KW
2035 my_strlcat(bad_chars_list, name, sizeof(bad_chars_list));
2036 bad_count++;
8c6180a9
KW
2037 }
2038 }
2039 }
2040
30d8090d
KW
2041 if (bad_count == 2 && maybe_utf8_turkic) {
2042 bad_count = 0;
2043 *bad_chars_list = '\0';
2044 PL_fold_locale['I'] = 'I';
2045 PL_fold_locale['i'] = 'i';
2046 PL_in_utf8_turkic_locale = TRUE;
b26541a9 2047 DEBUG_L(PerlIO_printf(Perl_debug_log, "%s is turkic\n", newctype));
30d8090d
KW
2048 }
2049 else {
b8df1494 2050 PL_in_utf8_turkic_locale = FALSE;
30d8090d 2051 }
b8df1494 2052
7d4bcc4a
KW
2053# ifdef MB_CUR_MAX
2054
8c6180a9 2055 /* We only handle single-byte locales (outside of UTF-8 ones; so if
d35fca5f 2056 * this locale requires more than one byte, there are going to be
8c6180a9 2057 * problems. */
9c8a6dc2 2058 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
b26541a9
KW
2059 "check_for_problems=%d, MB_CUR_MAX=%d\n",
2060 check_for_problems, (int) MB_CUR_MAX));
9c8a6dc2 2061
8b7358b9
KW
2062 if ( check_for_problems && MB_CUR_MAX > 1
2063 && ! PL_in_utf8_CTYPE_locale
ba1a4362
KW
2064
2065 /* Some platforms return MB_CUR_MAX > 1 for even the "C"
2066 * locale. Just assume that the implementation for them (plus
2067 * for POSIX) is correct and the > 1 value is spurious. (Since
2068 * these are specially handled to never be considered UTF-8
2069 * locales, as long as this is the only problem, everything
2070 * should work fine */
2071 && strNE(newctype, "C") && strNE(newctype, "POSIX"))
2072 {
8c6180a9
KW
2073 multi_byte_locale = TRUE;
2074 }
7d4bcc4a
KW
2075
2076# endif
8c6180a9 2077
30d8090d
KW
2078 /* If we found problems and we want them output, do so */
2079 if ( (UNLIKELY(bad_count) || UNLIKELY(multi_byte_locale))
2080 && (LIKELY(ckWARN_d(WARN_LOCALE)) || UNLIKELY(DEBUG_L_TEST)))
2081 {
8b7358b9
KW
2082 if (UNLIKELY(bad_count) && PL_in_utf8_CTYPE_locale) {
2083 PL_warn_locale = Perl_newSVpvf(aTHX_
2084 "Locale '%s' contains (at least) the following characters"
578a6a87
KW
2085 " which have\nunexpected meanings: %s\nThe Perl program"
2086 " will use the expected meanings",
8b7358b9 2087 newctype, bad_chars_list);
8b7358b9
KW
2088 }
2089 else {
2090 PL_warn_locale = Perl_newSVpvf(aTHX_
8c6180a9 2091 "Locale '%s' may not work well.%s%s%s\n",
780fcc9f 2092 newctype,
8c6180a9
KW
2093 (multi_byte_locale)
2094 ? " Some characters in it are not recognized by"
2095 " Perl."
2096 : "",
2097 (bad_count)
2098 ? "\nThe following characters (and maybe others)"
2099 " may not have the same meaning as the Perl"
2100 " program expects:\n"
2101 : "",
2102 (bad_count)
2103 ? bad_chars_list
2104 : ""
2105 );
8b7358b9
KW
2106 }
2107
3871c541 2108# ifdef HAS_SOME_LANGINFO
b119c2be 2109
07492de5 2110 const char * scratch_buffer = NULL;
b119c2be 2111 Perl_sv_catpvf(aTHX_ PL_warn_locale, "; codeset=%s",
fc46dd0d
KW
2112 my_langinfo_c(CODESET, LC_CTYPE,
2113 newctype,
fffcb78f
KW
2114 &scratch_buffer, NULL,
2115 NULL));
07492de5 2116 Safefree(scratch_buffer);
b119c2be
KW
2117
2118# endif
2119
8b7358b9
KW
2120 Perl_sv_catpvf(aTHX_ PL_warn_locale, "\n");
2121
cc9eaeb0 2122 /* If we are actually in the scope of the locale or are debugging,
bddebb56
KW
2123 * output the message now. If not in that scope, we save the
2124 * message to be output at the first operation using this locale,
2125 * if that actually happens. Most programs don't use locales, so
2126 * they are immune to bad ones. */
cc9eaeb0 2127 if (IN_LC(LC_CTYPE) || UNLIKELY(DEBUG_L_TEST)) {
780fcc9f 2128
780fcc9f 2129 /* The '0' below suppresses a bogus gcc compiler warning */
fe1c1494
KW
2130 Perl_warner(aTHX_ packWARN(WARN_LOCALE), SvPVX(PL_warn_locale),
2131 0);
bddebb56 2132
bddebb56
KW
2133 if (IN_LC(LC_CTYPE)) {
2134 SvREFCNT_dec_NN(PL_warn_locale);
2135 PL_warn_locale = NULL;
2136 }
780fcc9f 2137 }
baa60164 2138 }
31f05a37 2139 }
98994639 2140
fe1c1494 2141# endif /* USE_LOCALE_CTYPE */
7d4bcc4a 2142
98994639
HS
2143}
2144
98994639 2145void
2726666d
KW
2146Perl__warn_problematic_locale()
2147{
2726666d 2148
fe1c1494 2149# ifdef USE_LOCALE_CTYPE
2726666d 2150
5f04a188
KW
2151 dTHX;
2152
2153 /* Internal-to-core function that outputs the message in PL_warn_locale,
2154 * and then NULLS it. Should be called only through the macro
1629a27e 2155 * CHECK_AND_WARN_PROBLEMATIC_LOCALE_ */
5f04a188 2156
2726666d 2157 if (PL_warn_locale) {
2726666d
KW
2158 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
2159 SvPVX(PL_warn_locale),
2160 0 /* dummy to avoid compiler warning */ );
2726666d
KW
2161 SvREFCNT_dec_NN(PL_warn_locale);
2162 PL_warn_locale = NULL;
2163 }
2164
fe1c1494 2165# endif
2726666d
KW
2166
2167}
2168
a4f00dcc 2169STATIC void
c7dc1ea8
KW
2170S_new_LC_ALL(pTHX_ const char *unused)
2171{
2172 unsigned int i;
2173
2174 /* LC_ALL updates all the things we care about. */
2175
2176 PERL_UNUSED_ARG(unused);
2177
2178 for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++) {
2179 if (update_functions[i]) {
22392ba2 2180 const char * this_locale = querylocale_i(i);
1f9a6108 2181 update_functions[i](aTHX_ this_locale);
c7dc1ea8
KW
2182 }
2183 }
2184}
2185
2186STATIC void
a4f00dcc 2187S_new_collate(pTHX_ const char *newcoll)
98994639 2188{
7d4bcc4a 2189
fe1c1494 2190# ifndef USE_LOCALE_COLLATE
7d4bcc4a
KW
2191
2192 PERL_UNUSED_ARG(newcoll);
2193 PERL_UNUSED_CONTEXT;
2194
fe1c1494 2195# else
0d071d52 2196
291a84fb 2197 /* Called after each libc setlocale() call affecting LC_COLLATE, to tell
0d071d52
KW
2198 * core Perl this and that 'newcoll' is the name of the new locale.
2199 *
d35fca5f
KW
2200 * The design of locale collation is that every locale change is given an
2201 * index 'PL_collation_ix'. The first time a string particpates in an
2202 * operation that requires collation while locale collation is active, it
2203 * is given PERL_MAGIC_collxfrm magic (via sv_collxfrm_flags()). That
2204 * magic includes the collation index, and the transformation of the string
2205 * by strxfrm(), q.v. That transformation is used when doing comparisons,
2206 * instead of the string itself. If a string changes, the magic is
2207 * cleared. The next time the locale changes, the index is incremented,
2208 * and so we know during a comparison that the transformation is not
2209 * necessarily still valid, and so is recomputed. Note that if the locale
2210 * changes enough times, the index could wrap (a U32), and it is possible
2211 * that a transformation would improperly be considered valid, leading to
2212 * an unlikely bug */
0d071d52 2213
98994639 2214 if (! newcoll) {
7b9f2e4d
KW
2215 ++PL_collation_ix;
2216 Safefree(PL_collation_name);
2217 PL_collation_name = NULL;
1604cfb0 2218 PL_collation_standard = TRUE;
00bf60ca 2219 is_standard_collation:
1604cfb0
MS
2220 PL_collxfrm_base = 0;
2221 PL_collxfrm_mult = 2;
165a1c52 2222 PL_in_utf8_COLLATE_locale = FALSE;
f28f4d2a 2223 PL_strxfrm_NUL_replacement = '\0';
a4a439fb 2224 PL_strxfrm_max_cp = 0;
1604cfb0 2225 return;
98994639
HS
2226 }
2227
d35fca5f 2228 /* If this is not the same locale as currently, set the new one up */
7b9f2e4d 2229 if (strNE(PL_collation_name, newcoll)) {
1604cfb0
MS
2230 ++PL_collation_ix;
2231 Safefree(PL_collation_name);
32899cfb 2232 PL_collation_name = savepv(newcoll);
1604cfb0 2233 PL_collation_standard = isNAME_C_OR_POSIX(newcoll);
00bf60ca
KW
2234 if (PL_collation_standard) {
2235 goto is_standard_collation;
2236 }
98994639 2237
a0eeb339 2238 PL_in_utf8_COLLATE_locale = is_locale_utf8(newcoll);
f28f4d2a 2239 PL_strxfrm_NUL_replacement = '\0';
a4a439fb 2240 PL_strxfrm_max_cp = 0;
165a1c52 2241
59c018b9
KW
2242 /* A locale collation definition includes primary, secondary, tertiary,
2243 * etc. weights for each character. To sort, the primary weights are
2244 * used, and only if they compare equal, then the secondary weights are
2245 * used, and only if they compare equal, then the tertiary, etc.
2246 *
2247 * strxfrm() works by taking the input string, say ABC, and creating an
2248 * output transformed string consisting of first the primary weights,
2249 * A¹B¹C¹ followed by the secondary ones, A²B²C²; and then the
2250 * tertiary, etc, yielding A¹B¹C¹ A²B²C² A³B³C³ .... Some characters
2251 * may not have weights at every level. In our example, let's say B
2252 * doesn't have a tertiary weight, and A doesn't have a secondary
2253 * weight. The constructed string is then going to be
2254 * A¹B¹C¹ B²C² A³C³ ....
2255 * This has the desired effect that strcmp() will look at the secondary
2256 * or tertiary weights only if the strings compare equal at all higher
2257 * priority weights. The spaces shown here, like in
c342d20e 2258 * "A¹B¹C¹ A²B²C² "
59c018b9
KW
2259 * are not just for readability. In the general case, these must
2260 * actually be bytes, which we will call here 'separator weights'; and
2261 * they must be smaller than any other weight value, but since these
2262 * are C strings, only the terminating one can be a NUL (some
2263 * implementations may include a non-NUL separator weight just before
2264 * the NUL). Implementations tend to reserve 01 for the separator
2265 * weights. They are needed so that a shorter string's secondary
2266 * weights won't be misconstrued as primary weights of a longer string,
2267 * etc. By making them smaller than any other weight, the shorter
2268 * string will sort first. (Actually, if all secondary weights are
2269 * smaller than all primary ones, there is no need for a separator
2270 * weight between those two levels, etc.)
2271 *
2272 * The length of the transformed string is roughly a linear function of
2273 * the input string. It's not exactly linear because some characters
2274 * don't have weights at all levels. When we call strxfrm() we have to
2275 * allocate some memory to hold the transformed string. The
2276 * calculations below try to find coefficients 'm' and 'b' for this
2277 * locale so that m*x + b equals how much space we need, given the size
2278 * of the input string in 'x'. If we calculate too small, we increase
2279 * the size as needed, and call strxfrm() again, but it is better to
2280 * get it right the first time to avoid wasted expensive string
2281 * transformations. */
2282
1604cfb0 2283 {
79f120c8
KW
2284 /* We use the string below to find how long the tranformation of it
2285 * is. Almost all locales are supersets of ASCII, or at least the
2286 * ASCII letters. We use all of them, half upper half lower,
2287 * because if we used fewer, we might hit just the ones that are
2288 * outliers in a particular locale. Most of the strings being
2289 * collated will contain a preponderance of letters, and even if
2290 * they are above-ASCII, they are likely to have the same number of
2291 * weight levels as the ASCII ones. It turns out that digits tend
2292 * to have fewer levels, and some punctuation has more, but those
2293 * are relatively sparse in text, and khw believes this gives a
2294 * reasonable result, but it could be changed if experience so
2295 * dictates. */
2296 const char longer[] = "ABCDEFGHIJKLMnopqrstuvwxyz";
2297 char * x_longer; /* Transformed 'longer' */
2298 Size_t x_len_longer; /* Length of 'x_longer' */
2299
2300 char * x_shorter; /* We also transform a substring of 'longer' */
2301 Size_t x_len_shorter;
2302
a4a439fb 2303 /* _mem_collxfrm() is used get the transformation (though here we
79f120c8
KW
2304 * are interested only in its length). It is used because it has
2305 * the intelligence to handle all cases, but to work, it needs some
2306 * values of 'm' and 'b' to get it started. For the purposes of
2307 * this calculation we use a very conservative estimate of 'm' and
2308 * 'b'. This assumes a weight can be multiple bytes, enough to
2309 * hold any UV on the platform, and there are 5 levels, 4 weight
2310 * bytes, and a trailing NUL. */
2311 PL_collxfrm_base = 5;
2312 PL_collxfrm_mult = 5 * sizeof(UV);
2313
2314 /* Find out how long the transformation really is */
a4a439fb
KW
2315 x_longer = _mem_collxfrm(longer,
2316 sizeof(longer) - 1,
2317 &x_len_longer,
2318
2319 /* We avoid converting to UTF-8 in the
2320 * called function by telling it the
2321 * string is in UTF-8 if the locale is a
2322 * UTF-8 one. Since the string passed
2323 * here is invariant under UTF-8, we can
2324 * claim it's UTF-8 even though it isn't.
2325 * */
2326 PL_in_utf8_COLLATE_locale);
79f120c8
KW
2327 Safefree(x_longer);
2328
2329 /* Find out how long the transformation of a substring of 'longer'
2330 * is. Together the lengths of these transformations are
2331 * sufficient to calculate 'm' and 'b'. The substring is all of
2332 * 'longer' except the first character. This minimizes the chances
2333 * of being swayed by outliers */
a4a439fb 2334 x_shorter = _mem_collxfrm(longer + 1,
79f120c8 2335 sizeof(longer) - 2,
a4a439fb
KW
2336 &x_len_shorter,
2337 PL_in_utf8_COLLATE_locale);
79f120c8
KW
2338 Safefree(x_shorter);
2339
2340 /* If the results are nonsensical for this simple test, the whole
2341 * locale definition is suspect. Mark it so that locale collation
2342 * is not active at all for it. XXX Should we warn? */
2343 if ( x_len_shorter == 0
2344 || x_len_longer == 0
2345 || x_len_shorter >= x_len_longer)
2346 {
2347 PL_collxfrm_mult = 0;
2348 PL_collxfrm_base = 0;
2349 }
2350 else {
2351 SSize_t base; /* Temporary */
2352
2353 /* We have both: m * strlen(longer) + b = x_len_longer
2354 * m * strlen(shorter) + b = x_len_shorter;
2355 * subtracting yields:
2356 * m * (strlen(longer) - strlen(shorter))
2357 * = x_len_longer - x_len_shorter
2358 * But we have set things up so that 'shorter' is 1 byte smaller
2359 * than 'longer'. Hence:
2360 * m = x_len_longer - x_len_shorter
2361 *
2362 * But if something went wrong, make sure the multiplier is at
2363 * least 1.
2364 */
2365 if (x_len_longer > x_len_shorter) {
2366 PL_collxfrm_mult = (STRLEN) x_len_longer - x_len_shorter;
2367 }
2368 else {
2369 PL_collxfrm_mult = 1;
2370 }
2371
2372 /* mx + b = len
2373 * so: b = len - mx
2374 * but in case something has gone wrong, make sure it is
2375 * non-negative */
2376 base = x_len_longer - PL_collxfrm_mult * (sizeof(longer) - 1);
2377 if (base < 0) {
2378 base = 0;
2379 }
2380
2381 /* Add 1 for the trailing NUL */
2382 PL_collxfrm_base = base + 1;
2383 }
58eebef2 2384
94d521c0 2385 DEBUG_L(PerlIO_printf(Perl_debug_log,
b26541a9 2386 "?UTF-8 locale=%d; x_len_shorter=%zu, "
b07929e4
KW
2387 "x_len_longer=%zu,"
2388 " collate multipler=%zu, collate base=%zu\n",
58eebef2
KW
2389 PL_in_utf8_COLLATE_locale,
2390 x_len_shorter, x_len_longer,
94d521c0 2391 PL_collxfrm_mult, PL_collxfrm_base));
1604cfb0 2392 }
98994639
HS
2393 }
2394
fe1c1494 2395# endif /* USE_LOCALE_COLLATE */
7d4bcc4a 2396
98994639
HS
2397}
2398
fe1c1494 2399#endif /* USE_LOCALE */
d36adde0 2400
d2b24094 2401#ifdef WIN32
b8cc575c 2402
27edef3b
KW
2403wchar_t *
2404Perl_Win_utf8_string_to_wstring(const char * utf8_string)
2405{
2406 wchar_t *wstring;
2407
2408 int req_size = MultiByteToWideChar(CP_UTF8, 0, utf8_string, -1, NULL, 0);
2409 if (! req_size) {
2410 errno = EINVAL;
2411 return NULL;
2412 }
2413
2414 Newx(wstring, req_size, wchar_t);
2415
2416 if (! MultiByteToWideChar(CP_UTF8, 0, utf8_string, -1, wstring, req_size))
2417 {
2418 Safefree(wstring);
2419 errno = EINVAL;
2420 return NULL;
2421 }
2422
2423 return wstring;
2424}
2425
2426char *
2427Perl_Win_wstring_to_utf8_string(const wchar_t * wstring)
2428{
2429 char *utf8_string;
2430
2431 int req_size =
2432 WideCharToMultiByte(CP_UTF8, 0, wstring, -1, NULL, 0, NULL, NULL);
2433
2434 Newx(utf8_string, req_size, char);
2435
2436 if (! WideCharToMultiByte(CP_UTF8, 0, wstring, -1, utf8_string,
2437 req_size, NULL, NULL))
2438 {
2439 Safefree(utf8_string);
2440 errno = EINVAL;
2441 return NULL;
2442 }
2443
2444 return utf8_string;
2445}
2446
6c332036
TC
2447#define USE_WSETLOCALE
2448
2449#ifdef USE_WSETLOCALE
2450
2451STATIC char *
2452S_wrap_wsetlocale(pTHX_ int category, const char *locale) {
27edef3b 2453 wchar_t *wlocale = NULL;
6c332036
TC
2454 wchar_t *wresult;
2455 char *result;
2456
2457 if (locale) {
27edef3b
KW
2458 wlocale = Win_utf8_string_to_wstring(locale);
2459 if (! wlocale) {
6c332036
TC
2460 return NULL;
2461 }
2462 }
2463 else {
2464 wlocale = NULL;
2465 }
27edef3b 2466
6c332036
TC
2467 wresult = _wsetlocale(category, wlocale);
2468 Safefree(wlocale);
27edef3b
KW
2469
2470 if (! wresult) {
6c332036
TC
2471 return NULL;
2472 }
27edef3b
KW
2473
2474 result = Win_wstring_to_utf8_string(wresult);
2475 SAVEFREEPV(result); /* is there something better we can do here? */
6c332036
TC
2476
2477 return result;
2478}
2479
2480#endif
2481
a4f00dcc 2482STATIC char *
b8cc575c 2483S_win32_setlocale(pTHX_ int category, const char* locale)
b385bb4d
KW
2484{
2485 /* This, for Windows, emulates POSIX setlocale() behavior. There is no
7d4bcc4a
KW
2486 * difference between the two unless the input locale is "", which normally
2487 * means on Windows to get the machine default, which is set via the
2488 * computer's "Regional and Language Options" (or its current equivalent).
2489 * In POSIX, it instead means to find the locale from the user's
2490 * environment. This routine changes the Windows behavior to first look in
2491 * the environment, and, if anything is found, use that instead of going to
2492 * the machine default. If there is no environment override, the machine
2493 * default is used, by calling the real setlocale() with "".
2494 *
2495 * The POSIX behavior is to use the LC_ALL variable if set; otherwise to
2496 * use the particular category's variable if set; otherwise to use the LANG
2497 * variable. */
b385bb4d 2498
175c4cf9 2499 bool override_LC_ALL = FALSE;
89f7b9aa 2500 char * result;
e5f10d49 2501 unsigned int i;
89f7b9aa 2502
b385bb4d 2503 if (locale && strEQ(locale, "")) {
7d4bcc4a
KW
2504
2505# ifdef LC_ALL
2506
b385bb4d
KW
2507 locale = PerlEnv_getenv("LC_ALL");
2508 if (! locale) {
e5f10d49
KW
2509 if (category == LC_ALL) {
2510 override_LC_ALL = TRUE;
2511 }
2512 else {
7d4bcc4a
KW
2513
2514# endif
7d4bcc4a 2515
e5f10d49
KW
2516 for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++) {
2517 if (category == categories[i]) {
2518 locale = PerlEnv_getenv(category_names[i]);
2519 goto found_locale;
2520 }
2521 }
7d4bcc4a 2522
b385bb4d 2523 locale = PerlEnv_getenv("LANG");
481465ea 2524 if (! locale) {
b385bb4d
KW
2525 locale = "";
2526 }
e5f10d49
KW
2527
2528 found_locale: ;
7d4bcc4a
KW
2529
2530# ifdef LC_ALL
2531
e5f10d49 2532 }
b385bb4d 2533 }
7d4bcc4a
KW
2534
2535# endif
2536
b385bb4d
KW
2537 }
2538
6c332036
TC
2539#ifdef USE_WSETLOCALE
2540 result = S_wrap_wsetlocale(aTHX_ category, locale);
2541#else
89f7b9aa 2542 result = setlocale(category, locale);
6c332036 2543#endif
48015184 2544 DEBUG_L(STMT_START {
b26541a9 2545 PerlIO_printf(Perl_debug_log, "%s\n",
d188fe6d 2546 setlocale_debug_string_r(category, locale, result));
48015184 2547 } STMT_END);
89f7b9aa 2548
481465ea 2549 if (! override_LC_ALL) {
89f7b9aa
KW
2550 return result;
2551 }
2552
dfd77d7a 2553 /* Here the input category was LC_ALL, and we have set it to what is in the
481465ea
KW
2554 * LANG variable or the system default if there is no LANG. But these have
2555 * lower priority than the other LC_foo variables, so override it for each
2556 * one that is set. (If they are set to "", it means to use the same thing
2557 * we just set LC_ALL to, so can skip) */
7d4bcc4a 2558
4fd76d49 2559 for (i = 0; i < LC_ALL_INDEX_; i++) {
e5f10d49
KW
2560 result = PerlEnv_getenv(category_names[i]);
2561 if (result && strNE(result, "")) {
6c332036 2562#ifdef USE_WSETLOCALE
17a1e9ac 2563 S_wrap_wsetlocale(aTHX_ categories[i], result);
6c332036 2564#else
17a1e9ac 2565 setlocale(categories[i], result);
6c332036 2566#endif
b26541a9 2567 DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s\n",
d188fe6d 2568 setlocale_debug_string_i(i, result, "not captured")));
e5f10d49 2569 }
89f7b9aa 2570 }
7d4bcc4a 2571
bbc98134 2572 result = setlocale(LC_ALL, NULL);
48015184 2573 DEBUG_L(STMT_START {
b26541a9 2574 PerlIO_printf(Perl_debug_log, "%s\n",
d188fe6d 2575 setlocale_debug_string_c(LC_ALL, NULL, result));
48015184 2576 } STMT_END);
89f7b9aa 2577
bbc98134 2578 return result;
b385bb4d
KW
2579}
2580
2581#endif
2582
9aac5db8 2583/*
9aac5db8
KW
2584=for apidoc Perl_setlocale
2585
2586This is an (almost) drop-in replacement for the system L<C<setlocale(3)>>,
2587taking the same parameters, and returning the same information, except that it
9487427b
KW
2588returns the correct underlying C<LC_NUMERIC> locale. Regular C<setlocale> will
2589instead return C<C> if the underlying locale has a non-dot decimal point
2590character, or a non-empty thousands separator for displaying floating point
2591numbers. This is because perl keeps that locale category such that it has a
2592dot and empty separator, changing the locale briefly during the operations
2593where the underlying one is required. C<Perl_setlocale> knows about this, and
2594compensates; regular C<setlocale> doesn't.
9aac5db8 2595
e9bc6d6b 2596Another reason it isn't completely a drop-in replacement is that it is
9aac5db8 2597declared to return S<C<const char *>>, whereas the system setlocale omits the
163deff3
KW
2598C<const> (presumably because its API was specified long ago, and can't be
2599updated; it is illegal to change the information C<setlocale> returns; doing
2600so leads to segfaults.)
9aac5db8 2601
e9bc6d6b
KW
2602Finally, C<Perl_setlocale> works under all circumstances, whereas plain
2603C<setlocale> can be completely ineffective on some platforms under some
2604configurations.
2605
9aac5db8 2606C<Perl_setlocale> should not be used to change the locale except on systems
e9bc6d6b
KW
2607where the predefined variable C<${^SAFE_LOCALES}> is 1. On some such systems,
2608the system C<setlocale()> is ineffective, returning the wrong information, and
2609failing to actually change the locale. C<Perl_setlocale>, however works
2610properly in all circumstances.
9aac5db8
KW
2611
2612The return points to a per-thread static buffer, which is overwritten the next
2613time C<Perl_setlocale> is called from the same thread.
2614
2615=cut
2616
2617*/
2618
d87e68d7
KW
2619#ifndef USE_LOCALE_NUMERIC
2620# define affects_LC_NUMERIC(cat) 0
2621#elif defined(LC_ALL)
2622# define affects_LC_NUMERIC(cat) (cat == LC_NUMERIC || cat == LC_ALL)
2623#else
2624# define affects_LC_NUMERIC(cat) (cat == LC_NUMERIC)
2625#endif
2626
9aac5db8
KW
2627const char *
2628Perl_setlocale(const int category, const char * locale)
a4f00dcc
KW
2629{
2630 /* This wraps POSIX::setlocale() */
2631
52129632 2632#ifndef USE_LOCALE
d36adde0
KW
2633
2634 PERL_UNUSED_ARG(category);
2635 PERL_UNUSED_ARG(locale);
2636
2637 return "C";
2638
2639#else
2640
9aac5db8 2641 const char * retval;
a4f00dcc 2642 dTHX;
0bb37942 2643
1a63cba7
KW
2644 DEBUG_L(PerlIO_printf(Perl_debug_log,
2645 "Entering Perl_setlocale(%d, \"%s\")\n",
2646 category, locale));
2647
0bb37942
KW
2648 /* A NULL locale means only query what the current one is. */
2649 if (locale == NULL) {
2650
6a7aca81 2651# ifndef USE_LOCALE_NUMERIC
0bb37942 2652
6a7aca81 2653 /* Without LC_NUMERIC, it's trivial; we just return the value */
22392ba2 2654 return save_to_buffer(querylocale_r(category),
0b8fa716 2655 &PL_setlocale_buf, &PL_setlocale_bufsize);
6a7aca81 2656# else
a4f00dcc 2657
0bb37942
KW
2658 /* We have the LC_NUMERIC name saved, because we are normally switched
2659 * into the C locale (or equivalent) for it. */
a4f00dcc 2660 if (category == LC_NUMERIC) {
1a63cba7
KW
2661 DEBUG_L(PerlIO_printf(Perl_debug_log,
2662 "Perl_setlocale(LC_NUMERIC, NULL) returning stashed '%s'\n",
2663 PL_numeric_name));
9aac5db8
KW
2664
2665 /* We don't have to copy this return value, as it is a per-thread
2666 * variable, and won't change until a future setlocale */
2667 return PL_numeric_name;
a4f00dcc
KW
2668 }
2669
6a7aca81
KW
2670# ifndef LC_ALL
2671
2672 /* Without LC_ALL, just return the value */
22392ba2 2673 return save_to_buffer(querylocale_r(category),
0b8fa716 2674 &PL_setlocale_buf, &PL_setlocale_bufsize);
6a7aca81
KW
2675
2676# else
2677
2678 /* Here, LC_ALL is available on this platform. It's the one
2679 * complicating category (because it can contain a toggled LC_NUMERIC
2680 * value), for all the remaining ones (we took care of LC_NUMERIC
2681 * above), just return the value */
2682 if (category != LC_ALL) {
22392ba2 2683 return save_to_buffer(querylocale_r(category),
0b8fa716 2684 &PL_setlocale_buf, &PL_setlocale_bufsize);
6a7aca81
KW
2685 }
2686
2687 bool toggled = FALSE;
a4f00dcc 2688
0bb37942
KW
2689 /* For an LC_ALL query, switch back to the underlying numeric locale
2690 * (if we aren't there already) so as to get the correct results. Our
2691 * records for all the other categories are valid without switching */
6a7aca81 2692 if (! PL_numeric_underlying) {
996e6b9e
KW
2693 set_numeric_underlying();
2694 toggled = TRUE;
a4f00dcc
KW
2695 }
2696
6a7aca81 2697 retval = querylocale_c(LC_ALL);
996e6b9e
KW
2698
2699 if (toggled) {
996e6b9e 2700 set_numeric_standard();
6a7aca81 2701 }
0bb37942 2702
1a63cba7
KW
2703 DEBUG_L(PerlIO_printf(Perl_debug_log, "%s\n",
2704 setlocale_debug_string_r(category, locale, retval)));
22392ba2 2705
0b8fa716 2706 return save_to_buffer(retval, &PL_setlocale_buf, &PL_setlocale_bufsize);
6a7aca81
KW
2707
2708# endif /* Has LC_ALL */
2709# endif /* Has LC_NUMERIC */
2710
0bb37942 2711 } /* End of querying the current locale */
a4f00dcc 2712
d87e68d7
KW
2713
2714 /* Here, the input has a locale to change to. First find the current
2715 * locale */
1a63cba7 2716 unsigned int cat_index = get_category_index(category, NULL);
d87e68d7
KW
2717 retval = querylocale_i(cat_index);
2718
2719 /* If the new locale is the same as the current one, nothing is actually
2720 * being changed, so do nothing. */
2721 if ( strEQ(retval, locale)
2722 && ( ! affects_LC_NUMERIC(category)
2723
2724# ifdef USE_LOCALE_NUMERIC
2725
2726 || strEQ(locale, PL_numeric_name)
2727
2728# endif
2729
2730 )) {
2731 DEBUG_L(PerlIO_printf(Perl_debug_log,
2732 "Already in requested locale: no action taken\n"));
2733 return save_to_buffer(setlocale_i(cat_index, locale),
0b8fa716 2734 &PL_setlocale_buf, &PL_setlocale_bufsize);
d87e68d7
KW
2735 }
2736
2737 /* Here, an actual change is being requested. Do it */
22392ba2 2738 retval = setlocale_i(cat_index, locale);
9aac5db8 2739 if (! retval) {
1a63cba7
KW
2740 DEBUG_L(PerlIO_printf(Perl_debug_log, "%s\n",
2741 setlocale_debug_string_i(cat_index, locale, "NULL")));
a4f00dcc
KW
2742 return NULL;
2743 }
2744
0b8fa716 2745 retval = save_to_buffer(retval, &PL_setlocale_buf, &PL_setlocale_bufsize);
22392ba2 2746
fe1c1494
KW
2747 /* Now that have changed locales, we have to update our records to
2748 * correspond. Only certain categories have extra work to update. */
c7dc1ea8
KW
2749 if (update_functions[cat_index]) {
2750 update_functions[cat_index](aTHX_ retval);
a4f00dcc
KW
2751 }
2752
b26541a9 2753 DEBUG_L(PerlIO_printf(Perl_debug_log, "returning '%s'\n", retval));
1a63cba7 2754
a4f00dcc
KW
2755 return retval;
2756
d36adde0
KW
2757#endif
2758
f7416781
KW
2759}
2760
faef259e
KW
2761#ifdef USE_LOCALE
2762
1b5d1fe8 2763STATIC const char *
0b8fa716 2764S_save_to_buffer(const char * string, const char **buf, Size_t *buf_size)
f7416781 2765{
0b8fa716
KW
2766 /* Copy the NUL-terminated 'string' to a buffer whose address before this
2767 * call began at *buf, and whose available length before this call was
2768 * *buf_size.
2769 *
2770 * If the length of 'string' is greater than the space available, the
2771 * buffer is grown accordingly, which may mean that it gets relocated.
2772 * *buf and *buf_size will be updated to reflect this.
2773 *
2774 * Regardless, the function returns a pointer to where 'string' is now
2775 * stored.
2776 *
2777 * 'string' may be NULL, which means no action gets taken, and NULL is
2778 * returned.
2779 *
2780 * If *buf or 'buf_size' are NULL or *buf_size is 0, the buffer is assumed
2781 * empty, and memory is malloc'd. 'buf-size' being NULL is to be used
2782 * when this is a single use buffer, which will shortly be freed by the
2783 * caller.
2784 */
f7416781 2785
0b6697c5 2786 Size_t string_size;
f7416781
KW
2787
2788 PERL_ARGS_ASSERT_SAVE_TO_BUFFER;
2789
0b6697c5
KW
2790 if (! string) {
2791 return NULL;
2792 }
2793
0b8fa716 2794 string_size = strlen(string) + 1;
0b6697c5 2795
b5644714
KW
2796 if (buf_size == NULL) {
2797 Newx(*buf, string_size, char);
2798 }
2799 else if (*buf_size == 0) {
f7416781
KW
2800 Newx(*buf, string_size, char);
2801 *buf_size = string_size;
2802 }
2803 else if (string_size > *buf_size) {
2804 Renew(*buf, string_size, char);
2805 *buf_size = string_size;
2806 }
2807
0b8fa716
KW
2808 {
2809 dTHX_DEBUGGING;
2810 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
2811 "Copying '%s' to %p\n",
2812 ((is_utf8_string((U8 *) string, 0))
2813 ? string
2814 :_byte_dump_string((U8 *) string, strlen(string), 0)),
2815 *buf));
2816 }
2817
2818 Copy(string, *buf, string_size, char);
f7416781
KW
2819 return *buf;
2820}
2821
d5efbc49
KW
2822STATIC utf8ness_t
2823S_get_locale_string_utf8ness_i(pTHX_ const char * locale,
2824 const unsigned cat_index,
2825 const char * string,
2826 const locale_utf8ness_t known_utf8)
2827{
2828 /* Return to indicate if 'string' in the locale given by the input
2829 * arguments should be considered UTF-8 or not.
2830 *
2831 * If the input 'locale' is not NULL, use that for the locale; otherwise
2832 * use the current locale for the category specified by 'cat_index'.
2833 */
2834
2835 Size_t len;
2836 const U8 * first_variant = NULL;
2837
2838 PERL_ARGS_ASSERT_GET_LOCALE_STRING_UTF8NESS_I;
2839 assert(cat_index <= NOMINAL_LC_ALL_INDEX);
2840
2841 if (string == NULL) {
2842 return UTF8NESS_NO;
2843 }
2844
2845 if (IN_BYTES) { /* respect 'use bytes' */
2846 return UTF8NESS_NO;
2847 }
2848
2849 len = strlen(string);
2850
2851 /* UTF8ness is immaterial if the representation doesn't vary */
2852 if (is_utf8_invariant_string_loc((U8 *) string, len, &first_variant)) {
2853 return UTF8NESS_IMMATERIAL;
2854 }
2855
2856 /* Can't be UTF-8 if invalid */
2857 if (! is_utf8_string((U8 *) first_variant,
2858 len - ((char *) first_variant - string)))
2859 {
2860 return UTF8NESS_NO;
2861 }
2862
2863 /* Here and below, we know the string is legal UTF-8, containing at least
2864 * one character requiring a sequence of two or more bytes. It is quite
2865 * likely to be UTF-8. But it pays to be paranoid and do further checking.
2866 *
2867 * If we already know the UTF-8ness of the locale, then we immediately know
2868 * what the string is */
2869 if (UNLIKELY(known_utf8 != LOCALE_UTF8NESS_UNKNOWN)) {
2870 if (known_utf8 == LOCALE_IS_UTF8) {
2871 return UTF8NESS_YES;
2872 }
2873 else {
2874 return UTF8NESS_NO;
2875 }
2876 }
2877
2878# if defined(HAS_SOME_LANGINFO) || defined(HAS_MBTOWC) || defined(HAS_MBRTOWC)
2879
2880 /* Here, we have available the libc functions that can be used to
2881 * accurately determine the UTF8ness of the underlying locale. If it is a
2882 * UTF-8 locale, the string is UTF-8; otherwise it was coincidental that
2883 * the string is legal UTF-8
2884 *
2885 * However, if the perl is compiled to not pay attention to the category
2886 * being passed in, you might think that that locale is essentially always
2887 * the C locale, so it would make sense to say it isn't UTF-8. But to get
2888 * here, the string has to contain characters unknown in the C locale. And
2889 * in fact, Windows boxes are compiled without LC_MESSAGES, as their
2890 * message catalog isn't really a part of the locale system. But those
2891 * messages really could be UTF-8, and given that the odds are rather small
2892 * of something not being UTF-8 but being syntactically valid UTF-8, khw
2893 * has decided to call such strings as UTF-8. */
2894
2895 if (locale == NULL) {
2896 locale = querylocale_i(cat_index);
2897 }
2898 if (is_locale_utf8(locale)) {
2899 return UTF8NESS_YES;
2900 }
2901
2902 return UTF8NESS_NO;
2903
2904# else
2905
2906 /* Here, we have a valid UTF-8 string containing non-ASCII characters, and
2907 * don't have access to functions to check if the locale is UTF-8 or not.
2908 * Assume that it is. khw tried adding a check that the string is entirely
2909 * in a single Unicode script, but discovered the strftime() timezone is
2910 * user-settable through the environment, which may be in a different
2911 * script than the locale-expected value. */
2912 PERL_UNUSED_ARG(locale);
2913 PERL_UNUSED_ARG(cat_index);
2914
2915 return UTF8NESS_YES;
2916
faef259e
KW
2917#endif
2918
d5efbc49
KW
2919}
2920
2921#endif /* USE_LOCALE */
2922
d5f728f7
KW
2923int
2924Perl_mbtowc_(pTHX_ const wchar_t * pwc, const char * s, const Size_t len)
2925{
2926
2927#if ! defined(HAS_MBRTOWC) && ! defined(HAS_MBTOWC)
2928
2929 PERL_UNUSED_ARG(pwc);
2930 PERL_UNUSED_ARG(s);
2931 PERL_UNUSED_ARG(len);
2932 return -1;
2933
2934#else /* Below we have some form of mbtowc() */
2935# if defined(HAS_MBRTOWC) \
2936 && (defined(USE_LOCALE_THREADS) || ! defined(HAS_MBTOWC))
2937# define USE_MBRTOWC
2938# else
2939# undef USE_MBRTOWC
2940# endif
2941
2942 int retval = -1;
2943
2944 if (s == NULL) { /* Initialize the shift state to all zeros in
2945 PL_mbrtowc_ps. */
2946
2947# if defined(USE_MBRTOWC)
2948
2949 memzero(&PL_mbrtowc_ps, sizeof(PL_mbrtowc_ps));
2950 return 0;
2951
2952# else
2953
2954 MBTOWC_LOCK;
2955 SETERRNO(0, 0);
2956 retval = mbtowc(NULL, NULL, 0);
2957 MBTOWC_UNLOCK;
2958 return retval;
2959
2960# endif
2961
2962 }
2963
2964# if defined(USE_MBRTOWC)
2965
2966 SETERRNO(0, 0);
2967 retval = (SSize_t) mbrtowc((wchar_t *) pwc, s, len, &PL_mbrtowc_ps);
2968
2969# else
2970
2971 /* Locking prevents races, but locales can be switched out without locking,
2972 * so this isn't a cure all */
2973 MBTOWC_LOCK;
2974 SETERRNO(0, 0);
2975 retval = mbtowc((wchar_t *) pwc, s, len);
2976 MBTOWC_UNLOCK;
2977
2978# endif
2979
2980 return retval;
2981
2982#endif
2983
2984}
2985
f7416781 2986/*
b7f19487
KW
2987=for apidoc Perl_localeconv
2988
2989This is a thread-safe version of the libc L<localeconv(3)>. It is the same as
2990L<POSIX::localeconv|POSIX/localeconv> (returning a hash of the C<localeconv()>
2991fields), but directly callable from XS code.
2992
2993=cut
2994*/
2995
2996HV *
2997Perl_localeconv(pTHX)
2998{
2999
3000#if ! defined(HAS_SOME_LOCALECONV) \
3001 || (! defined(USE_LOCALE_MONETARY) && ! defined(USE_LOCALE_NUMERIC))
3002
3003 return newHV();
3004
3005#else
3006
0630bfb4 3007 return my_localeconv(0, LOCALE_UTF8NESS_UNKNOWN);
b7f19487
KW
3008
3009#endif
3010
3011}
3012
3013#if defined(HAS_SOME_LOCALECONV) \
3014 && (defined(USE_LOCALE_MONETARY) || defined(USE_LOCALE_NUMERIC))
3015
3016HV *
0630bfb4 3017S_my_localeconv(pTHX_ const int item, const locale_utf8ness_t locale_is_utf8)
b7f19487
KW
3018{
3019 HV * retval;
3020 locale_utf8ness_t numeric_locale_is_utf8 = LOCALE_UTF8NESS_UNKNOWN;
3021 locale_utf8ness_t monetary_locale_is_utf8 = LOCALE_UTF8NESS_UNKNOWN;
3022 HV * (*copy_localeconv)(pTHX_ const struct lconv *,
0630bfb4
KW
3023 const int,
3024 const locale_utf8ness_t,
3025 const locale_utf8ness_t);
b7f19487
KW
3026
3027 /* A thread-safe locale_conv(). The locking mechanisms vary greatly
3028 * depending on platform capabilities. They all share this common set up
3029 * code for the function, and then conditional compilations choose one of
3030 * several terminations.
3031 *
0630bfb4
KW
3032 * There are two use cases:
3033 * 1) Called from POSIX::locale_conv(). This returns lconv() copied to
b7f19487 3034 * a hash, based on the current underlying locale.
0630bfb4
KW
3035 * 2) Certain items that nl_langinfo() provides are also derivable from
3036 * the return of localeconv(). Windows notably doesn't have
3037 * nl_langinfo(), so on that, and actually any platform lacking it,
3038 * my_localeconv() is used to emulate it for those particular items.
3039 * The code to do this is compiled only on such platforms. Rather than
3040 * going to the expense of creating a full hash when only one item is
3041 * needed, just the desired item is returned, in an SV cast to an HV.
b7f19487 3042 *
0630bfb4
KW
3043 * There is a helper function to accomplish each of the two tasks. The
3044 * function pointer just below is set to the appropriate one, and is called
b7f19487 3045 * from each of the various implementations, in the middle of whatever
0630bfb4
KW
3046 * necessary locking/locale swapping have been done. */
3047
3048# ifdef HAS_SOME_LANGINFO
3049
3050 PERL_UNUSED_ARG(item);
3051 PERL_UNUSED_ARG(locale_is_utf8);
3052
3053# ifdef USE_LOCALE_NUMERIC
3054
3055 /* When there is a nl_langinfo, we will only be called for localeconv
3056 * numeric purposes. */
3057 const bool is_localeconv_call = true;
3058
3059# endif
3060
3061# else
3062
3063 /* Note we use this sentinel; this works because this only gets compiled
3064 * when our perl_langinfo.h is used, and that uses negative numbers for all
3065 * the items */
3066 const bool is_localeconv_call = (item == 0);
3067 if (is_localeconv_call)
3068
3069# endif
b7f19487
KW
3070
3071 {
3072 copy_localeconv = S_populate_localeconv;
3073
3074# ifdef USE_LOCALE_NUMERIC
3075
3076 /* Get the UTF8ness of the locales now to avoid repeating this for each
3077 * string returned by localeconv() */
3078 numeric_locale_is_utf8 = (is_locale_utf8(PL_numeric_name))
3079 ? LOCALE_IS_UTF8
3080 : LOCALE_NOT_UTF8;
3081
3082# endif
3083# ifdef USE_LOCALE_MONETARY
3084
3085 monetary_locale_is_utf8 = (is_locale_utf8(querylocale_c(LC_MONETARY)))
3086 ? LOCALE_IS_UTF8
3087 : LOCALE_NOT_UTF8;
3088
3089# endif
3090
3091 }
3092
0630bfb4
KW
3093# ifndef HAS_SOME_LANGINFO
3094
3095 else {
3096 copy_localeconv = S_get_nl_item_from_localeconv;
3097 numeric_locale_is_utf8 = locale_is_utf8;
3098 }
3099
3100# endif
3101
b7f19487
KW
3102 PERL_ARGS_ASSERT_MY_LOCALECONV;
3103/*--------------------------------------------------------------------------*/
3104/* Here, we are done with the common beginning of all the implementations of
3105 * my_localeconv(). Below are the various terminations of the function (except
3106 * the closing '}'. They are separated out because the preprocessor directives
3107 * were making the simple logic hard to follow. Each implementation ends with
3108 * the same few lines. khw decided to keep those separate because he thought
3109 * it was clearer to the reader.
3110 *
3111 * The first distinct termination (of the above common code) are the
3112 * implementations when we have locale_conv_l() and can use it. These are the
3113 * simplest cases, without any locking needed. */
3114# if defined(USE_POSIX_2008_LOCALE) && defined(HAS_LOCALECONV_L)
3115
3116 /* And there are two sub-cases: First (by far the most common) is where we
3117 * are compiled to pay attention to LC_NUMERIC */
3118# ifdef USE_LOCALE_NUMERIC
3119
3120 const locale_t cur = use_curlocale_scratch();
3121 locale_t with_numeric = duplocale(cur);
3122
3123 /* Just create a new locale object with what we've got, but using the
3124 * underlying LC_NUMERIC locale */
3125 with_numeric = newlocale(LC_NUMERIC_MASK, PL_numeric_name, with_numeric);
3126
3127 retval = copy_localeconv(aTHX_ localeconv_l(with_numeric),
0630bfb4 3128 item,
b7f19487
KW
3129 numeric_locale_is_utf8,
3130 monetary_locale_is_utf8);
3131 freelocale(with_numeric);
3132
3133 return retval;
3134
3135/*--------------------------------------------------------------------------*/
3136# else /* Below not paying attention to LC_NUMERIC */
3137
3138 const locale_t cur = use_curlocale_scratch();
3139
3140 retval = copy_localeconv(aTHX_ localeconv_l(cur),
0630bfb4 3141 item,
b7f19487
KW
3142 numeric_locale_is_utf8,
3143 monetary_locale_is_utf8);
3144 return retval;
3145
3146# endif /* Above, using lconv_l(); below plain lconv() */
3147/*--------------------------------------------------------------------------*/
3148# elif ! defined(TS_W32_BROKEN_LOCALECONV) /* Next is regular lconv() */
3149
f5988947
KW
3150 /* There are so many locks because localeconv() deals with two
3151 * categories, and returns in a single global static buffer. Some
3152 * locks might be no-ops on this platform, but not others. We need to
3153 * lock if any one isn't a no-op. */
b7f19487 3154
f5988947
KW
3155# ifdef USE_LOCALE_NUMERIC
3156
3157 LC_NUMERIC_LOCK(0);
0630bfb4
KW
3158 const char * orig_switched_locale = NULL;
3159
3160 /* When called internally, are already switched into the proper numeric
3161 * locale; otherwise must toggle to it */
3162 if (is_localeconv_call) {
3163 orig_switched_locale = toggle_locale_c(LC_NUMERIC, PL_numeric_name);
3164 }
f5988947
KW
3165
3166# endif
b7f19487
KW
3167
3168 LOCALECONV_LOCK;
0630bfb4
KW
3169 retval = copy_localeconv(aTHX_ localeconv(),
3170 item,
3171 numeric_locale_is_utf8,
3172 monetary_locale_is_utf8);
b7f19487 3173 LOCALECONV_UNLOCK;
f5988947
KW
3174
3175# ifdef USE_LOCALE_NUMERIC
3176
0630bfb4
KW
3177 if (orig_switched_locale) {
3178 restore_toggled_locale_c(LC_NUMERIC, orig_switched_locale);
3179 }
f5988947
KW
3180 LC_NUMERIC_UNLOCK;
3181
3182# endif
b7f19487
KW
3183
3184 return retval;
3185
3186/*--------------------------------------------------------------------------*/
3187# else /* defined(TS_W32_BROKEN_LOCALECONV) */
3188
3189 /* Last is a workaround for the broken localeconv() on Windows with
3190 * thread-safe locales prior to VS 15. It looks at the global locale
3191 * instead of the thread one. As a work-around, we toggle to the global
3192 * locale; populate the return; then toggle back. We have to use LC_ALL
3193 * instead of the individual categories because of another bug in Windows.
3194 *
3195 * This introduces a potential race with any other thread that has also
3196 * converted to use the global locale, and doesn't protect its locale calls
3197 * with mutexes. khw can't think of any reason for a thread to do so on
3198 * Windows, as the locale API is the same regardless of thread-safety, except
3199 * if the code is ported from working on another platform where there might
3200 * be some reason to do this. But this is typically due to some
3201 * alien-to-Perl library that thinks it owns locale setting. Such a
3202 * library usn't likely to exist on Windows, so such an application is
3203 * unlikely to be run on Windows
3204 */
3205 bool restore_per_thread = FALSE;
b7f19487 3206
f5988947
KW
3207# ifdef USE_LOCALE_NUMERIC
3208
3209 const char * orig_switched_locale = NULL;
3210
3211 LC_NUMERIC_LOCK(0);
0630bfb4
KW
3212
3213 /* When called internally, are already switched into the proper numeric
3214 * locale; otherwise must toggle to it */
3215 if (is_localeconv_call) {
3216 orig_switched_locale = toggle_locale_c(LC_NUMERIC, PL_numeric_name);
3217 }
f5988947
KW
3218
3219# endif
b7f19487
KW
3220
3221 /* Save the per-thread locale state */
3222 const char * save_thread = querylocale_c(LC_ALL);
3223
3224 /* Change to the global locale, and note if we already were there */
3225 if (_configthreadlocale(_DISABLE_PER_THREAD_LOCALE)
3226 != _DISABLE_PER_THREAD_LOCALE)
3227 {
3228 restore_per_thread = TRUE;
3229 }
3230
3231 /* Save the state of the global locale; then convert to our desired
3232 * state. */
3233 const char * save_global = querylocale_c(LC_ALL);
3234 void_setlocale_c(LC_ALL, save_thread);
3235
3236 /* Safely stash the desired data */
3237 LOCALECONV_LOCK;
0630bfb4
KW
3238 retval = copy_localeconv(aTHX_ localeconv(),
3239 item,
3240 numeric_locale_is_utf8,
3241 monetary_locale_is_utf8);
b7f19487
KW
3242 LOCALECONV_UNLOCK;
3243
3244 /* Restore the global locale's prior state */
3245 void_setlocale_c(LC_ALL, save_global);
3246
3247 /* And back to per-thread locales */
3248 if (restore_per_thread) {
3249 _configthreadlocale(_ENABLE_PER_THREAD_LOCALE);
3250 }
3251
3252 /* Restore the per-thread locale state */
3253 void_setlocale_c(LC_ALL, save_thread);
3254
f5988947
KW
3255# ifdef USE_LOCALE_NUMERIC
3256
0630bfb4
KW
3257 if (orig_switched_locale) {
3258 restore_toggled_locale_c(LC_NUMERIC, orig_switched_locale);
3259 }
f5988947
KW
3260 LC_NUMERIC_UNLOCK;
3261
3262# endif
b7f19487
KW
3263
3264 return retval;
3265
3266# endif
3267/*--------------------------------------------------------------------------*/
3268}
3269
3270STATIC HV *
3271S_populate_localeconv(pTHX_ const struct lconv *lcbuf,
0630bfb4 3272 const int unused,
b7f19487
KW
3273 const locale_utf8ness_t numeric_locale_is_utf8,
3274 const locale_utf8ness_t monetary_locale_is_utf8)
3275{
3276 /* This returns a mortalized hash containing all the elements returned by
3277 * localeconv(). It is used by Perl_localeconv() and POSIX::localeconv()
3278 */
0630bfb4 3279 PERL_UNUSED_ARG(unused);
b7f19487
KW
3280
3281 struct lconv_offset {
3282 const char *name;
3283 size_t offset;
3284 };
3285
3286 /* Create e.g.,
3287 {"thousands_sep", STRUCT_OFFSET(struct lconv, thousands_sep)},
3288 */
3289# define LCONV_ENTRY(name) \
3290 {STRINGIFY(name), STRUCT_OFFSET(struct lconv, name)}
3291
3292 /* Set up structures containing the documented fields. One structure for
3293 * LC_NUMERIC-controlled strings; one for LC_MONETARY ones, and a final one
3294 * of just numerics. */
3295# ifdef USE_LOCALE_NUMERIC
3296
3297 static const struct lconv_offset lconv_numeric_strings[] = {
3298 LCONV_ENTRY(decimal_point),
3299 LCONV_ENTRY(thousands_sep),
3300# ifndef NO_LOCALECONV_GROUPING
3301 LCONV_ENTRY(grouping),
3302# endif
3303 {NULL, 0}
3304 };
3305
3306# endif
3307# ifdef USE_LOCALE_MONETARY
3308
3309 static const struct lconv_offset lconv_monetary_strings[] = {
3310 LCONV_ENTRY(int_curr_symbol),
3311 LCONV_ENTRY(currency_symbol),
3312 LCONV_ENTRY(mon_decimal_point),
3313# ifndef NO_LOCALECONV_MON_THOUSANDS_SEP
3314 LCONV_ENTRY(mon_thousands_sep),
3315# endif
3316# ifndef NO_LOCALECONV_MON_GROUPING
3317 LCONV_ENTRY(mon_grouping),
3318# endif
3319 LCONV_ENTRY(positive_sign),
3320 LCONV_ENTRY(negative_sign),
3321 {NULL, 0}
3322 };
3323
3324# endif
3325
3326 static const struct lconv_offset lconv_integers[] = {
3327# ifdef USE_LOCALE_MONETARY
3328 LCONV_ENTRY(int_frac_digits),
3329 LCONV_ENTRY(frac_digits),
3330 LCONV_ENTRY(p_cs_precedes),
3331 LCONV_ENTRY(p_sep_by_space),
3332 LCONV_ENTRY(n_cs_precedes),
3333 LCONV_ENTRY(n_sep_by_space),
3334 LCONV_ENTRY(p_sign_posn),
3335 LCONV_ENTRY(n_sign_posn),
3336# ifdef HAS_LC_MONETARY_2008
3337 LCONV_ENTRY(int_p_cs_precedes),
3338 LCONV_ENTRY(int_p_sep_by_space),
3339 LCONV_ENTRY(int_n_cs_precedes),
3340 LCONV_ENTRY(int_n_sep_by_space),
3341 LCONV_ENTRY(int_p_sign_posn),
3342 LCONV_ENTRY(int_n_sign_posn),
3343# endif
3344# endif
3345 {NULL, 0}
3346 };
3347
3348 static const unsigned category_indices[] = {
3349# ifdef USE_LOCALE_NUMERIC
3350 LC_NUMERIC_INDEX_,
3351# endif
3352# ifdef USE_LOCALE_MONETARY
3353 LC_MONETARY_INDEX_,
3354# endif
3355 (unsigned) -1 /* Just so the previous element can always end with a
3356 comma => subtract 1 below for the max loop index */
3357 };
3358
3359 const char *ptr = (const char *) lcbuf;
3360 const struct lconv_offset *integers = lconv_integers;
3361
3362 HV * retval = newHV();
3363 sv_2mortal((SV*)retval);
3364
3365 PERL_ARGS_ASSERT_POPULATE_LOCALECONV;
3366
3367 /* For each enabled category ... */
3368 for (PERL_UINT_FAST8_T i = 0; i < C_ARRAY_LENGTH(category_indices) - 1; i++) {
3369 const unsigned cat_index = category_indices[i];
3370 locale_utf8ness_t locale_is_utf8 = LOCALE_UTF8NESS_UNKNOWN;
3371 const char *locale;
3372
3373 /* ( = NULL silences a compiler warning; would segfault if it could
3374 * actually happen.) */
3375 const struct lconv_offset *strings = NULL;
3376
3377# ifdef USE_LOCALE_NUMERIC
3378 if (cat_index == LC_NUMERIC_INDEX_) {
3379 locale_is_utf8 = numeric_locale_is_utf8;
3380 strings = lconv_numeric_strings;
3381 }
0630bfb4
KW
3382# else
3383 PERL_UNUSED_ARG(numeric_locale_is_utf8);
b7f19487
KW
3384# endif
3385# ifdef USE_LOCALE_MONETARY
3386 if (cat_index == LC_MONETARY_INDEX_) {
3387 locale_is_utf8 = monetary_locale_is_utf8;
3388 strings = lconv_monetary_strings;
3389 }
0630bfb4
KW
3390# else
3391 PERL_UNUSED_ARG(monetary_locale_is_utf8);
b7f19487
KW
3392# endif
3393
3394 assert(locale_is_utf8 != LOCALE_UTF8NESS_UNKNOWN);
3395
3396 /* Iterate over the strings structure for this category */
0630bfb4 3397 locale = querylocale_i(cat_index);
b7f19487
KW
3398
3399 while (strings->name) {
3400 const char *value = *((const char **)(ptr + strings->offset));
3401 if (value && *value) {
3402 bool is_utf8 = /* Only make UTF-8 if required to */
3403 (UTF8NESS_YES == (get_locale_string_utf8ness_i(locale,
3404 cat_index,
3405 value,
3406 locale_is_utf8)));
3407 (void) hv_store(retval,
3408 strings->name,
3409 strlen(strings->name),
3410 newSVpvn_utf8(value, strlen(value), is_utf8),
3411 0);
3412 }
3413
3414 strings++;
3415 }
3416 }
3417
3418 while (integers->name) {
3419 const char value = *((const char *)(ptr + integers->offset));
3420
3421 if (value != CHAR_MAX)
3422 (void) hv_store(retval, integers->name,
3423 strlen(integers->name), newSViv(value), 0);
3424 integers++;
3425 }
3426
3427 return retval;
3428}
3429
0630bfb4
KW
3430# ifndef HAS_SOME_LANGINFO
3431
3432STATIC HV *
3433S_get_nl_item_from_localeconv(pTHX_ const struct lconv *lcbuf,
3434 const int item,
3435 const locale_utf8ness_t unused1,
3436 const locale_utf8ness_t unused2)
3437{
3438 /* This is a helper function for my_localeconv(), which is called from
3439 * my_langinfo() to emulate the libc nl_langinfo() function on platforms
3440 * that don't have it available.
3441 *
3442 * This function acts as an extension to my_langinfo(), the intermediate
3443 * my_localeconv() call is to set up the locks and switch into the proper
3444 * locale. That logic exists for other reasons, and by doing it this way,
3445 * it doesn't have to be duplicated.
3446 *
3447 * This function extracts the current value of 'item' in the current locale
3448 * using the localconv() result also passed in, via 'lcbuf'. The other
3449 * parameter is unused, a placeholder so the signature of this function
3450 * matches another that does need it, and so the two functions can be
3451 * referred to by a single function pointer, to simplify the code below */
3452
3453 const char * prefix = "";
3454 const char * temp = NULL;
3455
3456 PERL_ARGS_ASSERT_GET_NL_ITEM_FROM_LOCALECONV;
3457 PERL_UNUSED_ARG(unused1);
3458 PERL_UNUSED_ARG(unused2);
3459
3460 switch (item) {
3461 case CRNCYSTR:
3462 temp = lcbuf->currency_symbol;
3463
3464 if (lcbuf->p_cs_precedes) {
3465
3466 /* khw couldn't find any documentation that CHAR_MAX is the signal,
3467 * but cygwin uses it thusly */
3468 if (lcbuf->p_cs_precedes == CHAR_MAX) {
3469 prefix = ".";
3470 }
3471 else {
3472 prefix = "-";
3473 }
3474 }
3475 else {
3476 prefix = "+";
3477 }
3478
3479 break;
3480
3481 case RADIXCHAR:
3482 temp = lcbuf->decimal_point;
3483 break;
3484
3485 case THOUSEP:
3486 temp = lcbuf->thousands_sep;
3487 break;
3488
3489 default:
3490 locale_panic_(Perl_form(aTHX_
3491 "Unexpected item passed to populate_localeconv: %d", item));
3492 }
3493
3494 return (HV *) Perl_newSVpvf(aTHX_ "%s%s", prefix, temp);
3495}
3496
3497# endif /* ! Has some form of langinfo() */
b7f19487
KW
3498#endif /* Has some form of localeconv() and paying attn to a category it
3499 traffics in */
3500
d0826532
KW
3501#ifndef HAS_SOME_LANGINFO
3502
3503typedef int nl_item; /* Substitute 'int' for emulated nl_langinfo() */
3504
3505#endif
3506
b7f19487 3507/*
f7416781 3508
d0826532
KW
3509=for apidoc Perl_langinfo
3510=for apidoc_item Perl_langinfo8
3511
3512C<Perl_langinfo> is an (almost) drop-in replacement for the system
3513C<L<nl_langinfo(3)>>, taking the same C<item> parameter values, and returning
3514the same information. But it is more thread-safe than regular
3515C<nl_langinfo()>, and hides the quirks of Perl's locale handling from your
3516code, and can be used on systems that lack a native C<nl_langinfo>.
f7416781 3517
d0826532
KW
3518However, you should instead use the improved version of this:
3519L</Perl_langinfo8>, which behaves identically except for an additional
3520parameter, a pointer to a variable declared as L</C<utf8ness_t>>, into which it
3521returns to you how you should treat the returned string with regards to it
3522being encoded in UTF-8 or not.
f7416781 3523
d0826532 3524Concerning the differences between these and plain C<nl_langinfo()>:
f7416781
KW
3525
3526=over
3527
d0826532 3528=item a.
f7416781 3529
d0826532
KW
3530C<Perl_langinfo8> has an extra parameter, described above. Besides this, the
3531other reasons they aren't quite a drop-in replacement is actually an advantage.
3532The C<const>ness of the return allows the compiler to catch attempts to write
3533into the returned buffer, which is illegal and could cause run-time crashes.
ce181bf6 3534
d0826532 3535=item b.
ce181bf6 3536
d0826532 3537They deliver the correct results for the C<RADIXCHAR> and C<THOUSEP> items,
f7416781
KW
3538without you having to write extra code. The reason for the extra code would be
3539because these are from the C<LC_NUMERIC> locale category, which is normally
9487427b
KW
3540kept set by Perl so that the radix is a dot, and the separator is the empty
3541string, no matter what the underlying locale is supposed to be, and so to get
3542the expected results, you have to temporarily toggle into the underlying
3543locale, and later toggle back. (You could use plain C<nl_langinfo> and
3544C<L</STORE_LC_NUMERIC_FORCE_TO_UNDERLYING>> for this but then you wouldn't get
3545the other advantages of C<Perl_langinfo()>; not keeping C<LC_NUMERIC> in the C
3546(or equivalent) locale would break a lot of CPAN, which is expecting the radix
3547(decimal point) character to be a dot.)
ce181bf6 3548
d0826532 3549=item c.
ce181bf6 3550
d0826532 3551The system function they replace can have its static return buffer trashed,
f1460a66 3552not only by a subsequent call to that function, but by a C<freelocale>,
d0826532
KW
3553C<setlocale>, or other locale change. The returned buffer of these functions
3554is not changed until the next call to one or the other, so the buffer is never
3555in a trashed state.
f7416781 3556
d0826532 3557=item d.
f7416781 3558
d0826532
KW
3559The return buffer is per-thread, so it also is never overwritten by a call to
3560these functions from another thread; unlike the function it replaces.
ce181bf6 3561
d0826532 3562=item e.
ce181bf6 3563
d0826532
KW
3564But most importantly, they work on systems that don't have C<nl_langinfo>, such
3565as Windows, hence making your code more portable. Of the fifty-some possible
ce181bf6 3566items specified by the POSIX 2008 standard,
f7416781 3567L<http://pubs.opengroup.org/onlinepubs/9699919799/basedefs/langinfo.h.html>,
8d72e74e 3568only one is completely unimplemented, though on non-Windows platforms, another
d0826532 3569significant one is not fully implemented). They use various techniques to
8d72e74e
KW
3570recover the other items, including calling C<L<localeconv(3)>>, and
3571C<L<strftime(3)>>, both of which are specified in C89, so should be always be
3572available. Later C<strftime()> versions have additional capabilities; C<""> is
d0826532 3573returned for any item not available on your system.
ce181bf6 3574
d0826532 3575It is important to note that, when called with an item that is recovered by
ce181bf6 3576using C<localeconv>, the buffer from any previous explicit call to
d0826532
KW
3577C<L<localeconv(3)>> will be overwritten. But you shouldn't be using
3578C<localeconv> anyway because it is is very much not thread-safe, and suffers
3579from the same problems outlined in item 'b.' above for the fields it returns that
3580are controlled by the LC_NUMERIC locale category. Instead, avoid all of those
3581problems by calling L</Perl_localeconv>, which is thread-safe; or by using the
3582methods given in L<perlcall> to call
3583L<C<POSIX::localeconv()>|POSIX/localeconv>, which is also thread-safe.
3584
3585=back
9f0bc911 3586
6201c220
KW
3587The details for those items which may deviate from what this emulation returns
3588and what a native C<nl_langinfo()> would return are specified in
3589L<I18N::Langinfo>.
f7416781 3590
d0826532
KW
3591When using C<Perl_langinfo8> (or plain C<Perl_langinfo>) on systems that don't
3592have a native C<nl_langinfo()>, you must
f7416781
KW
3593
3594 #include "perl_langinfo.h"
3595
3596before the C<perl.h> C<#include>. You can replace your C<langinfo.h>
3597C<#include> with this one. (Doing it this way keeps out the symbols that plain
c3997e39
KW
3598C<langinfo.h> would try to import into the namespace for code that doesn't need
3599it.)
f7416781 3600
f7416781
KW
3601=cut
3602
3603*/
3604
3605const char *
f7416781 3606Perl_langinfo(const nl_item item)
f7416781 3607{
d0826532
KW
3608 return Perl_langinfo8(item, NULL);
3609}
3610
3611const char *
3612Perl_langinfo8(const nl_item item, utf8ness_t * utf8ness)
3613{
07492de5 3614 dTHX;
fc46dd0d
KW
3615 unsigned cat_index;
3616
d0826532
KW
3617 PERL_ARGS_ASSERT_PERL_LANGINFO8;
3618
3619 if (utf8ness) { /* Assume for now */
3620 *utf8ness = UTF8NESS_IMMATERIAL;
3621 }
3622
fc46dd0d
KW
3623 /* Find the locale category that controls the input 'item'. If we are not
3624 * paying attention to that category, instead return a default value. Also
3625 * return the default value if there is no way for us to figure out the
3626 * correct value. If we have some form of nl_langinfo(), we can always
3627 * figure it out, but lacking that, there may be alternative methods that
3628 * can be used to recover most of the possible items. Some of those
3629 * methods need libc functions, which may or may not be available. If
3630 * unavailable, we can't compute the correct value, so must here return the
3631 * default. */
faef259e 3632 switch (item) {
fc46dd0d
KW
3633
3634 case CODESET:
faef259e
KW
3635
3636#ifdef USE_LOCALE_CTYPE
faef259e 3637
fc46dd0d
KW
3638 cat_index = LC_CTYPE_INDEX_;
3639 break;
faef259e 3640
fc46dd0d
KW
3641#else
3642 return C_codeset;
faef259e
KW
3643#endif
3644#if defined(USE_LOCALE_MESSAGES) && defined(HAS_SOME_LANGINFO)
faef259e 3645
fc46dd0d
KW
3646 case YESEXPR: case YESSTR: case NOEXPR: case NOSTR:
3647 cat_index = LC_MESSAGES_INDEX_;
3648 break;
3649#else
faef259e
KW
3650 case YESEXPR: return "^[+1yY]";
3651 case YESSTR: return "yes";
3652 case NOEXPR: return "^[-0nN]";
3653 case NOSTR: return "no";
faef259e 3654#endif
fc46dd0d
KW
3655
3656 case CRNCYSTR:
3657
faef259e
KW
3658#if defined(USE_LOCALE_MONETARY) \
3659 && (defined(HAS_SOME_LANGINFO) || defined(HAS_SOME_LOCALECONV))
faef259e 3660
fc46dd0d
KW
3661 cat_index = LC_MONETARY_INDEX_;
3662 break;
3663#else
faef259e 3664 return "-";
faef259e 3665#endif
fc76ce84 3666
fc46dd0d
KW
3667 case RADIXCHAR:
3668
fc76ce84 3669#ifdef CAN_CALCULATE_RADIX
faef259e 3670
fc46dd0d
KW
3671 cat_index = LC_NUMERIC_INDEX_;
3672 break;
3673#else
faef259e 3674 return C_decimal_point;
fc76ce84 3675#endif
fc46dd0d
KW
3676
3677 case THOUSEP:
3678
fc76ce84
KW
3679#if defined(USE_LOCALE_NUMERIC) \
3680 && (defined(HAS_SOME_LANGINFO) || defined(HAS_SOME_LOCALECONV))
fc46dd0d
KW
3681
3682 cat_index = LC_NUMERIC_INDEX_;
3683 break;
fc76ce84 3684#else
faef259e 3685 return C_thousands_sep;
fc46dd0d
KW
3686#endif
3687
3688/* The other possible items are all in LC_TIME. */
3689#ifdef USE_LOCALE_TIME
3690
3691 default:
3692 cat_index = LC_TIME_INDEX_;
3693 break;
faef259e
KW
3694
3695#endif
3696#if ! defined(USE_LOCALE_TIME) || ! defined(HAS_SOME_LANGINFO)
3697
3698 /* If not using LC_TIME, hard code the rest. Or, if there is no
3699 * nl_langinfo(), we use strftime() as an alternative, and it is missing
3700 * functionality to get every single one, so hard-code those */
3701
3702 case ERA: return ""; /* Unimplemented; for use with strftime() %E
3703 modifier */
3704
3705 /* These formats are defined by C89, so we assume that strftime supports
3706 * them, and so are returned unconditionally; they may not be what the
3707 * locale actually says, but should give good enough results for someone
3708 * using them as formats (as opposed to trying to parse them to figure
3709 * out what the locale says). The other format items are actually tested
3710 * to verify they work on the platform */
3711 case D_FMT: return "%x";
3712 case T_FMT: return "%X";
3713 case D_T_FMT: return "%c";
3714
3715# if defined(WIN32) || ! defined(USE_LOCALE_TIME)
3716
3717 /* strftime() on Windows doesn't have the POSIX (beyond C89) extensions
3718 * that would allow it to recover these */
3719 case ERA_D_FMT: return "%x";
3720 case ERA_T_FMT: return "%X";
3721 case ERA_D_T_FMT: return "%c";
3722 case ALT_DIGITS: return "0";
3723
3724# endif
fc46dd0d 3725# ifndef USE_LOCALE_TIME
faef259e
KW
3726
3727 case T_FMT_AMPM: return "%r";
3728 case ABDAY_1: return "Sun";
3729 case ABDAY_2: return "Mon";
3730 case ABDAY_3: return "Tue";
3731 case ABDAY_4: return "Wed";
3732 case ABDAY_5: return "Thu";
3733 case ABDAY_6: return "Fri";
3734 case ABDAY_7: return "Sat";
3735 case AM_STR: return "AM";
3736 case PM_STR: return "PM";
3737 case ABMON_1: return "Jan";
3738 case ABMON_2: return "Feb";
3739 case ABMON_3: return "Mar";
3740 case ABMON_4: return "Apr";
3741 case ABMON_5: return "May";
3742 case ABMON_6: return "Jun";
3743 case ABMON_7: return "Jul";
3744 case ABMON_8: return "Aug";
3745 case ABMON_9: return "Sep";
3746 case ABMON_10: return "Oct";
3747 case ABMON_11: return "Nov";
3748 case ABMON_12: return "Dec";
3749 case DAY_1: return "Sunday";
3750 case DAY_2: return "Monday";
3751 case DAY_3: return "Tuesday";
3752 case DAY_4: return "Wednesday";
3753 case DAY_5: return "Thursday";
3754 case DAY_6: return "Friday";
3755 case DAY_7: return "Saturday";
3756 case MON_1: return "January";
3757 case MON_2: return "February";
3758 case MON_3: return "March";
3759 case MON_4: return "April";
3760 case MON_5: return "May";
3761 case MON_6: return "June";
3762 case MON_7: return "July";
3763 case MON_8: return "August";
3764 case MON_9: return "September";
3765 case MON_10: return "October";
3766 case MON_11: return "November";
3767 case MON_12: return "December";
3768
fc46dd0d 3769# endif
faef259e
KW
3770#endif
3771
3772 } /* End of switch on item */
3773
3774#ifndef USE_LOCALE
3775
3776 Perl_croak_nocontext("panic: Unexpected nl_langinfo() item %d", item);
3777 NOT_REACHED; /* NOTREACHED */
fc46dd0d 3778 PERL_UNUSED_VAR(cat_index);
faef259e
KW
3779
3780#else
fc46dd0d
KW
3781# ifdef USE_LOCALE_NUMERIC
3782
3783 /* Use either the underlying numeric, or the other underlying categories */
3784 if (cat_index == LC_NUMERIC_INDEX_) {
3785 return my_langinfo_c(item, LC_NUMERIC, PL_numeric_name,
d0826532 3786 &PL_langinfo_buf, &PL_langinfo_bufsize, utf8ness);
fc46dd0d
KW
3787 }
3788 else
3789
3790# endif
faef259e 3791
fc46dd0d
KW
3792 {
3793 return my_langinfo_i(item, cat_index, querylocale_i(cat_index),
d0826532 3794 &PL_langinfo_buf, &PL_langinfo_bufsize, utf8ness);
fc46dd0d 3795 }
faef259e
KW
3796
3797#endif
3798
f61748ac
KW
3799}
3800
faef259e
KW
3801#ifdef USE_LOCALE
3802
07492de5
KW
3803/* There are several implementations of my_langinfo, depending on the
3804 * Configuration. They all share the same beginning of the function */
59a15af0 3805STATIC const char *
fc46dd0d 3806S_my_langinfo_i(pTHX_
07492de5 3807 const nl_item item, /* The item to look up */
fc46dd0d
KW
3808 const unsigned int cat_index, /* The locale category that
3809 controls it */
3810 /* The locale to look up 'item' in. */
3811 const char * locale,
07492de5
KW
3812
3813 /* Where to store the result, and where the size of that buffer
3814 * is stored, updated on exit. retbuf_sizep may be NULL for an
3815 * empty-on-entry, single use buffer whose size we don't need
3816 * to keep track of */
3817 const char ** retbufp,
fffcb78f
KW
3818 Size_t * retbuf_sizep,
3819
3820 /* If not NULL, the location to store the UTF8-ness of 'item's
3821 * value, as documented */
3822 utf8ness_t * utf8ness)
f61748ac 3823{
07492de5 3824 const char * retval = NULL;
faef259e 3825
fc46dd0d
KW
3826 PERL_ARGS_ASSERT_MY_LANGINFO_I;
3827 assert(cat_index <= NOMINAL_LC_ALL_INDEX);
5a854ab3 3828
fc46dd0d
KW
3829 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
3830 "Entering my_langinfo item=%d, using locale %s\n",
3831 item, locale));
fe1c1494
KW
3832/*--------------------------------------------------------------------------*/
3833/* Above is the common beginning to all the implementations of my_langinfo().
332c5a2a
KW
3834 * Below are the various completions.
3835 *
3836 * Some platforms don't deal well with non-ASCII strings in locale X when
3837 * LC_CTYPE is not in X. (Actually it is probably when X is UTF-8 and LC_CTYPE
3838 * isn't, or vice versa). There is explicit code to bring the categories into
3839 * sync. This doesn't seem to be a problem with nl_langinfo(), so that
3840 * implementation doesn't currently worry about it. But it is a problem on
3841 * Windows boxes, which don't have nl_langinfo(). */
3842
fc46dd0d 3843# if defined(HAS_THREAD_SAFE_NL_LANGINFO_L) && defined(USE_POSIX_2008_LOCALE)
f7416781 3844
332c5a2a
KW
3845 /* Simplest is if we can use nl_langinfo_l()
3846 *
3847 * With it, we can change LC_CTYPE in the same call as the other category */
3848# ifdef USE_LOCALE_CTYPE
3849# define CTYPE_SAFETY_MASK LC_CTYPE_MASK
3850# else
3851# define CTYPE_SAFETY_MASK 0
3852# endif
f7416781 3853
332c5a2a
KW
3854 locale_t cur = newlocale((category_masks[cat_index] | CTYPE_SAFETY_MASK),
3855 locale, (locale_t) 0);
ee90eb2d 3856
fc46dd0d 3857 retval = save_to_buffer(nl_langinfo_l(item, cur), retbufp, retbuf_sizep);
fffcb78f
KW
3858 if (utf8ness) {
3859 *utf8ness = get_locale_string_utf8ness_i(locale, cat_index, retval,
3860 LOCALE_UTF8NESS_UNKNOWN);
3861 }
5acc4454 3862
fc46dd0d 3863 freelocale(cur);
07492de5
KW
3864
3865 return retval;
fe1c1494 3866/*--------------------------------------------------------------------------*/
fc46dd0d 3867# elif defined(HAS_NL_LANGINFO) /* nl_langinfo() is available. */
ab340fff 3868
fc46dd0d 3869 /* The second version of my_langinfo() is if we have plain nl_langinfo() */
d36adde0 3870
332c5a2a
KW
3871# ifdef USE_LOCALE_CTYPE
3872
3873 /* Ths function sorts out if things actually have to be switched or not,
3874 * for both calls. */
3875 const char * orig_CTYPE_locale = toggle_locale_c(LC_CTYPE, locale);
3876
3877# endif
3878
fc46dd0d 3879 const char * orig_switched_locale = toggle_locale_i(cat_index, locale);
ab340fff 3880
fc46dd0d
KW
3881 NL_LANGINFO_LOCK;
3882 retval = save_to_buffer(nl_langinfo(item), retbufp, retbuf_sizep);
3883 NL_LANGINFO_UNLOCK;
d36adde0 3884
fffcb78f
KW
3885 if (utf8ness) {
3886 *utf8ness = get_locale_string_utf8ness_i(locale, cat_index,
3887 retval, LOCALE_UTF8NESS_UNKNOWN);
3888 }
3889
fc46dd0d 3890 restore_toggled_locale_i(cat_index, orig_switched_locale);
ab340fff 3891
332c5a2a
KW
3892# ifdef USE_LOCALE_CTYPE
3893 restore_toggled_locale_c(LC_CTYPE, orig_CTYPE_locale);
3894# endif
3895
0b6697c5 3896 return retval;
fe1c1494
KW
3897/*--------------------------------------------------------------------------*/
3898# else /* Below, emulate nl_langinfo as best we can */
43dd6b15 3899
fc46dd0d
KW
3900 /* And the third and final completion is where we have to emulate
3901 * nl_langinfo(). There are various possibilities depending on the
3902 * Configuration */
69c5e0db 3903
332c5a2a
KW
3904# ifdef USE_LOCALE_CTYPE
3905
3906 const char * orig_CTYPE_locale = toggle_locale_c(LC_CTYPE, locale);
3907
3908# endif
f7416781 3909
fc46dd0d
KW
3910 const char * orig_switched_locale = toggle_locale_i(cat_index, locale);
3911
3912 /* Here, we are in the locale we want information about */
3913
fffcb78f
KW
3914 /* Almost all the items will have ASCII return values. Set that here, and
3915 * override if necessary */
3916 utf8ness_t is_utf8 = UTF8NESS_IMMATERIAL;
3917
fc46dd0d
KW
3918 switch (item) {
3919 default:
3920 retval = "";
3921 break;
3922
fc76ce84
KW
3923 case RADIXCHAR:
3924
3925# if defined(HAS_SNPRINTF) \
3926 && (! defined(HAS_SOME_LOCALECONV) || defined(TS_W32_BROKEN_LOCALECONV))
69c5e0db 3927
ea80ef5d 3928 {
fc76ce84
KW
3929 /* snprintf() can be used to find the radix character by outputting
3930 * a known simple floating point number to a buffer, and parsing
3931 * it, inferring the radix as the bytes separating the integer and
3932 * fractional parts. But localeconv() is more direct, not
3933 * requiring inference, so use it instead of the code just below,
3934 * if (likely) it is available and works ok */
69c5e0db 3935
ea80ef5d
KW
3936 char * floatbuf = NULL;
3937 const Size_t initial_size = 10;
69c5e0db 3938
ea80ef5d 3939 Newx(floatbuf, initial_size, char);
692d08c5
KW
3940
3941 /* 1.5 is exactly representable on binary computers */
ea80ef5d 3942 Size_t needed_size = snprintf(floatbuf, initial_size, "%.1f", 1.5);
692d08c5
KW
3943
3944 /* If our guess wasn't big enough, increase and try again, based on
3945 * the real number that strnprintf() is supposed to return */
3946 if (UNLIKELY(needed_size >= initial_size)) {
ea80ef5d
KW
3947 needed_size++; /* insurance */
3948 Renew(floatbuf, needed_size, char);
3949 Size_t new_needed = snprintf(floatbuf, needed_size, "%.1f", 1.5);
3950 assert(new_needed <= needed_size);
3951 needed_size = new_needed;
3952 }
69c5e0db 3953
ea80ef5d
KW
3954 char * s = floatbuf;
3955 char * e = floatbuf + needed_size;
69c5e0db 3956
ea80ef5d
KW
3957 /* Find the '1' */
3958 while (s < e && *s != '1') {
3959 s++;
3960 }
eb1b97ad 3961
ea80ef5d
KW
3962 if (LIKELY(s < e)) {
3963 s++;
eb1b97ad 3964 }
69c5e0db 3965
ea80ef5d
KW
3966 /* Find the '5' */
3967 char * item_start = s;
3968 while (s < e && *s != '5') {
3969 s++;
3970 }
69c5e0db 3971
ea80ef5d 3972 /* Everything in between is the radix string */
692d08c5 3973 if (LIKELY(s < e)) {
ea80ef5d
KW
3974 *s = '\0';
3975 retval = save_to_buffer(item_start,
3976 (const char **) &PL_langinfo_buf,
3977 &PL_langinfo_bufsize);
3978 Safefree(floatbuf);
fffcb78f
KW
3979
3980 if (utf8ness) {
3981 is_utf8 = get_locale_string_utf8ness_i(locale, cat_index,
3982 retval,
3983 LOCALE_UTF8NESS_UNKNOWN);
3984
3985 }
3986
69c5e0db 3987 break;
ea80ef5d
KW
3988 }
3989
3990 Safefree(floatbuf);
3991 }
69c5e0db 3992
fc76ce84
KW
3993# ifdef HAS_SOME_LOCALECONV /* snprintf() failed; drop down to use
3994 localeconv() */
3995
3996 /* FALLTHROUGH */ \
69c5e0db 3997
fc76ce84 3998# else /* snprintf() failed and no localeconv() */
69c5e0db 3999
fc46dd0d
KW
4000 retval = C_decimal_point;
4001 break;
fc76ce84
KW
4002
4003# endif
69c5e0db 4004# endif
fc76ce84 4005# ifdef HAS_SOME_LOCALECONV
69c5e0db 4006
692d08c5
KW
4007 /* These items are available from localeconv(). (To avoid using
4008 * TS_W32_BROKEN_LOCALECONV, one could use GetNumberFormat and
4009 * GetCurrencyFormat; patches welcome) */
5acc4454 4010
0630bfb4
KW
4011 case CRNCYSTR:
4012 case THOUSEP:
4013 {
4014 SV * string = (SV *) my_localeconv(item, LOCALE_UTF8NESS_UNKNOWN);
69c5e0db 4015
0630bfb4 4016 retval = save_to_buffer(SvPV_nolen(string), retbufp, retbuf_sizep);
5acc4454 4017
fffcb78f
KW
4018 if (utf8ness) {
4019 is_utf8 = get_locale_string_utf8ness_i(locale, cat_index, retval,
4020 LOCALE_UTF8NESS_UNKNOWN);
4021 }
4022
0630bfb4 4023 SvREFCNT_dec_NN(string);
fffcb78f 4024 break;
0630bfb4 4025 }
f7416781 4026
909e71b1
KW
4027# endif /* Some form of localeconv */
4028# ifdef HAS_STRFTIME
4029
4030 /* These formats are only available in later strfmtime's */
4031 case ERA_D_FMT: case ERA_T_FMT: case ERA_D_T_FMT: case T_FMT_AMPM:
4032
4033 /* The rest can be gotten from most versions of strftime(). */
4034 case ABDAY_1: case ABDAY_2: case ABDAY_3:
4035 case ABDAY_4: case ABDAY_5: case ABDAY_6: case ABDAY_7:
4036 case ALT_DIGITS:
4037 case AM_STR: case PM_STR:
4038 case ABMON_1: case ABMON_2: case ABMON_3: case ABMON_4:
4039 case ABMON_5: case ABMON_6: case ABMON_7: case ABMON_8:
4040 case ABMON_9: case ABMON_10: case ABMON_11: case ABMON_12:
4041 case DAY_1: case DAY_2: case DAY_3: case DAY_4:
4042 case DAY_5: case DAY_6: case DAY_7:
4043 case MON_1: case MON_2: case MON_3: case MON_4:
4044 case MON_5: case MON_6: case MON_7: case MON_8:
4045 case MON_9: case MON_10: case MON_11: case MON_12:
dd8a7039
KW
4046 {
4047 const char * format;
4048 bool return_format = FALSE;
4049 int mon = 0;
4050 int mday = 1;
4051 int hour = 6;
639a7c35 4052
909e71b1 4053 GCC_DIAG_IGNORE_STMT(-Wimplicit-fallthrough);
639a7c35 4054
909e71b1
KW
4055 switch (item) {
4056 default:
20ad7b23 4057 locale_panic_(Perl_form(aTHX_ "switch case: %d problem", item));
909e71b1
KW
4058 NOT_REACHED; /* NOTREACHED */
4059
4060 case PM_STR: hour = 18;
4061 case AM_STR:
4062 format = "%p";
4063 break;
4064 case ABDAY_7: mday++;
4065 case ABDAY_6: mday++;
4066 case ABDAY_5: mday++;
4067 case ABDAY_4: mday++;
4068 case ABDAY_3: mday++;
4069 case ABDAY_2: mday++;
4070 case ABDAY_1:
4071 format = "%a";
4072 break;
4073 case DAY_7: mday++;
4074 case DAY_6: mday++;
4075 case DAY_5: mday++;
4076 case DAY_4: mday++;
4077 case DAY_3: mday++;
4078 case DAY_2: mday++;
4079 case DAY_1:
4080 format = "%A";
4081 break;
4082 case ABMON_12: mon++;
4083 case ABMON_11: mon++;
4084 case ABMON_10: mon++;
4085 case ABMON_9: mon++;
4086 case ABMON_8: mon++;
4087 case ABMON_7: mon++;
4088 case ABMON_6: mon++;
4089 case ABMON_5: mon++;
4090 case ABMON_4: mon++;
4091 case ABMON_3: mon++;
4092 case ABMON_2: mon++;
4093 case ABMON_1:
4094 format = "%b";
4095 break;
4096 case MON_12: mon++;
4097 case MON_11: mon++;
4098 case MON_10: mon++;
4099 case MON_9: mon++;
4100 case MON_8: mon++;
4101 case MON_7: mon++;
4102 case MON_6: mon++;
4103 case MON_5: mon++;
4104 case MON_4: mon++;
4105 case MON_3: mon++;
4106 case MON_2: mon++;
4107 case MON_1:
4108 format = "%B";
4109 break;
4110 case T_FMT_AMPM:
4111 format = "%r";
4112 return_format = TRUE;
4113 break;
4114 case ERA_D_FMT:
4115 format = "%Ex";
4116 return_format = TRUE;
4117 break;
4118 case ERA_T_FMT:
4119 format = "%EX";
4120 return_format = TRUE;
4121 break;
4122 case ERA_D_T_FMT:
4123 format = "%Ec";
4124 return_format = TRUE;
4125 break;
4126 case ALT_DIGITS:
4127 format = "%Ow"; /* Find the alternate digit for 0 */
4128 break;
4129 }
f7416781 4130
909e71b1 4131 GCC_DIAG_RESTORE_STMT;
639a7c35 4132
dd8a7039
KW
4133 /* The year was deliberately chosen so that January 1 is on the
4134 * first day of the week. Since we're only getting one thing at a
4135 * time, it all works */
fffcb78f
KW
4136 const char * temp = my_strftime8(format, 30, 30, hour, mday, mon,
4137 2011, 0, 0, 0, &is_utf8);
07492de5 4138 retval = save_to_buffer(temp, retbufp, retbuf_sizep);
dd8a7039 4139 Safefree(temp);
f7416781 4140
07492de5
KW
4141 /* If the item is 'ALT_DIGITS', '*retbuf' contains the alternate
4142 * format for wday 0. If the value is the same as the normal 0,
4143 * there isn't an alternate, so clear the buffer.
dd8a7039
KW
4144 *
4145 * (wday was chosen because its range is all a single digit.
4146 * Things like tm_sec have two digits as the minimum: '00'.) */
07492de5 4147 if (item == ALT_DIGITS && strEQ(*retbufp, "0")) {
fc46dd0d
KW
4148 retval = "";
4149 break;
909e71b1 4150 }
f7416781 4151
909e71b1 4152 /* ALT_DIGITS is problematic. Experiments on it showed that
fe1c1494
KW
4153 * strftime() did not always work properly when going from alt-9 to
4154 * alt-10. Only a few locales have this item defined, and in all
4155 * of them on Linux that khw was able to find, nl_langinfo() merely
4156 * returned the alt-0 character, possibly doubled. Most Unicode
4157 * digits are in blocks of 10 consecutive code points, so that is
4158 * sufficient information for such scripts, as we can infer alt-1,
4159 * alt-2, .... But for a Japanese locale, a CJK ideographic 0 is
4160 * returned, and the CJK digits are not in code point order, so you
4161 * can't really infer anything. The localedef for this locale did
4162 * specify the succeeding digits, so that strftime() works properly
4163 * on them, without needing to infer anything. But the
4164 * nl_langinfo() return did not give sufficient information for the
4165 * caller to understand what's going on. So until there is
4166 * evidence that it should work differently, this returns the alt-0
dd8a7039 4167 * string for ALT_DIGITS. */
f7416781 4168
dd8a7039 4169 if (return_format) {
d1b150d9 4170
909e71b1
KW
4171 /* If to return the format, not the value, overwrite the buffer
4172 * with it. But some strftime()s will keep the original format
4173 * if illegal, so change those to "" */
07492de5 4174 if (strEQ(*retbufp, format)) {
909e71b1
KW
4175 retval = "";
4176 }
4177 else {
4178 retval = format;
4179 }
fffcb78f
KW
4180
4181 /* A format is always in ASCII */
4182 is_utf8 = UTF8NESS_IMMATERIAL;
dd8a7039 4183 }
f7416781 4184
dd8a7039
KW
4185 break;
4186 }
f7416781 4187
909e71b1 4188# endif
f7416781 4189
f206e331
KW
4190 case CODESET:
4191
f6e2ab67
KW
4192 /* The trivial case */
4193 if (isNAME_C_OR_POSIX(locale)) {
fc46dd0d
KW
4194 retval = C_codeset;
4195 break;
f6e2ab67 4196 }
f206e331 4197
f6e2ab67 4198# ifdef WIN32
f206e331 4199
f6e2ab67
KW
4200 /* This function retrieves the code page. It is subject to change, but
4201 * is documented and has been stable for many releases */
f4a34202
KW
4202 UINT ___lc_codepage_func(void);
4203
4204 retval = save_to_buffer(Perl_form(aTHX_ "%d", ___lc_codepage_func()),
4205 retbufp, retbuf_sizep);
f6e2ab67
KW
4206 DEBUG_Lv(PerlIO_printf(Perl_debug_log, "locale='%s' cp=%s\n",
4207 locale, retval));
f4a34202
KW
4208 break;
4209
f6e2ab67 4210# else
f4a34202 4211
f6e2ab67
KW
4212 /* The codeset is important, but khw did not figure out a way for it to
4213 * be retrieved on non-Windows boxes without nl_langinfo(). But even
4214 * if we can't get it directly, we can usually determine if it is a
4215 * UTF-8 locale or not. If it is UTF-8, we (correctly) use that for
4216 * the code set. */
f206e331 4217
f6e2ab67
KW
4218# if defined(HAS_MBTOWC) || defined(HAS_MBRTOWC)
4219
4220 /* If libc mbtowc() evaluates the bytes that form the REPLACEMENT
4221 * CHARACTER as that Unicode code point, this has to be a UTF-8 locale.
4222 * */
4223
4224 wchar_t wc = 0;
4225 (void) Perl_mbtowc_(aTHX_ NULL, NULL, 0);/* Reset shift state */
4226 int mbtowc_ret = Perl_mbtowc_(aTHX_ &wc,
4227 STR_WITH_LEN(REPLACEMENT_CHARACTER_UTF8));
4228 if (mbtowc_ret >= 0 && wc == UNICODE_REPLACEMENT) {
58399f03
KW
4229 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
4230 "mbtowc returned REPLACEMENT\n"));
fc46dd0d
KW
4231 retval = "UTF-8";
4232 break;
f6e2ab67 4233 }
f206e331 4234
f6e2ab67
KW
4235 /* Here, it isn't a UTF-8 locale. */
4236
58399f03
KW
4237# else /* mbtowc() is not available. */
4238
4239 /* Sling together several possibilities, depending on platform
4240 * capabilities and what we found.
4241 *
4242 * For non-English locales or non-dollar currency locales, we likely
4243 * will find out whether a locale is UTF-8 or not */
4244
4245 utf8ness_t is_utf8 = UTF8NESS_UNKNOWN;
4246 const char * scratch_buf = NULL;
4247
4248# if defined(USE_LOCALE_MONETARY) && defined(HAS_SOME_LOCALECONV)
4249
4250 /* Can't use this method unless localeconv() is available, as that's
4251 * the way we find out the currency symbol. */
4252
4253 /* First try looking at the currency symbol (via a recursive call) to
4254 * see if it disambiguates things. Often that will be in the native
4255 * script, and if the symbol isn't legal UTF-8, we know that the locale
4256 * isn't either. */
4257 (void) my_langinfo_c(CRNCYSTR, LC_MONETARY, locale, &scratch_buf, NULL,
4258 &is_utf8);
4259 Safefree(scratch_buf);
4260
f6e2ab67 4261# endif
58399f03
KW
4262# ifdef USE_LOCALE_TIME
4263
4264 /* If we have ruled out being UTF-8, no point in checking further. */
4265 if (is_utf8 != UTF8NESS_NO) {
4266
4267 /* But otherwise do check more. This is done even if the currency
4268 * symbol looks to be UTF-8, just in case that's a false positive.
4269 *
4270 * Look at the LC_TIME entries, like the names of the months or
4271 * weekdays. We quit at the first one that is illegal UTF-8 */
4272
4273 utf8ness_t this_is_utf8 = UTF8NESS_UNKNOWN;
4274 const int times[] = {
4275 DAY_1, DAY_2, DAY_3, DAY_4, DAY_5, DAY_6, DAY_7,
4276 MON_1, MON_2, MON_3, MON_4, MON_5, MON_6, MON_7, MON_8,
4277 MON_9, MON_10, MON_11, MON_12,
4278 ALT_DIGITS, AM_STR, PM_STR,
4279 ABDAY_1, ABDAY_2, ABDAY_3, ABDAY_4, ABDAY_5, ABDAY_6,
4280 ABDAY_7,
4281 ABMON_1, ABMON_2, ABMON_3, ABMON_4, ABMON_5, ABMON_6,
4282 ABMON_7, ABMON_8, ABMON_9, ABMON_10, ABMON_11, ABMON_12
4283 };
4284
4285 /* The code in the recursive call can handle switching the locales,
4286 * but by doing it here, we avoid switching each iteration of the
4287 * loop */
4288 const char * orig_TIME_locale = toggle_locale_c(LC_TIME, locale);
4289
4290 for (PERL_UINT_FAST8_T i = 0; i < C_ARRAY_LENGTH(times); i++) {
4291 scratch_buf = NULL;
4292 (void) my_langinfo_c(times[i], LC_TIME, locale, &scratch_buf,
4293 NULL, &this_is_utf8);
4294 Safefree(scratch_buf);
4295 if (this_is_utf8 == UTF8NESS_NO) {
4296 is_utf8 = UTF8NESS_NO;
4297 break;
4298 }
4299
4300 if (this_is_utf8 == UTF8NESS_YES) {
4301 is_utf8 = UTF8NESS_YES;
4302 }
4303 }
4304
4305 /* Here we have gone through all the LC_TIME elements. is_utf8 has
4306 * been set as follows:
4307 * UTF8NESS_NO If any aren't legal UTF-8
4308 * UTF8NESS_IMMMATERIAL If all are ASCII
4309 * UTF8NESS_YES If all are legal UTF-8 (including
4310 * ASCIIi), and at least one isn't
4311 * ASCII. */
4312
4313 restore_toggled_locale_c(LC_TIME, orig_TIME_locale);
4314 }
f6e2ab67 4315
58399f03
KW
4316# endif /* LC_TIME */
4317
4318 /* If nothing examined above rules out it being UTF-8, and at least one
4319 * thing fits as UTF-8 (and not plain ASCII), assume the codeset is
4320 * UTF-8. */
4321 if (is_utf8 == UTF8NESS_YES) {
4322 retval = "UTF-8";
4323 break;
4324 }
4325
4326 /* Here, nothing examined indicates that the codeset is UTF-8. But
4327 * what is it? The other locale categories are not likely to be of
4328 * further help:
4329 *
4330 * LC_NUMERIC Only a few locales in the world have a non-ASCII radix
4331 * or group separator.
4332 * LC_CTYPE This code wouldn't be compiled if mbtowc() existed and
4333 * was reliable. This is unlikely in C99. There are
4334 * other functions that could be used instead, but are
4335 * they going to exist, and be able to distinguish between
4336 * UTF-8 and 8859-1? Deal with this only if it becomes
4337 * necessary.
4338 * LC_MESSAGES The strings returned from strerror() would seem likely
4339 * candidates, but experience has shown that many systems
4340 * don't actually have translations installed for them.
4341 * They are instead always in English, so everything in
4342 * them is ASCII, which is of no help to us. A Configure
4343 * probe could possibly be written to see if this platform
4344 * has non-ASCII error messages. But again, wait until it
4345 * turns out to be an actual problem. */
4346
4347# endif /* ! mbtowc() */
4348
4349 /* Rejoin the mbtowc available/not-available cases.
4350 *
4351 * We got here only because we haven't been able to find the codeset.
4352 * The only other option khw could think of is to see if the codeset is
4353 * part of the locale name. This is very less than ideal; often there
4354 * is no code set in the name; and at other times they even lie.
f6e2ab67
KW
4355 *
4356 * Find any dot in the locale name */
4357 retval = (const char *) strchr(locale, '.');
58399f03
KW
4358 if (! retval) {
4359 retval = ""; /* Alas, no dot */
4360 break;
4361 }
f206e331 4362
58399f03
KW
4363 /* Use everything past the dot */
4364 retval++;
f206e331 4365
f6e2ab67
KW
4366# if defined(HAS_MBTOWC) || defined(HAS_MBRTOWC)
4367
58399f03
KW
4368 /* When these functions, are available, they were tried earlier and
4369 * indicated that the locale did not act like a proper UTF-8 one. So
4370 * if it claims to be UTF-8, it is a lie */
f6e2ab67
KW
4371 if (is_codeset_name_UTF8(retval)) {
4372 retval = "";
58399f03 4373 break;
f6e2ab67 4374 }
58399f03
KW
4375
4376# endif
4377
4378 /* Otherwise the code set name is considered to be everything past the
4379 * dot. */
4380 retval = save_to_buffer(retval, retbufp, retbuf_sizep);
f206e331
KW
4381
4382 break;
4383
f6e2ab67 4384# endif
f206e331 4385
f6e2ab67 4386 } /* Giant switch() of nl_langinfo() items */
f7416781 4387
fc46dd0d 4388 restore_toggled_locale_i(cat_index, orig_switched_locale);
332c5a2a
KW
4389
4390# ifdef USE_LOCALE_CTYPE
4391 restore_toggled_locale_c(LC_CTYPE, orig_CTYPE_locale);
4392# endif
4393
fffcb78f
KW
4394 if (utf8ness) {
4395 *utf8ness = is_utf8;
4396 }
4397
0b6697c5 4398 return retval;
f7416781 4399
f6e2ab67 4400# endif /* All the implementations of my_langinfo() */
58399f03 4401
fe1c1494 4402/*--------------------------------------------------------------------------*/
f6e2ab67
KW
4403
4404} /* my_langinfo() */
b385bb4d 4405
faef259e
KW
4406#endif /* USE_LOCALE */
4407
7b12ce91
KW
4408char *
4409Perl_my_strftime8(pTHX_ const char *fmt, int sec, int min, int hour, int mday,
4410 int mon, int year, int wday, int yday, int isdst,
4411 utf8ness_t * utf8ness)
4412{ /* Documented in util.c */
4413 char * retval = my_strftime(fmt, sec, min, hour, mday, mon, year, wday,
4414 yday, isdst);
4415
4416 PERL_ARGS_ASSERT_MY_STRFTIME8;
4417
4418 if (utf8ness) {
4419
4420#ifdef USE_LOCALE_TIME
4421 *utf8ness = get_locale_string_utf8ness_i(NULL, LC_TIME_INDEX_,
fffcb78f 4422 retval, LOCALE_UTF8NESS_UNKNOWN);
7b12ce91
KW
4423#else
4424 *utf8ness = UTF8NESS_IMMATERIAL;
4425#endif
4426
4427 }
4428
4429 DEBUG_Lv(PerlIO_printf(Perl_debug_log, "fmt=%s, retval=%s", fmt,
4430 ((is_utf8_string((U8 *) retval, 0))
4431 ? retval
4432 :_byte_dump_string((U8 *) retval, strlen(retval), 0)));
4433 if (utf8ness) PerlIO_printf(Perl_debug_log, "; utf8ness=%d",
4434 (int) *utf8ness);
4435 PerlIO_printf(Perl_debug_log, "\n");
4436 );
4437
4438 return retval;
4439}
4440
98994639
HS
4441/*
4442 * Initialize locale awareness.
4443 */
4444int
4445Perl_init_i18nl10n(pTHX_ int printwarn)
4446{
0e92a118
KW
4447 /* printwarn is
4448 *
4449 * 0 if not to output warning when setup locale is bad
4450 * 1 if to output warning based on value of PERL_BADLANG
4451 * >1 if to output regardless of PERL_BADLANG
4452 *
4453 * returns
98994639 4454 * 1 = set ok or not applicable,
0e92a118
KW
4455 * 0 = fallback to a locale of lower priority
4456 * -1 = fallback to all locales failed, not even to the C locale
6b058d42
KW
4457 *
4458 * Under -DDEBUGGING, if the environment variable PERL_DEBUG_LOCALE_INIT is
4459 * set, debugging information is output.
4460 *
b85af0ab
KW
4461 * This looks more complicated than it is, mainly due to the #ifdefs and
4462 * error handling.
6b058d42 4463 *
b85af0ab 4464 * Besides some asserts, data structure initialization, and specific
fe1c1494
KW
4465 * platform complications, this routine is effectively represented by this
4466 * pseudo-code:
b85af0ab 4467 *
fe1c1494
KW
4468 * setlocale(LC_ALL, ""); x
4469 * foreach (subcategory) { x
4470 * curlocales[f(subcategory)] = setlocale(subcategory, NULL); x
4471 * } x
4472 * if (platform_so_requires) {
4473 * foreach (subcategory) {
4474 * PL_curlocales[f(subcategory)] = curlocales[f(subcategory)]
4475 * }
4476 * }
4477 * foreach (subcategory) {
4478 * if (needs_special_handling[f(subcategory)] &this_subcat_handler
4479 * }
b85af0ab 4480 *
fe1c1494
KW
4481 * This sets all the categories to the values in the current environment,
4482 * saves them temporarily in curlocales[] until they can be handled and/or
4483 * on some platforms saved in a per-thread array PL_curlocales[].
b85af0ab 4484 *
fe1c1494
KW
4485 * f(foo) is a mapping from the opaque system category numbers to small
4486 * non-negative integers used most everywhere in this file as indices into
4487 * arrays (such as curlocales[]) so the program doesn't have to otherwise
4488 * deal with the opaqueness.
b85af0ab 4489 *
fe1c1494
KW
4490 * If the platform doesn't have LC_ALL, the lines marked 'x' above are
4491 * effectively replaced by:
4492 * foreach (subcategory) { y
4493 * curlocales[f(subcategory)] = setlocale(subcategory, ""); y
4494 * } y
b85af0ab 4495 *
fe1c1494
KW
4496 * The only differences being the lack of an LC_ALL call, and using ""
4497 * instead of NULL in the setlocale calls.
b85af0ab 4498 *
fe1c1494 4499 * But there are, of course, complications.
b85af0ab 4500 *
fe1c1494
KW
4501 * it has to deal with if this is an embedded perl, whose locale doesn't
4502 * come from the environment, but has been set up by the caller. This is
4503 * pretty simply handled: the "" in the setlocale calls is not a string
4504 * constant, but a variable which is set to NULL in the embedded case.
b85af0ab 4505 *
fe1c1494
KW
4506 * But the major complication is handling failure and doing fallback. All
4507 * the code marked 'x' or 'y' above is actually enclosed in an outer loop,
4508 * using the array trial_locales[]. On entry, trial_locales[] is
4509 * initialized to just one entry, containing the NULL or "" locale argument
4510 * shown above. If, as is almost always the case, everything works, it
4511 * exits after just the one iteration, going on to the next step.
b85af0ab 4512 *
fe1c1494
KW
4513 * But if there is a failure, the code tries its best to honor the
4514 * environment as much as possible. It self-modifies trial_locales[] to
4515 * have more elements, one for each of the POSIX-specified settings from
4516 * the environment, such as LANG, ending in the ultimate fallback, the C
4517 * locale. Thus if there is something bogus with a higher priority
4518 * environment variable, it will try with the next highest, until something
4519 * works. If everything fails, it limps along with whatever state it got
4520 * to.
b85af0ab
KW
4521 *
4522 * A further complication is that Windows has an additional fallback, the
4523 * user-default ANSI code page obtained from the operating system. This is
4524 * added as yet another loop iteration, just before the final "C"
6b058d42 4525 *
1aa8e6dd
KW
4526 * A slight complication is that in embedded Perls, the locale may already
4527 * be set-up, and we don't want to get it from the normal environment
4528 * variables. This is handled by having a special environment variable
4529 * indicate we're in this situation. We simply set setlocale's 2nd
4530 * parameter to be a NULL instead of "". That indicates to setlocale that
4531 * it is not to change anything, but to return the current value,
4532 * effectively initializing perl's db to what the locale already is.
4533 *
4534 * We play the same trick with NULL if a LC_ALL succeeds. We call
4535 * setlocale() on the individual categores with NULL to get their existing
4536 * values for our db, instead of trying to change them.
4537 * */
1565c085 4538
0e92a118
KW
4539 int ok = 1;
4540
7d4bcc4a
KW
4541#ifndef USE_LOCALE
4542
4543 PERL_UNUSED_ARG(printwarn);
4544
4545#else /* USE_LOCALE */
7d4bcc4a
KW
4546# ifdef __GLIBC__
4547
24f3e849 4548 const char * const language = PerlEnv_getenv("LANGUAGE");
7d4bcc4a
KW
4549
4550# endif
65ebb059 4551
ccd65d51
KW
4552 /* NULL uses the existing already set up locale */
4553 const char * const setlocale_init = (PerlEnv_getenv("PERL_SKIP_LOCALE_INIT"))
4554 ? NULL
4555 : "";
938a8fef 4556 typedef struct trial_locales_struct_s {
e57aaf75 4557 const char* trial_locale;
6e1b5a64
B
4558 const char* fallback_desc;
4559 const char* fallback_name;
e57aaf75 4560 } trial_locales_struct;
4fe10b51 4561 /* 5 = 1 each for "", LC_ALL, LANG, (Win32) system default locale, C */
938a8fef 4562 trial_locales_struct trial_locales[5];
c3fcd832 4563 unsigned int trial_locales_count;
24f3e849
KW
4564 const char * const lc_all = PerlEnv_getenv("LC_ALL");
4565 const char * const lang = PerlEnv_getenv("LANG");
98994639 4566 bool setlocale_failure = FALSE;
65ebb059 4567 unsigned int i;
175c4cf9
KW
4568
4569 /* A later getenv() could zap this, so only use here */
4570 const char * const bad_lang_use_once = PerlEnv_getenv("PERL_BADLANG");
4571
4572 const bool locwarn = (printwarn > 1
e5f10d49
KW
4573 || ( printwarn
4574 && ( ! bad_lang_use_once
22ff3130 4575 || (
e5f10d49
KW
4576 /* disallow with "" or "0" */
4577 *bad_lang_use_once
4578 && strNE("0", bad_lang_use_once)))));
ea92aad8 4579
291a84fb
KW
4580 /* current locale for given category; should have been copied so aren't
4581 * volatile */
8de4332b 4582 const char * curlocales[NOMINAL_LC_ALL_INDEX + 1];
291a84fb 4583
948523db
KW
4584# ifndef DEBUGGING
4585# define DEBUG_LOCALE_INIT(a,b,c)
4586# else
7d4bcc4a 4587
8298454c 4588 DEBUG_INITIALIZATION_set(cBOOL(PerlEnv_getenv("PERL_DEBUG_LOCALE_INIT")));
7d4bcc4a 4589
d188fe6d 4590# define DEBUG_LOCALE_INIT(cat_index, locale, result) \
b26541a9 4591 DEBUG_L(PerlIO_printf(Perl_debug_log, "%s\n", \
d188fe6d 4592 setlocale_debug_string_i(cat_index, locale, result)));
2fcc0ca9 4593
948523db
KW
4594/* Make sure the parallel arrays are properly set up */
4595# ifdef USE_LOCALE_NUMERIC
4fd76d49
KW
4596 assert(categories[LC_NUMERIC_INDEX_] == LC_NUMERIC);
4597 assert(strEQ(category_names[LC_NUMERIC_INDEX_], "LC_NUMERIC"));
e9bc6d6b 4598# ifdef USE_POSIX_2008_LOCALE
4fd76d49 4599 assert(category_masks[LC_NUMERIC_INDEX_] == LC_NUMERIC_MASK);
e9bc6d6b 4600# endif
948523db
KW
4601# endif
4602# ifdef USE_LOCALE_CTYPE
4fd76d49
KW
4603 assert(categories[LC_CTYPE_INDEX_] == LC_CTYPE);
4604 assert(strEQ(category_names[LC_CTYPE_INDEX_], "LC_CTYPE"));
e9bc6d6b 4605# ifdef USE_POSIX_2008_LOCALE
4fd76d49 4606 assert(category_masks[LC_CTYPE_INDEX_] == LC_CTYPE_MASK);
e9bc6d6b 4607# endif
948523db
KW
4608# endif
4609# ifdef USE_LOCALE_COLLATE
4fd76d49
KW
4610 assert(categories[LC_COLLATE_INDEX_] == LC_COLLATE);
4611 assert(strEQ(category_names[LC_COLLATE_INDEX_], "LC_COLLATE"));
e9bc6d6b 4612# ifdef USE_POSIX_2008_LOCALE
4fd76d49 4613 assert(category_masks[LC_COLLATE_INDEX_] == LC_COLLATE_MASK);
e9bc6d6b 4614# endif
948523db
KW
4615# endif
4616# ifdef USE_LOCALE_TIME
4fd76d49
KW
4617 assert(categories[LC_TIME_INDEX_] == LC_TIME);
4618 assert(strEQ(category_names[LC_TIME_INDEX_], "LC_TIME"));
e9bc6d6b 4619# ifdef USE_POSIX_2008_LOCALE
4fd76d49 4620 assert(category_masks[LC_TIME_INDEX_] == LC_TIME_MASK);
e9bc6d6b 4621# endif
948523db
KW
4622# endif
4623# ifdef USE_LOCALE_MESSAGES
4fd76d49
KW
4624 assert(categories[LC_MESSAGES_INDEX_] == LC_MESSAGES);
4625 assert(strEQ(category_names[LC_MESSAGES_INDEX_], "LC_MESSAGES"));
e9bc6d6b 4626# ifdef USE_POSIX_2008_LOCALE
4fd76d49 4627 assert(category_masks[LC_MESSAGES_INDEX_] == LC_MESSAGES_MASK);
e9bc6d6b 4628# endif
948523db
KW
4629# endif
4630# ifdef USE_LOCALE_MONETARY
4fd76d49
KW
4631 assert(categories[LC_MONETARY_INDEX_] == LC_MONETARY);
4632 assert(strEQ(category_names[LC_MONETARY_INDEX_], "LC_MONETARY"));
e9bc6d6b 4633# ifdef USE_POSIX_2008_LOCALE
4fd76d49 4634 assert(category_masks[LC_MONETARY_INDEX_] == LC_MONETARY_MASK);
e9bc6d6b 4635# endif
948523db 4636# endif
9821811f 4637# ifdef USE_LOCALE_ADDRESS
4fd76d49
KW
4638 assert(categories[LC_ADDRESS_INDEX_] == LC_ADDRESS);
4639 assert(strEQ(category_names[LC_ADDRESS_INDEX_], "LC_ADDRESS"));
e9bc6d6b 4640# ifdef USE_POSIX_2008_LOCALE
4fd76d49 4641 assert(category_masks[LC_ADDRESS_INDEX_] == LC_ADDRESS_MASK);
e9bc6d6b 4642# endif
9821811f
KW
4643# endif
4644# ifdef USE_LOCALE_IDENTIFICATION
4fd76d49
KW
4645 assert(categories[LC_IDENTIFICATION_INDEX_] == LC_IDENTIFICATION);
4646 assert(strEQ(category_names[LC_IDENTIFICATION_INDEX_], "LC_IDENTIFICATION"));
e9bc6d6b 4647# ifdef USE_POSIX_2008_LOCALE
4fd76d49 4648 assert(category_masks[LC_IDENTIFICATION_INDEX_] == LC_IDENTIFICATION_MASK);
e9bc6d6b 4649# endif
9821811f
KW
4650# endif
4651# ifdef USE_LOCALE_MEASUREMENT
4fd76d49
KW
4652 assert(categories[LC_MEASUREMENT_INDEX_] == LC_MEASUREMENT);
4653 assert(strEQ(category_names[LC_MEASUREMENT_INDEX_], "LC_MEASUREMENT"));
e9bc6d6b 4654# ifdef USE_POSIX_2008_LOCALE
4fd76d49 4655 assert(category_masks[LC_MEASUREMENT_INDEX_] == LC_MEASUREMENT_MASK);
e9bc6d6b 4656# endif
9821811f
KW
4657# endif
4658# ifdef USE_LOCALE_PAPER
4fd76d49
KW
4659 assert(categories[LC_PAPER_INDEX_] == LC_PAPER);
4660 assert(strEQ(category_names[LC_PAPER_INDEX_], "LC_PAPER"));
e9bc6d6b 4661# ifdef USE_POSIX_2008_LOCALE
4fd76d49 4662 assert(category_masks[LC_PAPER_INDEX_] == LC_PAPER_MASK);
e9bc6d6b 4663# endif
9821811f
KW
4664# endif
4665# ifdef USE_LOCALE_TELEPHONE
4fd76d49
KW
4666 assert(categories[LC_TELEPHONE_INDEX_] == LC_TELEPHONE);
4667 assert(strEQ(category_names[LC_TELEPHONE_INDEX_], "LC_TELEPHONE"));
e9bc6d6b 4668# ifdef USE_POSIX_2008_LOCALE
4fd76d49 4669 assert(category_masks[LC_TELEPHONE_INDEX_] == LC_TELEPHONE_MASK);
e9bc6d6b 4670# endif
9821811f 4671# endif
36dbc955 4672# ifdef USE_LOCALE_SYNTAX
4fd76d49
KW
4673 assert(categories[LC_SYNTAX_INDEX_] == LC_SYNTAX);
4674 assert(strEQ(category_names[LC_SYNTAX_INDEX_], "LC_SYNTAX"));
36dbc955 4675# ifdef USE_POSIX_2008_LOCALE
4fd76d49 4676 assert(category_masks[LC_SYNTAX_INDEX_] == LC_SYNTAX_MASK);
36dbc955
KW
4677# endif
4678# endif
4679# ifdef USE_LOCALE_TOD
4fd76d49
KW
4680 assert(categories[LC_TOD_INDEX_] == LC_TOD);
4681 assert(strEQ(category_names[LC_TOD_INDEX_], "LC_TOD"));
36dbc955 4682# ifdef USE_POSIX_2008_LOCALE
4fd76d49 4683 assert(category_masks[LC_TOD_INDEX_] == LC_TOD_MASK);
36dbc955
KW
4684# endif
4685# endif
948523db 4686# ifdef LC_ALL
4fd76d49
KW
4687 assert(categories[LC_ALL_INDEX_] == LC_ALL);
4688 assert(strEQ(category_names[LC_ALL_INDEX_], "LC_ALL"));
b9cb1acb 4689 STATIC_ASSERT_STMT(NOMINAL_LC_ALL_INDEX == LC_ALL_INDEX_);
e9bc6d6b 4690# ifdef USE_POSIX_2008_LOCALE
4fd76d49 4691 assert(category_masks[LC_ALL_INDEX_] == LC_ALL_MASK);
e9bc6d6b 4692# endif
948523db
KW
4693# endif
4694# endif /* DEBUGGING */
47280b20 4695
5a6637f0
KW
4696 /* Initialize the per-thread mbrFOO() state variables. See POSIX.xs for
4697 * why these particular incantations are used. */
fe1c1494 4698# ifdef HAS_MBRLEN
d2c9cb53 4699 memzero(&PL_mbrlen_ps, sizeof(PL_mbrlen_ps));
fe1c1494
KW
4700# endif
4701# ifdef HAS_MBRTOWC
5a6637f0 4702 memzero(&PL_mbrtowc_ps, sizeof(PL_mbrtowc_ps));
fe1c1494
KW
4703# endif
4704# ifdef HAS_WCTOMBR
5a6637f0 4705 wcrtomb(NULL, L'\0', &PL_wcrtomb_ps);
fe1c1494 4706# endif
d2c9cb53 4707
47280b20
KW
4708 /* Initialize the cache of the program's UTF-8ness for the always known
4709 * locales C and POSIX */
4710 my_strlcpy(PL_locale_utf8ness, C_and_POSIX_utf8ness,
4711 sizeof(PL_locale_utf8ness));
4712
8a2e41ff
HS
4713 /* See https://github.com/Perl/perl5/issues/17824 */
4714 Zero(curlocales, NOMINAL_LC_ALL_INDEX, char *);
4715
e9bc6d6b
KW
4716# ifdef USE_THREAD_SAFE_LOCALE
4717# ifdef WIN32
4718
4719 _configthreadlocale(_ENABLE_PER_THREAD_LOCALE);
4720
4721# endif
4722# endif
39e69e77 4723# ifdef USE_POSIX_2008_LOCALE
e9bc6d6b
KW
4724
4725 PL_C_locale_obj = newlocale(LC_ALL_MASK, "C", (locale_t) 0);
4726 if (! PL_C_locale_obj) {
20ad7b23
KW
4727 locale_panic_(Perl_form(aTHX_
4728 "Cannot create POSIX 2008 C locale object"));
e9bc6d6b 4729 }
e9bc6d6b 4730
b26541a9
KW
4731 DEBUG_Lv(PerlIO_printf(Perl_debug_log, "created C object %p\n",
4732 PL_C_locale_obj));
e9bc6d6b 4733# endif
78f7c09f
KW
4734# ifdef USE_LOCALE_NUMERIC
4735
2fbc73dd 4736 PL_numeric_radix_sv = newSVpvn(C_decimal_point, strlen(C_decimal_point));
7b9f2e4d
KW
4737 Newx(PL_numeric_name, 2, char);
4738 Copy("C", PL_numeric_name, 2, char);
4739
4740# endif
4741# ifdef USE_LOCALE_COLLATE
4742
4743 Newx(PL_collation_name, 2, char);
4744 Copy("C", PL_collation_name, 2, char);
3ca88433 4745
78f7c09f 4746# endif
434e0069 4747# ifdef USE_PL_CURLOCALES
e9bc6d6b
KW
4748
4749 /* Initialize our records. If we have POSIX 2008, we have LC_ALL */
3980cddb 4750 void_setlocale_c(LC_ALL, porcelain_setlocale(LC_ALL, NULL));
e9bc6d6b
KW
4751
4752# endif
98994639 4753
65ebb059 4754 /* We try each locale in the list until we get one that works, or exhaust
20a240df
KW
4755 * the list. Normally the loop is executed just once. But if setting the
4756 * locale fails, inside the loop we add fallback trials to the array and so
4757 * will execute the loop multiple times */
e57aaf75
B
4758 trial_locales[0] = (trial_locales_struct) {
4759 .trial_locale = setlocale_init,
6e1b5a64
B
4760 .fallback_desc = NULL,
4761 .fallback_name = NULL,
e57aaf75 4762 };
c3fcd832 4763 trial_locales_count = 1;
7d4bcc4a 4764
65ebb059 4765 for (i= 0; i < trial_locales_count; i++) {
e57aaf75 4766 const char * trial_locale = trial_locales[i].trial_locale;
503bf960 4767 setlocale_failure = FALSE;
65ebb059 4768
7d4bcc4a
KW
4769# ifdef LC_ALL
4770
c117e8b0
KW
4771 /* setlocale() return vals; not copied so must be looked at
4772 * immediately. */
4773 const char * sl_result[NOMINAL_LC_ALL_INDEX + 1];
a7ff7aca 4774 sl_result[LC_ALL_INDEX_] = stdized_setlocale(LC_ALL, trial_locale);
d188fe6d 4775 DEBUG_LOCALE_INIT(LC_ALL_INDEX_, trial_locale, sl_result[LC_ALL_INDEX_]);
4fd76d49 4776 if (! sl_result[LC_ALL_INDEX_]) {
49c85077 4777 setlocale_failure = TRUE;
7cd8b568
KW
4778 }
4779 else {
4780 /* Since LC_ALL succeeded, it should have changed all the other
4781 * categories it can to its value; so we massage things so that the
4782 * setlocales below just return their category's current values.
4783 * This adequately handles the case in NetBSD where LC_COLLATE may
4784 * not be defined for a locale, and setting it individually will
7d4bcc4a 4785 * fail, whereas setting LC_ALL succeeds, leaving LC_COLLATE set to
7cd8b568
KW
4786 * the POSIX locale. */
4787 trial_locale = NULL;
4788 }
7d4bcc4a
KW
4789
4790# endif /* LC_ALL */
98994639 4791
e5f10d49
KW
4792 if (! setlocale_failure) {
4793 unsigned int j;
4794 for (j = 0; j < NOMINAL_LC_ALL_INDEX; j++) {
cf59ced9 4795 Safefree(curlocales[j]);
a7ff7aca 4796 curlocales[j] = stdized_setlocale(categories[j], trial_locale);
e5f10d49
KW
4797 if (! curlocales[j]) {
4798 setlocale_failure = TRUE;
4799 }
a683a66a 4800 curlocales[j] = savepv(curlocales[j]);
d188fe6d 4801 DEBUG_LOCALE_INIT(j, trial_locale, curlocales[j]);
e5f10d49 4802 }
c835d6be 4803
f6665bcd 4804 if (LIKELY(! setlocale_failure)) { /* All succeeded */
e5f10d49 4805 break; /* Exit trial_locales loop */
49c85077 4806 }
65ebb059 4807 }
98994639 4808
49c85077
KW
4809 /* Here, something failed; will need to try a fallback. */
4810 ok = 0;
65ebb059 4811
49c85077
KW
4812 if (i == 0) {
4813 unsigned int j;
98994639 4814
65ebb059 4815 if (locwarn) { /* Output failure info only on the first one */
7d4bcc4a
KW
4816
4817# ifdef LC_ALL
98994639 4818
49c85077
KW
4819 PerlIO_printf(Perl_error_log,
4820 "perl: warning: Setting locale failed.\n");
98994639 4821
7d4bcc4a 4822# else /* !LC_ALL */
98994639 4823
49c85077 4824 PerlIO_printf(Perl_error_log,
71ce8c74 4825 "perl: warning: Setting locale failed for the categories:\n");
7d4bcc4a 4826
e5f10d49
KW
4827 for (j = 0; j < NOMINAL_LC_ALL_INDEX; j++) {
4828 if (! curlocales[j]) {
71ce8c74 4829 PerlIO_printf(Perl_error_log, "\t%s\n", category_names[j]);
e5f10d49 4830 }
e5f10d49 4831 }
7d4bcc4a 4832
7d4bcc4a 4833# endif /* LC_ALL */
98994639 4834
49c85077
KW
4835 PerlIO_printf(Perl_error_log,
4836 "perl: warning: Please check that your locale settings:\n");
98994639 4837
7d4bcc4a
KW
4838# ifdef __GLIBC__
4839
49c85077
KW
4840 PerlIO_printf(Perl_error_log,
4841 "\tLANGUAGE = %c%s%c,\n",
4842 language ? '"' : '(',
4843 language ? language : "unset",
4844 language ? '"' : ')');
7d4bcc4a 4845# endif
98994639 4846
49c85077
KW
4847 PerlIO_printf(Perl_error_log,
4848 "\tLC_ALL = %c%s%c,\n",
4849 lc_all ? '"' : '(',
4850 lc_all ? lc_all : "unset",
4851 lc_all ? '"' : ')');
98994639 4852
7d4bcc4a
KW
4853# if defined(USE_ENVIRON_ARRAY)
4854
49c85077 4855 {
cd999af9 4856 char **e;
d5e32b93
KW
4857
4858 /* Look through the environment for any variables of the
4859 * form qr/ ^ LC_ [A-Z]+ = /x, except LC_ALL which was
4860 * already handled above. These are assumed to be locale
4861 * settings. Output them and their values. */
cd999af9 4862 for (e = environ; *e; e++) {
d5e32b93
KW
4863 const STRLEN prefix_len = sizeof("LC_") - 1;
4864 STRLEN uppers_len;
4865
cd999af9 4866 if ( strBEGINs(*e, "LC_")
c8b388b0 4867 && ! strBEGINs(*e, "LC_ALL=")
d5e32b93
KW
4868 && (uppers_len = strspn(*e + prefix_len,
4869 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"))
4870 && ((*e)[prefix_len + uppers_len] == '='))
cd999af9
KW
4871 {
4872 PerlIO_printf(Perl_error_log, "\t%.*s = \"%s\",\n",
d5e32b93
KW
4873 (int) (prefix_len + uppers_len), *e,
4874 *e + prefix_len + uppers_len + 1);
cd999af9
KW
4875 }
4876 }
49c85077 4877 }
7d4bcc4a
KW
4878
4879# else
4880
49c85077
KW
4881 PerlIO_printf(Perl_error_log,
4882 "\t(possibly more locale environment variables)\n");
7d4bcc4a
KW
4883
4884# endif
98994639 4885
49c85077
KW
4886 PerlIO_printf(Perl_error_log,
4887 "\tLANG = %c%s%c\n",
4888 lang ? '"' : '(',
4889 lang ? lang : "unset",
4890 lang ? '"' : ')');
98994639 4891
49c85077
KW
4892 PerlIO_printf(Perl_error_log,
4893 " are supported and installed on your system.\n");
4894 }
98994639 4895
65ebb059 4896 /* Calculate what fallback locales to try. We have avoided this
f6bab5f6 4897 * until we have to, because failure is quite unlikely. This will
65ebb059
KW
4898 * usually change the upper bound of the loop we are in.
4899 *
4900 * Since the system's default way of setting the locale has not
4901 * found one that works, We use Perl's defined ordering: LC_ALL,
4902 * LANG, and the C locale. We don't try the same locale twice, so
4903 * don't add to the list if already there. (On POSIX systems, the
4904 * LC_ALL element will likely be a repeat of the 0th element "",
6b058d42
KW
4905 * but there's no harm done by doing it explicitly.
4906 *
4907 * Note that this tries the LC_ALL environment variable even on
4908 * systems which have no LC_ALL locale setting. This may or may
4909 * not have been originally intentional, but there's no real need
4910 * to change the behavior. */
65ebb059
KW
4911 if (lc_all) {
4912 for (j = 0; j < trial_locales_count; j++) {
e57aaf75 4913 if (strEQ(lc_all, trial_locales[j].trial_locale)) {
65ebb059
KW
4914 goto done_lc_all;
4915 }
4916 }
e57aaf75
B
4917 trial_locales[trial_locales_count++] = (trial_locales_struct) {
4918 .trial_locale = lc_all,
6e1b5a64
B
4919 .fallback_desc = (strEQ(lc_all, "C")
4920 ? "the standard locale"
4921 : "a fallback locale"),
4922 .fallback_name = lc_all,
e57aaf75 4923 };
65ebb059
KW
4924 }
4925 done_lc_all:
98994639 4926
65ebb059
KW
4927 if (lang) {
4928 for (j = 0; j < trial_locales_count; j++) {
e57aaf75 4929 if (strEQ(lang, trial_locales[j].trial_locale)) {
65ebb059
KW
4930 goto done_lang;
4931 }
4932 }
e57aaf75
B
4933 trial_locales[trial_locales_count++] = (trial_locales_struct) {
4934 .trial_locale = lang,
6e1b5a64
B
4935 .fallback_desc = (strEQ(lang, "C")
4936 ? "the standard locale"
4937 : "a fallback locale"),
4938 .fallback_name = lang,
e57aaf75 4939 };
65ebb059
KW
4940 }
4941 done_lang:
4942
7d4bcc4a
KW
4943# if defined(WIN32) && defined(LC_ALL)
4944
65ebb059
KW
4945 /* For Windows, we also try the system default locale before "C".
4946 * (If there exists a Windows without LC_ALL we skip this because
4947 * it gets too complicated. For those, the "C" is the next
4fe10b51
B
4948 * fallback possibility). */
4949 {
4950 /* Note that this may change the locale, but we are going to do
c3b01b79
KW
4951 * that anyway.
4952 *
4953 * Our normal Windows setlocale() implementation ignores the
4954 * system default locale to make things work like POSIX. This
4955 * is the only place where we want to consider it, so have to
4956 * use wrap_wsetlocale(). */
4957 const char *system_default_locale =
4958 stdize_locale(LC_ALL,
4959 S_wrap_wsetlocale(aTHX_ LC_ALL, ""),
4960 &PL_stdize_locale_buf,
4961 &PL_stdize_locale_bufsize,
4962 __LINE__);
4fe10b51
B
4963 DEBUG_LOCALE_INIT(LC_ALL_INDEX_, "", system_default_locale);
4964
4965 /* Skip if invalid or if it's already on the list of locales to
4966 * try */
4967 if (! system_default_locale) {
4968 goto done_system_default;
4969 }
4970 for (j = 0; j < trial_locales_count; j++) {
4971 if (strEQ(system_default_locale, trial_locales[j].trial_locale)) {
4972 goto done_system_default;
4973 }
4974 }
4975
4976 trial_locales[trial_locales_count++] = (trial_locales_struct) {
4977 .trial_locale = system_default_locale,
4978 .fallback_desc = (strEQ(system_default_locale, "C")
4979 ? "the standard locale"
4980 : "the system default locale"),
4981 .fallback_name = system_default_locale,
4982 };
4983 }
4984 done_system_default:
7d4bcc4a
KW
4985
4986# endif
65ebb059
KW
4987
4988 for (j = 0; j < trial_locales_count; j++) {
e57aaf75 4989 if (strEQ("C", trial_locales[j].trial_locale)) {
65ebb059
KW
4990 goto done_C;
4991 }
4992 }
e57aaf75
B
4993 trial_locales[trial_locales_count++] = (trial_locales_struct) {
4994 .trial_locale = "C",
6e1b5a64
B
4995 .fallback_desc = "the standard locale",
4996 .fallback_name = "C",
e57aaf75 4997 };
98994639 4998
65ebb059
KW
4999 done_C: ;
5000 } /* end of first time through the loop */
98994639 5001
7d4bcc4a
KW
5002# ifdef WIN32
5003
65ebb059 5004 next_iteration: ;
7d4bcc4a
KW
5005
5006# endif
65ebb059
KW
5007
5008 } /* end of looping through the trial locales */
5009
5010 if (ok < 1) { /* If we tried to fallback */
5011 const char* msg;
5012 if (! setlocale_failure) { /* fallback succeeded */
5013 msg = "Falling back to";
5014 }
5015 else { /* fallback failed */
e5f10d49 5016 unsigned int j;
98994639 5017
65ebb059
KW
5018 /* We dropped off the end of the loop, so have to decrement i to
5019 * get back to the value the last time through */
5020 i--;
98994639 5021
65ebb059
KW
5022 ok = -1;
5023 msg = "Failed to fall back to";
5024
5025 /* To continue, we should use whatever values we've got */
7d4bcc4a 5026
e5f10d49
KW
5027 for (j = 0; j < NOMINAL_LC_ALL_INDEX; j++) {
5028 Safefree(curlocales[j]);
a7ff7aca 5029 curlocales[j] = savepv(stdized_setlocale(categories[j], NULL));
d188fe6d 5030 DEBUG_LOCALE_INIT(j, NULL, curlocales[j]);
e5f10d49 5031 }
65ebb059
KW
5032 }
5033
5034 if (locwarn) {
6e1b5a64
B
5035 const char * description = trial_locales[i].fallback_desc;
5036 const char * name = trial_locales[i].fallback_name;
7d4bcc4a 5037
65ebb059
KW
5038 if (name && strNE(name, "")) {
5039 PerlIO_printf(Perl_error_log,
5040 "perl: warning: %s %s (\"%s\").\n", msg, description, name);
5041 }
5042 else {
5043 PerlIO_printf(Perl_error_log,
5044 "perl: warning: %s %s.\n", msg, description);
5045 }
5046 }
5047 } /* End of tried to fallback */
98994639 5048
a7ff7aca
KW
5049# ifdef USE_POSIX_2008_LOCALE
5050
5051 /* The stdized setlocales haven't affected the P2008 locales. Initialize
11d8dd19
KW
5052 * them now, calculating LC_ALL only on the final go round, when all have
5053 * been set. */
a7ff7aca 5054 for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++) {
20be777a
KW
5055 (void) emulate_setlocale_i(i, curlocales[i],
5056 RECALCULATE_LC_ALL_ON_FINAL_INTERATION,
5057 __LINE__);
a7ff7aca
KW
5058 }
5059
5060# endif
5061
4e10e6a0 5062 /* Done with finding the locales; update the auxiliary records */
c7dc1ea8 5063 new_LC_ALL(NULL);
e5f10d49 5064
948523db 5065 for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++) {
47280b20 5066
8751a11f 5067# if defined(USE_LOCALE_THREADS) && ! defined(USE_THREAD_SAFE_LOCALE)
47280b20
KW
5068
5069 /* This caches whether each category's locale is UTF-8 or not. This
5070 * may involve changing the locale. It is ok to do this at
e9bc6d6b
KW
5071 * initialization time before any threads have started, but not later
5072 * unless thread-safe operations are used.
47280b20
KW
5073 * Caching means that if the program heeds our dictate not to change
5074 * locales in threaded applications, this data will remain valid, and
9de6fe47
KW
5075 * it may get queried without having to change locales. If the
5076 * environment is such that all categories have the same locale, this
5077 * isn't needed, as the code will not change the locale; but this
5078 * handles the uncommon case where the environment has disparate
5079 * locales for the categories */
47280b20
KW
5080 (void) _is_cur_LC_category_utf8(categories[i]);
5081
5082# endif
5083
e5f10d49
KW
5084 Safefree(curlocales[i]);
5085 }
b310b053 5086
7d4bcc4a
KW
5087# if defined(USE_PERLIO) && defined(USE_LOCALE_CTYPE)
5088
49c85077 5089 /* Set PL_utf8locale to TRUE if using PerlIO _and_ the current LC_CTYPE
50bf02bd
KW
5090 * locale is UTF-8. The call to new_ctype() just above has already
5091 * calculated the latter value and saved it in PL_in_utf8_CTYPE_locale. If
5092 * both PL_utf8locale and PL_unicode (set by -C or by $ENV{PERL_UNICODE})
5093 * are true, perl.c:S_parse_body() will turn on the PerlIO :utf8 layer on
5094 * STDIN, STDOUT, STDERR, _and_ the default open discipline. */
5095 PL_utf8locale = PL_in_utf8_CTYPE_locale;
49c85077 5096
a05d7ebb 5097 /* Set PL_unicode to $ENV{PERL_UNICODE} if using PerlIO.
fde18df1
JH
5098 This is an alternative to using the -C command line switch
5099 (the -C if present will override this). */
5100 {
1604cfb0
MS
5101 const char *p = PerlEnv_getenv("PERL_UNICODE");
5102 PL_unicode = p ? parse_unicode_opts(&p) : 0;
5103 if (PL_unicode & PERL_UNICODE_UTF8CACHEASSERT_FLAG)
5104 PL_utf8cache = -1;
b310b053
JH
5105 }
5106
7d4bcc4a 5107# endif
e3305790 5108#endif /* USE_LOCALE */
7d4bcc4a 5109
2fcc0ca9 5110 /* So won't continue to output stuff */
27cdc72e 5111 DEBUG_INITIALIZATION_set(FALSE);
7d4bcc4a 5112
98994639
HS
5113 return ok;
5114}
5115
98994639
HS
5116#ifdef USE_LOCALE_COLLATE
5117
a4a439fb 5118char *
a4a439fb
KW
5119Perl__mem_collxfrm(pTHX_ const char *input_string,
5120 STRLEN len, /* Length of 'input_string' */
5121 STRLEN *xlen, /* Set to length of returned string
5122 (not including the collation index
5123 prefix) */
5124 bool utf8 /* Is the input in UTF-8? */
6696cfa7 5125 )
98994639 5126{
fe1c1494
KW
5127 /* _mem_collxfrm() is like strxfrm() but with two important differences.
5128 * First, it handles embedded NULs. Second, it allocates a bit more memory
5129 * than needed for the transformed data itself. The real transformed data
5130 * begins at offset COLLXFRM_HDR_LEN. *xlen is set to the length of that,
5131 * and doesn't include the collation index size.
5132 *
5133 * It is the caller's responsibility to eventually free the memory returned
5134 * by this function.
5135 *
a4a439fb
KW
5136 * Please see sv_collxfrm() to see how this is used. */
5137
fe1c1494 5138# define COLLXFRM_HDR_LEN sizeof(PL_collation_ix)
55e5378d 5139
6696cfa7
KW
5140 char * s = (char *) input_string;
5141 STRLEN s_strlen = strlen(input_string);
79f120c8 5142 char *xbuf = NULL;
55e5378d 5143 STRLEN xAlloc; /* xalloc is a reserved word in VC */
17f41037 5144 STRLEN length_in_chars;
c664130f 5145 bool first_time = TRUE; /* Cleared after first loop iteration */
98994639 5146
a4a439fb
KW
5147 PERL_ARGS_ASSERT__MEM_COLLXFRM;
5148
5149 /* Must be NUL-terminated */
5150 assert(*(input_string + len) == '\0');
7918f24d 5151
79f120c8
KW
5152 /* If this locale has defective collation, skip */
5153 if (PL_collxfrm_base == 0 && PL_collxfrm_mult == 0) {
c7202dee
KW
5154 DEBUG_L(PerlIO_printf(Perl_debug_log,
5155 "_mem_collxfrm: locale's collation is defective\n"));
79f120c8
KW
5156 goto bad;
5157 }
5158
6696cfa7
KW
5159 /* Replace any embedded NULs with the control that sorts before any others.
5160 * This will give as good as possible results on strings that don't
5161 * otherwise contain that character, but otherwise there may be
5162 * less-than-perfect results with that character and NUL. This is
fdc080f3 5163 * unavoidable unless we replace strxfrm with our own implementation. */
fd43f63c
KW
5164 if (UNLIKELY(s_strlen < len)) { /* Only execute if there is an embedded
5165 NUL */
6696cfa7
KW
5166 char * e = s + len;
5167 char * sans_nuls;
fdc080f3 5168 STRLEN sans_nuls_len;
94762aa0 5169 int try_non_controls;
afc4976f
KW
5170 char this_replacement_char[] = "?\0"; /* Room for a two-byte string,
5171 making sure 2nd byte is NUL.
5172 */
5173 STRLEN this_replacement_len;
5174
1e4c9676
KW
5175 /* If we don't know what non-NUL control character sorts lowest for
5176 * this locale, find it */
f28f4d2a 5177 if (PL_strxfrm_NUL_replacement == '\0') {
6696cfa7 5178 int j;
afc4976f 5179 char * cur_min_x = NULL; /* The min_char's xfrm, (except it also
6696cfa7
KW
5180 includes the collation index
5181 prefixed. */
5182
91c0e2e0 5183 DEBUG_Lv(PerlIO_printf(Perl_debug_log, "Looking to replace NUL\n"));
94762aa0
KW
5184
5185 /* Unlikely, but it may be that no control will work to replace
1e4c9676
KW
5186 * NUL, in which case we instead look for any character. Controls
5187 * are preferred because collation order is, in general, context
5188 * sensitive, with adjoining characters affecting the order, and
5189 * controls are less likely to have such interactions, allowing the
5190 * NUL-replacement to stand on its own. (Another way to look at it
5191 * is to imagine what would happen if the NUL were replaced by a
5192 * combining character; it wouldn't work out all that well.) */
94762aa0
KW
5193 for (try_non_controls = 0;
5194 try_non_controls < 2;
5195 try_non_controls++)
5196 {
d4ff9586
KW
5197 /* Look through all legal code points (NUL isn't) */
5198 for (j = 1; j < 256; j++) {
5199 char * x; /* j's xfrm plus collation index */
5200 STRLEN x_len; /* length of 'x' */
5201 STRLEN trial_len = 1;
736a4fed 5202 char cur_source[] = { '\0', '\0' };
d4ff9586 5203
736a4fed
KW
5204 /* Skip non-controls the first time through the loop. The
5205 * controls in a UTF-8 locale are the L1 ones */
afc4976f
KW
5206 if (! try_non_controls && (PL_in_utf8_COLLATE_locale)
5207 ? ! isCNTRL_L1(j)
5208 : ! isCNTRL_LC(j))
5209 {
d4ff9586 5210 continue;
6696cfa7 5211 }
6696cfa7 5212
736a4fed
KW
5213 /* Create a 1-char string of the current code point */
5214 cur_source[0] = (char) j;
5215
d4ff9586
KW
5216 /* Then transform it */
5217 x = _mem_collxfrm(cur_source, trial_len, &x_len,
afc4976f 5218 0 /* The string is not in UTF-8 */);
6696cfa7 5219
1e4c9676 5220 /* Ignore any character that didn't successfully transform.
d4ff9586
KW
5221 * */
5222 if (! x) {
5223 continue;
5224 }
6696cfa7 5225
d4ff9586
KW
5226 /* If this character's transformation is lower than
5227 * the current lowest, this one becomes the lowest */
5228 if ( cur_min_x == NULL
5229 || strLT(x + COLLXFRM_HDR_LEN,
5230 cur_min_x + COLLXFRM_HDR_LEN))
5231 {
f28f4d2a 5232 PL_strxfrm_NUL_replacement = j;
62d66863 5233 Safefree(cur_min_x);
d4ff9586 5234 cur_min_x = x;
d4ff9586
KW
5235 }
5236 else {
5237 Safefree(x);
5238 }
1e4c9676 5239 } /* end of loop through all 255 characters */
6696cfa7 5240
1e4c9676 5241 /* Stop looking if found */
94762aa0
KW
5242 if (cur_min_x) {
5243 break;
5244 }
5245
5246 /* Unlikely, but possible, if there aren't any controls that
5247 * work in the locale, repeat the loop, looking for any
5248 * character that works */
5249 DEBUG_L(PerlIO_printf(Perl_debug_log,
5250 "_mem_collxfrm: No control worked. Trying non-controls\n"));
1e4c9676 5251 } /* End of loop to try first the controls, then any char */
6696cfa7 5252
94762aa0
KW
5253 if (! cur_min_x) {
5254 DEBUG_L(PerlIO_printf(Perl_debug_log,
5255 "_mem_collxfrm: Couldn't find any character to replace"
5256 " embedded NULs in locale %s with", PL_collation_name));
5257 goto bad;
58eebef2
KW
5258 }
5259
94762aa0
KW
5260 DEBUG_L(PerlIO_printf(Perl_debug_log,
5261 "_mem_collxfrm: Replacing embedded NULs in locale %s with "
f28f4d2a 5262 "0x%02X\n", PL_collation_name, PL_strxfrm_NUL_replacement));
94762aa0 5263
6696cfa7 5264 Safefree(cur_min_x);
1e4c9676 5265 } /* End of determining the character that is to replace NULs */
afc4976f
KW
5266
5267 /* If the replacement is variant under UTF-8, it must match the
291a84fb 5268 * UTF8-ness of the original */
f28f4d2a
KW
5269 if ( ! UVCHR_IS_INVARIANT(PL_strxfrm_NUL_replacement) && utf8) {
5270 this_replacement_char[0] =
5271 UTF8_EIGHT_BIT_HI(PL_strxfrm_NUL_replacement);
5272 this_replacement_char[1] =
5273 UTF8_EIGHT_BIT_LO(PL_strxfrm_NUL_replacement);
afc4976f
KW
5274 this_replacement_len = 2;
5275 }
5276 else {
f28f4d2a 5277 this_replacement_char[0] = PL_strxfrm_NUL_replacement;
afc4976f
KW
5278 /* this_replacement_char[1] = '\0' was done at initialization */
5279 this_replacement_len = 1;
6696cfa7
KW
5280 }
5281
5282 /* The worst case length for the replaced string would be if every
5283 * character in it is NUL. Multiply that by the length of each
5284 * replacement, and allow for a trailing NUL */
afc4976f 5285 sans_nuls_len = (len * this_replacement_len) + 1;
fdc080f3 5286 Newx(sans_nuls, sans_nuls_len, char);
6696cfa7
KW
5287 *sans_nuls = '\0';
5288
6696cfa7
KW
5289 /* Replace each NUL with the lowest collating control. Loop until have
5290 * exhausted all the NULs */
5291 while (s + s_strlen < e) {
6069d6c5 5292 my_strlcat(sans_nuls, s, sans_nuls_len);
6696cfa7
KW
5293
5294 /* Do the actual replacement */
6069d6c5 5295 my_strlcat(sans_nuls, this_replacement_char, sans_nuls_len);
6696cfa7
KW
5296
5297 /* Move past the input NUL */
5298 s += s_strlen + 1;
5299 s_strlen = strlen(s);
5300 }
5301
5302 /* And add anything that trails the final NUL */
6069d6c5 5303 my_strlcat(sans_nuls, s, sans_nuls_len);
6696cfa7
KW
5304
5305 /* Switch so below we transform this modified string */
5306 s = sans_nuls;
5307 len = strlen(s);
1e4c9676 5308 } /* End of replacing NULs */
6696cfa7 5309
a4a439fb
KW
5310 /* Make sure the UTF8ness of the string and locale match */
5311 if (utf8 != PL_in_utf8_COLLATE_locale) {
9de6fe47 5312 /* XXX convert above Unicode to 10FFFF? */
a4a439fb
KW
5313 const char * const t = s; /* Temporary so we can later find where the
5314 input was */
5315
5316 /* Here they don't match. Change the string's to be what the locale is
5317 * expecting */
5318
5319 if (! utf8) { /* locale is UTF-8, but input isn't; upgrade the input */
5320 s = (char *) bytes_to_utf8((const U8 *) s, &len);
5321 utf8 = TRUE;
5322 }
5323 else { /* locale is not UTF-8; but input is; downgrade the input */
5324
5325 s = (char *) bytes_from_utf8((const U8 *) s, &len, &utf8);
5326
5327 /* If the downgrade was successful we are done, but if the input
5328 * contains things that require UTF-8 to represent, have to do
5329 * damage control ... */
5330 if (UNLIKELY(utf8)) {
5331
5332 /* What we do is construct a non-UTF-8 string with
5333 * 1) the characters representable by a single byte converted
5334 * to be so (if necessary);
5335 * 2) and the rest converted to collate the same as the
5336 * highest collating representable character. That makes
5337 * them collate at the end. This is similar to how we
5338 * handle embedded NULs, but we use the highest collating
5339 * code point instead of the smallest. Like the NUL case,
5340 * this isn't perfect, but is the best we can reasonably
5341 * do. Every above-255 code point will sort the same as
5342 * the highest-sorting 0-255 code point. If that code
5343 * point can combine in a sequence with some other code
5344 * points for weight calculations, us changing something to
5345 * be it can adversely affect the results. But in most
5346 * cases, it should work reasonably. And note that this is
5347 * really an illegal situation: using code points above 255
5348 * on a locale where only 0-255 are valid. If two strings
5349 * sort entirely equal, then the sort order for the
5350 * above-255 code points will be in code point order. */
5351
5352 utf8 = FALSE;
5353
5354 /* If we haven't calculated the code point with the maximum
5355 * collating order for this locale, do so now */
5356 if (! PL_strxfrm_max_cp) {
5357 int j;
5358
5359 /* The current transformed string that collates the
5360 * highest (except it also includes the prefixed collation
5361 * index. */
5362 char * cur_max_x = NULL;
5363
5364 /* Look through all legal code points (NUL isn't) */
5365 for (j = 1; j < 256; j++) {
5366 char * x;
5367 STRLEN x_len;
736a4fed 5368 char cur_source[] = { '\0', '\0' };
a4a439fb 5369
736a4fed
KW
5370 /* Create a 1-char string of the current code point */
5371 cur_source[0] = (char) j;
a4a439fb
KW
5372
5373 /* Then transform it */
5374 x = _mem_collxfrm(cur_source, 1, &x_len, FALSE);
5375
5376 /* If something went wrong (which it shouldn't), just
5377 * ignore this code point */
94762aa0 5378 if (! x) {
a4a439fb
KW
5379 continue;
5380 }
5381
5382 /* If this character's transformation is higher than
5383 * the current highest, this one becomes the highest */
5384 if ( cur_max_x == NULL
55e5378d
KW
5385 || strGT(x + COLLXFRM_HDR_LEN,
5386 cur_max_x + COLLXFRM_HDR_LEN))
a4a439fb
KW
5387 {
5388 PL_strxfrm_max_cp = j;
62d66863 5389 Safefree(cur_max_x);
a4a439fb
KW
5390 cur_max_x = x;
5391 }
5392 else {
5393 Safefree(x);
5394 }
5395 }
5396
94762aa0
KW
5397 if (! cur_max_x) {
5398 DEBUG_L(PerlIO_printf(Perl_debug_log,
5399 "_mem_collxfrm: Couldn't find any character to"
5400 " replace above-Latin1 chars in locale %s with",
5401 PL_collation_name));
5402 goto bad;
5403 }
5404
58eebef2
KW
5405 DEBUG_L(PerlIO_printf(Perl_debug_log,
5406 "_mem_collxfrm: highest 1-byte collating character"
5407 " in locale %s is 0x%02X\n",
5408 PL_collation_name,
5409 PL_strxfrm_max_cp));
58eebef2 5410
a4a439fb
KW
5411 Safefree(cur_max_x);
5412 }
5413
5414 /* Here we know which legal code point collates the highest.
5415 * We are ready to construct the non-UTF-8 string. The length
5416 * will be at least 1 byte smaller than the input string
5417 * (because we changed at least one 2-byte character into a
5418 * single byte), but that is eaten up by the trailing NUL */
5419 Newx(s, len, char);
5420
5421 {
5422 STRLEN i;
5423 STRLEN d= 0;
042d9e50 5424 char * e = (char *) t + len;
a4a439fb
KW
5425
5426 for (i = 0; i < len; i+= UTF8SKIP(t + i)) {
5427 U8 cur_char = t[i];
5428 if (UTF8_IS_INVARIANT(cur_char)) {
5429 s[d++] = cur_char;
5430 }
042d9e50 5431 else if (UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(t + i, e)) {
a4a439fb
KW
5432 s[d++] = EIGHT_BIT_UTF8_TO_NATIVE(cur_char, t[i+1]);
5433 }
5434 else { /* Replace illegal cp with highest collating
5435 one */
5436 s[d++] = PL_strxfrm_max_cp;
5437 }
5438 }
5439 s[d++] = '\0';
5440 Renew(s, d, char); /* Free up unused space */
5441 }
5442 }
5443 }
5444
5445 /* Here, we have constructed a modified version of the input. It could
5446 * be that we already had a modified copy before we did this version.
5447 * If so, that copy is no longer needed */
5448 if (t != input_string) {
5449 Safefree(t);
5450 }
5451 }
5452
17f41037
KW
5453 length_in_chars = (utf8)
5454 ? utf8_length((U8 *) s, (U8 *) s + len)
5455 : len;
5456
59c018b9
KW
5457 /* The first element in the output is the collation id, used by
5458 * sv_collxfrm(); then comes the space for the transformed string. The
5459 * equation should give us a good estimate as to how much is needed */
55e5378d 5460 xAlloc = COLLXFRM_HDR_LEN
a4a439fb 5461 + PL_collxfrm_base
17f41037 5462 + (PL_collxfrm_mult * length_in_chars);
a02a5408 5463 Newx(xbuf, xAlloc, char);
c7202dee
KW
5464 if (UNLIKELY(! xbuf)) {
5465 DEBUG_L(PerlIO_printf(Perl_debug_log,
5466 "_mem_collxfrm: Couldn't malloc %zu bytes\n", xAlloc));
1604cfb0 5467 goto bad;
c7202dee 5468 }
98994639 5469
d35fca5f 5470 /* Store the collation id */
98994639 5471 *(U32*)xbuf = PL_collation_ix;
d35fca5f
KW
5472
5473 /* Then the transformation of the input. We loop until successful, or we
5474 * give up */
4ebeff16 5475 for (;;) {
1adab0a7 5476
3b30ed2b 5477 errno = 0;
55e5378d 5478 *xlen = strxfrm(xbuf + COLLXFRM_HDR_LEN, s, xAlloc - COLLXFRM_HDR_LEN);
4ebeff16
KW
5479
5480 /* If the transformed string occupies less space than we told strxfrm()
3b30ed2b 5481 * was available, it means it transformed the whole string. */
55e5378d 5482 if (*xlen < xAlloc - COLLXFRM_HDR_LEN) {
17f41037 5483
3b30ed2b
KW
5484 /* But there still could have been a problem */
5485 if (errno != 0) {
5486 DEBUG_L(PerlIO_printf(Perl_debug_log,
5487 "strxfrm failed for LC_COLLATE=%s; errno=%d, input=%s\n",
5488 PL_collation_name, errno,
5489 _byte_dump_string((U8 *) s, len, 0)));
5490 goto bad;
5491 }
5492
5493 /* Here, the transformation was successful. Some systems include a
5494 * trailing NUL in the returned length. Ignore it, using a loop in
5495 * case multiple trailing NULs are returned. */
1adab0a7
KW
5496 while ( (*xlen) > 0
5497 && *(xbuf + COLLXFRM_HDR_LEN + (*xlen) - 1) == '\0')
5498 {
5499 (*xlen)--;
5500 }
5501
17f41037
KW
5502 /* If the first try didn't get it, it means our prediction was low.
5503 * Modify the coefficients so that we predict a larger value in any
5504 * future transformations */
5505 if (! first_time) {
5506 STRLEN needed = *xlen + 1; /* +1 For trailing NUL */
5507 STRLEN computed_guess = PL_collxfrm_base
5508 + (PL_collxfrm_mult * length_in_chars);
e1c30f0c
KW
5509
5510 /* On zero-length input, just keep current slope instead of
5511 * dividing by 0 */
5512 const STRLEN new_m = (length_in_chars != 0)
5513 ? needed / length_in_chars
5514 : PL_collxfrm_mult;
17f41037
KW
5515
5516 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
b26541a9 5517 "initial size of %zu bytes for a length "
b07929e4 5518 "%zu string was insufficient, %zu needed\n",
b07929e4 5519 computed_guess, length_in_chars, needed));
17f41037
KW
5520
5521 /* If slope increased, use it, but discard this result for
5522 * length 1 strings, as we can't be sure that it's a real slope
5523 * change */
5524 if (length_in_chars > 1 && new_m > PL_collxfrm_mult) {
7d4bcc4a
KW
5525
5526# ifdef DEBUGGING
5527
17f41037
KW
5528 STRLEN old_m = PL_collxfrm_mult;
5529 STRLEN old_b = PL_collxfrm_base;
7d4bcc4a
KW
5530
5531# endif
5532
17f41037
KW
5533 PL_collxfrm_mult = new_m;
5534 PL_collxfrm_base = 1; /* +1 For trailing NUL */
5535 computed_guess = PL_collxfrm_base
5536 + (PL_collxfrm_mult * length_in_chars);
5537 if (computed_guess < needed) {
5538 PL_collxfrm_base += needed - computed_guess;
5539 }
5540
5541 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
b26541a9 5542 "slope is now %zu; was %zu, base "
b07929e4 5543 "is now %zu; was %zu\n",
b07929e4
KW
5544 PL_collxfrm_mult, old_m,
5545 PL_collxfrm_base, old_b));
17f41037
KW
5546 }
5547 else { /* Slope didn't change, but 'b' did */
5548 const STRLEN new_b = needed
5549 - computed_guess
5550 + PL_collxfrm_base;
5551 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
b26541a9 5552 "base is now %zu; was %zu\n", new_b, PL_collxfrm_base));
17f41037
KW
5553 PL_collxfrm_base = new_b;
5554 }
5555 }
5556
4ebeff16
KW
5557 break;
5558 }
bb0f664e 5559
c7202dee
KW
5560 if (UNLIKELY(*xlen >= PERL_INT_MAX)) {
5561 DEBUG_L(PerlIO_printf(Perl_debug_log,
5562 "_mem_collxfrm: Needed %zu bytes, max permissible is %u\n",
5563 *xlen, PERL_INT_MAX));
4ebeff16 5564 goto bad;
c7202dee 5565 }
d35fca5f 5566
c664130f 5567 /* A well-behaved strxfrm() returns exactly how much space it needs
1adab0a7
KW
5568 * (usually not including the trailing NUL) when it fails due to not
5569 * enough space being provided. Assume that this is the case unless
5570 * it's been proven otherwise */
c664130f 5571 if (LIKELY(PL_strxfrm_is_behaved) && first_time) {
55e5378d 5572 xAlloc = *xlen + COLLXFRM_HDR_LEN + 1;
c664130f
KW
5573 }
5574 else { /* Here, either:
5575 * 1) The strxfrm() has previously shown bad behavior; or
5576 * 2) It isn't the first time through the loop, which means
5577 * that the strxfrm() is now showing bad behavior, because
5578 * we gave it what it said was needed in the previous
5579 * iteration, and it came back saying it needed still more.
5580 * (Many versions of cygwin fit this. When the buffer size
5581 * isn't sufficient, they return the input size instead of
5582 * how much is needed.)
d4ff9586
KW
5583 * Increase the buffer size by a fixed percentage and try again.
5584 * */
6ddd902c 5585 xAlloc += (xAlloc / 4) + 1;
c664130f 5586 PL_strxfrm_is_behaved = FALSE;
c664130f 5587
94d521c0
KW
5588 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
5589 "_mem_collxfrm required more space than previously"
5590 " calculated for locale %s, trying again with new"
5591 " guess=%zu+%zu\n",
3e25f6d2 5592 PL_collation_name, COLLXFRM_HDR_LEN,
94d521c0 5593 xAlloc - COLLXFRM_HDR_LEN));
58eebef2 5594 }
c664130f 5595
4ebeff16 5596 Renew(xbuf, xAlloc, char);
c7202dee
KW
5597 if (UNLIKELY(! xbuf)) {
5598 DEBUG_L(PerlIO_printf(Perl_debug_log,
5599 "_mem_collxfrm: Couldn't realloc %zu bytes\n", xAlloc));
4ebeff16 5600 goto bad;
c7202dee 5601 }
c664130f
KW
5602
5603 first_time = FALSE;
4ebeff16 5604 }
98994639 5605
94d521c0
KW
5606 DEBUG_Lv((print_collxfrm_input_and_return(s, s + len, xlen, utf8),
5607 PerlIO_printf(Perl_debug_log, "Its xfrm is:"),
7e2f38b2
KW
5608 PerlIO_printf(Perl_debug_log, "%s\n",
5609 _byte_dump_string((U8 *) xbuf + COLLXFRM_HDR_LEN,
94d521c0 5610 *xlen, 1))));
58eebef2 5611
fe1c1494 5612 /* Free up unneeded space; retain enough for trailing NUL */
55e5378d 5613 Renew(xbuf, COLLXFRM_HDR_LEN + *xlen + 1, char);
98994639 5614
6696cfa7
KW
5615 if (s != input_string) {
5616 Safefree(s);
98994639
HS
5617 }
5618
98994639
HS
5619 return xbuf;
5620
5621 bad:
7d4bcc4a 5622
94d521c0 5623 DEBUG_Lv(print_collxfrm_input_and_return(s, s + len, NULL, utf8));
7d4bcc4a 5624
21dce8f4
KW
5625 Safefree(xbuf);
5626 if (s != input_string) {
5627 Safefree(s);
5628 }
5629 *xlen = 0;
5630
98994639
HS
5631 return NULL;
5632}
5633
7d4bcc4a 5634# ifdef DEBUGGING
c7202dee 5635
4cbaac56 5636STATIC void
c7202dee
KW
5637S_print_collxfrm_input_and_return(pTHX_
5638 const char * const s,
5639 const char * const e,
5640 const STRLEN * const xlen,
5641 const bool is_utf8)
5642{
c7202dee
KW
5643
5644 PERL_ARGS_ASSERT_PRINT_COLLXFRM_INPUT_AND_RETURN;
5645
511e4ff7
DM
5646 PerlIO_printf(Perl_debug_log, "_mem_collxfrm[%" UVuf "]: returning ",
5647 (UV)PL_collation_ix);
c7202dee 5648 if (xlen) {
08b6dc1d 5649 PerlIO_printf(Perl_debug_log, "%zu", *xlen);
c7202dee
KW
5650 }
5651 else {
5652 PerlIO_printf(Perl_debug_log, "NULL");
5653 }
5654 PerlIO_printf(Perl_debug_log, " for locale '%s', string='",
5655 PL_collation_name);
9c8a6dc2
KW
5656 print_bytes_for_locale(s, e, is_utf8);
5657
5658 PerlIO_printf(Perl_debug_log, "'\n");
5659}
5660
4d9252fd
KW
5661# endif /* DEBUGGING */
5662#endif /* USE_LOCALE_COLLATE */
fe1c1494 5663
1c0e67b5
KW
5664#ifdef USE_LOCALE
5665# ifdef DEBUGGING
4d9252fd 5666
9c8a6dc2
KW
5667STATIC void
5668S_print_bytes_for_locale(pTHX_
5669 const char * const s,
5670 const char * const e,
5671 const bool is_utf8)
5672{
5673 const char * t = s;
5674 bool prev_was_printable = TRUE;
5675 bool first_time = TRUE;
5676
5677 PERL_ARGS_ASSERT_PRINT_BYTES_FOR_LOCALE;
c7202dee
KW
5678
5679 while (t < e) {
5680 UV cp = (is_utf8)
5681 ? utf8_to_uvchr_buf((U8 *) t, e, NULL)
5682 : * (U8 *) t;
5683 if (isPRINT(cp)) {
5684 if (! prev_was_printable) {
5685 PerlIO_printf(Perl_debug_log, " ");
5686 }
5687 PerlIO_printf(Perl_debug_log, "%c", (U8) cp);
5688 prev_was_printable = TRUE;
5689 }
5690 else {
5691 if (! first_time) {
5692 PerlIO_printf(Perl_debug_log, " ");
5693 }
147e3846 5694 PerlIO_printf(Perl_debug_log, "%02" UVXf, cp);
c7202dee
KW
5695 prev_was_printable = FALSE;
5696 }
5697 t += (is_utf8) ? UTF8SKIP(t) : 1;
5698 first_time = FALSE;
5699 }
c7202dee
KW
5700}
5701
1c0e67b5 5702# endif /* #ifdef DEBUGGING */
8ef6e574 5703
962aa53f 5704STATIC const char *
497bfce4
KW
5705S_toggle_locale_i(pTHX_ const unsigned cat_index,
5706 const char * new_locale,
5707 const line_t caller_line)
5708{
5709 /* Changes the locale for the category specified by 'index' to 'new_locale,
5710 * if they aren't already the same.
5711 *
5712 * Returns a copy of the name of the original locale for 'cat_index'
5713 * so can be switched back to with the companion function
5714 * restore_toggled_locale_i(), (NULL if no restoral is necessary.) */
5715
5716 const char * locale_to_restore_to = NULL;
5717
5718 PERL_ARGS_ASSERT_TOGGLE_LOCALE_I;
5719 assert(cat_index <= NOMINAL_LC_ALL_INDEX);
5720
5721 /* Find the original locale of the category we may need to change, so that
5722 * it can be restored to later */
5723
5724 locale_to_restore_to = querylocale_i(cat_index);
5725
5726 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
5727 "(%" LINE_Tf "): toggle_locale_i: index=%d(%s), wanted=%s,"
5728 " actual=%s\n",
5729 caller_line, cat_index, category_names[cat_index],
5730 new_locale, locale_to_restore_to));
5731
5732 if (! locale_to_restore_to) {
5733 locale_panic_(Perl_form(aTHX_ "Could not find current %s locale, errno=%d",
5734 category_names[cat_index], errno));
5735 }
5736
5737 /* If the locales are the same, there's nothing to do */
5738 if (strEQ(locale_to_restore_to, new_locale)) {
5739 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
5740 "(%d): %s locale unchanged as %s\n",
5741 caller_line, category_names[cat_index],
5742 new_locale));
5743
5744 return NULL;
5745 }
5746
5747 /* Finally, change the locale to the new one */
5748 void_setlocale_i(cat_index, new_locale);
5749
5750 DEBUG_Lv(PerlIO_printf(Perl_debug_log, "(%" LINE_Tf "): %s locale switched to %s\n",
5751 caller_line, category_names[cat_index], new_locale));
5752
5753 return locale_to_restore_to;
5754
5755#ifndef DEBUGGING
5756 PERL_UNUSED_ARG(caller_line);
5757#endif
5758
5759}
5760
5761STATIC void
5762S_restore_toggled_locale_i(pTHX_ const unsigned int cat_index,
5763 const char * restore_locale,
5764 const line_t caller_line)
5765{
5766 /* Restores the locale for LC_category corresponding to cat_indes to
5767 * 'restore_locale' (which is a copy that will be freed by this function),
5768 * or do nothing if the latter parameter is NULL */
5769
5770 PERL_ARGS_ASSERT_RESTORE_TOGGLED_LOCALE_I;
5771 assert(cat_index <= NOMINAL_LC_ALL_INDEX);
5772
5773 if (restore_locale == NULL) {
5774 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
5775 "(%" LINE_Tf "): No need to restore %s\n",
5776 caller_line, category_names[cat_index]));
5777 return;
5778 }
5779
5780 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
5781 "(%" LINE_Tf "): %s restoring locale to %s\n",
5782 caller_line, category_names[cat_index],
5783 restore_locale));
5784
5785 void_setlocale_i(cat_index, restore_locale);
5786
5787#ifndef DEBUGGING
5788 PERL_UNUSED_ARG(caller_line);
5789#endif
5790
5791}
5792
5793STATIC const char *
fe1c1494
KW
5794S_switch_category_locale_to_template(pTHX_ const int switch_category,
5795 const int template_category,
5796 const char * template_locale)
962aa53f
KW
5797{
5798 /* Changes the locale for LC_'switch_category" to that of
5799 * LC_'template_category', if they aren't already the same. If not NULL,
5800 * 'template_locale' is the locale that 'template_category' is in.
5801 *
9de6fe47
KW
5802 * Returns a copy of the name of the original locale for 'switch_category'
5803 * so can be switched back to with the companion function
5804 * restore_switched_locale(), (NULL if no restoral is necessary.) */
962aa53f 5805
b0fb28a3 5806 const char * restore_to_locale = NULL;
962aa53f
KW
5807
5808 if (switch_category == template_category) { /* No changes needed */
5809 return NULL;
5810 }
5811
5812 /* Find the original locale of the category we may need to change, so that
5813 * it can be restored to later */
32899cfb 5814 restore_to_locale = querylocale_r(switch_category);
962aa53f 5815 if (! restore_to_locale) {
20ad7b23
KW
5816 locale_panic_(Perl_form(aTHX_ "Could not find current %s locale",
5817 category_name(switch_category)));
962aa53f 5818 }
32899cfb 5819 restore_to_locale = savepv(restore_to_locale);
962aa53f
KW
5820
5821 /* If the locale of the template category wasn't passed in, find it now */
5822 if (template_locale == NULL) {
a683a66a 5823 template_locale = querylocale_r(template_category);
962aa53f 5824 if (! template_locale) {
20ad7b23
KW
5825 locale_panic_(Perl_form(aTHX_ "Could not find current %s locale\n",
5826 category_name(template_category)));
962aa53f
KW
5827 }
5828 }
5829
5830 /* It the locales are the same, there's nothing to do */
5831 if (strEQ(restore_to_locale, template_locale)) {
5832 Safefree(restore_to_locale);
5833
5834 DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s locale unchanged as %s\n",
e1c8059f 5835 category_name(switch_category), template_locale));
962aa53f
KW
5836
5837 return NULL;
5838 }
5839
5840 /* Finally, change the locale to the template one */
3980cddb 5841 if (! bool_setlocale_r(switch_category, template_locale)) {
57eeab0b
KW
5842 setlocale_failure_panic_i(get_category_index(switch_category,
5843 NULL),
5844 category_name(switch_category),
5845 template_locale,
5846 __LINE__,
5847 __LINE__);
962aa53f
KW
5848 }
5849
5850 DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s locale switched to %s\n",
5851 category_name(switch_category), template_locale));
5852
5853 return restore_to_locale;
5854}
5855
5856STATIC void
fe1c1494
KW
5857S_restore_switched_locale(pTHX_ const int category,
5858 const char * const original_locale)
962aa53f 5859{
9de6fe47
KW
5860 /* Restores the locale for LC_'category' to 'original_locale' (which is a
5861 * copy that will be freed by this function), or do nothing if the latter
5862 * parameter is NULL */
962aa53f
KW
5863
5864 if (original_locale == NULL) {
5865 return;
5866 }
5867
3980cddb 5868 if (! bool_setlocale_r(category, original_locale)) {
20ad7b23
KW
5869 locale_panic_(Perl_form(aTHX_ "s restoring %s to %s failed",
5870 category_name(category), original_locale));
962aa53f
KW
5871 }
5872
5873 Safefree(original_locale);
5874}
5875
51332242 5876/* is_cur_LC_category_utf8 uses a small char buffer to avoid malloc/free */
fe1c1494 5877# define CUR_LC_BUFFER_SIZE 64
51332242 5878
c1284011
KW
5879bool
5880Perl__is_cur_LC_category_utf8(pTHX_ int category)
7d74bb61
KW
5881{
5882 /* Returns TRUE if the current locale for 'category' is UTF-8; FALSE
5883 * otherwise. 'category' may not be LC_ALL. If the platform doesn't have
119ee68b 5884 * nl_langinfo(), nor MB_CUR_MAX, this employs a heuristic, which hence
609548d2
KW
5885 * could give the wrong result. The result will very likely be correct for
5886 * languages that have commonly used non-ASCII characters, but for notably
5887 * English, it comes down to if the locale's name ends in something like
9de6fe47
KW
5888 * "UTF-8". It errs on the side of not being a UTF-8 locale.
5889 *
5890 * If the platform is early C89, not containing mbtowc(), or we are
5891 * compiled to not pay attention to LC_CTYPE, this employs heuristics.
5892 * These work very well for non-Latin locales or those whose currency
5893 * symbol isn't a '$' nor plain ASCII text. But without LC_CTYPE and at
5894 * least MB_CUR_MAX, English locales with an ASCII currency symbol depend
5895 * on the name containing UTF-8 or not. */
7d74bb61 5896
47280b20 5897 /* Name of current locale corresponding to the input category */
8de4332b 5898 const char *save_input_locale = NULL;
47280b20
KW
5899
5900 bool is_utf8 = FALSE; /* The return value */
7d74bb61 5901
47280b20
KW
5902 /* The variables below are for the cache of previous lookups using this
5903 * function. The cache is a C string, described at the definition for
5904 * 'C_and_POSIX_utf8ness'.
5905 *
5906 * The first part of the cache is fixed, for the C and POSIX locales. The
5907 * varying part starts just after them. */
5908 char * utf8ness_cache = PL_locale_utf8ness + STRLENs(C_and_POSIX_utf8ness);
5909
5910 Size_t utf8ness_cache_size; /* Size of the varying portion */
5911 Size_t input_name_len; /* Length in bytes of save_input_locale */
5912 Size_t input_name_len_with_overhead; /* plus extra chars used to store
5913 the name in the cache */
5914 char * delimited; /* The name plus the delimiters used to store
5915 it in the cache */
51332242 5916 char buffer[CUR_LC_BUFFER_SIZE]; /* small buffer */
47280b20
KW
5917 char * name_pos; /* position of 'delimited' in the cache, or 0
5918 if not there */
5919
5920
7d4bcc4a
KW
5921# ifdef LC_ALL
5922
7d74bb61 5923 assert(category != LC_ALL);
7d4bcc4a
KW
5924
5925# endif
7d74bb61 5926
47280b20 5927 /* Get the desired category's locale */
32899cfb 5928 save_input_locale = querylocale_r(category);
47280b20 5929
47280b20
KW
5930 DEBUG_L(PerlIO_printf(Perl_debug_log,
5931 "Current locale for %s is %s\n",
5932 category_name(category), save_input_locale));
5933
5934 input_name_len = strlen(save_input_locale);
5935
5936 /* In our cache, each name is accompanied by two delimiters and a single
5937 * utf8ness digit */
5938 input_name_len_with_overhead = input_name_len + 3;
5939
51332242
N
5940 if ( input_name_len_with_overhead <= CUR_LC_BUFFER_SIZE ) {
5941 /* we can use the buffer, avoid a malloc */
5942 delimited = buffer;
5943 } else { /* need a malloc */
5944 /* Allocate and populate space for a copy of the name surrounded by the
5945 * delimiters */
5946 Newx(delimited, input_name_len_with_overhead, char);
5947 }
5948
47280b20
KW
5949 delimited[0] = UTF8NESS_SEP[0];
5950 Copy(save_input_locale, delimited + 1, input_name_len, char);
5951 delimited[input_name_len+1] = UTF8NESS_PREFIX[0];
5952 delimited[input_name_len+2] = '\0';
5953
5954 /* And see if that is in the cache */
5955 name_pos = instr(PL_locale_utf8ness, delimited);
5956 if (name_pos) {
5957 is_utf8 = *(name_pos + input_name_len_with_overhead - 1) - '0';
5958
94d521c0
KW
5959 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
5960 "UTF8ness for locale %s=%d, \n",
5961 save_input_locale, is_utf8));
47280b20
KW
5962
5963 /* And, if not already in that position, move it to the beginning of
5964 * the non-constant portion of the list, since it is the most recently
5965 * used. (We don't have to worry about overflow, since just moving
5966 * existing names around) */
5967 if (name_pos > utf8ness_cache) {
5968 Move(utf8ness_cache,
5969 utf8ness_cache + input_name_len_with_overhead,
5970 name_pos - utf8ness_cache, char);
5971 Copy(delimited,
5972 utf8ness_cache,
5973 input_name_len_with_overhead - 1, char);
5974 utf8ness_cache[input_name_len_with_overhead - 1] = is_utf8 + '0';
5975 }
5976
51332242
N
5977 /* free only when not using the buffer */
5978 if ( delimited != buffer ) Safefree(delimited);
47280b20 5979 return is_utf8;
7d74bb61
KW
5980 }
5981
47280b20
KW
5982 /* Here we don't have stored the utf8ness for the input locale. We have to
5983 * calculate it */
5984
94646a69 5985# if defined(USE_LOCALE_CTYPE) \
3871c541 5986 && ( defined(HAS_SOME_LANGINFO) \
94646a69 5987 || (defined(HAS_MBTOWC) || defined(HAS_MBRTOWC)))
7d74bb61 5988
0dec74cd 5989 {
962aa53f
KW
5990 const char *original_ctype_locale
5991 = switch_category_locale_to_template(LC_CTYPE,
5992 category,
5993 save_input_locale);
69014004 5994
7d74bb61 5995 /* Here the current LC_CTYPE is set to the locale of the category whose
0dec74cd 5996 * information is desired. This means that nl_langinfo() and mbtowc()
1d958db2 5997 * should give the correct results */
119ee68b 5998
0dec74cd
KW
5999# ifdef MB_CUR_MAX /* But we can potentially rule out UTF-8ness, avoiding
6000 calling the functions if we have this */
6001
6002 /* Standard UTF-8 needs at least 4 bytes to represent the maximum
6003 * Unicode code point. */
6004
b26541a9
KW
6005 DEBUG_L(PerlIO_printf(Perl_debug_log, "MB_CUR_MAX=%d\n",
6006 (int) MB_CUR_MAX));
0dec74cd
KW
6007 if ((unsigned) MB_CUR_MAX < STRLENs(MAX_UNICODE_UTF8)) {
6008 is_utf8 = FALSE;
b5b2847c
KW
6009 restore_switched_locale(LC_CTYPE, original_ctype_locale);
6010 goto finish_and_return;
0dec74cd
KW
6011 }
6012
6013# endif
3871c541 6014# if defined(HAS_SOME_LANGINFO)
7d4bcc4a 6015
0dec74cd
KW
6016 { /* The task is easiest if the platform has this POSIX 2001 function.
6017 Except on some platforms it can wrongly return "", so have to have
6018 a fallback. And it can return that it's UTF-8, even if there are
6019 variances from that. For example, Turkish locales may use the
6020 alternate dotted I rules, and sometimes it appears to be a
6021 defective locale definition. XXX We should probably check for
9de6fe47
KW
6022 these in the Latin1 range and warn (but on glibc, requires
6023 iswalnum() etc. due to their not handling 80-FF correctly */
07492de5 6024 const char * scratch_buffer = NULL;
fc46dd0d
KW
6025 const char *codeset = my_langinfo_c(CODESET, LC_CTYPE,
6026 save_input_locale,
fffcb78f 6027 &scratch_buffer, NULL, NULL);
119ee68b 6028
af84e886 6029 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
c70a3e68
KW
6030 "\tnllanginfo returned CODESET '%s'\n", codeset));
6031
6032 if (codeset && strNE(codeset, "")) {
1d958db2 6033
1d075173
KW
6034 /* If the implementation of foldEQ() somehow were
6035 * to change to not go byte-by-byte, this could
6036 * read past end of string, as only one length is
6037 * checked. But currently, a premature NUL will
6038 * compare false, and it will stop there */
c3c9077b
KW
6039 is_utf8 = cBOOL( foldEQ(codeset, "UTF-8", STRLENs("UTF-8"))
6040 || foldEQ(codeset, "UTF8", STRLENs("UTF8")));
1d958db2 6041
69014004
KW
6042 DEBUG_L(PerlIO_printf(Perl_debug_log,
6043 "\tnllanginfo returned CODESET '%s'; ?UTF8 locale=%d\n",
6044 codeset, is_utf8));
b5b2847c 6045 restore_switched_locale(LC_CTYPE, original_ctype_locale);
07492de5 6046 Safefree(scratch_buffer);
b5b2847c 6047 goto finish_and_return;
1d958db2 6048 }
119ee68b
KW
6049 }
6050
7d4bcc4a 6051# endif
94646a69 6052# if defined(HAS_MBTOWC) || defined(HAS_MBRTOWC)
0dec74cd
KW
6053 /* We can see if this is a UTF-8-like locale if have mbtowc(). It was a
6054 * late adder to C89, so very likely to have it. However, testing has
6055 * shown that, like nl_langinfo() above, there are locales that are not
6056 * strictly UTF-8 that this will return that they are */
0dec74cd 6057 {
e1d27f6b 6058 wchar_t wc = 0;
51fc4b19 6059 int len;
94646a69 6060
d5f728f7
KW
6061 PERL_UNUSED_RESULT(mbtowc_(NULL, NULL, 0));
6062 len = mbtowc_(&wc, REPLACEMENT_CHARACTER_UTF8,
6063 STRLENs(REPLACEMENT_CHARACTER_UTF8));
b1d4925c 6064
0dec74cd
KW
6065 is_utf8 = cBOOL( len == STRLENs(REPLACEMENT_CHARACTER_UTF8)
6066 && wc == (wchar_t) UNICODE_REPLACEMENT);
119ee68b 6067 }
7d4bcc4a 6068
17dd77cd
FP
6069# endif
6070
962aa53f 6071 restore_switched_locale(LC_CTYPE, original_ctype_locale);
47280b20 6072 goto finish_and_return;
7d74bb61 6073 }
119ee68b 6074
0dec74cd 6075# else
7d74bb61 6076
0dec74cd
KW
6077 /* Here, we must have a C89 compiler that doesn't have mbtowc(). Next
6078 * try looking at the currency symbol to see if it disambiguates
6079 * things. Often that will be in the native script, and if the symbol
6080 * isn't in UTF-8, we know that the locale isn't. If it is non-ASCII
6081 * UTF-8, we infer that the locale is too, as the odds of a non-UTF8
6082 * string being valid UTF-8 are quite small */
fa9b773e 6083
660ee27b
KW
6084# ifdef USE_LOCALE_MONETARY
6085
6086 /* If have LC_MONETARY, we can look at the currency symbol. Often that
6087 * will be in the native script. We do this one first because there is
6088 * just one string to examine, so potentially avoids work */
7d4bcc4a 6089
9db16864
KW
6090 {
6091 const char *original_monetary_locale
660ee27b
KW
6092 = switch_category_locale_to_template(LC_MONETARY,
6093 category,
6094 save_input_locale);
9db16864 6095 bool only_ascii = FALSE;
07492de5 6096 const char * scratch_buffer = NULL;
660ee27b 6097 const U8 * currency_string
fc46dd0d
KW
6098 = (const U8 *) my_langinfo_c(CRNCYSTR, LC_MONETARY,
6099 save_input_locale,
fffcb78f 6100 &scratch_buffer, NULL, NULL);
660ee27b
KW
6101 /* 2nd param not relevant for this item */
6102 const U8 * first_variant;
fa9b773e 6103
660ee27b
KW
6104 assert( *currency_string == '-'
6105 || *currency_string == '+'
6106 || *currency_string == '.');
97f4de96 6107
660ee27b 6108 currency_string++;
fa9b773e 6109
660ee27b 6110 if (is_utf8_invariant_string_loc(currency_string, 0, &first_variant))
9db16864 6111 {
fe1c1494
KW
6112 DEBUG_L(PerlIO_printf(Perl_debug_log,
6113 "Couldn't get currency symbol for %s, or contains"
6114 " only ASCII; can't use for determining if UTF-8"
6115 " locale\n", save_input_locale));
9db16864
KW
6116 only_ascii = TRUE;
6117 }
6118 else {
660ee27b 6119 is_utf8 = is_strict_utf8_string(first_variant, 0);
9db16864 6120 }
07492de5 6121 Safefree(scratch_buffer);
fa9b773e 6122
9db16864 6123 restore_switched_locale(LC_MONETARY, original_monetary_locale);
fa9b773e 6124
9db16864 6125 if (! only_ascii) {
fa9b773e 6126
9db16864
KW
6127 /* It isn't a UTF-8 locale if the symbol is not legal UTF-8;
6128 * otherwise assume the locale is UTF-8 if and only if the symbol
6129 * is non-ascii UTF-8. */
fe1c1494
KW
6130 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
6131 "\t?Currency symbol for %s is UTF-8=%d\n",
9db16864
KW
6132 save_input_locale, is_utf8));
6133 goto finish_and_return;
6134 }
13542a67 6135 }
fa9b773e 6136
660ee27b 6137# endif /* USE_LOCALE_MONETARY */
7d4bcc4a 6138# if defined(HAS_STRFTIME) && defined(USE_LOCALE_TIME)
15f7e74e 6139
9db16864
KW
6140 /* Still haven't found a non-ASCII string to disambiguate UTF-8 or not. Try
6141 * the names of the months and weekdays, timezone, and am/pm indicator */
6142 {
6143 const char *original_time_locale
6144 = switch_category_locale_to_template(LC_TIME,
6145 category,
6146 save_input_locale);
6147 int hour = 10;
6148 bool is_dst = FALSE;
6149 int dom = 1;
6150 int month = 0;
6151 int i;
6152 char * formatted_time;
6153
6154 /* Here the current LC_TIME is set to the locale of the category
fe1c1494
KW
6155 * whose information is desired. Look at all the days of the week
6156 * and month names, and the timezone and am/pm indicator for UTF-8
6157 * variant characters. The first such a one found will tell us if
6158 * the locale is UTF-8 or not */
9db16864
KW
6159
6160 for (i = 0; i < 7 + 12; i++) { /* 7 days; 12 months */
6161 formatted_time = my_strftime("%A %B %Z %p",
6162 0, 0, hour, dom, month, 2012 - 1900, 0, 0, is_dst);
6163 if ( ! formatted_time
6164 || is_utf8_invariant_string((U8 *) formatted_time, 0))
6165 {
15f7e74e 6166
fe1c1494
KW
6167 /* Here, we didn't find a non-ASCII. Try the next time
6168 * through with the complemented dst and am/pm, and try
6169 * with the next weekday. After we have gotten all
6170 * weekdays, try the next month */
9db16864
KW
6171 is_dst = ! is_dst;
6172 hour = (hour + 12) % 24;
6173 dom++;
6174 if (i > 6) {
6175 month++;
6176 }
e8fd52ac 6177 Safefree(formatted_time);
9db16864 6178 continue;
15f7e74e 6179 }
9db16864
KW
6180
6181 /* Here, we have a non-ASCII. Return TRUE is it is valid UTF8;
6182 * false otherwise. But first, restore LC_TIME to its original
6183 * locale if we changed it */
6184 restore_switched_locale(LC_TIME, original_time_locale);
6185
fe1c1494
KW
6186 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
6187 "\t?time-related strings for %s are UTF-8=%d\n",
9db16864
KW
6188 save_input_locale,
6189 is_utf8_string((U8 *) formatted_time, 0)));
6190 is_utf8 = is_utf8_string((U8 *) formatted_time, 0);
e8fd52ac 6191 Safefree(formatted_time);
9db16864 6192 goto finish_and_return;
15f7e74e
KW
6193 }
6194
9db16864
KW
6195 /* Falling off the end of the loop indicates all the names were just
6196 * ASCII. Go on to the next test. If we changed it, restore LC_TIME
6197 * to its original locale */
962aa53f 6198 restore_switched_locale(LC_TIME, original_time_locale);
fe1c1494
KW
6199 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
6200 "All time-related words for %s contain only ASCII;"
6201 " can't use for determining if UTF-8 locale\n",
6202 save_input_locale));
15f7e74e
KW
6203 }
6204
7d4bcc4a 6205# endif
15f7e74e 6206
7d4bcc4a 6207# if 0 && defined(USE_LOCALE_MESSAGES) && defined(HAS_SYS_ERRLIST)
855aeb93 6208
fe1c1494
KW
6209 /* This code is ifdefd out because it was found to not be necessary in
6210 * testing on our dromedary test machine, which has over 700 locales.
6211 * There, this added no value to looking at the currency symbol and the
6212 * time strings. I left it in so as to avoid rewriting it if real-world
6213 * experience indicates that dromedary is an outlier. Essentially, instead
6214 * of returning abpve if we haven't found illegal utf8, we continue on and
6215 * examine all the strerror() messages on the platform for utf8ness. If
6216 * all are ASCII, we still don't know the answer; but otherwise we have a
6217 * pretty good indication of the utf8ness. The reason this doesn't help
6218 * much is that the messages may not have been translated into the locale.
6219 * The currency symbol and time strings are much more likely to have been
6220 * translated. */
9db16864
KW
6221 {
6222 int e;
6223 bool non_ascii = FALSE;
6224 const char *original_messages_locale
6225 = switch_category_locale_to_template(LC_MESSAGES,
6226 category,
6227 save_input_locale);
6228 const char * errmsg = NULL;
6229
6230 /* Here the current LC_MESSAGES is set to the locale of the category
6231 * whose information is desired. Look through all the messages. We
6232 * can't use Strerror() here because it may expand to code that
6233 * segfaults in miniperl */
6234
6235 for (e = 0; e <= sys_nerr; e++) {
6236 errno = 0;
6237 errmsg = sys_errlist[e];
6238 if (errno || !errmsg) {
6239 break;
6240 }
6241 errmsg = savepv(errmsg);
6242 if (! is_utf8_invariant_string((U8 *) errmsg, 0)) {
6243 non_ascii = TRUE;
6244 is_utf8 = is_utf8_string((U8 *) errmsg, 0);
6245 break;
6246 }
855aeb93 6247 }
9db16864 6248 Safefree(errmsg);
855aeb93 6249
9db16864 6250 restore_switched_locale(LC_MESSAGES, original_messages_locale);
855aeb93 6251
9db16864 6252 if (non_ascii) {
5857e934 6253
fe1c1494
KW
6254 /* Any non-UTF-8 message means not a UTF-8 locale; if all are
6255 * valid, any non-ascii means it is one; otherwise we assume it
6256 * isn't */
6257 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
6258 "\t?error messages for %s are UTF-8=%d\n",
9db16864
KW
6259 save_input_locale,
6260 is_utf8));
6261 goto finish_and_return;
6262 }
855aeb93 6263
fe1c1494
KW
6264 DEBUG_L(PerlIO_printf(Perl_debug_log,
6265 "All error messages for %s contain only ASCII;"
6266 " can't use for determining if UTF-8 locale\n",
6267 save_input_locale));
9db16864 6268 }
855aeb93 6269
7d4bcc4a 6270# endif
0dec74cd 6271# ifndef EBCDIC /* On os390, even if the name ends with "UTF-8', it isn't a
92c0a900 6272 UTF-8 locale */
7d4bcc4a 6273
97f4de96
KW
6274 /* As a last resort, look at the locale name to see if it matches
6275 * qr/UTF -? * 8 /ix, or some other common locale names. This "name", the
6276 * return of setlocale(), is actually defined to be opaque, so we can't
6277 * really rely on the absence of various substrings in the name to indicate
6278 * its UTF-8ness, but if it has UTF8 in the name, it is extremely likely to
6279 * be a UTF-8 locale. Similarly for the other common names */
6280
0dec74cd 6281 {
c36d8df8 6282 const Size_t final_pos = strlen(save_input_locale) - 1;
0dec74cd 6283
c36d8df8
KW
6284 if (final_pos >= 3) {
6285 const char *name = save_input_locale;
97f4de96 6286
c36d8df8
KW
6287 /* Find next 'U' or 'u' and look from there */
6288 while ((name += strcspn(name, "Uu") + 1)
6289 <= save_input_locale + final_pos - 2)
97f4de96 6290 {
c36d8df8
KW
6291 if ( isALPHA_FOLD_NE(*name, 't')
6292 || isALPHA_FOLD_NE(*(name + 1), 'f'))
6293 {
6294 continue;
6295 }
6296 name += 2;
6297 if (*(name) == '-') {
6298 if ((name > save_input_locale + final_pos - 1)) {
6299 break;
6300 }
6301 name++;
6302 }
6303 if (*(name) == '8') {
6304 DEBUG_L(PerlIO_printf(Perl_debug_log,
6305 "Locale %s ends with UTF-8 in name\n",
6306 save_input_locale));
6307 is_utf8 = TRUE;
6308 goto finish_and_return;
97f4de96 6309 }
97f4de96 6310 }
c36d8df8
KW
6311 DEBUG_L(PerlIO_printf(Perl_debug_log,
6312 "Locale %s doesn't end with UTF-8 in name\n",
6313 save_input_locale));
97f4de96 6314 }
97f4de96 6315
4d8d465a 6316# ifdef WIN32
7d4bcc4a 6317
c36d8df8
KW
6318 /* http://msdn.microsoft.com/en-us/library/windows/desktop/dd317756.aspx */
6319 if (memENDs(save_input_locale, final_pos, "65001")) {
6320 DEBUG_L(PerlIO_printf(Perl_debug_log,
8a0832a1 6321 "Locale %s ends with 65001 in name, is UTF-8 locale\n",
97f4de96 6322 save_input_locale));
c36d8df8
KW
6323 is_utf8 = TRUE;
6324 goto finish_and_return;
6325 }
7d4bcc4a 6326
4d8d465a 6327# endif
17dd77cd 6328 }
4d8d465a 6329# endif
97f4de96
KW
6330
6331 /* Other common encodings are the ISO 8859 series, which aren't UTF-8. But
6332 * since we are about to return FALSE anyway, there is no point in doing
6333 * this extra work */
7d4bcc4a 6334
0dec74cd 6335# if 0
97f4de96
KW
6336 if (instr(save_input_locale, "8859")) {
6337 DEBUG_L(PerlIO_printf(Perl_debug_log,
6338 "Locale %s has 8859 in name, not UTF-8 locale\n",
6339 save_input_locale));
47280b20
KW
6340 is_utf8 = FALSE;
6341 goto finish_and_return;
97f4de96 6342 }
0dec74cd 6343# endif
97f4de96 6344
69014004
KW
6345 DEBUG_L(PerlIO_printf(Perl_debug_log,
6346 "Assuming locale %s is not a UTF-8 locale\n",
6347 save_input_locale));
47280b20
KW
6348 is_utf8 = FALSE;
6349
0dec74cd
KW
6350# endif /* the code that is compiled when no modern LC_CTYPE */
6351
47280b20
KW
6352 finish_and_return:
6353
6354 /* Cache this result so we don't have to go through all this next time. */
6355 utf8ness_cache_size = sizeof(PL_locale_utf8ness)
6356 - (utf8ness_cache - PL_locale_utf8ness);
6357
6358 /* But we can't save it if it is too large for the total space available */
6359 if (LIKELY(input_name_len_with_overhead < utf8ness_cache_size)) {
6360 Size_t utf8ness_cache_len = strlen(utf8ness_cache);
6361
6362 /* Here it can fit, but we may need to clear out the oldest cached
6363 * result(s) to do so. Check */
6364 if (utf8ness_cache_len + input_name_len_with_overhead
6365 >= utf8ness_cache_size)
6366 {
6367 /* Here we have to clear something out to make room for this.
6368 * Start looking at the rightmost place where it could fit and find
6369 * the beginning of the entry that extends past that. */
6370 char * cutoff = (char *) my_memrchr(utf8ness_cache,
6371 UTF8NESS_SEP[0],
6372 utf8ness_cache_size
6373 - input_name_len_with_overhead);
6374
6375 assert(cutoff);
6376 assert(cutoff >= utf8ness_cache);
6377
6378 /* This and all subsequent entries must be removed */
6379 *cutoff = '\0';
6380 utf8ness_cache_len = strlen(utf8ness_cache);
6381 }
6382
6383 /* Make space for the new entry */
6384 Move(utf8ness_cache,
6385 utf8ness_cache + input_name_len_with_overhead,
6386 utf8ness_cache_len + 1 /* Incl. trailing NUL */, char);
6387
6388 /* And insert it */
6389 Copy(delimited, utf8ness_cache, input_name_len_with_overhead - 1, char);
6390 utf8ness_cache[input_name_len_with_overhead - 1] = is_utf8 + '0';
6391
72299e00 6392 if ((PL_locale_utf8ness[strlen(PL_locale_utf8ness)-1] & ~1) != '0') {
20ad7b23
KW
6393 locale_panic_(Perl_form(aTHX_
6394 "Corrupt utf8ness_cache=%s\nlen=%zu,"
6395 " inserted_name=%s, its_len=%zu",
6396 PL_locale_utf8ness, strlen(PL_locale_utf8ness),
6397 delimited, input_name_len_with_overhead));
47280b20
KW
6398 }
6399 }
6400
6401# ifdef DEBUGGING
6402
de3a3072
KW
6403 if (DEBUG_Lv_TEST) {
6404 const char * s = PL_locale_utf8ness;
6405
6406 /* Audit the structure */
6407 while (s < PL_locale_utf8ness + strlen(PL_locale_utf8ness)) {
6408 const char *e;
6409
6410 if (*s != UTF8NESS_SEP[0]) {
20ad7b23
KW
6411 locale_panic_(Perl_form(aTHX_
6412 "Corrupt utf8ness_cache: missing"
6413 " separator %.*s<-- HERE %s",
6414 (int) (s - PL_locale_utf8ness),
6415 PL_locale_utf8ness,
6416 s));
de3a3072
KW
6417 }
6418 s++;
6419 e = strchr(s, UTF8NESS_PREFIX[0]);
6420 if (! e) {
01b91050 6421 e = PL_locale_utf8ness + strlen(PL_locale_utf8ness);
20ad7b23
KW
6422 locale_panic_(Perl_form(aTHX_
6423 "Corrupt utf8ness_cache: missing"
6424 " separator %.*s<-- HERE %s",
6425 (int) (e - PL_locale_utf8ness),
6426 PL_locale_utf8ness,
6427 e));
de3a3072
KW
6428 }
6429 e++;
6430 if (*e != '0' && *e != '1') {
20ad7b23
KW
6431 locale_panic_(Perl_form(aTHX_
6432 "Corrupt utf8ness_cache: utf8ness"
6433 " must be [01] %.*s<-- HERE %s",
6434 (int) (e + 1 - PL_locale_utf8ness),
6435 PL_locale_utf8ness,
6436 e + 1));
de3a3072
KW
6437 }
6438 if (ninstr(PL_locale_utf8ness, s, s-1, e)) {
20ad7b23
KW
6439 locale_panic_(Perl_form(aTHX_
6440 "Corrupt utf8ness_cache: entry"
6441 " has duplicate %.*s<-- HERE %s",
6442 (int) (e - PL_locale_utf8ness),
6443 PL_locale_utf8ness,
6444 e));
de3a3072
KW
6445 }
6446 s = e + 1;
6447 }
6448 }
6449
94d521c0 6450 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
47280b20 6451 "PL_locale_utf8ness is now %s; returning %d\n",
94d521c0 6452 PL_locale_utf8ness, is_utf8));
47280b20
KW
6453
6454# endif
6455
51332242
N
6456 /* free only when not using the buffer */
6457 if ( delimited != buffer ) Safefree(delimited);
47280b20 6458 return is_utf8;
7d74bb61
KW
6459}
6460
9bbcebb1
KW
6461STATIC bool
6462S_is_codeset_name_UTF8(const char * name)
6463{
6464 /* Return a boolean as to if the passed-in name indicates it is a UTF-8
6465 * code set. Several variants are possible */
6466 const Size_t len = strlen(name);
6467
6468 PERL_ARGS_ASSERT_IS_CODESET_NAME_UTF8;
6469
6470# ifdef WIN32
6471
6472 /* http://msdn.microsoft.com/en-us/library/windows/desktop/dd317756.aspx */
6473 if (memENDs(name, len, "65001")) {
6474 return TRUE;
6475 }
6476
6477# endif
6478 /* 'UTF8' or 'UTF-8' */
6479 return ( inRANGE(len, 4, 5)
6480 && name[len-1] == '8'
6481 && ( memBEGINs(name, len, "UTF")
6482 || memBEGINs(name, len, "utf"))
6483 && (len == 4 || name[3] == '-'));
6484}
6485
a0eeb339
KW
6486STATIC bool
6487S_is_locale_utf8(pTHX_ const char * locale)
6488{
6489 /* Returns TRUE if the locale 'locale' is UTF-8; FALSE otherwise. It uses
58399f03
KW
6490 * my_langinfo(), which employs various methods to get this information
6491 * if nl_langinfo() isn't available, using heuristics as a last resort, in
6492 * which case, the result will very likely be correct for locales for
6493 * languages that have commonly used non-ASCII characters, but for notably
6494 * English, it comes down to if the locale's name ends in something like
6495 * "UTF-8". It errs on the side of not being a UTF-8 locale. */
a0eeb339
KW
6496
6497# if ! defined(USE_LOCALE_CTYPE) \
6498 || defined(EBCDIC) /* There aren't any real UTF-8 locales at this time */
6499
6500 PERL_UNUSED_ARG(locale);
6501
6502 return FALSE;
6503
6504# else
6505
6506 const char * scratch_buffer = NULL;
6507 const char * codeset = my_langinfo_c(CODESET, LC_CTYPE, locale,
fffcb78f 6508 &scratch_buffer, NULL, NULL);
a0eeb339
KW
6509 bool retval = is_codeset_name_UTF8(codeset);
6510
6511 PERL_ARGS_ASSERT_IS_LOCALE_UTF8;
6512
6513 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
6514 "found codeset=%s, is_utf8=%d\n", codeset, retval));
6515
6516 Safefree(scratch_buffer);
6517 return retval;
6518
6519# endif
6520
6521}
6522
6523#endif /* USE_LOCALE */
7d74bb61 6524
d6ded950
KW
6525bool
6526Perl__is_in_locale_category(pTHX_ const bool compiling, const int category)
6527{
6528 /* Internal function which returns if we are in the scope of a pragma that
6529 * enables the locale category 'category'. 'compiling' should indicate if
6530 * this is during the compilation phase (TRUE) or not (FALSE). */
6531
6532 const COP * const cop = (compiling) ? &PL_compiling : PL_curcop;
6533
363d7d4a
JK
6534 SV *these_categories = cop_hints_fetch_pvs(cop, "locale", 0);
6535 if (! these_categories || these_categories == &PL_sv_placeholder) {
d6ded950
KW
6536 return FALSE;
6537 }
6538
6539 /* The pseudo-category 'not_characters' is -1, so just add 1 to each to get
6540 * a valid unsigned */
6541 assert(category >= -1);
363d7d4a 6542 return cBOOL(SvUV(these_categories) & (1U << (category + 1)));
d6ded950
KW
6543}
6544
2c6ee1a7 6545char *
6ebbc862
KW
6546Perl_my_strerror(pTHX_ const int errnum)
6547{
6548 /* Returns a mortalized copy of the text of the error message associated
6549 * with 'errnum'. It uses the current locale's text unless the platform
6550 * doesn't have the LC_MESSAGES category or we are not being called from
6551 * within the scope of 'use locale'. In the former case, it uses whatever
6552 * strerror returns; in the latter case it uses the text from the C locale.
6553 *
6554 * The function just calls strerror(), but temporarily switches, if needed,
6555 * to the C locale */
6556
6557 char *errstr;
6558
52770946 6559#ifndef USE_LOCALE_MESSAGES
6ebbc862 6560
52770946
KW
6561 /* If platform doesn't have messages category, we don't do any switching to
6562 * the C locale; we just use whatever strerror() returns */
6563
6564 errstr = savepv(Strerror(errnum));
6565
6566#else /* Has locale messages */
6567
6568 const bool within_locale_scope = IN_LC(LC_MESSAGES);
2c6ee1a7 6569
8751a11f 6570# ifndef USE_LOCALE_THREADS
7aaa36b1 6571
39e69e77
KW
6572 /* This function is trivial without threads. */
6573 if (within_locale_scope) {
f9ea7fff 6574 errstr = savepv(Strerror(errnum));
39e69e77
KW
6575 }
6576 else {
22392ba2 6577 const char * save_locale = querylocale_c(LC_MESSAGES);
39e69e77 6578
3980cddb 6579 void_setlocale_c(LC_MESSAGES, "C");
f9ea7fff 6580 errstr = savepv(Strerror(errnum));
3980cddb 6581 void_setlocale_c(LC_MESSAGES, save_locale);
39e69e77
KW
6582 }
6583
fe1c1494 6584# elif defined(USE_POSIX_2008_LOCALE) && defined(HAS_STRERROR_L)
39e69e77
KW
6585
6586 /* This function is also trivial if we don't have to worry about thread
6587 * safety and have strerror_l(), as it handles the switch of locales so we
6588 * don't have to deal with that. We don't have to worry about thread
6589 * safety if strerror_r() is also available. Both it and strerror_l() are
6590 * thread-safe. Plain strerror() isn't thread safe. But on threaded
6591 * builds when strerror_r() is available, the apparent call to strerror()
6592 * below is actually a macro that behind-the-scenes calls strerror_r(). */
43cb6651 6593
39e69e77 6594# ifdef HAS_STRERROR_R
7aaa36b1
KW
6595
6596 if (within_locale_scope) {
f9ea7fff 6597 errstr = savepv(Strerror(errnum));
7aaa36b1
KW
6598 }
6599 else {
4eb27fc5 6600 errstr = savepv(strerror_l(errnum, PL_C_locale_obj));
7aaa36b1
KW
6601 }
6602
43cb6651
KW
6603# else
6604
6605 /* Here we have strerror_l(), but not strerror_r() and we are on a
6606 * threaded-build. We use strerror_l() for everything, constructing a
6607 * locale to pass to it if necessary */
6608
43cb6651
KW
6609 locale_t locale_to_use;
6610
6611 if (within_locale_scope) {
827a4a05 6612 locale_to_use = use_curlocale_scratch();
43cb6651
KW
6613 }
6614 else { /* Use C locale if not within 'use locale' scope */
6615 locale_to_use = PL_C_locale_obj;
6616 }
6617
6618 errstr = savepv(strerror_l(errnum, locale_to_use));
6619
43cb6651
KW
6620# endif
6621# else /* Doesn't have strerror_l() */
7aaa36b1 6622
8de4332b 6623 const char * save_locale = NULL;
c9dda6da 6624 bool locale_is_C = FALSE;
2c6ee1a7 6625
9de6fe47 6626 /* We have a critical section to prevent another thread from executing this
d2be42af
KW
6627 * same code at the same time which could cause LC_MESSAGES to be changed
6628 * to something else while we need it to be constant. (On thread-safe
6629 * perls, the LOCK is a no-op.) Since this is the only place in core that
6630 * changes LC_MESSAGES (unless the user has called setlocale()), this works
6631 * to prevent races. */
7953f73f 6632 SETLOCALE_LOCK;
6ebbc862 6633
9c8a6dc2
KW
6634 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
6635 "my_strerror called with errnum %d\n", errnum));
d2be42af
KW
6636
6637 /* If not within locale scope, need to return messages in the C locale */
6638 if (within_locale_scope) {
6639 DEBUG_Lv(PerlIO_printf(Perl_debug_log, "WITHIN locale scope\n"));
6640 }
6641 else {
a683a66a 6642 save_locale = querylocale_c(LC_MESSAGES);
c9dda6da 6643 if (! save_locale) {
d523eeeb 6644 SETLOCALE_UNLOCK;
20ad7b23 6645 locale_panic_("Could not find current LC_MESSAGES locale");
d2be42af 6646 NOT_REACHED; /* NOTREACHED */ \
c9dda6da 6647 }
d2be42af
KW
6648
6649 locale_is_C = isNAME_C_OR_POSIX(save_locale);
6650
6651 /* Switch to the C locale if not already in it */
6652 if (! locale_is_C && ! bool_setlocale_c(LC_MESSAGES, "C")) {
6653
6654 /* If, for some reason, the locale change failed, we soldier on as
6655 * best as possible under the circumstances, using the current
6656 * locale, and clear save_locale, so we don't try to change back.
6657 * On z/0S, all setlocale() calls fail after you've created a
6658 * thread. This is their way of making sure the entire process is
6659 * always a single locale. This means that 'use locale' is always
6660 * in place for messages under these circumstances. */
6661 save_locale = NULL;
6ebbc862 6662 }
6ebbc862 6663 } /* end of ! within_locale_scope */
a0b53297 6664
9c8a6dc2
KW
6665 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
6666 "Any locale change has been done; about to call Strerror\n"));
52770946 6667 errstr = savepv(Strerror(errnum));
6ebbc862 6668
d2be42af
KW
6669 /* Switch back if we successully switched */
6670 if ( save_locale
6671 && ! locale_is_C
6672 && ! bool_setlocale_c(LC_MESSAGES, save_locale))
6673 {
6674 SETLOCALE_UNLOCK;
6675 locale_panic_(Perl_form(aTHX_
6676 "setlocale restore to '%s' failed",
6677 save_locale));
6678 NOT_REACHED; /* NOTREACHED */ \
6ebbc862
KW
6679 }
6680
7953f73f 6681 SETLOCALE_UNLOCK;
6ebbc862 6682
7aaa36b1 6683# endif /* End of doesn't have strerror_l */
6affbbf0 6684
94d521c0
KW
6685 DEBUG_Lv((PerlIO_printf(Perl_debug_log,
6686 "Strerror returned; saving a copy: '"),
6687 print_bytes_for_locale(errstr, errstr + strlen(errstr), 0),
6688 PerlIO_printf(Perl_debug_log, "'\n")));
6affbbf0 6689
d36adde0 6690#endif /* End of does have locale messages */
2c6ee1a7 6691
52770946 6692 SAVEFREEPV(errstr);
6ebbc862 6693 return errstr;
2c6ee1a7
KW
6694}
6695
66610fdd 6696/*
747c467a 6697
58e641fb
KW
6698=for apidoc switch_to_global_locale
6699
75920755 6700On systems without locale support, or on typical single-threaded builds, or on
58e641fb
KW
6701platforms that do not support per-thread locale operations, this function does
6702nothing. On such systems that do have locale support, only a locale global to
6703the whole program is available.
6704
6705On multi-threaded builds on systems that do have per-thread locale operations,
6706this function converts the thread it is running in to use the global locale.
6707This is for code that has not yet or cannot be updated to handle multi-threaded
6708locale operation. As long as only a single thread is so-converted, everything
6709works fine, as all the other threads continue to ignore the global one, so only
6710this thread looks at it.
6711
69c5e0db
KW
6712However, on Windows systems this isn't quite true prior to Visual Studio 15,
6713at which point Microsoft fixed a bug. A race can occur if you use the
6714following operations on earlier Windows platforms:
6715
6716=over
6717
6718=item L<POSIX::localeconv|POSIX/localeconv>
6719
6720=item L<I18N::Langinfo>, items C<CRNCYSTR> and C<THOUSEP>
6721
6722=item L<perlapi/Perl_langinfo>, items C<CRNCYSTR> and C<THOUSEP>
6723
6724=back
6725
6726The first item is not fixable (except by upgrading to a later Visual Studio
6727release), but it would be possible to work around the latter two items by using
6728the Windows API functions C<GetNumberFormat> and C<GetCurrencyFormat>; patches
6729welcome.
6730
58e641fb
KW
6731Without this function call, threads that use the L<C<setlocale(3)>> system
6732function will not work properly, as all the locale-sensitive functions will
6733look at the per-thread locale, and C<setlocale> will have no effect on this
6734thread.
6735
6736Perl code should convert to either call
6737L<C<Perl_setlocale>|perlapi/Perl_setlocale> (which is a drop-in for the system
6738C<setlocale>) or use the methods given in L<perlcall> to call
6739L<C<POSIX::setlocale>|POSIX/setlocale>. Either one will transparently properly
6740handle all cases of single- vs multi-thread, POSIX 2008-supported or not.
6741
6742Non-Perl libraries, such as C<gtk>, that call the system C<setlocale> can
6743continue to work if this function is called before transferring control to the
6744library.
6745
6746Upon return from the code that needs to use the global locale,
6747L<C<sync_locale()>|perlapi/sync_locale> should be called to restore the safe
6748multi-thread operation.
6749
6750=cut
6751*/
6752
6753void
6754Perl_switch_to_global_locale()
6755{
4bcc858c 6756 dTHX;
58e641fb
KW
6757
6758#ifdef USE_THREAD_SAFE_LOCALE
6759# ifdef WIN32
6760
6761 _configthreadlocale(_DISABLE_PER_THREAD_LOCALE);
6762
6763# else
58e641fb
KW
6764
6765 {
6766 unsigned int i;
6767
4fd76d49 6768 for (i = 0; i < LC_ALL_INDEX_; i++) {
a683a66a 6769 setlocale(categories[i], querylocale_i(i));
58e641fb
KW
6770 }
6771 }
6772
58e641fb
KW
6773 uselocale(LC_GLOBAL_LOCALE);
6774
6775# endif
6776#endif
6777
6778}
6779
6780/*
6781
747c467a
KW
6782=for apidoc sync_locale
6783
b2e9ba0c
KW
6784L<C<Perl_setlocale>|perlapi/Perl_setlocale> can be used at any time to query or
6785change the locale (though changing the locale is antisocial and dangerous on
6786multi-threaded systems that don't have multi-thread safe locale operations.
6787(See L<perllocale/Multi-threaded operation>). Using the system
6788L<C<setlocale(3)>> should be avoided. Nevertheless, certain non-Perl libraries
6789called from XS, such as C<Gtk> do so, and this can't be changed. When the
6790locale is changed by XS code that didn't use
6791L<C<Perl_setlocale>|perlapi/Perl_setlocale>, Perl needs to be told that the
6792locale has changed. Use this function to do so, before returning to Perl.
6793
6794The return value is a boolean: TRUE if the global locale at the time of call
6795was in effect; and FALSE if a per-thread locale was in effect. This can be
6796used by the caller that needs to restore things as-they-were to decide whether
6797or not to call
6798L<C<Perl_switch_to_global_locale>|perlapi/switch_to_global_locale>.
747c467a
KW
6799
6800=cut
6801*/
6802
b2e9ba0c
KW
6803bool
6804Perl_sync_locale()
747c467a 6805{
d36adde0
KW
6806
6807#ifndef USE_LOCALE
6808
6809 return TRUE;
6810
6811#else
6812
e9bc6d6b 6813 const char * newlocale;
b2e9ba0c
KW
6814 dTHX;
6815
d36adde0 6816# ifdef USE_POSIX_2008_LOCALE
b2e9ba0c
KW
6817
6818 bool was_in_global_locale = FALSE;
6819 locale_t cur_obj = uselocale((locale_t) 0);
6820
6821 /* On Windows, unless the foreign code has turned off the thread-safe
6822 * locale setting, any plain setlocale() will have affected what we see, so
6823 * no need to worry. Otherwise, If the foreign code has done a plain
6824 * setlocale(), it will only affect the global locale on POSIX systems, but
6825 * will affect the */
6826 if (cur_obj == LC_GLOBAL_LOCALE) {
6827
d36adde0 6828# ifdef HAS_QUERY_LOCALE
b2e9ba0c 6829
a683a66a 6830 void_setlocale_c(LC_ALL, querylocale_c(LC_ALL));
b2e9ba0c 6831
d36adde0 6832# else
b2e9ba0c
KW
6833
6834 unsigned int i;
6835
6836 /* We can't trust that we can read the LC_ALL format on the
6837 * platform, so do them individually */
4fd76d49 6838 for (i = 0; i < LC_ALL_INDEX_; i++) {
3980cddb 6839 void_setlocale_i(i, querylocale_i(i));
b2e9ba0c 6840 }
747c467a 6841
d36adde0 6842# endif
b2e9ba0c
KW
6843
6844 was_in_global_locale = TRUE;
6845 }
6846
d36adde0 6847# else
b2e9ba0c
KW
6848
6849 bool was_in_global_locale = TRUE;
6850
d36adde0
KW
6851# endif
6852# ifdef USE_LOCALE_CTYPE
7d4bcc4a 6853
22392ba2 6854 newlocale = querylocale_c(LC_CTYPE);
9f82ea3e 6855 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
b26541a9 6856 "%s\n", setlocale_debug_string_c(LC_CTYPE, NULL, newlocale)));
9f82ea3e 6857 new_ctype(newlocale);
747c467a 6858
d36adde0
KW
6859# endif /* USE_LOCALE_CTYPE */
6860# ifdef USE_LOCALE_COLLATE
7d4bcc4a 6861
22392ba2 6862 newlocale = querylocale_c(LC_COLLATE);
9f82ea3e 6863 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
b26541a9 6864 "%s\n", setlocale_debug_string_c(LC_COLLATE, NULL, newlocale)));
9f82ea3e 6865 new_collate(newlocale);
747c467a 6866
d36adde0
KW
6867# endif
6868# ifdef USE_LOCALE_NUMERIC
7d4bcc4a 6869
22392ba2 6870 newlocale = querylocale_c(LC_NUMERIC);
9f82ea3e 6871 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
b26541a9 6872 "%s\n", setlocale_debug_string_c(LC_NUMERIC, NULL, newlocale)));
9f82ea3e 6873 new_numeric(newlocale);
7d4bcc4a 6874
d36adde0 6875# endif /* USE_LOCALE_NUMERIC */
747c467a 6876
b2e9ba0c 6877 return was_in_global_locale;
d36adde0
KW
6878
6879#endif
6880
747c467a
KW
6881}
6882
5d1187d1
KW
6883#if defined(DEBUGGING) && defined(USE_LOCALE)
6884
a4f00dcc 6885STATIC char *
d188fe6d
KW
6886S_setlocale_debug_string_i(const unsigned cat_index,
6887 const char* const locale, /* Optional locale name */
5d1187d1
KW
6888
6889 /* return value from setlocale() when attempting to
6890 * set 'category' to 'locale' */
6891 const char* const retval)
6892{
6893 /* Returns a pointer to a NUL-terminated string in static storage with
6894 * added text about the info passed in. This is not thread safe and will
6895 * be overwritten by the next call, so this should be used just to
fa07b8e5 6896 * formulate a string to immediately print or savepv() on. */
5d1187d1 6897
73af6c07 6898 static char ret[1024];
d188fe6d 6899 assert(cat_index <= NOMINAL_LC_ALL_INDEX);
7d4bcc4a 6900
e5f10d49 6901 my_strlcpy(ret, "setlocale(", sizeof(ret));
d188fe6d 6902 my_strlcat(ret, category_names[cat_index], sizeof(ret));
fa07b8e5 6903 my_strlcat(ret, ", ", sizeof(ret));
5d1187d1
KW
6904
6905 if (locale) {
fa07b8e5
KW
6906 my_strlcat(ret, "\"", sizeof(ret));
6907 my_strlcat(ret, locale, sizeof(ret));
6908 my_strlcat(ret, "\"", sizeof(ret));
5d1187d1
KW
6909 }
6910 else {
fa07b8e5 6911 my_strlcat(ret, "NULL", sizeof(ret));
5d1187d1
KW
6912 }
6913
fa07b8e5 6914 my_strlcat(ret, ") returned ", sizeof(ret));
5d1187d1
KW
6915
6916 if (retval) {
fa07b8e5
KW
6917 my_strlcat(ret, "\"", sizeof(ret));
6918 my_strlcat(ret, retval, sizeof(ret));
6919 my_strlcat(ret, "\"", sizeof(ret));
5d1187d1
KW
6920 }
6921 else {
fa07b8e5 6922 my_strlcat(ret, "NULL", sizeof(ret));
5d1187d1
KW
6923 }
6924
6925 assert(strlen(ret) < sizeof(ret));
6926
6927 return ret;
6928}
6929
6930#endif
747c467a 6931
e9bc6d6b
KW
6932void
6933Perl_thread_locale_init()
6934{
6935 /* Called from a thread on startup*/
6936
6937#ifdef USE_THREAD_SAFE_LOCALE
6938
6939 dTHX_DEBUGGING;
6940
e9bc6d6b
KW
6941
6942 DEBUG_L(PerlIO_printf(Perl_debug_log,
b26541a9
KW
6943 "new thread, initial locale is %s; calling setlocale\n",
6944 setlocale(LC_ALL, NULL)));
e9bc6d6b
KW
6945
6946# ifdef WIN32
6947
fe1c1494 6948 /* On Windows, make sure new thread has per-thread locales enabled */
e9bc6d6b
KW
6949 _configthreadlocale(_ENABLE_PER_THREAD_LOCALE);
6950
6951# else
6952
fe1c1494 6953 /* This thread starts off in the C locale */
e9bc6d6b
KW
6954 Perl_setlocale(LC_ALL, "C");
6955
6956# endif
6957#endif
6958
6959}
6960
6961void
6962Perl_thread_locale_term()
6963{
6964 /* Called from a thread as it gets ready to terminate */
6965
4b06b4d0 6966#ifdef USE_POSIX_2008_LOCALE
e9bc6d6b
KW
6967
6968 /* C starts the new thread in the global C locale. If we are thread-safe,
6969 * we want to not be in the global locale */
6970
e9bc6d6b
KW
6971 { /* Free up */
6972 locale_t cur_obj = uselocale(LC_GLOBAL_LOCALE);
e72200e7 6973 if (cur_obj != LC_GLOBAL_LOCALE && cur_obj != PL_C_locale_obj) {
e9bc6d6b
KW
6974 freelocale(cur_obj);
6975 }
6976 }
6977
e9bc6d6b
KW
6978#endif
6979
6980}
747c467a
KW
6981
6982/*
14d04a33 6983 * ex: set ts=8 sts=4 sw=4 et:
37442d52 6984 */