This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
locale.c: Remove many #if conditionals
[perl5.git] / locale.c
CommitLineData
98994639
HS
1/* locale.c
2 *
1129b882
NC
3 * Copyright (C) 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001,
4 * 2002, 2003, 2005, 2006, 2007, 2008 by Larry Wall and others
98994639
HS
5 *
6 * You may distribute under the terms of either the GNU General Public
7 * License or the Artistic License, as specified in the README file.
8 *
9 */
10
11/*
4ac71550 12 * A Elbereth Gilthoniel,
cdad3b53 13 * silivren penna míriel
4ac71550 14 * o menel aglar elenath!
cdad3b53 15 * Na-chaered palan-díriel
4ac71550
TC
16 * o galadhremmin ennorath,
17 * Fanuilos, le linnathon
18 * nef aear, si nef aearon!
19 *
20 * [p.238 of _The Lord of the Rings_, II/i: "Many Meetings"]
98994639
HS
21 */
22
166f8a29
DM
23/* utility functions for handling locale-specific stuff like what
24 * character represents the decimal point.
0d071d52 25 *
7d4bcc4a
KW
26 * All C programs have an underlying locale. Perl code generally doesn't pay
27 * any attention to it except within the scope of a 'use locale'. For most
0d071d52
KW
28 * categories, it accomplishes this by just using different operations if it is
29 * in such scope than if not. However, various libc functions called by Perl
30 * are affected by the LC_NUMERIC category, so there are macros in perl.h that
31 * are used to toggle between the current locale and the C locale depending on
a9ad02a8
KW
32 * the desired behavior of those functions at the moment. And, LC_MESSAGES is
33 * switched to the C locale for outputting the message unless within the scope
34 * of 'use locale'.
166f8a29
DM
35 */
36
98994639
HS
37#include "EXTERN.h"
38#define PERL_IN_LOCALE_C
f7416781 39#include "perl_langinfo.h"
98994639
HS
40#include "perl.h"
41
a4af207c
JH
42#include "reentr.h"
43
2fcc0ca9
KW
44/* If the environment says to, we can output debugging information during
45 * initialization. This is done before option parsing, and before any thread
46 * creation, so can be a file-level static */
47#ifdef DEBUGGING
7d4bcc4a 48# ifdef PERL_GLOBAL_STRUCT
27cdc72e 49 /* no global syms allowed */
7d4bcc4a
KW
50# define debug_initialization 0
51# define DEBUG_INITIALIZATION_set(v)
52# else
2fcc0ca9 53static bool debug_initialization = FALSE;
7d4bcc4a
KW
54# define DEBUG_INITIALIZATION_set(v) (debug_initialization = v)
55# endif
2fcc0ca9
KW
56#endif
57
ff1b739b
KW
58/* strlen() of a literal string constant. XXX We might want this more general,
59 * but using it in just this file for now */
60#define STRLENs(s) (sizeof("" s "") - 1)
61
8ef6e574
KW
62#ifdef USE_LOCALE
63
98994639 64/*
0d071d52
KW
65 * Standardize the locale name from a string returned by 'setlocale', possibly
66 * modifying that string.
98994639 67 *
0ef2a2b2 68 * The typical return value of setlocale() is either
98994639
HS
69 * (1) "xx_YY" if the first argument of setlocale() is not LC_ALL
70 * (2) "xa_YY xb_YY ..." if the first argument of setlocale() is LC_ALL
71 * (the space-separated values represent the various sublocales,
0ef2a2b2 72 * in some unspecified order). This is not handled by this function.
98994639
HS
73 *
74 * In some platforms it has a form like "LC_SOMETHING=Lang_Country.866\n",
0ef2a2b2
KW
75 * which is harmful for further use of the string in setlocale(). This
76 * function removes the trailing new line and everything up through the '='
98994639
HS
77 *
78 */
79STATIC char *
80S_stdize_locale(pTHX_ char *locs)
81{
7452cf6a 82 const char * const s = strchr(locs, '=');
98994639
HS
83 bool okay = TRUE;
84
7918f24d
NC
85 PERL_ARGS_ASSERT_STDIZE_LOCALE;
86
8772537c
AL
87 if (s) {
88 const char * const t = strchr(s, '.');
98994639 89 okay = FALSE;
8772537c
AL
90 if (t) {
91 const char * const u = strchr(t, '\n');
92 if (u && (u[1] == 0)) {
93 const STRLEN len = u - s;
94 Move(s + 1, locs, len, char);
95 locs[len] = 0;
96 okay = TRUE;
98994639
HS
97 }
98 }
99 }
100
101 if (!okay)
102 Perl_croak(aTHX_ "Can't fix broken locale name \"%s\"", locs);
103
104 return locs;
105}
106
e5f10d49
KW
107/* Two parallel arrays; first the locale categories Perl uses on this system;
108 * the second array is their names. These arrays are in mostly arbitrary
109 * order. */
110
111const int categories[] = {
112
113# ifdef USE_LOCALE_NUMERIC
114 LC_NUMERIC,
115# endif
116# ifdef USE_LOCALE_CTYPE
117 LC_CTYPE,
118# endif
119# ifdef USE_LOCALE_COLLATE
120 LC_COLLATE,
121# endif
122# ifdef USE_LOCALE_TIME
123 LC_TIME,
124# endif
125# ifdef USE_LOCALE_MESSAGES
126 LC_MESSAGES,
127# endif
128# ifdef USE_LOCALE_MONETARY
129 LC_MONETARY,
130# endif
131# ifdef LC_ALL
132 LC_ALL,
133# endif
134 -1 /* Placeholder because C doesn't allow a
135 trailing comma, and it would get complicated
136 with all the #ifdef's */
137};
138
139/* The top-most real element is LC_ALL */
140
141const char * category_names[] = {
142
143# ifdef USE_LOCALE_NUMERIC
144 "LC_NUMERIC",
145# endif
146# ifdef USE_LOCALE_CTYPE
147 "LC_CTYPE",
148# endif
149# ifdef USE_LOCALE_COLLATE
150 "LC_COLLATE",
151# endif
152# ifdef USE_LOCALE_TIME
153 "LC_TIME",
154# endif
155# ifdef USE_LOCALE_MESSAGES
156 "LC_MESSAGES",
157# endif
158# ifdef USE_LOCALE_MONETARY
159 "LC_MONETARY",
160# endif
161# ifdef LC_ALL
162 "LC_ALL",
163# endif
164 NULL /* Placeholder */
165 };
166
167# ifdef LC_ALL
168
169 /* On systems with LC_ALL, it is kept in the highest index position. (-2
170 * to account for the final unused placeholder element.) */
171# define NOMINAL_LC_ALL_INDEX (C_ARRAY_LENGTH(categories) - 2)
172
173# else
174
175 /* On systems without LC_ALL, we pretend it is there, one beyond the real
176 * top element, hence in the unused placeholder element. */
177# define NOMINAL_LC_ALL_INDEX (C_ARRAY_LENGTH(categories) - 1)
178
179# endif
180
181/* Pretending there is an LC_ALL element just above allows us to avoid most
182 * special cases. Most loops through these arrays in the code below are
183 * written like 'for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++)'. They will work
184 * on either type of system. But the code must be written to not access the
185 * element at 'NOMINAL_LC_ALL_INDEX' except on platforms that have it. */
186
8ef6e574
KW
187#endif
188
d2b24094
KW
189/* Windows requres a customized base-level setlocale() */
190# ifdef WIN32
191# define my_setlocale(cat, locale) win32_setlocale(cat, locale)
192# else
193# define my_setlocale(cat, locale) setlocale(cat, locale)
194# endif
195
837ce802
KW
196/* Just placeholders for now. "_c" is intended to be called when the category
197 * is a constant known at compile time; "_r", not known until run time */
d2b24094
KW
198# define do_setlocale_c(category, locale) my_setlocale(category, locale)
199# define do_setlocale_r(category, locale) my_setlocale(category, locale)
837ce802 200
a4f00dcc 201STATIC void
86799d2d 202S_set_numeric_radix(pTHX_ const bool use_locale)
98994639 203{
86799d2d
KW
204 /* If 'use_locale' is FALSE, set to use a dot for the radix character. If
205 * TRUE, use the radix character derived from the current locale */
7d4bcc4a 206
86799d2d
KW
207#if defined(USE_LOCALE_NUMERIC) && ( defined(HAS_LOCALECONV) \
208 || defined(HAS_NL_LANGINFO))
98994639 209
86799d2d
KW
210 /* We only set up the radix SV if we are to use a locale radix ... */
211 if (use_locale) {
212 const char * radix = my_nl_langinfo(PERL_RADIXCHAR, FALSE);
213 /* FALSE => already in dest locale */
214
215 /* ... and the character being used isn't a dot */
216 if (strNE(radix, ".")) {
217 if (PL_numeric_radix_sv) {
218 sv_setpv(PL_numeric_radix_sv, radix);
219 }
220 else {
221 PL_numeric_radix_sv = newSVpv(radix, 0);
222 }
223
224 if ( ! is_utf8_invariant_string(
225 (U8 *) SvPVX(PL_numeric_radix_sv), SvCUR(PL_numeric_radix_sv))
226 && is_utf8_string(
227 (U8 *) SvPVX(PL_numeric_radix_sv), SvCUR(PL_numeric_radix_sv))
c1284011 228 && _is_cur_LC_category_utf8(LC_NUMERIC))
28acfe03 229 {
86799d2d 230 SvUTF8_on(PL_numeric_radix_sv);
28acfe03 231 }
86799d2d
KW
232 goto done;
233 }
98994639 234 }
69014004 235
86799d2d
KW
236 SvREFCNT_dec(PL_numeric_radix_sv);
237 PL_numeric_radix_sv = NULL;
238
239 done: ;
240
241# ifdef DEBUGGING
7d4bcc4a 242
2fcc0ca9
KW
243 if (DEBUG_L_TEST || debug_initialization) {
244 PerlIO_printf(Perl_debug_log, "Locale radix is '%s', ?UTF-8=%d\n",
69014004 245 (PL_numeric_radix_sv)
37b7e435
KW
246 ? SvPVX(PL_numeric_radix_sv)
247 : "NULL",
248 (PL_numeric_radix_sv)
39eb7305 249 ? cBOOL(SvUTF8(PL_numeric_radix_sv))
2fcc0ca9
KW
250 : 0);
251 }
69014004 252
86799d2d
KW
253# endif
254#endif /* USE_LOCALE_NUMERIC and can find the radix char */
7d4bcc4a 255
98994639
HS
256}
257
a39edc4c
KW
258/* Is the C string input 'name' "C" or "POSIX"? If so, and 'name' is the
259 * return of setlocale(), then this is extremely likely to be the C or POSIX
260 * locale. However, the output of setlocale() is documented to be opaque, but
261 * the odds are extremely small that it would return these two strings for some
262 * other locale. Note that VMS in these two locales includes many non-ASCII
263 * characters as controls and punctuation (below are hex bytes):
e70d440c
KW
264 * cntrl: 84-97 9B-9F
265 * punct: A1-A3 A5 A7-AB B0-B3 B5-B7 B9-BD BF-CF D1-DD DF-EF F1-FD
a39edc4c
KW
266 * Oddly, none there are listed as alphas, though some represent alphabetics
267 * http://www.nntp.perl.org/group/perl.perl5.porters/2013/02/msg198753.html */
7d4bcc4a
KW
268#define isNAME_C_OR_POSIX(name) \
269 ( (name) != NULL \
270 && (( *(name) == 'C' && (*(name + 1)) == '\0') \
271 || strEQ((name), "POSIX")))
a39edc4c 272
98994639 273void
8772537c 274Perl_new_numeric(pTHX_ const char *newnum)
98994639 275{
7d4bcc4a
KW
276
277#ifndef USE_LOCALE_NUMERIC
278
279 PERL_UNUSED_ARG(newnum);
280
281#else
0d071d52
KW
282
283 /* Called after all libc setlocale() calls affecting LC_NUMERIC, to tell
284 * core Perl this and that 'newnum' is the name of the new locale.
285 * It installs this locale as the current underlying default.
286 *
287 * The default locale and the C locale can be toggled between by use of the
5792c642
KW
288 * set_numeric_underlying() and set_numeric_standard() functions, which
289 * should probably not be called directly, but only via macros like
0d071d52
KW
290 * SET_NUMERIC_STANDARD() in perl.h.
291 *
292 * The toggling is necessary mainly so that a non-dot radix decimal point
293 * character can be output, while allowing internal calculations to use a
294 * dot.
295 *
296 * This sets several interpreter-level variables:
bb304765 297 * PL_numeric_name The underlying locale's name: a copy of 'newnum'
892e6465 298 * PL_numeric_underlying A boolean indicating if the toggled state is such
7738054c
KW
299 * that the current locale is the program's underlying
300 * locale
301 * PL_numeric_standard An int indicating if the toggled state is such
302 * that the current locale is the C locale. If non-zero,
303 * it is in C; if > 1, it means it may not be toggled away
304 * from C.
0d071d52
KW
305 * Note that both of the last two variables can be true at the same time,
306 * if the underlying locale is C. (Toggling is a no-op under these
307 * circumstances.)
308 *
309 * Any code changing the locale (outside this file) should use
310 * POSIX::setlocale, which calls this function. Therefore this function
311 * should be called directly only from this file and from
312 * POSIX::setlocale() */
313
b03f34cf 314 char *save_newnum;
98994639
HS
315
316 if (! newnum) {
43c5f42d
NC
317 Safefree(PL_numeric_name);
318 PL_numeric_name = NULL;
98994639 319 PL_numeric_standard = TRUE;
892e6465 320 PL_numeric_underlying = TRUE;
98994639
HS
321 return;
322 }
323
b03f34cf 324 save_newnum = stdize_locale(savepv(newnum));
abe1abcf
KW
325
326 PL_numeric_standard = isNAME_C_OR_POSIX(save_newnum);
892e6465 327 PL_numeric_underlying = TRUE;
abe1abcf 328
b03f34cf 329 if (! PL_numeric_name || strNE(PL_numeric_name, save_newnum)) {
98994639 330 Safefree(PL_numeric_name);
b03f34cf 331 PL_numeric_name = save_newnum;
b03f34cf 332 }
abe1abcf
KW
333 else {
334 Safefree(save_newnum);
335 }
4c28b29c
KW
336
337 /* Keep LC_NUMERIC in the C locale. This is for XS modules, so they don't
338 * have to worry about the radix being a non-dot. (Core operations that
339 * need the underlying locale change to it temporarily). */
340 set_numeric_standard();
341
98994639 342#endif /* USE_LOCALE_NUMERIC */
7d4bcc4a 343
98994639
HS
344}
345
346void
347Perl_set_numeric_standard(pTHX)
348{
7d4bcc4a 349
98994639 350#ifdef USE_LOCALE_NUMERIC
7d4bcc4a 351
28c1bf33
KW
352 /* Toggle the LC_NUMERIC locale to C. Most code should use the macros like
353 * SET_NUMERIC_STANDARD() in perl.h instead of calling this directly. The
354 * macro avoids calling this routine if toggling isn't necessary according
355 * to our records (which could be wrong if some XS code has changed the
356 * locale behind our back) */
0d071d52 357
837ce802 358 do_setlocale_c(LC_NUMERIC, "C");
a9b8c0d8 359 PL_numeric_standard = TRUE;
892e6465 360 PL_numeric_underlying = isNAME_C_OR_POSIX(PL_numeric_name);
86799d2d 361 set_numeric_radix(0);
7d4bcc4a
KW
362
363# ifdef DEBUGGING
364
2fcc0ca9
KW
365 if (DEBUG_L_TEST || debug_initialization) {
366 PerlIO_printf(Perl_debug_log,
58e4a467 367 "LC_NUMERIC locale now is standard C\n");
2fcc0ca9 368 }
98994639 369
7d4bcc4a 370# endif
98994639 371#endif /* USE_LOCALE_NUMERIC */
7d4bcc4a 372
98994639
HS
373}
374
375void
5792c642 376Perl_set_numeric_underlying(pTHX)
98994639 377{
7d4bcc4a 378
98994639 379#ifdef USE_LOCALE_NUMERIC
7d4bcc4a 380
28c1bf33 381 /* Toggle the LC_NUMERIC locale to the current underlying default. Most
7d4bcc4a
KW
382 * code should use the macros like SET_NUMERIC_UNDERLYING() in perl.h
383 * instead of calling this directly. The macro avoids calling this routine
384 * if toggling isn't necessary according to our records (which could be
385 * wrong if some XS code has changed the locale behind our back) */
a9b8c0d8 386
837ce802 387 do_setlocale_c(LC_NUMERIC, PL_numeric_name);
a9b8c0d8 388 PL_numeric_standard = isNAME_C_OR_POSIX(PL_numeric_name);
892e6465 389 PL_numeric_underlying = TRUE;
86799d2d 390 set_numeric_radix(1);
7d4bcc4a
KW
391
392# ifdef DEBUGGING
393
2fcc0ca9
KW
394 if (DEBUG_L_TEST || debug_initialization) {
395 PerlIO_printf(Perl_debug_log,
58e4a467 396 "LC_NUMERIC locale now is %s\n",
2fcc0ca9
KW
397 PL_numeric_name);
398 }
98994639 399
7d4bcc4a 400# endif
98994639 401#endif /* USE_LOCALE_NUMERIC */
7d4bcc4a 402
98994639
HS
403}
404
405/*
406 * Set up for a new ctype locale.
407 */
a4f00dcc
KW
408STATIC void
409S_new_ctype(pTHX_ const char *newctype)
98994639 410{
7d4bcc4a
KW
411
412#ifndef USE_LOCALE_CTYPE
413
414 PERL_ARGS_ASSERT_NEW_CTYPE;
415 PERL_UNUSED_ARG(newctype);
416 PERL_UNUSED_CONTEXT;
417
418#else
0d071d52
KW
419
420 /* Called after all libc setlocale() calls affecting LC_CTYPE, to tell
421 * core Perl this and that 'newctype' is the name of the new locale.
422 *
423 * This function sets up the folding arrays for all 256 bytes, assuming
424 * that tofold() is tolc() since fold case is not a concept in POSIX,
425 *
426 * Any code changing the locale (outside this file) should use
427 * POSIX::setlocale, which calls this function. Therefore this function
428 * should be called directly only from this file and from
429 * POSIX::setlocale() */
430
27da23d5 431 dVAR;
68067e4e 432 UV i;
98994639 433
7918f24d
NC
434 PERL_ARGS_ASSERT_NEW_CTYPE;
435
215c5139
KW
436 /* We will replace any bad locale warning with 1) nothing if the new one is
437 * ok; or 2) a new warning for the bad new locale */
438 if (PL_warn_locale) {
439 SvREFCNT_dec_NN(PL_warn_locale);
440 PL_warn_locale = NULL;
441 }
442
c1284011 443 PL_in_utf8_CTYPE_locale = _is_cur_LC_category_utf8(LC_CTYPE);
31f05a37
KW
444
445 /* A UTF-8 locale gets standard rules. But note that code still has to
446 * handle this specially because of the three problematic code points */
447 if (PL_in_utf8_CTYPE_locale) {
448 Copy(PL_fold_latin1, PL_fold_locale, 256, U8);
449 }
450 else {
8c6180a9
KW
451 /* Assume enough space for every character being bad. 4 spaces each
452 * for the 94 printable characters that are output like "'x' "; and 5
453 * spaces each for "'\\' ", "'\t' ", and "'\n' "; plus a terminating
454 * NUL */
455 char bad_chars_list[ (94 * 4) + (3 * 5) + 1 ];
456
cc9eaeb0
KW
457 /* Don't check for problems if we are suppressing the warnings */
458 bool check_for_problems = ckWARN_d(WARN_LOCALE)
459 || UNLIKELY(DEBUG_L_TEST);
8c6180a9
KW
460 bool multi_byte_locale = FALSE; /* Assume is a single-byte locale
461 to start */
462 unsigned int bad_count = 0; /* Count of bad characters */
463
baa60164
KW
464 for (i = 0; i < 256; i++) {
465 if (isUPPER_LC((U8) i))
466 PL_fold_locale[i] = (U8) toLOWER_LC((U8) i);
467 else if (isLOWER_LC((U8) i))
468 PL_fold_locale[i] = (U8) toUPPER_LC((U8) i);
469 else
470 PL_fold_locale[i] = (U8) i;
8c6180a9
KW
471
472 /* If checking for locale problems, see if the native ASCII-range
473 * printables plus \n and \t are in their expected categories in
474 * the new locale. If not, this could mean big trouble, upending
475 * Perl's and most programs' assumptions, like having a
476 * metacharacter with special meaning become a \w. Fortunately,
477 * it's very rare to find locales that aren't supersets of ASCII
478 * nowadays. It isn't a problem for most controls to be changed
479 * into something else; we check only \n and \t, though perhaps \r
480 * could be an issue as well. */
7d4bcc4a 481 if ( check_for_problems
8c6180a9
KW
482 && (isGRAPH_A(i) || isBLANK_A(i) || i == '\n'))
483 {
7d4bcc4a 484 if (( isALPHANUMERIC_A(i) && ! isALPHANUMERIC_LC(i))
8c6180a9
KW
485 || (isPUNCT_A(i) && ! isPUNCT_LC(i))
486 || (isBLANK_A(i) && ! isBLANK_LC(i))
487 || (i == '\n' && ! isCNTRL_LC(i)))
488 {
489 if (bad_count) { /* Separate multiple entries with a
490 blank */
491 bad_chars_list[bad_count++] = ' ';
492 }
493 bad_chars_list[bad_count++] = '\'';
494 if (isPRINT_A(i)) {
495 bad_chars_list[bad_count++] = (char) i;
496 }
497 else {
498 bad_chars_list[bad_count++] = '\\';
499 if (i == '\n') {
500 bad_chars_list[bad_count++] = 'n';
501 }
502 else {
503 assert(i == '\t');
504 bad_chars_list[bad_count++] = 't';
505 }
506 }
507 bad_chars_list[bad_count++] = '\'';
508 bad_chars_list[bad_count] = '\0';
509 }
510 }
511 }
512
7d4bcc4a
KW
513# ifdef MB_CUR_MAX
514
8c6180a9 515 /* We only handle single-byte locales (outside of UTF-8 ones; so if
d35fca5f 516 * this locale requires more than one byte, there are going to be
8c6180a9 517 * problems. */
9c8a6dc2
KW
518 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
519 "%s:%d: check_for_problems=%d, MB_CUR_MAX=%d\n",
520 __FILE__, __LINE__, check_for_problems, (int) MB_CUR_MAX));
521
ba1a4362
KW
522 if (check_for_problems && MB_CUR_MAX > 1
523
524 /* Some platforms return MB_CUR_MAX > 1 for even the "C"
525 * locale. Just assume that the implementation for them (plus
526 * for POSIX) is correct and the > 1 value is spurious. (Since
527 * these are specially handled to never be considered UTF-8
528 * locales, as long as this is the only problem, everything
529 * should work fine */
530 && strNE(newctype, "C") && strNE(newctype, "POSIX"))
531 {
8c6180a9
KW
532 multi_byte_locale = TRUE;
533 }
7d4bcc4a
KW
534
535# endif
8c6180a9
KW
536
537 if (bad_count || multi_byte_locale) {
780fcc9f 538 PL_warn_locale = Perl_newSVpvf(aTHX_
8c6180a9 539 "Locale '%s' may not work well.%s%s%s\n",
780fcc9f 540 newctype,
8c6180a9
KW
541 (multi_byte_locale)
542 ? " Some characters in it are not recognized by"
543 " Perl."
544 : "",
545 (bad_count)
546 ? "\nThe following characters (and maybe others)"
547 " may not have the same meaning as the Perl"
548 " program expects:\n"
549 : "",
550 (bad_count)
551 ? bad_chars_list
552 : ""
553 );
cc9eaeb0 554 /* If we are actually in the scope of the locale or are debugging,
bddebb56
KW
555 * output the message now. If not in that scope, we save the
556 * message to be output at the first operation using this locale,
557 * if that actually happens. Most programs don't use locales, so
558 * they are immune to bad ones. */
cc9eaeb0 559 if (IN_LC(LC_CTYPE) || UNLIKELY(DEBUG_L_TEST)) {
780fcc9f
KW
560
561 /* We have to save 'newctype' because the setlocale() just
562 * below may destroy it. The next setlocale() further down
563 * should restore it properly so that the intermediate change
564 * here is transparent to this function's caller */
565 const char * const badlocale = savepv(newctype);
566
837ce802 567 do_setlocale_c(LC_CTYPE, "C");
780fcc9f
KW
568
569 /* The '0' below suppresses a bogus gcc compiler warning */
570 Perl_warner(aTHX_ packWARN(WARN_LOCALE), SvPVX(PL_warn_locale), 0);
bddebb56 571
837ce802 572 do_setlocale_c(LC_CTYPE, badlocale);
c0f3a893 573 Safefree(badlocale);
bddebb56
KW
574
575 if (IN_LC(LC_CTYPE)) {
576 SvREFCNT_dec_NN(PL_warn_locale);
577 PL_warn_locale = NULL;
578 }
780fcc9f 579 }
baa60164 580 }
31f05a37 581 }
98994639
HS
582
583#endif /* USE_LOCALE_CTYPE */
7d4bcc4a 584
98994639
HS
585}
586
98994639 587void
2726666d
KW
588Perl__warn_problematic_locale()
589{
2726666d
KW
590
591#ifdef USE_LOCALE_CTYPE
592
5f04a188
KW
593 dTHX;
594
595 /* Internal-to-core function that outputs the message in PL_warn_locale,
596 * and then NULLS it. Should be called only through the macro
597 * _CHECK_AND_WARN_PROBLEMATIC_LOCALE */
598
2726666d
KW
599 if (PL_warn_locale) {
600 /*GCC_DIAG_IGNORE(-Wformat-security); Didn't work */
601 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
602 SvPVX(PL_warn_locale),
603 0 /* dummy to avoid compiler warning */ );
604 /* GCC_DIAG_RESTORE; */
605 SvREFCNT_dec_NN(PL_warn_locale);
606 PL_warn_locale = NULL;
607 }
608
609#endif
610
611}
612
a4f00dcc
KW
613STATIC void
614S_new_collate(pTHX_ const char *newcoll)
98994639 615{
7d4bcc4a
KW
616
617#ifndef USE_LOCALE_COLLATE
618
619 PERL_UNUSED_ARG(newcoll);
620 PERL_UNUSED_CONTEXT;
621
622#else
0d071d52
KW
623
624 /* Called after all libc setlocale() calls affecting LC_COLLATE, to tell
625 * core Perl this and that 'newcoll' is the name of the new locale.
626 *
d35fca5f
KW
627 * The design of locale collation is that every locale change is given an
628 * index 'PL_collation_ix'. The first time a string particpates in an
629 * operation that requires collation while locale collation is active, it
630 * is given PERL_MAGIC_collxfrm magic (via sv_collxfrm_flags()). That
631 * magic includes the collation index, and the transformation of the string
632 * by strxfrm(), q.v. That transformation is used when doing comparisons,
633 * instead of the string itself. If a string changes, the magic is
634 * cleared. The next time the locale changes, the index is incremented,
635 * and so we know during a comparison that the transformation is not
636 * necessarily still valid, and so is recomputed. Note that if the locale
637 * changes enough times, the index could wrap (a U32), and it is possible
638 * that a transformation would improperly be considered valid, leading to
639 * an unlikely bug */
0d071d52 640
98994639
HS
641 if (! newcoll) {
642 if (PL_collation_name) {
643 ++PL_collation_ix;
644 Safefree(PL_collation_name);
645 PL_collation_name = NULL;
646 }
647 PL_collation_standard = TRUE;
00bf60ca 648 is_standard_collation:
98994639
HS
649 PL_collxfrm_base = 0;
650 PL_collxfrm_mult = 2;
165a1c52 651 PL_in_utf8_COLLATE_locale = FALSE;
f28f4d2a 652 PL_strxfrm_NUL_replacement = '\0';
a4a439fb 653 PL_strxfrm_max_cp = 0;
98994639
HS
654 return;
655 }
656
d35fca5f 657 /* If this is not the same locale as currently, set the new one up */
98994639
HS
658 if (! PL_collation_name || strNE(PL_collation_name, newcoll)) {
659 ++PL_collation_ix;
660 Safefree(PL_collation_name);
661 PL_collation_name = stdize_locale(savepv(newcoll));
a39edc4c 662 PL_collation_standard = isNAME_C_OR_POSIX(newcoll);
00bf60ca
KW
663 if (PL_collation_standard) {
664 goto is_standard_collation;
665 }
98994639 666
165a1c52 667 PL_in_utf8_COLLATE_locale = _is_cur_LC_category_utf8(LC_COLLATE);
f28f4d2a 668 PL_strxfrm_NUL_replacement = '\0';
a4a439fb 669 PL_strxfrm_max_cp = 0;
165a1c52 670
59c018b9
KW
671 /* A locale collation definition includes primary, secondary, tertiary,
672 * etc. weights for each character. To sort, the primary weights are
673 * used, and only if they compare equal, then the secondary weights are
674 * used, and only if they compare equal, then the tertiary, etc.
675 *
676 * strxfrm() works by taking the input string, say ABC, and creating an
677 * output transformed string consisting of first the primary weights,
678 * A¹B¹C¹ followed by the secondary ones, A²B²C²; and then the
679 * tertiary, etc, yielding A¹B¹C¹ A²B²C² A³B³C³ .... Some characters
680 * may not have weights at every level. In our example, let's say B
681 * doesn't have a tertiary weight, and A doesn't have a secondary
682 * weight. The constructed string is then going to be
683 * A¹B¹C¹ B²C² A³C³ ....
684 * This has the desired effect that strcmp() will look at the secondary
685 * or tertiary weights only if the strings compare equal at all higher
686 * priority weights. The spaces shown here, like in
c342d20e 687 * "A¹B¹C¹ A²B²C² "
59c018b9
KW
688 * are not just for readability. In the general case, these must
689 * actually be bytes, which we will call here 'separator weights'; and
690 * they must be smaller than any other weight value, but since these
691 * are C strings, only the terminating one can be a NUL (some
692 * implementations may include a non-NUL separator weight just before
693 * the NUL). Implementations tend to reserve 01 for the separator
694 * weights. They are needed so that a shorter string's secondary
695 * weights won't be misconstrued as primary weights of a longer string,
696 * etc. By making them smaller than any other weight, the shorter
697 * string will sort first. (Actually, if all secondary weights are
698 * smaller than all primary ones, there is no need for a separator
699 * weight between those two levels, etc.)
700 *
701 * The length of the transformed string is roughly a linear function of
702 * the input string. It's not exactly linear because some characters
703 * don't have weights at all levels. When we call strxfrm() we have to
704 * allocate some memory to hold the transformed string. The
705 * calculations below try to find coefficients 'm' and 'b' for this
706 * locale so that m*x + b equals how much space we need, given the size
707 * of the input string in 'x'. If we calculate too small, we increase
708 * the size as needed, and call strxfrm() again, but it is better to
709 * get it right the first time to avoid wasted expensive string
710 * transformations. */
711
98994639 712 {
79f120c8
KW
713 /* We use the string below to find how long the tranformation of it
714 * is. Almost all locales are supersets of ASCII, or at least the
715 * ASCII letters. We use all of them, half upper half lower,
716 * because if we used fewer, we might hit just the ones that are
717 * outliers in a particular locale. Most of the strings being
718 * collated will contain a preponderance of letters, and even if
719 * they are above-ASCII, they are likely to have the same number of
720 * weight levels as the ASCII ones. It turns out that digits tend
721 * to have fewer levels, and some punctuation has more, but those
722 * are relatively sparse in text, and khw believes this gives a
723 * reasonable result, but it could be changed if experience so
724 * dictates. */
725 const char longer[] = "ABCDEFGHIJKLMnopqrstuvwxyz";
726 char * x_longer; /* Transformed 'longer' */
727 Size_t x_len_longer; /* Length of 'x_longer' */
728
729 char * x_shorter; /* We also transform a substring of 'longer' */
730 Size_t x_len_shorter;
731
a4a439fb 732 /* _mem_collxfrm() is used get the transformation (though here we
79f120c8
KW
733 * are interested only in its length). It is used because it has
734 * the intelligence to handle all cases, but to work, it needs some
735 * values of 'm' and 'b' to get it started. For the purposes of
736 * this calculation we use a very conservative estimate of 'm' and
737 * 'b'. This assumes a weight can be multiple bytes, enough to
738 * hold any UV on the platform, and there are 5 levels, 4 weight
739 * bytes, and a trailing NUL. */
740 PL_collxfrm_base = 5;
741 PL_collxfrm_mult = 5 * sizeof(UV);
742
743 /* Find out how long the transformation really is */
a4a439fb
KW
744 x_longer = _mem_collxfrm(longer,
745 sizeof(longer) - 1,
746 &x_len_longer,
747
748 /* We avoid converting to UTF-8 in the
749 * called function by telling it the
750 * string is in UTF-8 if the locale is a
751 * UTF-8 one. Since the string passed
752 * here is invariant under UTF-8, we can
753 * claim it's UTF-8 even though it isn't.
754 * */
755 PL_in_utf8_COLLATE_locale);
79f120c8
KW
756 Safefree(x_longer);
757
758 /* Find out how long the transformation of a substring of 'longer'
759 * is. Together the lengths of these transformations are
760 * sufficient to calculate 'm' and 'b'. The substring is all of
761 * 'longer' except the first character. This minimizes the chances
762 * of being swayed by outliers */
a4a439fb 763 x_shorter = _mem_collxfrm(longer + 1,
79f120c8 764 sizeof(longer) - 2,
a4a439fb
KW
765 &x_len_shorter,
766 PL_in_utf8_COLLATE_locale);
79f120c8
KW
767 Safefree(x_shorter);
768
769 /* If the results are nonsensical for this simple test, the whole
770 * locale definition is suspect. Mark it so that locale collation
771 * is not active at all for it. XXX Should we warn? */
772 if ( x_len_shorter == 0
773 || x_len_longer == 0
774 || x_len_shorter >= x_len_longer)
775 {
776 PL_collxfrm_mult = 0;
777 PL_collxfrm_base = 0;
778 }
779 else {
780 SSize_t base; /* Temporary */
781
782 /* We have both: m * strlen(longer) + b = x_len_longer
783 * m * strlen(shorter) + b = x_len_shorter;
784 * subtracting yields:
785 * m * (strlen(longer) - strlen(shorter))
786 * = x_len_longer - x_len_shorter
787 * But we have set things up so that 'shorter' is 1 byte smaller
788 * than 'longer'. Hence:
789 * m = x_len_longer - x_len_shorter
790 *
791 * But if something went wrong, make sure the multiplier is at
792 * least 1.
793 */
794 if (x_len_longer > x_len_shorter) {
795 PL_collxfrm_mult = (STRLEN) x_len_longer - x_len_shorter;
796 }
797 else {
798 PL_collxfrm_mult = 1;
799 }
800
801 /* mx + b = len
802 * so: b = len - mx
803 * but in case something has gone wrong, make sure it is
804 * non-negative */
805 base = x_len_longer - PL_collxfrm_mult * (sizeof(longer) - 1);
806 if (base < 0) {
807 base = 0;
808 }
809
810 /* Add 1 for the trailing NUL */
811 PL_collxfrm_base = base + 1;
812 }
58eebef2 813
7d4bcc4a
KW
814# ifdef DEBUGGING
815
58eebef2
KW
816 if (DEBUG_L_TEST || debug_initialization) {
817 PerlIO_printf(Perl_debug_log,
b07929e4
KW
818 "%s:%d: ?UTF-8 locale=%d; x_len_shorter=%zu, "
819 "x_len_longer=%zu,"
820 " collate multipler=%zu, collate base=%zu\n",
58eebef2
KW
821 __FILE__, __LINE__,
822 PL_in_utf8_COLLATE_locale,
823 x_len_shorter, x_len_longer,
824 PL_collxfrm_mult, PL_collxfrm_base);
825 }
7d4bcc4a
KW
826# endif
827
98994639
HS
828 }
829 }
830
831#endif /* USE_LOCALE_COLLATE */
7d4bcc4a 832
98994639
HS
833}
834
d2b24094 835#ifdef WIN32
b8cc575c 836
a4f00dcc 837STATIC char *
b8cc575c 838S_win32_setlocale(pTHX_ int category, const char* locale)
b385bb4d
KW
839{
840 /* This, for Windows, emulates POSIX setlocale() behavior. There is no
7d4bcc4a
KW
841 * difference between the two unless the input locale is "", which normally
842 * means on Windows to get the machine default, which is set via the
843 * computer's "Regional and Language Options" (or its current equivalent).
844 * In POSIX, it instead means to find the locale from the user's
845 * environment. This routine changes the Windows behavior to first look in
846 * the environment, and, if anything is found, use that instead of going to
847 * the machine default. If there is no environment override, the machine
848 * default is used, by calling the real setlocale() with "".
849 *
850 * The POSIX behavior is to use the LC_ALL variable if set; otherwise to
851 * use the particular category's variable if set; otherwise to use the LANG
852 * variable. */
b385bb4d 853
175c4cf9 854 bool override_LC_ALL = FALSE;
89f7b9aa 855 char * result;
e5f10d49 856 unsigned int i;
89f7b9aa 857
b385bb4d 858 if (locale && strEQ(locale, "")) {
7d4bcc4a
KW
859
860# ifdef LC_ALL
861
b385bb4d
KW
862 locale = PerlEnv_getenv("LC_ALL");
863 if (! locale) {
e5f10d49
KW
864 if (category == LC_ALL) {
865 override_LC_ALL = TRUE;
866 }
867 else {
7d4bcc4a
KW
868
869# endif
7d4bcc4a 870
e5f10d49
KW
871 for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++) {
872 if (category == categories[i]) {
873 locale = PerlEnv_getenv(category_names[i]);
874 goto found_locale;
875 }
876 }
7d4bcc4a 877
b385bb4d 878 locale = PerlEnv_getenv("LANG");
481465ea 879 if (! locale) {
b385bb4d
KW
880 locale = "";
881 }
e5f10d49
KW
882
883 found_locale: ;
7d4bcc4a
KW
884
885# ifdef LC_ALL
886
e5f10d49 887 }
b385bb4d 888 }
7d4bcc4a
KW
889
890# endif
891
b385bb4d
KW
892 }
893
89f7b9aa 894 result = setlocale(category, locale);
bbc98134 895 DEBUG_L(PerlIO_printf(Perl_debug_log, "%s:%d: %s\n", __FILE__, __LINE__,
a4f00dcc 896 setlocale_debug_string(category, locale, result)));
89f7b9aa 897
481465ea 898 if (! override_LC_ALL) {
89f7b9aa
KW
899 return result;
900 }
901
dfd77d7a 902 /* Here the input category was LC_ALL, and we have set it to what is in the
481465ea
KW
903 * LANG variable or the system default if there is no LANG. But these have
904 * lower priority than the other LC_foo variables, so override it for each
905 * one that is set. (If they are set to "", it means to use the same thing
906 * we just set LC_ALL to, so can skip) */
7d4bcc4a 907
e5f10d49
KW
908 for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++) {
909 result = PerlEnv_getenv(category_names[i]);
910 if (result && strNE(result, "")) {
911 setlocale(categories[i], result);
912 DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s:%d: %s\n",
913 __FILE__, __LINE__,
914 setlocale_debug_string(categories[i], result, "not captured")));
915 }
89f7b9aa 916 }
7d4bcc4a 917
bbc98134
KW
918 result = setlocale(LC_ALL, NULL);
919 DEBUG_L(PerlIO_printf(Perl_debug_log, "%s:%d: %s\n",
920 __FILE__, __LINE__,
a4f00dcc 921 setlocale_debug_string(LC_ALL, NULL, result)));
89f7b9aa 922
bbc98134 923 return result;
b385bb4d
KW
924}
925
926#endif
927
a4f00dcc
KW
928char *
929Perl_setlocale(int category, const char * locale)
930{
931 /* This wraps POSIX::setlocale() */
932
933 char * retval;
1159483a 934 char * newlocale;
a4f00dcc
KW
935 dTHX;
936
a4f00dcc
KW
937#ifdef USE_LOCALE_NUMERIC
938
939 /* A NULL locale means only query what the current one is. We
940 * have the LC_NUMERIC name saved, because we are normally switched
941 * into the C locale for it. Switch back so an LC_ALL query will yield
942 * the correct results; all other categories don't require special
943 * handling */
944 if (locale == NULL) {
945 if (category == LC_NUMERIC) {
946 return savepv(PL_numeric_name);
947 }
948
7d4bcc4a 949# ifdef LC_ALL
a4f00dcc
KW
950
951 else if (category == LC_ALL) {
952 SET_NUMERIC_UNDERLYING();
953 }
954
7d4bcc4a 955# endif
a4f00dcc
KW
956
957 }
958
959#endif
960
e5f10d49
KW
961 /* Save retval since subsequent setlocale() calls may overwrite it. */
962 retval = savepv(do_setlocale_r(category, locale));
a4f00dcc
KW
963
964 DEBUG_L(PerlIO_printf(Perl_debug_log,
965 "%s:%d: %s\n", __FILE__, __LINE__,
966 setlocale_debug_string(category, locale, retval)));
967 if (! retval) {
968 /* Should never happen that a query would return an error, but be
969 * sure and reset to C locale */
970 if (locale == 0) {
971 SET_NUMERIC_STANDARD();
972 }
7d4bcc4a 973
a4f00dcc
KW
974 return NULL;
975 }
976
a4f00dcc
KW
977 /* If locale == NULL, we are just querying the state, but may have switched
978 * to NUMERIC_UNDERLYING. Switch back before returning. */
979 if (locale == NULL) {
980 SET_NUMERIC_STANDARD();
981 return retval;
982 }
a4f00dcc 983
1159483a
KW
984 /* Now that have switched locales, we have to update our records to
985 * correspond. */
a4f00dcc 986
1159483a 987 switch (category) {
a4f00dcc 988
1159483a 989#ifdef USE_LOCALE_CTYPE
a4f00dcc 990
1159483a
KW
991 case LC_CTYPE:
992 new_ctype(retval);
993 break;
a4f00dcc 994
1159483a 995#endif
a4f00dcc
KW
996#ifdef USE_LOCALE_COLLATE
997
1159483a
KW
998 case LC_COLLATE:
999 new_collate(retval);
1000 break;
a4f00dcc 1001
1159483a 1002#endif
a4f00dcc
KW
1003#ifdef USE_LOCALE_NUMERIC
1004
1159483a
KW
1005 case LC_NUMERIC:
1006 new_numeric(retval);
1007 break;
a4f00dcc 1008
1159483a
KW
1009#endif
1010#ifdef LC_ALL
a4f00dcc 1011
1159483a 1012 case LC_ALL:
a4f00dcc 1013
1159483a
KW
1014 /* LC_ALL updates all the things we care about. The values may not
1015 * be the same as 'retval', as the locale "" may have set things
1016 * individually */
a4f00dcc 1017
1159483a 1018# ifdef USE_LOCALE_CTYPE
a4f00dcc 1019
1159483a
KW
1020 newlocale = do_setlocale_c(LC_CTYPE, NULL);
1021 new_ctype(newlocale);
a4f00dcc 1022
1159483a
KW
1023# endif /* USE_LOCALE_CTYPE */
1024# ifdef USE_LOCALE_COLLATE
1025
1026 newlocale = do_setlocale_c(LC_COLLATE, NULL);
1027 new_collate(newlocale);
a4f00dcc 1028
7d4bcc4a 1029# endif
1159483a 1030# ifdef USE_LOCALE_NUMERIC
a4f00dcc 1031
1159483a
KW
1032 newlocale = do_setlocale_c(LC_NUMERIC, NULL);
1033 new_numeric(newlocale);
a4f00dcc 1034
1159483a
KW
1035# endif /* USE_LOCALE_NUMERIC */
1036#endif /* LC_ALL */
a4f00dcc 1037
1159483a
KW
1038 default:
1039 break;
a4f00dcc
KW
1040 }
1041
1042 return retval;
1043
f7416781
KW
1044
1045}
1046
1047PERL_STATIC_INLINE const char *
1048S_save_to_buffer(const char * string, char **buf, Size_t *buf_size, const Size_t offset)
1049{
1050 /* Copy the NUL-terminated 'string' to 'buf' + 'offset'. 'buf' has size 'buf_size',
1051 * growing it if necessary */
1052
1053 const Size_t string_size = strlen(string) + offset + 1;
1054
1055 PERL_ARGS_ASSERT_SAVE_TO_BUFFER;
1056
1057 if (*buf_size == 0) {
1058 Newx(*buf, string_size, char);
1059 *buf_size = string_size;
1060 }
1061 else if (string_size > *buf_size) {
1062 Renew(*buf, string_size, char);
1063 *buf_size = string_size;
1064 }
1065
1066 Copy(string, *buf + offset, string_size - offset, char);
1067 return *buf;
1068}
1069
1070/*
1071
1072=head1 Locale-related functions and macros
1073
1074=for apidoc Perl_langinfo
1075
7d4bcc4a 1076This is an (almost ª) drop-in replacement for the system C<L<nl_langinfo(3)>>,
f7416781
KW
1077taking the same C<item> parameter values, and returning the same information.
1078But it is more thread-safe than regular C<nl_langinfo()>, and hides the quirks
1079of Perl's locale handling from your code, and can be used on systems that lack
1080a native C<nl_langinfo>.
1081
1082Expanding on these:
1083
1084=over
1085
1086=item *
1087
1088It delivers the correct results for the C<RADIXCHAR> and C<THOUSESEP> items,
1089without you having to write extra code. The reason for the extra code would be
1090because these are from the C<LC_NUMERIC> locale category, which is normally
1091kept set to the C locale by Perl, no matter what the underlying locale is
1092supposed to be, and so to get the expected results, you have to temporarily
1093toggle into the underlying locale, and later toggle back. (You could use
1094plain C<nl_langinfo> and C<L</STORE_LC_NUMERIC_FORCE_TO_UNDERLYING>> for this
1095but then you wouldn't get the other advantages of C<Perl_langinfo()>; not
1096keeping C<LC_NUMERIC> in the C locale would break a lot of CPAN, which is
1097expecting the radix (decimal point) character to be a dot.)
1098
1099=item *
1100
1101Depending on C<item>, it works on systems that don't have C<nl_langinfo>, hence
1102makes your code more portable. Of the fifty-some possible items specified by
1103the POSIX 2008 standard,
1104L<http://pubs.opengroup.org/onlinepubs/9699919799/basedefs/langinfo.h.html>,
1105only two are completely unimplemented. It uses various techniques to recover
1106the other items, including calling C<L<localeconv(3)>>, and C<L<strftime(3)>>,
1107both of which are specified in C89, so should be always be available. Later
1108C<strftime()> versions have additional capabilities; C<""> is returned for
1109those not available on your system.
1110
1111The details for those items which may differ from what this emulation returns
1112and what a native C<nl_langinfo()> would return are:
1113
1114=over
1115
1116=item C<CODESET>
1117
1118=item C<ERA>
1119
1120Unimplemented, so returns C<"">.
1121
1122=item C<YESEXPR>
1123
1124=item C<NOEXPR>
1125
1126Only the values for English are returned. Earlier POSIX standards also
1127specified C<YESSTR> and C<NOSTR>, but these have been removed from POSIX 2008,
1128and aren't supported by C<Perl_langinfo>.
1129
1130=item C<D_FMT>
1131
1132Always evaluates to C<%x>, the locale's appropriate date representation.
1133
1134=item C<T_FMT>
1135
1136Always evaluates to C<%X>, the locale's appropriate time representation.
1137
1138=item C<D_T_FMT>
1139
1140Always evaluates to C<%c>, the locale's appropriate date and time
1141representation.
1142
1143=item C<CRNCYSTR>
1144
1145The return may be incorrect for those rare locales where the currency symbol
1146replaces the radix character.
1147Send email to L<mailto:perlbug@perl.org> if you have examples of it needing
1148to work differently.
1149
1150=item C<ALT_DIGITS>
1151
1152Currently this gives the same results as Linux does.
1153Send email to L<mailto:perlbug@perl.org> if you have examples of it needing
1154to work differently.
1155
1156=item C<ERA_D_FMT>
1157
1158=item C<ERA_T_FMT>
1159
1160=item C<ERA_D_T_FMT>
1161
1162=item C<T_FMT_AMPM>
1163
1164These are derived by using C<strftime()>, and not all versions of that function
1165know about them. C<""> is returned for these on such systems.
1166
1167=back
1168
1169When using C<Perl_langinfo> on systems that don't have a native
1170C<nl_langinfo()>, you must
1171
1172 #include "perl_langinfo.h"
1173
1174before the C<perl.h> C<#include>. You can replace your C<langinfo.h>
1175C<#include> with this one. (Doing it this way keeps out the symbols that plain
1176C<langinfo.h> imports into the namespace for code that doesn't need it.)
1177
1178You also should not use the bare C<langinfo.h> item names, but should preface
1179them with C<PERL_>, so use C<PERL_RADIXCHAR> instead of plain C<RADIXCHAR>.
1180The C<PERL_I<foo>> versions will also work for this function on systems that do
1181have a native C<nl_langinfo>.
1182
1183=item *
1184
1185It is thread-friendly, returning its result in a buffer that won't be
1186overwritten by another thread, so you don't have to code for that possibility.
1187The buffer can be overwritten by the next call to C<nl_langinfo> or
1188C<Perl_langinfo> in the same thread.
1189
1190=item *
1191
7d4bcc4a 1192ª It returns S<C<const char *>>, whereas plain C<nl_langinfo()> returns S<C<char
f7416781
KW
1193*>>, but you are (only by documentation) forbidden to write into the buffer.
1194By declaring this C<const>, the compiler enforces this restriction. The extra
1195C<const> is why this isn't an unequivocal drop-in replacement for
1196C<nl_langinfo>.
1197
1198=back
1199
1200The original impetus for C<Perl_langinfo()> was so that code that needs to
1201find out the current currency symbol, floating point radix character, or digit
1202grouping separator can use, on all systems, the simpler and more
1203thread-friendly C<nl_langinfo> API instead of C<L<localeconv(3)>> which is a
1204pain to make thread-friendly. For other fields returned by C<localeconv>, it
1205is better to use the methods given in L<perlcall> to call
1206L<C<POSIX::localeconv()>|POSIX/localeconv>, which is thread-friendly.
1207
1208=cut
1209
1210*/
1211
1212const char *
1213#ifdef HAS_NL_LANGINFO
1214Perl_langinfo(const nl_item item)
1215#else
1216Perl_langinfo(const int item)
1217#endif
1218{
f61748ac
KW
1219 return my_nl_langinfo(item, TRUE);
1220}
1221
1222const char *
1223#ifdef HAS_NL_LANGINFO
1224S_my_nl_langinfo(const nl_item item, bool toggle)
1225#else
1226S_my_nl_langinfo(const int item, bool toggle)
1227#endif
1228{
ae74815b 1229 dTHX;
f7416781 1230
ab340fff
KW
1231#if defined(HAS_NL_LANGINFO) /* nl_langinfo() is available. */
1232#if ! defined(HAS_POSIX_2008_LOCALE)
f7416781 1233
ab340fff 1234 /* Here, use plain nl_langinfo(), switching to the underlying LC_NUMERIC
ae74815b
KW
1235 * for those items dependent on it. This must be copied to a buffer before
1236 * switching back, as some systems destroy the buffer when setlocale() is
1237 * called */
f7416781
KW
1238
1239 LOCALE_LOCK;
1240
1241 if (toggle) {
1242 if (item == PERL_RADIXCHAR || item == PERL_THOUSEP) {
837ce802 1243 do_setlocale_c(LC_NUMERIC, PL_numeric_name);
f7416781
KW
1244 }
1245 else {
1246 toggle = FALSE;
1247 }
1248 }
1249
1250 save_to_buffer(nl_langinfo(item), &PL_langinfo_buf, &PL_langinfo_bufsize, 0);
1251
1252 if (toggle) {
837ce802 1253 do_setlocale_c(LC_NUMERIC, "C");
f7416781
KW
1254 }
1255
1256 LOCALE_UNLOCK;
1257
1258 return PL_langinfo_buf;
1259
ab340fff
KW
1260# else /* Use nl_langinfo_l(), avoiding both a mutex and changing the locale */
1261
1262 bool do_free = FALSE;
1263 locale_t cur = uselocale((locale_t) 0);
1264
1265 if (cur == LC_GLOBAL_LOCALE) {
1266 cur = duplocale(LC_GLOBAL_LOCALE);
1267 do_free = TRUE;
1268 }
1269
1270 if ( toggle
1271 && (item == PERL_RADIXCHAR || item == PERL_THOUSEP))
1272 {
1273 cur = newlocale(LC_NUMERIC_MASK, PL_numeric_name, cur);
1274 do_free = TRUE;
1275 }
1276
1277 save_to_buffer(nl_langinfo_l(item, cur),
1278 &PL_langinfo_buf, &PL_langinfo_bufsize, 0);
1279 if (do_free) {
1280 freelocale(cur);
1281 }
1282
1283 return PL_langinfo_buf;
1284
1285# endif
f7416781 1286#else /* Below, emulate nl_langinfo as best we can */
f7416781
KW
1287# ifdef HAS_LOCALECONV
1288
1289 const struct lconv* lc;
1290
1291# endif
1292# ifdef HAS_STRFTIME
1293
1294 struct tm tm;
1295 bool return_format = FALSE; /* Return the %format, not the value */
1296 const char * format;
1297
1298# endif
1299
1300 /* We copy the results to a per-thread buffer, even if not multi-threaded.
1301 * This is in part to simplify this code, and partly because we need a
1302 * buffer anyway for strftime(), and partly because a call of localeconv()
1303 * could otherwise wipe out the buffer, and the programmer would not be
1304 * expecting this, as this is a nl_langinfo() substitute after all, so s/he
1305 * might be thinking their localeconv() is safe until another localeconv()
1306 * call. */
1307
1308 switch (item) {
1309 Size_t len;
1310 const char * retval;
1311
1312 /* These 2 are unimplemented */
1313 case PERL_CODESET:
1314 case PERL_ERA: /* For use with strftime() %E modifier */
1315
1316 default:
1317 return "";
1318
1319 /* We use only an English set, since we don't know any more */
1320 case PERL_YESEXPR: return "^[+1yY]";
1321 case PERL_NOEXPR: return "^[-0nN]";
1322
1323# ifdef HAS_LOCALECONV
1324
1325 case PERL_CRNCYSTR:
1326
1327 LOCALE_LOCK;
1328
1329 lc = localeconv();
1330 if (! lc || ! lc->currency_symbol || strEQ("", lc->currency_symbol))
1331 {
1332 LOCALE_UNLOCK;
1333 return "";
1334 }
1335
1336 /* Leave the first spot empty to be filled in below */
1337 save_to_buffer(lc->currency_symbol, &PL_langinfo_buf,
1338 &PL_langinfo_bufsize, 1);
1339 if (lc->mon_decimal_point && strEQ(lc->mon_decimal_point, ""))
1340 { /* khw couldn't figure out how the localedef specifications
1341 would show that the $ should replace the radix; this is
1342 just a guess as to how it might work.*/
1343 *PL_langinfo_buf = '.';
1344 }
1345 else if (lc->p_cs_precedes) {
1346 *PL_langinfo_buf = '-';
1347 }
1348 else {
1349 *PL_langinfo_buf = '+';
1350 }
1351
1352 LOCALE_UNLOCK;
1353 break;
1354
1355 case PERL_RADIXCHAR:
1356 case PERL_THOUSEP:
1357
1358 LOCALE_LOCK;
1359
1360 if (toggle) {
837ce802 1361 do_setlocale_c(LC_NUMERIC, PL_numeric_name);
f7416781
KW
1362 }
1363
1364 lc = localeconv();
1365 if (! lc) {
1366 retval = "";
1367 }
1368 else switch (item) {
1369 case PERL_RADIXCHAR:
1370 if (! lc->decimal_point) {
1371 retval = "";
1372 }
1373 else {
1374 retval = lc->decimal_point;
1375 }
1376 break;
1377
1378 case PERL_THOUSEP:
1379 if (! lc->thousands_sep || strEQ("", lc->thousands_sep)) {
1380 retval = "";
1381 }
1382 else {
1383 retval = lc->thousands_sep;
1384 }
1385 break;
1386
1387 default:
1388 LOCALE_UNLOCK;
1389 Perl_croak(aTHX_ "panic: %s: %d: switch case: %d problem",
1390 __FILE__, __LINE__, item);
1391 }
1392
1393 save_to_buffer(retval, &PL_langinfo_buf, &PL_langinfo_bufsize, 0);
1394
1395 if (toggle) {
837ce802 1396 do_setlocale_c(LC_NUMERIC, "C");
f7416781
KW
1397 }
1398
1399 LOCALE_UNLOCK;
1400
1401 break;
1402
1403# endif
1404# ifdef HAS_STRFTIME
1405
1406 /* These are defined by C89, so we assume that strftime supports them,
1407 * and so are returned unconditionally; they may not be what the locale
1408 * actually says, but should give good enough results for someone using
1409 * them as formats (as opposed to trying to parse them to figure out
7d4bcc4a 1410 * what the locale says). The other format items are actually tested to
f7416781
KW
1411 * verify they work on the platform */
1412 case PERL_D_FMT: return "%x";
1413 case PERL_T_FMT: return "%X";
1414 case PERL_D_T_FMT: return "%c";
1415
1416 /* These formats are only available in later strfmtime's */
1417 case PERL_ERA_D_FMT: case PERL_ERA_T_FMT: case PERL_ERA_D_T_FMT:
1418 case PERL_T_FMT_AMPM:
1419
1420 /* The rest can be gotten from most versions of strftime(). */
1421 case PERL_ABDAY_1: case PERL_ABDAY_2: case PERL_ABDAY_3:
1422 case PERL_ABDAY_4: case PERL_ABDAY_5: case PERL_ABDAY_6:
1423 case PERL_ABDAY_7:
1424 case PERL_ALT_DIGITS:
1425 case PERL_AM_STR: case PERL_PM_STR:
1426 case PERL_ABMON_1: case PERL_ABMON_2: case PERL_ABMON_3:
1427 case PERL_ABMON_4: case PERL_ABMON_5: case PERL_ABMON_6:
1428 case PERL_ABMON_7: case PERL_ABMON_8: case PERL_ABMON_9:
1429 case PERL_ABMON_10: case PERL_ABMON_11: case PERL_ABMON_12:
1430 case PERL_DAY_1: case PERL_DAY_2: case PERL_DAY_3: case PERL_DAY_4:
1431 case PERL_DAY_5: case PERL_DAY_6: case PERL_DAY_7:
1432 case PERL_MON_1: case PERL_MON_2: case PERL_MON_3: case PERL_MON_4:
1433 case PERL_MON_5: case PERL_MON_6: case PERL_MON_7: case PERL_MON_8:
1434 case PERL_MON_9: case PERL_MON_10: case PERL_MON_11: case PERL_MON_12:
1435
1436 LOCALE_LOCK;
1437
1438 init_tm(&tm); /* Precaution against core dumps */
1439 tm.tm_sec = 30;
1440 tm.tm_min = 30;
1441 tm.tm_hour = 6;
1442 tm.tm_year = 2017 - 1900;
1443 tm.tm_wday = 0;
1444 tm.tm_mon = 0;
1445 switch (item) {
1446 default:
1447 LOCALE_UNLOCK;
1448 Perl_croak(aTHX_ "panic: %s: %d: switch case: %d problem",
1449 __FILE__, __LINE__, item);
1450 NOT_REACHED; /* NOTREACHED */
1451
1452 case PERL_PM_STR: tm.tm_hour = 18;
1453 case PERL_AM_STR:
1454 format = "%p";
1455 break;
1456
1457 case PERL_ABDAY_7: tm.tm_wday++;
1458 case PERL_ABDAY_6: tm.tm_wday++;
1459 case PERL_ABDAY_5: tm.tm_wday++;
1460 case PERL_ABDAY_4: tm.tm_wday++;
1461 case PERL_ABDAY_3: tm.tm_wday++;
1462 case PERL_ABDAY_2: tm.tm_wday++;
1463 case PERL_ABDAY_1:
1464 format = "%a";
1465 break;
1466
1467 case PERL_DAY_7: tm.tm_wday++;
1468 case PERL_DAY_6: tm.tm_wday++;
1469 case PERL_DAY_5: tm.tm_wday++;
1470 case PERL_DAY_4: tm.tm_wday++;
1471 case PERL_DAY_3: tm.tm_wday++;
1472 case PERL_DAY_2: tm.tm_wday++;
1473 case PERL_DAY_1:
1474 format = "%A";
1475 break;
1476
1477 case PERL_ABMON_12: tm.tm_mon++;
1478 case PERL_ABMON_11: tm.tm_mon++;
1479 case PERL_ABMON_10: tm.tm_mon++;
1480 case PERL_ABMON_9: tm.tm_mon++;
1481 case PERL_ABMON_8: tm.tm_mon++;
1482 case PERL_ABMON_7: tm.tm_mon++;
1483 case PERL_ABMON_6: tm.tm_mon++;
1484 case PERL_ABMON_5: tm.tm_mon++;
1485 case PERL_ABMON_4: tm.tm_mon++;
1486 case PERL_ABMON_3: tm.tm_mon++;
1487 case PERL_ABMON_2: tm.tm_mon++;
1488 case PERL_ABMON_1:
1489 format = "%b";
1490 break;
1491
1492 case PERL_MON_12: tm.tm_mon++;
1493 case PERL_MON_11: tm.tm_mon++;
1494 case PERL_MON_10: tm.tm_mon++;
1495 case PERL_MON_9: tm.tm_mon++;
1496 case PERL_MON_8: tm.tm_mon++;
1497 case PERL_MON_7: tm.tm_mon++;
1498 case PERL_MON_6: tm.tm_mon++;
1499 case PERL_MON_5: tm.tm_mon++;
1500 case PERL_MON_4: tm.tm_mon++;
1501 case PERL_MON_3: tm.tm_mon++;
1502 case PERL_MON_2: tm.tm_mon++;
1503 case PERL_MON_1:
1504 format = "%B";
1505 break;
1506
1507 case PERL_T_FMT_AMPM:
1508 format = "%r";
1509 return_format = TRUE;
1510 break;
1511
1512 case PERL_ERA_D_FMT:
1513 format = "%Ex";
1514 return_format = TRUE;
1515 break;
1516
1517 case PERL_ERA_T_FMT:
1518 format = "%EX";
1519 return_format = TRUE;
1520 break;
1521
1522 case PERL_ERA_D_T_FMT:
1523 format = "%Ec";
1524 return_format = TRUE;
1525 break;
1526
1527 case PERL_ALT_DIGITS:
1528 tm.tm_wday = 0;
1529 format = "%Ow"; /* Find the alternate digit for 0 */
1530 break;
1531 }
1532
1533 /* We can't use my_strftime() because it doesn't look at tm_wday */
1534 while (0 == strftime(PL_langinfo_buf, PL_langinfo_bufsize,
1535 format, &tm))
1536 {
1537 /* A zero return means one of:
1538 * a) there wasn't enough space in PL_langinfo_buf
1539 * b) the format, like a plain %p, returns empty
1540 * c) it was an illegal format, though some implementations of
1541 * strftime will just return the illegal format as a plain
1542 * character sequence.
1543 *
1544 * To quickly test for case 'b)', try again but precede the
1545 * format with a plain character. If that result is still
1546 * empty, the problem is either 'a)' or 'c)' */
1547
1548 Size_t format_size = strlen(format) + 1;
1549 Size_t mod_size = format_size + 1;
1550 char * mod_format;
1551 char * temp_result;
1552
1553 Newx(mod_format, mod_size, char);
1554 Newx(temp_result, PL_langinfo_bufsize, char);
1555 *mod_format = '\a';
1556 my_strlcpy(mod_format + 1, format, mod_size);
1557 len = strftime(temp_result,
1558 PL_langinfo_bufsize,
1559 mod_format, &tm);
1560 Safefree(mod_format);
1561 Safefree(temp_result);
1562
1563 /* If 'len' is non-zero, it means that we had a case like %p
1564 * which means the current locale doesn't use a.m. or p.m., and
1565 * that is valid */
1566 if (len == 0) {
1567
1568 /* Here, still didn't work. If we get well beyond a
1569 * reasonable size, bail out to prevent an infinite loop. */
1570
1571 if (PL_langinfo_bufsize > 100 * format_size) {
1572 *PL_langinfo_buf = '\0';
1573 }
1574 else { /* Double the buffer size to retry; Add 1 in case
1575 original was 0, so we aren't stuck at 0. */
1576 PL_langinfo_bufsize *= 2;
1577 PL_langinfo_bufsize++;
1578 Renew(PL_langinfo_buf, PL_langinfo_bufsize, char);
1579 continue;
1580 }
1581 }
1582
1583 break;
1584 }
1585
1586 /* Here, we got a result.
1587 *
1588 * If the item is 'ALT_DIGITS', PL_langinfo_buf contains the
1589 * alternate format for wday 0. If the value is the same as the
1590 * normal 0, there isn't an alternate, so clear the buffer. */
1591 if ( item == PERL_ALT_DIGITS
1592 && strEQ(PL_langinfo_buf, "0"))
1593 {
1594 *PL_langinfo_buf = '\0';
1595 }
1596
1597 /* ALT_DIGITS is problematic. Experiments on it showed that
1598 * strftime() did not always work properly when going from alt-9 to
1599 * alt-10. Only a few locales have this item defined, and in all
1600 * of them on Linux that khw was able to find, nl_langinfo() merely
1601 * returned the alt-0 character, possibly doubled. Most Unicode
1602 * digits are in blocks of 10 consecutive code points, so that is
1603 * sufficient information for those scripts, as we can infer alt-1,
1604 * alt-2, .... But for a Japanese locale, a CJK ideographic 0 is
1605 * returned, and the CJK digits are not in code point order, so you
1606 * can't really infer anything. The localedef for this locale did
1607 * specify the succeeding digits, so that strftime() works properly
1608 * on them, without needing to infer anything. But the
1609 * nl_langinfo() return did not give sufficient information for the
1610 * caller to understand what's going on. So until there is
1611 * evidence that it should work differently, this returns the alt-0
1612 * string for ALT_DIGITS.
1613 *
1614 * wday was chosen because its range is all a single digit. Things
1615 * like tm_sec have two digits as the minimum: '00' */
1616
1617 LOCALE_UNLOCK;
1618
1619 /* If to return the format, not the value, overwrite the buffer
1620 * with it. But some strftime()s will keep the original format if
1621 * illegal, so change those to "" */
1622 if (return_format) {
1623 if (strEQ(PL_langinfo_buf, format)) {
1624 *PL_langinfo_buf = '\0';
1625 }
1626 else {
1627 save_to_buffer(format, &PL_langinfo_buf,
1628 &PL_langinfo_bufsize, 0);
1629 }
1630 }
1631
1632 break;
1633
1634# endif
1635
1636 }
1637
1638 return PL_langinfo_buf;
1639
1640#endif
1641
a4f00dcc 1642}
b385bb4d 1643
98994639
HS
1644/*
1645 * Initialize locale awareness.
1646 */
1647int
1648Perl_init_i18nl10n(pTHX_ int printwarn)
1649{
0e92a118
KW
1650 /* printwarn is
1651 *
1652 * 0 if not to output warning when setup locale is bad
1653 * 1 if to output warning based on value of PERL_BADLANG
1654 * >1 if to output regardless of PERL_BADLANG
1655 *
1656 * returns
98994639 1657 * 1 = set ok or not applicable,
0e92a118
KW
1658 * 0 = fallback to a locale of lower priority
1659 * -1 = fallback to all locales failed, not even to the C locale
6b058d42
KW
1660 *
1661 * Under -DDEBUGGING, if the environment variable PERL_DEBUG_LOCALE_INIT is
1662 * set, debugging information is output.
1663 *
1664 * This looks more complicated than it is, mainly due to the #ifdefs.
1665 *
1666 * We try to set LC_ALL to the value determined by the environment. If
1667 * there is no LC_ALL on this platform, we try the individual categories we
1668 * know about. If this works, we are done.
1669 *
1670 * But if it doesn't work, we have to do something else. We search the
1671 * environment variables ourselves instead of relying on the system to do
1672 * it. We look at, in order, LC_ALL, LANG, a system default locale (if we
1673 * think there is one), and the ultimate fallback "C". This is all done in
1674 * the same loop as above to avoid duplicating code, but it makes things
7d4bcc4a
KW
1675 * more complex. The 'trial_locales' array is initialized with just one
1676 * element; it causes the behavior described in the paragraph above this to
1677 * happen. If that fails, we add elements to 'trial_locales', and do extra
1678 * loop iterations to cause the behavior described in this paragraph.
6b058d42
KW
1679 *
1680 * On Ultrix, the locale MUST come from the environment, so there is
1681 * preliminary code to set it. I (khw) am not sure that it is necessary,
1682 * and that this couldn't be folded into the loop, but barring any real
1683 * platforms to test on, it's staying as-is
1684 *
1685 * A slight complication is that in embedded Perls, the locale may already
1686 * be set-up, and we don't want to get it from the normal environment
1687 * variables. This is handled by having a special environment variable
1688 * indicate we're in this situation. We simply set setlocale's 2nd
1689 * parameter to be a NULL instead of "". That indicates to setlocale that
1690 * it is not to change anything, but to return the current value,
1691 * effectively initializing perl's db to what the locale already is.
1692 *
1693 * We play the same trick with NULL if a LC_ALL succeeds. We call
1694 * setlocale() on the individual categores with NULL to get their existing
1695 * values for our db, instead of trying to change them.
1696 * */
98994639 1697
0e92a118
KW
1698 int ok = 1;
1699
7d4bcc4a
KW
1700#ifndef USE_LOCALE
1701
1702 PERL_UNUSED_ARG(printwarn);
1703
1704#else /* USE_LOCALE */
7d4bcc4a
KW
1705# ifdef __GLIBC__
1706
175c4cf9 1707 const char * const language = savepv(PerlEnv_getenv("LANGUAGE"));
7d4bcc4a
KW
1708
1709# endif
65ebb059 1710
ccd65d51
KW
1711 /* NULL uses the existing already set up locale */
1712 const char * const setlocale_init = (PerlEnv_getenv("PERL_SKIP_LOCALE_INIT"))
1713 ? NULL
1714 : "";
c3fcd832
KW
1715 const char* trial_locales[5]; /* 5 = 1 each for "", LC_ALL, LANG, "", C */
1716 unsigned int trial_locales_count;
175c4cf9
KW
1717 const char * const lc_all = savepv(PerlEnv_getenv("LC_ALL"));
1718 const char * const lang = savepv(PerlEnv_getenv("LANG"));
98994639 1719 bool setlocale_failure = FALSE;
65ebb059 1720 unsigned int i;
175c4cf9
KW
1721
1722 /* A later getenv() could zap this, so only use here */
1723 const char * const bad_lang_use_once = PerlEnv_getenv("PERL_BADLANG");
1724
1725 const bool locwarn = (printwarn > 1
e5f10d49
KW
1726 || ( printwarn
1727 && ( ! bad_lang_use_once
22ff3130 1728 || (
e5f10d49
KW
1729 /* disallow with "" or "0" */
1730 *bad_lang_use_once
1731 && strNE("0", bad_lang_use_once)))));
0e92a118 1732 bool done = FALSE;
e5f10d49
KW
1733 char * sl_result[NOMINAL_LC_ALL_INDEX + 1]; /* setlocale() return vals;
1734 not copied so must be
1735 looked at immediately */
1736 char * curlocales[NOMINAL_LC_ALL_INDEX + 1]; /* current locale for given
1737 category; should have been
1738 copied so aren't volatile
1739 */
5d1187d1 1740 char * locale_param;
7d4bcc4a
KW
1741
1742# ifdef WIN32
1743
6bce99ee
JH
1744 /* In some systems you can find out the system default locale
1745 * and use that as the fallback locale. */
7d4bcc4a
KW
1746# define SYSTEM_DEFAULT_LOCALE
1747# endif
1748# ifdef SYSTEM_DEFAULT_LOCALE
1749
65ebb059 1750 const char *system_default_locale = NULL;
98994639 1751
7d4bcc4a
KW
1752# endif
1753# ifdef DEBUGGING
1754
8298454c 1755 DEBUG_INITIALIZATION_set(cBOOL(PerlEnv_getenv("PERL_DEBUG_LOCALE_INIT")));
7d4bcc4a
KW
1756
1757# define DEBUG_LOCALE_INIT(category, locale, result) \
2fcc0ca9
KW
1758 STMT_START { \
1759 if (debug_initialization) { \
1760 PerlIO_printf(Perl_debug_log, \
1761 "%s:%d: %s\n", \
1762 __FILE__, __LINE__, \
a4f00dcc 1763 setlocale_debug_string(category, \
2fcc0ca9
KW
1764 locale, \
1765 result)); \
1766 } \
1767 } STMT_END
2fcc0ca9 1768
7d4bcc4a
KW
1769# else
1770# define DEBUG_LOCALE_INIT(a,b,c)
1771# endif
1772
1773# ifndef LOCALE_ENVIRON_REQUIRED
1774
0e92a118 1775 PERL_UNUSED_VAR(done);
5d1187d1 1776 PERL_UNUSED_VAR(locale_param);
7d4bcc4a
KW
1777
1778# else
98994639
HS
1779
1780 /*
1781 * Ultrix setlocale(..., "") fails if there are no environment
1782 * variables from which to get a locale name.
1783 */
1784
7d4bcc4a
KW
1785# ifdef LC_ALL
1786
98994639 1787 if (lang) {
e5f10d49
KW
1788 sl_result[NOMINAL_LC_ALL_INDEX] = do_setlocale_c(LC_ALL, setlocale_init);
1789 DEBUG_LOCALE_INIT(LC_ALL, setlocale_init, sl_result[NOMINAL_LC_ALL_INDEX]);
1790 if (sl_result[NOMINAL_LC_ALL_INDEX])
98994639
HS
1791 done = TRUE;
1792 else
1793 setlocale_failure = TRUE;
1794 }
5d1187d1 1795 if (! setlocale_failure) {
e5f10d49
KW
1796 for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++) {
1797 locale_param = (! done && (lang || PerlEnv_getenv(category_names[i])))
1798 ? setlocale_init
1799 : NULL;
1800 sl_result[i] = do_setlocale_r(categories[i], locale_param);
1801 if (! sl_result[i]) {
1802 setlocale_failure = TRUE;
1803 }
1804 DEBUG_LOCALE_INIT(categories[i], locale_param, sl_result[i]);
c835d6be 1805 }
7d4bcc4a
KW
1806 }
1807
1808# endif /* LC_ALL */
e5f10d49 1809# endif /* LOCALE_ENVIRON_REQUIRED */
98994639 1810
65ebb059 1811 /* We try each locale in the list until we get one that works, or exhaust
20a240df
KW
1812 * the list. Normally the loop is executed just once. But if setting the
1813 * locale fails, inside the loop we add fallback trials to the array and so
1814 * will execute the loop multiple times */
c3fcd832
KW
1815 trial_locales[0] = setlocale_init;
1816 trial_locales_count = 1;
7d4bcc4a 1817
65ebb059
KW
1818 for (i= 0; i < trial_locales_count; i++) {
1819 const char * trial_locale = trial_locales[i];
1820
1821 if (i > 0) {
1822
1823 /* XXX This is to preserve old behavior for LOCALE_ENVIRON_REQUIRED
1824 * when i==0, but I (khw) don't think that behavior makes much
1825 * sense */
1826 setlocale_failure = FALSE;
1827
7d4bcc4a
KW
1828# ifdef SYSTEM_DEFAULT_LOCALE
1829# ifdef WIN32
1830
65ebb059
KW
1831 /* On Windows machines, an entry of "" after the 0th means to use
1832 * the system default locale, which we now proceed to get. */
1833 if (strEQ(trial_locale, "")) {
1834 unsigned int j;
1835
1836 /* Note that this may change the locale, but we are going to do
1837 * that anyway just below */
837ce802 1838 system_default_locale = do_setlocale_c(LC_ALL, "");
5d1187d1 1839 DEBUG_LOCALE_INIT(LC_ALL, "", system_default_locale);
65ebb059 1840
7d4bcc4a 1841 /* Skip if invalid or if it's already on the list of locales to
65ebb059
KW
1842 * try */
1843 if (! system_default_locale) {
1844 goto next_iteration;
1845 }
1846 for (j = 0; j < trial_locales_count; j++) {
1847 if (strEQ(system_default_locale, trial_locales[j])) {
1848 goto next_iteration;
1849 }
1850 }
1851
1852 trial_locale = system_default_locale;
1853 }
7d4bcc4a
KW
1854# endif /* WIN32 */
1855# endif /* SYSTEM_DEFAULT_LOCALE */
65ebb059
KW
1856 }
1857
7d4bcc4a
KW
1858# ifdef LC_ALL
1859
e5f10d49
KW
1860 sl_result[NOMINAL_LC_ALL_INDEX] = do_setlocale_c(LC_ALL, trial_locale);
1861 DEBUG_LOCALE_INIT(LC_ALL, trial_locale, sl_result[NOMINAL_LC_ALL_INDEX]);
1862 if (! sl_result[NOMINAL_LC_ALL_INDEX]) {
49c85077 1863 setlocale_failure = TRUE;
7cd8b568
KW
1864 }
1865 else {
1866 /* Since LC_ALL succeeded, it should have changed all the other
1867 * categories it can to its value; so we massage things so that the
1868 * setlocales below just return their category's current values.
1869 * This adequately handles the case in NetBSD where LC_COLLATE may
1870 * not be defined for a locale, and setting it individually will
7d4bcc4a 1871 * fail, whereas setting LC_ALL succeeds, leaving LC_COLLATE set to
7cd8b568
KW
1872 * the POSIX locale. */
1873 trial_locale = NULL;
1874 }
7d4bcc4a
KW
1875
1876# endif /* LC_ALL */
98994639 1877
e5f10d49
KW
1878 if (! setlocale_failure) {
1879 unsigned int j;
1880 for (j = 0; j < NOMINAL_LC_ALL_INDEX; j++) {
1881 curlocales[j]
1882 = savepv(do_setlocale_r(categories[j], trial_locale));
1883 if (! curlocales[j]) {
1884 setlocale_failure = TRUE;
1885 }
1886 DEBUG_LOCALE_INIT(categories[j], trial_locale, curlocales[j]);
1887 }
c835d6be 1888
e5f10d49
KW
1889 if (! setlocale_failure) { /* All succeeded */
1890 break; /* Exit trial_locales loop */
49c85077 1891 }
65ebb059 1892 }
98994639 1893
49c85077
KW
1894 /* Here, something failed; will need to try a fallback. */
1895 ok = 0;
65ebb059 1896
49c85077
KW
1897 if (i == 0) {
1898 unsigned int j;
98994639 1899
65ebb059 1900 if (locwarn) { /* Output failure info only on the first one */
7d4bcc4a
KW
1901
1902# ifdef LC_ALL
98994639 1903
49c85077
KW
1904 PerlIO_printf(Perl_error_log,
1905 "perl: warning: Setting locale failed.\n");
98994639 1906
7d4bcc4a 1907# else /* !LC_ALL */
98994639 1908
49c85077
KW
1909 PerlIO_printf(Perl_error_log,
1910 "perl: warning: Setting locale failed for the categories:\n\t");
7d4bcc4a 1911
e5f10d49
KW
1912 for (j = 0; j < NOMINAL_LC_ALL_INDEX; j++) {
1913 if (! curlocales[j]) {
1914 PerlIO_printf(Perl_error_log, category_names[j]);
1915 }
1916 else {
1917 Safefree(curlocales[j]);
1918 }
1919 }
7d4bcc4a 1920
a782673d 1921 PerlIO_printf(Perl_error_log, "and possibly others\n");
98994639 1922
7d4bcc4a 1923# endif /* LC_ALL */
98994639 1924
49c85077
KW
1925 PerlIO_printf(Perl_error_log,
1926 "perl: warning: Please check that your locale settings:\n");
98994639 1927
7d4bcc4a
KW
1928# ifdef __GLIBC__
1929
49c85077
KW
1930 PerlIO_printf(Perl_error_log,
1931 "\tLANGUAGE = %c%s%c,\n",
1932 language ? '"' : '(',
1933 language ? language : "unset",
1934 language ? '"' : ')');
7d4bcc4a 1935# endif
98994639 1936
49c85077
KW
1937 PerlIO_printf(Perl_error_log,
1938 "\tLC_ALL = %c%s%c,\n",
1939 lc_all ? '"' : '(',
1940 lc_all ? lc_all : "unset",
1941 lc_all ? '"' : ')');
98994639 1942
7d4bcc4a
KW
1943# if defined(USE_ENVIRON_ARRAY)
1944
49c85077 1945 {
cd999af9 1946 char **e;
d5e32b93
KW
1947
1948 /* Look through the environment for any variables of the
1949 * form qr/ ^ LC_ [A-Z]+ = /x, except LC_ALL which was
1950 * already handled above. These are assumed to be locale
1951 * settings. Output them and their values. */
cd999af9 1952 for (e = environ; *e; e++) {
d5e32b93
KW
1953 const STRLEN prefix_len = sizeof("LC_") - 1;
1954 STRLEN uppers_len;
1955
cd999af9 1956 if ( strBEGINs(*e, "LC_")
c8b388b0 1957 && ! strBEGINs(*e, "LC_ALL=")
d5e32b93
KW
1958 && (uppers_len = strspn(*e + prefix_len,
1959 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"))
1960 && ((*e)[prefix_len + uppers_len] == '='))
cd999af9
KW
1961 {
1962 PerlIO_printf(Perl_error_log, "\t%.*s = \"%s\",\n",
d5e32b93
KW
1963 (int) (prefix_len + uppers_len), *e,
1964 *e + prefix_len + uppers_len + 1);
cd999af9
KW
1965 }
1966 }
49c85077 1967 }
7d4bcc4a
KW
1968
1969# else
1970
49c85077
KW
1971 PerlIO_printf(Perl_error_log,
1972 "\t(possibly more locale environment variables)\n");
7d4bcc4a
KW
1973
1974# endif
98994639 1975
49c85077
KW
1976 PerlIO_printf(Perl_error_log,
1977 "\tLANG = %c%s%c\n",
1978 lang ? '"' : '(',
1979 lang ? lang : "unset",
1980 lang ? '"' : ')');
98994639 1981
49c85077
KW
1982 PerlIO_printf(Perl_error_log,
1983 " are supported and installed on your system.\n");
1984 }
98994639 1985
65ebb059 1986 /* Calculate what fallback locales to try. We have avoided this
f6bab5f6 1987 * until we have to, because failure is quite unlikely. This will
65ebb059
KW
1988 * usually change the upper bound of the loop we are in.
1989 *
1990 * Since the system's default way of setting the locale has not
1991 * found one that works, We use Perl's defined ordering: LC_ALL,
1992 * LANG, and the C locale. We don't try the same locale twice, so
1993 * don't add to the list if already there. (On POSIX systems, the
1994 * LC_ALL element will likely be a repeat of the 0th element "",
6b058d42
KW
1995 * but there's no harm done by doing it explicitly.
1996 *
1997 * Note that this tries the LC_ALL environment variable even on
1998 * systems which have no LC_ALL locale setting. This may or may
1999 * not have been originally intentional, but there's no real need
2000 * to change the behavior. */
65ebb059
KW
2001 if (lc_all) {
2002 for (j = 0; j < trial_locales_count; j++) {
2003 if (strEQ(lc_all, trial_locales[j])) {
2004 goto done_lc_all;
2005 }
2006 }
2007 trial_locales[trial_locales_count++] = lc_all;
2008 }
2009 done_lc_all:
98994639 2010
65ebb059
KW
2011 if (lang) {
2012 for (j = 0; j < trial_locales_count; j++) {
2013 if (strEQ(lang, trial_locales[j])) {
2014 goto done_lang;
2015 }
2016 }
2017 trial_locales[trial_locales_count++] = lang;
2018 }
2019 done_lang:
2020
7d4bcc4a
KW
2021# if defined(WIN32) && defined(LC_ALL)
2022
65ebb059
KW
2023 /* For Windows, we also try the system default locale before "C".
2024 * (If there exists a Windows without LC_ALL we skip this because
2025 * it gets too complicated. For those, the "C" is the next
2026 * fallback possibility). The "" is the same as the 0th element of
2027 * the array, but the code at the loop above knows to treat it
2028 * differently when not the 0th */
2029 trial_locales[trial_locales_count++] = "";
7d4bcc4a
KW
2030
2031# endif
65ebb059
KW
2032
2033 for (j = 0; j < trial_locales_count; j++) {
2034 if (strEQ("C", trial_locales[j])) {
2035 goto done_C;
2036 }
2037 }
2038 trial_locales[trial_locales_count++] = "C";
98994639 2039
65ebb059
KW
2040 done_C: ;
2041 } /* end of first time through the loop */
98994639 2042
7d4bcc4a
KW
2043# ifdef WIN32
2044
65ebb059 2045 next_iteration: ;
7d4bcc4a
KW
2046
2047# endif
65ebb059
KW
2048
2049 } /* end of looping through the trial locales */
2050
2051 if (ok < 1) { /* If we tried to fallback */
2052 const char* msg;
2053 if (! setlocale_failure) { /* fallback succeeded */
2054 msg = "Falling back to";
2055 }
2056 else { /* fallback failed */
e5f10d49 2057 unsigned int j;
98994639 2058
65ebb059
KW
2059 /* We dropped off the end of the loop, so have to decrement i to
2060 * get back to the value the last time through */
2061 i--;
98994639 2062
65ebb059
KW
2063 ok = -1;
2064 msg = "Failed to fall back to";
2065
2066 /* To continue, we should use whatever values we've got */
7d4bcc4a 2067
e5f10d49
KW
2068 for (j = 0; j < NOMINAL_LC_ALL_INDEX; j++) {
2069 Safefree(curlocales[j]);
2070 curlocales[j] = savepv(do_setlocale_r(categories[j], NULL));
2071 DEBUG_LOCALE_INIT(categories[j], NULL, curlocales[j]);
2072 }
65ebb059
KW
2073 }
2074
2075 if (locwarn) {
2076 const char * description;
2077 const char * name = "";
2078 if (strEQ(trial_locales[i], "C")) {
2079 description = "the standard locale";
2080 name = "C";
2081 }
7d4bcc4a
KW
2082
2083# ifdef SYSTEM_DEFAULT_LOCALE
2084
65ebb059
KW
2085 else if (strEQ(trial_locales[i], "")) {
2086 description = "the system default locale";
2087 if (system_default_locale) {
2088 name = system_default_locale;
2089 }
2090 }
7d4bcc4a
KW
2091
2092# endif /* SYSTEM_DEFAULT_LOCALE */
2093
65ebb059
KW
2094 else {
2095 description = "a fallback locale";
2096 name = trial_locales[i];
2097 }
2098 if (name && strNE(name, "")) {
2099 PerlIO_printf(Perl_error_log,
2100 "perl: warning: %s %s (\"%s\").\n", msg, description, name);
2101 }
2102 else {
2103 PerlIO_printf(Perl_error_log,
2104 "perl: warning: %s %s.\n", msg, description);
2105 }
2106 }
2107 } /* End of tried to fallback */
98994639 2108
e5f10d49
KW
2109 /* Done with finding the locales; update our records */
2110
2111 for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++) {
2112 switch (categories[i]) {
2113
7d4bcc4a
KW
2114# ifdef USE_LOCALE_CTYPE
2115
e5f10d49
KW
2116 case LC_CTYPE:
2117 new_ctype(curlocales[i]);
2118 break;
98994639 2119
e5f10d49 2120# endif
7d4bcc4a
KW
2121# ifdef USE_LOCALE_COLLATE
2122
e5f10d49
KW
2123 case LC_COLLATE:
2124 new_collate(curlocales[i]);
2125 break;
98994639 2126
e5f10d49 2127# endif
7d4bcc4a
KW
2128# ifdef USE_LOCALE_NUMERIC
2129
e5f10d49
KW
2130 case LC_NUMERIC:
2131 new_numeric(curlocales[i]);
2132 break;
2133
2134# endif
2135
2136 default: ;
2137 }
2138
2139 Safefree(curlocales[i]);
2140 }
b310b053 2141
7d4bcc4a
KW
2142# if defined(USE_PERLIO) && defined(USE_LOCALE_CTYPE)
2143
49c85077
KW
2144 /* Set PL_utf8locale to TRUE if using PerlIO _and_ the current LC_CTYPE
2145 * locale is UTF-8. If PL_utf8locale and PL_unicode (set by -C or by
2146 * $ENV{PERL_UNICODE}) are true, perl.c:S_parse_body() will turn on the
2147 * PerlIO :utf8 layer on STDIN, STDOUT, STDERR, _and_ the default open
2148 * discipline. */
c1284011 2149 PL_utf8locale = _is_cur_LC_category_utf8(LC_CTYPE);
49c85077 2150
a05d7ebb 2151 /* Set PL_unicode to $ENV{PERL_UNICODE} if using PerlIO.
fde18df1
JH
2152 This is an alternative to using the -C command line switch
2153 (the -C if present will override this). */
2154 {
dd374669 2155 const char *p = PerlEnv_getenv("PERL_UNICODE");
a05d7ebb 2156 PL_unicode = p ? parse_unicode_opts(&p) : 0;
5a22a2bb
NC
2157 if (PL_unicode & PERL_UNICODE_UTF8CACHEASSERT_FLAG)
2158 PL_utf8cache = -1;
b310b053
JH
2159 }
2160
7d4bcc4a 2161# endif
7d4bcc4a
KW
2162# ifdef __GLIBC__
2163
175c4cf9 2164 Safefree(language);
7d4bcc4a
KW
2165
2166# endif
175c4cf9
KW
2167
2168 Safefree(lc_all);
2169 Safefree(lang);
2170
e3305790 2171#endif /* USE_LOCALE */
2fcc0ca9 2172#ifdef DEBUGGING
7d4bcc4a 2173
2fcc0ca9 2174 /* So won't continue to output stuff */
27cdc72e 2175 DEBUG_INITIALIZATION_set(FALSE);
7d4bcc4a 2176
2fcc0ca9
KW
2177#endif
2178
98994639
HS
2179 return ok;
2180}
2181
98994639
HS
2182#ifdef USE_LOCALE_COLLATE
2183
a4a439fb 2184char *
a4a439fb
KW
2185Perl__mem_collxfrm(pTHX_ const char *input_string,
2186 STRLEN len, /* Length of 'input_string' */
2187 STRLEN *xlen, /* Set to length of returned string
2188 (not including the collation index
2189 prefix) */
2190 bool utf8 /* Is the input in UTF-8? */
6696cfa7 2191 )
98994639 2192{
a4a439fb
KW
2193
2194 /* _mem_collxfrm() is a bit like strxfrm() but with two important
2195 * differences. First, it handles embedded NULs. Second, it allocates a bit
2196 * more memory than needed for the transformed data itself. The real
55e5378d 2197 * transformed data begins at offset COLLXFRM_HDR_LEN. *xlen is set to
a4a439fb
KW
2198 * the length of that, and doesn't include the collation index size.
2199 * Please see sv_collxfrm() to see how this is used. */
2200
55e5378d
KW
2201#define COLLXFRM_HDR_LEN sizeof(PL_collation_ix)
2202
6696cfa7
KW
2203 char * s = (char *) input_string;
2204 STRLEN s_strlen = strlen(input_string);
79f120c8 2205 char *xbuf = NULL;
55e5378d 2206 STRLEN xAlloc; /* xalloc is a reserved word in VC */
17f41037 2207 STRLEN length_in_chars;
c664130f 2208 bool first_time = TRUE; /* Cleared after first loop iteration */
98994639 2209
a4a439fb
KW
2210 PERL_ARGS_ASSERT__MEM_COLLXFRM;
2211
2212 /* Must be NUL-terminated */
2213 assert(*(input_string + len) == '\0');
7918f24d 2214
79f120c8
KW
2215 /* If this locale has defective collation, skip */
2216 if (PL_collxfrm_base == 0 && PL_collxfrm_mult == 0) {
c7202dee
KW
2217 DEBUG_L(PerlIO_printf(Perl_debug_log,
2218 "_mem_collxfrm: locale's collation is defective\n"));
79f120c8
KW
2219 goto bad;
2220 }
2221
6696cfa7
KW
2222 /* Replace any embedded NULs with the control that sorts before any others.
2223 * This will give as good as possible results on strings that don't
2224 * otherwise contain that character, but otherwise there may be
2225 * less-than-perfect results with that character and NUL. This is
fdc080f3 2226 * unavoidable unless we replace strxfrm with our own implementation. */
fd43f63c
KW
2227 if (UNLIKELY(s_strlen < len)) { /* Only execute if there is an embedded
2228 NUL */
6696cfa7
KW
2229 char * e = s + len;
2230 char * sans_nuls;
fdc080f3 2231 STRLEN sans_nuls_len;
94762aa0 2232 int try_non_controls;
afc4976f
KW
2233 char this_replacement_char[] = "?\0"; /* Room for a two-byte string,
2234 making sure 2nd byte is NUL.
2235 */
2236 STRLEN this_replacement_len;
2237
1e4c9676
KW
2238 /* If we don't know what non-NUL control character sorts lowest for
2239 * this locale, find it */
f28f4d2a 2240 if (PL_strxfrm_NUL_replacement == '\0') {
6696cfa7 2241 int j;
afc4976f 2242 char * cur_min_x = NULL; /* The min_char's xfrm, (except it also
6696cfa7
KW
2243 includes the collation index
2244 prefixed. */
2245
91c0e2e0 2246 DEBUG_Lv(PerlIO_printf(Perl_debug_log, "Looking to replace NUL\n"));
94762aa0
KW
2247
2248 /* Unlikely, but it may be that no control will work to replace
1e4c9676
KW
2249 * NUL, in which case we instead look for any character. Controls
2250 * are preferred because collation order is, in general, context
2251 * sensitive, with adjoining characters affecting the order, and
2252 * controls are less likely to have such interactions, allowing the
2253 * NUL-replacement to stand on its own. (Another way to look at it
2254 * is to imagine what would happen if the NUL were replaced by a
2255 * combining character; it wouldn't work out all that well.) */
94762aa0
KW
2256 for (try_non_controls = 0;
2257 try_non_controls < 2;
2258 try_non_controls++)
2259 {
d4ff9586
KW
2260 /* Look through all legal code points (NUL isn't) */
2261 for (j = 1; j < 256; j++) {
2262 char * x; /* j's xfrm plus collation index */
2263 STRLEN x_len; /* length of 'x' */
2264 STRLEN trial_len = 1;
736a4fed 2265 char cur_source[] = { '\0', '\0' };
d4ff9586 2266
736a4fed
KW
2267 /* Skip non-controls the first time through the loop. The
2268 * controls in a UTF-8 locale are the L1 ones */
afc4976f
KW
2269 if (! try_non_controls && (PL_in_utf8_COLLATE_locale)
2270 ? ! isCNTRL_L1(j)
2271 : ! isCNTRL_LC(j))
2272 {
d4ff9586 2273 continue;
6696cfa7 2274 }
6696cfa7 2275
736a4fed
KW
2276 /* Create a 1-char string of the current code point */
2277 cur_source[0] = (char) j;
2278
d4ff9586
KW
2279 /* Then transform it */
2280 x = _mem_collxfrm(cur_source, trial_len, &x_len,
afc4976f 2281 0 /* The string is not in UTF-8 */);
6696cfa7 2282
1e4c9676 2283 /* Ignore any character that didn't successfully transform.
d4ff9586
KW
2284 * */
2285 if (! x) {
2286 continue;
2287 }
6696cfa7 2288
d4ff9586
KW
2289 /* If this character's transformation is lower than
2290 * the current lowest, this one becomes the lowest */
2291 if ( cur_min_x == NULL
2292 || strLT(x + COLLXFRM_HDR_LEN,
2293 cur_min_x + COLLXFRM_HDR_LEN))
2294 {
f28f4d2a 2295 PL_strxfrm_NUL_replacement = j;
d4ff9586 2296 cur_min_x = x;
d4ff9586
KW
2297 }
2298 else {
2299 Safefree(x);
2300 }
1e4c9676 2301 } /* end of loop through all 255 characters */
6696cfa7 2302
1e4c9676 2303 /* Stop looking if found */
94762aa0
KW
2304 if (cur_min_x) {
2305 break;
2306 }
2307
2308 /* Unlikely, but possible, if there aren't any controls that
2309 * work in the locale, repeat the loop, looking for any
2310 * character that works */
2311 DEBUG_L(PerlIO_printf(Perl_debug_log,
2312 "_mem_collxfrm: No control worked. Trying non-controls\n"));
1e4c9676 2313 } /* End of loop to try first the controls, then any char */
6696cfa7 2314
94762aa0
KW
2315 if (! cur_min_x) {
2316 DEBUG_L(PerlIO_printf(Perl_debug_log,
2317 "_mem_collxfrm: Couldn't find any character to replace"
2318 " embedded NULs in locale %s with", PL_collation_name));
2319 goto bad;
58eebef2
KW
2320 }
2321
94762aa0
KW
2322 DEBUG_L(PerlIO_printf(Perl_debug_log,
2323 "_mem_collxfrm: Replacing embedded NULs in locale %s with "
f28f4d2a 2324 "0x%02X\n", PL_collation_name, PL_strxfrm_NUL_replacement));
94762aa0 2325
6696cfa7 2326 Safefree(cur_min_x);
1e4c9676 2327 } /* End of determining the character that is to replace NULs */
afc4976f
KW
2328
2329 /* If the replacement is variant under UTF-8, it must match the
2330 * UTF8-ness as the original */
f28f4d2a
KW
2331 if ( ! UVCHR_IS_INVARIANT(PL_strxfrm_NUL_replacement) && utf8) {
2332 this_replacement_char[0] =
2333 UTF8_EIGHT_BIT_HI(PL_strxfrm_NUL_replacement);
2334 this_replacement_char[1] =
2335 UTF8_EIGHT_BIT_LO(PL_strxfrm_NUL_replacement);
afc4976f
KW
2336 this_replacement_len = 2;
2337 }
2338 else {
f28f4d2a 2339 this_replacement_char[0] = PL_strxfrm_NUL_replacement;
afc4976f
KW
2340 /* this_replacement_char[1] = '\0' was done at initialization */
2341 this_replacement_len = 1;
6696cfa7
KW
2342 }
2343
2344 /* The worst case length for the replaced string would be if every
2345 * character in it is NUL. Multiply that by the length of each
2346 * replacement, and allow for a trailing NUL */
afc4976f 2347 sans_nuls_len = (len * this_replacement_len) + 1;
fdc080f3 2348 Newx(sans_nuls, sans_nuls_len, char);
6696cfa7
KW
2349 *sans_nuls = '\0';
2350
6696cfa7
KW
2351 /* Replace each NUL with the lowest collating control. Loop until have
2352 * exhausted all the NULs */
2353 while (s + s_strlen < e) {
6069d6c5 2354 my_strlcat(sans_nuls, s, sans_nuls_len);
6696cfa7
KW
2355
2356 /* Do the actual replacement */
6069d6c5 2357 my_strlcat(sans_nuls, this_replacement_char, sans_nuls_len);
6696cfa7
KW
2358
2359 /* Move past the input NUL */
2360 s += s_strlen + 1;
2361 s_strlen = strlen(s);
2362 }
2363
2364 /* And add anything that trails the final NUL */
6069d6c5 2365 my_strlcat(sans_nuls, s, sans_nuls_len);
6696cfa7
KW
2366
2367 /* Switch so below we transform this modified string */
2368 s = sans_nuls;
2369 len = strlen(s);
1e4c9676 2370 } /* End of replacing NULs */
6696cfa7 2371
a4a439fb
KW
2372 /* Make sure the UTF8ness of the string and locale match */
2373 if (utf8 != PL_in_utf8_COLLATE_locale) {
2374 const char * const t = s; /* Temporary so we can later find where the
2375 input was */
2376
2377 /* Here they don't match. Change the string's to be what the locale is
2378 * expecting */
2379
2380 if (! utf8) { /* locale is UTF-8, but input isn't; upgrade the input */
2381 s = (char *) bytes_to_utf8((const U8 *) s, &len);
2382 utf8 = TRUE;
2383 }
2384 else { /* locale is not UTF-8; but input is; downgrade the input */
2385
2386 s = (char *) bytes_from_utf8((const U8 *) s, &len, &utf8);
2387
2388 /* If the downgrade was successful we are done, but if the input
2389 * contains things that require UTF-8 to represent, have to do
2390 * damage control ... */
2391 if (UNLIKELY(utf8)) {
2392
2393 /* What we do is construct a non-UTF-8 string with
2394 * 1) the characters representable by a single byte converted
2395 * to be so (if necessary);
2396 * 2) and the rest converted to collate the same as the
2397 * highest collating representable character. That makes
2398 * them collate at the end. This is similar to how we
2399 * handle embedded NULs, but we use the highest collating
2400 * code point instead of the smallest. Like the NUL case,
2401 * this isn't perfect, but is the best we can reasonably
2402 * do. Every above-255 code point will sort the same as
2403 * the highest-sorting 0-255 code point. If that code
2404 * point can combine in a sequence with some other code
2405 * points for weight calculations, us changing something to
2406 * be it can adversely affect the results. But in most
2407 * cases, it should work reasonably. And note that this is
2408 * really an illegal situation: using code points above 255
2409 * on a locale where only 0-255 are valid. If two strings
2410 * sort entirely equal, then the sort order for the
2411 * above-255 code points will be in code point order. */
2412
2413 utf8 = FALSE;
2414
2415 /* If we haven't calculated the code point with the maximum
2416 * collating order for this locale, do so now */
2417 if (! PL_strxfrm_max_cp) {
2418 int j;
2419
2420 /* The current transformed string that collates the
2421 * highest (except it also includes the prefixed collation
2422 * index. */
2423 char * cur_max_x = NULL;
2424
2425 /* Look through all legal code points (NUL isn't) */
2426 for (j = 1; j < 256; j++) {
2427 char * x;
2428 STRLEN x_len;
736a4fed 2429 char cur_source[] = { '\0', '\0' };
a4a439fb 2430
736a4fed
KW
2431 /* Create a 1-char string of the current code point */
2432 cur_source[0] = (char) j;
a4a439fb
KW
2433
2434 /* Then transform it */
2435 x = _mem_collxfrm(cur_source, 1, &x_len, FALSE);
2436
2437 /* If something went wrong (which it shouldn't), just
2438 * ignore this code point */
94762aa0 2439 if (! x) {
a4a439fb
KW
2440 continue;
2441 }
2442
2443 /* If this character's transformation is higher than
2444 * the current highest, this one becomes the highest */
2445 if ( cur_max_x == NULL
55e5378d
KW
2446 || strGT(x + COLLXFRM_HDR_LEN,
2447 cur_max_x + COLLXFRM_HDR_LEN))
a4a439fb
KW
2448 {
2449 PL_strxfrm_max_cp = j;
2450 cur_max_x = x;
2451 }
2452 else {
2453 Safefree(x);
2454 }
2455 }
2456
94762aa0
KW
2457 if (! cur_max_x) {
2458 DEBUG_L(PerlIO_printf(Perl_debug_log,
2459 "_mem_collxfrm: Couldn't find any character to"
2460 " replace above-Latin1 chars in locale %s with",
2461 PL_collation_name));
2462 goto bad;
2463 }
2464
58eebef2
KW
2465 DEBUG_L(PerlIO_printf(Perl_debug_log,
2466 "_mem_collxfrm: highest 1-byte collating character"
2467 " in locale %s is 0x%02X\n",
2468 PL_collation_name,
2469 PL_strxfrm_max_cp));
58eebef2 2470
a4a439fb
KW
2471 Safefree(cur_max_x);
2472 }
2473
2474 /* Here we know which legal code point collates the highest.
2475 * We are ready to construct the non-UTF-8 string. The length
2476 * will be at least 1 byte smaller than the input string
2477 * (because we changed at least one 2-byte character into a
2478 * single byte), but that is eaten up by the trailing NUL */
2479 Newx(s, len, char);
2480
2481 {
2482 STRLEN i;
2483 STRLEN d= 0;
042d9e50 2484 char * e = (char *) t + len;
a4a439fb
KW
2485
2486 for (i = 0; i < len; i+= UTF8SKIP(t + i)) {
2487 U8 cur_char = t[i];
2488 if (UTF8_IS_INVARIANT(cur_char)) {
2489 s[d++] = cur_char;
2490 }
042d9e50 2491 else if (UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(t + i, e)) {
a4a439fb
KW
2492 s[d++] = EIGHT_BIT_UTF8_TO_NATIVE(cur_char, t[i+1]);
2493 }
2494 else { /* Replace illegal cp with highest collating
2495 one */
2496 s[d++] = PL_strxfrm_max_cp;
2497 }
2498 }
2499 s[d++] = '\0';
2500 Renew(s, d, char); /* Free up unused space */
2501 }
2502 }
2503 }
2504
2505 /* Here, we have constructed a modified version of the input. It could
2506 * be that we already had a modified copy before we did this version.
2507 * If so, that copy is no longer needed */
2508 if (t != input_string) {
2509 Safefree(t);
2510 }
2511 }
2512
17f41037
KW
2513 length_in_chars = (utf8)
2514 ? utf8_length((U8 *) s, (U8 *) s + len)
2515 : len;
2516
59c018b9
KW
2517 /* The first element in the output is the collation id, used by
2518 * sv_collxfrm(); then comes the space for the transformed string. The
2519 * equation should give us a good estimate as to how much is needed */
55e5378d 2520 xAlloc = COLLXFRM_HDR_LEN
a4a439fb 2521 + PL_collxfrm_base
17f41037 2522 + (PL_collxfrm_mult * length_in_chars);
a02a5408 2523 Newx(xbuf, xAlloc, char);
c7202dee
KW
2524 if (UNLIKELY(! xbuf)) {
2525 DEBUG_L(PerlIO_printf(Perl_debug_log,
2526 "_mem_collxfrm: Couldn't malloc %zu bytes\n", xAlloc));
98994639 2527 goto bad;
c7202dee 2528 }
98994639 2529
d35fca5f 2530 /* Store the collation id */
98994639 2531 *(U32*)xbuf = PL_collation_ix;
d35fca5f
KW
2532
2533 /* Then the transformation of the input. We loop until successful, or we
2534 * give up */
4ebeff16 2535 for (;;) {
1adab0a7 2536
55e5378d 2537 *xlen = strxfrm(xbuf + COLLXFRM_HDR_LEN, s, xAlloc - COLLXFRM_HDR_LEN);
4ebeff16
KW
2538
2539 /* If the transformed string occupies less space than we told strxfrm()
2540 * was available, it means it successfully transformed the whole
2541 * string. */
55e5378d 2542 if (*xlen < xAlloc - COLLXFRM_HDR_LEN) {
17f41037 2543
1adab0a7
KW
2544 /* Some systems include a trailing NUL in the returned length.
2545 * Ignore it, using a loop in case multiple trailing NULs are
2546 * returned. */
2547 while ( (*xlen) > 0
2548 && *(xbuf + COLLXFRM_HDR_LEN + (*xlen) - 1) == '\0')
2549 {
2550 (*xlen)--;
2551 }
2552
17f41037
KW
2553 /* If the first try didn't get it, it means our prediction was low.
2554 * Modify the coefficients so that we predict a larger value in any
2555 * future transformations */
2556 if (! first_time) {
2557 STRLEN needed = *xlen + 1; /* +1 For trailing NUL */
2558 STRLEN computed_guess = PL_collxfrm_base
2559 + (PL_collxfrm_mult * length_in_chars);
e1c30f0c
KW
2560
2561 /* On zero-length input, just keep current slope instead of
2562 * dividing by 0 */
2563 const STRLEN new_m = (length_in_chars != 0)
2564 ? needed / length_in_chars
2565 : PL_collxfrm_mult;
17f41037
KW
2566
2567 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
b07929e4
KW
2568 "%s: %d: initial size of %zu bytes for a length "
2569 "%zu string was insufficient, %zu needed\n",
17f41037 2570 __FILE__, __LINE__,
b07929e4 2571 computed_guess, length_in_chars, needed));
17f41037
KW
2572
2573 /* If slope increased, use it, but discard this result for
2574 * length 1 strings, as we can't be sure that it's a real slope
2575 * change */
2576 if (length_in_chars > 1 && new_m > PL_collxfrm_mult) {
7d4bcc4a
KW
2577
2578# ifdef DEBUGGING
2579
17f41037
KW
2580 STRLEN old_m = PL_collxfrm_mult;
2581 STRLEN old_b = PL_collxfrm_base;
7d4bcc4a
KW
2582
2583# endif
2584
17f41037
KW
2585 PL_collxfrm_mult = new_m;
2586 PL_collxfrm_base = 1; /* +1 For trailing NUL */
2587 computed_guess = PL_collxfrm_base
2588 + (PL_collxfrm_mult * length_in_chars);
2589 if (computed_guess < needed) {
2590 PL_collxfrm_base += needed - computed_guess;
2591 }
2592
2593 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
b07929e4
KW
2594 "%s: %d: slope is now %zu; was %zu, base "
2595 "is now %zu; was %zu\n",
17f41037 2596 __FILE__, __LINE__,
b07929e4
KW
2597 PL_collxfrm_mult, old_m,
2598 PL_collxfrm_base, old_b));
17f41037
KW
2599 }
2600 else { /* Slope didn't change, but 'b' did */
2601 const STRLEN new_b = needed
2602 - computed_guess
2603 + PL_collxfrm_base;
2604 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
b07929e4 2605 "%s: %d: base is now %zu; was %zu\n",
17f41037 2606 __FILE__, __LINE__,
b07929e4 2607 new_b, PL_collxfrm_base));
17f41037
KW
2608 PL_collxfrm_base = new_b;
2609 }
2610 }
2611
4ebeff16
KW
2612 break;
2613 }
bb0f664e 2614
c7202dee
KW
2615 if (UNLIKELY(*xlen >= PERL_INT_MAX)) {
2616 DEBUG_L(PerlIO_printf(Perl_debug_log,
2617 "_mem_collxfrm: Needed %zu bytes, max permissible is %u\n",
2618 *xlen, PERL_INT_MAX));
4ebeff16 2619 goto bad;
c7202dee 2620 }
d35fca5f 2621
c664130f 2622 /* A well-behaved strxfrm() returns exactly how much space it needs
1adab0a7
KW
2623 * (usually not including the trailing NUL) when it fails due to not
2624 * enough space being provided. Assume that this is the case unless
2625 * it's been proven otherwise */
c664130f 2626 if (LIKELY(PL_strxfrm_is_behaved) && first_time) {
55e5378d 2627 xAlloc = *xlen + COLLXFRM_HDR_LEN + 1;
c664130f
KW
2628 }
2629 else { /* Here, either:
2630 * 1) The strxfrm() has previously shown bad behavior; or
2631 * 2) It isn't the first time through the loop, which means
2632 * that the strxfrm() is now showing bad behavior, because
2633 * we gave it what it said was needed in the previous
2634 * iteration, and it came back saying it needed still more.
2635 * (Many versions of cygwin fit this. When the buffer size
2636 * isn't sufficient, they return the input size instead of
2637 * how much is needed.)
d4ff9586
KW
2638 * Increase the buffer size by a fixed percentage and try again.
2639 * */
6ddd902c 2640 xAlloc += (xAlloc / 4) + 1;
c664130f 2641 PL_strxfrm_is_behaved = FALSE;
c664130f 2642
7d4bcc4a
KW
2643# ifdef DEBUGGING
2644
58eebef2
KW
2645 if (DEBUG_Lv_TEST || debug_initialization) {
2646 PerlIO_printf(Perl_debug_log,
2647 "_mem_collxfrm required more space than previously calculated"
b07929e4 2648 " for locale %s, trying again with new guess=%d+%zu\n",
58eebef2 2649 PL_collation_name, (int) COLLXFRM_HDR_LEN,
b07929e4 2650 xAlloc - COLLXFRM_HDR_LEN);
58eebef2 2651 }
7d4bcc4a
KW
2652
2653# endif
2654
58eebef2 2655 }
c664130f 2656
4ebeff16 2657 Renew(xbuf, xAlloc, char);
c7202dee
KW
2658 if (UNLIKELY(! xbuf)) {
2659 DEBUG_L(PerlIO_printf(Perl_debug_log,
2660 "_mem_collxfrm: Couldn't realloc %zu bytes\n", xAlloc));
4ebeff16 2661 goto bad;
c7202dee 2662 }
c664130f
KW
2663
2664 first_time = FALSE;
4ebeff16 2665 }
98994639 2666
6696cfa7 2667
7d4bcc4a
KW
2668# ifdef DEBUGGING
2669
58eebef2 2670 if (DEBUG_Lv_TEST || debug_initialization) {
c7202dee
KW
2671
2672 print_collxfrm_input_and_return(s, s + len, xlen, utf8);
2673 PerlIO_printf(Perl_debug_log, "Its xfrm is:");
7e2f38b2
KW
2674 PerlIO_printf(Perl_debug_log, "%s\n",
2675 _byte_dump_string((U8 *) xbuf + COLLXFRM_HDR_LEN,
2676 *xlen, 1));
58eebef2 2677 }
7d4bcc4a
KW
2678
2679# endif
58eebef2 2680
3c5f993e 2681 /* Free up unneeded space; retain ehough for trailing NUL */
55e5378d 2682 Renew(xbuf, COLLXFRM_HDR_LEN + *xlen + 1, char);
98994639 2683
6696cfa7
KW
2684 if (s != input_string) {
2685 Safefree(s);
98994639
HS
2686 }
2687
98994639
HS
2688 return xbuf;
2689
2690 bad:
2691 Safefree(xbuf);
6696cfa7
KW
2692 if (s != input_string) {
2693 Safefree(s);
2694 }
98994639 2695 *xlen = 0;
7d4bcc4a
KW
2696
2697# ifdef DEBUGGING
2698
58eebef2 2699 if (DEBUG_Lv_TEST || debug_initialization) {
c7202dee 2700 print_collxfrm_input_and_return(s, s + len, NULL, utf8);
58eebef2 2701 }
7d4bcc4a
KW
2702
2703# endif
2704
98994639
HS
2705 return NULL;
2706}
2707
7d4bcc4a 2708# ifdef DEBUGGING
c7202dee 2709
4cbaac56 2710STATIC void
c7202dee
KW
2711S_print_collxfrm_input_and_return(pTHX_
2712 const char * const s,
2713 const char * const e,
2714 const STRLEN * const xlen,
2715 const bool is_utf8)
2716{
c7202dee
KW
2717
2718 PERL_ARGS_ASSERT_PRINT_COLLXFRM_INPUT_AND_RETURN;
2719
511e4ff7
DM
2720 PerlIO_printf(Perl_debug_log, "_mem_collxfrm[%" UVuf "]: returning ",
2721 (UV)PL_collation_ix);
c7202dee 2722 if (xlen) {
08b6dc1d 2723 PerlIO_printf(Perl_debug_log, "%zu", *xlen);
c7202dee
KW
2724 }
2725 else {
2726 PerlIO_printf(Perl_debug_log, "NULL");
2727 }
2728 PerlIO_printf(Perl_debug_log, " for locale '%s', string='",
2729 PL_collation_name);
9c8a6dc2
KW
2730 print_bytes_for_locale(s, e, is_utf8);
2731
2732 PerlIO_printf(Perl_debug_log, "'\n");
2733}
2734
2735STATIC void
2736S_print_bytes_for_locale(pTHX_
2737 const char * const s,
2738 const char * const e,
2739 const bool is_utf8)
2740{
2741 const char * t = s;
2742 bool prev_was_printable = TRUE;
2743 bool first_time = TRUE;
2744
2745 PERL_ARGS_ASSERT_PRINT_BYTES_FOR_LOCALE;
c7202dee
KW
2746
2747 while (t < e) {
2748 UV cp = (is_utf8)
2749 ? utf8_to_uvchr_buf((U8 *) t, e, NULL)
2750 : * (U8 *) t;
2751 if (isPRINT(cp)) {
2752 if (! prev_was_printable) {
2753 PerlIO_printf(Perl_debug_log, " ");
2754 }
2755 PerlIO_printf(Perl_debug_log, "%c", (U8) cp);
2756 prev_was_printable = TRUE;
2757 }
2758 else {
2759 if (! first_time) {
2760 PerlIO_printf(Perl_debug_log, " ");
2761 }
147e3846 2762 PerlIO_printf(Perl_debug_log, "%02" UVXf, cp);
c7202dee
KW
2763 prev_was_printable = FALSE;
2764 }
2765 t += (is_utf8) ? UTF8SKIP(t) : 1;
2766 first_time = FALSE;
2767 }
c7202dee
KW
2768}
2769
7d4bcc4a 2770# endif /* #ifdef DEBUGGING */
98994639 2771#endif /* USE_LOCALE_COLLATE */
58eebef2 2772
8ef6e574
KW
2773#ifdef USE_LOCALE
2774
c1284011
KW
2775bool
2776Perl__is_cur_LC_category_utf8(pTHX_ int category)
7d74bb61
KW
2777{
2778 /* Returns TRUE if the current locale for 'category' is UTF-8; FALSE
2779 * otherwise. 'category' may not be LC_ALL. If the platform doesn't have
119ee68b 2780 * nl_langinfo(), nor MB_CUR_MAX, this employs a heuristic, which hence
609548d2
KW
2781 * could give the wrong result. The result will very likely be correct for
2782 * languages that have commonly used non-ASCII characters, but for notably
2783 * English, it comes down to if the locale's name ends in something like
2784 * "UTF-8". It errs on the side of not being a UTF-8 locale. */
7d74bb61
KW
2785
2786 char *save_input_locale = NULL;
7d74bb61
KW
2787 STRLEN final_pos;
2788
7d4bcc4a
KW
2789# ifdef LC_ALL
2790
7d74bb61 2791 assert(category != LC_ALL);
7d4bcc4a
KW
2792
2793# endif
7d74bb61
KW
2794
2795 /* First dispose of the trivial cases */
837ce802 2796 save_input_locale = do_setlocale_r(category, NULL);
7d74bb61 2797 if (! save_input_locale) {
69014004
KW
2798 DEBUG_L(PerlIO_printf(Perl_debug_log,
2799 "Could not find current locale for category %d\n",
2800 category));
7d74bb61
KW
2801 return FALSE; /* XXX maybe should croak */
2802 }
b07fffd1 2803 save_input_locale = stdize_locale(savepv(save_input_locale));
a39edc4c 2804 if (isNAME_C_OR_POSIX(save_input_locale)) {
69014004
KW
2805 DEBUG_L(PerlIO_printf(Perl_debug_log,
2806 "Current locale for category %d is %s\n",
2807 category, save_input_locale));
b07fffd1 2808 Safefree(save_input_locale);
7d74bb61
KW
2809 return FALSE;
2810 }
2811
7d4bcc4a 2812# if defined(USE_LOCALE_CTYPE) \
1d958db2 2813 && (defined(MB_CUR_MAX) || (defined(HAS_NL_LANGINFO) && defined(CODESET)))
7d74bb61 2814
1d958db2 2815 { /* Next try nl_langinfo or MB_CUR_MAX if available */
7d74bb61
KW
2816
2817 char *save_ctype_locale = NULL;
119ee68b 2818 bool is_utf8;
7d74bb61 2819
119ee68b 2820 if (category != LC_CTYPE) { /* These work only on LC_CTYPE */
7d74bb61
KW
2821
2822 /* Get the current LC_CTYPE locale */
837ce802 2823 save_ctype_locale = do_setlocale_c(LC_CTYPE, NULL);
7d74bb61 2824 if (! save_ctype_locale) {
69014004
KW
2825 DEBUG_L(PerlIO_printf(Perl_debug_log,
2826 "Could not find current locale for LC_CTYPE\n"));
7d74bb61
KW
2827 goto cant_use_nllanginfo;
2828 }
4f72bb37 2829 save_ctype_locale = stdize_locale(savepv(save_ctype_locale));
7d74bb61
KW
2830
2831 /* If LC_CTYPE and the desired category use the same locale, this
2832 * means that finding the value for LC_CTYPE is the same as finding
2833 * the value for the desired category. Otherwise, switch LC_CTYPE
2834 * to the desired category's locale */
2835 if (strEQ(save_ctype_locale, save_input_locale)) {
2836 Safefree(save_ctype_locale);
2837 save_ctype_locale = NULL;
2838 }
837ce802 2839 else if (! do_setlocale_c(LC_CTYPE, save_input_locale)) {
69014004
KW
2840 DEBUG_L(PerlIO_printf(Perl_debug_log,
2841 "Could not change LC_CTYPE locale to %s\n",
2842 save_input_locale));
7d74bb61
KW
2843 Safefree(save_ctype_locale);
2844 goto cant_use_nllanginfo;
2845 }
2846 }
2847
69014004
KW
2848 DEBUG_L(PerlIO_printf(Perl_debug_log, "Current LC_CTYPE locale=%s\n",
2849 save_input_locale));
2850
7d74bb61 2851 /* Here the current LC_CTYPE is set to the locale of the category whose
1d958db2
KW
2852 * information is desired. This means that nl_langinfo() and MB_CUR_MAX
2853 * should give the correct results */
119ee68b 2854
7d4bcc4a 2855# if defined(HAS_NL_LANGINFO) && defined(CODESET)
c70a3e68 2856 /* The task is easiest if has this POSIX 2001 function */
7d4bcc4a 2857
1d958db2 2858 {
c70a3e68
KW
2859 const char *codeset = my_nl_langinfo(PERL_CODESET, FALSE);
2860 /* FALSE => already in dest locale */
119ee68b 2861
c70a3e68
KW
2862 DEBUG_L(PerlIO_printf(Perl_debug_log,
2863 "\tnllanginfo returned CODESET '%s'\n", codeset));
2864
2865 if (codeset && strNE(codeset, "")) {
1d958db2
KW
2866 /* If we switched LC_CTYPE, switch back */
2867 if (save_ctype_locale) {
837ce802 2868 do_setlocale_c(LC_CTYPE, save_ctype_locale);
1d958db2
KW
2869 Safefree(save_ctype_locale);
2870 }
2871
abc1d81d
KW
2872 is_utf8 = ( ( strlen(codeset) == STRLENs("UTF-8")
2873 && foldEQ(codeset, STR_WITH_LEN("UTF-8")))
2874 || ( strlen(codeset) == STRLENs("UTF8")
2875 && foldEQ(codeset, STR_WITH_LEN("UTF8"))));
1d958db2 2876
69014004
KW
2877 DEBUG_L(PerlIO_printf(Perl_debug_log,
2878 "\tnllanginfo returned CODESET '%s'; ?UTF8 locale=%d\n",
2879 codeset, is_utf8));
1d958db2
KW
2880 Safefree(save_input_locale);
2881 return is_utf8;
2882 }
119ee68b
KW
2883 }
2884
7d4bcc4a
KW
2885# endif
2886# ifdef MB_CUR_MAX
1d958db2
KW
2887
2888 /* Here, either we don't have nl_langinfo, or it didn't return a
2889 * codeset. Try MB_CUR_MAX */
2890
119ee68b
KW
2891 /* Standard UTF-8 needs at least 4 bytes to represent the maximum
2892 * Unicode code point. Since UTF-8 is the only non-single byte
2893 * encoding we handle, we just say any such encoding is UTF-8, and if
2894 * turns out to be wrong, other things will fail */
2895 is_utf8 = MB_CUR_MAX >= 4;
2896
69014004
KW
2897 DEBUG_L(PerlIO_printf(Perl_debug_log,
2898 "\tMB_CUR_MAX=%d; ?UTF8 locale=%d\n",
2899 (int) MB_CUR_MAX, is_utf8));
2900
119ee68b
KW
2901 Safefree(save_input_locale);
2902
7d4bcc4a 2903# ifdef HAS_MBTOWC
119ee68b
KW
2904
2905 /* ... But, most system that have MB_CUR_MAX will also have mbtowc(),
2906 * since they are both in the C99 standard. We can feed a known byte
2907 * string to the latter function, and check that it gives the expected
2908 * result */
2909 if (is_utf8) {
2910 wchar_t wc;
51fc4b19
KW
2911 int len;
2912
856b881c 2913 PERL_UNUSED_RESULT(mbtowc(&wc, NULL, 0));/* Reset any shift state */
69014004 2914 errno = 0;
b1d4925c 2915 len = mbtowc(&wc, STR_WITH_LEN(REPLACEMENT_CHARACTER_UTF8));
51fc4b19 2916
b1d4925c
KW
2917
2918 if ( len != STRLENs(REPLACEMENT_CHARACTER_UTF8)
2919 || wc != (wchar_t) 0xFFFD)
119ee68b
KW
2920 {
2921 is_utf8 = FALSE;
b1d4925c
KW
2922 DEBUG_L(PerlIO_printf(Perl_debug_log, "\replacement=U+%x\n",
2923 (unsigned int)wc));
69014004
KW
2924 DEBUG_L(PerlIO_printf(Perl_debug_log,
2925 "\treturn from mbtowc=%d; errno=%d; ?UTF8 locale=0\n",
51fc4b19 2926 len, errno));
119ee68b
KW
2927 }
2928 }
7d4bcc4a
KW
2929
2930# endif
119ee68b 2931
1d958db2
KW
2932 /* If we switched LC_CTYPE, switch back */
2933 if (save_ctype_locale) {
837ce802 2934 do_setlocale_c(LC_CTYPE, save_ctype_locale);
1d958db2 2935 Safefree(save_ctype_locale);
119ee68b 2936 }
7d74bb61 2937
1d958db2 2938 return is_utf8;
7d4bcc4a
KW
2939
2940# endif
2941
7d74bb61 2942 }
119ee68b 2943
7d74bb61
KW
2944 cant_use_nllanginfo:
2945
7d4bcc4a 2946# else /* nl_langinfo should work if available, so don't bother compiling this
0080c90a
KW
2947 fallback code. The final fallback of looking at the name is
2948 compiled, and will be executed if nl_langinfo fails */
7d74bb61 2949
97f4de96
KW
2950 /* nl_langinfo not available or failed somehow. Next try looking at the
2951 * currency symbol to see if it disambiguates things. Often that will be
2952 * in the native script, and if the symbol isn't in UTF-8, we know that the
2953 * locale isn't. If it is non-ASCII UTF-8, we infer that the locale is
609548d2
KW
2954 * too, as the odds of a non-UTF8 string being valid UTF-8 are quite small
2955 * */
fa9b773e 2956
7d4bcc4a
KW
2957# ifdef HAS_LOCALECONV
2958# ifdef USE_LOCALE_MONETARY
2959
fa9b773e
KW
2960 {
2961 char *save_monetary_locale = NULL;
fa9b773e 2962 bool only_ascii = FALSE;
13542a67
KW
2963 bool is_utf8 = FALSE;
2964 struct lconv* lc;
fa9b773e 2965
97f4de96
KW
2966 /* Like above for LC_CTYPE, we first set LC_MONETARY to the locale of
2967 * the desired category, if it isn't that locale already */
2968
fa9b773e
KW
2969 if (category != LC_MONETARY) {
2970
837ce802 2971 save_monetary_locale = do_setlocale_c(LC_MONETARY, NULL);
fa9b773e 2972 if (! save_monetary_locale) {
69014004
KW
2973 DEBUG_L(PerlIO_printf(Perl_debug_log,
2974 "Could not find current locale for LC_MONETARY\n"));
fa9b773e
KW
2975 goto cant_use_monetary;
2976 }
4f72bb37 2977 save_monetary_locale = stdize_locale(savepv(save_monetary_locale));
fa9b773e 2978
13542a67
KW
2979 if (strEQ(save_monetary_locale, save_input_locale)) {
2980 Safefree(save_monetary_locale);
2981 save_monetary_locale = NULL;
2982 }
837ce802 2983 else if (! do_setlocale_c(LC_MONETARY, save_input_locale)) {
59c234b4
KW
2984 DEBUG_L(PerlIO_printf(Perl_debug_log,
2985 "Could not change LC_MONETARY locale to %s\n",
2986 save_input_locale));
2987 Safefree(save_monetary_locale);
2988 goto cant_use_monetary;
fa9b773e
KW
2989 }
2990 }
2991
2992 /* Here the current LC_MONETARY is set to the locale of the category
2993 * whose information is desired. */
2994
13542a67
KW
2995 lc = localeconv();
2996 if (! lc
2997 || ! lc->currency_symbol
c5f058df 2998 || is_utf8_invariant_string((U8 *) lc->currency_symbol, 0))
13542a67
KW
2999 {
3000 DEBUG_L(PerlIO_printf(Perl_debug_log, "Couldn't get currency symbol for %s, or contains only ASCII; can't use for determining if UTF-8 locale\n", save_input_locale));
3001 only_ascii = TRUE;
3002 }
3003 else {
3004 is_utf8 = is_utf8_string((U8 *) lc->currency_symbol, 0);
fa9b773e
KW
3005 }
3006
3007 /* If we changed it, restore LC_MONETARY to its original locale */
3008 if (save_monetary_locale) {
837ce802 3009 do_setlocale_c(LC_MONETARY, save_monetary_locale);
fa9b773e
KW
3010 Safefree(save_monetary_locale);
3011 }
3012
13542a67 3013 if (! only_ascii) {
fa9b773e 3014
59c234b4
KW
3015 /* It isn't a UTF-8 locale if the symbol is not legal UTF-8;
3016 * otherwise assume the locale is UTF-8 if and only if the symbol
3017 * is non-ascii UTF-8. */
3018 DEBUG_L(PerlIO_printf(Perl_debug_log, "\t?Currency symbol for %s is UTF-8=%d\n",
3019 save_input_locale, is_utf8));
3020 Safefree(save_input_locale);
3021 return is_utf8;
13542a67 3022 }
fa9b773e
KW
3023 }
3024 cant_use_monetary:
3025
7d4bcc4a
KW
3026# endif /* USE_LOCALE_MONETARY */
3027# endif /* HAS_LOCALECONV */
fa9b773e 3028
7d4bcc4a 3029# if defined(HAS_STRFTIME) && defined(USE_LOCALE_TIME)
15f7e74e
KW
3030
3031/* Still haven't found a non-ASCII string to disambiguate UTF-8 or not. Try
3032 * the names of the months and weekdays, timezone, and am/pm indicator */
3033 {
3034 char *save_time_locale = NULL;
3035 int hour = 10;
3036 bool is_dst = FALSE;
3037 int dom = 1;
3038 int month = 0;
3039 int i;
3040 char * formatted_time;
3041
3042
3043 /* Like above for LC_MONETARY, we set LC_TIME to the locale of the
3044 * desired category, if it isn't that locale already */
3045
3046 if (category != LC_TIME) {
3047
837ce802 3048 save_time_locale = do_setlocale_c(LC_TIME, NULL);
15f7e74e
KW
3049 if (! save_time_locale) {
3050 DEBUG_L(PerlIO_printf(Perl_debug_log,
3051 "Could not find current locale for LC_TIME\n"));
3052 goto cant_use_time;
3053 }
3054 save_time_locale = stdize_locale(savepv(save_time_locale));
3055
3056 if (strEQ(save_time_locale, save_input_locale)) {
3057 Safefree(save_time_locale);
3058 save_time_locale = NULL;
3059 }
837ce802 3060 else if (! do_setlocale_c(LC_TIME, save_input_locale)) {
15f7e74e
KW
3061 DEBUG_L(PerlIO_printf(Perl_debug_log,
3062 "Could not change LC_TIME locale to %s\n",
3063 save_input_locale));
3064 Safefree(save_time_locale);
3065 goto cant_use_time;
3066 }
3067 }
3068
3069 /* Here the current LC_TIME is set to the locale of the category
3070 * whose information is desired. Look at all the days of the week and
9f10db87 3071 * month names, and the timezone and am/pm indicator for UTF-8 variant
15f7e74e
KW
3072 * characters. The first such a one found will tell us if the locale
3073 * is UTF-8 or not */
3074
3075 for (i = 0; i < 7 + 12; i++) { /* 7 days; 12 months */
3076 formatted_time = my_strftime("%A %B %Z %p",
3ae5cd07 3077 0, 0, hour, dom, month, 2012 - 1900, 0, 0, is_dst);
c5f058df
KW
3078 if ( ! formatted_time
3079 || is_utf8_invariant_string((U8 *) formatted_time, 0))
3080 {
15f7e74e
KW
3081
3082 /* Here, we didn't find a non-ASCII. Try the next time through
3083 * with the complemented dst and am/pm, and try with the next
3084 * weekday. After we have gotten all weekdays, try the next
3085 * month */
3086 is_dst = ! is_dst;
3087 hour = (hour + 12) % 24;
3088 dom++;
3089 if (i > 6) {
3090 month++;
3091 }
3092 continue;
3093 }
3094
3095 /* Here, we have a non-ASCII. Return TRUE is it is valid UTF8;
3096 * false otherwise. But first, restore LC_TIME to its original
3097 * locale if we changed it */
3098 if (save_time_locale) {
837ce802 3099 do_setlocale_c(LC_TIME, save_time_locale);
15f7e74e
KW
3100 Safefree(save_time_locale);
3101 }
3102
3103 DEBUG_L(PerlIO_printf(Perl_debug_log, "\t?time-related strings for %s are UTF-8=%d\n",
3104 save_input_locale,
3105 is_utf8_string((U8 *) formatted_time, 0)));
3106 Safefree(save_input_locale);
3107 return is_utf8_string((U8 *) formatted_time, 0);
3108 }
3109
3110 /* Falling off the end of the loop indicates all the names were just
3111 * ASCII. Go on to the next test. If we changed it, restore LC_TIME
3112 * to its original locale */
3113 if (save_time_locale) {
837ce802 3114 do_setlocale_c(LC_TIME, save_time_locale);
15f7e74e
KW
3115 Safefree(save_time_locale);
3116 }
3117 DEBUG_L(PerlIO_printf(Perl_debug_log, "All time-related words for %s contain only ASCII; can't use for determining if UTF-8 locale\n", save_input_locale));
3118 }
3119 cant_use_time:
3120
7d4bcc4a 3121# endif
15f7e74e 3122
7d4bcc4a 3123# if 0 && defined(USE_LOCALE_MESSAGES) && defined(HAS_SYS_ERRLIST)
855aeb93
JH
3124
3125/* This code is ifdefd out because it was found to not be necessary in testing
5857e934
KW
3126 * on our dromedary test machine, which has over 700 locales. There, this
3127 * added no value to looking at the currency symbol and the time strings. I
3128 * left it in so as to avoid rewriting it if real-world experience indicates
3129 * that dromedary is an outlier. Essentially, instead of returning abpve if we
855aeb93
JH
3130 * haven't found illegal utf8, we continue on and examine all the strerror()
3131 * messages on the platform for utf8ness. If all are ASCII, we still don't
3132 * know the answer; but otherwise we have a pretty good indication of the
5857e934
KW
3133 * utf8ness. The reason this doesn't help much is that the messages may not
3134 * have been translated into the locale. The currency symbol and time strings
3135 * are much more likely to have been translated. */
3136 {
855aeb93 3137 int e;
5857e934
KW
3138 bool is_utf8 = FALSE;
3139 bool non_ascii = FALSE;
855aeb93 3140 char *save_messages_locale = NULL;
5857e934 3141 const char * errmsg = NULL;
855aeb93 3142
5857e934
KW
3143 /* Like above, we set LC_MESSAGES to the locale of the desired
3144 * category, if it isn't that locale already */
855aeb93
JH
3145
3146 if (category != LC_MESSAGES) {
3147
837ce802 3148 save_messages_locale = do_setlocale_c(LC_MESSAGES, NULL);
855aeb93 3149 if (! save_messages_locale) {
5857e934
KW
3150 DEBUG_L(PerlIO_printf(Perl_debug_log,
3151 "Could not find current locale for LC_MESSAGES\n"));
855aeb93
JH
3152 goto cant_use_messages;
3153 }
5857e934 3154 save_messages_locale = stdize_locale(savepv(save_messages_locale));
855aeb93
JH
3155
3156 if (strEQ(save_messages_locale, save_input_locale)) {
5857e934
KW
3157 Safefree(save_messages_locale);
3158 save_messages_locale = NULL;
855aeb93 3159 }
837ce802 3160 else if (! do_setlocale_c(LC_MESSAGES, save_input_locale)) {
5857e934
KW
3161 DEBUG_L(PerlIO_printf(Perl_debug_log,
3162 "Could not change LC_MESSAGES locale to %s\n",
3163 save_input_locale));
855aeb93
JH
3164 Safefree(save_messages_locale);
3165 goto cant_use_messages;
3166 }
3167 }
3168
3169 /* Here the current LC_MESSAGES is set to the locale of the category
5857e934
KW
3170 * whose information is desired. Look through all the messages. We
3171 * can't use Strerror() here because it may expand to code that
3172 * segfaults in miniperl */
855aeb93 3173
5857e934
KW
3174 for (e = 0; e <= sys_nerr; e++) {
3175 errno = 0;
3176 errmsg = sys_errlist[e];
3177 if (errno || !errmsg) {
855aeb93
JH
3178 break;
3179 }
5857e934 3180 errmsg = savepv(errmsg);
c5f058df 3181 if (! is_utf8_invariant_string((U8 *) errmsg, 0)) {
5857e934
KW
3182 non_ascii = TRUE;
3183 is_utf8 = is_utf8_string((U8 *) errmsg, 0);
3184 break;
855aeb93
JH
3185 }
3186 }
5857e934 3187 Safefree(errmsg);
855aeb93
JH
3188
3189 /* And, if we changed it, restore LC_MESSAGES to its original locale */
3190 if (save_messages_locale) {
837ce802 3191 do_setlocale_c(LC_MESSAGES, save_messages_locale);
855aeb93
JH
3192 Safefree(save_messages_locale);
3193 }
3194
5857e934
KW
3195 if (non_ascii) {
3196
3197 /* Any non-UTF-8 message means not a UTF-8 locale; if all are valid,
3198 * any non-ascii means it is one; otherwise we assume it isn't */
3199 DEBUG_L(PerlIO_printf(Perl_debug_log, "\t?error messages for %s are UTF-8=%d\n",
3200 save_input_locale,
3201 is_utf8));
3202 Safefree(save_input_locale);
3203 return is_utf8;
3204 }
855aeb93 3205
5857e934 3206 DEBUG_L(PerlIO_printf(Perl_debug_log, "All error messages for %s contain only ASCII; can't use for determining if UTF-8 locale\n", save_input_locale));
855aeb93
JH
3207 }
3208 cant_use_messages:
3209
7d4bcc4a
KW
3210# endif
3211# endif /* the code that is compiled when no nl_langinfo */
0080c90a 3212
7d4bcc4a 3213# ifndef EBCDIC /* On os390, even if the name ends with "UTF-8', it isn't a
92c0a900 3214 UTF-8 locale */
7d4bcc4a 3215
97f4de96
KW
3216 /* As a last resort, look at the locale name to see if it matches
3217 * qr/UTF -? * 8 /ix, or some other common locale names. This "name", the
3218 * return of setlocale(), is actually defined to be opaque, so we can't
3219 * really rely on the absence of various substrings in the name to indicate
3220 * its UTF-8ness, but if it has UTF8 in the name, it is extremely likely to
3221 * be a UTF-8 locale. Similarly for the other common names */
3222
3223 final_pos = strlen(save_input_locale) - 1;
3224 if (final_pos >= 3) {
3225 char *name = save_input_locale;
3226
3227 /* Find next 'U' or 'u' and look from there */
3228 while ((name += strcspn(name, "Uu") + 1)
3229 <= save_input_locale + final_pos - 2)
3230 {
c72493e0 3231 if ( isALPHA_FOLD_NE(*name, 't')
305b8651 3232 || isALPHA_FOLD_NE(*(name + 1), 'f'))
97f4de96
KW
3233 {
3234 continue;
3235 }
3236 name += 2;
3237 if (*(name) == '-') {
3238 if ((name > save_input_locale + final_pos - 1)) {
3239 break;
3240 }
3241 name++;
3242 }
3243 if (*(name) == '8') {
97f4de96
KW
3244 DEBUG_L(PerlIO_printf(Perl_debug_log,
3245 "Locale %s ends with UTF-8 in name\n",
3246 save_input_locale));
00c54b9c 3247 Safefree(save_input_locale);
97f4de96
KW
3248 return TRUE;
3249 }
3250 }
3251 DEBUG_L(PerlIO_printf(Perl_debug_log,
3252 "Locale %s doesn't end with UTF-8 in name\n",
3253 save_input_locale));
3254 }
3255
7d4bcc4a
KW
3256# endif
3257# ifdef WIN32
3258
97f4de96 3259 /* http://msdn.microsoft.com/en-us/library/windows/desktop/dd317756.aspx */
8a0832a1 3260 if (memENDs(save_input_locale, final_pos, "65001")) {
97f4de96 3261 DEBUG_L(PerlIO_printf(Perl_debug_log,
8a0832a1 3262 "Locale %s ends with 65001 in name, is UTF-8 locale\n",
97f4de96
KW
3263 save_input_locale));
3264 Safefree(save_input_locale);
3265 return TRUE;
3266 }
7d4bcc4a
KW
3267
3268# endif
97f4de96
KW
3269
3270 /* Other common encodings are the ISO 8859 series, which aren't UTF-8. But
3271 * since we are about to return FALSE anyway, there is no point in doing
3272 * this extra work */
7d4bcc4a
KW
3273
3274# if 0
97f4de96
KW
3275 if (instr(save_input_locale, "8859")) {
3276 DEBUG_L(PerlIO_printf(Perl_debug_log,
3277 "Locale %s has 8859 in name, not UTF-8 locale\n",
3278 save_input_locale));
3279 Safefree(save_input_locale);
3280 return FALSE;
3281 }
7d4bcc4a 3282# endif
97f4de96 3283
69014004
KW
3284 DEBUG_L(PerlIO_printf(Perl_debug_log,
3285 "Assuming locale %s is not a UTF-8 locale\n",
3286 save_input_locale));
fa9b773e 3287 Safefree(save_input_locale);
7d74bb61
KW
3288 return FALSE;
3289}
3290
8ef6e574 3291#endif
7d74bb61 3292
d6ded950
KW
3293
3294bool
3295Perl__is_in_locale_category(pTHX_ const bool compiling, const int category)
3296{
1a4f13e1 3297 dVAR;
d6ded950
KW
3298 /* Internal function which returns if we are in the scope of a pragma that
3299 * enables the locale category 'category'. 'compiling' should indicate if
3300 * this is during the compilation phase (TRUE) or not (FALSE). */
3301
3302 const COP * const cop = (compiling) ? &PL_compiling : PL_curcop;
3303
3304 SV *categories = cop_hints_fetch_pvs(cop, "locale", 0);
3305 if (! categories || categories == &PL_sv_placeholder) {
3306 return FALSE;
3307 }
3308
3309 /* The pseudo-category 'not_characters' is -1, so just add 1 to each to get
3310 * a valid unsigned */
3311 assert(category >= -1);
3312 return cBOOL(SvUV(categories) & (1U << (category + 1)));
3313}
3314
2c6ee1a7 3315char *
6ebbc862
KW
3316Perl_my_strerror(pTHX_ const int errnum)
3317{
3318 /* Returns a mortalized copy of the text of the error message associated
3319 * with 'errnum'. It uses the current locale's text unless the platform
3320 * doesn't have the LC_MESSAGES category or we are not being called from
3321 * within the scope of 'use locale'. In the former case, it uses whatever
3322 * strerror returns; in the latter case it uses the text from the C locale.
3323 *
3324 * The function just calls strerror(), but temporarily switches, if needed,
3325 * to the C locale */
3326
3327 char *errstr;
52770946 3328 dVAR;
6ebbc862 3329
52770946 3330#ifndef USE_LOCALE_MESSAGES
6ebbc862 3331
52770946
KW
3332 /* If platform doesn't have messages category, we don't do any switching to
3333 * the C locale; we just use whatever strerror() returns */
3334
3335 errstr = savepv(Strerror(errnum));
3336
3337#else /* Has locale messages */
3338
3339 const bool within_locale_scope = IN_LC(LC_MESSAGES);
2c6ee1a7 3340
7aaa36b1
KW
3341# if defined(HAS_POSIX_2008_LOCALE) && defined(HAS_STRERROR_L)
3342
3343 /* This function is trivial if we have strerror_l() */
3344
3345 if (within_locale_scope) {
4eb27fc5 3346 errstr = savepv(strerror(errnum));
7aaa36b1
KW
3347 }
3348 else {
4eb27fc5 3349 errstr = savepv(strerror_l(errnum, PL_C_locale_obj));
7aaa36b1
KW
3350 }
3351
7aaa36b1
KW
3352# else /* Doesn't have strerror_l(). */
3353
3354# ifdef USE_POSIX_2008_LOCALE
3355
fcd0e682 3356 locale_t save_locale = NULL;
7aaa36b1
KW
3357
3358# else
3359
fcd0e682 3360 char * save_locale = NULL;
c9dda6da 3361 bool locale_is_C = FALSE;
2c6ee1a7 3362
6ebbc862
KW
3363 /* We have a critical section to prevent another thread from changing the
3364 * locale out from under us (or zapping the buffer returned from
3365 * setlocale() ) */
3366 LOCALE_LOCK;
3367
7aaa36b1 3368# endif
6ebbc862 3369
9c8a6dc2
KW
3370 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
3371 "my_strerror called with errnum %d\n", errnum));
6ebbc862 3372 if (! within_locale_scope) {
c9dda6da 3373 errno = 0;
a0b53297 3374
f1d2176b 3375# ifdef USE_POSIX_2008_LOCALE /* Use the thread-safe locale functions */
6ebbc862 3376
9c8a6dc2
KW
3377 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
3378 "Not within locale scope, about to call"
3379 " uselocale(0x%p)\n", PL_C_locale_obj));
6ebbc862 3380 save_locale = uselocale(PL_C_locale_obj);
c9dda6da
KW
3381 if (! save_locale) {
3382 DEBUG_L(PerlIO_printf(Perl_debug_log,
9c8a6dc2
KW
3383 "uselocale failed, errno=%d\n", errno));
3384 }
3385 else {
3386 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
3387 "uselocale returned 0x%p\n", save_locale));
c9dda6da 3388 }
6ebbc862 3389
7aaa36b1 3390# else /* Not thread-safe build */
a0b53297 3391
837ce802 3392 save_locale = do_setlocale_c(LC_MESSAGES, NULL);
c9dda6da
KW
3393 if (! save_locale) {
3394 DEBUG_L(PerlIO_printf(Perl_debug_log,
3395 "setlocale failed, errno=%d\n", errno));
3396 }
3397 else {
3398 locale_is_C = isNAME_C_OR_POSIX(save_locale);
2c6ee1a7 3399
c9dda6da
KW
3400 /* Switch to the C locale if not already in it */
3401 if (! locale_is_C) {
2c6ee1a7 3402
c9dda6da
KW
3403 /* The setlocale() just below likely will zap 'save_locale', so
3404 * create a copy. */
3405 save_locale = savepv(save_locale);
837ce802 3406 do_setlocale_c(LC_MESSAGES, "C");
c9dda6da 3407 }
6ebbc862 3408 }
2c6ee1a7 3409
7aaa36b1 3410# endif
2c6ee1a7 3411
6ebbc862 3412 } /* end of ! within_locale_scope */
9c8a6dc2
KW
3413 else {
3414 DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s: %d: WITHIN locale scope\n",
3415 __FILE__, __LINE__));
3416 }
a0b53297 3417
9c8a6dc2
KW
3418 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
3419 "Any locale change has been done; about to call Strerror\n"));
52770946 3420 errstr = savepv(Strerror(errnum));
6ebbc862
KW
3421
3422 if (! within_locale_scope) {
c9dda6da 3423 errno = 0;
a0b53297 3424
f1d2176b 3425# ifdef USE_POSIX_2008_LOCALE
6ebbc862 3426
9c8a6dc2
KW
3427 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
3428 "%s: %d: not within locale scope, restoring the locale\n",
3429 __FILE__, __LINE__));
c9dda6da
KW
3430 if (save_locale && ! uselocale(save_locale)) {
3431 DEBUG_L(PerlIO_printf(Perl_debug_log,
3432 "uselocale restore failed, errno=%d\n", errno));
3433 }
2c6ee1a7 3434 }
6ebbc862 3435
7aaa36b1 3436# else
6ebbc862 3437
c9dda6da 3438 if (save_locale && ! locale_is_C) {
837ce802 3439 if (! do_setlocale_c(LC_MESSAGES, save_locale)) {
c9dda6da
KW
3440 DEBUG_L(PerlIO_printf(Perl_debug_log,
3441 "setlocale restore failed, errno=%d\n", errno));
3442 }
6ebbc862
KW
3443 Safefree(save_locale);
3444 }
3445 }
3446
3447 LOCALE_UNLOCK;
3448
7aaa36b1
KW
3449# endif
3450# endif /* End of doesn't have strerror_l */
52770946 3451#endif /* End of does have locale messages */
6affbbf0
KW
3452
3453#ifdef DEBUGGING
3454
3455 if (DEBUG_Lv_TEST) {
3456 PerlIO_printf(Perl_debug_log, "Strerror returned; saving a copy: '");
3457 print_bytes_for_locale(errstr, errstr + strlen(errstr), 0);
3458 PerlIO_printf(Perl_debug_log, "'\n");
3459 }
3460
2c6ee1a7
KW
3461#endif
3462
52770946 3463 SAVEFREEPV(errstr);
6ebbc862 3464 return errstr;
2c6ee1a7
KW
3465}
3466
66610fdd 3467/*
747c467a 3468
747c467a
KW
3469=for apidoc sync_locale
3470
3471Changing the program's locale should be avoided by XS code. Nevertheless,
3472certain non-Perl libraries called from XS, such as C<Gtk> do so. When this
3473happens, Perl needs to be told that the locale has changed. Use this function
3474to do so, before returning to Perl.
3475
3476=cut
3477*/
3478
3479void
3480Perl_sync_locale(pTHX)
3481{
9f82ea3e 3482 char * newlocale;
747c467a
KW
3483
3484#ifdef USE_LOCALE_CTYPE
7d4bcc4a 3485
9f82ea3e
KW
3486 newlocale = do_setlocale_c(LC_CTYPE, NULL);
3487 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
3488 "%s:%d: %s\n", __FILE__, __LINE__,
3489 setlocale_debug_string(LC_CTYPE, NULL, newlocale)));
3490 new_ctype(newlocale);
747c467a 3491
7d4bcc4a 3492#endif /* USE_LOCALE_CTYPE */
747c467a 3493#ifdef USE_LOCALE_COLLATE
7d4bcc4a 3494
9f82ea3e
KW
3495 newlocale = do_setlocale_c(LC_COLLATE, NULL);
3496 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
3497 "%s:%d: %s\n", __FILE__, __LINE__,
3498 setlocale_debug_string(LC_COLLATE, NULL, newlocale)));
3499 new_collate(newlocale);
747c467a 3500
7d4bcc4a 3501#endif
747c467a 3502#ifdef USE_LOCALE_NUMERIC
7d4bcc4a 3503
9f82ea3e
KW
3504 newlocale = do_setlocale_c(LC_NUMERIC, NULL);
3505 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
3506 "%s:%d: %s\n", __FILE__, __LINE__,
3507 setlocale_debug_string(LC_NUMERIC, NULL, newlocale)));
3508 new_numeric(newlocale);
7d4bcc4a 3509
747c467a
KW
3510#endif /* USE_LOCALE_NUMERIC */
3511
3512}
3513
5d1187d1
KW
3514#if defined(DEBUGGING) && defined(USE_LOCALE)
3515
a4f00dcc
KW
3516STATIC char *
3517S_setlocale_debug_string(const int category, /* category number,
5d1187d1
KW
3518 like LC_ALL */
3519 const char* const locale, /* locale name */
3520
3521 /* return value from setlocale() when attempting to
3522 * set 'category' to 'locale' */
3523 const char* const retval)
3524{
3525 /* Returns a pointer to a NUL-terminated string in static storage with
3526 * added text about the info passed in. This is not thread safe and will
3527 * be overwritten by the next call, so this should be used just to
fa07b8e5 3528 * formulate a string to immediately print or savepv() on. */
5d1187d1 3529
398a990f
DM
3530 /* initialise to a non-null value to keep it out of BSS and so keep
3531 * -DPERL_GLOBAL_STRUCT_PRIVATE happy */
60b45a7d
KW
3532 static char ret[128] = "If you can read this, thank your buggy C"
3533 " library strlcpy(), and change your hints file"
3534 " to undef it";
e5f10d49 3535 unsigned int i;
7d4bcc4a 3536
e5f10d49 3537 const unsigned int highest_index = NOMINAL_LC_ALL_INDEX;
7d4bcc4a 3538
e5f10d49 3539 my_strlcpy(ret, "setlocale(", sizeof(ret));
7d4bcc4a 3540
e5f10d49
KW
3541 /* Look for category in our list, and if found, add its name */
3542 for (i = 0; i <= highest_index; i++) {
3543 if (category == categories[i]) {
3544 my_strlcat(ret, category_names[i], sizeof(ret));
3545 goto found_category;
3546 }
3547 }
7d4bcc4a 3548
e5f10d49
KW
3549 /* Unknown category to us */
3550 my_snprintf(ret, sizeof(ret), "%s? %d", ret, category);
7d4bcc4a 3551
e5f10d49 3552 found_category:
5d1187d1 3553
fa07b8e5 3554 my_strlcat(ret, ", ", sizeof(ret));
5d1187d1
KW
3555
3556 if (locale) {
fa07b8e5
KW
3557 my_strlcat(ret, "\"", sizeof(ret));
3558 my_strlcat(ret, locale, sizeof(ret));
3559 my_strlcat(ret, "\"", sizeof(ret));
5d1187d1
KW
3560 }
3561 else {
fa07b8e5 3562 my_strlcat(ret, "NULL", sizeof(ret));
5d1187d1
KW
3563 }
3564
fa07b8e5 3565 my_strlcat(ret, ") returned ", sizeof(ret));
5d1187d1
KW
3566
3567 if (retval) {
fa07b8e5
KW
3568 my_strlcat(ret, "\"", sizeof(ret));
3569 my_strlcat(ret, retval, sizeof(ret));
3570 my_strlcat(ret, "\"", sizeof(ret));
5d1187d1
KW
3571 }
3572 else {
fa07b8e5 3573 my_strlcat(ret, "NULL", sizeof(ret));
5d1187d1
KW
3574 }
3575
3576 assert(strlen(ret) < sizeof(ret));
3577
3578 return ret;
3579}
3580
3581#endif
747c467a
KW
3582
3583
3584/*
14d04a33 3585 * ex: set ts=8 sts=4 sw=4 et:
37442d52 3586 */