This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Merge branch 'BO' into blead
[perl5.git] / locale.c
CommitLineData
98994639
HS
1/* locale.c
2 *
1129b882
NC
3 * Copyright (C) 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001,
4 * 2002, 2003, 2005, 2006, 2007, 2008 by Larry Wall and others
98994639
HS
5 *
6 * You may distribute under the terms of either the GNU General Public
7 * License or the Artistic License, as specified in the README file.
8 *
9 */
10
11/*
4ac71550 12 * A Elbereth Gilthoniel,
cdad3b53 13 * silivren penna míriel
4ac71550 14 * o menel aglar elenath!
cdad3b53 15 * Na-chaered palan-díriel
4ac71550
TC
16 * o galadhremmin ennorath,
17 * Fanuilos, le linnathon
18 * nef aear, si nef aearon!
19 *
20 * [p.238 of _The Lord of the Rings_, II/i: "Many Meetings"]
98994639
HS
21 */
22
166f8a29
DM
23/* utility functions for handling locale-specific stuff like what
24 * character represents the decimal point.
0d071d52
KW
25 *
26 * All C programs have an underlying locale. Perl generally doesn't pay any
27 * attention to it except within the scope of a 'use locale'. For most
28 * categories, it accomplishes this by just using different operations if it is
29 * in such scope than if not. However, various libc functions called by Perl
30 * are affected by the LC_NUMERIC category, so there are macros in perl.h that
31 * are used to toggle between the current locale and the C locale depending on
a9ad02a8
KW
32 * the desired behavior of those functions at the moment. And, LC_MESSAGES is
33 * switched to the C locale for outputting the message unless within the scope
34 * of 'use locale'.
166f8a29
DM
35 */
36
98994639
HS
37#include "EXTERN.h"
38#define PERL_IN_LOCALE_C
f7416781 39#include "perl_langinfo.h"
98994639
HS
40#include "perl.h"
41
a4af207c
JH
42#include "reentr.h"
43
2fcc0ca9
KW
44/* If the environment says to, we can output debugging information during
45 * initialization. This is done before option parsing, and before any thread
46 * creation, so can be a file-level static */
47#ifdef DEBUGGING
27cdc72e
DM
48# ifdef PERL_GLOBAL_STRUCT
49 /* no global syms allowed */
50# define debug_initialization 0
51# define DEBUG_INITIALIZATION_set(v)
52# else
2fcc0ca9 53static bool debug_initialization = FALSE;
27cdc72e
DM
54# define DEBUG_INITIALIZATION_set(v) (debug_initialization = v)
55# endif
2fcc0ca9
KW
56#endif
57
8ef6e574
KW
58#ifdef USE_LOCALE
59
98994639 60/*
0d071d52
KW
61 * Standardize the locale name from a string returned by 'setlocale', possibly
62 * modifying that string.
98994639 63 *
0ef2a2b2 64 * The typical return value of setlocale() is either
98994639
HS
65 * (1) "xx_YY" if the first argument of setlocale() is not LC_ALL
66 * (2) "xa_YY xb_YY ..." if the first argument of setlocale() is LC_ALL
67 * (the space-separated values represent the various sublocales,
0ef2a2b2 68 * in some unspecified order). This is not handled by this function.
98994639
HS
69 *
70 * In some platforms it has a form like "LC_SOMETHING=Lang_Country.866\n",
0ef2a2b2
KW
71 * which is harmful for further use of the string in setlocale(). This
72 * function removes the trailing new line and everything up through the '='
98994639
HS
73 *
74 */
75STATIC char *
76S_stdize_locale(pTHX_ char *locs)
77{
7452cf6a 78 const char * const s = strchr(locs, '=');
98994639
HS
79 bool okay = TRUE;
80
7918f24d
NC
81 PERL_ARGS_ASSERT_STDIZE_LOCALE;
82
8772537c
AL
83 if (s) {
84 const char * const t = strchr(s, '.');
98994639 85 okay = FALSE;
8772537c
AL
86 if (t) {
87 const char * const u = strchr(t, '\n');
88 if (u && (u[1] == 0)) {
89 const STRLEN len = u - s;
90 Move(s + 1, locs, len, char);
91 locs[len] = 0;
92 okay = TRUE;
98994639
HS
93 }
94 }
95 }
96
97 if (!okay)
98 Perl_croak(aTHX_ "Can't fix broken locale name \"%s\"", locs);
99
100 return locs;
101}
102
8ef6e574
KW
103#endif
104
a4f00dcc
KW
105STATIC void
106S_set_numeric_radix(pTHX)
98994639
HS
107{
108#ifdef USE_LOCALE_NUMERIC
109# ifdef HAS_LOCALECONV
7452cf6a 110 const struct lconv* const lc = localeconv();
98994639 111
98994639
HS
112 if (lc && lc->decimal_point) {
113 if (lc->decimal_point[0] == '.' && lc->decimal_point[1] == 0) {
114 SvREFCNT_dec(PL_numeric_radix_sv);
a0714e2c 115 PL_numeric_radix_sv = NULL;
98994639
HS
116 }
117 else {
118 if (PL_numeric_radix_sv)
119 sv_setpv(PL_numeric_radix_sv, lc->decimal_point);
120 else
121 PL_numeric_radix_sv = newSVpv(lc->decimal_point, 0);
c5f058df 122 if (! is_utf8_invariant_string((U8 *) lc->decimal_point, 0)
28acfe03 123 && is_utf8_string((U8 *) lc->decimal_point, 0)
c1284011 124 && _is_cur_LC_category_utf8(LC_NUMERIC))
28acfe03
KW
125 {
126 SvUTF8_on(PL_numeric_radix_sv);
127 }
98994639
HS
128 }
129 }
130 else
a0714e2c 131 PL_numeric_radix_sv = NULL;
69014004 132
2fcc0ca9
KW
133#ifdef DEBUGGING
134 if (DEBUG_L_TEST || debug_initialization) {
135 PerlIO_printf(Perl_debug_log, "Locale radix is '%s', ?UTF-8=%d\n",
69014004 136 (PL_numeric_radix_sv)
37b7e435
KW
137 ? SvPVX(PL_numeric_radix_sv)
138 : "NULL",
139 (PL_numeric_radix_sv)
39eb7305 140 ? cBOOL(SvUTF8(PL_numeric_radix_sv))
2fcc0ca9
KW
141 : 0);
142 }
143#endif
69014004 144
98994639
HS
145# endif /* HAS_LOCALECONV */
146#endif /* USE_LOCALE_NUMERIC */
147}
148
a39edc4c
KW
149/* Is the C string input 'name' "C" or "POSIX"? If so, and 'name' is the
150 * return of setlocale(), then this is extremely likely to be the C or POSIX
151 * locale. However, the output of setlocale() is documented to be opaque, but
152 * the odds are extremely small that it would return these two strings for some
153 * other locale. Note that VMS in these two locales includes many non-ASCII
154 * characters as controls and punctuation (below are hex bytes):
155 * cntrl: 00-1F 7F 84-97 9B-9F
156 * punct: 21-2F 3A-40 5B-60 7B-7E A1-A3 A5 A7-AB B0-B3 B5-B7 B9-BD BF-CF D1-DD DF-EF F1-FD
157 * Oddly, none there are listed as alphas, though some represent alphabetics
158 * http://www.nntp.perl.org/group/perl.perl5.porters/2013/02/msg198753.html */
98b630b3
KW
159#define isNAME_C_OR_POSIX(name) ((name) != NULL \
160 && ((*(name) == 'C' && (*(name + 1)) == '\0') \
a39edc4c
KW
161 || strEQ((name), "POSIX")))
162
98994639 163void
8772537c 164Perl_new_numeric(pTHX_ const char *newnum)
98994639
HS
165{
166#ifdef USE_LOCALE_NUMERIC
0d071d52
KW
167
168 /* Called after all libc setlocale() calls affecting LC_NUMERIC, to tell
169 * core Perl this and that 'newnum' is the name of the new locale.
170 * It installs this locale as the current underlying default.
171 *
172 * The default locale and the C locale can be toggled between by use of the
173 * set_numeric_local() and set_numeric_standard() functions, which should
174 * probably not be called directly, but only via macros like
175 * SET_NUMERIC_STANDARD() in perl.h.
176 *
177 * The toggling is necessary mainly so that a non-dot radix decimal point
178 * character can be output, while allowing internal calculations to use a
179 * dot.
180 *
181 * This sets several interpreter-level variables:
bb304765 182 * PL_numeric_name The underlying locale's name: a copy of 'newnum'
0d071d52 183 * PL_numeric_local A boolean indicating if the toggled state is such
7738054c
KW
184 * that the current locale is the program's underlying
185 * locale
186 * PL_numeric_standard An int indicating if the toggled state is such
187 * that the current locale is the C locale. If non-zero,
188 * it is in C; if > 1, it means it may not be toggled away
189 * from C.
0d071d52
KW
190 * Note that both of the last two variables can be true at the same time,
191 * if the underlying locale is C. (Toggling is a no-op under these
192 * circumstances.)
193 *
194 * Any code changing the locale (outside this file) should use
195 * POSIX::setlocale, which calls this function. Therefore this function
196 * should be called directly only from this file and from
197 * POSIX::setlocale() */
198
b03f34cf 199 char *save_newnum;
98994639
HS
200
201 if (! newnum) {
43c5f42d
NC
202 Safefree(PL_numeric_name);
203 PL_numeric_name = NULL;
98994639
HS
204 PL_numeric_standard = TRUE;
205 PL_numeric_local = TRUE;
206 return;
207 }
208
b03f34cf 209 save_newnum = stdize_locale(savepv(newnum));
abe1abcf
KW
210
211 PL_numeric_standard = isNAME_C_OR_POSIX(save_newnum);
212 PL_numeric_local = TRUE;
213
b03f34cf 214 if (! PL_numeric_name || strNE(PL_numeric_name, save_newnum)) {
98994639 215 Safefree(PL_numeric_name);
b03f34cf 216 PL_numeric_name = save_newnum;
b03f34cf 217 }
abe1abcf
KW
218 else {
219 Safefree(save_newnum);
220 }
4c28b29c
KW
221
222 /* Keep LC_NUMERIC in the C locale. This is for XS modules, so they don't
223 * have to worry about the radix being a non-dot. (Core operations that
224 * need the underlying locale change to it temporarily). */
225 set_numeric_standard();
226
e19f01cb 227 set_numeric_radix();
6959d69d 228
f2ce9e1c
JH
229#else
230 PERL_UNUSED_ARG(newnum);
98994639
HS
231#endif /* USE_LOCALE_NUMERIC */
232}
233
234void
235Perl_set_numeric_standard(pTHX)
236{
237#ifdef USE_LOCALE_NUMERIC
28c1bf33
KW
238 /* Toggle the LC_NUMERIC locale to C. Most code should use the macros like
239 * SET_NUMERIC_STANDARD() in perl.h instead of calling this directly. The
240 * macro avoids calling this routine if toggling isn't necessary according
241 * to our records (which could be wrong if some XS code has changed the
242 * locale behind our back) */
0d071d52 243
a9b8c0d8
KW
244 setlocale(LC_NUMERIC, "C");
245 PL_numeric_standard = TRUE;
246 PL_numeric_local = isNAME_C_OR_POSIX(PL_numeric_name);
247 set_numeric_radix();
2fcc0ca9
KW
248#ifdef DEBUGGING
249 if (DEBUG_L_TEST || debug_initialization) {
250 PerlIO_printf(Perl_debug_log,
251 "Underlying LC_NUMERIC locale now is C\n");
252 }
253#endif
98994639
HS
254
255#endif /* USE_LOCALE_NUMERIC */
256}
257
258void
259Perl_set_numeric_local(pTHX)
260{
261#ifdef USE_LOCALE_NUMERIC
28c1bf33
KW
262 /* Toggle the LC_NUMERIC locale to the current underlying default. Most
263 * code should use the macros like SET_NUMERIC_LOCAL() in perl.h instead of
264 * calling this directly. The macro avoids calling this routine if
265 * toggling isn't necessary according to our records (which could be wrong
266 * if some XS code has changed the locale behind our back) */
a9b8c0d8
KW
267
268 setlocale(LC_NUMERIC, PL_numeric_name);
269 PL_numeric_standard = isNAME_C_OR_POSIX(PL_numeric_name);
270 PL_numeric_local = TRUE;
271 set_numeric_radix();
2fcc0ca9
KW
272#ifdef DEBUGGING
273 if (DEBUG_L_TEST || debug_initialization) {
274 PerlIO_printf(Perl_debug_log,
69014004 275 "Underlying LC_NUMERIC locale now is %s\n",
2fcc0ca9
KW
276 PL_numeric_name);
277 }
278#endif
98994639
HS
279
280#endif /* USE_LOCALE_NUMERIC */
281}
282
283/*
284 * Set up for a new ctype locale.
285 */
a4f00dcc
KW
286STATIC void
287S_new_ctype(pTHX_ const char *newctype)
98994639
HS
288{
289#ifdef USE_LOCALE_CTYPE
0d071d52
KW
290
291 /* Called after all libc setlocale() calls affecting LC_CTYPE, to tell
292 * core Perl this and that 'newctype' is the name of the new locale.
293 *
294 * This function sets up the folding arrays for all 256 bytes, assuming
295 * that tofold() is tolc() since fold case is not a concept in POSIX,
296 *
297 * Any code changing the locale (outside this file) should use
298 * POSIX::setlocale, which calls this function. Therefore this function
299 * should be called directly only from this file and from
300 * POSIX::setlocale() */
301
27da23d5 302 dVAR;
68067e4e 303 UV i;
98994639 304
7918f24d
NC
305 PERL_ARGS_ASSERT_NEW_CTYPE;
306
215c5139
KW
307 /* We will replace any bad locale warning with 1) nothing if the new one is
308 * ok; or 2) a new warning for the bad new locale */
309 if (PL_warn_locale) {
310 SvREFCNT_dec_NN(PL_warn_locale);
311 PL_warn_locale = NULL;
312 }
313
c1284011 314 PL_in_utf8_CTYPE_locale = _is_cur_LC_category_utf8(LC_CTYPE);
31f05a37
KW
315
316 /* A UTF-8 locale gets standard rules. But note that code still has to
317 * handle this specially because of the three problematic code points */
318 if (PL_in_utf8_CTYPE_locale) {
319 Copy(PL_fold_latin1, PL_fold_locale, 256, U8);
320 }
321 else {
8c6180a9
KW
322 /* Assume enough space for every character being bad. 4 spaces each
323 * for the 94 printable characters that are output like "'x' "; and 5
324 * spaces each for "'\\' ", "'\t' ", and "'\n' "; plus a terminating
325 * NUL */
326 char bad_chars_list[ (94 * 4) + (3 * 5) + 1 ];
327
cc9eaeb0
KW
328 /* Don't check for problems if we are suppressing the warnings */
329 bool check_for_problems = ckWARN_d(WARN_LOCALE)
330 || UNLIKELY(DEBUG_L_TEST);
8c6180a9
KW
331 bool multi_byte_locale = FALSE; /* Assume is a single-byte locale
332 to start */
333 unsigned int bad_count = 0; /* Count of bad characters */
334
baa60164
KW
335 for (i = 0; i < 256; i++) {
336 if (isUPPER_LC((U8) i))
337 PL_fold_locale[i] = (U8) toLOWER_LC((U8) i);
338 else if (isLOWER_LC((U8) i))
339 PL_fold_locale[i] = (U8) toUPPER_LC((U8) i);
340 else
341 PL_fold_locale[i] = (U8) i;
8c6180a9
KW
342
343 /* If checking for locale problems, see if the native ASCII-range
344 * printables plus \n and \t are in their expected categories in
345 * the new locale. If not, this could mean big trouble, upending
346 * Perl's and most programs' assumptions, like having a
347 * metacharacter with special meaning become a \w. Fortunately,
348 * it's very rare to find locales that aren't supersets of ASCII
349 * nowadays. It isn't a problem for most controls to be changed
350 * into something else; we check only \n and \t, though perhaps \r
351 * could be an issue as well. */
352 if (check_for_problems
353 && (isGRAPH_A(i) || isBLANK_A(i) || i == '\n'))
354 {
355 if ((isALPHANUMERIC_A(i) && ! isALPHANUMERIC_LC(i))
356 || (isPUNCT_A(i) && ! isPUNCT_LC(i))
357 || (isBLANK_A(i) && ! isBLANK_LC(i))
358 || (i == '\n' && ! isCNTRL_LC(i)))
359 {
360 if (bad_count) { /* Separate multiple entries with a
361 blank */
362 bad_chars_list[bad_count++] = ' ';
363 }
364 bad_chars_list[bad_count++] = '\'';
365 if (isPRINT_A(i)) {
366 bad_chars_list[bad_count++] = (char) i;
367 }
368 else {
369 bad_chars_list[bad_count++] = '\\';
370 if (i == '\n') {
371 bad_chars_list[bad_count++] = 'n';
372 }
373 else {
374 assert(i == '\t');
375 bad_chars_list[bad_count++] = 't';
376 }
377 }
378 bad_chars_list[bad_count++] = '\'';
379 bad_chars_list[bad_count] = '\0';
380 }
381 }
382 }
383
384#ifdef MB_CUR_MAX
385 /* We only handle single-byte locales (outside of UTF-8 ones; so if
d35fca5f 386 * this locale requires more than one byte, there are going to be
8c6180a9 387 * problems. */
9c8a6dc2
KW
388 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
389 "%s:%d: check_for_problems=%d, MB_CUR_MAX=%d\n",
390 __FILE__, __LINE__, check_for_problems, (int) MB_CUR_MAX));
391
ba1a4362
KW
392 if (check_for_problems && MB_CUR_MAX > 1
393
394 /* Some platforms return MB_CUR_MAX > 1 for even the "C"
395 * locale. Just assume that the implementation for them (plus
396 * for POSIX) is correct and the > 1 value is spurious. (Since
397 * these are specially handled to never be considered UTF-8
398 * locales, as long as this is the only problem, everything
399 * should work fine */
400 && strNE(newctype, "C") && strNE(newctype, "POSIX"))
401 {
8c6180a9
KW
402 multi_byte_locale = TRUE;
403 }
404#endif
405
406 if (bad_count || multi_byte_locale) {
780fcc9f 407 PL_warn_locale = Perl_newSVpvf(aTHX_
8c6180a9 408 "Locale '%s' may not work well.%s%s%s\n",
780fcc9f 409 newctype,
8c6180a9
KW
410 (multi_byte_locale)
411 ? " Some characters in it are not recognized by"
412 " Perl."
413 : "",
414 (bad_count)
415 ? "\nThe following characters (and maybe others)"
416 " may not have the same meaning as the Perl"
417 " program expects:\n"
418 : "",
419 (bad_count)
420 ? bad_chars_list
421 : ""
422 );
cc9eaeb0 423 /* If we are actually in the scope of the locale or are debugging,
bddebb56
KW
424 * output the message now. If not in that scope, we save the
425 * message to be output at the first operation using this locale,
426 * if that actually happens. Most programs don't use locales, so
427 * they are immune to bad ones. */
cc9eaeb0 428 if (IN_LC(LC_CTYPE) || UNLIKELY(DEBUG_L_TEST)) {
780fcc9f
KW
429
430 /* We have to save 'newctype' because the setlocale() just
431 * below may destroy it. The next setlocale() further down
432 * should restore it properly so that the intermediate change
433 * here is transparent to this function's caller */
434 const char * const badlocale = savepv(newctype);
435
436 setlocale(LC_CTYPE, "C");
437
438 /* The '0' below suppresses a bogus gcc compiler warning */
439 Perl_warner(aTHX_ packWARN(WARN_LOCALE), SvPVX(PL_warn_locale), 0);
bddebb56 440
780fcc9f 441 setlocale(LC_CTYPE, badlocale);
c0f3a893 442 Safefree(badlocale);
bddebb56
KW
443
444 if (IN_LC(LC_CTYPE)) {
445 SvREFCNT_dec_NN(PL_warn_locale);
446 PL_warn_locale = NULL;
447 }
780fcc9f 448 }
baa60164 449 }
31f05a37 450 }
98994639
HS
451
452#endif /* USE_LOCALE_CTYPE */
7918f24d 453 PERL_ARGS_ASSERT_NEW_CTYPE;
8772537c 454 PERL_UNUSED_ARG(newctype);
96a5add6 455 PERL_UNUSED_CONTEXT;
98994639
HS
456}
457
98994639 458void
2726666d
KW
459Perl__warn_problematic_locale()
460{
2726666d
KW
461
462#ifdef USE_LOCALE_CTYPE
463
5f04a188
KW
464 dTHX;
465
466 /* Internal-to-core function that outputs the message in PL_warn_locale,
467 * and then NULLS it. Should be called only through the macro
468 * _CHECK_AND_WARN_PROBLEMATIC_LOCALE */
469
2726666d
KW
470 if (PL_warn_locale) {
471 /*GCC_DIAG_IGNORE(-Wformat-security); Didn't work */
472 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
473 SvPVX(PL_warn_locale),
474 0 /* dummy to avoid compiler warning */ );
475 /* GCC_DIAG_RESTORE; */
476 SvREFCNT_dec_NN(PL_warn_locale);
477 PL_warn_locale = NULL;
478 }
479
480#endif
481
482}
483
a4f00dcc
KW
484STATIC void
485S_new_collate(pTHX_ const char *newcoll)
98994639
HS
486{
487#ifdef USE_LOCALE_COLLATE
0d071d52
KW
488
489 /* Called after all libc setlocale() calls affecting LC_COLLATE, to tell
490 * core Perl this and that 'newcoll' is the name of the new locale.
491 *
492 * Any code changing the locale (outside this file) should use
493 * POSIX::setlocale, which calls this function. Therefore this function
494 * should be called directly only from this file and from
d35fca5f
KW
495 * POSIX::setlocale().
496 *
497 * The design of locale collation is that every locale change is given an
498 * index 'PL_collation_ix'. The first time a string particpates in an
499 * operation that requires collation while locale collation is active, it
500 * is given PERL_MAGIC_collxfrm magic (via sv_collxfrm_flags()). That
501 * magic includes the collation index, and the transformation of the string
502 * by strxfrm(), q.v. That transformation is used when doing comparisons,
503 * instead of the string itself. If a string changes, the magic is
504 * cleared. The next time the locale changes, the index is incremented,
505 * and so we know during a comparison that the transformation is not
506 * necessarily still valid, and so is recomputed. Note that if the locale
507 * changes enough times, the index could wrap (a U32), and it is possible
508 * that a transformation would improperly be considered valid, leading to
509 * an unlikely bug */
0d071d52 510
98994639
HS
511 if (! newcoll) {
512 if (PL_collation_name) {
513 ++PL_collation_ix;
514 Safefree(PL_collation_name);
515 PL_collation_name = NULL;
516 }
517 PL_collation_standard = TRUE;
00bf60ca 518 is_standard_collation:
98994639
HS
519 PL_collxfrm_base = 0;
520 PL_collxfrm_mult = 2;
165a1c52 521 PL_in_utf8_COLLATE_locale = FALSE;
f28f4d2a 522 PL_strxfrm_NUL_replacement = '\0';
a4a439fb 523 PL_strxfrm_max_cp = 0;
98994639
HS
524 return;
525 }
526
d35fca5f 527 /* If this is not the same locale as currently, set the new one up */
98994639
HS
528 if (! PL_collation_name || strNE(PL_collation_name, newcoll)) {
529 ++PL_collation_ix;
530 Safefree(PL_collation_name);
531 PL_collation_name = stdize_locale(savepv(newcoll));
a39edc4c 532 PL_collation_standard = isNAME_C_OR_POSIX(newcoll);
00bf60ca
KW
533 if (PL_collation_standard) {
534 goto is_standard_collation;
535 }
98994639 536
165a1c52 537 PL_in_utf8_COLLATE_locale = _is_cur_LC_category_utf8(LC_COLLATE);
f28f4d2a 538 PL_strxfrm_NUL_replacement = '\0';
a4a439fb 539 PL_strxfrm_max_cp = 0;
165a1c52 540
59c018b9
KW
541 /* A locale collation definition includes primary, secondary, tertiary,
542 * etc. weights for each character. To sort, the primary weights are
543 * used, and only if they compare equal, then the secondary weights are
544 * used, and only if they compare equal, then the tertiary, etc.
545 *
546 * strxfrm() works by taking the input string, say ABC, and creating an
547 * output transformed string consisting of first the primary weights,
548 * A¹B¹C¹ followed by the secondary ones, A²B²C²; and then the
549 * tertiary, etc, yielding A¹B¹C¹ A²B²C² A³B³C³ .... Some characters
550 * may not have weights at every level. In our example, let's say B
551 * doesn't have a tertiary weight, and A doesn't have a secondary
552 * weight. The constructed string is then going to be
553 * A¹B¹C¹ B²C² A³C³ ....
554 * This has the desired effect that strcmp() will look at the secondary
555 * or tertiary weights only if the strings compare equal at all higher
556 * priority weights. The spaces shown here, like in
c342d20e 557 * "A¹B¹C¹ A²B²C² "
59c018b9
KW
558 * are not just for readability. In the general case, these must
559 * actually be bytes, which we will call here 'separator weights'; and
560 * they must be smaller than any other weight value, but since these
561 * are C strings, only the terminating one can be a NUL (some
562 * implementations may include a non-NUL separator weight just before
563 * the NUL). Implementations tend to reserve 01 for the separator
564 * weights. They are needed so that a shorter string's secondary
565 * weights won't be misconstrued as primary weights of a longer string,
566 * etc. By making them smaller than any other weight, the shorter
567 * string will sort first. (Actually, if all secondary weights are
568 * smaller than all primary ones, there is no need for a separator
569 * weight between those two levels, etc.)
570 *
571 * The length of the transformed string is roughly a linear function of
572 * the input string. It's not exactly linear because some characters
573 * don't have weights at all levels. When we call strxfrm() we have to
574 * allocate some memory to hold the transformed string. The
575 * calculations below try to find coefficients 'm' and 'b' for this
576 * locale so that m*x + b equals how much space we need, given the size
577 * of the input string in 'x'. If we calculate too small, we increase
578 * the size as needed, and call strxfrm() again, but it is better to
579 * get it right the first time to avoid wasted expensive string
580 * transformations. */
581
98994639 582 {
79f120c8
KW
583 /* We use the string below to find how long the tranformation of it
584 * is. Almost all locales are supersets of ASCII, or at least the
585 * ASCII letters. We use all of them, half upper half lower,
586 * because if we used fewer, we might hit just the ones that are
587 * outliers in a particular locale. Most of the strings being
588 * collated will contain a preponderance of letters, and even if
589 * they are above-ASCII, they are likely to have the same number of
590 * weight levels as the ASCII ones. It turns out that digits tend
591 * to have fewer levels, and some punctuation has more, but those
592 * are relatively sparse in text, and khw believes this gives a
593 * reasonable result, but it could be changed if experience so
594 * dictates. */
595 const char longer[] = "ABCDEFGHIJKLMnopqrstuvwxyz";
596 char * x_longer; /* Transformed 'longer' */
597 Size_t x_len_longer; /* Length of 'x_longer' */
598
599 char * x_shorter; /* We also transform a substring of 'longer' */
600 Size_t x_len_shorter;
601
a4a439fb 602 /* _mem_collxfrm() is used get the transformation (though here we
79f120c8
KW
603 * are interested only in its length). It is used because it has
604 * the intelligence to handle all cases, but to work, it needs some
605 * values of 'm' and 'b' to get it started. For the purposes of
606 * this calculation we use a very conservative estimate of 'm' and
607 * 'b'. This assumes a weight can be multiple bytes, enough to
608 * hold any UV on the platform, and there are 5 levels, 4 weight
609 * bytes, and a trailing NUL. */
610 PL_collxfrm_base = 5;
611 PL_collxfrm_mult = 5 * sizeof(UV);
612
613 /* Find out how long the transformation really is */
a4a439fb
KW
614 x_longer = _mem_collxfrm(longer,
615 sizeof(longer) - 1,
616 &x_len_longer,
617
618 /* We avoid converting to UTF-8 in the
619 * called function by telling it the
620 * string is in UTF-8 if the locale is a
621 * UTF-8 one. Since the string passed
622 * here is invariant under UTF-8, we can
623 * claim it's UTF-8 even though it isn't.
624 * */
625 PL_in_utf8_COLLATE_locale);
79f120c8
KW
626 Safefree(x_longer);
627
628 /* Find out how long the transformation of a substring of 'longer'
629 * is. Together the lengths of these transformations are
630 * sufficient to calculate 'm' and 'b'. The substring is all of
631 * 'longer' except the first character. This minimizes the chances
632 * of being swayed by outliers */
a4a439fb 633 x_shorter = _mem_collxfrm(longer + 1,
79f120c8 634 sizeof(longer) - 2,
a4a439fb
KW
635 &x_len_shorter,
636 PL_in_utf8_COLLATE_locale);
79f120c8
KW
637 Safefree(x_shorter);
638
639 /* If the results are nonsensical for this simple test, the whole
640 * locale definition is suspect. Mark it so that locale collation
641 * is not active at all for it. XXX Should we warn? */
642 if ( x_len_shorter == 0
643 || x_len_longer == 0
644 || x_len_shorter >= x_len_longer)
645 {
646 PL_collxfrm_mult = 0;
647 PL_collxfrm_base = 0;
648 }
649 else {
650 SSize_t base; /* Temporary */
651
652 /* We have both: m * strlen(longer) + b = x_len_longer
653 * m * strlen(shorter) + b = x_len_shorter;
654 * subtracting yields:
655 * m * (strlen(longer) - strlen(shorter))
656 * = x_len_longer - x_len_shorter
657 * But we have set things up so that 'shorter' is 1 byte smaller
658 * than 'longer'. Hence:
659 * m = x_len_longer - x_len_shorter
660 *
661 * But if something went wrong, make sure the multiplier is at
662 * least 1.
663 */
664 if (x_len_longer > x_len_shorter) {
665 PL_collxfrm_mult = (STRLEN) x_len_longer - x_len_shorter;
666 }
667 else {
668 PL_collxfrm_mult = 1;
669 }
670
671 /* mx + b = len
672 * so: b = len - mx
673 * but in case something has gone wrong, make sure it is
674 * non-negative */
675 base = x_len_longer - PL_collxfrm_mult * (sizeof(longer) - 1);
676 if (base < 0) {
677 base = 0;
678 }
679
680 /* Add 1 for the trailing NUL */
681 PL_collxfrm_base = base + 1;
682 }
58eebef2
KW
683
684#ifdef DEBUGGING
685 if (DEBUG_L_TEST || debug_initialization) {
686 PerlIO_printf(Perl_debug_log,
b07929e4
KW
687 "%s:%d: ?UTF-8 locale=%d; x_len_shorter=%zu, "
688 "x_len_longer=%zu,"
689 " collate multipler=%zu, collate base=%zu\n",
58eebef2
KW
690 __FILE__, __LINE__,
691 PL_in_utf8_COLLATE_locale,
692 x_len_shorter, x_len_longer,
693 PL_collxfrm_mult, PL_collxfrm_base);
694 }
695#endif
98994639
HS
696 }
697 }
698
f2ce9e1c
JH
699#else
700 PERL_UNUSED_ARG(newcoll);
98994639
HS
701#endif /* USE_LOCALE_COLLATE */
702}
703
a4f00dcc 704#ifndef WIN32 /* No wrapper except on Windows */
b385bb4d 705
a4f00dcc
KW
706#define my_setlocale(a,b) setlocale(a,b)
707
708#else /* WIN32 */
709
710STATIC char *
711S_my_setlocale(pTHX_ int category, const char* locale)
b385bb4d
KW
712{
713 /* This, for Windows, emulates POSIX setlocale() behavior. There is no
714 * difference unless the input locale is "", which means on Windows to get
715 * the machine default, which is set via the computer's "Regional and
716 * Language Options" (or its current equivalent). In POSIX, it instead
717 * means to find the locale from the user's environment. This routine
718 * looks in the environment, and, if anything is found, uses that instead
719 * of going to the machine default. If there is no environment override,
720 * the machine default is used, as normal, by calling the real setlocale()
721 * with "". The POSIX behavior is to use the LC_ALL variable if set;
722 * otherwise to use the particular category's variable if set; otherwise to
723 * use the LANG variable. */
724
175c4cf9 725 bool override_LC_ALL = FALSE;
89f7b9aa
KW
726 char * result;
727
b385bb4d
KW
728 if (locale && strEQ(locale, "")) {
729# ifdef LC_ALL
730 locale = PerlEnv_getenv("LC_ALL");
731 if (! locale) {
732#endif
733 switch (category) {
734# ifdef LC_ALL
735 case LC_ALL:
481465ea 736 override_LC_ALL = TRUE;
b385bb4d
KW
737 break; /* We already know its variable isn't set */
738# endif
739# ifdef USE_LOCALE_TIME
740 case LC_TIME:
741 locale = PerlEnv_getenv("LC_TIME");
742 break;
743# endif
744# ifdef USE_LOCALE_CTYPE
745 case LC_CTYPE:
746 locale = PerlEnv_getenv("LC_CTYPE");
747 break;
748# endif
749# ifdef USE_LOCALE_COLLATE
750 case LC_COLLATE:
751 locale = PerlEnv_getenv("LC_COLLATE");
752 break;
753# endif
754# ifdef USE_LOCALE_MONETARY
755 case LC_MONETARY:
756 locale = PerlEnv_getenv("LC_MONETARY");
757 break;
758# endif
759# ifdef USE_LOCALE_NUMERIC
760 case LC_NUMERIC:
761 locale = PerlEnv_getenv("LC_NUMERIC");
762 break;
763# endif
764# ifdef USE_LOCALE_MESSAGES
765 case LC_MESSAGES:
766 locale = PerlEnv_getenv("LC_MESSAGES");
767 break;
768# endif
769 default:
770 /* This is a category, like PAPER_SIZE that we don't
771 * know about; and so can't provide a wrapper. */
772 break;
773 }
774 if (! locale) {
775 locale = PerlEnv_getenv("LANG");
481465ea 776 if (! locale) {
b385bb4d
KW
777 locale = "";
778 }
779 }
780# ifdef LC_ALL
781 }
782# endif
783 }
784
89f7b9aa 785 result = setlocale(category, locale);
bbc98134 786 DEBUG_L(PerlIO_printf(Perl_debug_log, "%s:%d: %s\n", __FILE__, __LINE__,
a4f00dcc 787 setlocale_debug_string(category, locale, result)));
89f7b9aa 788
481465ea 789 if (! override_LC_ALL) {
89f7b9aa
KW
790 return result;
791 }
792
dfd77d7a 793 /* Here the input category was LC_ALL, and we have set it to what is in the
481465ea
KW
794 * LANG variable or the system default if there is no LANG. But these have
795 * lower priority than the other LC_foo variables, so override it for each
796 * one that is set. (If they are set to "", it means to use the same thing
797 * we just set LC_ALL to, so can skip) */
89f7b9aa
KW
798# ifdef USE_LOCALE_TIME
799 result = PerlEnv_getenv("LC_TIME");
730252b2 800 if (result && strNE(result, "")) {
89f7b9aa 801 setlocale(LC_TIME, result);
bbc98134
KW
802 DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s:%d: %s\n",
803 __FILE__, __LINE__,
a4f00dcc 804 setlocale_debug_string(LC_TIME, result, "not captured")));
89f7b9aa
KW
805 }
806# endif
807# ifdef USE_LOCALE_CTYPE
808 result = PerlEnv_getenv("LC_CTYPE");
730252b2 809 if (result && strNE(result, "")) {
89f7b9aa 810 setlocale(LC_CTYPE, result);
bbc98134
KW
811 DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s:%d: %s\n",
812 __FILE__, __LINE__,
a4f00dcc 813 setlocale_debug_string(LC_CTYPE, result, "not captured")));
89f7b9aa
KW
814 }
815# endif
816# ifdef USE_LOCALE_COLLATE
817 result = PerlEnv_getenv("LC_COLLATE");
730252b2 818 if (result && strNE(result, "")) {
89f7b9aa 819 setlocale(LC_COLLATE, result);
bbc98134
KW
820 DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s:%d: %s\n",
821 __FILE__, __LINE__,
a4f00dcc 822 setlocale_debug_string(LC_COLLATE, result, "not captured")));
89f7b9aa
KW
823 }
824# endif
825# ifdef USE_LOCALE_MONETARY
826 result = PerlEnv_getenv("LC_MONETARY");
730252b2 827 if (result && strNE(result, "")) {
89f7b9aa 828 setlocale(LC_MONETARY, result);
bbc98134
KW
829 DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s:%d: %s\n",
830 __FILE__, __LINE__,
a4f00dcc 831 setlocale_debug_string(LC_MONETARY, result, "not captured")));
89f7b9aa
KW
832 }
833# endif
834# ifdef USE_LOCALE_NUMERIC
835 result = PerlEnv_getenv("LC_NUMERIC");
730252b2 836 if (result && strNE(result, "")) {
89f7b9aa 837 setlocale(LC_NUMERIC, result);
bbc98134
KW
838 DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s:%d: %s\n",
839 __FILE__, __LINE__,
a4f00dcc 840 setlocale_debug_string(LC_NUMERIC, result, "not captured")));
89f7b9aa
KW
841 }
842# endif
843# ifdef USE_LOCALE_MESSAGES
844 result = PerlEnv_getenv("LC_MESSAGES");
730252b2 845 if (result && strNE(result, "")) {
89f7b9aa 846 setlocale(LC_MESSAGES, result);
bbc98134
KW
847 DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s:%d: %s\n",
848 __FILE__, __LINE__,
a4f00dcc 849 setlocale_debug_string(LC_MESSAGES, result, "not captured")));
89f7b9aa
KW
850 }
851# endif
852
bbc98134
KW
853 result = setlocale(LC_ALL, NULL);
854 DEBUG_L(PerlIO_printf(Perl_debug_log, "%s:%d: %s\n",
855 __FILE__, __LINE__,
a4f00dcc 856 setlocale_debug_string(LC_ALL, NULL, result)));
89f7b9aa 857
bbc98134 858 return result;
b385bb4d
KW
859}
860
861#endif
862
a4f00dcc
KW
863char *
864Perl_setlocale(int category, const char * locale)
865{
866 /* This wraps POSIX::setlocale() */
867
868 char * retval;
869 dTHX;
870
871
872#ifdef USE_LOCALE_NUMERIC
873
874 /* A NULL locale means only query what the current one is. We
875 * have the LC_NUMERIC name saved, because we are normally switched
876 * into the C locale for it. Switch back so an LC_ALL query will yield
877 * the correct results; all other categories don't require special
878 * handling */
879 if (locale == NULL) {
880 if (category == LC_NUMERIC) {
881 return savepv(PL_numeric_name);
882 }
883
884# ifdef LC_ALL
885
886 else if (category == LC_ALL) {
887 SET_NUMERIC_UNDERLYING();
888 }
889
890# endif
891
892 }
893
894#endif
895
896 retval = my_setlocale(category, locale);
897
898 DEBUG_L(PerlIO_printf(Perl_debug_log,
899 "%s:%d: %s\n", __FILE__, __LINE__,
900 setlocale_debug_string(category, locale, retval)));
901 if (! retval) {
902 /* Should never happen that a query would return an error, but be
903 * sure and reset to C locale */
904 if (locale == 0) {
905 SET_NUMERIC_STANDARD();
906 }
907 return NULL;
908 }
909
910 /* Save retval since subsequent setlocale() calls may overwrite it. */
911 retval = savepv(retval);
912
913 /* If locale == NULL, we are just querying the state, but may have switched
914 * to NUMERIC_UNDERLYING. Switch back before returning. */
915 if (locale == NULL) {
916 SET_NUMERIC_STANDARD();
917 return retval;
918 }
919 else { /* Now that have switched locales, we have to update our records to
920 correspond */
921
922#ifdef USE_LOCALE_CTYPE
923
924 if ( category == LC_CTYPE
925
926# ifdef LC_ALL
927
928 || category == LC_ALL
929
930# endif
931
932 )
933 {
934 char *newctype;
935
936# ifdef LC_ALL
937
938 if (category == LC_ALL) {
939 newctype = setlocale(LC_CTYPE, NULL);
940 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
941 "%s:%d: %s\n", __FILE__, __LINE__,
942 setlocale_debug_string(LC_CTYPE, NULL, newctype)));
943 }
944 else
945
946# endif
947
948 newctype = retval;
949 new_ctype(newctype);
950 }
951
952#endif /* USE_LOCALE_CTYPE */
953
954#ifdef USE_LOCALE_COLLATE
955
956 if ( category == LC_COLLATE
957
958# ifdef LC_ALL
959
960 || category == LC_ALL
961
962# endif
963
964 )
965 {
966 char *newcoll;
967
968# ifdef LC_ALL
969
970 if (category == LC_ALL) {
971 newcoll = setlocale(LC_COLLATE, NULL);
972 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
973 "%s:%d: %s\n", __FILE__, __LINE__,
974 setlocale_debug_string(LC_COLLATE, NULL, newcoll)));
975 }
976 else
977
978# endif
979
980 newcoll = retval;
981 new_collate(newcoll);
982 }
983
984#endif /* USE_LOCALE_COLLATE */
985
986#ifdef USE_LOCALE_NUMERIC
987
988 if ( category == LC_NUMERIC
989
990# ifdef LC_ALL
991
992 || category == LC_ALL
993
994# endif
995
996 )
997 {
998 char *newnum;
999
1000# ifdef LC_ALL
1001
1002 if (category == LC_ALL) {
1003 newnum = setlocale(LC_NUMERIC, NULL);
1004 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
1005 "%s:%d: %s\n", __FILE__, __LINE__,
1006 setlocale_debug_string(LC_NUMERIC, NULL, newnum)));
1007 }
1008 else
1009
1010# endif
1011
1012 newnum = retval;
1013 new_numeric(newnum);
1014 }
1015
1016#endif /* USE_LOCALE_NUMERIC */
1017
1018 }
1019
1020 return retval;
1021
f7416781
KW
1022
1023}
1024
1025PERL_STATIC_INLINE const char *
1026S_save_to_buffer(const char * string, char **buf, Size_t *buf_size, const Size_t offset)
1027{
1028 /* Copy the NUL-terminated 'string' to 'buf' + 'offset'. 'buf' has size 'buf_size',
1029 * growing it if necessary */
1030
1031 const Size_t string_size = strlen(string) + offset + 1;
1032
1033 PERL_ARGS_ASSERT_SAVE_TO_BUFFER;
1034
1035 if (*buf_size == 0) {
1036 Newx(*buf, string_size, char);
1037 *buf_size = string_size;
1038 }
1039 else if (string_size > *buf_size) {
1040 Renew(*buf, string_size, char);
1041 *buf_size = string_size;
1042 }
1043
1044 Copy(string, *buf + offset, string_size - offset, char);
1045 return *buf;
1046}
1047
1048/*
1049
1050=head1 Locale-related functions and macros
1051
1052=for apidoc Perl_langinfo
1053
1054This is an (almostª) drop-in replacement for the system C<L<nl_langinfo(3)>>,
1055taking the same C<item> parameter values, and returning the same information.
1056But it is more thread-safe than regular C<nl_langinfo()>, and hides the quirks
1057of Perl's locale handling from your code, and can be used on systems that lack
1058a native C<nl_langinfo>.
1059
1060Expanding on these:
1061
1062=over
1063
1064=item *
1065
1066It delivers the correct results for the C<RADIXCHAR> and C<THOUSESEP> items,
1067without you having to write extra code. The reason for the extra code would be
1068because these are from the C<LC_NUMERIC> locale category, which is normally
1069kept set to the C locale by Perl, no matter what the underlying locale is
1070supposed to be, and so to get the expected results, you have to temporarily
1071toggle into the underlying locale, and later toggle back. (You could use
1072plain C<nl_langinfo> and C<L</STORE_LC_NUMERIC_FORCE_TO_UNDERLYING>> for this
1073but then you wouldn't get the other advantages of C<Perl_langinfo()>; not
1074keeping C<LC_NUMERIC> in the C locale would break a lot of CPAN, which is
1075expecting the radix (decimal point) character to be a dot.)
1076
1077=item *
1078
1079Depending on C<item>, it works on systems that don't have C<nl_langinfo>, hence
1080makes your code more portable. Of the fifty-some possible items specified by
1081the POSIX 2008 standard,
1082L<http://pubs.opengroup.org/onlinepubs/9699919799/basedefs/langinfo.h.html>,
1083only two are completely unimplemented. It uses various techniques to recover
1084the other items, including calling C<L<localeconv(3)>>, and C<L<strftime(3)>>,
1085both of which are specified in C89, so should be always be available. Later
1086C<strftime()> versions have additional capabilities; C<""> is returned for
1087those not available on your system.
1088
1089The details for those items which may differ from what this emulation returns
1090and what a native C<nl_langinfo()> would return are:
1091
1092=over
1093
1094=item C<CODESET>
1095
1096=item C<ERA>
1097
1098Unimplemented, so returns C<"">.
1099
1100=item C<YESEXPR>
1101
1102=item C<NOEXPR>
1103
1104Only the values for English are returned. Earlier POSIX standards also
1105specified C<YESSTR> and C<NOSTR>, but these have been removed from POSIX 2008,
1106and aren't supported by C<Perl_langinfo>.
1107
1108=item C<D_FMT>
1109
1110Always evaluates to C<%x>, the locale's appropriate date representation.
1111
1112=item C<T_FMT>
1113
1114Always evaluates to C<%X>, the locale's appropriate time representation.
1115
1116=item C<D_T_FMT>
1117
1118Always evaluates to C<%c>, the locale's appropriate date and time
1119representation.
1120
1121=item C<CRNCYSTR>
1122
1123The return may be incorrect for those rare locales where the currency symbol
1124replaces the radix character.
1125Send email to L<mailto:perlbug@perl.org> if you have examples of it needing
1126to work differently.
1127
1128=item C<ALT_DIGITS>
1129
1130Currently this gives the same results as Linux does.
1131Send email to L<mailto:perlbug@perl.org> if you have examples of it needing
1132to work differently.
1133
1134=item C<ERA_D_FMT>
1135
1136=item C<ERA_T_FMT>
1137
1138=item C<ERA_D_T_FMT>
1139
1140=item C<T_FMT_AMPM>
1141
1142These are derived by using C<strftime()>, and not all versions of that function
1143know about them. C<""> is returned for these on such systems.
1144
1145=back
1146
1147When using C<Perl_langinfo> on systems that don't have a native
1148C<nl_langinfo()>, you must
1149
1150 #include "perl_langinfo.h"
1151
1152before the C<perl.h> C<#include>. You can replace your C<langinfo.h>
1153C<#include> with this one. (Doing it this way keeps out the symbols that plain
1154C<langinfo.h> imports into the namespace for code that doesn't need it.)
1155
1156You also should not use the bare C<langinfo.h> item names, but should preface
1157them with C<PERL_>, so use C<PERL_RADIXCHAR> instead of plain C<RADIXCHAR>.
1158The C<PERL_I<foo>> versions will also work for this function on systems that do
1159have a native C<nl_langinfo>.
1160
1161=item *
1162
1163It is thread-friendly, returning its result in a buffer that won't be
1164overwritten by another thread, so you don't have to code for that possibility.
1165The buffer can be overwritten by the next call to C<nl_langinfo> or
1166C<Perl_langinfo> in the same thread.
1167
1168=item *
1169
1170ªIt returns S<C<const char *>>, whereas plain C<nl_langinfo()> returns S<C<char
1171*>>, but you are (only by documentation) forbidden to write into the buffer.
1172By declaring this C<const>, the compiler enforces this restriction. The extra
1173C<const> is why this isn't an unequivocal drop-in replacement for
1174C<nl_langinfo>.
1175
1176=back
1177
1178The original impetus for C<Perl_langinfo()> was so that code that needs to
1179find out the current currency symbol, floating point radix character, or digit
1180grouping separator can use, on all systems, the simpler and more
1181thread-friendly C<nl_langinfo> API instead of C<L<localeconv(3)>> which is a
1182pain to make thread-friendly. For other fields returned by C<localeconv>, it
1183is better to use the methods given in L<perlcall> to call
1184L<C<POSIX::localeconv()>|POSIX/localeconv>, which is thread-friendly.
1185
1186=cut
1187
1188*/
1189
1190const char *
1191#ifdef HAS_NL_LANGINFO
1192Perl_langinfo(const nl_item item)
1193#else
1194Perl_langinfo(const int item)
1195#endif
1196{
1197 bool toggle = TRUE;
ae74815b 1198 dTHX;
f7416781 1199
ab340fff
KW
1200#if defined(HAS_NL_LANGINFO) /* nl_langinfo() is available. */
1201#if ! defined(HAS_POSIX_2008_LOCALE)
f7416781 1202
ab340fff 1203 /* Here, use plain nl_langinfo(), switching to the underlying LC_NUMERIC
ae74815b
KW
1204 * for those items dependent on it. This must be copied to a buffer before
1205 * switching back, as some systems destroy the buffer when setlocale() is
1206 * called */
f7416781
KW
1207
1208 LOCALE_LOCK;
1209
1210 if (toggle) {
1211 if (item == PERL_RADIXCHAR || item == PERL_THOUSEP) {
1212 setlocale(LC_NUMERIC, PL_numeric_name);
1213 }
1214 else {
1215 toggle = FALSE;
1216 }
1217 }
1218
1219 save_to_buffer(nl_langinfo(item), &PL_langinfo_buf, &PL_langinfo_bufsize, 0);
1220
1221 if (toggle) {
1222 setlocale(LC_NUMERIC, "C");
1223 }
1224
1225 LOCALE_UNLOCK;
1226
1227 return PL_langinfo_buf;
1228
ab340fff
KW
1229# else /* Use nl_langinfo_l(), avoiding both a mutex and changing the locale */
1230
1231 bool do_free = FALSE;
1232 locale_t cur = uselocale((locale_t) 0);
1233
1234 if (cur == LC_GLOBAL_LOCALE) {
1235 cur = duplocale(LC_GLOBAL_LOCALE);
1236 do_free = TRUE;
1237 }
1238
1239 if ( toggle
1240 && (item == PERL_RADIXCHAR || item == PERL_THOUSEP))
1241 {
1242 cur = newlocale(LC_NUMERIC_MASK, PL_numeric_name, cur);
1243 do_free = TRUE;
1244 }
1245
1246 save_to_buffer(nl_langinfo_l(item, cur),
1247 &PL_langinfo_buf, &PL_langinfo_bufsize, 0);
1248 if (do_free) {
1249 freelocale(cur);
1250 }
1251
1252 return PL_langinfo_buf;
1253
1254# endif
f7416781 1255#else /* Below, emulate nl_langinfo as best we can */
f7416781
KW
1256# ifdef HAS_LOCALECONV
1257
1258 const struct lconv* lc;
1259
1260# endif
1261# ifdef HAS_STRFTIME
1262
1263 struct tm tm;
1264 bool return_format = FALSE; /* Return the %format, not the value */
1265 const char * format;
1266
1267# endif
1268
1269 /* We copy the results to a per-thread buffer, even if not multi-threaded.
1270 * This is in part to simplify this code, and partly because we need a
1271 * buffer anyway for strftime(), and partly because a call of localeconv()
1272 * could otherwise wipe out the buffer, and the programmer would not be
1273 * expecting this, as this is a nl_langinfo() substitute after all, so s/he
1274 * might be thinking their localeconv() is safe until another localeconv()
1275 * call. */
1276
1277 switch (item) {
1278 Size_t len;
1279 const char * retval;
1280
1281 /* These 2 are unimplemented */
1282 case PERL_CODESET:
1283 case PERL_ERA: /* For use with strftime() %E modifier */
1284
1285 default:
1286 return "";
1287
1288 /* We use only an English set, since we don't know any more */
1289 case PERL_YESEXPR: return "^[+1yY]";
1290 case PERL_NOEXPR: return "^[-0nN]";
1291
1292# ifdef HAS_LOCALECONV
1293
1294 case PERL_CRNCYSTR:
1295
1296 LOCALE_LOCK;
1297
1298 lc = localeconv();
1299 if (! lc || ! lc->currency_symbol || strEQ("", lc->currency_symbol))
1300 {
1301 LOCALE_UNLOCK;
1302 return "";
1303 }
1304
1305 /* Leave the first spot empty to be filled in below */
1306 save_to_buffer(lc->currency_symbol, &PL_langinfo_buf,
1307 &PL_langinfo_bufsize, 1);
1308 if (lc->mon_decimal_point && strEQ(lc->mon_decimal_point, ""))
1309 { /* khw couldn't figure out how the localedef specifications
1310 would show that the $ should replace the radix; this is
1311 just a guess as to how it might work.*/
1312 *PL_langinfo_buf = '.';
1313 }
1314 else if (lc->p_cs_precedes) {
1315 *PL_langinfo_buf = '-';
1316 }
1317 else {
1318 *PL_langinfo_buf = '+';
1319 }
1320
1321 LOCALE_UNLOCK;
1322 break;
1323
1324 case PERL_RADIXCHAR:
1325 case PERL_THOUSEP:
1326
1327 LOCALE_LOCK;
1328
1329 if (toggle) {
1330 setlocale(LC_NUMERIC, PL_numeric_name);
1331 }
1332
1333 lc = localeconv();
1334 if (! lc) {
1335 retval = "";
1336 }
1337 else switch (item) {
1338 case PERL_RADIXCHAR:
1339 if (! lc->decimal_point) {
1340 retval = "";
1341 }
1342 else {
1343 retval = lc->decimal_point;
1344 }
1345 break;
1346
1347 case PERL_THOUSEP:
1348 if (! lc->thousands_sep || strEQ("", lc->thousands_sep)) {
1349 retval = "";
1350 }
1351 else {
1352 retval = lc->thousands_sep;
1353 }
1354 break;
1355
1356 default:
1357 LOCALE_UNLOCK;
1358 Perl_croak(aTHX_ "panic: %s: %d: switch case: %d problem",
1359 __FILE__, __LINE__, item);
1360 }
1361
1362 save_to_buffer(retval, &PL_langinfo_buf, &PL_langinfo_bufsize, 0);
1363
1364 if (toggle) {
1365 setlocale(LC_NUMERIC, "C");
1366 }
1367
1368 LOCALE_UNLOCK;
1369
1370 break;
1371
1372# endif
1373# ifdef HAS_STRFTIME
1374
1375 /* These are defined by C89, so we assume that strftime supports them,
1376 * and so are returned unconditionally; they may not be what the locale
1377 * actually says, but should give good enough results for someone using
1378 * them as formats (as opposed to trying to parse them to figure out
1379 * what the locale says). The other format ones are actually tested to
1380 * verify they work on the platform */
1381 case PERL_D_FMT: return "%x";
1382 case PERL_T_FMT: return "%X";
1383 case PERL_D_T_FMT: return "%c";
1384
1385 /* These formats are only available in later strfmtime's */
1386 case PERL_ERA_D_FMT: case PERL_ERA_T_FMT: case PERL_ERA_D_T_FMT:
1387 case PERL_T_FMT_AMPM:
1388
1389 /* The rest can be gotten from most versions of strftime(). */
1390 case PERL_ABDAY_1: case PERL_ABDAY_2: case PERL_ABDAY_3:
1391 case PERL_ABDAY_4: case PERL_ABDAY_5: case PERL_ABDAY_6:
1392 case PERL_ABDAY_7:
1393 case PERL_ALT_DIGITS:
1394 case PERL_AM_STR: case PERL_PM_STR:
1395 case PERL_ABMON_1: case PERL_ABMON_2: case PERL_ABMON_3:
1396 case PERL_ABMON_4: case PERL_ABMON_5: case PERL_ABMON_6:
1397 case PERL_ABMON_7: case PERL_ABMON_8: case PERL_ABMON_9:
1398 case PERL_ABMON_10: case PERL_ABMON_11: case PERL_ABMON_12:
1399 case PERL_DAY_1: case PERL_DAY_2: case PERL_DAY_3: case PERL_DAY_4:
1400 case PERL_DAY_5: case PERL_DAY_6: case PERL_DAY_7:
1401 case PERL_MON_1: case PERL_MON_2: case PERL_MON_3: case PERL_MON_4:
1402 case PERL_MON_5: case PERL_MON_6: case PERL_MON_7: case PERL_MON_8:
1403 case PERL_MON_9: case PERL_MON_10: case PERL_MON_11: case PERL_MON_12:
1404
1405 LOCALE_LOCK;
1406
1407 init_tm(&tm); /* Precaution against core dumps */
1408 tm.tm_sec = 30;
1409 tm.tm_min = 30;
1410 tm.tm_hour = 6;
1411 tm.tm_year = 2017 - 1900;
1412 tm.tm_wday = 0;
1413 tm.tm_mon = 0;
1414 switch (item) {
1415 default:
1416 LOCALE_UNLOCK;
1417 Perl_croak(aTHX_ "panic: %s: %d: switch case: %d problem",
1418 __FILE__, __LINE__, item);
1419 NOT_REACHED; /* NOTREACHED */
1420
1421 case PERL_PM_STR: tm.tm_hour = 18;
1422 case PERL_AM_STR:
1423 format = "%p";
1424 break;
1425
1426 case PERL_ABDAY_7: tm.tm_wday++;
1427 case PERL_ABDAY_6: tm.tm_wday++;
1428 case PERL_ABDAY_5: tm.tm_wday++;
1429 case PERL_ABDAY_4: tm.tm_wday++;
1430 case PERL_ABDAY_3: tm.tm_wday++;
1431 case PERL_ABDAY_2: tm.tm_wday++;
1432 case PERL_ABDAY_1:
1433 format = "%a";
1434 break;
1435
1436 case PERL_DAY_7: tm.tm_wday++;
1437 case PERL_DAY_6: tm.tm_wday++;
1438 case PERL_DAY_5: tm.tm_wday++;
1439 case PERL_DAY_4: tm.tm_wday++;
1440 case PERL_DAY_3: tm.tm_wday++;
1441 case PERL_DAY_2: tm.tm_wday++;
1442 case PERL_DAY_1:
1443 format = "%A";
1444 break;
1445
1446 case PERL_ABMON_12: tm.tm_mon++;
1447 case PERL_ABMON_11: tm.tm_mon++;
1448 case PERL_ABMON_10: tm.tm_mon++;
1449 case PERL_ABMON_9: tm.tm_mon++;
1450 case PERL_ABMON_8: tm.tm_mon++;
1451 case PERL_ABMON_7: tm.tm_mon++;
1452 case PERL_ABMON_6: tm.tm_mon++;
1453 case PERL_ABMON_5: tm.tm_mon++;
1454 case PERL_ABMON_4: tm.tm_mon++;
1455 case PERL_ABMON_3: tm.tm_mon++;
1456 case PERL_ABMON_2: tm.tm_mon++;
1457 case PERL_ABMON_1:
1458 format = "%b";
1459 break;
1460
1461 case PERL_MON_12: tm.tm_mon++;
1462 case PERL_MON_11: tm.tm_mon++;
1463 case PERL_MON_10: tm.tm_mon++;
1464 case PERL_MON_9: tm.tm_mon++;
1465 case PERL_MON_8: tm.tm_mon++;
1466 case PERL_MON_7: tm.tm_mon++;
1467 case PERL_MON_6: tm.tm_mon++;
1468 case PERL_MON_5: tm.tm_mon++;
1469 case PERL_MON_4: tm.tm_mon++;
1470 case PERL_MON_3: tm.tm_mon++;
1471 case PERL_MON_2: tm.tm_mon++;
1472 case PERL_MON_1:
1473 format = "%B";
1474 break;
1475
1476 case PERL_T_FMT_AMPM:
1477 format = "%r";
1478 return_format = TRUE;
1479 break;
1480
1481 case PERL_ERA_D_FMT:
1482 format = "%Ex";
1483 return_format = TRUE;
1484 break;
1485
1486 case PERL_ERA_T_FMT:
1487 format = "%EX";
1488 return_format = TRUE;
1489 break;
1490
1491 case PERL_ERA_D_T_FMT:
1492 format = "%Ec";
1493 return_format = TRUE;
1494 break;
1495
1496 case PERL_ALT_DIGITS:
1497 tm.tm_wday = 0;
1498 format = "%Ow"; /* Find the alternate digit for 0 */
1499 break;
1500 }
1501
1502 /* We can't use my_strftime() because it doesn't look at tm_wday */
1503 while (0 == strftime(PL_langinfo_buf, PL_langinfo_bufsize,
1504 format, &tm))
1505 {
1506 /* A zero return means one of:
1507 * a) there wasn't enough space in PL_langinfo_buf
1508 * b) the format, like a plain %p, returns empty
1509 * c) it was an illegal format, though some implementations of
1510 * strftime will just return the illegal format as a plain
1511 * character sequence.
1512 *
1513 * To quickly test for case 'b)', try again but precede the
1514 * format with a plain character. If that result is still
1515 * empty, the problem is either 'a)' or 'c)' */
1516
1517 Size_t format_size = strlen(format) + 1;
1518 Size_t mod_size = format_size + 1;
1519 char * mod_format;
1520 char * temp_result;
1521
1522 Newx(mod_format, mod_size, char);
1523 Newx(temp_result, PL_langinfo_bufsize, char);
1524 *mod_format = '\a';
1525 my_strlcpy(mod_format + 1, format, mod_size);
1526 len = strftime(temp_result,
1527 PL_langinfo_bufsize,
1528 mod_format, &tm);
1529 Safefree(mod_format);
1530 Safefree(temp_result);
1531
1532 /* If 'len' is non-zero, it means that we had a case like %p
1533 * which means the current locale doesn't use a.m. or p.m., and
1534 * that is valid */
1535 if (len == 0) {
1536
1537 /* Here, still didn't work. If we get well beyond a
1538 * reasonable size, bail out to prevent an infinite loop. */
1539
1540 if (PL_langinfo_bufsize > 100 * format_size) {
1541 *PL_langinfo_buf = '\0';
1542 }
1543 else { /* Double the buffer size to retry; Add 1 in case
1544 original was 0, so we aren't stuck at 0. */
1545 PL_langinfo_bufsize *= 2;
1546 PL_langinfo_bufsize++;
1547 Renew(PL_langinfo_buf, PL_langinfo_bufsize, char);
1548 continue;
1549 }
1550 }
1551
1552 break;
1553 }
1554
1555 /* Here, we got a result.
1556 *
1557 * If the item is 'ALT_DIGITS', PL_langinfo_buf contains the
1558 * alternate format for wday 0. If the value is the same as the
1559 * normal 0, there isn't an alternate, so clear the buffer. */
1560 if ( item == PERL_ALT_DIGITS
1561 && strEQ(PL_langinfo_buf, "0"))
1562 {
1563 *PL_langinfo_buf = '\0';
1564 }
1565
1566 /* ALT_DIGITS is problematic. Experiments on it showed that
1567 * strftime() did not always work properly when going from alt-9 to
1568 * alt-10. Only a few locales have this item defined, and in all
1569 * of them on Linux that khw was able to find, nl_langinfo() merely
1570 * returned the alt-0 character, possibly doubled. Most Unicode
1571 * digits are in blocks of 10 consecutive code points, so that is
1572 * sufficient information for those scripts, as we can infer alt-1,
1573 * alt-2, .... But for a Japanese locale, a CJK ideographic 0 is
1574 * returned, and the CJK digits are not in code point order, so you
1575 * can't really infer anything. The localedef for this locale did
1576 * specify the succeeding digits, so that strftime() works properly
1577 * on them, without needing to infer anything. But the
1578 * nl_langinfo() return did not give sufficient information for the
1579 * caller to understand what's going on. So until there is
1580 * evidence that it should work differently, this returns the alt-0
1581 * string for ALT_DIGITS.
1582 *
1583 * wday was chosen because its range is all a single digit. Things
1584 * like tm_sec have two digits as the minimum: '00' */
1585
1586 LOCALE_UNLOCK;
1587
1588 /* If to return the format, not the value, overwrite the buffer
1589 * with it. But some strftime()s will keep the original format if
1590 * illegal, so change those to "" */
1591 if (return_format) {
1592 if (strEQ(PL_langinfo_buf, format)) {
1593 *PL_langinfo_buf = '\0';
1594 }
1595 else {
1596 save_to_buffer(format, &PL_langinfo_buf,
1597 &PL_langinfo_bufsize, 0);
1598 }
1599 }
1600
1601 break;
1602
1603# endif
1604
1605 }
1606
1607 return PL_langinfo_buf;
1608
1609#endif
1610
a4f00dcc 1611}
b385bb4d 1612
98994639
HS
1613/*
1614 * Initialize locale awareness.
1615 */
1616int
1617Perl_init_i18nl10n(pTHX_ int printwarn)
1618{
0e92a118
KW
1619 /* printwarn is
1620 *
1621 * 0 if not to output warning when setup locale is bad
1622 * 1 if to output warning based on value of PERL_BADLANG
1623 * >1 if to output regardless of PERL_BADLANG
1624 *
1625 * returns
98994639 1626 * 1 = set ok or not applicable,
0e92a118
KW
1627 * 0 = fallback to a locale of lower priority
1628 * -1 = fallback to all locales failed, not even to the C locale
6b058d42
KW
1629 *
1630 * Under -DDEBUGGING, if the environment variable PERL_DEBUG_LOCALE_INIT is
1631 * set, debugging information is output.
1632 *
1633 * This looks more complicated than it is, mainly due to the #ifdefs.
1634 *
1635 * We try to set LC_ALL to the value determined by the environment. If
1636 * there is no LC_ALL on this platform, we try the individual categories we
1637 * know about. If this works, we are done.
1638 *
1639 * But if it doesn't work, we have to do something else. We search the
1640 * environment variables ourselves instead of relying on the system to do
1641 * it. We look at, in order, LC_ALL, LANG, a system default locale (if we
1642 * think there is one), and the ultimate fallback "C". This is all done in
1643 * the same loop as above to avoid duplicating code, but it makes things
1644 * more complex. After the original failure, we add the fallback
1645 * possibilities to the list of locales to try, and iterate the loop
1646 * through them all until one succeeds.
1647 *
1648 * On Ultrix, the locale MUST come from the environment, so there is
1649 * preliminary code to set it. I (khw) am not sure that it is necessary,
1650 * and that this couldn't be folded into the loop, but barring any real
1651 * platforms to test on, it's staying as-is
1652 *
1653 * A slight complication is that in embedded Perls, the locale may already
1654 * be set-up, and we don't want to get it from the normal environment
1655 * variables. This is handled by having a special environment variable
1656 * indicate we're in this situation. We simply set setlocale's 2nd
1657 * parameter to be a NULL instead of "". That indicates to setlocale that
1658 * it is not to change anything, but to return the current value,
1659 * effectively initializing perl's db to what the locale already is.
1660 *
1661 * We play the same trick with NULL if a LC_ALL succeeds. We call
1662 * setlocale() on the individual categores with NULL to get their existing
1663 * values for our db, instead of trying to change them.
1664 * */
98994639 1665
0e92a118
KW
1666 int ok = 1;
1667
98994639 1668#if defined(USE_LOCALE)
98994639
HS
1669#ifdef USE_LOCALE_CTYPE
1670 char *curctype = NULL;
1671#endif /* USE_LOCALE_CTYPE */
1672#ifdef USE_LOCALE_COLLATE
1673 char *curcoll = NULL;
1674#endif /* USE_LOCALE_COLLATE */
1675#ifdef USE_LOCALE_NUMERIC
1676 char *curnum = NULL;
1677#endif /* USE_LOCALE_NUMERIC */
1678#ifdef __GLIBC__
175c4cf9 1679 const char * const language = savepv(PerlEnv_getenv("LANGUAGE"));
98994639 1680#endif
65ebb059 1681
ccd65d51
KW
1682 /* NULL uses the existing already set up locale */
1683 const char * const setlocale_init = (PerlEnv_getenv("PERL_SKIP_LOCALE_INIT"))
1684 ? NULL
1685 : "";
c3fcd832
KW
1686 const char* trial_locales[5]; /* 5 = 1 each for "", LC_ALL, LANG, "", C */
1687 unsigned int trial_locales_count;
175c4cf9
KW
1688 const char * const lc_all = savepv(PerlEnv_getenv("LC_ALL"));
1689 const char * const lang = savepv(PerlEnv_getenv("LANG"));
98994639 1690 bool setlocale_failure = FALSE;
65ebb059 1691 unsigned int i;
175c4cf9
KW
1692
1693 /* A later getenv() could zap this, so only use here */
1694 const char * const bad_lang_use_once = PerlEnv_getenv("PERL_BADLANG");
1695
1696 const bool locwarn = (printwarn > 1
1697 || (printwarn
1698 && (! bad_lang_use_once
22ff3130
HS
1699 || (
1700 /* disallow with "" or "0" */
1701 *bad_lang_use_once
1702 && strNE("0", bad_lang_use_once)))));
0e92a118 1703 bool done = FALSE;
5d1187d1
KW
1704 char * sl_result; /* return from setlocale() */
1705 char * locale_param;
6bce99ee
JH
1706#ifdef WIN32
1707 /* In some systems you can find out the system default locale
1708 * and use that as the fallback locale. */
1709# define SYSTEM_DEFAULT_LOCALE
1710#endif
1711#ifdef SYSTEM_DEFAULT_LOCALE
65ebb059 1712 const char *system_default_locale = NULL;
6bce99ee 1713#endif
98994639 1714
2fcc0ca9 1715#ifdef DEBUGGING
8298454c 1716 DEBUG_INITIALIZATION_set(cBOOL(PerlEnv_getenv("PERL_DEBUG_LOCALE_INIT")));
2fcc0ca9
KW
1717# define DEBUG_LOCALE_INIT(category, locale, result) \
1718 STMT_START { \
1719 if (debug_initialization) { \
1720 PerlIO_printf(Perl_debug_log, \
1721 "%s:%d: %s\n", \
1722 __FILE__, __LINE__, \
a4f00dcc 1723 setlocale_debug_string(category, \
2fcc0ca9
KW
1724 locale, \
1725 result)); \
1726 } \
1727 } STMT_END
1728#else
1729# define DEBUG_LOCALE_INIT(a,b,c)
1730#endif
1731
0e92a118
KW
1732#ifndef LOCALE_ENVIRON_REQUIRED
1733 PERL_UNUSED_VAR(done);
5d1187d1 1734 PERL_UNUSED_VAR(locale_param);
0e92a118 1735#else
98994639
HS
1736
1737 /*
1738 * Ultrix setlocale(..., "") fails if there are no environment
1739 * variables from which to get a locale name.
1740 */
1741
b3e384bf 1742# ifdef LC_ALL
98994639 1743 if (lang) {
5d1187d1
KW
1744 sl_result = my_setlocale(LC_ALL, setlocale_init);
1745 DEBUG_LOCALE_INIT(LC_ALL, setlocale_init, sl_result);
1746 if (sl_result)
98994639
HS
1747 done = TRUE;
1748 else
1749 setlocale_failure = TRUE;
1750 }
5d1187d1 1751 if (! setlocale_failure) {
b3e384bf 1752# ifdef USE_LOCALE_CTYPE
5d1187d1
KW
1753 locale_param = (! done && (lang || PerlEnv_getenv("LC_CTYPE")))
1754 ? setlocale_init
1755 : NULL;
1756 curctype = my_setlocale(LC_CTYPE, locale_param);
1757 DEBUG_LOCALE_INIT(LC_CTYPE, locale_param, sl_result);
1758 if (! curctype)
98994639
HS
1759 setlocale_failure = TRUE;
1760 else
1761 curctype = savepv(curctype);
b3e384bf
KW
1762# endif /* USE_LOCALE_CTYPE */
1763# ifdef USE_LOCALE_COLLATE
5d1187d1
KW
1764 locale_param = (! done && (lang || PerlEnv_getenv("LC_COLLATE")))
1765 ? setlocale_init
1766 : NULL;
1767 curcoll = my_setlocale(LC_COLLATE, locale_param);
1768 DEBUG_LOCALE_INIT(LC_COLLATE, locale_param, sl_result);
1769 if (! curcoll)
98994639
HS
1770 setlocale_failure = TRUE;
1771 else
1772 curcoll = savepv(curcoll);
b3e384bf
KW
1773# endif /* USE_LOCALE_COLLATE */
1774# ifdef USE_LOCALE_NUMERIC
5d1187d1
KW
1775 locale_param = (! done && (lang || PerlEnv_getenv("LC_NUMERIC")))
1776 ? setlocale_init
1777 : NULL;
1778 curnum = my_setlocale(LC_NUMERIC, locale_param);
1779 DEBUG_LOCALE_INIT(LC_NUMERIC, locale_param, sl_result);
1780 if (! curnum)
98994639
HS
1781 setlocale_failure = TRUE;
1782 else
1783 curnum = savepv(curnum);
b3e384bf 1784# endif /* USE_LOCALE_NUMERIC */
a782673d 1785# ifdef USE_LOCALE_MESSAGES
5d1187d1
KW
1786 locale_param = (! done && (lang || PerlEnv_getenv("LC_MESSAGES")))
1787 ? setlocale_init
1788 : NULL;
1789 sl_result = my_setlocale(LC_MESSAGES, locale_param);
1790 DEBUG_LOCALE_INIT(LC_MESSAGES, locale_param, sl_result);
9f42613c 1791 if (! sl_result) {
a782673d
KW
1792 setlocale_failure = TRUE;
1793 }
1794# endif /* USE_LOCALE_MESSAGES */
c835d6be 1795# ifdef USE_LOCALE_MONETARY
5d1187d1
KW
1796 locale_param = (! done && (lang || PerlEnv_getenv("LC_MONETARY")))
1797 ? setlocale_init
1798 : NULL;
1799 sl_result = my_setlocale(LC_MONETARY, locale_param);
1800 DEBUG_LOCALE_INIT(LC_MONETARY, locale_param, sl_result);
1801 if (! sl_result) {
c835d6be
KW
1802 setlocale_failure = TRUE;
1803 }
1804# endif /* USE_LOCALE_MONETARY */
98994639
HS
1805 }
1806
b3e384bf 1807# endif /* LC_ALL */
98994639
HS
1808
1809#endif /* !LOCALE_ENVIRON_REQUIRED */
1810
65ebb059 1811 /* We try each locale in the list until we get one that works, or exhaust
20a240df
KW
1812 * the list. Normally the loop is executed just once. But if setting the
1813 * locale fails, inside the loop we add fallback trials to the array and so
1814 * will execute the loop multiple times */
c3fcd832
KW
1815 trial_locales[0] = setlocale_init;
1816 trial_locales_count = 1;
65ebb059
KW
1817 for (i= 0; i < trial_locales_count; i++) {
1818 const char * trial_locale = trial_locales[i];
1819
1820 if (i > 0) {
1821
1822 /* XXX This is to preserve old behavior for LOCALE_ENVIRON_REQUIRED
1823 * when i==0, but I (khw) don't think that behavior makes much
1824 * sense */
1825 setlocale_failure = FALSE;
1826
6bce99ee
JH
1827#ifdef SYSTEM_DEFAULT_LOCALE
1828# ifdef WIN32
65ebb059
KW
1829 /* On Windows machines, an entry of "" after the 0th means to use
1830 * the system default locale, which we now proceed to get. */
1831 if (strEQ(trial_locale, "")) {
1832 unsigned int j;
1833
1834 /* Note that this may change the locale, but we are going to do
1835 * that anyway just below */
1836 system_default_locale = setlocale(LC_ALL, "");
5d1187d1 1837 DEBUG_LOCALE_INIT(LC_ALL, "", system_default_locale);
65ebb059
KW
1838
1839 /* Skip if invalid or it's already on the list of locales to
1840 * try */
1841 if (! system_default_locale) {
1842 goto next_iteration;
1843 }
1844 for (j = 0; j < trial_locales_count; j++) {
1845 if (strEQ(system_default_locale, trial_locales[j])) {
1846 goto next_iteration;
1847 }
1848 }
1849
1850 trial_locale = system_default_locale;
1851 }
6bce99ee
JH
1852# endif /* WIN32 */
1853#endif /* SYSTEM_DEFAULT_LOCALE */
65ebb059
KW
1854 }
1855
98994639 1856#ifdef LC_ALL
5d1187d1
KW
1857 sl_result = my_setlocale(LC_ALL, trial_locale);
1858 DEBUG_LOCALE_INIT(LC_ALL, trial_locale, sl_result);
1859 if (! sl_result) {
49c85077 1860 setlocale_failure = TRUE;
7cd8b568
KW
1861 }
1862 else {
1863 /* Since LC_ALL succeeded, it should have changed all the other
1864 * categories it can to its value; so we massage things so that the
1865 * setlocales below just return their category's current values.
1866 * This adequately handles the case in NetBSD where LC_COLLATE may
1867 * not be defined for a locale, and setting it individually will
1868 * fail, whereas setting LC_ALL suceeds, leaving LC_COLLATE set to
1869 * the POSIX locale. */
1870 trial_locale = NULL;
1871 }
98994639
HS
1872#endif /* LC_ALL */
1873
49c85077 1874 if (!setlocale_failure) {
98994639 1875#ifdef USE_LOCALE_CTYPE
49c85077 1876 Safefree(curctype);
5d1187d1
KW
1877 curctype = my_setlocale(LC_CTYPE, trial_locale);
1878 DEBUG_LOCALE_INIT(LC_CTYPE, trial_locale, curctype);
1879 if (! curctype)
49c85077
KW
1880 setlocale_failure = TRUE;
1881 else
1882 curctype = savepv(curctype);
98994639
HS
1883#endif /* USE_LOCALE_CTYPE */
1884#ifdef USE_LOCALE_COLLATE
49c85077 1885 Safefree(curcoll);
5d1187d1
KW
1886 curcoll = my_setlocale(LC_COLLATE, trial_locale);
1887 DEBUG_LOCALE_INIT(LC_COLLATE, trial_locale, curcoll);
1888 if (! curcoll)
49c85077
KW
1889 setlocale_failure = TRUE;
1890 else
1891 curcoll = savepv(curcoll);
98994639
HS
1892#endif /* USE_LOCALE_COLLATE */
1893#ifdef USE_LOCALE_NUMERIC
49c85077 1894 Safefree(curnum);
5d1187d1
KW
1895 curnum = my_setlocale(LC_NUMERIC, trial_locale);
1896 DEBUG_LOCALE_INIT(LC_NUMERIC, trial_locale, curnum);
1897 if (! curnum)
49c85077
KW
1898 setlocale_failure = TRUE;
1899 else
1900 curnum = savepv(curnum);
98994639 1901#endif /* USE_LOCALE_NUMERIC */
a782673d 1902#ifdef USE_LOCALE_MESSAGES
5d1187d1
KW
1903 sl_result = my_setlocale(LC_MESSAGES, trial_locale);
1904 DEBUG_LOCALE_INIT(LC_MESSAGES, trial_locale, sl_result);
1905 if (! (sl_result))
a782673d
KW
1906 setlocale_failure = TRUE;
1907#endif /* USE_LOCALE_MESSAGES */
c835d6be 1908#ifdef USE_LOCALE_MONETARY
5d1187d1
KW
1909 sl_result = my_setlocale(LC_MONETARY, trial_locale);
1910 DEBUG_LOCALE_INIT(LC_MONETARY, trial_locale, sl_result);
1911 if (! (sl_result))
c835d6be
KW
1912 setlocale_failure = TRUE;
1913#endif /* USE_LOCALE_MONETARY */
1914
49c85077
KW
1915 if (! setlocale_failure) { /* Success */
1916 break;
1917 }
65ebb059 1918 }
98994639 1919
49c85077
KW
1920 /* Here, something failed; will need to try a fallback. */
1921 ok = 0;
65ebb059 1922
49c85077
KW
1923 if (i == 0) {
1924 unsigned int j;
98994639 1925
65ebb059 1926 if (locwarn) { /* Output failure info only on the first one */
98994639
HS
1927#ifdef LC_ALL
1928
49c85077
KW
1929 PerlIO_printf(Perl_error_log,
1930 "perl: warning: Setting locale failed.\n");
98994639
HS
1931
1932#else /* !LC_ALL */
1933
49c85077
KW
1934 PerlIO_printf(Perl_error_log,
1935 "perl: warning: Setting locale failed for the categories:\n\t");
20a240df 1936# ifdef USE_LOCALE_CTYPE
49c85077
KW
1937 if (! curctype)
1938 PerlIO_printf(Perl_error_log, "LC_CTYPE ");
20a240df
KW
1939# endif /* USE_LOCALE_CTYPE */
1940# ifdef USE_LOCALE_COLLATE
49c85077
KW
1941 if (! curcoll)
1942 PerlIO_printf(Perl_error_log, "LC_COLLATE ");
20a240df
KW
1943# endif /* USE_LOCALE_COLLATE */
1944# ifdef USE_LOCALE_NUMERIC
49c85077
KW
1945 if (! curnum)
1946 PerlIO_printf(Perl_error_log, "LC_NUMERIC ");
20a240df 1947# endif /* USE_LOCALE_NUMERIC */
a782673d 1948 PerlIO_printf(Perl_error_log, "and possibly others\n");
98994639
HS
1949
1950#endif /* LC_ALL */
1951
49c85077
KW
1952 PerlIO_printf(Perl_error_log,
1953 "perl: warning: Please check that your locale settings:\n");
98994639
HS
1954
1955#ifdef __GLIBC__
49c85077
KW
1956 PerlIO_printf(Perl_error_log,
1957 "\tLANGUAGE = %c%s%c,\n",
1958 language ? '"' : '(',
1959 language ? language : "unset",
1960 language ? '"' : ')');
98994639
HS
1961#endif
1962
49c85077
KW
1963 PerlIO_printf(Perl_error_log,
1964 "\tLC_ALL = %c%s%c,\n",
1965 lc_all ? '"' : '(',
1966 lc_all ? lc_all : "unset",
1967 lc_all ? '"' : ')');
98994639
HS
1968
1969#if defined(USE_ENVIRON_ARRAY)
49c85077 1970 {
cd999af9 1971 char **e;
d5e32b93
KW
1972
1973 /* Look through the environment for any variables of the
1974 * form qr/ ^ LC_ [A-Z]+ = /x, except LC_ALL which was
1975 * already handled above. These are assumed to be locale
1976 * settings. Output them and their values. */
cd999af9 1977 for (e = environ; *e; e++) {
d5e32b93
KW
1978 const STRLEN prefix_len = sizeof("LC_") - 1;
1979 STRLEN uppers_len;
1980
cd999af9 1981 if ( strBEGINs(*e, "LC_")
c8b388b0 1982 && ! strBEGINs(*e, "LC_ALL=")
d5e32b93
KW
1983 && (uppers_len = strspn(*e + prefix_len,
1984 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"))
1985 && ((*e)[prefix_len + uppers_len] == '='))
cd999af9
KW
1986 {
1987 PerlIO_printf(Perl_error_log, "\t%.*s = \"%s\",\n",
d5e32b93
KW
1988 (int) (prefix_len + uppers_len), *e,
1989 *e + prefix_len + uppers_len + 1);
cd999af9
KW
1990 }
1991 }
49c85077 1992 }
98994639 1993#else
49c85077
KW
1994 PerlIO_printf(Perl_error_log,
1995 "\t(possibly more locale environment variables)\n");
98994639
HS
1996#endif
1997
49c85077
KW
1998 PerlIO_printf(Perl_error_log,
1999 "\tLANG = %c%s%c\n",
2000 lang ? '"' : '(',
2001 lang ? lang : "unset",
2002 lang ? '"' : ')');
98994639 2003
49c85077
KW
2004 PerlIO_printf(Perl_error_log,
2005 " are supported and installed on your system.\n");
2006 }
98994639 2007
65ebb059 2008 /* Calculate what fallback locales to try. We have avoided this
f6bab5f6 2009 * until we have to, because failure is quite unlikely. This will
65ebb059
KW
2010 * usually change the upper bound of the loop we are in.
2011 *
2012 * Since the system's default way of setting the locale has not
2013 * found one that works, We use Perl's defined ordering: LC_ALL,
2014 * LANG, and the C locale. We don't try the same locale twice, so
2015 * don't add to the list if already there. (On POSIX systems, the
2016 * LC_ALL element will likely be a repeat of the 0th element "",
6b058d42
KW
2017 * but there's no harm done by doing it explicitly.
2018 *
2019 * Note that this tries the LC_ALL environment variable even on
2020 * systems which have no LC_ALL locale setting. This may or may
2021 * not have been originally intentional, but there's no real need
2022 * to change the behavior. */
65ebb059
KW
2023 if (lc_all) {
2024 for (j = 0; j < trial_locales_count; j++) {
2025 if (strEQ(lc_all, trial_locales[j])) {
2026 goto done_lc_all;
2027 }
2028 }
2029 trial_locales[trial_locales_count++] = lc_all;
2030 }
2031 done_lc_all:
98994639 2032
65ebb059
KW
2033 if (lang) {
2034 for (j = 0; j < trial_locales_count; j++) {
2035 if (strEQ(lang, trial_locales[j])) {
2036 goto done_lang;
2037 }
2038 }
2039 trial_locales[trial_locales_count++] = lang;
2040 }
2041 done_lang:
2042
2043#if defined(WIN32) && defined(LC_ALL)
2044 /* For Windows, we also try the system default locale before "C".
2045 * (If there exists a Windows without LC_ALL we skip this because
2046 * it gets too complicated. For those, the "C" is the next
2047 * fallback possibility). The "" is the same as the 0th element of
2048 * the array, but the code at the loop above knows to treat it
2049 * differently when not the 0th */
2050 trial_locales[trial_locales_count++] = "";
2051#endif
2052
2053 for (j = 0; j < trial_locales_count; j++) {
2054 if (strEQ("C", trial_locales[j])) {
2055 goto done_C;
2056 }
2057 }
2058 trial_locales[trial_locales_count++] = "C";
98994639 2059
65ebb059
KW
2060 done_C: ;
2061 } /* end of first time through the loop */
98994639 2062
65ebb059
KW
2063#ifdef WIN32
2064 next_iteration: ;
2065#endif
2066
2067 } /* end of looping through the trial locales */
2068
2069 if (ok < 1) { /* If we tried to fallback */
2070 const char* msg;
2071 if (! setlocale_failure) { /* fallback succeeded */
2072 msg = "Falling back to";
2073 }
2074 else { /* fallback failed */
98994639 2075
65ebb059
KW
2076 /* We dropped off the end of the loop, so have to decrement i to
2077 * get back to the value the last time through */
2078 i--;
98994639 2079
65ebb059
KW
2080 ok = -1;
2081 msg = "Failed to fall back to";
2082
2083 /* To continue, we should use whatever values we've got */
98994639 2084#ifdef USE_LOCALE_CTYPE
49c85077
KW
2085 Safefree(curctype);
2086 curctype = savepv(setlocale(LC_CTYPE, NULL));
5d1187d1 2087 DEBUG_LOCALE_INIT(LC_CTYPE, NULL, curctype);
98994639
HS
2088#endif /* USE_LOCALE_CTYPE */
2089#ifdef USE_LOCALE_COLLATE
49c85077
KW
2090 Safefree(curcoll);
2091 curcoll = savepv(setlocale(LC_COLLATE, NULL));
5d1187d1 2092 DEBUG_LOCALE_INIT(LC_COLLATE, NULL, curcoll);
98994639
HS
2093#endif /* USE_LOCALE_COLLATE */
2094#ifdef USE_LOCALE_NUMERIC
49c85077
KW
2095 Safefree(curnum);
2096 curnum = savepv(setlocale(LC_NUMERIC, NULL));
5d1187d1 2097 DEBUG_LOCALE_INIT(LC_NUMERIC, NULL, curnum);
98994639 2098#endif /* USE_LOCALE_NUMERIC */
65ebb059
KW
2099 }
2100
2101 if (locwarn) {
2102 const char * description;
2103 const char * name = "";
2104 if (strEQ(trial_locales[i], "C")) {
2105 description = "the standard locale";
2106 name = "C";
2107 }
6bce99ee 2108#ifdef SYSTEM_DEFAULT_LOCALE
65ebb059
KW
2109 else if (strEQ(trial_locales[i], "")) {
2110 description = "the system default locale";
2111 if (system_default_locale) {
2112 name = system_default_locale;
2113 }
2114 }
6bce99ee 2115#endif /* SYSTEM_DEFAULT_LOCALE */
65ebb059
KW
2116 else {
2117 description = "a fallback locale";
2118 name = trial_locales[i];
2119 }
2120 if (name && strNE(name, "")) {
2121 PerlIO_printf(Perl_error_log,
2122 "perl: warning: %s %s (\"%s\").\n", msg, description, name);
2123 }
2124 else {
2125 PerlIO_printf(Perl_error_log,
2126 "perl: warning: %s %s.\n", msg, description);
2127 }
2128 }
2129 } /* End of tried to fallback */
98994639
HS
2130
2131#ifdef USE_LOCALE_CTYPE
2132 new_ctype(curctype);
2133#endif /* USE_LOCALE_CTYPE */
2134
2135#ifdef USE_LOCALE_COLLATE
2136 new_collate(curcoll);
2137#endif /* USE_LOCALE_COLLATE */
2138
2139#ifdef USE_LOCALE_NUMERIC
2140 new_numeric(curnum);
2141#endif /* USE_LOCALE_NUMERIC */
b310b053 2142
8ef6e574 2143#if defined(USE_PERLIO) && defined(USE_LOCALE_CTYPE)
49c85077
KW
2144 /* Set PL_utf8locale to TRUE if using PerlIO _and_ the current LC_CTYPE
2145 * locale is UTF-8. If PL_utf8locale and PL_unicode (set by -C or by
2146 * $ENV{PERL_UNICODE}) are true, perl.c:S_parse_body() will turn on the
2147 * PerlIO :utf8 layer on STDIN, STDOUT, STDERR, _and_ the default open
2148 * discipline. */
c1284011 2149 PL_utf8locale = _is_cur_LC_category_utf8(LC_CTYPE);
49c85077 2150
a05d7ebb 2151 /* Set PL_unicode to $ENV{PERL_UNICODE} if using PerlIO.
fde18df1
JH
2152 This is an alternative to using the -C command line switch
2153 (the -C if present will override this). */
2154 {
dd374669 2155 const char *p = PerlEnv_getenv("PERL_UNICODE");
a05d7ebb 2156 PL_unicode = p ? parse_unicode_opts(&p) : 0;
5a22a2bb
NC
2157 if (PL_unicode & PERL_UNICODE_UTF8CACHEASSERT_FLAG)
2158 PL_utf8cache = -1;
b310b053 2159 }
ec71e770 2160#endif
b310b053 2161
98994639 2162#ifdef USE_LOCALE_CTYPE
43c5f42d 2163 Safefree(curctype);
98994639
HS
2164#endif /* USE_LOCALE_CTYPE */
2165#ifdef USE_LOCALE_COLLATE
43c5f42d 2166 Safefree(curcoll);
98994639
HS
2167#endif /* USE_LOCALE_COLLATE */
2168#ifdef USE_LOCALE_NUMERIC
43c5f42d 2169 Safefree(curnum);
98994639 2170#endif /* USE_LOCALE_NUMERIC */
8ef6e574 2171
175c4cf9
KW
2172#ifdef __GLIBC__
2173 Safefree(language);
2174#endif
2175
2176 Safefree(lc_all);
2177 Safefree(lang);
2178
e3305790
KW
2179#else /* !USE_LOCALE */
2180 PERL_UNUSED_ARG(printwarn);
2181#endif /* USE_LOCALE */
2182
2fcc0ca9
KW
2183#ifdef DEBUGGING
2184 /* So won't continue to output stuff */
27cdc72e 2185 DEBUG_INITIALIZATION_set(FALSE);
2fcc0ca9
KW
2186#endif
2187
98994639
HS
2188 return ok;
2189}
2190
98994639
HS
2191#ifdef USE_LOCALE_COLLATE
2192
a4a439fb 2193char *
a4a439fb
KW
2194Perl__mem_collxfrm(pTHX_ const char *input_string,
2195 STRLEN len, /* Length of 'input_string' */
2196 STRLEN *xlen, /* Set to length of returned string
2197 (not including the collation index
2198 prefix) */
2199 bool utf8 /* Is the input in UTF-8? */
6696cfa7 2200 )
98994639 2201{
a4a439fb
KW
2202
2203 /* _mem_collxfrm() is a bit like strxfrm() but with two important
2204 * differences. First, it handles embedded NULs. Second, it allocates a bit
2205 * more memory than needed for the transformed data itself. The real
55e5378d 2206 * transformed data begins at offset COLLXFRM_HDR_LEN. *xlen is set to
a4a439fb
KW
2207 * the length of that, and doesn't include the collation index size.
2208 * Please see sv_collxfrm() to see how this is used. */
2209
55e5378d
KW
2210#define COLLXFRM_HDR_LEN sizeof(PL_collation_ix)
2211
6696cfa7
KW
2212 char * s = (char *) input_string;
2213 STRLEN s_strlen = strlen(input_string);
79f120c8 2214 char *xbuf = NULL;
55e5378d 2215 STRLEN xAlloc; /* xalloc is a reserved word in VC */
17f41037 2216 STRLEN length_in_chars;
c664130f 2217 bool first_time = TRUE; /* Cleared after first loop iteration */
98994639 2218
a4a439fb
KW
2219 PERL_ARGS_ASSERT__MEM_COLLXFRM;
2220
2221 /* Must be NUL-terminated */
2222 assert(*(input_string + len) == '\0');
7918f24d 2223
79f120c8
KW
2224 /* If this locale has defective collation, skip */
2225 if (PL_collxfrm_base == 0 && PL_collxfrm_mult == 0) {
c7202dee
KW
2226 DEBUG_L(PerlIO_printf(Perl_debug_log,
2227 "_mem_collxfrm: locale's collation is defective\n"));
79f120c8
KW
2228 goto bad;
2229 }
2230
6696cfa7
KW
2231 /* Replace any embedded NULs with the control that sorts before any others.
2232 * This will give as good as possible results on strings that don't
2233 * otherwise contain that character, but otherwise there may be
2234 * less-than-perfect results with that character and NUL. This is
fdc080f3 2235 * unavoidable unless we replace strxfrm with our own implementation. */
fd43f63c
KW
2236 if (UNLIKELY(s_strlen < len)) { /* Only execute if there is an embedded
2237 NUL */
6696cfa7
KW
2238 char * e = s + len;
2239 char * sans_nuls;
fdc080f3 2240 STRLEN sans_nuls_len;
94762aa0 2241 int try_non_controls;
afc4976f
KW
2242 char this_replacement_char[] = "?\0"; /* Room for a two-byte string,
2243 making sure 2nd byte is NUL.
2244 */
2245 STRLEN this_replacement_len;
2246
1e4c9676
KW
2247 /* If we don't know what non-NUL control character sorts lowest for
2248 * this locale, find it */
f28f4d2a 2249 if (PL_strxfrm_NUL_replacement == '\0') {
6696cfa7 2250 int j;
afc4976f 2251 char * cur_min_x = NULL; /* The min_char's xfrm, (except it also
6696cfa7
KW
2252 includes the collation index
2253 prefixed. */
2254
91c0e2e0 2255 DEBUG_Lv(PerlIO_printf(Perl_debug_log, "Looking to replace NUL\n"));
94762aa0
KW
2256
2257 /* Unlikely, but it may be that no control will work to replace
1e4c9676
KW
2258 * NUL, in which case we instead look for any character. Controls
2259 * are preferred because collation order is, in general, context
2260 * sensitive, with adjoining characters affecting the order, and
2261 * controls are less likely to have such interactions, allowing the
2262 * NUL-replacement to stand on its own. (Another way to look at it
2263 * is to imagine what would happen if the NUL were replaced by a
2264 * combining character; it wouldn't work out all that well.) */
94762aa0
KW
2265 for (try_non_controls = 0;
2266 try_non_controls < 2;
2267 try_non_controls++)
2268 {
d4ff9586
KW
2269 /* Look through all legal code points (NUL isn't) */
2270 for (j = 1; j < 256; j++) {
2271 char * x; /* j's xfrm plus collation index */
2272 STRLEN x_len; /* length of 'x' */
2273 STRLEN trial_len = 1;
736a4fed 2274 char cur_source[] = { '\0', '\0' };
d4ff9586 2275
736a4fed
KW
2276 /* Skip non-controls the first time through the loop. The
2277 * controls in a UTF-8 locale are the L1 ones */
afc4976f
KW
2278 if (! try_non_controls && (PL_in_utf8_COLLATE_locale)
2279 ? ! isCNTRL_L1(j)
2280 : ! isCNTRL_LC(j))
2281 {
d4ff9586 2282 continue;
6696cfa7 2283 }
6696cfa7 2284
736a4fed
KW
2285 /* Create a 1-char string of the current code point */
2286 cur_source[0] = (char) j;
2287
d4ff9586
KW
2288 /* Then transform it */
2289 x = _mem_collxfrm(cur_source, trial_len, &x_len,
afc4976f 2290 0 /* The string is not in UTF-8 */);
6696cfa7 2291
1e4c9676 2292 /* Ignore any character that didn't successfully transform.
d4ff9586
KW
2293 * */
2294 if (! x) {
2295 continue;
2296 }
6696cfa7 2297
d4ff9586
KW
2298 /* If this character's transformation is lower than
2299 * the current lowest, this one becomes the lowest */
2300 if ( cur_min_x == NULL
2301 || strLT(x + COLLXFRM_HDR_LEN,
2302 cur_min_x + COLLXFRM_HDR_LEN))
2303 {
f28f4d2a 2304 PL_strxfrm_NUL_replacement = j;
d4ff9586 2305 cur_min_x = x;
d4ff9586
KW
2306 }
2307 else {
2308 Safefree(x);
2309 }
1e4c9676 2310 } /* end of loop through all 255 characters */
6696cfa7 2311
1e4c9676 2312 /* Stop looking if found */
94762aa0
KW
2313 if (cur_min_x) {
2314 break;
2315 }
2316
2317 /* Unlikely, but possible, if there aren't any controls that
2318 * work in the locale, repeat the loop, looking for any
2319 * character that works */
2320 DEBUG_L(PerlIO_printf(Perl_debug_log,
2321 "_mem_collxfrm: No control worked. Trying non-controls\n"));
1e4c9676 2322 } /* End of loop to try first the controls, then any char */
6696cfa7 2323
94762aa0
KW
2324 if (! cur_min_x) {
2325 DEBUG_L(PerlIO_printf(Perl_debug_log,
2326 "_mem_collxfrm: Couldn't find any character to replace"
2327 " embedded NULs in locale %s with", PL_collation_name));
2328 goto bad;
58eebef2
KW
2329 }
2330
94762aa0
KW
2331 DEBUG_L(PerlIO_printf(Perl_debug_log,
2332 "_mem_collxfrm: Replacing embedded NULs in locale %s with "
f28f4d2a 2333 "0x%02X\n", PL_collation_name, PL_strxfrm_NUL_replacement));
94762aa0 2334
6696cfa7 2335 Safefree(cur_min_x);
1e4c9676 2336 } /* End of determining the character that is to replace NULs */
afc4976f
KW
2337
2338 /* If the replacement is variant under UTF-8, it must match the
2339 * UTF8-ness as the original */
f28f4d2a
KW
2340 if ( ! UVCHR_IS_INVARIANT(PL_strxfrm_NUL_replacement) && utf8) {
2341 this_replacement_char[0] =
2342 UTF8_EIGHT_BIT_HI(PL_strxfrm_NUL_replacement);
2343 this_replacement_char[1] =
2344 UTF8_EIGHT_BIT_LO(PL_strxfrm_NUL_replacement);
afc4976f
KW
2345 this_replacement_len = 2;
2346 }
2347 else {
f28f4d2a 2348 this_replacement_char[0] = PL_strxfrm_NUL_replacement;
afc4976f
KW
2349 /* this_replacement_char[1] = '\0' was done at initialization */
2350 this_replacement_len = 1;
6696cfa7
KW
2351 }
2352
2353 /* The worst case length for the replaced string would be if every
2354 * character in it is NUL. Multiply that by the length of each
2355 * replacement, and allow for a trailing NUL */
afc4976f 2356 sans_nuls_len = (len * this_replacement_len) + 1;
fdc080f3 2357 Newx(sans_nuls, sans_nuls_len, char);
6696cfa7
KW
2358 *sans_nuls = '\0';
2359
6696cfa7
KW
2360 /* Replace each NUL with the lowest collating control. Loop until have
2361 * exhausted all the NULs */
2362 while (s + s_strlen < e) {
6069d6c5 2363 my_strlcat(sans_nuls, s, sans_nuls_len);
6696cfa7
KW
2364
2365 /* Do the actual replacement */
6069d6c5 2366 my_strlcat(sans_nuls, this_replacement_char, sans_nuls_len);
6696cfa7
KW
2367
2368 /* Move past the input NUL */
2369 s += s_strlen + 1;
2370 s_strlen = strlen(s);
2371 }
2372
2373 /* And add anything that trails the final NUL */
6069d6c5 2374 my_strlcat(sans_nuls, s, sans_nuls_len);
6696cfa7
KW
2375
2376 /* Switch so below we transform this modified string */
2377 s = sans_nuls;
2378 len = strlen(s);
1e4c9676 2379 } /* End of replacing NULs */
6696cfa7 2380
a4a439fb
KW
2381 /* Make sure the UTF8ness of the string and locale match */
2382 if (utf8 != PL_in_utf8_COLLATE_locale) {
2383 const char * const t = s; /* Temporary so we can later find where the
2384 input was */
2385
2386 /* Here they don't match. Change the string's to be what the locale is
2387 * expecting */
2388
2389 if (! utf8) { /* locale is UTF-8, but input isn't; upgrade the input */
2390 s = (char *) bytes_to_utf8((const U8 *) s, &len);
2391 utf8 = TRUE;
2392 }
2393 else { /* locale is not UTF-8; but input is; downgrade the input */
2394
2395 s = (char *) bytes_from_utf8((const U8 *) s, &len, &utf8);
2396
2397 /* If the downgrade was successful we are done, but if the input
2398 * contains things that require UTF-8 to represent, have to do
2399 * damage control ... */
2400 if (UNLIKELY(utf8)) {
2401
2402 /* What we do is construct a non-UTF-8 string with
2403 * 1) the characters representable by a single byte converted
2404 * to be so (if necessary);
2405 * 2) and the rest converted to collate the same as the
2406 * highest collating representable character. That makes
2407 * them collate at the end. This is similar to how we
2408 * handle embedded NULs, but we use the highest collating
2409 * code point instead of the smallest. Like the NUL case,
2410 * this isn't perfect, but is the best we can reasonably
2411 * do. Every above-255 code point will sort the same as
2412 * the highest-sorting 0-255 code point. If that code
2413 * point can combine in a sequence with some other code
2414 * points for weight calculations, us changing something to
2415 * be it can adversely affect the results. But in most
2416 * cases, it should work reasonably. And note that this is
2417 * really an illegal situation: using code points above 255
2418 * on a locale where only 0-255 are valid. If two strings
2419 * sort entirely equal, then the sort order for the
2420 * above-255 code points will be in code point order. */
2421
2422 utf8 = FALSE;
2423
2424 /* If we haven't calculated the code point with the maximum
2425 * collating order for this locale, do so now */
2426 if (! PL_strxfrm_max_cp) {
2427 int j;
2428
2429 /* The current transformed string that collates the
2430 * highest (except it also includes the prefixed collation
2431 * index. */
2432 char * cur_max_x = NULL;
2433
2434 /* Look through all legal code points (NUL isn't) */
2435 for (j = 1; j < 256; j++) {
2436 char * x;
2437 STRLEN x_len;
736a4fed 2438 char cur_source[] = { '\0', '\0' };
a4a439fb 2439
736a4fed
KW
2440 /* Create a 1-char string of the current code point */
2441 cur_source[0] = (char) j;
a4a439fb
KW
2442
2443 /* Then transform it */
2444 x = _mem_collxfrm(cur_source, 1, &x_len, FALSE);
2445
2446 /* If something went wrong (which it shouldn't), just
2447 * ignore this code point */
94762aa0 2448 if (! x) {
a4a439fb
KW
2449 continue;
2450 }
2451
2452 /* If this character's transformation is higher than
2453 * the current highest, this one becomes the highest */
2454 if ( cur_max_x == NULL
55e5378d
KW
2455 || strGT(x + COLLXFRM_HDR_LEN,
2456 cur_max_x + COLLXFRM_HDR_LEN))
a4a439fb
KW
2457 {
2458 PL_strxfrm_max_cp = j;
2459 cur_max_x = x;
2460 }
2461 else {
2462 Safefree(x);
2463 }
2464 }
2465
94762aa0
KW
2466 if (! cur_max_x) {
2467 DEBUG_L(PerlIO_printf(Perl_debug_log,
2468 "_mem_collxfrm: Couldn't find any character to"
2469 " replace above-Latin1 chars in locale %s with",
2470 PL_collation_name));
2471 goto bad;
2472 }
2473
58eebef2
KW
2474 DEBUG_L(PerlIO_printf(Perl_debug_log,
2475 "_mem_collxfrm: highest 1-byte collating character"
2476 " in locale %s is 0x%02X\n",
2477 PL_collation_name,
2478 PL_strxfrm_max_cp));
58eebef2 2479
a4a439fb
KW
2480 Safefree(cur_max_x);
2481 }
2482
2483 /* Here we know which legal code point collates the highest.
2484 * We are ready to construct the non-UTF-8 string. The length
2485 * will be at least 1 byte smaller than the input string
2486 * (because we changed at least one 2-byte character into a
2487 * single byte), but that is eaten up by the trailing NUL */
2488 Newx(s, len, char);
2489
2490 {
2491 STRLEN i;
2492 STRLEN d= 0;
042d9e50 2493 char * e = (char *) t + len;
a4a439fb
KW
2494
2495 for (i = 0; i < len; i+= UTF8SKIP(t + i)) {
2496 U8 cur_char = t[i];
2497 if (UTF8_IS_INVARIANT(cur_char)) {
2498 s[d++] = cur_char;
2499 }
042d9e50 2500 else if (UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(t + i, e)) {
a4a439fb
KW
2501 s[d++] = EIGHT_BIT_UTF8_TO_NATIVE(cur_char, t[i+1]);
2502 }
2503 else { /* Replace illegal cp with highest collating
2504 one */
2505 s[d++] = PL_strxfrm_max_cp;
2506 }
2507 }
2508 s[d++] = '\0';
2509 Renew(s, d, char); /* Free up unused space */
2510 }
2511 }
2512 }
2513
2514 /* Here, we have constructed a modified version of the input. It could
2515 * be that we already had a modified copy before we did this version.
2516 * If so, that copy is no longer needed */
2517 if (t != input_string) {
2518 Safefree(t);
2519 }
2520 }
2521
17f41037
KW
2522 length_in_chars = (utf8)
2523 ? utf8_length((U8 *) s, (U8 *) s + len)
2524 : len;
2525
59c018b9
KW
2526 /* The first element in the output is the collation id, used by
2527 * sv_collxfrm(); then comes the space for the transformed string. The
2528 * equation should give us a good estimate as to how much is needed */
55e5378d 2529 xAlloc = COLLXFRM_HDR_LEN
a4a439fb 2530 + PL_collxfrm_base
17f41037 2531 + (PL_collxfrm_mult * length_in_chars);
a02a5408 2532 Newx(xbuf, xAlloc, char);
c7202dee
KW
2533 if (UNLIKELY(! xbuf)) {
2534 DEBUG_L(PerlIO_printf(Perl_debug_log,
2535 "_mem_collxfrm: Couldn't malloc %zu bytes\n", xAlloc));
98994639 2536 goto bad;
c7202dee 2537 }
98994639 2538
d35fca5f 2539 /* Store the collation id */
98994639 2540 *(U32*)xbuf = PL_collation_ix;
d35fca5f
KW
2541
2542 /* Then the transformation of the input. We loop until successful, or we
2543 * give up */
4ebeff16 2544 for (;;) {
1adab0a7 2545
55e5378d 2546 *xlen = strxfrm(xbuf + COLLXFRM_HDR_LEN, s, xAlloc - COLLXFRM_HDR_LEN);
4ebeff16
KW
2547
2548 /* If the transformed string occupies less space than we told strxfrm()
2549 * was available, it means it successfully transformed the whole
2550 * string. */
55e5378d 2551 if (*xlen < xAlloc - COLLXFRM_HDR_LEN) {
17f41037 2552
1adab0a7
KW
2553 /* Some systems include a trailing NUL in the returned length.
2554 * Ignore it, using a loop in case multiple trailing NULs are
2555 * returned. */
2556 while ( (*xlen) > 0
2557 && *(xbuf + COLLXFRM_HDR_LEN + (*xlen) - 1) == '\0')
2558 {
2559 (*xlen)--;
2560 }
2561
17f41037
KW
2562 /* If the first try didn't get it, it means our prediction was low.
2563 * Modify the coefficients so that we predict a larger value in any
2564 * future transformations */
2565 if (! first_time) {
2566 STRLEN needed = *xlen + 1; /* +1 For trailing NUL */
2567 STRLEN computed_guess = PL_collxfrm_base
2568 + (PL_collxfrm_mult * length_in_chars);
e1c30f0c
KW
2569
2570 /* On zero-length input, just keep current slope instead of
2571 * dividing by 0 */
2572 const STRLEN new_m = (length_in_chars != 0)
2573 ? needed / length_in_chars
2574 : PL_collxfrm_mult;
17f41037
KW
2575
2576 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
b07929e4
KW
2577 "%s: %d: initial size of %zu bytes for a length "
2578 "%zu string was insufficient, %zu needed\n",
17f41037 2579 __FILE__, __LINE__,
b07929e4 2580 computed_guess, length_in_chars, needed));
17f41037
KW
2581
2582 /* If slope increased, use it, but discard this result for
2583 * length 1 strings, as we can't be sure that it's a real slope
2584 * change */
2585 if (length_in_chars > 1 && new_m > PL_collxfrm_mult) {
2586#ifdef DEBUGGING
2587 STRLEN old_m = PL_collxfrm_mult;
2588 STRLEN old_b = PL_collxfrm_base;
2589#endif
2590 PL_collxfrm_mult = new_m;
2591 PL_collxfrm_base = 1; /* +1 For trailing NUL */
2592 computed_guess = PL_collxfrm_base
2593 + (PL_collxfrm_mult * length_in_chars);
2594 if (computed_guess < needed) {
2595 PL_collxfrm_base += needed - computed_guess;
2596 }
2597
2598 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
b07929e4
KW
2599 "%s: %d: slope is now %zu; was %zu, base "
2600 "is now %zu; was %zu\n",
17f41037 2601 __FILE__, __LINE__,
b07929e4
KW
2602 PL_collxfrm_mult, old_m,
2603 PL_collxfrm_base, old_b));
17f41037
KW
2604 }
2605 else { /* Slope didn't change, but 'b' did */
2606 const STRLEN new_b = needed
2607 - computed_guess
2608 + PL_collxfrm_base;
2609 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
b07929e4 2610 "%s: %d: base is now %zu; was %zu\n",
17f41037 2611 __FILE__, __LINE__,
b07929e4 2612 new_b, PL_collxfrm_base));
17f41037
KW
2613 PL_collxfrm_base = new_b;
2614 }
2615 }
2616
4ebeff16
KW
2617 break;
2618 }
bb0f664e 2619
c7202dee
KW
2620 if (UNLIKELY(*xlen >= PERL_INT_MAX)) {
2621 DEBUG_L(PerlIO_printf(Perl_debug_log,
2622 "_mem_collxfrm: Needed %zu bytes, max permissible is %u\n",
2623 *xlen, PERL_INT_MAX));
4ebeff16 2624 goto bad;
c7202dee 2625 }
d35fca5f 2626
c664130f 2627 /* A well-behaved strxfrm() returns exactly how much space it needs
1adab0a7
KW
2628 * (usually not including the trailing NUL) when it fails due to not
2629 * enough space being provided. Assume that this is the case unless
2630 * it's been proven otherwise */
c664130f 2631 if (LIKELY(PL_strxfrm_is_behaved) && first_time) {
55e5378d 2632 xAlloc = *xlen + COLLXFRM_HDR_LEN + 1;
c664130f
KW
2633 }
2634 else { /* Here, either:
2635 * 1) The strxfrm() has previously shown bad behavior; or
2636 * 2) It isn't the first time through the loop, which means
2637 * that the strxfrm() is now showing bad behavior, because
2638 * we gave it what it said was needed in the previous
2639 * iteration, and it came back saying it needed still more.
2640 * (Many versions of cygwin fit this. When the buffer size
2641 * isn't sufficient, they return the input size instead of
2642 * how much is needed.)
d4ff9586
KW
2643 * Increase the buffer size by a fixed percentage and try again.
2644 * */
6ddd902c 2645 xAlloc += (xAlloc / 4) + 1;
c664130f 2646 PL_strxfrm_is_behaved = FALSE;
c664130f 2647
58eebef2
KW
2648#ifdef DEBUGGING
2649 if (DEBUG_Lv_TEST || debug_initialization) {
2650 PerlIO_printf(Perl_debug_log,
2651 "_mem_collxfrm required more space than previously calculated"
b07929e4 2652 " for locale %s, trying again with new guess=%d+%zu\n",
58eebef2 2653 PL_collation_name, (int) COLLXFRM_HDR_LEN,
b07929e4 2654 xAlloc - COLLXFRM_HDR_LEN);
58eebef2
KW
2655 }
2656#endif
2657 }
c664130f 2658
4ebeff16 2659 Renew(xbuf, xAlloc, char);
c7202dee
KW
2660 if (UNLIKELY(! xbuf)) {
2661 DEBUG_L(PerlIO_printf(Perl_debug_log,
2662 "_mem_collxfrm: Couldn't realloc %zu bytes\n", xAlloc));
4ebeff16 2663 goto bad;
c7202dee 2664 }
c664130f
KW
2665
2666 first_time = FALSE;
4ebeff16 2667 }
98994639 2668
6696cfa7 2669
58eebef2
KW
2670#ifdef DEBUGGING
2671 if (DEBUG_Lv_TEST || debug_initialization) {
c7202dee
KW
2672
2673 print_collxfrm_input_and_return(s, s + len, xlen, utf8);
2674 PerlIO_printf(Perl_debug_log, "Its xfrm is:");
7e2f38b2
KW
2675 PerlIO_printf(Perl_debug_log, "%s\n",
2676 _byte_dump_string((U8 *) xbuf + COLLXFRM_HDR_LEN,
2677 *xlen, 1));
58eebef2
KW
2678 }
2679#endif
2680
3c5f993e 2681 /* Free up unneeded space; retain ehough for trailing NUL */
55e5378d 2682 Renew(xbuf, COLLXFRM_HDR_LEN + *xlen + 1, char);
98994639 2683
6696cfa7
KW
2684 if (s != input_string) {
2685 Safefree(s);
98994639
HS
2686 }
2687
98994639
HS
2688 return xbuf;
2689
2690 bad:
2691 Safefree(xbuf);
6696cfa7
KW
2692 if (s != input_string) {
2693 Safefree(s);
2694 }
98994639 2695 *xlen = 0;
58eebef2
KW
2696#ifdef DEBUGGING
2697 if (DEBUG_Lv_TEST || debug_initialization) {
c7202dee 2698 print_collxfrm_input_and_return(s, s + len, NULL, utf8);
58eebef2
KW
2699 }
2700#endif
98994639
HS
2701 return NULL;
2702}
2703
c7202dee
KW
2704#ifdef DEBUGGING
2705
4cbaac56 2706STATIC void
c7202dee
KW
2707S_print_collxfrm_input_and_return(pTHX_
2708 const char * const s,
2709 const char * const e,
2710 const STRLEN * const xlen,
2711 const bool is_utf8)
2712{
c7202dee
KW
2713
2714 PERL_ARGS_ASSERT_PRINT_COLLXFRM_INPUT_AND_RETURN;
2715
511e4ff7
DM
2716 PerlIO_printf(Perl_debug_log, "_mem_collxfrm[%" UVuf "]: returning ",
2717 (UV)PL_collation_ix);
c7202dee 2718 if (xlen) {
08b6dc1d 2719 PerlIO_printf(Perl_debug_log, "%zu", *xlen);
c7202dee
KW
2720 }
2721 else {
2722 PerlIO_printf(Perl_debug_log, "NULL");
2723 }
2724 PerlIO_printf(Perl_debug_log, " for locale '%s', string='",
2725 PL_collation_name);
9c8a6dc2
KW
2726 print_bytes_for_locale(s, e, is_utf8);
2727
2728 PerlIO_printf(Perl_debug_log, "'\n");
2729}
2730
2731STATIC void
2732S_print_bytes_for_locale(pTHX_
2733 const char * const s,
2734 const char * const e,
2735 const bool is_utf8)
2736{
2737 const char * t = s;
2738 bool prev_was_printable = TRUE;
2739 bool first_time = TRUE;
2740
2741 PERL_ARGS_ASSERT_PRINT_BYTES_FOR_LOCALE;
c7202dee
KW
2742
2743 while (t < e) {
2744 UV cp = (is_utf8)
2745 ? utf8_to_uvchr_buf((U8 *) t, e, NULL)
2746 : * (U8 *) t;
2747 if (isPRINT(cp)) {
2748 if (! prev_was_printable) {
2749 PerlIO_printf(Perl_debug_log, " ");
2750 }
2751 PerlIO_printf(Perl_debug_log, "%c", (U8) cp);
2752 prev_was_printable = TRUE;
2753 }
2754 else {
2755 if (! first_time) {
2756 PerlIO_printf(Perl_debug_log, " ");
2757 }
147e3846 2758 PerlIO_printf(Perl_debug_log, "%02" UVXf, cp);
c7202dee
KW
2759 prev_was_printable = FALSE;
2760 }
2761 t += (is_utf8) ? UTF8SKIP(t) : 1;
2762 first_time = FALSE;
2763 }
c7202dee
KW
2764}
2765
2766#endif /* #ifdef DEBUGGING */
2767
98994639 2768#endif /* USE_LOCALE_COLLATE */
58eebef2 2769
8ef6e574
KW
2770#ifdef USE_LOCALE
2771
c1284011
KW
2772bool
2773Perl__is_cur_LC_category_utf8(pTHX_ int category)
7d74bb61
KW
2774{
2775 /* Returns TRUE if the current locale for 'category' is UTF-8; FALSE
2776 * otherwise. 'category' may not be LC_ALL. If the platform doesn't have
119ee68b 2777 * nl_langinfo(), nor MB_CUR_MAX, this employs a heuristic, which hence
609548d2
KW
2778 * could give the wrong result. The result will very likely be correct for
2779 * languages that have commonly used non-ASCII characters, but for notably
2780 * English, it comes down to if the locale's name ends in something like
2781 * "UTF-8". It errs on the side of not being a UTF-8 locale. */
7d74bb61
KW
2782
2783 char *save_input_locale = NULL;
7d74bb61
KW
2784 STRLEN final_pos;
2785
8ef6e574 2786#ifdef LC_ALL
7d74bb61 2787 assert(category != LC_ALL);
8ef6e574 2788#endif
7d74bb61
KW
2789
2790 /* First dispose of the trivial cases */
b07fffd1 2791 save_input_locale = setlocale(category, NULL);
7d74bb61 2792 if (! save_input_locale) {
69014004
KW
2793 DEBUG_L(PerlIO_printf(Perl_debug_log,
2794 "Could not find current locale for category %d\n",
2795 category));
7d74bb61
KW
2796 return FALSE; /* XXX maybe should croak */
2797 }
b07fffd1 2798 save_input_locale = stdize_locale(savepv(save_input_locale));
a39edc4c 2799 if (isNAME_C_OR_POSIX(save_input_locale)) {
69014004
KW
2800 DEBUG_L(PerlIO_printf(Perl_debug_log,
2801 "Current locale for category %d is %s\n",
2802 category, save_input_locale));
b07fffd1 2803 Safefree(save_input_locale);
7d74bb61
KW
2804 return FALSE;
2805 }
2806
1d958db2
KW
2807#if defined(USE_LOCALE_CTYPE) \
2808 && (defined(MB_CUR_MAX) || (defined(HAS_NL_LANGINFO) && defined(CODESET)))
7d74bb61 2809
1d958db2 2810 { /* Next try nl_langinfo or MB_CUR_MAX if available */
7d74bb61
KW
2811
2812 char *save_ctype_locale = NULL;
119ee68b 2813 bool is_utf8;
7d74bb61 2814
119ee68b 2815 if (category != LC_CTYPE) { /* These work only on LC_CTYPE */
7d74bb61
KW
2816
2817 /* Get the current LC_CTYPE locale */
4f72bb37 2818 save_ctype_locale = setlocale(LC_CTYPE, NULL);
7d74bb61 2819 if (! save_ctype_locale) {
69014004
KW
2820 DEBUG_L(PerlIO_printf(Perl_debug_log,
2821 "Could not find current locale for LC_CTYPE\n"));
7d74bb61
KW
2822 goto cant_use_nllanginfo;
2823 }
4f72bb37 2824 save_ctype_locale = stdize_locale(savepv(save_ctype_locale));
7d74bb61
KW
2825
2826 /* If LC_CTYPE and the desired category use the same locale, this
2827 * means that finding the value for LC_CTYPE is the same as finding
2828 * the value for the desired category. Otherwise, switch LC_CTYPE
2829 * to the desired category's locale */
2830 if (strEQ(save_ctype_locale, save_input_locale)) {
2831 Safefree(save_ctype_locale);
2832 save_ctype_locale = NULL;
2833 }
2834 else if (! setlocale(LC_CTYPE, save_input_locale)) {
69014004
KW
2835 DEBUG_L(PerlIO_printf(Perl_debug_log,
2836 "Could not change LC_CTYPE locale to %s\n",
2837 save_input_locale));
7d74bb61
KW
2838 Safefree(save_ctype_locale);
2839 goto cant_use_nllanginfo;
2840 }
2841 }
2842
69014004
KW
2843 DEBUG_L(PerlIO_printf(Perl_debug_log, "Current LC_CTYPE locale=%s\n",
2844 save_input_locale));
2845
7d74bb61 2846 /* Here the current LC_CTYPE is set to the locale of the category whose
1d958db2
KW
2847 * information is desired. This means that nl_langinfo() and MB_CUR_MAX
2848 * should give the correct results */
119ee68b 2849
1d958db2
KW
2850# if defined(HAS_NL_LANGINFO) && defined(CODESET)
2851 {
4f72bb37 2852 char *codeset = nl_langinfo(CODESET);
1d958db2 2853 if (codeset && strNE(codeset, "")) {
4f72bb37 2854 codeset = savepv(codeset);
119ee68b 2855
1d958db2
KW
2856 /* If we switched LC_CTYPE, switch back */
2857 if (save_ctype_locale) {
2858 setlocale(LC_CTYPE, save_ctype_locale);
2859 Safefree(save_ctype_locale);
2860 }
2861
2862 is_utf8 = foldEQ(codeset, STR_WITH_LEN("UTF-8"))
2863 || foldEQ(codeset, STR_WITH_LEN("UTF8"));
2864
69014004
KW
2865 DEBUG_L(PerlIO_printf(Perl_debug_log,
2866 "\tnllanginfo returned CODESET '%s'; ?UTF8 locale=%d\n",
2867 codeset, is_utf8));
1d958db2
KW
2868 Safefree(codeset);
2869 Safefree(save_input_locale);
2870 return is_utf8;
2871 }
119ee68b
KW
2872 }
2873
1d958db2
KW
2874# endif
2875# ifdef MB_CUR_MAX
2876
2877 /* Here, either we don't have nl_langinfo, or it didn't return a
2878 * codeset. Try MB_CUR_MAX */
2879
119ee68b
KW
2880 /* Standard UTF-8 needs at least 4 bytes to represent the maximum
2881 * Unicode code point. Since UTF-8 is the only non-single byte
2882 * encoding we handle, we just say any such encoding is UTF-8, and if
2883 * turns out to be wrong, other things will fail */
2884 is_utf8 = MB_CUR_MAX >= 4;
2885
69014004
KW
2886 DEBUG_L(PerlIO_printf(Perl_debug_log,
2887 "\tMB_CUR_MAX=%d; ?UTF8 locale=%d\n",
2888 (int) MB_CUR_MAX, is_utf8));
2889
119ee68b
KW
2890 Safefree(save_input_locale);
2891
2892# ifdef HAS_MBTOWC
2893
2894 /* ... But, most system that have MB_CUR_MAX will also have mbtowc(),
2895 * since they are both in the C99 standard. We can feed a known byte
2896 * string to the latter function, and check that it gives the expected
2897 * result */
2898 if (is_utf8) {
2899 wchar_t wc;
856b881c 2900 PERL_UNUSED_RESULT(mbtowc(&wc, NULL, 0));/* Reset any shift state */
69014004 2901 errno = 0;
f019f68f 2902 if ((size_t)mbtowc(&wc, HYPHEN_UTF8, strlen(HYPHEN_UTF8))
119ee68b
KW
2903 != strlen(HYPHEN_UTF8)
2904 || wc != (wchar_t) 0x2010)
2905 {
2906 is_utf8 = FALSE;
abdcbdb8 2907 DEBUG_L(PerlIO_printf(Perl_debug_log, "\thyphen=U+%x\n", (unsigned int)wc));
69014004
KW
2908 DEBUG_L(PerlIO_printf(Perl_debug_log,
2909 "\treturn from mbtowc=%d; errno=%d; ?UTF8 locale=0\n",
2910 mbtowc(&wc, HYPHEN_UTF8, strlen(HYPHEN_UTF8)), errno));
119ee68b
KW
2911 }
2912 }
119ee68b
KW
2913# endif
2914
1d958db2
KW
2915 /* If we switched LC_CTYPE, switch back */
2916 if (save_ctype_locale) {
2917 setlocale(LC_CTYPE, save_ctype_locale);
2918 Safefree(save_ctype_locale);
119ee68b 2919 }
7d74bb61 2920
1d958db2 2921 return is_utf8;
119ee68b 2922# endif
7d74bb61 2923 }
119ee68b 2924
7d74bb61
KW
2925 cant_use_nllanginfo:
2926
0080c90a
KW
2927#else /* nl_langinfo should work if available, so don't bother compiling this
2928 fallback code. The final fallback of looking at the name is
2929 compiled, and will be executed if nl_langinfo fails */
7d74bb61 2930
97f4de96
KW
2931 /* nl_langinfo not available or failed somehow. Next try looking at the
2932 * currency symbol to see if it disambiguates things. Often that will be
2933 * in the native script, and if the symbol isn't in UTF-8, we know that the
2934 * locale isn't. If it is non-ASCII UTF-8, we infer that the locale is
609548d2
KW
2935 * too, as the odds of a non-UTF8 string being valid UTF-8 are quite small
2936 * */
fa9b773e
KW
2937
2938#ifdef HAS_LOCALECONV
fa9b773e 2939# ifdef USE_LOCALE_MONETARY
fa9b773e
KW
2940 {
2941 char *save_monetary_locale = NULL;
fa9b773e 2942 bool only_ascii = FALSE;
13542a67
KW
2943 bool is_utf8 = FALSE;
2944 struct lconv* lc;
fa9b773e 2945
97f4de96
KW
2946 /* Like above for LC_CTYPE, we first set LC_MONETARY to the locale of
2947 * the desired category, if it isn't that locale already */
2948
fa9b773e
KW
2949 if (category != LC_MONETARY) {
2950
4f72bb37 2951 save_monetary_locale = setlocale(LC_MONETARY, NULL);
fa9b773e 2952 if (! save_monetary_locale) {
69014004
KW
2953 DEBUG_L(PerlIO_printf(Perl_debug_log,
2954 "Could not find current locale for LC_MONETARY\n"));
fa9b773e
KW
2955 goto cant_use_monetary;
2956 }
4f72bb37 2957 save_monetary_locale = stdize_locale(savepv(save_monetary_locale));
fa9b773e 2958
13542a67
KW
2959 if (strEQ(save_monetary_locale, save_input_locale)) {
2960 Safefree(save_monetary_locale);
2961 save_monetary_locale = NULL;
2962 }
2963 else if (! setlocale(LC_MONETARY, save_input_locale)) {
59c234b4
KW
2964 DEBUG_L(PerlIO_printf(Perl_debug_log,
2965 "Could not change LC_MONETARY locale to %s\n",
2966 save_input_locale));
2967 Safefree(save_monetary_locale);
2968 goto cant_use_monetary;
fa9b773e
KW
2969 }
2970 }
2971
2972 /* Here the current LC_MONETARY is set to the locale of the category
2973 * whose information is desired. */
2974
13542a67
KW
2975 lc = localeconv();
2976 if (! lc
2977 || ! lc->currency_symbol
c5f058df 2978 || is_utf8_invariant_string((U8 *) lc->currency_symbol, 0))
13542a67
KW
2979 {
2980 DEBUG_L(PerlIO_printf(Perl_debug_log, "Couldn't get currency symbol for %s, or contains only ASCII; can't use for determining if UTF-8 locale\n", save_input_locale));
2981 only_ascii = TRUE;
2982 }
2983 else {
2984 is_utf8 = is_utf8_string((U8 *) lc->currency_symbol, 0);
fa9b773e
KW
2985 }
2986
2987 /* If we changed it, restore LC_MONETARY to its original locale */
2988 if (save_monetary_locale) {
2989 setlocale(LC_MONETARY, save_monetary_locale);
2990 Safefree(save_monetary_locale);
2991 }
2992
13542a67 2993 if (! only_ascii) {
fa9b773e 2994
59c234b4
KW
2995 /* It isn't a UTF-8 locale if the symbol is not legal UTF-8;
2996 * otherwise assume the locale is UTF-8 if and only if the symbol
2997 * is non-ascii UTF-8. */
2998 DEBUG_L(PerlIO_printf(Perl_debug_log, "\t?Currency symbol for %s is UTF-8=%d\n",
2999 save_input_locale, is_utf8));
3000 Safefree(save_input_locale);
3001 return is_utf8;
13542a67 3002 }
fa9b773e
KW
3003 }
3004 cant_use_monetary:
3005
3006# endif /* USE_LOCALE_MONETARY */
3007#endif /* HAS_LOCALECONV */
3008
15f7e74e
KW
3009#if defined(HAS_STRFTIME) && defined(USE_LOCALE_TIME)
3010
3011/* Still haven't found a non-ASCII string to disambiguate UTF-8 or not. Try
3012 * the names of the months and weekdays, timezone, and am/pm indicator */
3013 {
3014 char *save_time_locale = NULL;
3015 int hour = 10;
3016 bool is_dst = FALSE;
3017 int dom = 1;
3018 int month = 0;
3019 int i;
3020 char * formatted_time;
3021
3022
3023 /* Like above for LC_MONETARY, we set LC_TIME to the locale of the
3024 * desired category, if it isn't that locale already */
3025
3026 if (category != LC_TIME) {
3027
3028 save_time_locale = setlocale(LC_TIME, NULL);
3029 if (! save_time_locale) {
3030 DEBUG_L(PerlIO_printf(Perl_debug_log,
3031 "Could not find current locale for LC_TIME\n"));
3032 goto cant_use_time;
3033 }
3034 save_time_locale = stdize_locale(savepv(save_time_locale));
3035
3036 if (strEQ(save_time_locale, save_input_locale)) {
3037 Safefree(save_time_locale);
3038 save_time_locale = NULL;
3039 }
3040 else if (! setlocale(LC_TIME, save_input_locale)) {
3041 DEBUG_L(PerlIO_printf(Perl_debug_log,
3042 "Could not change LC_TIME locale to %s\n",
3043 save_input_locale));
3044 Safefree(save_time_locale);
3045 goto cant_use_time;
3046 }
3047 }
3048
3049 /* Here the current LC_TIME is set to the locale of the category
3050 * whose information is desired. Look at all the days of the week and
9f10db87 3051 * month names, and the timezone and am/pm indicator for UTF-8 variant
15f7e74e
KW
3052 * characters. The first such a one found will tell us if the locale
3053 * is UTF-8 or not */
3054
3055 for (i = 0; i < 7 + 12; i++) { /* 7 days; 12 months */
3056 formatted_time = my_strftime("%A %B %Z %p",
3ae5cd07 3057 0, 0, hour, dom, month, 2012 - 1900, 0, 0, is_dst);
c5f058df
KW
3058 if ( ! formatted_time
3059 || is_utf8_invariant_string((U8 *) formatted_time, 0))
3060 {
15f7e74e
KW
3061
3062 /* Here, we didn't find a non-ASCII. Try the next time through
3063 * with the complemented dst and am/pm, and try with the next
3064 * weekday. After we have gotten all weekdays, try the next
3065 * month */
3066 is_dst = ! is_dst;
3067 hour = (hour + 12) % 24;
3068 dom++;
3069 if (i > 6) {
3070 month++;
3071 }
3072 continue;
3073 }
3074
3075 /* Here, we have a non-ASCII. Return TRUE is it is valid UTF8;
3076 * false otherwise. But first, restore LC_TIME to its original
3077 * locale if we changed it */
3078 if (save_time_locale) {
3079 setlocale(LC_TIME, save_time_locale);
3080 Safefree(save_time_locale);
3081 }
3082
3083 DEBUG_L(PerlIO_printf(Perl_debug_log, "\t?time-related strings for %s are UTF-8=%d\n",
3084 save_input_locale,
3085 is_utf8_string((U8 *) formatted_time, 0)));
3086 Safefree(save_input_locale);
3087 return is_utf8_string((U8 *) formatted_time, 0);
3088 }
3089
3090 /* Falling off the end of the loop indicates all the names were just
3091 * ASCII. Go on to the next test. If we changed it, restore LC_TIME
3092 * to its original locale */
3093 if (save_time_locale) {
3094 setlocale(LC_TIME, save_time_locale);
3095 Safefree(save_time_locale);
3096 }
3097 DEBUG_L(PerlIO_printf(Perl_debug_log, "All time-related words for %s contain only ASCII; can't use for determining if UTF-8 locale\n", save_input_locale));
3098 }
3099 cant_use_time:
3100
3101#endif
3102
3103#if 0 && defined(USE_LOCALE_MESSAGES) && defined(HAS_SYS_ERRLIST)
855aeb93
JH
3104
3105/* This code is ifdefd out because it was found to not be necessary in testing
5857e934
KW
3106 * on our dromedary test machine, which has over 700 locales. There, this
3107 * added no value to looking at the currency symbol and the time strings. I
3108 * left it in so as to avoid rewriting it if real-world experience indicates
3109 * that dromedary is an outlier. Essentially, instead of returning abpve if we
855aeb93
JH
3110 * haven't found illegal utf8, we continue on and examine all the strerror()
3111 * messages on the platform for utf8ness. If all are ASCII, we still don't
3112 * know the answer; but otherwise we have a pretty good indication of the
5857e934
KW
3113 * utf8ness. The reason this doesn't help much is that the messages may not
3114 * have been translated into the locale. The currency symbol and time strings
3115 * are much more likely to have been translated. */
3116 {
855aeb93 3117 int e;
5857e934
KW
3118 bool is_utf8 = FALSE;
3119 bool non_ascii = FALSE;
855aeb93 3120 char *save_messages_locale = NULL;
5857e934 3121 const char * errmsg = NULL;
855aeb93 3122
5857e934
KW
3123 /* Like above, we set LC_MESSAGES to the locale of the desired
3124 * category, if it isn't that locale already */
855aeb93
JH
3125
3126 if (category != LC_MESSAGES) {
3127
5857e934 3128 save_messages_locale = setlocale(LC_MESSAGES, NULL);
855aeb93 3129 if (! save_messages_locale) {
5857e934
KW
3130 DEBUG_L(PerlIO_printf(Perl_debug_log,
3131 "Could not find current locale for LC_MESSAGES\n"));
855aeb93
JH
3132 goto cant_use_messages;
3133 }
5857e934 3134 save_messages_locale = stdize_locale(savepv(save_messages_locale));
855aeb93
JH
3135
3136 if (strEQ(save_messages_locale, save_input_locale)) {
5857e934
KW
3137 Safefree(save_messages_locale);
3138 save_messages_locale = NULL;
855aeb93
JH
3139 }
3140 else if (! setlocale(LC_MESSAGES, save_input_locale)) {
5857e934
KW
3141 DEBUG_L(PerlIO_printf(Perl_debug_log,
3142 "Could not change LC_MESSAGES locale to %s\n",
3143 save_input_locale));
855aeb93
JH
3144 Safefree(save_messages_locale);
3145 goto cant_use_messages;
3146 }
3147 }
3148
3149 /* Here the current LC_MESSAGES is set to the locale of the category
5857e934
KW
3150 * whose information is desired. Look through all the messages. We
3151 * can't use Strerror() here because it may expand to code that
3152 * segfaults in miniperl */
855aeb93 3153
5857e934
KW
3154 for (e = 0; e <= sys_nerr; e++) {
3155 errno = 0;
3156 errmsg = sys_errlist[e];
3157 if (errno || !errmsg) {
855aeb93
JH
3158 break;
3159 }
5857e934 3160 errmsg = savepv(errmsg);
c5f058df 3161 if (! is_utf8_invariant_string((U8 *) errmsg, 0)) {
5857e934
KW
3162 non_ascii = TRUE;
3163 is_utf8 = is_utf8_string((U8 *) errmsg, 0);
3164 break;
855aeb93
JH
3165 }
3166 }
5857e934 3167 Safefree(errmsg);
855aeb93
JH
3168
3169 /* And, if we changed it, restore LC_MESSAGES to its original locale */
3170 if (save_messages_locale) {
3171 setlocale(LC_MESSAGES, save_messages_locale);
3172 Safefree(save_messages_locale);
3173 }
3174
5857e934
KW
3175 if (non_ascii) {
3176
3177 /* Any non-UTF-8 message means not a UTF-8 locale; if all are valid,
3178 * any non-ascii means it is one; otherwise we assume it isn't */
3179 DEBUG_L(PerlIO_printf(Perl_debug_log, "\t?error messages for %s are UTF-8=%d\n",
3180 save_input_locale,
3181 is_utf8));
3182 Safefree(save_input_locale);
3183 return is_utf8;
3184 }
855aeb93 3185
5857e934 3186 DEBUG_L(PerlIO_printf(Perl_debug_log, "All error messages for %s contain only ASCII; can't use for determining if UTF-8 locale\n", save_input_locale));
855aeb93
JH
3187 }
3188 cant_use_messages:
3189
3190#endif
fa9b773e 3191
0080c90a
KW
3192#endif /* the code that is compiled when no nl_langinfo */
3193
92c0a900
KW
3194#ifndef EBCDIC /* On os390, even if the name ends with "UTF-8', it isn't a
3195 UTF-8 locale */
97f4de96
KW
3196 /* As a last resort, look at the locale name to see if it matches
3197 * qr/UTF -? * 8 /ix, or some other common locale names. This "name", the
3198 * return of setlocale(), is actually defined to be opaque, so we can't
3199 * really rely on the absence of various substrings in the name to indicate
3200 * its UTF-8ness, but if it has UTF8 in the name, it is extremely likely to
3201 * be a UTF-8 locale. Similarly for the other common names */
3202
3203 final_pos = strlen(save_input_locale) - 1;
3204 if (final_pos >= 3) {
3205 char *name = save_input_locale;
3206
3207 /* Find next 'U' or 'u' and look from there */
3208 while ((name += strcspn(name, "Uu") + 1)
3209 <= save_input_locale + final_pos - 2)
3210 {
305b8651
KW
3211 if (!isALPHA_FOLD_NE(*name, 't')
3212 || isALPHA_FOLD_NE(*(name + 1), 'f'))
97f4de96
KW
3213 {
3214 continue;
3215 }
3216 name += 2;
3217 if (*(name) == '-') {
3218 if ((name > save_input_locale + final_pos - 1)) {
3219 break;
3220 }
3221 name++;
3222 }
3223 if (*(name) == '8') {
97f4de96
KW
3224 DEBUG_L(PerlIO_printf(Perl_debug_log,
3225 "Locale %s ends with UTF-8 in name\n",
3226 save_input_locale));
00c54b9c 3227 Safefree(save_input_locale);
97f4de96
KW
3228 return TRUE;
3229 }
3230 }
3231 DEBUG_L(PerlIO_printf(Perl_debug_log,
3232 "Locale %s doesn't end with UTF-8 in name\n",
3233 save_input_locale));
3234 }
92c0a900 3235#endif
97f4de96
KW
3236
3237#ifdef WIN32
3238 /* http://msdn.microsoft.com/en-us/library/windows/desktop/dd317756.aspx */
8a0832a1 3239 if (memENDs(save_input_locale, final_pos, "65001")) {
97f4de96 3240 DEBUG_L(PerlIO_printf(Perl_debug_log,
8a0832a1 3241 "Locale %s ends with 65001 in name, is UTF-8 locale\n",
97f4de96
KW
3242 save_input_locale));
3243 Safefree(save_input_locale);
3244 return TRUE;
3245 }
3246#endif
3247
3248 /* Other common encodings are the ISO 8859 series, which aren't UTF-8. But
3249 * since we are about to return FALSE anyway, there is no point in doing
3250 * this extra work */
3251#if 0
3252 if (instr(save_input_locale, "8859")) {
3253 DEBUG_L(PerlIO_printf(Perl_debug_log,
3254 "Locale %s has 8859 in name, not UTF-8 locale\n",
3255 save_input_locale));
3256 Safefree(save_input_locale);
3257 return FALSE;
3258 }
3259#endif
3260
69014004
KW
3261 DEBUG_L(PerlIO_printf(Perl_debug_log,
3262 "Assuming locale %s is not a UTF-8 locale\n",
3263 save_input_locale));
fa9b773e 3264 Safefree(save_input_locale);
7d74bb61
KW
3265 return FALSE;
3266}
3267
8ef6e574 3268#endif
7d74bb61 3269
d6ded950
KW
3270
3271bool
3272Perl__is_in_locale_category(pTHX_ const bool compiling, const int category)
3273{
1a4f13e1 3274 dVAR;
d6ded950
KW
3275 /* Internal function which returns if we are in the scope of a pragma that
3276 * enables the locale category 'category'. 'compiling' should indicate if
3277 * this is during the compilation phase (TRUE) or not (FALSE). */
3278
3279 const COP * const cop = (compiling) ? &PL_compiling : PL_curcop;
3280
3281 SV *categories = cop_hints_fetch_pvs(cop, "locale", 0);
3282 if (! categories || categories == &PL_sv_placeholder) {
3283 return FALSE;
3284 }
3285
3286 /* The pseudo-category 'not_characters' is -1, so just add 1 to each to get
3287 * a valid unsigned */
3288 assert(category >= -1);
3289 return cBOOL(SvUV(categories) & (1U << (category + 1)));
3290}
3291
2c6ee1a7 3292char *
6ebbc862
KW
3293Perl_my_strerror(pTHX_ const int errnum)
3294{
3295 /* Returns a mortalized copy of the text of the error message associated
3296 * with 'errnum'. It uses the current locale's text unless the platform
3297 * doesn't have the LC_MESSAGES category or we are not being called from
3298 * within the scope of 'use locale'. In the former case, it uses whatever
3299 * strerror returns; in the latter case it uses the text from the C locale.
3300 *
3301 * The function just calls strerror(), but temporarily switches, if needed,
3302 * to the C locale */
3303
3304 char *errstr;
52770946 3305 dVAR;
6ebbc862 3306
52770946 3307#ifndef USE_LOCALE_MESSAGES
6ebbc862 3308
52770946
KW
3309 /* If platform doesn't have messages category, we don't do any switching to
3310 * the C locale; we just use whatever strerror() returns */
3311
3312 errstr = savepv(Strerror(errnum));
3313
3314#else /* Has locale messages */
3315
3316 const bool within_locale_scope = IN_LC(LC_MESSAGES);
2c6ee1a7 3317
7aaa36b1
KW
3318# if defined(HAS_POSIX_2008_LOCALE) && defined(HAS_STRERROR_L)
3319
3320 /* This function is trivial if we have strerror_l() */
3321
3322 if (within_locale_scope) {
3323 errstr = strerror(errnum);
3324 }
3325 else {
3326 errstr = strerror_l(errnum, PL_C_locale_obj);
3327 }
3328
3329 errstr = savepv(errstr);
3330
3331# else /* Doesn't have strerror_l(). */
3332
3333# ifdef USE_POSIX_2008_LOCALE
3334
fcd0e682 3335 locale_t save_locale = NULL;
7aaa36b1
KW
3336
3337# else
3338
fcd0e682 3339 char * save_locale = NULL;
c9dda6da 3340 bool locale_is_C = FALSE;
2c6ee1a7 3341
6ebbc862
KW
3342 /* We have a critical section to prevent another thread from changing the
3343 * locale out from under us (or zapping the buffer returned from
3344 * setlocale() ) */
3345 LOCALE_LOCK;
3346
7aaa36b1 3347# endif
6ebbc862 3348
9c8a6dc2
KW
3349 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
3350 "my_strerror called with errnum %d\n", errnum));
6ebbc862 3351 if (! within_locale_scope) {
c9dda6da 3352 errno = 0;
a0b53297 3353
f1d2176b 3354# ifdef USE_POSIX_2008_LOCALE /* Use the thread-safe locale functions */
6ebbc862 3355
9c8a6dc2
KW
3356 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
3357 "Not within locale scope, about to call"
3358 " uselocale(0x%p)\n", PL_C_locale_obj));
6ebbc862 3359 save_locale = uselocale(PL_C_locale_obj);
c9dda6da
KW
3360 if (! save_locale) {
3361 DEBUG_L(PerlIO_printf(Perl_debug_log,
9c8a6dc2
KW
3362 "uselocale failed, errno=%d\n", errno));
3363 }
3364 else {
3365 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
3366 "uselocale returned 0x%p\n", save_locale));
c9dda6da 3367 }
6ebbc862 3368
7aaa36b1 3369# else /* Not thread-safe build */
a0b53297
KW
3370
3371 save_locale = setlocale(LC_MESSAGES, NULL);
c9dda6da
KW
3372 if (! save_locale) {
3373 DEBUG_L(PerlIO_printf(Perl_debug_log,
3374 "setlocale failed, errno=%d\n", errno));
3375 }
3376 else {
3377 locale_is_C = isNAME_C_OR_POSIX(save_locale);
2c6ee1a7 3378
c9dda6da
KW
3379 /* Switch to the C locale if not already in it */
3380 if (! locale_is_C) {
2c6ee1a7 3381
c9dda6da
KW
3382 /* The setlocale() just below likely will zap 'save_locale', so
3383 * create a copy. */
3384 save_locale = savepv(save_locale);
3385 setlocale(LC_MESSAGES, "C");
3386 }
6ebbc862 3387 }
2c6ee1a7 3388
7aaa36b1 3389# endif
2c6ee1a7 3390
6ebbc862 3391 } /* end of ! within_locale_scope */
9c8a6dc2
KW
3392 else {
3393 DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s: %d: WITHIN locale scope\n",
3394 __FILE__, __LINE__));
3395 }
a0b53297 3396
9c8a6dc2
KW
3397 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
3398 "Any locale change has been done; about to call Strerror\n"));
52770946 3399 errstr = savepv(Strerror(errnum));
6ebbc862
KW
3400
3401 if (! within_locale_scope) {
c9dda6da 3402 errno = 0;
a0b53297 3403
f1d2176b 3404# ifdef USE_POSIX_2008_LOCALE
6ebbc862 3405
9c8a6dc2
KW
3406 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
3407 "%s: %d: not within locale scope, restoring the locale\n",
3408 __FILE__, __LINE__));
c9dda6da
KW
3409 if (save_locale && ! uselocale(save_locale)) {
3410 DEBUG_L(PerlIO_printf(Perl_debug_log,
3411 "uselocale restore failed, errno=%d\n", errno));
3412 }
2c6ee1a7 3413 }
6ebbc862 3414
7aaa36b1 3415# else
6ebbc862 3416
c9dda6da
KW
3417 if (save_locale && ! locale_is_C) {
3418 if (! setlocale(LC_MESSAGES, save_locale)) {
3419 DEBUG_L(PerlIO_printf(Perl_debug_log,
3420 "setlocale restore failed, errno=%d\n", errno));
3421 }
6ebbc862
KW
3422 Safefree(save_locale);
3423 }
3424 }
3425
3426 LOCALE_UNLOCK;
3427
7aaa36b1
KW
3428# endif
3429# endif /* End of doesn't have strerror_l */
52770946 3430#endif /* End of does have locale messages */
6affbbf0
KW
3431
3432#ifdef DEBUGGING
3433
3434 if (DEBUG_Lv_TEST) {
3435 PerlIO_printf(Perl_debug_log, "Strerror returned; saving a copy: '");
3436 print_bytes_for_locale(errstr, errstr + strlen(errstr), 0);
3437 PerlIO_printf(Perl_debug_log, "'\n");
3438 }
3439
2c6ee1a7
KW
3440#endif
3441
52770946 3442 SAVEFREEPV(errstr);
6ebbc862 3443 return errstr;
2c6ee1a7
KW
3444}
3445
66610fdd 3446/*
747c467a 3447
747c467a
KW
3448=for apidoc sync_locale
3449
3450Changing the program's locale should be avoided by XS code. Nevertheless,
3451certain non-Perl libraries called from XS, such as C<Gtk> do so. When this
3452happens, Perl needs to be told that the locale has changed. Use this function
3453to do so, before returning to Perl.
3454
3455=cut
3456*/
3457
3458void
3459Perl_sync_locale(pTHX)
3460{
3461
3462#ifdef USE_LOCALE_CTYPE
3463 new_ctype(setlocale(LC_CTYPE, NULL));
3464#endif /* USE_LOCALE_CTYPE */
3465
3466#ifdef USE_LOCALE_COLLATE
3467 new_collate(setlocale(LC_COLLATE, NULL));
3468#endif
3469
3470#ifdef USE_LOCALE_NUMERIC
3471 set_numeric_local(); /* Switch from "C" to underlying LC_NUMERIC */
3472 new_numeric(setlocale(LC_NUMERIC, NULL));
3473#endif /* USE_LOCALE_NUMERIC */
3474
3475}
3476
5d1187d1
KW
3477#if defined(DEBUGGING) && defined(USE_LOCALE)
3478
a4f00dcc
KW
3479STATIC char *
3480S_setlocale_debug_string(const int category, /* category number,
5d1187d1
KW
3481 like LC_ALL */
3482 const char* const locale, /* locale name */
3483
3484 /* return value from setlocale() when attempting to
3485 * set 'category' to 'locale' */
3486 const char* const retval)
3487{
3488 /* Returns a pointer to a NUL-terminated string in static storage with
3489 * added text about the info passed in. This is not thread safe and will
3490 * be overwritten by the next call, so this should be used just to
fa07b8e5 3491 * formulate a string to immediately print or savepv() on. */
5d1187d1 3492
398a990f
DM
3493 /* initialise to a non-null value to keep it out of BSS and so keep
3494 * -DPERL_GLOBAL_STRUCT_PRIVATE happy */
60b45a7d
KW
3495 static char ret[128] = "If you can read this, thank your buggy C"
3496 " library strlcpy(), and change your hints file"
3497 " to undef it";
fa07b8e5 3498 my_strlcpy(ret, "setlocale(", sizeof(ret));
5d1187d1
KW
3499
3500 switch (category) {
3501 default:
fa07b8e5 3502 my_snprintf(ret, sizeof(ret), "%s? %d", ret, category);
5d1187d1
KW
3503 break;
3504# ifdef LC_ALL
3505 case LC_ALL:
fa07b8e5 3506 my_strlcat(ret, "LC_ALL", sizeof(ret));
5d1187d1
KW
3507 break;
3508# endif
3509# ifdef LC_CTYPE
3510 case LC_CTYPE:
fa07b8e5 3511 my_strlcat(ret, "LC_CTYPE", sizeof(ret));
5d1187d1
KW
3512 break;
3513# endif
3514# ifdef LC_NUMERIC
3515 case LC_NUMERIC:
fa07b8e5 3516 my_strlcat(ret, "LC_NUMERIC", sizeof(ret));
5d1187d1
KW
3517 break;
3518# endif
3519# ifdef LC_COLLATE
3520 case LC_COLLATE:
fa07b8e5 3521 my_strlcat(ret, "LC_COLLATE", sizeof(ret));
5d1187d1
KW
3522 break;
3523# endif
3524# ifdef LC_TIME
3525 case LC_TIME:
fa07b8e5 3526 my_strlcat(ret, "LC_TIME", sizeof(ret));
5d1187d1
KW
3527 break;
3528# endif
3529# ifdef LC_MONETARY
3530 case LC_MONETARY:
fa07b8e5 3531 my_strlcat(ret, "LC_MONETARY", sizeof(ret));
5d1187d1
KW
3532 break;
3533# endif
3534# ifdef LC_MESSAGES
3535 case LC_MESSAGES:
fa07b8e5 3536 my_strlcat(ret, "LC_MESSAGES", sizeof(ret));
5d1187d1
KW
3537 break;
3538# endif
3539 }
3540
fa07b8e5 3541 my_strlcat(ret, ", ", sizeof(ret));
5d1187d1
KW
3542
3543 if (locale) {
fa07b8e5
KW
3544 my_strlcat(ret, "\"", sizeof(ret));
3545 my_strlcat(ret, locale, sizeof(ret));
3546 my_strlcat(ret, "\"", sizeof(ret));
5d1187d1
KW
3547 }
3548 else {
fa07b8e5 3549 my_strlcat(ret, "NULL", sizeof(ret));
5d1187d1
KW
3550 }
3551
fa07b8e5 3552 my_strlcat(ret, ") returned ", sizeof(ret));
5d1187d1
KW
3553
3554 if (retval) {
fa07b8e5
KW
3555 my_strlcat(ret, "\"", sizeof(ret));
3556 my_strlcat(ret, retval, sizeof(ret));
3557 my_strlcat(ret, "\"", sizeof(ret));
5d1187d1
KW
3558 }
3559 else {
fa07b8e5 3560 my_strlcat(ret, "NULL", sizeof(ret));
5d1187d1
KW
3561 }
3562
3563 assert(strlen(ret) < sizeof(ret));
3564
3565 return ret;
3566}
3567
3568#endif
747c467a
KW
3569
3570
3571/*
14d04a33 3572 * ex: set ts=8 sts=4 sw=4 et:
37442d52 3573 */