This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
locale.c:sync_locale(): Add debugging info
[perl5.git] / locale.c
CommitLineData
98994639
HS
1/* locale.c
2 *
1129b882
NC
3 * Copyright (C) 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001,
4 * 2002, 2003, 2005, 2006, 2007, 2008 by Larry Wall and others
98994639
HS
5 *
6 * You may distribute under the terms of either the GNU General Public
7 * License or the Artistic License, as specified in the README file.
8 *
9 */
10
11/*
4ac71550 12 * A Elbereth Gilthoniel,
cdad3b53 13 * silivren penna míriel
4ac71550 14 * o menel aglar elenath!
cdad3b53 15 * Na-chaered palan-díriel
4ac71550
TC
16 * o galadhremmin ennorath,
17 * Fanuilos, le linnathon
18 * nef aear, si nef aearon!
19 *
20 * [p.238 of _The Lord of the Rings_, II/i: "Many Meetings"]
98994639
HS
21 */
22
166f8a29
DM
23/* utility functions for handling locale-specific stuff like what
24 * character represents the decimal point.
0d071d52 25 *
7d4bcc4a
KW
26 * All C programs have an underlying locale. Perl code generally doesn't pay
27 * any attention to it except within the scope of a 'use locale'. For most
0d071d52
KW
28 * categories, it accomplishes this by just using different operations if it is
29 * in such scope than if not. However, various libc functions called by Perl
30 * are affected by the LC_NUMERIC category, so there are macros in perl.h that
31 * are used to toggle between the current locale and the C locale depending on
a9ad02a8
KW
32 * the desired behavior of those functions at the moment. And, LC_MESSAGES is
33 * switched to the C locale for outputting the message unless within the scope
34 * of 'use locale'.
166f8a29
DM
35 */
36
98994639
HS
37#include "EXTERN.h"
38#define PERL_IN_LOCALE_C
f7416781 39#include "perl_langinfo.h"
98994639
HS
40#include "perl.h"
41
a4af207c
JH
42#include "reentr.h"
43
2fcc0ca9
KW
44/* If the environment says to, we can output debugging information during
45 * initialization. This is done before option parsing, and before any thread
46 * creation, so can be a file-level static */
47#ifdef DEBUGGING
7d4bcc4a 48# ifdef PERL_GLOBAL_STRUCT
27cdc72e 49 /* no global syms allowed */
7d4bcc4a
KW
50# define debug_initialization 0
51# define DEBUG_INITIALIZATION_set(v)
52# else
2fcc0ca9 53static bool debug_initialization = FALSE;
7d4bcc4a
KW
54# define DEBUG_INITIALIZATION_set(v) (debug_initialization = v)
55# endif
2fcc0ca9
KW
56#endif
57
8ef6e574
KW
58#ifdef USE_LOCALE
59
98994639 60/*
0d071d52
KW
61 * Standardize the locale name from a string returned by 'setlocale', possibly
62 * modifying that string.
98994639 63 *
0ef2a2b2 64 * The typical return value of setlocale() is either
98994639
HS
65 * (1) "xx_YY" if the first argument of setlocale() is not LC_ALL
66 * (2) "xa_YY xb_YY ..." if the first argument of setlocale() is LC_ALL
67 * (the space-separated values represent the various sublocales,
0ef2a2b2 68 * in some unspecified order). This is not handled by this function.
98994639
HS
69 *
70 * In some platforms it has a form like "LC_SOMETHING=Lang_Country.866\n",
0ef2a2b2
KW
71 * which is harmful for further use of the string in setlocale(). This
72 * function removes the trailing new line and everything up through the '='
98994639
HS
73 *
74 */
75STATIC char *
76S_stdize_locale(pTHX_ char *locs)
77{
7452cf6a 78 const char * const s = strchr(locs, '=');
98994639
HS
79 bool okay = TRUE;
80
7918f24d
NC
81 PERL_ARGS_ASSERT_STDIZE_LOCALE;
82
8772537c
AL
83 if (s) {
84 const char * const t = strchr(s, '.');
98994639 85 okay = FALSE;
8772537c
AL
86 if (t) {
87 const char * const u = strchr(t, '\n');
88 if (u && (u[1] == 0)) {
89 const STRLEN len = u - s;
90 Move(s + 1, locs, len, char);
91 locs[len] = 0;
92 okay = TRUE;
98994639
HS
93 }
94 }
95 }
96
97 if (!okay)
98 Perl_croak(aTHX_ "Can't fix broken locale name \"%s\"", locs);
99
100 return locs;
101}
102
8ef6e574
KW
103#endif
104
d2b24094
KW
105/* Windows requres a customized base-level setlocale() */
106# ifdef WIN32
107# define my_setlocale(cat, locale) win32_setlocale(cat, locale)
108# else
109# define my_setlocale(cat, locale) setlocale(cat, locale)
110# endif
111
837ce802
KW
112/* Just placeholders for now. "_c" is intended to be called when the category
113 * is a constant known at compile time; "_r", not known until run time */
d2b24094
KW
114# define do_setlocale_c(category, locale) my_setlocale(category, locale)
115# define do_setlocale_r(category, locale) my_setlocale(category, locale)
837ce802 116
a4f00dcc 117STATIC void
86799d2d 118S_set_numeric_radix(pTHX_ const bool use_locale)
98994639 119{
86799d2d
KW
120 /* If 'use_locale' is FALSE, set to use a dot for the radix character. If
121 * TRUE, use the radix character derived from the current locale */
7d4bcc4a 122
86799d2d
KW
123#if defined(USE_LOCALE_NUMERIC) && ( defined(HAS_LOCALECONV) \
124 || defined(HAS_NL_LANGINFO))
98994639 125
86799d2d
KW
126 /* We only set up the radix SV if we are to use a locale radix ... */
127 if (use_locale) {
128 const char * radix = my_nl_langinfo(PERL_RADIXCHAR, FALSE);
129 /* FALSE => already in dest locale */
130
131 /* ... and the character being used isn't a dot */
132 if (strNE(radix, ".")) {
133 if (PL_numeric_radix_sv) {
134 sv_setpv(PL_numeric_radix_sv, radix);
135 }
136 else {
137 PL_numeric_radix_sv = newSVpv(radix, 0);
138 }
139
140 if ( ! is_utf8_invariant_string(
141 (U8 *) SvPVX(PL_numeric_radix_sv), SvCUR(PL_numeric_radix_sv))
142 && is_utf8_string(
143 (U8 *) SvPVX(PL_numeric_radix_sv), SvCUR(PL_numeric_radix_sv))
c1284011 144 && _is_cur_LC_category_utf8(LC_NUMERIC))
28acfe03 145 {
86799d2d 146 SvUTF8_on(PL_numeric_radix_sv);
28acfe03 147 }
86799d2d
KW
148 goto done;
149 }
98994639 150 }
69014004 151
86799d2d
KW
152 SvREFCNT_dec(PL_numeric_radix_sv);
153 PL_numeric_radix_sv = NULL;
154
155 done: ;
156
157# ifdef DEBUGGING
7d4bcc4a 158
2fcc0ca9
KW
159 if (DEBUG_L_TEST || debug_initialization) {
160 PerlIO_printf(Perl_debug_log, "Locale radix is '%s', ?UTF-8=%d\n",
69014004 161 (PL_numeric_radix_sv)
37b7e435
KW
162 ? SvPVX(PL_numeric_radix_sv)
163 : "NULL",
164 (PL_numeric_radix_sv)
39eb7305 165 ? cBOOL(SvUTF8(PL_numeric_radix_sv))
2fcc0ca9
KW
166 : 0);
167 }
69014004 168
86799d2d
KW
169# endif
170#endif /* USE_LOCALE_NUMERIC and can find the radix char */
7d4bcc4a 171
98994639
HS
172}
173
a39edc4c
KW
174/* Is the C string input 'name' "C" or "POSIX"? If so, and 'name' is the
175 * return of setlocale(), then this is extremely likely to be the C or POSIX
176 * locale. However, the output of setlocale() is documented to be opaque, but
177 * the odds are extremely small that it would return these two strings for some
178 * other locale. Note that VMS in these two locales includes many non-ASCII
179 * characters as controls and punctuation (below are hex bytes):
180 * cntrl: 00-1F 7F 84-97 9B-9F
181 * punct: 21-2F 3A-40 5B-60 7B-7E A1-A3 A5 A7-AB B0-B3 B5-B7 B9-BD BF-CF D1-DD DF-EF F1-FD
182 * Oddly, none there are listed as alphas, though some represent alphabetics
183 * http://www.nntp.perl.org/group/perl.perl5.porters/2013/02/msg198753.html */
7d4bcc4a
KW
184#define isNAME_C_OR_POSIX(name) \
185 ( (name) != NULL \
186 && (( *(name) == 'C' && (*(name + 1)) == '\0') \
187 || strEQ((name), "POSIX")))
a39edc4c 188
98994639 189void
8772537c 190Perl_new_numeric(pTHX_ const char *newnum)
98994639 191{
7d4bcc4a
KW
192
193#ifndef USE_LOCALE_NUMERIC
194
195 PERL_UNUSED_ARG(newnum);
196
197#else
0d071d52
KW
198
199 /* Called after all libc setlocale() calls affecting LC_NUMERIC, to tell
200 * core Perl this and that 'newnum' is the name of the new locale.
201 * It installs this locale as the current underlying default.
202 *
203 * The default locale and the C locale can be toggled between by use of the
204 * set_numeric_local() and set_numeric_standard() functions, which should
205 * probably not be called directly, but only via macros like
206 * SET_NUMERIC_STANDARD() in perl.h.
207 *
208 * The toggling is necessary mainly so that a non-dot radix decimal point
209 * character can be output, while allowing internal calculations to use a
210 * dot.
211 *
212 * This sets several interpreter-level variables:
bb304765 213 * PL_numeric_name The underlying locale's name: a copy of 'newnum'
0d071d52 214 * PL_numeric_local A boolean indicating if the toggled state is such
7738054c
KW
215 * that the current locale is the program's underlying
216 * locale
217 * PL_numeric_standard An int indicating if the toggled state is such
218 * that the current locale is the C locale. If non-zero,
219 * it is in C; if > 1, it means it may not be toggled away
220 * from C.
0d071d52
KW
221 * Note that both of the last two variables can be true at the same time,
222 * if the underlying locale is C. (Toggling is a no-op under these
223 * circumstances.)
224 *
225 * Any code changing the locale (outside this file) should use
226 * POSIX::setlocale, which calls this function. Therefore this function
227 * should be called directly only from this file and from
228 * POSIX::setlocale() */
229
b03f34cf 230 char *save_newnum;
98994639
HS
231
232 if (! newnum) {
43c5f42d
NC
233 Safefree(PL_numeric_name);
234 PL_numeric_name = NULL;
98994639
HS
235 PL_numeric_standard = TRUE;
236 PL_numeric_local = TRUE;
237 return;
238 }
239
b03f34cf 240 save_newnum = stdize_locale(savepv(newnum));
abe1abcf
KW
241
242 PL_numeric_standard = isNAME_C_OR_POSIX(save_newnum);
243 PL_numeric_local = TRUE;
244
b03f34cf 245 if (! PL_numeric_name || strNE(PL_numeric_name, save_newnum)) {
98994639 246 Safefree(PL_numeric_name);
b03f34cf 247 PL_numeric_name = save_newnum;
b03f34cf 248 }
abe1abcf
KW
249 else {
250 Safefree(save_newnum);
251 }
4c28b29c
KW
252
253 /* Keep LC_NUMERIC in the C locale. This is for XS modules, so they don't
254 * have to worry about the radix being a non-dot. (Core operations that
255 * need the underlying locale change to it temporarily). */
256 set_numeric_standard();
257
98994639 258#endif /* USE_LOCALE_NUMERIC */
7d4bcc4a 259
98994639
HS
260}
261
262void
263Perl_set_numeric_standard(pTHX)
264{
7d4bcc4a 265
98994639 266#ifdef USE_LOCALE_NUMERIC
7d4bcc4a 267
28c1bf33
KW
268 /* Toggle the LC_NUMERIC locale to C. Most code should use the macros like
269 * SET_NUMERIC_STANDARD() in perl.h instead of calling this directly. The
270 * macro avoids calling this routine if toggling isn't necessary according
271 * to our records (which could be wrong if some XS code has changed the
272 * locale behind our back) */
0d071d52 273
837ce802 274 do_setlocale_c(LC_NUMERIC, "C");
a9b8c0d8
KW
275 PL_numeric_standard = TRUE;
276 PL_numeric_local = isNAME_C_OR_POSIX(PL_numeric_name);
86799d2d 277 set_numeric_radix(0);
7d4bcc4a
KW
278
279# ifdef DEBUGGING
280
2fcc0ca9
KW
281 if (DEBUG_L_TEST || debug_initialization) {
282 PerlIO_printf(Perl_debug_log,
283 "Underlying LC_NUMERIC locale now is C\n");
284 }
98994639 285
7d4bcc4a 286# endif
98994639 287#endif /* USE_LOCALE_NUMERIC */
7d4bcc4a 288
98994639
HS
289}
290
291void
292Perl_set_numeric_local(pTHX)
293{
7d4bcc4a 294
98994639 295#ifdef USE_LOCALE_NUMERIC
7d4bcc4a 296
28c1bf33 297 /* Toggle the LC_NUMERIC locale to the current underlying default. Most
7d4bcc4a
KW
298 * code should use the macros like SET_NUMERIC_UNDERLYING() in perl.h
299 * instead of calling this directly. The macro avoids calling this routine
300 * if toggling isn't necessary according to our records (which could be
301 * wrong if some XS code has changed the locale behind our back) */
a9b8c0d8 302
837ce802 303 do_setlocale_c(LC_NUMERIC, PL_numeric_name);
a9b8c0d8
KW
304 PL_numeric_standard = isNAME_C_OR_POSIX(PL_numeric_name);
305 PL_numeric_local = TRUE;
86799d2d 306 set_numeric_radix(1);
7d4bcc4a
KW
307
308# ifdef DEBUGGING
309
2fcc0ca9
KW
310 if (DEBUG_L_TEST || debug_initialization) {
311 PerlIO_printf(Perl_debug_log,
69014004 312 "Underlying LC_NUMERIC locale now is %s\n",
2fcc0ca9
KW
313 PL_numeric_name);
314 }
98994639 315
7d4bcc4a 316# endif
98994639 317#endif /* USE_LOCALE_NUMERIC */
7d4bcc4a 318
98994639
HS
319}
320
321/*
322 * Set up for a new ctype locale.
323 */
a4f00dcc
KW
324STATIC void
325S_new_ctype(pTHX_ const char *newctype)
98994639 326{
7d4bcc4a
KW
327
328#ifndef USE_LOCALE_CTYPE
329
330 PERL_ARGS_ASSERT_NEW_CTYPE;
331 PERL_UNUSED_ARG(newctype);
332 PERL_UNUSED_CONTEXT;
333
334#else
0d071d52
KW
335
336 /* Called after all libc setlocale() calls affecting LC_CTYPE, to tell
337 * core Perl this and that 'newctype' is the name of the new locale.
338 *
339 * This function sets up the folding arrays for all 256 bytes, assuming
340 * that tofold() is tolc() since fold case is not a concept in POSIX,
341 *
342 * Any code changing the locale (outside this file) should use
343 * POSIX::setlocale, which calls this function. Therefore this function
344 * should be called directly only from this file and from
345 * POSIX::setlocale() */
346
27da23d5 347 dVAR;
68067e4e 348 UV i;
98994639 349
7918f24d
NC
350 PERL_ARGS_ASSERT_NEW_CTYPE;
351
215c5139
KW
352 /* We will replace any bad locale warning with 1) nothing if the new one is
353 * ok; or 2) a new warning for the bad new locale */
354 if (PL_warn_locale) {
355 SvREFCNT_dec_NN(PL_warn_locale);
356 PL_warn_locale = NULL;
357 }
358
c1284011 359 PL_in_utf8_CTYPE_locale = _is_cur_LC_category_utf8(LC_CTYPE);
31f05a37
KW
360
361 /* A UTF-8 locale gets standard rules. But note that code still has to
362 * handle this specially because of the three problematic code points */
363 if (PL_in_utf8_CTYPE_locale) {
364 Copy(PL_fold_latin1, PL_fold_locale, 256, U8);
365 }
366 else {
8c6180a9
KW
367 /* Assume enough space for every character being bad. 4 spaces each
368 * for the 94 printable characters that are output like "'x' "; and 5
369 * spaces each for "'\\' ", "'\t' ", and "'\n' "; plus a terminating
370 * NUL */
371 char bad_chars_list[ (94 * 4) + (3 * 5) + 1 ];
372
cc9eaeb0
KW
373 /* Don't check for problems if we are suppressing the warnings */
374 bool check_for_problems = ckWARN_d(WARN_LOCALE)
375 || UNLIKELY(DEBUG_L_TEST);
8c6180a9
KW
376 bool multi_byte_locale = FALSE; /* Assume is a single-byte locale
377 to start */
378 unsigned int bad_count = 0; /* Count of bad characters */
379
baa60164
KW
380 for (i = 0; i < 256; i++) {
381 if (isUPPER_LC((U8) i))
382 PL_fold_locale[i] = (U8) toLOWER_LC((U8) i);
383 else if (isLOWER_LC((U8) i))
384 PL_fold_locale[i] = (U8) toUPPER_LC((U8) i);
385 else
386 PL_fold_locale[i] = (U8) i;
8c6180a9
KW
387
388 /* If checking for locale problems, see if the native ASCII-range
389 * printables plus \n and \t are in their expected categories in
390 * the new locale. If not, this could mean big trouble, upending
391 * Perl's and most programs' assumptions, like having a
392 * metacharacter with special meaning become a \w. Fortunately,
393 * it's very rare to find locales that aren't supersets of ASCII
394 * nowadays. It isn't a problem for most controls to be changed
395 * into something else; we check only \n and \t, though perhaps \r
396 * could be an issue as well. */
7d4bcc4a 397 if ( check_for_problems
8c6180a9
KW
398 && (isGRAPH_A(i) || isBLANK_A(i) || i == '\n'))
399 {
7d4bcc4a 400 if (( isALPHANUMERIC_A(i) && ! isALPHANUMERIC_LC(i))
8c6180a9
KW
401 || (isPUNCT_A(i) && ! isPUNCT_LC(i))
402 || (isBLANK_A(i) && ! isBLANK_LC(i))
403 || (i == '\n' && ! isCNTRL_LC(i)))
404 {
405 if (bad_count) { /* Separate multiple entries with a
406 blank */
407 bad_chars_list[bad_count++] = ' ';
408 }
409 bad_chars_list[bad_count++] = '\'';
410 if (isPRINT_A(i)) {
411 bad_chars_list[bad_count++] = (char) i;
412 }
413 else {
414 bad_chars_list[bad_count++] = '\\';
415 if (i == '\n') {
416 bad_chars_list[bad_count++] = 'n';
417 }
418 else {
419 assert(i == '\t');
420 bad_chars_list[bad_count++] = 't';
421 }
422 }
423 bad_chars_list[bad_count++] = '\'';
424 bad_chars_list[bad_count] = '\0';
425 }
426 }
427 }
428
7d4bcc4a
KW
429# ifdef MB_CUR_MAX
430
8c6180a9 431 /* We only handle single-byte locales (outside of UTF-8 ones; so if
d35fca5f 432 * this locale requires more than one byte, there are going to be
8c6180a9 433 * problems. */
9c8a6dc2
KW
434 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
435 "%s:%d: check_for_problems=%d, MB_CUR_MAX=%d\n",
436 __FILE__, __LINE__, check_for_problems, (int) MB_CUR_MAX));
437
ba1a4362
KW
438 if (check_for_problems && MB_CUR_MAX > 1
439
440 /* Some platforms return MB_CUR_MAX > 1 for even the "C"
441 * locale. Just assume that the implementation for them (plus
442 * for POSIX) is correct and the > 1 value is spurious. (Since
443 * these are specially handled to never be considered UTF-8
444 * locales, as long as this is the only problem, everything
445 * should work fine */
446 && strNE(newctype, "C") && strNE(newctype, "POSIX"))
447 {
8c6180a9
KW
448 multi_byte_locale = TRUE;
449 }
7d4bcc4a
KW
450
451# endif
8c6180a9
KW
452
453 if (bad_count || multi_byte_locale) {
780fcc9f 454 PL_warn_locale = Perl_newSVpvf(aTHX_
8c6180a9 455 "Locale '%s' may not work well.%s%s%s\n",
780fcc9f 456 newctype,
8c6180a9
KW
457 (multi_byte_locale)
458 ? " Some characters in it are not recognized by"
459 " Perl."
460 : "",
461 (bad_count)
462 ? "\nThe following characters (and maybe others)"
463 " may not have the same meaning as the Perl"
464 " program expects:\n"
465 : "",
466 (bad_count)
467 ? bad_chars_list
468 : ""
469 );
cc9eaeb0 470 /* If we are actually in the scope of the locale or are debugging,
bddebb56
KW
471 * output the message now. If not in that scope, we save the
472 * message to be output at the first operation using this locale,
473 * if that actually happens. Most programs don't use locales, so
474 * they are immune to bad ones. */
cc9eaeb0 475 if (IN_LC(LC_CTYPE) || UNLIKELY(DEBUG_L_TEST)) {
780fcc9f
KW
476
477 /* We have to save 'newctype' because the setlocale() just
478 * below may destroy it. The next setlocale() further down
479 * should restore it properly so that the intermediate change
480 * here is transparent to this function's caller */
481 const char * const badlocale = savepv(newctype);
482
837ce802 483 do_setlocale_c(LC_CTYPE, "C");
780fcc9f
KW
484
485 /* The '0' below suppresses a bogus gcc compiler warning */
486 Perl_warner(aTHX_ packWARN(WARN_LOCALE), SvPVX(PL_warn_locale), 0);
bddebb56 487
837ce802 488 do_setlocale_c(LC_CTYPE, badlocale);
c0f3a893 489 Safefree(badlocale);
bddebb56
KW
490
491 if (IN_LC(LC_CTYPE)) {
492 SvREFCNT_dec_NN(PL_warn_locale);
493 PL_warn_locale = NULL;
494 }
780fcc9f 495 }
baa60164 496 }
31f05a37 497 }
98994639
HS
498
499#endif /* USE_LOCALE_CTYPE */
7d4bcc4a 500
98994639
HS
501}
502
98994639 503void
2726666d
KW
504Perl__warn_problematic_locale()
505{
2726666d
KW
506
507#ifdef USE_LOCALE_CTYPE
508
5f04a188
KW
509 dTHX;
510
511 /* Internal-to-core function that outputs the message in PL_warn_locale,
512 * and then NULLS it. Should be called only through the macro
513 * _CHECK_AND_WARN_PROBLEMATIC_LOCALE */
514
2726666d
KW
515 if (PL_warn_locale) {
516 /*GCC_DIAG_IGNORE(-Wformat-security); Didn't work */
517 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
518 SvPVX(PL_warn_locale),
519 0 /* dummy to avoid compiler warning */ );
520 /* GCC_DIAG_RESTORE; */
521 SvREFCNT_dec_NN(PL_warn_locale);
522 PL_warn_locale = NULL;
523 }
524
525#endif
526
527}
528
a4f00dcc
KW
529STATIC void
530S_new_collate(pTHX_ const char *newcoll)
98994639 531{
7d4bcc4a
KW
532
533#ifndef USE_LOCALE_COLLATE
534
535 PERL_UNUSED_ARG(newcoll);
536 PERL_UNUSED_CONTEXT;
537
538#else
0d071d52
KW
539
540 /* Called after all libc setlocale() calls affecting LC_COLLATE, to tell
541 * core Perl this and that 'newcoll' is the name of the new locale.
542 *
d35fca5f
KW
543 * The design of locale collation is that every locale change is given an
544 * index 'PL_collation_ix'. The first time a string particpates in an
545 * operation that requires collation while locale collation is active, it
546 * is given PERL_MAGIC_collxfrm magic (via sv_collxfrm_flags()). That
547 * magic includes the collation index, and the transformation of the string
548 * by strxfrm(), q.v. That transformation is used when doing comparisons,
549 * instead of the string itself. If a string changes, the magic is
550 * cleared. The next time the locale changes, the index is incremented,
551 * and so we know during a comparison that the transformation is not
552 * necessarily still valid, and so is recomputed. Note that if the locale
553 * changes enough times, the index could wrap (a U32), and it is possible
554 * that a transformation would improperly be considered valid, leading to
555 * an unlikely bug */
0d071d52 556
98994639
HS
557 if (! newcoll) {
558 if (PL_collation_name) {
559 ++PL_collation_ix;
560 Safefree(PL_collation_name);
561 PL_collation_name = NULL;
562 }
563 PL_collation_standard = TRUE;
00bf60ca 564 is_standard_collation:
98994639
HS
565 PL_collxfrm_base = 0;
566 PL_collxfrm_mult = 2;
165a1c52 567 PL_in_utf8_COLLATE_locale = FALSE;
f28f4d2a 568 PL_strxfrm_NUL_replacement = '\0';
a4a439fb 569 PL_strxfrm_max_cp = 0;
98994639
HS
570 return;
571 }
572
d35fca5f 573 /* If this is not the same locale as currently, set the new one up */
98994639
HS
574 if (! PL_collation_name || strNE(PL_collation_name, newcoll)) {
575 ++PL_collation_ix;
576 Safefree(PL_collation_name);
577 PL_collation_name = stdize_locale(savepv(newcoll));
a39edc4c 578 PL_collation_standard = isNAME_C_OR_POSIX(newcoll);
00bf60ca
KW
579 if (PL_collation_standard) {
580 goto is_standard_collation;
581 }
98994639 582
165a1c52 583 PL_in_utf8_COLLATE_locale = _is_cur_LC_category_utf8(LC_COLLATE);
f28f4d2a 584 PL_strxfrm_NUL_replacement = '\0';
a4a439fb 585 PL_strxfrm_max_cp = 0;
165a1c52 586
59c018b9
KW
587 /* A locale collation definition includes primary, secondary, tertiary,
588 * etc. weights for each character. To sort, the primary weights are
589 * used, and only if they compare equal, then the secondary weights are
590 * used, and only if they compare equal, then the tertiary, etc.
591 *
592 * strxfrm() works by taking the input string, say ABC, and creating an
593 * output transformed string consisting of first the primary weights,
594 * A¹B¹C¹ followed by the secondary ones, A²B²C²; and then the
595 * tertiary, etc, yielding A¹B¹C¹ A²B²C² A³B³C³ .... Some characters
596 * may not have weights at every level. In our example, let's say B
597 * doesn't have a tertiary weight, and A doesn't have a secondary
598 * weight. The constructed string is then going to be
599 * A¹B¹C¹ B²C² A³C³ ....
600 * This has the desired effect that strcmp() will look at the secondary
601 * or tertiary weights only if the strings compare equal at all higher
602 * priority weights. The spaces shown here, like in
c342d20e 603 * "A¹B¹C¹ A²B²C² "
59c018b9
KW
604 * are not just for readability. In the general case, these must
605 * actually be bytes, which we will call here 'separator weights'; and
606 * they must be smaller than any other weight value, but since these
607 * are C strings, only the terminating one can be a NUL (some
608 * implementations may include a non-NUL separator weight just before
609 * the NUL). Implementations tend to reserve 01 for the separator
610 * weights. They are needed so that a shorter string's secondary
611 * weights won't be misconstrued as primary weights of a longer string,
612 * etc. By making them smaller than any other weight, the shorter
613 * string will sort first. (Actually, if all secondary weights are
614 * smaller than all primary ones, there is no need for a separator
615 * weight between those two levels, etc.)
616 *
617 * The length of the transformed string is roughly a linear function of
618 * the input string. It's not exactly linear because some characters
619 * don't have weights at all levels. When we call strxfrm() we have to
620 * allocate some memory to hold the transformed string. The
621 * calculations below try to find coefficients 'm' and 'b' for this
622 * locale so that m*x + b equals how much space we need, given the size
623 * of the input string in 'x'. If we calculate too small, we increase
624 * the size as needed, and call strxfrm() again, but it is better to
625 * get it right the first time to avoid wasted expensive string
626 * transformations. */
627
98994639 628 {
79f120c8
KW
629 /* We use the string below to find how long the tranformation of it
630 * is. Almost all locales are supersets of ASCII, or at least the
631 * ASCII letters. We use all of them, half upper half lower,
632 * because if we used fewer, we might hit just the ones that are
633 * outliers in a particular locale. Most of the strings being
634 * collated will contain a preponderance of letters, and even if
635 * they are above-ASCII, they are likely to have the same number of
636 * weight levels as the ASCII ones. It turns out that digits tend
637 * to have fewer levels, and some punctuation has more, but those
638 * are relatively sparse in text, and khw believes this gives a
639 * reasonable result, but it could be changed if experience so
640 * dictates. */
641 const char longer[] = "ABCDEFGHIJKLMnopqrstuvwxyz";
642 char * x_longer; /* Transformed 'longer' */
643 Size_t x_len_longer; /* Length of 'x_longer' */
644
645 char * x_shorter; /* We also transform a substring of 'longer' */
646 Size_t x_len_shorter;
647
a4a439fb 648 /* _mem_collxfrm() is used get the transformation (though here we
79f120c8
KW
649 * are interested only in its length). It is used because it has
650 * the intelligence to handle all cases, but to work, it needs some
651 * values of 'm' and 'b' to get it started. For the purposes of
652 * this calculation we use a very conservative estimate of 'm' and
653 * 'b'. This assumes a weight can be multiple bytes, enough to
654 * hold any UV on the platform, and there are 5 levels, 4 weight
655 * bytes, and a trailing NUL. */
656 PL_collxfrm_base = 5;
657 PL_collxfrm_mult = 5 * sizeof(UV);
658
659 /* Find out how long the transformation really is */
a4a439fb
KW
660 x_longer = _mem_collxfrm(longer,
661 sizeof(longer) - 1,
662 &x_len_longer,
663
664 /* We avoid converting to UTF-8 in the
665 * called function by telling it the
666 * string is in UTF-8 if the locale is a
667 * UTF-8 one. Since the string passed
668 * here is invariant under UTF-8, we can
669 * claim it's UTF-8 even though it isn't.
670 * */
671 PL_in_utf8_COLLATE_locale);
79f120c8
KW
672 Safefree(x_longer);
673
674 /* Find out how long the transformation of a substring of 'longer'
675 * is. Together the lengths of these transformations are
676 * sufficient to calculate 'm' and 'b'. The substring is all of
677 * 'longer' except the first character. This minimizes the chances
678 * of being swayed by outliers */
a4a439fb 679 x_shorter = _mem_collxfrm(longer + 1,
79f120c8 680 sizeof(longer) - 2,
a4a439fb
KW
681 &x_len_shorter,
682 PL_in_utf8_COLLATE_locale);
79f120c8
KW
683 Safefree(x_shorter);
684
685 /* If the results are nonsensical for this simple test, the whole
686 * locale definition is suspect. Mark it so that locale collation
687 * is not active at all for it. XXX Should we warn? */
688 if ( x_len_shorter == 0
689 || x_len_longer == 0
690 || x_len_shorter >= x_len_longer)
691 {
692 PL_collxfrm_mult = 0;
693 PL_collxfrm_base = 0;
694 }
695 else {
696 SSize_t base; /* Temporary */
697
698 /* We have both: m * strlen(longer) + b = x_len_longer
699 * m * strlen(shorter) + b = x_len_shorter;
700 * subtracting yields:
701 * m * (strlen(longer) - strlen(shorter))
702 * = x_len_longer - x_len_shorter
703 * But we have set things up so that 'shorter' is 1 byte smaller
704 * than 'longer'. Hence:
705 * m = x_len_longer - x_len_shorter
706 *
707 * But if something went wrong, make sure the multiplier is at
708 * least 1.
709 */
710 if (x_len_longer > x_len_shorter) {
711 PL_collxfrm_mult = (STRLEN) x_len_longer - x_len_shorter;
712 }
713 else {
714 PL_collxfrm_mult = 1;
715 }
716
717 /* mx + b = len
718 * so: b = len - mx
719 * but in case something has gone wrong, make sure it is
720 * non-negative */
721 base = x_len_longer - PL_collxfrm_mult * (sizeof(longer) - 1);
722 if (base < 0) {
723 base = 0;
724 }
725
726 /* Add 1 for the trailing NUL */
727 PL_collxfrm_base = base + 1;
728 }
58eebef2 729
7d4bcc4a
KW
730# ifdef DEBUGGING
731
58eebef2
KW
732 if (DEBUG_L_TEST || debug_initialization) {
733 PerlIO_printf(Perl_debug_log,
b07929e4
KW
734 "%s:%d: ?UTF-8 locale=%d; x_len_shorter=%zu, "
735 "x_len_longer=%zu,"
736 " collate multipler=%zu, collate base=%zu\n",
58eebef2
KW
737 __FILE__, __LINE__,
738 PL_in_utf8_COLLATE_locale,
739 x_len_shorter, x_len_longer,
740 PL_collxfrm_mult, PL_collxfrm_base);
741 }
7d4bcc4a
KW
742# endif
743
98994639
HS
744 }
745 }
746
747#endif /* USE_LOCALE_COLLATE */
7d4bcc4a 748
98994639
HS
749}
750
d2b24094 751#ifdef WIN32
b8cc575c 752
a4f00dcc 753STATIC char *
b8cc575c 754S_win32_setlocale(pTHX_ int category, const char* locale)
b385bb4d
KW
755{
756 /* This, for Windows, emulates POSIX setlocale() behavior. There is no
7d4bcc4a
KW
757 * difference between the two unless the input locale is "", which normally
758 * means on Windows to get the machine default, which is set via the
759 * computer's "Regional and Language Options" (or its current equivalent).
760 * In POSIX, it instead means to find the locale from the user's
761 * environment. This routine changes the Windows behavior to first look in
762 * the environment, and, if anything is found, use that instead of going to
763 * the machine default. If there is no environment override, the machine
764 * default is used, by calling the real setlocale() with "".
765 *
766 * The POSIX behavior is to use the LC_ALL variable if set; otherwise to
767 * use the particular category's variable if set; otherwise to use the LANG
768 * variable. */
b385bb4d 769
175c4cf9 770 bool override_LC_ALL = FALSE;
89f7b9aa
KW
771 char * result;
772
b385bb4d 773 if (locale && strEQ(locale, "")) {
7d4bcc4a
KW
774
775# ifdef LC_ALL
776
b385bb4d
KW
777 locale = PerlEnv_getenv("LC_ALL");
778 if (! locale) {
7d4bcc4a
KW
779
780# endif
b385bb4d 781 switch (category) {
7d4bcc4a
KW
782
783# ifdef LC_ALL
b385bb4d 784 case LC_ALL:
481465ea 785 override_LC_ALL = TRUE;
b385bb4d 786 break; /* We already know its variable isn't set */
7d4bcc4a
KW
787
788# endif
789# ifdef USE_LOCALE_TIME
790
b385bb4d
KW
791 case LC_TIME:
792 locale = PerlEnv_getenv("LC_TIME");
793 break;
7d4bcc4a
KW
794
795# endif
796# ifdef USE_LOCALE_CTYPE
797
b385bb4d
KW
798 case LC_CTYPE:
799 locale = PerlEnv_getenv("LC_CTYPE");
800 break;
7d4bcc4a
KW
801
802# endif
803# ifdef USE_LOCALE_COLLATE
804
b385bb4d
KW
805 case LC_COLLATE:
806 locale = PerlEnv_getenv("LC_COLLATE");
807 break;
7d4bcc4a
KW
808
809# endif
810# ifdef USE_LOCALE_MONETARY
811
b385bb4d
KW
812 case LC_MONETARY:
813 locale = PerlEnv_getenv("LC_MONETARY");
814 break;
7d4bcc4a
KW
815
816# endif
817# ifdef USE_LOCALE_NUMERIC
818
b385bb4d
KW
819 case LC_NUMERIC:
820 locale = PerlEnv_getenv("LC_NUMERIC");
821 break;
7d4bcc4a
KW
822
823# endif
824# ifdef USE_LOCALE_MESSAGES
825
b385bb4d
KW
826 case LC_MESSAGES:
827 locale = PerlEnv_getenv("LC_MESSAGES");
828 break;
7d4bcc4a
KW
829
830# endif
831
b385bb4d
KW
832 default:
833 /* This is a category, like PAPER_SIZE that we don't
834 * know about; and so can't provide a wrapper. */
835 break;
836 }
837 if (! locale) {
838 locale = PerlEnv_getenv("LANG");
481465ea 839 if (! locale) {
b385bb4d
KW
840 locale = "";
841 }
842 }
7d4bcc4a
KW
843
844# ifdef LC_ALL
845
b385bb4d 846 }
7d4bcc4a
KW
847
848# endif
849
b385bb4d
KW
850 }
851
89f7b9aa 852 result = setlocale(category, locale);
bbc98134 853 DEBUG_L(PerlIO_printf(Perl_debug_log, "%s:%d: %s\n", __FILE__, __LINE__,
a4f00dcc 854 setlocale_debug_string(category, locale, result)));
89f7b9aa 855
481465ea 856 if (! override_LC_ALL) {
89f7b9aa
KW
857 return result;
858 }
859
dfd77d7a 860 /* Here the input category was LC_ALL, and we have set it to what is in the
481465ea
KW
861 * LANG variable or the system default if there is no LANG. But these have
862 * lower priority than the other LC_foo variables, so override it for each
863 * one that is set. (If they are set to "", it means to use the same thing
864 * we just set LC_ALL to, so can skip) */
7d4bcc4a
KW
865
866# ifdef USE_LOCALE_TIME
867
89f7b9aa 868 result = PerlEnv_getenv("LC_TIME");
730252b2 869 if (result && strNE(result, "")) {
89f7b9aa 870 setlocale(LC_TIME, result);
bbc98134
KW
871 DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s:%d: %s\n",
872 __FILE__, __LINE__,
a4f00dcc 873 setlocale_debug_string(LC_TIME, result, "not captured")));
89f7b9aa 874 }
7d4bcc4a
KW
875
876# endif
877# ifdef USE_LOCALE_CTYPE
878
89f7b9aa 879 result = PerlEnv_getenv("LC_CTYPE");
730252b2 880 if (result && strNE(result, "")) {
89f7b9aa 881 setlocale(LC_CTYPE, result);
bbc98134
KW
882 DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s:%d: %s\n",
883 __FILE__, __LINE__,
a4f00dcc 884 setlocale_debug_string(LC_CTYPE, result, "not captured")));
89f7b9aa 885 }
7d4bcc4a
KW
886
887# endif
888# ifdef USE_LOCALE_COLLATE
889
89f7b9aa 890 result = PerlEnv_getenv("LC_COLLATE");
730252b2 891 if (result && strNE(result, "")) {
89f7b9aa 892 setlocale(LC_COLLATE, result);
bbc98134
KW
893 DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s:%d: %s\n",
894 __FILE__, __LINE__,
a4f00dcc 895 setlocale_debug_string(LC_COLLATE, result, "not captured")));
89f7b9aa 896 }
7d4bcc4a
KW
897
898# endif
899# ifdef USE_LOCALE_MONETARY
900
89f7b9aa 901 result = PerlEnv_getenv("LC_MONETARY");
730252b2 902 if (result && strNE(result, "")) {
89f7b9aa 903 setlocale(LC_MONETARY, result);
bbc98134
KW
904 DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s:%d: %s\n",
905 __FILE__, __LINE__,
a4f00dcc 906 setlocale_debug_string(LC_MONETARY, result, "not captured")));
89f7b9aa 907 }
7d4bcc4a
KW
908
909# endif
910# ifdef USE_LOCALE_NUMERIC
911
89f7b9aa 912 result = PerlEnv_getenv("LC_NUMERIC");
730252b2 913 if (result && strNE(result, "")) {
89f7b9aa 914 setlocale(LC_NUMERIC, result);
bbc98134
KW
915 DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s:%d: %s\n",
916 __FILE__, __LINE__,
a4f00dcc 917 setlocale_debug_string(LC_NUMERIC, result, "not captured")));
89f7b9aa 918 }
7d4bcc4a
KW
919
920# endif
921# ifdef USE_LOCALE_MESSAGES
922
89f7b9aa 923 result = PerlEnv_getenv("LC_MESSAGES");
730252b2 924 if (result && strNE(result, "")) {
89f7b9aa 925 setlocale(LC_MESSAGES, result);
bbc98134
KW
926 DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s:%d: %s\n",
927 __FILE__, __LINE__,
a4f00dcc 928 setlocale_debug_string(LC_MESSAGES, result, "not captured")));
89f7b9aa 929 }
7d4bcc4a
KW
930
931# endif
89f7b9aa 932
bbc98134
KW
933 result = setlocale(LC_ALL, NULL);
934 DEBUG_L(PerlIO_printf(Perl_debug_log, "%s:%d: %s\n",
935 __FILE__, __LINE__,
a4f00dcc 936 setlocale_debug_string(LC_ALL, NULL, result)));
89f7b9aa 937
bbc98134 938 return result;
b385bb4d
KW
939}
940
941#endif
942
a4f00dcc
KW
943char *
944Perl_setlocale(int category, const char * locale)
945{
946 /* This wraps POSIX::setlocale() */
947
948 char * retval;
949 dTHX;
950
a4f00dcc
KW
951#ifdef USE_LOCALE_NUMERIC
952
953 /* A NULL locale means only query what the current one is. We
954 * have the LC_NUMERIC name saved, because we are normally switched
955 * into the C locale for it. Switch back so an LC_ALL query will yield
956 * the correct results; all other categories don't require special
957 * handling */
958 if (locale == NULL) {
959 if (category == LC_NUMERIC) {
960 return savepv(PL_numeric_name);
961 }
962
7d4bcc4a 963# ifdef LC_ALL
a4f00dcc
KW
964
965 else if (category == LC_ALL) {
966 SET_NUMERIC_UNDERLYING();
967 }
968
7d4bcc4a 969# endif
a4f00dcc
KW
970
971 }
972
973#endif
974
d2b24094 975 retval = do_setlocale_r(category, locale);
a4f00dcc
KW
976
977 DEBUG_L(PerlIO_printf(Perl_debug_log,
978 "%s:%d: %s\n", __FILE__, __LINE__,
979 setlocale_debug_string(category, locale, retval)));
980 if (! retval) {
981 /* Should never happen that a query would return an error, but be
982 * sure and reset to C locale */
983 if (locale == 0) {
984 SET_NUMERIC_STANDARD();
985 }
7d4bcc4a 986
a4f00dcc
KW
987 return NULL;
988 }
989
990 /* Save retval since subsequent setlocale() calls may overwrite it. */
991 retval = savepv(retval);
992
993 /* If locale == NULL, we are just querying the state, but may have switched
994 * to NUMERIC_UNDERLYING. Switch back before returning. */
995 if (locale == NULL) {
996 SET_NUMERIC_STANDARD();
997 return retval;
998 }
999 else { /* Now that have switched locales, we have to update our records to
1000 correspond */
1001
1002#ifdef USE_LOCALE_CTYPE
1003
1004 if ( category == LC_CTYPE
1005
7d4bcc4a 1006# ifdef LC_ALL
a4f00dcc
KW
1007
1008 || category == LC_ALL
1009
7d4bcc4a 1010# endif
a4f00dcc
KW
1011
1012 )
1013 {
1014 char *newctype;
1015
7d4bcc4a 1016# ifdef LC_ALL
a4f00dcc
KW
1017
1018 if (category == LC_ALL) {
837ce802 1019 newctype = do_setlocale_c(LC_CTYPE, NULL);
a4f00dcc
KW
1020 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
1021 "%s:%d: %s\n", __FILE__, __LINE__,
1022 setlocale_debug_string(LC_CTYPE, NULL, newctype)));
1023 }
1024 else
1025
7d4bcc4a 1026# endif
a4f00dcc
KW
1027
1028 newctype = retval;
1029 new_ctype(newctype);
1030 }
1031
1032#endif /* USE_LOCALE_CTYPE */
a4f00dcc
KW
1033#ifdef USE_LOCALE_COLLATE
1034
1035 if ( category == LC_COLLATE
1036
7d4bcc4a 1037# ifdef LC_ALL
a4f00dcc
KW
1038
1039 || category == LC_ALL
1040
7d4bcc4a 1041# endif
a4f00dcc
KW
1042
1043 )
1044 {
1045 char *newcoll;
1046
7d4bcc4a 1047# ifdef LC_ALL
a4f00dcc
KW
1048
1049 if (category == LC_ALL) {
837ce802 1050 newcoll = do_setlocale_c(LC_COLLATE, NULL);
a4f00dcc
KW
1051 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
1052 "%s:%d: %s\n", __FILE__, __LINE__,
1053 setlocale_debug_string(LC_COLLATE, NULL, newcoll)));
1054 }
1055 else
1056
7d4bcc4a 1057# endif
a4f00dcc
KW
1058
1059 newcoll = retval;
1060 new_collate(newcoll);
1061 }
1062
1063#endif /* USE_LOCALE_COLLATE */
a4f00dcc
KW
1064#ifdef USE_LOCALE_NUMERIC
1065
1066 if ( category == LC_NUMERIC
1067
7d4bcc4a 1068# ifdef LC_ALL
a4f00dcc
KW
1069
1070 || category == LC_ALL
1071
7d4bcc4a 1072# endif
a4f00dcc
KW
1073
1074 )
1075 {
1076 char *newnum;
1077
7d4bcc4a 1078# ifdef LC_ALL
a4f00dcc
KW
1079
1080 if (category == LC_ALL) {
837ce802 1081 newnum = do_setlocale_c(LC_NUMERIC, NULL);
a4f00dcc
KW
1082 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
1083 "%s:%d: %s\n", __FILE__, __LINE__,
1084 setlocale_debug_string(LC_NUMERIC, NULL, newnum)));
1085 }
1086 else
1087
7d4bcc4a 1088# endif
a4f00dcc
KW
1089
1090 newnum = retval;
1091 new_numeric(newnum);
1092 }
1093
1094#endif /* USE_LOCALE_NUMERIC */
1095
1096 }
1097
1098 return retval;
1099
f7416781
KW
1100
1101}
1102
1103PERL_STATIC_INLINE const char *
1104S_save_to_buffer(const char * string, char **buf, Size_t *buf_size, const Size_t offset)
1105{
1106 /* Copy the NUL-terminated 'string' to 'buf' + 'offset'. 'buf' has size 'buf_size',
1107 * growing it if necessary */
1108
1109 const Size_t string_size = strlen(string) + offset + 1;
1110
1111 PERL_ARGS_ASSERT_SAVE_TO_BUFFER;
1112
1113 if (*buf_size == 0) {
1114 Newx(*buf, string_size, char);
1115 *buf_size = string_size;
1116 }
1117 else if (string_size > *buf_size) {
1118 Renew(*buf, string_size, char);
1119 *buf_size = string_size;
1120 }
1121
1122 Copy(string, *buf + offset, string_size - offset, char);
1123 return *buf;
1124}
1125
1126/*
1127
1128=head1 Locale-related functions and macros
1129
1130=for apidoc Perl_langinfo
1131
7d4bcc4a 1132This is an (almost ª) drop-in replacement for the system C<L<nl_langinfo(3)>>,
f7416781
KW
1133taking the same C<item> parameter values, and returning the same information.
1134But it is more thread-safe than regular C<nl_langinfo()>, and hides the quirks
1135of Perl's locale handling from your code, and can be used on systems that lack
1136a native C<nl_langinfo>.
1137
1138Expanding on these:
1139
1140=over
1141
1142=item *
1143
1144It delivers the correct results for the C<RADIXCHAR> and C<THOUSESEP> items,
1145without you having to write extra code. The reason for the extra code would be
1146because these are from the C<LC_NUMERIC> locale category, which is normally
1147kept set to the C locale by Perl, no matter what the underlying locale is
1148supposed to be, and so to get the expected results, you have to temporarily
1149toggle into the underlying locale, and later toggle back. (You could use
1150plain C<nl_langinfo> and C<L</STORE_LC_NUMERIC_FORCE_TO_UNDERLYING>> for this
1151but then you wouldn't get the other advantages of C<Perl_langinfo()>; not
1152keeping C<LC_NUMERIC> in the C locale would break a lot of CPAN, which is
1153expecting the radix (decimal point) character to be a dot.)
1154
1155=item *
1156
1157Depending on C<item>, it works on systems that don't have C<nl_langinfo>, hence
1158makes your code more portable. Of the fifty-some possible items specified by
1159the POSIX 2008 standard,
1160L<http://pubs.opengroup.org/onlinepubs/9699919799/basedefs/langinfo.h.html>,
1161only two are completely unimplemented. It uses various techniques to recover
1162the other items, including calling C<L<localeconv(3)>>, and C<L<strftime(3)>>,
1163both of which are specified in C89, so should be always be available. Later
1164C<strftime()> versions have additional capabilities; C<""> is returned for
1165those not available on your system.
1166
1167The details for those items which may differ from what this emulation returns
1168and what a native C<nl_langinfo()> would return are:
1169
1170=over
1171
1172=item C<CODESET>
1173
1174=item C<ERA>
1175
1176Unimplemented, so returns C<"">.
1177
1178=item C<YESEXPR>
1179
1180=item C<NOEXPR>
1181
1182Only the values for English are returned. Earlier POSIX standards also
1183specified C<YESSTR> and C<NOSTR>, but these have been removed from POSIX 2008,
1184and aren't supported by C<Perl_langinfo>.
1185
1186=item C<D_FMT>
1187
1188Always evaluates to C<%x>, the locale's appropriate date representation.
1189
1190=item C<T_FMT>
1191
1192Always evaluates to C<%X>, the locale's appropriate time representation.
1193
1194=item C<D_T_FMT>
1195
1196Always evaluates to C<%c>, the locale's appropriate date and time
1197representation.
1198
1199=item C<CRNCYSTR>
1200
1201The return may be incorrect for those rare locales where the currency symbol
1202replaces the radix character.
1203Send email to L<mailto:perlbug@perl.org> if you have examples of it needing
1204to work differently.
1205
1206=item C<ALT_DIGITS>
1207
1208Currently this gives the same results as Linux does.
1209Send email to L<mailto:perlbug@perl.org> if you have examples of it needing
1210to work differently.
1211
1212=item C<ERA_D_FMT>
1213
1214=item C<ERA_T_FMT>
1215
1216=item C<ERA_D_T_FMT>
1217
1218=item C<T_FMT_AMPM>
1219
1220These are derived by using C<strftime()>, and not all versions of that function
1221know about them. C<""> is returned for these on such systems.
1222
1223=back
1224
1225When using C<Perl_langinfo> on systems that don't have a native
1226C<nl_langinfo()>, you must
1227
1228 #include "perl_langinfo.h"
1229
1230before the C<perl.h> C<#include>. You can replace your C<langinfo.h>
1231C<#include> with this one. (Doing it this way keeps out the symbols that plain
1232C<langinfo.h> imports into the namespace for code that doesn't need it.)
1233
1234You also should not use the bare C<langinfo.h> item names, but should preface
1235them with C<PERL_>, so use C<PERL_RADIXCHAR> instead of plain C<RADIXCHAR>.
1236The C<PERL_I<foo>> versions will also work for this function on systems that do
1237have a native C<nl_langinfo>.
1238
1239=item *
1240
1241It is thread-friendly, returning its result in a buffer that won't be
1242overwritten by another thread, so you don't have to code for that possibility.
1243The buffer can be overwritten by the next call to C<nl_langinfo> or
1244C<Perl_langinfo> in the same thread.
1245
1246=item *
1247
7d4bcc4a 1248ª It returns S<C<const char *>>, whereas plain C<nl_langinfo()> returns S<C<char
f7416781
KW
1249*>>, but you are (only by documentation) forbidden to write into the buffer.
1250By declaring this C<const>, the compiler enforces this restriction. The extra
1251C<const> is why this isn't an unequivocal drop-in replacement for
1252C<nl_langinfo>.
1253
1254=back
1255
1256The original impetus for C<Perl_langinfo()> was so that code that needs to
1257find out the current currency symbol, floating point radix character, or digit
1258grouping separator can use, on all systems, the simpler and more
1259thread-friendly C<nl_langinfo> API instead of C<L<localeconv(3)>> which is a
1260pain to make thread-friendly. For other fields returned by C<localeconv>, it
1261is better to use the methods given in L<perlcall> to call
1262L<C<POSIX::localeconv()>|POSIX/localeconv>, which is thread-friendly.
1263
1264=cut
1265
1266*/
1267
1268const char *
1269#ifdef HAS_NL_LANGINFO
1270Perl_langinfo(const nl_item item)
1271#else
1272Perl_langinfo(const int item)
1273#endif
1274{
f61748ac
KW
1275 return my_nl_langinfo(item, TRUE);
1276}
1277
1278const char *
1279#ifdef HAS_NL_LANGINFO
1280S_my_nl_langinfo(const nl_item item, bool toggle)
1281#else
1282S_my_nl_langinfo(const int item, bool toggle)
1283#endif
1284{
ae74815b 1285 dTHX;
f7416781 1286
ab340fff
KW
1287#if defined(HAS_NL_LANGINFO) /* nl_langinfo() is available. */
1288#if ! defined(HAS_POSIX_2008_LOCALE)
f7416781 1289
ab340fff 1290 /* Here, use plain nl_langinfo(), switching to the underlying LC_NUMERIC
ae74815b
KW
1291 * for those items dependent on it. This must be copied to a buffer before
1292 * switching back, as some systems destroy the buffer when setlocale() is
1293 * called */
f7416781
KW
1294
1295 LOCALE_LOCK;
1296
1297 if (toggle) {
1298 if (item == PERL_RADIXCHAR || item == PERL_THOUSEP) {
837ce802 1299 do_setlocale_c(LC_NUMERIC, PL_numeric_name);
f7416781
KW
1300 }
1301 else {
1302 toggle = FALSE;
1303 }
1304 }
1305
1306 save_to_buffer(nl_langinfo(item), &PL_langinfo_buf, &PL_langinfo_bufsize, 0);
1307
1308 if (toggle) {
837ce802 1309 do_setlocale_c(LC_NUMERIC, "C");
f7416781
KW
1310 }
1311
1312 LOCALE_UNLOCK;
1313
1314 return PL_langinfo_buf;
1315
ab340fff
KW
1316# else /* Use nl_langinfo_l(), avoiding both a mutex and changing the locale */
1317
1318 bool do_free = FALSE;
1319 locale_t cur = uselocale((locale_t) 0);
1320
1321 if (cur == LC_GLOBAL_LOCALE) {
1322 cur = duplocale(LC_GLOBAL_LOCALE);
1323 do_free = TRUE;
1324 }
1325
1326 if ( toggle
1327 && (item == PERL_RADIXCHAR || item == PERL_THOUSEP))
1328 {
1329 cur = newlocale(LC_NUMERIC_MASK, PL_numeric_name, cur);
1330 do_free = TRUE;
1331 }
1332
1333 save_to_buffer(nl_langinfo_l(item, cur),
1334 &PL_langinfo_buf, &PL_langinfo_bufsize, 0);
1335 if (do_free) {
1336 freelocale(cur);
1337 }
1338
1339 return PL_langinfo_buf;
1340
1341# endif
f7416781 1342#else /* Below, emulate nl_langinfo as best we can */
f7416781
KW
1343# ifdef HAS_LOCALECONV
1344
1345 const struct lconv* lc;
1346
1347# endif
1348# ifdef HAS_STRFTIME
1349
1350 struct tm tm;
1351 bool return_format = FALSE; /* Return the %format, not the value */
1352 const char * format;
1353
1354# endif
1355
1356 /* We copy the results to a per-thread buffer, even if not multi-threaded.
1357 * This is in part to simplify this code, and partly because we need a
1358 * buffer anyway for strftime(), and partly because a call of localeconv()
1359 * could otherwise wipe out the buffer, and the programmer would not be
1360 * expecting this, as this is a nl_langinfo() substitute after all, so s/he
1361 * might be thinking their localeconv() is safe until another localeconv()
1362 * call. */
1363
1364 switch (item) {
1365 Size_t len;
1366 const char * retval;
1367
1368 /* These 2 are unimplemented */
1369 case PERL_CODESET:
1370 case PERL_ERA: /* For use with strftime() %E modifier */
1371
1372 default:
1373 return "";
1374
1375 /* We use only an English set, since we don't know any more */
1376 case PERL_YESEXPR: return "^[+1yY]";
1377 case PERL_NOEXPR: return "^[-0nN]";
1378
1379# ifdef HAS_LOCALECONV
1380
1381 case PERL_CRNCYSTR:
1382
1383 LOCALE_LOCK;
1384
1385 lc = localeconv();
1386 if (! lc || ! lc->currency_symbol || strEQ("", lc->currency_symbol))
1387 {
1388 LOCALE_UNLOCK;
1389 return "";
1390 }
1391
1392 /* Leave the first spot empty to be filled in below */
1393 save_to_buffer(lc->currency_symbol, &PL_langinfo_buf,
1394 &PL_langinfo_bufsize, 1);
1395 if (lc->mon_decimal_point && strEQ(lc->mon_decimal_point, ""))
1396 { /* khw couldn't figure out how the localedef specifications
1397 would show that the $ should replace the radix; this is
1398 just a guess as to how it might work.*/
1399 *PL_langinfo_buf = '.';
1400 }
1401 else if (lc->p_cs_precedes) {
1402 *PL_langinfo_buf = '-';
1403 }
1404 else {
1405 *PL_langinfo_buf = '+';
1406 }
1407
1408 LOCALE_UNLOCK;
1409 break;
1410
1411 case PERL_RADIXCHAR:
1412 case PERL_THOUSEP:
1413
1414 LOCALE_LOCK;
1415
1416 if (toggle) {
837ce802 1417 do_setlocale_c(LC_NUMERIC, PL_numeric_name);
f7416781
KW
1418 }
1419
1420 lc = localeconv();
1421 if (! lc) {
1422 retval = "";
1423 }
1424 else switch (item) {
1425 case PERL_RADIXCHAR:
1426 if (! lc->decimal_point) {
1427 retval = "";
1428 }
1429 else {
1430 retval = lc->decimal_point;
1431 }
1432 break;
1433
1434 case PERL_THOUSEP:
1435 if (! lc->thousands_sep || strEQ("", lc->thousands_sep)) {
1436 retval = "";
1437 }
1438 else {
1439 retval = lc->thousands_sep;
1440 }
1441 break;
1442
1443 default:
1444 LOCALE_UNLOCK;
1445 Perl_croak(aTHX_ "panic: %s: %d: switch case: %d problem",
1446 __FILE__, __LINE__, item);
1447 }
1448
1449 save_to_buffer(retval, &PL_langinfo_buf, &PL_langinfo_bufsize, 0);
1450
1451 if (toggle) {
837ce802 1452 do_setlocale_c(LC_NUMERIC, "C");
f7416781
KW
1453 }
1454
1455 LOCALE_UNLOCK;
1456
1457 break;
1458
1459# endif
1460# ifdef HAS_STRFTIME
1461
1462 /* These are defined by C89, so we assume that strftime supports them,
1463 * and so are returned unconditionally; they may not be what the locale
1464 * actually says, but should give good enough results for someone using
1465 * them as formats (as opposed to trying to parse them to figure out
7d4bcc4a 1466 * what the locale says). The other format items are actually tested to
f7416781
KW
1467 * verify they work on the platform */
1468 case PERL_D_FMT: return "%x";
1469 case PERL_T_FMT: return "%X";
1470 case PERL_D_T_FMT: return "%c";
1471
1472 /* These formats are only available in later strfmtime's */
1473 case PERL_ERA_D_FMT: case PERL_ERA_T_FMT: case PERL_ERA_D_T_FMT:
1474 case PERL_T_FMT_AMPM:
1475
1476 /* The rest can be gotten from most versions of strftime(). */
1477 case PERL_ABDAY_1: case PERL_ABDAY_2: case PERL_ABDAY_3:
1478 case PERL_ABDAY_4: case PERL_ABDAY_5: case PERL_ABDAY_6:
1479 case PERL_ABDAY_7:
1480 case PERL_ALT_DIGITS:
1481 case PERL_AM_STR: case PERL_PM_STR:
1482 case PERL_ABMON_1: case PERL_ABMON_2: case PERL_ABMON_3:
1483 case PERL_ABMON_4: case PERL_ABMON_5: case PERL_ABMON_6:
1484 case PERL_ABMON_7: case PERL_ABMON_8: case PERL_ABMON_9:
1485 case PERL_ABMON_10: case PERL_ABMON_11: case PERL_ABMON_12:
1486 case PERL_DAY_1: case PERL_DAY_2: case PERL_DAY_3: case PERL_DAY_4:
1487 case PERL_DAY_5: case PERL_DAY_6: case PERL_DAY_7:
1488 case PERL_MON_1: case PERL_MON_2: case PERL_MON_3: case PERL_MON_4:
1489 case PERL_MON_5: case PERL_MON_6: case PERL_MON_7: case PERL_MON_8:
1490 case PERL_MON_9: case PERL_MON_10: case PERL_MON_11: case PERL_MON_12:
1491
1492 LOCALE_LOCK;
1493
1494 init_tm(&tm); /* Precaution against core dumps */
1495 tm.tm_sec = 30;
1496 tm.tm_min = 30;
1497 tm.tm_hour = 6;
1498 tm.tm_year = 2017 - 1900;
1499 tm.tm_wday = 0;
1500 tm.tm_mon = 0;
1501 switch (item) {
1502 default:
1503 LOCALE_UNLOCK;
1504 Perl_croak(aTHX_ "panic: %s: %d: switch case: %d problem",
1505 __FILE__, __LINE__, item);
1506 NOT_REACHED; /* NOTREACHED */
1507
1508 case PERL_PM_STR: tm.tm_hour = 18;
1509 case PERL_AM_STR:
1510 format = "%p";
1511 break;
1512
1513 case PERL_ABDAY_7: tm.tm_wday++;
1514 case PERL_ABDAY_6: tm.tm_wday++;
1515 case PERL_ABDAY_5: tm.tm_wday++;
1516 case PERL_ABDAY_4: tm.tm_wday++;
1517 case PERL_ABDAY_3: tm.tm_wday++;
1518 case PERL_ABDAY_2: tm.tm_wday++;
1519 case PERL_ABDAY_1:
1520 format = "%a";
1521 break;
1522
1523 case PERL_DAY_7: tm.tm_wday++;
1524 case PERL_DAY_6: tm.tm_wday++;
1525 case PERL_DAY_5: tm.tm_wday++;
1526 case PERL_DAY_4: tm.tm_wday++;
1527 case PERL_DAY_3: tm.tm_wday++;
1528 case PERL_DAY_2: tm.tm_wday++;
1529 case PERL_DAY_1:
1530 format = "%A";
1531 break;
1532
1533 case PERL_ABMON_12: tm.tm_mon++;
1534 case PERL_ABMON_11: tm.tm_mon++;
1535 case PERL_ABMON_10: tm.tm_mon++;
1536 case PERL_ABMON_9: tm.tm_mon++;
1537 case PERL_ABMON_8: tm.tm_mon++;
1538 case PERL_ABMON_7: tm.tm_mon++;
1539 case PERL_ABMON_6: tm.tm_mon++;
1540 case PERL_ABMON_5: tm.tm_mon++;
1541 case PERL_ABMON_4: tm.tm_mon++;
1542 case PERL_ABMON_3: tm.tm_mon++;
1543 case PERL_ABMON_2: tm.tm_mon++;
1544 case PERL_ABMON_1:
1545 format = "%b";
1546 break;
1547
1548 case PERL_MON_12: tm.tm_mon++;
1549 case PERL_MON_11: tm.tm_mon++;
1550 case PERL_MON_10: tm.tm_mon++;
1551 case PERL_MON_9: tm.tm_mon++;
1552 case PERL_MON_8: tm.tm_mon++;
1553 case PERL_MON_7: tm.tm_mon++;
1554 case PERL_MON_6: tm.tm_mon++;
1555 case PERL_MON_5: tm.tm_mon++;
1556 case PERL_MON_4: tm.tm_mon++;
1557 case PERL_MON_3: tm.tm_mon++;
1558 case PERL_MON_2: tm.tm_mon++;
1559 case PERL_MON_1:
1560 format = "%B";
1561 break;
1562
1563 case PERL_T_FMT_AMPM:
1564 format = "%r";
1565 return_format = TRUE;
1566 break;
1567
1568 case PERL_ERA_D_FMT:
1569 format = "%Ex";
1570 return_format = TRUE;
1571 break;
1572
1573 case PERL_ERA_T_FMT:
1574 format = "%EX";
1575 return_format = TRUE;
1576 break;
1577
1578 case PERL_ERA_D_T_FMT:
1579 format = "%Ec";
1580 return_format = TRUE;
1581 break;
1582
1583 case PERL_ALT_DIGITS:
1584 tm.tm_wday = 0;
1585 format = "%Ow"; /* Find the alternate digit for 0 */
1586 break;
1587 }
1588
1589 /* We can't use my_strftime() because it doesn't look at tm_wday */
1590 while (0 == strftime(PL_langinfo_buf, PL_langinfo_bufsize,
1591 format, &tm))
1592 {
1593 /* A zero return means one of:
1594 * a) there wasn't enough space in PL_langinfo_buf
1595 * b) the format, like a plain %p, returns empty
1596 * c) it was an illegal format, though some implementations of
1597 * strftime will just return the illegal format as a plain
1598 * character sequence.
1599 *
1600 * To quickly test for case 'b)', try again but precede the
1601 * format with a plain character. If that result is still
1602 * empty, the problem is either 'a)' or 'c)' */
1603
1604 Size_t format_size = strlen(format) + 1;
1605 Size_t mod_size = format_size + 1;
1606 char * mod_format;
1607 char * temp_result;
1608
1609 Newx(mod_format, mod_size, char);
1610 Newx(temp_result, PL_langinfo_bufsize, char);
1611 *mod_format = '\a';
1612 my_strlcpy(mod_format + 1, format, mod_size);
1613 len = strftime(temp_result,
1614 PL_langinfo_bufsize,
1615 mod_format, &tm);
1616 Safefree(mod_format);
1617 Safefree(temp_result);
1618
1619 /* If 'len' is non-zero, it means that we had a case like %p
1620 * which means the current locale doesn't use a.m. or p.m., and
1621 * that is valid */
1622 if (len == 0) {
1623
1624 /* Here, still didn't work. If we get well beyond a
1625 * reasonable size, bail out to prevent an infinite loop. */
1626
1627 if (PL_langinfo_bufsize > 100 * format_size) {
1628 *PL_langinfo_buf = '\0';
1629 }
1630 else { /* Double the buffer size to retry; Add 1 in case
1631 original was 0, so we aren't stuck at 0. */
1632 PL_langinfo_bufsize *= 2;
1633 PL_langinfo_bufsize++;
1634 Renew(PL_langinfo_buf, PL_langinfo_bufsize, char);
1635 continue;
1636 }
1637 }
1638
1639 break;
1640 }
1641
1642 /* Here, we got a result.
1643 *
1644 * If the item is 'ALT_DIGITS', PL_langinfo_buf contains the
1645 * alternate format for wday 0. If the value is the same as the
1646 * normal 0, there isn't an alternate, so clear the buffer. */
1647 if ( item == PERL_ALT_DIGITS
1648 && strEQ(PL_langinfo_buf, "0"))
1649 {
1650 *PL_langinfo_buf = '\0';
1651 }
1652
1653 /* ALT_DIGITS is problematic. Experiments on it showed that
1654 * strftime() did not always work properly when going from alt-9 to
1655 * alt-10. Only a few locales have this item defined, and in all
1656 * of them on Linux that khw was able to find, nl_langinfo() merely
1657 * returned the alt-0 character, possibly doubled. Most Unicode
1658 * digits are in blocks of 10 consecutive code points, so that is
1659 * sufficient information for those scripts, as we can infer alt-1,
1660 * alt-2, .... But for a Japanese locale, a CJK ideographic 0 is
1661 * returned, and the CJK digits are not in code point order, so you
1662 * can't really infer anything. The localedef for this locale did
1663 * specify the succeeding digits, so that strftime() works properly
1664 * on them, without needing to infer anything. But the
1665 * nl_langinfo() return did not give sufficient information for the
1666 * caller to understand what's going on. So until there is
1667 * evidence that it should work differently, this returns the alt-0
1668 * string for ALT_DIGITS.
1669 *
1670 * wday was chosen because its range is all a single digit. Things
1671 * like tm_sec have two digits as the minimum: '00' */
1672
1673 LOCALE_UNLOCK;
1674
1675 /* If to return the format, not the value, overwrite the buffer
1676 * with it. But some strftime()s will keep the original format if
1677 * illegal, so change those to "" */
1678 if (return_format) {
1679 if (strEQ(PL_langinfo_buf, format)) {
1680 *PL_langinfo_buf = '\0';
1681 }
1682 else {
1683 save_to_buffer(format, &PL_langinfo_buf,
1684 &PL_langinfo_bufsize, 0);
1685 }
1686 }
1687
1688 break;
1689
1690# endif
1691
1692 }
1693
1694 return PL_langinfo_buf;
1695
1696#endif
1697
a4f00dcc 1698}
b385bb4d 1699
98994639
HS
1700/*
1701 * Initialize locale awareness.
1702 */
1703int
1704Perl_init_i18nl10n(pTHX_ int printwarn)
1705{
0e92a118
KW
1706 /* printwarn is
1707 *
1708 * 0 if not to output warning when setup locale is bad
1709 * 1 if to output warning based on value of PERL_BADLANG
1710 * >1 if to output regardless of PERL_BADLANG
1711 *
1712 * returns
98994639 1713 * 1 = set ok or not applicable,
0e92a118
KW
1714 * 0 = fallback to a locale of lower priority
1715 * -1 = fallback to all locales failed, not even to the C locale
6b058d42
KW
1716 *
1717 * Under -DDEBUGGING, if the environment variable PERL_DEBUG_LOCALE_INIT is
1718 * set, debugging information is output.
1719 *
1720 * This looks more complicated than it is, mainly due to the #ifdefs.
1721 *
1722 * We try to set LC_ALL to the value determined by the environment. If
1723 * there is no LC_ALL on this platform, we try the individual categories we
1724 * know about. If this works, we are done.
1725 *
1726 * But if it doesn't work, we have to do something else. We search the
1727 * environment variables ourselves instead of relying on the system to do
1728 * it. We look at, in order, LC_ALL, LANG, a system default locale (if we
1729 * think there is one), and the ultimate fallback "C". This is all done in
1730 * the same loop as above to avoid duplicating code, but it makes things
7d4bcc4a
KW
1731 * more complex. The 'trial_locales' array is initialized with just one
1732 * element; it causes the behavior described in the paragraph above this to
1733 * happen. If that fails, we add elements to 'trial_locales', and do extra
1734 * loop iterations to cause the behavior described in this paragraph.
6b058d42
KW
1735 *
1736 * On Ultrix, the locale MUST come from the environment, so there is
1737 * preliminary code to set it. I (khw) am not sure that it is necessary,
1738 * and that this couldn't be folded into the loop, but barring any real
1739 * platforms to test on, it's staying as-is
1740 *
1741 * A slight complication is that in embedded Perls, the locale may already
1742 * be set-up, and we don't want to get it from the normal environment
1743 * variables. This is handled by having a special environment variable
1744 * indicate we're in this situation. We simply set setlocale's 2nd
1745 * parameter to be a NULL instead of "". That indicates to setlocale that
1746 * it is not to change anything, but to return the current value,
1747 * effectively initializing perl's db to what the locale already is.
1748 *
1749 * We play the same trick with NULL if a LC_ALL succeeds. We call
1750 * setlocale() on the individual categores with NULL to get their existing
1751 * values for our db, instead of trying to change them.
1752 * */
98994639 1753
0e92a118
KW
1754 int ok = 1;
1755
7d4bcc4a
KW
1756#ifndef USE_LOCALE
1757
1758 PERL_UNUSED_ARG(printwarn);
1759
1760#else /* USE_LOCALE */
1761# ifdef USE_LOCALE_CTYPE
1762
98994639 1763 char *curctype = NULL;
7d4bcc4a
KW
1764
1765# endif /* USE_LOCALE_CTYPE */
1766# ifdef USE_LOCALE_COLLATE
1767
98994639 1768 char *curcoll = NULL;
7d4bcc4a
KW
1769
1770# endif /* USE_LOCALE_COLLATE */
1771# ifdef USE_LOCALE_NUMERIC
1772
98994639 1773 char *curnum = NULL;
7d4bcc4a
KW
1774
1775# endif /* USE_LOCALE_NUMERIC */
1776# ifdef __GLIBC__
1777
175c4cf9 1778 const char * const language = savepv(PerlEnv_getenv("LANGUAGE"));
7d4bcc4a
KW
1779
1780# endif
65ebb059 1781
ccd65d51
KW
1782 /* NULL uses the existing already set up locale */
1783 const char * const setlocale_init = (PerlEnv_getenv("PERL_SKIP_LOCALE_INIT"))
1784 ? NULL
1785 : "";
c3fcd832
KW
1786 const char* trial_locales[5]; /* 5 = 1 each for "", LC_ALL, LANG, "", C */
1787 unsigned int trial_locales_count;
175c4cf9
KW
1788 const char * const lc_all = savepv(PerlEnv_getenv("LC_ALL"));
1789 const char * const lang = savepv(PerlEnv_getenv("LANG"));
98994639 1790 bool setlocale_failure = FALSE;
65ebb059 1791 unsigned int i;
175c4cf9
KW
1792
1793 /* A later getenv() could zap this, so only use here */
1794 const char * const bad_lang_use_once = PerlEnv_getenv("PERL_BADLANG");
1795
1796 const bool locwarn = (printwarn > 1
1797 || (printwarn
1798 && (! bad_lang_use_once
22ff3130
HS
1799 || (
1800 /* disallow with "" or "0" */
1801 *bad_lang_use_once
1802 && strNE("0", bad_lang_use_once)))));
0e92a118 1803 bool done = FALSE;
5d1187d1
KW
1804 char * sl_result; /* return from setlocale() */
1805 char * locale_param;
7d4bcc4a
KW
1806
1807# ifdef WIN32
1808
6bce99ee
JH
1809 /* In some systems you can find out the system default locale
1810 * and use that as the fallback locale. */
7d4bcc4a
KW
1811# define SYSTEM_DEFAULT_LOCALE
1812# endif
1813# ifdef SYSTEM_DEFAULT_LOCALE
1814
65ebb059 1815 const char *system_default_locale = NULL;
98994639 1816
7d4bcc4a
KW
1817# endif
1818# ifdef DEBUGGING
1819
8298454c 1820 DEBUG_INITIALIZATION_set(cBOOL(PerlEnv_getenv("PERL_DEBUG_LOCALE_INIT")));
7d4bcc4a
KW
1821
1822# define DEBUG_LOCALE_INIT(category, locale, result) \
2fcc0ca9
KW
1823 STMT_START { \
1824 if (debug_initialization) { \
1825 PerlIO_printf(Perl_debug_log, \
1826 "%s:%d: %s\n", \
1827 __FILE__, __LINE__, \
a4f00dcc 1828 setlocale_debug_string(category, \
2fcc0ca9
KW
1829 locale, \
1830 result)); \
1831 } \
1832 } STMT_END
2fcc0ca9 1833
7d4bcc4a
KW
1834# else
1835# define DEBUG_LOCALE_INIT(a,b,c)
1836# endif
1837
1838# ifndef LOCALE_ENVIRON_REQUIRED
1839
0e92a118 1840 PERL_UNUSED_VAR(done);
5d1187d1 1841 PERL_UNUSED_VAR(locale_param);
7d4bcc4a
KW
1842
1843# else
98994639
HS
1844
1845 /*
1846 * Ultrix setlocale(..., "") fails if there are no environment
1847 * variables from which to get a locale name.
1848 */
1849
7d4bcc4a
KW
1850# ifdef LC_ALL
1851
98994639 1852 if (lang) {
d2b24094 1853 sl_result = do_setlocale_c(LC_ALL, setlocale_init);
5d1187d1
KW
1854 DEBUG_LOCALE_INIT(LC_ALL, setlocale_init, sl_result);
1855 if (sl_result)
98994639
HS
1856 done = TRUE;
1857 else
1858 setlocale_failure = TRUE;
1859 }
5d1187d1 1860 if (! setlocale_failure) {
7d4bcc4a
KW
1861
1862# ifdef USE_LOCALE_CTYPE
1863
5d1187d1
KW
1864 locale_param = (! done && (lang || PerlEnv_getenv("LC_CTYPE")))
1865 ? setlocale_init
1866 : NULL;
d2b24094 1867 curctype = do_setlocale_c(LC_CTYPE, locale_param);
5d1187d1
KW
1868 DEBUG_LOCALE_INIT(LC_CTYPE, locale_param, sl_result);
1869 if (! curctype)
98994639
HS
1870 setlocale_failure = TRUE;
1871 else
1872 curctype = savepv(curctype);
7d4bcc4a
KW
1873
1874# endif /* USE_LOCALE_CTYPE */
1875# ifdef USE_LOCALE_COLLATE
1876
5d1187d1
KW
1877 locale_param = (! done && (lang || PerlEnv_getenv("LC_COLLATE")))
1878 ? setlocale_init
1879 : NULL;
d2b24094 1880 curcoll = do_setlocale_c(LC_COLLATE, locale_param);
5d1187d1
KW
1881 DEBUG_LOCALE_INIT(LC_COLLATE, locale_param, sl_result);
1882 if (! curcoll)
98994639
HS
1883 setlocale_failure = TRUE;
1884 else
1885 curcoll = savepv(curcoll);
7d4bcc4a
KW
1886
1887# endif /* USE_LOCALE_COLLATE */
1888# ifdef USE_LOCALE_NUMERIC
1889
5d1187d1
KW
1890 locale_param = (! done && (lang || PerlEnv_getenv("LC_NUMERIC")))
1891 ? setlocale_init
1892 : NULL;
d2b24094 1893 curnum = do_setlocale_c(LC_NUMERIC, locale_param);
5d1187d1
KW
1894 DEBUG_LOCALE_INIT(LC_NUMERIC, locale_param, sl_result);
1895 if (! curnum)
98994639
HS
1896 setlocale_failure = TRUE;
1897 else
1898 curnum = savepv(curnum);
7d4bcc4a
KW
1899
1900# endif /* USE_LOCALE_NUMERIC */
1901# ifdef USE_LOCALE_MESSAGES
1902
5d1187d1
KW
1903 locale_param = (! done && (lang || PerlEnv_getenv("LC_MESSAGES")))
1904 ? setlocale_init
1905 : NULL;
d2b24094 1906 sl_result = do_setlocale_c(LC_MESSAGES, locale_param);
5d1187d1 1907 DEBUG_LOCALE_INIT(LC_MESSAGES, locale_param, sl_result);
9f42613c 1908 if (! sl_result) {
a782673d
KW
1909 setlocale_failure = TRUE;
1910 }
7d4bcc4a
KW
1911
1912# endif /* USE_LOCALE_MESSAGES */
1913# ifdef USE_LOCALE_MONETARY
1914
5d1187d1
KW
1915 locale_param = (! done && (lang || PerlEnv_getenv("LC_MONETARY")))
1916 ? setlocale_init
1917 : NULL;
d2b24094 1918 sl_result = do_setlocale_c(LC_MONETARY, locale_param);
5d1187d1
KW
1919 DEBUG_LOCALE_INIT(LC_MONETARY, locale_param, sl_result);
1920 if (! sl_result) {
c835d6be
KW
1921 setlocale_failure = TRUE;
1922 }
98994639 1923
7d4bcc4a 1924# endif /* USE_LOCALE_MONETARY */
98994639 1925
7d4bcc4a
KW
1926 }
1927
1928# endif /* LC_ALL */
1929# endif /* !LOCALE_ENVIRON_REQUIRED */
98994639 1930
65ebb059 1931 /* We try each locale in the list until we get one that works, or exhaust
20a240df
KW
1932 * the list. Normally the loop is executed just once. But if setting the
1933 * locale fails, inside the loop we add fallback trials to the array and so
1934 * will execute the loop multiple times */
c3fcd832
KW
1935 trial_locales[0] = setlocale_init;
1936 trial_locales_count = 1;
7d4bcc4a 1937
65ebb059
KW
1938 for (i= 0; i < trial_locales_count; i++) {
1939 const char * trial_locale = trial_locales[i];
1940
1941 if (i > 0) {
1942
1943 /* XXX This is to preserve old behavior for LOCALE_ENVIRON_REQUIRED
1944 * when i==0, but I (khw) don't think that behavior makes much
1945 * sense */
1946 setlocale_failure = FALSE;
1947
7d4bcc4a
KW
1948# ifdef SYSTEM_DEFAULT_LOCALE
1949# ifdef WIN32
1950
65ebb059
KW
1951 /* On Windows machines, an entry of "" after the 0th means to use
1952 * the system default locale, which we now proceed to get. */
1953 if (strEQ(trial_locale, "")) {
1954 unsigned int j;
1955
1956 /* Note that this may change the locale, but we are going to do
1957 * that anyway just below */
837ce802 1958 system_default_locale = do_setlocale_c(LC_ALL, "");
5d1187d1 1959 DEBUG_LOCALE_INIT(LC_ALL, "", system_default_locale);
65ebb059 1960
7d4bcc4a 1961 /* Skip if invalid or if it's already on the list of locales to
65ebb059
KW
1962 * try */
1963 if (! system_default_locale) {
1964 goto next_iteration;
1965 }
1966 for (j = 0; j < trial_locales_count; j++) {
1967 if (strEQ(system_default_locale, trial_locales[j])) {
1968 goto next_iteration;
1969 }
1970 }
1971
1972 trial_locale = system_default_locale;
1973 }
7d4bcc4a
KW
1974# endif /* WIN32 */
1975# endif /* SYSTEM_DEFAULT_LOCALE */
65ebb059
KW
1976 }
1977
7d4bcc4a
KW
1978# ifdef LC_ALL
1979
d2b24094 1980 sl_result = do_setlocale_c(LC_ALL, trial_locale);
5d1187d1
KW
1981 DEBUG_LOCALE_INIT(LC_ALL, trial_locale, sl_result);
1982 if (! sl_result) {
49c85077 1983 setlocale_failure = TRUE;
7cd8b568
KW
1984 }
1985 else {
1986 /* Since LC_ALL succeeded, it should have changed all the other
1987 * categories it can to its value; so we massage things so that the
1988 * setlocales below just return their category's current values.
1989 * This adequately handles the case in NetBSD where LC_COLLATE may
1990 * not be defined for a locale, and setting it individually will
7d4bcc4a 1991 * fail, whereas setting LC_ALL succeeds, leaving LC_COLLATE set to
7cd8b568
KW
1992 * the POSIX locale. */
1993 trial_locale = NULL;
1994 }
7d4bcc4a
KW
1995
1996# endif /* LC_ALL */
98994639 1997
49c85077 1998 if (!setlocale_failure) {
7d4bcc4a
KW
1999
2000# ifdef USE_LOCALE_CTYPE
2001
49c85077 2002 Safefree(curctype);
d2b24094 2003 curctype = do_setlocale_c(LC_CTYPE, trial_locale);
5d1187d1
KW
2004 DEBUG_LOCALE_INIT(LC_CTYPE, trial_locale, curctype);
2005 if (! curctype)
49c85077
KW
2006 setlocale_failure = TRUE;
2007 else
2008 curctype = savepv(curctype);
7d4bcc4a
KW
2009
2010# endif /* USE_LOCALE_CTYPE */
2011# ifdef USE_LOCALE_COLLATE
2012
49c85077 2013 Safefree(curcoll);
d2b24094 2014 curcoll = do_setlocale_c(LC_COLLATE, trial_locale);
5d1187d1
KW
2015 DEBUG_LOCALE_INIT(LC_COLLATE, trial_locale, curcoll);
2016 if (! curcoll)
49c85077
KW
2017 setlocale_failure = TRUE;
2018 else
2019 curcoll = savepv(curcoll);
7d4bcc4a
KW
2020
2021# endif /* USE_LOCALE_COLLATE */
2022# ifdef USE_LOCALE_NUMERIC
2023
49c85077 2024 Safefree(curnum);
d2b24094 2025 curnum = do_setlocale_c(LC_NUMERIC, trial_locale);
5d1187d1
KW
2026 DEBUG_LOCALE_INIT(LC_NUMERIC, trial_locale, curnum);
2027 if (! curnum)
49c85077
KW
2028 setlocale_failure = TRUE;
2029 else
2030 curnum = savepv(curnum);
7d4bcc4a
KW
2031
2032# endif /* USE_LOCALE_NUMERIC */
2033# ifdef USE_LOCALE_MESSAGES
2034
d2b24094 2035 sl_result = do_setlocale_c(LC_MESSAGES, trial_locale);
5d1187d1
KW
2036 DEBUG_LOCALE_INIT(LC_MESSAGES, trial_locale, sl_result);
2037 if (! (sl_result))
a782673d 2038 setlocale_failure = TRUE;
7d4bcc4a
KW
2039
2040# endif /* USE_LOCALE_MESSAGES */
2041# ifdef USE_LOCALE_MONETARY
2042
d2b24094 2043 sl_result = do_setlocale_c(LC_MONETARY, trial_locale);
5d1187d1
KW
2044 DEBUG_LOCALE_INIT(LC_MONETARY, trial_locale, sl_result);
2045 if (! (sl_result))
c835d6be 2046 setlocale_failure = TRUE;
7d4bcc4a
KW
2047
2048# endif /* USE_LOCALE_MONETARY */
c835d6be 2049
49c85077
KW
2050 if (! setlocale_failure) { /* Success */
2051 break;
2052 }
65ebb059 2053 }
98994639 2054
49c85077
KW
2055 /* Here, something failed; will need to try a fallback. */
2056 ok = 0;
65ebb059 2057
49c85077
KW
2058 if (i == 0) {
2059 unsigned int j;
98994639 2060
65ebb059 2061 if (locwarn) { /* Output failure info only on the first one */
7d4bcc4a
KW
2062
2063# ifdef LC_ALL
98994639 2064
49c85077
KW
2065 PerlIO_printf(Perl_error_log,
2066 "perl: warning: Setting locale failed.\n");
98994639 2067
7d4bcc4a 2068# else /* !LC_ALL */
98994639 2069
49c85077
KW
2070 PerlIO_printf(Perl_error_log,
2071 "perl: warning: Setting locale failed for the categories:\n\t");
7d4bcc4a
KW
2072
2073# ifdef USE_LOCALE_CTYPE
2074
49c85077
KW
2075 if (! curctype)
2076 PerlIO_printf(Perl_error_log, "LC_CTYPE ");
7d4bcc4a
KW
2077
2078# endif /* USE_LOCALE_CTYPE */
2079# ifdef USE_LOCALE_COLLATE
49c85077
KW
2080 if (! curcoll)
2081 PerlIO_printf(Perl_error_log, "LC_COLLATE ");
7d4bcc4a
KW
2082
2083# endif /* USE_LOCALE_COLLATE */
2084# ifdef USE_LOCALE_NUMERIC
2085
49c85077
KW
2086 if (! curnum)
2087 PerlIO_printf(Perl_error_log, "LC_NUMERIC ");
7d4bcc4a
KW
2088
2089# endif /* USE_LOCALE_NUMERIC */
2090
a782673d 2091 PerlIO_printf(Perl_error_log, "and possibly others\n");
98994639 2092
7d4bcc4a 2093# endif /* LC_ALL */
98994639 2094
49c85077
KW
2095 PerlIO_printf(Perl_error_log,
2096 "perl: warning: Please check that your locale settings:\n");
98994639 2097
7d4bcc4a
KW
2098# ifdef __GLIBC__
2099
49c85077
KW
2100 PerlIO_printf(Perl_error_log,
2101 "\tLANGUAGE = %c%s%c,\n",
2102 language ? '"' : '(',
2103 language ? language : "unset",
2104 language ? '"' : ')');
7d4bcc4a 2105# endif
98994639 2106
49c85077
KW
2107 PerlIO_printf(Perl_error_log,
2108 "\tLC_ALL = %c%s%c,\n",
2109 lc_all ? '"' : '(',
2110 lc_all ? lc_all : "unset",
2111 lc_all ? '"' : ')');
98994639 2112
7d4bcc4a
KW
2113# if defined(USE_ENVIRON_ARRAY)
2114
49c85077 2115 {
cd999af9 2116 char **e;
d5e32b93
KW
2117
2118 /* Look through the environment for any variables of the
2119 * form qr/ ^ LC_ [A-Z]+ = /x, except LC_ALL which was
2120 * already handled above. These are assumed to be locale
2121 * settings. Output them and their values. */
cd999af9 2122 for (e = environ; *e; e++) {
d5e32b93
KW
2123 const STRLEN prefix_len = sizeof("LC_") - 1;
2124 STRLEN uppers_len;
2125
cd999af9 2126 if ( strBEGINs(*e, "LC_")
c8b388b0 2127 && ! strBEGINs(*e, "LC_ALL=")
d5e32b93
KW
2128 && (uppers_len = strspn(*e + prefix_len,
2129 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"))
2130 && ((*e)[prefix_len + uppers_len] == '='))
cd999af9
KW
2131 {
2132 PerlIO_printf(Perl_error_log, "\t%.*s = \"%s\",\n",
d5e32b93
KW
2133 (int) (prefix_len + uppers_len), *e,
2134 *e + prefix_len + uppers_len + 1);
cd999af9
KW
2135 }
2136 }
49c85077 2137 }
7d4bcc4a
KW
2138
2139# else
2140
49c85077
KW
2141 PerlIO_printf(Perl_error_log,
2142 "\t(possibly more locale environment variables)\n");
7d4bcc4a
KW
2143
2144# endif
98994639 2145
49c85077
KW
2146 PerlIO_printf(Perl_error_log,
2147 "\tLANG = %c%s%c\n",
2148 lang ? '"' : '(',
2149 lang ? lang : "unset",
2150 lang ? '"' : ')');
98994639 2151
49c85077
KW
2152 PerlIO_printf(Perl_error_log,
2153 " are supported and installed on your system.\n");
2154 }
98994639 2155
65ebb059 2156 /* Calculate what fallback locales to try. We have avoided this
f6bab5f6 2157 * until we have to, because failure is quite unlikely. This will
65ebb059
KW
2158 * usually change the upper bound of the loop we are in.
2159 *
2160 * Since the system's default way of setting the locale has not
2161 * found one that works, We use Perl's defined ordering: LC_ALL,
2162 * LANG, and the C locale. We don't try the same locale twice, so
2163 * don't add to the list if already there. (On POSIX systems, the
2164 * LC_ALL element will likely be a repeat of the 0th element "",
6b058d42
KW
2165 * but there's no harm done by doing it explicitly.
2166 *
2167 * Note that this tries the LC_ALL environment variable even on
2168 * systems which have no LC_ALL locale setting. This may or may
2169 * not have been originally intentional, but there's no real need
2170 * to change the behavior. */
65ebb059
KW
2171 if (lc_all) {
2172 for (j = 0; j < trial_locales_count; j++) {
2173 if (strEQ(lc_all, trial_locales[j])) {
2174 goto done_lc_all;
2175 }
2176 }
2177 trial_locales[trial_locales_count++] = lc_all;
2178 }
2179 done_lc_all:
98994639 2180
65ebb059
KW
2181 if (lang) {
2182 for (j = 0; j < trial_locales_count; j++) {
2183 if (strEQ(lang, trial_locales[j])) {
2184 goto done_lang;
2185 }
2186 }
2187 trial_locales[trial_locales_count++] = lang;
2188 }
2189 done_lang:
2190
7d4bcc4a
KW
2191# if defined(WIN32) && defined(LC_ALL)
2192
65ebb059
KW
2193 /* For Windows, we also try the system default locale before "C".
2194 * (If there exists a Windows without LC_ALL we skip this because
2195 * it gets too complicated. For those, the "C" is the next
2196 * fallback possibility). The "" is the same as the 0th element of
2197 * the array, but the code at the loop above knows to treat it
2198 * differently when not the 0th */
2199 trial_locales[trial_locales_count++] = "";
7d4bcc4a
KW
2200
2201# endif
65ebb059
KW
2202
2203 for (j = 0; j < trial_locales_count; j++) {
2204 if (strEQ("C", trial_locales[j])) {
2205 goto done_C;
2206 }
2207 }
2208 trial_locales[trial_locales_count++] = "C";
98994639 2209
65ebb059
KW
2210 done_C: ;
2211 } /* end of first time through the loop */
98994639 2212
7d4bcc4a
KW
2213# ifdef WIN32
2214
65ebb059 2215 next_iteration: ;
7d4bcc4a
KW
2216
2217# endif
65ebb059
KW
2218
2219 } /* end of looping through the trial locales */
2220
2221 if (ok < 1) { /* If we tried to fallback */
2222 const char* msg;
2223 if (! setlocale_failure) { /* fallback succeeded */
2224 msg = "Falling back to";
2225 }
2226 else { /* fallback failed */
98994639 2227
65ebb059
KW
2228 /* We dropped off the end of the loop, so have to decrement i to
2229 * get back to the value the last time through */
2230 i--;
98994639 2231
65ebb059
KW
2232 ok = -1;
2233 msg = "Failed to fall back to";
2234
2235 /* To continue, we should use whatever values we've got */
7d4bcc4a
KW
2236
2237# ifdef USE_LOCALE_CTYPE
2238
49c85077 2239 Safefree(curctype);
837ce802 2240 curctype = savepv(do_setlocale_c(LC_CTYPE, NULL));
5d1187d1 2241 DEBUG_LOCALE_INIT(LC_CTYPE, NULL, curctype);
7d4bcc4a
KW
2242
2243# endif /* USE_LOCALE_CTYPE */
2244# ifdef USE_LOCALE_COLLATE
2245
49c85077 2246 Safefree(curcoll);
837ce802 2247 curcoll = savepv(do_setlocale_c(LC_COLLATE, NULL));
5d1187d1 2248 DEBUG_LOCALE_INIT(LC_COLLATE, NULL, curcoll);
7d4bcc4a
KW
2249
2250# endif /* USE_LOCALE_COLLATE */
2251# ifdef USE_LOCALE_NUMERIC
2252
49c85077 2253 Safefree(curnum);
837ce802 2254 curnum = savepv(do_setlocale_c(LC_NUMERIC, NULL));
5d1187d1 2255 DEBUG_LOCALE_INIT(LC_NUMERIC, NULL, curnum);
7d4bcc4a
KW
2256
2257# endif /* USE_LOCALE_NUMERIC */
2258
65ebb059
KW
2259 }
2260
2261 if (locwarn) {
2262 const char * description;
2263 const char * name = "";
2264 if (strEQ(trial_locales[i], "C")) {
2265 description = "the standard locale";
2266 name = "C";
2267 }
7d4bcc4a
KW
2268
2269# ifdef SYSTEM_DEFAULT_LOCALE
2270
65ebb059
KW
2271 else if (strEQ(trial_locales[i], "")) {
2272 description = "the system default locale";
2273 if (system_default_locale) {
2274 name = system_default_locale;
2275 }
2276 }
7d4bcc4a
KW
2277
2278# endif /* SYSTEM_DEFAULT_LOCALE */
2279
65ebb059
KW
2280 else {
2281 description = "a fallback locale";
2282 name = trial_locales[i];
2283 }
2284 if (name && strNE(name, "")) {
2285 PerlIO_printf(Perl_error_log,
2286 "perl: warning: %s %s (\"%s\").\n", msg, description, name);
2287 }
2288 else {
2289 PerlIO_printf(Perl_error_log,
2290 "perl: warning: %s %s.\n", msg, description);
2291 }
2292 }
2293 } /* End of tried to fallback */
98994639 2294
7d4bcc4a
KW
2295# ifdef USE_LOCALE_CTYPE
2296
98994639 2297 new_ctype(curctype);
98994639 2298
7d4bcc4a
KW
2299# endif /* USE_LOCALE_CTYPE */
2300# ifdef USE_LOCALE_COLLATE
2301
98994639 2302 new_collate(curcoll);
98994639 2303
7d4bcc4a
KW
2304# endif /* USE_LOCALE_COLLATE */
2305# ifdef USE_LOCALE_NUMERIC
2306
98994639 2307 new_numeric(curnum);
b310b053 2308
7d4bcc4a
KW
2309# endif /* USE_LOCALE_NUMERIC */
2310# if defined(USE_PERLIO) && defined(USE_LOCALE_CTYPE)
2311
49c85077
KW
2312 /* Set PL_utf8locale to TRUE if using PerlIO _and_ the current LC_CTYPE
2313 * locale is UTF-8. If PL_utf8locale and PL_unicode (set by -C or by
2314 * $ENV{PERL_UNICODE}) are true, perl.c:S_parse_body() will turn on the
2315 * PerlIO :utf8 layer on STDIN, STDOUT, STDERR, _and_ the default open
2316 * discipline. */
c1284011 2317 PL_utf8locale = _is_cur_LC_category_utf8(LC_CTYPE);
49c85077 2318
a05d7ebb 2319 /* Set PL_unicode to $ENV{PERL_UNICODE} if using PerlIO.
fde18df1
JH
2320 This is an alternative to using the -C command line switch
2321 (the -C if present will override this). */
2322 {
dd374669 2323 const char *p = PerlEnv_getenv("PERL_UNICODE");
a05d7ebb 2324 PL_unicode = p ? parse_unicode_opts(&p) : 0;
5a22a2bb
NC
2325 if (PL_unicode & PERL_UNICODE_UTF8CACHEASSERT_FLAG)
2326 PL_utf8cache = -1;
b310b053
JH
2327 }
2328
7d4bcc4a
KW
2329# endif
2330# ifdef USE_LOCALE_CTYPE
2331
43c5f42d 2332 Safefree(curctype);
7d4bcc4a
KW
2333
2334# endif /* USE_LOCALE_CTYPE */
2335# ifdef USE_LOCALE_COLLATE
2336
43c5f42d 2337 Safefree(curcoll);
7d4bcc4a
KW
2338
2339# endif /* USE_LOCALE_COLLATE */
2340# ifdef USE_LOCALE_NUMERIC
2341
43c5f42d 2342 Safefree(curnum);
8ef6e574 2343
7d4bcc4a
KW
2344# endif /* USE_LOCALE_NUMERIC */
2345
2346# ifdef __GLIBC__
2347
175c4cf9 2348 Safefree(language);
7d4bcc4a
KW
2349
2350# endif
175c4cf9
KW
2351
2352 Safefree(lc_all);
2353 Safefree(lang);
2354
e3305790 2355#endif /* USE_LOCALE */
2fcc0ca9 2356#ifdef DEBUGGING
7d4bcc4a 2357
2fcc0ca9 2358 /* So won't continue to output stuff */
27cdc72e 2359 DEBUG_INITIALIZATION_set(FALSE);
7d4bcc4a 2360
2fcc0ca9
KW
2361#endif
2362
98994639
HS
2363 return ok;
2364}
2365
98994639
HS
2366#ifdef USE_LOCALE_COLLATE
2367
a4a439fb 2368char *
a4a439fb
KW
2369Perl__mem_collxfrm(pTHX_ const char *input_string,
2370 STRLEN len, /* Length of 'input_string' */
2371 STRLEN *xlen, /* Set to length of returned string
2372 (not including the collation index
2373 prefix) */
2374 bool utf8 /* Is the input in UTF-8? */
6696cfa7 2375 )
98994639 2376{
a4a439fb
KW
2377
2378 /* _mem_collxfrm() is a bit like strxfrm() but with two important
2379 * differences. First, it handles embedded NULs. Second, it allocates a bit
2380 * more memory than needed for the transformed data itself. The real
55e5378d 2381 * transformed data begins at offset COLLXFRM_HDR_LEN. *xlen is set to
a4a439fb
KW
2382 * the length of that, and doesn't include the collation index size.
2383 * Please see sv_collxfrm() to see how this is used. */
2384
55e5378d
KW
2385#define COLLXFRM_HDR_LEN sizeof(PL_collation_ix)
2386
6696cfa7
KW
2387 char * s = (char *) input_string;
2388 STRLEN s_strlen = strlen(input_string);
79f120c8 2389 char *xbuf = NULL;
55e5378d 2390 STRLEN xAlloc; /* xalloc is a reserved word in VC */
17f41037 2391 STRLEN length_in_chars;
c664130f 2392 bool first_time = TRUE; /* Cleared after first loop iteration */
98994639 2393
a4a439fb
KW
2394 PERL_ARGS_ASSERT__MEM_COLLXFRM;
2395
2396 /* Must be NUL-terminated */
2397 assert(*(input_string + len) == '\0');
7918f24d 2398
79f120c8
KW
2399 /* If this locale has defective collation, skip */
2400 if (PL_collxfrm_base == 0 && PL_collxfrm_mult == 0) {
c7202dee
KW
2401 DEBUG_L(PerlIO_printf(Perl_debug_log,
2402 "_mem_collxfrm: locale's collation is defective\n"));
79f120c8
KW
2403 goto bad;
2404 }
2405
6696cfa7
KW
2406 /* Replace any embedded NULs with the control that sorts before any others.
2407 * This will give as good as possible results on strings that don't
2408 * otherwise contain that character, but otherwise there may be
2409 * less-than-perfect results with that character and NUL. This is
fdc080f3 2410 * unavoidable unless we replace strxfrm with our own implementation. */
fd43f63c
KW
2411 if (UNLIKELY(s_strlen < len)) { /* Only execute if there is an embedded
2412 NUL */
6696cfa7
KW
2413 char * e = s + len;
2414 char * sans_nuls;
fdc080f3 2415 STRLEN sans_nuls_len;
94762aa0 2416 int try_non_controls;
afc4976f
KW
2417 char this_replacement_char[] = "?\0"; /* Room for a two-byte string,
2418 making sure 2nd byte is NUL.
2419 */
2420 STRLEN this_replacement_len;
2421
1e4c9676
KW
2422 /* If we don't know what non-NUL control character sorts lowest for
2423 * this locale, find it */
f28f4d2a 2424 if (PL_strxfrm_NUL_replacement == '\0') {
6696cfa7 2425 int j;
afc4976f 2426 char * cur_min_x = NULL; /* The min_char's xfrm, (except it also
6696cfa7
KW
2427 includes the collation index
2428 prefixed. */
2429
91c0e2e0 2430 DEBUG_Lv(PerlIO_printf(Perl_debug_log, "Looking to replace NUL\n"));
94762aa0
KW
2431
2432 /* Unlikely, but it may be that no control will work to replace
1e4c9676
KW
2433 * NUL, in which case we instead look for any character. Controls
2434 * are preferred because collation order is, in general, context
2435 * sensitive, with adjoining characters affecting the order, and
2436 * controls are less likely to have such interactions, allowing the
2437 * NUL-replacement to stand on its own. (Another way to look at it
2438 * is to imagine what would happen if the NUL were replaced by a
2439 * combining character; it wouldn't work out all that well.) */
94762aa0
KW
2440 for (try_non_controls = 0;
2441 try_non_controls < 2;
2442 try_non_controls++)
2443 {
d4ff9586
KW
2444 /* Look through all legal code points (NUL isn't) */
2445 for (j = 1; j < 256; j++) {
2446 char * x; /* j's xfrm plus collation index */
2447 STRLEN x_len; /* length of 'x' */
2448 STRLEN trial_len = 1;
736a4fed 2449 char cur_source[] = { '\0', '\0' };
d4ff9586 2450
736a4fed
KW
2451 /* Skip non-controls the first time through the loop. The
2452 * controls in a UTF-8 locale are the L1 ones */
afc4976f
KW
2453 if (! try_non_controls && (PL_in_utf8_COLLATE_locale)
2454 ? ! isCNTRL_L1(j)
2455 : ! isCNTRL_LC(j))
2456 {
d4ff9586 2457 continue;
6696cfa7 2458 }
6696cfa7 2459
736a4fed
KW
2460 /* Create a 1-char string of the current code point */
2461 cur_source[0] = (char) j;
2462
d4ff9586
KW
2463 /* Then transform it */
2464 x = _mem_collxfrm(cur_source, trial_len, &x_len,
afc4976f 2465 0 /* The string is not in UTF-8 */);
6696cfa7 2466
1e4c9676 2467 /* Ignore any character that didn't successfully transform.
d4ff9586
KW
2468 * */
2469 if (! x) {
2470 continue;
2471 }
6696cfa7 2472
d4ff9586
KW
2473 /* If this character's transformation is lower than
2474 * the current lowest, this one becomes the lowest */
2475 if ( cur_min_x == NULL
2476 || strLT(x + COLLXFRM_HDR_LEN,
2477 cur_min_x + COLLXFRM_HDR_LEN))
2478 {
f28f4d2a 2479 PL_strxfrm_NUL_replacement = j;
d4ff9586 2480 cur_min_x = x;
d4ff9586
KW
2481 }
2482 else {
2483 Safefree(x);
2484 }
1e4c9676 2485 } /* end of loop through all 255 characters */
6696cfa7 2486
1e4c9676 2487 /* Stop looking if found */
94762aa0
KW
2488 if (cur_min_x) {
2489 break;
2490 }
2491
2492 /* Unlikely, but possible, if there aren't any controls that
2493 * work in the locale, repeat the loop, looking for any
2494 * character that works */
2495 DEBUG_L(PerlIO_printf(Perl_debug_log,
2496 "_mem_collxfrm: No control worked. Trying non-controls\n"));
1e4c9676 2497 } /* End of loop to try first the controls, then any char */
6696cfa7 2498
94762aa0
KW
2499 if (! cur_min_x) {
2500 DEBUG_L(PerlIO_printf(Perl_debug_log,
2501 "_mem_collxfrm: Couldn't find any character to replace"
2502 " embedded NULs in locale %s with", PL_collation_name));
2503 goto bad;
58eebef2
KW
2504 }
2505
94762aa0
KW
2506 DEBUG_L(PerlIO_printf(Perl_debug_log,
2507 "_mem_collxfrm: Replacing embedded NULs in locale %s with "
f28f4d2a 2508 "0x%02X\n", PL_collation_name, PL_strxfrm_NUL_replacement));
94762aa0 2509
6696cfa7 2510 Safefree(cur_min_x);
1e4c9676 2511 } /* End of determining the character that is to replace NULs */
afc4976f
KW
2512
2513 /* If the replacement is variant under UTF-8, it must match the
2514 * UTF8-ness as the original */
f28f4d2a
KW
2515 if ( ! UVCHR_IS_INVARIANT(PL_strxfrm_NUL_replacement) && utf8) {
2516 this_replacement_char[0] =
2517 UTF8_EIGHT_BIT_HI(PL_strxfrm_NUL_replacement);
2518 this_replacement_char[1] =
2519 UTF8_EIGHT_BIT_LO(PL_strxfrm_NUL_replacement);
afc4976f
KW
2520 this_replacement_len = 2;
2521 }
2522 else {
f28f4d2a 2523 this_replacement_char[0] = PL_strxfrm_NUL_replacement;
afc4976f
KW
2524 /* this_replacement_char[1] = '\0' was done at initialization */
2525 this_replacement_len = 1;
6696cfa7
KW
2526 }
2527
2528 /* The worst case length for the replaced string would be if every
2529 * character in it is NUL. Multiply that by the length of each
2530 * replacement, and allow for a trailing NUL */
afc4976f 2531 sans_nuls_len = (len * this_replacement_len) + 1;
fdc080f3 2532 Newx(sans_nuls, sans_nuls_len, char);
6696cfa7
KW
2533 *sans_nuls = '\0';
2534
6696cfa7
KW
2535 /* Replace each NUL with the lowest collating control. Loop until have
2536 * exhausted all the NULs */
2537 while (s + s_strlen < e) {
6069d6c5 2538 my_strlcat(sans_nuls, s, sans_nuls_len);
6696cfa7
KW
2539
2540 /* Do the actual replacement */
6069d6c5 2541 my_strlcat(sans_nuls, this_replacement_char, sans_nuls_len);
6696cfa7
KW
2542
2543 /* Move past the input NUL */
2544 s += s_strlen + 1;
2545 s_strlen = strlen(s);
2546 }
2547
2548 /* And add anything that trails the final NUL */
6069d6c5 2549 my_strlcat(sans_nuls, s, sans_nuls_len);
6696cfa7
KW
2550
2551 /* Switch so below we transform this modified string */
2552 s = sans_nuls;
2553 len = strlen(s);
1e4c9676 2554 } /* End of replacing NULs */
6696cfa7 2555
a4a439fb
KW
2556 /* Make sure the UTF8ness of the string and locale match */
2557 if (utf8 != PL_in_utf8_COLLATE_locale) {
2558 const char * const t = s; /* Temporary so we can later find where the
2559 input was */
2560
2561 /* Here they don't match. Change the string's to be what the locale is
2562 * expecting */
2563
2564 if (! utf8) { /* locale is UTF-8, but input isn't; upgrade the input */
2565 s = (char *) bytes_to_utf8((const U8 *) s, &len);
2566 utf8 = TRUE;
2567 }
2568 else { /* locale is not UTF-8; but input is; downgrade the input */
2569
2570 s = (char *) bytes_from_utf8((const U8 *) s, &len, &utf8);
2571
2572 /* If the downgrade was successful we are done, but if the input
2573 * contains things that require UTF-8 to represent, have to do
2574 * damage control ... */
2575 if (UNLIKELY(utf8)) {
2576
2577 /* What we do is construct a non-UTF-8 string with
2578 * 1) the characters representable by a single byte converted
2579 * to be so (if necessary);
2580 * 2) and the rest converted to collate the same as the
2581 * highest collating representable character. That makes
2582 * them collate at the end. This is similar to how we
2583 * handle embedded NULs, but we use the highest collating
2584 * code point instead of the smallest. Like the NUL case,
2585 * this isn't perfect, but is the best we can reasonably
2586 * do. Every above-255 code point will sort the same as
2587 * the highest-sorting 0-255 code point. If that code
2588 * point can combine in a sequence with some other code
2589 * points for weight calculations, us changing something to
2590 * be it can adversely affect the results. But in most
2591 * cases, it should work reasonably. And note that this is
2592 * really an illegal situation: using code points above 255
2593 * on a locale where only 0-255 are valid. If two strings
2594 * sort entirely equal, then the sort order for the
2595 * above-255 code points will be in code point order. */
2596
2597 utf8 = FALSE;
2598
2599 /* If we haven't calculated the code point with the maximum
2600 * collating order for this locale, do so now */
2601 if (! PL_strxfrm_max_cp) {
2602 int j;
2603
2604 /* The current transformed string that collates the
2605 * highest (except it also includes the prefixed collation
2606 * index. */
2607 char * cur_max_x = NULL;
2608
2609 /* Look through all legal code points (NUL isn't) */
2610 for (j = 1; j < 256; j++) {
2611 char * x;
2612 STRLEN x_len;
736a4fed 2613 char cur_source[] = { '\0', '\0' };
a4a439fb 2614
736a4fed
KW
2615 /* Create a 1-char string of the current code point */
2616 cur_source[0] = (char) j;
a4a439fb
KW
2617
2618 /* Then transform it */
2619 x = _mem_collxfrm(cur_source, 1, &x_len, FALSE);
2620
2621 /* If something went wrong (which it shouldn't), just
2622 * ignore this code point */
94762aa0 2623 if (! x) {
a4a439fb
KW
2624 continue;
2625 }
2626
2627 /* If this character's transformation is higher than
2628 * the current highest, this one becomes the highest */
2629 if ( cur_max_x == NULL
55e5378d
KW
2630 || strGT(x + COLLXFRM_HDR_LEN,
2631 cur_max_x + COLLXFRM_HDR_LEN))
a4a439fb
KW
2632 {
2633 PL_strxfrm_max_cp = j;
2634 cur_max_x = x;
2635 }
2636 else {
2637 Safefree(x);
2638 }
2639 }
2640
94762aa0
KW
2641 if (! cur_max_x) {
2642 DEBUG_L(PerlIO_printf(Perl_debug_log,
2643 "_mem_collxfrm: Couldn't find any character to"
2644 " replace above-Latin1 chars in locale %s with",
2645 PL_collation_name));
2646 goto bad;
2647 }
2648
58eebef2
KW
2649 DEBUG_L(PerlIO_printf(Perl_debug_log,
2650 "_mem_collxfrm: highest 1-byte collating character"
2651 " in locale %s is 0x%02X\n",
2652 PL_collation_name,
2653 PL_strxfrm_max_cp));
58eebef2 2654
a4a439fb
KW
2655 Safefree(cur_max_x);
2656 }
2657
2658 /* Here we know which legal code point collates the highest.
2659 * We are ready to construct the non-UTF-8 string. The length
2660 * will be at least 1 byte smaller than the input string
2661 * (because we changed at least one 2-byte character into a
2662 * single byte), but that is eaten up by the trailing NUL */
2663 Newx(s, len, char);
2664
2665 {
2666 STRLEN i;
2667 STRLEN d= 0;
042d9e50 2668 char * e = (char *) t + len;
a4a439fb
KW
2669
2670 for (i = 0; i < len; i+= UTF8SKIP(t + i)) {
2671 U8 cur_char = t[i];
2672 if (UTF8_IS_INVARIANT(cur_char)) {
2673 s[d++] = cur_char;
2674 }
042d9e50 2675 else if (UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(t + i, e)) {
a4a439fb
KW
2676 s[d++] = EIGHT_BIT_UTF8_TO_NATIVE(cur_char, t[i+1]);
2677 }
2678 else { /* Replace illegal cp with highest collating
2679 one */
2680 s[d++] = PL_strxfrm_max_cp;
2681 }
2682 }
2683 s[d++] = '\0';
2684 Renew(s, d, char); /* Free up unused space */
2685 }
2686 }
2687 }
2688
2689 /* Here, we have constructed a modified version of the input. It could
2690 * be that we already had a modified copy before we did this version.
2691 * If so, that copy is no longer needed */
2692 if (t != input_string) {
2693 Safefree(t);
2694 }
2695 }
2696
17f41037
KW
2697 length_in_chars = (utf8)
2698 ? utf8_length((U8 *) s, (U8 *) s + len)
2699 : len;
2700
59c018b9
KW
2701 /* The first element in the output is the collation id, used by
2702 * sv_collxfrm(); then comes the space for the transformed string. The
2703 * equation should give us a good estimate as to how much is needed */
55e5378d 2704 xAlloc = COLLXFRM_HDR_LEN
a4a439fb 2705 + PL_collxfrm_base
17f41037 2706 + (PL_collxfrm_mult * length_in_chars);
a02a5408 2707 Newx(xbuf, xAlloc, char);
c7202dee
KW
2708 if (UNLIKELY(! xbuf)) {
2709 DEBUG_L(PerlIO_printf(Perl_debug_log,
2710 "_mem_collxfrm: Couldn't malloc %zu bytes\n", xAlloc));
98994639 2711 goto bad;
c7202dee 2712 }
98994639 2713
d35fca5f 2714 /* Store the collation id */
98994639 2715 *(U32*)xbuf = PL_collation_ix;
d35fca5f
KW
2716
2717 /* Then the transformation of the input. We loop until successful, or we
2718 * give up */
4ebeff16 2719 for (;;) {
1adab0a7 2720
55e5378d 2721 *xlen = strxfrm(xbuf + COLLXFRM_HDR_LEN, s, xAlloc - COLLXFRM_HDR_LEN);
4ebeff16
KW
2722
2723 /* If the transformed string occupies less space than we told strxfrm()
2724 * was available, it means it successfully transformed the whole
2725 * string. */
55e5378d 2726 if (*xlen < xAlloc - COLLXFRM_HDR_LEN) {
17f41037 2727
1adab0a7
KW
2728 /* Some systems include a trailing NUL in the returned length.
2729 * Ignore it, using a loop in case multiple trailing NULs are
2730 * returned. */
2731 while ( (*xlen) > 0
2732 && *(xbuf + COLLXFRM_HDR_LEN + (*xlen) - 1) == '\0')
2733 {
2734 (*xlen)--;
2735 }
2736
17f41037
KW
2737 /* If the first try didn't get it, it means our prediction was low.
2738 * Modify the coefficients so that we predict a larger value in any
2739 * future transformations */
2740 if (! first_time) {
2741 STRLEN needed = *xlen + 1; /* +1 For trailing NUL */
2742 STRLEN computed_guess = PL_collxfrm_base
2743 + (PL_collxfrm_mult * length_in_chars);
e1c30f0c
KW
2744
2745 /* On zero-length input, just keep current slope instead of
2746 * dividing by 0 */
2747 const STRLEN new_m = (length_in_chars != 0)
2748 ? needed / length_in_chars
2749 : PL_collxfrm_mult;
17f41037
KW
2750
2751 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
b07929e4
KW
2752 "%s: %d: initial size of %zu bytes for a length "
2753 "%zu string was insufficient, %zu needed\n",
17f41037 2754 __FILE__, __LINE__,
b07929e4 2755 computed_guess, length_in_chars, needed));
17f41037
KW
2756
2757 /* If slope increased, use it, but discard this result for
2758 * length 1 strings, as we can't be sure that it's a real slope
2759 * change */
2760 if (length_in_chars > 1 && new_m > PL_collxfrm_mult) {
7d4bcc4a
KW
2761
2762# ifdef DEBUGGING
2763
17f41037
KW
2764 STRLEN old_m = PL_collxfrm_mult;
2765 STRLEN old_b = PL_collxfrm_base;
7d4bcc4a
KW
2766
2767# endif
2768
17f41037
KW
2769 PL_collxfrm_mult = new_m;
2770 PL_collxfrm_base = 1; /* +1 For trailing NUL */
2771 computed_guess = PL_collxfrm_base
2772 + (PL_collxfrm_mult * length_in_chars);
2773 if (computed_guess < needed) {
2774 PL_collxfrm_base += needed - computed_guess;
2775 }
2776
2777 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
b07929e4
KW
2778 "%s: %d: slope is now %zu; was %zu, base "
2779 "is now %zu; was %zu\n",
17f41037 2780 __FILE__, __LINE__,
b07929e4
KW
2781 PL_collxfrm_mult, old_m,
2782 PL_collxfrm_base, old_b));
17f41037
KW
2783 }
2784 else { /* Slope didn't change, but 'b' did */
2785 const STRLEN new_b = needed
2786 - computed_guess
2787 + PL_collxfrm_base;
2788 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
b07929e4 2789 "%s: %d: base is now %zu; was %zu\n",
17f41037 2790 __FILE__, __LINE__,
b07929e4 2791 new_b, PL_collxfrm_base));
17f41037
KW
2792 PL_collxfrm_base = new_b;
2793 }
2794 }
2795
4ebeff16
KW
2796 break;
2797 }
bb0f664e 2798
c7202dee
KW
2799 if (UNLIKELY(*xlen >= PERL_INT_MAX)) {
2800 DEBUG_L(PerlIO_printf(Perl_debug_log,
2801 "_mem_collxfrm: Needed %zu bytes, max permissible is %u\n",
2802 *xlen, PERL_INT_MAX));
4ebeff16 2803 goto bad;
c7202dee 2804 }
d35fca5f 2805
c664130f 2806 /* A well-behaved strxfrm() returns exactly how much space it needs
1adab0a7
KW
2807 * (usually not including the trailing NUL) when it fails due to not
2808 * enough space being provided. Assume that this is the case unless
2809 * it's been proven otherwise */
c664130f 2810 if (LIKELY(PL_strxfrm_is_behaved) && first_time) {
55e5378d 2811 xAlloc = *xlen + COLLXFRM_HDR_LEN + 1;
c664130f
KW
2812 }
2813 else { /* Here, either:
2814 * 1) The strxfrm() has previously shown bad behavior; or
2815 * 2) It isn't the first time through the loop, which means
2816 * that the strxfrm() is now showing bad behavior, because
2817 * we gave it what it said was needed in the previous
2818 * iteration, and it came back saying it needed still more.
2819 * (Many versions of cygwin fit this. When the buffer size
2820 * isn't sufficient, they return the input size instead of
2821 * how much is needed.)
d4ff9586
KW
2822 * Increase the buffer size by a fixed percentage and try again.
2823 * */
6ddd902c 2824 xAlloc += (xAlloc / 4) + 1;
c664130f 2825 PL_strxfrm_is_behaved = FALSE;
c664130f 2826
7d4bcc4a
KW
2827# ifdef DEBUGGING
2828
58eebef2
KW
2829 if (DEBUG_Lv_TEST || debug_initialization) {
2830 PerlIO_printf(Perl_debug_log,
2831 "_mem_collxfrm required more space than previously calculated"
b07929e4 2832 " for locale %s, trying again with new guess=%d+%zu\n",
58eebef2 2833 PL_collation_name, (int) COLLXFRM_HDR_LEN,
b07929e4 2834 xAlloc - COLLXFRM_HDR_LEN);
58eebef2 2835 }
7d4bcc4a
KW
2836
2837# endif
2838
58eebef2 2839 }
c664130f 2840
4ebeff16 2841 Renew(xbuf, xAlloc, char);
c7202dee
KW
2842 if (UNLIKELY(! xbuf)) {
2843 DEBUG_L(PerlIO_printf(Perl_debug_log,
2844 "_mem_collxfrm: Couldn't realloc %zu bytes\n", xAlloc));
4ebeff16 2845 goto bad;
c7202dee 2846 }
c664130f
KW
2847
2848 first_time = FALSE;
4ebeff16 2849 }
98994639 2850
6696cfa7 2851
7d4bcc4a
KW
2852# ifdef DEBUGGING
2853
58eebef2 2854 if (DEBUG_Lv_TEST || debug_initialization) {
c7202dee
KW
2855
2856 print_collxfrm_input_and_return(s, s + len, xlen, utf8);
2857 PerlIO_printf(Perl_debug_log, "Its xfrm is:");
7e2f38b2
KW
2858 PerlIO_printf(Perl_debug_log, "%s\n",
2859 _byte_dump_string((U8 *) xbuf + COLLXFRM_HDR_LEN,
2860 *xlen, 1));
58eebef2 2861 }
7d4bcc4a
KW
2862
2863# endif
58eebef2 2864
3c5f993e 2865 /* Free up unneeded space; retain ehough for trailing NUL */
55e5378d 2866 Renew(xbuf, COLLXFRM_HDR_LEN + *xlen + 1, char);
98994639 2867
6696cfa7
KW
2868 if (s != input_string) {
2869 Safefree(s);
98994639
HS
2870 }
2871
98994639
HS
2872 return xbuf;
2873
2874 bad:
2875 Safefree(xbuf);
6696cfa7
KW
2876 if (s != input_string) {
2877 Safefree(s);
2878 }
98994639 2879 *xlen = 0;
7d4bcc4a
KW
2880
2881# ifdef DEBUGGING
2882
58eebef2 2883 if (DEBUG_Lv_TEST || debug_initialization) {
c7202dee 2884 print_collxfrm_input_and_return(s, s + len, NULL, utf8);
58eebef2 2885 }
7d4bcc4a
KW
2886
2887# endif
2888
98994639
HS
2889 return NULL;
2890}
2891
7d4bcc4a 2892# ifdef DEBUGGING
c7202dee 2893
4cbaac56 2894STATIC void
c7202dee
KW
2895S_print_collxfrm_input_and_return(pTHX_
2896 const char * const s,
2897 const char * const e,
2898 const STRLEN * const xlen,
2899 const bool is_utf8)
2900{
c7202dee
KW
2901
2902 PERL_ARGS_ASSERT_PRINT_COLLXFRM_INPUT_AND_RETURN;
2903
511e4ff7
DM
2904 PerlIO_printf(Perl_debug_log, "_mem_collxfrm[%" UVuf "]: returning ",
2905 (UV)PL_collation_ix);
c7202dee 2906 if (xlen) {
08b6dc1d 2907 PerlIO_printf(Perl_debug_log, "%zu", *xlen);
c7202dee
KW
2908 }
2909 else {
2910 PerlIO_printf(Perl_debug_log, "NULL");
2911 }
2912 PerlIO_printf(Perl_debug_log, " for locale '%s', string='",
2913 PL_collation_name);
9c8a6dc2
KW
2914 print_bytes_for_locale(s, e, is_utf8);
2915
2916 PerlIO_printf(Perl_debug_log, "'\n");
2917}
2918
2919STATIC void
2920S_print_bytes_for_locale(pTHX_
2921 const char * const s,
2922 const char * const e,
2923 const bool is_utf8)
2924{
2925 const char * t = s;
2926 bool prev_was_printable = TRUE;
2927 bool first_time = TRUE;
2928
2929 PERL_ARGS_ASSERT_PRINT_BYTES_FOR_LOCALE;
c7202dee
KW
2930
2931 while (t < e) {
2932 UV cp = (is_utf8)
2933 ? utf8_to_uvchr_buf((U8 *) t, e, NULL)
2934 : * (U8 *) t;
2935 if (isPRINT(cp)) {
2936 if (! prev_was_printable) {
2937 PerlIO_printf(Perl_debug_log, " ");
2938 }
2939 PerlIO_printf(Perl_debug_log, "%c", (U8) cp);
2940 prev_was_printable = TRUE;
2941 }
2942 else {
2943 if (! first_time) {
2944 PerlIO_printf(Perl_debug_log, " ");
2945 }
147e3846 2946 PerlIO_printf(Perl_debug_log, "%02" UVXf, cp);
c7202dee
KW
2947 prev_was_printable = FALSE;
2948 }
2949 t += (is_utf8) ? UTF8SKIP(t) : 1;
2950 first_time = FALSE;
2951 }
c7202dee
KW
2952}
2953
7d4bcc4a 2954# endif /* #ifdef DEBUGGING */
98994639 2955#endif /* USE_LOCALE_COLLATE */
58eebef2 2956
8ef6e574
KW
2957#ifdef USE_LOCALE
2958
c1284011
KW
2959bool
2960Perl__is_cur_LC_category_utf8(pTHX_ int category)
7d74bb61
KW
2961{
2962 /* Returns TRUE if the current locale for 'category' is UTF-8; FALSE
2963 * otherwise. 'category' may not be LC_ALL. If the platform doesn't have
119ee68b 2964 * nl_langinfo(), nor MB_CUR_MAX, this employs a heuristic, which hence
609548d2
KW
2965 * could give the wrong result. The result will very likely be correct for
2966 * languages that have commonly used non-ASCII characters, but for notably
2967 * English, it comes down to if the locale's name ends in something like
2968 * "UTF-8". It errs on the side of not being a UTF-8 locale. */
7d74bb61
KW
2969
2970 char *save_input_locale = NULL;
7d74bb61
KW
2971 STRLEN final_pos;
2972
7d4bcc4a
KW
2973# ifdef LC_ALL
2974
7d74bb61 2975 assert(category != LC_ALL);
7d4bcc4a
KW
2976
2977# endif
7d74bb61
KW
2978
2979 /* First dispose of the trivial cases */
837ce802 2980 save_input_locale = do_setlocale_r(category, NULL);
7d74bb61 2981 if (! save_input_locale) {
69014004
KW
2982 DEBUG_L(PerlIO_printf(Perl_debug_log,
2983 "Could not find current locale for category %d\n",
2984 category));
7d74bb61
KW
2985 return FALSE; /* XXX maybe should croak */
2986 }
b07fffd1 2987 save_input_locale = stdize_locale(savepv(save_input_locale));
a39edc4c 2988 if (isNAME_C_OR_POSIX(save_input_locale)) {
69014004
KW
2989 DEBUG_L(PerlIO_printf(Perl_debug_log,
2990 "Current locale for category %d is %s\n",
2991 category, save_input_locale));
b07fffd1 2992 Safefree(save_input_locale);
7d74bb61
KW
2993 return FALSE;
2994 }
2995
7d4bcc4a 2996# if defined(USE_LOCALE_CTYPE) \
1d958db2 2997 && (defined(MB_CUR_MAX) || (defined(HAS_NL_LANGINFO) && defined(CODESET)))
7d74bb61 2998
1d958db2 2999 { /* Next try nl_langinfo or MB_CUR_MAX if available */
7d74bb61
KW
3000
3001 char *save_ctype_locale = NULL;
119ee68b 3002 bool is_utf8;
7d74bb61 3003
119ee68b 3004 if (category != LC_CTYPE) { /* These work only on LC_CTYPE */
7d74bb61
KW
3005
3006 /* Get the current LC_CTYPE locale */
837ce802 3007 save_ctype_locale = do_setlocale_c(LC_CTYPE, NULL);
7d74bb61 3008 if (! save_ctype_locale) {
69014004
KW
3009 DEBUG_L(PerlIO_printf(Perl_debug_log,
3010 "Could not find current locale for LC_CTYPE\n"));
7d74bb61
KW
3011 goto cant_use_nllanginfo;
3012 }
4f72bb37 3013 save_ctype_locale = stdize_locale(savepv(save_ctype_locale));
7d74bb61
KW
3014
3015 /* If LC_CTYPE and the desired category use the same locale, this
3016 * means that finding the value for LC_CTYPE is the same as finding
3017 * the value for the desired category. Otherwise, switch LC_CTYPE
3018 * to the desired category's locale */
3019 if (strEQ(save_ctype_locale, save_input_locale)) {
3020 Safefree(save_ctype_locale);
3021 save_ctype_locale = NULL;
3022 }
837ce802 3023 else if (! do_setlocale_c(LC_CTYPE, save_input_locale)) {
69014004
KW
3024 DEBUG_L(PerlIO_printf(Perl_debug_log,
3025 "Could not change LC_CTYPE locale to %s\n",
3026 save_input_locale));
7d74bb61
KW
3027 Safefree(save_ctype_locale);
3028 goto cant_use_nllanginfo;
3029 }
3030 }
3031
69014004
KW
3032 DEBUG_L(PerlIO_printf(Perl_debug_log, "Current LC_CTYPE locale=%s\n",
3033 save_input_locale));
3034
7d74bb61 3035 /* Here the current LC_CTYPE is set to the locale of the category whose
1d958db2
KW
3036 * information is desired. This means that nl_langinfo() and MB_CUR_MAX
3037 * should give the correct results */
119ee68b 3038
7d4bcc4a 3039# if defined(HAS_NL_LANGINFO) && defined(CODESET)
c70a3e68 3040 /* The task is easiest if has this POSIX 2001 function */
7d4bcc4a 3041
1d958db2 3042 {
c70a3e68
KW
3043 const char *codeset = my_nl_langinfo(PERL_CODESET, FALSE);
3044 /* FALSE => already in dest locale */
119ee68b 3045
c70a3e68
KW
3046 DEBUG_L(PerlIO_printf(Perl_debug_log,
3047 "\tnllanginfo returned CODESET '%s'\n", codeset));
3048
3049 if (codeset && strNE(codeset, "")) {
1d958db2
KW
3050 /* If we switched LC_CTYPE, switch back */
3051 if (save_ctype_locale) {
837ce802 3052 do_setlocale_c(LC_CTYPE, save_ctype_locale);
1d958db2
KW
3053 Safefree(save_ctype_locale);
3054 }
3055
3056 is_utf8 = foldEQ(codeset, STR_WITH_LEN("UTF-8"))
3057 || foldEQ(codeset, STR_WITH_LEN("UTF8"));
3058
69014004
KW
3059 DEBUG_L(PerlIO_printf(Perl_debug_log,
3060 "\tnllanginfo returned CODESET '%s'; ?UTF8 locale=%d\n",
3061 codeset, is_utf8));
1d958db2
KW
3062 Safefree(save_input_locale);
3063 return is_utf8;
3064 }
119ee68b
KW
3065 }
3066
7d4bcc4a
KW
3067# endif
3068# ifdef MB_CUR_MAX
1d958db2
KW
3069
3070 /* Here, either we don't have nl_langinfo, or it didn't return a
3071 * codeset. Try MB_CUR_MAX */
3072
119ee68b
KW
3073 /* Standard UTF-8 needs at least 4 bytes to represent the maximum
3074 * Unicode code point. Since UTF-8 is the only non-single byte
3075 * encoding we handle, we just say any such encoding is UTF-8, and if
3076 * turns out to be wrong, other things will fail */
3077 is_utf8 = MB_CUR_MAX >= 4;
3078
69014004
KW
3079 DEBUG_L(PerlIO_printf(Perl_debug_log,
3080 "\tMB_CUR_MAX=%d; ?UTF8 locale=%d\n",
3081 (int) MB_CUR_MAX, is_utf8));
3082
119ee68b
KW
3083 Safefree(save_input_locale);
3084
7d4bcc4a 3085# ifdef HAS_MBTOWC
119ee68b
KW
3086
3087 /* ... But, most system that have MB_CUR_MAX will also have mbtowc(),
3088 * since they are both in the C99 standard. We can feed a known byte
3089 * string to the latter function, and check that it gives the expected
3090 * result */
3091 if (is_utf8) {
3092 wchar_t wc;
856b881c 3093 PERL_UNUSED_RESULT(mbtowc(&wc, NULL, 0));/* Reset any shift state */
69014004 3094 errno = 0;
f019f68f 3095 if ((size_t)mbtowc(&wc, HYPHEN_UTF8, strlen(HYPHEN_UTF8))
119ee68b
KW
3096 != strlen(HYPHEN_UTF8)
3097 || wc != (wchar_t) 0x2010)
3098 {
3099 is_utf8 = FALSE;
abdcbdb8 3100 DEBUG_L(PerlIO_printf(Perl_debug_log, "\thyphen=U+%x\n", (unsigned int)wc));
69014004
KW
3101 DEBUG_L(PerlIO_printf(Perl_debug_log,
3102 "\treturn from mbtowc=%d; errno=%d; ?UTF8 locale=0\n",
3103 mbtowc(&wc, HYPHEN_UTF8, strlen(HYPHEN_UTF8)), errno));
119ee68b
KW
3104 }
3105 }
7d4bcc4a
KW
3106
3107# endif
119ee68b 3108
1d958db2
KW
3109 /* If we switched LC_CTYPE, switch back */
3110 if (save_ctype_locale) {
837ce802 3111 do_setlocale_c(LC_CTYPE, save_ctype_locale);
1d958db2 3112 Safefree(save_ctype_locale);
119ee68b 3113 }
7d74bb61 3114
1d958db2 3115 return is_utf8;
7d4bcc4a
KW
3116
3117# endif
3118
7d74bb61 3119 }
119ee68b 3120
7d74bb61
KW
3121 cant_use_nllanginfo:
3122
7d4bcc4a 3123# else /* nl_langinfo should work if available, so don't bother compiling this
0080c90a
KW
3124 fallback code. The final fallback of looking at the name is
3125 compiled, and will be executed if nl_langinfo fails */
7d74bb61 3126
97f4de96
KW
3127 /* nl_langinfo not available or failed somehow. Next try looking at the
3128 * currency symbol to see if it disambiguates things. Often that will be
3129 * in the native script, and if the symbol isn't in UTF-8, we know that the
3130 * locale isn't. If it is non-ASCII UTF-8, we infer that the locale is
609548d2
KW
3131 * too, as the odds of a non-UTF8 string being valid UTF-8 are quite small
3132 * */
fa9b773e 3133
7d4bcc4a
KW
3134# ifdef HAS_LOCALECONV
3135# ifdef USE_LOCALE_MONETARY
3136
fa9b773e
KW
3137 {
3138 char *save_monetary_locale = NULL;
fa9b773e 3139 bool only_ascii = FALSE;
13542a67
KW
3140 bool is_utf8 = FALSE;
3141 struct lconv* lc;
fa9b773e 3142
97f4de96
KW
3143 /* Like above for LC_CTYPE, we first set LC_MONETARY to the locale of
3144 * the desired category, if it isn't that locale already */
3145
fa9b773e
KW
3146 if (category != LC_MONETARY) {
3147
837ce802 3148 save_monetary_locale = do_setlocale_c(LC_MONETARY, NULL);
fa9b773e 3149 if (! save_monetary_locale) {
69014004
KW
3150 DEBUG_L(PerlIO_printf(Perl_debug_log,
3151 "Could not find current locale for LC_MONETARY\n"));
fa9b773e
KW
3152 goto cant_use_monetary;
3153 }
4f72bb37 3154 save_monetary_locale = stdize_locale(savepv(save_monetary_locale));
fa9b773e 3155
13542a67
KW
3156 if (strEQ(save_monetary_locale, save_input_locale)) {
3157 Safefree(save_monetary_locale);
3158 save_monetary_locale = NULL;
3159 }
837ce802 3160 else if (! do_setlocale_c(LC_MONETARY, save_input_locale)) {
59c234b4
KW
3161 DEBUG_L(PerlIO_printf(Perl_debug_log,
3162 "Could not change LC_MONETARY locale to %s\n",
3163 save_input_locale));
3164 Safefree(save_monetary_locale);
3165 goto cant_use_monetary;
fa9b773e
KW
3166 }
3167 }
3168
3169 /* Here the current LC_MONETARY is set to the locale of the category
3170 * whose information is desired. */
3171
13542a67
KW
3172 lc = localeconv();
3173 if (! lc
3174 || ! lc->currency_symbol
c5f058df 3175 || is_utf8_invariant_string((U8 *) lc->currency_symbol, 0))
13542a67
KW
3176 {
3177 DEBUG_L(PerlIO_printf(Perl_debug_log, "Couldn't get currency symbol for %s, or contains only ASCII; can't use for determining if UTF-8 locale\n", save_input_locale));
3178 only_ascii = TRUE;
3179 }
3180 else {
3181 is_utf8 = is_utf8_string((U8 *) lc->currency_symbol, 0);
fa9b773e
KW
3182 }
3183
3184 /* If we changed it, restore LC_MONETARY to its original locale */
3185 if (save_monetary_locale) {
837ce802 3186 do_setlocale_c(LC_MONETARY, save_monetary_locale);
fa9b773e
KW
3187 Safefree(save_monetary_locale);
3188 }
3189
13542a67 3190 if (! only_ascii) {
fa9b773e 3191
59c234b4
KW
3192 /* It isn't a UTF-8 locale if the symbol is not legal UTF-8;
3193 * otherwise assume the locale is UTF-8 if and only if the symbol
3194 * is non-ascii UTF-8. */
3195 DEBUG_L(PerlIO_printf(Perl_debug_log, "\t?Currency symbol for %s is UTF-8=%d\n",
3196 save_input_locale, is_utf8));
3197 Safefree(save_input_locale);
3198 return is_utf8;
13542a67 3199 }
fa9b773e
KW
3200 }
3201 cant_use_monetary:
3202
7d4bcc4a
KW
3203# endif /* USE_LOCALE_MONETARY */
3204# endif /* HAS_LOCALECONV */
fa9b773e 3205
7d4bcc4a 3206# if defined(HAS_STRFTIME) && defined(USE_LOCALE_TIME)
15f7e74e
KW
3207
3208/* Still haven't found a non-ASCII string to disambiguate UTF-8 or not. Try
3209 * the names of the months and weekdays, timezone, and am/pm indicator */
3210 {
3211 char *save_time_locale = NULL;
3212 int hour = 10;
3213 bool is_dst = FALSE;
3214 int dom = 1;
3215 int month = 0;
3216 int i;
3217 char * formatted_time;
3218
3219
3220 /* Like above for LC_MONETARY, we set LC_TIME to the locale of the
3221 * desired category, if it isn't that locale already */
3222
3223 if (category != LC_TIME) {
3224
837ce802 3225 save_time_locale = do_setlocale_c(LC_TIME, NULL);
15f7e74e
KW
3226 if (! save_time_locale) {
3227 DEBUG_L(PerlIO_printf(Perl_debug_log,
3228 "Could not find current locale for LC_TIME\n"));
3229 goto cant_use_time;
3230 }
3231 save_time_locale = stdize_locale(savepv(save_time_locale));
3232
3233 if (strEQ(save_time_locale, save_input_locale)) {
3234 Safefree(save_time_locale);
3235 save_time_locale = NULL;
3236 }
837ce802 3237 else if (! do_setlocale_c(LC_TIME, save_input_locale)) {
15f7e74e
KW
3238 DEBUG_L(PerlIO_printf(Perl_debug_log,
3239 "Could not change LC_TIME locale to %s\n",
3240 save_input_locale));
3241 Safefree(save_time_locale);
3242 goto cant_use_time;
3243 }
3244 }
3245
3246 /* Here the current LC_TIME is set to the locale of the category
3247 * whose information is desired. Look at all the days of the week and
9f10db87 3248 * month names, and the timezone and am/pm indicator for UTF-8 variant
15f7e74e
KW
3249 * characters. The first such a one found will tell us if the locale
3250 * is UTF-8 or not */
3251
3252 for (i = 0; i < 7 + 12; i++) { /* 7 days; 12 months */
3253 formatted_time = my_strftime("%A %B %Z %p",
3ae5cd07 3254 0, 0, hour, dom, month, 2012 - 1900, 0, 0, is_dst);
c5f058df
KW
3255 if ( ! formatted_time
3256 || is_utf8_invariant_string((U8 *) formatted_time, 0))
3257 {
15f7e74e
KW
3258
3259 /* Here, we didn't find a non-ASCII. Try the next time through
3260 * with the complemented dst and am/pm, and try with the next
3261 * weekday. After we have gotten all weekdays, try the next
3262 * month */
3263 is_dst = ! is_dst;
3264 hour = (hour + 12) % 24;
3265 dom++;
3266 if (i > 6) {
3267 month++;
3268 }
3269 continue;
3270 }
3271
3272 /* Here, we have a non-ASCII. Return TRUE is it is valid UTF8;
3273 * false otherwise. But first, restore LC_TIME to its original
3274 * locale if we changed it */
3275 if (save_time_locale) {
837ce802 3276 do_setlocale_c(LC_TIME, save_time_locale);
15f7e74e
KW
3277 Safefree(save_time_locale);
3278 }
3279
3280 DEBUG_L(PerlIO_printf(Perl_debug_log, "\t?time-related strings for %s are UTF-8=%d\n",
3281 save_input_locale,
3282 is_utf8_string((U8 *) formatted_time, 0)));
3283 Safefree(save_input_locale);
3284 return is_utf8_string((U8 *) formatted_time, 0);
3285 }
3286
3287 /* Falling off the end of the loop indicates all the names were just
3288 * ASCII. Go on to the next test. If we changed it, restore LC_TIME
3289 * to its original locale */
3290 if (save_time_locale) {
837ce802 3291 do_setlocale_c(LC_TIME, save_time_locale);
15f7e74e
KW
3292 Safefree(save_time_locale);
3293 }
3294 DEBUG_L(PerlIO_printf(Perl_debug_log, "All time-related words for %s contain only ASCII; can't use for determining if UTF-8 locale\n", save_input_locale));
3295 }
3296 cant_use_time:
3297
7d4bcc4a 3298# endif
15f7e74e 3299
7d4bcc4a 3300# if 0 && defined(USE_LOCALE_MESSAGES) && defined(HAS_SYS_ERRLIST)
855aeb93
JH
3301
3302/* This code is ifdefd out because it was found to not be necessary in testing
5857e934
KW
3303 * on our dromedary test machine, which has over 700 locales. There, this
3304 * added no value to looking at the currency symbol and the time strings. I
3305 * left it in so as to avoid rewriting it if real-world experience indicates
3306 * that dromedary is an outlier. Essentially, instead of returning abpve if we
855aeb93
JH
3307 * haven't found illegal utf8, we continue on and examine all the strerror()
3308 * messages on the platform for utf8ness. If all are ASCII, we still don't
3309 * know the answer; but otherwise we have a pretty good indication of the
5857e934
KW
3310 * utf8ness. The reason this doesn't help much is that the messages may not
3311 * have been translated into the locale. The currency symbol and time strings
3312 * are much more likely to have been translated. */
3313 {
855aeb93 3314 int e;
5857e934
KW
3315 bool is_utf8 = FALSE;
3316 bool non_ascii = FALSE;
855aeb93 3317 char *save_messages_locale = NULL;
5857e934 3318 const char * errmsg = NULL;
855aeb93 3319
5857e934
KW
3320 /* Like above, we set LC_MESSAGES to the locale of the desired
3321 * category, if it isn't that locale already */
855aeb93
JH
3322
3323 if (category != LC_MESSAGES) {
3324
837ce802 3325 save_messages_locale = do_setlocale_c(LC_MESSAGES, NULL);
855aeb93 3326 if (! save_messages_locale) {
5857e934
KW
3327 DEBUG_L(PerlIO_printf(Perl_debug_log,
3328 "Could not find current locale for LC_MESSAGES\n"));
855aeb93
JH
3329 goto cant_use_messages;
3330 }
5857e934 3331 save_messages_locale = stdize_locale(savepv(save_messages_locale));
855aeb93
JH
3332
3333 if (strEQ(save_messages_locale, save_input_locale)) {
5857e934
KW
3334 Safefree(save_messages_locale);
3335 save_messages_locale = NULL;
855aeb93 3336 }
837ce802 3337 else if (! do_setlocale_c(LC_MESSAGES, save_input_locale)) {
5857e934
KW
3338 DEBUG_L(PerlIO_printf(Perl_debug_log,
3339 "Could not change LC_MESSAGES locale to %s\n",
3340 save_input_locale));
855aeb93
JH
3341 Safefree(save_messages_locale);
3342 goto cant_use_messages;
3343 }
3344 }
3345
3346 /* Here the current LC_MESSAGES is set to the locale of the category
5857e934
KW
3347 * whose information is desired. Look through all the messages. We
3348 * can't use Strerror() here because it may expand to code that
3349 * segfaults in miniperl */
855aeb93 3350
5857e934
KW
3351 for (e = 0; e <= sys_nerr; e++) {
3352 errno = 0;
3353 errmsg = sys_errlist[e];
3354 if (errno || !errmsg) {
855aeb93
JH
3355 break;
3356 }
5857e934 3357 errmsg = savepv(errmsg);
c5f058df 3358 if (! is_utf8_invariant_string((U8 *) errmsg, 0)) {
5857e934
KW
3359 non_ascii = TRUE;
3360 is_utf8 = is_utf8_string((U8 *) errmsg, 0);
3361 break;
855aeb93
JH
3362 }
3363 }
5857e934 3364 Safefree(errmsg);
855aeb93
JH
3365
3366 /* And, if we changed it, restore LC_MESSAGES to its original locale */
3367 if (save_messages_locale) {
837ce802 3368 do_setlocale_c(LC_MESSAGES, save_messages_locale);
855aeb93
JH
3369 Safefree(save_messages_locale);
3370 }
3371
5857e934
KW
3372 if (non_ascii) {
3373
3374 /* Any non-UTF-8 message means not a UTF-8 locale; if all are valid,
3375 * any non-ascii means it is one; otherwise we assume it isn't */
3376 DEBUG_L(PerlIO_printf(Perl_debug_log, "\t?error messages for %s are UTF-8=%d\n",
3377 save_input_locale,
3378 is_utf8));
3379 Safefree(save_input_locale);
3380 return is_utf8;
3381 }
855aeb93 3382
5857e934 3383 DEBUG_L(PerlIO_printf(Perl_debug_log, "All error messages for %s contain only ASCII; can't use for determining if UTF-8 locale\n", save_input_locale));
855aeb93
JH
3384 }
3385 cant_use_messages:
3386
7d4bcc4a
KW
3387# endif
3388# endif /* the code that is compiled when no nl_langinfo */
0080c90a 3389
7d4bcc4a 3390# ifndef EBCDIC /* On os390, even if the name ends with "UTF-8', it isn't a
92c0a900 3391 UTF-8 locale */
7d4bcc4a 3392
97f4de96
KW
3393 /* As a last resort, look at the locale name to see if it matches
3394 * qr/UTF -? * 8 /ix, or some other common locale names. This "name", the
3395 * return of setlocale(), is actually defined to be opaque, so we can't
3396 * really rely on the absence of various substrings in the name to indicate
3397 * its UTF-8ness, but if it has UTF8 in the name, it is extremely likely to
3398 * be a UTF-8 locale. Similarly for the other common names */
3399
3400 final_pos = strlen(save_input_locale) - 1;
3401 if (final_pos >= 3) {
3402 char *name = save_input_locale;
3403
3404 /* Find next 'U' or 'u' and look from there */
3405 while ((name += strcspn(name, "Uu") + 1)
3406 <= save_input_locale + final_pos - 2)
3407 {
c72493e0 3408 if ( isALPHA_FOLD_NE(*name, 't')
305b8651 3409 || isALPHA_FOLD_NE(*(name + 1), 'f'))
97f4de96
KW
3410 {
3411 continue;
3412 }
3413 name += 2;
3414 if (*(name) == '-') {
3415 if ((name > save_input_locale + final_pos - 1)) {
3416 break;
3417 }
3418 name++;
3419 }
3420 if (*(name) == '8') {
97f4de96
KW
3421 DEBUG_L(PerlIO_printf(Perl_debug_log,
3422 "Locale %s ends with UTF-8 in name\n",
3423 save_input_locale));
00c54b9c 3424 Safefree(save_input_locale);
97f4de96
KW
3425 return TRUE;
3426 }
3427 }
3428 DEBUG_L(PerlIO_printf(Perl_debug_log,
3429 "Locale %s doesn't end with UTF-8 in name\n",
3430 save_input_locale));
3431 }
3432
7d4bcc4a
KW
3433# endif
3434# ifdef WIN32
3435
97f4de96 3436 /* http://msdn.microsoft.com/en-us/library/windows/desktop/dd317756.aspx */
8a0832a1 3437 if (memENDs(save_input_locale, final_pos, "65001")) {
97f4de96 3438 DEBUG_L(PerlIO_printf(Perl_debug_log,
8a0832a1 3439 "Locale %s ends with 65001 in name, is UTF-8 locale\n",
97f4de96
KW
3440 save_input_locale));
3441 Safefree(save_input_locale);
3442 return TRUE;
3443 }
7d4bcc4a
KW
3444
3445# endif
97f4de96
KW
3446
3447 /* Other common encodings are the ISO 8859 series, which aren't UTF-8. But
3448 * since we are about to return FALSE anyway, there is no point in doing
3449 * this extra work */
7d4bcc4a
KW
3450
3451# if 0
97f4de96
KW
3452 if (instr(save_input_locale, "8859")) {
3453 DEBUG_L(PerlIO_printf(Perl_debug_log,
3454 "Locale %s has 8859 in name, not UTF-8 locale\n",
3455 save_input_locale));
3456 Safefree(save_input_locale);
3457 return FALSE;
3458 }
7d4bcc4a 3459# endif
97f4de96 3460
69014004
KW
3461 DEBUG_L(PerlIO_printf(Perl_debug_log,
3462 "Assuming locale %s is not a UTF-8 locale\n",
3463 save_input_locale));
fa9b773e 3464 Safefree(save_input_locale);
7d74bb61
KW
3465 return FALSE;
3466}
3467
8ef6e574 3468#endif
7d74bb61 3469
d6ded950
KW
3470
3471bool
3472Perl__is_in_locale_category(pTHX_ const bool compiling, const int category)
3473{
1a4f13e1 3474 dVAR;
d6ded950
KW
3475 /* Internal function which returns if we are in the scope of a pragma that
3476 * enables the locale category 'category'. 'compiling' should indicate if
3477 * this is during the compilation phase (TRUE) or not (FALSE). */
3478
3479 const COP * const cop = (compiling) ? &PL_compiling : PL_curcop;
3480
3481 SV *categories = cop_hints_fetch_pvs(cop, "locale", 0);
3482 if (! categories || categories == &PL_sv_placeholder) {
3483 return FALSE;
3484 }
3485
3486 /* The pseudo-category 'not_characters' is -1, so just add 1 to each to get
3487 * a valid unsigned */
3488 assert(category >= -1);
3489 return cBOOL(SvUV(categories) & (1U << (category + 1)));
3490}
3491
2c6ee1a7 3492char *
6ebbc862
KW
3493Perl_my_strerror(pTHX_ const int errnum)
3494{
3495 /* Returns a mortalized copy of the text of the error message associated
3496 * with 'errnum'. It uses the current locale's text unless the platform
3497 * doesn't have the LC_MESSAGES category or we are not being called from
3498 * within the scope of 'use locale'. In the former case, it uses whatever
3499 * strerror returns; in the latter case it uses the text from the C locale.
3500 *
3501 * The function just calls strerror(), but temporarily switches, if needed,
3502 * to the C locale */
3503
3504 char *errstr;
52770946 3505 dVAR;
6ebbc862 3506
52770946 3507#ifndef USE_LOCALE_MESSAGES
6ebbc862 3508
52770946
KW
3509 /* If platform doesn't have messages category, we don't do any switching to
3510 * the C locale; we just use whatever strerror() returns */
3511
3512 errstr = savepv(Strerror(errnum));
3513
3514#else /* Has locale messages */
3515
3516 const bool within_locale_scope = IN_LC(LC_MESSAGES);
2c6ee1a7 3517
7aaa36b1
KW
3518# if defined(HAS_POSIX_2008_LOCALE) && defined(HAS_STRERROR_L)
3519
3520 /* This function is trivial if we have strerror_l() */
3521
3522 if (within_locale_scope) {
3523 errstr = strerror(errnum);
3524 }
3525 else {
3526 errstr = strerror_l(errnum, PL_C_locale_obj);
3527 }
3528
3529 errstr = savepv(errstr);
3530
3531# else /* Doesn't have strerror_l(). */
3532
3533# ifdef USE_POSIX_2008_LOCALE
3534
fcd0e682 3535 locale_t save_locale = NULL;
7aaa36b1
KW
3536
3537# else
3538
fcd0e682 3539 char * save_locale = NULL;
c9dda6da 3540 bool locale_is_C = FALSE;
2c6ee1a7 3541
6ebbc862
KW
3542 /* We have a critical section to prevent another thread from changing the
3543 * locale out from under us (or zapping the buffer returned from
3544 * setlocale() ) */
3545 LOCALE_LOCK;
3546
7aaa36b1 3547# endif
6ebbc862 3548
9c8a6dc2
KW
3549 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
3550 "my_strerror called with errnum %d\n", errnum));
6ebbc862 3551 if (! within_locale_scope) {
c9dda6da 3552 errno = 0;
a0b53297 3553
f1d2176b 3554# ifdef USE_POSIX_2008_LOCALE /* Use the thread-safe locale functions */
6ebbc862 3555
9c8a6dc2
KW
3556 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
3557 "Not within locale scope, about to call"
3558 " uselocale(0x%p)\n", PL_C_locale_obj));
6ebbc862 3559 save_locale = uselocale(PL_C_locale_obj);
c9dda6da
KW
3560 if (! save_locale) {
3561 DEBUG_L(PerlIO_printf(Perl_debug_log,
9c8a6dc2
KW
3562 "uselocale failed, errno=%d\n", errno));
3563 }
3564 else {
3565 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
3566 "uselocale returned 0x%p\n", save_locale));
c9dda6da 3567 }
6ebbc862 3568
7aaa36b1 3569# else /* Not thread-safe build */
a0b53297 3570
837ce802 3571 save_locale = do_setlocale_c(LC_MESSAGES, NULL);
c9dda6da
KW
3572 if (! save_locale) {
3573 DEBUG_L(PerlIO_printf(Perl_debug_log,
3574 "setlocale failed, errno=%d\n", errno));
3575 }
3576 else {
3577 locale_is_C = isNAME_C_OR_POSIX(save_locale);
2c6ee1a7 3578
c9dda6da
KW
3579 /* Switch to the C locale if not already in it */
3580 if (! locale_is_C) {
2c6ee1a7 3581
c9dda6da
KW
3582 /* The setlocale() just below likely will zap 'save_locale', so
3583 * create a copy. */
3584 save_locale = savepv(save_locale);
837ce802 3585 do_setlocale_c(LC_MESSAGES, "C");
c9dda6da 3586 }
6ebbc862 3587 }
2c6ee1a7 3588
7aaa36b1 3589# endif
2c6ee1a7 3590
6ebbc862 3591 } /* end of ! within_locale_scope */
9c8a6dc2
KW
3592 else {
3593 DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s: %d: WITHIN locale scope\n",
3594 __FILE__, __LINE__));
3595 }
a0b53297 3596
9c8a6dc2
KW
3597 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
3598 "Any locale change has been done; about to call Strerror\n"));
52770946 3599 errstr = savepv(Strerror(errnum));
6ebbc862
KW
3600
3601 if (! within_locale_scope) {
c9dda6da 3602 errno = 0;
a0b53297 3603
f1d2176b 3604# ifdef USE_POSIX_2008_LOCALE
6ebbc862 3605
9c8a6dc2
KW
3606 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
3607 "%s: %d: not within locale scope, restoring the locale\n",
3608 __FILE__, __LINE__));
c9dda6da
KW
3609 if (save_locale && ! uselocale(save_locale)) {
3610 DEBUG_L(PerlIO_printf(Perl_debug_log,
3611 "uselocale restore failed, errno=%d\n", errno));
3612 }
2c6ee1a7 3613 }
6ebbc862 3614
7aaa36b1 3615# else
6ebbc862 3616
c9dda6da 3617 if (save_locale && ! locale_is_C) {
837ce802 3618 if (! do_setlocale_c(LC_MESSAGES, save_locale)) {
c9dda6da
KW
3619 DEBUG_L(PerlIO_printf(Perl_debug_log,
3620 "setlocale restore failed, errno=%d\n", errno));
3621 }
6ebbc862
KW
3622 Safefree(save_locale);
3623 }
3624 }
3625
3626 LOCALE_UNLOCK;
3627
7aaa36b1
KW
3628# endif
3629# endif /* End of doesn't have strerror_l */
52770946 3630#endif /* End of does have locale messages */
6affbbf0
KW
3631
3632#ifdef DEBUGGING
3633
3634 if (DEBUG_Lv_TEST) {
3635 PerlIO_printf(Perl_debug_log, "Strerror returned; saving a copy: '");
3636 print_bytes_for_locale(errstr, errstr + strlen(errstr), 0);
3637 PerlIO_printf(Perl_debug_log, "'\n");
3638 }
3639
2c6ee1a7
KW
3640#endif
3641
52770946 3642 SAVEFREEPV(errstr);
6ebbc862 3643 return errstr;
2c6ee1a7
KW
3644}
3645
66610fdd 3646/*
747c467a 3647
747c467a
KW
3648=for apidoc sync_locale
3649
3650Changing the program's locale should be avoided by XS code. Nevertheless,
3651certain non-Perl libraries called from XS, such as C<Gtk> do so. When this
3652happens, Perl needs to be told that the locale has changed. Use this function
3653to do so, before returning to Perl.
3654
3655=cut
3656*/
3657
3658void
3659Perl_sync_locale(pTHX)
3660{
9f82ea3e 3661 char * newlocale;
747c467a
KW
3662
3663#ifdef USE_LOCALE_CTYPE
7d4bcc4a 3664
9f82ea3e
KW
3665 newlocale = do_setlocale_c(LC_CTYPE, NULL);
3666 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
3667 "%s:%d: %s\n", __FILE__, __LINE__,
3668 setlocale_debug_string(LC_CTYPE, NULL, newlocale)));
3669 new_ctype(newlocale);
747c467a 3670
7d4bcc4a 3671#endif /* USE_LOCALE_CTYPE */
747c467a 3672#ifdef USE_LOCALE_COLLATE
7d4bcc4a 3673
9f82ea3e
KW
3674 newlocale = do_setlocale_c(LC_COLLATE, NULL);
3675 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
3676 "%s:%d: %s\n", __FILE__, __LINE__,
3677 setlocale_debug_string(LC_COLLATE, NULL, newlocale)));
3678 new_collate(newlocale);
747c467a 3679
7d4bcc4a 3680#endif
747c467a 3681#ifdef USE_LOCALE_NUMERIC
7d4bcc4a 3682
9f82ea3e
KW
3683 newlocale = do_setlocale_c(LC_NUMERIC, NULL);
3684 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
3685 "%s:%d: %s\n", __FILE__, __LINE__,
3686 setlocale_debug_string(LC_NUMERIC, NULL, newlocale)));
3687 new_numeric(newlocale);
7d4bcc4a 3688
747c467a
KW
3689#endif /* USE_LOCALE_NUMERIC */
3690
3691}
3692
5d1187d1
KW
3693#if defined(DEBUGGING) && defined(USE_LOCALE)
3694
a4f00dcc
KW
3695STATIC char *
3696S_setlocale_debug_string(const int category, /* category number,
5d1187d1
KW
3697 like LC_ALL */
3698 const char* const locale, /* locale name */
3699
3700 /* return value from setlocale() when attempting to
3701 * set 'category' to 'locale' */
3702 const char* const retval)
3703{
3704 /* Returns a pointer to a NUL-terminated string in static storage with
3705 * added text about the info passed in. This is not thread safe and will
3706 * be overwritten by the next call, so this should be used just to
fa07b8e5 3707 * formulate a string to immediately print or savepv() on. */
5d1187d1 3708
398a990f
DM
3709 /* initialise to a non-null value to keep it out of BSS and so keep
3710 * -DPERL_GLOBAL_STRUCT_PRIVATE happy */
60b45a7d
KW
3711 static char ret[128] = "If you can read this, thank your buggy C"
3712 " library strlcpy(), and change your hints file"
3713 " to undef it";
fa07b8e5 3714 my_strlcpy(ret, "setlocale(", sizeof(ret));
5d1187d1
KW
3715
3716 switch (category) {
3717 default:
fa07b8e5 3718 my_snprintf(ret, sizeof(ret), "%s? %d", ret, category);
5d1187d1 3719 break;
7d4bcc4a
KW
3720
3721# ifdef LC_ALL
3722
5d1187d1 3723 case LC_ALL:
fa07b8e5 3724 my_strlcat(ret, "LC_ALL", sizeof(ret));
5d1187d1 3725 break;
7d4bcc4a
KW
3726
3727# endif
3728# ifdef LC_CTYPE
3729
5d1187d1