This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Change name of internal function
[perl5.git] / locale.c
CommitLineData
98994639
HS
1/* locale.c
2 *
1129b882
NC
3 * Copyright (C) 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001,
4 * 2002, 2003, 2005, 2006, 2007, 2008 by Larry Wall and others
98994639
HS
5 *
6 * You may distribute under the terms of either the GNU General Public
7 * License or the Artistic License, as specified in the README file.
8 *
9 */
10
11/*
4ac71550 12 * A Elbereth Gilthoniel,
cdad3b53 13 * silivren penna míriel
4ac71550 14 * o menel aglar elenath!
cdad3b53 15 * Na-chaered palan-díriel
4ac71550
TC
16 * o galadhremmin ennorath,
17 * Fanuilos, le linnathon
18 * nef aear, si nef aearon!
19 *
20 * [p.238 of _The Lord of the Rings_, II/i: "Many Meetings"]
98994639
HS
21 */
22
166f8a29
DM
23/* utility functions for handling locale-specific stuff like what
24 * character represents the decimal point.
0d071d52 25 *
7d4bcc4a
KW
26 * All C programs have an underlying locale. Perl code generally doesn't pay
27 * any attention to it except within the scope of a 'use locale'. For most
0d071d52
KW
28 * categories, it accomplishes this by just using different operations if it is
29 * in such scope than if not. However, various libc functions called by Perl
30 * are affected by the LC_NUMERIC category, so there are macros in perl.h that
31 * are used to toggle between the current locale and the C locale depending on
a9ad02a8
KW
32 * the desired behavior of those functions at the moment. And, LC_MESSAGES is
33 * switched to the C locale for outputting the message unless within the scope
34 * of 'use locale'.
166f8a29
DM
35 */
36
98994639
HS
37#include "EXTERN.h"
38#define PERL_IN_LOCALE_C
f7416781 39#include "perl_langinfo.h"
98994639
HS
40#include "perl.h"
41
a4af207c
JH
42#include "reentr.h"
43
2fcc0ca9
KW
44/* If the environment says to, we can output debugging information during
45 * initialization. This is done before option parsing, and before any thread
46 * creation, so can be a file-level static */
47#ifdef DEBUGGING
7d4bcc4a 48# ifdef PERL_GLOBAL_STRUCT
27cdc72e 49 /* no global syms allowed */
7d4bcc4a
KW
50# define debug_initialization 0
51# define DEBUG_INITIALIZATION_set(v)
52# else
2fcc0ca9 53static bool debug_initialization = FALSE;
7d4bcc4a
KW
54# define DEBUG_INITIALIZATION_set(v) (debug_initialization = v)
55# endif
2fcc0ca9
KW
56#endif
57
8ef6e574
KW
58#ifdef USE_LOCALE
59
98994639 60/*
0d071d52
KW
61 * Standardize the locale name from a string returned by 'setlocale', possibly
62 * modifying that string.
98994639 63 *
0ef2a2b2 64 * The typical return value of setlocale() is either
98994639
HS
65 * (1) "xx_YY" if the first argument of setlocale() is not LC_ALL
66 * (2) "xa_YY xb_YY ..." if the first argument of setlocale() is LC_ALL
67 * (the space-separated values represent the various sublocales,
0ef2a2b2 68 * in some unspecified order). This is not handled by this function.
98994639
HS
69 *
70 * In some platforms it has a form like "LC_SOMETHING=Lang_Country.866\n",
0ef2a2b2
KW
71 * which is harmful for further use of the string in setlocale(). This
72 * function removes the trailing new line and everything up through the '='
98994639
HS
73 *
74 */
75STATIC char *
76S_stdize_locale(pTHX_ char *locs)
77{
7452cf6a 78 const char * const s = strchr(locs, '=');
98994639
HS
79 bool okay = TRUE;
80
7918f24d
NC
81 PERL_ARGS_ASSERT_STDIZE_LOCALE;
82
8772537c
AL
83 if (s) {
84 const char * const t = strchr(s, '.');
98994639 85 okay = FALSE;
8772537c
AL
86 if (t) {
87 const char * const u = strchr(t, '\n');
88 if (u && (u[1] == 0)) {
89 const STRLEN len = u - s;
90 Move(s + 1, locs, len, char);
91 locs[len] = 0;
92 okay = TRUE;
98994639
HS
93 }
94 }
95 }
96
97 if (!okay)
98 Perl_croak(aTHX_ "Can't fix broken locale name \"%s\"", locs);
99
100 return locs;
101}
102
8ef6e574
KW
103#endif
104
d2b24094
KW
105/* Windows requres a customized base-level setlocale() */
106# ifdef WIN32
107# define my_setlocale(cat, locale) win32_setlocale(cat, locale)
108# else
109# define my_setlocale(cat, locale) setlocale(cat, locale)
110# endif
111
837ce802
KW
112/* Just placeholders for now. "_c" is intended to be called when the category
113 * is a constant known at compile time; "_r", not known until run time */
d2b24094
KW
114# define do_setlocale_c(category, locale) my_setlocale(category, locale)
115# define do_setlocale_r(category, locale) my_setlocale(category, locale)
837ce802 116
a4f00dcc 117STATIC void
86799d2d 118S_set_numeric_radix(pTHX_ const bool use_locale)
98994639 119{
86799d2d
KW
120 /* If 'use_locale' is FALSE, set to use a dot for the radix character. If
121 * TRUE, use the radix character derived from the current locale */
7d4bcc4a 122
86799d2d
KW
123#if defined(USE_LOCALE_NUMERIC) && ( defined(HAS_LOCALECONV) \
124 || defined(HAS_NL_LANGINFO))
98994639 125
86799d2d
KW
126 /* We only set up the radix SV if we are to use a locale radix ... */
127 if (use_locale) {
128 const char * radix = my_nl_langinfo(PERL_RADIXCHAR, FALSE);
129 /* FALSE => already in dest locale */
130
131 /* ... and the character being used isn't a dot */
132 if (strNE(radix, ".")) {
133 if (PL_numeric_radix_sv) {
134 sv_setpv(PL_numeric_radix_sv, radix);
135 }
136 else {
137 PL_numeric_radix_sv = newSVpv(radix, 0);
138 }
139
140 if ( ! is_utf8_invariant_string(
141 (U8 *) SvPVX(PL_numeric_radix_sv), SvCUR(PL_numeric_radix_sv))
142 && is_utf8_string(
143 (U8 *) SvPVX(PL_numeric_radix_sv), SvCUR(PL_numeric_radix_sv))
c1284011 144 && _is_cur_LC_category_utf8(LC_NUMERIC))
28acfe03 145 {
86799d2d 146 SvUTF8_on(PL_numeric_radix_sv);
28acfe03 147 }
86799d2d
KW
148 goto done;
149 }
98994639 150 }
69014004 151
86799d2d
KW
152 SvREFCNT_dec(PL_numeric_radix_sv);
153 PL_numeric_radix_sv = NULL;
154
155 done: ;
156
157# ifdef DEBUGGING
7d4bcc4a 158
2fcc0ca9
KW
159 if (DEBUG_L_TEST || debug_initialization) {
160 PerlIO_printf(Perl_debug_log, "Locale radix is '%s', ?UTF-8=%d\n",
69014004 161 (PL_numeric_radix_sv)
37b7e435
KW
162 ? SvPVX(PL_numeric_radix_sv)
163 : "NULL",
164 (PL_numeric_radix_sv)
39eb7305 165 ? cBOOL(SvUTF8(PL_numeric_radix_sv))
2fcc0ca9
KW
166 : 0);
167 }
69014004 168
86799d2d
KW
169# endif
170#endif /* USE_LOCALE_NUMERIC and can find the radix char */
7d4bcc4a 171
98994639
HS
172}
173
a39edc4c
KW
174/* Is the C string input 'name' "C" or "POSIX"? If so, and 'name' is the
175 * return of setlocale(), then this is extremely likely to be the C or POSIX
176 * locale. However, the output of setlocale() is documented to be opaque, but
177 * the odds are extremely small that it would return these two strings for some
178 * other locale. Note that VMS in these two locales includes many non-ASCII
179 * characters as controls and punctuation (below are hex bytes):
180 * cntrl: 00-1F 7F 84-97 9B-9F
181 * punct: 21-2F 3A-40 5B-60 7B-7E A1-A3 A5 A7-AB B0-B3 B5-B7 B9-BD BF-CF D1-DD DF-EF F1-FD
182 * Oddly, none there are listed as alphas, though some represent alphabetics
183 * http://www.nntp.perl.org/group/perl.perl5.porters/2013/02/msg198753.html */
7d4bcc4a
KW
184#define isNAME_C_OR_POSIX(name) \
185 ( (name) != NULL \
186 && (( *(name) == 'C' && (*(name + 1)) == '\0') \
187 || strEQ((name), "POSIX")))
a39edc4c 188
98994639 189void
8772537c 190Perl_new_numeric(pTHX_ const char *newnum)
98994639 191{
7d4bcc4a
KW
192
193#ifndef USE_LOCALE_NUMERIC
194
195 PERL_UNUSED_ARG(newnum);
196
197#else
0d071d52
KW
198
199 /* Called after all libc setlocale() calls affecting LC_NUMERIC, to tell
200 * core Perl this and that 'newnum' is the name of the new locale.
201 * It installs this locale as the current underlying default.
202 *
203 * The default locale and the C locale can be toggled between by use of the
5792c642
KW
204 * set_numeric_underlying() and set_numeric_standard() functions, which
205 * should probably not be called directly, but only via macros like
0d071d52
KW
206 * SET_NUMERIC_STANDARD() in perl.h.
207 *
208 * The toggling is necessary mainly so that a non-dot radix decimal point
209 * character can be output, while allowing internal calculations to use a
210 * dot.
211 *
212 * This sets several interpreter-level variables:
bb304765 213 * PL_numeric_name The underlying locale's name: a copy of 'newnum'
892e6465 214 * PL_numeric_underlying A boolean indicating if the toggled state is such
7738054c
KW
215 * that the current locale is the program's underlying
216 * locale
217 * PL_numeric_standard An int indicating if the toggled state is such
218 * that the current locale is the C locale. If non-zero,
219 * it is in C; if > 1, it means it may not be toggled away
220 * from C.
0d071d52
KW
221 * Note that both of the last two variables can be true at the same time,
222 * if the underlying locale is C. (Toggling is a no-op under these
223 * circumstances.)
224 *
225 * Any code changing the locale (outside this file) should use
226 * POSIX::setlocale, which calls this function. Therefore this function
227 * should be called directly only from this file and from
228 * POSIX::setlocale() */
229
b03f34cf 230 char *save_newnum;
98994639
HS
231
232 if (! newnum) {
43c5f42d
NC
233 Safefree(PL_numeric_name);
234 PL_numeric_name = NULL;
98994639 235 PL_numeric_standard = TRUE;
892e6465 236 PL_numeric_underlying = TRUE;
98994639
HS
237 return;
238 }
239
b03f34cf 240 save_newnum = stdize_locale(savepv(newnum));
abe1abcf
KW
241
242 PL_numeric_standard = isNAME_C_OR_POSIX(save_newnum);
892e6465 243 PL_numeric_underlying = TRUE;
abe1abcf 244
b03f34cf 245 if (! PL_numeric_name || strNE(PL_numeric_name, save_newnum)) {
98994639 246 Safefree(PL_numeric_name);
b03f34cf 247 PL_numeric_name = save_newnum;
b03f34cf 248 }
abe1abcf
KW
249 else {
250 Safefree(save_newnum);
251 }
4c28b29c
KW
252
253 /* Keep LC_NUMERIC in the C locale. This is for XS modules, so they don't
254 * have to worry about the radix being a non-dot. (Core operations that
255 * need the underlying locale change to it temporarily). */
256 set_numeric_standard();
257
98994639 258#endif /* USE_LOCALE_NUMERIC */
7d4bcc4a 259
98994639
HS
260}
261
262void
263Perl_set_numeric_standard(pTHX)
264{
7d4bcc4a 265
98994639 266#ifdef USE_LOCALE_NUMERIC
7d4bcc4a 267
28c1bf33
KW
268 /* Toggle the LC_NUMERIC locale to C. Most code should use the macros like
269 * SET_NUMERIC_STANDARD() in perl.h instead of calling this directly. The
270 * macro avoids calling this routine if toggling isn't necessary according
271 * to our records (which could be wrong if some XS code has changed the
272 * locale behind our back) */
0d071d52 273
837ce802 274 do_setlocale_c(LC_NUMERIC, "C");
a9b8c0d8 275 PL_numeric_standard = TRUE;
892e6465 276 PL_numeric_underlying = isNAME_C_OR_POSIX(PL_numeric_name);
86799d2d 277 set_numeric_radix(0);
7d4bcc4a
KW
278
279# ifdef DEBUGGING
280
2fcc0ca9
KW
281 if (DEBUG_L_TEST || debug_initialization) {
282 PerlIO_printf(Perl_debug_log,
283 "Underlying LC_NUMERIC locale now is C\n");
284 }
98994639 285
7d4bcc4a 286# endif
98994639 287#endif /* USE_LOCALE_NUMERIC */
7d4bcc4a 288
98994639
HS
289}
290
291void
5792c642 292Perl_set_numeric_underlying(pTHX)
98994639 293{
7d4bcc4a 294
98994639 295#ifdef USE_LOCALE_NUMERIC
7d4bcc4a 296
28c1bf33 297 /* Toggle the LC_NUMERIC locale to the current underlying default. Most
7d4bcc4a
KW
298 * code should use the macros like SET_NUMERIC_UNDERLYING() in perl.h
299 * instead of calling this directly. The macro avoids calling this routine
300 * if toggling isn't necessary according to our records (which could be
301 * wrong if some XS code has changed the locale behind our back) */
a9b8c0d8 302
837ce802 303 do_setlocale_c(LC_NUMERIC, PL_numeric_name);
a9b8c0d8 304 PL_numeric_standard = isNAME_C_OR_POSIX(PL_numeric_name);
892e6465 305 PL_numeric_underlying = TRUE;
86799d2d 306 set_numeric_radix(1);
7d4bcc4a
KW
307
308# ifdef DEBUGGING
309
2fcc0ca9
KW
310 if (DEBUG_L_TEST || debug_initialization) {
311 PerlIO_printf(Perl_debug_log,
69014004 312 "Underlying LC_NUMERIC locale now is %s\n",
2fcc0ca9
KW
313 PL_numeric_name);
314 }
98994639 315
7d4bcc4a 316# endif
98994639 317#endif /* USE_LOCALE_NUMERIC */
7d4bcc4a 318
98994639
HS
319}
320
321/*
322 * Set up for a new ctype locale.
323 */
a4f00dcc
KW
324STATIC void
325S_new_ctype(pTHX_ const char *newctype)
98994639 326{
7d4bcc4a
KW
327
328#ifndef USE_LOCALE_CTYPE
329
330 PERL_ARGS_ASSERT_NEW_CTYPE;
331 PERL_UNUSED_ARG(newctype);
332 PERL_UNUSED_CONTEXT;
333
334#else
0d071d52
KW
335
336 /* Called after all libc setlocale() calls affecting LC_CTYPE, to tell
337 * core Perl this and that 'newctype' is the name of the new locale.
338 *
339 * This function sets up the folding arrays for all 256 bytes, assuming
340 * that tofold() is tolc() since fold case is not a concept in POSIX,
341 *
342 * Any code changing the locale (outside this file) should use
343 * POSIX::setlocale, which calls this function. Therefore this function
344 * should be called directly only from this file and from
345 * POSIX::setlocale() */
346
27da23d5 347 dVAR;
68067e4e 348 UV i;
98994639 349
7918f24d
NC
350 PERL_ARGS_ASSERT_NEW_CTYPE;
351
215c5139
KW
352 /* We will replace any bad locale warning with 1) nothing if the new one is
353 * ok; or 2) a new warning for the bad new locale */
354 if (PL_warn_locale) {
355 SvREFCNT_dec_NN(PL_warn_locale);
356 PL_warn_locale = NULL;
357 }
358
c1284011 359 PL_in_utf8_CTYPE_locale = _is_cur_LC_category_utf8(LC_CTYPE);
31f05a37
KW
360
361 /* A UTF-8 locale gets standard rules. But note that code still has to
362 * handle this specially because of the three problematic code points */
363 if (PL_in_utf8_CTYPE_locale) {
364 Copy(PL_fold_latin1, PL_fold_locale, 256, U8);
365 }
366 else {
8c6180a9
KW
367 /* Assume enough space for every character being bad. 4 spaces each
368 * for the 94 printable characters that are output like "'x' "; and 5
369 * spaces each for "'\\' ", "'\t' ", and "'\n' "; plus a terminating
370 * NUL */
371 char bad_chars_list[ (94 * 4) + (3 * 5) + 1 ];
372
cc9eaeb0
KW
373 /* Don't check for problems if we are suppressing the warnings */
374 bool check_for_problems = ckWARN_d(WARN_LOCALE)
375 || UNLIKELY(DEBUG_L_TEST);
8c6180a9
KW
376 bool multi_byte_locale = FALSE; /* Assume is a single-byte locale
377 to start */
378 unsigned int bad_count = 0; /* Count of bad characters */
379
baa60164
KW
380 for (i = 0; i < 256; i++) {
381 if (isUPPER_LC((U8) i))
382 PL_fold_locale[i] = (U8) toLOWER_LC((U8) i);
383 else if (isLOWER_LC((U8) i))
384 PL_fold_locale[i] = (U8) toUPPER_LC((U8) i);
385 else
386 PL_fold_locale[i] = (U8) i;
8c6180a9
KW
387
388 /* If checking for locale problems, see if the native ASCII-range
389 * printables plus \n and \t are in their expected categories in
390 * the new locale. If not, this could mean big trouble, upending
391 * Perl's and most programs' assumptions, like having a
392 * metacharacter with special meaning become a \w. Fortunately,
393 * it's very rare to find locales that aren't supersets of ASCII
394 * nowadays. It isn't a problem for most controls to be changed
395 * into something else; we check only \n and \t, though perhaps \r
396 * could be an issue as well. */
7d4bcc4a 397 if ( check_for_problems
8c6180a9
KW
398 && (isGRAPH_A(i) || isBLANK_A(i) || i == '\n'))
399 {
7d4bcc4a 400 if (( isALPHANUMERIC_A(i) && ! isALPHANUMERIC_LC(i))
8c6180a9
KW
401 || (isPUNCT_A(i) && ! isPUNCT_LC(i))
402 || (isBLANK_A(i) && ! isBLANK_LC(i))
403 || (i == '\n' && ! isCNTRL_LC(i)))
404 {
405 if (bad_count) { /* Separate multiple entries with a
406 blank */
407 bad_chars_list[bad_count++] = ' ';
408 }
409 bad_chars_list[bad_count++] = '\'';
410 if (isPRINT_A(i)) {
411 bad_chars_list[bad_count++] = (char) i;
412 }
413 else {
414 bad_chars_list[bad_count++] = '\\';
415 if (i == '\n') {
416 bad_chars_list[bad_count++] = 'n';
417 }
418 else {
419 assert(i == '\t');
420 bad_chars_list[bad_count++] = 't';
421 }
422 }
423 bad_chars_list[bad_count++] = '\'';
424 bad_chars_list[bad_count] = '\0';
425 }
426 }
427 }
428
7d4bcc4a
KW
429# ifdef MB_CUR_MAX
430
8c6180a9 431 /* We only handle single-byte locales (outside of UTF-8 ones; so if
d35fca5f 432 * this locale requires more than one byte, there are going to be
8c6180a9 433 * problems. */
9c8a6dc2
KW
434 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
435 "%s:%d: check_for_problems=%d, MB_CUR_MAX=%d\n",
436 __FILE__, __LINE__, check_for_problems, (int) MB_CUR_MAX));
437
ba1a4362
KW
438 if (check_for_problems && MB_CUR_MAX > 1
439
440 /* Some platforms return MB_CUR_MAX > 1 for even the "C"
441 * locale. Just assume that the implementation for them (plus
442 * for POSIX) is correct and the > 1 value is spurious. (Since
443 * these are specially handled to never be considered UTF-8
444 * locales, as long as this is the only problem, everything
445 * should work fine */
446 && strNE(newctype, "C") && strNE(newctype, "POSIX"))
447 {
8c6180a9
KW
448 multi_byte_locale = TRUE;
449 }
7d4bcc4a
KW
450
451# endif
8c6180a9
KW
452
453 if (bad_count || multi_byte_locale) {
780fcc9f 454 PL_warn_locale = Perl_newSVpvf(aTHX_
8c6180a9 455 "Locale '%s' may not work well.%s%s%s\n",
780fcc9f 456 newctype,
8c6180a9
KW
457 (multi_byte_locale)
458 ? " Some characters in it are not recognized by"
459 " Perl."
460 : "",
461 (bad_count)
462 ? "\nThe following characters (and maybe others)"
463 " may not have the same meaning as the Perl"
464 " program expects:\n"
465 : "",
466 (bad_count)
467 ? bad_chars_list
468 : ""
469 );
cc9eaeb0 470 /* If we are actually in the scope of the locale or are debugging,
bddebb56
KW
471 * output the message now. If not in that scope, we save the
472 * message to be output at the first operation using this locale,
473 * if that actually happens. Most programs don't use locales, so
474 * they are immune to bad ones. */
cc9eaeb0 475 if (IN_LC(LC_CTYPE) || UNLIKELY(DEBUG_L_TEST)) {
780fcc9f
KW
476
477 /* We have to save 'newctype' because the setlocale() just
478 * below may destroy it. The next setlocale() further down
479 * should restore it properly so that the intermediate change
480 * here is transparent to this function's caller */
481 const char * const badlocale = savepv(newctype);
482
837ce802 483 do_setlocale_c(LC_CTYPE, "C");
780fcc9f
KW
484
485 /* The '0' below suppresses a bogus gcc compiler warning */
486 Perl_warner(aTHX_ packWARN(WARN_LOCALE), SvPVX(PL_warn_locale), 0);
bddebb56 487
837ce802 488 do_setlocale_c(LC_CTYPE, badlocale);
c0f3a893 489 Safefree(badlocale);
bddebb56
KW
490
491 if (IN_LC(LC_CTYPE)) {
492 SvREFCNT_dec_NN(PL_warn_locale);
493 PL_warn_locale = NULL;
494 }
780fcc9f 495 }
baa60164 496 }
31f05a37 497 }
98994639
HS
498
499#endif /* USE_LOCALE_CTYPE */
7d4bcc4a 500
98994639
HS
501}
502
98994639 503void
2726666d
KW
504Perl__warn_problematic_locale()
505{
2726666d
KW
506
507#ifdef USE_LOCALE_CTYPE
508
5f04a188
KW
509 dTHX;
510
511 /* Internal-to-core function that outputs the message in PL_warn_locale,
512 * and then NULLS it. Should be called only through the macro
513 * _CHECK_AND_WARN_PROBLEMATIC_LOCALE */
514
2726666d
KW
515 if (PL_warn_locale) {
516 /*GCC_DIAG_IGNORE(-Wformat-security); Didn't work */
517 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
518 SvPVX(PL_warn_locale),
519 0 /* dummy to avoid compiler warning */ );
520 /* GCC_DIAG_RESTORE; */
521 SvREFCNT_dec_NN(PL_warn_locale);
522 PL_warn_locale = NULL;
523 }
524
525#endif
526
527}
528
a4f00dcc
KW
529STATIC void
530S_new_collate(pTHX_ const char *newcoll)
98994639 531{
7d4bcc4a
KW
532
533#ifndef USE_LOCALE_COLLATE
534
535 PERL_UNUSED_ARG(newcoll);
536 PERL_UNUSED_CONTEXT;
537
538#else
0d071d52
KW
539
540 /* Called after all libc setlocale() calls affecting LC_COLLATE, to tell
541 * core Perl this and that 'newcoll' is the name of the new locale.
542 *
d35fca5f
KW
543 * The design of locale collation is that every locale change is given an
544 * index 'PL_collation_ix'. The first time a string particpates in an
545 * operation that requires collation while locale collation is active, it
546 * is given PERL_MAGIC_collxfrm magic (via sv_collxfrm_flags()). That
547 * magic includes the collation index, and the transformation of the string
548 * by strxfrm(), q.v. That transformation is used when doing comparisons,
549 * instead of the string itself. If a string changes, the magic is
550 * cleared. The next time the locale changes, the index is incremented,
551 * and so we know during a comparison that the transformation is not
552 * necessarily still valid, and so is recomputed. Note that if the locale
553 * changes enough times, the index could wrap (a U32), and it is possible
554 * that a transformation would improperly be considered valid, leading to
555 * an unlikely bug */
0d071d52 556
98994639
HS
557 if (! newcoll) {
558 if (PL_collation_name) {
559 ++PL_collation_ix;
560 Safefree(PL_collation_name);
561 PL_collation_name = NULL;
562 }
563 PL_collation_standard = TRUE;
00bf60ca 564 is_standard_collation:
98994639
HS
565 PL_collxfrm_base = 0;
566 PL_collxfrm_mult = 2;
165a1c52 567 PL_in_utf8_COLLATE_locale = FALSE;
f28f4d2a 568 PL_strxfrm_NUL_replacement = '\0';
a4a439fb 569 PL_strxfrm_max_cp = 0;
98994639
HS
570 return;
571 }
572
d35fca5f 573 /* If this is not the same locale as currently, set the new one up */
98994639
HS
574 if (! PL_collation_name || strNE(PL_collation_name, newcoll)) {
575 ++PL_collation_ix;
576 Safefree(PL_collation_name);
577 PL_collation_name = stdize_locale(savepv(newcoll));
a39edc4c 578 PL_collation_standard = isNAME_C_OR_POSIX(newcoll);
00bf60ca
KW
579 if (PL_collation_standard) {
580 goto is_standard_collation;
581 }
98994639 582
165a1c52 583 PL_in_utf8_COLLATE_locale = _is_cur_LC_category_utf8(LC_COLLATE);
f28f4d2a 584 PL_strxfrm_NUL_replacement = '\0';
a4a439fb 585 PL_strxfrm_max_cp = 0;
165a1c52 586
59c018b9
KW
587 /* A locale collation definition includes primary, secondary, tertiary,
588 * etc. weights for each character. To sort, the primary weights are
589 * used, and only if they compare equal, then the secondary weights are
590 * used, and only if they compare equal, then the tertiary, etc.
591 *
592 * strxfrm() works by taking the input string, say ABC, and creating an
593 * output transformed string consisting of first the primary weights,
594 * A¹B¹C¹ followed by the secondary ones, A²B²C²; and then the
595 * tertiary, etc, yielding A¹B¹C¹ A²B²C² A³B³C³ .... Some characters
596 * may not have weights at every level. In our example, let's say B
597 * doesn't have a tertiary weight, and A doesn't have a secondary
598 * weight. The constructed string is then going to be
599 * A¹B¹C¹ B²C² A³C³ ....
600 * This has the desired effect that strcmp() will look at the secondary
601 * or tertiary weights only if the strings compare equal at all higher
602 * priority weights. The spaces shown here, like in
c342d20e 603 * "A¹B¹C¹ A²B²C² "
59c018b9
KW
604 * are not just for readability. In the general case, these must
605 * actually be bytes, which we will call here 'separator weights'; and
606 * they must be smaller than any other weight value, but since these
607 * are C strings, only the terminating one can be a NUL (some
608 * implementations may include a non-NUL separator weight just before
609 * the NUL). Implementations tend to reserve 01 for the separator
610 * weights. They are needed so that a shorter string's secondary
611 * weights won't be misconstrued as primary weights of a longer string,
612 * etc. By making them smaller than any other weight, the shorter
613 * string will sort first. (Actually, if all secondary weights are
614 * smaller than all primary ones, there is no need for a separator
615 * weight between those two levels, etc.)
616 *
617 * The length of the transformed string is roughly a linear function of
618 * the input string. It's not exactly linear because some characters
619 * don't have weights at all levels. When we call strxfrm() we have to
620 * allocate some memory to hold the transformed string. The
621 * calculations below try to find coefficients 'm' and 'b' for this
622 * locale so that m*x + b equals how much space we need, given the size
623 * of the input string in 'x'. If we calculate too small, we increase
624 * the size as needed, and call strxfrm() again, but it is better to
625 * get it right the first time to avoid wasted expensive string
626 * transformations. */
627
98994639 628 {
79f120c8
KW
629 /* We use the string below to find how long the tranformation of it
630 * is. Almost all locales are supersets of ASCII, or at least the
631 * ASCII letters. We use all of them, half upper half lower,
632 * because if we used fewer, we might hit just the ones that are
633 * outliers in a particular locale. Most of the strings being
634 * collated will contain a preponderance of letters, and even if
635 * they are above-ASCII, they are likely to have the same number of
636 * weight levels as the ASCII ones. It turns out that digits tend
637 * to have fewer levels, and some punctuation has more, but those
638 * are relatively sparse in text, and khw believes this gives a
639 * reasonable result, but it could be changed if experience so
640 * dictates. */
641 const char longer[] = "ABCDEFGHIJKLMnopqrstuvwxyz";
642 char * x_longer; /* Transformed 'longer' */
643 Size_t x_len_longer; /* Length of 'x_longer' */
644
645 char * x_shorter; /* We also transform a substring of 'longer' */
646 Size_t x_len_shorter;
647
a4a439fb 648 /* _mem_collxfrm() is used get the transformation (though here we
79f120c8
KW
649 * are interested only in its length). It is used because it has
650 * the intelligence to handle all cases, but to work, it needs some
651 * values of 'm' and 'b' to get it started. For the purposes of
652 * this calculation we use a very conservative estimate of 'm' and
653 * 'b'. This assumes a weight can be multiple bytes, enough to
654 * hold any UV on the platform, and there are 5 levels, 4 weight
655 * bytes, and a trailing NUL. */
656 PL_collxfrm_base = 5;
657 PL_collxfrm_mult = 5 * sizeof(UV);
658
659 /* Find out how long the transformation really is */
a4a439fb
KW
660 x_longer = _mem_collxfrm(longer,
661 sizeof(longer) - 1,
662 &x_len_longer,
663
664 /* We avoid converting to UTF-8 in the
665 * called function by telling it the
666 * string is in UTF-8 if the locale is a
667 * UTF-8 one. Since the string passed
668 * here is invariant under UTF-8, we can
669 * claim it's UTF-8 even though it isn't.
670 * */
671 PL_in_utf8_COLLATE_locale);
79f120c8
KW
672 Safefree(x_longer);
673
674 /* Find out how long the transformation of a substring of 'longer'
675 * is. Together the lengths of these transformations are
676 * sufficient to calculate 'm' and 'b'. The substring is all of
677 * 'longer' except the first character. This minimizes the chances
678 * of being swayed by outliers */
a4a439fb 679 x_shorter = _mem_collxfrm(longer + 1,
79f120c8 680 sizeof(longer) - 2,
a4a439fb
KW
681 &x_len_shorter,
682 PL_in_utf8_COLLATE_locale);
79f120c8
KW
683 Safefree(x_shorter);
684
685 /* If the results are nonsensical for this simple test, the whole
686 * locale definition is suspect. Mark it so that locale collation
687 * is not active at all for it. XXX Should we warn? */
688 if ( x_len_shorter == 0
689 || x_len_longer == 0
690 || x_len_shorter >= x_len_longer)
691 {
692 PL_collxfrm_mult = 0;
693 PL_collxfrm_base = 0;
694 }
695 else {
696 SSize_t base; /* Temporary */
697
698 /* We have both: m * strlen(longer) + b = x_len_longer
699 * m * strlen(shorter) + b = x_len_shorter;
700 * subtracting yields:
701 * m * (strlen(longer) - strlen(shorter))
702 * = x_len_longer - x_len_shorter
703 * But we have set things up so that 'shorter' is 1 byte smaller
704 * than 'longer'. Hence:
705 * m = x_len_longer - x_len_shorter
706 *
707 * But if something went wrong, make sure the multiplier is at
708 * least 1.
709 */
710 if (x_len_longer > x_len_shorter) {
711 PL_collxfrm_mult = (STRLEN) x_len_longer - x_len_shorter;
712 }
713 else {
714 PL_collxfrm_mult = 1;
715 }
716
717 /* mx + b = len
718 * so: b = len - mx
719 * but in case something has gone wrong, make sure it is
720 * non-negative */
721 base = x_len_longer - PL_collxfrm_mult * (sizeof(longer) - 1);
722 if (base < 0) {
723 base = 0;
724 }
725
726 /* Add 1 for the trailing NUL */
727 PL_collxfrm_base = base + 1;
728 }
58eebef2 729
7d4bcc4a
KW
730# ifdef DEBUGGING
731
58eebef2
KW
732 if (DEBUG_L_TEST || debug_initialization) {
733 PerlIO_printf(Perl_debug_log,
b07929e4
KW
734 "%s:%d: ?UTF-8 locale=%d; x_len_shorter=%zu, "
735 "x_len_longer=%zu,"
736 " collate multipler=%zu, collate base=%zu\n",
58eebef2
KW
737 __FILE__, __LINE__,
738 PL_in_utf8_COLLATE_locale,
739 x_len_shorter, x_len_longer,
740 PL_collxfrm_mult, PL_collxfrm_base);
741 }
7d4bcc4a
KW
742# endif
743
98994639
HS
744 }
745 }
746
747#endif /* USE_LOCALE_COLLATE */
7d4bcc4a 748
98994639
HS
749}
750
d2b24094 751#ifdef WIN32
b8cc575c 752
a4f00dcc 753STATIC char *
b8cc575c 754S_win32_setlocale(pTHX_ int category, const char* locale)
b385bb4d
KW
755{
756 /* This, for Windows, emulates POSIX setlocale() behavior. There is no
7d4bcc4a
KW
757 * difference between the two unless the input locale is "", which normally
758 * means on Windows to get the machine default, which is set via the
759 * computer's "Regional and Language Options" (or its current equivalent).
760 * In POSIX, it instead means to find the locale from the user's
761 * environment. This routine changes the Windows behavior to first look in
762 * the environment, and, if anything is found, use that instead of going to
763 * the machine default. If there is no environment override, the machine
764 * default is used, by calling the real setlocale() with "".
765 *
766 * The POSIX behavior is to use the LC_ALL variable if set; otherwise to
767 * use the particular category's variable if set; otherwise to use the LANG
768 * variable. */
b385bb4d 769
175c4cf9 770 bool override_LC_ALL = FALSE;
89f7b9aa
KW
771 char * result;
772
b385bb4d 773 if (locale && strEQ(locale, "")) {
7d4bcc4a
KW
774
775# ifdef LC_ALL
776
b385bb4d
KW
777 locale = PerlEnv_getenv("LC_ALL");
778 if (! locale) {
7d4bcc4a
KW
779
780# endif
b385bb4d 781 switch (category) {
7d4bcc4a
KW
782
783# ifdef LC_ALL
b385bb4d 784 case LC_ALL:
481465ea 785 override_LC_ALL = TRUE;
b385bb4d 786 break; /* We already know its variable isn't set */
7d4bcc4a
KW
787
788# endif
789# ifdef USE_LOCALE_TIME
790
b385bb4d
KW
791 case LC_TIME:
792 locale = PerlEnv_getenv("LC_TIME");
793 break;
7d4bcc4a
KW
794
795# endif
796# ifdef USE_LOCALE_CTYPE
797
b385bb4d
KW
798 case LC_CTYPE:
799 locale = PerlEnv_getenv("LC_CTYPE");
800 break;
7d4bcc4a
KW
801
802# endif
803# ifdef USE_LOCALE_COLLATE
804
b385bb4d
KW
805 case LC_COLLATE:
806 locale = PerlEnv_getenv("LC_COLLATE");
807 break;
7d4bcc4a
KW
808
809# endif
810# ifdef USE_LOCALE_MONETARY
811
b385bb4d
KW
812 case LC_MONETARY:
813 locale = PerlEnv_getenv("LC_MONETARY");
814 break;
7d4bcc4a
KW
815
816# endif
817# ifdef USE_LOCALE_NUMERIC
818
b385bb4d
KW
819 case LC_NUMERIC:
820 locale = PerlEnv_getenv("LC_NUMERIC");
821 break;
7d4bcc4a
KW
822
823# endif
824# ifdef USE_LOCALE_MESSAGES
825
b385bb4d
KW
826 case LC_MESSAGES:
827 locale = PerlEnv_getenv("LC_MESSAGES");
828 break;
7d4bcc4a
KW
829
830# endif
831
b385bb4d
KW
832 default:
833 /* This is a category, like PAPER_SIZE that we don't
834 * know about; and so can't provide a wrapper. */
835 break;
836 }
837 if (! locale) {
838 locale = PerlEnv_getenv("LANG");
481465ea 839 if (! locale) {
b385bb4d
KW
840 locale = "";
841 }
842 }
7d4bcc4a
KW
843
844# ifdef LC_ALL
845
b385bb4d 846 }
7d4bcc4a
KW
847
848# endif
849
b385bb4d
KW
850 }
851
89f7b9aa 852 result = setlocale(category, locale);
bbc98134 853 DEBUG_L(PerlIO_printf(Perl_debug_log, "%s:%d: %s\n", __FILE__, __LINE__,
a4f00dcc 854 setlocale_debug_string(category, locale, result)));
89f7b9aa 855
481465ea 856 if (! override_LC_ALL) {
89f7b9aa
KW
857 return result;
858 }
859
dfd77d7a 860 /* Here the input category was LC_ALL, and we have set it to what is in the
481465ea
KW
861 * LANG variable or the system default if there is no LANG. But these have
862 * lower priority than the other LC_foo variables, so override it for each
863 * one that is set. (If they are set to "", it means to use the same thing
864 * we just set LC_ALL to, so can skip) */
7d4bcc4a
KW
865
866# ifdef USE_LOCALE_TIME
867
89f7b9aa 868 result = PerlEnv_getenv("LC_TIME");
730252b2 869 if (result && strNE(result, "")) {
89f7b9aa 870 setlocale(LC_TIME, result);
bbc98134
KW
871 DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s:%d: %s\n",
872 __FILE__, __LINE__,
a4f00dcc 873 setlocale_debug_string(LC_TIME, result, "not captured")));
89f7b9aa 874 }
7d4bcc4a
KW
875
876# endif
877# ifdef USE_LOCALE_CTYPE
878
89f7b9aa 879 result = PerlEnv_getenv("LC_CTYPE");
730252b2 880 if (result && strNE(result, "")) {
89f7b9aa 881 setlocale(LC_CTYPE, result);
bbc98134
KW
882 DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s:%d: %s\n",
883 __FILE__, __LINE__,
a4f00dcc 884 setlocale_debug_string(LC_CTYPE, result, "not captured")));
89f7b9aa 885 }
7d4bcc4a
KW
886
887# endif
888# ifdef USE_LOCALE_COLLATE
889
89f7b9aa 890 result = PerlEnv_getenv("LC_COLLATE");
730252b2 891 if (result && strNE(result, "")) {
89f7b9aa 892 setlocale(LC_COLLATE, result);
bbc98134
KW
893 DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s:%d: %s\n",
894 __FILE__, __LINE__,
a4f00dcc 895 setlocale_debug_string(LC_COLLATE, result, "not captured")));
89f7b9aa 896 }
7d4bcc4a
KW
897
898# endif
899# ifdef USE_LOCALE_MONETARY
900
89f7b9aa 901 result = PerlEnv_getenv("LC_MONETARY");
730252b2 902 if (result && strNE(result, "")) {
89f7b9aa 903 setlocale(LC_MONETARY, result);
bbc98134
KW
904 DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s:%d: %s\n",
905 __FILE__, __LINE__,
a4f00dcc 906 setlocale_debug_string(LC_MONETARY, result, "not captured")));
89f7b9aa 907 }
7d4bcc4a
KW
908
909# endif
910# ifdef USE_LOCALE_NUMERIC
911
89f7b9aa 912 result = PerlEnv_getenv("LC_NUMERIC");
730252b2 913 if (result && strNE(result, "")) {
89f7b9aa 914 setlocale(LC_NUMERIC, result);
bbc98134
KW
915 DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s:%d: %s\n",
916 __FILE__, __LINE__,
a4f00dcc 917 setlocale_debug_string(LC_NUMERIC, result, "not captured")));
89f7b9aa 918 }
7d4bcc4a
KW
919
920# endif
921# ifdef USE_LOCALE_MESSAGES
922
89f7b9aa 923 result = PerlEnv_getenv("LC_MESSAGES");
730252b2 924 if (result && strNE(result, "")) {
89f7b9aa 925 setlocale(LC_MESSAGES, result);
bbc98134
KW
926 DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s:%d: %s\n",
927 __FILE__, __LINE__,
a4f00dcc 928 setlocale_debug_string(LC_MESSAGES, result, "not captured")));
89f7b9aa 929 }
7d4bcc4a
KW
930
931# endif
89f7b9aa 932
bbc98134
KW
933 result = setlocale(LC_ALL, NULL);
934 DEBUG_L(PerlIO_printf(Perl_debug_log, "%s:%d: %s\n",
935 __FILE__, __LINE__,
a4f00dcc 936 setlocale_debug_string(LC_ALL, NULL, result)));
89f7b9aa 937
bbc98134 938 return result;
b385bb4d
KW
939}
940
941#endif
942
a4f00dcc
KW
943char *
944Perl_setlocale(int category, const char * locale)
945{
946 /* This wraps POSIX::setlocale() */
947
948 char * retval;
1159483a 949 char * newlocale;
a4f00dcc
KW
950 dTHX;
951
a4f00dcc
KW
952#ifdef USE_LOCALE_NUMERIC
953
954 /* A NULL locale means only query what the current one is. We
955 * have the LC_NUMERIC name saved, because we are normally switched
956 * into the C locale for it. Switch back so an LC_ALL query will yield
957 * the correct results; all other categories don't require special
958 * handling */
959 if (locale == NULL) {
960 if (category == LC_NUMERIC) {
961 return savepv(PL_numeric_name);
962 }
963
7d4bcc4a 964# ifdef LC_ALL
a4f00dcc
KW
965
966 else if (category == LC_ALL) {
967 SET_NUMERIC_UNDERLYING();
968 }
969
7d4bcc4a 970# endif
a4f00dcc
KW
971
972 }
973
974#endif
975
d2b24094 976 retval = do_setlocale_r(category, locale);
a4f00dcc
KW
977
978 DEBUG_L(PerlIO_printf(Perl_debug_log,
979 "%s:%d: %s\n", __FILE__, __LINE__,
980 setlocale_debug_string(category, locale, retval)));
981 if (! retval) {
982 /* Should never happen that a query would return an error, but be
983 * sure and reset to C locale */
984 if (locale == 0) {
985 SET_NUMERIC_STANDARD();
986 }
7d4bcc4a 987
a4f00dcc
KW
988 return NULL;
989 }
990
991 /* Save retval since subsequent setlocale() calls may overwrite it. */
992 retval = savepv(retval);
993
994 /* If locale == NULL, we are just querying the state, but may have switched
995 * to NUMERIC_UNDERLYING. Switch back before returning. */
996 if (locale == NULL) {
997 SET_NUMERIC_STANDARD();
998 return retval;
999 }
a4f00dcc 1000
1159483a
KW
1001 /* Now that have switched locales, we have to update our records to
1002 * correspond. */
a4f00dcc 1003
1159483a 1004 switch (category) {
a4f00dcc 1005
1159483a 1006#ifdef USE_LOCALE_CTYPE
a4f00dcc 1007
1159483a
KW
1008 case LC_CTYPE:
1009 new_ctype(retval);
1010 break;
a4f00dcc 1011
1159483a 1012#endif
a4f00dcc
KW
1013#ifdef USE_LOCALE_COLLATE
1014
1159483a
KW
1015 case LC_COLLATE:
1016 new_collate(retval);
1017 break;
a4f00dcc 1018
1159483a 1019#endif
a4f00dcc
KW
1020#ifdef USE_LOCALE_NUMERIC
1021
1159483a
KW
1022 case LC_NUMERIC:
1023 new_numeric(retval);
1024 break;
a4f00dcc 1025
1159483a
KW
1026#endif
1027#ifdef LC_ALL
a4f00dcc 1028
1159483a 1029 case LC_ALL:
a4f00dcc 1030
1159483a
KW
1031 /* LC_ALL updates all the things we care about. The values may not
1032 * be the same as 'retval', as the locale "" may have set things
1033 * individually */
a4f00dcc 1034
1159483a 1035# ifdef USE_LOCALE_CTYPE
a4f00dcc 1036
1159483a
KW
1037 newlocale = do_setlocale_c(LC_CTYPE, NULL);
1038 new_ctype(newlocale);
a4f00dcc 1039
1159483a
KW
1040# endif /* USE_LOCALE_CTYPE */
1041# ifdef USE_LOCALE_COLLATE
1042
1043 newlocale = do_setlocale_c(LC_COLLATE, NULL);
1044 new_collate(newlocale);
a4f00dcc 1045
7d4bcc4a 1046# endif
1159483a 1047# ifdef USE_LOCALE_NUMERIC
a4f00dcc 1048
1159483a
KW
1049 newlocale = do_setlocale_c(LC_NUMERIC, NULL);
1050 new_numeric(newlocale);
a4f00dcc 1051
1159483a
KW
1052# endif /* USE_LOCALE_NUMERIC */
1053#endif /* LC_ALL */
a4f00dcc 1054
1159483a
KW
1055 default:
1056 break;
a4f00dcc
KW
1057 }
1058
1059 return retval;
1060
f7416781
KW
1061
1062}
1063
1064PERL_STATIC_INLINE const char *
1065S_save_to_buffer(const char * string, char **buf, Size_t *buf_size, const Size_t offset)
1066{
1067 /* Copy the NUL-terminated 'string' to 'buf' + 'offset'. 'buf' has size 'buf_size',
1068 * growing it if necessary */
1069
1070 const Size_t string_size = strlen(string) + offset + 1;
1071
1072 PERL_ARGS_ASSERT_SAVE_TO_BUFFER;
1073
1074 if (*buf_size == 0) {
1075 Newx(*buf, string_size, char);
1076 *buf_size = string_size;
1077 }
1078 else if (string_size > *buf_size) {
1079 Renew(*buf, string_size, char);
1080 *buf_size = string_size;
1081 }
1082
1083 Copy(string, *buf + offset, string_size - offset, char);
1084 return *buf;
1085}
1086
1087/*
1088
1089=head1 Locale-related functions and macros
1090
1091=for apidoc Perl_langinfo
1092
7d4bcc4a 1093This is an (almost ª) drop-in replacement for the system C<L<nl_langinfo(3)>>,
f7416781
KW
1094taking the same C<item> parameter values, and returning the same information.
1095But it is more thread-safe than regular C<nl_langinfo()>, and hides the quirks
1096of Perl's locale handling from your code, and can be used on systems that lack
1097a native C<nl_langinfo>.
1098
1099Expanding on these:
1100
1101=over
1102
1103=item *
1104
1105It delivers the correct results for the C<RADIXCHAR> and C<THOUSESEP> items,
1106without you having to write extra code. The reason for the extra code would be
1107because these are from the C<LC_NUMERIC> locale category, which is normally
1108kept set to the C locale by Perl, no matter what the underlying locale is
1109supposed to be, and so to get the expected results, you have to temporarily
1110toggle into the underlying locale, and later toggle back. (You could use
1111plain C<nl_langinfo> and C<L</STORE_LC_NUMERIC_FORCE_TO_UNDERLYING>> for this
1112but then you wouldn't get the other advantages of C<Perl_langinfo()>; not
1113keeping C<LC_NUMERIC> in the C locale would break a lot of CPAN, which is
1114expecting the radix (decimal point) character to be a dot.)
1115
1116=item *
1117
1118Depending on C<item>, it works on systems that don't have C<nl_langinfo>, hence
1119makes your code more portable. Of the fifty-some possible items specified by
1120the POSIX 2008 standard,
1121L<http://pubs.opengroup.org/onlinepubs/9699919799/basedefs/langinfo.h.html>,
1122only two are completely unimplemented. It uses various techniques to recover
1123the other items, including calling C<L<localeconv(3)>>, and C<L<strftime(3)>>,
1124both of which are specified in C89, so should be always be available. Later
1125C<strftime()> versions have additional capabilities; C<""> is returned for
1126those not available on your system.
1127
1128The details for those items which may differ from what this emulation returns
1129and what a native C<nl_langinfo()> would return are:
1130
1131=over
1132
1133=item C<CODESET>
1134
1135=item C<ERA>
1136
1137Unimplemented, so returns C<"">.
1138
1139=item C<YESEXPR>
1140
1141=item C<NOEXPR>
1142
1143Only the values for English are returned. Earlier POSIX standards also
1144specified C<YESSTR> and C<NOSTR>, but these have been removed from POSIX 2008,
1145and aren't supported by C<Perl_langinfo>.
1146
1147=item C<D_FMT>
1148
1149Always evaluates to C<%x>, the locale's appropriate date representation.
1150
1151=item C<T_FMT>
1152
1153Always evaluates to C<%X>, the locale's appropriate time representation.
1154
1155=item C<D_T_FMT>
1156
1157Always evaluates to C<%c>, the locale's appropriate date and time
1158representation.
1159
1160=item C<CRNCYSTR>
1161
1162The return may be incorrect for those rare locales where the currency symbol
1163replaces the radix character.
1164Send email to L<mailto:perlbug@perl.org> if you have examples of it needing
1165to work differently.
1166
1167=item C<ALT_DIGITS>
1168
1169Currently this gives the same results as Linux does.
1170Send email to L<mailto:perlbug@perl.org> if you have examples of it needing
1171to work differently.
1172
1173=item C<ERA_D_FMT>
1174
1175=item C<ERA_T_FMT>
1176
1177=item C<ERA_D_T_FMT>
1178
1179=item C<T_FMT_AMPM>
1180
1181These are derived by using C<strftime()>, and not all versions of that function
1182know about them. C<""> is returned for these on such systems.
1183
1184=back
1185
1186When using C<Perl_langinfo> on systems that don't have a native
1187C<nl_langinfo()>, you must
1188
1189 #include "perl_langinfo.h"
1190
1191before the C<perl.h> C<#include>. You can replace your C<langinfo.h>
1192C<#include> with this one. (Doing it this way keeps out the symbols that plain
1193C<langinfo.h> imports into the namespace for code that doesn't need it.)
1194
1195You also should not use the bare C<langinfo.h> item names, but should preface
1196them with C<PERL_>, so use C<PERL_RADIXCHAR> instead of plain C<RADIXCHAR>.
1197The C<PERL_I<foo>> versions will also work for this function on systems that do
1198have a native C<nl_langinfo>.
1199
1200=item *
1201
1202It is thread-friendly, returning its result in a buffer that won't be
1203overwritten by another thread, so you don't have to code for that possibility.
1204The buffer can be overwritten by the next call to C<nl_langinfo> or
1205C<Perl_langinfo> in the same thread.
1206
1207=item *
1208
7d4bcc4a 1209ª It returns S<C<const char *>>, whereas plain C<nl_langinfo()> returns S<C<char
f7416781
KW
1210*>>, but you are (only by documentation) forbidden to write into the buffer.
1211By declaring this C<const>, the compiler enforces this restriction. The extra
1212C<const> is why this isn't an unequivocal drop-in replacement for
1213C<nl_langinfo>.
1214
1215=back
1216
1217The original impetus for C<Perl_langinfo()> was so that code that needs to
1218find out the current currency symbol, floating point radix character, or digit
1219grouping separator can use, on all systems, the simpler and more
1220thread-friendly C<nl_langinfo> API instead of C<L<localeconv(3)>> which is a
1221pain to make thread-friendly. For other fields returned by C<localeconv>, it
1222is better to use the methods given in L<perlcall> to call
1223L<C<POSIX::localeconv()>|POSIX/localeconv>, which is thread-friendly.
1224
1225=cut
1226
1227*/
1228
1229const char *
1230#ifdef HAS_NL_LANGINFO
1231Perl_langinfo(const nl_item item)
1232#else
1233Perl_langinfo(const int item)
1234#endif
1235{
f61748ac
KW
1236 return my_nl_langinfo(item, TRUE);
1237}
1238
1239const char *
1240#ifdef HAS_NL_LANGINFO
1241S_my_nl_langinfo(const nl_item item, bool toggle)
1242#else
1243S_my_nl_langinfo(const int item, bool toggle)
1244#endif
1245{
ae74815b 1246 dTHX;
f7416781 1247
ab340fff
KW
1248#if defined(HAS_NL_LANGINFO) /* nl_langinfo() is available. */
1249#if ! defined(HAS_POSIX_2008_LOCALE)
f7416781 1250
ab340fff 1251 /* Here, use plain nl_langinfo(), switching to the underlying LC_NUMERIC
ae74815b
KW
1252 * for those items dependent on it. This must be copied to a buffer before
1253 * switching back, as some systems destroy the buffer when setlocale() is
1254 * called */
f7416781
KW
1255
1256 LOCALE_LOCK;
1257
1258 if (toggle) {
1259 if (item == PERL_RADIXCHAR || item == PERL_THOUSEP) {
837ce802 1260 do_setlocale_c(LC_NUMERIC, PL_numeric_name);
f7416781
KW
1261 }
1262 else {
1263 toggle = FALSE;
1264 }
1265 }
1266
1267 save_to_buffer(nl_langinfo(item), &PL_langinfo_buf, &PL_langinfo_bufsize, 0);
1268
1269 if (toggle) {
837ce802 1270 do_setlocale_c(LC_NUMERIC, "C");
f7416781
KW
1271 }
1272
1273 LOCALE_UNLOCK;
1274
1275 return PL_langinfo_buf;
1276
ab340fff
KW
1277# else /* Use nl_langinfo_l(), avoiding both a mutex and changing the locale */
1278
1279 bool do_free = FALSE;
1280 locale_t cur = uselocale((locale_t) 0);
1281
1282 if (cur == LC_GLOBAL_LOCALE) {
1283 cur = duplocale(LC_GLOBAL_LOCALE);
1284 do_free = TRUE;
1285 }
1286
1287 if ( toggle
1288 && (item == PERL_RADIXCHAR || item == PERL_THOUSEP))
1289 {
1290 cur = newlocale(LC_NUMERIC_MASK, PL_numeric_name, cur);
1291 do_free = TRUE;
1292 }
1293
1294 save_to_buffer(nl_langinfo_l(item, cur),
1295 &PL_langinfo_buf, &PL_langinfo_bufsize, 0);
1296 if (do_free) {
1297 freelocale(cur);
1298 }
1299
1300 return PL_langinfo_buf;
1301
1302# endif
f7416781 1303#else /* Below, emulate nl_langinfo as best we can */
f7416781
KW
1304# ifdef HAS_LOCALECONV
1305
1306 const struct lconv* lc;
1307
1308# endif
1309# ifdef HAS_STRFTIME
1310
1311 struct tm tm;
1312 bool return_format = FALSE; /* Return the %format, not the value */
1313 const char * format;
1314
1315# endif
1316
1317 /* We copy the results to a per-thread buffer, even if not multi-threaded.
1318 * This is in part to simplify this code, and partly because we need a
1319 * buffer anyway for strftime(), and partly because a call of localeconv()
1320 * could otherwise wipe out the buffer, and the programmer would not be
1321 * expecting this, as this is a nl_langinfo() substitute after all, so s/he
1322 * might be thinking their localeconv() is safe until another localeconv()
1323 * call. */
1324
1325 switch (item) {
1326 Size_t len;
1327 const char * retval;
1328
1329 /* These 2 are unimplemented */
1330 case PERL_CODESET:
1331 case PERL_ERA: /* For use with strftime() %E modifier */
1332
1333 default:
1334 return "";
1335
1336 /* We use only an English set, since we don't know any more */
1337 case PERL_YESEXPR: return "^[+1yY]";
1338 case PERL_NOEXPR: return "^[-0nN]";
1339
1340# ifdef HAS_LOCALECONV
1341
1342 case PERL_CRNCYSTR:
1343
1344 LOCALE_LOCK;
1345
1346 lc = localeconv();
1347 if (! lc || ! lc->currency_symbol || strEQ("", lc->currency_symbol))
1348 {
1349 LOCALE_UNLOCK;
1350 return "";
1351 }
1352
1353 /* Leave the first spot empty to be filled in below */
1354 save_to_buffer(lc->currency_symbol, &PL_langinfo_buf,
1355 &PL_langinfo_bufsize, 1);
1356 if (lc->mon_decimal_point && strEQ(lc->mon_decimal_point, ""))
1357 { /* khw couldn't figure out how the localedef specifications
1358 would show that the $ should replace the radix; this is
1359 just a guess as to how it might work.*/
1360 *PL_langinfo_buf = '.';
1361 }
1362 else if (lc->p_cs_precedes) {
1363 *PL_langinfo_buf = '-';
1364 }
1365 else {
1366 *PL_langinfo_buf = '+';
1367 }
1368
1369 LOCALE_UNLOCK;
1370 break;
1371
1372 case PERL_RADIXCHAR:
1373 case PERL_THOUSEP:
1374
1375 LOCALE_LOCK;
1376
1377 if (toggle) {
837ce802 1378 do_setlocale_c(LC_NUMERIC, PL_numeric_name);
f7416781
KW
1379 }
1380
1381 lc = localeconv();
1382 if (! lc) {
1383 retval = "";
1384 }
1385 else switch (item) {
1386 case PERL_RADIXCHAR:
1387 if (! lc->decimal_point) {
1388 retval = "";
1389 }
1390 else {
1391 retval = lc->decimal_point;
1392 }
1393 break;
1394
1395 case PERL_THOUSEP:
1396 if (! lc->thousands_sep || strEQ("", lc->thousands_sep)) {
1397 retval = "";
1398 }
1399 else {
1400 retval = lc->thousands_sep;
1401 }
1402 break;
1403
1404 default:
1405 LOCALE_UNLOCK;
1406 Perl_croak(aTHX_ "panic: %s: %d: switch case: %d problem",
1407 __FILE__, __LINE__, item);
1408 }
1409
1410 save_to_buffer(retval, &PL_langinfo_buf, &PL_langinfo_bufsize, 0);
1411
1412 if (toggle) {
837ce802 1413 do_setlocale_c(LC_NUMERIC, "C");
f7416781
KW
1414 }
1415
1416 LOCALE_UNLOCK;
1417
1418 break;
1419
1420# endif
1421# ifdef HAS_STRFTIME
1422
1423 /* These are defined by C89, so we assume that strftime supports them,
1424 * and so are returned unconditionally; they may not be what the locale
1425 * actually says, but should give good enough results for someone using
1426 * them as formats (as opposed to trying to parse them to figure out
7d4bcc4a 1427 * what the locale says). The other format items are actually tested to
f7416781
KW
1428 * verify they work on the platform */
1429 case PERL_D_FMT: return "%x";
1430 case PERL_T_FMT: return "%X";
1431 case PERL_D_T_FMT: return "%c";
1432
1433 /* These formats are only available in later strfmtime's */
1434 case PERL_ERA_D_FMT: case PERL_ERA_T_FMT: case PERL_ERA_D_T_FMT:
1435 case PERL_T_FMT_AMPM:
1436
1437 /* The rest can be gotten from most versions of strftime(). */
1438 case PERL_ABDAY_1: case PERL_ABDAY_2: case PERL_ABDAY_3:
1439 case PERL_ABDAY_4: case PERL_ABDAY_5: case PERL_ABDAY_6:
1440 case PERL_ABDAY_7:
1441 case PERL_ALT_DIGITS:
1442 case PERL_AM_STR: case PERL_PM_STR:
1443 case PERL_ABMON_1: case PERL_ABMON_2: case PERL_ABMON_3:
1444 case PERL_ABMON_4: case PERL_ABMON_5: case PERL_ABMON_6:
1445 case PERL_ABMON_7: case PERL_ABMON_8: case PERL_ABMON_9:
1446 case PERL_ABMON_10: case PERL_ABMON_11: case PERL_ABMON_12:
1447 case PERL_DAY_1: case PERL_DAY_2: case PERL_DAY_3: case PERL_DAY_4:
1448 case PERL_DAY_5: case PERL_DAY_6: case PERL_DAY_7:
1449 case PERL_MON_1: case PERL_MON_2: case PERL_MON_3: case PERL_MON_4:
1450 case PERL_MON_5: case PERL_MON_6: case PERL_MON_7: case PERL_MON_8:
1451 case PERL_MON_9: case PERL_MON_10: case PERL_MON_11: case PERL_MON_12:
1452
1453 LOCALE_LOCK;
1454
1455 init_tm(&tm); /* Precaution against core dumps */
1456 tm.tm_sec = 30;
1457 tm.tm_min = 30;
1458 tm.tm_hour = 6;
1459 tm.tm_year = 2017 - 1900;
1460 tm.tm_wday = 0;
1461 tm.tm_mon = 0;
1462 switch (item) {
1463 default:
1464 LOCALE_UNLOCK;
1465 Perl_croak(aTHX_ "panic: %s: %d: switch case: %d problem",
1466 __FILE__, __LINE__, item);
1467 NOT_REACHED; /* NOTREACHED */
1468
1469 case PERL_PM_STR: tm.tm_hour = 18;
1470 case PERL_AM_STR:
1471 format = "%p";
1472 break;
1473
1474 case PERL_ABDAY_7: tm.tm_wday++;
1475 case PERL_ABDAY_6: tm.tm_wday++;
1476 case PERL_ABDAY_5: tm.tm_wday++;
1477 case PERL_ABDAY_4: tm.tm_wday++;
1478 case PERL_ABDAY_3: tm.tm_wday++;
1479 case PERL_ABDAY_2: tm.tm_wday++;
1480 case PERL_ABDAY_1:
1481 format = "%a";
1482 break;
1483
1484 case PERL_DAY_7: tm.tm_wday++;
1485 case PERL_DAY_6: tm.tm_wday++;
1486 case PERL_DAY_5: tm.tm_wday++;
1487 case PERL_DAY_4: tm.tm_wday++;
1488 case PERL_DAY_3: tm.tm_wday++;
1489 case PERL_DAY_2: tm.tm_wday++;
1490 case PERL_DAY_1:
1491 format = "%A";
1492 break;
1493
1494 case PERL_ABMON_12: tm.tm_mon++;
1495 case PERL_ABMON_11: tm.tm_mon++;
1496 case PERL_ABMON_10: tm.tm_mon++;
1497 case PERL_ABMON_9: tm.tm_mon++;
1498 case PERL_ABMON_8: tm.tm_mon++;
1499 case PERL_ABMON_7: tm.tm_mon++;
1500 case PERL_ABMON_6: tm.tm_mon++;
1501 case PERL_ABMON_5: tm.tm_mon++;
1502 case PERL_ABMON_4: tm.tm_mon++;
1503 case PERL_ABMON_3: tm.tm_mon++;
1504 case PERL_ABMON_2: tm.tm_mon++;
1505 case PERL_ABMON_1:
1506 format = "%b";
1507 break;
1508
1509 case PERL_MON_12: tm.tm_mon++;
1510 case PERL_MON_11: tm.tm_mon++;
1511 case PERL_MON_10: tm.tm_mon++;
1512 case PERL_MON_9: tm.tm_mon++;
1513 case PERL_MON_8: tm.tm_mon++;
1514 case PERL_MON_7: tm.tm_mon++;
1515 case PERL_MON_6: tm.tm_mon++;
1516 case PERL_MON_5: tm.tm_mon++;
1517 case PERL_MON_4: tm.tm_mon++;
1518 case PERL_MON_3: tm.tm_mon++;
1519 case PERL_MON_2: tm.tm_mon++;
1520 case PERL_MON_1:
1521 format = "%B";
1522 break;
1523
1524 case PERL_T_FMT_AMPM:
1525 format = "%r";
1526 return_format = TRUE;
1527 break;
1528
1529 case PERL_ERA_D_FMT:
1530 format = "%Ex";
1531 return_format = TRUE;
1532 break;
1533
1534 case PERL_ERA_T_FMT:
1535 format = "%EX";
1536 return_format = TRUE;
1537 break;
1538
1539 case PERL_ERA_D_T_FMT:
1540 format = "%Ec";
1541 return_format = TRUE;
1542 break;
1543
1544 case PERL_ALT_DIGITS:
1545 tm.tm_wday = 0;
1546 format = "%Ow"; /* Find the alternate digit for 0 */
1547 break;
1548 }
1549
1550 /* We can't use my_strftime() because it doesn't look at tm_wday */
1551 while (0 == strftime(PL_langinfo_buf, PL_langinfo_bufsize,
1552 format, &tm))
1553 {
1554 /* A zero return means one of:
1555 * a) there wasn't enough space in PL_langinfo_buf
1556 * b) the format, like a plain %p, returns empty
1557 * c) it was an illegal format, though some implementations of
1558 * strftime will just return the illegal format as a plain
1559 * character sequence.
1560 *
1561 * To quickly test for case 'b)', try again but precede the
1562 * format with a plain character. If that result is still
1563 * empty, the problem is either 'a)' or 'c)' */
1564
1565 Size_t format_size = strlen(format) + 1;
1566 Size_t mod_size = format_size + 1;
1567 char * mod_format;
1568 char * temp_result;
1569
1570 Newx(mod_format, mod_size, char);
1571 Newx(temp_result, PL_langinfo_bufsize, char);
1572 *mod_format = '\a';
1573 my_strlcpy(mod_format + 1, format, mod_size);
1574 len = strftime(temp_result,
1575 PL_langinfo_bufsize,
1576 mod_format, &tm);
1577 Safefree(mod_format);
1578 Safefree(temp_result);
1579
1580 /* If 'len' is non-zero, it means that we had a case like %p
1581 * which means the current locale doesn't use a.m. or p.m., and
1582 * that is valid */
1583 if (len == 0) {
1584
1585 /* Here, still didn't work. If we get well beyond a
1586 * reasonable size, bail out to prevent an infinite loop. */
1587
1588 if (PL_langinfo_bufsize > 100 * format_size) {
1589 *PL_langinfo_buf = '\0';
1590 }
1591 else { /* Double the buffer size to retry; Add 1 in case
1592 original was 0, so we aren't stuck at 0. */
1593 PL_langinfo_bufsize *= 2;
1594 PL_langinfo_bufsize++;
1595 Renew(PL_langinfo_buf, PL_langinfo_bufsize, char);
1596 continue;
1597 }
1598 }
1599
1600 break;
1601 }
1602
1603 /* Here, we got a result.
1604 *
1605 * If the item is 'ALT_DIGITS', PL_langinfo_buf contains the
1606 * alternate format for wday 0. If the value is the same as the
1607 * normal 0, there isn't an alternate, so clear the buffer. */
1608 if ( item == PERL_ALT_DIGITS
1609 && strEQ(PL_langinfo_buf, "0"))
1610 {
1611 *PL_langinfo_buf = '\0';
1612 }
1613
1614 /* ALT_DIGITS is problematic. Experiments on it showed that
1615 * strftime() did not always work properly when going from alt-9 to
1616 * alt-10. Only a few locales have this item defined, and in all
1617 * of them on Linux that khw was able to find, nl_langinfo() merely
1618 * returned the alt-0 character, possibly doubled. Most Unicode
1619 * digits are in blocks of 10 consecutive code points, so that is
1620 * sufficient information for those scripts, as we can infer alt-1,
1621 * alt-2, .... But for a Japanese locale, a CJK ideographic 0 is
1622 * returned, and the CJK digits are not in code point order, so you
1623 * can't really infer anything. The localedef for this locale did
1624 * specify the succeeding digits, so that strftime() works properly
1625 * on them, without needing to infer anything. But the
1626 * nl_langinfo() return did not give sufficient information for the
1627 * caller to understand what's going on. So until there is
1628 * evidence that it should work differently, this returns the alt-0
1629 * string for ALT_DIGITS.
1630 *
1631 * wday was chosen because its range is all a single digit. Things
1632 * like tm_sec have two digits as the minimum: '00' */
1633
1634 LOCALE_UNLOCK;
1635
1636 /* If to return the format, not the value, overwrite the buffer
1637 * with it. But some strftime()s will keep the original format if
1638 * illegal, so change those to "" */
1639 if (return_format) {
1640 if (strEQ(PL_langinfo_buf, format)) {
1641 *PL_langinfo_buf = '\0';
1642 }
1643 else {
1644 save_to_buffer(format, &PL_langinfo_buf,
1645 &PL_langinfo_bufsize, 0);
1646 }
1647 }
1648
1649 break;
1650
1651# endif
1652
1653 }
1654
1655 return PL_langinfo_buf;
1656
1657#endif
1658
a4f00dcc 1659}
b385bb4d 1660
98994639
HS
1661/*
1662 * Initialize locale awareness.
1663 */
1664int
1665Perl_init_i18nl10n(pTHX_ int printwarn)
1666{
0e92a118
KW
1667 /* printwarn is
1668 *
1669 * 0 if not to output warning when setup locale is bad
1670 * 1 if to output warning based on value of PERL_BADLANG
1671 * >1 if to output regardless of PERL_BADLANG
1672 *
1673 * returns
98994639 1674 * 1 = set ok or not applicable,
0e92a118
KW
1675 * 0 = fallback to a locale of lower priority
1676 * -1 = fallback to all locales failed, not even to the C locale
6b058d42
KW
1677 *
1678 * Under -DDEBUGGING, if the environment variable PERL_DEBUG_LOCALE_INIT is
1679 * set, debugging information is output.
1680 *
1681 * This looks more complicated than it is, mainly due to the #ifdefs.
1682 *
1683 * We try to set LC_ALL to the value determined by the environment. If
1684 * there is no LC_ALL on this platform, we try the individual categories we
1685 * know about. If this works, we are done.
1686 *
1687 * But if it doesn't work, we have to do something else. We search the
1688 * environment variables ourselves instead of relying on the system to do
1689 * it. We look at, in order, LC_ALL, LANG, a system default locale (if we
1690 * think there is one), and the ultimate fallback "C". This is all done in
1691 * the same loop as above to avoid duplicating code, but it makes things
7d4bcc4a
KW
1692 * more complex. The 'trial_locales' array is initialized with just one
1693 * element; it causes the behavior described in the paragraph above this to
1694 * happen. If that fails, we add elements to 'trial_locales', and do extra
1695 * loop iterations to cause the behavior described in this paragraph.
6b058d42
KW
1696 *
1697 * On Ultrix, the locale MUST come from the environment, so there is
1698 * preliminary code to set it. I (khw) am not sure that it is necessary,
1699 * and that this couldn't be folded into the loop, but barring any real
1700 * platforms to test on, it's staying as-is
1701 *
1702 * A slight complication is that in embedded Perls, the locale may already
1703 * be set-up, and we don't want to get it from the normal environment
1704 * variables. This is handled by having a special environment variable
1705 * indicate we're in this situation. We simply set setlocale's 2nd
1706 * parameter to be a NULL instead of "". That indicates to setlocale that
1707 * it is not to change anything, but to return the current value,
1708 * effectively initializing perl's db to what the locale already is.
1709 *
1710 * We play the same trick with NULL if a LC_ALL succeeds. We call
1711 * setlocale() on the individual categores with NULL to get their existing
1712 * values for our db, instead of trying to change them.
1713 * */
98994639 1714
0e92a118
KW
1715 int ok = 1;
1716
7d4bcc4a
KW
1717#ifndef USE_LOCALE
1718
1719 PERL_UNUSED_ARG(printwarn);
1720
1721#else /* USE_LOCALE */
1722# ifdef USE_LOCALE_CTYPE
1723
98994639 1724 char *curctype = NULL;
7d4bcc4a
KW
1725
1726# endif /* USE_LOCALE_CTYPE */
1727# ifdef USE_LOCALE_COLLATE
1728
98994639 1729 char *curcoll = NULL;
7d4bcc4a
KW
1730
1731# endif /* USE_LOCALE_COLLATE */
1732# ifdef USE_LOCALE_NUMERIC
1733
98994639 1734 char *curnum = NULL;
7d4bcc4a
KW
1735
1736# endif /* USE_LOCALE_NUMERIC */
1737# ifdef __GLIBC__
1738
175c4cf9 1739 const char * const language = savepv(PerlEnv_getenv("LANGUAGE"));
7d4bcc4a
KW
1740
1741# endif
65ebb059 1742
ccd65d51
KW
1743 /* NULL uses the existing already set up locale */
1744 const char * const setlocale_init = (PerlEnv_getenv("PERL_SKIP_LOCALE_INIT"))
1745 ? NULL
1746 : "";
c3fcd832
KW
1747 const char* trial_locales[5]; /* 5 = 1 each for "", LC_ALL, LANG, "", C */
1748 unsigned int trial_locales_count;
175c4cf9
KW
1749 const char * const lc_all = savepv(PerlEnv_getenv("LC_ALL"));
1750 const char * const lang = savepv(PerlEnv_getenv("LANG"));
98994639 1751 bool setlocale_failure = FALSE;
65ebb059 1752 unsigned int i;
175c4cf9
KW
1753
1754 /* A later getenv() could zap this, so only use here */
1755 const char * const bad_lang_use_once = PerlEnv_getenv("PERL_BADLANG");
1756
1757 const bool locwarn = (printwarn > 1
1758 || (printwarn
1759 && (! bad_lang_use_once
22ff3130
HS
1760 || (
1761 /* disallow with "" or "0" */
1762 *bad_lang_use_once
1763 && strNE("0", bad_lang_use_once)))));
0e92a118 1764 bool done = FALSE;
5d1187d1
KW
1765 char * sl_result; /* return from setlocale() */
1766 char * locale_param;
7d4bcc4a
KW
1767
1768# ifdef WIN32
1769
6bce99ee
JH
1770 /* In some systems you can find out the system default locale
1771 * and use that as the fallback locale. */
7d4bcc4a
KW
1772# define SYSTEM_DEFAULT_LOCALE
1773# endif
1774# ifdef SYSTEM_DEFAULT_LOCALE
1775
65ebb059 1776 const char *system_default_locale = NULL;
98994639 1777
7d4bcc4a
KW
1778# endif
1779# ifdef DEBUGGING
1780
8298454c 1781 DEBUG_INITIALIZATION_set(cBOOL(PerlEnv_getenv("PERL_DEBUG_LOCALE_INIT")));
7d4bcc4a
KW
1782
1783# define DEBUG_LOCALE_INIT(category, locale, result) \
2fcc0ca9
KW
1784 STMT_START { \
1785 if (debug_initialization) { \
1786 PerlIO_printf(Perl_debug_log, \
1787 "%s:%d: %s\n", \
1788 __FILE__, __LINE__, \
a4f00dcc 1789 setlocale_debug_string(category, \
2fcc0ca9
KW
1790 locale, \
1791 result)); \
1792 } \
1793 } STMT_END
2fcc0ca9 1794
7d4bcc4a
KW
1795# else
1796# define DEBUG_LOCALE_INIT(a,b,c)
1797# endif
1798
1799# ifndef LOCALE_ENVIRON_REQUIRED
1800
0e92a118 1801 PERL_UNUSED_VAR(done);
5d1187d1 1802 PERL_UNUSED_VAR(locale_param);
7d4bcc4a
KW
1803
1804# else
98994639
HS
1805
1806 /*
1807 * Ultrix setlocale(..., "") fails if there are no environment
1808 * variables from which to get a locale name.
1809 */
1810
7d4bcc4a
KW
1811# ifdef LC_ALL
1812
98994639 1813 if (lang) {
d2b24094 1814 sl_result = do_setlocale_c(LC_ALL, setlocale_init);
5d1187d1
KW
1815 DEBUG_LOCALE_INIT(LC_ALL, setlocale_init, sl_result);
1816 if (sl_result)
98994639
HS
1817 done = TRUE;
1818 else
1819 setlocale_failure = TRUE;
1820 }
5d1187d1 1821 if (! setlocale_failure) {
7d4bcc4a
KW
1822
1823# ifdef USE_LOCALE_CTYPE
1824
5d1187d1
KW
1825 locale_param = (! done && (lang || PerlEnv_getenv("LC_CTYPE")))
1826 ? setlocale_init
1827 : NULL;
d2b24094 1828 curctype = do_setlocale_c(LC_CTYPE, locale_param);
5d1187d1
KW
1829 DEBUG_LOCALE_INIT(LC_CTYPE, locale_param, sl_result);
1830 if (! curctype)
98994639
HS
1831 setlocale_failure = TRUE;
1832 else
1833 curctype = savepv(curctype);
7d4bcc4a
KW
1834
1835# endif /* USE_LOCALE_CTYPE */
1836# ifdef USE_LOCALE_COLLATE
1837
5d1187d1
KW
1838 locale_param = (! done && (lang || PerlEnv_getenv("LC_COLLATE")))
1839 ? setlocale_init
1840 : NULL;
d2b24094 1841 curcoll = do_setlocale_c(LC_COLLATE, locale_param);
5d1187d1
KW
1842 DEBUG_LOCALE_INIT(LC_COLLATE, locale_param, sl_result);
1843 if (! curcoll)
98994639
HS
1844 setlocale_failure = TRUE;
1845 else
1846 curcoll = savepv(curcoll);
7d4bcc4a
KW
1847
1848# endif /* USE_LOCALE_COLLATE */
1849# ifdef USE_LOCALE_NUMERIC
1850
5d1187d1
KW
1851 locale_param = (! done && (lang || PerlEnv_getenv("LC_NUMERIC")))
1852 ? setlocale_init
1853 : NULL;
d2b24094 1854 curnum = do_setlocale_c(LC_NUMERIC, locale_param);
5d1187d1
KW
1855 DEBUG_LOCALE_INIT(LC_NUMERIC, locale_param, sl_result);
1856 if (! curnum)
98994639
HS
1857 setlocale_failure = TRUE;
1858 else
1859 curnum = savepv(curnum);
7d4bcc4a
KW
1860
1861# endif /* USE_LOCALE_NUMERIC */
1862# ifdef USE_LOCALE_MESSAGES
1863
5d1187d1
KW
1864 locale_param = (! done && (lang || PerlEnv_getenv("LC_MESSAGES")))
1865 ? setlocale_init
1866 : NULL;
d2b24094 1867 sl_result = do_setlocale_c(LC_MESSAGES, locale_param);
5d1187d1 1868 DEBUG_LOCALE_INIT(LC_MESSAGES, locale_param, sl_result);
9f42613c 1869 if (! sl_result) {
a782673d
KW
1870 setlocale_failure = TRUE;
1871 }
7d4bcc4a
KW
1872
1873# endif /* USE_LOCALE_MESSAGES */
1874# ifdef USE_LOCALE_MONETARY
1875
5d1187d1
KW
1876 locale_param = (! done && (lang || PerlEnv_getenv("LC_MONETARY")))
1877 ? setlocale_init
1878 : NULL;
d2b24094 1879 sl_result = do_setlocale_c(LC_MONETARY, locale_param);
5d1187d1
KW
1880 DEBUG_LOCALE_INIT(LC_MONETARY, locale_param, sl_result);
1881 if (! sl_result) {
c835d6be
KW
1882 setlocale_failure = TRUE;
1883 }
98994639 1884
7d4bcc4a 1885# endif /* USE_LOCALE_MONETARY */
98994639 1886
7d4bcc4a
KW
1887 }
1888
1889# endif /* LC_ALL */
1890# endif /* !LOCALE_ENVIRON_REQUIRED */
98994639 1891
65ebb059 1892 /* We try each locale in the list until we get one that works, or exhaust
20a240df
KW
1893 * the list. Normally the loop is executed just once. But if setting the
1894 * locale fails, inside the loop we add fallback trials to the array and so
1895 * will execute the loop multiple times */
c3fcd832
KW
1896 trial_locales[0] = setlocale_init;
1897 trial_locales_count = 1;
7d4bcc4a 1898
65ebb059
KW
1899 for (i= 0; i < trial_locales_count; i++) {
1900 const char * trial_locale = trial_locales[i];
1901
1902 if (i > 0) {
1903
1904 /* XXX This is to preserve old behavior for LOCALE_ENVIRON_REQUIRED
1905 * when i==0, but I (khw) don't think that behavior makes much
1906 * sense */
1907 setlocale_failure = FALSE;
1908
7d4bcc4a
KW
1909# ifdef SYSTEM_DEFAULT_LOCALE
1910# ifdef WIN32
1911
65ebb059
KW
1912 /* On Windows machines, an entry of "" after the 0th means to use
1913 * the system default locale, which we now proceed to get. */
1914 if (strEQ(trial_locale, "")) {
1915 unsigned int j;
1916
1917 /* Note that this may change the locale, but we are going to do
1918 * that anyway just below */
837ce802 1919 system_default_locale = do_setlocale_c(LC_ALL, "");
5d1187d1 1920 DEBUG_LOCALE_INIT(LC_ALL, "", system_default_locale);
65ebb059 1921
7d4bcc4a 1922 /* Skip if invalid or if it's already on the list of locales to
65ebb059
KW
1923 * try */
1924 if (! system_default_locale) {
1925 goto next_iteration;
1926 }
1927 for (j = 0; j < trial_locales_count; j++) {
1928 if (strEQ(system_default_locale, trial_locales[j])) {
1929 goto next_iteration;
1930 }
1931 }
1932
1933 trial_locale = system_default_locale;
1934 }
7d4bcc4a
KW
1935# endif /* WIN32 */
1936# endif /* SYSTEM_DEFAULT_LOCALE */
65ebb059
KW
1937 }
1938
7d4bcc4a
KW
1939# ifdef LC_ALL
1940
d2b24094 1941 sl_result = do_setlocale_c(LC_ALL, trial_locale);
5d1187d1
KW
1942 DEBUG_LOCALE_INIT(LC_ALL, trial_locale, sl_result);
1943 if (! sl_result) {
49c85077 1944 setlocale_failure = TRUE;
7cd8b568
KW
1945 }
1946 else {
1947 /* Since LC_ALL succeeded, it should have changed all the other
1948 * categories it can to its value; so we massage things so that the
1949 * setlocales below just return their category's current values.
1950 * This adequately handles the case in NetBSD where LC_COLLATE may
1951 * not be defined for a locale, and setting it individually will
7d4bcc4a 1952 * fail, whereas setting LC_ALL succeeds, leaving LC_COLLATE set to
7cd8b568
KW
1953 * the POSIX locale. */
1954 trial_locale = NULL;
1955 }
7d4bcc4a
KW
1956
1957# endif /* LC_ALL */
98994639 1958
49c85077 1959 if (!setlocale_failure) {
7d4bcc4a
KW
1960
1961# ifdef USE_LOCALE_CTYPE
1962
49c85077 1963 Safefree(curctype);
d2b24094 1964 curctype = do_setlocale_c(LC_CTYPE, trial_locale);
5d1187d1
KW
1965 DEBUG_LOCALE_INIT(LC_CTYPE, trial_locale, curctype);
1966 if (! curctype)
49c85077
KW
1967 setlocale_failure = TRUE;
1968 else
1969 curctype = savepv(curctype);
7d4bcc4a
KW
1970
1971# endif /* USE_LOCALE_CTYPE */
1972# ifdef USE_LOCALE_COLLATE
1973
49c85077 1974 Safefree(curcoll);
d2b24094 1975 curcoll = do_setlocale_c(LC_COLLATE, trial_locale);
5d1187d1
KW
1976 DEBUG_LOCALE_INIT(LC_COLLATE, trial_locale, curcoll);
1977 if (! curcoll)
49c85077
KW
1978 setlocale_failure = TRUE;
1979 else
1980 curcoll = savepv(curcoll);
7d4bcc4a
KW
1981
1982# endif /* USE_LOCALE_COLLATE */
1983# ifdef USE_LOCALE_NUMERIC
1984
49c85077 1985 Safefree(curnum);
d2b24094 1986 curnum = do_setlocale_c(LC_NUMERIC, trial_locale);
5d1187d1
KW
1987 DEBUG_LOCALE_INIT(LC_NUMERIC, trial_locale, curnum);
1988 if (! curnum)
49c85077
KW
1989 setlocale_failure = TRUE;
1990 else
1991 curnum = savepv(curnum);
7d4bcc4a
KW
1992
1993# endif /* USE_LOCALE_NUMERIC */
1994# ifdef USE_LOCALE_MESSAGES
1995
d2b24094 1996 sl_result = do_setlocale_c(LC_MESSAGES, trial_locale);
5d1187d1
KW
1997 DEBUG_LOCALE_INIT(LC_MESSAGES, trial_locale, sl_result);
1998 if (! (sl_result))
a782673d 1999 setlocale_failure = TRUE;
7d4bcc4a
KW
2000
2001# endif /* USE_LOCALE_MESSAGES */
2002# ifdef USE_LOCALE_MONETARY
2003
d2b24094 2004 sl_result = do_setlocale_c(LC_MONETARY, trial_locale);
5d1187d1
KW
2005 DEBUG_LOCALE_INIT(LC_MONETARY, trial_locale, sl_result);
2006 if (! (sl_result))
c835d6be 2007 setlocale_failure = TRUE;
7d4bcc4a
KW
2008
2009# endif /* USE_LOCALE_MONETARY */
c835d6be 2010
49c85077
KW
2011 if (! setlocale_failure) { /* Success */
2012 break;
2013 }
65ebb059 2014 }
98994639 2015
49c85077
KW
2016 /* Here, something failed; will need to try a fallback. */
2017 ok = 0;
65ebb059 2018
49c85077
KW
2019 if (i == 0) {
2020 unsigned int j;
98994639 2021
65ebb059 2022 if (locwarn) { /* Output failure info only on the first one */
7d4bcc4a
KW
2023
2024# ifdef LC_ALL
98994639 2025
49c85077
KW
2026 PerlIO_printf(Perl_error_log,
2027 "perl: warning: Setting locale failed.\n");
98994639 2028
7d4bcc4a 2029# else /* !LC_ALL */
98994639 2030
49c85077
KW
2031 PerlIO_printf(Perl_error_log,
2032 "perl: warning: Setting locale failed for the categories:\n\t");
7d4bcc4a
KW
2033
2034# ifdef USE_LOCALE_CTYPE
2035
49c85077
KW
2036 if (! curctype)
2037 PerlIO_printf(Perl_error_log, "LC_CTYPE ");
7d4bcc4a
KW
2038
2039# endif /* USE_LOCALE_CTYPE */
2040# ifdef USE_LOCALE_COLLATE
49c85077
KW
2041 if (! curcoll)
2042 PerlIO_printf(Perl_error_log, "LC_COLLATE ");
7d4bcc4a
KW
2043
2044# endif /* USE_LOCALE_COLLATE */
2045# ifdef USE_LOCALE_NUMERIC
2046
49c85077
KW
2047 if (! curnum)
2048 PerlIO_printf(Perl_error_log, "LC_NUMERIC ");
7d4bcc4a
KW
2049
2050# endif /* USE_LOCALE_NUMERIC */
2051
a782673d 2052 PerlIO_printf(Perl_error_log, "and possibly others\n");
98994639 2053
7d4bcc4a 2054# endif /* LC_ALL */
98994639 2055
49c85077
KW
2056 PerlIO_printf(Perl_error_log,
2057 "perl: warning: Please check that your locale settings:\n");
98994639 2058
7d4bcc4a
KW
2059# ifdef __GLIBC__
2060
49c85077
KW
2061 PerlIO_printf(Perl_error_log,
2062 "\tLANGUAGE = %c%s%c,\n",
2063 language ? '"' : '(',
2064 language ? language : "unset",
2065 language ? '"' : ')');
7d4bcc4a 2066# endif
98994639 2067
49c85077
KW
2068 PerlIO_printf(Perl_error_log,
2069 "\tLC_ALL = %c%s%c,\n",
2070 lc_all ? '"' : '(',
2071 lc_all ? lc_all : "unset",
2072 lc_all ? '"' : ')');
98994639 2073
7d4bcc4a
KW
2074# if defined(USE_ENVIRON_ARRAY)
2075
49c85077 2076 {
cd999af9 2077 char **e;
d5e32b93
KW
2078
2079 /* Look through the environment for any variables of the
2080 * form qr/ ^ LC_ [A-Z]+ = /x, except LC_ALL which was
2081 * already handled above. These are assumed to be locale
2082 * settings. Output them and their values. */
cd999af9 2083 for (e = environ; *e; e++) {
d5e32b93
KW
2084 const STRLEN prefix_len = sizeof("LC_") - 1;
2085 STRLEN uppers_len;
2086
cd999af9 2087 if ( strBEGINs(*e, "LC_")
c8b388b0 2088 && ! strBEGINs(*e, "LC_ALL=")
d5e32b93
KW
2089 && (uppers_len = strspn(*e + prefix_len,
2090 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"))
2091 && ((*e)[prefix_len + uppers_len] == '='))
cd999af9
KW
2092 {
2093 PerlIO_printf(Perl_error_log, "\t%.*s = \"%s\",\n",
d5e32b93
KW
2094 (int) (prefix_len + uppers_len), *e,
2095 *e + prefix_len + uppers_len + 1);
cd999af9
KW
2096 }
2097 }
49c85077 2098 }
7d4bcc4a
KW
2099
2100# else
2101
49c85077
KW
2102 PerlIO_printf(Perl_error_log,
2103 "\t(possibly more locale environment variables)\n");
7d4bcc4a
KW
2104
2105# endif
98994639 2106
49c85077
KW
2107 PerlIO_printf(Perl_error_log,
2108 "\tLANG = %c%s%c\n",
2109 lang ? '"' : '(',
2110 lang ? lang : "unset",
2111 lang ? '"' : ')');
98994639 2112
49c85077
KW
2113 PerlIO_printf(Perl_error_log,
2114 " are supported and installed on your system.\n");
2115 }
98994639 2116
65ebb059 2117 /* Calculate what fallback locales to try. We have avoided this
f6bab5f6 2118 * until we have to, because failure is quite unlikely. This will
65ebb059
KW
2119 * usually change the upper bound of the loop we are in.
2120 *
2121 * Since the system's default way of setting the locale has not
2122 * found one that works, We use Perl's defined ordering: LC_ALL,
2123 * LANG, and the C locale. We don't try the same locale twice, so
2124 * don't add to the list if already there. (On POSIX systems, the
2125 * LC_ALL element will likely be a repeat of the 0th element "",
6b058d42
KW
2126 * but there's no harm done by doing it explicitly.
2127 *
2128 * Note that this tries the LC_ALL environment variable even on
2129 * systems which have no LC_ALL locale setting. This may or may
2130 * not have been originally intentional, but there's no real need
2131 * to change the behavior. */
65ebb059
KW
2132 if (lc_all) {
2133 for (j = 0; j < trial_locales_count; j++) {
2134 if (strEQ(lc_all, trial_locales[j])) {
2135 goto done_lc_all;
2136 }
2137 }
2138 trial_locales[trial_locales_count++] = lc_all;
2139 }
2140 done_lc_all:
98994639 2141
65ebb059
KW
2142 if (lang) {
2143 for (j = 0; j < trial_locales_count; j++) {
2144 if (strEQ(lang, trial_locales[j])) {
2145 goto done_lang;
2146 }
2147 }
2148 trial_locales[trial_locales_count++] = lang;
2149 }
2150 done_lang:
2151
7d4bcc4a
KW
2152# if defined(WIN32) && defined(LC_ALL)
2153
65ebb059
KW
2154 /* For Windows, we also try the system default locale before "C".
2155 * (If there exists a Windows without LC_ALL we skip this because
2156 * it gets too complicated. For those, the "C" is the next
2157 * fallback possibility). The "" is the same as the 0th element of
2158 * the array, but the code at the loop above knows to treat it
2159 * differently when not the 0th */
2160 trial_locales[trial_locales_count++] = "";
7d4bcc4a
KW
2161
2162# endif
65ebb059
KW
2163
2164 for (j = 0; j < trial_locales_count; j++) {
2165 if (strEQ("C", trial_locales[j])) {
2166 goto done_C;
2167 }
2168 }
2169 trial_locales[trial_locales_count++] = "C";
98994639 2170
65ebb059
KW
2171 done_C: ;
2172 } /* end of first time through the loop */
98994639 2173
7d4bcc4a
KW
2174# ifdef WIN32
2175
65ebb059 2176 next_iteration: ;
7d4bcc4a
KW
2177
2178# endif
65ebb059
KW
2179
2180 } /* end of looping through the trial locales */
2181
2182 if (ok < 1) { /* If we tried to fallback */
2183 const char* msg;
2184 if (! setlocale_failure) { /* fallback succeeded */
2185 msg = "Falling back to";
2186 }
2187 else { /* fallback failed */
98994639 2188
65ebb059
KW
2189 /* We dropped off the end of the loop, so have to decrement i to
2190 * get back to the value the last time through */
2191 i--;
98994639 2192
65ebb059
KW
2193 ok = -1;
2194 msg = "Failed to fall back to";
2195
2196 /* To continue, we should use whatever values we've got */
7d4bcc4a
KW
2197
2198# ifdef USE_LOCALE_CTYPE
2199
49c85077 2200 Safefree(curctype);
837ce802 2201 curctype = savepv(do_setlocale_c(LC_CTYPE, NULL));
5d1187d1 2202 DEBUG_LOCALE_INIT(LC_CTYPE, NULL, curctype);
7d4bcc4a
KW
2203
2204# endif /* USE_LOCALE_CTYPE */
2205# ifdef USE_LOCALE_COLLATE
2206
49c85077 2207 Safefree(curcoll);
837ce802 2208 curcoll = savepv(do_setlocale_c(LC_COLLATE, NULL));
5d1187d1 2209 DEBUG_LOCALE_INIT(LC_COLLATE, NULL, curcoll);
7d4bcc4a
KW
2210
2211# endif /* USE_LOCALE_COLLATE */
2212# ifdef USE_LOCALE_NUMERIC
2213
49c85077 2214 Safefree(curnum);
837ce802 2215 curnum = savepv(do_setlocale_c(LC_NUMERIC, NULL));
5d1187d1 2216 DEBUG_LOCALE_INIT(LC_NUMERIC, NULL, curnum);
7d4bcc4a
KW
2217
2218# endif /* USE_LOCALE_NUMERIC */
2219
65ebb059
KW
2220 }
2221
2222 if (locwarn) {
2223 const char * description;
2224 const char * name = "";
2225 if (strEQ(trial_locales[i], "C")) {
2226 description = "the standard locale";
2227 name = "C";
2228 }
7d4bcc4a
KW
2229
2230# ifdef SYSTEM_DEFAULT_LOCALE
2231
65ebb059
KW
2232 else if (strEQ(trial_locales[i], "")) {
2233 description = "the system default locale";
2234 if (system_default_locale) {
2235 name = system_default_locale;
2236 }
2237 }
7d4bcc4a
KW
2238
2239# endif /* SYSTEM_DEFAULT_LOCALE */
2240
65ebb059
KW
2241 else {
2242 description = "a fallback locale";
2243 name = trial_locales[i];
2244 }
2245 if (name && strNE(name, "")) {
2246 PerlIO_printf(Perl_error_log,
2247 "perl: warning: %s %s (\"%s\").\n", msg, description, name);
2248 }
2249 else {
2250 PerlIO_printf(Perl_error_log,
2251 "perl: warning: %s %s.\n", msg, description);
2252 }
2253 }
2254 } /* End of tried to fallback */
98994639 2255
7d4bcc4a
KW
2256# ifdef USE_LOCALE_CTYPE
2257
98994639 2258 new_ctype(curctype);
98994639 2259
7d4bcc4a
KW
2260# endif /* USE_LOCALE_CTYPE */
2261# ifdef USE_LOCALE_COLLATE
2262
98994639 2263 new_collate(curcoll);
98994639 2264
7d4bcc4a
KW
2265# endif /* USE_LOCALE_COLLATE */
2266# ifdef USE_LOCALE_NUMERIC
2267
98994639 2268 new_numeric(curnum);
b310b053 2269
7d4bcc4a
KW
2270# endif /* USE_LOCALE_NUMERIC */
2271# if defined(USE_PERLIO) && defined(USE_LOCALE_CTYPE)
2272
49c85077
KW
2273 /* Set PL_utf8locale to TRUE if using PerlIO _and_ the current LC_CTYPE
2274 * locale is UTF-8. If PL_utf8locale and PL_unicode (set by -C or by
2275 * $ENV{PERL_UNICODE}) are true, perl.c:S_parse_body() will turn on the
2276 * PerlIO :utf8 layer on STDIN, STDOUT, STDERR, _and_ the default open
2277 * discipline. */
c1284011 2278 PL_utf8locale = _is_cur_LC_category_utf8(LC_CTYPE);
49c85077 2279
a05d7ebb 2280 /* Set PL_unicode to $ENV{PERL_UNICODE} if using PerlIO.
fde18df1
JH
2281 This is an alternative to using the -C command line switch
2282 (the -C if present will override this). */
2283 {
dd374669 2284 const char *p = PerlEnv_getenv("PERL_UNICODE");
a05d7ebb 2285 PL_unicode = p ? parse_unicode_opts(&p) : 0;
5a22a2bb
NC
2286 if (PL_unicode & PERL_UNICODE_UTF8CACHEASSERT_FLAG)
2287 PL_utf8cache = -1;
b310b053
JH
2288 }
2289
7d4bcc4a
KW
2290# endif
2291# ifdef USE_LOCALE_CTYPE
2292
43c5f42d 2293 Safefree(curctype);
7d4bcc4a
KW
2294
2295# endif /* USE_LOCALE_CTYPE */
2296# ifdef USE_LOCALE_COLLATE
2297
43c5f42d 2298 Safefree(curcoll);
7d4bcc4a
KW
2299
2300# endif /* USE_LOCALE_COLLATE */
2301# ifdef USE_LOCALE_NUMERIC
2302
43c5f42d 2303 Safefree(curnum);
8ef6e574 2304
7d4bcc4a
KW
2305# endif /* USE_LOCALE_NUMERIC */
2306
2307# ifdef __GLIBC__
2308
175c4cf9 2309 Safefree(language);
7d4bcc4a
KW
2310
2311# endif
175c4cf9
KW
2312
2313 Safefree(lc_all);
2314 Safefree(lang);
2315
e3305790 2316#endif /* USE_LOCALE */
2fcc0ca9 2317#ifdef DEBUGGING
7d4bcc4a 2318
2fcc0ca9 2319 /* So won't continue to output stuff */
27cdc72e 2320 DEBUG_INITIALIZATION_set(FALSE);
7d4bcc4a 2321
2fcc0ca9
KW
2322#endif
2323
98994639
HS
2324 return ok;
2325}
2326
98994639
HS
2327#ifdef USE_LOCALE_COLLATE
2328
a4a439fb 2329char *
a4a439fb
KW
2330Perl__mem_collxfrm(pTHX_ const char *input_string,
2331 STRLEN len, /* Length of 'input_string' */
2332 STRLEN *xlen, /* Set to length of returned string
2333 (not including the collation index
2334 prefix) */
2335 bool utf8 /* Is the input in UTF-8? */
6696cfa7 2336 )
98994639 2337{
a4a439fb
KW
2338
2339 /* _mem_collxfrm() is a bit like strxfrm() but with two important
2340 * differences. First, it handles embedded NULs. Second, it allocates a bit
2341 * more memory than needed for the transformed data itself. The real
55e5378d 2342 * transformed data begins at offset COLLXFRM_HDR_LEN. *xlen is set to
a4a439fb
KW
2343 * the length of that, and doesn't include the collation index size.
2344 * Please see sv_collxfrm() to see how this is used. */
2345
55e5378d
KW
2346#define COLLXFRM_HDR_LEN sizeof(PL_collation_ix)
2347
6696cfa7
KW
2348 char * s = (char *) input_string;
2349 STRLEN s_strlen = strlen(input_string);
79f120c8 2350 char *xbuf = NULL;
55e5378d 2351 STRLEN xAlloc; /* xalloc is a reserved word in VC */
17f41037 2352 STRLEN length_in_chars;
c664130f 2353 bool first_time = TRUE; /* Cleared after first loop iteration */
98994639 2354
a4a439fb
KW
2355 PERL_ARGS_ASSERT__MEM_COLLXFRM;
2356
2357 /* Must be NUL-terminated */
2358 assert(*(input_string + len) == '\0');
7918f24d 2359
79f120c8
KW
2360 /* If this locale has defective collation, skip */
2361 if (PL_collxfrm_base == 0 && PL_collxfrm_mult == 0) {
c7202dee
KW
2362 DEBUG_L(PerlIO_printf(Perl_debug_log,
2363 "_mem_collxfrm: locale's collation is defective\n"));
79f120c8
KW
2364 goto bad;
2365 }
2366
6696cfa7
KW
2367 /* Replace any embedded NULs with the control that sorts before any others.
2368 * This will give as good as possible results on strings that don't
2369 * otherwise contain that character, but otherwise there may be
2370 * less-than-perfect results with that character and NUL. This is
fdc080f3 2371 * unavoidable unless we replace strxfrm with our own implementation. */
fd43f63c
KW
2372 if (UNLIKELY(s_strlen < len)) { /* Only execute if there is an embedded
2373 NUL */
6696cfa7
KW
2374 char * e = s + len;
2375 char * sans_nuls;
fdc080f3 2376 STRLEN sans_nuls_len;
94762aa0 2377 int try_non_controls;
afc4976f
KW
2378 char this_replacement_char[] = "?\0"; /* Room for a two-byte string,
2379 making sure 2nd byte is NUL.
2380 */
2381 STRLEN this_replacement_len;
2382
1e4c9676
KW
2383 /* If we don't know what non-NUL control character sorts lowest for
2384 * this locale, find it */
f28f4d2a 2385 if (PL_strxfrm_NUL_replacement == '\0') {
6696cfa7 2386 int j;
afc4976f 2387 char * cur_min_x = NULL; /* The min_char's xfrm, (except it also
6696cfa7
KW
2388 includes the collation index
2389 prefixed. */
2390
91c0e2e0 2391 DEBUG_Lv(PerlIO_printf(Perl_debug_log, "Looking to replace NUL\n"));
94762aa0
KW
2392
2393 /* Unlikely, but it may be that no control will work to replace
1e4c9676
KW
2394 * NUL, in which case we instead look for any character. Controls
2395 * are preferred because collation order is, in general, context
2396 * sensitive, with adjoining characters affecting the order, and
2397 * controls are less likely to have such interactions, allowing the
2398 * NUL-replacement to stand on its own. (Another way to look at it
2399 * is to imagine what would happen if the NUL were replaced by a
2400 * combining character; it wouldn't work out all that well.) */
94762aa0
KW
2401 for (try_non_controls = 0;
2402 try_non_controls < 2;
2403 try_non_controls++)
2404 {
d4ff9586
KW
2405 /* Look through all legal code points (NUL isn't) */
2406 for (j = 1; j < 256; j++) {
2407 char * x; /* j's xfrm plus collation index */
2408 STRLEN x_len; /* length of 'x' */
2409 STRLEN trial_len = 1;
736a4fed 2410 char cur_source[] = { '\0', '\0' };
d4ff9586 2411
736a4fed
KW
2412 /* Skip non-controls the first time through the loop. The
2413 * controls in a UTF-8 locale are the L1 ones */
afc4976f
KW
2414 if (! try_non_controls && (PL_in_utf8_COLLATE_locale)
2415 ? ! isCNTRL_L1(j)
2416 : ! isCNTRL_LC(j))
2417 {
d4ff9586 2418 continue;
6696cfa7 2419 }
6696cfa7 2420
736a4fed
KW
2421 /* Create a 1-char string of the current code point */
2422 cur_source[0] = (char) j;
2423
d4ff9586
KW
2424 /* Then transform it */
2425 x = _mem_collxfrm(cur_source, trial_len, &x_len,
afc4976f 2426 0 /* The string is not in UTF-8 */);
6696cfa7 2427
1e4c9676 2428 /* Ignore any character that didn't successfully transform.
d4ff9586
KW
2429 * */
2430 if (! x) {
2431 continue;
2432 }
6696cfa7 2433
d4ff9586
KW
2434 /* If this character's transformation is lower than
2435 * the current lowest, this one becomes the lowest */
2436 if ( cur_min_x == NULL
2437 || strLT(x + COLLXFRM_HDR_LEN,
2438 cur_min_x + COLLXFRM_HDR_LEN))
2439 {
f28f4d2a 2440 PL_strxfrm_NUL_replacement = j;
d4ff9586 2441 cur_min_x = x;
d4ff9586
KW
2442 }
2443 else {
2444 Safefree(x);
2445 }
1e4c9676 2446 } /* end of loop through all 255 characters */
6696cfa7 2447
1e4c9676 2448 /* Stop looking if found */
94762aa0
KW
2449 if (cur_min_x) {
2450 break;
2451 }
2452
2453 /* Unlikely, but possible, if there aren't any controls that
2454 * work in the locale, repeat the loop, looking for any
2455 * character that works */
2456 DEBUG_L(PerlIO_printf(Perl_debug_log,
2457 "_mem_collxfrm: No control worked. Trying non-controls\n"));
1e4c9676 2458 } /* End of loop to try first the controls, then any char */
6696cfa7 2459
94762aa0
KW
2460 if (! cur_min_x) {
2461 DEBUG_L(PerlIO_printf(Perl_debug_log,
2462 "_mem_collxfrm: Couldn't find any character to replace"
2463 " embedded NULs in locale %s with", PL_collation_name));
2464 goto bad;
58eebef2
KW
2465 }
2466
94762aa0
KW
2467 DEBUG_L(PerlIO_printf(Perl_debug_log,
2468 "_mem_collxfrm: Replacing embedded NULs in locale %s with "
f28f4d2a 2469 "0x%02X\n", PL_collation_name, PL_strxfrm_NUL_replacement));
94762aa0 2470
6696cfa7 2471 Safefree(cur_min_x);
1e4c9676 2472 } /* End of determining the character that is to replace NULs */
afc4976f
KW
2473
2474 /* If the replacement is variant under UTF-8, it must match the
2475 * UTF8-ness as the original */
f28f4d2a
KW
2476 if ( ! UVCHR_IS_INVARIANT(PL_strxfrm_NUL_replacement) && utf8) {
2477 this_replacement_char[0] =
2478 UTF8_EIGHT_BIT_HI(PL_strxfrm_NUL_replacement);
2479 this_replacement_char[1] =
2480 UTF8_EIGHT_BIT_LO(PL_strxfrm_NUL_replacement);
afc4976f
KW
2481 this_replacement_len = 2;
2482 }
2483 else {
f28f4d2a 2484 this_replacement_char[0] = PL_strxfrm_NUL_replacement;
afc4976f
KW
2485 /* this_replacement_char[1] = '\0' was done at initialization */
2486 this_replacement_len = 1;
6696cfa7
KW
2487 }
2488
2489 /* The worst case length for the replaced string would be if every
2490 * character in it is NUL. Multiply that by the length of each
2491 * replacement, and allow for a trailing NUL */
afc4976f 2492 sans_nuls_len = (len * this_replacement_len) + 1;
fdc080f3 2493 Newx(sans_nuls, sans_nuls_len, char);
6696cfa7
KW
2494 *sans_nuls = '\0';
2495
6696cfa7
KW
2496 /* Replace each NUL with the lowest collating control. Loop until have
2497 * exhausted all the NULs */
2498 while (s + s_strlen < e) {
6069d6c5 2499 my_strlcat(sans_nuls, s, sans_nuls_len);
6696cfa7
KW
2500
2501 /* Do the actual replacement */
6069d6c5 2502 my_strlcat(sans_nuls, this_replacement_char, sans_nuls_len);
6696cfa7
KW
2503
2504 /* Move past the input NUL */
2505 s += s_strlen + 1;
2506 s_strlen = strlen(s);
2507 }
2508
2509 /* And add anything that trails the final NUL */
6069d6c5 2510 my_strlcat(sans_nuls, s, sans_nuls_len);
6696cfa7
KW
2511
2512 /* Switch so below we transform this modified string */
2513 s = sans_nuls;
2514 len = strlen(s);
1e4c9676 2515 } /* End of replacing NULs */
6696cfa7 2516
a4a439fb
KW
2517 /* Make sure the UTF8ness of the string and locale match */
2518 if (utf8 != PL_in_utf8_COLLATE_locale) {
2519 const char * const t = s; /* Temporary so we can later find where the
2520 input was */
2521
2522 /* Here they don't match. Change the string's to be what the locale is
2523 * expecting */
2524
2525 if (! utf8) { /* locale is UTF-8, but input isn't; upgrade the input */
2526 s = (char *) bytes_to_utf8((const U8 *) s, &len);
2527 utf8 = TRUE;
2528 }
2529 else { /* locale is not UTF-8; but input is; downgrade the input */
2530
2531 s = (char *) bytes_from_utf8((const U8 *) s, &len, &utf8);
2532
2533 /* If the downgrade was successful we are done, but if the input
2534 * contains things that require UTF-8 to represent, have to do
2535 * damage control ... */
2536 if (UNLIKELY(utf8)) {
2537
2538 /* What we do is construct a non-UTF-8 string with
2539 * 1) the characters representable by a single byte converted
2540 * to be so (if necessary);
2541 * 2) and the rest converted to collate the same as the
2542 * highest collating representable character. That makes
2543 * them collate at the end. This is similar to how we
2544 * handle embedded NULs, but we use the highest collating
2545 * code point instead of the smallest. Like the NUL case,
2546 * this isn't perfect, but is the best we can reasonably
2547 * do. Every above-255 code point will sort the same as
2548 * the highest-sorting 0-255 code point. If that code
2549 * point can combine in a sequence with some other code
2550 * points for weight calculations, us changing something to
2551 * be it can adversely affect the results. But in most
2552 * cases, it should work reasonably. And note that this is
2553 * really an illegal situation: using code points above 255
2554 * on a locale where only 0-255 are valid. If two strings
2555 * sort entirely equal, then the sort order for the
2556 * above-255 code points will be in code point order. */
2557
2558 utf8 = FALSE;
2559
2560 /* If we haven't calculated the code point with the maximum
2561 * collating order for this locale, do so now */
2562 if (! PL_strxfrm_max_cp) {
2563 int j;
2564
2565 /* The current transformed string that collates the
2566 * highest (except it also includes the prefixed collation
2567 * index. */
2568 char * cur_max_x = NULL;
2569
2570 /* Look through all legal code points (NUL isn't) */
2571 for (j = 1; j < 256; j++) {
2572 char * x;
2573 STRLEN x_len;
736a4fed 2574 char cur_source[] = { '\0', '\0' };
a4a439fb 2575
736a4fed
KW
2576 /* Create a 1-char string of the current code point */
2577 cur_source[0] = (char) j;
a4a439fb
KW
2578
2579 /* Then transform it */
2580 x = _mem_collxfrm(cur_source, 1, &x_len, FALSE);
2581
2582 /* If something went wrong (which it shouldn't), just
2583 * ignore this code point */
94762aa0 2584 if (! x) {
a4a439fb
KW
2585 continue;
2586 }
2587
2588 /* If this character's transformation is higher than
2589 * the current highest, this one becomes the highest */
2590 if ( cur_max_x == NULL
55e5378d
KW
2591 || strGT(x + COLLXFRM_HDR_LEN,
2592 cur_max_x + COLLXFRM_HDR_LEN))
a4a439fb
KW
2593 {
2594 PL_strxfrm_max_cp = j;
2595 cur_max_x = x;
2596 }
2597 else {
2598 Safefree(x);
2599 }
2600 }
2601
94762aa0
KW
2602 if (! cur_max_x) {
2603 DEBUG_L(PerlIO_printf(Perl_debug_log,
2604 "_mem_collxfrm: Couldn't find any character to"
2605 " replace above-Latin1 chars in locale %s with",
2606 PL_collation_name));
2607 goto bad;
2608 }
2609
58eebef2
KW
2610 DEBUG_L(PerlIO_printf(Perl_debug_log,
2611 "_mem_collxfrm: highest 1-byte collating character"
2612 " in locale %s is 0x%02X\n",
2613 PL_collation_name,
2614 PL_strxfrm_max_cp));
58eebef2 2615
a4a439fb
KW
2616 Safefree(cur_max_x);
2617 }
2618
2619 /* Here we know which legal code point collates the highest.
2620 * We are ready to construct the non-UTF-8 string. The length
2621 * will be at least 1 byte smaller than the input string
2622 * (because we changed at least one 2-byte character into a
2623 * single byte), but that is eaten up by the trailing NUL */
2624 Newx(s, len, char);
2625
2626 {
2627 STRLEN i;
2628 STRLEN d= 0;
042d9e50 2629 char * e = (char *) t + len;
a4a439fb
KW
2630
2631 for (i = 0; i < len; i+= UTF8SKIP(t + i)) {
2632 U8 cur_char = t[i];
2633 if (UTF8_IS_INVARIANT(cur_char)) {
2634 s[d++] = cur_char;
2635 }
042d9e50 2636 else if (UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(t + i, e)) {
a4a439fb
KW
2637 s[d++] = EIGHT_BIT_UTF8_TO_NATIVE(cur_char, t[i+1]);
2638 }
2639 else { /* Replace illegal cp with highest collating
2640 one */
2641 s[d++] = PL_strxfrm_max_cp;
2642 }
2643 }
2644 s[d++] = '\0';
2645 Renew(s, d, char); /* Free up unused space */
2646 }
2647 }
2648 }
2649
2650 /* Here, we have constructed a modified version of the input. It could
2651 * be that we already had a modified copy before we did this version.
2652 * If so, that copy is no longer needed */
2653 if (t != input_string) {
2654 Safefree(t);
2655 }
2656 }
2657
17f41037
KW
2658 length_in_chars = (utf8)
2659 ? utf8_length((U8 *) s, (U8 *) s + len)
2660 : len;
2661
59c018b9
KW
2662 /* The first element in the output is the collation id, used by
2663 * sv_collxfrm(); then comes the space for the transformed string. The
2664 * equation should give us a good estimate as to how much is needed */
55e5378d 2665 xAlloc = COLLXFRM_HDR_LEN
a4a439fb 2666 + PL_collxfrm_base
17f41037 2667 + (PL_collxfrm_mult * length_in_chars);
a02a5408 2668 Newx(xbuf, xAlloc, char);
c7202dee
KW
2669 if (UNLIKELY(! xbuf)) {
2670 DEBUG_L(PerlIO_printf(Perl_debug_log,
2671 "_mem_collxfrm: Couldn't malloc %zu bytes\n", xAlloc));
98994639 2672 goto bad;
c7202dee 2673 }
98994639 2674
d35fca5f 2675 /* Store the collation id */
98994639 2676 *(U32*)xbuf = PL_collation_ix;
d35fca5f
KW
2677
2678 /* Then the transformation of the input. We loop until successful, or we
2679 * give up */
4ebeff16 2680 for (;;) {
1adab0a7 2681
55e5378d 2682 *xlen = strxfrm(xbuf + COLLXFRM_HDR_LEN, s, xAlloc - COLLXFRM_HDR_LEN);
4ebeff16
KW
2683
2684 /* If the transformed string occupies less space than we told strxfrm()
2685 * was available, it means it successfully transformed the whole
2686 * string. */
55e5378d 2687 if (*xlen < xAlloc - COLLXFRM_HDR_LEN) {
17f41037 2688
1adab0a7
KW
2689 /* Some systems include a trailing NUL in the returned length.
2690 * Ignore it, using a loop in case multiple trailing NULs are
2691 * returned. */
2692 while ( (*xlen) > 0
2693 && *(xbuf + COLLXFRM_HDR_LEN + (*xlen) - 1) == '\0')
2694 {
2695 (*xlen)--;
2696 }
2697
17f41037
KW
2698 /* If the first try didn't get it, it means our prediction was low.
2699 * Modify the coefficients so that we predict a larger value in any
2700 * future transformations */
2701 if (! first_time) {
2702 STRLEN needed = *xlen + 1; /* +1 For trailing NUL */
2703 STRLEN computed_guess = PL_collxfrm_base
2704 + (PL_collxfrm_mult * length_in_chars);
e1c30f0c
KW
2705
2706 /* On zero-length input, just keep current slope instead of
2707 * dividing by 0 */
2708 const STRLEN new_m = (length_in_chars != 0)
2709 ? needed / length_in_chars
2710 : PL_collxfrm_mult;
17f41037
KW
2711
2712 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
b07929e4
KW
2713 "%s: %d: initial size of %zu bytes for a length "
2714 "%zu string was insufficient, %zu needed\n",
17f41037 2715 __FILE__, __LINE__,
b07929e4 2716 computed_guess, length_in_chars, needed));
17f41037
KW
2717
2718 /* If slope increased, use it, but discard this result for
2719 * length 1 strings, as we can't be sure that it's a real slope
2720 * change */
2721 if (length_in_chars > 1 && new_m > PL_collxfrm_mult) {
7d4bcc4a
KW
2722
2723# ifdef DEBUGGING
2724
17f41037
KW
2725 STRLEN old_m = PL_collxfrm_mult;
2726 STRLEN old_b = PL_collxfrm_base;
7d4bcc4a
KW
2727
2728# endif
2729
17f41037
KW
2730 PL_collxfrm_mult = new_m;
2731 PL_collxfrm_base = 1; /* +1 For trailing NUL */
2732 computed_guess = PL_collxfrm_base
2733 + (PL_collxfrm_mult * length_in_chars);
2734 if (computed_guess < needed) {
2735 PL_collxfrm_base += needed - computed_guess;
2736 }
2737
2738 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
b07929e4
KW
2739 "%s: %d: slope is now %zu; was %zu, base "
2740 "is now %zu; was %zu\n",
17f41037 2741 __FILE__, __LINE__,
b07929e4
KW
2742 PL_collxfrm_mult, old_m,
2743 PL_collxfrm_base, old_b));
17f41037
KW
2744 }
2745 else { /* Slope didn't change, but 'b' did */
2746 const STRLEN new_b = needed
2747 - computed_guess
2748 + PL_collxfrm_base;
2749 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
b07929e4 2750 "%s: %d: base is now %zu; was %zu\n",
17f41037 2751 __FILE__, __LINE__,
b07929e4 2752 new_b, PL_collxfrm_base));
17f41037
KW
2753 PL_collxfrm_base = new_b;
2754 }
2755 }
2756
4ebeff16
KW
2757 break;
2758 }
bb0f664e 2759
c7202dee
KW
2760 if (UNLIKELY(*xlen >= PERL_INT_MAX)) {
2761 DEBUG_L(PerlIO_printf(Perl_debug_log,
2762 "_mem_collxfrm: Needed %zu bytes, max permissible is %u\n",
2763 *xlen, PERL_INT_MAX));
4ebeff16 2764 goto bad;
c7202dee 2765 }
d35fca5f 2766
c664130f 2767 /* A well-behaved strxfrm() returns exactly how much space it needs
1adab0a7
KW
2768 * (usually not including the trailing NUL) when it fails due to not
2769 * enough space being provided. Assume that this is the case unless
2770 * it's been proven otherwise */
c664130f 2771 if (LIKELY(PL_strxfrm_is_behaved) && first_time) {
55e5378d 2772 xAlloc = *xlen + COLLXFRM_HDR_LEN + 1;
c664130f
KW
2773 }
2774 else { /* Here, either:
2775 * 1) The strxfrm() has previously shown bad behavior; or
2776 * 2) It isn't the first time through the loop, which means
2777 * that the strxfrm() is now showing bad behavior, because
2778 * we gave it what it said was needed in the previous
2779 * iteration, and it came back saying it needed still more.
2780 * (Many versions of cygwin fit this. When the buffer size
2781 * isn't sufficient, they return the input size instead of
2782 * how much is needed.)
d4ff9586
KW
2783 * Increase the buffer size by a fixed percentage and try again.
2784 * */
6ddd902c 2785 xAlloc += (xAlloc / 4) + 1;
c664130f 2786 PL_strxfrm_is_behaved = FALSE;
c664130f 2787
7d4bcc4a
KW
2788# ifdef DEBUGGING
2789
58eebef2
KW
2790 if (DEBUG_Lv_TEST || debug_initialization) {
2791 PerlIO_printf(Perl_debug_log,
2792 "_mem_collxfrm required more space than previously calculated"
b07929e4 2793 " for locale %s, trying again with new guess=%d+%zu\n",
58eebef2 2794 PL_collation_name, (int) COLLXFRM_HDR_LEN,
b07929e4 2795 xAlloc - COLLXFRM_HDR_LEN);
58eebef2 2796 }
7d4bcc4a
KW
2797
2798# endif
2799
58eebef2 2800 }
c664130f 2801
4ebeff16 2802 Renew(xbuf, xAlloc, char);
c7202dee
KW
2803 if (UNLIKELY(! xbuf)) {
2804 DEBUG_L(PerlIO_printf(Perl_debug_log,
2805 "_mem_collxfrm: Couldn't realloc %zu bytes\n", xAlloc));
4ebeff16 2806 goto bad;
c7202dee 2807 }
c664130f
KW
2808
2809 first_time = FALSE;
4ebeff16 2810 }
98994639 2811
6696cfa7 2812
7d4bcc4a
KW
2813# ifdef DEBUGGING
2814
58eebef2 2815 if (DEBUG_Lv_TEST || debug_initialization) {
c7202dee
KW
2816
2817 print_collxfrm_input_and_return(s, s + len, xlen, utf8);
2818 PerlIO_printf(Perl_debug_log, "Its xfrm is:");
7e2f38b2
KW
2819 PerlIO_printf(Perl_debug_log, "%s\n",
2820 _byte_dump_string((U8 *) xbuf + COLLXFRM_HDR_LEN,
2821 *xlen, 1));
58eebef2 2822 }
7d4bcc4a
KW
2823
2824# endif
58eebef2 2825
3c5f993e 2826 /* Free up unneeded space; retain ehough for trailing NUL */
55e5378d 2827 Renew(xbuf, COLLXFRM_HDR_LEN + *xlen + 1, char);
98994639 2828
6696cfa7
KW
2829 if (s != input_string) {
2830 Safefree(s);
98994639
HS
2831 }
2832
98994639
HS
2833 return xbuf;
2834
2835 bad:
2836 Safefree(xbuf);
6696cfa7
KW
2837 if (s != input_string) {
2838 Safefree(s);
2839 }
98994639 2840 *xlen = 0;
7d4bcc4a
KW
2841
2842# ifdef DEBUGGING
2843
58eebef2 2844 if (DEBUG_Lv_TEST || debug_initialization) {
c7202dee 2845 print_collxfrm_input_and_return(s, s + len, NULL, utf8);
58eebef2 2846 }
7d4bcc4a
KW
2847
2848# endif
2849
98994639
HS
2850 return NULL;
2851}
2852
7d4bcc4a 2853# ifdef DEBUGGING
c7202dee 2854
4cbaac56 2855STATIC void
c7202dee
KW
2856S_print_collxfrm_input_and_return(pTHX_
2857 const char * const s,
2858 const char * const e,
2859 const STRLEN * const xlen,
2860 const bool is_utf8)
2861{
c7202dee
KW
2862
2863 PERL_ARGS_ASSERT_PRINT_COLLXFRM_INPUT_AND_RETURN;
2864
511e4ff7
DM
2865 PerlIO_printf(Perl_debug_log, "_mem_collxfrm[%" UVuf "]: returning ",
2866 (UV)PL_collation_ix);
c7202dee 2867 if (xlen) {
08b6dc1d 2868 PerlIO_printf(Perl_debug_log, "%zu", *xlen);
c7202dee
KW
2869 }
2870 else {
2871 PerlIO_printf(Perl_debug_log, "NULL");
2872 }
2873 PerlIO_printf(Perl_debug_log, " for locale '%s', string='",
2874 PL_collation_name);
9c8a6dc2
KW
2875 print_bytes_for_locale(s, e, is_utf8);
2876
2877 PerlIO_printf(Perl_debug_log, "'\n");
2878}
2879
2880STATIC void
2881S_print_bytes_for_locale(pTHX_
2882 const char * const s,
2883 const char * const e,
2884 const bool is_utf8)
2885{
2886 const char * t = s;
2887 bool prev_was_printable = TRUE;
2888 bool first_time = TRUE;
2889
2890 PERL_ARGS_ASSERT_PRINT_BYTES_FOR_LOCALE;
c7202dee
KW
2891
2892 while (t < e) {
2893 UV cp = (is_utf8)
2894 ? utf8_to_uvchr_buf((U8 *) t, e, NULL)
2895 : * (U8 *) t;
2896 if (isPRINT(cp)) {
2897 if (! prev_was_printable) {
2898 PerlIO_printf(Perl_debug_log, " ");
2899 }
2900 PerlIO_printf(Perl_debug_log, "%c", (U8) cp);
2901 prev_was_printable = TRUE;
2902 }
2903 else {
2904 if (! first_time) {
2905 PerlIO_printf(Perl_debug_log, " ");
2906 }
147e3846 2907 PerlIO_printf(Perl_debug_log, "%02" UVXf, cp);
c7202dee
KW
2908 prev_was_printable = FALSE;
2909 }
2910 t += (is_utf8) ? UTF8SKIP(t) : 1;
2911 first_time = FALSE;
2912 }
c7202dee
KW
2913}
2914
7d4bcc4a 2915# endif /* #ifdef DEBUGGING */
98994639 2916#endif /* USE_LOCALE_COLLATE */
58eebef2 2917
8ef6e574
KW
2918#ifdef USE_LOCALE
2919
c1284011
KW
2920bool
2921Perl__is_cur_LC_category_utf8(pTHX_ int category)
7d74bb61
KW
2922{
2923 /* Returns TRUE if the current locale for 'category' is UTF-8; FALSE
2924 * otherwise. 'category' may not be LC_ALL. If the platform doesn't have
119ee68b 2925 * nl_langinfo(), nor MB_CUR_MAX, this employs a heuristic, which hence
609548d2
KW
2926 * could give the wrong result. The result will very likely be correct for
2927 * languages that have commonly used non-ASCII characters, but for notably
2928 * English, it comes down to if the locale's name ends in something like
2929 * "UTF-8". It errs on the side of not being a UTF-8 locale. */
7d74bb61
KW
2930
2931 char *save_input_locale = NULL;
7d74bb61
KW
2932 STRLEN final_pos;
2933
7d4bcc4a
KW
2934# ifdef LC_ALL
2935
7d74bb61 2936 assert(category != LC_ALL);
7d4bcc4a
KW
2937
2938# endif
7d74bb61
KW
2939
2940 /* First dispose of the trivial cases */
837ce802 2941 save_input_locale = do_setlocale_r(category, NULL);
7d74bb61 2942 if (! save_input_locale) {
69014004
KW
2943 DEBUG_L(PerlIO_printf(Perl_debug_log,
2944 "Could not find current locale for category %d\n",
2945 category));
7d74bb61
KW
2946 return FALSE; /* XXX maybe should croak */
2947 }
b07fffd1 2948 save_input_locale = stdize_locale(savepv(save_input_locale));
a39edc4c 2949 if (isNAME_C_OR_POSIX(save_input_locale)) {
69014004
KW
2950 DEBUG_L(PerlIO_printf(Perl_debug_log,
2951 "Current locale for category %d is %s\n",
2952 category, save_input_locale));
b07fffd1 2953 Safefree(save_input_locale);
7d74bb61
KW
2954 return FALSE;
2955 }
2956
7d4bcc4a 2957# if defined(USE_LOCALE_CTYPE) \
1d958db2 2958 && (defined(MB_CUR_MAX) || (defined(HAS_NL_LANGINFO) && defined(CODESET)))
7d74bb61 2959
1d958db2 2960 { /* Next try nl_langinfo or MB_CUR_MAX if available */
7d74bb61
KW
2961
2962 char *save_ctype_locale = NULL;
119ee68b 2963 bool is_utf8;
7d74bb61 2964
119ee68b 2965 if (category != LC_CTYPE) { /* These work only on LC_CTYPE */
7d74bb61
KW
2966
2967 /* Get the current LC_CTYPE locale */
837ce802 2968 save_ctype_locale = do_setlocale_c(LC_CTYPE, NULL);
7d74bb61 2969 if (! save_ctype_locale) {
69014004
KW
2970 DEBUG_L(PerlIO_printf(Perl_debug_log,
2971 "Could not find current locale for LC_CTYPE\n"));
7d74bb61
KW
2972 goto cant_use_nllanginfo;
2973 }
4f72bb37 2974 save_ctype_locale = stdize_locale(savepv(save_ctype_locale));
7d74bb61
KW
2975
2976 /* If LC_CTYPE and the desired category use the same locale, this
2977 * means that finding the value for LC_CTYPE is the same as finding
2978 * the value for the desired category. Otherwise, switch LC_CTYPE
2979 * to the desired category's locale */
2980 if (strEQ(save_ctype_locale, save_input_locale)) {
2981 Safefree(save_ctype_locale);
2982 save_ctype_locale = NULL;
2983 }
837ce802 2984 else if (! do_setlocale_c(LC_CTYPE, save_input_locale)) {
69014004
KW
2985 DEBUG_L(PerlIO_printf(Perl_debug_log,
2986 "Could not change LC_CTYPE locale to %s\n",
2987 save_input_locale));
7d74bb61
KW
2988 Safefree(save_ctype_locale);
2989 goto cant_use_nllanginfo;
2990 }
2991 }
2992
69014004
KW
2993 DEBUG_L(PerlIO_printf(Perl_debug_log, "Current LC_CTYPE locale=%s\n",
2994 save_input_locale));
2995
7d74bb61 2996 /* Here the current LC_CTYPE is set to the locale of the category whose
1d958db2
KW
2997 * information is desired. This means that nl_langinfo() and MB_CUR_MAX
2998 * should give the correct results */
119ee68b 2999
7d4bcc4a 3000# if defined(HAS_NL_LANGINFO) && defined(CODESET)
c70a3e68 3001 /* The task is easiest if has this POSIX 2001 function */
7d4bcc4a 3002
1d958db2 3003 {
c70a3e68
KW
3004 const char *codeset = my_nl_langinfo(PERL_CODESET, FALSE);
3005 /* FALSE => already in dest locale */
119ee68b 3006
c70a3e68
KW
3007 DEBUG_L(PerlIO_printf(Perl_debug_log,
3008 "\tnllanginfo returned CODESET '%s'\n", codeset));
3009
3010 if (codeset && strNE(codeset, "")) {
1d958db2
KW
3011 /* If we switched LC_CTYPE, switch back */
3012 if (save_ctype_locale) {
837ce802 3013 do_setlocale_c(LC_CTYPE, save_ctype_locale);
1d958db2
KW
3014 Safefree(save_ctype_locale);
3015 }
3016
3017 is_utf8 = foldEQ(codeset, STR_WITH_LEN("UTF-8"))
3018 || foldEQ(codeset, STR_WITH_LEN("UTF8"));
3019
69014004
KW
3020 DEBUG_L(PerlIO_printf(Perl_debug_log,
3021 "\tnllanginfo returned CODESET '%s'; ?UTF8 locale=%d\n",
3022 codeset, is_utf8));
1d958db2
KW
3023 Safefree(save_input_locale);
3024 return is_utf8;
3025 }
119ee68b
KW
3026 }
3027
7d4bcc4a
KW
3028# endif
3029# ifdef MB_CUR_MAX
1d958db2
KW
3030
3031 /* Here, either we don't have nl_langinfo, or it didn't return a
3032 * codeset. Try MB_CUR_MAX */
3033
119ee68b
KW
3034 /* Standard UTF-8 needs at least 4 bytes to represent the maximum
3035 * Unicode code point. Since UTF-8 is the only non-single byte
3036 * encoding we handle, we just say any such encoding is UTF-8, and if
3037 * turns out to be wrong, other things will fail */
3038 is_utf8 = MB_CUR_MAX >= 4;
3039
69014004
KW
3040 DEBUG_L(PerlIO_printf(Perl_debug_log,
3041 "\tMB_CUR_MAX=%d; ?UTF8 locale=%d\n",
3042 (int) MB_CUR_MAX, is_utf8));
3043
119ee68b
KW
3044 Safefree(save_input_locale);
3045
7d4bcc4a 3046# ifdef HAS_MBTOWC
119ee68b
KW
3047
3048 /* ... But, most system that have MB_CUR_MAX will also have mbtowc(),
3049 * since they are both in the C99 standard. We can feed a known byte
3050 * string to the latter function, and check that it gives the expected
3051 * result */
3052 if (is_utf8) {
3053 wchar_t wc;
856b881c 3054 PERL_UNUSED_RESULT(mbtowc(&wc, NULL, 0));/* Reset any shift state */
69014004 3055 errno = 0;
f019f68f 3056 if ((size_t)mbtowc(&wc, HYPHEN_UTF8, strlen(HYPHEN_UTF8))
119ee68b
KW
3057 != strlen(HYPHEN_UTF8)
3058 || wc != (wchar_t) 0x2010)
3059 {
3060 is_utf8 = FALSE;
abdcbdb8 3061 DEBUG_L(PerlIO_printf(Perl_debug_log, "\thyphen=U+%x\n", (unsigned int)wc));
69014004
KW
3062 DEBUG_L(PerlIO_printf(Perl_debug_log,
3063 "\treturn from mbtowc=%d; errno=%d; ?UTF8 locale=0\n",
3064 mbtowc(&wc, HYPHEN_UTF8, strlen(HYPHEN_UTF8)), errno));
119ee68b
KW
3065 }
3066 }
7d4bcc4a
KW
3067
3068# endif
119ee68b 3069
1d958db2
KW
3070 /* If we switched LC_CTYPE, switch back */
3071 if (save_ctype_locale) {
837ce802 3072 do_setlocale_c(LC_CTYPE, save_ctype_locale);
1d958db2 3073 Safefree(save_ctype_locale);
119ee68b 3074 }
7d74bb61 3075
1d958db2 3076 return is_utf8;
7d4bcc4a
KW
3077
3078# endif
3079
7d74bb61 3080 }
119ee68b 3081
7d74bb61
KW
3082 cant_use_nllanginfo:
3083
7d4bcc4a 3084# else /* nl_langinfo should work if available, so don't bother compiling this
0080c90a
KW
3085 fallback code. The final fallback of looking at the name is
3086 compiled, and will be executed if nl_langinfo fails */
7d74bb61 3087
97f4de96
KW
3088 /* nl_langinfo not available or failed somehow. Next try looking at the
3089 * currency symbol to see if it disambiguates things. Often that will be
3090 * in the native script, and if the symbol isn't in UTF-8, we know that the
3091 * locale isn't. If it is non-ASCII UTF-8, we infer that the locale is
609548d2
KW
3092 * too, as the odds of a non-UTF8 string being valid UTF-8 are quite small
3093 * */
fa9b773e 3094
7d4bcc4a
KW
3095# ifdef HAS_LOCALECONV
3096# ifdef USE_LOCALE_MONETARY
3097
fa9b773e
KW
3098 {
3099 char *save_monetary_locale = NULL;
fa9b773e 3100 bool only_ascii = FALSE;
13542a67
KW
3101 bool is_utf8 = FALSE;
3102 struct lconv* lc;
fa9b773e 3103
97f4de96
KW
3104 /* Like above for LC_CTYPE, we first set LC_MONETARY to the locale of
3105 * the desired category, if it isn't that locale already */
3106
fa9b773e
KW
3107 if (category != LC_MONETARY) {
3108
837ce802 3109 save_monetary_locale = do_setlocale_c(LC_MONETARY, NULL);
fa9b773e 3110 if (! save_monetary_locale) {
69014004
KW
3111 DEBUG_L(PerlIO_printf(Perl_debug_log,
3112 "Could not find current locale for LC_MONETARY\n"));
fa9b773e
KW
3113 goto cant_use_monetary;
3114 }
4f72bb37 3115 save_monetary_locale = stdize_locale(savepv(save_monetary_locale));
fa9b773e 3116
13542a67
KW
3117 if (strEQ(save_monetary_locale, save_input_locale)) {
3118 Safefree(save_monetary_locale);
3119 save_monetary_locale = NULL;
3120 }
837ce802 3121 else if (! do_setlocale_c(LC_MONETARY, save_input_locale)) {
59c234b4
KW
3122 DEBUG_L(PerlIO_printf(Perl_debug_log,
3123 "Could not change LC_MONETARY locale to %s\n",
3124 save_input_locale));
3125 Safefree(save_monetary_locale);
3126 goto cant_use_monetary;
fa9b773e
KW
3127 }
3128 }
3129
3130 /* Here the current LC_MONETARY is set to the locale of the category
3131 * whose information is desired. */
3132
13542a67
KW
3133 lc = localeconv();
3134 if (! lc
3135 || ! lc->currency_symbol
c5f058df 3136 || is_utf8_invariant_string((U8 *) lc->currency_symbol, 0))
13542a67
KW
3137 {
3138 DEBUG_L(PerlIO_printf(Perl_debug_log, "Couldn't get currency symbol for %s, or contains only ASCII; can't use for determining if UTF-8 locale\n", save_input_locale));
3139 only_ascii = TRUE;
3140 }
3141 else {
3142 is_utf8 = is_utf8_string((U8 *) lc->currency_symbol, 0);
fa9b773e
KW
3143 }
3144
3145 /* If we changed it, restore LC_MONETARY to its original locale */
3146 if (save_monetary_locale) {
837ce802 3147 do_setlocale_c(LC_MONETARY, save_monetary_locale);
fa9b773e
KW
3148 Safefree(save_monetary_locale);
3149 }
3150
13542a67 3151 if (! only_ascii) {
fa9b773e 3152
59c234b4
KW
3153 /* It isn't a UTF-8 locale if the symbol is not legal UTF-8;
3154 * otherwise assume the locale is UTF-8 if and only if the symbol
3155 * is non-ascii UTF-8. */
3156 DEBUG_L(PerlIO_printf(Perl_debug_log, "\t?Currency symbol for %s is UTF-8=%d\n",
3157 save_input_locale, is_utf8));
3158 Safefree(save_input_locale);
3159 return is_utf8;
13542a67 3160 }
fa9b773e
KW
3161 }
3162 cant_use_monetary:
3163
7d4bcc4a
KW
3164# endif /* USE_LOCALE_MONETARY */
3165# endif /* HAS_LOCALECONV */
fa9b773e 3166
7d4bcc4a 3167# if defined(HAS_STRFTIME) && defined(USE_LOCALE_TIME)
15f7e74e
KW
3168
3169/* Still haven't found a non-ASCII string to disambiguate UTF-8 or not. Try
3170 * the names of the months and weekdays, timezone, and am/pm indicator */
3171 {
3172 char *save_time_locale = NULL;
3173 int hour = 10;
3174 bool is_dst = FALSE;
3175 int dom = 1;
3176 int month = 0;
3177 int i;
3178 char * formatted_time;
3179
3180
3181 /* Like above for LC_MONETARY, we set LC_TIME to the locale of the
3182 * desired category, if it isn't that locale already */
3183
3184 if (category != LC_TIME) {
3185
837ce802 3186 save_time_locale = do_setlocale_c(LC_TIME, NULL);
15f7e74e
KW
3187 if (! save_time_locale) {
3188 DEBUG_L(PerlIO_printf(Perl_debug_log,
3189 "Could not find current locale for LC_TIME\n"));
3190 goto cant_use_time;
3191 }
3192 save_time_locale = stdize_locale(savepv(save_time_locale));
3193
3194 if (strEQ(save_time_locale, save_input_locale)) {
3195 Safefree(save_time_locale);
3196 save_time_locale = NULL;
3197 }
837ce802 3198 else if (! do_setlocale_c(LC_TIME, save_input_locale)) {
15f7e74e
KW
3199 DEBUG_L(PerlIO_printf(Perl_debug_log,
3200 "Could not change LC_TIME locale to %s\n",
3201 save_input_locale));
3202 Safefree(save_time_locale);
3203 goto cant_use_time;
3204 }
3205 }
3206
3207 /* Here the current LC_TIME is set to the locale of the category
3208 * whose information is desired. Look at all the days of the week and
9f10db87 3209 * month names, and the timezone and am/pm indicator for UTF-8 variant
15f7e74e
KW
3210 * characters. The first such a one found will tell us if the locale
3211 * is UTF-8 or not */
3212
3213 for (i = 0; i < 7 + 12; i++) { /* 7 days; 12 months */
3214 formatted_time = my_strftime("%A %B %Z %p",
3ae5cd07 3215 0, 0, hour, dom, month, 2012 - 1900, 0, 0, is_dst);
c5f058df
KW
3216 if ( ! formatted_time
3217 || is_utf8_invariant_string((U8 *) formatted_time, 0))
3218 {
15f7e74e
KW
3219
3220 /* Here, we didn't find a non-ASCII. Try the next time through
3221 * with the complemented dst and am/pm, and try with the next
3222 * weekday. After we have gotten all weekdays, try the next
3223 * month */
3224 is_dst = ! is_dst;
3225 hour = (hour + 12) % 24;
3226 dom++;
3227 if (i > 6) {
3228 month++;
3229 }
3230 continue;
3231 }
3232
3233 /* Here, we have a non-ASCII. Return TRUE is it is valid UTF8;
3234 * false otherwise. But first, restore LC_TIME to its original
3235 * locale if we changed it */
3236 if (save_time_locale) {
837ce802 3237 do_setlocale_c(LC_TIME, save_time_locale);
15f7e74e
KW
3238 Safefree(save_time_locale);
3239 }
3240
3241 DEBUG_L(PerlIO_printf(Perl_debug_log, "\t?time-related strings for %s are UTF-8=%d\n",
3242 save_input_locale,
3243 is_utf8_string((U8 *) formatted_time, 0)));
3244 Safefree(save_input_locale);
3245 return is_utf8_string((U8 *) formatted_time, 0);
3246 }
3247
3248 /* Falling off the end of the loop indicates all the names were just
3249 * ASCII. Go on to the next test. If we changed it, restore LC_TIME
3250 * to its original locale */
3251 if (save_time_locale) {
837ce802 3252 do_setlocale_c(LC_TIME, save_time_locale);
15f7e74e
KW
3253 Safefree(save_time_locale);
3254 }
3255 DEBUG_L(PerlIO_printf(Perl_debug_log, "All time-related words for %s contain only ASCII; can't use for determining if UTF-8 locale\n", save_input_locale));
3256 }
3257 cant_use_time:
3258
7d4bcc4a 3259# endif
15f7e74e 3260
7d4bcc4a 3261# if 0 && defined(USE_LOCALE_MESSAGES) && defined(HAS_SYS_ERRLIST)
855aeb93
JH
3262
3263/* This code is ifdefd out because it was found to not be necessary in testing
5857e934
KW
3264 * on our dromedary test machine, which has over 700 locales. There, this
3265 * added no value to looking at the currency symbol and the time strings. I
3266 * left it in so as to avoid rewriting it if real-world experience indicates
3267 * that dromedary is an outlier. Essentially, instead of returning abpve if we
855aeb93
JH
3268 * haven't found illegal utf8, we continue on and examine all the strerror()
3269 * messages on the platform for utf8ness. If all are ASCII, we still don't
3270 * know the answer; but otherwise we have a pretty good indication of the
5857e934
KW
3271 * utf8ness. The reason this doesn't help much is that the messages may not
3272 * have been translated into the locale. The currency symbol and time strings
3273 * are much more likely to have been translated. */
3274 {
855aeb93 3275 int e;
5857e934
KW
3276 bool is_utf8 = FALSE;
3277 bool non_ascii = FALSE;
855aeb93 3278 char *save_messages_locale = NULL;
5857e934 3279 const char * errmsg = NULL;
855aeb93 3280
5857e934
KW
3281 /* Like above, we set LC_MESSAGES to the locale of the desired
3282 * category, if it isn't that locale already */
855aeb93
JH
3283
3284 if (category != LC_MESSAGES) {
3285
837ce802 3286 save_messages_locale = do_setlocale_c(LC_MESSAGES, NULL);
855aeb93 3287 if (! save_messages_locale) {
5857e934
KW
3288 DEBUG_L(PerlIO_printf(Perl_debug_log,
3289 "Could not find current locale for LC_MESSAGES\n"));
855aeb93
JH
3290 goto cant_use_messages;
3291 }
5857e934 3292 save_messages_locale = stdize_locale(savepv(save_messages_locale));
855aeb93
JH
3293
3294 if (strEQ(save_messages_locale, save_input_locale)) {
5857e934
KW
3295 Safefree(save_messages_locale);
3296 save_messages_locale = NULL;
855aeb93 3297 }
837ce802 3298 else if (! do_setlocale_c(LC_MESSAGES, save_input_locale)) {
5857e934
KW
3299 DEBUG_L(PerlIO_printf(Perl_debug_log,
3300 "Could not change LC_MESSAGES locale to %s\n",
3301 save_input_locale));
855aeb93
JH
3302 Safefree(save_messages_locale);
3303 goto cant_use_messages;
3304 }
3305 }
3306
3307 /* Here the current LC_MESSAGES is set to the locale of the category
5857e934
KW
3308 * whose information is desired. Look through all the messages. We
3309 * can't use Strerror() here because it may expand to code that
3310 * segfaults in miniperl */
855aeb93 3311
5857e934
KW
3312 for (e = 0; e <= sys_nerr; e++) {
3313 errno = 0;
3314 errmsg = sys_errlist[e];
3315 if (errno || !errmsg) {
855aeb93
JH
3316 break;
3317 }
5857e934 3318 errmsg = savepv(errmsg);
c5f058df 3319 if (! is_utf8_invariant_string((U8 *) errmsg, 0)) {
5857e934
KW
3320 non_ascii = TRUE;
3321 is_utf8 = is_utf8_string((U8 *) errmsg, 0);
3322 break;
855aeb93
JH
3323 }
3324 }
5857e934 3325 Safefree(errmsg);
855aeb93
JH
3326
3327 /* And, if we changed it, restore LC_MESSAGES to its original locale */
3328 if (save_messages_locale) {
837ce802 3329 do_setlocale_c(LC_MESSAGES, save_messages_locale);
855aeb93
JH
3330 Safefree(save_messages_locale);
3331 }
3332
5857e934
KW
3333 if (non_ascii) {
3334
3335 /* Any non-UTF-8 message means not a UTF-8 locale; if all are valid,
3336 * any non-ascii means it is one; otherwise we assume it isn't */
3337 DEBUG_L(PerlIO_printf(Perl_debug_log, "\t?error messages for %s are UTF-8=%d\n",
3338 save_input_locale,
3339 is_utf8));
3340 Safefree(save_input_locale);
3341 return is_utf8;
3342 }
855aeb93 3343
5857e934 3344 DEBUG_L(PerlIO_printf(Perl_debug_log, "All error messages for %s contain only ASCII; can't use for determining if UTF-8 locale\n", save_input_locale));
855aeb93
JH
3345 }
3346 cant_use_messages:
3347
7d4bcc4a
KW
3348# endif
3349# endif /* the code that is compiled when no nl_langinfo */
0080c90a 3350
7d4bcc4a 3351# ifndef EBCDIC /* On os390, even if the name ends with "UTF-8', it isn't a
92c0a900 3352 UTF-8 locale */
7d4bcc4a 3353
97f4de96
KW
3354 /* As a last resort, look at the locale name to see if it matches
3355 * qr/UTF -? * 8 /ix, or some other common locale names. This "name", the
3356 * return of setlocale(), is actually defined to be opaque, so we can't
3357 * really rely on the absence of various substrings in the name to indicate
3358 * its UTF-8ness, but if it has UTF8 in the name, it is extremely likely to
3359 * be a UTF-8 locale. Similarly for the other common names */
3360
3361 final_pos = strlen(save_input_locale) - 1;
3362 if (final_pos >= 3) {
3363 char *name = save_input_locale;
3364
3365 /* Find next 'U' or 'u' and look from there */
3366 while ((name += strcspn(name, "Uu") + 1)
3367 <= save_input_locale + final_pos - 2)
3368 {
c72493e0 3369 if ( isALPHA_FOLD_NE(*name, 't')
305b8651 3370 || isALPHA_FOLD_NE(*(name + 1), 'f'))
97f4de96
KW
3371 {
3372 continue;
3373 }
3374 name += 2;
3375 if (*(name) == '-') {
3376 if ((name > save_input_locale + final_pos - 1)) {
3377 break;
3378 }
3379 name++;
3380 }
3381 if (*(name) == '8') {
97f4de96
KW
3382 DEBUG_L(PerlIO_printf(Perl_debug_log,
3383 "Locale %s ends with UTF-8 in name\n",
3384 save_input_locale));
00c54b9c 3385 Safefree(save_input_locale);
97f4de96
KW
3386 return TRUE;
3387 }
3388 }
3389 DEBUG_L(PerlIO_printf(Perl_debug_log,
3390 "Locale %s doesn't end with UTF-8 in name\n",
3391 save_input_locale));
3392 }
3393
7d4bcc4a
KW
3394# endif
3395# ifdef WIN32
3396
97f4de96 3397 /* http://msdn.microsoft.com/en-us/library/windows/desktop/dd317756.aspx */
8a0832a1 3398 if (memENDs(save_input_locale, final_pos, "65001")) {
97f4de96 3399 DEBUG_L(PerlIO_printf(Perl_debug_log,
8a0832a1 3400 "Locale %s ends with 65001 in name, is UTF-8 locale\n",
97f4de96
KW
3401 save_input_locale));
3402 Safefree(save_input_locale);
3403 return TRUE;
3404 }
7d4bcc4a
KW
3405
3406# endif
97f4de96
KW
3407
3408 /* Other common encodings are the ISO 8859 series, which aren't UTF-8. But
3409 * since we are about to return FALSE anyway, there is no point in doing
3410 * this extra work */
7d4bcc4a
KW
3411
3412# if 0
97f4de96
KW
3413 if (instr(save_input_locale, "8859")) {
3414 DEBUG_L(PerlIO_printf(Perl_debug_log,
3415 "Locale %s has 8859 in name, not UTF-8 locale\n",
3416 save_input_locale));
3417 Safefree(save_input_locale);
3418 return FALSE;
3419 }
7d4bcc4a 3420# endif
97f4de96 3421
69014004
KW
3422 DEBUG_L(PerlIO_printf(Perl_debug_log,
3423 "Assuming locale %s is not a UTF-8 locale\n",
3424 save_input_locale));
fa9b773e 3425 Safefree(save_input_locale);
7d74bb61
KW
3426 return FALSE;
3427}
3428
8ef6e574 3429#endif
7d74bb61 3430
d6ded950
KW
3431
3432bool
3433Perl__is_in_locale_category(pTHX_ const bool compiling, const int category)
3434{
1a4f13e1 3435 dVAR;
d6ded950
KW
3436 /* Internal function which returns if we are in the scope of a pragma that
3437 * enables the locale category 'category'. 'compiling' should indicate if
3438 * this is during the compilation phase (TRUE) or not (FALSE). */
3439
3440 const COP * const cop = (compiling) ? &PL_compiling : PL_curcop;
3441
3442 SV *categories = cop_hints_fetch_pvs(cop, "locale", 0);
3443 if (! categories || categories == &PL_sv_placeholder) {
3444 return FALSE;
3445 }
3446
3447 /* The pseudo-category 'not_characters' is -1, so just add 1 to each to get
3448 * a valid unsigned */
3449 assert(category >= -1);
3450 return cBOOL(SvUV(categories) & (1U << (category + 1)));
3451}
3452
2c6ee1a7 3453char *
6ebbc862
KW
3454Perl_my_strerror(pTHX_ const int errnum)
3455{
3456 /* Returns a mortalized copy of the text of the error message associated
3457 * with 'errnum'. It uses the current locale's text unless the platform
3458 * doesn't have the LC_MESSAGES category or we are not being called from
3459 * within the scope of 'use locale'. In the former case, it uses whatever
3460 * strerror returns; in the latter case it uses the text from the C locale.
3461 *
3462 * The function just calls strerror(), but temporarily switches, if needed,
3463 * to the C locale */
3464
3465 char *errstr;
52770946 3466 dVAR;
6ebbc862 3467
52770946 3468#ifndef USE_LOCALE_MESSAGES
6ebbc862 3469
52770946
KW
3470 /* If platform doesn't have messages category, we don't do any switching to
3471 * the C locale; we just use whatever strerror() returns */
3472
3473 errstr = savepv(Strerror(errnum));
3474
3475#else /* Has locale messages */
3476
3477 const bool within_locale_scope = IN_LC(LC_MESSAGES);
2c6ee1a7 3478
7aaa36b1
KW
3479# if defined(HAS_POSIX_2008_LOCALE) && defined(HAS_STRERROR_L)
3480
3481 /* This function is trivial if we have strerror_l() */
3482
3483 if (within_locale_scope) {
3484 errstr = strerror(errnum);
3485 }
3486 else {
3487 errstr = strerror_l(errnum, PL_C_locale_obj);
3488 }
3489
3490 errstr = savepv(errstr);
3491
3492# else /* Doesn't have strerror_l(). */
3493
3494# ifdef USE_POSIX_2008_LOCALE
3495
fcd0e682 3496 locale_t save_locale = NULL;
7aaa36b1
KW
3497
3498# else
3499
fcd0e682 3500 char * save_locale = NULL;
c9dda6da 3501 bool locale_is_C = FALSE;
2c6ee1a7 3502
6ebbc862
KW
3503 /* We have a critical section to prevent another thread from changing the
3504 * locale out from under us (or zapping the buffer returned from
3505 * setlocale() ) */
3506 LOCALE_LOCK;
3507
7aaa36b1 3508# endif
6ebbc862 3509
9c8a6dc2
KW
3510 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
3511 "my_strerror called with errnum %d\n", errnum));
6ebbc862 3512 if (! within_locale_scope) {
c9dda6da 3513 errno = 0;
a0b53297 3514
f1d2176b 3515# ifdef USE_POSIX_2008_LOCALE /* Use the thread-safe locale functions */
6ebbc862 3516
9c8a6dc2
KW
3517 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
3518 "Not within locale scope, about to call"
3519 " uselocale(0x%p)\n", PL_C_locale_obj));
6ebbc862 3520 save_locale = uselocale(PL_C_locale_obj);
c9dda6da
KW
3521 if (! save_locale) {
3522 DEBUG_L(PerlIO_printf(Perl_debug_log,
9c8a6dc2
KW
3523 "uselocale failed, errno=%d\n", errno));
3524 }
3525 else {
3526 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
3527 "uselocale returned 0x%p\n", save_locale));
c9dda6da 3528 }
6ebbc862 3529
7aaa36b1 3530# else /* Not thread-safe build */
a0b53297 3531
837ce802 3532 save_locale = do_setlocale_c(LC_MESSAGES, NULL);
c9dda6da
KW
3533 if (! save_locale) {
3534 DEBUG_L(PerlIO_printf(Perl_debug_log,
3535 "setlocale failed, errno=%d\n", errno));
3536 }
3537 else {
3538 locale_is_C = isNAME_C_OR_POSIX(save_locale);
2c6ee1a7 3539
c9dda6da
KW
3540 /* Switch to the C locale if not already in it */
3541 if (! locale_is_C) {
2c6ee1a7 3542
c9dda6da
KW
3543 /* The setlocale() just below likely will zap 'save_locale', so
3544 * create a copy. */
3545 save_locale = savepv(save_locale);
837ce802 3546 do_setlocale_c(LC_MESSAGES, "C");
c9dda6da 3547 }
6ebbc862 3548 }
2c6ee1a7 3549
7aaa36b1 3550# endif
2c6ee1a7 3551
6ebbc862 3552 } /* end of ! within_locale_scope */
9c8a6dc2
KW
3553 else {
3554 DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s: %d: WITHIN locale scope\n",
3555 __FILE__, __LINE__));
3556 }
a0b53297 3557
9c8a6dc2
KW
3558 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
3559 "Any locale change has been done; about to call Strerror\n"));
52770946 3560 errstr = savepv(Strerror(errnum));
6ebbc862
KW
3561
3562 if (! within_locale_scope) {
c9dda6da 3563 errno = 0;
a0b53297 3564
f1d2176b 3565# ifdef USE_POSIX_2008_LOCALE
6ebbc862 3566
9c8a6dc2
KW
3567 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
3568 "%s: %d: not within locale scope, restoring the locale\n",
3569 __FILE__, __LINE__));
c9dda6da
KW
3570 if (save_locale && ! uselocale(save_locale)) {
3571 DEBUG_L(PerlIO_printf(Perl_debug_log,
3572 "uselocale restore failed, errno=%d\n", errno));
3573 }
2c6ee1a7 3574 }
6ebbc862 3575
7aaa36b1 3576# else
6ebbc862 3577
c9dda6da 3578 if (save_locale && ! locale_is_C) {
837ce802 3579 if (! do_setlocale_c(LC_MESSAGES, save_locale)) {
c9dda6da
KW
3580 DEBUG_L(PerlIO_printf(Perl_debug_log,
3581 "setlocale restore failed, errno=%d\n", errno));
3582 }
6ebbc862
KW
3583 Safefree(save_locale);
3584 }
3585 }
3586
3587 LOCALE_UNLOCK;
3588
7aaa36b1
KW
3589# endif
3590# endif /* End of doesn't have strerror_l */
52770946 3591#endif /* End of does have locale messages */
6affbbf0
KW
3592
3593#ifdef DEBUGGING
3594
3595 if (DEBUG_Lv_TEST) {
3596 PerlIO_printf(Perl_debug_log, "Strerror returned; saving a copy: '");
3597 print_bytes_for_locale(errstr, errstr + strlen(errstr), 0);
3598 PerlIO_printf(Perl_debug_log, "'\n");
3599 }
3600
2c6ee1a7
KW
3601#endif
3602
52770946 3603 SAVEFREEPV(errstr);
6ebbc862 3604 return errstr;
2c6ee1a7
KW
3605}
3606
66610fdd 3607/*
747c467a 3608
747c467a
KW
3609=for apidoc sync_locale
3610
3611Changing the program's locale should be avoided by XS code. Nevertheless,
3612certain non-Perl libraries called from XS, such as C<Gtk> do so. When this
3613happens, Perl needs to be told that the locale has changed. Use this function
3614to do so, before returning to Perl.
3615
3616=cut
3617*/
3618
3619void
3620Perl_sync_locale(pTHX)
3621{
9f82ea3e 3622 char * newlocale;
747c467a
KW
3623
3624#ifdef USE_LOCALE_CTYPE
7d4bcc4a 3625
9f82ea3e
KW
3626 newlocale = do_setlocale_c(LC_CTYPE, NULL);
3627 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
3628 "%s:%d: %s\n", __FILE__, __LINE__,
3629 setlocale_debug_string(LC_CTYPE, NULL, newlocale)));
3630 new_ctype(newlocale);
747c467a 3631
7d4bcc4a 3632#endif /* USE_LOCALE_CTYPE */
747c467a 3633#ifdef USE_LOCALE_COLLATE
7d4bcc4a 3634
9f82ea3e
KW
3635 newlocale = do_setlocale_c(LC_COLLATE, NULL);
3636 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
3637 "%s:%d: %s\n", __FILE__, __LINE__,
3638 setlocale_debug_string(LC_COLLATE, NULL, newlocale)));
3639 new_collate(newlocale);
747c467a 3640
7d4bcc4a 3641#endif
747c467a 3642#ifdef USE_LOCALE_NUMERIC
7d4bcc4a 3643
9f82ea3e
KW
3644 newlocale = do_setlocale_c(LC_NUMERIC, NULL);
3645 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
3646 "%s:%d: %s\n", __FILE__, __LINE__,
3647 setlocale_debug_string(LC_NUMERIC, NULL, newlocale)));
3648 new_numeric(newlocale);
7d4bcc4a 3649
747c467a
KW
3650#endif /* USE_LOCALE_NUMERIC */
3651
3652}
3653
5d1187d1
KW
3654#if defined(DEBUGGING) && defined(USE_LOCALE)
3655
a4f00dcc
KW
3656STATIC char *
3657S_setlocale_debug_string(const int category, /* category number,
5d1187d1
KW
3658 like LC_ALL */
3659 const char* const locale, /* locale name */
3660
3661 /* return value from setlocale() when attempting to
3662 * set 'category' to 'locale' */
3663 const char* const retval)
3664{
3665 /* Returns a pointer to a NUL-terminated string in static storage with
3666 * added text about the info passed in. This is not thread safe and will
3667 * be overwritten by the next call, so this should be used just to
fa07b8e5 3668 * formulate a string to immediately print or savepv() on. */
5d1187d1 3669
398a990f
DM
3670 /* initialise to a non-null value to keep it out of BSS and so keep
3671 * -DPERL_GLOBAL_STRUCT_PRIVATE happy */
60b45a7d
KW
3672 static char ret[128] = "If you can read this, thank your buggy C"
3673 " library strlcpy(), and change your hints file"
3674 " to undef it";
fa07b8e5 3675 my_strlcpy(ret, "setlocale(", sizeof(ret));
5d1187d1
KW
3676
3677 switch (category) {
3678 default:
fa07b8e5 3679 my_snprintf(ret, sizeof(ret), "%s? %d", ret, category);
5d1187d1 3680 break;
7d4bcc4a
KW
3681
3682# ifdef LC_ALL
3683
5d1187d1 3684 case LC_ALL:
fa07b8e5 3685 my_strlcat(ret, "LC_ALL", sizeof(ret));
5d1187d1 3686 break;
7d4bcc4a
KW
3687
3688# endif
3689# ifdef LC_CTYPE
3690
5d1187d1 3691 case LC_CTYPE:
fa07b8e5 3692 my_strlcat(ret, "LC_CTYPE", sizeof(ret));
5d1187d1 3693 break;
7d4bcc4a
KW
3694
3695# endif
3696# ifdef LC_NUMERIC
3697
5d1187d1 3698 case LC_NUMERIC:
fa07b8e5 3699 my_strlcat(ret, "LC_NUMERIC", sizeof(ret));
5d1187d1 3700 break;
7d4bcc4a
KW
3701
3702# endif
3703# ifdef LC_COLLATE
3704
5d1187d1 3705 case LC_COLLATE:
fa07b8e5 3706 my_strlcat(ret, "LC_COLLATE", sizeof(ret));
5d1187d1