This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
locale.c: Change static fcn name
[perl5.git] / locale.c
CommitLineData
98994639
HS
1/* locale.c
2 *
1129b882
NC
3 * Copyright (C) 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001,
4 * 2002, 2003, 2005, 2006, 2007, 2008 by Larry Wall and others
98994639
HS
5 *
6 * You may distribute under the terms of either the GNU General Public
7 * License or the Artistic License, as specified in the README file.
8 *
9 */
10
11/*
4ac71550 12 * A Elbereth Gilthoniel,
cdad3b53 13 * silivren penna míriel
4ac71550 14 * o menel aglar elenath!
cdad3b53 15 * Na-chaered palan-díriel
4ac71550
TC
16 * o galadhremmin ennorath,
17 * Fanuilos, le linnathon
18 * nef aear, si nef aearon!
19 *
20 * [p.238 of _The Lord of the Rings_, II/i: "Many Meetings"]
98994639
HS
21 */
22
166f8a29
DM
23/* utility functions for handling locale-specific stuff like what
24 * character represents the decimal point.
0d071d52 25 *
7d4bcc4a
KW
26 * All C programs have an underlying locale. Perl code generally doesn't pay
27 * any attention to it except within the scope of a 'use locale'. For most
0d071d52
KW
28 * categories, it accomplishes this by just using different operations if it is
29 * in such scope than if not. However, various libc functions called by Perl
30 * are affected by the LC_NUMERIC category, so there are macros in perl.h that
31 * are used to toggle between the current locale and the C locale depending on
a9ad02a8
KW
32 * the desired behavior of those functions at the moment. And, LC_MESSAGES is
33 * switched to the C locale for outputting the message unless within the scope
34 * of 'use locale'.
166f8a29
DM
35 */
36
98994639
HS
37#include "EXTERN.h"
38#define PERL_IN_LOCALE_C
f7416781 39#include "perl_langinfo.h"
98994639
HS
40#include "perl.h"
41
a4af207c
JH
42#include "reentr.h"
43
2fcc0ca9
KW
44/* If the environment says to, we can output debugging information during
45 * initialization. This is done before option parsing, and before any thread
46 * creation, so can be a file-level static */
47#ifdef DEBUGGING
7d4bcc4a 48# ifdef PERL_GLOBAL_STRUCT
27cdc72e 49 /* no global syms allowed */
7d4bcc4a
KW
50# define debug_initialization 0
51# define DEBUG_INITIALIZATION_set(v)
52# else
2fcc0ca9 53static bool debug_initialization = FALSE;
7d4bcc4a
KW
54# define DEBUG_INITIALIZATION_set(v) (debug_initialization = v)
55# endif
2fcc0ca9
KW
56#endif
57
8ef6e574
KW
58#ifdef USE_LOCALE
59
98994639 60/*
0d071d52
KW
61 * Standardize the locale name from a string returned by 'setlocale', possibly
62 * modifying that string.
98994639 63 *
0ef2a2b2 64 * The typical return value of setlocale() is either
98994639
HS
65 * (1) "xx_YY" if the first argument of setlocale() is not LC_ALL
66 * (2) "xa_YY xb_YY ..." if the first argument of setlocale() is LC_ALL
67 * (the space-separated values represent the various sublocales,
0ef2a2b2 68 * in some unspecified order). This is not handled by this function.
98994639
HS
69 *
70 * In some platforms it has a form like "LC_SOMETHING=Lang_Country.866\n",
0ef2a2b2
KW
71 * which is harmful for further use of the string in setlocale(). This
72 * function removes the trailing new line and everything up through the '='
98994639
HS
73 *
74 */
75STATIC char *
76S_stdize_locale(pTHX_ char *locs)
77{
7452cf6a 78 const char * const s = strchr(locs, '=');
98994639
HS
79 bool okay = TRUE;
80
7918f24d
NC
81 PERL_ARGS_ASSERT_STDIZE_LOCALE;
82
8772537c
AL
83 if (s) {
84 const char * const t = strchr(s, '.');
98994639 85 okay = FALSE;
8772537c
AL
86 if (t) {
87 const char * const u = strchr(t, '\n');
88 if (u && (u[1] == 0)) {
89 const STRLEN len = u - s;
90 Move(s + 1, locs, len, char);
91 locs[len] = 0;
92 okay = TRUE;
98994639
HS
93 }
94 }
95 }
96
97 if (!okay)
98 Perl_croak(aTHX_ "Can't fix broken locale name \"%s\"", locs);
99
100 return locs;
101}
102
8ef6e574
KW
103#endif
104
a4f00dcc 105STATIC void
86799d2d 106S_set_numeric_radix(pTHX_ const bool use_locale)
98994639 107{
86799d2d
KW
108 /* If 'use_locale' is FALSE, set to use a dot for the radix character. If
109 * TRUE, use the radix character derived from the current locale */
7d4bcc4a 110
86799d2d
KW
111#if defined(USE_LOCALE_NUMERIC) && ( defined(HAS_LOCALECONV) \
112 || defined(HAS_NL_LANGINFO))
98994639 113
86799d2d
KW
114 /* We only set up the radix SV if we are to use a locale radix ... */
115 if (use_locale) {
116 const char * radix = my_nl_langinfo(PERL_RADIXCHAR, FALSE);
117 /* FALSE => already in dest locale */
118
119 /* ... and the character being used isn't a dot */
120 if (strNE(radix, ".")) {
121 if (PL_numeric_radix_sv) {
122 sv_setpv(PL_numeric_radix_sv, radix);
123 }
124 else {
125 PL_numeric_radix_sv = newSVpv(radix, 0);
126 }
127
128 if ( ! is_utf8_invariant_string(
129 (U8 *) SvPVX(PL_numeric_radix_sv), SvCUR(PL_numeric_radix_sv))
130 && is_utf8_string(
131 (U8 *) SvPVX(PL_numeric_radix_sv), SvCUR(PL_numeric_radix_sv))
c1284011 132 && _is_cur_LC_category_utf8(LC_NUMERIC))
28acfe03 133 {
86799d2d 134 SvUTF8_on(PL_numeric_radix_sv);
28acfe03 135 }
86799d2d
KW
136 goto done;
137 }
98994639 138 }
69014004 139
86799d2d
KW
140 SvREFCNT_dec(PL_numeric_radix_sv);
141 PL_numeric_radix_sv = NULL;
142
143 done: ;
144
145# ifdef DEBUGGING
7d4bcc4a 146
2fcc0ca9
KW
147 if (DEBUG_L_TEST || debug_initialization) {
148 PerlIO_printf(Perl_debug_log, "Locale radix is '%s', ?UTF-8=%d\n",
69014004 149 (PL_numeric_radix_sv)
37b7e435
KW
150 ? SvPVX(PL_numeric_radix_sv)
151 : "NULL",
152 (PL_numeric_radix_sv)
39eb7305 153 ? cBOOL(SvUTF8(PL_numeric_radix_sv))
2fcc0ca9
KW
154 : 0);
155 }
69014004 156
86799d2d
KW
157# endif
158#endif /* USE_LOCALE_NUMERIC and can find the radix char */
7d4bcc4a 159
98994639
HS
160}
161
a39edc4c
KW
162/* Is the C string input 'name' "C" or "POSIX"? If so, and 'name' is the
163 * return of setlocale(), then this is extremely likely to be the C or POSIX
164 * locale. However, the output of setlocale() is documented to be opaque, but
165 * the odds are extremely small that it would return these two strings for some
166 * other locale. Note that VMS in these two locales includes many non-ASCII
167 * characters as controls and punctuation (below are hex bytes):
168 * cntrl: 00-1F 7F 84-97 9B-9F
169 * punct: 21-2F 3A-40 5B-60 7B-7E A1-A3 A5 A7-AB B0-B3 B5-B7 B9-BD BF-CF D1-DD DF-EF F1-FD
170 * Oddly, none there are listed as alphas, though some represent alphabetics
171 * http://www.nntp.perl.org/group/perl.perl5.porters/2013/02/msg198753.html */
7d4bcc4a
KW
172#define isNAME_C_OR_POSIX(name) \
173 ( (name) != NULL \
174 && (( *(name) == 'C' && (*(name + 1)) == '\0') \
175 || strEQ((name), "POSIX")))
a39edc4c 176
98994639 177void
8772537c 178Perl_new_numeric(pTHX_ const char *newnum)
98994639 179{
7d4bcc4a
KW
180
181#ifndef USE_LOCALE_NUMERIC
182
183 PERL_UNUSED_ARG(newnum);
184
185#else
0d071d52
KW
186
187 /* Called after all libc setlocale() calls affecting LC_NUMERIC, to tell
188 * core Perl this and that 'newnum' is the name of the new locale.
189 * It installs this locale as the current underlying default.
190 *
191 * The default locale and the C locale can be toggled between by use of the
192 * set_numeric_local() and set_numeric_standard() functions, which should
193 * probably not be called directly, but only via macros like
194 * SET_NUMERIC_STANDARD() in perl.h.
195 *
196 * The toggling is necessary mainly so that a non-dot radix decimal point
197 * character can be output, while allowing internal calculations to use a
198 * dot.
199 *
200 * This sets several interpreter-level variables:
bb304765 201 * PL_numeric_name The underlying locale's name: a copy of 'newnum'
0d071d52 202 * PL_numeric_local A boolean indicating if the toggled state is such
7738054c
KW
203 * that the current locale is the program's underlying
204 * locale
205 * PL_numeric_standard An int indicating if the toggled state is such
206 * that the current locale is the C locale. If non-zero,
207 * it is in C; if > 1, it means it may not be toggled away
208 * from C.
0d071d52
KW
209 * Note that both of the last two variables can be true at the same time,
210 * if the underlying locale is C. (Toggling is a no-op under these
211 * circumstances.)
212 *
213 * Any code changing the locale (outside this file) should use
214 * POSIX::setlocale, which calls this function. Therefore this function
215 * should be called directly only from this file and from
216 * POSIX::setlocale() */
217
b03f34cf 218 char *save_newnum;
98994639
HS
219
220 if (! newnum) {
43c5f42d
NC
221 Safefree(PL_numeric_name);
222 PL_numeric_name = NULL;
98994639
HS
223 PL_numeric_standard = TRUE;
224 PL_numeric_local = TRUE;
225 return;
226 }
227
b03f34cf 228 save_newnum = stdize_locale(savepv(newnum));
abe1abcf
KW
229
230 PL_numeric_standard = isNAME_C_OR_POSIX(save_newnum);
231 PL_numeric_local = TRUE;
232
b03f34cf 233 if (! PL_numeric_name || strNE(PL_numeric_name, save_newnum)) {
98994639 234 Safefree(PL_numeric_name);
b03f34cf 235 PL_numeric_name = save_newnum;
b03f34cf 236 }
abe1abcf
KW
237 else {
238 Safefree(save_newnum);
239 }
4c28b29c
KW
240
241 /* Keep LC_NUMERIC in the C locale. This is for XS modules, so they don't
242 * have to worry about the radix being a non-dot. (Core operations that
243 * need the underlying locale change to it temporarily). */
244 set_numeric_standard();
245
98994639 246#endif /* USE_LOCALE_NUMERIC */
7d4bcc4a 247
98994639
HS
248}
249
250void
251Perl_set_numeric_standard(pTHX)
252{
7d4bcc4a 253
98994639 254#ifdef USE_LOCALE_NUMERIC
7d4bcc4a 255
28c1bf33
KW
256 /* Toggle the LC_NUMERIC locale to C. Most code should use the macros like
257 * SET_NUMERIC_STANDARD() in perl.h instead of calling this directly. The
258 * macro avoids calling this routine if toggling isn't necessary according
259 * to our records (which could be wrong if some XS code has changed the
260 * locale behind our back) */
0d071d52 261
a9b8c0d8
KW
262 setlocale(LC_NUMERIC, "C");
263 PL_numeric_standard = TRUE;
264 PL_numeric_local = isNAME_C_OR_POSIX(PL_numeric_name);
86799d2d 265 set_numeric_radix(0);
7d4bcc4a
KW
266
267# ifdef DEBUGGING
268
2fcc0ca9
KW
269 if (DEBUG_L_TEST || debug_initialization) {
270 PerlIO_printf(Perl_debug_log,
271 "Underlying LC_NUMERIC locale now is C\n");
272 }
98994639 273
7d4bcc4a 274# endif
98994639 275#endif /* USE_LOCALE_NUMERIC */
7d4bcc4a 276
98994639
HS
277}
278
279void
280Perl_set_numeric_local(pTHX)
281{
7d4bcc4a 282
98994639 283#ifdef USE_LOCALE_NUMERIC
7d4bcc4a 284
28c1bf33 285 /* Toggle the LC_NUMERIC locale to the current underlying default. Most
7d4bcc4a
KW
286 * code should use the macros like SET_NUMERIC_UNDERLYING() in perl.h
287 * instead of calling this directly. The macro avoids calling this routine
288 * if toggling isn't necessary according to our records (which could be
289 * wrong if some XS code has changed the locale behind our back) */
a9b8c0d8
KW
290
291 setlocale(LC_NUMERIC, PL_numeric_name);
292 PL_numeric_standard = isNAME_C_OR_POSIX(PL_numeric_name);
293 PL_numeric_local = TRUE;
86799d2d 294 set_numeric_radix(1);
7d4bcc4a
KW
295
296# ifdef DEBUGGING
297
2fcc0ca9
KW
298 if (DEBUG_L_TEST || debug_initialization) {
299 PerlIO_printf(Perl_debug_log,
69014004 300 "Underlying LC_NUMERIC locale now is %s\n",
2fcc0ca9
KW
301 PL_numeric_name);
302 }
98994639 303
7d4bcc4a 304# endif
98994639 305#endif /* USE_LOCALE_NUMERIC */
7d4bcc4a 306
98994639
HS
307}
308
309/*
310 * Set up for a new ctype locale.
311 */
a4f00dcc
KW
312STATIC void
313S_new_ctype(pTHX_ const char *newctype)
98994639 314{
7d4bcc4a
KW
315
316#ifndef USE_LOCALE_CTYPE
317
318 PERL_ARGS_ASSERT_NEW_CTYPE;
319 PERL_UNUSED_ARG(newctype);
320 PERL_UNUSED_CONTEXT;
321
322#else
0d071d52
KW
323
324 /* Called after all libc setlocale() calls affecting LC_CTYPE, to tell
325 * core Perl this and that 'newctype' is the name of the new locale.
326 *
327 * This function sets up the folding arrays for all 256 bytes, assuming
328 * that tofold() is tolc() since fold case is not a concept in POSIX,
329 *
330 * Any code changing the locale (outside this file) should use
331 * POSIX::setlocale, which calls this function. Therefore this function
332 * should be called directly only from this file and from
333 * POSIX::setlocale() */
334
27da23d5 335 dVAR;
68067e4e 336 UV i;
98994639 337
7918f24d
NC
338 PERL_ARGS_ASSERT_NEW_CTYPE;
339
215c5139
KW
340 /* We will replace any bad locale warning with 1) nothing if the new one is
341 * ok; or 2) a new warning for the bad new locale */
342 if (PL_warn_locale) {
343 SvREFCNT_dec_NN(PL_warn_locale);
344 PL_warn_locale = NULL;
345 }
346
c1284011 347 PL_in_utf8_CTYPE_locale = _is_cur_LC_category_utf8(LC_CTYPE);
31f05a37
KW
348
349 /* A UTF-8 locale gets standard rules. But note that code still has to
350 * handle this specially because of the three problematic code points */
351 if (PL_in_utf8_CTYPE_locale) {
352 Copy(PL_fold_latin1, PL_fold_locale, 256, U8);
353 }
354 else {
8c6180a9
KW
355 /* Assume enough space for every character being bad. 4 spaces each
356 * for the 94 printable characters that are output like "'x' "; and 5
357 * spaces each for "'\\' ", "'\t' ", and "'\n' "; plus a terminating
358 * NUL */
359 char bad_chars_list[ (94 * 4) + (3 * 5) + 1 ];
360
cc9eaeb0
KW
361 /* Don't check for problems if we are suppressing the warnings */
362 bool check_for_problems = ckWARN_d(WARN_LOCALE)
363 || UNLIKELY(DEBUG_L_TEST);
8c6180a9
KW
364 bool multi_byte_locale = FALSE; /* Assume is a single-byte locale
365 to start */
366 unsigned int bad_count = 0; /* Count of bad characters */
367
baa60164
KW
368 for (i = 0; i < 256; i++) {
369 if (isUPPER_LC((U8) i))
370 PL_fold_locale[i] = (U8) toLOWER_LC((U8) i);
371 else if (isLOWER_LC((U8) i))
372 PL_fold_locale[i] = (U8) toUPPER_LC((U8) i);
373 else
374 PL_fold_locale[i] = (U8) i;
8c6180a9
KW
375
376 /* If checking for locale problems, see if the native ASCII-range
377 * printables plus \n and \t are in their expected categories in
378 * the new locale. If not, this could mean big trouble, upending
379 * Perl's and most programs' assumptions, like having a
380 * metacharacter with special meaning become a \w. Fortunately,
381 * it's very rare to find locales that aren't supersets of ASCII
382 * nowadays. It isn't a problem for most controls to be changed
383 * into something else; we check only \n and \t, though perhaps \r
384 * could be an issue as well. */
7d4bcc4a 385 if ( check_for_problems
8c6180a9
KW
386 && (isGRAPH_A(i) || isBLANK_A(i) || i == '\n'))
387 {
7d4bcc4a 388 if (( isALPHANUMERIC_A(i) && ! isALPHANUMERIC_LC(i))
8c6180a9
KW
389 || (isPUNCT_A(i) && ! isPUNCT_LC(i))
390 || (isBLANK_A(i) && ! isBLANK_LC(i))
391 || (i == '\n' && ! isCNTRL_LC(i)))
392 {
393 if (bad_count) { /* Separate multiple entries with a
394 blank */
395 bad_chars_list[bad_count++] = ' ';
396 }
397 bad_chars_list[bad_count++] = '\'';
398 if (isPRINT_A(i)) {
399 bad_chars_list[bad_count++] = (char) i;
400 }
401 else {
402 bad_chars_list[bad_count++] = '\\';
403 if (i == '\n') {
404 bad_chars_list[bad_count++] = 'n';
405 }
406 else {
407 assert(i == '\t');
408 bad_chars_list[bad_count++] = 't';
409 }
410 }
411 bad_chars_list[bad_count++] = '\'';
412 bad_chars_list[bad_count] = '\0';
413 }
414 }
415 }
416
7d4bcc4a
KW
417# ifdef MB_CUR_MAX
418
8c6180a9 419 /* We only handle single-byte locales (outside of UTF-8 ones; so if
d35fca5f 420 * this locale requires more than one byte, there are going to be
8c6180a9 421 * problems. */
9c8a6dc2
KW
422 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
423 "%s:%d: check_for_problems=%d, MB_CUR_MAX=%d\n",
424 __FILE__, __LINE__, check_for_problems, (int) MB_CUR_MAX));
425
ba1a4362
KW
426 if (check_for_problems && MB_CUR_MAX > 1
427
428 /* Some platforms return MB_CUR_MAX > 1 for even the "C"
429 * locale. Just assume that the implementation for them (plus
430 * for POSIX) is correct and the > 1 value is spurious. (Since
431 * these are specially handled to never be considered UTF-8
432 * locales, as long as this is the only problem, everything
433 * should work fine */
434 && strNE(newctype, "C") && strNE(newctype, "POSIX"))
435 {
8c6180a9
KW
436 multi_byte_locale = TRUE;
437 }
7d4bcc4a
KW
438
439# endif
8c6180a9
KW
440
441 if (bad_count || multi_byte_locale) {
780fcc9f 442 PL_warn_locale = Perl_newSVpvf(aTHX_
8c6180a9 443 "Locale '%s' may not work well.%s%s%s\n",
780fcc9f 444 newctype,
8c6180a9
KW
445 (multi_byte_locale)
446 ? " Some characters in it are not recognized by"
447 " Perl."
448 : "",
449 (bad_count)
450 ? "\nThe following characters (and maybe others)"
451 " may not have the same meaning as the Perl"
452 " program expects:\n"
453 : "",
454 (bad_count)
455 ? bad_chars_list
456 : ""
457 );
cc9eaeb0 458 /* If we are actually in the scope of the locale or are debugging,
bddebb56
KW
459 * output the message now. If not in that scope, we save the
460 * message to be output at the first operation using this locale,
461 * if that actually happens. Most programs don't use locales, so
462 * they are immune to bad ones. */
cc9eaeb0 463 if (IN_LC(LC_CTYPE) || UNLIKELY(DEBUG_L_TEST)) {
780fcc9f
KW
464
465 /* We have to save 'newctype' because the setlocale() just
466 * below may destroy it. The next setlocale() further down
467 * should restore it properly so that the intermediate change
468 * here is transparent to this function's caller */
469 const char * const badlocale = savepv(newctype);
470
471 setlocale(LC_CTYPE, "C");
472
473 /* The '0' below suppresses a bogus gcc compiler warning */
474 Perl_warner(aTHX_ packWARN(WARN_LOCALE), SvPVX(PL_warn_locale), 0);
bddebb56 475
780fcc9f 476 setlocale(LC_CTYPE, badlocale);
c0f3a893 477 Safefree(badlocale);
bddebb56
KW
478
479 if (IN_LC(LC_CTYPE)) {
480 SvREFCNT_dec_NN(PL_warn_locale);
481 PL_warn_locale = NULL;
482 }
780fcc9f 483 }
baa60164 484 }
31f05a37 485 }
98994639
HS
486
487#endif /* USE_LOCALE_CTYPE */
7d4bcc4a 488
98994639
HS
489}
490
98994639 491void
2726666d
KW
492Perl__warn_problematic_locale()
493{
2726666d
KW
494
495#ifdef USE_LOCALE_CTYPE
496
5f04a188
KW
497 dTHX;
498
499 /* Internal-to-core function that outputs the message in PL_warn_locale,
500 * and then NULLS it. Should be called only through the macro
501 * _CHECK_AND_WARN_PROBLEMATIC_LOCALE */
502
2726666d
KW
503 if (PL_warn_locale) {
504 /*GCC_DIAG_IGNORE(-Wformat-security); Didn't work */
505 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
506 SvPVX(PL_warn_locale),
507 0 /* dummy to avoid compiler warning */ );
508 /* GCC_DIAG_RESTORE; */
509 SvREFCNT_dec_NN(PL_warn_locale);
510 PL_warn_locale = NULL;
511 }
512
513#endif
514
515}
516
a4f00dcc
KW
517STATIC void
518S_new_collate(pTHX_ const char *newcoll)
98994639 519{
7d4bcc4a
KW
520
521#ifndef USE_LOCALE_COLLATE
522
523 PERL_UNUSED_ARG(newcoll);
524 PERL_UNUSED_CONTEXT;
525
526#else
0d071d52
KW
527
528 /* Called after all libc setlocale() calls affecting LC_COLLATE, to tell
529 * core Perl this and that 'newcoll' is the name of the new locale.
530 *
d35fca5f
KW
531 * The design of locale collation is that every locale change is given an
532 * index 'PL_collation_ix'. The first time a string particpates in an
533 * operation that requires collation while locale collation is active, it
534 * is given PERL_MAGIC_collxfrm magic (via sv_collxfrm_flags()). That
535 * magic includes the collation index, and the transformation of the string
536 * by strxfrm(), q.v. That transformation is used when doing comparisons,
537 * instead of the string itself. If a string changes, the magic is
538 * cleared. The next time the locale changes, the index is incremented,
539 * and so we know during a comparison that the transformation is not
540 * necessarily still valid, and so is recomputed. Note that if the locale
541 * changes enough times, the index could wrap (a U32), and it is possible
542 * that a transformation would improperly be considered valid, leading to
543 * an unlikely bug */
0d071d52 544
98994639
HS
545 if (! newcoll) {
546 if (PL_collation_name) {
547 ++PL_collation_ix;
548 Safefree(PL_collation_name);
549 PL_collation_name = NULL;
550 }
551 PL_collation_standard = TRUE;
00bf60ca 552 is_standard_collation:
98994639
HS
553 PL_collxfrm_base = 0;
554 PL_collxfrm_mult = 2;
165a1c52 555 PL_in_utf8_COLLATE_locale = FALSE;
f28f4d2a 556 PL_strxfrm_NUL_replacement = '\0';
a4a439fb 557 PL_strxfrm_max_cp = 0;
98994639
HS
558 return;
559 }
560
d35fca5f 561 /* If this is not the same locale as currently, set the new one up */
98994639
HS
562 if (! PL_collation_name || strNE(PL_collation_name, newcoll)) {
563 ++PL_collation_ix;
564 Safefree(PL_collation_name);
565 PL_collation_name = stdize_locale(savepv(newcoll));
a39edc4c 566 PL_collation_standard = isNAME_C_OR_POSIX(newcoll);
00bf60ca
KW
567 if (PL_collation_standard) {
568 goto is_standard_collation;
569 }
98994639 570
165a1c52 571 PL_in_utf8_COLLATE_locale = _is_cur_LC_category_utf8(LC_COLLATE);
f28f4d2a 572 PL_strxfrm_NUL_replacement = '\0';
a4a439fb 573 PL_strxfrm_max_cp = 0;
165a1c52 574
59c018b9
KW
575 /* A locale collation definition includes primary, secondary, tertiary,
576 * etc. weights for each character. To sort, the primary weights are
577 * used, and only if they compare equal, then the secondary weights are
578 * used, and only if they compare equal, then the tertiary, etc.
579 *
580 * strxfrm() works by taking the input string, say ABC, and creating an
581 * output transformed string consisting of first the primary weights,
582 * A¹B¹C¹ followed by the secondary ones, A²B²C²; and then the
583 * tertiary, etc, yielding A¹B¹C¹ A²B²C² A³B³C³ .... Some characters
584 * may not have weights at every level. In our example, let's say B
585 * doesn't have a tertiary weight, and A doesn't have a secondary
586 * weight. The constructed string is then going to be
587 * A¹B¹C¹ B²C² A³C³ ....
588 * This has the desired effect that strcmp() will look at the secondary
589 * or tertiary weights only if the strings compare equal at all higher
590 * priority weights. The spaces shown here, like in
c342d20e 591 * "A¹B¹C¹ A²B²C² "
59c018b9
KW
592 * are not just for readability. In the general case, these must
593 * actually be bytes, which we will call here 'separator weights'; and
594 * they must be smaller than any other weight value, but since these
595 * are C strings, only the terminating one can be a NUL (some
596 * implementations may include a non-NUL separator weight just before
597 * the NUL). Implementations tend to reserve 01 for the separator
598 * weights. They are needed so that a shorter string's secondary
599 * weights won't be misconstrued as primary weights of a longer string,
600 * etc. By making them smaller than any other weight, the shorter
601 * string will sort first. (Actually, if all secondary weights are
602 * smaller than all primary ones, there is no need for a separator
603 * weight between those two levels, etc.)
604 *
605 * The length of the transformed string is roughly a linear function of
606 * the input string. It's not exactly linear because some characters
607 * don't have weights at all levels. When we call strxfrm() we have to
608 * allocate some memory to hold the transformed string. The
609 * calculations below try to find coefficients 'm' and 'b' for this
610 * locale so that m*x + b equals how much space we need, given the size
611 * of the input string in 'x'. If we calculate too small, we increase
612 * the size as needed, and call strxfrm() again, but it is better to
613 * get it right the first time to avoid wasted expensive string
614 * transformations. */
615
98994639 616 {
79f120c8
KW
617 /* We use the string below to find how long the tranformation of it
618 * is. Almost all locales are supersets of ASCII, or at least the
619 * ASCII letters. We use all of them, half upper half lower,
620 * because if we used fewer, we might hit just the ones that are
621 * outliers in a particular locale. Most of the strings being
622 * collated will contain a preponderance of letters, and even if
623 * they are above-ASCII, they are likely to have the same number of
624 * weight levels as the ASCII ones. It turns out that digits tend
625 * to have fewer levels, and some punctuation has more, but those
626 * are relatively sparse in text, and khw believes this gives a
627 * reasonable result, but it could be changed if experience so
628 * dictates. */
629 const char longer[] = "ABCDEFGHIJKLMnopqrstuvwxyz";
630 char * x_longer; /* Transformed 'longer' */
631 Size_t x_len_longer; /* Length of 'x_longer' */
632
633 char * x_shorter; /* We also transform a substring of 'longer' */
634 Size_t x_len_shorter;
635
a4a439fb 636 /* _mem_collxfrm() is used get the transformation (though here we
79f120c8
KW
637 * are interested only in its length). It is used because it has
638 * the intelligence to handle all cases, but to work, it needs some
639 * values of 'm' and 'b' to get it started. For the purposes of
640 * this calculation we use a very conservative estimate of 'm' and
641 * 'b'. This assumes a weight can be multiple bytes, enough to
642 * hold any UV on the platform, and there are 5 levels, 4 weight
643 * bytes, and a trailing NUL. */
644 PL_collxfrm_base = 5;
645 PL_collxfrm_mult = 5 * sizeof(UV);
646
647 /* Find out how long the transformation really is */
a4a439fb
KW
648 x_longer = _mem_collxfrm(longer,
649 sizeof(longer) - 1,
650 &x_len_longer,
651
652 /* We avoid converting to UTF-8 in the
653 * called function by telling it the
654 * string is in UTF-8 if the locale is a
655 * UTF-8 one. Since the string passed
656 * here is invariant under UTF-8, we can
657 * claim it's UTF-8 even though it isn't.
658 * */
659 PL_in_utf8_COLLATE_locale);
79f120c8
KW
660 Safefree(x_longer);
661
662 /* Find out how long the transformation of a substring of 'longer'
663 * is. Together the lengths of these transformations are
664 * sufficient to calculate 'm' and 'b'. The substring is all of
665 * 'longer' except the first character. This minimizes the chances
666 * of being swayed by outliers */
a4a439fb 667 x_shorter = _mem_collxfrm(longer + 1,
79f120c8 668 sizeof(longer) - 2,
a4a439fb
KW
669 &x_len_shorter,
670 PL_in_utf8_COLLATE_locale);
79f120c8
KW
671 Safefree(x_shorter);
672
673 /* If the results are nonsensical for this simple test, the whole
674 * locale definition is suspect. Mark it so that locale collation
675 * is not active at all for it. XXX Should we warn? */
676 if ( x_len_shorter == 0
677 || x_len_longer == 0
678 || x_len_shorter >= x_len_longer)
679 {
680 PL_collxfrm_mult = 0;
681 PL_collxfrm_base = 0;
682 }
683 else {
684 SSize_t base; /* Temporary */
685
686 /* We have both: m * strlen(longer) + b = x_len_longer
687 * m * strlen(shorter) + b = x_len_shorter;
688 * subtracting yields:
689 * m * (strlen(longer) - strlen(shorter))
690 * = x_len_longer - x_len_shorter
691 * But we have set things up so that 'shorter' is 1 byte smaller
692 * than 'longer'. Hence:
693 * m = x_len_longer - x_len_shorter
694 *
695 * But if something went wrong, make sure the multiplier is at
696 * least 1.
697 */
698 if (x_len_longer > x_len_shorter) {
699 PL_collxfrm_mult = (STRLEN) x_len_longer - x_len_shorter;
700 }
701 else {
702 PL_collxfrm_mult = 1;
703 }
704
705 /* mx + b = len
706 * so: b = len - mx
707 * but in case something has gone wrong, make sure it is
708 * non-negative */
709 base = x_len_longer - PL_collxfrm_mult * (sizeof(longer) - 1);
710 if (base < 0) {
711 base = 0;
712 }
713
714 /* Add 1 for the trailing NUL */
715 PL_collxfrm_base = base + 1;
716 }
58eebef2 717
7d4bcc4a
KW
718# ifdef DEBUGGING
719
58eebef2
KW
720 if (DEBUG_L_TEST || debug_initialization) {
721 PerlIO_printf(Perl_debug_log,
b07929e4
KW
722 "%s:%d: ?UTF-8 locale=%d; x_len_shorter=%zu, "
723 "x_len_longer=%zu,"
724 " collate multipler=%zu, collate base=%zu\n",
58eebef2
KW
725 __FILE__, __LINE__,
726 PL_in_utf8_COLLATE_locale,
727 x_len_shorter, x_len_longer,
728 PL_collxfrm_mult, PL_collxfrm_base);
729 }
7d4bcc4a
KW
730# endif
731
98994639
HS
732 }
733 }
734
735#endif /* USE_LOCALE_COLLATE */
7d4bcc4a 736
98994639
HS
737}
738
a4f00dcc 739#ifndef WIN32 /* No wrapper except on Windows */
b385bb4d 740
7d4bcc4a 741# define my_setlocale(a,b) setlocale(a,b)
a4f00dcc
KW
742
743#else /* WIN32 */
744
b8cc575c
KW
745#define my_setlocale(a,b) win32_setlocale(a,b)
746
a4f00dcc 747STATIC char *
b8cc575c 748S_win32_setlocale(pTHX_ int category, const char* locale)
b385bb4d
KW
749{
750 /* This, for Windows, emulates POSIX setlocale() behavior. There is no
7d4bcc4a
KW
751 * difference between the two unless the input locale is "", which normally
752 * means on Windows to get the machine default, which is set via the
753 * computer's "Regional and Language Options" (or its current equivalent).
754 * In POSIX, it instead means to find the locale from the user's
755 * environment. This routine changes the Windows behavior to first look in
756 * the environment, and, if anything is found, use that instead of going to
757 * the machine default. If there is no environment override, the machine
758 * default is used, by calling the real setlocale() with "".
759 *
760 * The POSIX behavior is to use the LC_ALL variable if set; otherwise to
761 * use the particular category's variable if set; otherwise to use the LANG
762 * variable. */
b385bb4d 763
175c4cf9 764 bool override_LC_ALL = FALSE;
89f7b9aa
KW
765 char * result;
766
b385bb4d 767 if (locale && strEQ(locale, "")) {
7d4bcc4a
KW
768
769# ifdef LC_ALL
770
b385bb4d
KW
771 locale = PerlEnv_getenv("LC_ALL");
772 if (! locale) {
7d4bcc4a
KW
773
774# endif
b385bb4d 775 switch (category) {
7d4bcc4a
KW
776
777# ifdef LC_ALL
b385bb4d 778 case LC_ALL:
481465ea 779 override_LC_ALL = TRUE;
b385bb4d 780 break; /* We already know its variable isn't set */
7d4bcc4a
KW
781
782# endif
783# ifdef USE_LOCALE_TIME
784
b385bb4d
KW
785 case LC_TIME:
786 locale = PerlEnv_getenv("LC_TIME");
787 break;
7d4bcc4a
KW
788
789# endif
790# ifdef USE_LOCALE_CTYPE
791
b385bb4d
KW
792 case LC_CTYPE:
793 locale = PerlEnv_getenv("LC_CTYPE");
794 break;
7d4bcc4a
KW
795
796# endif
797# ifdef USE_LOCALE_COLLATE
798
b385bb4d
KW
799 case LC_COLLATE:
800 locale = PerlEnv_getenv("LC_COLLATE");
801 break;
7d4bcc4a
KW
802
803# endif
804# ifdef USE_LOCALE_MONETARY
805
b385bb4d
KW
806 case LC_MONETARY:
807 locale = PerlEnv_getenv("LC_MONETARY");
808 break;
7d4bcc4a
KW
809
810# endif
811# ifdef USE_LOCALE_NUMERIC
812
b385bb4d
KW
813 case LC_NUMERIC:
814 locale = PerlEnv_getenv("LC_NUMERIC");
815 break;
7d4bcc4a
KW
816
817# endif
818# ifdef USE_LOCALE_MESSAGES
819
b385bb4d
KW
820 case LC_MESSAGES:
821 locale = PerlEnv_getenv("LC_MESSAGES");
822 break;
7d4bcc4a
KW
823
824# endif
825
b385bb4d
KW
826 default:
827 /* This is a category, like PAPER_SIZE that we don't
828 * know about; and so can't provide a wrapper. */
829 break;
830 }
831 if (! locale) {
832 locale = PerlEnv_getenv("LANG");
481465ea 833 if (! locale) {
b385bb4d
KW
834 locale = "";
835 }
836 }
7d4bcc4a
KW
837
838# ifdef LC_ALL
839
b385bb4d 840 }
7d4bcc4a
KW
841
842# endif
843
b385bb4d
KW
844 }
845
89f7b9aa 846 result = setlocale(category, locale);
bbc98134 847 DEBUG_L(PerlIO_printf(Perl_debug_log, "%s:%d: %s\n", __FILE__, __LINE__,
a4f00dcc 848 setlocale_debug_string(category, locale, result)));
89f7b9aa 849
481465ea 850 if (! override_LC_ALL) {
89f7b9aa
KW
851 return result;
852 }
853
dfd77d7a 854 /* Here the input category was LC_ALL, and we have set it to what is in the
481465ea
KW
855 * LANG variable or the system default if there is no LANG. But these have
856 * lower priority than the other LC_foo variables, so override it for each
857 * one that is set. (If they are set to "", it means to use the same thing
858 * we just set LC_ALL to, so can skip) */
7d4bcc4a
KW
859
860# ifdef USE_LOCALE_TIME
861
89f7b9aa 862 result = PerlEnv_getenv("LC_TIME");
730252b2 863 if (result && strNE(result, "")) {
89f7b9aa 864 setlocale(LC_TIME, result);
bbc98134
KW
865 DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s:%d: %s\n",
866 __FILE__, __LINE__,
a4f00dcc 867 setlocale_debug_string(LC_TIME, result, "not captured")));
89f7b9aa 868 }
7d4bcc4a
KW
869
870# endif
871# ifdef USE_LOCALE_CTYPE
872
89f7b9aa 873 result = PerlEnv_getenv("LC_CTYPE");
730252b2 874 if (result && strNE(result, "")) {
89f7b9aa 875 setlocale(LC_CTYPE, result);
bbc98134
KW
876 DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s:%d: %s\n",
877 __FILE__, __LINE__,
a4f00dcc 878 setlocale_debug_string(LC_CTYPE, result, "not captured")));
89f7b9aa 879 }
7d4bcc4a
KW
880
881# endif
882# ifdef USE_LOCALE_COLLATE
883
89f7b9aa 884 result = PerlEnv_getenv("LC_COLLATE");
730252b2 885 if (result && strNE(result, "")) {
89f7b9aa 886 setlocale(LC_COLLATE, result);
bbc98134
KW
887 DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s:%d: %s\n",
888 __FILE__, __LINE__,
a4f00dcc 889 setlocale_debug_string(LC_COLLATE, result, "not captured")));
89f7b9aa 890 }
7d4bcc4a
KW
891
892# endif
893# ifdef USE_LOCALE_MONETARY
894
89f7b9aa 895 result = PerlEnv_getenv("LC_MONETARY");
730252b2 896 if (result && strNE(result, "")) {
89f7b9aa 897 setlocale(LC_MONETARY, result);
bbc98134
KW
898 DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s:%d: %s\n",
899 __FILE__, __LINE__,
a4f00dcc 900 setlocale_debug_string(LC_MONETARY, result, "not captured")));
89f7b9aa 901 }
7d4bcc4a
KW
902
903# endif
904# ifdef USE_LOCALE_NUMERIC
905
89f7b9aa 906 result = PerlEnv_getenv("LC_NUMERIC");
730252b2 907 if (result && strNE(result, "")) {
89f7b9aa 908 setlocale(LC_NUMERIC, result);
bbc98134
KW
909 DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s:%d: %s\n",
910 __FILE__, __LINE__,
a4f00dcc 911 setlocale_debug_string(LC_NUMERIC, result, "not captured")));
89f7b9aa 912 }
7d4bcc4a
KW
913
914# endif
915# ifdef USE_LOCALE_MESSAGES
916
89f7b9aa 917 result = PerlEnv_getenv("LC_MESSAGES");
730252b2 918 if (result && strNE(result, "")) {
89f7b9aa 919 setlocale(LC_MESSAGES, result);
bbc98134
KW
920 DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s:%d: %s\n",
921 __FILE__, __LINE__,
a4f00dcc 922 setlocale_debug_string(LC_MESSAGES, result, "not captured")));
89f7b9aa 923 }
7d4bcc4a
KW
924
925# endif
89f7b9aa 926
bbc98134
KW
927 result = setlocale(LC_ALL, NULL);
928 DEBUG_L(PerlIO_printf(Perl_debug_log, "%s:%d: %s\n",
929 __FILE__, __LINE__,
a4f00dcc 930 setlocale_debug_string(LC_ALL, NULL, result)));
89f7b9aa 931
bbc98134 932 return result;
b385bb4d
KW
933}
934
935#endif
936
a4f00dcc
KW
937char *
938Perl_setlocale(int category, const char * locale)
939{
940 /* This wraps POSIX::setlocale() */
941
942 char * retval;
943 dTHX;
944
a4f00dcc
KW
945#ifdef USE_LOCALE_NUMERIC
946
947 /* A NULL locale means only query what the current one is. We
948 * have the LC_NUMERIC name saved, because we are normally switched
949 * into the C locale for it. Switch back so an LC_ALL query will yield
950 * the correct results; all other categories don't require special
951 * handling */
952 if (locale == NULL) {
953 if (category == LC_NUMERIC) {
954 return savepv(PL_numeric_name);
955 }
956
7d4bcc4a 957# ifdef LC_ALL
a4f00dcc
KW
958
959 else if (category == LC_ALL) {
960 SET_NUMERIC_UNDERLYING();
961 }
962
7d4bcc4a 963# endif
a4f00dcc
KW
964
965 }
966
967#endif
968
969 retval = my_setlocale(category, locale);
970
971 DEBUG_L(PerlIO_printf(Perl_debug_log,
972 "%s:%d: %s\n", __FILE__, __LINE__,
973 setlocale_debug_string(category, locale, retval)));
974 if (! retval) {
975 /* Should never happen that a query would return an error, but be
976 * sure and reset to C locale */
977 if (locale == 0) {
978 SET_NUMERIC_STANDARD();
979 }
7d4bcc4a 980
a4f00dcc
KW
981 return NULL;
982 }
983
984 /* Save retval since subsequent setlocale() calls may overwrite it. */
985 retval = savepv(retval);
986
987 /* If locale == NULL, we are just querying the state, but may have switched
988 * to NUMERIC_UNDERLYING. Switch back before returning. */
989 if (locale == NULL) {
990 SET_NUMERIC_STANDARD();
991 return retval;
992 }
993 else { /* Now that have switched locales, we have to update our records to
994 correspond */
995
996#ifdef USE_LOCALE_CTYPE
997
998 if ( category == LC_CTYPE
999
7d4bcc4a 1000# ifdef LC_ALL
a4f00dcc
KW
1001
1002 || category == LC_ALL
1003
7d4bcc4a 1004# endif
a4f00dcc
KW
1005
1006 )
1007 {
1008 char *newctype;
1009
7d4bcc4a 1010# ifdef LC_ALL
a4f00dcc
KW
1011
1012 if (category == LC_ALL) {
1013 newctype = setlocale(LC_CTYPE, NULL);
1014 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
1015 "%s:%d: %s\n", __FILE__, __LINE__,
1016 setlocale_debug_string(LC_CTYPE, NULL, newctype)));
1017 }
1018 else
1019
7d4bcc4a 1020# endif
a4f00dcc
KW
1021
1022 newctype = retval;
1023 new_ctype(newctype);
1024 }
1025
1026#endif /* USE_LOCALE_CTYPE */
a4f00dcc
KW
1027#ifdef USE_LOCALE_COLLATE
1028
1029 if ( category == LC_COLLATE
1030
7d4bcc4a 1031# ifdef LC_ALL
a4f00dcc
KW
1032
1033 || category == LC_ALL
1034
7d4bcc4a 1035# endif
a4f00dcc
KW
1036
1037 )
1038 {
1039 char *newcoll;
1040
7d4bcc4a 1041# ifdef LC_ALL
a4f00dcc
KW
1042
1043 if (category == LC_ALL) {
1044 newcoll = setlocale(LC_COLLATE, NULL);
1045 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
1046 "%s:%d: %s\n", __FILE__, __LINE__,
1047 setlocale_debug_string(LC_COLLATE, NULL, newcoll)));
1048 }
1049 else
1050
7d4bcc4a 1051# endif
a4f00dcc
KW
1052
1053 newcoll = retval;
1054 new_collate(newcoll);
1055 }
1056
1057#endif /* USE_LOCALE_COLLATE */
a4f00dcc
KW
1058#ifdef USE_LOCALE_NUMERIC
1059
1060 if ( category == LC_NUMERIC
1061
7d4bcc4a 1062# ifdef LC_ALL
a4f00dcc
KW
1063
1064 || category == LC_ALL
1065
7d4bcc4a 1066# endif
a4f00dcc
KW
1067
1068 )
1069 {
1070 char *newnum;
1071
7d4bcc4a 1072# ifdef LC_ALL
a4f00dcc
KW
1073
1074 if (category == LC_ALL) {
1075 newnum = setlocale(LC_NUMERIC, NULL);
1076 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
1077 "%s:%d: %s\n", __FILE__, __LINE__,
1078 setlocale_debug_string(LC_NUMERIC, NULL, newnum)));
1079 }
1080 else
1081
7d4bcc4a 1082# endif
a4f00dcc
KW
1083
1084 newnum = retval;
1085 new_numeric(newnum);
1086 }
1087
1088#endif /* USE_LOCALE_NUMERIC */
1089
1090 }
1091
1092 return retval;
1093
f7416781
KW
1094
1095}
1096
1097PERL_STATIC_INLINE const char *
1098S_save_to_buffer(const char * string, char **buf, Size_t *buf_size, const Size_t offset)
1099{
1100 /* Copy the NUL-terminated 'string' to 'buf' + 'offset'. 'buf' has size 'buf_size',
1101 * growing it if necessary */
1102
1103 const Size_t string_size = strlen(string) + offset + 1;
1104
1105 PERL_ARGS_ASSERT_SAVE_TO_BUFFER;
1106
1107 if (*buf_size == 0) {
1108 Newx(*buf, string_size, char);
1109 *buf_size = string_size;
1110 }
1111 else if (string_size > *buf_size) {
1112 Renew(*buf, string_size, char);
1113 *buf_size = string_size;
1114 }
1115
1116 Copy(string, *buf + offset, string_size - offset, char);
1117 return *buf;
1118}
1119
1120/*
1121
1122=head1 Locale-related functions and macros
1123
1124=for apidoc Perl_langinfo
1125
7d4bcc4a 1126This is an (almost ª) drop-in replacement for the system C<L<nl_langinfo(3)>>,
f7416781
KW
1127taking the same C<item> parameter values, and returning the same information.
1128But it is more thread-safe than regular C<nl_langinfo()>, and hides the quirks
1129of Perl's locale handling from your code, and can be used on systems that lack
1130a native C<nl_langinfo>.
1131
1132Expanding on these:
1133
1134=over
1135
1136=item *
1137
1138It delivers the correct results for the C<RADIXCHAR> and C<THOUSESEP> items,
1139without you having to write extra code. The reason for the extra code would be
1140because these are from the C<LC_NUMERIC> locale category, which is normally
1141kept set to the C locale by Perl, no matter what the underlying locale is
1142supposed to be, and so to get the expected results, you have to temporarily
1143toggle into the underlying locale, and later toggle back. (You could use
1144plain C<nl_langinfo> and C<L</STORE_LC_NUMERIC_FORCE_TO_UNDERLYING>> for this
1145but then you wouldn't get the other advantages of C<Perl_langinfo()>; not
1146keeping C<LC_NUMERIC> in the C locale would break a lot of CPAN, which is
1147expecting the radix (decimal point) character to be a dot.)
1148
1149=item *
1150
1151Depending on C<item>, it works on systems that don't have C<nl_langinfo>, hence
1152makes your code more portable. Of the fifty-some possible items specified by
1153the POSIX 2008 standard,
1154L<http://pubs.opengroup.org/onlinepubs/9699919799/basedefs/langinfo.h.html>,
1155only two are completely unimplemented. It uses various techniques to recover
1156the other items, including calling C<L<localeconv(3)>>, and C<L<strftime(3)>>,
1157both of which are specified in C89, so should be always be available. Later
1158C<strftime()> versions have additional capabilities; C<""> is returned for
1159those not available on your system.
1160
1161The details for those items which may differ from what this emulation returns
1162and what a native C<nl_langinfo()> would return are:
1163
1164=over
1165
1166=item C<CODESET>
1167
1168=item C<ERA>
1169
1170Unimplemented, so returns C<"">.
1171
1172=item C<YESEXPR>
1173
1174=item C<NOEXPR>
1175
1176Only the values for English are returned. Earlier POSIX standards also
1177specified C<YESSTR> and C<NOSTR>, but these have been removed from POSIX 2008,
1178and aren't supported by C<Perl_langinfo>.
1179
1180=item C<D_FMT>
1181
1182Always evaluates to C<%x>, the locale's appropriate date representation.
1183
1184=item C<T_FMT>
1185
1186Always evaluates to C<%X>, the locale's appropriate time representation.
1187
1188=item C<D_T_FMT>
1189
1190Always evaluates to C<%c>, the locale's appropriate date and time
1191representation.
1192
1193=item C<CRNCYSTR>
1194
1195The return may be incorrect for those rare locales where the currency symbol
1196replaces the radix character.
1197Send email to L<mailto:perlbug@perl.org> if you have examples of it needing
1198to work differently.
1199
1200=item C<ALT_DIGITS>
1201
1202Currently this gives the same results as Linux does.
1203Send email to L<mailto:perlbug@perl.org> if you have examples of it needing
1204to work differently.
1205
1206=item C<ERA_D_FMT>
1207
1208=item C<ERA_T_FMT>
1209
1210=item C<ERA_D_T_FMT>
1211
1212=item C<T_FMT_AMPM>
1213
1214These are derived by using C<strftime()>, and not all versions of that function
1215know about them. C<""> is returned for these on such systems.
1216
1217=back
1218
1219When using C<Perl_langinfo> on systems that don't have a native
1220C<nl_langinfo()>, you must
1221
1222 #include "perl_langinfo.h"
1223
1224before the C<perl.h> C<#include>. You can replace your C<langinfo.h>
1225C<#include> with this one. (Doing it this way keeps out the symbols that plain
1226C<langinfo.h> imports into the namespace for code that doesn't need it.)
1227
1228You also should not use the bare C<langinfo.h> item names, but should preface
1229them with C<PERL_>, so use C<PERL_RADIXCHAR> instead of plain C<RADIXCHAR>.
1230The C<PERL_I<foo>> versions will also work for this function on systems that do
1231have a native C<nl_langinfo>.
1232
1233=item *
1234
1235It is thread-friendly, returning its result in a buffer that won't be
1236overwritten by another thread, so you don't have to code for that possibility.
1237The buffer can be overwritten by the next call to C<nl_langinfo> or
1238C<Perl_langinfo> in the same thread.
1239
1240=item *
1241
7d4bcc4a 1242ª It returns S<C<const char *>>, whereas plain C<nl_langinfo()> returns S<C<char
f7416781
KW
1243*>>, but you are (only by documentation) forbidden to write into the buffer.
1244By declaring this C<const>, the compiler enforces this restriction. The extra
1245C<const> is why this isn't an unequivocal drop-in replacement for
1246C<nl_langinfo>.
1247
1248=back
1249
1250The original impetus for C<Perl_langinfo()> was so that code that needs to
1251find out the current currency symbol, floating point radix character, or digit
1252grouping separator can use, on all systems, the simpler and more
1253thread-friendly C<nl_langinfo> API instead of C<L<localeconv(3)>> which is a
1254pain to make thread-friendly. For other fields returned by C<localeconv>, it
1255is better to use the methods given in L<perlcall> to call
1256L<C<POSIX::localeconv()>|POSIX/localeconv>, which is thread-friendly.
1257
1258=cut
1259
1260*/
1261
1262const char *
1263#ifdef HAS_NL_LANGINFO
1264Perl_langinfo(const nl_item item)
1265#else
1266Perl_langinfo(const int item)
1267#endif
1268{
f61748ac
KW
1269 return my_nl_langinfo(item, TRUE);
1270}
1271
1272const char *
1273#ifdef HAS_NL_LANGINFO
1274S_my_nl_langinfo(const nl_item item, bool toggle)
1275#else
1276S_my_nl_langinfo(const int item, bool toggle)
1277#endif
1278{
ae74815b 1279 dTHX;
f7416781 1280
ab340fff
KW
1281#if defined(HAS_NL_LANGINFO) /* nl_langinfo() is available. */
1282#if ! defined(HAS_POSIX_2008_LOCALE)
f7416781 1283
ab340fff 1284 /* Here, use plain nl_langinfo(), switching to the underlying LC_NUMERIC
ae74815b
KW
1285 * for those items dependent on it. This must be copied to a buffer before
1286 * switching back, as some systems destroy the buffer when setlocale() is
1287 * called */
f7416781
KW
1288
1289 LOCALE_LOCK;
1290
1291 if (toggle) {
1292 if (item == PERL_RADIXCHAR || item == PERL_THOUSEP) {
1293 setlocale(LC_NUMERIC, PL_numeric_name);
1294 }
1295 else {
1296 toggle = FALSE;
1297 }
1298 }
1299
1300 save_to_buffer(nl_langinfo(item), &PL_langinfo_buf, &PL_langinfo_bufsize, 0);
1301
1302 if (toggle) {
1303 setlocale(LC_NUMERIC, "C");
1304 }
1305
1306 LOCALE_UNLOCK;
1307
1308 return PL_langinfo_buf;
1309
ab340fff
KW
1310# else /* Use nl_langinfo_l(), avoiding both a mutex and changing the locale */
1311
1312 bool do_free = FALSE;
1313 locale_t cur = uselocale((locale_t) 0);
1314
1315 if (cur == LC_GLOBAL_LOCALE) {
1316 cur = duplocale(LC_GLOBAL_LOCALE);
1317 do_free = TRUE;
1318 }
1319
1320 if ( toggle
1321 && (item == PERL_RADIXCHAR || item == PERL_THOUSEP))
1322 {
1323 cur = newlocale(LC_NUMERIC_MASK, PL_numeric_name, cur);
1324 do_free = TRUE;
1325 }
1326
1327 save_to_buffer(nl_langinfo_l(item, cur),
1328 &PL_langinfo_buf, &PL_langinfo_bufsize, 0);
1329 if (do_free) {
1330 freelocale(cur);
1331 }
1332
1333 return PL_langinfo_buf;
1334
1335# endif
f7416781 1336#else /* Below, emulate nl_langinfo as best we can */
f7416781
KW
1337# ifdef HAS_LOCALECONV
1338
1339 const struct lconv* lc;
1340
1341# endif
1342# ifdef HAS_STRFTIME
1343
1344 struct tm tm;
1345 bool return_format = FALSE; /* Return the %format, not the value */
1346 const char * format;
1347
1348# endif
1349
1350 /* We copy the results to a per-thread buffer, even if not multi-threaded.
1351 * This is in part to simplify this code, and partly because we need a
1352 * buffer anyway for strftime(), and partly because a call of localeconv()
1353 * could otherwise wipe out the buffer, and the programmer would not be
1354 * expecting this, as this is a nl_langinfo() substitute after all, so s/he
1355 * might be thinking their localeconv() is safe until another localeconv()
1356 * call. */
1357
1358 switch (item) {
1359 Size_t len;
1360 const char * retval;
1361
1362 /* These 2 are unimplemented */
1363 case PERL_CODESET:
1364 case PERL_ERA: /* For use with strftime() %E modifier */
1365
1366 default:
1367 return "";
1368
1369 /* We use only an English set, since we don't know any more */
1370 case PERL_YESEXPR: return "^[+1yY]";
1371 case PERL_NOEXPR: return "^[-0nN]";
1372
1373# ifdef HAS_LOCALECONV
1374
1375 case PERL_CRNCYSTR:
1376
1377 LOCALE_LOCK;
1378
1379 lc = localeconv();
1380 if (! lc || ! lc->currency_symbol || strEQ("", lc->currency_symbol))
1381 {
1382 LOCALE_UNLOCK;
1383 return "";
1384 }
1385
1386 /* Leave the first spot empty to be filled in below */
1387 save_to_buffer(lc->currency_symbol, &PL_langinfo_buf,
1388 &PL_langinfo_bufsize, 1);
1389 if (lc->mon_decimal_point && strEQ(lc->mon_decimal_point, ""))
1390 { /* khw couldn't figure out how the localedef specifications
1391 would show that the $ should replace the radix; this is
1392 just a guess as to how it might work.*/
1393 *PL_langinfo_buf = '.';
1394 }
1395 else if (lc->p_cs_precedes) {
1396 *PL_langinfo_buf = '-';
1397 }
1398 else {
1399 *PL_langinfo_buf = '+';
1400 }
1401
1402 LOCALE_UNLOCK;
1403 break;
1404
1405 case PERL_RADIXCHAR:
1406 case PERL_THOUSEP:
1407
1408 LOCALE_LOCK;
1409
1410 if (toggle) {
1411 setlocale(LC_NUMERIC, PL_numeric_name);
1412 }
1413
1414 lc = localeconv();
1415 if (! lc) {
1416 retval = "";
1417 }
1418 else switch (item) {
1419 case PERL_RADIXCHAR:
1420 if (! lc->decimal_point) {
1421 retval = "";
1422 }
1423 else {
1424 retval = lc->decimal_point;
1425 }
1426 break;
1427
1428 case PERL_THOUSEP:
1429 if (! lc->thousands_sep || strEQ("", lc->thousands_sep)) {
1430 retval = "";
1431 }
1432 else {
1433 retval = lc->thousands_sep;
1434 }
1435 break;
1436
1437 default:
1438 LOCALE_UNLOCK;
1439 Perl_croak(aTHX_ "panic: %s: %d: switch case: %d problem",
1440 __FILE__, __LINE__, item);
1441 }
1442
1443 save_to_buffer(retval, &PL_langinfo_buf, &PL_langinfo_bufsize, 0);
1444
1445 if (toggle) {
1446 setlocale(LC_NUMERIC, "C");
1447 }
1448
1449 LOCALE_UNLOCK;
1450
1451 break;
1452
1453# endif
1454# ifdef HAS_STRFTIME
1455
1456 /* These are defined by C89, so we assume that strftime supports them,
1457 * and so are returned unconditionally; they may not be what the locale
1458 * actually says, but should give good enough results for someone using
1459 * them as formats (as opposed to trying to parse them to figure out
7d4bcc4a 1460 * what the locale says). The other format items are actually tested to
f7416781
KW
1461 * verify they work on the platform */
1462 case PERL_D_FMT: return "%x";
1463 case PERL_T_FMT: return "%X";
1464 case PERL_D_T_FMT: return "%c";
1465
1466 /* These formats are only available in later strfmtime's */
1467 case PERL_ERA_D_FMT: case PERL_ERA_T_FMT: case PERL_ERA_D_T_FMT:
1468 case PERL_T_FMT_AMPM:
1469
1470 /* The rest can be gotten from most versions of strftime(). */
1471 case PERL_ABDAY_1: case PERL_ABDAY_2: case PERL_ABDAY_3:
1472 case PERL_ABDAY_4: case PERL_ABDAY_5: case PERL_ABDAY_6:
1473 case PERL_ABDAY_7:
1474 case PERL_ALT_DIGITS:
1475 case PERL_AM_STR: case PERL_PM_STR:
1476 case PERL_ABMON_1: case PERL_ABMON_2: case PERL_ABMON_3:
1477 case PERL_ABMON_4: case PERL_ABMON_5: case PERL_ABMON_6:
1478 case PERL_ABMON_7: case PERL_ABMON_8: case PERL_ABMON_9:
1479 case PERL_ABMON_10: case PERL_ABMON_11: case PERL_ABMON_12:
1480 case PERL_DAY_1: case PERL_DAY_2: case PERL_DAY_3: case PERL_DAY_4:
1481 case PERL_DAY_5: case PERL_DAY_6: case PERL_DAY_7:
1482 case PERL_MON_1: case PERL_MON_2: case PERL_MON_3: case PERL_MON_4:
1483 case PERL_MON_5: case PERL_MON_6: case PERL_MON_7: case PERL_MON_8:
1484 case PERL_MON_9: case PERL_MON_10: case PERL_MON_11: case PERL_MON_12:
1485
1486 LOCALE_LOCK;
1487
1488 init_tm(&tm); /* Precaution against core dumps */
1489 tm.tm_sec = 30;
1490 tm.tm_min = 30;
1491 tm.tm_hour = 6;
1492 tm.tm_year = 2017 - 1900;
1493 tm.tm_wday = 0;
1494 tm.tm_mon = 0;
1495 switch (item) {
1496 default:
1497 LOCALE_UNLOCK;
1498 Perl_croak(aTHX_ "panic: %s: %d: switch case: %d problem",
1499 __FILE__, __LINE__, item);
1500 NOT_REACHED; /* NOTREACHED */
1501
1502 case PERL_PM_STR: tm.tm_hour = 18;
1503 case PERL_AM_STR:
1504 format = "%p";
1505 break;
1506
1507 case PERL_ABDAY_7: tm.tm_wday++;
1508 case PERL_ABDAY_6: tm.tm_wday++;
1509 case PERL_ABDAY_5: tm.tm_wday++;
1510 case PERL_ABDAY_4: tm.tm_wday++;
1511 case PERL_ABDAY_3: tm.tm_wday++;
1512 case PERL_ABDAY_2: tm.tm_wday++;
1513 case PERL_ABDAY_1:
1514 format = "%a";
1515 break;
1516
1517 case PERL_DAY_7: tm.tm_wday++;
1518 case PERL_DAY_6: tm.tm_wday++;
1519 case PERL_DAY_5: tm.tm_wday++;
1520 case PERL_DAY_4: tm.tm_wday++;
1521 case PERL_DAY_3: tm.tm_wday++;
1522 case PERL_DAY_2: tm.tm_wday++;
1523 case PERL_DAY_1:
1524 format = "%A";
1525 break;
1526
1527 case PERL_ABMON_12: tm.tm_mon++;
1528 case PERL_ABMON_11: tm.tm_mon++;
1529 case PERL_ABMON_10: tm.tm_mon++;
1530 case PERL_ABMON_9: tm.tm_mon++;
1531 case PERL_ABMON_8: tm.tm_mon++;
1532 case PERL_ABMON_7: tm.tm_mon++;
1533 case PERL_ABMON_6: tm.tm_mon++;
1534 case PERL_ABMON_5: tm.tm_mon++;
1535 case PERL_ABMON_4: tm.tm_mon++;
1536 case PERL_ABMON_3: tm.tm_mon++;
1537 case PERL_ABMON_2: tm.tm_mon++;
1538 case PERL_ABMON_1:
1539 format = "%b";
1540 break;
1541
1542 case PERL_MON_12: tm.tm_mon++;
1543 case PERL_MON_11: tm.tm_mon++;
1544 case PERL_MON_10: tm.tm_mon++;
1545 case PERL_MON_9: tm.tm_mon++;
1546 case PERL_MON_8: tm.tm_mon++;
1547 case PERL_MON_7: tm.tm_mon++;
1548 case PERL_MON_6: tm.tm_mon++;
1549 case PERL_MON_5: tm.tm_mon++;
1550 case PERL_MON_4: tm.tm_mon++;
1551 case PERL_MON_3: tm.tm_mon++;
1552 case PERL_MON_2: tm.tm_mon++;
1553 case PERL_MON_1:
1554 format = "%B";
1555 break;
1556
1557 case PERL_T_FMT_AMPM:
1558 format = "%r";
1559 return_format = TRUE;
1560 break;
1561
1562 case PERL_ERA_D_FMT:
1563 format = "%Ex";
1564 return_format = TRUE;
1565 break;
1566
1567 case PERL_ERA_T_FMT:
1568 format = "%EX";
1569 return_format = TRUE;
1570 break;
1571
1572 case PERL_ERA_D_T_FMT:
1573 format = "%Ec";
1574 return_format = TRUE;
1575 break;
1576
1577 case PERL_ALT_DIGITS:
1578 tm.tm_wday = 0;
1579 format = "%Ow"; /* Find the alternate digit for 0 */
1580 break;
1581 }
1582
1583 /* We can't use my_strftime() because it doesn't look at tm_wday */
1584 while (0 == strftime(PL_langinfo_buf, PL_langinfo_bufsize,
1585 format, &tm))
1586 {
1587 /* A zero return means one of:
1588 * a) there wasn't enough space in PL_langinfo_buf
1589 * b) the format, like a plain %p, returns empty
1590 * c) it was an illegal format, though some implementations of
1591 * strftime will just return the illegal format as a plain
1592 * character sequence.
1593 *
1594 * To quickly test for case 'b)', try again but precede the
1595 * format with a plain character. If that result is still
1596 * empty, the problem is either 'a)' or 'c)' */
1597
1598 Size_t format_size = strlen(format) + 1;
1599 Size_t mod_size = format_size + 1;
1600 char * mod_format;
1601 char * temp_result;
1602
1603 Newx(mod_format, mod_size, char);
1604 Newx(temp_result, PL_langinfo_bufsize, char);
1605 *mod_format = '\a';
1606 my_strlcpy(mod_format + 1, format, mod_size);
1607 len = strftime(temp_result,
1608 PL_langinfo_bufsize,
1609 mod_format, &tm);
1610 Safefree(mod_format);
1611 Safefree(temp_result);
1612
1613 /* If 'len' is non-zero, it means that we had a case like %p
1614 * which means the current locale doesn't use a.m. or p.m., and
1615 * that is valid */
1616 if (len == 0) {
1617
1618 /* Here, still didn't work. If we get well beyond a
1619 * reasonable size, bail out to prevent an infinite loop. */
1620
1621 if (PL_langinfo_bufsize > 100 * format_size) {
1622 *PL_langinfo_buf = '\0';
1623 }
1624 else { /* Double the buffer size to retry; Add 1 in case
1625 original was 0, so we aren't stuck at 0. */
1626 PL_langinfo_bufsize *= 2;
1627 PL_langinfo_bufsize++;
1628 Renew(PL_langinfo_buf, PL_langinfo_bufsize, char);
1629 continue;
1630 }
1631 }
1632
1633 break;
1634 }
1635
1636 /* Here, we got a result.
1637 *
1638 * If the item is 'ALT_DIGITS', PL_langinfo_buf contains the
1639 * alternate format for wday 0. If the value is the same as the
1640 * normal 0, there isn't an alternate, so clear the buffer. */
1641 if ( item == PERL_ALT_DIGITS
1642 && strEQ(PL_langinfo_buf, "0"))
1643 {
1644 *PL_langinfo_buf = '\0';
1645 }
1646
1647 /* ALT_DIGITS is problematic. Experiments on it showed that
1648 * strftime() did not always work properly when going from alt-9 to
1649 * alt-10. Only a few locales have this item defined, and in all
1650 * of them on Linux that khw was able to find, nl_langinfo() merely
1651 * returned the alt-0 character, possibly doubled. Most Unicode
1652 * digits are in blocks of 10 consecutive code points, so that is
1653 * sufficient information for those scripts, as we can infer alt-1,
1654 * alt-2, .... But for a Japanese locale, a CJK ideographic 0 is
1655 * returned, and the CJK digits are not in code point order, so you
1656 * can't really infer anything. The localedef for this locale did
1657 * specify the succeeding digits, so that strftime() works properly
1658 * on them, without needing to infer anything. But the
1659 * nl_langinfo() return did not give sufficient information for the
1660 * caller to understand what's going on. So until there is
1661 * evidence that it should work differently, this returns the alt-0
1662 * string for ALT_DIGITS.
1663 *
1664 * wday was chosen because its range is all a single digit. Things
1665 * like tm_sec have two digits as the minimum: '00' */
1666
1667 LOCALE_UNLOCK;
1668
1669 /* If to return the format, not the value, overwrite the buffer
1670 * with it. But some strftime()s will keep the original format if
1671 * illegal, so change those to "" */
1672 if (return_format) {
1673 if (strEQ(PL_langinfo_buf, format)) {
1674 *PL_langinfo_buf = '\0';
1675 }
1676 else {
1677 save_to_buffer(format, &PL_langinfo_buf,
1678 &PL_langinfo_bufsize, 0);
1679 }
1680 }
1681
1682 break;
1683
1684# endif
1685
1686 }
1687
1688 return PL_langinfo_buf;
1689
1690#endif
1691
a4f00dcc 1692}
b385bb4d 1693
98994639
HS
1694/*
1695 * Initialize locale awareness.
1696 */
1697int
1698Perl_init_i18nl10n(pTHX_ int printwarn)
1699{
0e92a118
KW
1700 /* printwarn is
1701 *
1702 * 0 if not to output warning when setup locale is bad
1703 * 1 if to output warning based on value of PERL_BADLANG
1704 * >1 if to output regardless of PERL_BADLANG
1705 *
1706 * returns
98994639 1707 * 1 = set ok or not applicable,
0e92a118
KW
1708 * 0 = fallback to a locale of lower priority
1709 * -1 = fallback to all locales failed, not even to the C locale
6b058d42
KW
1710 *
1711 * Under -DDEBUGGING, if the environment variable PERL_DEBUG_LOCALE_INIT is
1712 * set, debugging information is output.
1713 *
1714 * This looks more complicated than it is, mainly due to the #ifdefs.
1715 *
1716 * We try to set LC_ALL to the value determined by the environment. If
1717 * there is no LC_ALL on this platform, we try the individual categories we
1718 * know about. If this works, we are done.
1719 *
1720 * But if it doesn't work, we have to do something else. We search the
1721 * environment variables ourselves instead of relying on the system to do
1722 * it. We look at, in order, LC_ALL, LANG, a system default locale (if we
1723 * think there is one), and the ultimate fallback "C". This is all done in
1724 * the same loop as above to avoid duplicating code, but it makes things
7d4bcc4a
KW
1725 * more complex. The 'trial_locales' array is initialized with just one
1726 * element; it causes the behavior described in the paragraph above this to
1727 * happen. If that fails, we add elements to 'trial_locales', and do extra
1728 * loop iterations to cause the behavior described in this paragraph.
6b058d42
KW
1729 *
1730 * On Ultrix, the locale MUST come from the environment, so there is
1731 * preliminary code to set it. I (khw) am not sure that it is necessary,
1732 * and that this couldn't be folded into the loop, but barring any real
1733 * platforms to test on, it's staying as-is
1734 *
1735 * A slight complication is that in embedded Perls, the locale may already
1736 * be set-up, and we don't want to get it from the normal environment
1737 * variables. This is handled by having a special environment variable
1738 * indicate we're in this situation. We simply set setlocale's 2nd
1739 * parameter to be a NULL instead of "". That indicates to setlocale that
1740 * it is not to change anything, but to return the current value,
1741 * effectively initializing perl's db to what the locale already is.
1742 *
1743 * We play the same trick with NULL if a LC_ALL succeeds. We call
1744 * setlocale() on the individual categores with NULL to get their existing
1745 * values for our db, instead of trying to change them.
1746 * */
98994639 1747
0e92a118
KW
1748 int ok = 1;
1749
7d4bcc4a
KW
1750#ifndef USE_LOCALE
1751
1752 PERL_UNUSED_ARG(printwarn);
1753
1754#else /* USE_LOCALE */
1755# ifdef USE_LOCALE_CTYPE
1756
98994639 1757 char *curctype = NULL;
7d4bcc4a
KW
1758
1759# endif /* USE_LOCALE_CTYPE */
1760# ifdef USE_LOCALE_COLLATE
1761
98994639 1762 char *curcoll = NULL;
7d4bcc4a
KW
1763
1764# endif /* USE_LOCALE_COLLATE */
1765# ifdef USE_LOCALE_NUMERIC
1766
98994639 1767 char *curnum = NULL;
7d4bcc4a
KW
1768
1769# endif /* USE_LOCALE_NUMERIC */
1770# ifdef __GLIBC__
1771
175c4cf9 1772 const char * const language = savepv(PerlEnv_getenv("LANGUAGE"));
7d4bcc4a
KW
1773
1774# endif
65ebb059 1775
ccd65d51
KW
1776 /* NULL uses the existing already set up locale */
1777 const char * const setlocale_init = (PerlEnv_getenv("PERL_SKIP_LOCALE_INIT"))
1778 ? NULL
1779 : "";
c3fcd832
KW
1780 const char* trial_locales[5]; /* 5 = 1 each for "", LC_ALL, LANG, "", C */
1781 unsigned int trial_locales_count;
175c4cf9
KW
1782 const char * const lc_all = savepv(PerlEnv_getenv("LC_ALL"));
1783 const char * const lang = savepv(PerlEnv_getenv("LANG"));
98994639 1784 bool setlocale_failure = FALSE;
65ebb059 1785 unsigned int i;
175c4cf9
KW
1786
1787 /* A later getenv() could zap this, so only use here */
1788 const char * const bad_lang_use_once = PerlEnv_getenv("PERL_BADLANG");
1789
1790 const bool locwarn = (printwarn > 1
1791 || (printwarn
1792 && (! bad_lang_use_once
22ff3130
HS
1793 || (
1794 /* disallow with "" or "0" */
1795 *bad_lang_use_once
1796 && strNE("0", bad_lang_use_once)))));
0e92a118 1797 bool done = FALSE;
5d1187d1
KW
1798 char * sl_result; /* return from setlocale() */
1799 char * locale_param;
7d4bcc4a
KW
1800
1801# ifdef WIN32
1802
6bce99ee
JH
1803 /* In some systems you can find out the system default locale
1804 * and use that as the fallback locale. */
7d4bcc4a
KW
1805# define SYSTEM_DEFAULT_LOCALE
1806# endif
1807# ifdef SYSTEM_DEFAULT_LOCALE
1808
65ebb059 1809 const char *system_default_locale = NULL;
98994639 1810
7d4bcc4a
KW
1811# endif
1812# ifdef DEBUGGING
1813
8298454c 1814 DEBUG_INITIALIZATION_set(cBOOL(PerlEnv_getenv("PERL_DEBUG_LOCALE_INIT")));
7d4bcc4a
KW
1815
1816# define DEBUG_LOCALE_INIT(category, locale, result) \
2fcc0ca9
KW
1817 STMT_START { \
1818 if (debug_initialization) { \
1819 PerlIO_printf(Perl_debug_log, \
1820 "%s:%d: %s\n", \
1821 __FILE__, __LINE__, \
a4f00dcc 1822 setlocale_debug_string(category, \
2fcc0ca9
KW
1823 locale, \
1824 result)); \
1825 } \
1826 } STMT_END
2fcc0ca9 1827
7d4bcc4a
KW
1828# else
1829# define DEBUG_LOCALE_INIT(a,b,c)
1830# endif
1831
1832# ifndef LOCALE_ENVIRON_REQUIRED
1833
0e92a118 1834 PERL_UNUSED_VAR(done);
5d1187d1 1835 PERL_UNUSED_VAR(locale_param);
7d4bcc4a
KW
1836
1837# else
98994639
HS
1838
1839 /*
1840 * Ultrix setlocale(..., "") fails if there are no environment
1841 * variables from which to get a locale name.
1842 */
1843
7d4bcc4a
KW
1844# ifdef LC_ALL
1845
98994639 1846 if (lang) {
5d1187d1
KW
1847 sl_result = my_setlocale(LC_ALL, setlocale_init);
1848 DEBUG_LOCALE_INIT(LC_ALL, setlocale_init, sl_result);
1849 if (sl_result)
98994639
HS
1850 done = TRUE;
1851 else
1852 setlocale_failure = TRUE;
1853 }
5d1187d1 1854 if (! setlocale_failure) {
7d4bcc4a
KW
1855
1856# ifdef USE_LOCALE_CTYPE
1857
5d1187d1
KW
1858 locale_param = (! done && (lang || PerlEnv_getenv("LC_CTYPE")))
1859 ? setlocale_init
1860 : NULL;
1861 curctype = my_setlocale(LC_CTYPE, locale_param);
1862 DEBUG_LOCALE_INIT(LC_CTYPE, locale_param, sl_result);
1863 if (! curctype)
98994639
HS
1864 setlocale_failure = TRUE;
1865 else
1866 curctype = savepv(curctype);
7d4bcc4a
KW
1867
1868# endif /* USE_LOCALE_CTYPE */
1869# ifdef USE_LOCALE_COLLATE
1870
5d1187d1
KW
1871 locale_param = (! done && (lang || PerlEnv_getenv("LC_COLLATE")))
1872 ? setlocale_init
1873 : NULL;
1874 curcoll = my_setlocale(LC_COLLATE, locale_param);
1875 DEBUG_LOCALE_INIT(LC_COLLATE, locale_param, sl_result);
1876 if (! curcoll)
98994639
HS
1877 setlocale_failure = TRUE;
1878 else
1879 curcoll = savepv(curcoll);
7d4bcc4a
KW
1880
1881# endif /* USE_LOCALE_COLLATE */
1882# ifdef USE_LOCALE_NUMERIC
1883
5d1187d1
KW
1884 locale_param = (! done && (lang || PerlEnv_getenv("LC_NUMERIC")))
1885 ? setlocale_init
1886 : NULL;
1887 curnum = my_setlocale(LC_NUMERIC, locale_param);
1888 DEBUG_LOCALE_INIT(LC_NUMERIC, locale_param, sl_result);
1889 if (! curnum)
98994639
HS
1890 setlocale_failure = TRUE;
1891 else
1892 curnum = savepv(curnum);
7d4bcc4a
KW
1893
1894# endif /* USE_LOCALE_NUMERIC */
1895# ifdef USE_LOCALE_MESSAGES
1896
5d1187d1
KW
1897 locale_param = (! done && (lang || PerlEnv_getenv("LC_MESSAGES")))
1898 ? setlocale_init
1899 : NULL;
1900 sl_result = my_setlocale(LC_MESSAGES, locale_param);
1901 DEBUG_LOCALE_INIT(LC_MESSAGES, locale_param, sl_result);
9f42613c 1902 if (! sl_result) {
a782673d
KW
1903 setlocale_failure = TRUE;
1904 }
7d4bcc4a
KW
1905
1906# endif /* USE_LOCALE_MESSAGES */
1907# ifdef USE_LOCALE_MONETARY
1908
5d1187d1
KW
1909 locale_param = (! done && (lang || PerlEnv_getenv("LC_MONETARY")))
1910 ? setlocale_init
1911 : NULL;
1912 sl_result = my_setlocale(LC_MONETARY, locale_param);
1913 DEBUG_LOCALE_INIT(LC_MONETARY, locale_param, sl_result);
1914 if (! sl_result) {
c835d6be
KW
1915 setlocale_failure = TRUE;
1916 }
98994639 1917
7d4bcc4a 1918# endif /* USE_LOCALE_MONETARY */
98994639 1919
7d4bcc4a
KW
1920 }
1921
1922# endif /* LC_ALL */
1923# endif /* !LOCALE_ENVIRON_REQUIRED */
98994639 1924
65ebb059 1925 /* We try each locale in the list until we get one that works, or exhaust
20a240df
KW
1926 * the list. Normally the loop is executed just once. But if setting the
1927 * locale fails, inside the loop we add fallback trials to the array and so
1928 * will execute the loop multiple times */
c3fcd832
KW
1929 trial_locales[0] = setlocale_init;
1930 trial_locales_count = 1;
7d4bcc4a 1931
65ebb059
KW
1932 for (i= 0; i < trial_locales_count; i++) {
1933 const char * trial_locale = trial_locales[i];
1934
1935 if (i > 0) {
1936
1937 /* XXX This is to preserve old behavior for LOCALE_ENVIRON_REQUIRED
1938 * when i==0, but I (khw) don't think that behavior makes much
1939 * sense */
1940 setlocale_failure = FALSE;
1941
7d4bcc4a
KW
1942# ifdef SYSTEM_DEFAULT_LOCALE
1943# ifdef WIN32
1944
65ebb059
KW
1945 /* On Windows machines, an entry of "" after the 0th means to use
1946 * the system default locale, which we now proceed to get. */
1947 if (strEQ(trial_locale, "")) {
1948 unsigned int j;
1949
1950 /* Note that this may change the locale, but we are going to do
1951 * that anyway just below */
1952 system_default_locale = setlocale(LC_ALL, "");
5d1187d1 1953 DEBUG_LOCALE_INIT(LC_ALL, "", system_default_locale);
65ebb059 1954
7d4bcc4a 1955 /* Skip if invalid or if it's already on the list of locales to
65ebb059
KW
1956 * try */
1957 if (! system_default_locale) {
1958 goto next_iteration;
1959 }
1960 for (j = 0; j < trial_locales_count; j++) {
1961 if (strEQ(system_default_locale, trial_locales[j])) {
1962 goto next_iteration;
1963 }
1964 }
1965
1966 trial_locale = system_default_locale;
1967 }
7d4bcc4a
KW
1968# endif /* WIN32 */
1969# endif /* SYSTEM_DEFAULT_LOCALE */
65ebb059
KW
1970 }
1971
7d4bcc4a
KW
1972# ifdef LC_ALL
1973
5d1187d1
KW
1974 sl_result = my_setlocale(LC_ALL, trial_locale);
1975 DEBUG_LOCALE_INIT(LC_ALL, trial_locale, sl_result);
1976 if (! sl_result) {
49c85077 1977 setlocale_failure = TRUE;
7cd8b568
KW
1978 }
1979 else {
1980 /* Since LC_ALL succeeded, it should have changed all the other
1981 * categories it can to its value; so we massage things so that the
1982 * setlocales below just return their category's current values.
1983 * This adequately handles the case in NetBSD where LC_COLLATE may
1984 * not be defined for a locale, and setting it individually will
7d4bcc4a 1985 * fail, whereas setting LC_ALL succeeds, leaving LC_COLLATE set to
7cd8b568
KW
1986 * the POSIX locale. */
1987 trial_locale = NULL;
1988 }
7d4bcc4a
KW
1989
1990# endif /* LC_ALL */
98994639 1991
49c85077 1992 if (!setlocale_failure) {
7d4bcc4a
KW
1993
1994# ifdef USE_LOCALE_CTYPE
1995
49c85077 1996 Safefree(curctype);
5d1187d1
KW
1997 curctype = my_setlocale(LC_CTYPE, trial_locale);
1998 DEBUG_LOCALE_INIT(LC_CTYPE, trial_locale, curctype);
1999 if (! curctype)
49c85077
KW
2000 setlocale_failure = TRUE;
2001 else
2002 curctype = savepv(curctype);
7d4bcc4a
KW
2003
2004# endif /* USE_LOCALE_CTYPE */
2005# ifdef USE_LOCALE_COLLATE
2006
49c85077 2007 Safefree(curcoll);
5d1187d1
KW
2008 curcoll = my_setlocale(LC_COLLATE, trial_locale);
2009 DEBUG_LOCALE_INIT(LC_COLLATE, trial_locale, curcoll);
2010 if (! curcoll)
49c85077
KW
2011 setlocale_failure = TRUE;
2012 else
2013 curcoll = savepv(curcoll);
7d4bcc4a
KW
2014
2015# endif /* USE_LOCALE_COLLATE */
2016# ifdef USE_LOCALE_NUMERIC
2017
49c85077 2018 Safefree(curnum);
5d1187d1
KW
2019 curnum = my_setlocale(LC_NUMERIC, trial_locale);
2020 DEBUG_LOCALE_INIT(LC_NUMERIC, trial_locale, curnum);
2021 if (! curnum)
49c85077
KW
2022 setlocale_failure = TRUE;
2023 else
2024 curnum = savepv(curnum);
7d4bcc4a
KW
2025
2026# endif /* USE_LOCALE_NUMERIC */
2027# ifdef USE_LOCALE_MESSAGES
2028
5d1187d1
KW
2029 sl_result = my_setlocale(LC_MESSAGES, trial_locale);
2030 DEBUG_LOCALE_INIT(LC_MESSAGES, trial_locale, sl_result);
2031 if (! (sl_result))
a782673d 2032 setlocale_failure = TRUE;
7d4bcc4a
KW
2033
2034# endif /* USE_LOCALE_MESSAGES */
2035# ifdef USE_LOCALE_MONETARY
2036
5d1187d1
KW
2037 sl_result = my_setlocale(LC_MONETARY, trial_locale);
2038 DEBUG_LOCALE_INIT(LC_MONETARY, trial_locale, sl_result);
2039 if (! (sl_result))
c835d6be 2040 setlocale_failure = TRUE;
7d4bcc4a
KW
2041
2042# endif /* USE_LOCALE_MONETARY */
c835d6be 2043
49c85077
KW
2044 if (! setlocale_failure) { /* Success */
2045 break;
2046 }
65ebb059 2047 }
98994639 2048
49c85077
KW
2049 /* Here, something failed; will need to try a fallback. */
2050 ok = 0;
65ebb059 2051
49c85077
KW
2052 if (i == 0) {
2053 unsigned int j;
98994639 2054
65ebb059 2055 if (locwarn) { /* Output failure info only on the first one */
7d4bcc4a
KW
2056
2057# ifdef LC_ALL
98994639 2058
49c85077
KW
2059 PerlIO_printf(Perl_error_log,
2060 "perl: warning: Setting locale failed.\n");
98994639 2061
7d4bcc4a 2062# else /* !LC_ALL */
98994639 2063
49c85077
KW
2064 PerlIO_printf(Perl_error_log,
2065 "perl: warning: Setting locale failed for the categories:\n\t");
7d4bcc4a
KW
2066
2067# ifdef USE_LOCALE_CTYPE
2068
49c85077
KW
2069 if (! curctype)
2070 PerlIO_printf(Perl_error_log, "LC_CTYPE ");
7d4bcc4a
KW
2071
2072# endif /* USE_LOCALE_CTYPE */
2073# ifdef USE_LOCALE_COLLATE
49c85077
KW
2074 if (! curcoll)
2075 PerlIO_printf(Perl_error_log, "LC_COLLATE ");
7d4bcc4a
KW
2076
2077# endif /* USE_LOCALE_COLLATE */
2078# ifdef USE_LOCALE_NUMERIC
2079
49c85077
KW
2080 if (! curnum)
2081 PerlIO_printf(Perl_error_log, "LC_NUMERIC ");
7d4bcc4a
KW
2082
2083# endif /* USE_LOCALE_NUMERIC */
2084
a782673d 2085 PerlIO_printf(Perl_error_log, "and possibly others\n");
98994639 2086
7d4bcc4a 2087# endif /* LC_ALL */
98994639 2088
49c85077
KW
2089 PerlIO_printf(Perl_error_log,
2090 "perl: warning: Please check that your locale settings:\n");
98994639 2091
7d4bcc4a
KW
2092# ifdef __GLIBC__
2093
49c85077
KW
2094 PerlIO_printf(Perl_error_log,
2095 "\tLANGUAGE = %c%s%c,\n",
2096 language ? '"' : '(',
2097 language ? language : "unset",
2098 language ? '"' : ')');
7d4bcc4a 2099# endif
98994639 2100
49c85077
KW
2101 PerlIO_printf(Perl_error_log,
2102 "\tLC_ALL = %c%s%c,\n",
2103 lc_all ? '"' : '(',
2104 lc_all ? lc_all : "unset",
2105 lc_all ? '"' : ')');
98994639 2106
7d4bcc4a
KW
2107# if defined(USE_ENVIRON_ARRAY)
2108
49c85077 2109 {
cd999af9 2110 char **e;
d5e32b93
KW
2111
2112 /* Look through the environment for any variables of the
2113 * form qr/ ^ LC_ [A-Z]+ = /x, except LC_ALL which was
2114 * already handled above. These are assumed to be locale
2115 * settings. Output them and their values. */
cd999af9 2116 for (e = environ; *e; e++) {
d5e32b93
KW
2117 const STRLEN prefix_len = sizeof("LC_") - 1;
2118 STRLEN uppers_len;
2119
cd999af9 2120 if ( strBEGINs(*e, "LC_")
c8b388b0 2121 && ! strBEGINs(*e, "LC_ALL=")
d5e32b93
KW
2122 && (uppers_len = strspn(*e + prefix_len,
2123 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"))
2124 && ((*e)[prefix_len + uppers_len] == '='))
cd999af9
KW
2125 {
2126 PerlIO_printf(Perl_error_log, "\t%.*s = \"%s\",\n",
d5e32b93
KW
2127 (int) (prefix_len + uppers_len), *e,
2128 *e + prefix_len + uppers_len + 1);
cd999af9
KW
2129 }
2130 }
49c85077 2131 }
7d4bcc4a
KW
2132
2133# else
2134
49c85077
KW
2135 PerlIO_printf(Perl_error_log,
2136 "\t(possibly more locale environment variables)\n");
7d4bcc4a
KW
2137
2138# endif
98994639 2139
49c85077
KW
2140 PerlIO_printf(Perl_error_log,
2141 "\tLANG = %c%s%c\n",
2142 lang ? '"' : '(',
2143 lang ? lang : "unset",
2144 lang ? '"' : ')');
98994639 2145
49c85077
KW
2146 PerlIO_printf(Perl_error_log,
2147 " are supported and installed on your system.\n");
2148 }
98994639 2149
65ebb059 2150 /* Calculate what fallback locales to try. We have avoided this
f6bab5f6 2151 * until we have to, because failure is quite unlikely. This will
65ebb059
KW
2152 * usually change the upper bound of the loop we are in.
2153 *
2154 * Since the system's default way of setting the locale has not
2155 * found one that works, We use Perl's defined ordering: LC_ALL,
2156 * LANG, and the C locale. We don't try the same locale twice, so
2157 * don't add to the list if already there. (On POSIX systems, the
2158 * LC_ALL element will likely be a repeat of the 0th element "",
6b058d42
KW
2159 * but there's no harm done by doing it explicitly.
2160 *
2161 * Note that this tries the LC_ALL environment variable even on
2162 * systems which have no LC_ALL locale setting. This may or may
2163 * not have been originally intentional, but there's no real need
2164 * to change the behavior. */
65ebb059
KW
2165 if (lc_all) {
2166 for (j = 0; j < trial_locales_count; j++) {
2167 if (strEQ(lc_all, trial_locales[j])) {
2168 goto done_lc_all;
2169 }
2170 }
2171 trial_locales[trial_locales_count++] = lc_all;
2172 }
2173 done_lc_all:
98994639 2174
65ebb059
KW
2175 if (lang) {
2176 for (j = 0; j < trial_locales_count; j++) {
2177 if (strEQ(lang, trial_locales[j])) {
2178 goto done_lang;
2179 }
2180 }
2181 trial_locales[trial_locales_count++] = lang;
2182 }
2183 done_lang:
2184
7d4bcc4a
KW
2185# if defined(WIN32) && defined(LC_ALL)
2186
65ebb059
KW
2187 /* For Windows, we also try the system default locale before "C".
2188 * (If there exists a Windows without LC_ALL we skip this because
2189 * it gets too complicated. For those, the "C" is the next
2190 * fallback possibility). The "" is the same as the 0th element of
2191 * the array, but the code at the loop above knows to treat it
2192 * differently when not the 0th */
2193 trial_locales[trial_locales_count++] = "";
7d4bcc4a
KW
2194
2195# endif
65ebb059
KW
2196
2197 for (j = 0; j < trial_locales_count; j++) {
2198 if (strEQ("C", trial_locales[j])) {
2199 goto done_C;
2200 }
2201 }
2202 trial_locales[trial_locales_count++] = "C";
98994639 2203
65ebb059
KW
2204 done_C: ;
2205 } /* end of first time through the loop */
98994639 2206
7d4bcc4a
KW
2207# ifdef WIN32
2208
65ebb059 2209 next_iteration: ;
7d4bcc4a
KW
2210
2211# endif
65ebb059
KW
2212
2213 } /* end of looping through the trial locales */
2214
2215 if (ok < 1) { /* If we tried to fallback */
2216 const char* msg;
2217 if (! setlocale_failure) { /* fallback succeeded */
2218 msg = "Falling back to";
2219 }
2220 else { /* fallback failed */
98994639 2221
65ebb059
KW
2222 /* We dropped off the end of the loop, so have to decrement i to
2223 * get back to the value the last time through */
2224 i--;
98994639 2225
65ebb059
KW
2226 ok = -1;
2227 msg = "Failed to fall back to";
2228
2229 /* To continue, we should use whatever values we've got */
7d4bcc4a
KW
2230
2231# ifdef USE_LOCALE_CTYPE
2232
49c85077
KW
2233 Safefree(curctype);
2234 curctype = savepv(setlocale(LC_CTYPE, NULL));
5d1187d1 2235 DEBUG_LOCALE_INIT(LC_CTYPE, NULL, curctype);
7d4bcc4a
KW
2236
2237# endif /* USE_LOCALE_CTYPE */
2238# ifdef USE_LOCALE_COLLATE
2239
49c85077
KW
2240 Safefree(curcoll);
2241 curcoll = savepv(setlocale(LC_COLLATE, NULL));
5d1187d1 2242 DEBUG_LOCALE_INIT(LC_COLLATE, NULL, curcoll);
7d4bcc4a
KW
2243
2244# endif /* USE_LOCALE_COLLATE */
2245# ifdef USE_LOCALE_NUMERIC
2246
49c85077
KW
2247 Safefree(curnum);
2248 curnum = savepv(setlocale(LC_NUMERIC, NULL));
5d1187d1 2249 DEBUG_LOCALE_INIT(LC_NUMERIC, NULL, curnum);
7d4bcc4a
KW
2250
2251# endif /* USE_LOCALE_NUMERIC */
2252
65ebb059
KW
2253 }
2254
2255 if (locwarn) {
2256 const char * description;
2257 const char * name = "";
2258 if (strEQ(trial_locales[i], "C")) {
2259 description = "the standard locale";
2260 name = "C";
2261 }
7d4bcc4a
KW
2262
2263# ifdef SYSTEM_DEFAULT_LOCALE
2264
65ebb059
KW
2265 else if (strEQ(trial_locales[i], "")) {
2266 description = "the system default locale";
2267 if (system_default_locale) {
2268 name = system_default_locale;
2269 }
2270 }
7d4bcc4a
KW
2271
2272# endif /* SYSTEM_DEFAULT_LOCALE */
2273
65ebb059
KW
2274 else {
2275 description = "a fallback locale";
2276 name = trial_locales[i];
2277 }
2278 if (name && strNE(name, "")) {
2279 PerlIO_printf(Perl_error_log,
2280 "perl: warning: %s %s (\"%s\").\n", msg, description, name);
2281 }
2282 else {
2283 PerlIO_printf(Perl_error_log,
2284 "perl: warning: %s %s.\n", msg, description);
2285 }
2286 }
2287 } /* End of tried to fallback */
98994639 2288
7d4bcc4a
KW
2289# ifdef USE_LOCALE_CTYPE
2290
98994639 2291 new_ctype(curctype);
98994639 2292
7d4bcc4a
KW
2293# endif /* USE_LOCALE_CTYPE */
2294# ifdef USE_LOCALE_COLLATE
2295
98994639 2296 new_collate(curcoll);
98994639 2297
7d4bcc4a
KW
2298# endif /* USE_LOCALE_COLLATE */
2299# ifdef USE_LOCALE_NUMERIC
2300
98994639 2301 new_numeric(curnum);
b310b053 2302
7d4bcc4a
KW
2303# endif /* USE_LOCALE_NUMERIC */
2304# if defined(USE_PERLIO) && defined(USE_LOCALE_CTYPE)
2305
49c85077
KW
2306 /* Set PL_utf8locale to TRUE if using PerlIO _and_ the current LC_CTYPE
2307 * locale is UTF-8. If PL_utf8locale and PL_unicode (set by -C or by
2308 * $ENV{PERL_UNICODE}) are true, perl.c:S_parse_body() will turn on the
2309 * PerlIO :utf8 layer on STDIN, STDOUT, STDERR, _and_ the default open
2310 * discipline. */
c1284011 2311 PL_utf8locale = _is_cur_LC_category_utf8(LC_CTYPE);
49c85077 2312
a05d7ebb 2313 /* Set PL_unicode to $ENV{PERL_UNICODE} if using PerlIO.
fde18df1
JH
2314 This is an alternative to using the -C command line switch
2315 (the -C if present will override this). */
2316 {
dd374669 2317 const char *p = PerlEnv_getenv("PERL_UNICODE");
a05d7ebb 2318 PL_unicode = p ? parse_unicode_opts(&p) : 0;
5a22a2bb
NC
2319 if (PL_unicode & PERL_UNICODE_UTF8CACHEASSERT_FLAG)
2320 PL_utf8cache = -1;
b310b053
JH
2321 }
2322
7d4bcc4a
KW
2323# endif
2324# ifdef USE_LOCALE_CTYPE
2325
43c5f42d 2326 Safefree(curctype);
7d4bcc4a
KW
2327
2328# endif /* USE_LOCALE_CTYPE */
2329# ifdef USE_LOCALE_COLLATE
2330
43c5f42d 2331 Safefree(curcoll);
7d4bcc4a
KW
2332
2333# endif /* USE_LOCALE_COLLATE */
2334# ifdef USE_LOCALE_NUMERIC
2335
43c5f42d 2336 Safefree(curnum);
8ef6e574 2337
7d4bcc4a
KW
2338# endif /* USE_LOCALE_NUMERIC */
2339
2340# ifdef __GLIBC__
2341
175c4cf9 2342 Safefree(language);
7d4bcc4a
KW
2343
2344# endif
175c4cf9
KW
2345
2346 Safefree(lc_all);
2347 Safefree(lang);
2348
e3305790 2349#endif /* USE_LOCALE */
2fcc0ca9 2350#ifdef DEBUGGING
7d4bcc4a 2351
2fcc0ca9 2352 /* So won't continue to output stuff */
27cdc72e 2353 DEBUG_INITIALIZATION_set(FALSE);
7d4bcc4a 2354
2fcc0ca9
KW
2355#endif
2356
98994639
HS
2357 return ok;
2358}
2359
98994639
HS
2360#ifdef USE_LOCALE_COLLATE
2361
a4a439fb 2362char *
a4a439fb
KW
2363Perl__mem_collxfrm(pTHX_ const char *input_string,
2364 STRLEN len, /* Length of 'input_string' */
2365 STRLEN *xlen, /* Set to length of returned string
2366 (not including the collation index
2367 prefix) */
2368 bool utf8 /* Is the input in UTF-8? */
6696cfa7 2369 )
98994639 2370{
a4a439fb
KW
2371
2372 /* _mem_collxfrm() is a bit like strxfrm() but with two important
2373 * differences. First, it handles embedded NULs. Second, it allocates a bit
2374 * more memory than needed for the transformed data itself. The real
55e5378d 2375 * transformed data begins at offset COLLXFRM_HDR_LEN. *xlen is set to
a4a439fb
KW
2376 * the length of that, and doesn't include the collation index size.
2377 * Please see sv_collxfrm() to see how this is used. */
2378
55e5378d
KW
2379#define COLLXFRM_HDR_LEN sizeof(PL_collation_ix)
2380
6696cfa7
KW
2381 char * s = (char *) input_string;
2382 STRLEN s_strlen = strlen(input_string);
79f120c8 2383 char *xbuf = NULL;
55e5378d 2384 STRLEN xAlloc; /* xalloc is a reserved word in VC */
17f41037 2385 STRLEN length_in_chars;
c664130f 2386 bool first_time = TRUE; /* Cleared after first loop iteration */
98994639 2387
a4a439fb
KW
2388 PERL_ARGS_ASSERT__MEM_COLLXFRM;
2389
2390 /* Must be NUL-terminated */
2391 assert(*(input_string + len) == '\0');
7918f24d 2392
79f120c8
KW
2393 /* If this locale has defective collation, skip */
2394 if (PL_collxfrm_base == 0 && PL_collxfrm_mult == 0) {
c7202dee
KW
2395 DEBUG_L(PerlIO_printf(Perl_debug_log,
2396 "_mem_collxfrm: locale's collation is defective\n"));
79f120c8
KW
2397 goto bad;
2398 }
2399
6696cfa7
KW
2400 /* Replace any embedded NULs with the control that sorts before any others.
2401 * This will give as good as possible results on strings that don't
2402 * otherwise contain that character, but otherwise there may be
2403 * less-than-perfect results with that character and NUL. This is
fdc080f3 2404 * unavoidable unless we replace strxfrm with our own implementation. */
fd43f63c
KW
2405 if (UNLIKELY(s_strlen < len)) { /* Only execute if there is an embedded
2406 NUL */
6696cfa7
KW
2407 char * e = s + len;
2408 char * sans_nuls;
fdc080f3 2409 STRLEN sans_nuls_len;
94762aa0 2410 int try_non_controls;
afc4976f
KW
2411 char this_replacement_char[] = "?\0"; /* Room for a two-byte string,
2412 making sure 2nd byte is NUL.
2413 */
2414 STRLEN this_replacement_len;
2415
1e4c9676
KW
2416 /* If we don't know what non-NUL control character sorts lowest for
2417 * this locale, find it */
f28f4d2a 2418 if (PL_strxfrm_NUL_replacement == '\0') {
6696cfa7 2419 int j;
afc4976f 2420 char * cur_min_x = NULL; /* The min_char's xfrm, (except it also
6696cfa7
KW
2421 includes the collation index
2422 prefixed. */
2423
91c0e2e0 2424 DEBUG_Lv(PerlIO_printf(Perl_debug_log, "Looking to replace NUL\n"));
94762aa0
KW
2425
2426 /* Unlikely, but it may be that no control will work to replace
1e4c9676
KW
2427 * NUL, in which case we instead look for any character. Controls
2428 * are preferred because collation order is, in general, context
2429 * sensitive, with adjoining characters affecting the order, and
2430 * controls are less likely to have such interactions, allowing the
2431 * NUL-replacement to stand on its own. (Another way to look at it
2432 * is to imagine what would happen if the NUL were replaced by a
2433 * combining character; it wouldn't work out all that well.) */
94762aa0
KW
2434 for (try_non_controls = 0;
2435 try_non_controls < 2;
2436 try_non_controls++)
2437 {
d4ff9586
KW
2438 /* Look through all legal code points (NUL isn't) */
2439 for (j = 1; j < 256; j++) {
2440 char * x; /* j's xfrm plus collation index */
2441 STRLEN x_len; /* length of 'x' */
2442 STRLEN trial_len = 1;
736a4fed 2443 char cur_source[] = { '\0', '\0' };
d4ff9586 2444
736a4fed
KW
2445 /* Skip non-controls the first time through the loop. The
2446 * controls in a UTF-8 locale are the L1 ones */
afc4976f
KW
2447 if (! try_non_controls && (PL_in_utf8_COLLATE_locale)
2448 ? ! isCNTRL_L1(j)
2449 : ! isCNTRL_LC(j))
2450 {
d4ff9586 2451 continue;
6696cfa7 2452 }
6696cfa7 2453
736a4fed
KW
2454 /* Create a 1-char string of the current code point */
2455 cur_source[0] = (char) j;
2456
d4ff9586
KW
2457 /* Then transform it */
2458 x = _mem_collxfrm(cur_source, trial_len, &x_len,
afc4976f 2459 0 /* The string is not in UTF-8 */);
6696cfa7 2460
1e4c9676 2461 /* Ignore any character that didn't successfully transform.
d4ff9586
KW
2462 * */
2463 if (! x) {
2464 continue;
2465 }
6696cfa7 2466
d4ff9586
KW
2467 /* If this character's transformation is lower than
2468 * the current lowest, this one becomes the lowest */
2469 if ( cur_min_x == NULL
2470 || strLT(x + COLLXFRM_HDR_LEN,
2471 cur_min_x + COLLXFRM_HDR_LEN))
2472 {
f28f4d2a 2473 PL_strxfrm_NUL_replacement = j;
d4ff9586 2474 cur_min_x = x;
d4ff9586
KW
2475 }
2476 else {
2477 Safefree(x);
2478 }
1e4c9676 2479 } /* end of loop through all 255 characters */
6696cfa7 2480
1e4c9676 2481 /* Stop looking if found */
94762aa0
KW
2482 if (cur_min_x) {
2483 break;
2484 }
2485
2486 /* Unlikely, but possible, if there aren't any controls that
2487 * work in the locale, repeat the loop, looking for any
2488 * character that works */
2489 DEBUG_L(PerlIO_printf(Perl_debug_log,
2490 "_mem_collxfrm: No control worked. Trying non-controls\n"));
1e4c9676 2491 } /* End of loop to try first the controls, then any char */
6696cfa7 2492
94762aa0
KW
2493 if (! cur_min_x) {
2494 DEBUG_L(PerlIO_printf(Perl_debug_log,
2495 "_mem_collxfrm: Couldn't find any character to replace"
2496 " embedded NULs in locale %s with", PL_collation_name));
2497 goto bad;
58eebef2
KW
2498 }
2499
94762aa0
KW
2500 DEBUG_L(PerlIO_printf(Perl_debug_log,
2501 "_mem_collxfrm: Replacing embedded NULs in locale %s with "
f28f4d2a 2502 "0x%02X\n", PL_collation_name, PL_strxfrm_NUL_replacement));
94762aa0 2503
6696cfa7 2504 Safefree(cur_min_x);
1e4c9676 2505 } /* End of determining the character that is to replace NULs */
afc4976f
KW
2506
2507 /* If the replacement is variant under UTF-8, it must match the
2508 * UTF8-ness as the original */
f28f4d2a
KW
2509 if ( ! UVCHR_IS_INVARIANT(PL_strxfrm_NUL_replacement) && utf8) {
2510 this_replacement_char[0] =
2511 UTF8_EIGHT_BIT_HI(PL_strxfrm_NUL_replacement);
2512 this_replacement_char[1] =
2513 UTF8_EIGHT_BIT_LO(PL_strxfrm_NUL_replacement);
afc4976f
KW
2514 this_replacement_len = 2;
2515 }
2516 else {
f28f4d2a 2517 this_replacement_char[0] = PL_strxfrm_NUL_replacement;
afc4976f
KW
2518 /* this_replacement_char[1] = '\0' was done at initialization */
2519 this_replacement_len = 1;
6696cfa7
KW
2520 }
2521
2522 /* The worst case length for the replaced string would be if every
2523 * character in it is NUL. Multiply that by the length of each
2524 * replacement, and allow for a trailing NUL */
afc4976f 2525 sans_nuls_len = (len * this_replacement_len) + 1;
fdc080f3 2526 Newx(sans_nuls, sans_nuls_len, char);
6696cfa7
KW
2527 *sans_nuls = '\0';
2528
6696cfa7
KW
2529 /* Replace each NUL with the lowest collating control. Loop until have
2530 * exhausted all the NULs */
2531 while (s + s_strlen < e) {
6069d6c5 2532 my_strlcat(sans_nuls, s, sans_nuls_len);
6696cfa7
KW
2533
2534 /* Do the actual replacement */
6069d6c5 2535 my_strlcat(sans_nuls, this_replacement_char, sans_nuls_len);
6696cfa7
KW
2536
2537 /* Move past the input NUL */
2538 s += s_strlen + 1;
2539 s_strlen = strlen(s);
2540 }
2541
2542 /* And add anything that trails the final NUL */
6069d6c5 2543 my_strlcat(sans_nuls, s, sans_nuls_len);
6696cfa7
KW
2544
2545 /* Switch so below we transform this modified string */
2546 s = sans_nuls;
2547 len = strlen(s);
1e4c9676 2548 } /* End of replacing NULs */
6696cfa7 2549
a4a439fb
KW
2550 /* Make sure the UTF8ness of the string and locale match */
2551 if (utf8 != PL_in_utf8_COLLATE_locale) {
2552 const char * const t = s; /* Temporary so we can later find where the
2553 input was */
2554
2555 /* Here they don't match. Change the string's to be what the locale is
2556 * expecting */
2557
2558 if (! utf8) { /* locale is UTF-8, but input isn't; upgrade the input */
2559 s = (char *) bytes_to_utf8((const U8 *) s, &len);
2560 utf8 = TRUE;
2561 }
2562 else { /* locale is not UTF-8; but input is; downgrade the input */
2563
2564 s = (char *) bytes_from_utf8((const U8 *) s, &len, &utf8);
2565
2566 /* If the downgrade was successful we are done, but if the input
2567 * contains things that require UTF-8 to represent, have to do
2568 * damage control ... */
2569 if (UNLIKELY(utf8)) {
2570
2571 /* What we do is construct a non-UTF-8 string with
2572 * 1) the characters representable by a single byte converted
2573 * to be so (if necessary);
2574 * 2) and the rest converted to collate the same as the
2575 * highest collating representable character. That makes
2576 * them collate at the end. This is similar to how we
2577 * handle embedded NULs, but we use the highest collating
2578 * code point instead of the smallest. Like the NUL case,
2579 * this isn't perfect, but is the best we can reasonably
2580 * do. Every above-255 code point will sort the same as
2581 * the highest-sorting 0-255 code point. If that code
2582 * point can combine in a sequence with some other code
2583 * points for weight calculations, us changing something to
2584 * be it can adversely affect the results. But in most
2585 * cases, it should work reasonably. And note that this is
2586 * really an illegal situation: using code points above 255
2587 * on a locale where only 0-255 are valid. If two strings
2588 * sort entirely equal, then the sort order for the
2589 * above-255 code points will be in code point order. */
2590
2591 utf8 = FALSE;
2592
2593 /* If we haven't calculated the code point with the maximum
2594 * collating order for this locale, do so now */
2595 if (! PL_strxfrm_max_cp) {
2596 int j;
2597
2598 /* The current transformed string that collates the
2599 * highest (except it also includes the prefixed collation
2600 * index. */
2601 char * cur_max_x = NULL;
2602
2603 /* Look through all legal code points (NUL isn't) */
2604 for (j = 1; j < 256; j++) {
2605 char * x;
2606 STRLEN x_len;
736a4fed 2607 char cur_source[] = { '\0', '\0' };
a4a439fb 2608
736a4fed
KW
2609 /* Create a 1-char string of the current code point */
2610 cur_source[0] = (char) j;
a4a439fb
KW
2611
2612 /* Then transform it */
2613 x = _mem_collxfrm(cur_source, 1, &x_len, FALSE);
2614
2615 /* If something went wrong (which it shouldn't), just
2616 * ignore this code point */
94762aa0 2617 if (! x) {
a4a439fb
KW
2618 continue;
2619 }
2620
2621 /* If this character's transformation is higher than
2622 * the current highest, this one becomes the highest */
2623 if ( cur_max_x == NULL
55e5378d
KW
2624 || strGT(x + COLLXFRM_HDR_LEN,
2625 cur_max_x + COLLXFRM_HDR_LEN))
a4a439fb
KW
2626 {
2627 PL_strxfrm_max_cp = j;
2628 cur_max_x = x;
2629 }
2630 else {
2631 Safefree(x);
2632 }
2633 }
2634
94762aa0
KW
2635 if (! cur_max_x) {
2636 DEBUG_L(PerlIO_printf(Perl_debug_log,
2637 "_mem_collxfrm: Couldn't find any character to"
2638 " replace above-Latin1 chars in locale %s with",
2639 PL_collation_name));
2640 goto bad;
2641 }
2642
58eebef2
KW
2643 DEBUG_L(PerlIO_printf(Perl_debug_log,
2644 "_mem_collxfrm: highest 1-byte collating character"
2645 " in locale %s is 0x%02X\n",
2646 PL_collation_name,
2647 PL_strxfrm_max_cp));
58eebef2 2648
a4a439fb
KW
2649 Safefree(cur_max_x);
2650 }
2651
2652 /* Here we know which legal code point collates the highest.
2653 * We are ready to construct the non-UTF-8 string. The length
2654 * will be at least 1 byte smaller than the input string
2655 * (because we changed at least one 2-byte character into a
2656 * single byte), but that is eaten up by the trailing NUL */
2657 Newx(s, len, char);
2658
2659 {
2660 STRLEN i;
2661 STRLEN d= 0;
042d9e50 2662 char * e = (char *) t + len;
a4a439fb
KW
2663
2664 for (i = 0; i < len; i+= UTF8SKIP(t + i)) {
2665 U8 cur_char = t[i];
2666 if (UTF8_IS_INVARIANT(cur_char)) {
2667 s[d++] = cur_char;
2668 }
042d9e50 2669 else if (UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(t + i, e)) {
a4a439fb
KW
2670 s[d++] = EIGHT_BIT_UTF8_TO_NATIVE(cur_char, t[i+1]);
2671 }
2672 else { /* Replace illegal cp with highest collating
2673 one */
2674 s[d++] = PL_strxfrm_max_cp;
2675 }
2676 }
2677 s[d++] = '\0';
2678 Renew(s, d, char); /* Free up unused space */
2679 }
2680 }
2681 }
2682
2683 /* Here, we have constructed a modified version of the input. It could
2684 * be that we already had a modified copy before we did this version.
2685 * If so, that copy is no longer needed */
2686 if (t != input_string) {
2687 Safefree(t);
2688 }
2689 }
2690
17f41037
KW
2691 length_in_chars = (utf8)
2692 ? utf8_length((U8 *) s, (U8 *) s + len)
2693 : len;
2694
59c018b9
KW
2695 /* The first element in the output is the collation id, used by
2696 * sv_collxfrm(); then comes the space for the transformed string. The
2697 * equation should give us a good estimate as to how much is needed */
55e5378d 2698 xAlloc = COLLXFRM_HDR_LEN
a4a439fb 2699 + PL_collxfrm_base
17f41037 2700 + (PL_collxfrm_mult * length_in_chars);
a02a5408 2701 Newx(xbuf, xAlloc, char);
c7202dee
KW
2702 if (UNLIKELY(! xbuf)) {
2703 DEBUG_L(PerlIO_printf(Perl_debug_log,
2704 "_mem_collxfrm: Couldn't malloc %zu bytes\n", xAlloc));
98994639 2705 goto bad;
c7202dee 2706 }
98994639 2707
d35fca5f 2708 /* Store the collation id */
98994639 2709 *(U32*)xbuf = PL_collation_ix;
d35fca5f
KW
2710
2711 /* Then the transformation of the input. We loop until successful, or we
2712 * give up */
4ebeff16 2713 for (;;) {
1adab0a7 2714
55e5378d 2715 *xlen = strxfrm(xbuf + COLLXFRM_HDR_LEN, s, xAlloc - COLLXFRM_HDR_LEN);
4ebeff16
KW
2716
2717 /* If the transformed string occupies less space than we told strxfrm()
2718 * was available, it means it successfully transformed the whole
2719 * string. */
55e5378d 2720 if (*xlen < xAlloc - COLLXFRM_HDR_LEN) {
17f41037 2721
1adab0a7
KW
2722 /* Some systems include a trailing NUL in the returned length.
2723 * Ignore it, using a loop in case multiple trailing NULs are
2724 * returned. */
2725 while ( (*xlen) > 0
2726 && *(xbuf + COLLXFRM_HDR_LEN + (*xlen) - 1) == '\0')
2727 {
2728 (*xlen)--;
2729 }
2730
17f41037
KW
2731 /* If the first try didn't get it, it means our prediction was low.
2732 * Modify the coefficients so that we predict a larger value in any
2733 * future transformations */
2734 if (! first_time) {
2735 STRLEN needed = *xlen + 1; /* +1 For trailing NUL */
2736 STRLEN computed_guess = PL_collxfrm_base
2737 + (PL_collxfrm_mult * length_in_chars);
e1c30f0c
KW
2738
2739 /* On zero-length input, just keep current slope instead of
2740 * dividing by 0 */
2741 const STRLEN new_m = (length_in_chars != 0)
2742 ? needed / length_in_chars
2743 : PL_collxfrm_mult;
17f41037
KW
2744
2745 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
b07929e4
KW
2746 "%s: %d: initial size of %zu bytes for a length "
2747 "%zu string was insufficient, %zu needed\n",
17f41037 2748 __FILE__, __LINE__,
b07929e4 2749 computed_guess, length_in_chars, needed));
17f41037
KW
2750
2751 /* If slope increased, use it, but discard this result for
2752 * length 1 strings, as we can't be sure that it's a real slope
2753 * change */
2754 if (length_in_chars > 1 && new_m > PL_collxfrm_mult) {
7d4bcc4a
KW
2755
2756# ifdef DEBUGGING
2757
17f41037
KW
2758 STRLEN old_m = PL_collxfrm_mult;
2759 STRLEN old_b = PL_collxfrm_base;
7d4bcc4a
KW
2760
2761# endif
2762
17f41037
KW
2763 PL_collxfrm_mult = new_m;
2764 PL_collxfrm_base = 1; /* +1 For trailing NUL */
2765 computed_guess = PL_collxfrm_base
2766 + (PL_collxfrm_mult * length_in_chars);
2767 if (computed_guess < needed) {
2768 PL_collxfrm_base += needed - computed_guess;
2769 }
2770
2771 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
b07929e4
KW
2772 "%s: %d: slope is now %zu; was %zu, base "
2773 "is now %zu; was %zu\n",
17f41037 2774 __FILE__, __LINE__,
b07929e4
KW
2775 PL_collxfrm_mult, old_m,
2776 PL_collxfrm_base, old_b));
17f41037
KW
2777 }
2778 else { /* Slope didn't change, but 'b' did */
2779 const STRLEN new_b = needed
2780 - computed_guess
2781 + PL_collxfrm_base;
2782 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
b07929e4 2783 "%s: %d: base is now %zu; was %zu\n",
17f41037 2784 __FILE__, __LINE__,
b07929e4 2785 new_b, PL_collxfrm_base));
17f41037
KW
2786 PL_collxfrm_base = new_b;
2787 }
2788 }
2789
4ebeff16
KW
2790 break;
2791 }
bb0f664e 2792
c7202dee
KW
2793 if (UNLIKELY(*xlen >= PERL_INT_MAX)) {
2794 DEBUG_L(PerlIO_printf(Perl_debug_log,
2795 "_mem_collxfrm: Needed %zu bytes, max permissible is %u\n",
2796 *xlen, PERL_INT_MAX));
4ebeff16 2797 goto bad;
c7202dee 2798 }
d35fca5f 2799
c664130f 2800 /* A well-behaved strxfrm() returns exactly how much space it needs
1adab0a7
KW
2801 * (usually not including the trailing NUL) when it fails due to not
2802 * enough space being provided. Assume that this is the case unless
2803 * it's been proven otherwise */
c664130f 2804 if (LIKELY(PL_strxfrm_is_behaved) && first_time) {
55e5378d 2805 xAlloc = *xlen + COLLXFRM_HDR_LEN + 1;
c664130f
KW
2806 }
2807 else { /* Here, either:
2808 * 1) The strxfrm() has previously shown bad behavior; or
2809 * 2) It isn't the first time through the loop, which means
2810 * that the strxfrm() is now showing bad behavior, because
2811 * we gave it what it said was needed in the previous
2812 * iteration, and it came back saying it needed still more.
2813 * (Many versions of cygwin fit this. When the buffer size
2814 * isn't sufficient, they return the input size instead of
2815 * how much is needed.)
d4ff9586
KW
2816 * Increase the buffer size by a fixed percentage and try again.
2817 * */
6ddd902c 2818 xAlloc += (xAlloc / 4) + 1;
c664130f 2819 PL_strxfrm_is_behaved = FALSE;
c664130f 2820
7d4bcc4a
KW
2821# ifdef DEBUGGING
2822
58eebef2
KW
2823 if (DEBUG_Lv_TEST || debug_initialization) {
2824 PerlIO_printf(Perl_debug_log,
2825 "_mem_collxfrm required more space than previously calculated"
b07929e4 2826 " for locale %s, trying again with new guess=%d+%zu\n",
58eebef2 2827 PL_collation_name, (int) COLLXFRM_HDR_LEN,
b07929e4 2828 xAlloc - COLLXFRM_HDR_LEN);
58eebef2 2829 }
7d4bcc4a
KW
2830
2831# endif
2832
58eebef2 2833 }
c664130f 2834
4ebeff16 2835 Renew(xbuf, xAlloc, char);
c7202dee
KW
2836 if (UNLIKELY(! xbuf)) {
2837 DEBUG_L(PerlIO_printf(Perl_debug_log,
2838 "_mem_collxfrm: Couldn't realloc %zu bytes\n", xAlloc));
4ebeff16 2839 goto bad;
c7202dee 2840 }
c664130f
KW
2841
2842 first_time = FALSE;
4ebeff16 2843 }
98994639 2844
6696cfa7 2845
7d4bcc4a
KW
2846# ifdef DEBUGGING
2847
58eebef2 2848 if (DEBUG_Lv_TEST || debug_initialization) {
c7202dee
KW
2849
2850 print_collxfrm_input_and_return(s, s + len, xlen, utf8);
2851 PerlIO_printf(Perl_debug_log, "Its xfrm is:");
7e2f38b2
KW
2852 PerlIO_printf(Perl_debug_log, "%s\n",
2853 _byte_dump_string((U8 *) xbuf + COLLXFRM_HDR_LEN,
2854 *xlen, 1));
58eebef2 2855 }
7d4bcc4a
KW
2856
2857# endif
58eebef2 2858
3c5f993e 2859 /* Free up unneeded space; retain ehough for trailing NUL */
55e5378d 2860 Renew(xbuf, COLLXFRM_HDR_LEN + *xlen + 1, char);
98994639 2861
6696cfa7
KW
2862 if (s != input_string) {
2863 Safefree(s);
98994639
HS
2864 }
2865
98994639
HS
2866 return xbuf;
2867
2868 bad:
2869 Safefree(xbuf);
6696cfa7
KW
2870 if (s != input_string) {
2871 Safefree(s);
2872 }
98994639 2873 *xlen = 0;
7d4bcc4a
KW
2874
2875# ifdef DEBUGGING
2876
58eebef2 2877 if (DEBUG_Lv_TEST || debug_initialization) {
c7202dee 2878 print_collxfrm_input_and_return(s, s + len, NULL, utf8);
58eebef2 2879 }
7d4bcc4a
KW
2880
2881# endif
2882
98994639
HS
2883 return NULL;
2884}
2885
7d4bcc4a 2886# ifdef DEBUGGING
c7202dee 2887
4cbaac56 2888STATIC void
c7202dee
KW
2889S_print_collxfrm_input_and_return(pTHX_
2890 const char * const s,
2891 const char * const e,
2892 const STRLEN * const xlen,
2893 const bool is_utf8)
2894{
c7202dee
KW
2895
2896 PERL_ARGS_ASSERT_PRINT_COLLXFRM_INPUT_AND_RETURN;
2897
511e4ff7
DM
2898 PerlIO_printf(Perl_debug_log, "_mem_collxfrm[%" UVuf "]: returning ",
2899 (UV)PL_collation_ix);
c7202dee 2900 if (xlen) {
08b6dc1d 2901 PerlIO_printf(Perl_debug_log, "%zu", *xlen);
c7202dee
KW
2902 }
2903 else {
2904 PerlIO_printf(Perl_debug_log, "NULL");
2905 }
2906 PerlIO_printf(Perl_debug_log, " for locale '%s', string='",
2907 PL_collation_name);
9c8a6dc2
KW
2908 print_bytes_for_locale(s, e, is_utf8);
2909
2910 PerlIO_printf(Perl_debug_log, "'\n");
2911}
2912
2913STATIC void
2914S_print_bytes_for_locale(pTHX_
2915 const char * const s,
2916 const char * const e,
2917 const bool is_utf8)
2918{
2919 const char * t = s;
2920 bool prev_was_printable = TRUE;
2921 bool first_time = TRUE;
2922
2923 PERL_ARGS_ASSERT_PRINT_BYTES_FOR_LOCALE;
c7202dee
KW
2924
2925 while (t < e) {
2926 UV cp = (is_utf8)
2927 ? utf8_to_uvchr_buf((U8 *) t, e, NULL)
2928 : * (U8 *) t;
2929 if (isPRINT(cp)) {
2930 if (! prev_was_printable) {
2931 PerlIO_printf(Perl_debug_log, " ");
2932 }
2933 PerlIO_printf(Perl_debug_log, "%c", (U8) cp);
2934 prev_was_printable = TRUE;
2935 }
2936 else {
2937 if (! first_time) {
2938 PerlIO_printf(Perl_debug_log, " ");
2939 }
147e3846 2940 PerlIO_printf(Perl_debug_log, "%02" UVXf, cp);
c7202dee
KW
2941 prev_was_printable = FALSE;
2942 }
2943 t += (is_utf8) ? UTF8SKIP(t) : 1;
2944 first_time = FALSE;
2945 }
c7202dee
KW
2946}
2947
7d4bcc4a 2948# endif /* #ifdef DEBUGGING */
98994639 2949#endif /* USE_LOCALE_COLLATE */
58eebef2 2950
8ef6e574
KW
2951#ifdef USE_LOCALE
2952
c1284011
KW
2953bool
2954Perl__is_cur_LC_category_utf8(pTHX_ int category)
7d74bb61
KW
2955{
2956 /* Returns TRUE if the current locale for 'category' is UTF-8; FALSE
2957 * otherwise. 'category' may not be LC_ALL. If the platform doesn't have
119ee68b 2958 * nl_langinfo(), nor MB_CUR_MAX, this employs a heuristic, which hence
609548d2
KW
2959 * could give the wrong result. The result will very likely be correct for
2960 * languages that have commonly used non-ASCII characters, but for notably
2961 * English, it comes down to if the locale's name ends in something like
2962 * "UTF-8". It errs on the side of not being a UTF-8 locale. */
7d74bb61
KW
2963
2964 char *save_input_locale = NULL;
7d74bb61
KW
2965 STRLEN final_pos;
2966
7d4bcc4a
KW
2967# ifdef LC_ALL
2968
7d74bb61 2969 assert(category != LC_ALL);
7d4bcc4a
KW
2970
2971# endif
7d74bb61
KW
2972
2973 /* First dispose of the trivial cases */
b07fffd1 2974 save_input_locale = setlocale(category, NULL);
7d74bb61 2975 if (! save_input_locale) {
69014004
KW
2976 DEBUG_L(PerlIO_printf(Perl_debug_log,
2977 "Could not find current locale for category %d\n",
2978 category));
7d74bb61
KW
2979 return FALSE; /* XXX maybe should croak */
2980 }
b07fffd1 2981 save_input_locale = stdize_locale(savepv(save_input_locale));
a39edc4c 2982 if (isNAME_C_OR_POSIX(save_input_locale)) {
69014004
KW
2983 DEBUG_L(PerlIO_printf(Perl_debug_log,
2984 "Current locale for category %d is %s\n",
2985 category, save_input_locale));
b07fffd1 2986 Safefree(save_input_locale);
7d74bb61
KW
2987 return FALSE;
2988 }
2989
7d4bcc4a 2990# if defined(USE_LOCALE_CTYPE) \
1d958db2 2991 && (defined(MB_CUR_MAX) || (defined(HAS_NL_LANGINFO) && defined(CODESET)))
7d74bb61 2992
1d958db2 2993 { /* Next try nl_langinfo or MB_CUR_MAX if available */
7d74bb61
KW
2994
2995 char *save_ctype_locale = NULL;
119ee68b 2996 bool is_utf8;
7d74bb61 2997
119ee68b 2998 if (category != LC_CTYPE) { /* These work only on LC_CTYPE */
7d74bb61
KW
2999
3000 /* Get the current LC_CTYPE locale */
4f72bb37 3001 save_ctype_locale = setlocale(LC_CTYPE, NULL);
7d74bb61 3002 if (! save_ctype_locale) {
69014004
KW
3003 DEBUG_L(PerlIO_printf(Perl_debug_log,
3004 "Could not find current locale for LC_CTYPE\n"));
7d74bb61
KW
3005 goto cant_use_nllanginfo;
3006 }
4f72bb37 3007 save_ctype_locale = stdize_locale(savepv(save_ctype_locale));
7d74bb61
KW
3008
3009 /* If LC_CTYPE and the desired category use the same locale, this
3010 * means that finding the value for LC_CTYPE is the same as finding
3011 * the value for the desired category. Otherwise, switch LC_CTYPE
3012 * to the desired category's locale */
3013 if (strEQ(save_ctype_locale, save_input_locale)) {
3014 Safefree(save_ctype_locale);
3015 save_ctype_locale = NULL;
3016 }
3017 else if (! setlocale(LC_CTYPE, save_input_locale)) {
69014004
KW
3018 DEBUG_L(PerlIO_printf(Perl_debug_log,
3019 "Could not change LC_CTYPE locale to %s\n",
3020 save_input_locale));
7d74bb61
KW
3021 Safefree(save_ctype_locale);
3022 goto cant_use_nllanginfo;
3023 }
3024 }
3025
69014004
KW
3026 DEBUG_L(PerlIO_printf(Perl_debug_log, "Current LC_CTYPE locale=%s\n",
3027 save_input_locale));
3028
7d74bb61 3029 /* Here the current LC_CTYPE is set to the locale of the category whose
1d958db2
KW
3030 * information is desired. This means that nl_langinfo() and MB_CUR_MAX
3031 * should give the correct results */
119ee68b 3032
7d4bcc4a
KW
3033# if defined(HAS_NL_LANGINFO) && defined(CODESET)
3034
1d958db2 3035 {
4f72bb37 3036 char *codeset = nl_langinfo(CODESET);
1d958db2 3037 if (codeset && strNE(codeset, "")) {
4f72bb37 3038 codeset = savepv(codeset);
119ee68b 3039
1d958db2
KW
3040 /* If we switched LC_CTYPE, switch back */
3041 if (save_ctype_locale) {
3042 setlocale(LC_CTYPE, save_ctype_locale);
3043 Safefree(save_ctype_locale);
3044 }
3045
3046 is_utf8 = foldEQ(codeset, STR_WITH_LEN("UTF-8"))
3047 || foldEQ(codeset, STR_WITH_LEN("UTF8"));
3048
69014004
KW
3049 DEBUG_L(PerlIO_printf(Perl_debug_log,
3050 "\tnllanginfo returned CODESET '%s'; ?UTF8 locale=%d\n",
3051 codeset, is_utf8));
1d958db2
KW
3052 Safefree(codeset);
3053 Safefree(save_input_locale);
3054 return is_utf8;
3055 }
119ee68b
KW
3056 }
3057
7d4bcc4a
KW
3058# endif
3059# ifdef MB_CUR_MAX
1d958db2
KW
3060
3061 /* Here, either we don't have nl_langinfo, or it didn't return a
3062 * codeset. Try MB_CUR_MAX */
3063
119ee68b
KW
3064 /* Standard UTF-8 needs at least 4 bytes to represent the maximum
3065 * Unicode code point. Since UTF-8 is the only non-single byte
3066 * encoding we handle, we just say any such encoding is UTF-8, and if
3067 * turns out to be wrong, other things will fail */
3068 is_utf8 = MB_CUR_MAX >= 4;
3069
69014004
KW
3070 DEBUG_L(PerlIO_printf(Perl_debug_log,
3071 "\tMB_CUR_MAX=%d; ?UTF8 locale=%d\n",
3072 (int) MB_CUR_MAX, is_utf8));
3073
119ee68b
KW
3074 Safefree(save_input_locale);
3075
7d4bcc4a 3076# ifdef HAS_MBTOWC
119ee68b
KW
3077
3078 /* ... But, most system that have MB_CUR_MAX will also have mbtowc(),
3079 * since they are both in the C99 standard. We can feed a known byte
3080 * string to the latter function, and check that it gives the expected
3081 * result */
3082 if (is_utf8) {
3083 wchar_t wc;
856b881c 3084 PERL_UNUSED_RESULT(mbtowc(&wc, NULL, 0));/* Reset any shift state */
69014004 3085 errno = 0;
f019f68f 3086 if ((size_t)mbtowc(&wc, HYPHEN_UTF8, strlen(HYPHEN_UTF8))
119ee68b
KW
3087 != strlen(HYPHEN_UTF8)
3088 || wc != (wchar_t) 0x2010)
3089 {
3090 is_utf8 = FALSE;
abdcbdb8 3091 DEBUG_L(PerlIO_printf(Perl_debug_log, "\thyphen=U+%x\n", (unsigned int)wc));
69014004
KW
3092 DEBUG_L(PerlIO_printf(Perl_debug_log,
3093 "\treturn from mbtowc=%d; errno=%d; ?UTF8 locale=0\n",
3094 mbtowc(&wc, HYPHEN_UTF8, strlen(HYPHEN_UTF8)), errno));
119ee68b
KW
3095 }
3096 }
7d4bcc4a
KW
3097
3098# endif
119ee68b 3099
1d958db2
KW
3100 /* If we switched LC_CTYPE, switch back */
3101 if (save_ctype_locale) {
3102 setlocale(LC_CTYPE, save_ctype_locale);
3103 Safefree(save_ctype_locale);
119ee68b 3104 }
7d74bb61 3105
1d958db2 3106 return is_utf8;
7d4bcc4a
KW
3107
3108# endif
3109
7d74bb61 3110 }
119ee68b 3111
7d74bb61
KW
3112 cant_use_nllanginfo:
3113
7d4bcc4a 3114# else /* nl_langinfo should work if available, so don't bother compiling this
0080c90a
KW
3115 fallback code. The final fallback of looking at the name is
3116 compiled, and will be executed if nl_langinfo fails */
7d74bb61 3117
97f4de96
KW
3118 /* nl_langinfo not available or failed somehow. Next try looking at the
3119 * currency symbol to see if it disambiguates things. Often that will be
3120 * in the native script, and if the symbol isn't in UTF-8, we know that the
3121 * locale isn't. If it is non-ASCII UTF-8, we infer that the locale is
609548d2
KW
3122 * too, as the odds of a non-UTF8 string being valid UTF-8 are quite small
3123 * */
fa9b773e 3124
7d4bcc4a
KW
3125# ifdef HAS_LOCALECONV
3126# ifdef USE_LOCALE_MONETARY
3127
fa9b773e
KW
3128 {
3129 char *save_monetary_locale = NULL;
fa9b773e 3130 bool only_ascii = FALSE;
13542a67
KW
3131 bool is_utf8 = FALSE;
3132 struct lconv* lc;
fa9b773e 3133
97f4de96
KW
3134 /* Like above for LC_CTYPE, we first set LC_MONETARY to the locale of
3135 * the desired category, if it isn't that locale already */
3136
fa9b773e
KW
3137 if (category != LC_MONETARY) {
3138
4f72bb37 3139 save_monetary_locale = setlocale(LC_MONETARY, NULL);
fa9b773e 3140 if (! save_monetary_locale) {
69014004
KW
3141 DEBUG_L(PerlIO_printf(Perl_debug_log,
3142 "Could not find current locale for LC_MONETARY\n"));
fa9b773e
KW
3143 goto cant_use_monetary;
3144 }
4f72bb37 3145 save_monetary_locale = stdize_locale(savepv(save_monetary_locale));
fa9b773e 3146
13542a67
KW
3147 if (strEQ(save_monetary_locale, save_input_locale)) {
3148 Safefree(save_monetary_locale);
3149 save_monetary_locale = NULL;
3150 }
3151 else if (! setlocale(LC_MONETARY, save_input_locale)) {
59c234b4
KW
3152 DEBUG_L(PerlIO_printf(Perl_debug_log,
3153 "Could not change LC_MONETARY locale to %s\n",
3154 save_input_locale));
3155 Safefree(save_monetary_locale);
3156 goto cant_use_monetary;
fa9b773e
KW
3157 }
3158 }
3159
3160 /* Here the current LC_MONETARY is set to the locale of the category
3161 * whose information is desired. */
3162
13542a67
KW
3163 lc = localeconv();
3164 if (! lc
3165 || ! lc->currency_symbol
c5f058df 3166 || is_utf8_invariant_string((U8 *) lc->currency_symbol, 0))
13542a67
KW
3167 {
3168 DEBUG_L(PerlIO_printf(Perl_debug_log, "Couldn't get currency symbol for %s, or contains only ASCII; can't use for determining if UTF-8 locale\n", save_input_locale));
3169 only_ascii = TRUE;
3170 }
3171 else {
3172 is_utf8 = is_utf8_string((U8 *) lc->currency_symbol, 0);
fa9b773e
KW
3173 }
3174
3175 /* If we changed it, restore LC_MONETARY to its original locale */
3176 if (save_monetary_locale) {
3177 setlocale(LC_MONETARY, save_monetary_locale);
3178 Safefree(save_monetary_locale);
3179 }
3180
13542a67 3181 if (! only_ascii) {
fa9b773e 3182
59c234b4
KW
3183 /* It isn't a UTF-8 locale if the symbol is not legal UTF-8;
3184 * otherwise assume the locale is UTF-8 if and only if the symbol
3185 * is non-ascii UTF-8. */
3186 DEBUG_L(PerlIO_printf(Perl_debug_log, "\t?Currency symbol for %s is UTF-8=%d\n",
3187 save_input_locale, is_utf8));
3188 Safefree(save_input_locale);
3189 return is_utf8;
13542a67 3190 }
fa9b773e
KW
3191 }
3192 cant_use_monetary:
3193
7d4bcc4a
KW
3194# endif /* USE_LOCALE_MONETARY */
3195# endif /* HAS_LOCALECONV */
fa9b773e 3196
7d4bcc4a 3197# if defined(HAS_STRFTIME) && defined(USE_LOCALE_TIME)
15f7e74e
KW
3198
3199/* Still haven't found a non-ASCII string to disambiguate UTF-8 or not. Try
3200 * the names of the months and weekdays, timezone, and am/pm indicator */
3201 {
3202 char *save_time_locale = NULL;
3203 int hour = 10;
3204 bool is_dst = FALSE;
3205 int dom = 1;
3206 int month = 0;
3207 int i;
3208 char * formatted_time;
3209
3210
3211 /* Like above for LC_MONETARY, we set LC_TIME to the locale of the
3212 * desired category, if it isn't that locale already */
3213
3214 if (category != LC_TIME) {
3215
3216 save_time_locale = setlocale(LC_TIME, NULL);
3217 if (! save_time_locale) {
3218 DEBUG_L(PerlIO_printf(Perl_debug_log,
3219 "Could not find current locale for LC_TIME\n"));
3220 goto cant_use_time;
3221 }
3222 save_time_locale = stdize_locale(savepv(save_time_locale));
3223
3224 if (strEQ(save_time_locale, save_input_locale)) {
3225 Safefree(save_time_locale);
3226 save_time_locale = NULL;
3227 }
3228 else if (! setlocale(LC_TIME, save_input_locale)) {
3229 DEBUG_L(PerlIO_printf(Perl_debug_log,
3230 "Could not change LC_TIME locale to %s\n",
3231 save_input_locale));
3232 Safefree(save_time_locale);
3233 goto cant_use_time;
3234 }
3235 }
3236
3237 /* Here the current LC_TIME is set to the locale of the category
3238 * whose information is desired. Look at all the days of the week and
9f10db87 3239 * month names, and the timezone and am/pm indicator for UTF-8 variant
15f7e74e
KW
3240 * characters. The first such a one found will tell us if the locale
3241 * is UTF-8 or not */
3242
3243 for (i = 0; i < 7 + 12; i++) { /* 7 days; 12 months */
3244 formatted_time = my_strftime("%A %B %Z %p",
3ae5cd07 3245 0, 0, hour, dom, month, 2012 - 1900, 0, 0, is_dst);
c5f058df
KW
3246 if ( ! formatted_time
3247 || is_utf8_invariant_string((U8 *) formatted_time, 0))
3248 {
15f7e74e
KW
3249
3250 /* Here, we didn't find a non-ASCII. Try the next time through
3251 * with the complemented dst and am/pm, and try with the next
3252 * weekday. After we have gotten all weekdays, try the next
3253 * month */
3254 is_dst = ! is_dst;
3255 hour = (hour + 12) % 24;
3256 dom++;
3257 if (i > 6) {
3258 month++;
3259 }
3260 continue;
3261 }
3262
3263 /* Here, we have a non-ASCII. Return TRUE is it is valid UTF8;
3264 * false otherwise. But first, restore LC_TIME to its original
3265 * locale if we changed it */
3266 if (save_time_locale) {
3267 setlocale(LC_TIME, save_time_locale);
3268 Safefree(save_time_locale);
3269 }
3270
3271 DEBUG_L(PerlIO_printf(Perl_debug_log, "\t?time-related strings for %s are UTF-8=%d\n",
3272 save_input_locale,
3273 is_utf8_string((U8 *) formatted_time, 0)));
3274 Safefree(save_input_locale);
3275 return is_utf8_string((U8 *) formatted_time, 0);
3276 }
3277
3278 /* Falling off the end of the loop indicates all the names were just
3279 * ASCII. Go on to the next test. If we changed it, restore LC_TIME
3280 * to its original locale */
3281 if (save_time_locale) {
3282 setlocale(LC_TIME, save_time_locale);
3283 Safefree(save_time_locale);
3284 }
3285 DEBUG_L(PerlIO_printf(Perl_debug_log, "All time-related words for %s contain only ASCII; can't use for determining if UTF-8 locale\n", save_input_locale));
3286 }
3287 cant_use_time:
3288
7d4bcc4a 3289# endif
15f7e74e 3290
7d4bcc4a 3291# if 0 && defined(USE_LOCALE_MESSAGES) && defined(HAS_SYS_ERRLIST)
855aeb93
JH
3292
3293/* This code is ifdefd out because it was found to not be necessary in testing
5857e934
KW
3294 * on our dromedary test machine, which has over 700 locales. There, this
3295 * added no value to looking at the currency symbol and the time strings. I
3296 * left it in so as to avoid rewriting it if real-world experience indicates
3297 * that dromedary is an outlier. Essentially, instead of returning abpve if we
855aeb93
JH
3298 * haven't found illegal utf8, we continue on and examine all the strerror()
3299 * messages on the platform for utf8ness. If all are ASCII, we still don't
3300 * know the answer; but otherwise we have a pretty good indication of the
5857e934
KW
3301 * utf8ness. The reason this doesn't help much is that the messages may not
3302 * have been translated into the locale. The currency symbol and time strings
3303 * are much more likely to have been translated. */
3304 {
855aeb93 3305 int e;
5857e934
KW
3306 bool is_utf8 = FALSE;
3307 bool non_ascii = FALSE;
855aeb93 3308 char *save_messages_locale = NULL;
5857e934 3309 const char * errmsg = NULL;
855aeb93 3310
5857e934
KW
3311 /* Like above, we set LC_MESSAGES to the locale of the desired
3312 * category, if it isn't that locale already */
855aeb93
JH
3313
3314 if (category != LC_MESSAGES) {
3315
5857e934 3316 save_messages_locale = setlocale(LC_MESSAGES, NULL);
855aeb93 3317 if (! save_messages_locale) {
5857e934
KW
3318 DEBUG_L(PerlIO_printf(Perl_debug_log,
3319 "Could not find current locale for LC_MESSAGES\n"));
855aeb93
JH
3320 goto cant_use_messages;
3321 }
5857e934 3322 save_messages_locale = stdize_locale(savepv(save_messages_locale));
855aeb93
JH
3323
3324 if (strEQ(save_messages_locale, save_input_locale)) {
5857e934
KW
3325 Safefree(save_messages_locale);
3326 save_messages_locale = NULL;
855aeb93
JH
3327 }
3328 else if (! setlocale(LC_MESSAGES, save_input_locale)) {
5857e934
KW
3329 DEBUG_L(PerlIO_printf(Perl_debug_log,
3330 "Could not change LC_MESSAGES locale to %s\n",
3331 save_input_locale));
855aeb93
JH
3332 Safefree(save_messages_locale);
3333 goto cant_use_messages;
3334 }
3335 }
3336
3337 /* Here the current LC_MESSAGES is set to the locale of the category
5857e934
KW
3338 * whose information is desired. Look through all the messages. We
3339 * can't use Strerror() here because it may expand to code that
3340 * segfaults in miniperl */
855aeb93 3341
5857e934
KW
3342 for (e = 0; e <= sys_nerr; e++) {
3343 errno = 0;
3344 errmsg = sys_errlist[e];
3345 if (errno || !errmsg) {
855aeb93
JH
3346 break;
3347 }
5857e934 3348 errmsg = savepv(errmsg);
c5f058df 3349 if (! is_utf8_invariant_string((U8 *) errmsg, 0)) {
5857e934
KW
3350 non_ascii = TRUE;
3351 is_utf8 = is_utf8_string((U8 *) errmsg, 0);
3352 break;
855aeb93
JH
3353 }
3354 }
5857e934 3355 Safefree(errmsg);
855aeb93
JH
3356
3357 /* And, if we changed it, restore LC_MESSAGES to its original locale */
3358 if (save_messages_locale) {
3359 setlocale(LC_MESSAGES, save_messages_locale);
3360 Safefree(save_messages_locale);
3361 }
3362
5857e934
KW
3363 if (non_ascii) {
3364
3365 /* Any non-UTF-8 message means not a UTF-8 locale; if all are valid,
3366 * any non-ascii means it is one; otherwise we assume it isn't */
3367 DEBUG_L(PerlIO_printf(Perl_debug_log, "\t?error messages for %s are UTF-8=%d\n",
3368 save_input_locale,
3369 is_utf8));
3370 Safefree(save_input_locale);
3371 return is_utf8;
3372 }
855aeb93 3373
5857e934 3374 DEBUG_L(PerlIO_printf(Perl_debug_log, "All error messages for %s contain only ASCII; can't use for determining if UTF-8 locale\n", save_input_locale));
855aeb93
JH
3375 }
3376 cant_use_messages:
3377
7d4bcc4a
KW
3378# endif
3379# endif /* the code that is compiled when no nl_langinfo */
0080c90a 3380
7d4bcc4a 3381# ifndef EBCDIC /* On os390, even if the name ends with "UTF-8', it isn't a
92c0a900 3382 UTF-8 locale */
7d4bcc4a 3383
97f4de96
KW
3384 /* As a last resort, look at the locale name to see if it matches
3385 * qr/UTF -? * 8 /ix, or some other common locale names. This "name", the
3386 * return of setlocale(), is actually defined to be opaque, so we can't
3387 * really rely on the absence of various substrings in the name to indicate
3388 * its UTF-8ness, but if it has UTF8 in the name, it is extremely likely to
3389 * be a UTF-8 locale. Similarly for the other common names */
3390
3391 final_pos = strlen(save_input_locale) - 1;
3392 if (final_pos >= 3) {
3393 char *name = save_input_locale;
3394
3395 /* Find next 'U' or 'u' and look from there */
3396 while ((name += strcspn(name, "Uu") + 1)
3397 <= save_input_locale + final_pos - 2)
3398 {
7d4bcc4a 3399 if ( !isALPHA_FOLD_NE(*name, 't')
305b8651 3400 || isALPHA_FOLD_NE(*(name + 1), 'f'))
97f4de96
KW
3401 {
3402 continue;
3403 }
3404 name += 2;
3405 if (*(name) == '-') {
3406 if ((name > save_input_locale + final_pos - 1)) {
3407 break;
3408 }
3409 name++;
3410 }
3411 if (*(name) == '8') {
97f4de96
KW
3412 DEBUG_L(PerlIO_printf(Perl_debug_log,
3413 "Locale %s ends with UTF-8 in name\n",
3414 save_input_locale));
00c54b9c 3415 Safefree(save_input_locale);
97f4de96
KW
3416 return TRUE;
3417 }
3418 }
3419 DEBUG_L(PerlIO_printf(Perl_debug_log,
3420 "Locale %s doesn't end with UTF-8 in name\n",
3421 save_input_locale));
3422 }
3423
7d4bcc4a
KW
3424# endif
3425# ifdef WIN32
3426
97f4de96 3427 /* http://msdn.microsoft.com/en-us/library/windows/desktop/dd317756.aspx */
8a0832a1 3428 if (memENDs(save_input_locale, final_pos, "65001")) {
97f4de96 3429 DEBUG_L(PerlIO_printf(Perl_debug_log,
8a0832a1 3430 "Locale %s ends with 65001 in name, is UTF-8 locale\n",
97f4de96
KW
3431 save_input_locale));
3432 Safefree(save_input_locale);
3433 return TRUE;
3434 }
7d4bcc4a
KW
3435
3436# endif
97f4de96
KW
3437
3438 /* Other common encodings are the ISO 8859 series, which aren't UTF-8. But
3439 * since we are about to return FALSE anyway, there is no point in doing
3440 * this extra work */
7d4bcc4a
KW
3441
3442# if 0
97f4de96
KW
3443 if (instr(save_input_locale, "8859")) {
3444 DEBUG_L(PerlIO_printf(Perl_debug_log,
3445 "Locale %s has 8859 in name, not UTF-8 locale\n",
3446 save_input_locale));
3447 Safefree(save_input_locale);
3448 return FALSE;
3449 }
7d4bcc4a 3450# endif
97f4de96 3451
69014004
KW
3452 DEBUG_L(PerlIO_printf(Perl_debug_log,
3453 "Assuming locale %s is not a UTF-8 locale\n",
3454 save_input_locale));
fa9b773e 3455 Safefree(save_input_locale);
7d74bb61
KW
3456 return FALSE;
3457}
3458
8ef6e574 3459#endif
7d74bb61 3460
d6ded950
KW
3461
3462bool
3463Perl__is_in_locale_category(pTHX_ const bool compiling, const int category)
3464{
1a4f13e1 3465 dVAR;
d6ded950
KW
3466 /* Internal function which returns if we are in the scope of a pragma that
3467 * enables the locale category 'category'. 'compiling' should indicate if
3468 * this is during the compilation phase (TRUE) or not (FALSE). */
3469
3470 const COP * const cop = (compiling) ? &PL_compiling : PL_curcop;
3471
3472 SV *categories = cop_hints_fetch_pvs(cop, "locale", 0);
3473 if (! categories || categories == &PL_sv_placeholder) {
3474 return FALSE;
3475 }
3476
3477 /* The pseudo-category 'not_characters' is -1, so just add 1 to each to get
3478 * a valid unsigned */
3479 assert(category >= -1);
3480 return cBOOL(SvUV(categories) & (1U << (category + 1)));
3481}
3482
2c6ee1a7 3483char *
6ebbc862
KW
3484Perl_my_strerror(pTHX_ const int errnum)
3485{
3486 /* Returns a mortalized copy of the text of the error message associated
3487 * with 'errnum'. It uses the current locale's text unless the platform
3488 * doesn't have the LC_MESSAGES category or we are not being called from
3489 * within the scope of 'use locale'. In the former case, it uses whatever
3490 * strerror returns; in the latter case it uses the text from the C locale.
3491 *
3492 * The function just calls strerror(), but temporarily switches, if needed,
3493 * to the C locale */
3494
3495 char *errstr;
52770946 3496 dVAR;
6ebbc862 3497
52770946 3498#ifndef USE_LOCALE_MESSAGES
6ebbc862 3499
52770946
KW
3500 /* If platform doesn't have messages category, we don't do any switching to
3501 * the C locale; we just use whatever strerror() returns */
3502
3503 errstr = savepv(Strerror(errnum));
3504
3505#else /* Has locale messages */
3506
3507 const bool within_locale_scope = IN_LC(LC_MESSAGES);
2c6ee1a7 3508
7aaa36b1
KW
3509# if defined(HAS_POSIX_2008_LOCALE) && defined(HAS_STRERROR_L)
3510
3511 /* This function is trivial if we have strerror_l() */
3512
3513 if (within_locale_scope) {
3514 errstr = strerror(errnum);
3515 }
3516 else {
3517 errstr = strerror_l(errnum, PL_C_locale_obj);
3518 }
3519
3520 errstr = savepv(errstr);
3521
3522# else /* Doesn't have strerror_l(). */
3523
3524# ifdef USE_POSIX_2008_LOCALE
3525
fcd0e682 3526 locale_t save_locale = NULL;
7aaa36b1
KW
3527
3528# else
3529
fcd0e682 3530 char * save_locale = NULL;
c9dda6da 3531 bool locale_is_C = FALSE;
2c6ee1a7 3532
6ebbc862
KW
3533 /* We have a critical section to prevent another thread from changing the
3534 * locale out from under us (or zapping the buffer returned from
3535 * setlocale() ) */
3536 LOCALE_LOCK;
3537
7aaa36b1 3538# endif
6ebbc862 3539
9c8a6dc2
KW
3540 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
3541 "my_strerror called with errnum %d\n", errnum));
6ebbc862 3542 if (! within_locale_scope) {
c9dda6da 3543 errno = 0;
a0b53297 3544
f1d2176b 3545# ifdef USE_POSIX_2008_LOCALE /* Use the thread-safe locale functions */
6ebbc862 3546
9c8a6dc2
KW
3547 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
3548 "Not within locale scope, about to call"
3549 " uselocale(0x%p)\n", PL_C_locale_obj));
6ebbc862 3550 save_locale = uselocale(PL_C_locale_obj);
c9dda6da
KW
3551 if (! save_locale) {
3552 DEBUG_L(PerlIO_printf(Perl_debug_log,
9c8a6dc2
KW
3553 "uselocale failed, errno=%d\n", errno));
3554 }
3555 else {
3556 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
3557 "uselocale returned 0x%p\n", save_locale));
c9dda6da 3558 }
6ebbc862 3559
7aaa36b1 3560# else /* Not thread-safe build */
a0b53297
KW
3561
3562 save_locale = setlocale(LC_MESSAGES, NULL);
c9dda6da
KW
3563 if (! save_locale) {
3564 DEBUG_L(PerlIO_printf(Perl_debug_log,
3565 "setlocale failed, errno=%d\n", errno));
3566 }
3567 else {
3568 locale_is_C = isNAME_C_OR_POSIX(save_locale);
2c6ee1a7 3569
c9dda6da
KW
3570 /* Switch to the C locale if not already in it */
3571 if (! locale_is_C) {
2c6ee1a7 3572
c9dda6da
KW
3573 /* The setlocale() just below likely will zap 'save_locale', so
3574 * create a copy. */
3575 save_locale = savepv(save_locale);
3576 setlocale(LC_MESSAGES, "C");
3577 }
6ebbc862 3578 }
2c6ee1a7 3579
7aaa36b1 3580# endif
2c6ee1a7 3581
6ebbc862 3582 } /* end of ! within_locale_scope */
9c8a6dc2
KW
3583 else {
3584 DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s: %d: WITHIN locale scope\n",
3585 __FILE__, __LINE__));
3586 }
a0b53297 3587
9c8a6dc2
KW
3588 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
3589 "Any locale change has been done; about to call Strerror\n"));
52770946 3590 errstr = savepv(Strerror(errnum));
6ebbc862
KW
3591
3592 if (! within_locale_scope) {
c9dda6da 3593 errno = 0;
a0b53297 3594
f1d2176b 3595# ifdef USE_POSIX_2008_LOCALE
6ebbc862 3596
9c8a6dc2
KW
3597 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
3598 "%s: %d: not within locale scope, restoring the locale\n",
3599 __FILE__, __LINE__));
c9dda6da
KW
3600 if (save_locale && ! uselocale(save_locale)) {
3601 DEBUG_L(PerlIO_printf(Perl_debug_log,
3602 "uselocale restore failed, errno=%d\n", errno));
3603 }
2c6ee1a7 3604 }
6ebbc862 3605
7aaa36b1 3606# else
6ebbc862 3607
c9dda6da
KW
3608 if (save_locale && ! locale_is_C) {
3609 if (! setlocale(LC_MESSAGES, save_locale)) {
3610 DEBUG_L(PerlIO_printf(Perl_debug_log,
3611 "setlocale restore failed, errno=%d\n", errno));
3612 }
6ebbc862
KW
3613 Safefree(save_locale);
3614 }
3615 }
3616
3617 LOCALE_UNLOCK;
3618
7aaa36b1
KW
3619# endif
3620# endif /* End of doesn't have strerror_l */
52770946 3621#endif /* End of does have locale messages */
6affbbf0
KW
3622
3623#ifdef DEBUGGING
3624
3625 if (DEBUG_Lv_TEST) {
3626 PerlIO_printf(Perl_debug_log, "Strerror returned; saving a copy: '");
3627 print_bytes_for_locale(errstr, errstr + strlen(errstr), 0);
3628 PerlIO_printf(Perl_debug_log, "'\n");
3629 }
3630
2c6ee1a7
KW
3631#endif
3632
52770946 3633 SAVEFREEPV(errstr);
6ebbc862 3634 return errstr;
2c6ee1a7
KW
3635}
3636
66610fdd 3637/*
747c467a 3638
747c467a
KW
3639=for apidoc sync_locale
3640
3641Changing the program's locale should be avoided by XS code. Nevertheless,
3642certain non-Perl libraries called from XS, such as C<Gtk> do so. When this
3643happens, Perl needs to be told that the locale has changed. Use this function
3644to do so, before returning to Perl.
3645
3646=cut
3647*/
3648
3649void
3650Perl_sync_locale(pTHX)
3651{
3652
3653#ifdef USE_LOCALE_CTYPE
7d4bcc4a 3654
747c467a 3655 new_ctype(setlocale(LC_CTYPE, NULL));
747c467a 3656
7d4bcc4a 3657#endif /* USE_LOCALE_CTYPE */
747c467a 3658#ifdef USE_LOCALE_COLLATE
7d4bcc4a 3659
747c467a 3660 new_collate(setlocale(LC_COLLATE, NULL));
747c467a 3661
7d4bcc4a 3662#endif
747c467a 3663#ifdef USE_LOCALE_NUMERIC
7d4bcc4a 3664
747c467a
KW
3665 set_numeric_local(); /* Switch from "C" to underlying LC_NUMERIC */
3666 new_numeric(setlocale(LC_NUMERIC, NULL));
7d4bcc4a 3667
747c467a
KW
3668#endif /* USE_LOCALE_NUMERIC */
3669
3670}
3671
5d1187d1
KW
3672#if defined(DEBUGGING) && defined(USE_LOCALE)
3673
a4f00dcc
KW
3674STATIC char *
3675S_setlocale_debug_string(const int category, /* category number,
5d1187d1
KW
3676 like LC_ALL */
3677 const char* const locale, /* locale name */
3678
3679 /* return value from setlocale() when attempting to
3680 * set 'category' to 'locale' */
3681 const char* const retval)
3682{
3683 /* Returns a pointer to a NUL-terminated string in static storage with
3684 * added text about the info passed in. This is not thread safe and will
3685 * be overwritten by the next call, so this should be used just to
fa07b8e5 3686 * formulate a string to immediately print or savepv() on. */
5d1187d1 3687
398a990f
DM
3688 /* initialise to a non-null value to keep it out of BSS and so keep
3689 * -DPERL_GLOBAL_STRUCT_PRIVATE happy */
60b45a7d
KW
3690 static char ret[128] = "If you can read this, thank your buggy C"
3691 " library strlcpy(), and change your hints file"
3692 " to undef it";
fa07b8e5 3693 my_strlcpy(ret, "setlocale(", sizeof(ret));
5d1187d1
KW
3694
3695 switch (category) {
3696 default:
fa07b8e5 3697 my_snprintf(ret, sizeof(ret), "%s? %d", ret, category);
5d1187d1 3698 break;
7d4bcc4a
KW
3699
3700# ifdef LC_ALL
3701
5d1187d1 3702 case LC_ALL:
fa07b8e5 3703 my_strlcat(ret, "LC_ALL", sizeof(ret));
5d1187d1 3704 break;
7d4bcc4a
KW
3705
3706# endif
3707# ifdef LC_CTYPE
3708
5d1187d1 3709 case LC_CTYPE:
fa07b8e5 3710 my_strlcat(ret, "LC_CTYPE", sizeof(ret));
5d1187d1 3711 break;
7d4bcc4a
KW
3712
3713# endif
3714# ifdef LC_NUMERIC
3715
5d1187d1 3716 case LC_NUMERIC:
fa07b8e5 3717 my_strlcat(ret, "LC_NUMERIC", sizeof(ret));
5d1187d1 3718 break;
7d4bcc4a
KW
3719
3720# endif
3721# ifdef LC_COLLATE
3722
5d1187d1 3723 case LC_COLLATE:
fa07b8e5 3724 my_strlcat(ret, "LC_COLLATE", sizeof(ret));
5d1187d1 3725 break;
7d4bcc4a
KW
3726
3727# endif
3728# ifdef LC_TIME
3729
5d1187d1 3730 case LC_TIME:
fa07b8e5 3731 my_strlcat(ret, "LC_TIME", sizeof(ret));
5d1187d1 3732 break;
7d4bcc4a
KW
3733
3734# endif
3735# ifdef LC_MONETARY
3736
5d1187d1 3737 case LC_MONETARY:
fa07b8e5 3738 my_strlcat(ret, "LC_MONETARY", sizeof(ret));
5d1187d1 3739 break;
7d4bcc4a
KW
3740
3741# endif
3742# ifdef LC_MESSAGES
3743
5d1187d1 3744 case LC_MESSAGES:
fa07b8e5 3745 my_strlcat(ret, "LC_MESSAGES", sizeof(ret));
5d1187d1 3746 break;
7d4bcc4a
KW
3747
3748# endif
3749
5d1187d1
KW
3750 }
3751
fa07b8e5 3752 my_strlcat(ret, ", ", sizeof(ret));
5d1187d1
KW
3753
3754 if (locale) {
fa07b8e5
KW
3755 my_strlcat(ret, "\"", sizeof(ret));
3756 my_strlcat(ret, locale, sizeof(ret));
3757 my_strlcat(ret, "\"", sizeof(ret));
5d1187d1
KW
3758 }
3759 else {
fa07b8e5 3760 my_strlcat(ret, "NULL", sizeof(ret));
5d1187d1
KW
3761 }
3762
fa07b8e5 3763 my_strlcat(ret, ") returned ", sizeof(ret));
5d1187d1
KW