This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
use OPpAVHVSWITCH_MASK
[perl5.git] / locale.c
CommitLineData
98994639
HS
1/* locale.c
2 *
1129b882
NC
3 * Copyright (C) 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001,
4 * 2002, 2003, 2005, 2006, 2007, 2008 by Larry Wall and others
98994639
HS
5 *
6 * You may distribute under the terms of either the GNU General Public
7 * License or the Artistic License, as specified in the README file.
8 *
9 */
10
11/*
4ac71550 12 * A Elbereth Gilthoniel,
cdad3b53 13 * silivren penna míriel
4ac71550 14 * o menel aglar elenath!
cdad3b53 15 * Na-chaered palan-díriel
4ac71550
TC
16 * o galadhremmin ennorath,
17 * Fanuilos, le linnathon
18 * nef aear, si nef aearon!
19 *
20 * [p.238 of _The Lord of the Rings_, II/i: "Many Meetings"]
98994639
HS
21 */
22
166f8a29
DM
23/* utility functions for handling locale-specific stuff like what
24 * character represents the decimal point.
0d071d52
KW
25 *
26 * All C programs have an underlying locale. Perl generally doesn't pay any
27 * attention to it except within the scope of a 'use locale'. For most
28 * categories, it accomplishes this by just using different operations if it is
29 * in such scope than if not. However, various libc functions called by Perl
30 * are affected by the LC_NUMERIC category, so there are macros in perl.h that
31 * are used to toggle between the current locale and the C locale depending on
a9ad02a8
KW
32 * the desired behavior of those functions at the moment. And, LC_MESSAGES is
33 * switched to the C locale for outputting the message unless within the scope
34 * of 'use locale'.
166f8a29
DM
35 */
36
98994639
HS
37#include "EXTERN.h"
38#define PERL_IN_LOCALE_C
39#include "perl.h"
40
b310b053
JH
41#ifdef I_LANGINFO
42# include <langinfo.h>
43#endif
44
a4af207c
JH
45#include "reentr.h"
46
2fcc0ca9
KW
47/* If the environment says to, we can output debugging information during
48 * initialization. This is done before option parsing, and before any thread
49 * creation, so can be a file-level static */
50#ifdef DEBUGGING
27cdc72e
DM
51# ifdef PERL_GLOBAL_STRUCT
52 /* no global syms allowed */
53# define debug_initialization 0
54# define DEBUG_INITIALIZATION_set(v)
55# else
2fcc0ca9 56static bool debug_initialization = FALSE;
27cdc72e
DM
57# define DEBUG_INITIALIZATION_set(v) (debug_initialization = v)
58# endif
2fcc0ca9
KW
59#endif
60
8ef6e574
KW
61#ifdef USE_LOCALE
62
98994639 63/*
0d071d52
KW
64 * Standardize the locale name from a string returned by 'setlocale', possibly
65 * modifying that string.
98994639 66 *
0ef2a2b2 67 * The typical return value of setlocale() is either
98994639
HS
68 * (1) "xx_YY" if the first argument of setlocale() is not LC_ALL
69 * (2) "xa_YY xb_YY ..." if the first argument of setlocale() is LC_ALL
70 * (the space-separated values represent the various sublocales,
0ef2a2b2 71 * in some unspecified order). This is not handled by this function.
98994639
HS
72 *
73 * In some platforms it has a form like "LC_SOMETHING=Lang_Country.866\n",
0ef2a2b2
KW
74 * which is harmful for further use of the string in setlocale(). This
75 * function removes the trailing new line and everything up through the '='
98994639
HS
76 *
77 */
78STATIC char *
79S_stdize_locale(pTHX_ char *locs)
80{
7452cf6a 81 const char * const s = strchr(locs, '=');
98994639
HS
82 bool okay = TRUE;
83
7918f24d
NC
84 PERL_ARGS_ASSERT_STDIZE_LOCALE;
85
8772537c
AL
86 if (s) {
87 const char * const t = strchr(s, '.');
98994639 88 okay = FALSE;
8772537c
AL
89 if (t) {
90 const char * const u = strchr(t, '\n');
91 if (u && (u[1] == 0)) {
92 const STRLEN len = u - s;
93 Move(s + 1, locs, len, char);
94 locs[len] = 0;
95 okay = TRUE;
98994639
HS
96 }
97 }
98 }
99
100 if (!okay)
101 Perl_croak(aTHX_ "Can't fix broken locale name \"%s\"", locs);
102
103 return locs;
104}
105
8ef6e574
KW
106#endif
107
a4f00dcc
KW
108STATIC void
109S_set_numeric_radix(pTHX)
98994639
HS
110{
111#ifdef USE_LOCALE_NUMERIC
112# ifdef HAS_LOCALECONV
7452cf6a 113 const struct lconv* const lc = localeconv();
98994639 114
98994639
HS
115 if (lc && lc->decimal_point) {
116 if (lc->decimal_point[0] == '.' && lc->decimal_point[1] == 0) {
117 SvREFCNT_dec(PL_numeric_radix_sv);
a0714e2c 118 PL_numeric_radix_sv = NULL;
98994639
HS
119 }
120 else {
121 if (PL_numeric_radix_sv)
122 sv_setpv(PL_numeric_radix_sv, lc->decimal_point);
123 else
124 PL_numeric_radix_sv = newSVpv(lc->decimal_point, 0);
c5f058df 125 if (! is_utf8_invariant_string((U8 *) lc->decimal_point, 0)
28acfe03 126 && is_utf8_string((U8 *) lc->decimal_point, 0)
c1284011 127 && _is_cur_LC_category_utf8(LC_NUMERIC))
28acfe03
KW
128 {
129 SvUTF8_on(PL_numeric_radix_sv);
130 }
98994639
HS
131 }
132 }
133 else
a0714e2c 134 PL_numeric_radix_sv = NULL;
69014004 135
2fcc0ca9
KW
136#ifdef DEBUGGING
137 if (DEBUG_L_TEST || debug_initialization) {
138 PerlIO_printf(Perl_debug_log, "Locale radix is '%s', ?UTF-8=%d\n",
69014004 139 (PL_numeric_radix_sv)
37b7e435
KW
140 ? SvPVX(PL_numeric_radix_sv)
141 : "NULL",
142 (PL_numeric_radix_sv)
39eb7305 143 ? cBOOL(SvUTF8(PL_numeric_radix_sv))
2fcc0ca9
KW
144 : 0);
145 }
146#endif
69014004 147
98994639
HS
148# endif /* HAS_LOCALECONV */
149#endif /* USE_LOCALE_NUMERIC */
150}
151
a39edc4c
KW
152/* Is the C string input 'name' "C" or "POSIX"? If so, and 'name' is the
153 * return of setlocale(), then this is extremely likely to be the C or POSIX
154 * locale. However, the output of setlocale() is documented to be opaque, but
155 * the odds are extremely small that it would return these two strings for some
156 * other locale. Note that VMS in these two locales includes many non-ASCII
157 * characters as controls and punctuation (below are hex bytes):
158 * cntrl: 00-1F 7F 84-97 9B-9F
159 * punct: 21-2F 3A-40 5B-60 7B-7E A1-A3 A5 A7-AB B0-B3 B5-B7 B9-BD BF-CF D1-DD DF-EF F1-FD
160 * Oddly, none there are listed as alphas, though some represent alphabetics
161 * http://www.nntp.perl.org/group/perl.perl5.porters/2013/02/msg198753.html */
98b630b3
KW
162#define isNAME_C_OR_POSIX(name) ((name) != NULL \
163 && ((*(name) == 'C' && (*(name + 1)) == '\0') \
a39edc4c
KW
164 || strEQ((name), "POSIX")))
165
98994639 166void
8772537c 167Perl_new_numeric(pTHX_ const char *newnum)
98994639
HS
168{
169#ifdef USE_LOCALE_NUMERIC
0d071d52
KW
170
171 /* Called after all libc setlocale() calls affecting LC_NUMERIC, to tell
172 * core Perl this and that 'newnum' is the name of the new locale.
173 * It installs this locale as the current underlying default.
174 *
175 * The default locale and the C locale can be toggled between by use of the
176 * set_numeric_local() and set_numeric_standard() functions, which should
177 * probably not be called directly, but only via macros like
178 * SET_NUMERIC_STANDARD() in perl.h.
179 *
180 * The toggling is necessary mainly so that a non-dot radix decimal point
181 * character can be output, while allowing internal calculations to use a
182 * dot.
183 *
184 * This sets several interpreter-level variables:
bb304765 185 * PL_numeric_name The underlying locale's name: a copy of 'newnum'
0d071d52 186 * PL_numeric_local A boolean indicating if the toggled state is such
7738054c
KW
187 * that the current locale is the program's underlying
188 * locale
189 * PL_numeric_standard An int indicating if the toggled state is such
190 * that the current locale is the C locale. If non-zero,
191 * it is in C; if > 1, it means it may not be toggled away
192 * from C.
0d071d52
KW
193 * Note that both of the last two variables can be true at the same time,
194 * if the underlying locale is C. (Toggling is a no-op under these
195 * circumstances.)
196 *
197 * Any code changing the locale (outside this file) should use
198 * POSIX::setlocale, which calls this function. Therefore this function
199 * should be called directly only from this file and from
200 * POSIX::setlocale() */
201
b03f34cf 202 char *save_newnum;
98994639
HS
203
204 if (! newnum) {
43c5f42d
NC
205 Safefree(PL_numeric_name);
206 PL_numeric_name = NULL;
98994639
HS
207 PL_numeric_standard = TRUE;
208 PL_numeric_local = TRUE;
209 return;
210 }
211
b03f34cf 212 save_newnum = stdize_locale(savepv(newnum));
abe1abcf
KW
213
214 PL_numeric_standard = isNAME_C_OR_POSIX(save_newnum);
215 PL_numeric_local = TRUE;
216
b03f34cf 217 if (! PL_numeric_name || strNE(PL_numeric_name, save_newnum)) {
98994639 218 Safefree(PL_numeric_name);
b03f34cf 219 PL_numeric_name = save_newnum;
b03f34cf 220 }
abe1abcf
KW
221 else {
222 Safefree(save_newnum);
223 }
4c28b29c
KW
224
225 /* Keep LC_NUMERIC in the C locale. This is for XS modules, so they don't
226 * have to worry about the radix being a non-dot. (Core operations that
227 * need the underlying locale change to it temporarily). */
228 set_numeric_standard();
229
e19f01cb 230 set_numeric_radix();
6959d69d 231
f2ce9e1c
JH
232#else
233 PERL_UNUSED_ARG(newnum);
98994639
HS
234#endif /* USE_LOCALE_NUMERIC */
235}
236
237void
238Perl_set_numeric_standard(pTHX)
239{
240#ifdef USE_LOCALE_NUMERIC
28c1bf33
KW
241 /* Toggle the LC_NUMERIC locale to C. Most code should use the macros like
242 * SET_NUMERIC_STANDARD() in perl.h instead of calling this directly. The
243 * macro avoids calling this routine if toggling isn't necessary according
244 * to our records (which could be wrong if some XS code has changed the
245 * locale behind our back) */
0d071d52 246
a9b8c0d8
KW
247 setlocale(LC_NUMERIC, "C");
248 PL_numeric_standard = TRUE;
249 PL_numeric_local = isNAME_C_OR_POSIX(PL_numeric_name);
250 set_numeric_radix();
2fcc0ca9
KW
251#ifdef DEBUGGING
252 if (DEBUG_L_TEST || debug_initialization) {
253 PerlIO_printf(Perl_debug_log,
254 "Underlying LC_NUMERIC locale now is C\n");
255 }
256#endif
98994639
HS
257
258#endif /* USE_LOCALE_NUMERIC */
259}
260
261void
262Perl_set_numeric_local(pTHX)
263{
264#ifdef USE_LOCALE_NUMERIC
28c1bf33
KW
265 /* Toggle the LC_NUMERIC locale to the current underlying default. Most
266 * code should use the macros like SET_NUMERIC_LOCAL() in perl.h instead of
267 * calling this directly. The macro avoids calling this routine if
268 * toggling isn't necessary according to our records (which could be wrong
269 * if some XS code has changed the locale behind our back) */
a9b8c0d8
KW
270
271 setlocale(LC_NUMERIC, PL_numeric_name);
272 PL_numeric_standard = isNAME_C_OR_POSIX(PL_numeric_name);
273 PL_numeric_local = TRUE;
274 set_numeric_radix();
2fcc0ca9
KW
275#ifdef DEBUGGING
276 if (DEBUG_L_TEST || debug_initialization) {
277 PerlIO_printf(Perl_debug_log,
69014004 278 "Underlying LC_NUMERIC locale now is %s\n",
2fcc0ca9
KW
279 PL_numeric_name);
280 }
281#endif
98994639
HS
282
283#endif /* USE_LOCALE_NUMERIC */
284}
285
286/*
287 * Set up for a new ctype locale.
288 */
a4f00dcc
KW
289STATIC void
290S_new_ctype(pTHX_ const char *newctype)
98994639
HS
291{
292#ifdef USE_LOCALE_CTYPE
0d071d52
KW
293
294 /* Called after all libc setlocale() calls affecting LC_CTYPE, to tell
295 * core Perl this and that 'newctype' is the name of the new locale.
296 *
297 * This function sets up the folding arrays for all 256 bytes, assuming
298 * that tofold() is tolc() since fold case is not a concept in POSIX,
299 *
300 * Any code changing the locale (outside this file) should use
301 * POSIX::setlocale, which calls this function. Therefore this function
302 * should be called directly only from this file and from
303 * POSIX::setlocale() */
304
27da23d5 305 dVAR;
68067e4e 306 UV i;
98994639 307
7918f24d
NC
308 PERL_ARGS_ASSERT_NEW_CTYPE;
309
215c5139
KW
310 /* We will replace any bad locale warning with 1) nothing if the new one is
311 * ok; or 2) a new warning for the bad new locale */
312 if (PL_warn_locale) {
313 SvREFCNT_dec_NN(PL_warn_locale);
314 PL_warn_locale = NULL;
315 }
316
c1284011 317 PL_in_utf8_CTYPE_locale = _is_cur_LC_category_utf8(LC_CTYPE);
31f05a37
KW
318
319 /* A UTF-8 locale gets standard rules. But note that code still has to
320 * handle this specially because of the three problematic code points */
321 if (PL_in_utf8_CTYPE_locale) {
322 Copy(PL_fold_latin1, PL_fold_locale, 256, U8);
323 }
324 else {
8c6180a9
KW
325 /* Assume enough space for every character being bad. 4 spaces each
326 * for the 94 printable characters that are output like "'x' "; and 5
327 * spaces each for "'\\' ", "'\t' ", and "'\n' "; plus a terminating
328 * NUL */
329 char bad_chars_list[ (94 * 4) + (3 * 5) + 1 ];
330
cc9eaeb0
KW
331 /* Don't check for problems if we are suppressing the warnings */
332 bool check_for_problems = ckWARN_d(WARN_LOCALE)
333 || UNLIKELY(DEBUG_L_TEST);
8c6180a9
KW
334 bool multi_byte_locale = FALSE; /* Assume is a single-byte locale
335 to start */
336 unsigned int bad_count = 0; /* Count of bad characters */
337
baa60164
KW
338 for (i = 0; i < 256; i++) {
339 if (isUPPER_LC((U8) i))
340 PL_fold_locale[i] = (U8) toLOWER_LC((U8) i);
341 else if (isLOWER_LC((U8) i))
342 PL_fold_locale[i] = (U8) toUPPER_LC((U8) i);
343 else
344 PL_fold_locale[i] = (U8) i;
8c6180a9
KW
345
346 /* If checking for locale problems, see if the native ASCII-range
347 * printables plus \n and \t are in their expected categories in
348 * the new locale. If not, this could mean big trouble, upending
349 * Perl's and most programs' assumptions, like having a
350 * metacharacter with special meaning become a \w. Fortunately,
351 * it's very rare to find locales that aren't supersets of ASCII
352 * nowadays. It isn't a problem for most controls to be changed
353 * into something else; we check only \n and \t, though perhaps \r
354 * could be an issue as well. */
355 if (check_for_problems
356 && (isGRAPH_A(i) || isBLANK_A(i) || i == '\n'))
357 {
358 if ((isALPHANUMERIC_A(i) && ! isALPHANUMERIC_LC(i))
359 || (isPUNCT_A(i) && ! isPUNCT_LC(i))
360 || (isBLANK_A(i) && ! isBLANK_LC(i))
361 || (i == '\n' && ! isCNTRL_LC(i)))
362 {
363 if (bad_count) { /* Separate multiple entries with a
364 blank */
365 bad_chars_list[bad_count++] = ' ';
366 }
367 bad_chars_list[bad_count++] = '\'';
368 if (isPRINT_A(i)) {
369 bad_chars_list[bad_count++] = (char) i;
370 }
371 else {
372 bad_chars_list[bad_count++] = '\\';
373 if (i == '\n') {
374 bad_chars_list[bad_count++] = 'n';
375 }
376 else {
377 assert(i == '\t');
378 bad_chars_list[bad_count++] = 't';
379 }
380 }
381 bad_chars_list[bad_count++] = '\'';
382 bad_chars_list[bad_count] = '\0';
383 }
384 }
385 }
386
387#ifdef MB_CUR_MAX
388 /* We only handle single-byte locales (outside of UTF-8 ones; so if
d35fca5f 389 * this locale requires more than one byte, there are going to be
8c6180a9 390 * problems. */
9c8a6dc2
KW
391 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
392 "%s:%d: check_for_problems=%d, MB_CUR_MAX=%d\n",
393 __FILE__, __LINE__, check_for_problems, (int) MB_CUR_MAX));
394
ba1a4362
KW
395 if (check_for_problems && MB_CUR_MAX > 1
396
397 /* Some platforms return MB_CUR_MAX > 1 for even the "C"
398 * locale. Just assume that the implementation for them (plus
399 * for POSIX) is correct and the > 1 value is spurious. (Since
400 * these are specially handled to never be considered UTF-8
401 * locales, as long as this is the only problem, everything
402 * should work fine */
403 && strNE(newctype, "C") && strNE(newctype, "POSIX"))
404 {
8c6180a9
KW
405 multi_byte_locale = TRUE;
406 }
407#endif
408
409 if (bad_count || multi_byte_locale) {
780fcc9f 410 PL_warn_locale = Perl_newSVpvf(aTHX_
8c6180a9 411 "Locale '%s' may not work well.%s%s%s\n",
780fcc9f 412 newctype,
8c6180a9
KW
413 (multi_byte_locale)
414 ? " Some characters in it are not recognized by"
415 " Perl."
416 : "",
417 (bad_count)
418 ? "\nThe following characters (and maybe others)"
419 " may not have the same meaning as the Perl"
420 " program expects:\n"
421 : "",
422 (bad_count)
423 ? bad_chars_list
424 : ""
425 );
cc9eaeb0 426 /* If we are actually in the scope of the locale or are debugging,
bddebb56
KW
427 * output the message now. If not in that scope, we save the
428 * message to be output at the first operation using this locale,
429 * if that actually happens. Most programs don't use locales, so
430 * they are immune to bad ones. */
cc9eaeb0 431 if (IN_LC(LC_CTYPE) || UNLIKELY(DEBUG_L_TEST)) {
780fcc9f
KW
432
433 /* We have to save 'newctype' because the setlocale() just
434 * below may destroy it. The next setlocale() further down
435 * should restore it properly so that the intermediate change
436 * here is transparent to this function's caller */
437 const char * const badlocale = savepv(newctype);
438
439 setlocale(LC_CTYPE, "C");
440
441 /* The '0' below suppresses a bogus gcc compiler warning */
442 Perl_warner(aTHX_ packWARN(WARN_LOCALE), SvPVX(PL_warn_locale), 0);
bddebb56 443
780fcc9f 444 setlocale(LC_CTYPE, badlocale);
c0f3a893 445 Safefree(badlocale);
bddebb56
KW
446
447 if (IN_LC(LC_CTYPE)) {
448 SvREFCNT_dec_NN(PL_warn_locale);
449 PL_warn_locale = NULL;
450 }
780fcc9f 451 }
baa60164 452 }
31f05a37 453 }
98994639
HS
454
455#endif /* USE_LOCALE_CTYPE */
7918f24d 456 PERL_ARGS_ASSERT_NEW_CTYPE;
8772537c 457 PERL_UNUSED_ARG(newctype);
96a5add6 458 PERL_UNUSED_CONTEXT;
98994639
HS
459}
460
98994639 461void
2726666d
KW
462Perl__warn_problematic_locale()
463{
2726666d
KW
464
465#ifdef USE_LOCALE_CTYPE
466
5f04a188
KW
467 dTHX;
468
469 /* Internal-to-core function that outputs the message in PL_warn_locale,
470 * and then NULLS it. Should be called only through the macro
471 * _CHECK_AND_WARN_PROBLEMATIC_LOCALE */
472
2726666d
KW
473 if (PL_warn_locale) {
474 /*GCC_DIAG_IGNORE(-Wformat-security); Didn't work */
475 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
476 SvPVX(PL_warn_locale),
477 0 /* dummy to avoid compiler warning */ );
478 /* GCC_DIAG_RESTORE; */
479 SvREFCNT_dec_NN(PL_warn_locale);
480 PL_warn_locale = NULL;
481 }
482
483#endif
484
485}
486
a4f00dcc
KW
487STATIC void
488S_new_collate(pTHX_ const char *newcoll)
98994639
HS
489{
490#ifdef USE_LOCALE_COLLATE
0d071d52
KW
491
492 /* Called after all libc setlocale() calls affecting LC_COLLATE, to tell
493 * core Perl this and that 'newcoll' is the name of the new locale.
494 *
495 * Any code changing the locale (outside this file) should use
496 * POSIX::setlocale, which calls this function. Therefore this function
497 * should be called directly only from this file and from
d35fca5f
KW
498 * POSIX::setlocale().
499 *
500 * The design of locale collation is that every locale change is given an
501 * index 'PL_collation_ix'. The first time a string particpates in an
502 * operation that requires collation while locale collation is active, it
503 * is given PERL_MAGIC_collxfrm magic (via sv_collxfrm_flags()). That
504 * magic includes the collation index, and the transformation of the string
505 * by strxfrm(), q.v. That transformation is used when doing comparisons,
506 * instead of the string itself. If a string changes, the magic is
507 * cleared. The next time the locale changes, the index is incremented,
508 * and so we know during a comparison that the transformation is not
509 * necessarily still valid, and so is recomputed. Note that if the locale
510 * changes enough times, the index could wrap (a U32), and it is possible
511 * that a transformation would improperly be considered valid, leading to
512 * an unlikely bug */
0d071d52 513
98994639
HS
514 if (! newcoll) {
515 if (PL_collation_name) {
516 ++PL_collation_ix;
517 Safefree(PL_collation_name);
518 PL_collation_name = NULL;
519 }
520 PL_collation_standard = TRUE;
00bf60ca 521 is_standard_collation:
98994639
HS
522 PL_collxfrm_base = 0;
523 PL_collxfrm_mult = 2;
165a1c52 524 PL_in_utf8_COLLATE_locale = FALSE;
f28f4d2a 525 PL_strxfrm_NUL_replacement = '\0';
a4a439fb 526 PL_strxfrm_max_cp = 0;
98994639
HS
527 return;
528 }
529
d35fca5f 530 /* If this is not the same locale as currently, set the new one up */
98994639
HS
531 if (! PL_collation_name || strNE(PL_collation_name, newcoll)) {
532 ++PL_collation_ix;
533 Safefree(PL_collation_name);
534 PL_collation_name = stdize_locale(savepv(newcoll));
a39edc4c 535 PL_collation_standard = isNAME_C_OR_POSIX(newcoll);
00bf60ca
KW
536 if (PL_collation_standard) {
537 goto is_standard_collation;
538 }
98994639 539
165a1c52 540 PL_in_utf8_COLLATE_locale = _is_cur_LC_category_utf8(LC_COLLATE);
f28f4d2a 541 PL_strxfrm_NUL_replacement = '\0';
a4a439fb 542 PL_strxfrm_max_cp = 0;
165a1c52 543
59c018b9
KW
544 /* A locale collation definition includes primary, secondary, tertiary,
545 * etc. weights for each character. To sort, the primary weights are
546 * used, and only if they compare equal, then the secondary weights are
547 * used, and only if they compare equal, then the tertiary, etc.
548 *
549 * strxfrm() works by taking the input string, say ABC, and creating an
550 * output transformed string consisting of first the primary weights,
551 * A¹B¹C¹ followed by the secondary ones, A²B²C²; and then the
552 * tertiary, etc, yielding A¹B¹C¹ A²B²C² A³B³C³ .... Some characters
553 * may not have weights at every level. In our example, let's say B
554 * doesn't have a tertiary weight, and A doesn't have a secondary
555 * weight. The constructed string is then going to be
556 * A¹B¹C¹ B²C² A³C³ ....
557 * This has the desired effect that strcmp() will look at the secondary
558 * or tertiary weights only if the strings compare equal at all higher
559 * priority weights. The spaces shown here, like in
c342d20e 560 * "A¹B¹C¹ A²B²C² "
59c018b9
KW
561 * are not just for readability. In the general case, these must
562 * actually be bytes, which we will call here 'separator weights'; and
563 * they must be smaller than any other weight value, but since these
564 * are C strings, only the terminating one can be a NUL (some
565 * implementations may include a non-NUL separator weight just before
566 * the NUL). Implementations tend to reserve 01 for the separator
567 * weights. They are needed so that a shorter string's secondary
568 * weights won't be misconstrued as primary weights of a longer string,
569 * etc. By making them smaller than any other weight, the shorter
570 * string will sort first. (Actually, if all secondary weights are
571 * smaller than all primary ones, there is no need for a separator
572 * weight between those two levels, etc.)
573 *
574 * The length of the transformed string is roughly a linear function of
575 * the input string. It's not exactly linear because some characters
576 * don't have weights at all levels. When we call strxfrm() we have to
577 * allocate some memory to hold the transformed string. The
578 * calculations below try to find coefficients 'm' and 'b' for this
579 * locale so that m*x + b equals how much space we need, given the size
580 * of the input string in 'x'. If we calculate too small, we increase
581 * the size as needed, and call strxfrm() again, but it is better to
582 * get it right the first time to avoid wasted expensive string
583 * transformations. */
584
98994639 585 {
79f120c8
KW
586 /* We use the string below to find how long the tranformation of it
587 * is. Almost all locales are supersets of ASCII, or at least the
588 * ASCII letters. We use all of them, half upper half lower,
589 * because if we used fewer, we might hit just the ones that are
590 * outliers in a particular locale. Most of the strings being
591 * collated will contain a preponderance of letters, and even if
592 * they are above-ASCII, they are likely to have the same number of
593 * weight levels as the ASCII ones. It turns out that digits tend
594 * to have fewer levels, and some punctuation has more, but those
595 * are relatively sparse in text, and khw believes this gives a
596 * reasonable result, but it could be changed if experience so
597 * dictates. */
598 const char longer[] = "ABCDEFGHIJKLMnopqrstuvwxyz";
599 char * x_longer; /* Transformed 'longer' */
600 Size_t x_len_longer; /* Length of 'x_longer' */
601
602 char * x_shorter; /* We also transform a substring of 'longer' */
603 Size_t x_len_shorter;
604
a4a439fb 605 /* _mem_collxfrm() is used get the transformation (though here we
79f120c8
KW
606 * are interested only in its length). It is used because it has
607 * the intelligence to handle all cases, but to work, it needs some
608 * values of 'm' and 'b' to get it started. For the purposes of
609 * this calculation we use a very conservative estimate of 'm' and
610 * 'b'. This assumes a weight can be multiple bytes, enough to
611 * hold any UV on the platform, and there are 5 levels, 4 weight
612 * bytes, and a trailing NUL. */
613 PL_collxfrm_base = 5;
614 PL_collxfrm_mult = 5 * sizeof(UV);
615
616 /* Find out how long the transformation really is */
a4a439fb
KW
617 x_longer = _mem_collxfrm(longer,
618 sizeof(longer) - 1,
619 &x_len_longer,
620
621 /* We avoid converting to UTF-8 in the
622 * called function by telling it the
623 * string is in UTF-8 if the locale is a
624 * UTF-8 one. Since the string passed
625 * here is invariant under UTF-8, we can
626 * claim it's UTF-8 even though it isn't.
627 * */
628 PL_in_utf8_COLLATE_locale);
79f120c8
KW
629 Safefree(x_longer);
630
631 /* Find out how long the transformation of a substring of 'longer'
632 * is. Together the lengths of these transformations are
633 * sufficient to calculate 'm' and 'b'. The substring is all of
634 * 'longer' except the first character. This minimizes the chances
635 * of being swayed by outliers */
a4a439fb 636 x_shorter = _mem_collxfrm(longer + 1,
79f120c8 637 sizeof(longer) - 2,
a4a439fb
KW
638 &x_len_shorter,
639 PL_in_utf8_COLLATE_locale);
79f120c8
KW
640 Safefree(x_shorter);
641
642 /* If the results are nonsensical for this simple test, the whole
643 * locale definition is suspect. Mark it so that locale collation
644 * is not active at all for it. XXX Should we warn? */
645 if ( x_len_shorter == 0
646 || x_len_longer == 0
647 || x_len_shorter >= x_len_longer)
648 {
649 PL_collxfrm_mult = 0;
650 PL_collxfrm_base = 0;
651 }
652 else {
653 SSize_t base; /* Temporary */
654
655 /* We have both: m * strlen(longer) + b = x_len_longer
656 * m * strlen(shorter) + b = x_len_shorter;
657 * subtracting yields:
658 * m * (strlen(longer) - strlen(shorter))
659 * = x_len_longer - x_len_shorter
660 * But we have set things up so that 'shorter' is 1 byte smaller
661 * than 'longer'. Hence:
662 * m = x_len_longer - x_len_shorter
663 *
664 * But if something went wrong, make sure the multiplier is at
665 * least 1.
666 */
667 if (x_len_longer > x_len_shorter) {
668 PL_collxfrm_mult = (STRLEN) x_len_longer - x_len_shorter;
669 }
670 else {
671 PL_collxfrm_mult = 1;
672 }
673
674 /* mx + b = len
675 * so: b = len - mx
676 * but in case something has gone wrong, make sure it is
677 * non-negative */
678 base = x_len_longer - PL_collxfrm_mult * (sizeof(longer) - 1);
679 if (base < 0) {
680 base = 0;
681 }
682
683 /* Add 1 for the trailing NUL */
684 PL_collxfrm_base = base + 1;
685 }
58eebef2
KW
686
687#ifdef DEBUGGING
688 if (DEBUG_L_TEST || debug_initialization) {
689 PerlIO_printf(Perl_debug_log,
b07929e4
KW
690 "%s:%d: ?UTF-8 locale=%d; x_len_shorter=%zu, "
691 "x_len_longer=%zu,"
692 " collate multipler=%zu, collate base=%zu\n",
58eebef2
KW
693 __FILE__, __LINE__,
694 PL_in_utf8_COLLATE_locale,
695 x_len_shorter, x_len_longer,
696 PL_collxfrm_mult, PL_collxfrm_base);
697 }
698#endif
98994639
HS
699 }
700 }
701
f2ce9e1c
JH
702#else
703 PERL_UNUSED_ARG(newcoll);
98994639
HS
704#endif /* USE_LOCALE_COLLATE */
705}
706
a4f00dcc 707#ifndef WIN32 /* No wrapper except on Windows */
b385bb4d 708
a4f00dcc
KW
709#define my_setlocale(a,b) setlocale(a,b)
710
711#else /* WIN32 */
712
713STATIC char *
714S_my_setlocale(pTHX_ int category, const char* locale)
b385bb4d
KW
715{
716 /* This, for Windows, emulates POSIX setlocale() behavior. There is no
717 * difference unless the input locale is "", which means on Windows to get
718 * the machine default, which is set via the computer's "Regional and
719 * Language Options" (or its current equivalent). In POSIX, it instead
720 * means to find the locale from the user's environment. This routine
721 * looks in the environment, and, if anything is found, uses that instead
722 * of going to the machine default. If there is no environment override,
723 * the machine default is used, as normal, by calling the real setlocale()
724 * with "". The POSIX behavior is to use the LC_ALL variable if set;
725 * otherwise to use the particular category's variable if set; otherwise to
726 * use the LANG variable. */
727
175c4cf9 728 bool override_LC_ALL = FALSE;
89f7b9aa
KW
729 char * result;
730
b385bb4d
KW
731 if (locale && strEQ(locale, "")) {
732# ifdef LC_ALL
733 locale = PerlEnv_getenv("LC_ALL");
734 if (! locale) {
735#endif
736 switch (category) {
737# ifdef LC_ALL
738 case LC_ALL:
481465ea 739 override_LC_ALL = TRUE;
b385bb4d
KW
740 break; /* We already know its variable isn't set */
741# endif
742# ifdef USE_LOCALE_TIME
743 case LC_TIME:
744 locale = PerlEnv_getenv("LC_TIME");
745 break;
746# endif
747# ifdef USE_LOCALE_CTYPE
748 case LC_CTYPE:
749 locale = PerlEnv_getenv("LC_CTYPE");
750 break;
751# endif
752# ifdef USE_LOCALE_COLLATE
753 case LC_COLLATE:
754 locale = PerlEnv_getenv("LC_COLLATE");
755 break;
756# endif
757# ifdef USE_LOCALE_MONETARY
758 case LC_MONETARY:
759 locale = PerlEnv_getenv("LC_MONETARY");
760 break;
761# endif
762# ifdef USE_LOCALE_NUMERIC
763 case LC_NUMERIC:
764 locale = PerlEnv_getenv("LC_NUMERIC");
765 break;
766# endif
767# ifdef USE_LOCALE_MESSAGES
768 case LC_MESSAGES:
769 locale = PerlEnv_getenv("LC_MESSAGES");
770 break;
771# endif
772 default:
773 /* This is a category, like PAPER_SIZE that we don't
774 * know about; and so can't provide a wrapper. */
775 break;
776 }
777 if (! locale) {
778 locale = PerlEnv_getenv("LANG");
481465ea 779 if (! locale) {
b385bb4d
KW
780 locale = "";
781 }
782 }
783# ifdef LC_ALL
784 }
785# endif
786 }
787
89f7b9aa 788 result = setlocale(category, locale);
bbc98134 789 DEBUG_L(PerlIO_printf(Perl_debug_log, "%s:%d: %s\n", __FILE__, __LINE__,
a4f00dcc 790 setlocale_debug_string(category, locale, result)));
89f7b9aa 791
481465ea 792 if (! override_LC_ALL) {
89f7b9aa
KW
793 return result;
794 }
795
dfd77d7a 796 /* Here the input category was LC_ALL, and we have set it to what is in the
481465ea
KW
797 * LANG variable or the system default if there is no LANG. But these have
798 * lower priority than the other LC_foo variables, so override it for each
799 * one that is set. (If they are set to "", it means to use the same thing
800 * we just set LC_ALL to, so can skip) */
89f7b9aa
KW
801# ifdef USE_LOCALE_TIME
802 result = PerlEnv_getenv("LC_TIME");
730252b2 803 if (result && strNE(result, "")) {
89f7b9aa 804 setlocale(LC_TIME, result);
bbc98134
KW
805 DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s:%d: %s\n",
806 __FILE__, __LINE__,
a4f00dcc 807 setlocale_debug_string(LC_TIME, result, "not captured")));
89f7b9aa
KW
808 }
809# endif
810# ifdef USE_LOCALE_CTYPE
811 result = PerlEnv_getenv("LC_CTYPE");
730252b2 812 if (result && strNE(result, "")) {
89f7b9aa 813 setlocale(LC_CTYPE, result);
bbc98134
KW
814 DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s:%d: %s\n",
815 __FILE__, __LINE__,
a4f00dcc 816 setlocale_debug_string(LC_CTYPE, result, "not captured")));
89f7b9aa
KW
817 }
818# endif
819# ifdef USE_LOCALE_COLLATE
820 result = PerlEnv_getenv("LC_COLLATE");
730252b2 821 if (result && strNE(result, "")) {
89f7b9aa 822 setlocale(LC_COLLATE, result);
bbc98134
KW
823 DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s:%d: %s\n",
824 __FILE__, __LINE__,
a4f00dcc 825 setlocale_debug_string(LC_COLLATE, result, "not captured")));
89f7b9aa
KW
826 }
827# endif
828# ifdef USE_LOCALE_MONETARY
829 result = PerlEnv_getenv("LC_MONETARY");
730252b2 830 if (result && strNE(result, "")) {
89f7b9aa 831 setlocale(LC_MONETARY, result);
bbc98134
KW
832 DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s:%d: %s\n",
833 __FILE__, __LINE__,
a4f00dcc 834 setlocale_debug_string(LC_MONETARY, result, "not captured")));
89f7b9aa
KW
835 }
836# endif
837# ifdef USE_LOCALE_NUMERIC
838 result = PerlEnv_getenv("LC_NUMERIC");
730252b2 839 if (result && strNE(result, "")) {
89f7b9aa 840 setlocale(LC_NUMERIC, result);
bbc98134
KW
841 DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s:%d: %s\n",
842 __FILE__, __LINE__,
a4f00dcc 843 setlocale_debug_string(LC_NUMERIC, result, "not captured")));
89f7b9aa
KW
844 }
845# endif
846# ifdef USE_LOCALE_MESSAGES
847 result = PerlEnv_getenv("LC_MESSAGES");
730252b2 848 if (result && strNE(result, "")) {
89f7b9aa 849 setlocale(LC_MESSAGES, result);
bbc98134
KW
850 DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s:%d: %s\n",
851 __FILE__, __LINE__,
a4f00dcc 852 setlocale_debug_string(LC_MESSAGES, result, "not captured")));
89f7b9aa
KW
853 }
854# endif
855
bbc98134
KW
856 result = setlocale(LC_ALL, NULL);
857 DEBUG_L(PerlIO_printf(Perl_debug_log, "%s:%d: %s\n",
858 __FILE__, __LINE__,
a4f00dcc 859 setlocale_debug_string(LC_ALL, NULL, result)));
89f7b9aa 860
bbc98134 861 return result;
b385bb4d
KW
862}
863
864#endif
865
a4f00dcc
KW
866char *
867Perl_setlocale(int category, const char * locale)
868{
869 /* This wraps POSIX::setlocale() */
870
871 char * retval;
872 dTHX;
873
874
875#ifdef USE_LOCALE_NUMERIC
876
877 /* A NULL locale means only query what the current one is. We
878 * have the LC_NUMERIC name saved, because we are normally switched
879 * into the C locale for it. Switch back so an LC_ALL query will yield
880 * the correct results; all other categories don't require special
881 * handling */
882 if (locale == NULL) {
883 if (category == LC_NUMERIC) {
884 return savepv(PL_numeric_name);
885 }
886
887# ifdef LC_ALL
888
889 else if (category == LC_ALL) {
890 SET_NUMERIC_UNDERLYING();
891 }
892
893# endif
894
895 }
896
897#endif
898
899 retval = my_setlocale(category, locale);
900
901 DEBUG_L(PerlIO_printf(Perl_debug_log,
902 "%s:%d: %s\n", __FILE__, __LINE__,
903 setlocale_debug_string(category, locale, retval)));
904 if (! retval) {
905 /* Should never happen that a query would return an error, but be
906 * sure and reset to C locale */
907 if (locale == 0) {
908 SET_NUMERIC_STANDARD();
909 }
910 return NULL;
911 }
912
913 /* Save retval since subsequent setlocale() calls may overwrite it. */
914 retval = savepv(retval);
915
916 /* If locale == NULL, we are just querying the state, but may have switched
917 * to NUMERIC_UNDERLYING. Switch back before returning. */
918 if (locale == NULL) {
919 SET_NUMERIC_STANDARD();
920 return retval;
921 }
922 else { /* Now that have switched locales, we have to update our records to
923 correspond */
924
925#ifdef USE_LOCALE_CTYPE
926
927 if ( category == LC_CTYPE
928
929# ifdef LC_ALL
930
931 || category == LC_ALL
932
933# endif
934
935 )
936 {
937 char *newctype;
938
939# ifdef LC_ALL
940
941 if (category == LC_ALL) {
942 newctype = setlocale(LC_CTYPE, NULL);
943 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
944 "%s:%d: %s\n", __FILE__, __LINE__,
945 setlocale_debug_string(LC_CTYPE, NULL, newctype)));
946 }
947 else
948
949# endif
950
951 newctype = retval;
952 new_ctype(newctype);
953 }
954
955#endif /* USE_LOCALE_CTYPE */
956
957#ifdef USE_LOCALE_COLLATE
958
959 if ( category == LC_COLLATE
960
961# ifdef LC_ALL
962
963 || category == LC_ALL
964
965# endif
966
967 )
968 {
969 char *newcoll;
970
971# ifdef LC_ALL
972
973 if (category == LC_ALL) {
974 newcoll = setlocale(LC_COLLATE, NULL);
975 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
976 "%s:%d: %s\n", __FILE__, __LINE__,
977 setlocale_debug_string(LC_COLLATE, NULL, newcoll)));
978 }
979 else
980
981# endif
982
983 newcoll = retval;
984 new_collate(newcoll);
985 }
986
987#endif /* USE_LOCALE_COLLATE */
988
989#ifdef USE_LOCALE_NUMERIC
990
991 if ( category == LC_NUMERIC
992
993# ifdef LC_ALL
994
995 || category == LC_ALL
996
997# endif
998
999 )
1000 {
1001 char *newnum;
1002
1003# ifdef LC_ALL
1004
1005 if (category == LC_ALL) {
1006 newnum = setlocale(LC_NUMERIC, NULL);
1007 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
1008 "%s:%d: %s\n", __FILE__, __LINE__,
1009 setlocale_debug_string(LC_NUMERIC, NULL, newnum)));
1010 }
1011 else
1012
1013# endif
1014
1015 newnum = retval;
1016 new_numeric(newnum);
1017 }
1018
1019#endif /* USE_LOCALE_NUMERIC */
1020
1021 }
1022
1023 return retval;
1024
1025}
b385bb4d 1026
98994639
HS
1027/*
1028 * Initialize locale awareness.
1029 */
1030int
1031Perl_init_i18nl10n(pTHX_ int printwarn)
1032{
0e92a118
KW
1033 /* printwarn is
1034 *
1035 * 0 if not to output warning when setup locale is bad
1036 * 1 if to output warning based on value of PERL_BADLANG
1037 * >1 if to output regardless of PERL_BADLANG
1038 *
1039 * returns
98994639 1040 * 1 = set ok or not applicable,
0e92a118
KW
1041 * 0 = fallback to a locale of lower priority
1042 * -1 = fallback to all locales failed, not even to the C locale
6b058d42
KW
1043 *
1044 * Under -DDEBUGGING, if the environment variable PERL_DEBUG_LOCALE_INIT is
1045 * set, debugging information is output.
1046 *
1047 * This looks more complicated than it is, mainly due to the #ifdefs.
1048 *
1049 * We try to set LC_ALL to the value determined by the environment. If
1050 * there is no LC_ALL on this platform, we try the individual categories we
1051 * know about. If this works, we are done.
1052 *
1053 * But if it doesn't work, we have to do something else. We search the
1054 * environment variables ourselves instead of relying on the system to do
1055 * it. We look at, in order, LC_ALL, LANG, a system default locale (if we
1056 * think there is one), and the ultimate fallback "C". This is all done in
1057 * the same loop as above to avoid duplicating code, but it makes things
1058 * more complex. After the original failure, we add the fallback
1059 * possibilities to the list of locales to try, and iterate the loop
1060 * through them all until one succeeds.
1061 *
1062 * On Ultrix, the locale MUST come from the environment, so there is
1063 * preliminary code to set it. I (khw) am not sure that it is necessary,
1064 * and that this couldn't be folded into the loop, but barring any real
1065 * platforms to test on, it's staying as-is
1066 *
1067 * A slight complication is that in embedded Perls, the locale may already
1068 * be set-up, and we don't want to get it from the normal environment
1069 * variables. This is handled by having a special environment variable
1070 * indicate we're in this situation. We simply set setlocale's 2nd
1071 * parameter to be a NULL instead of "". That indicates to setlocale that
1072 * it is not to change anything, but to return the current value,
1073 * effectively initializing perl's db to what the locale already is.
1074 *
1075 * We play the same trick with NULL if a LC_ALL succeeds. We call
1076 * setlocale() on the individual categores with NULL to get their existing
1077 * values for our db, instead of trying to change them.
1078 * */
98994639 1079
0e92a118
KW
1080 int ok = 1;
1081
98994639 1082#if defined(USE_LOCALE)
98994639
HS
1083#ifdef USE_LOCALE_CTYPE
1084 char *curctype = NULL;
1085#endif /* USE_LOCALE_CTYPE */
1086#ifdef USE_LOCALE_COLLATE
1087 char *curcoll = NULL;
1088#endif /* USE_LOCALE_COLLATE */
1089#ifdef USE_LOCALE_NUMERIC
1090 char *curnum = NULL;
1091#endif /* USE_LOCALE_NUMERIC */
1092#ifdef __GLIBC__
175c4cf9 1093 const char * const language = savepv(PerlEnv_getenv("LANGUAGE"));
98994639 1094#endif
65ebb059 1095
ccd65d51
KW
1096 /* NULL uses the existing already set up locale */
1097 const char * const setlocale_init = (PerlEnv_getenv("PERL_SKIP_LOCALE_INIT"))
1098 ? NULL
1099 : "";
c3fcd832
KW
1100 const char* trial_locales[5]; /* 5 = 1 each for "", LC_ALL, LANG, "", C */
1101 unsigned int trial_locales_count;
175c4cf9
KW
1102 const char * const lc_all = savepv(PerlEnv_getenv("LC_ALL"));
1103 const char * const lang = savepv(PerlEnv_getenv("LANG"));
98994639 1104 bool setlocale_failure = FALSE;
65ebb059
KW
1105 unsigned int i;
1106 char *p;
175c4cf9
KW
1107
1108 /* A later getenv() could zap this, so only use here */
1109 const char * const bad_lang_use_once = PerlEnv_getenv("PERL_BADLANG");
1110
1111 const bool locwarn = (printwarn > 1
1112 || (printwarn
1113 && (! bad_lang_use_once
22ff3130
HS
1114 || (
1115 /* disallow with "" or "0" */
1116 *bad_lang_use_once
1117 && strNE("0", bad_lang_use_once)))));
0e92a118 1118 bool done = FALSE;
5d1187d1
KW
1119 char * sl_result; /* return from setlocale() */
1120 char * locale_param;
6bce99ee
JH
1121#ifdef WIN32
1122 /* In some systems you can find out the system default locale
1123 * and use that as the fallback locale. */
1124# define SYSTEM_DEFAULT_LOCALE
1125#endif
1126#ifdef SYSTEM_DEFAULT_LOCALE
65ebb059 1127 const char *system_default_locale = NULL;
6bce99ee 1128#endif
98994639 1129
2fcc0ca9 1130#ifdef DEBUGGING
8298454c 1131 DEBUG_INITIALIZATION_set(cBOOL(PerlEnv_getenv("PERL_DEBUG_LOCALE_INIT")));
2fcc0ca9
KW
1132# define DEBUG_LOCALE_INIT(category, locale, result) \
1133 STMT_START { \
1134 if (debug_initialization) { \
1135 PerlIO_printf(Perl_debug_log, \
1136 "%s:%d: %s\n", \
1137 __FILE__, __LINE__, \
a4f00dcc 1138 setlocale_debug_string(category, \
2fcc0ca9
KW
1139 locale, \
1140 result)); \
1141 } \
1142 } STMT_END
1143#else
1144# define DEBUG_LOCALE_INIT(a,b,c)
1145#endif
1146
0e92a118
KW
1147#ifndef LOCALE_ENVIRON_REQUIRED
1148 PERL_UNUSED_VAR(done);
5d1187d1 1149 PERL_UNUSED_VAR(locale_param);
0e92a118 1150#else
98994639
HS
1151
1152 /*
1153 * Ultrix setlocale(..., "") fails if there are no environment
1154 * variables from which to get a locale name.
1155 */
1156
b3e384bf 1157# ifdef LC_ALL
98994639 1158 if (lang) {
5d1187d1
KW
1159 sl_result = my_setlocale(LC_ALL, setlocale_init);
1160 DEBUG_LOCALE_INIT(LC_ALL, setlocale_init, sl_result);
1161 if (sl_result)
98994639
HS
1162 done = TRUE;
1163 else
1164 setlocale_failure = TRUE;
1165 }
5d1187d1 1166 if (! setlocale_failure) {
b3e384bf 1167# ifdef USE_LOCALE_CTYPE
5d1187d1
KW
1168 locale_param = (! done && (lang || PerlEnv_getenv("LC_CTYPE")))
1169 ? setlocale_init
1170 : NULL;
1171 curctype = my_setlocale(LC_CTYPE, locale_param);
1172 DEBUG_LOCALE_INIT(LC_CTYPE, locale_param, sl_result);
1173 if (! curctype)
98994639
HS
1174 setlocale_failure = TRUE;
1175 else
1176 curctype = savepv(curctype);
b3e384bf
KW
1177# endif /* USE_LOCALE_CTYPE */
1178# ifdef USE_LOCALE_COLLATE
5d1187d1
KW
1179 locale_param = (! done && (lang || PerlEnv_getenv("LC_COLLATE")))
1180 ? setlocale_init
1181 : NULL;
1182 curcoll = my_setlocale(LC_COLLATE, locale_param);
1183 DEBUG_LOCALE_INIT(LC_COLLATE, locale_param, sl_result);
1184 if (! curcoll)
98994639
HS
1185 setlocale_failure = TRUE;
1186 else
1187 curcoll = savepv(curcoll);
b3e384bf
KW
1188# endif /* USE_LOCALE_COLLATE */
1189# ifdef USE_LOCALE_NUMERIC
5d1187d1
KW
1190 locale_param = (! done && (lang || PerlEnv_getenv("LC_NUMERIC")))
1191 ? setlocale_init
1192 : NULL;
1193 curnum = my_setlocale(LC_NUMERIC, locale_param);
1194 DEBUG_LOCALE_INIT(LC_NUMERIC, locale_param, sl_result);
1195 if (! curnum)
98994639
HS
1196 setlocale_failure = TRUE;
1197 else
1198 curnum = savepv(curnum);
b3e384bf 1199# endif /* USE_LOCALE_NUMERIC */
a782673d 1200# ifdef USE_LOCALE_MESSAGES
5d1187d1
KW
1201 locale_param = (! done && (lang || PerlEnv_getenv("LC_MESSAGES")))
1202 ? setlocale_init
1203 : NULL;
1204 sl_result = my_setlocale(LC_MESSAGES, locale_param);
1205 DEBUG_LOCALE_INIT(LC_MESSAGES, locale_param, sl_result);
9f42613c 1206 if (! sl_result) {
a782673d
KW
1207 setlocale_failure = TRUE;
1208 }
1209# endif /* USE_LOCALE_MESSAGES */
c835d6be 1210# ifdef USE_LOCALE_MONETARY
5d1187d1
KW
1211 locale_param = (! done && (lang || PerlEnv_getenv("LC_MONETARY")))
1212 ? setlocale_init
1213 : NULL;
1214 sl_result = my_setlocale(LC_MONETARY, locale_param);
1215 DEBUG_LOCALE_INIT(LC_MONETARY, locale_param, sl_result);
1216 if (! sl_result) {
c835d6be
KW
1217 setlocale_failure = TRUE;
1218 }
1219# endif /* USE_LOCALE_MONETARY */
98994639
HS
1220 }
1221
b3e384bf 1222# endif /* LC_ALL */
98994639
HS
1223
1224#endif /* !LOCALE_ENVIRON_REQUIRED */
1225
65ebb059 1226 /* We try each locale in the list until we get one that works, or exhaust
20a240df
KW
1227 * the list. Normally the loop is executed just once. But if setting the
1228 * locale fails, inside the loop we add fallback trials to the array and so
1229 * will execute the loop multiple times */
c3fcd832
KW
1230 trial_locales[0] = setlocale_init;
1231 trial_locales_count = 1;
65ebb059
KW
1232 for (i= 0; i < trial_locales_count; i++) {
1233 const char * trial_locale = trial_locales[i];
1234
1235 if (i > 0) {
1236
1237 /* XXX This is to preserve old behavior for LOCALE_ENVIRON_REQUIRED
1238 * when i==0, but I (khw) don't think that behavior makes much
1239 * sense */
1240 setlocale_failure = FALSE;
1241
6bce99ee
JH
1242#ifdef SYSTEM_DEFAULT_LOCALE
1243# ifdef WIN32
65ebb059
KW
1244 /* On Windows machines, an entry of "" after the 0th means to use
1245 * the system default locale, which we now proceed to get. */
1246 if (strEQ(trial_locale, "")) {
1247 unsigned int j;
1248
1249 /* Note that this may change the locale, but we are going to do
1250 * that anyway just below */
1251 system_default_locale = setlocale(LC_ALL, "");
5d1187d1 1252 DEBUG_LOCALE_INIT(LC_ALL, "", system_default_locale);
65ebb059
KW
1253
1254 /* Skip if invalid or it's already on the list of locales to
1255 * try */
1256 if (! system_default_locale) {
1257 goto next_iteration;
1258 }
1259 for (j = 0; j < trial_locales_count; j++) {
1260 if (strEQ(system_default_locale, trial_locales[j])) {
1261 goto next_iteration;
1262 }
1263 }
1264
1265 trial_locale = system_default_locale;
1266 }
6bce99ee
JH
1267# endif /* WIN32 */
1268#endif /* SYSTEM_DEFAULT_LOCALE */
65ebb059
KW
1269 }
1270
98994639 1271#ifdef LC_ALL
5d1187d1
KW
1272 sl_result = my_setlocale(LC_ALL, trial_locale);
1273 DEBUG_LOCALE_INIT(LC_ALL, trial_locale, sl_result);
1274 if (! sl_result) {
49c85077 1275 setlocale_failure = TRUE;
7cd8b568
KW
1276 }
1277 else {
1278 /* Since LC_ALL succeeded, it should have changed all the other
1279 * categories it can to its value; so we massage things so that the
1280 * setlocales below just return their category's current values.
1281 * This adequately handles the case in NetBSD where LC_COLLATE may
1282 * not be defined for a locale, and setting it individually will
1283 * fail, whereas setting LC_ALL suceeds, leaving LC_COLLATE set to
1284 * the POSIX locale. */
1285 trial_locale = NULL;
1286 }
98994639
HS
1287#endif /* LC_ALL */
1288
49c85077 1289 if (!setlocale_failure) {
98994639 1290#ifdef USE_LOCALE_CTYPE
49c85077 1291 Safefree(curctype);
5d1187d1
KW
1292 curctype = my_setlocale(LC_CTYPE, trial_locale);
1293 DEBUG_LOCALE_INIT(LC_CTYPE, trial_locale, curctype);
1294 if (! curctype)
49c85077
KW
1295 setlocale_failure = TRUE;
1296 else
1297 curctype = savepv(curctype);
98994639
HS
1298#endif /* USE_LOCALE_CTYPE */
1299#ifdef USE_LOCALE_COLLATE
49c85077 1300 Safefree(curcoll);
5d1187d1
KW
1301 curcoll = my_setlocale(LC_COLLATE, trial_locale);
1302 DEBUG_LOCALE_INIT(LC_COLLATE, trial_locale, curcoll);
1303 if (! curcoll)
49c85077
KW
1304 setlocale_failure = TRUE;
1305 else
1306 curcoll = savepv(curcoll);
98994639
HS
1307#endif /* USE_LOCALE_COLLATE */
1308#ifdef USE_LOCALE_NUMERIC
49c85077 1309 Safefree(curnum);
5d1187d1
KW
1310 curnum = my_setlocale(LC_NUMERIC, trial_locale);
1311 DEBUG_LOCALE_INIT(LC_NUMERIC, trial_locale, curnum);
1312 if (! curnum)
49c85077
KW
1313 setlocale_failure = TRUE;
1314 else
1315 curnum = savepv(curnum);
98994639 1316#endif /* USE_LOCALE_NUMERIC */
a782673d 1317#ifdef USE_LOCALE_MESSAGES
5d1187d1
KW
1318 sl_result = my_setlocale(LC_MESSAGES, trial_locale);
1319 DEBUG_LOCALE_INIT(LC_MESSAGES, trial_locale, sl_result);
1320 if (! (sl_result))
a782673d
KW
1321 setlocale_failure = TRUE;
1322#endif /* USE_LOCALE_MESSAGES */
c835d6be 1323#ifdef USE_LOCALE_MONETARY
5d1187d1
KW
1324 sl_result = my_setlocale(LC_MONETARY, trial_locale);
1325 DEBUG_LOCALE_INIT(LC_MONETARY, trial_locale, sl_result);
1326 if (! (sl_result))
c835d6be
KW
1327 setlocale_failure = TRUE;
1328#endif /* USE_LOCALE_MONETARY */
1329
49c85077
KW
1330 if (! setlocale_failure) { /* Success */
1331 break;
1332 }
65ebb059 1333 }
98994639 1334
49c85077
KW
1335 /* Here, something failed; will need to try a fallback. */
1336 ok = 0;
65ebb059 1337
49c85077
KW
1338 if (i == 0) {
1339 unsigned int j;
98994639 1340
65ebb059 1341 if (locwarn) { /* Output failure info only on the first one */
98994639
HS
1342#ifdef LC_ALL
1343
49c85077
KW
1344 PerlIO_printf(Perl_error_log,
1345 "perl: warning: Setting locale failed.\n");
98994639
HS
1346
1347#else /* !LC_ALL */
1348
49c85077
KW
1349 PerlIO_printf(Perl_error_log,
1350 "perl: warning: Setting locale failed for the categories:\n\t");
20a240df 1351# ifdef USE_LOCALE_CTYPE
49c85077
KW
1352 if (! curctype)
1353 PerlIO_printf(Perl_error_log, "LC_CTYPE ");
20a240df
KW
1354# endif /* USE_LOCALE_CTYPE */
1355# ifdef USE_LOCALE_COLLATE
49c85077
KW
1356 if (! curcoll)
1357 PerlIO_printf(Perl_error_log, "LC_COLLATE ");
20a240df
KW
1358# endif /* USE_LOCALE_COLLATE */
1359# ifdef USE_LOCALE_NUMERIC
49c85077
KW
1360 if (! curnum)
1361 PerlIO_printf(Perl_error_log, "LC_NUMERIC ");
20a240df 1362# endif /* USE_LOCALE_NUMERIC */
a782673d 1363 PerlIO_printf(Perl_error_log, "and possibly others\n");
98994639
HS
1364
1365#endif /* LC_ALL */
1366
49c85077
KW
1367 PerlIO_printf(Perl_error_log,
1368 "perl: warning: Please check that your locale settings:\n");
98994639
HS
1369
1370#ifdef __GLIBC__
49c85077
KW
1371 PerlIO_printf(Perl_error_log,
1372 "\tLANGUAGE = %c%s%c,\n",
1373 language ? '"' : '(',
1374 language ? language : "unset",
1375 language ? '"' : ')');
98994639
HS
1376#endif
1377
49c85077
KW
1378 PerlIO_printf(Perl_error_log,
1379 "\tLC_ALL = %c%s%c,\n",
1380 lc_all ? '"' : '(',
1381 lc_all ? lc_all : "unset",
1382 lc_all ? '"' : ')');
98994639
HS
1383
1384#if defined(USE_ENVIRON_ARRAY)
49c85077
KW
1385 {
1386 char **e;
1387 for (e = environ; *e; e++) {
0eb7f56a
YO
1388 if (strEQs(*e, "LC_")
1389 && strNEs(*e, "LC_ALL=")
49c85077
KW
1390 && (p = strchr(*e, '=')))
1391 PerlIO_printf(Perl_error_log, "\t%.*s = \"%s\",\n",
1392 (int)(p - *e), *e, p + 1);
1393 }
1394 }
98994639 1395#else
49c85077
KW
1396 PerlIO_printf(Perl_error_log,
1397 "\t(possibly more locale environment variables)\n");
98994639
HS
1398#endif
1399
49c85077
KW
1400 PerlIO_printf(Perl_error_log,
1401 "\tLANG = %c%s%c\n",
1402 lang ? '"' : '(',
1403 lang ? lang : "unset",
1404 lang ? '"' : ')');
98994639 1405
49c85077
KW
1406 PerlIO_printf(Perl_error_log,
1407 " are supported and installed on your system.\n");
1408 }
98994639 1409
65ebb059 1410 /* Calculate what fallback locales to try. We have avoided this
f6bab5f6 1411 * until we have to, because failure is quite unlikely. This will
65ebb059
KW
1412 * usually change the upper bound of the loop we are in.
1413 *
1414 * Since the system's default way of setting the locale has not
1415 * found one that works, We use Perl's defined ordering: LC_ALL,
1416 * LANG, and the C locale. We don't try the same locale twice, so
1417 * don't add to the list if already there. (On POSIX systems, the
1418 * LC_ALL element will likely be a repeat of the 0th element "",
6b058d42
KW
1419 * but there's no harm done by doing it explicitly.
1420 *
1421 * Note that this tries the LC_ALL environment variable even on
1422 * systems which have no LC_ALL locale setting. This may or may
1423 * not have been originally intentional, but there's no real need
1424 * to change the behavior. */
65ebb059
KW
1425 if (lc_all) {
1426 for (j = 0; j < trial_locales_count; j++) {
1427 if (strEQ(lc_all, trial_locales[j])) {
1428 goto done_lc_all;
1429 }
1430 }
1431 trial_locales[trial_locales_count++] = lc_all;
1432 }
1433 done_lc_all:
98994639 1434
65ebb059
KW
1435 if (lang) {
1436 for (j = 0; j < trial_locales_count; j++) {
1437 if (strEQ(lang, trial_locales[j])) {
1438 goto done_lang;
1439 }
1440 }
1441 trial_locales[trial_locales_count++] = lang;
1442 }
1443 done_lang:
1444
1445#if defined(WIN32) && defined(LC_ALL)
1446 /* For Windows, we also try the system default locale before "C".
1447 * (If there exists a Windows without LC_ALL we skip this because
1448 * it gets too complicated. For those, the "C" is the next
1449 * fallback possibility). The "" is the same as the 0th element of
1450 * the array, but the code at the loop above knows to treat it
1451 * differently when not the 0th */
1452 trial_locales[trial_locales_count++] = "";
1453#endif
1454
1455 for (j = 0; j < trial_locales_count; j++) {
1456 if (strEQ("C", trial_locales[j])) {
1457 goto done_C;
1458 }
1459 }
1460 trial_locales[trial_locales_count++] = "C";
98994639 1461
65ebb059
KW
1462 done_C: ;
1463 } /* end of first time through the loop */
98994639 1464
65ebb059
KW
1465#ifdef WIN32
1466 next_iteration: ;
1467#endif
1468
1469 } /* end of looping through the trial locales */
1470
1471 if (ok < 1) { /* If we tried to fallback */
1472 const char* msg;
1473 if (! setlocale_failure) { /* fallback succeeded */
1474 msg = "Falling back to";
1475 }
1476 else { /* fallback failed */
98994639 1477
65ebb059
KW
1478 /* We dropped off the end of the loop, so have to decrement i to
1479 * get back to the value the last time through */
1480 i--;
98994639 1481
65ebb059
KW
1482 ok = -1;
1483 msg = "Failed to fall back to";
1484
1485 /* To continue, we should use whatever values we've got */
98994639 1486#ifdef USE_LOCALE_CTYPE
49c85077
KW
1487 Safefree(curctype);
1488 curctype = savepv(setlocale(LC_CTYPE, NULL));
5d1187d1 1489 DEBUG_LOCALE_INIT(LC_CTYPE, NULL, curctype);
98994639
HS
1490#endif /* USE_LOCALE_CTYPE */
1491#ifdef USE_LOCALE_COLLATE
49c85077
KW
1492 Safefree(curcoll);
1493 curcoll = savepv(setlocale(LC_COLLATE, NULL));
5d1187d1 1494 DEBUG_LOCALE_INIT(LC_COLLATE, NULL, curcoll);
98994639
HS
1495#endif /* USE_LOCALE_COLLATE */
1496#ifdef USE_LOCALE_NUMERIC
49c85077
KW
1497 Safefree(curnum);
1498 curnum = savepv(setlocale(LC_NUMERIC, NULL));
5d1187d1 1499 DEBUG_LOCALE_INIT(LC_NUMERIC, NULL, curnum);
98994639 1500#endif /* USE_LOCALE_NUMERIC */
65ebb059
KW
1501 }
1502
1503 if (locwarn) {
1504 const char * description;
1505 const char * name = "";
1506 if (strEQ(trial_locales[i], "C")) {
1507 description = "the standard locale";
1508 name = "C";
1509 }
6bce99ee 1510#ifdef SYSTEM_DEFAULT_LOCALE
65ebb059
KW
1511 else if (strEQ(trial_locales[i], "")) {
1512 description = "the system default locale";
1513 if (system_default_locale) {
1514 name = system_default_locale;
1515 }
1516 }
6bce99ee 1517#endif /* SYSTEM_DEFAULT_LOCALE */
65ebb059
KW
1518 else {
1519 description = "a fallback locale";
1520 name = trial_locales[i];
1521 }
1522 if (name && strNE(name, "")) {
1523 PerlIO_printf(Perl_error_log,
1524 "perl: warning: %s %s (\"%s\").\n", msg, description, name);
1525 }
1526 else {
1527 PerlIO_printf(Perl_error_log,
1528 "perl: warning: %s %s.\n", msg, description);
1529 }
1530 }
1531 } /* End of tried to fallback */
98994639
HS
1532
1533#ifdef USE_LOCALE_CTYPE
1534 new_ctype(curctype);
1535#endif /* USE_LOCALE_CTYPE */
1536
1537#ifdef USE_LOCALE_COLLATE
1538 new_collate(curcoll);
1539#endif /* USE_LOCALE_COLLATE */
1540
1541#ifdef USE_LOCALE_NUMERIC
1542 new_numeric(curnum);
1543#endif /* USE_LOCALE_NUMERIC */
b310b053 1544
8ef6e574 1545#if defined(USE_PERLIO) && defined(USE_LOCALE_CTYPE)
49c85077
KW
1546 /* Set PL_utf8locale to TRUE if using PerlIO _and_ the current LC_CTYPE
1547 * locale is UTF-8. If PL_utf8locale and PL_unicode (set by -C or by
1548 * $ENV{PERL_UNICODE}) are true, perl.c:S_parse_body() will turn on the
1549 * PerlIO :utf8 layer on STDIN, STDOUT, STDERR, _and_ the default open
1550 * discipline. */
c1284011 1551 PL_utf8locale = _is_cur_LC_category_utf8(LC_CTYPE);
49c85077 1552
a05d7ebb 1553 /* Set PL_unicode to $ENV{PERL_UNICODE} if using PerlIO.
fde18df1
JH
1554 This is an alternative to using the -C command line switch
1555 (the -C if present will override this). */
1556 {
dd374669 1557 const char *p = PerlEnv_getenv("PERL_UNICODE");
a05d7ebb 1558 PL_unicode = p ? parse_unicode_opts(&p) : 0;
5a22a2bb
NC
1559 if (PL_unicode & PERL_UNICODE_UTF8CACHEASSERT_FLAG)
1560 PL_utf8cache = -1;
b310b053 1561 }
ec71e770 1562#endif
b310b053 1563
98994639 1564#ifdef USE_LOCALE_CTYPE
43c5f42d 1565 Safefree(curctype);
98994639
HS
1566#endif /* USE_LOCALE_CTYPE */
1567#ifdef USE_LOCALE_COLLATE
43c5f42d 1568 Safefree(curcoll);
98994639
HS
1569#endif /* USE_LOCALE_COLLATE */
1570#ifdef USE_LOCALE_NUMERIC
43c5f42d 1571 Safefree(curnum);
98994639 1572#endif /* USE_LOCALE_NUMERIC */
8ef6e574 1573
175c4cf9
KW
1574#ifdef __GLIBC__
1575 Safefree(language);
1576#endif
1577
1578 Safefree(lc_all);
1579 Safefree(lang);
1580
e3305790
KW
1581#else /* !USE_LOCALE */
1582 PERL_UNUSED_ARG(printwarn);
1583#endif /* USE_LOCALE */
1584
2fcc0ca9
KW
1585#ifdef DEBUGGING
1586 /* So won't continue to output stuff */
27cdc72e 1587 DEBUG_INITIALIZATION_set(FALSE);
2fcc0ca9
KW
1588#endif
1589
98994639
HS
1590 return ok;
1591}
1592
98994639
HS
1593#ifdef USE_LOCALE_COLLATE
1594
a4a439fb 1595char *
a4a439fb
KW
1596Perl__mem_collxfrm(pTHX_ const char *input_string,
1597 STRLEN len, /* Length of 'input_string' */
1598 STRLEN *xlen, /* Set to length of returned string
1599 (not including the collation index
1600 prefix) */
1601 bool utf8 /* Is the input in UTF-8? */
6696cfa7 1602 )
98994639 1603{
a4a439fb
KW
1604
1605 /* _mem_collxfrm() is a bit like strxfrm() but with two important
1606 * differences. First, it handles embedded NULs. Second, it allocates a bit
1607 * more memory than needed for the transformed data itself. The real
55e5378d 1608 * transformed data begins at offset COLLXFRM_HDR_LEN. *xlen is set to
a4a439fb
KW
1609 * the length of that, and doesn't include the collation index size.
1610 * Please see sv_collxfrm() to see how this is used. */
1611
55e5378d
KW
1612#define COLLXFRM_HDR_LEN sizeof(PL_collation_ix)
1613
6696cfa7
KW
1614 char * s = (char *) input_string;
1615 STRLEN s_strlen = strlen(input_string);
79f120c8 1616 char *xbuf = NULL;
55e5378d 1617 STRLEN xAlloc; /* xalloc is a reserved word in VC */
17f41037 1618 STRLEN length_in_chars;
c664130f 1619 bool first_time = TRUE; /* Cleared after first loop iteration */
98994639 1620
a4a439fb
KW
1621 PERL_ARGS_ASSERT__MEM_COLLXFRM;
1622
1623 /* Must be NUL-terminated */
1624 assert(*(input_string + len) == '\0');
7918f24d 1625
79f120c8
KW
1626 /* If this locale has defective collation, skip */
1627 if (PL_collxfrm_base == 0 && PL_collxfrm_mult == 0) {
c7202dee
KW
1628 DEBUG_L(PerlIO_printf(Perl_debug_log,
1629 "_mem_collxfrm: locale's collation is defective\n"));
79f120c8
KW
1630 goto bad;
1631 }
1632
6696cfa7
KW
1633 /* Replace any embedded NULs with the control that sorts before any others.
1634 * This will give as good as possible results on strings that don't
1635 * otherwise contain that character, but otherwise there may be
1636 * less-than-perfect results with that character and NUL. This is
fdc080f3 1637 * unavoidable unless we replace strxfrm with our own implementation. */
1e4c9676 1638 if (s_strlen < len) { /* Only execute if there is an embedded NUL */
6696cfa7
KW
1639 char * e = s + len;
1640 char * sans_nuls;
fdc080f3
KW
1641 STRLEN sans_nuls_len;
1642 STRLEN sans_nuls_pos;
94762aa0 1643 int try_non_controls;
afc4976f
KW
1644 char this_replacement_char[] = "?\0"; /* Room for a two-byte string,
1645 making sure 2nd byte is NUL.
1646 */
1647 STRLEN this_replacement_len;
1648
1e4c9676
KW
1649 /* If we don't know what non-NUL control character sorts lowest for
1650 * this locale, find it */
f28f4d2a 1651 if (PL_strxfrm_NUL_replacement == '\0') {
6696cfa7 1652 int j;
afc4976f 1653 char * cur_min_x = NULL; /* The min_char's xfrm, (except it also
6696cfa7
KW
1654 includes the collation index
1655 prefixed. */
1656
91c0e2e0 1657 DEBUG_Lv(PerlIO_printf(Perl_debug_log, "Looking to replace NUL\n"));
94762aa0
KW
1658
1659 /* Unlikely, but it may be that no control will work to replace
1e4c9676
KW
1660 * NUL, in which case we instead look for any character. Controls
1661 * are preferred because collation order is, in general, context
1662 * sensitive, with adjoining characters affecting the order, and
1663 * controls are less likely to have such interactions, allowing the
1664 * NUL-replacement to stand on its own. (Another way to look at it
1665 * is to imagine what would happen if the NUL were replaced by a
1666 * combining character; it wouldn't work out all that well.) */
94762aa0
KW
1667 for (try_non_controls = 0;
1668 try_non_controls < 2;
1669 try_non_controls++)
1670 {
d4ff9586
KW
1671 /* Look through all legal code points (NUL isn't) */
1672 for (j = 1; j < 256; j++) {
1673 char * x; /* j's xfrm plus collation index */
1674 STRLEN x_len; /* length of 'x' */
1675 STRLEN trial_len = 1;
736a4fed 1676 char cur_source[] = { '\0', '\0' };
d4ff9586 1677
736a4fed
KW
1678 /* Skip non-controls the first time through the loop. The
1679 * controls in a UTF-8 locale are the L1 ones */
afc4976f
KW
1680 if (! try_non_controls && (PL_in_utf8_COLLATE_locale)
1681 ? ! isCNTRL_L1(j)
1682 : ! isCNTRL_LC(j))
1683 {
d4ff9586 1684 continue;
6696cfa7 1685 }
6696cfa7 1686
736a4fed
KW
1687 /* Create a 1-char string of the current code point */
1688 cur_source[0] = (char) j;
1689
d4ff9586
KW
1690 /* Then transform it */
1691 x = _mem_collxfrm(cur_source, trial_len, &x_len,
afc4976f 1692 0 /* The string is not in UTF-8 */);
6696cfa7 1693
1e4c9676 1694 /* Ignore any character that didn't successfully transform.
d4ff9586
KW
1695 * */
1696 if (! x) {
1697 continue;
1698 }
6696cfa7 1699
d4ff9586
KW
1700 /* If this character's transformation is lower than
1701 * the current lowest, this one becomes the lowest */
1702 if ( cur_min_x == NULL
1703 || strLT(x + COLLXFRM_HDR_LEN,
1704 cur_min_x + COLLXFRM_HDR_LEN))
1705 {
f28f4d2a 1706 PL_strxfrm_NUL_replacement = j;
d4ff9586 1707 cur_min_x = x;
d4ff9586
KW
1708 }
1709 else {
1710 Safefree(x);
1711 }
1e4c9676 1712 } /* end of loop through all 255 characters */
6696cfa7 1713
1e4c9676 1714 /* Stop looking if found */
94762aa0
KW
1715 if (cur_min_x) {
1716 break;
1717 }
1718
1719 /* Unlikely, but possible, if there aren't any controls that
1720 * work in the locale, repeat the loop, looking for any
1721 * character that works */
1722 DEBUG_L(PerlIO_printf(Perl_debug_log,
1723 "_mem_collxfrm: No control worked. Trying non-controls\n"));
1e4c9676 1724 } /* End of loop to try first the controls, then any char */
6696cfa7 1725
94762aa0
KW
1726 if (! cur_min_x) {
1727 DEBUG_L(PerlIO_printf(Perl_debug_log,
1728 "_mem_collxfrm: Couldn't find any character to replace"
1729 " embedded NULs in locale %s with", PL_collation_name));
1730 goto bad;
58eebef2
KW
1731 }
1732
94762aa0
KW
1733 DEBUG_L(PerlIO_printf(Perl_debug_log,
1734 "_mem_collxfrm: Replacing embedded NULs in locale %s with "
f28f4d2a 1735 "0x%02X\n", PL_collation_name, PL_strxfrm_NUL_replacement));
94762aa0 1736
6696cfa7 1737 Safefree(cur_min_x);
1e4c9676 1738 } /* End of determining the character that is to replace NULs */
afc4976f
KW
1739
1740 /* If the replacement is variant under UTF-8, it must match the
1741 * UTF8-ness as the original */
f28f4d2a
KW
1742 if ( ! UVCHR_IS_INVARIANT(PL_strxfrm_NUL_replacement) && utf8) {
1743 this_replacement_char[0] =
1744 UTF8_EIGHT_BIT_HI(PL_strxfrm_NUL_replacement);
1745 this_replacement_char[1] =
1746 UTF8_EIGHT_BIT_LO(PL_strxfrm_NUL_replacement);
afc4976f
KW
1747 this_replacement_len = 2;
1748 }
1749 else {
f28f4d2a 1750 this_replacement_char[0] = PL_strxfrm_NUL_replacement;
afc4976f
KW
1751 /* this_replacement_char[1] = '\0' was done at initialization */
1752 this_replacement_len = 1;
6696cfa7
KW
1753 }
1754
1755 /* The worst case length for the replaced string would be if every
1756 * character in it is NUL. Multiply that by the length of each
1757 * replacement, and allow for a trailing NUL */
afc4976f 1758 sans_nuls_len = (len * this_replacement_len) + 1;
fdc080f3 1759 Newx(sans_nuls, sans_nuls_len, char);
6696cfa7 1760 *sans_nuls = '\0';
fdc080f3 1761 sans_nuls_pos = 0;
6696cfa7 1762
6696cfa7
KW
1763 /* Replace each NUL with the lowest collating control. Loop until have
1764 * exhausted all the NULs */
1765 while (s + s_strlen < e) {
fdc080f3
KW
1766 sans_nuls_pos = my_strlcat(sans_nuls + sans_nuls_pos,
1767 s,
1768 sans_nuls_len);
6696cfa7
KW
1769
1770 /* Do the actual replacement */
fdc080f3 1771 sans_nuls_pos = my_strlcat(sans_nuls + sans_nuls_pos,
afc4976f 1772 this_replacement_char,
fdc080f3 1773 sans_nuls_len);
6696cfa7
KW
1774
1775 /* Move past the input NUL */
1776 s += s_strlen + 1;
1777 s_strlen = strlen(s);
1778 }
1779
1780 /* And add anything that trails the final NUL */
fdc080f3 1781 my_strlcat(sans_nuls + sans_nuls_pos, s, sans_nuls_len);
6696cfa7
KW
1782
1783 /* Switch so below we transform this modified string */
1784 s = sans_nuls;
1785 len = strlen(s);
1e4c9676 1786 } /* End of replacing NULs */
6696cfa7 1787
a4a439fb
KW
1788 /* Make sure the UTF8ness of the string and locale match */
1789 if (utf8 != PL_in_utf8_COLLATE_locale) {
1790 const char * const t = s; /* Temporary so we can later find where the
1791 input was */
1792
1793 /* Here they don't match. Change the string's to be what the locale is
1794 * expecting */
1795
1796 if (! utf8) { /* locale is UTF-8, but input isn't; upgrade the input */
1797 s = (char *) bytes_to_utf8((const U8 *) s, &len);
1798 utf8 = TRUE;
1799 }
1800 else { /* locale is not UTF-8; but input is; downgrade the input */
1801
1802 s = (char *) bytes_from_utf8((const U8 *) s, &len, &utf8);
1803
1804 /* If the downgrade was successful we are done, but if the input
1805 * contains things that require UTF-8 to represent, have to do
1806 * damage control ... */
1807 if (UNLIKELY(utf8)) {
1808
1809 /* What we do is construct a non-UTF-8 string with
1810 * 1) the characters representable by a single byte converted
1811 * to be so (if necessary);
1812 * 2) and the rest converted to collate the same as the
1813 * highest collating representable character. That makes
1814 * them collate at the end. This is similar to how we
1815 * handle embedded NULs, but we use the highest collating
1816 * code point instead of the smallest. Like the NUL case,
1817 * this isn't perfect, but is the best we can reasonably
1818 * do. Every above-255 code point will sort the same as
1819 * the highest-sorting 0-255 code point. If that code
1820 * point can combine in a sequence with some other code
1821 * points for weight calculations, us changing something to
1822 * be it can adversely affect the results. But in most
1823 * cases, it should work reasonably. And note that this is
1824 * really an illegal situation: using code points above 255
1825 * on a locale where only 0-255 are valid. If two strings
1826 * sort entirely equal, then the sort order for the
1827 * above-255 code points will be in code point order. */
1828
1829 utf8 = FALSE;
1830
1831 /* If we haven't calculated the code point with the maximum
1832 * collating order for this locale, do so now */
1833 if (! PL_strxfrm_max_cp) {
1834 int j;
1835
1836 /* The current transformed string that collates the
1837 * highest (except it also includes the prefixed collation
1838 * index. */
1839 char * cur_max_x = NULL;
1840
1841 /* Look through all legal code points (NUL isn't) */
1842 for (j = 1; j < 256; j++) {
1843 char * x;
1844 STRLEN x_len;
736a4fed 1845 char cur_source[] = { '\0', '\0' };
a4a439fb 1846
736a4fed
KW
1847 /* Create a 1-char string of the current code point */
1848 cur_source[0] = (char) j;
a4a439fb
KW
1849
1850 /* Then transform it */
1851 x = _mem_collxfrm(cur_source, 1, &x_len, FALSE);
1852
1853 /* If something went wrong (which it shouldn't), just
1854 * ignore this code point */
94762aa0 1855 if (! x) {
a4a439fb
KW
1856 continue;
1857 }
1858
1859 /* If this character's transformation is higher than
1860 * the current highest, this one becomes the highest */
1861 if ( cur_max_x == NULL
55e5378d
KW
1862 || strGT(x + COLLXFRM_HDR_LEN,
1863 cur_max_x + COLLXFRM_HDR_LEN))
a4a439fb
KW
1864 {
1865 PL_strxfrm_max_cp = j;
1866 cur_max_x = x;
1867 }
1868 else {
1869 Safefree(x);
1870 }
1871 }
1872
94762aa0
KW
1873 if (! cur_max_x) {
1874 DEBUG_L(PerlIO_printf(Perl_debug_log,
1875 "_mem_collxfrm: Couldn't find any character to"
1876 " replace above-Latin1 chars in locale %s with",
1877 PL_collation_name));
1878 goto bad;
1879 }
1880
58eebef2
KW
1881 DEBUG_L(PerlIO_printf(Perl_debug_log,
1882 "_mem_collxfrm: highest 1-byte collating character"
1883 " in locale %s is 0x%02X\n",
1884 PL_collation_name,
1885 PL_strxfrm_max_cp));
58eebef2 1886
a4a439fb
KW
1887 Safefree(cur_max_x);
1888 }
1889
1890 /* Here we know which legal code point collates the highest.
1891 * We are ready to construct the non-UTF-8 string. The length
1892 * will be at least 1 byte smaller than the input string
1893 * (because we changed at least one 2-byte character into a
1894 * single byte), but that is eaten up by the trailing NUL */
1895 Newx(s, len, char);
1896
1897 {
1898 STRLEN i;
1899 STRLEN d= 0;
042d9e50 1900 char * e = (char *) t + len;
a4a439fb
KW
1901
1902 for (i = 0; i < len; i+= UTF8SKIP(t + i)) {
1903 U8 cur_char = t[i];
1904 if (UTF8_IS_INVARIANT(cur_char)) {
1905 s[d++] = cur_char;
1906 }
042d9e50 1907 else if (UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(t + i, e)) {
a4a439fb
KW
1908 s[d++] = EIGHT_BIT_UTF8_TO_NATIVE(cur_char, t[i+1]);
1909 }
1910 else { /* Replace illegal cp with highest collating
1911 one */
1912 s[d++] = PL_strxfrm_max_cp;
1913 }
1914 }
1915 s[d++] = '\0';
1916 Renew(s, d, char); /* Free up unused space */
1917 }
1918 }
1919 }
1920
1921 /* Here, we have constructed a modified version of the input. It could
1922 * be that we already had a modified copy before we did this version.
1923 * If so, that copy is no longer needed */
1924 if (t != input_string) {
1925 Safefree(t);
1926 }
1927 }
1928
17f41037
KW
1929 length_in_chars = (utf8)
1930 ? utf8_length((U8 *) s, (U8 *) s + len)
1931 : len;
1932
59c018b9
KW
1933 /* The first element in the output is the collation id, used by
1934 * sv_collxfrm(); then comes the space for the transformed string. The
1935 * equation should give us a good estimate as to how much is needed */
55e5378d 1936 xAlloc = COLLXFRM_HDR_LEN
a4a439fb 1937 + PL_collxfrm_base
17f41037 1938 + (PL_collxfrm_mult * length_in_chars);
a02a5408 1939 Newx(xbuf, xAlloc, char);
c7202dee
KW
1940 if (UNLIKELY(! xbuf)) {
1941 DEBUG_L(PerlIO_printf(Perl_debug_log,
1942 "_mem_collxfrm: Couldn't malloc %zu bytes\n", xAlloc));
98994639 1943 goto bad;
c7202dee 1944 }
98994639 1945
d35fca5f 1946 /* Store the collation id */
98994639 1947 *(U32*)xbuf = PL_collation_ix;
d35fca5f
KW
1948
1949 /* Then the transformation of the input. We loop until successful, or we
1950 * give up */
4ebeff16 1951 for (;;) {
1adab0a7 1952
55e5378d 1953 *xlen = strxfrm(xbuf + COLLXFRM_HDR_LEN, s, xAlloc - COLLXFRM_HDR_LEN);
4ebeff16
KW
1954
1955 /* If the transformed string occupies less space than we told strxfrm()
1956 * was available, it means it successfully transformed the whole
1957 * string. */
55e5378d 1958 if (*xlen < xAlloc - COLLXFRM_HDR_LEN) {
17f41037 1959
1adab0a7
KW
1960 /* Some systems include a trailing NUL in the returned length.
1961 * Ignore it, using a loop in case multiple trailing NULs are
1962 * returned. */
1963 while ( (*xlen) > 0
1964 && *(xbuf + COLLXFRM_HDR_LEN + (*xlen) - 1) == '\0')
1965 {
1966 (*xlen)--;
1967 }
1968
17f41037
KW
1969 /* If the first try didn't get it, it means our prediction was low.
1970 * Modify the coefficients so that we predict a larger value in any
1971 * future transformations */
1972 if (! first_time) {
1973 STRLEN needed = *xlen + 1; /* +1 For trailing NUL */
1974 STRLEN computed_guess = PL_collxfrm_base
1975 + (PL_collxfrm_mult * length_in_chars);
e1c30f0c
KW
1976
1977 /* On zero-length input, just keep current slope instead of
1978 * dividing by 0 */
1979 const STRLEN new_m = (length_in_chars != 0)
1980 ? needed / length_in_chars
1981 : PL_collxfrm_mult;
17f41037
KW
1982
1983 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
b07929e4
KW
1984 "%s: %d: initial size of %zu bytes for a length "
1985 "%zu string was insufficient, %zu needed\n",
17f41037 1986 __FILE__, __LINE__,
b07929e4 1987 computed_guess, length_in_chars, needed));
17f41037
KW
1988
1989 /* If slope increased, use it, but discard this result for
1990 * length 1 strings, as we can't be sure that it's a real slope
1991 * change */
1992 if (length_in_chars > 1 && new_m > PL_collxfrm_mult) {
1993#ifdef DEBUGGING
1994 STRLEN old_m = PL_collxfrm_mult;
1995 STRLEN old_b = PL_collxfrm_base;
1996#endif
1997 PL_collxfrm_mult = new_m;
1998 PL_collxfrm_base = 1; /* +1 For trailing NUL */
1999 computed_guess = PL_collxfrm_base
2000 + (PL_collxfrm_mult * length_in_chars);
2001 if (computed_guess < needed) {
2002 PL_collxfrm_base += needed - computed_guess;
2003 }
2004
2005 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
b07929e4
KW
2006 "%s: %d: slope is now %zu; was %zu, base "
2007 "is now %zu; was %zu\n",
17f41037 2008 __FILE__, __LINE__,
b07929e4
KW
2009 PL_collxfrm_mult, old_m,
2010 PL_collxfrm_base, old_b));
17f41037
KW
2011 }
2012 else { /* Slope didn't change, but 'b' did */
2013 const STRLEN new_b = needed
2014 - computed_guess
2015 + PL_collxfrm_base;
2016 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
b07929e4 2017 "%s: %d: base is now %zu; was %zu\n",
17f41037 2018 __FILE__, __LINE__,
b07929e4 2019 new_b, PL_collxfrm_base));
17f41037
KW
2020 PL_collxfrm_base = new_b;
2021 }
2022 }
2023
4ebeff16
KW
2024 break;
2025 }
bb0f664e 2026
c7202dee
KW
2027 if (UNLIKELY(*xlen >= PERL_INT_MAX)) {
2028 DEBUG_L(PerlIO_printf(Perl_debug_log,
2029 "_mem_collxfrm: Needed %zu bytes, max permissible is %u\n",
2030 *xlen, PERL_INT_MAX));
4ebeff16 2031 goto bad;
c7202dee 2032 }
d35fca5f 2033
c664130f 2034 /* A well-behaved strxfrm() returns exactly how much space it needs
1adab0a7
KW
2035 * (usually not including the trailing NUL) when it fails due to not
2036 * enough space being provided. Assume that this is the case unless
2037 * it's been proven otherwise */
c664130f 2038 if (LIKELY(PL_strxfrm_is_behaved) && first_time) {
55e5378d 2039 xAlloc = *xlen + COLLXFRM_HDR_LEN + 1;
c664130f
KW
2040 }
2041 else { /* Here, either:
2042 * 1) The strxfrm() has previously shown bad behavior; or
2043 * 2) It isn't the first time through the loop, which means
2044 * that the strxfrm() is now showing bad behavior, because
2045 * we gave it what it said was needed in the previous
2046 * iteration, and it came back saying it needed still more.
2047 * (Many versions of cygwin fit this. When the buffer size
2048 * isn't sufficient, they return the input size instead of
2049 * how much is needed.)
d4ff9586
KW
2050 * Increase the buffer size by a fixed percentage and try again.
2051 * */
6ddd902c 2052 xAlloc += (xAlloc / 4) + 1;
c664130f 2053 PL_strxfrm_is_behaved = FALSE;
c664130f 2054
58eebef2
KW
2055#ifdef DEBUGGING
2056 if (DEBUG_Lv_TEST || debug_initialization) {
2057 PerlIO_printf(Perl_debug_log,
2058 "_mem_collxfrm required more space than previously calculated"
b07929e4 2059 " for locale %s, trying again with new guess=%d+%zu\n",
58eebef2 2060 PL_collation_name, (int) COLLXFRM_HDR_LEN,
b07929e4 2061 xAlloc - COLLXFRM_HDR_LEN);
58eebef2
KW
2062 }
2063#endif
2064 }
c664130f 2065
4ebeff16 2066 Renew(xbuf, xAlloc, char);
c7202dee
KW
2067 if (UNLIKELY(! xbuf)) {
2068 DEBUG_L(PerlIO_printf(Perl_debug_log,
2069 "_mem_collxfrm: Couldn't realloc %zu bytes\n", xAlloc));
4ebeff16 2070 goto bad;
c7202dee 2071 }
c664130f
KW
2072
2073 first_time = FALSE;
4ebeff16 2074 }
98994639 2075
6696cfa7 2076
58eebef2
KW
2077#ifdef DEBUGGING
2078 if (DEBUG_Lv_TEST || debug_initialization) {
c7202dee
KW
2079
2080 print_collxfrm_input_and_return(s, s + len, xlen, utf8);
2081 PerlIO_printf(Perl_debug_log, "Its xfrm is:");
7e2f38b2
KW
2082 PerlIO_printf(Perl_debug_log, "%s\n",
2083 _byte_dump_string((U8 *) xbuf + COLLXFRM_HDR_LEN,
2084 *xlen, 1));
58eebef2
KW
2085 }
2086#endif
2087
3c5f993e 2088 /* Free up unneeded space; retain ehough for trailing NUL */
55e5378d 2089 Renew(xbuf, COLLXFRM_HDR_LEN + *xlen + 1, char);
98994639 2090
6696cfa7
KW
2091 if (s != input_string) {
2092 Safefree(s);
98994639
HS
2093 }
2094
98994639
HS
2095 return xbuf;
2096
2097 bad:
2098 Safefree(xbuf);
6696cfa7
KW
2099 if (s != input_string) {
2100 Safefree(s);
2101 }
98994639 2102 *xlen = 0;
58eebef2
KW
2103#ifdef DEBUGGING
2104 if (DEBUG_Lv_TEST || debug_initialization) {
c7202dee 2105 print_collxfrm_input_and_return(s, s + len, NULL, utf8);
58eebef2
KW
2106 }
2107#endif
98994639
HS
2108 return NULL;
2109}
2110
c7202dee
KW
2111#ifdef DEBUGGING
2112
4cbaac56 2113STATIC void
c7202dee
KW
2114S_print_collxfrm_input_and_return(pTHX_
2115 const char * const s,
2116 const char * const e,
2117 const STRLEN * const xlen,
2118 const bool is_utf8)
2119{
c7202dee
KW
2120
2121 PERL_ARGS_ASSERT_PRINT_COLLXFRM_INPUT_AND_RETURN;
2122
511e4ff7
DM
2123 PerlIO_printf(Perl_debug_log, "_mem_collxfrm[%" UVuf "]: returning ",
2124 (UV)PL_collation_ix);
c7202dee 2125 if (xlen) {
147e3846 2126 PerlIO_printf(Perl_debug_log, "%" UVuf, (UV) *xlen);
c7202dee
KW
2127 }
2128 else {
2129 PerlIO_printf(Perl_debug_log, "NULL");
2130 }
2131 PerlIO_printf(Perl_debug_log, " for locale '%s', string='",
2132 PL_collation_name);
9c8a6dc2
KW
2133 print_bytes_for_locale(s, e, is_utf8);
2134
2135 PerlIO_printf(Perl_debug_log, "'\n");
2136}
2137
2138STATIC void
2139S_print_bytes_for_locale(pTHX_
2140 const char * const s,
2141 const char * const e,
2142 const bool is_utf8)
2143{
2144 const char * t = s;
2145 bool prev_was_printable = TRUE;
2146 bool first_time = TRUE;
2147
2148 PERL_ARGS_ASSERT_PRINT_BYTES_FOR_LOCALE;
c7202dee
KW
2149
2150 while (t < e) {
2151 UV cp = (is_utf8)
2152 ? utf8_to_uvchr_buf((U8 *) t, e, NULL)
2153 : * (U8 *) t;
2154 if (isPRINT(cp)) {
2155 if (! prev_was_printable) {
2156 PerlIO_printf(Perl_debug_log, " ");
2157 }
2158 PerlIO_printf(Perl_debug_log, "%c", (U8) cp);
2159 prev_was_printable = TRUE;
2160 }
2161 else {
2162 if (! first_time) {
2163 PerlIO_printf(Perl_debug_log, " ");
2164 }
147e3846 2165 PerlIO_printf(Perl_debug_log, "%02" UVXf, cp);
c7202dee
KW
2166 prev_was_printable = FALSE;
2167 }
2168 t += (is_utf8) ? UTF8SKIP(t) : 1;
2169 first_time = FALSE;
2170 }
c7202dee
KW
2171}
2172
2173#endif /* #ifdef DEBUGGING */
2174
98994639 2175#endif /* USE_LOCALE_COLLATE */
58eebef2 2176
8ef6e574
KW
2177#ifdef USE_LOCALE
2178
c1284011
KW
2179bool
2180Perl__is_cur_LC_category_utf8(pTHX_ int category)
7d74bb61
KW
2181{
2182 /* Returns TRUE if the current locale for 'category' is UTF-8; FALSE
2183 * otherwise. 'category' may not be LC_ALL. If the platform doesn't have
119ee68b 2184 * nl_langinfo(), nor MB_CUR_MAX, this employs a heuristic, which hence
609548d2
KW
2185 * could give the wrong result. The result will very likely be correct for
2186 * languages that have commonly used non-ASCII characters, but for notably
2187 * English, it comes down to if the locale's name ends in something like
2188 * "UTF-8". It errs on the side of not being a UTF-8 locale. */
7d74bb61
KW
2189
2190 char *save_input_locale = NULL;
7d74bb61
KW
2191 STRLEN final_pos;
2192
8ef6e574 2193#ifdef LC_ALL
7d74bb61 2194 assert(category != LC_ALL);
8ef6e574 2195#endif
7d74bb61
KW
2196
2197 /* First dispose of the trivial cases */
b07fffd1 2198 save_input_locale = setlocale(category, NULL);
7d74bb61 2199 if (! save_input_locale) {
69014004
KW
2200 DEBUG_L(PerlIO_printf(Perl_debug_log,
2201 "Could not find current locale for category %d\n",
2202 category));
7d74bb61
KW
2203 return FALSE; /* XXX maybe should croak */
2204 }
b07fffd1 2205 save_input_locale = stdize_locale(savepv(save_input_locale));
a39edc4c 2206 if (isNAME_C_OR_POSIX(save_input_locale)) {
69014004
KW
2207 DEBUG_L(PerlIO_printf(Perl_debug_log,
2208 "Current locale for category %d is %s\n",
2209 category, save_input_locale));
b07fffd1 2210 Safefree(save_input_locale);
7d74bb61
KW
2211 return FALSE;
2212 }
2213
1d958db2
KW
2214#if defined(USE_LOCALE_CTYPE) \
2215 && (defined(MB_CUR_MAX) || (defined(HAS_NL_LANGINFO) && defined(CODESET)))
7d74bb61 2216
1d958db2 2217 { /* Next try nl_langinfo or MB_CUR_MAX if available */
7d74bb61
KW
2218
2219 char *save_ctype_locale = NULL;
119ee68b 2220 bool is_utf8;
7d74bb61 2221
119ee68b 2222 if (category != LC_CTYPE) { /* These work only on LC_CTYPE */
7d74bb61
KW
2223
2224 /* Get the current LC_CTYPE locale */
4f72bb37 2225 save_ctype_locale = setlocale(LC_CTYPE, NULL);
7d74bb61 2226 if (! save_ctype_locale) {
69014004
KW
2227 DEBUG_L(PerlIO_printf(Perl_debug_log,
2228 "Could not find current locale for LC_CTYPE\n"));
7d74bb61
KW
2229 goto cant_use_nllanginfo;
2230 }
4f72bb37 2231 save_ctype_locale = stdize_locale(savepv(save_ctype_locale));
7d74bb61
KW
2232
2233 /* If LC_CTYPE and the desired category use the same locale, this
2234 * means that finding the value for LC_CTYPE is the same as finding
2235 * the value for the desired category. Otherwise, switch LC_CTYPE
2236 * to the desired category's locale */
2237 if (strEQ(save_ctype_locale, save_input_locale)) {
2238 Safefree(save_ctype_locale);
2239 save_ctype_locale = NULL;
2240 }
2241 else if (! setlocale(LC_CTYPE, save_input_locale)) {
69014004
KW
2242 DEBUG_L(PerlIO_printf(Perl_debug_log,
2243 "Could not change LC_CTYPE locale to %s\n",
2244 save_input_locale));
7d74bb61
KW
2245 Safefree(save_ctype_locale);
2246 goto cant_use_nllanginfo;
2247 }
2248 }
2249
69014004
KW
2250 DEBUG_L(PerlIO_printf(Perl_debug_log, "Current LC_CTYPE locale=%s\n",
2251 save_input_locale));
2252
7d74bb61 2253 /* Here the current LC_CTYPE is set to the locale of the category whose
1d958db2
KW
2254 * information is desired. This means that nl_langinfo() and MB_CUR_MAX
2255 * should give the correct results */
119ee68b 2256
1d958db2
KW
2257# if defined(HAS_NL_LANGINFO) && defined(CODESET)
2258 {
4f72bb37 2259 char *codeset = nl_langinfo(CODESET);
1d958db2 2260 if (codeset && strNE(codeset, "")) {
4f72bb37 2261 codeset = savepv(codeset);
119ee68b 2262
1d958db2
KW
2263 /* If we switched LC_CTYPE, switch back */
2264 if (save_ctype_locale) {
2265 setlocale(LC_CTYPE, save_ctype_locale);
2266 Safefree(save_ctype_locale);
2267 }
2268
2269 is_utf8 = foldEQ(codeset, STR_WITH_LEN("UTF-8"))
2270 || foldEQ(codeset, STR_WITH_LEN("UTF8"));
2271
69014004
KW
2272 DEBUG_L(PerlIO_printf(Perl_debug_log,
2273 "\tnllanginfo returned CODESET '%s'; ?UTF8 locale=%d\n",
2274 codeset, is_utf8));
1d958db2
KW
2275 Safefree(codeset);
2276 Safefree(save_input_locale);
2277 return is_utf8;
2278 }
119ee68b
KW
2279 }
2280
1d958db2
KW
2281# endif
2282# ifdef MB_CUR_MAX
2283
2284 /* Here, either we don't have nl_langinfo, or it didn't return a
2285 * codeset. Try MB_CUR_MAX */
2286
119ee68b
KW
2287 /* Standard UTF-8 needs at least 4 bytes to represent the maximum
2288 * Unicode code point. Since UTF-8 is the only non-single byte
2289 * encoding we handle, we just say any such encoding is UTF-8, and if
2290 * turns out to be wrong, other things will fail */
2291 is_utf8 = MB_CUR_MAX >= 4;
2292
69014004
KW
2293 DEBUG_L(PerlIO_printf(Perl_debug_log,
2294 "\tMB_CUR_MAX=%d; ?UTF8 locale=%d\n",
2295 (int) MB_CUR_MAX, is_utf8));
2296
119ee68b
KW
2297 Safefree(save_input_locale);
2298
2299# ifdef HAS_MBTOWC
2300
2301 /* ... But, most system that have MB_CUR_MAX will also have mbtowc(),
2302 * since they are both in the C99 standard. We can feed a known byte
2303 * string to the latter function, and check that it gives the expected
2304 * result */
2305 if (is_utf8) {
2306 wchar_t wc;
856b881c 2307 PERL_UNUSED_RESULT(mbtowc(&wc, NULL, 0));/* Reset any shift state */
69014004 2308 errno = 0;
f019f68f 2309 if ((size_t)mbtowc(&wc, HYPHEN_UTF8, strlen(HYPHEN_UTF8))
119ee68b
KW
2310 != strlen(HYPHEN_UTF8)
2311 || wc != (wchar_t) 0x2010)
2312 {
2313 is_utf8 = FALSE;
abdcbdb8 2314 DEBUG_L(PerlIO_printf(Perl_debug_log, "\thyphen=U+%x\n", (unsigned int)wc));
69014004
KW
2315 DEBUG_L(PerlIO_printf(Perl_debug_log,
2316 "\treturn from mbtowc=%d; errno=%d; ?UTF8 locale=0\n",
2317 mbtowc(&wc, HYPHEN_UTF8, strlen(HYPHEN_UTF8)), errno));
119ee68b
KW
2318 }
2319 }
119ee68b
KW
2320# endif
2321
1d958db2
KW
2322 /* If we switched LC_CTYPE, switch back */
2323 if (save_ctype_locale) {
2324 setlocale(LC_CTYPE, save_ctype_locale);
2325 Safefree(save_ctype_locale);
119ee68b 2326 }
7d74bb61 2327
1d958db2 2328 return is_utf8;
119ee68b 2329# endif
7d74bb61 2330 }
119ee68b 2331
7d74bb61
KW
2332 cant_use_nllanginfo:
2333
0080c90a
KW
2334#else /* nl_langinfo should work if available, so don't bother compiling this
2335 fallback code. The final fallback of looking at the name is
2336 compiled, and will be executed if nl_langinfo fails */
7d74bb61 2337
97f4de96
KW
2338 /* nl_langinfo not available or failed somehow. Next try looking at the
2339 * currency symbol to see if it disambiguates things. Often that will be
2340 * in the native script, and if the symbol isn't in UTF-8, we know that the
2341 * locale isn't. If it is non-ASCII UTF-8, we infer that the locale is
609548d2
KW
2342 * too, as the odds of a non-UTF8 string being valid UTF-8 are quite small
2343 * */
fa9b773e
KW
2344
2345#ifdef HAS_LOCALECONV
fa9b773e 2346# ifdef USE_LOCALE_MONETARY
fa9b773e
KW
2347 {
2348 char *save_monetary_locale = NULL;
fa9b773e 2349 bool only_ascii = FALSE;
13542a67
KW
2350 bool is_utf8 = FALSE;
2351 struct lconv* lc;
fa9b773e 2352
97f4de96
KW
2353 /* Like above for LC_CTYPE, we first set LC_MONETARY to the locale of
2354 * the desired category, if it isn't that locale already */
2355
fa9b773e
KW
2356 if (category != LC_MONETARY) {
2357
4f72bb37 2358 save_monetary_locale = setlocale(LC_MONETARY, NULL);
fa9b773e 2359 if (! save_monetary_locale) {
69014004
KW
2360 DEBUG_L(PerlIO_printf(Perl_debug_log,
2361 "Could not find current locale for LC_MONETARY\n"));
fa9b773e
KW
2362 goto cant_use_monetary;
2363 }
4f72bb37 2364 save_monetary_locale = stdize_locale(savepv(save_monetary_locale));
fa9b773e 2365
13542a67
KW
2366 if (strEQ(save_monetary_locale, save_input_locale)) {
2367 Safefree(save_monetary_locale);
2368 save_monetary_locale = NULL;
2369 }
2370 else if (! setlocale(LC_MONETARY, save_input_locale)) {
59c234b4
KW
2371 DEBUG_L(PerlIO_printf(Perl_debug_log,
2372 "Could not change LC_MONETARY locale to %s\n",
2373 save_input_locale));
2374 Safefree(save_monetary_locale);
2375 goto cant_use_monetary;
fa9b773e
KW
2376 }
2377 }
2378
2379 /* Here the current LC_MONETARY is set to the locale of the category
2380 * whose information is desired. */
2381
13542a67
KW
2382 lc = localeconv();
2383 if (! lc
2384 || ! lc->currency_symbol
c5f058df 2385 || is_utf8_invariant_string((U8 *) lc->currency_symbol, 0))
13542a67
KW
2386 {
2387 DEBUG_L(PerlIO_printf(Perl_debug_log, "Couldn't get currency symbol for %s, or contains only ASCII; can't use for determining if UTF-8 locale\n", save_input_locale));
2388 only_ascii = TRUE;
2389 }
2390 else {
2391 is_utf8 = is_utf8_string((U8 *) lc->currency_symbol, 0);
fa9b773e
KW
2392 }
2393
2394 /* If we changed it, restore LC_MONETARY to its original locale */
2395 if (save_monetary_locale) {
2396 setlocale(LC_MONETARY, save_monetary_locale);
2397 Safefree(save_monetary_locale);
2398 }
2399
13542a67 2400 if (! only_ascii) {
fa9b773e 2401
59c234b4
KW
2402 /* It isn't a UTF-8 locale if the symbol is not legal UTF-8;
2403 * otherwise assume the locale is UTF-8 if and only if the symbol
2404 * is non-ascii UTF-8. */
2405 DEBUG_L(PerlIO_printf(Perl_debug_log, "\t?Currency symbol for %s is UTF-8=%d\n",
2406 save_input_locale, is_utf8));
2407 Safefree(save_input_locale);
2408 return is_utf8;
13542a67 2409 }
fa9b773e
KW
2410 }
2411 cant_use_monetary:
2412
2413# endif /* USE_LOCALE_MONETARY */
2414#endif /* HAS_LOCALECONV */
2415
15f7e74e
KW
2416#if defined(HAS_STRFTIME) && defined(USE_LOCALE_TIME)
2417
2418/* Still haven't found a non-ASCII string to disambiguate UTF-8 or not. Try
2419 * the names of the months and weekdays, timezone, and am/pm indicator */
2420 {
2421 char *save_time_locale = NULL;
2422 int hour = 10;
2423 bool is_dst = FALSE;
2424 int dom = 1;
2425 int month = 0;
2426 int i;
2427 char * formatted_time;
2428
2429
2430 /* Like above for LC_MONETARY, we set LC_TIME to the locale of the
2431 * desired category, if it isn't that locale already */
2432
2433 if (category != LC_TIME) {
2434
2435 save_time_locale = setlocale(LC_TIME, NULL);
2436 if (! save_time_locale) {
2437 DEBUG_L(PerlIO_printf(Perl_debug_log,
2438 "Could not find current locale for LC_TIME\n"));
2439 goto cant_use_time;
2440 }
2441 save_time_locale = stdize_locale(savepv(save_time_locale));
2442
2443 if (strEQ(save_time_locale, save_input_locale)) {
2444 Safefree(save_time_locale);
2445 save_time_locale = NULL;
2446 }
2447 else if (! setlocale(LC_TIME, save_input_locale)) {
2448 DEBUG_L(PerlIO_printf(Perl_debug_log,
2449 "Could not change LC_TIME locale to %s\n",
2450 save_input_locale));
2451 Safefree(save_time_locale);
2452 goto cant_use_time;
2453 }
2454 }
2455
2456 /* Here the current LC_TIME is set to the locale of the category
2457 * whose information is desired. Look at all the days of the week and
9f10db87 2458 * month names, and the timezone and am/pm indicator for UTF-8 variant
15f7e74e
KW
2459 * characters. The first such a one found will tell us if the locale
2460 * is UTF-8 or not */
2461
2462 for (i = 0; i < 7 + 12; i++) { /* 7 days; 12 months */
2463 formatted_time = my_strftime("%A %B %Z %p",
2464 0, 0, hour, dom, month, 112, 0, 0, is_dst);
c5f058df
KW
2465 if ( ! formatted_time
2466 || is_utf8_invariant_string((U8 *) formatted_time, 0))
2467 {
15f7e74e
KW
2468
2469 /* Here, we didn't find a non-ASCII. Try the next time through
2470 * with the complemented dst and am/pm, and try with the next
2471 * weekday. After we have gotten all weekdays, try the next
2472 * month */
2473 is_dst = ! is_dst;
2474 hour = (hour + 12) % 24;
2475 dom++;
2476 if (i > 6) {
2477 month++;
2478 }
2479 continue;
2480 }
2481
2482 /* Here, we have a non-ASCII. Return TRUE is it is valid UTF8;
2483 * false otherwise. But first, restore LC_TIME to its original
2484 * locale if we changed it */
2485 if (save_time_locale) {
2486 setlocale(LC_TIME, save_time_locale);
2487 Safefree(save_time_locale);
2488 }
2489
2490 DEBUG_L(PerlIO_printf(Perl_debug_log, "\t?time-related strings for %s are UTF-8=%d\n",
2491 save_input_locale,
2492 is_utf8_string((U8 *) formatted_time, 0)));
2493 Safefree(save_input_locale);
2494 return is_utf8_string((U8 *) formatted_time, 0);
2495 }
2496
2497 /* Falling off the end of the loop indicates all the names were just
2498 * ASCII. Go on to the next test. If we changed it, restore LC_TIME
2499 * to its original locale */
2500 if (save_time_locale) {
2501 setlocale(LC_TIME, save_time_locale);
2502 Safefree(save_time_locale);
2503 }
2504 DEBUG_L(PerlIO_printf(Perl_debug_log, "All time-related words for %s contain only ASCII; can't use for determining if UTF-8 locale\n", save_input_locale));
2505 }
2506 cant_use_time:
2507
2508#endif
2509
2510#if 0 && defined(USE_LOCALE_MESSAGES) && defined(HAS_SYS_ERRLIST)
855aeb93
JH
2511
2512/* This code is ifdefd out because it was found to not be necessary in testing
5857e934
KW
2513 * on our dromedary test machine, which has over 700 locales. There, this
2514 * added no value to looking at the currency symbol and the time strings. I
2515 * left it in so as to avoid rewriting it if real-world experience indicates
2516 * that dromedary is an outlier. Essentially, instead of returning abpve if we
855aeb93
JH
2517 * haven't found illegal utf8, we continue on and examine all the strerror()
2518 * messages on the platform for utf8ness. If all are ASCII, we still don't
2519 * know the answer; but otherwise we have a pretty good indication of the
5857e934
KW
2520 * utf8ness. The reason this doesn't help much is that the messages may not
2521 * have been translated into the locale. The currency symbol and time strings
2522 * are much more likely to have been translated. */
2523 {
855aeb93 2524 int e;
5857e934
KW
2525 bool is_utf8 = FALSE;
2526 bool non_ascii = FALSE;
855aeb93 2527 char *save_messages_locale = NULL;
5857e934 2528 const char * errmsg = NULL;
855aeb93 2529
5857e934
KW
2530 /* Like above, we set LC_MESSAGES to the locale of the desired
2531 * category, if it isn't that locale already */
855aeb93
JH
2532
2533 if (category != LC_MESSAGES) {
2534
5857e934 2535 save_messages_locale = setlocale(LC_MESSAGES, NULL);
855aeb93 2536 if (! save_messages_locale) {
5857e934
KW
2537 DEBUG_L(PerlIO_printf(Perl_debug_log,
2538 "Could not find current locale for LC_MESSAGES\n"));
855aeb93
JH
2539 goto cant_use_messages;
2540 }
5857e934 2541 save_messages_locale = stdize_locale(savepv(save_messages_locale));
855aeb93
JH
2542
2543 if (strEQ(save_messages_locale, save_input_locale)) {
5857e934
KW
2544 Safefree(save_messages_locale);
2545 save_messages_locale = NULL;
855aeb93
JH
2546 }
2547 else if (! setlocale(LC_MESSAGES, save_input_locale)) {
5857e934
KW
2548 DEBUG_L(PerlIO_printf(Perl_debug_log,
2549 "Could not change LC_MESSAGES locale to %s\n",
2550 save_input_locale));
855aeb93
JH
2551 Safefree(save_messages_locale);
2552 goto cant_use_messages;
2553 }
2554 }
2555
2556 /* Here the current LC_MESSAGES is set to the locale of the category
5857e934
KW
2557 * whose information is desired. Look through all the messages. We
2558 * can't use Strerror() here because it may expand to code that
2559 * segfaults in miniperl */
855aeb93 2560
5857e934
KW
2561 for (e = 0; e <= sys_nerr; e++) {
2562 errno = 0;
2563 errmsg = sys_errlist[e];
2564 if (errno || !errmsg) {
855aeb93
JH
2565 break;
2566 }
5857e934 2567 errmsg = savepv(errmsg);
c5f058df 2568 if (! is_utf8_invariant_string((U8 *) errmsg, 0)) {
5857e934
KW
2569 non_ascii = TRUE;
2570 is_utf8 = is_utf8_string((U8 *) errmsg, 0);
2571 break;
855aeb93
JH
2572 }
2573 }
5857e934 2574 Safefree(errmsg);
855aeb93
JH
2575
2576 /* And, if we changed it, restore LC_MESSAGES to its original locale */
2577 if (save_messages_locale) {
2578 setlocale(LC_MESSAGES, save_messages_locale);
2579 Safefree(save_messages_locale);
2580 }
2581
5857e934
KW
2582 if (non_ascii) {
2583
2584 /* Any non-UTF-8 message means not a UTF-8 locale; if all are valid,
2585 * any non-ascii means it is one; otherwise we assume it isn't */
2586 DEBUG_L(PerlIO_printf(Perl_debug_log, "\t?error messages for %s are UTF-8=%d\n",
2587 save_input_locale,
2588 is_utf8));
2589 Safefree(save_input_locale);
2590 return is_utf8;
2591 }
855aeb93 2592
5857e934 2593 DEBUG_L(PerlIO_printf(Perl_debug_log, "All error messages for %s contain only ASCII; can't use for determining if UTF-8 locale\n", save_input_locale));
855aeb93
JH
2594 }
2595 cant_use_messages:
2596
2597#endif
fa9b773e 2598
0080c90a
KW
2599#endif /* the code that is compiled when no nl_langinfo */
2600
92c0a900
KW
2601#ifndef EBCDIC /* On os390, even if the name ends with "UTF-8', it isn't a
2602 UTF-8 locale */
97f4de96
KW
2603 /* As a last resort, look at the locale name to see if it matches
2604 * qr/UTF -? * 8 /ix, or some other common locale names. This "name", the
2605 * return of setlocale(), is actually defined to be opaque, so we can't
2606 * really rely on the absence of various substrings in the name to indicate
2607 * its UTF-8ness, but if it has UTF8 in the name, it is extremely likely to
2608 * be a UTF-8 locale. Similarly for the other common names */
2609
2610 final_pos = strlen(save_input_locale) - 1;
2611 if (final_pos >= 3) {
2612 char *name = save_input_locale;
2613
2614 /* Find next 'U' or 'u' and look from there */
2615 while ((name += strcspn(name, "Uu") + 1)
2616 <= save_input_locale + final_pos - 2)
2617 {
305b8651
KW
2618 if (!isALPHA_FOLD_NE(*name, 't')
2619 || isALPHA_FOLD_NE(*(name + 1), 'f'))
97f4de96
KW
2620 {
2621 continue;
2622 }
2623 name += 2;
2624 if (*(name) == '-') {
2625 if ((name > save_input_locale + final_pos - 1)) {
2626 break;
2627 }
2628 name++;
2629 }
2630 if (*(name) == '8') {
97f4de96
KW
2631 DEBUG_L(PerlIO_printf(Perl_debug_log,
2632 "Locale %s ends with UTF-8 in name\n",
2633 save_input_locale));
00c54b9c 2634 Safefree(save_input_locale);
97f4de96
KW
2635 return TRUE;
2636 }
2637 }
2638 DEBUG_L(PerlIO_printf(Perl_debug_log,
2639 "Locale %s doesn't end with UTF-8 in name\n",
2640 save_input_locale));
2641 }
92c0a900 2642#endif
97f4de96
KW
2643
2644#ifdef WIN32
2645 /* http://msdn.microsoft.com/en-us/library/windows/desktop/dd317756.aspx */
2646 if (final_pos >= 4
2647 && *(save_input_locale + final_pos - 0) == '1'
2648 && *(save_input_locale + final_pos - 1) == '0'
2649 && *(save_input_locale + final_pos - 2) == '0'
2650 && *(save_input_locale + final_pos - 3) == '5'
2651 && *(save_input_locale + final_pos - 4) == '6')
2652 {
2653 DEBUG_L(PerlIO_printf(Perl_debug_log,
2654 "Locale %s ends with 10056 in name, is UTF-8 locale\n",
2655 save_input_locale));
2656 Safefree(save_input_locale);
2657 return TRUE;
2658 }
2659#endif
2660
2661 /* Other common encodings are the ISO 8859 series, which aren't UTF-8. But
2662 * since we are about to return FALSE anyway, there is no point in doing
2663 * this extra work */
2664#if 0
2665 if (instr(save_input_locale, "8859")) {
2666 DEBUG_L(PerlIO_printf(Perl_debug_log,
2667 "Locale %s has 8859 in name, not UTF-8 locale\n",
2668 save_input_locale));
2669 Safefree(save_input_locale);
2670 return FALSE;
2671 }
2672#endif
2673
69014004
KW
2674 DEBUG_L(PerlIO_printf(Perl_debug_log,
2675 "Assuming locale %s is not a UTF-8 locale\n",
2676 save_input_locale));
fa9b773e 2677 Safefree(save_input_locale);
7d74bb61
KW
2678 return FALSE;
2679}
2680
8ef6e574 2681#endif
7d74bb61 2682
d6ded950
KW
2683
2684bool
2685Perl__is_in_locale_category(pTHX_ const bool compiling, const int category)
2686{
1a4f13e1 2687 dVAR;
d6ded950
KW
2688 /* Internal function which returns if we are in the scope of a pragma that
2689 * enables the locale category 'category'. 'compiling' should indicate if
2690 * this is during the compilation phase (TRUE) or not (FALSE). */
2691
2692 const COP * const cop = (compiling) ? &PL_compiling : PL_curcop;
2693
2694 SV *categories = cop_hints_fetch_pvs(cop, "locale", 0);
2695 if (! categories || categories == &PL_sv_placeholder) {
2696 return FALSE;
2697 }
2698
2699 /* The pseudo-category 'not_characters' is -1, so just add 1 to each to get
2700 * a valid unsigned */
2701 assert(category >= -1);
2702 return cBOOL(SvUV(categories) & (1U << (category + 1)));
2703}
2704
2c6ee1a7 2705char *
6ebbc862
KW
2706Perl_my_strerror(pTHX_ const int errnum)
2707{
2708 /* Returns a mortalized copy of the text of the error message associated
2709 * with 'errnum'. It uses the current locale's text unless the platform
2710 * doesn't have the LC_MESSAGES category or we are not being called from
2711 * within the scope of 'use locale'. In the former case, it uses whatever
2712 * strerror returns; in the latter case it uses the text from the C locale.
2713 *
2714 * The function just calls strerror(), but temporarily switches, if needed,
2715 * to the C locale */
2716
2717 char *errstr;
2718
2719#ifdef USE_LOCALE_MESSAGES /* If platform doesn't have messages category, we
2720 don't do any switching to the C locale; we just
2721 use whatever strerror() returns */
2722 const bool within_locale_scope = IN_LC(LC_MESSAGES);
2723
a0b53297 2724 dVAR;
2c6ee1a7 2725
6ebbc862 2726# ifdef USE_THREAD_SAFE_LOCALE
fcd0e682 2727 locale_t save_locale = NULL;
6ebbc862 2728# else
fcd0e682 2729 char * save_locale = NULL;
c9dda6da 2730 bool locale_is_C = FALSE;
2c6ee1a7 2731
6ebbc862
KW
2732 /* We have a critical section to prevent another thread from changing the
2733 * locale out from under us (or zapping the buffer returned from
2734 * setlocale() ) */
2735 LOCALE_LOCK;
2736
2737# endif
2738
9c8a6dc2
KW
2739 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
2740 "my_strerror called with errnum %d\n", errnum));
6ebbc862 2741 if (! within_locale_scope) {
c9dda6da 2742 errno = 0;
a0b53297 2743
6ebbc862
KW
2744# ifdef USE_THREAD_SAFE_LOCALE /* Use the thread-safe locale functions */
2745
9c8a6dc2
KW
2746 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
2747 "Not within locale scope, about to call"
2748 " uselocale(0x%p)\n", PL_C_locale_obj));
6ebbc862 2749 save_locale = uselocale(PL_C_locale_obj);
c9dda6da
KW
2750 if (! save_locale) {
2751 DEBUG_L(PerlIO_printf(Perl_debug_log,
9c8a6dc2
KW
2752 "uselocale failed, errno=%d\n", errno));
2753 }
2754 else {
2755 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
2756 "uselocale returned 0x%p\n", save_locale));
c9dda6da 2757 }
6ebbc862
KW
2758
2759# else /* Not thread-safe build */
a0b53297
KW
2760
2761 save_locale = setlocale(LC_MESSAGES, NULL);
c9dda6da
KW
2762 if (! save_locale) {
2763 DEBUG_L(PerlIO_printf(Perl_debug_log,
2764 "setlocale failed, errno=%d\n", errno));
2765 }
2766 else {
2767 locale_is_C = isNAME_C_OR_POSIX(save_locale);
2c6ee1a7 2768
c9dda6da
KW
2769 /* Switch to the C locale if not already in it */
2770 if (! locale_is_C) {
2c6ee1a7 2771
c9dda6da
KW
2772 /* The setlocale() just below likely will zap 'save_locale', so
2773 * create a copy. */
2774 save_locale = savepv(save_locale);
2775 setlocale(LC_MESSAGES, "C");
2776 }
6ebbc862 2777 }
2c6ee1a7 2778
6ebbc862 2779# endif
2c6ee1a7 2780
6ebbc862 2781 } /* end of ! within_locale_scope */
9c8a6dc2
KW
2782 else {
2783 DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s: %d: WITHIN locale scope\n",
2784 __FILE__, __LINE__));
2785 }
a0b53297 2786
6ebbc862 2787#endif
a0b53297 2788
9c8a6dc2
KW
2789 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
2790 "Any locale change has been done; about to call Strerror\n"));
6ebbc862
KW
2791 errstr = Strerror(errnum);
2792 if (errstr) {
866a55c1
KW
2793
2794#ifdef DEBUGGING
2795
9c8a6dc2
KW
2796 if (DEBUG_Lv_TEST) {
2797 PerlIO_printf(Perl_debug_log, "Strerror returned; saving a copy: '");
2798 print_bytes_for_locale(errstr, errstr + strlen(errstr), 0);
2799 PerlIO_printf(Perl_debug_log, "'\n");
2800 }
2801
866a55c1
KW
2802#endif
2803
6ebbc862
KW
2804 errstr = savepv(errstr);
2805 SAVEFREEPV(errstr);
2806 }
2807
2808#ifdef USE_LOCALE_MESSAGES
2809
2810 if (! within_locale_scope) {
c9dda6da 2811 errno = 0;
a0b53297 2812
6ebbc862
KW
2813# ifdef USE_THREAD_SAFE_LOCALE
2814
9c8a6dc2
KW
2815 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
2816 "%s: %d: not within locale scope, restoring the locale\n",
2817 __FILE__, __LINE__));
c9dda6da
KW
2818 if (save_locale && ! uselocale(save_locale)) {
2819 DEBUG_L(PerlIO_printf(Perl_debug_log,
2820 "uselocale restore failed, errno=%d\n", errno));
2821 }
2c6ee1a7 2822 }
6ebbc862
KW
2823
2824# else
2825
c9dda6da
KW
2826 if (save_locale && ! locale_is_C) {
2827 if (! setlocale(LC_MESSAGES, save_locale)) {
2828 DEBUG_L(PerlIO_printf(Perl_debug_log,
2829 "setlocale restore failed, errno=%d\n", errno));
2830 }
6ebbc862
KW
2831 Safefree(save_locale);
2832 }
2833 }
2834
2835 LOCALE_UNLOCK;
2836
2837# endif
2c6ee1a7
KW
2838#endif
2839
6ebbc862 2840 return errstr;
2c6ee1a7
KW
2841}
2842
66610fdd 2843/*
747c467a
KW
2844
2845=head1 Locale-related functions and macros
2846
2847=for apidoc sync_locale
2848
2849Changing the program's locale should be avoided by XS code. Nevertheless,
2850certain non-Perl libraries called from XS, such as C<Gtk> do so. When this
2851happens, Perl needs to be told that the locale has changed. Use this function
2852to do so, before returning to Perl.
2853
2854=cut
2855*/
2856
2857void
2858Perl_sync_locale(pTHX)
2859{
2860
2861#ifdef USE_LOCALE_CTYPE
2862 new_ctype(setlocale(LC_CTYPE, NULL));
2863#endif /* USE_LOCALE_CTYPE */
2864
2865#ifdef USE_LOCALE_COLLATE
2866 new_collate(setlocale(LC_COLLATE, NULL));
2867#endif
2868
2869#ifdef USE_LOCALE_NUMERIC
2870 set_numeric_local(); /* Switch from "C" to underlying LC_NUMERIC */
2871 new_numeric(setlocale(LC_NUMERIC, NULL));
2872#endif /* USE_LOCALE_NUMERIC */
2873
2874}
2875
5d1187d1
KW
2876#if defined(DEBUGGING) && defined(USE_LOCALE)
2877
a4f00dcc
KW
2878STATIC char *
2879S_setlocale_debug_string(const int category, /* category number,
5d1187d1
KW
2880 like LC_ALL */
2881 const char* const locale, /* locale name */
2882
2883 /* return value from setlocale() when attempting to
2884 * set 'category' to 'locale' */
2885 const char* const retval)
2886{
2887 /* Returns a pointer to a NUL-terminated string in static storage with
2888 * added text about the info passed in. This is not thread safe and will
2889 * be overwritten by the next call, so this should be used just to
fa07b8e5 2890 * formulate a string to immediately print or savepv() on. */
5d1187d1 2891
398a990f
DM
2892 /* initialise to a non-null value to keep it out of BSS and so keep
2893 * -DPERL_GLOBAL_STRUCT_PRIVATE happy */
60b45a7d
KW
2894 static char ret[128] = "If you can read this, thank your buggy C"
2895 " library strlcpy(), and change your hints file"
2896 " to undef it";
fa07b8e5 2897 my_strlcpy(ret, "setlocale(", sizeof(ret));
5d1187d1
KW
2898
2899 switch (category) {
2900 default:
fa07b8e5 2901 my_snprintf(ret, sizeof(ret), "%s? %d", ret, category);
5d1187d1
KW
2902 break;
2903# ifdef LC_ALL
2904 case LC_ALL:
fa07b8e5 2905 my_strlcat(ret, "LC_ALL", sizeof(ret));
5d1187d1
KW
2906 break;
2907# endif
2908# ifdef LC_CTYPE
2909 case LC_CTYPE:
fa07b8e5 2910 my_strlcat(ret, "LC_CTYPE", sizeof(ret));
5d1187d1
KW
2911 break;
2912# endif
2913# ifdef LC_NUMERIC
2914 case LC_NUMERIC:
fa07b8e5 2915 my_strlcat(ret, "LC_NUMERIC", sizeof(ret));
5d1187d1
KW
2916 break;
2917# endif
2918# ifdef LC_COLLATE
2919 case LC_COLLATE:
fa07b8e5 2920 my_strlcat(ret, "LC_COLLATE", sizeof(ret));
5d1187d1
KW
2921 break;
2922# endif
2923# ifdef LC_TIME
2924 case LC_TIME:
fa07b8e5 2925 my_strlcat(ret, "LC_TIME", sizeof(ret));
5d1187d1
KW
2926 break;
2927# endif
2928# ifdef LC_MONETARY
2929 case LC_MONETARY:
fa07b8e5 2930 my_strlcat(ret, "LC_MONETARY", sizeof(ret));
5d1187d1
KW
2931 break;
2932# endif
2933# ifdef LC_MESSAGES
2934 case LC_MESSAGES:
fa07b8e5 2935 my_strlcat(ret, "LC_MESSAGES", sizeof(ret));
5d1187d1
KW
2936 break;
2937# endif
2938 }
2939
fa07b8e5 2940 my_strlcat(ret, ", ", sizeof(ret));
5d1187d1
KW
2941
2942 if (locale) {
fa07b8e5
KW
2943 my_strlcat(ret, "\"", sizeof(ret));
2944 my_strlcat(ret, locale, sizeof(ret));
2945 my_strlcat(ret, "\"", sizeof(ret));
5d1187d1
KW
2946 }
2947 else {
fa07b8e5 2948 my_strlcat(ret, "NULL", sizeof(ret));
5d1187d1
KW
2949 }
2950
fa07b8e5 2951 my_strlcat(ret, ") returned ", sizeof(ret));
5d1187d1
KW
2952
2953 if (retval) {
fa07b8e5
KW
2954 my_strlcat(ret, "\"", sizeof(ret));
2955 my_strlcat(ret, retval, sizeof(ret));
2956 my_strlcat(ret, "\"", sizeof(ret));
5d1187d1
KW
2957 }
2958 else {
fa07b8e5 2959 my_strlcat(ret, "NULL", sizeof(ret));
5d1187d1
KW
2960 }
2961
2962 assert(strlen(ret) < sizeof(ret));
2963
2964 return ret;
2965}
2966
2967#endif
747c467a
KW
2968
2969
2970/*
14d04a33 2971 * ex: set ts=8 sts=4 sw=4 et:
37442d52 2972 */