This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Teach perl about more locale categories
[perl5.git] / locale.c
CommitLineData
98994639
HS
1/* locale.c
2 *
1129b882
NC
3 * Copyright (C) 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001,
4 * 2002, 2003, 2005, 2006, 2007, 2008 by Larry Wall and others
98994639
HS
5 *
6 * You may distribute under the terms of either the GNU General Public
7 * License or the Artistic License, as specified in the README file.
8 *
9 */
10
11/*
4ac71550 12 * A Elbereth Gilthoniel,
cdad3b53 13 * silivren penna míriel
4ac71550 14 * o menel aglar elenath!
cdad3b53 15 * Na-chaered palan-díriel
4ac71550
TC
16 * o galadhremmin ennorath,
17 * Fanuilos, le linnathon
18 * nef aear, si nef aearon!
19 *
20 * [p.238 of _The Lord of the Rings_, II/i: "Many Meetings"]
98994639
HS
21 */
22
166f8a29
DM
23/* utility functions for handling locale-specific stuff like what
24 * character represents the decimal point.
0d071d52 25 *
7d4bcc4a
KW
26 * All C programs have an underlying locale. Perl code generally doesn't pay
27 * any attention to it except within the scope of a 'use locale'. For most
0d071d52
KW
28 * categories, it accomplishes this by just using different operations if it is
29 * in such scope than if not. However, various libc functions called by Perl
30 * are affected by the LC_NUMERIC category, so there are macros in perl.h that
31 * are used to toggle between the current locale and the C locale depending on
a9ad02a8
KW
32 * the desired behavior of those functions at the moment. And, LC_MESSAGES is
33 * switched to the C locale for outputting the message unless within the scope
34 * of 'use locale'.
166f8a29
DM
35 */
36
98994639
HS
37#include "EXTERN.h"
38#define PERL_IN_LOCALE_C
f7416781 39#include "perl_langinfo.h"
98994639
HS
40#include "perl.h"
41
a4af207c
JH
42#include "reentr.h"
43
2fcc0ca9
KW
44/* If the environment says to, we can output debugging information during
45 * initialization. This is done before option parsing, and before any thread
46 * creation, so can be a file-level static */
5a4b0634
KW
47#if ! defined(DEBUGGING) || defined(PERL_GLOBAL_STRUCT)
48# define debug_initialization 0
49# define DEBUG_INITIALIZATION_set(v)
50#else
2fcc0ca9 51static bool debug_initialization = FALSE;
5a4b0634 52# define DEBUG_INITIALIZATION_set(v) (debug_initialization = v)
2fcc0ca9
KW
53#endif
54
291a84fb
KW
55/* strlen() of a literal string constant. We might want this more general,
56 * but using it in just this file for now. A problem with more generality is
57 * the compiler warnings about comparing unlike signs */
ff1b739b
KW
58#define STRLENs(s) (sizeof("" s "") - 1)
59
63e5b0d7
KW
60/* Is the C string input 'name' "C" or "POSIX"? If so, and 'name' is the
61 * return of setlocale(), then this is extremely likely to be the C or POSIX
62 * locale. However, the output of setlocale() is documented to be opaque, but
63 * the odds are extremely small that it would return these two strings for some
64 * other locale. Note that VMS in these two locales includes many non-ASCII
65 * characters as controls and punctuation (below are hex bytes):
66 * cntrl: 84-97 9B-9F
67 * punct: A1-A3 A5 A7-AB B0-B3 B5-B7 B9-BD BF-CF D1-DD DF-EF F1-FD
68 * Oddly, none there are listed as alphas, though some represent alphabetics
69 * http://www.nntp.perl.org/group/perl.perl5.porters/2013/02/msg198753.html */
70#define isNAME_C_OR_POSIX(name) \
71 ( (name) != NULL \
72 && (( *(name) == 'C' && (*(name + 1)) == '\0') \
73 || strEQ((name), "POSIX")))
74
8ef6e574
KW
75#ifdef USE_LOCALE
76
98994639 77/*
0d071d52
KW
78 * Standardize the locale name from a string returned by 'setlocale', possibly
79 * modifying that string.
98994639 80 *
0ef2a2b2 81 * The typical return value of setlocale() is either
98994639
HS
82 * (1) "xx_YY" if the first argument of setlocale() is not LC_ALL
83 * (2) "xa_YY xb_YY ..." if the first argument of setlocale() is LC_ALL
84 * (the space-separated values represent the various sublocales,
0ef2a2b2 85 * in some unspecified order). This is not handled by this function.
98994639
HS
86 *
87 * In some platforms it has a form like "LC_SOMETHING=Lang_Country.866\n",
0ef2a2b2
KW
88 * which is harmful for further use of the string in setlocale(). This
89 * function removes the trailing new line and everything up through the '='
98994639
HS
90 *
91 */
92STATIC char *
93S_stdize_locale(pTHX_ char *locs)
94{
7452cf6a 95 const char * const s = strchr(locs, '=');
98994639
HS
96 bool okay = TRUE;
97
7918f24d
NC
98 PERL_ARGS_ASSERT_STDIZE_LOCALE;
99
8772537c
AL
100 if (s) {
101 const char * const t = strchr(s, '.');
98994639 102 okay = FALSE;
8772537c
AL
103 if (t) {
104 const char * const u = strchr(t, '\n');
105 if (u && (u[1] == 0)) {
106 const STRLEN len = u - s;
107 Move(s + 1, locs, len, char);
108 locs[len] = 0;
109 okay = TRUE;
98994639
HS
110 }
111 }
112 }
113
114 if (!okay)
115 Perl_croak(aTHX_ "Can't fix broken locale name \"%s\"", locs);
116
117 return locs;
118}
119
e5f10d49
KW
120/* Two parallel arrays; first the locale categories Perl uses on this system;
121 * the second array is their names. These arrays are in mostly arbitrary
122 * order. */
123
124const int categories[] = {
125
126# ifdef USE_LOCALE_NUMERIC
127 LC_NUMERIC,
128# endif
129# ifdef USE_LOCALE_CTYPE
130 LC_CTYPE,
131# endif
132# ifdef USE_LOCALE_COLLATE
133 LC_COLLATE,
134# endif
135# ifdef USE_LOCALE_TIME
136 LC_TIME,
137# endif
138# ifdef USE_LOCALE_MESSAGES
139 LC_MESSAGES,
140# endif
141# ifdef USE_LOCALE_MONETARY
142 LC_MONETARY,
143# endif
9821811f
KW
144# ifdef USE_LOCALE_ADDRESS
145 LC_ADDRESS,
146# endif
147# ifdef USE_LOCALE_IDENTIFICATION
148 LC_IDENTIFICATION,
149# endif
150# ifdef USE_LOCALE_MEASUREMENT
151 LC_MEASUREMENT,
152# endif
153# ifdef USE_LOCALE_PAPER
154 LC_PAPER,
155# endif
156# ifdef USE_LOCALE_TELEPHONE
157 LC_TELEPHONE,
158# endif
e5f10d49
KW
159# ifdef LC_ALL
160 LC_ALL,
161# endif
162 -1 /* Placeholder because C doesn't allow a
163 trailing comma, and it would get complicated
164 with all the #ifdef's */
165};
166
167/* The top-most real element is LC_ALL */
168
169const char * category_names[] = {
170
171# ifdef USE_LOCALE_NUMERIC
172 "LC_NUMERIC",
173# endif
174# ifdef USE_LOCALE_CTYPE
175 "LC_CTYPE",
176# endif
177# ifdef USE_LOCALE_COLLATE
178 "LC_COLLATE",
179# endif
180# ifdef USE_LOCALE_TIME
181 "LC_TIME",
182# endif
183# ifdef USE_LOCALE_MESSAGES
184 "LC_MESSAGES",
185# endif
186# ifdef USE_LOCALE_MONETARY
187 "LC_MONETARY",
188# endif
9821811f
KW
189# ifdef USE_LOCALE_ADDRESS
190 "LC_ADDRESS",
191# endif
192# ifdef USE_LOCALE_IDENTIFICATION
193 "LC_IDENTIFICATION",
194# endif
195# ifdef USE_LOCALE_MEASUREMENT
196 "LC_MEASUREMENT",
197# endif
198# ifdef USE_LOCALE_PAPER
199 "LC_PAPER",
200# endif
201# ifdef USE_LOCALE_TELEPHONE
202 "LC_TELEPHONE",
203# endif
e5f10d49
KW
204# ifdef LC_ALL
205 "LC_ALL",
206# endif
207 NULL /* Placeholder */
208 };
209
210# ifdef LC_ALL
211
212 /* On systems with LC_ALL, it is kept in the highest index position. (-2
213 * to account for the final unused placeholder element.) */
214# define NOMINAL_LC_ALL_INDEX (C_ARRAY_LENGTH(categories) - 2)
215
216# else
217
218 /* On systems without LC_ALL, we pretend it is there, one beyond the real
219 * top element, hence in the unused placeholder element. */
220# define NOMINAL_LC_ALL_INDEX (C_ARRAY_LENGTH(categories) - 1)
221
222# endif
223
224/* Pretending there is an LC_ALL element just above allows us to avoid most
225 * special cases. Most loops through these arrays in the code below are
226 * written like 'for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++)'. They will work
227 * on either type of system. But the code must be written to not access the
948523db
KW
228 * element at 'LC_ALL_INDEX' except on platforms that have it. This can be
229 * checked for at compile time by using the #define LC_ALL_INDEX which is only
230 * defined if we do have LC_ALL. */
e5f10d49 231
b09aaf40
KW
232STATIC const char *
233S_category_name(const int category)
234{
235 unsigned int i;
236
237#ifdef LC_ALL
238
239 if (category == LC_ALL) {
240 return "LC_ALL";
241 }
242
243#endif
244
245 for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++) {
246 if (category == categories[i]) {
247 return category_names[i];
248 }
249 }
250
251 {
252 const char suffix[] = " (unknown)";
253 int temp = category;
254 Size_t length = sizeof(suffix) + 1;
255 char * unknown;
256 dTHX;
257
258 if (temp < 0) {
259 length++;
260 temp = - temp;
261 }
262
263 /* Calculate the number of digits */
264 while (temp >= 10) {
265 temp /= 10;
266 length++;
267 }
268
269 Newx(unknown, length, char);
270 my_snprintf(unknown, length, "%d%s", category, suffix);
271 SAVEFREEPV(unknown);
272 return unknown;
273 }
274}
275
948523db
KW
276/* Now create LC_foo_INDEX #defines for just those categories on this system */
277# ifdef USE_LOCALE_NUMERIC
278# define LC_NUMERIC_INDEX 0
279# define _DUMMY_NUMERIC LC_NUMERIC_INDEX
280# else
281# define _DUMMY_NUMERIC -1
282# endif
283# ifdef USE_LOCALE_CTYPE
284# define LC_CTYPE_INDEX _DUMMY_NUMERIC + 1
285# define _DUMMY_CTYPE LC_CTYPE_INDEX
286# else
287# define _DUMMY_CTYPE _DUMMY_NUMERIC
288# endif
289# ifdef USE_LOCALE_COLLATE
290# define LC_COLLATE_INDEX _DUMMY_CTYPE + 1
291# define _DUMMY_COLLATE LC_COLLATE_INDEX
292# else
293# define _DUMMY_COLLATE _DUMMY_COLLATE
294# endif
295# ifdef USE_LOCALE_TIME
296# define LC_TIME_INDEX _DUMMY_COLLATE + 1
297# define _DUMMY_TIME LC_TIME_INDEX
298# else
299# define _DUMMY_TIME _DUMMY_COLLATE
300# endif
301# ifdef USE_LOCALE_MESSAGES
302# define LC_MESSAGES_INDEX _DUMMY_TIME + 1
303# define _DUMMY_MESSAGES LC_MESSAGES_INDEX
304# else
305# define _DUMMY_MESSAGES _DUMMY_TIME
306# endif
307# ifdef USE_LOCALE_MONETARY
308# define LC_MONETARY_INDEX _DUMMY_MESSAGES + 1
309# define _DUMMY_MONETARY LC_MONETARY_INDEX
310# else
311# define _DUMMY_MONETARY _DUMMY_MESSAGES
312# endif
9821811f
KW
313# ifdef USE_LOCALE_ADDRESS
314# define LC_ADDRESS_INDEX _DUMMY_MONETARY + 1
315# define _DUMMY_ADDRESS LC_ADDRESS_INDEX
316# else
317# define _DUMMY_ADDRESS _DUMMY_MONETARY
318# endif
319# ifdef USE_LOCALE_IDENTIFICATION
320# define LC_IDENTIFICATION_INDEX _DUMMY_ADDRESS + 1
321# define _DUMMY_IDENTIFICATION LC_IDENTIFICATION_INDEX
322# else
323# define _DUMMY_IDENTIFICATION _DUMMY_ADDRESS
324# endif
325# ifdef USE_LOCALE_MEASUREMENT
326# define LC_MEASUREMENT_INDEX _DUMMY_IDENTIFICATION + 1
327# define _DUMMY_MEASUREMENT LC_MEASUREMENT_INDEX
328# else
329# define _DUMMY_MEASUREMENT _DUMMY_IDENTIFICATION
330# endif
331# ifdef USE_LOCALE_PAPER
332# define LC_PAPER_INDEX _DUMMY_MEASUREMENT + 1
333# define _DUMMY_PAPER LC_PAPER_INDEX
334# else
335# define _DUMMY_PAPER _DUMMY_MEASUREMENT
336# endif
337# ifdef USE_LOCALE_TELEPHONE
338# define LC_TELEPHONE_INDEX _DUMMY_PAPER + 1
339# define _DUMMY_TELEPHONE LC_TELEPHONE_INDEX
340# else
341# define _DUMMY_TELEPHONE _DUMMY_PAPER
342# endif
948523db 343# ifdef LC_ALL
9821811f 344# define LC_ALL_INDEX _DUMMY_TELEPHONE + 1
948523db
KW
345# endif
346#endif /* ifdef USE_LOCALE */
8ef6e574 347
d2b24094
KW
348/* Windows requres a customized base-level setlocale() */
349# ifdef WIN32
350# define my_setlocale(cat, locale) win32_setlocale(cat, locale)
351# else
352# define my_setlocale(cat, locale) setlocale(cat, locale)
353# endif
354
837ce802
KW
355/* Just placeholders for now. "_c" is intended to be called when the category
356 * is a constant known at compile time; "_r", not known until run time */
d2b24094
KW
357# define do_setlocale_c(category, locale) my_setlocale(category, locale)
358# define do_setlocale_r(category, locale) my_setlocale(category, locale)
837ce802 359
a4f00dcc 360STATIC void
86799d2d 361S_set_numeric_radix(pTHX_ const bool use_locale)
98994639 362{
86799d2d
KW
363 /* If 'use_locale' is FALSE, set to use a dot for the radix character. If
364 * TRUE, use the radix character derived from the current locale */
7d4bcc4a 365
86799d2d
KW
366#if defined(USE_LOCALE_NUMERIC) && ( defined(HAS_LOCALECONV) \
367 || defined(HAS_NL_LANGINFO))
98994639 368
86799d2d
KW
369 /* We only set up the radix SV if we are to use a locale radix ... */
370 if (use_locale) {
371 const char * radix = my_nl_langinfo(PERL_RADIXCHAR, FALSE);
372 /* FALSE => already in dest locale */
86799d2d
KW
373 /* ... and the character being used isn't a dot */
374 if (strNE(radix, ".")) {
2213a3be
KW
375 const U8 * first_variant;
376
86799d2d
KW
377 if (PL_numeric_radix_sv) {
378 sv_setpv(PL_numeric_radix_sv, radix);
379 }
380 else {
381 PL_numeric_radix_sv = newSVpv(radix, 0);
382 }
383
2213a3be
KW
384 /* If there is a byte variant under UTF-8, and if the remainder of
385 * the string starting there is valid UTF-8, and we are in a UTF-8
386 * locale, then mark the radix as being in UTF-8 */
387 if ( ! is_utf8_invariant_string_loc(
388 (U8 *) SvPVX(PL_numeric_radix_sv),
389 SvCUR(PL_numeric_radix_sv),
390 &first_variant)
391 && is_utf8_string(first_variant,
392 SvCUR(PL_numeric_radix_sv)
393 - ((char *) first_variant
394 - SvPVX(PL_numeric_radix_sv)))
c1284011 395 && _is_cur_LC_category_utf8(LC_NUMERIC))
28acfe03 396 {
86799d2d 397 SvUTF8_on(PL_numeric_radix_sv);
28acfe03 398 }
86799d2d
KW
399 goto done;
400 }
98994639 401 }
69014004 402
86799d2d
KW
403 SvREFCNT_dec(PL_numeric_radix_sv);
404 PL_numeric_radix_sv = NULL;
405
406 done: ;
407
408# ifdef DEBUGGING
7d4bcc4a 409
2fcc0ca9
KW
410 if (DEBUG_L_TEST || debug_initialization) {
411 PerlIO_printf(Perl_debug_log, "Locale radix is '%s', ?UTF-8=%d\n",
69014004 412 (PL_numeric_radix_sv)
37b7e435
KW
413 ? SvPVX(PL_numeric_radix_sv)
414 : "NULL",
415 (PL_numeric_radix_sv)
39eb7305 416 ? cBOOL(SvUTF8(PL_numeric_radix_sv))
2fcc0ca9
KW
417 : 0);
418 }
69014004 419
86799d2d
KW
420# endif
421#endif /* USE_LOCALE_NUMERIC and can find the radix char */
7d4bcc4a 422
98994639
HS
423}
424
a39edc4c 425
98994639 426void
8772537c 427Perl_new_numeric(pTHX_ const char *newnum)
98994639 428{
7d4bcc4a
KW
429
430#ifndef USE_LOCALE_NUMERIC
431
432 PERL_UNUSED_ARG(newnum);
433
434#else
0d071d52 435
291a84fb 436 /* Called after each libc setlocale() call affecting LC_NUMERIC, to tell
0d071d52
KW
437 * core Perl this and that 'newnum' is the name of the new locale.
438 * It installs this locale as the current underlying default.
439 *
440 * The default locale and the C locale can be toggled between by use of the
5792c642
KW
441 * set_numeric_underlying() and set_numeric_standard() functions, which
442 * should probably not be called directly, but only via macros like
0d071d52
KW
443 * SET_NUMERIC_STANDARD() in perl.h.
444 *
445 * The toggling is necessary mainly so that a non-dot radix decimal point
446 * character can be output, while allowing internal calculations to use a
447 * dot.
448 *
449 * This sets several interpreter-level variables:
bb304765 450 * PL_numeric_name The underlying locale's name: a copy of 'newnum'
892e6465 451 * PL_numeric_underlying A boolean indicating if the toggled state is such
7738054c
KW
452 * that the current locale is the program's underlying
453 * locale
454 * PL_numeric_standard An int indicating if the toggled state is such
455 * that the current locale is the C locale. If non-zero,
456 * it is in C; if > 1, it means it may not be toggled away
457 * from C.
0d071d52
KW
458 * Note that both of the last two variables can be true at the same time,
459 * if the underlying locale is C. (Toggling is a no-op under these
460 * circumstances.)
461 *
462 * Any code changing the locale (outside this file) should use
463 * POSIX::setlocale, which calls this function. Therefore this function
464 * should be called directly only from this file and from
465 * POSIX::setlocale() */
466
b03f34cf 467 char *save_newnum;
98994639
HS
468
469 if (! newnum) {
43c5f42d
NC
470 Safefree(PL_numeric_name);
471 PL_numeric_name = NULL;
98994639 472 PL_numeric_standard = TRUE;
892e6465 473 PL_numeric_underlying = TRUE;
98994639
HS
474 return;
475 }
476
b03f34cf 477 save_newnum = stdize_locale(savepv(newnum));
abe1abcf
KW
478
479 PL_numeric_standard = isNAME_C_OR_POSIX(save_newnum);
892e6465 480 PL_numeric_underlying = TRUE;
abe1abcf 481
b03f34cf 482 if (! PL_numeric_name || strNE(PL_numeric_name, save_newnum)) {
98994639 483 Safefree(PL_numeric_name);
b03f34cf 484 PL_numeric_name = save_newnum;
b03f34cf 485 }
abe1abcf
KW
486 else {
487 Safefree(save_newnum);
488 }
4c28b29c
KW
489
490 /* Keep LC_NUMERIC in the C locale. This is for XS modules, so they don't
491 * have to worry about the radix being a non-dot. (Core operations that
492 * need the underlying locale change to it temporarily). */
493 set_numeric_standard();
494
98994639 495#endif /* USE_LOCALE_NUMERIC */
7d4bcc4a 496
98994639
HS
497}
498
499void
500Perl_set_numeric_standard(pTHX)
501{
7d4bcc4a 502
98994639 503#ifdef USE_LOCALE_NUMERIC
7d4bcc4a 504
28c1bf33
KW
505 /* Toggle the LC_NUMERIC locale to C. Most code should use the macros like
506 * SET_NUMERIC_STANDARD() in perl.h instead of calling this directly. The
507 * macro avoids calling this routine if toggling isn't necessary according
508 * to our records (which could be wrong if some XS code has changed the
509 * locale behind our back) */
0d071d52 510
837ce802 511 do_setlocale_c(LC_NUMERIC, "C");
a9b8c0d8 512 PL_numeric_standard = TRUE;
892e6465 513 PL_numeric_underlying = isNAME_C_OR_POSIX(PL_numeric_name);
86799d2d 514 set_numeric_radix(0);
7d4bcc4a
KW
515
516# ifdef DEBUGGING
517
2fcc0ca9
KW
518 if (DEBUG_L_TEST || debug_initialization) {
519 PerlIO_printf(Perl_debug_log,
58e4a467 520 "LC_NUMERIC locale now is standard C\n");
2fcc0ca9 521 }
98994639 522
7d4bcc4a 523# endif
98994639 524#endif /* USE_LOCALE_NUMERIC */
7d4bcc4a 525
98994639
HS
526}
527
528void
5792c642 529Perl_set_numeric_underlying(pTHX)
98994639 530{
7d4bcc4a 531
98994639 532#ifdef USE_LOCALE_NUMERIC
7d4bcc4a 533
28c1bf33 534 /* Toggle the LC_NUMERIC locale to the current underlying default. Most
7d4bcc4a
KW
535 * code should use the macros like SET_NUMERIC_UNDERLYING() in perl.h
536 * instead of calling this directly. The macro avoids calling this routine
537 * if toggling isn't necessary according to our records (which could be
538 * wrong if some XS code has changed the locale behind our back) */
a9b8c0d8 539
837ce802 540 do_setlocale_c(LC_NUMERIC, PL_numeric_name);
a9b8c0d8 541 PL_numeric_standard = isNAME_C_OR_POSIX(PL_numeric_name);
892e6465 542 PL_numeric_underlying = TRUE;
86799d2d 543 set_numeric_radix(1);
7d4bcc4a
KW
544
545# ifdef DEBUGGING
546
2fcc0ca9
KW
547 if (DEBUG_L_TEST || debug_initialization) {
548 PerlIO_printf(Perl_debug_log,
58e4a467 549 "LC_NUMERIC locale now is %s\n",
2fcc0ca9
KW
550 PL_numeric_name);
551 }
98994639 552
7d4bcc4a 553# endif
98994639 554#endif /* USE_LOCALE_NUMERIC */
7d4bcc4a 555
98994639
HS
556}
557
558/*
559 * Set up for a new ctype locale.
560 */
a4f00dcc
KW
561STATIC void
562S_new_ctype(pTHX_ const char *newctype)
98994639 563{
7d4bcc4a
KW
564
565#ifndef USE_LOCALE_CTYPE
566
567 PERL_ARGS_ASSERT_NEW_CTYPE;
568 PERL_UNUSED_ARG(newctype);
569 PERL_UNUSED_CONTEXT;
570
571#else
0d071d52 572
291a84fb 573 /* Called after each libc setlocale() call affecting LC_CTYPE, to tell
0d071d52
KW
574 * core Perl this and that 'newctype' is the name of the new locale.
575 *
576 * This function sets up the folding arrays for all 256 bytes, assuming
577 * that tofold() is tolc() since fold case is not a concept in POSIX,
578 *
579 * Any code changing the locale (outside this file) should use
580 * POSIX::setlocale, which calls this function. Therefore this function
581 * should be called directly only from this file and from
582 * POSIX::setlocale() */
583
27da23d5 584 dVAR;
68067e4e 585 UV i;
98994639 586
7918f24d
NC
587 PERL_ARGS_ASSERT_NEW_CTYPE;
588
215c5139
KW
589 /* We will replace any bad locale warning with 1) nothing if the new one is
590 * ok; or 2) a new warning for the bad new locale */
591 if (PL_warn_locale) {
592 SvREFCNT_dec_NN(PL_warn_locale);
593 PL_warn_locale = NULL;
594 }
595
c1284011 596 PL_in_utf8_CTYPE_locale = _is_cur_LC_category_utf8(LC_CTYPE);
31f05a37
KW
597
598 /* A UTF-8 locale gets standard rules. But note that code still has to
599 * handle this specially because of the three problematic code points */
600 if (PL_in_utf8_CTYPE_locale) {
601 Copy(PL_fold_latin1, PL_fold_locale, 256, U8);
602 }
603 else {
8c6180a9
KW
604 /* Assume enough space for every character being bad. 4 spaces each
605 * for the 94 printable characters that are output like "'x' "; and 5
606 * spaces each for "'\\' ", "'\t' ", and "'\n' "; plus a terminating
607 * NUL */
608 char bad_chars_list[ (94 * 4) + (3 * 5) + 1 ];
609
cc9eaeb0
KW
610 /* Don't check for problems if we are suppressing the warnings */
611 bool check_for_problems = ckWARN_d(WARN_LOCALE)
612 || UNLIKELY(DEBUG_L_TEST);
8c6180a9
KW
613 bool multi_byte_locale = FALSE; /* Assume is a single-byte locale
614 to start */
615 unsigned int bad_count = 0; /* Count of bad characters */
616
baa60164 617 for (i = 0; i < 256; i++) {
5a8a8f21
KW
618 if (isupper(i))
619 PL_fold_locale[i] = (U8) tolower(i);
620 else if (islower(i))
621 PL_fold_locale[i] = (U8) toupper(i);
baa60164
KW
622 else
623 PL_fold_locale[i] = (U8) i;
8c6180a9
KW
624
625 /* If checking for locale problems, see if the native ASCII-range
626 * printables plus \n and \t are in their expected categories in
627 * the new locale. If not, this could mean big trouble, upending
628 * Perl's and most programs' assumptions, like having a
629 * metacharacter with special meaning become a \w. Fortunately,
630 * it's very rare to find locales that aren't supersets of ASCII
631 * nowadays. It isn't a problem for most controls to be changed
632 * into something else; we check only \n and \t, though perhaps \r
633 * could be an issue as well. */
7d4bcc4a 634 if ( check_for_problems
8c6180a9
KW
635 && (isGRAPH_A(i) || isBLANK_A(i) || i == '\n'))
636 {
adf947ba
KW
637 if ( cBOOL(isalnum(i)) != cBOOL(isALPHANUMERIC(i))
638 || cBOOL(isalpha(i)) != cBOOL(isALPHA_A(i))
639 || cBOOL(isdigit(i)) != cBOOL(isDIGIT_A(i))
640 || cBOOL(isgraph(i)) != cBOOL(isGRAPH_A(i))
641 || cBOOL(islower(i)) != cBOOL(isLOWER_A(i))
642 || cBOOL(isprint(i)) != cBOOL(isPRINT_A(i))
643 || cBOOL(ispunct(i)) != cBOOL(isPUNCT_A(i))
644 || cBOOL(isspace(i)) != cBOOL(isSPACE_A(i))
645 || cBOOL(isupper(i)) != cBOOL(isUPPER_A(i))
646 || cBOOL(isxdigit(i))!= cBOOL(isXDIGIT_A(i))
647 || tolower(i) != (int) toLOWER_A(i)
648 || toupper(i) != (int) toUPPER_A(i)
649 || (i == '\n' && ! isCNTRL_LC(i)))
8c6180a9
KW
650 {
651 if (bad_count) { /* Separate multiple entries with a
652 blank */
653 bad_chars_list[bad_count++] = ' ';
654 }
655 bad_chars_list[bad_count++] = '\'';
656 if (isPRINT_A(i)) {
657 bad_chars_list[bad_count++] = (char) i;
658 }
659 else {
660 bad_chars_list[bad_count++] = '\\';
661 if (i == '\n') {
662 bad_chars_list[bad_count++] = 'n';
663 }
664 else {
665 assert(i == '\t');
666 bad_chars_list[bad_count++] = 't';
667 }
668 }
669 bad_chars_list[bad_count++] = '\'';
670 bad_chars_list[bad_count] = '\0';
671 }
672 }
673 }
674
7d4bcc4a
KW
675# ifdef MB_CUR_MAX
676
8c6180a9 677 /* We only handle single-byte locales (outside of UTF-8 ones; so if
d35fca5f 678 * this locale requires more than one byte, there are going to be
8c6180a9 679 * problems. */
9c8a6dc2
KW
680 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
681 "%s:%d: check_for_problems=%d, MB_CUR_MAX=%d\n",
682 __FILE__, __LINE__, check_for_problems, (int) MB_CUR_MAX));
683
ba1a4362
KW
684 if (check_for_problems && MB_CUR_MAX > 1
685
686 /* Some platforms return MB_CUR_MAX > 1 for even the "C"
687 * locale. Just assume that the implementation for them (plus
688 * for POSIX) is correct and the > 1 value is spurious. (Since
689 * these are specially handled to never be considered UTF-8
690 * locales, as long as this is the only problem, everything
691 * should work fine */
692 && strNE(newctype, "C") && strNE(newctype, "POSIX"))
693 {
8c6180a9
KW
694 multi_byte_locale = TRUE;
695 }
7d4bcc4a
KW
696
697# endif
8c6180a9
KW
698
699 if (bad_count || multi_byte_locale) {
780fcc9f 700 PL_warn_locale = Perl_newSVpvf(aTHX_
8c6180a9 701 "Locale '%s' may not work well.%s%s%s\n",
780fcc9f 702 newctype,
8c6180a9
KW
703 (multi_byte_locale)
704 ? " Some characters in it are not recognized by"
705 " Perl."
706 : "",
707 (bad_count)
708 ? "\nThe following characters (and maybe others)"
709 " may not have the same meaning as the Perl"
710 " program expects:\n"
711 : "",
712 (bad_count)
713 ? bad_chars_list
714 : ""
715 );
cc9eaeb0 716 /* If we are actually in the scope of the locale or are debugging,
bddebb56
KW
717 * output the message now. If not in that scope, we save the
718 * message to be output at the first operation using this locale,
719 * if that actually happens. Most programs don't use locales, so
720 * they are immune to bad ones. */
cc9eaeb0 721 if (IN_LC(LC_CTYPE) || UNLIKELY(DEBUG_L_TEST)) {
780fcc9f
KW
722
723 /* We have to save 'newctype' because the setlocale() just
724 * below may destroy it. The next setlocale() further down
725 * should restore it properly so that the intermediate change
726 * here is transparent to this function's caller */
727 const char * const badlocale = savepv(newctype);
728
837ce802 729 do_setlocale_c(LC_CTYPE, "C");
780fcc9f
KW
730
731 /* The '0' below suppresses a bogus gcc compiler warning */
732 Perl_warner(aTHX_ packWARN(WARN_LOCALE), SvPVX(PL_warn_locale), 0);
bddebb56 733
837ce802 734 do_setlocale_c(LC_CTYPE, badlocale);
c0f3a893 735 Safefree(badlocale);
bddebb56
KW
736
737 if (IN_LC(LC_CTYPE)) {
738 SvREFCNT_dec_NN(PL_warn_locale);
739 PL_warn_locale = NULL;
740 }
780fcc9f 741 }
baa60164 742 }
31f05a37 743 }
98994639
HS
744
745#endif /* USE_LOCALE_CTYPE */
7d4bcc4a 746
98994639
HS
747}
748
98994639 749void
2726666d
KW
750Perl__warn_problematic_locale()
751{
2726666d
KW
752
753#ifdef USE_LOCALE_CTYPE
754
5f04a188
KW
755 dTHX;
756
757 /* Internal-to-core function that outputs the message in PL_warn_locale,
758 * and then NULLS it. Should be called only through the macro
759 * _CHECK_AND_WARN_PROBLEMATIC_LOCALE */
760
2726666d 761 if (PL_warn_locale) {
2726666d
KW
762 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
763 SvPVX(PL_warn_locale),
764 0 /* dummy to avoid compiler warning */ );
2726666d
KW
765 SvREFCNT_dec_NN(PL_warn_locale);
766 PL_warn_locale = NULL;
767 }
768
769#endif
770
771}
772
a4f00dcc
KW
773STATIC void
774S_new_collate(pTHX_ const char *newcoll)
98994639 775{
7d4bcc4a
KW
776
777#ifndef USE_LOCALE_COLLATE
778
779 PERL_UNUSED_ARG(newcoll);
780 PERL_UNUSED_CONTEXT;
781
782#else
0d071d52 783
291a84fb 784 /* Called after each libc setlocale() call affecting LC_COLLATE, to tell
0d071d52
KW
785 * core Perl this and that 'newcoll' is the name of the new locale.
786 *
d35fca5f
KW
787 * The design of locale collation is that every locale change is given an
788 * index 'PL_collation_ix'. The first time a string particpates in an
789 * operation that requires collation while locale collation is active, it
790 * is given PERL_MAGIC_collxfrm magic (via sv_collxfrm_flags()). That
791 * magic includes the collation index, and the transformation of the string
792 * by strxfrm(), q.v. That transformation is used when doing comparisons,
793 * instead of the string itself. If a string changes, the magic is
794 * cleared. The next time the locale changes, the index is incremented,
795 * and so we know during a comparison that the transformation is not
796 * necessarily still valid, and so is recomputed. Note that if the locale
797 * changes enough times, the index could wrap (a U32), and it is possible
798 * that a transformation would improperly be considered valid, leading to
799 * an unlikely bug */
0d071d52 800
98994639
HS
801 if (! newcoll) {
802 if (PL_collation_name) {
803 ++PL_collation_ix;
804 Safefree(PL_collation_name);
805 PL_collation_name = NULL;
806 }
807 PL_collation_standard = TRUE;
00bf60ca 808 is_standard_collation:
98994639
HS
809 PL_collxfrm_base = 0;
810 PL_collxfrm_mult = 2;
165a1c52 811 PL_in_utf8_COLLATE_locale = FALSE;
f28f4d2a 812 PL_strxfrm_NUL_replacement = '\0';
a4a439fb 813 PL_strxfrm_max_cp = 0;
98994639
HS
814 return;
815 }
816
d35fca5f 817 /* If this is not the same locale as currently, set the new one up */
98994639
HS
818 if (! PL_collation_name || strNE(PL_collation_name, newcoll)) {
819 ++PL_collation_ix;
820 Safefree(PL_collation_name);
821 PL_collation_name = stdize_locale(savepv(newcoll));
a39edc4c 822 PL_collation_standard = isNAME_C_OR_POSIX(newcoll);
00bf60ca
KW
823 if (PL_collation_standard) {
824 goto is_standard_collation;
825 }
98994639 826
165a1c52 827 PL_in_utf8_COLLATE_locale = _is_cur_LC_category_utf8(LC_COLLATE);
f28f4d2a 828 PL_strxfrm_NUL_replacement = '\0';
a4a439fb 829 PL_strxfrm_max_cp = 0;
165a1c52 830
59c018b9
KW
831 /* A locale collation definition includes primary, secondary, tertiary,
832 * etc. weights for each character. To sort, the primary weights are
833 * used, and only if they compare equal, then the secondary weights are
834 * used, and only if they compare equal, then the tertiary, etc.
835 *
836 * strxfrm() works by taking the input string, say ABC, and creating an
837 * output transformed string consisting of first the primary weights,
838 * A¹B¹C¹ followed by the secondary ones, A²B²C²; and then the
839 * tertiary, etc, yielding A¹B¹C¹ A²B²C² A³B³C³ .... Some characters
840 * may not have weights at every level. In our example, let's say B
841 * doesn't have a tertiary weight, and A doesn't have a secondary
842 * weight. The constructed string is then going to be
843 * A¹B¹C¹ B²C² A³C³ ....
844 * This has the desired effect that strcmp() will look at the secondary
845 * or tertiary weights only if the strings compare equal at all higher
846 * priority weights. The spaces shown here, like in
c342d20e 847 * "A¹B¹C¹ A²B²C² "
59c018b9
KW
848 * are not just for readability. In the general case, these must
849 * actually be bytes, which we will call here 'separator weights'; and
850 * they must be smaller than any other weight value, but since these
851 * are C strings, only the terminating one can be a NUL (some
852 * implementations may include a non-NUL separator weight just before
853 * the NUL). Implementations tend to reserve 01 for the separator
854 * weights. They are needed so that a shorter string's secondary
855 * weights won't be misconstrued as primary weights of a longer string,
856 * etc. By making them smaller than any other weight, the shorter
857 * string will sort first. (Actually, if all secondary weights are
858 * smaller than all primary ones, there is no need for a separator
859 * weight between those two levels, etc.)
860 *
861 * The length of the transformed string is roughly a linear function of
862 * the input string. It's not exactly linear because some characters
863 * don't have weights at all levels. When we call strxfrm() we have to
864 * allocate some memory to hold the transformed string. The
865 * calculations below try to find coefficients 'm' and 'b' for this
866 * locale so that m*x + b equals how much space we need, given the size
867 * of the input string in 'x'. If we calculate too small, we increase
868 * the size as needed, and call strxfrm() again, but it is better to
869 * get it right the first time to avoid wasted expensive string
870 * transformations. */
871
98994639 872 {
79f120c8
KW
873 /* We use the string below to find how long the tranformation of it
874 * is. Almost all locales are supersets of ASCII, or at least the
875 * ASCII letters. We use all of them, half upper half lower,
876 * because if we used fewer, we might hit just the ones that are
877 * outliers in a particular locale. Most of the strings being
878 * collated will contain a preponderance of letters, and even if
879 * they are above-ASCII, they are likely to have the same number of
880 * weight levels as the ASCII ones. It turns out that digits tend
881 * to have fewer levels, and some punctuation has more, but those
882 * are relatively sparse in text, and khw believes this gives a
883 * reasonable result, but it could be changed if experience so
884 * dictates. */
885 const char longer[] = "ABCDEFGHIJKLMnopqrstuvwxyz";
886 char * x_longer; /* Transformed 'longer' */
887 Size_t x_len_longer; /* Length of 'x_longer' */
888
889 char * x_shorter; /* We also transform a substring of 'longer' */
890 Size_t x_len_shorter;
891
a4a439fb 892 /* _mem_collxfrm() is used get the transformation (though here we
79f120c8
KW
893 * are interested only in its length). It is used because it has
894 * the intelligence to handle all cases, but to work, it needs some
895 * values of 'm' and 'b' to get it started. For the purposes of
896 * this calculation we use a very conservative estimate of 'm' and
897 * 'b'. This assumes a weight can be multiple bytes, enough to
898 * hold any UV on the platform, and there are 5 levels, 4 weight
899 * bytes, and a trailing NUL. */
900 PL_collxfrm_base = 5;
901 PL_collxfrm_mult = 5 * sizeof(UV);
902
903 /* Find out how long the transformation really is */
a4a439fb
KW
904 x_longer = _mem_collxfrm(longer,
905 sizeof(longer) - 1,
906 &x_len_longer,
907
908 /* We avoid converting to UTF-8 in the
909 * called function by telling it the
910 * string is in UTF-8 if the locale is a
911 * UTF-8 one. Since the string passed
912 * here is invariant under UTF-8, we can
913 * claim it's UTF-8 even though it isn't.
914 * */
915 PL_in_utf8_COLLATE_locale);
79f120c8
KW
916 Safefree(x_longer);
917
918 /* Find out how long the transformation of a substring of 'longer'
919 * is. Together the lengths of these transformations are
920 * sufficient to calculate 'm' and 'b'. The substring is all of
921 * 'longer' except the first character. This minimizes the chances
922 * of being swayed by outliers */
a4a439fb 923 x_shorter = _mem_collxfrm(longer + 1,
79f120c8 924 sizeof(longer) - 2,
a4a439fb
KW
925 &x_len_shorter,
926 PL_in_utf8_COLLATE_locale);
79f120c8
KW
927 Safefree(x_shorter);
928
929 /* If the results are nonsensical for this simple test, the whole
930 * locale definition is suspect. Mark it so that locale collation
931 * is not active at all for it. XXX Should we warn? */
932 if ( x_len_shorter == 0
933 || x_len_longer == 0
934 || x_len_shorter >= x_len_longer)
935 {
936 PL_collxfrm_mult = 0;
937 PL_collxfrm_base = 0;
938 }
939 else {
940 SSize_t base; /* Temporary */
941
942 /* We have both: m * strlen(longer) + b = x_len_longer
943 * m * strlen(shorter) + b = x_len_shorter;
944 * subtracting yields:
945 * m * (strlen(longer) - strlen(shorter))
946 * = x_len_longer - x_len_shorter
947 * But we have set things up so that 'shorter' is 1 byte smaller
948 * than 'longer'. Hence:
949 * m = x_len_longer - x_len_shorter
950 *
951 * But if something went wrong, make sure the multiplier is at
952 * least 1.
953 */
954 if (x_len_longer > x_len_shorter) {
955 PL_collxfrm_mult = (STRLEN) x_len_longer - x_len_shorter;
956 }
957 else {
958 PL_collxfrm_mult = 1;
959 }
960
961 /* mx + b = len
962 * so: b = len - mx
963 * but in case something has gone wrong, make sure it is
964 * non-negative */
965 base = x_len_longer - PL_collxfrm_mult * (sizeof(longer) - 1);
966 if (base < 0) {
967 base = 0;
968 }
969
970 /* Add 1 for the trailing NUL */
971 PL_collxfrm_base = base + 1;
972 }
58eebef2 973
7d4bcc4a
KW
974# ifdef DEBUGGING
975
58eebef2
KW
976 if (DEBUG_L_TEST || debug_initialization) {
977 PerlIO_printf(Perl_debug_log,
b07929e4
KW
978 "%s:%d: ?UTF-8 locale=%d; x_len_shorter=%zu, "
979 "x_len_longer=%zu,"
980 " collate multipler=%zu, collate base=%zu\n",
58eebef2
KW
981 __FILE__, __LINE__,
982 PL_in_utf8_COLLATE_locale,
983 x_len_shorter, x_len_longer,
984 PL_collxfrm_mult, PL_collxfrm_base);
985 }
7d4bcc4a
KW
986# endif
987
98994639
HS
988 }
989 }
990
991#endif /* USE_LOCALE_COLLATE */
7d4bcc4a 992
98994639
HS
993}
994
d2b24094 995#ifdef WIN32
b8cc575c 996
a4f00dcc 997STATIC char *
b8cc575c 998S_win32_setlocale(pTHX_ int category, const char* locale)
b385bb4d
KW
999{
1000 /* This, for Windows, emulates POSIX setlocale() behavior. There is no
7d4bcc4a
KW
1001 * difference between the two unless the input locale is "", which normally
1002 * means on Windows to get the machine default, which is set via the
1003 * computer's "Regional and Language Options" (or its current equivalent).
1004 * In POSIX, it instead means to find the locale from the user's
1005 * environment. This routine changes the Windows behavior to first look in
1006 * the environment, and, if anything is found, use that instead of going to
1007 * the machine default. If there is no environment override, the machine
1008 * default is used, by calling the real setlocale() with "".
1009 *
1010 * The POSIX behavior is to use the LC_ALL variable if set; otherwise to
1011 * use the particular category's variable if set; otherwise to use the LANG
1012 * variable. */
b385bb4d 1013
175c4cf9 1014 bool override_LC_ALL = FALSE;
89f7b9aa 1015 char * result;
e5f10d49 1016 unsigned int i;
89f7b9aa 1017
b385bb4d 1018 if (locale && strEQ(locale, "")) {
7d4bcc4a
KW
1019
1020# ifdef LC_ALL
1021
b385bb4d
KW
1022 locale = PerlEnv_getenv("LC_ALL");
1023 if (! locale) {
e5f10d49
KW
1024 if (category == LC_ALL) {
1025 override_LC_ALL = TRUE;
1026 }
1027 else {
7d4bcc4a
KW
1028
1029# endif
7d4bcc4a 1030
e5f10d49
KW
1031 for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++) {
1032 if (category == categories[i]) {
1033 locale = PerlEnv_getenv(category_names[i]);
1034 goto found_locale;
1035 }
1036 }
7d4bcc4a 1037
b385bb4d 1038 locale = PerlEnv_getenv("LANG");
481465ea 1039 if (! locale) {
b385bb4d
KW
1040 locale = "";
1041 }
e5f10d49
KW
1042
1043 found_locale: ;
7d4bcc4a
KW
1044
1045# ifdef LC_ALL
1046
e5f10d49 1047 }
b385bb4d 1048 }
7d4bcc4a
KW
1049
1050# endif
1051
b385bb4d
KW
1052 }
1053
89f7b9aa 1054 result = setlocale(category, locale);
bbc98134 1055 DEBUG_L(PerlIO_printf(Perl_debug_log, "%s:%d: %s\n", __FILE__, __LINE__,
a4f00dcc 1056 setlocale_debug_string(category, locale, result)));
89f7b9aa 1057
481465ea 1058 if (! override_LC_ALL) {
89f7b9aa
KW
1059 return result;
1060 }
1061
dfd77d7a 1062 /* Here the input category was LC_ALL, and we have set it to what is in the
481465ea
KW
1063 * LANG variable or the system default if there is no LANG. But these have
1064 * lower priority than the other LC_foo variables, so override it for each
1065 * one that is set. (If they are set to "", it means to use the same thing
1066 * we just set LC_ALL to, so can skip) */
7d4bcc4a 1067
948523db 1068 for (i = 0; i < LC_ALL_INDEX; i++) {
e5f10d49
KW
1069 result = PerlEnv_getenv(category_names[i]);
1070 if (result && strNE(result, "")) {
1071 setlocale(categories[i], result);
1072 DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s:%d: %s\n",
1073 __FILE__, __LINE__,
1074 setlocale_debug_string(categories[i], result, "not captured")));
1075 }
89f7b9aa 1076 }
7d4bcc4a 1077
bbc98134
KW
1078 result = setlocale(LC_ALL, NULL);
1079 DEBUG_L(PerlIO_printf(Perl_debug_log, "%s:%d: %s\n",
1080 __FILE__, __LINE__,
a4f00dcc 1081 setlocale_debug_string(LC_ALL, NULL, result)));
89f7b9aa 1082
bbc98134 1083 return result;
b385bb4d
KW
1084}
1085
1086#endif
1087
a4f00dcc
KW
1088char *
1089Perl_setlocale(int category, const char * locale)
1090{
1091 /* This wraps POSIX::setlocale() */
1092
1093 char * retval;
1159483a 1094 char * newlocale;
a4f00dcc
KW
1095 dTHX;
1096
a4f00dcc
KW
1097#ifdef USE_LOCALE_NUMERIC
1098
291a84fb
KW
1099 /* A NULL locale means only query what the current one is. We have the
1100 * LC_NUMERIC name saved, because we are normally switched into the C
1101 * locale for it. For an LC_ALL query, switch back to get the correct
1102 * results. All other categories don't require special handling */
a4f00dcc
KW
1103 if (locale == NULL) {
1104 if (category == LC_NUMERIC) {
1105 return savepv(PL_numeric_name);
1106 }
1107
7d4bcc4a 1108# ifdef LC_ALL
a4f00dcc
KW
1109
1110 else if (category == LC_ALL) {
1111 SET_NUMERIC_UNDERLYING();
1112 }
1113
7d4bcc4a 1114# endif
a4f00dcc
KW
1115
1116 }
1117
1118#endif
1119
e5f10d49
KW
1120 /* Save retval since subsequent setlocale() calls may overwrite it. */
1121 retval = savepv(do_setlocale_r(category, locale));
a4f00dcc
KW
1122
1123 DEBUG_L(PerlIO_printf(Perl_debug_log,
1124 "%s:%d: %s\n", __FILE__, __LINE__,
1125 setlocale_debug_string(category, locale, retval)));
1126 if (! retval) {
1127 /* Should never happen that a query would return an error, but be
1128 * sure and reset to C locale */
1129 if (locale == 0) {
1130 SET_NUMERIC_STANDARD();
1131 }
7d4bcc4a 1132
a4f00dcc
KW
1133 return NULL;
1134 }
1135
a4f00dcc
KW
1136 /* If locale == NULL, we are just querying the state, but may have switched
1137 * to NUMERIC_UNDERLYING. Switch back before returning. */
1138 if (locale == NULL) {
1139 SET_NUMERIC_STANDARD();
1140 return retval;
1141 }
a4f00dcc 1142
1159483a
KW
1143 /* Now that have switched locales, we have to update our records to
1144 * correspond. */
a4f00dcc 1145
1159483a 1146 switch (category) {
a4f00dcc 1147
1159483a 1148#ifdef USE_LOCALE_CTYPE
a4f00dcc 1149
1159483a
KW
1150 case LC_CTYPE:
1151 new_ctype(retval);
1152 break;
a4f00dcc 1153
1159483a 1154#endif
a4f00dcc
KW
1155#ifdef USE_LOCALE_COLLATE
1156
1159483a
KW
1157 case LC_COLLATE:
1158 new_collate(retval);
1159 break;
a4f00dcc 1160
1159483a 1161#endif
a4f00dcc
KW
1162#ifdef USE_LOCALE_NUMERIC
1163
1159483a
KW
1164 case LC_NUMERIC:
1165 new_numeric(retval);
1166 break;
a4f00dcc 1167
1159483a
KW
1168#endif
1169#ifdef LC_ALL
a4f00dcc 1170
1159483a 1171 case LC_ALL:
a4f00dcc 1172
1159483a
KW
1173 /* LC_ALL updates all the things we care about. The values may not
1174 * be the same as 'retval', as the locale "" may have set things
1175 * individually */
a4f00dcc 1176
1159483a 1177# ifdef USE_LOCALE_CTYPE
a4f00dcc 1178
1159483a
KW
1179 newlocale = do_setlocale_c(LC_CTYPE, NULL);
1180 new_ctype(newlocale);
a4f00dcc 1181
1159483a
KW
1182# endif /* USE_LOCALE_CTYPE */
1183# ifdef USE_LOCALE_COLLATE
1184
1185 newlocale = do_setlocale_c(LC_COLLATE, NULL);
1186 new_collate(newlocale);
a4f00dcc 1187
7d4bcc4a 1188# endif
1159483a 1189# ifdef USE_LOCALE_NUMERIC
a4f00dcc 1190
1159483a
KW
1191 newlocale = do_setlocale_c(LC_NUMERIC, NULL);
1192 new_numeric(newlocale);
a4f00dcc 1193
1159483a
KW
1194# endif /* USE_LOCALE_NUMERIC */
1195#endif /* LC_ALL */
a4f00dcc 1196
1159483a
KW
1197 default:
1198 break;
a4f00dcc
KW
1199 }
1200
1201 return retval;
1202
f7416781
KW
1203
1204}
1205
1206PERL_STATIC_INLINE const char *
1207S_save_to_buffer(const char * string, char **buf, Size_t *buf_size, const Size_t offset)
1208{
1209 /* Copy the NUL-terminated 'string' to 'buf' + 'offset'. 'buf' has size 'buf_size',
1210 * growing it if necessary */
1211
1212 const Size_t string_size = strlen(string) + offset + 1;
1213
1214 PERL_ARGS_ASSERT_SAVE_TO_BUFFER;
1215
1216 if (*buf_size == 0) {
1217 Newx(*buf, string_size, char);
1218 *buf_size = string_size;
1219 }
1220 else if (string_size > *buf_size) {
1221 Renew(*buf, string_size, char);
1222 *buf_size = string_size;
1223 }
1224
1225 Copy(string, *buf + offset, string_size - offset, char);
1226 return *buf;
1227}
1228
1229/*
1230
1231=head1 Locale-related functions and macros
1232
1233=for apidoc Perl_langinfo
1234
7d4bcc4a 1235This is an (almost ª) drop-in replacement for the system C<L<nl_langinfo(3)>>,
f7416781
KW
1236taking the same C<item> parameter values, and returning the same information.
1237But it is more thread-safe than regular C<nl_langinfo()>, and hides the quirks
1238of Perl's locale handling from your code, and can be used on systems that lack
1239a native C<nl_langinfo>.
1240
1241Expanding on these:
1242
1243=over
1244
1245=item *
1246
1247It delivers the correct results for the C<RADIXCHAR> and C<THOUSESEP> items,
1248without you having to write extra code. The reason for the extra code would be
1249because these are from the C<LC_NUMERIC> locale category, which is normally
1250kept set to the C locale by Perl, no matter what the underlying locale is
1251supposed to be, and so to get the expected results, you have to temporarily
1252toggle into the underlying locale, and later toggle back. (You could use
1253plain C<nl_langinfo> and C<L</STORE_LC_NUMERIC_FORCE_TO_UNDERLYING>> for this
1254but then you wouldn't get the other advantages of C<Perl_langinfo()>; not
1255keeping C<LC_NUMERIC> in the C locale would break a lot of CPAN, which is
1256expecting the radix (decimal point) character to be a dot.)
1257
1258=item *
1259
1260Depending on C<item>, it works on systems that don't have C<nl_langinfo>, hence
1261makes your code more portable. Of the fifty-some possible items specified by
1262the POSIX 2008 standard,
1263L<http://pubs.opengroup.org/onlinepubs/9699919799/basedefs/langinfo.h.html>,
1264only two are completely unimplemented. It uses various techniques to recover
1265the other items, including calling C<L<localeconv(3)>>, and C<L<strftime(3)>>,
1266both of which are specified in C89, so should be always be available. Later
1267C<strftime()> versions have additional capabilities; C<""> is returned for
1268those not available on your system.
1269
1270The details for those items which may differ from what this emulation returns
1271and what a native C<nl_langinfo()> would return are:
1272
1273=over
1274
1275=item C<CODESET>
1276
1277=item C<ERA>
1278
1279Unimplemented, so returns C<"">.
1280
1281=item C<YESEXPR>
1282
c1566110
KW
1283=item C<YESSTR>
1284
f7416781
KW
1285=item C<NOEXPR>
1286
c1566110
KW
1287=item C<NOSTR>
1288
1289Only the values for English are returned. C<YESSTR> and C<NOSTR> have been
1290removed from POSIX 2008, and are retained for backwards compatibility. Your
1291platform's C<nl_langinfo> may not support them.
f7416781
KW
1292
1293=item C<D_FMT>
1294
1295Always evaluates to C<%x>, the locale's appropriate date representation.
1296
1297=item C<T_FMT>
1298
1299Always evaluates to C<%X>, the locale's appropriate time representation.
1300
1301=item C<D_T_FMT>
1302
1303Always evaluates to C<%c>, the locale's appropriate date and time
1304representation.
1305
1306=item C<CRNCYSTR>
1307
1308The return may be incorrect for those rare locales where the currency symbol
1309replaces the radix character.
1310Send email to L<mailto:perlbug@perl.org> if you have examples of it needing
1311to work differently.
1312
1313=item C<ALT_DIGITS>
1314
1315Currently this gives the same results as Linux does.
1316Send email to L<mailto:perlbug@perl.org> if you have examples of it needing
1317to work differently.
1318
1319=item C<ERA_D_FMT>
1320
1321=item C<ERA_T_FMT>
1322
1323=item C<ERA_D_T_FMT>
1324
1325=item C<T_FMT_AMPM>
1326
1327These are derived by using C<strftime()>, and not all versions of that function
1328know about them. C<""> is returned for these on such systems.
1329
1330=back
1331
1332When using C<Perl_langinfo> on systems that don't have a native
1333C<nl_langinfo()>, you must
1334
1335 #include "perl_langinfo.h"
1336
1337before the C<perl.h> C<#include>. You can replace your C<langinfo.h>
1338C<#include> with this one. (Doing it this way keeps out the symbols that plain
1339C<langinfo.h> imports into the namespace for code that doesn't need it.)
1340
1341You also should not use the bare C<langinfo.h> item names, but should preface
1342them with C<PERL_>, so use C<PERL_RADIXCHAR> instead of plain C<RADIXCHAR>.
1343The C<PERL_I<foo>> versions will also work for this function on systems that do
1344have a native C<nl_langinfo>.
1345
1346=item *
1347
1348It is thread-friendly, returning its result in a buffer that won't be
1349overwritten by another thread, so you don't have to code for that possibility.
1350The buffer can be overwritten by the next call to C<nl_langinfo> or
1351C<Perl_langinfo> in the same thread.
1352
1353=item *
1354
7d4bcc4a 1355ª It returns S<C<const char *>>, whereas plain C<nl_langinfo()> returns S<C<char
f7416781
KW
1356*>>, but you are (only by documentation) forbidden to write into the buffer.
1357By declaring this C<const>, the compiler enforces this restriction. The extra
1358C<const> is why this isn't an unequivocal drop-in replacement for
1359C<nl_langinfo>.
1360
1361=back
1362
1363The original impetus for C<Perl_langinfo()> was so that code that needs to
1364find out the current currency symbol, floating point radix character, or digit
1365grouping separator can use, on all systems, the simpler and more
1366thread-friendly C<nl_langinfo> API instead of C<L<localeconv(3)>> which is a
1367pain to make thread-friendly. For other fields returned by C<localeconv>, it
1368is better to use the methods given in L<perlcall> to call
1369L<C<POSIX::localeconv()>|POSIX/localeconv>, which is thread-friendly.
1370
1371=cut
1372
1373*/
1374
1375const char *
1376#ifdef HAS_NL_LANGINFO
1377Perl_langinfo(const nl_item item)
1378#else
1379Perl_langinfo(const int item)
1380#endif
1381{
f61748ac
KW
1382 return my_nl_langinfo(item, TRUE);
1383}
1384
1385const char *
1386#ifdef HAS_NL_LANGINFO
1387S_my_nl_langinfo(const nl_item item, bool toggle)
1388#else
1389S_my_nl_langinfo(const int item, bool toggle)
1390#endif
1391{
ae74815b 1392 dTHX;
f7416781 1393
ab340fff
KW
1394#if defined(HAS_NL_LANGINFO) /* nl_langinfo() is available. */
1395#if ! defined(HAS_POSIX_2008_LOCALE)
f7416781 1396
ab340fff 1397 /* Here, use plain nl_langinfo(), switching to the underlying LC_NUMERIC
ae74815b
KW
1398 * for those items dependent on it. This must be copied to a buffer before
1399 * switching back, as some systems destroy the buffer when setlocale() is
1400 * called */
f7416781
KW
1401
1402 LOCALE_LOCK;
1403
1404 if (toggle) {
1405 if (item == PERL_RADIXCHAR || item == PERL_THOUSEP) {
837ce802 1406 do_setlocale_c(LC_NUMERIC, PL_numeric_name);
f7416781
KW
1407 }
1408 else {
1409 toggle = FALSE;
1410 }
1411 }
1412
1413 save_to_buffer(nl_langinfo(item), &PL_langinfo_buf, &PL_langinfo_bufsize, 0);
1414
1415 if (toggle) {
837ce802 1416 do_setlocale_c(LC_NUMERIC, "C");
f7416781
KW
1417 }
1418
1419 LOCALE_UNLOCK;
1420
ab340fff
KW
1421# else /* Use nl_langinfo_l(), avoiding both a mutex and changing the locale */
1422
1423 bool do_free = FALSE;
1424 locale_t cur = uselocale((locale_t) 0);
1425
1426 if (cur == LC_GLOBAL_LOCALE) {
1427 cur = duplocale(LC_GLOBAL_LOCALE);
1428 do_free = TRUE;
1429 }
1430
1431 if ( toggle
1432 && (item == PERL_RADIXCHAR || item == PERL_THOUSEP))
1433 {
1434 cur = newlocale(LC_NUMERIC_MASK, PL_numeric_name, cur);
1435 do_free = TRUE;
1436 }
1437
1438 save_to_buffer(nl_langinfo_l(item, cur),
1439 &PL_langinfo_buf, &PL_langinfo_bufsize, 0);
1440 if (do_free) {
1441 freelocale(cur);
1442 }
1443
c1566110
KW
1444# endif
1445
1446 if (strEQ(PL_langinfo_buf, "")) {
1447 if (item == PERL_YESSTR) {
1448 return "yes";
1449 }
1450 if (item == PERL_NOSTR) {
1451 return "no";
1452 }
1453 }
1454
ab340fff
KW
1455 return PL_langinfo_buf;
1456
f7416781 1457#else /* Below, emulate nl_langinfo as best we can */
f7416781
KW
1458# ifdef HAS_LOCALECONV
1459
1460 const struct lconv* lc;
1461
1462# endif
1463# ifdef HAS_STRFTIME
1464
1465 struct tm tm;
1466 bool return_format = FALSE; /* Return the %format, not the value */
1467 const char * format;
1468
1469# endif
1470
1471 /* We copy the results to a per-thread buffer, even if not multi-threaded.
1472 * This is in part to simplify this code, and partly because we need a
1473 * buffer anyway for strftime(), and partly because a call of localeconv()
1474 * could otherwise wipe out the buffer, and the programmer would not be
1475 * expecting this, as this is a nl_langinfo() substitute after all, so s/he
1476 * might be thinking their localeconv() is safe until another localeconv()
1477 * call. */
1478
1479 switch (item) {
1480 Size_t len;
1481 const char * retval;
1482
1483 /* These 2 are unimplemented */
1484 case PERL_CODESET:
1485 case PERL_ERA: /* For use with strftime() %E modifier */
1486
1487 default:
1488 return "";
1489
1490 /* We use only an English set, since we don't know any more */
1491 case PERL_YESEXPR: return "^[+1yY]";
c1566110 1492 case PERL_YESSTR: return "yes";
f7416781 1493 case PERL_NOEXPR: return "^[-0nN]";
c1566110 1494 case PERL_NOSTR: return "no";
f7416781
KW
1495
1496# ifdef HAS_LOCALECONV
1497
1498 case PERL_CRNCYSTR:
1499
1500 LOCALE_LOCK;
1501
291a84fb
KW
1502 /* We don't bother with localeconv_l() because any system that has
1503 * it is likely to also have nl_langinfo() */
1504
f7416781
KW
1505 lc = localeconv();
1506 if (! lc || ! lc->currency_symbol || strEQ("", lc->currency_symbol))
1507 {
1508 LOCALE_UNLOCK;
1509 return "";
1510 }
1511
1512 /* Leave the first spot empty to be filled in below */
1513 save_to_buffer(lc->currency_symbol, &PL_langinfo_buf,
1514 &PL_langinfo_bufsize, 1);
1515 if (lc->mon_decimal_point && strEQ(lc->mon_decimal_point, ""))
1516 { /* khw couldn't figure out how the localedef specifications
1517 would show that the $ should replace the radix; this is
1518 just a guess as to how it might work.*/
1519 *PL_langinfo_buf = '.';
1520 }
1521 else if (lc->p_cs_precedes) {
1522 *PL_langinfo_buf = '-';
1523 }
1524 else {
1525 *PL_langinfo_buf = '+';
1526 }
1527
1528 LOCALE_UNLOCK;
1529 break;
1530
1531 case PERL_RADIXCHAR:
1532 case PERL_THOUSEP:
1533
1534 LOCALE_LOCK;
1535
1536 if (toggle) {
837ce802 1537 do_setlocale_c(LC_NUMERIC, PL_numeric_name);
f7416781
KW
1538 }
1539
1540 lc = localeconv();
1541 if (! lc) {
1542 retval = "";
1543 }
33394adc
KW
1544 else {
1545 retval = (item == PERL_RADIXCHAR)
1546 ? lc->decimal_point
1547 : lc->thousands_sep;
1548 if (! retval) {
1549 retval = "";
1550 }
f7416781
KW
1551 }
1552
1553 save_to_buffer(retval, &PL_langinfo_buf, &PL_langinfo_bufsize, 0);
1554
1555 if (toggle) {
837ce802 1556 do_setlocale_c(LC_NUMERIC, "C");
f7416781
KW
1557 }
1558
1559 LOCALE_UNLOCK;
1560
1561 break;
1562
1563# endif
1564# ifdef HAS_STRFTIME
1565
1566 /* These are defined by C89, so we assume that strftime supports them,
1567 * and so are returned unconditionally; they may not be what the locale
1568 * actually says, but should give good enough results for someone using
1569 * them as formats (as opposed to trying to parse them to figure out
7d4bcc4a 1570 * what the locale says). The other format items are actually tested to
f7416781
KW
1571 * verify they work on the platform */
1572 case PERL_D_FMT: return "%x";
1573 case PERL_T_FMT: return "%X";
1574 case PERL_D_T_FMT: return "%c";
1575
1576 /* These formats are only available in later strfmtime's */
1577 case PERL_ERA_D_FMT: case PERL_ERA_T_FMT: case PERL_ERA_D_T_FMT:
1578 case PERL_T_FMT_AMPM:
1579
1580 /* The rest can be gotten from most versions of strftime(). */
1581 case PERL_ABDAY_1: case PERL_ABDAY_2: case PERL_ABDAY_3:
1582 case PERL_ABDAY_4: case PERL_ABDAY_5: case PERL_ABDAY_6:
1583 case PERL_ABDAY_7:
1584 case PERL_ALT_DIGITS:
1585 case PERL_AM_STR: case PERL_PM_STR:
1586 case PERL_ABMON_1: case PERL_ABMON_2: case PERL_ABMON_3:
1587 case PERL_ABMON_4: case PERL_ABMON_5: case PERL_ABMON_6:
1588 case PERL_ABMON_7: case PERL_ABMON_8: case PERL_ABMON_9:
1589 case PERL_ABMON_10: case PERL_ABMON_11: case PERL_ABMON_12:
1590 case PERL_DAY_1: case PERL_DAY_2: case PERL_DAY_3: case PERL_DAY_4:
1591 case PERL_DAY_5: case PERL_DAY_6: case PERL_DAY_7:
1592 case PERL_MON_1: case PERL_MON_2: case PERL_MON_3: case PERL_MON_4:
1593 case PERL_MON_5: case PERL_MON_6: case PERL_MON_7: case PERL_MON_8:
1594 case PERL_MON_9: case PERL_MON_10: case PERL_MON_11: case PERL_MON_12:
1595
1596 LOCALE_LOCK;
1597
1598 init_tm(&tm); /* Precaution against core dumps */
1599 tm.tm_sec = 30;
1600 tm.tm_min = 30;
1601 tm.tm_hour = 6;
1602 tm.tm_year = 2017 - 1900;
1603 tm.tm_wday = 0;
1604 tm.tm_mon = 0;
1605 switch (item) {
1606 default:
1607 LOCALE_UNLOCK;
1608 Perl_croak(aTHX_ "panic: %s: %d: switch case: %d problem",
1609 __FILE__, __LINE__, item);
1610 NOT_REACHED; /* NOTREACHED */
1611
1612 case PERL_PM_STR: tm.tm_hour = 18;
1613 case PERL_AM_STR:
1614 format = "%p";
1615 break;
1616
1617 case PERL_ABDAY_7: tm.tm_wday++;
1618 case PERL_ABDAY_6: tm.tm_wday++;
1619 case PERL_ABDAY_5: tm.tm_wday++;
1620 case PERL_ABDAY_4: tm.tm_wday++;
1621 case PERL_ABDAY_3: tm.tm_wday++;
1622 case PERL_ABDAY_2: tm.tm_wday++;
1623 case PERL_ABDAY_1:
1624 format = "%a";
1625 break;
1626
1627 case PERL_DAY_7: tm.tm_wday++;
1628 case PERL_DAY_6: tm.tm_wday++;
1629 case PERL_DAY_5: tm.tm_wday++;
1630 case PERL_DAY_4: tm.tm_wday++;
1631 case PERL_DAY_3: tm.tm_wday++;
1632 case PERL_DAY_2: tm.tm_wday++;
1633 case PERL_DAY_1:
1634 format = "%A";
1635 break;
1636
1637 case PERL_ABMON_12: tm.tm_mon++;
1638 case PERL_ABMON_11: tm.tm_mon++;
1639 case PERL_ABMON_10: tm.tm_mon++;
1640 case PERL_ABMON_9: tm.tm_mon++;
1641 case PERL_ABMON_8: tm.tm_mon++;
1642 case PERL_ABMON_7: tm.tm_mon++;
1643 case PERL_ABMON_6: tm.tm_mon++;
1644 case PERL_ABMON_5: tm.tm_mon++;
1645 case PERL_ABMON_4: tm.tm_mon++;
1646 case PERL_ABMON_3: tm.tm_mon++;
1647 case PERL_ABMON_2: tm.tm_mon++;
1648 case PERL_ABMON_1:
1649 format = "%b";
1650 break;
1651
1652 case PERL_MON_12: tm.tm_mon++;
1653 case PERL_MON_11: tm.tm_mon++;
1654 case PERL_MON_10: tm.tm_mon++;
1655 case PERL_MON_9: tm.tm_mon++;
1656 case PERL_MON_8: tm.tm_mon++;
1657 case PERL_MON_7: tm.tm_mon++;
1658 case PERL_MON_6: tm.tm_mon++;
1659 case PERL_MON_5: tm.tm_mon++;
1660 case PERL_MON_4: tm.tm_mon++;
1661 case PERL_MON_3: tm.tm_mon++;
1662 case PERL_MON_2: tm.tm_mon++;
1663 case PERL_MON_1:
1664 format = "%B";
1665 break;
1666
1667 case PERL_T_FMT_AMPM:
1668 format = "%r";
1669 return_format = TRUE;
1670 break;
1671
1672 case PERL_ERA_D_FMT:
1673 format = "%Ex";
1674 return_format = TRUE;
1675 break;
1676
1677 case PERL_ERA_T_FMT:
1678 format = "%EX";
1679 return_format = TRUE;
1680 break;
1681
1682 case PERL_ERA_D_T_FMT:
1683 format = "%Ec";
1684 return_format = TRUE;
1685 break;
1686
1687 case PERL_ALT_DIGITS:
1688 tm.tm_wday = 0;
1689 format = "%Ow"; /* Find the alternate digit for 0 */
1690 break;
1691 }
1692
1693 /* We can't use my_strftime() because it doesn't look at tm_wday */
1694 while (0 == strftime(PL_langinfo_buf, PL_langinfo_bufsize,
1695 format, &tm))
1696 {
1697 /* A zero return means one of:
1698 * a) there wasn't enough space in PL_langinfo_buf
1699 * b) the format, like a plain %p, returns empty
1700 * c) it was an illegal format, though some implementations of
1701 * strftime will just return the illegal format as a plain
1702 * character sequence.
1703 *
1704 * To quickly test for case 'b)', try again but precede the
1705 * format with a plain character. If that result is still
1706 * empty, the problem is either 'a)' or 'c)' */
1707
1708 Size_t format_size = strlen(format) + 1;
1709 Size_t mod_size = format_size + 1;
1710 char * mod_format;
1711 char * temp_result;
1712
1713 Newx(mod_format, mod_size, char);
1714 Newx(temp_result, PL_langinfo_bufsize, char);
1715 *mod_format = '\a';
1716 my_strlcpy(mod_format + 1, format, mod_size);
1717 len = strftime(temp_result,
1718 PL_langinfo_bufsize,
1719 mod_format, &tm);
1720 Safefree(mod_format);
1721 Safefree(temp_result);
1722
1723 /* If 'len' is non-zero, it means that we had a case like %p
1724 * which means the current locale doesn't use a.m. or p.m., and
1725 * that is valid */
1726 if (len == 0) {
1727
1728 /* Here, still didn't work. If we get well beyond a
1729 * reasonable size, bail out to prevent an infinite loop. */
1730
1731 if (PL_langinfo_bufsize > 100 * format_size) {
1732 *PL_langinfo_buf = '\0';
1733 }
1734 else { /* Double the buffer size to retry; Add 1 in case
1735 original was 0, so we aren't stuck at 0. */
1736 PL_langinfo_bufsize *= 2;
1737 PL_langinfo_bufsize++;
1738 Renew(PL_langinfo_buf, PL_langinfo_bufsize, char);
1739 continue;
1740 }
1741 }
1742
1743 break;
1744 }
1745
1746 /* Here, we got a result.
1747 *
1748 * If the item is 'ALT_DIGITS', PL_langinfo_buf contains the
1749 * alternate format for wday 0. If the value is the same as the
1750 * normal 0, there isn't an alternate, so clear the buffer. */
1751 if ( item == PERL_ALT_DIGITS
1752 && strEQ(PL_langinfo_buf, "0"))
1753 {
1754 *PL_langinfo_buf = '\0';
1755 }
1756
1757 /* ALT_DIGITS is problematic. Experiments on it showed that
1758 * strftime() did not always work properly when going from alt-9 to
1759 * alt-10. Only a few locales have this item defined, and in all
1760 * of them on Linux that khw was able to find, nl_langinfo() merely
1761 * returned the alt-0 character, possibly doubled. Most Unicode
1762 * digits are in blocks of 10 consecutive code points, so that is
1763 * sufficient information for those scripts, as we can infer alt-1,
1764 * alt-2, .... But for a Japanese locale, a CJK ideographic 0 is
1765 * returned, and the CJK digits are not in code point order, so you
1766 * can't really infer anything. The localedef for this locale did
1767 * specify the succeeding digits, so that strftime() works properly
1768 * on them, without needing to infer anything. But the
1769 * nl_langinfo() return did not give sufficient information for the
1770 * caller to understand what's going on. So until there is
1771 * evidence that it should work differently, this returns the alt-0
1772 * string for ALT_DIGITS.
1773 *
1774 * wday was chosen because its range is all a single digit. Things
1775 * like tm_sec have two digits as the minimum: '00' */
1776
1777 LOCALE_UNLOCK;
1778
1779 /* If to return the format, not the value, overwrite the buffer
1780 * with it. But some strftime()s will keep the original format if
1781 * illegal, so change those to "" */
1782 if (return_format) {
1783 if (strEQ(PL_langinfo_buf, format)) {
1784 *PL_langinfo_buf = '\0';
1785 }
1786 else {
1787 save_to_buffer(format, &PL_langinfo_buf,
1788 &PL_langinfo_bufsize, 0);
1789 }
1790 }
1791
1792 break;
1793
1794# endif
1795
1796 }
1797
1798 return PL_langinfo_buf;
1799
1800#endif
1801
a4f00dcc 1802}
b385bb4d 1803
98994639
HS
1804/*
1805 * Initialize locale awareness.
1806 */
1807int
1808Perl_init_i18nl10n(pTHX_ int printwarn)
1809{
0e92a118
KW
1810 /* printwarn is
1811 *
1812 * 0 if not to output warning when setup locale is bad
1813 * 1 if to output warning based on value of PERL_BADLANG
1814 * >1 if to output regardless of PERL_BADLANG
1815 *
1816 * returns
98994639 1817 * 1 = set ok or not applicable,
0e92a118
KW
1818 * 0 = fallback to a locale of lower priority
1819 * -1 = fallback to all locales failed, not even to the C locale
6b058d42
KW
1820 *
1821 * Under -DDEBUGGING, if the environment variable PERL_DEBUG_LOCALE_INIT is
1822 * set, debugging information is output.
1823 *
1824 * This looks more complicated than it is, mainly due to the #ifdefs.
1825 *
1826 * We try to set LC_ALL to the value determined by the environment. If
1827 * there is no LC_ALL on this platform, we try the individual categories we
1828 * know about. If this works, we are done.
1829 *
1830 * But if it doesn't work, we have to do something else. We search the
1831 * environment variables ourselves instead of relying on the system to do
1832 * it. We look at, in order, LC_ALL, LANG, a system default locale (if we
1833 * think there is one), and the ultimate fallback "C". This is all done in
1834 * the same loop as above to avoid duplicating code, but it makes things
7d4bcc4a
KW
1835 * more complex. The 'trial_locales' array is initialized with just one
1836 * element; it causes the behavior described in the paragraph above this to
1837 * happen. If that fails, we add elements to 'trial_locales', and do extra
1838 * loop iterations to cause the behavior described in this paragraph.
6b058d42
KW
1839 *
1840 * On Ultrix, the locale MUST come from the environment, so there is
1841 * preliminary code to set it. I (khw) am not sure that it is necessary,
1842 * and that this couldn't be folded into the loop, but barring any real
1843 * platforms to test on, it's staying as-is
1844 *
1845 * A slight complication is that in embedded Perls, the locale may already
1846 * be set-up, and we don't want to get it from the normal environment
1847 * variables. This is handled by having a special environment variable
1848 * indicate we're in this situation. We simply set setlocale's 2nd
1849 * parameter to be a NULL instead of "". That indicates to setlocale that
1850 * it is not to change anything, but to return the current value,
1851 * effectively initializing perl's db to what the locale already is.
1852 *
1853 * We play the same trick with NULL if a LC_ALL succeeds. We call
1854 * setlocale() on the individual categores with NULL to get their existing
1855 * values for our db, instead of trying to change them.
1856 * */
98994639 1857
0e92a118
KW
1858 int ok = 1;
1859
7d4bcc4a
KW
1860#ifndef USE_LOCALE
1861
1862 PERL_UNUSED_ARG(printwarn);
1863
1864#else /* USE_LOCALE */
7d4bcc4a
KW
1865# ifdef __GLIBC__
1866
175c4cf9 1867 const char * const language = savepv(PerlEnv_getenv("LANGUAGE"));
7d4bcc4a
KW
1868
1869# endif
65ebb059 1870
ccd65d51
KW
1871 /* NULL uses the existing already set up locale */
1872 const char * const setlocale_init = (PerlEnv_getenv("PERL_SKIP_LOCALE_INIT"))
1873 ? NULL
1874 : "";
c3fcd832
KW
1875 const char* trial_locales[5]; /* 5 = 1 each for "", LC_ALL, LANG, "", C */
1876 unsigned int trial_locales_count;
175c4cf9
KW
1877 const char * const lc_all = savepv(PerlEnv_getenv("LC_ALL"));
1878 const char * const lang = savepv(PerlEnv_getenv("LANG"));
98994639 1879 bool setlocale_failure = FALSE;
65ebb059 1880 unsigned int i;
175c4cf9
KW
1881
1882 /* A later getenv() could zap this, so only use here */
1883 const char * const bad_lang_use_once = PerlEnv_getenv("PERL_BADLANG");
1884
1885 const bool locwarn = (printwarn > 1
e5f10d49
KW
1886 || ( printwarn
1887 && ( ! bad_lang_use_once
22ff3130 1888 || (
e5f10d49
KW
1889 /* disallow with "" or "0" */
1890 *bad_lang_use_once
1891 && strNE("0", bad_lang_use_once)))));
ea92aad8 1892
291a84fb 1893 /* setlocale() return vals; not copied so must be looked at immediately */
8de4332b 1894 const char * sl_result[NOMINAL_LC_ALL_INDEX + 1];
291a84fb
KW
1895
1896 /* current locale for given category; should have been copied so aren't
1897 * volatile */
8de4332b 1898 const char * curlocales[NOMINAL_LC_ALL_INDEX + 1];
291a84fb 1899
7d4bcc4a
KW
1900# ifdef WIN32
1901
6bce99ee
JH
1902 /* In some systems you can find out the system default locale
1903 * and use that as the fallback locale. */
7d4bcc4a
KW
1904# define SYSTEM_DEFAULT_LOCALE
1905# endif
1906# ifdef SYSTEM_DEFAULT_LOCALE
1907
65ebb059 1908 const char *system_default_locale = NULL;
98994639 1909
7d4bcc4a 1910# endif
948523db
KW
1911
1912# ifndef DEBUGGING
1913# define DEBUG_LOCALE_INIT(a,b,c)
1914# else
7d4bcc4a 1915
8298454c 1916 DEBUG_INITIALIZATION_set(cBOOL(PerlEnv_getenv("PERL_DEBUG_LOCALE_INIT")));
7d4bcc4a
KW
1917
1918# define DEBUG_LOCALE_INIT(category, locale, result) \
2fcc0ca9
KW
1919 STMT_START { \
1920 if (debug_initialization) { \
1921 PerlIO_printf(Perl_debug_log, \
1922 "%s:%d: %s\n", \
1923 __FILE__, __LINE__, \
a4f00dcc 1924 setlocale_debug_string(category, \
2fcc0ca9
KW
1925 locale, \
1926 result)); \
1927 } \
1928 } STMT_END
2fcc0ca9 1929
948523db
KW
1930/* Make sure the parallel arrays are properly set up */
1931# ifdef USE_LOCALE_NUMERIC
1932 assert(categories[LC_NUMERIC_INDEX] == LC_NUMERIC);
1933 assert(strEQ(category_names[LC_NUMERIC_INDEX], "LC_NUMERIC"));
1934# endif
1935# ifdef USE_LOCALE_CTYPE
1936 assert(categories[LC_CTYPE_INDEX] == LC_CTYPE);
1937 assert(strEQ(category_names[LC_CTYPE_INDEX], "LC_CTYPE"));
1938# endif
1939# ifdef USE_LOCALE_COLLATE
1940 assert(categories[LC_COLLATE_INDEX] == LC_COLLATE);
1941 assert(strEQ(category_names[LC_COLLATE_INDEX], "LC_COLLATE"));
1942# endif
1943# ifdef USE_LOCALE_TIME
1944 assert(categories[LC_TIME_INDEX] == LC_TIME);
1945 assert(strEQ(category_names[LC_TIME_INDEX], "LC_TIME"));
1946# endif
1947# ifdef USE_LOCALE_MESSAGES
1948 assert(categories[LC_MESSAGES_INDEX] == LC_MESSAGES);
1949 assert(strEQ(category_names[LC_MESSAGES_INDEX], "LC_MESSAGES"));
1950# endif
1951# ifdef USE_LOCALE_MONETARY
1952 assert(categories[LC_MONETARY_INDEX] == LC_MONETARY);
1953 assert(strEQ(category_names[LC_MONETARY_INDEX], "LC_MONETARY"));
1954# endif
9821811f
KW
1955# ifdef USE_LOCALE_ADDRESS
1956 assert(categories[LC_ADDRESS_INDEX] == LC_ADDRESS);
1957 assert(strEQ(category_names[LC_ADDRESS_INDEX], "LC_ADDRESS"));
1958# endif
1959# ifdef USE_LOCALE_IDENTIFICATION
1960 assert(categories[LC_IDENTIFICATION_INDEX] == LC_IDENTIFICATION);
1961 assert(strEQ(category_names[LC_IDENTIFICATION_INDEX], "LC_IDENTIFICATION"));
1962# endif
1963# ifdef USE_LOCALE_MEASUREMENT
1964 assert(categories[LC_MEASUREMENT_INDEX] == LC_MEASUREMENT);
1965 assert(strEQ(category_names[LC_MEASUREMENT_INDEX], "LC_MEASUREMENT"));
1966# endif
1967# ifdef USE_LOCALE_PAPER
1968 assert(categories[LC_PAPER_INDEX] == LC_PAPER);
1969 assert(strEQ(category_names[LC_PAPER_INDEX], "LC_PAPER"));
1970# endif
1971# ifdef USE_LOCALE_TELEPHONE
1972 assert(categories[LC_TELEPHONE_INDEX] == LC_TELEPHONE);
1973 assert(strEQ(category_names[LC_TELEPHONE_INDEX], "LC_TELEPHONE"));
1974# endif
948523db
KW
1975# ifdef LC_ALL
1976 assert(categories[LC_ALL_INDEX] == LC_ALL);
1977 assert(strEQ(category_names[LC_ALL_INDEX], "LC_ALL"));
1978 assert(NOMINAL_LC_ALL_INDEX == LC_ALL_INDEX);
1979# endif
1980# endif /* DEBUGGING */
b56c4436 1981# ifdef LOCALE_ENVIRON_REQUIRED
98994639
HS
1982
1983 /*
1984 * Ultrix setlocale(..., "") fails if there are no environment
1985 * variables from which to get a locale name.
1986 */
1987
b56c4436
KW
1988# ifndef LC_ALL
1989# error Ultrix without LC_ALL not implemented
1990# else
7d4bcc4a 1991
b56c4436
KW
1992 {
1993 bool done = FALSE;
ea92aad8
KW
1994 if (lang) {
1995 sl_result[LC_ALL_INDEX] = do_setlocale_c(LC_ALL, setlocale_init);
1996 DEBUG_LOCALE_INIT(LC_ALL, setlocale_init, sl_result[LC_ALL_INDEX]);
1997 if (sl_result[LC_ALL_INDEX])
1998 done = TRUE;
1999 else
e5f10d49 2000 setlocale_failure = TRUE;
ea92aad8
KW
2001 }
2002 if (! setlocale_failure) {
2003 const char * locale_param;
2004 for (i = 0; i < LC_ALL_INDEX; i++) {
2005 locale_param = (! done && (lang || PerlEnv_getenv(category_names[i])))
2006 ? setlocale_init
2007 : NULL;
2008 sl_result[i] = do_setlocale_r(categories[i], locale_param);
2009 if (! sl_result[i]) {
2010 setlocale_failure = TRUE;
2011 }
2012 DEBUG_LOCALE_INIT(categories[i], locale_param, sl_result[i]);
e5f10d49 2013 }
c835d6be 2014 }
7d4bcc4a
KW
2015 }
2016
2017# endif /* LC_ALL */
e5f10d49 2018# endif /* LOCALE_ENVIRON_REQUIRED */
98994639 2019
65ebb059 2020 /* We try each locale in the list until we get one that works, or exhaust
20a240df
KW
2021 * the list. Normally the loop is executed just once. But if setting the
2022 * locale fails, inside the loop we add fallback trials to the array and so
2023 * will execute the loop multiple times */
c3fcd832
KW
2024 trial_locales[0] = setlocale_init;
2025 trial_locales_count = 1;
7d4bcc4a 2026
65ebb059
KW
2027 for (i= 0; i < trial_locales_count; i++) {
2028 const char * trial_locale = trial_locales[i];
2029
2030 if (i > 0) {
2031
2032 /* XXX This is to preserve old behavior for LOCALE_ENVIRON_REQUIRED
2033 * when i==0, but I (khw) don't think that behavior makes much
2034 * sense */
2035 setlocale_failure = FALSE;
2036
7d4bcc4a 2037# ifdef SYSTEM_DEFAULT_LOCALE
291a84fb 2038# ifdef WIN32 /* Note that assumes Win32 has LC_ALL */
7d4bcc4a 2039
65ebb059
KW
2040 /* On Windows machines, an entry of "" after the 0th means to use
2041 * the system default locale, which we now proceed to get. */
2042 if (strEQ(trial_locale, "")) {
2043 unsigned int j;
2044
2045 /* Note that this may change the locale, but we are going to do
2046 * that anyway just below */
837ce802 2047 system_default_locale = do_setlocale_c(LC_ALL, "");
5d1187d1 2048 DEBUG_LOCALE_INIT(LC_ALL, "", system_default_locale);
65ebb059 2049
7d4bcc4a 2050 /* Skip if invalid or if it's already on the list of locales to
65ebb059
KW
2051 * try */
2052 if (! system_default_locale) {
2053 goto next_iteration;
2054 }
2055 for (j = 0; j < trial_locales_count; j++) {
2056 if (strEQ(system_default_locale, trial_locales[j])) {
2057 goto next_iteration;
2058 }
2059 }
2060
2061 trial_locale = system_default_locale;
2062 }
ec0202b5
KW
2063# else
2064# error SYSTEM_DEFAULT_LOCALE only implemented for Win32
2065# endif
7d4bcc4a 2066# endif /* SYSTEM_DEFAULT_LOCALE */
291a84fb
KW
2067
2068 } /* For i > 0 */
65ebb059 2069
7d4bcc4a
KW
2070# ifdef LC_ALL
2071
948523db
KW
2072 sl_result[LC_ALL_INDEX] = do_setlocale_c(LC_ALL, trial_locale);
2073 DEBUG_LOCALE_INIT(LC_ALL, trial_locale, sl_result[LC_ALL_INDEX]);
2074 if (! sl_result[LC_ALL_INDEX]) {
49c85077 2075 setlocale_failure = TRUE;
7cd8b568
KW
2076 }
2077 else {
2078 /* Since LC_ALL succeeded, it should have changed all the other
2079 * categories it can to its value; so we massage things so that the
2080 * setlocales below just return their category's current values.
2081 * This adequately handles the case in NetBSD where LC_COLLATE may
2082 * not be defined for a locale, and setting it individually will
7d4bcc4a 2083 * fail, whereas setting LC_ALL succeeds, leaving LC_COLLATE set to
7cd8b568
KW
2084 * the POSIX locale. */
2085 trial_locale = NULL;
2086 }
7d4bcc4a
KW
2087
2088# endif /* LC_ALL */
98994639 2089
e5f10d49
KW
2090 if (! setlocale_failure) {
2091 unsigned int j;
2092 for (j = 0; j < NOMINAL_LC_ALL_INDEX; j++) {
2093 curlocales[j]
2094 = savepv(do_setlocale_r(categories[j], trial_locale));
2095 if (! curlocales[j]) {
2096 setlocale_failure = TRUE;
2097 }
2098 DEBUG_LOCALE_INIT(categories[j], trial_locale, curlocales[j]);
2099 }
c835d6be 2100
e5f10d49
KW
2101 if (! setlocale_failure) { /* All succeeded */
2102 break; /* Exit trial_locales loop */
49c85077 2103 }
65ebb059 2104 }
98994639 2105
49c85077
KW
2106 /* Here, something failed; will need to try a fallback. */
2107 ok = 0;
65ebb059 2108
49c85077
KW
2109 if (i == 0) {
2110 unsigned int j;
98994639 2111
65ebb059 2112 if (locwarn) { /* Output failure info only on the first one */
7d4bcc4a
KW
2113
2114# ifdef LC_ALL
98994639 2115
49c85077
KW
2116 PerlIO_printf(Perl_error_log,
2117 "perl: warning: Setting locale failed.\n");
98994639 2118
7d4bcc4a 2119# else /* !LC_ALL */
98994639 2120
49c85077
KW
2121 PerlIO_printf(Perl_error_log,
2122 "perl: warning: Setting locale failed for the categories:\n\t");
7d4bcc4a 2123
e5f10d49
KW
2124 for (j = 0; j < NOMINAL_LC_ALL_INDEX; j++) {
2125 if (! curlocales[j]) {
2126 PerlIO_printf(Perl_error_log, category_names[j]);
2127 }
2128 else {
2129 Safefree(curlocales[j]);
2130 }
2131 }
7d4bcc4a 2132
7d4bcc4a 2133# endif /* LC_ALL */
98994639 2134
49c85077
KW
2135 PerlIO_printf(Perl_error_log,
2136 "perl: warning: Please check that your locale settings:\n");
98994639 2137
7d4bcc4a
KW
2138# ifdef __GLIBC__
2139
49c85077
KW
2140 PerlIO_printf(Perl_error_log,
2141 "\tLANGUAGE = %c%s%c,\n",
2142 language ? '"' : '(',
2143 language ? language : "unset",
2144 language ? '"' : ')');
7d4bcc4a 2145# endif
98994639 2146
49c85077
KW
2147 PerlIO_printf(Perl_error_log,
2148 "\tLC_ALL = %c%s%c,\n",
2149 lc_all ? '"' : '(',
2150 lc_all ? lc_all : "unset",
2151 lc_all ? '"' : ')');
98994639 2152
7d4bcc4a
KW
2153# if defined(USE_ENVIRON_ARRAY)
2154
49c85077 2155 {
cd999af9 2156 char **e;
d5e32b93
KW
2157
2158 /* Look through the environment for any variables of the
2159 * form qr/ ^ LC_ [A-Z]+ = /x, except LC_ALL which was
2160 * already handled above. These are assumed to be locale
2161 * settings. Output them and their values. */
cd999af9 2162 for (e = environ; *e; e++) {
d5e32b93
KW
2163 const STRLEN prefix_len = sizeof("LC_") - 1;
2164 STRLEN uppers_len;
2165
cd999af9 2166 if ( strBEGINs(*e, "LC_")
c8b388b0 2167 && ! strBEGINs(*e, "LC_ALL=")
d5e32b93
KW
2168 && (uppers_len = strspn(*e + prefix_len,
2169 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"))
2170 && ((*e)[prefix_len + uppers_len] == '='))
cd999af9
KW
2171 {
2172 PerlIO_printf(Perl_error_log, "\t%.*s = \"%s\",\n",
d5e32b93
KW
2173 (int) (prefix_len + uppers_len), *e,
2174 *e + prefix_len + uppers_len + 1);
cd999af9
KW
2175 }
2176 }
49c85077 2177 }
7d4bcc4a
KW
2178
2179# else
2180
49c85077
KW
2181 PerlIO_printf(Perl_error_log,
2182 "\t(possibly more locale environment variables)\n");
7d4bcc4a
KW
2183
2184# endif
98994639 2185
49c85077
KW
2186 PerlIO_printf(Perl_error_log,
2187 "\tLANG = %c%s%c\n",
2188 lang ? '"' : '(',
2189 lang ? lang : "unset",
2190 lang ? '"' : ')');
98994639 2191
49c85077
KW
2192 PerlIO_printf(Perl_error_log,
2193 " are supported and installed on your system.\n");
2194 }
98994639 2195
65ebb059 2196 /* Calculate what fallback locales to try. We have avoided this
f6bab5f6 2197 * until we have to, because failure is quite unlikely. This will
65ebb059
KW
2198 * usually change the upper bound of the loop we are in.
2199 *
2200 * Since the system's default way of setting the locale has not
2201 * found one that works, We use Perl's defined ordering: LC_ALL,
2202 * LANG, and the C locale. We don't try the same locale twice, so
2203 * don't add to the list if already there. (On POSIX systems, the
2204 * LC_ALL element will likely be a repeat of the 0th element "",
6b058d42
KW
2205 * but there's no harm done by doing it explicitly.
2206 *
2207 * Note that this tries the LC_ALL environment variable even on
2208 * systems which have no LC_ALL locale setting. This may or may
2209 * not have been originally intentional, but there's no real need
2210 * to change the behavior. */
65ebb059
KW
2211 if (lc_all) {
2212 for (j = 0; j < trial_locales_count; j++) {
2213 if (strEQ(lc_all, trial_locales[j])) {
2214 goto done_lc_all;
2215 }
2216 }
2217 trial_locales[trial_locales_count++] = lc_all;
2218 }
2219 done_lc_all:
98994639 2220
65ebb059
KW
2221 if (lang) {
2222 for (j = 0; j < trial_locales_count; j++) {
2223 if (strEQ(lang, trial_locales[j])) {
2224 goto done_lang;
2225 }
2226 }
2227 trial_locales[trial_locales_count++] = lang;
2228 }
2229 done_lang:
2230
7d4bcc4a
KW
2231# if defined(WIN32) && defined(LC_ALL)
2232
65ebb059
KW
2233 /* For Windows, we also try the system default locale before "C".
2234 * (If there exists a Windows without LC_ALL we skip this because
2235 * it gets too complicated. For those, the "C" is the next
2236 * fallback possibility). The "" is the same as the 0th element of
2237 * the array, but the code at the loop above knows to treat it
2238 * differently when not the 0th */
2239 trial_locales[trial_locales_count++] = "";
7d4bcc4a
KW
2240
2241# endif
65ebb059
KW
2242
2243 for (j = 0; j < trial_locales_count; j++) {
2244 if (strEQ("C", trial_locales[j])) {
2245 goto done_C;
2246 }
2247 }
2248 trial_locales[trial_locales_count++] = "C";
98994639 2249
65ebb059
KW
2250 done_C: ;
2251 } /* end of first time through the loop */
98994639 2252
7d4bcc4a
KW
2253# ifdef WIN32
2254
65ebb059 2255 next_iteration: ;
7d4bcc4a
KW
2256
2257# endif
65ebb059
KW
2258
2259 } /* end of looping through the trial locales */
2260
2261 if (ok < 1) { /* If we tried to fallback */
2262 const char* msg;
2263 if (! setlocale_failure) { /* fallback succeeded */
2264 msg = "Falling back to";
2265 }
2266 else { /* fallback failed */
e5f10d49 2267 unsigned int j;
98994639 2268
65ebb059
KW
2269 /* We dropped off the end of the loop, so have to decrement i to
2270 * get back to the value the last time through */
2271 i--;
98994639 2272
65ebb059
KW
2273 ok = -1;
2274 msg = "Failed to fall back to";
2275
2276 /* To continue, we should use whatever values we've got */
7d4bcc4a 2277
e5f10d49
KW
2278 for (j = 0; j < NOMINAL_LC_ALL_INDEX; j++) {
2279 Safefree(curlocales[j]);
2280 curlocales[j] = savepv(do_setlocale_r(categories[j], NULL));
2281 DEBUG_LOCALE_INIT(categories[j], NULL, curlocales[j]);
2282 }
65ebb059
KW
2283 }
2284
2285 if (locwarn) {
2286 const char * description;
2287 const char * name = "";
2288 if (strEQ(trial_locales[i], "C")) {
2289 description = "the standard locale";
2290 name = "C";
2291 }
7d4bcc4a
KW
2292
2293# ifdef SYSTEM_DEFAULT_LOCALE
2294
65ebb059
KW
2295 else if (strEQ(trial_locales[i], "")) {
2296 description = "the system default locale";
2297 if (system_default_locale) {
2298 name = system_default_locale;
2299 }
2300 }
7d4bcc4a
KW
2301
2302# endif /* SYSTEM_DEFAULT_LOCALE */
2303
65ebb059
KW
2304 else {
2305 description = "a fallback locale";
2306 name = trial_locales[i];
2307 }
2308 if (name && strNE(name, "")) {
2309 PerlIO_printf(Perl_error_log,
2310 "perl: warning: %s %s (\"%s\").\n", msg, description, name);
2311 }
2312 else {
2313 PerlIO_printf(Perl_error_log,
2314 "perl: warning: %s %s.\n", msg, description);
2315 }
2316 }
2317 } /* End of tried to fallback */
98994639 2318
e5f10d49
KW
2319 /* Done with finding the locales; update our records */
2320
7d4bcc4a
KW
2321# ifdef USE_LOCALE_CTYPE
2322
948523db 2323 new_ctype(curlocales[LC_CTYPE_INDEX]);
98994639 2324
e5f10d49 2325# endif
7d4bcc4a
KW
2326# ifdef USE_LOCALE_COLLATE
2327
948523db 2328 new_collate(curlocales[LC_COLLATE_INDEX]);
98994639 2329
e5f10d49 2330# endif
7d4bcc4a
KW
2331# ifdef USE_LOCALE_NUMERIC
2332
948523db 2333 new_numeric(curlocales[LC_NUMERIC_INDEX]);
e5f10d49
KW
2334
2335# endif
2336
e5f10d49 2337
948523db 2338 for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++) {
e5f10d49
KW
2339 Safefree(curlocales[i]);
2340 }
b310b053 2341
7d4bcc4a
KW
2342# if defined(USE_PERLIO) && defined(USE_LOCALE_CTYPE)
2343
49c85077
KW
2344 /* Set PL_utf8locale to TRUE if using PerlIO _and_ the current LC_CTYPE
2345 * locale is UTF-8. If PL_utf8locale and PL_unicode (set by -C or by
2346 * $ENV{PERL_UNICODE}) are true, perl.c:S_parse_body() will turn on the
2347 * PerlIO :utf8 layer on STDIN, STDOUT, STDERR, _and_ the default open
2348 * discipline. */
c1284011 2349 PL_utf8locale = _is_cur_LC_category_utf8(LC_CTYPE);
49c85077 2350
a05d7ebb 2351 /* Set PL_unicode to $ENV{PERL_UNICODE} if using PerlIO.
fde18df1
JH
2352 This is an alternative to using the -C command line switch
2353 (the -C if present will override this). */
2354 {
dd374669 2355 const char *p = PerlEnv_getenv("PERL_UNICODE");
a05d7ebb 2356 PL_unicode = p ? parse_unicode_opts(&p) : 0;
5a22a2bb
NC
2357 if (PL_unicode & PERL_UNICODE_UTF8CACHEASSERT_FLAG)
2358 PL_utf8cache = -1;
b310b053
JH
2359 }
2360
7d4bcc4a 2361# endif
7d4bcc4a
KW
2362# ifdef __GLIBC__
2363
175c4cf9 2364 Safefree(language);
7d4bcc4a
KW
2365
2366# endif
175c4cf9
KW
2367
2368 Safefree(lc_all);
2369 Safefree(lang);
2370
e3305790 2371#endif /* USE_LOCALE */
2fcc0ca9 2372#ifdef DEBUGGING
7d4bcc4a 2373
2fcc0ca9 2374 /* So won't continue to output stuff */
27cdc72e 2375 DEBUG_INITIALIZATION_set(FALSE);
7d4bcc4a 2376
2fcc0ca9
KW
2377#endif
2378
98994639
HS
2379 return ok;
2380}
2381
98994639
HS
2382#ifdef USE_LOCALE_COLLATE
2383
a4a439fb 2384char *
a4a439fb
KW
2385Perl__mem_collxfrm(pTHX_ const char *input_string,
2386 STRLEN len, /* Length of 'input_string' */
2387 STRLEN *xlen, /* Set to length of returned string
2388 (not including the collation index
2389 prefix) */
2390 bool utf8 /* Is the input in UTF-8? */
6696cfa7 2391 )
98994639 2392{
a4a439fb
KW
2393
2394 /* _mem_collxfrm() is a bit like strxfrm() but with two important
2395 * differences. First, it handles embedded NULs. Second, it allocates a bit
2396 * more memory than needed for the transformed data itself. The real
55e5378d 2397 * transformed data begins at offset COLLXFRM_HDR_LEN. *xlen is set to
a4a439fb
KW
2398 * the length of that, and doesn't include the collation index size.
2399 * Please see sv_collxfrm() to see how this is used. */
2400
55e5378d
KW
2401#define COLLXFRM_HDR_LEN sizeof(PL_collation_ix)
2402
6696cfa7
KW
2403 char * s = (char *) input_string;
2404 STRLEN s_strlen = strlen(input_string);
79f120c8 2405 char *xbuf = NULL;
55e5378d 2406 STRLEN xAlloc; /* xalloc is a reserved word in VC */
17f41037 2407 STRLEN length_in_chars;
c664130f 2408 bool first_time = TRUE; /* Cleared after first loop iteration */
98994639 2409
a4a439fb
KW
2410 PERL_ARGS_ASSERT__MEM_COLLXFRM;
2411
2412 /* Must be NUL-terminated */
2413 assert(*(input_string + len) == '\0');
7918f24d 2414
79f120c8
KW
2415 /* If this locale has defective collation, skip */
2416 if (PL_collxfrm_base == 0 && PL_collxfrm_mult == 0) {
c7202dee
KW
2417 DEBUG_L(PerlIO_printf(Perl_debug_log,
2418 "_mem_collxfrm: locale's collation is defective\n"));
79f120c8
KW
2419 goto bad;
2420 }
2421
6696cfa7
KW
2422 /* Replace any embedded NULs with the control that sorts before any others.
2423 * This will give as good as possible results on strings that don't
2424 * otherwise contain that character, but otherwise there may be
2425 * less-than-perfect results with that character and NUL. This is
fdc080f3 2426 * unavoidable unless we replace strxfrm with our own implementation. */
fd43f63c
KW
2427 if (UNLIKELY(s_strlen < len)) { /* Only execute if there is an embedded
2428 NUL */
6696cfa7
KW
2429 char * e = s + len;
2430 char * sans_nuls;
fdc080f3 2431 STRLEN sans_nuls_len;
94762aa0 2432 int try_non_controls;
afc4976f
KW
2433 char this_replacement_char[] = "?\0"; /* Room for a two-byte string,
2434 making sure 2nd byte is NUL.
2435 */
2436 STRLEN this_replacement_len;
2437
1e4c9676
KW
2438 /* If we don't know what non-NUL control character sorts lowest for
2439 * this locale, find it */
f28f4d2a 2440 if (PL_strxfrm_NUL_replacement == '\0') {
6696cfa7 2441 int j;
afc4976f 2442 char * cur_min_x = NULL; /* The min_char's xfrm, (except it also
6696cfa7
KW
2443 includes the collation index
2444 prefixed. */
2445
91c0e2e0 2446 DEBUG_Lv(PerlIO_printf(Perl_debug_log, "Looking to replace NUL\n"));
94762aa0
KW
2447
2448 /* Unlikely, but it may be that no control will work to replace
1e4c9676
KW
2449 * NUL, in which case we instead look for any character. Controls
2450 * are preferred because collation order is, in general, context
2451 * sensitive, with adjoining characters affecting the order, and
2452 * controls are less likely to have such interactions, allowing the
2453 * NUL-replacement to stand on its own. (Another way to look at it
2454 * is to imagine what would happen if the NUL were replaced by a
2455 * combining character; it wouldn't work out all that well.) */
94762aa0
KW
2456 for (try_non_controls = 0;
2457 try_non_controls < 2;
2458 try_non_controls++)
2459 {
d4ff9586
KW
2460 /* Look through all legal code points (NUL isn't) */
2461 for (j = 1; j < 256; j++) {
2462 char * x; /* j's xfrm plus collation index */
2463 STRLEN x_len; /* length of 'x' */
2464 STRLEN trial_len = 1;
736a4fed 2465 char cur_source[] = { '\0', '\0' };
d4ff9586 2466
736a4fed
KW
2467 /* Skip non-controls the first time through the loop. The
2468 * controls in a UTF-8 locale are the L1 ones */
afc4976f
KW
2469 if (! try_non_controls && (PL_in_utf8_COLLATE_locale)
2470 ? ! isCNTRL_L1(j)
2471 : ! isCNTRL_LC(j))
2472 {
d4ff9586 2473 continue;
6696cfa7 2474 }
6696cfa7 2475
736a4fed
KW
2476 /* Create a 1-char string of the current code point */
2477 cur_source[0] = (char) j;
2478
d4ff9586
KW
2479 /* Then transform it */
2480 x = _mem_collxfrm(cur_source, trial_len, &x_len,
afc4976f 2481 0 /* The string is not in UTF-8 */);
6696cfa7 2482
1e4c9676 2483 /* Ignore any character that didn't successfully transform.
d4ff9586
KW
2484 * */
2485 if (! x) {
2486 continue;
2487 }
6696cfa7 2488
d4ff9586
KW
2489 /* If this character's transformation is lower than
2490 * the current lowest, this one becomes the lowest */
2491 if ( cur_min_x == NULL
2492 || strLT(x + COLLXFRM_HDR_LEN,
2493 cur_min_x + COLLXFRM_HDR_LEN))
2494 {
f28f4d2a 2495 PL_strxfrm_NUL_replacement = j;
d4ff9586 2496 cur_min_x = x;
d4ff9586
KW
2497 }
2498 else {
2499 Safefree(x);
2500 }
1e4c9676 2501 } /* end of loop through all 255 characters */
6696cfa7 2502
1e4c9676 2503 /* Stop looking if found */
94762aa0
KW
2504 if (cur_min_x) {
2505 break;
2506 }
2507
2508 /* Unlikely, but possible, if there aren't any controls that
2509 * work in the locale, repeat the loop, looking for any
2510 * character that works */
2511 DEBUG_L(PerlIO_printf(Perl_debug_log,
2512 "_mem_collxfrm: No control worked. Trying non-controls\n"));
1e4c9676 2513 } /* End of loop to try first the controls, then any char */
6696cfa7 2514
94762aa0
KW
2515 if (! cur_min_x) {
2516 DEBUG_L(PerlIO_printf(Perl_debug_log,
2517 "_mem_collxfrm: Couldn't find any character to replace"
2518 " embedded NULs in locale %s with", PL_collation_name));
2519 goto bad;
58eebef2
KW
2520 }
2521
94762aa0
KW
2522 DEBUG_L(PerlIO_printf(Perl_debug_log,
2523 "_mem_collxfrm: Replacing embedded NULs in locale %s with "
f28f4d2a 2524 "0x%02X\n", PL_collation_name, PL_strxfrm_NUL_replacement));
94762aa0 2525
6696cfa7 2526 Safefree(cur_min_x);
1e4c9676 2527 } /* End of determining the character that is to replace NULs */
afc4976f
KW
2528
2529 /* If the replacement is variant under UTF-8, it must match the
291a84fb 2530 * UTF8-ness of the original */
f28f4d2a
KW
2531 if ( ! UVCHR_IS_INVARIANT(PL_strxfrm_NUL_replacement) && utf8) {
2532 this_replacement_char[0] =
2533 UTF8_EIGHT_BIT_HI(PL_strxfrm_NUL_replacement);
2534 this_replacement_char[1] =
2535 UTF8_EIGHT_BIT_LO(PL_strxfrm_NUL_replacement);
afc4976f
KW
2536 this_replacement_len = 2;
2537 }
2538 else {
f28f4d2a 2539 this_replacement_char[0] = PL_strxfrm_NUL_replacement;
afc4976f
KW
2540 /* this_replacement_char[1] = '\0' was done at initialization */
2541 this_replacement_len = 1;
6696cfa7
KW
2542 }
2543
2544 /* The worst case length for the replaced string would be if every
2545 * character in it is NUL. Multiply that by the length of each
2546 * replacement, and allow for a trailing NUL */
afc4976f 2547 sans_nuls_len = (len * this_replacement_len) + 1;
fdc080f3 2548 Newx(sans_nuls, sans_nuls_len, char);
6696cfa7
KW
2549 *sans_nuls = '\0';
2550
6696cfa7
KW
2551 /* Replace each NUL with the lowest collating control. Loop until have
2552 * exhausted all the NULs */
2553 while (s + s_strlen < e) {
6069d6c5 2554 my_strlcat(sans_nuls, s, sans_nuls_len);
6696cfa7
KW
2555
2556 /* Do the actual replacement */
6069d6c5 2557 my_strlcat(sans_nuls, this_replacement_char, sans_nuls_len);
6696cfa7
KW
2558
2559 /* Move past the input NUL */
2560 s += s_strlen + 1;
2561 s_strlen = strlen(s);
2562 }
2563
2564 /* And add anything that trails the final NUL */
6069d6c5 2565 my_strlcat(sans_nuls, s, sans_nuls_len);
6696cfa7
KW
2566
2567 /* Switch so below we transform this modified string */
2568 s = sans_nuls;
2569 len = strlen(s);
1e4c9676 2570 } /* End of replacing NULs */
6696cfa7 2571
a4a439fb
KW
2572 /* Make sure the UTF8ness of the string and locale match */
2573 if (utf8 != PL_in_utf8_COLLATE_locale) {
2574 const char * const t = s; /* Temporary so we can later find where the
2575 input was */
2576
2577 /* Here they don't match. Change the string's to be what the locale is
2578 * expecting */
2579
2580 if (! utf8) { /* locale is UTF-8, but input isn't; upgrade the input */
2581 s = (char *) bytes_to_utf8((const U8 *) s, &len);
2582 utf8 = TRUE;
2583 }
2584 else { /* locale is not UTF-8; but input is; downgrade the input */
2585
2586 s = (char *) bytes_from_utf8((const U8 *) s, &len, &utf8);
2587
2588 /* If the downgrade was successful we are done, but if the input
2589 * contains things that require UTF-8 to represent, have to do
2590 * damage control ... */
2591 if (UNLIKELY(utf8)) {
2592
2593 /* What we do is construct a non-UTF-8 string with
2594 * 1) the characters representable by a single byte converted
2595 * to be so (if necessary);
2596 * 2) and the rest converted to collate the same as the
2597 * highest collating representable character. That makes
2598 * them collate at the end. This is similar to how we
2599 * handle embedded NULs, but we use the highest collating
2600 * code point instead of the smallest. Like the NUL case,
2601 * this isn't perfect, but is the best we can reasonably
2602 * do. Every above-255 code point will sort the same as
2603 * the highest-sorting 0-255 code point. If that code
2604 * point can combine in a sequence with some other code
2605 * points for weight calculations, us changing something to
2606 * be it can adversely affect the results. But in most
2607 * cases, it should work reasonably. And note that this is
2608 * really an illegal situation: using code points above 255
2609 * on a locale where only 0-255 are valid. If two strings
2610 * sort entirely equal, then the sort order for the
2611 * above-255 code points will be in code point order. */
2612
2613 utf8 = FALSE;
2614
2615 /* If we haven't calculated the code point with the maximum
2616 * collating order for this locale, do so now */
2617 if (! PL_strxfrm_max_cp) {
2618 int j;
2619
2620 /* The current transformed string that collates the
2621 * highest (except it also includes the prefixed collation
2622 * index. */
2623 char * cur_max_x = NULL;
2624
2625 /* Look through all legal code points (NUL isn't) */
2626 for (j = 1; j < 256; j++) {
2627 char * x;
2628 STRLEN x_len;
736a4fed 2629 char cur_source[] = { '\0', '\0' };
a4a439fb 2630
736a4fed
KW
2631 /* Create a 1-char string of the current code point */
2632 cur_source[0] = (char) j;
a4a439fb
KW
2633
2634 /* Then transform it */
2635 x = _mem_collxfrm(cur_source, 1, &x_len, FALSE);
2636
2637 /* If something went wrong (which it shouldn't), just
2638 * ignore this code point */
94762aa0 2639 if (! x) {
a4a439fb
KW
2640 continue;
2641 }
2642
2643 /* If this character's transformation is higher than
2644 * the current highest, this one becomes the highest */
2645 if ( cur_max_x == NULL
55e5378d
KW
2646 || strGT(x + COLLXFRM_HDR_LEN,
2647 cur_max_x + COLLXFRM_HDR_LEN))
a4a439fb
KW
2648 {
2649 PL_strxfrm_max_cp = j;
2650 cur_max_x = x;
2651 }
2652 else {
2653 Safefree(x);
2654 }
2655 }
2656
94762aa0
KW
2657 if (! cur_max_x) {
2658 DEBUG_L(PerlIO_printf(Perl_debug_log,
2659 "_mem_collxfrm: Couldn't find any character to"
2660 " replace above-Latin1 chars in locale %s with",
2661 PL_collation_name));
2662 goto bad;
2663 }
2664
58eebef2
KW
2665 DEBUG_L(PerlIO_printf(Perl_debug_log,
2666 "_mem_collxfrm: highest 1-byte collating character"
2667 " in locale %s is 0x%02X\n",
2668 PL_collation_name,
2669 PL_strxfrm_max_cp));
58eebef2 2670
a4a439fb
KW
2671 Safefree(cur_max_x);
2672 }
2673
2674 /* Here we know which legal code point collates the highest.
2675 * We are ready to construct the non-UTF-8 string. The length
2676 * will be at least 1 byte smaller than the input string
2677 * (because we changed at least one 2-byte character into a
2678 * single byte), but that is eaten up by the trailing NUL */
2679 Newx(s, len, char);
2680
2681 {
2682 STRLEN i;
2683 STRLEN d= 0;
042d9e50 2684 char * e = (char *) t + len;
a4a439fb
KW
2685
2686 for (i = 0; i < len; i+= UTF8SKIP(t + i)) {
2687 U8 cur_char = t[i];
2688 if (UTF8_IS_INVARIANT(cur_char)) {
2689 s[d++] = cur_char;
2690 }
042d9e50 2691 else if (UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(t + i, e)) {
a4a439fb
KW
2692 s[d++] = EIGHT_BIT_UTF8_TO_NATIVE(cur_char, t[i+1]);
2693 }
2694 else { /* Replace illegal cp with highest collating
2695 one */
2696 s[d++] = PL_strxfrm_max_cp;
2697 }
2698 }
2699 s[d++] = '\0';
2700 Renew(s, d, char); /* Free up unused space */
2701 }
2702 }
2703 }
2704
2705 /* Here, we have constructed a modified version of the input. It could
2706 * be that we already had a modified copy before we did this version.
2707 * If so, that copy is no longer needed */
2708 if (t != input_string) {
2709 Safefree(t);
2710 }
2711 }
2712
17f41037
KW
2713 length_in_chars = (utf8)
2714 ? utf8_length((U8 *) s, (U8 *) s + len)
2715 : len;
2716
59c018b9
KW
2717 /* The first element in the output is the collation id, used by
2718 * sv_collxfrm(); then comes the space for the transformed string. The
2719 * equation should give us a good estimate as to how much is needed */
55e5378d 2720 xAlloc = COLLXFRM_HDR_LEN
a4a439fb 2721 + PL_collxfrm_base
17f41037 2722 + (PL_collxfrm_mult * length_in_chars);
a02a5408 2723 Newx(xbuf, xAlloc, char);
c7202dee
KW
2724 if (UNLIKELY(! xbuf)) {
2725 DEBUG_L(PerlIO_printf(Perl_debug_log,
2726 "_mem_collxfrm: Couldn't malloc %zu bytes\n", xAlloc));
98994639 2727 goto bad;
c7202dee 2728 }
98994639 2729
d35fca5f 2730 /* Store the collation id */
98994639 2731 *(U32*)xbuf = PL_collation_ix;
d35fca5f
KW
2732
2733 /* Then the transformation of the input. We loop until successful, or we
2734 * give up */
4ebeff16 2735 for (;;) {
1adab0a7 2736
55e5378d 2737 *xlen = strxfrm(xbuf + COLLXFRM_HDR_LEN, s, xAlloc - COLLXFRM_HDR_LEN);
4ebeff16
KW
2738
2739 /* If the transformed string occupies less space than we told strxfrm()
2740 * was available, it means it successfully transformed the whole
2741 * string. */
55e5378d 2742 if (*xlen < xAlloc - COLLXFRM_HDR_LEN) {
17f41037 2743
1adab0a7
KW
2744 /* Some systems include a trailing NUL in the returned length.
2745 * Ignore it, using a loop in case multiple trailing NULs are
2746 * returned. */
2747 while ( (*xlen) > 0
2748 && *(xbuf + COLLXFRM_HDR_LEN + (*xlen) - 1) == '\0')
2749 {
2750 (*xlen)--;
2751 }
2752
17f41037
KW
2753 /* If the first try didn't get it, it means our prediction was low.
2754 * Modify the coefficients so that we predict a larger value in any
2755 * future transformations */
2756 if (! first_time) {
2757 STRLEN needed = *xlen + 1; /* +1 For trailing NUL */
2758 STRLEN computed_guess = PL_collxfrm_base
2759 + (PL_collxfrm_mult * length_in_chars);
e1c30f0c
KW
2760
2761 /* On zero-length input, just keep current slope instead of
2762 * dividing by 0 */
2763 const STRLEN new_m = (length_in_chars != 0)
2764 ? needed / length_in_chars
2765 : PL_collxfrm_mult;
17f41037
KW
2766
2767 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
b07929e4
KW
2768 "%s: %d: initial size of %zu bytes for a length "
2769 "%zu string was insufficient, %zu needed\n",
17f41037 2770 __FILE__, __LINE__,
b07929e4 2771 computed_guess, length_in_chars, needed));
17f41037
KW
2772
2773 /* If slope increased, use it, but discard this result for
2774 * length 1 strings, as we can't be sure that it's a real slope
2775 * change */
2776 if (length_in_chars > 1 && new_m > PL_collxfrm_mult) {
7d4bcc4a
KW
2777
2778# ifdef DEBUGGING
2779
17f41037
KW
2780 STRLEN old_m = PL_collxfrm_mult;
2781 STRLEN old_b = PL_collxfrm_base;
7d4bcc4a
KW
2782
2783# endif
2784
17f41037
KW
2785 PL_collxfrm_mult = new_m;
2786 PL_collxfrm_base = 1; /* +1 For trailing NUL */
2787 computed_guess = PL_collxfrm_base
2788 + (PL_collxfrm_mult * length_in_chars);
2789 if (computed_guess < needed) {
2790 PL_collxfrm_base += needed - computed_guess;
2791 }
2792
2793 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
b07929e4
KW
2794 "%s: %d: slope is now %zu; was %zu, base "
2795 "is now %zu; was %zu\n",
17f41037 2796 __FILE__, __LINE__,
b07929e4
KW
2797 PL_collxfrm_mult, old_m,
2798 PL_collxfrm_base, old_b));
17f41037
KW
2799 }
2800 else { /* Slope didn't change, but 'b' did */
2801 const STRLEN new_b = needed
2802 - computed_guess
2803 + PL_collxfrm_base;
2804 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
b07929e4 2805 "%s: %d: base is now %zu; was %zu\n",
17f41037 2806 __FILE__, __LINE__,
b07929e4 2807 new_b, PL_collxfrm_base));
17f41037
KW
2808 PL_collxfrm_base = new_b;
2809 }
2810 }
2811
4ebeff16
KW
2812 break;
2813 }
bb0f664e 2814
c7202dee
KW
2815 if (UNLIKELY(*xlen >= PERL_INT_MAX)) {
2816 DEBUG_L(PerlIO_printf(Perl_debug_log,
2817 "_mem_collxfrm: Needed %zu bytes, max permissible is %u\n",
2818 *xlen, PERL_INT_MAX));
4ebeff16 2819 goto bad;
c7202dee 2820 }
d35fca5f 2821
c664130f 2822 /* A well-behaved strxfrm() returns exactly how much space it needs
1adab0a7
KW
2823 * (usually not including the trailing NUL) when it fails due to not
2824 * enough space being provided. Assume that this is the case unless
2825 * it's been proven otherwise */
c664130f 2826 if (LIKELY(PL_strxfrm_is_behaved) && first_time) {
55e5378d 2827 xAlloc = *xlen + COLLXFRM_HDR_LEN + 1;
c664130f
KW
2828 }
2829 else { /* Here, either:
2830 * 1) The strxfrm() has previously shown bad behavior; or
2831 * 2) It isn't the first time through the loop, which means
2832 * that the strxfrm() is now showing bad behavior, because
2833 * we gave it what it said was needed in the previous
2834 * iteration, and it came back saying it needed still more.
2835 * (Many versions of cygwin fit this. When the buffer size
2836 * isn't sufficient, they return the input size instead of
2837 * how much is needed.)
d4ff9586
KW
2838 * Increase the buffer size by a fixed percentage and try again.
2839 * */
6ddd902c 2840 xAlloc += (xAlloc / 4) + 1;
c664130f 2841 PL_strxfrm_is_behaved = FALSE;
c664130f 2842
7d4bcc4a
KW
2843# ifdef DEBUGGING
2844
58eebef2
KW
2845 if (DEBUG_Lv_TEST || debug_initialization) {
2846 PerlIO_printf(Perl_debug_log,
2847 "_mem_collxfrm required more space than previously calculated"
b07929e4 2848 " for locale %s, trying again with new guess=%d+%zu\n",
58eebef2 2849 PL_collation_name, (int) COLLXFRM_HDR_LEN,
b07929e4 2850 xAlloc - COLLXFRM_HDR_LEN);
58eebef2 2851 }
7d4bcc4a
KW
2852
2853# endif
2854
58eebef2 2855 }
c664130f 2856
4ebeff16 2857 Renew(xbuf, xAlloc, char);
c7202dee
KW
2858 if (UNLIKELY(! xbuf)) {
2859 DEBUG_L(PerlIO_printf(Perl_debug_log,
2860 "_mem_collxfrm: Couldn't realloc %zu bytes\n", xAlloc));
4ebeff16 2861 goto bad;
c7202dee 2862 }
c664130f
KW
2863
2864 first_time = FALSE;
4ebeff16 2865 }
98994639 2866
6696cfa7 2867
7d4bcc4a
KW
2868# ifdef DEBUGGING
2869
58eebef2 2870 if (DEBUG_Lv_TEST || debug_initialization) {
c7202dee
KW
2871
2872 print_collxfrm_input_and_return(s, s + len, xlen, utf8);
2873 PerlIO_printf(Perl_debug_log, "Its xfrm is:");
7e2f38b2
KW
2874 PerlIO_printf(Perl_debug_log, "%s\n",
2875 _byte_dump_string((U8 *) xbuf + COLLXFRM_HDR_LEN,
2876 *xlen, 1));
58eebef2 2877 }
7d4bcc4a
KW
2878
2879# endif
58eebef2 2880
3c5f993e 2881 /* Free up unneeded space; retain ehough for trailing NUL */
55e5378d 2882 Renew(xbuf, COLLXFRM_HDR_LEN + *xlen + 1, char);
98994639 2883
6696cfa7
KW
2884 if (s != input_string) {
2885 Safefree(s);
98994639
HS
2886 }
2887
98994639
HS
2888 return xbuf;
2889
2890 bad:
2891 Safefree(xbuf);
6696cfa7
KW
2892 if (s != input_string) {
2893 Safefree(s);
2894 }
98994639 2895 *xlen = 0;
7d4bcc4a
KW
2896
2897# ifdef DEBUGGING
2898
58eebef2 2899 if (DEBUG_Lv_TEST || debug_initialization) {
c7202dee 2900 print_collxfrm_input_and_return(s, s + len, NULL, utf8);
58eebef2 2901 }
7d4bcc4a
KW
2902
2903# endif
2904
98994639
HS
2905 return NULL;
2906}
2907
7d4bcc4a 2908# ifdef DEBUGGING
c7202dee 2909
4cbaac56 2910STATIC void
c7202dee
KW
2911S_print_collxfrm_input_and_return(pTHX_
2912 const char * const s,
2913 const char * const e,
2914 const STRLEN * const xlen,
2915 const bool is_utf8)
2916{
c7202dee
KW
2917
2918 PERL_ARGS_ASSERT_PRINT_COLLXFRM_INPUT_AND_RETURN;
2919
511e4ff7
DM
2920 PerlIO_printf(Perl_debug_log, "_mem_collxfrm[%" UVuf "]: returning ",
2921 (UV)PL_collation_ix);
c7202dee 2922 if (xlen) {
08b6dc1d 2923 PerlIO_printf(Perl_debug_log, "%zu", *xlen);
c7202dee
KW
2924 }
2925 else {
2926 PerlIO_printf(Perl_debug_log, "NULL");
2927 }
2928 PerlIO_printf(Perl_debug_log, " for locale '%s', string='",
2929 PL_collation_name);
9c8a6dc2
KW
2930 print_bytes_for_locale(s, e, is_utf8);
2931
2932 PerlIO_printf(Perl_debug_log, "'\n");
2933}
2934
2935STATIC void
2936S_print_bytes_for_locale(pTHX_
2937 const char * const s,
2938 const char * const e,
2939 const bool is_utf8)
2940{
2941 const char * t = s;
2942 bool prev_was_printable = TRUE;
2943 bool first_time = TRUE;
2944
2945 PERL_ARGS_ASSERT_PRINT_BYTES_FOR_LOCALE;
c7202dee
KW
2946
2947 while (t < e) {
2948 UV cp = (is_utf8)
2949 ? utf8_to_uvchr_buf((U8 *) t, e, NULL)
2950 : * (U8 *) t;
2951 if (isPRINT(cp)) {
2952 if (! prev_was_printable) {
2953 PerlIO_printf(Perl_debug_log, " ");
2954 }
2955 PerlIO_printf(Perl_debug_log, "%c", (U8) cp);
2956 prev_was_printable = TRUE;
2957 }
2958 else {
2959 if (! first_time) {
2960 PerlIO_printf(Perl_debug_log, " ");
2961 }
147e3846 2962 PerlIO_printf(Perl_debug_log, "%02" UVXf, cp);
c7202dee
KW
2963 prev_was_printable = FALSE;
2964 }
2965 t += (is_utf8) ? UTF8SKIP(t) : 1;
2966 first_time = FALSE;
2967 }
c7202dee
KW
2968}
2969
7d4bcc4a 2970# endif /* #ifdef DEBUGGING */
98994639 2971#endif /* USE_LOCALE_COLLATE */
58eebef2 2972
8ef6e574
KW
2973#ifdef USE_LOCALE
2974
c1284011
KW
2975bool
2976Perl__is_cur_LC_category_utf8(pTHX_ int category)
7d74bb61
KW
2977{
2978 /* Returns TRUE if the current locale for 'category' is UTF-8; FALSE
2979 * otherwise. 'category' may not be LC_ALL. If the platform doesn't have
119ee68b 2980 * nl_langinfo(), nor MB_CUR_MAX, this employs a heuristic, which hence
609548d2
KW
2981 * could give the wrong result. The result will very likely be correct for
2982 * languages that have commonly used non-ASCII characters, but for notably
2983 * English, it comes down to if the locale's name ends in something like
2984 * "UTF-8". It errs on the side of not being a UTF-8 locale. */
7d74bb61 2985
8de4332b 2986 const char *save_input_locale = NULL;
7d74bb61
KW
2987 STRLEN final_pos;
2988
7d4bcc4a
KW
2989# ifdef LC_ALL
2990
7d74bb61 2991 assert(category != LC_ALL);
7d4bcc4a
KW
2992
2993# endif
7d74bb61
KW
2994
2995 /* First dispose of the trivial cases */
837ce802 2996 save_input_locale = do_setlocale_r(category, NULL);
7d74bb61 2997 if (! save_input_locale) {
69014004
KW
2998 DEBUG_L(PerlIO_printf(Perl_debug_log,
2999 "Could not find current locale for category %d\n",
3000 category));
7d74bb61
KW
3001 return FALSE; /* XXX maybe should croak */
3002 }
b07fffd1 3003 save_input_locale = stdize_locale(savepv(save_input_locale));
a39edc4c 3004 if (isNAME_C_OR_POSIX(save_input_locale)) {
69014004
KW
3005 DEBUG_L(PerlIO_printf(Perl_debug_log,
3006 "Current locale for category %d is %s\n",
3007 category, save_input_locale));
b07fffd1 3008 Safefree(save_input_locale);
7d74bb61
KW
3009 return FALSE;
3010 }
3011
7d4bcc4a 3012# if defined(USE_LOCALE_CTYPE) \
1d958db2 3013 && (defined(MB_CUR_MAX) || (defined(HAS_NL_LANGINFO) && defined(CODESET)))
7d74bb61 3014
1d958db2 3015 { /* Next try nl_langinfo or MB_CUR_MAX if available */
7d74bb61
KW
3016
3017 char *save_ctype_locale = NULL;
119ee68b 3018 bool is_utf8;
7d74bb61 3019
119ee68b 3020 if (category != LC_CTYPE) { /* These work only on LC_CTYPE */
7d74bb61
KW
3021
3022 /* Get the current LC_CTYPE locale */
837ce802 3023 save_ctype_locale = do_setlocale_c(LC_CTYPE, NULL);
7d74bb61 3024 if (! save_ctype_locale) {
69014004
KW
3025 DEBUG_L(PerlIO_printf(Perl_debug_log,
3026 "Could not find current locale for LC_CTYPE\n"));
7d74bb61
KW
3027 goto cant_use_nllanginfo;
3028 }
4f72bb37 3029 save_ctype_locale = stdize_locale(savepv(save_ctype_locale));
7d74bb61
KW
3030
3031 /* If LC_CTYPE and the desired category use the same locale, this
3032 * means that finding the value for LC_CTYPE is the same as finding
3033 * the value for the desired category. Otherwise, switch LC_CTYPE
3034 * to the desired category's locale */
3035 if (strEQ(save_ctype_locale, save_input_locale)) {
3036 Safefree(save_ctype_locale);
3037 save_ctype_locale = NULL;
3038 }
837ce802 3039 else if (! do_setlocale_c(LC_CTYPE, save_input_locale)) {
69014004
KW
3040 DEBUG_L(PerlIO_printf(Perl_debug_log,
3041 "Could not change LC_CTYPE locale to %s\n",
3042 save_input_locale));
7d74bb61
KW
3043 Safefree(save_ctype_locale);
3044 goto cant_use_nllanginfo;
3045 }
3046 }
3047
69014004
KW
3048 DEBUG_L(PerlIO_printf(Perl_debug_log, "Current LC_CTYPE locale=%s\n",
3049 save_input_locale));
3050
7d74bb61 3051 /* Here the current LC_CTYPE is set to the locale of the category whose
1d958db2
KW
3052 * information is desired. This means that nl_langinfo() and MB_CUR_MAX
3053 * should give the correct results */
119ee68b 3054
7d4bcc4a
KW
3055# if defined(HAS_NL_LANGINFO) && defined(CODESET)
3056
291a84fb 3057 { /* The task is easiest if the platform has this POSIX 2001 function */
c70a3e68
KW
3058 const char *codeset = my_nl_langinfo(PERL_CODESET, FALSE);
3059 /* FALSE => already in dest locale */
119ee68b 3060
c70a3e68
KW
3061 DEBUG_L(PerlIO_printf(Perl_debug_log,
3062 "\tnllanginfo returned CODESET '%s'\n", codeset));
3063
3064 if (codeset && strNE(codeset, "")) {
1d958db2
KW
3065 /* If we switched LC_CTYPE, switch back */
3066 if (save_ctype_locale) {
837ce802 3067 do_setlocale_c(LC_CTYPE, save_ctype_locale);
1d958db2
KW
3068 Safefree(save_ctype_locale);
3069 }
3070
abc1d81d
KW
3071 is_utf8 = ( ( strlen(codeset) == STRLENs("UTF-8")
3072 && foldEQ(codeset, STR_WITH_LEN("UTF-8")))
3073 || ( strlen(codeset) == STRLENs("UTF8")
3074 && foldEQ(codeset, STR_WITH_LEN("UTF8"))));
1d958db2 3075
69014004
KW
3076 DEBUG_L(PerlIO_printf(Perl_debug_log,
3077 "\tnllanginfo returned CODESET '%s'; ?UTF8 locale=%d\n",
3078 codeset, is_utf8));
1d958db2
KW
3079 Safefree(save_input_locale);
3080 return is_utf8;
3081 }
119ee68b
KW
3082 }
3083
7d4bcc4a
KW
3084# endif
3085# ifdef MB_CUR_MAX
1d958db2
KW
3086
3087 /* Here, either we don't have nl_langinfo, or it didn't return a
3088 * codeset. Try MB_CUR_MAX */
3089
119ee68b
KW
3090 /* Standard UTF-8 needs at least 4 bytes to represent the maximum
3091 * Unicode code point. Since UTF-8 is the only non-single byte
3092 * encoding we handle, we just say any such encoding is UTF-8, and if
3093 * turns out to be wrong, other things will fail */
5f7616bd 3094 is_utf8 = (unsigned) MB_CUR_MAX >= STRLENs(MAX_UNICODE_UTF8);
119ee68b 3095
69014004
KW
3096 DEBUG_L(PerlIO_printf(Perl_debug_log,
3097 "\tMB_CUR_MAX=%d; ?UTF8 locale=%d\n",
3098 (int) MB_CUR_MAX, is_utf8));
3099
119ee68b
KW
3100 Safefree(save_input_locale);
3101
7d4bcc4a 3102# ifdef HAS_MBTOWC
119ee68b
KW
3103
3104 /* ... But, most system that have MB_CUR_MAX will also have mbtowc(),
3105 * since they are both in the C99 standard. We can feed a known byte
3106 * string to the latter function, and check that it gives the expected
3107 * result */
3108 if (is_utf8) {
3109 wchar_t wc;
51fc4b19
KW
3110 int len;
3111
856b881c 3112 PERL_UNUSED_RESULT(mbtowc(&wc, NULL, 0));/* Reset any shift state */
69014004 3113 errno = 0;
b1d4925c 3114 len = mbtowc(&wc, STR_WITH_LEN(REPLACEMENT_CHARACTER_UTF8));
51fc4b19 3115
b1d4925c
KW
3116
3117 if ( len != STRLENs(REPLACEMENT_CHARACTER_UTF8)
5579633c 3118 || wc != (wchar_t) UNICODE_REPLACEMENT)
119ee68b
KW
3119 {
3120 is_utf8 = FALSE;
b1d4925c
KW
3121 DEBUG_L(PerlIO_printf(Perl_debug_log, "\replacement=U+%x\n",
3122 (unsigned int)wc));
69014004
KW
3123 DEBUG_L(PerlIO_printf(Perl_debug_log,
3124 "\treturn from mbtowc=%d; errno=%d; ?UTF8 locale=0\n",
51fc4b19 3125 len, errno));
119ee68b
KW
3126 }
3127 }
7d4bcc4a
KW
3128
3129# endif
119ee68b 3130
1d958db2
KW
3131 /* If we switched LC_CTYPE, switch back */
3132 if (save_ctype_locale) {
837ce802 3133 do_setlocale_c(LC_CTYPE, save_ctype_locale);
1d958db2 3134 Safefree(save_ctype_locale);
119ee68b 3135 }
7d74bb61 3136
1d958db2 3137 return is_utf8;
7d4bcc4a
KW
3138
3139# endif
3140
7d74bb61 3141 }
119ee68b 3142
7d74bb61
KW
3143 cant_use_nllanginfo:
3144
7d4bcc4a 3145# else /* nl_langinfo should work if available, so don't bother compiling this
0080c90a
KW
3146 fallback code. The final fallback of looking at the name is
3147 compiled, and will be executed if nl_langinfo fails */
7d74bb61 3148
97f4de96
KW
3149 /* nl_langinfo not available or failed somehow. Next try looking at the
3150 * currency symbol to see if it disambiguates things. Often that will be
3151 * in the native script, and if the symbol isn't in UTF-8, we know that the
3152 * locale isn't. If it is non-ASCII UTF-8, we infer that the locale is
609548d2
KW
3153 * too, as the odds of a non-UTF8 string being valid UTF-8 are quite small
3154 * */
fa9b773e 3155
7d4bcc4a
KW
3156# ifdef HAS_LOCALECONV
3157# ifdef USE_LOCALE_MONETARY
3158
fa9b773e
KW
3159 {
3160 char *save_monetary_locale = NULL;
fa9b773e 3161 bool only_ascii = FALSE;
13542a67
KW
3162 bool is_utf8 = FALSE;
3163 struct lconv* lc;
fa9b773e 3164
97f4de96
KW
3165 /* Like above for LC_CTYPE, we first set LC_MONETARY to the locale of
3166 * the desired category, if it isn't that locale already */
3167
fa9b773e
KW
3168 if (category != LC_MONETARY) {
3169
837ce802 3170 save_monetary_locale = do_setlocale_c(LC_MONETARY, NULL);
fa9b773e 3171 if (! save_monetary_locale) {
69014004
KW
3172 DEBUG_L(PerlIO_printf(Perl_debug_log,
3173 "Could not find current locale for LC_MONETARY\n"));
fa9b773e
KW
3174 goto cant_use_monetary;
3175 }
4f72bb37 3176 save_monetary_locale = stdize_locale(savepv(save_monetary_locale));
fa9b773e 3177
13542a67
KW
3178 if (strEQ(save_monetary_locale, save_input_locale)) {
3179 Safefree(save_monetary_locale);
3180 save_monetary_locale = NULL;
3181 }
837ce802 3182 else if (! do_setlocale_c(LC_MONETARY, save_input_locale)) {
59c234b4
KW
3183 DEBUG_L(PerlIO_printf(Perl_debug_log,
3184 "Could not change LC_MONETARY locale to %s\n",
3185 save_input_locale));
3186 Safefree(save_monetary_locale);
3187 goto cant_use_monetary;
fa9b773e
KW
3188 }
3189 }
3190
3191 /* Here the current LC_MONETARY is set to the locale of the category
3192 * whose information is desired. */
3193
13542a67
KW
3194 lc = localeconv();
3195 if (! lc
3196 || ! lc->currency_symbol
c5f058df 3197 || is_utf8_invariant_string((U8 *) lc->currency_symbol, 0))
13542a67
KW
3198 {
3199 DEBUG_L(PerlIO_printf(Perl_debug_log, "Couldn't get currency symbol for %s, or contains only ASCII; can't use for determining if UTF-8 locale\n", save_input_locale));
3200 only_ascii = TRUE;
3201 }
3202 else {
3203 is_utf8 = is_utf8_string((U8 *) lc->currency_symbol, 0);
fa9b773e
KW
3204 }
3205
3206 /* If we changed it, restore LC_MONETARY to its original locale */
3207 if (save_monetary_locale) {
837ce802 3208 do_setlocale_c(LC_MONETARY, save_monetary_locale);
fa9b773e
KW
3209 Safefree(save_monetary_locale);
3210 }
3211
13542a67 3212 if (! only_ascii) {
fa9b773e 3213
59c234b4
KW
3214 /* It isn't a UTF-8 locale if the symbol is not legal UTF-8;
3215 * otherwise assume the locale is UTF-8 if and only if the symbol
3216 * is non-ascii UTF-8. */
3217 DEBUG_L(PerlIO_printf(Perl_debug_log, "\t?Currency symbol for %s is UTF-8=%d\n",
3218 save_input_locale, is_utf8));
3219 Safefree(save_input_locale);
3220 return is_utf8;
13542a67 3221 }
fa9b773e
KW
3222 }
3223 cant_use_monetary:
3224
7d4bcc4a
KW
3225# endif /* USE_LOCALE_MONETARY */
3226# endif /* HAS_LOCALECONV */
fa9b773e 3227
7d4bcc4a 3228# if defined(HAS_STRFTIME) && defined(USE_LOCALE_TIME)
15f7e74e
KW
3229
3230/* Still haven't found a non-ASCII string to disambiguate UTF-8 or not. Try
3231 * the names of the months and weekdays, timezone, and am/pm indicator */
3232 {
3233 char *save_time_locale = NULL;
3234 int hour = 10;
3235 bool is_dst = FALSE;
3236 int dom = 1;
3237 int month = 0;
3238 int i;
3239 char * formatted_time;
3240
3241
3242 /* Like above for LC_MONETARY, we set LC_TIME to the locale of the
3243 * desired category, if it isn't that locale already */
3244
3245 if (category != LC_TIME) {
3246
837ce802 3247 save_time_locale = do_setlocale_c(LC_TIME, NULL);
15f7e74e
KW
3248 if (! save_time_locale) {
3249 DEBUG_L(PerlIO_printf(Perl_debug_log,
3250 "Could not find current locale for LC_TIME\n"));
3251 goto cant_use_time;
3252 }
3253 save_time_locale = stdize_locale(savepv(save_time_locale));
3254
3255 if (strEQ(save_time_locale, save_input_locale)) {
3256 Safefree(save_time_locale);
3257 save_time_locale = NULL;
3258 }
837ce802 3259 else if (! do_setlocale_c(LC_TIME, save_input_locale)) {
15f7e74e
KW
3260 DEBUG_L(PerlIO_printf(Perl_debug_log,
3261 "Could not change LC_TIME locale to %s\n",
3262 save_input_locale));
3263 Safefree(save_time_locale);
3264 goto cant_use_time;
3265 }
3266 }
3267
3268 /* Here the current LC_TIME is set to the locale of the category
3269 * whose information is desired. Look at all the days of the week and
9f10db87 3270 * month names, and the timezone and am/pm indicator for UTF-8 variant
15f7e74e
KW
3271 * characters. The first such a one found will tell us if the locale
3272 * is UTF-8 or not */
3273
3274 for (i = 0; i < 7 + 12; i++) { /* 7 days; 12 months */
3275 formatted_time = my_strftime("%A %B %Z %p",
3ae5cd07 3276 0, 0, hour, dom, month, 2012 - 1900, 0, 0, is_dst);
c5f058df
KW
3277 if ( ! formatted_time
3278 || is_utf8_invariant_string((U8 *) formatted_time, 0))
3279 {
15f7e74e
KW
3280
3281 /* Here, we didn't find a non-ASCII. Try the next time through
3282 * with the complemented dst and am/pm, and try with the next
3283 * weekday. After we have gotten all weekdays, try the next
3284 * month */
3285 is_dst = ! is_dst;
3286 hour = (hour + 12) % 24;
3287 dom++;
3288 if (i > 6) {
3289 month++;
3290 }
3291 continue;
3292 }
3293
3294 /* Here, we have a non-ASCII. Return TRUE is it is valid UTF8;
3295 * false otherwise. But first, restore LC_TIME to its original
3296 * locale if we changed it */
3297 if (save_time_locale) {
837ce802 3298 do_setlocale_c(LC_TIME, save_time_locale);
15f7e74e
KW
3299 Safefree(save_time_locale);
3300 }
3301
3302 DEBUG_L(PerlIO_printf(Perl_debug_log, "\t?time-related strings for %s are UTF-8=%d\n",
3303 save_input_locale,
3304 is_utf8_string((U8 *) formatted_time, 0)));
3305 Safefree(save_input_locale);
3306 return is_utf8_string((U8 *) formatted_time, 0);
3307 }
3308
3309 /* Falling off the end of the loop indicates all the names were just
3310 * ASCII. Go on to the next test. If we changed it, restore LC_TIME
3311 * to its original locale */
3312 if (save_time_locale) {
837ce802 3313 do_setlocale_c(LC_TIME, save_time_locale);
15f7e74e
KW
3314 Safefree(save_time_locale);
3315 }
3316 DEBUG_L(PerlIO_printf(Perl_debug_log, "All time-related words for %s contain only ASCII; can't use for determining if UTF-8 locale\n", save_input_locale));
3317 }
3318 cant_use_time:
3319
7d4bcc4a 3320# endif
15f7e74e 3321
7d4bcc4a 3322# if 0 && defined(USE_LOCALE_MESSAGES) && defined(HAS_SYS_ERRLIST)
855aeb93
JH
3323
3324/* This code is ifdefd out because it was found to not be necessary in testing
5857e934
KW
3325 * on our dromedary test machine, which has over 700 locales. There, this
3326 * added no value to looking at the currency symbol and the time strings. I
3327 * left it in so as to avoid rewriting it if real-world experience indicates
3328 * that dromedary is an outlier. Essentially, instead of returning abpve if we
855aeb93
JH
3329 * haven't found illegal utf8, we continue on and examine all the strerror()
3330 * messages on the platform for utf8ness. If all are ASCII, we still don't
3331 * know the answer; but otherwise we have a pretty good indication of the
5857e934
KW
3332 * utf8ness. The reason this doesn't help much is that the messages may not
3333 * have been translated into the locale. The currency symbol and time strings
3334 * are much more likely to have been translated. */
3335 {
855aeb93 3336 int e;
5857e934
KW
3337 bool is_utf8 = FALSE;
3338 bool non_ascii = FALSE;
855aeb93 3339 char *save_messages_locale = NULL;
5857e934 3340 const char * errmsg = NULL;
855aeb93 3341
5857e934
KW
3342 /* Like above, we set LC_MESSAGES to the locale of the desired
3343 * category, if it isn't that locale already */
855aeb93
JH
3344
3345 if (category != LC_MESSAGES) {
3346
837ce802 3347 save_messages_locale = do_setlocale_c(LC_MESSAGES, NULL);
855aeb93 3348 if (! save_messages_locale) {
5857e934
KW
3349 DEBUG_L(PerlIO_printf(Perl_debug_log,
3350 "Could not find current locale for LC_MESSAGES\n"));
855aeb93
JH
3351 goto cant_use_messages;
3352 }
5857e934 3353 save_messages_locale = stdize_locale(savepv(save_messages_locale));
855aeb93
JH
3354
3355 if (strEQ(save_messages_locale, save_input_locale)) {
5857e934
KW
3356 Safefree(save_messages_locale);
3357 save_messages_locale = NULL;
855aeb93 3358 }
837ce802 3359 else if (! do_setlocale_c(LC_MESSAGES, save_input_locale)) {
5857e934
KW
3360 DEBUG_L(PerlIO_printf(Perl_debug_log,
3361 "Could not change LC_MESSAGES locale to %s\n",
3362 save_input_locale));
855aeb93
JH
3363 Safefree(save_messages_locale);
3364 goto cant_use_messages;
3365 }
3366 }
3367
3368 /* Here the current LC_MESSAGES is set to the locale of the category
5857e934
KW
3369 * whose information is desired. Look through all the messages. We
3370 * can't use Strerror() here because it may expand to code that
3371 * segfaults in miniperl */
855aeb93 3372
5857e934
KW
3373 for (e = 0; e <= sys_nerr; e++) {
3374 errno = 0;
3375 errmsg = sys_errlist[e];
3376 if (errno || !errmsg) {
855aeb93
JH
3377 break;
3378 }
5857e934 3379 errmsg = savepv(errmsg);
c5f058df 3380 if (! is_utf8_invariant_string((U8 *) errmsg, 0)) {
5857e934
KW
3381 non_ascii = TRUE;
3382 is_utf8 = is_utf8_string((U8 *) errmsg, 0);
3383 break;
855aeb93
JH
3384 }
3385 }
5857e934 3386 Safefree(errmsg);
855aeb93
JH
3387
3388 /* And, if we changed it, restore LC_MESSAGES to its original locale */
3389 if (save_messages_locale) {
837ce802 3390 do_setlocale_c(LC_MESSAGES, save_messages_locale);
855aeb93
JH
3391 Safefree(save_messages_locale);
3392 }
3393
5857e934
KW
3394 if (non_ascii) {
3395
3396 /* Any non-UTF-8 message means not a UTF-8 locale; if all are valid,
3397 * any non-ascii means it is one; otherwise we assume it isn't */
3398 DEBUG_L(PerlIO_printf(Perl_debug_log, "\t?error messages for %s are UTF-8=%d\n",
3399 save_input_locale,
3400 is_utf8));
3401 Safefree(save_input_locale);
3402 return is_utf8;
3403 }
855aeb93 3404
5857e934 3405 DEBUG_L(PerlIO_printf(Perl_debug_log, "All error messages for %s contain only ASCII; can't use for determining if UTF-8 locale\n", save_input_locale));
855aeb93
JH
3406 }
3407 cant_use_messages:
3408
7d4bcc4a
KW
3409# endif
3410# endif /* the code that is compiled when no nl_langinfo */
0080c90a 3411
7d4bcc4a 3412# ifndef EBCDIC /* On os390, even if the name ends with "UTF-8', it isn't a
92c0a900 3413 UTF-8 locale */
7d4bcc4a 3414
97f4de96
KW
3415 /* As a last resort, look at the locale name to see if it matches
3416 * qr/UTF -? * 8 /ix, or some other common locale names. This "name", the
3417 * return of setlocale(), is actually defined to be opaque, so we can't
3418 * really rely on the absence of various substrings in the name to indicate
3419 * its UTF-8ness, but if it has UTF8 in the name, it is extremely likely to
3420 * be a UTF-8 locale. Similarly for the other common names */
3421
3422 final_pos = strlen(save_input_locale) - 1;
3423 if (final_pos >= 3) {
8de4332b 3424 const char *name = save_input_locale;
97f4de96
KW
3425
3426 /* Find next 'U' or 'u' and look from there */
3427 while ((name += strcspn(name, "Uu") + 1)
3428 <= save_input_locale + final_pos - 2)
3429 {
c72493e0 3430 if ( isALPHA_FOLD_NE(*name, 't')
305b8651 3431 || isALPHA_FOLD_NE(*(name + 1), 'f'))
97f4de96
KW
3432 {
3433 continue;
3434 }
3435 name += 2;
3436 if (*(name) == '-') {
3437 if ((name > save_input_locale + final_pos - 1)) {
3438 break;
3439 }
3440 name++;
3441 }
3442 if (*(name) == '8') {
97f4de96
KW
3443 DEBUG_L(PerlIO_printf(Perl_debug_log,
3444 "Locale %s ends with UTF-8 in name\n",
3445 save_input_locale));
00c54b9c 3446 Safefree(save_input_locale);
97f4de96
KW
3447 return TRUE;
3448 }
3449 }
3450 DEBUG_L(PerlIO_printf(Perl_debug_log,
3451 "Locale %s doesn't end with UTF-8 in name\n",
3452 save_input_locale));
3453 }
3454
7d4bcc4a
KW
3455# endif
3456# ifdef WIN32
3457
97f4de96 3458 /* http://msdn.microsoft.com/en-us/library/windows/desktop/dd317756.aspx */
8a0832a1 3459 if (memENDs(save_input_locale, final_pos, "65001")) {
97f4de96 3460 DEBUG_L(PerlIO_printf(Perl_debug_log,
8a0832a1 3461 "Locale %s ends with 65001 in name, is UTF-8 locale\n",
97f4de96
KW
3462 save_input_locale));
3463 Safefree(save_input_locale);
3464 return TRUE;
3465 }
7d4bcc4a
KW
3466
3467# endif
97f4de96
KW
3468
3469 /* Other common encodings are the ISO 8859 series, which aren't UTF-8. But
3470 * since we are about to return FALSE anyway, there is no point in doing
3471 * this extra work */
7d4bcc4a
KW
3472
3473# if 0
97f4de96
KW
3474 if (instr(save_input_locale, "8859")) {
3475 DEBUG_L(PerlIO_printf(Perl_debug_log,
3476 "Locale %s has 8859 in name, not UTF-8 locale\n",
3477 save_input_locale));
3478 Safefree(save_input_locale);
3479 return FALSE;
3480 }
7d4bcc4a 3481# endif
97f4de96 3482
69014004
KW
3483 DEBUG_L(PerlIO_printf(Perl_debug_log,
3484 "Assuming locale %s is not a UTF-8 locale\n",
3485 save_input_locale));
fa9b773e 3486 Safefree(save_input_locale);
7d74bb61
KW
3487 return FALSE;
3488}
3489
8ef6e574 3490#endif
7d74bb61 3491
d6ded950
KW
3492
3493bool
3494Perl__is_in_locale_category(pTHX_ const bool compiling, const int category)
3495{
1a4f13e1 3496 dVAR;
d6ded950
KW
3497 /* Internal function which returns if we are in the scope of a pragma that
3498 * enables the locale category 'category'. 'compiling' should indicate if
3499 * this is during the compilation phase (TRUE) or not (FALSE). */
3500
3501 const COP * const cop = (compiling) ? &PL_compiling : PL_curcop;
3502
3503 SV *categories = cop_hints_fetch_pvs(cop, "locale", 0);
3504 if (! categories || categories == &PL_sv_placeholder) {
3505 return FALSE;
3506 }
3507
3508 /* The pseudo-category 'not_characters' is -1, so just add 1 to each to get
3509 * a valid unsigned */
3510 assert(category >= -1);
3511 return cBOOL(SvUV(categories) & (1U << (category + 1)));
3512}
3513
2c6ee1a7 3514char *
6ebbc862
KW
3515Perl_my_strerror(pTHX_ const int errnum)
3516{
3517 /* Returns a mortalized copy of the text of the error message associated
3518 * with 'errnum'. It uses the current locale's text unless the platform
3519 * doesn't have the LC_MESSAGES category or we are not being called from
3520 * within the scope of 'use locale'. In the former case, it uses whatever
3521 * strerror returns; in the latter case it uses the text from the C locale.
3522 *
3523 * The function just calls strerror(), but temporarily switches, if needed,
3524 * to the C locale */
3525
3526 char *errstr;
52770946 3527 dVAR;
6ebbc862 3528
52770946 3529#ifndef USE_LOCALE_MESSAGES
6ebbc862 3530
52770946
KW
3531 /* If platform doesn't have messages category, we don't do any switching to
3532 * the C locale; we just use whatever strerror() returns */
3533
3534 errstr = savepv(Strerror(errnum));
3535
3536#else /* Has locale messages */
3537
3538 const bool within_locale_scope = IN_LC(LC_MESSAGES);
2c6ee1a7 3539
7aaa36b1
KW
3540# if defined(HAS_POSIX_2008_LOCALE) && defined(HAS_STRERROR_L)
3541
43cb6651
KW
3542 /* This function is trivial if we don't have to worry about thread safety
3543 * and have strerror_l(), as it handles the switch of locales so we don't
3544 * have to deal with that. We don't have to worry about thread safety if
3545 * this is an unthreaded build, or if strerror_r() is also available. Both
3546 * it and strerror_l() are thread-safe. Plain strerror() isn't thread
3547 * safe. But on threaded builds when strerror_r() is available, the
3548 * apparent call to strerror() below is actually a macro that
3549 * behind-the-scenes calls strerror_r().
3550 */
3551
3552# if ! defined(USE_ITHREADS) || defined(HAS_STRERROR_R)
7aaa36b1
KW
3553
3554 if (within_locale_scope) {
4eb27fc5 3555 errstr = savepv(strerror(errnum));
7aaa36b1
KW
3556 }
3557 else {
4eb27fc5 3558 errstr = savepv(strerror_l(errnum, PL_C_locale_obj));
7aaa36b1
KW
3559 }
3560
43cb6651
KW
3561# else
3562
3563 /* Here we have strerror_l(), but not strerror_r() and we are on a
3564 * threaded-build. We use strerror_l() for everything, constructing a
3565 * locale to pass to it if necessary */
3566
3567 bool do_free = FALSE;
3568 locale_t locale_to_use;
3569
3570 if (within_locale_scope) {
3571 locale_to_use = uselocale((locale_t) 0);
3572 if (locale_to_use == LC_GLOBAL_LOCALE) {
3573 locale_to_use = duplocale(LC_GLOBAL_LOCALE);
3574 do_free = TRUE;
3575 }
3576 }
3577 else { /* Use C locale if not within 'use locale' scope */
3578 locale_to_use = PL_C_locale_obj;
3579 }
3580
3581 errstr = savepv(strerror_l(errnum, locale_to_use));
3582
3583 if (do_free) {
3584 freelocale(locale_to_use);
3585 }
3586
3587# endif
3588# else /* Doesn't have strerror_l() */
7aaa36b1
KW
3589
3590# ifdef USE_POSIX_2008_LOCALE
3591
fcd0e682 3592 locale_t save_locale = NULL;
7aaa36b1
KW
3593
3594# else
3595
8de4332b 3596 const char * save_locale = NULL;
c9dda6da 3597 bool locale_is_C = FALSE;
2c6ee1a7 3598
6ebbc862
KW
3599 /* We have a critical section to prevent another thread from changing the
3600 * locale out from under us (or zapping the buffer returned from
3601 * setlocale() ) */
3602 LOCALE_LOCK;
3603
7aaa36b1 3604# endif
6ebbc862 3605
9c8a6dc2
KW
3606 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
3607 "my_strerror called with errnum %d\n", errnum));
6ebbc862 3608 if (! within_locale_scope) {
c9dda6da 3609 errno = 0;
a0b53297 3610
f1d2176b 3611# ifdef USE_POSIX_2008_LOCALE /* Use the thread-safe locale functions */
6ebbc862 3612
9c8a6dc2
KW
3613 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
3614 "Not within locale scope, about to call"
3615 " uselocale(0x%p)\n", PL_C_locale_obj));
6ebbc862 3616 save_locale = uselocale(PL_C_locale_obj);
c9dda6da
KW
3617 if (! save_locale) {
3618 DEBUG_L(PerlIO_printf(Perl_debug_log,
9c8a6dc2
KW
3619 "uselocale failed, errno=%d\n", errno));
3620 }
3621 else {
3622 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
3623 "uselocale returned 0x%p\n", save_locale));
c9dda6da 3624 }
6ebbc862 3625
7aaa36b1 3626# else /* Not thread-safe build */
a0b53297 3627
837ce802 3628 save_locale = do_setlocale_c(LC_MESSAGES, NULL);
c9dda6da
KW
3629 if (! save_locale) {
3630 DEBUG_L(PerlIO_printf(Perl_debug_log,
3631 "setlocale failed, errno=%d\n", errno));
3632 }
3633 else {
3634 locale_is_C = isNAME_C_OR_POSIX(save_locale);
2c6ee1a7 3635
c9dda6da
KW
3636 /* Switch to the C locale if not already in it */
3637 if (! locale_is_C) {
2c6ee1a7 3638
c9dda6da
KW
3639 /* The setlocale() just below likely will zap 'save_locale', so
3640 * create a copy. */
3641 save_locale = savepv(save_locale);
837ce802 3642 do_setlocale_c(LC_MESSAGES, "C");
c9dda6da 3643 }
6ebbc862 3644 }
2c6ee1a7 3645
7aaa36b1 3646# endif
2c6ee1a7 3647
6ebbc862 3648 } /* end of ! within_locale_scope */
9c8a6dc2
KW
3649 else {
3650 DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s: %d: WITHIN locale scope\n",
3651 __FILE__, __LINE__));
3652 }
a0b53297 3653
9c8a6dc2
KW
3654 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
3655 "Any locale change has been done; about to call Strerror\n"));
52770946 3656 errstr = savepv(Strerror(errnum));
6ebbc862
KW
3657
3658 if (! within_locale_scope) {
c9dda6da 3659 errno = 0;
a0b53297 3660
f1d2176b 3661# ifdef USE_POSIX_2008_LOCALE
6ebbc862 3662
9c8a6dc2
KW
3663 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
3664 "%s: %d: not within locale scope, restoring the locale\n",
3665 __FILE__, __LINE__));
c9dda6da
KW
3666 if (save_locale && ! uselocale(save_locale)) {
3667 DEBUG_L(PerlIO_printf(Perl_debug_log,
3668 "uselocale restore failed, errno=%d\n", errno));
3669 }
2c6ee1a7 3670 }
6ebbc862 3671
7aaa36b1 3672# else
6ebbc862 3673
c9dda6da 3674 if (save_locale && ! locale_is_C) {
837ce802 3675 if (! do_setlocale_c(LC_MESSAGES, save_locale)) {
c9dda6da
KW
3676 DEBUG_L(PerlIO_printf(Perl_debug_log,
3677 "setlocale restore failed, errno=%d\n", errno));
3678 }
6ebbc862
KW
3679 Safefree(save_locale);
3680 }
3681 }
3682
3683 LOCALE_UNLOCK;
3684
7aaa36b1
KW
3685# endif
3686# endif /* End of doesn't have strerror_l */
52770946 3687#endif /* End of does have locale messages */
6affbbf0
KW
3688
3689#ifdef DEBUGGING
3690
3691 if (DEBUG_Lv_TEST) {
3692 PerlIO_printf(Perl_debug_log, "Strerror returned; saving a copy: '");
3693 print_bytes_for_locale(errstr, errstr + strlen(errstr), 0);
3694 PerlIO_printf(Perl_debug_log, "'\n");
3695 }
3696
2c6ee1a7
KW
3697#endif
3698
52770946 3699 SAVEFREEPV(errstr);
6ebbc862 3700 return errstr;
2c6ee1a7
KW
3701}
3702
66610fdd 3703/*
747c467a 3704
747c467a
KW
3705=for apidoc sync_locale
3706
3707Changing the program's locale should be avoided by XS code. Nevertheless,
3708certain non-Perl libraries called from XS, such as C<Gtk> do so. When this
3709happens, Perl needs to be told that the locale has changed. Use this function
3710to do so, before returning to Perl.
3711
3712=cut
3713*/
3714
3715void
3716Perl_sync_locale(pTHX)
3717{
9f82ea3e 3718 char * newlocale;
747c467a
KW
3719
3720#ifdef USE_LOCALE_CTYPE
7d4bcc4a 3721
9f82ea3e
KW
3722 newlocale = do_setlocale_c(LC_CTYPE, NULL);
3723 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
3724 "%s:%d: %s\n", __FILE__, __LINE__,
3725 setlocale_debug_string(LC_CTYPE, NULL, newlocale)));
3726 new_ctype(newlocale);
747c467a 3727
7d4bcc4a 3728#endif /* USE_LOCALE_CTYPE */
747c467a 3729#ifdef USE_LOCALE_COLLATE
7d4bcc4a 3730
9f82ea3e
KW
3731 newlocale = do_setlocale_c(LC_COLLATE, NULL);
3732 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
3733 "%s:%d: %s\n", __FILE__, __LINE__,
3734 setlocale_debug_string(LC_COLLATE, NULL, newlocale)));
3735 new_collate(newlocale);
747c467a 3736
7d4bcc4a 3737#endif
747c467a 3738#ifdef USE_LOCALE_NUMERIC
7d4bcc4a 3739
9f82ea3e
KW
3740 newlocale = do_setlocale_c(LC_NUMERIC, NULL);
3741 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
3742 "%s:%d: %s\n", __FILE__, __LINE__,
3743 setlocale_debug_string(LC_NUMERIC, NULL, newlocale)));
3744 new_numeric(newlocale);
7d4bcc4a 3745
747c467a
KW
3746#endif /* USE_LOCALE_NUMERIC */
3747
3748}
3749
5d1187d1
KW
3750#if defined(DEBUGGING) && defined(USE_LOCALE)
3751