This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
locale.c: White-space only
[perl5.git] / locale.c
CommitLineData
98994639
HS
1/* locale.c
2 *
1129b882
NC
3 * Copyright (C) 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001,
4 * 2002, 2003, 2005, 2006, 2007, 2008 by Larry Wall and others
98994639
HS
5 *
6 * You may distribute under the terms of either the GNU General Public
7 * License or the Artistic License, as specified in the README file.
8 *
9 */
10
11/*
4ac71550 12 * A Elbereth Gilthoniel,
cdad3b53 13 * silivren penna míriel
4ac71550 14 * o menel aglar elenath!
cdad3b53 15 * Na-chaered palan-díriel
4ac71550
TC
16 * o galadhremmin ennorath,
17 * Fanuilos, le linnathon
18 * nef aear, si nef aearon!
19 *
20 * [p.238 of _The Lord of the Rings_, II/i: "Many Meetings"]
98994639
HS
21 */
22
166f8a29
DM
23/* utility functions for handling locale-specific stuff like what
24 * character represents the decimal point.
0d071d52 25 *
7d4bcc4a
KW
26 * All C programs have an underlying locale. Perl code generally doesn't pay
27 * any attention to it except within the scope of a 'use locale'. For most
0d071d52
KW
28 * categories, it accomplishes this by just using different operations if it is
29 * in such scope than if not. However, various libc functions called by Perl
30 * are affected by the LC_NUMERIC category, so there are macros in perl.h that
31 * are used to toggle between the current locale and the C locale depending on
a9ad02a8
KW
32 * the desired behavior of those functions at the moment. And, LC_MESSAGES is
33 * switched to the C locale for outputting the message unless within the scope
34 * of 'use locale'.
166f8a29
DM
35 */
36
98994639
HS
37#include "EXTERN.h"
38#define PERL_IN_LOCALE_C
f7416781 39#include "perl_langinfo.h"
98994639
HS
40#include "perl.h"
41
a4af207c
JH
42#include "reentr.h"
43
0dec74cd
KW
44#ifdef I_WCHAR
45# include <wchar.h>
46#endif
47
2fcc0ca9
KW
48/* If the environment says to, we can output debugging information during
49 * initialization. This is done before option parsing, and before any thread
50 * creation, so can be a file-level static */
5a4b0634
KW
51#if ! defined(DEBUGGING) || defined(PERL_GLOBAL_STRUCT)
52# define debug_initialization 0
53# define DEBUG_INITIALIZATION_set(v)
54#else
2fcc0ca9 55static bool debug_initialization = FALSE;
5a4b0634 56# define DEBUG_INITIALIZATION_set(v) (debug_initialization = v)
2fcc0ca9
KW
57#endif
58
291a84fb
KW
59/* strlen() of a literal string constant. We might want this more general,
60 * but using it in just this file for now. A problem with more generality is
61 * the compiler warnings about comparing unlike signs */
ff1b739b
KW
62#define STRLENs(s) (sizeof("" s "") - 1)
63
63e5b0d7
KW
64/* Is the C string input 'name' "C" or "POSIX"? If so, and 'name' is the
65 * return of setlocale(), then this is extremely likely to be the C or POSIX
66 * locale. However, the output of setlocale() is documented to be opaque, but
67 * the odds are extremely small that it would return these two strings for some
68 * other locale. Note that VMS in these two locales includes many non-ASCII
69 * characters as controls and punctuation (below are hex bytes):
70 * cntrl: 84-97 9B-9F
71 * punct: A1-A3 A5 A7-AB B0-B3 B5-B7 B9-BD BF-CF D1-DD DF-EF F1-FD
72 * Oddly, none there are listed as alphas, though some represent alphabetics
73 * http://www.nntp.perl.org/group/perl.perl5.porters/2013/02/msg198753.html */
74#define isNAME_C_OR_POSIX(name) \
75 ( (name) != NULL \
76 && (( *(name) == 'C' && (*(name + 1)) == '\0') \
77 || strEQ((name), "POSIX")))
78
8ef6e574
KW
79#ifdef USE_LOCALE
80
47280b20
KW
81/* This code keeps a LRU cache of the UTF-8ness of the locales it has so-far
82 * looked up. This is in the form of a C string: */
83
84#define UTF8NESS_SEP "\v"
85#define UTF8NESS_PREFIX "\f"
86
87/* So, the string looks like:
98994639 88 *
47280b20 89 * \vC\a0\vPOSIX\a0\vam_ET\a0\vaf_ZA.utf8\a1\ven_US.UTF-8\a1\0
98994639 90 *
47280b20
KW
91 * where the digit 0 after the \a indicates that the locale starting just
92 * after the preceding \v is not UTF-8, and the digit 1 mean it is. */
93
94STATIC_ASSERT_DECL(STRLENs(UTF8NESS_SEP) == 1);
95STATIC_ASSERT_DECL(STRLENs(UTF8NESS_PREFIX) == 1);
96
97#define C_and_POSIX_utf8ness UTF8NESS_SEP "C" UTF8NESS_PREFIX "0" \
98 UTF8NESS_SEP "POSIX" UTF8NESS_PREFIX "0"
99
100/* The cache is initialized to C_and_POSIX_utf8ness at start up. These are
101 * kept there always. The remining portion of the cache is LRU, with the
102 * oldest looked-up locale at the tail end */
103
98994639
HS
104STATIC char *
105S_stdize_locale(pTHX_ char *locs)
106{
47280b20
KW
107 /* Standardize the locale name from a string returned by 'setlocale',
108 * possibly modifying that string.
109 *
110 * The typical return value of setlocale() is either
111 * (1) "xx_YY" if the first argument of setlocale() is not LC_ALL
112 * (2) "xa_YY xb_YY ..." if the first argument of setlocale() is LC_ALL
113 * (the space-separated values represent the various sublocales,
114 * in some unspecified order). This is not handled by this function.
115 *
116 * In some platforms it has a form like "LC_SOMETHING=Lang_Country.866\n",
117 * which is harmful for further use of the string in setlocale(). This
118 * function removes the trailing new line and everything up through the '='
119 * */
120
7452cf6a 121 const char * const s = strchr(locs, '=');
98994639
HS
122 bool okay = TRUE;
123
7918f24d
NC
124 PERL_ARGS_ASSERT_STDIZE_LOCALE;
125
8772537c
AL
126 if (s) {
127 const char * const t = strchr(s, '.');
98994639 128 okay = FALSE;
8772537c
AL
129 if (t) {
130 const char * const u = strchr(t, '\n');
131 if (u && (u[1] == 0)) {
132 const STRLEN len = u - s;
133 Move(s + 1, locs, len, char);
134 locs[len] = 0;
135 okay = TRUE;
98994639
HS
136 }
137 }
138 }
139
140 if (!okay)
141 Perl_croak(aTHX_ "Can't fix broken locale name \"%s\"", locs);
142
143 return locs;
144}
145
e5f10d49
KW
146/* Two parallel arrays; first the locale categories Perl uses on this system;
147 * the second array is their names. These arrays are in mostly arbitrary
148 * order. */
149
150const int categories[] = {
151
152# ifdef USE_LOCALE_NUMERIC
153 LC_NUMERIC,
154# endif
155# ifdef USE_LOCALE_CTYPE
156 LC_CTYPE,
157# endif
158# ifdef USE_LOCALE_COLLATE
159 LC_COLLATE,
160# endif
161# ifdef USE_LOCALE_TIME
162 LC_TIME,
163# endif
164# ifdef USE_LOCALE_MESSAGES
165 LC_MESSAGES,
166# endif
167# ifdef USE_LOCALE_MONETARY
168 LC_MONETARY,
169# endif
9821811f
KW
170# ifdef USE_LOCALE_ADDRESS
171 LC_ADDRESS,
172# endif
173# ifdef USE_LOCALE_IDENTIFICATION
174 LC_IDENTIFICATION,
175# endif
176# ifdef USE_LOCALE_MEASUREMENT
177 LC_MEASUREMENT,
178# endif
179# ifdef USE_LOCALE_PAPER
180 LC_PAPER,
181# endif
182# ifdef USE_LOCALE_TELEPHONE
183 LC_TELEPHONE,
184# endif
e5f10d49
KW
185# ifdef LC_ALL
186 LC_ALL,
187# endif
188 -1 /* Placeholder because C doesn't allow a
189 trailing comma, and it would get complicated
190 with all the #ifdef's */
191};
192
193/* The top-most real element is LC_ALL */
194
195const char * category_names[] = {
196
197# ifdef USE_LOCALE_NUMERIC
198 "LC_NUMERIC",
199# endif
200# ifdef USE_LOCALE_CTYPE
201 "LC_CTYPE",
202# endif
203# ifdef USE_LOCALE_COLLATE
204 "LC_COLLATE",
205# endif
206# ifdef USE_LOCALE_TIME
207 "LC_TIME",
208# endif
209# ifdef USE_LOCALE_MESSAGES
210 "LC_MESSAGES",
211# endif
212# ifdef USE_LOCALE_MONETARY
213 "LC_MONETARY",
214# endif
9821811f
KW
215# ifdef USE_LOCALE_ADDRESS
216 "LC_ADDRESS",
217# endif
218# ifdef USE_LOCALE_IDENTIFICATION
219 "LC_IDENTIFICATION",
220# endif
221# ifdef USE_LOCALE_MEASUREMENT
222 "LC_MEASUREMENT",
223# endif
224# ifdef USE_LOCALE_PAPER
225 "LC_PAPER",
226# endif
227# ifdef USE_LOCALE_TELEPHONE
228 "LC_TELEPHONE",
229# endif
e5f10d49
KW
230# ifdef LC_ALL
231 "LC_ALL",
232# endif
233 NULL /* Placeholder */
234 };
235
236# ifdef LC_ALL
237
238 /* On systems with LC_ALL, it is kept in the highest index position. (-2
239 * to account for the final unused placeholder element.) */
240# define NOMINAL_LC_ALL_INDEX (C_ARRAY_LENGTH(categories) - 2)
241
242# else
243
244 /* On systems without LC_ALL, we pretend it is there, one beyond the real
245 * top element, hence in the unused placeholder element. */
246# define NOMINAL_LC_ALL_INDEX (C_ARRAY_LENGTH(categories) - 1)
247
248# endif
249
250/* Pretending there is an LC_ALL element just above allows us to avoid most
251 * special cases. Most loops through these arrays in the code below are
252 * written like 'for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++)'. They will work
253 * on either type of system. But the code must be written to not access the
948523db
KW
254 * element at 'LC_ALL_INDEX' except on platforms that have it. This can be
255 * checked for at compile time by using the #define LC_ALL_INDEX which is only
256 * defined if we do have LC_ALL. */
e5f10d49 257
b09aaf40
KW
258STATIC const char *
259S_category_name(const int category)
260{
261 unsigned int i;
262
263#ifdef LC_ALL
264
265 if (category == LC_ALL) {
266 return "LC_ALL";
267 }
268
269#endif
270
271 for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++) {
272 if (category == categories[i]) {
273 return category_names[i];
274 }
275 }
276
277 {
278 const char suffix[] = " (unknown)";
279 int temp = category;
280 Size_t length = sizeof(suffix) + 1;
281 char * unknown;
282 dTHX;
283
284 if (temp < 0) {
285 length++;
286 temp = - temp;
287 }
288
289 /* Calculate the number of digits */
290 while (temp >= 10) {
291 temp /= 10;
292 length++;
293 }
294
295 Newx(unknown, length, char);
296 my_snprintf(unknown, length, "%d%s", category, suffix);
297 SAVEFREEPV(unknown);
298 return unknown;
299 }
300}
301
948523db
KW
302/* Now create LC_foo_INDEX #defines for just those categories on this system */
303# ifdef USE_LOCALE_NUMERIC
304# define LC_NUMERIC_INDEX 0
305# define _DUMMY_NUMERIC LC_NUMERIC_INDEX
306# else
307# define _DUMMY_NUMERIC -1
308# endif
309# ifdef USE_LOCALE_CTYPE
310# define LC_CTYPE_INDEX _DUMMY_NUMERIC + 1
311# define _DUMMY_CTYPE LC_CTYPE_INDEX
312# else
313# define _DUMMY_CTYPE _DUMMY_NUMERIC
314# endif
315# ifdef USE_LOCALE_COLLATE
316# define LC_COLLATE_INDEX _DUMMY_CTYPE + 1
317# define _DUMMY_COLLATE LC_COLLATE_INDEX
318# else
319# define _DUMMY_COLLATE _DUMMY_COLLATE
320# endif
321# ifdef USE_LOCALE_TIME
322# define LC_TIME_INDEX _DUMMY_COLLATE + 1
323# define _DUMMY_TIME LC_TIME_INDEX
324# else
325# define _DUMMY_TIME _DUMMY_COLLATE
326# endif
327# ifdef USE_LOCALE_MESSAGES
328# define LC_MESSAGES_INDEX _DUMMY_TIME + 1
329# define _DUMMY_MESSAGES LC_MESSAGES_INDEX
330# else
331# define _DUMMY_MESSAGES _DUMMY_TIME
332# endif
333# ifdef USE_LOCALE_MONETARY
334# define LC_MONETARY_INDEX _DUMMY_MESSAGES + 1
335# define _DUMMY_MONETARY LC_MONETARY_INDEX
336# else
337# define _DUMMY_MONETARY _DUMMY_MESSAGES
338# endif
9821811f
KW
339# ifdef USE_LOCALE_ADDRESS
340# define LC_ADDRESS_INDEX _DUMMY_MONETARY + 1
341# define _DUMMY_ADDRESS LC_ADDRESS_INDEX
342# else
343# define _DUMMY_ADDRESS _DUMMY_MONETARY
344# endif
345# ifdef USE_LOCALE_IDENTIFICATION
346# define LC_IDENTIFICATION_INDEX _DUMMY_ADDRESS + 1
347# define _DUMMY_IDENTIFICATION LC_IDENTIFICATION_INDEX
348# else
349# define _DUMMY_IDENTIFICATION _DUMMY_ADDRESS
350# endif
351# ifdef USE_LOCALE_MEASUREMENT
352# define LC_MEASUREMENT_INDEX _DUMMY_IDENTIFICATION + 1
353# define _DUMMY_MEASUREMENT LC_MEASUREMENT_INDEX
354# else
355# define _DUMMY_MEASUREMENT _DUMMY_IDENTIFICATION
356# endif
357# ifdef USE_LOCALE_PAPER
358# define LC_PAPER_INDEX _DUMMY_MEASUREMENT + 1
359# define _DUMMY_PAPER LC_PAPER_INDEX
360# else
361# define _DUMMY_PAPER _DUMMY_MEASUREMENT
362# endif
363# ifdef USE_LOCALE_TELEPHONE
364# define LC_TELEPHONE_INDEX _DUMMY_PAPER + 1
365# define _DUMMY_TELEPHONE LC_TELEPHONE_INDEX
366# else
367# define _DUMMY_TELEPHONE _DUMMY_PAPER
368# endif
948523db 369# ifdef LC_ALL
9821811f 370# define LC_ALL_INDEX _DUMMY_TELEPHONE + 1
948523db
KW
371# endif
372#endif /* ifdef USE_LOCALE */
8ef6e574 373
d2b24094
KW
374/* Windows requres a customized base-level setlocale() */
375# ifdef WIN32
376# define my_setlocale(cat, locale) win32_setlocale(cat, locale)
377# else
378# define my_setlocale(cat, locale) setlocale(cat, locale)
379# endif
380
837ce802
KW
381/* Just placeholders for now. "_c" is intended to be called when the category
382 * is a constant known at compile time; "_r", not known until run time */
d2b24094
KW
383# define do_setlocale_c(category, locale) my_setlocale(category, locale)
384# define do_setlocale_r(category, locale) my_setlocale(category, locale)
837ce802 385
a4f00dcc 386STATIC void
86799d2d 387S_set_numeric_radix(pTHX_ const bool use_locale)
98994639 388{
86799d2d
KW
389 /* If 'use_locale' is FALSE, set to use a dot for the radix character. If
390 * TRUE, use the radix character derived from the current locale */
7d4bcc4a 391
86799d2d
KW
392#if defined(USE_LOCALE_NUMERIC) && ( defined(HAS_LOCALECONV) \
393 || defined(HAS_NL_LANGINFO))
98994639 394
87f8e8e7
KW
395 const char * radix = (use_locale)
396 ? my_nl_langinfo(PERL_RADIXCHAR, FALSE)
397 /* FALSE => already in dest locale */
398 : ".";
2213a3be 399
87f8e8e7 400 sv_setpv(PL_numeric_radix_sv, radix);
86799d2d 401
87f8e8e7
KW
402 /* If this is valid UTF-8 that isn't totally ASCII, and we are in
403 * a UTF-8 locale, then mark the radix as being in UTF-8 */
404 if (is_utf8_non_invariant_string((U8 *) SvPVX(PL_numeric_radix_sv),
7a393424 405 SvCUR(PL_numeric_radix_sv))
87f8e8e7
KW
406 && _is_cur_LC_category_utf8(LC_NUMERIC))
407 {
408 SvUTF8_on(PL_numeric_radix_sv);
409 }
86799d2d
KW
410
411# ifdef DEBUGGING
7d4bcc4a 412
2fcc0ca9
KW
413 if (DEBUG_L_TEST || debug_initialization) {
414 PerlIO_printf(Perl_debug_log, "Locale radix is '%s', ?UTF-8=%d\n",
3ca88433
KW
415 SvPVX(PL_numeric_radix_sv),
416 cBOOL(SvUTF8(PL_numeric_radix_sv)));
2fcc0ca9 417 }
69014004 418
86799d2d
KW
419# endif
420#endif /* USE_LOCALE_NUMERIC and can find the radix char */
7d4bcc4a 421
98994639
HS
422}
423
a39edc4c 424
98994639 425void
8772537c 426Perl_new_numeric(pTHX_ const char *newnum)
98994639 427{
7d4bcc4a
KW
428
429#ifndef USE_LOCALE_NUMERIC
430
431 PERL_UNUSED_ARG(newnum);
432
433#else
0d071d52 434
291a84fb 435 /* Called after each libc setlocale() call affecting LC_NUMERIC, to tell
0d071d52
KW
436 * core Perl this and that 'newnum' is the name of the new locale.
437 * It installs this locale as the current underlying default.
438 *
439 * The default locale and the C locale can be toggled between by use of the
5792c642
KW
440 * set_numeric_underlying() and set_numeric_standard() functions, which
441 * should probably not be called directly, but only via macros like
0d071d52
KW
442 * SET_NUMERIC_STANDARD() in perl.h.
443 *
444 * The toggling is necessary mainly so that a non-dot radix decimal point
445 * character can be output, while allowing internal calculations to use a
446 * dot.
447 *
448 * This sets several interpreter-level variables:
bb304765 449 * PL_numeric_name The underlying locale's name: a copy of 'newnum'
892e6465 450 * PL_numeric_underlying A boolean indicating if the toggled state is such
7738054c
KW
451 * that the current locale is the program's underlying
452 * locale
453 * PL_numeric_standard An int indicating if the toggled state is such
4c68b815
KW
454 * that the current locale is the C locale or
455 * indistinguishable from the C locale. If non-zero, it
456 * is in C; if > 1, it means it may not be toggled away
7738054c 457 * from C.
4c68b815
KW
458 * PL_numeric_underlying_is_standard A bool kept by this function
459 * indicating that the underlying locale and the standard
460 * C locale are indistinguishable for the purposes of
461 * LC_NUMERIC. This happens when both of the above two
462 * variables are true at the same time. (Toggling is a
463 * no-op under these circumstances.) This variable is
464 * used to avoid having to recalculate.
0d071d52
KW
465 * Any code changing the locale (outside this file) should use
466 * POSIX::setlocale, which calls this function. Therefore this function
467 * should be called directly only from this file and from
468 * POSIX::setlocale() */
469
b03f34cf 470 char *save_newnum;
98994639
HS
471
472 if (! newnum) {
43c5f42d
NC
473 Safefree(PL_numeric_name);
474 PL_numeric_name = NULL;
98994639 475 PL_numeric_standard = TRUE;
892e6465 476 PL_numeric_underlying = TRUE;
4c68b815 477 PL_numeric_underlying_is_standard = TRUE;
98994639
HS
478 return;
479 }
480
b03f34cf 481 save_newnum = stdize_locale(savepv(newnum));
892e6465 482 PL_numeric_underlying = TRUE;
4c68b815
KW
483 PL_numeric_standard = isNAME_C_OR_POSIX(save_newnum);
484
485 /* If its name isn't C nor POSIX, it could still be indistinguishable from
486 * them */
487 if (! PL_numeric_standard) {
488 PL_numeric_standard = cBOOL(strEQ(".", my_nl_langinfo(PERL_RADIXCHAR,
489 FALSE /* Don't toggle locale */ ))
490 && strEQ("", my_nl_langinfo(PERL_THOUSEP,
491 FALSE)));
492 }
abe1abcf 493
4c68b815 494 /* Save the new name if it isn't the same as the previous one, if any */
b03f34cf 495 if (! PL_numeric_name || strNE(PL_numeric_name, save_newnum)) {
98994639 496 Safefree(PL_numeric_name);
b03f34cf 497 PL_numeric_name = save_newnum;
b03f34cf 498 }
abe1abcf
KW
499 else {
500 Safefree(save_newnum);
501 }
4c28b29c 502
4c68b815
KW
503 PL_numeric_underlying_is_standard = PL_numeric_standard;
504
e1aa2579
KW
505# if defined(HAS_NEWLOCALE) && ! defined(NO_POSIX_2008_LOCALE)
506
507 PL_underlying_numeric_obj = newlocale(LC_NUMERIC_MASK,
508 PL_numeric_name,
509 PL_underlying_numeric_obj);
510
511#endif
512
4c68b815
KW
513 if (DEBUG_L_TEST || debug_initialization) {
514 PerlIO_printf(Perl_debug_log, "Called new_numeric with %s, PL_numeric_name=%s\n", newnum, PL_numeric_name);
515 }
516
4c28b29c
KW
517 /* Keep LC_NUMERIC in the C locale. This is for XS modules, so they don't
518 * have to worry about the radix being a non-dot. (Core operations that
519 * need the underlying locale change to it temporarily). */
520 set_numeric_standard();
521
98994639 522#endif /* USE_LOCALE_NUMERIC */
7d4bcc4a 523
98994639
HS
524}
525
526void
527Perl_set_numeric_standard(pTHX)
528{
7d4bcc4a 529
98994639 530#ifdef USE_LOCALE_NUMERIC
7d4bcc4a 531
28c1bf33
KW
532 /* Toggle the LC_NUMERIC locale to C. Most code should use the macros like
533 * SET_NUMERIC_STANDARD() in perl.h instead of calling this directly. The
534 * macro avoids calling this routine if toggling isn't necessary according
535 * to our records (which could be wrong if some XS code has changed the
536 * locale behind our back) */
0d071d52 537
837ce802 538 do_setlocale_c(LC_NUMERIC, "C");
a9b8c0d8 539 PL_numeric_standard = TRUE;
4c68b815 540 PL_numeric_underlying = PL_numeric_underlying_is_standard;
86799d2d 541 set_numeric_radix(0);
7d4bcc4a
KW
542
543# ifdef DEBUGGING
544
2fcc0ca9
KW
545 if (DEBUG_L_TEST || debug_initialization) {
546 PerlIO_printf(Perl_debug_log,
58e4a467 547 "LC_NUMERIC locale now is standard C\n");
2fcc0ca9 548 }
98994639 549
7d4bcc4a 550# endif
98994639 551#endif /* USE_LOCALE_NUMERIC */
7d4bcc4a 552
98994639
HS
553}
554
555void
5792c642 556Perl_set_numeric_underlying(pTHX)
98994639 557{
7d4bcc4a 558
98994639 559#ifdef USE_LOCALE_NUMERIC
7d4bcc4a 560
28c1bf33 561 /* Toggle the LC_NUMERIC locale to the current underlying default. Most
7d4bcc4a
KW
562 * code should use the macros like SET_NUMERIC_UNDERLYING() in perl.h
563 * instead of calling this directly. The macro avoids calling this routine
564 * if toggling isn't necessary according to our records (which could be
565 * wrong if some XS code has changed the locale behind our back) */
a9b8c0d8 566
837ce802 567 do_setlocale_c(LC_NUMERIC, PL_numeric_name);
4c68b815 568 PL_numeric_standard = PL_numeric_underlying_is_standard;
892e6465 569 PL_numeric_underlying = TRUE;
3c62fd46 570 set_numeric_radix(! PL_numeric_standard);
7d4bcc4a
KW
571
572# ifdef DEBUGGING
573
2fcc0ca9
KW
574 if (DEBUG_L_TEST || debug_initialization) {
575 PerlIO_printf(Perl_debug_log,
58e4a467 576 "LC_NUMERIC locale now is %s\n",
2fcc0ca9
KW
577 PL_numeric_name);
578 }
98994639 579
7d4bcc4a 580# endif
98994639 581#endif /* USE_LOCALE_NUMERIC */
7d4bcc4a 582
98994639
HS
583}
584
585/*
586 * Set up for a new ctype locale.
587 */
a4f00dcc
KW
588STATIC void
589S_new_ctype(pTHX_ const char *newctype)
98994639 590{
7d4bcc4a
KW
591
592#ifndef USE_LOCALE_CTYPE
593
594 PERL_ARGS_ASSERT_NEW_CTYPE;
595 PERL_UNUSED_ARG(newctype);
596 PERL_UNUSED_CONTEXT;
597
598#else
0d071d52 599
291a84fb 600 /* Called after each libc setlocale() call affecting LC_CTYPE, to tell
0d071d52
KW
601 * core Perl this and that 'newctype' is the name of the new locale.
602 *
603 * This function sets up the folding arrays for all 256 bytes, assuming
604 * that tofold() is tolc() since fold case is not a concept in POSIX,
605 *
606 * Any code changing the locale (outside this file) should use
607 * POSIX::setlocale, which calls this function. Therefore this function
608 * should be called directly only from this file and from
609 * POSIX::setlocale() */
610
27da23d5 611 dVAR;
68067e4e 612 UV i;
98994639 613
7918f24d
NC
614 PERL_ARGS_ASSERT_NEW_CTYPE;
615
215c5139
KW
616 /* We will replace any bad locale warning with 1) nothing if the new one is
617 * ok; or 2) a new warning for the bad new locale */
618 if (PL_warn_locale) {
619 SvREFCNT_dec_NN(PL_warn_locale);
620 PL_warn_locale = NULL;
621 }
622
c1284011 623 PL_in_utf8_CTYPE_locale = _is_cur_LC_category_utf8(LC_CTYPE);
31f05a37
KW
624
625 /* A UTF-8 locale gets standard rules. But note that code still has to
626 * handle this specially because of the three problematic code points */
627 if (PL_in_utf8_CTYPE_locale) {
628 Copy(PL_fold_latin1, PL_fold_locale, 256, U8);
629 }
630 else {
8c6180a9
KW
631 /* Assume enough space for every character being bad. 4 spaces each
632 * for the 94 printable characters that are output like "'x' "; and 5
633 * spaces each for "'\\' ", "'\t' ", and "'\n' "; plus a terminating
634 * NUL */
635 char bad_chars_list[ (94 * 4) + (3 * 5) + 1 ];
636
cc9eaeb0
KW
637 /* Don't check for problems if we are suppressing the warnings */
638 bool check_for_problems = ckWARN_d(WARN_LOCALE)
639 || UNLIKELY(DEBUG_L_TEST);
8c6180a9
KW
640 bool multi_byte_locale = FALSE; /* Assume is a single-byte locale
641 to start */
642 unsigned int bad_count = 0; /* Count of bad characters */
643
baa60164 644 for (i = 0; i < 256; i++) {
5a8a8f21
KW
645 if (isupper(i))
646 PL_fold_locale[i] = (U8) tolower(i);
647 else if (islower(i))
648 PL_fold_locale[i] = (U8) toupper(i);
baa60164
KW
649 else
650 PL_fold_locale[i] = (U8) i;
8c6180a9
KW
651
652 /* If checking for locale problems, see if the native ASCII-range
653 * printables plus \n and \t are in their expected categories in
654 * the new locale. If not, this could mean big trouble, upending
655 * Perl's and most programs' assumptions, like having a
656 * metacharacter with special meaning become a \w. Fortunately,
657 * it's very rare to find locales that aren't supersets of ASCII
658 * nowadays. It isn't a problem for most controls to be changed
659 * into something else; we check only \n and \t, though perhaps \r
660 * could be an issue as well. */
7d4bcc4a 661 if ( check_for_problems
8c6180a9
KW
662 && (isGRAPH_A(i) || isBLANK_A(i) || i == '\n'))
663 {
adf947ba
KW
664 if ( cBOOL(isalnum(i)) != cBOOL(isALPHANUMERIC(i))
665 || cBOOL(isalpha(i)) != cBOOL(isALPHA_A(i))
666 || cBOOL(isdigit(i)) != cBOOL(isDIGIT_A(i))
667 || cBOOL(isgraph(i)) != cBOOL(isGRAPH_A(i))
668 || cBOOL(islower(i)) != cBOOL(isLOWER_A(i))
669 || cBOOL(isprint(i)) != cBOOL(isPRINT_A(i))
670 || cBOOL(ispunct(i)) != cBOOL(isPUNCT_A(i))
671 || cBOOL(isspace(i)) != cBOOL(isSPACE_A(i))
672 || cBOOL(isupper(i)) != cBOOL(isUPPER_A(i))
673 || cBOOL(isxdigit(i))!= cBOOL(isXDIGIT_A(i))
674 || tolower(i) != (int) toLOWER_A(i)
675 || toupper(i) != (int) toUPPER_A(i)
676 || (i == '\n' && ! isCNTRL_LC(i)))
8c6180a9
KW
677 {
678 if (bad_count) { /* Separate multiple entries with a
679 blank */
680 bad_chars_list[bad_count++] = ' ';
681 }
682 bad_chars_list[bad_count++] = '\'';
683 if (isPRINT_A(i)) {
684 bad_chars_list[bad_count++] = (char) i;
685 }
686 else {
687 bad_chars_list[bad_count++] = '\\';
688 if (i == '\n') {
689 bad_chars_list[bad_count++] = 'n';
690 }
691 else {
692 assert(i == '\t');
693 bad_chars_list[bad_count++] = 't';
694 }
695 }
696 bad_chars_list[bad_count++] = '\'';
697 bad_chars_list[bad_count] = '\0';
698 }
699 }
700 }
701
7d4bcc4a
KW
702# ifdef MB_CUR_MAX
703
8c6180a9 704 /* We only handle single-byte locales (outside of UTF-8 ones; so if
d35fca5f 705 * this locale requires more than one byte, there are going to be
8c6180a9 706 * problems. */
9c8a6dc2
KW
707 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
708 "%s:%d: check_for_problems=%d, MB_CUR_MAX=%d\n",
709 __FILE__, __LINE__, check_for_problems, (int) MB_CUR_MAX));
710
ba1a4362
KW
711 if (check_for_problems && MB_CUR_MAX > 1
712
713 /* Some platforms return MB_CUR_MAX > 1 for even the "C"
714 * locale. Just assume that the implementation for them (plus
715 * for POSIX) is correct and the > 1 value is spurious. (Since
716 * these are specially handled to never be considered UTF-8
717 * locales, as long as this is the only problem, everything
718 * should work fine */
719 && strNE(newctype, "C") && strNE(newctype, "POSIX"))
720 {
8c6180a9
KW
721 multi_byte_locale = TRUE;
722 }
7d4bcc4a
KW
723
724# endif
8c6180a9
KW
725
726 if (bad_count || multi_byte_locale) {
780fcc9f 727 PL_warn_locale = Perl_newSVpvf(aTHX_
8c6180a9 728 "Locale '%s' may not work well.%s%s%s\n",
780fcc9f 729 newctype,
8c6180a9
KW
730 (multi_byte_locale)
731 ? " Some characters in it are not recognized by"
732 " Perl."
733 : "",
734 (bad_count)
735 ? "\nThe following characters (and maybe others)"
736 " may not have the same meaning as the Perl"
737 " program expects:\n"
738 : "",
739 (bad_count)
740 ? bad_chars_list
741 : ""
742 );
cc9eaeb0 743 /* If we are actually in the scope of the locale or are debugging,
bddebb56
KW
744 * output the message now. If not in that scope, we save the
745 * message to be output at the first operation using this locale,
746 * if that actually happens. Most programs don't use locales, so
747 * they are immune to bad ones. */
cc9eaeb0 748 if (IN_LC(LC_CTYPE) || UNLIKELY(DEBUG_L_TEST)) {
780fcc9f
KW
749
750 /* We have to save 'newctype' because the setlocale() just
751 * below may destroy it. The next setlocale() further down
752 * should restore it properly so that the intermediate change
753 * here is transparent to this function's caller */
754 const char * const badlocale = savepv(newctype);
755
837ce802 756 do_setlocale_c(LC_CTYPE, "C");
780fcc9f
KW
757
758 /* The '0' below suppresses a bogus gcc compiler warning */
759 Perl_warner(aTHX_ packWARN(WARN_LOCALE), SvPVX(PL_warn_locale), 0);
bddebb56 760
837ce802 761 do_setlocale_c(LC_CTYPE, badlocale);
c0f3a893 762 Safefree(badlocale);
bddebb56
KW
763
764 if (IN_LC(LC_CTYPE)) {
765 SvREFCNT_dec_NN(PL_warn_locale);
766 PL_warn_locale = NULL;
767 }
780fcc9f 768 }
baa60164 769 }
31f05a37 770 }
98994639
HS
771
772#endif /* USE_LOCALE_CTYPE */
7d4bcc4a 773
98994639
HS
774}
775
98994639 776void
2726666d
KW
777Perl__warn_problematic_locale()
778{
2726666d
KW
779
780#ifdef USE_LOCALE_CTYPE
781
5f04a188
KW
782 dTHX;
783
784 /* Internal-to-core function that outputs the message in PL_warn_locale,
785 * and then NULLS it. Should be called only through the macro
786 * _CHECK_AND_WARN_PROBLEMATIC_LOCALE */
787
2726666d 788 if (PL_warn_locale) {
2726666d
KW
789 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
790 SvPVX(PL_warn_locale),
791 0 /* dummy to avoid compiler warning */ );
2726666d
KW
792 SvREFCNT_dec_NN(PL_warn_locale);
793 PL_warn_locale = NULL;
794 }
795
796#endif
797
798}
799
a4f00dcc
KW
800STATIC void
801S_new_collate(pTHX_ const char *newcoll)
98994639 802{
7d4bcc4a
KW
803
804#ifndef USE_LOCALE_COLLATE
805
806 PERL_UNUSED_ARG(newcoll);
807 PERL_UNUSED_CONTEXT;
808
809#else
0d071d52 810
291a84fb 811 /* Called after each libc setlocale() call affecting LC_COLLATE, to tell
0d071d52
KW
812 * core Perl this and that 'newcoll' is the name of the new locale.
813 *
d35fca5f
KW
814 * The design of locale collation is that every locale change is given an
815 * index 'PL_collation_ix'. The first time a string particpates in an
816 * operation that requires collation while locale collation is active, it
817 * is given PERL_MAGIC_collxfrm magic (via sv_collxfrm_flags()). That
818 * magic includes the collation index, and the transformation of the string
819 * by strxfrm(), q.v. That transformation is used when doing comparisons,
820 * instead of the string itself. If a string changes, the magic is
821 * cleared. The next time the locale changes, the index is incremented,
822 * and so we know during a comparison that the transformation is not
823 * necessarily still valid, and so is recomputed. Note that if the locale
824 * changes enough times, the index could wrap (a U32), and it is possible
825 * that a transformation would improperly be considered valid, leading to
826 * an unlikely bug */
0d071d52 827
98994639
HS
828 if (! newcoll) {
829 if (PL_collation_name) {
830 ++PL_collation_ix;
831 Safefree(PL_collation_name);
832 PL_collation_name = NULL;
833 }
834 PL_collation_standard = TRUE;
00bf60ca 835 is_standard_collation:
98994639
HS
836 PL_collxfrm_base = 0;
837 PL_collxfrm_mult = 2;
165a1c52 838 PL_in_utf8_COLLATE_locale = FALSE;
f28f4d2a 839 PL_strxfrm_NUL_replacement = '\0';
a4a439fb 840 PL_strxfrm_max_cp = 0;
98994639
HS
841 return;
842 }
843
d35fca5f 844 /* If this is not the same locale as currently, set the new one up */
98994639
HS
845 if (! PL_collation_name || strNE(PL_collation_name, newcoll)) {
846 ++PL_collation_ix;
847 Safefree(PL_collation_name);
848 PL_collation_name = stdize_locale(savepv(newcoll));
a39edc4c 849 PL_collation_standard = isNAME_C_OR_POSIX(newcoll);
00bf60ca
KW
850 if (PL_collation_standard) {
851 goto is_standard_collation;
852 }
98994639 853
165a1c52 854 PL_in_utf8_COLLATE_locale = _is_cur_LC_category_utf8(LC_COLLATE);
f28f4d2a 855 PL_strxfrm_NUL_replacement = '\0';
a4a439fb 856 PL_strxfrm_max_cp = 0;
165a1c52 857
59c018b9
KW
858 /* A locale collation definition includes primary, secondary, tertiary,
859 * etc. weights for each character. To sort, the primary weights are
860 * used, and only if they compare equal, then the secondary weights are
861 * used, and only if they compare equal, then the tertiary, etc.
862 *
863 * strxfrm() works by taking the input string, say ABC, and creating an
864 * output transformed string consisting of first the primary weights,
865 * A¹B¹C¹ followed by the secondary ones, A²B²C²; and then the
866 * tertiary, etc, yielding A¹B¹C¹ A²B²C² A³B³C³ .... Some characters
867 * may not have weights at every level. In our example, let's say B
868 * doesn't have a tertiary weight, and A doesn't have a secondary
869 * weight. The constructed string is then going to be
870 * A¹B¹C¹ B²C² A³C³ ....
871 * This has the desired effect that strcmp() will look at the secondary
872 * or tertiary weights only if the strings compare equal at all higher
873 * priority weights. The spaces shown here, like in
c342d20e 874 * "A¹B¹C¹ A²B²C² "
59c018b9
KW
875 * are not just for readability. In the general case, these must
876 * actually be bytes, which we will call here 'separator weights'; and
877 * they must be smaller than any other weight value, but since these
878 * are C strings, only the terminating one can be a NUL (some
879 * implementations may include a non-NUL separator weight just before
880 * the NUL). Implementations tend to reserve 01 for the separator
881 * weights. They are needed so that a shorter string's secondary
882 * weights won't be misconstrued as primary weights of a longer string,
883 * etc. By making them smaller than any other weight, the shorter
884 * string will sort first. (Actually, if all secondary weights are
885 * smaller than all primary ones, there is no need for a separator
886 * weight between those two levels, etc.)
887 *
888 * The length of the transformed string is roughly a linear function of
889 * the input string. It's not exactly linear because some characters
890 * don't have weights at all levels. When we call strxfrm() we have to
891 * allocate some memory to hold the transformed string. The
892 * calculations below try to find coefficients 'm' and 'b' for this
893 * locale so that m*x + b equals how much space we need, given the size
894 * of the input string in 'x'. If we calculate too small, we increase
895 * the size as needed, and call strxfrm() again, but it is better to
896 * get it right the first time to avoid wasted expensive string
897 * transformations. */
898
98994639 899 {
79f120c8
KW
900 /* We use the string below to find how long the tranformation of it
901 * is. Almost all locales are supersets of ASCII, or at least the
902 * ASCII letters. We use all of them, half upper half lower,
903 * because if we used fewer, we might hit just the ones that are
904 * outliers in a particular locale. Most of the strings being
905 * collated will contain a preponderance of letters, and even if
906 * they are above-ASCII, they are likely to have the same number of
907 * weight levels as the ASCII ones. It turns out that digits tend
908 * to have fewer levels, and some punctuation has more, but those
909 * are relatively sparse in text, and khw believes this gives a
910 * reasonable result, but it could be changed if experience so
911 * dictates. */
912 const char longer[] = "ABCDEFGHIJKLMnopqrstuvwxyz";
913 char * x_longer; /* Transformed 'longer' */
914 Size_t x_len_longer; /* Length of 'x_longer' */
915
916 char * x_shorter; /* We also transform a substring of 'longer' */
917 Size_t x_len_shorter;
918
a4a439fb 919 /* _mem_collxfrm() is used get the transformation (though here we
79f120c8
KW
920 * are interested only in its length). It is used because it has
921 * the intelligence to handle all cases, but to work, it needs some
922 * values of 'm' and 'b' to get it started. For the purposes of
923 * this calculation we use a very conservative estimate of 'm' and
924 * 'b'. This assumes a weight can be multiple bytes, enough to
925 * hold any UV on the platform, and there are 5 levels, 4 weight
926 * bytes, and a trailing NUL. */
927 PL_collxfrm_base = 5;
928 PL_collxfrm_mult = 5 * sizeof(UV);
929
930 /* Find out how long the transformation really is */
a4a439fb
KW
931 x_longer = _mem_collxfrm(longer,
932 sizeof(longer) - 1,
933 &x_len_longer,
934
935 /* We avoid converting to UTF-8 in the
936 * called function by telling it the
937 * string is in UTF-8 if the locale is a
938 * UTF-8 one. Since the string passed
939 * here is invariant under UTF-8, we can
940 * claim it's UTF-8 even though it isn't.
941 * */
942 PL_in_utf8_COLLATE_locale);
79f120c8
KW
943 Safefree(x_longer);
944
945 /* Find out how long the transformation of a substring of 'longer'
946 * is. Together the lengths of these transformations are
947 * sufficient to calculate 'm' and 'b'. The substring is all of
948 * 'longer' except the first character. This minimizes the chances
949 * of being swayed by outliers */
a4a439fb 950 x_shorter = _mem_collxfrm(longer + 1,
79f120c8 951 sizeof(longer) - 2,
a4a439fb
KW
952 &x_len_shorter,
953 PL_in_utf8_COLLATE_locale);
79f120c8
KW
954 Safefree(x_shorter);
955
956 /* If the results are nonsensical for this simple test, the whole
957 * locale definition is suspect. Mark it so that locale collation
958 * is not active at all for it. XXX Should we warn? */
959 if ( x_len_shorter == 0
960 || x_len_longer == 0
961 || x_len_shorter >= x_len_longer)
962 {
963 PL_collxfrm_mult = 0;
964 PL_collxfrm_base = 0;
965 }
966 else {
967 SSize_t base; /* Temporary */
968
969 /* We have both: m * strlen(longer) + b = x_len_longer
970 * m * strlen(shorter) + b = x_len_shorter;
971 * subtracting yields:
972 * m * (strlen(longer) - strlen(shorter))
973 * = x_len_longer - x_len_shorter
974 * But we have set things up so that 'shorter' is 1 byte smaller
975 * than 'longer'. Hence:
976 * m = x_len_longer - x_len_shorter
977 *
978 * But if something went wrong, make sure the multiplier is at
979 * least 1.
980 */
981 if (x_len_longer > x_len_shorter) {
982 PL_collxfrm_mult = (STRLEN) x_len_longer - x_len_shorter;
983 }
984 else {
985 PL_collxfrm_mult = 1;
986 }
987
988 /* mx + b = len
989 * so: b = len - mx
990 * but in case something has gone wrong, make sure it is
991 * non-negative */
992 base = x_len_longer - PL_collxfrm_mult * (sizeof(longer) - 1);
993 if (base < 0) {
994 base = 0;
995 }
996
997 /* Add 1 for the trailing NUL */
998 PL_collxfrm_base = base + 1;
999 }
58eebef2 1000
7d4bcc4a
KW
1001# ifdef DEBUGGING
1002
58eebef2
KW
1003 if (DEBUG_L_TEST || debug_initialization) {
1004 PerlIO_printf(Perl_debug_log,
b07929e4
KW
1005 "%s:%d: ?UTF-8 locale=%d; x_len_shorter=%zu, "
1006 "x_len_longer=%zu,"
1007 " collate multipler=%zu, collate base=%zu\n",
58eebef2
KW
1008 __FILE__, __LINE__,
1009 PL_in_utf8_COLLATE_locale,
1010 x_len_shorter, x_len_longer,
1011 PL_collxfrm_mult, PL_collxfrm_base);
1012 }
7d4bcc4a
KW
1013# endif
1014
98994639
HS
1015 }
1016 }
1017
1018#endif /* USE_LOCALE_COLLATE */
7d4bcc4a 1019
98994639
HS
1020}
1021
d2b24094 1022#ifdef WIN32
b8cc575c 1023
a4f00dcc 1024STATIC char *
b8cc575c 1025S_win32_setlocale(pTHX_ int category, const char* locale)
b385bb4d
KW
1026{
1027 /* This, for Windows, emulates POSIX setlocale() behavior. There is no
7d4bcc4a
KW
1028 * difference between the two unless the input locale is "", which normally
1029 * means on Windows to get the machine default, which is set via the
1030 * computer's "Regional and Language Options" (or its current equivalent).
1031 * In POSIX, it instead means to find the locale from the user's
1032 * environment. This routine changes the Windows behavior to first look in
1033 * the environment, and, if anything is found, use that instead of going to
1034 * the machine default. If there is no environment override, the machine
1035 * default is used, by calling the real setlocale() with "".
1036 *
1037 * The POSIX behavior is to use the LC_ALL variable if set; otherwise to
1038 * use the particular category's variable if set; otherwise to use the LANG
1039 * variable. */
b385bb4d 1040
175c4cf9 1041 bool override_LC_ALL = FALSE;
89f7b9aa 1042 char * result;
e5f10d49 1043 unsigned int i;
89f7b9aa 1044
b385bb4d 1045 if (locale && strEQ(locale, "")) {
7d4bcc4a
KW
1046
1047# ifdef LC_ALL
1048
b385bb4d
KW
1049 locale = PerlEnv_getenv("LC_ALL");
1050 if (! locale) {
e5f10d49
KW
1051 if (category == LC_ALL) {
1052 override_LC_ALL = TRUE;
1053 }
1054 else {
7d4bcc4a
KW
1055
1056# endif
7d4bcc4a 1057
e5f10d49
KW
1058 for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++) {
1059 if (category == categories[i]) {
1060 locale = PerlEnv_getenv(category_names[i]);
1061 goto found_locale;
1062 }
1063 }
7d4bcc4a 1064
b385bb4d 1065 locale = PerlEnv_getenv("LANG");
481465ea 1066 if (! locale) {
b385bb4d
KW
1067 locale = "";
1068 }
e5f10d49
KW
1069
1070 found_locale: ;
7d4bcc4a
KW
1071
1072# ifdef LC_ALL
1073
e5f10d49 1074 }
b385bb4d 1075 }
7d4bcc4a
KW
1076
1077# endif
1078
b385bb4d
KW
1079 }
1080
89f7b9aa 1081 result = setlocale(category, locale);
bbc98134 1082 DEBUG_L(PerlIO_printf(Perl_debug_log, "%s:%d: %s\n", __FILE__, __LINE__,
a4f00dcc 1083 setlocale_debug_string(category, locale, result)));
89f7b9aa 1084
481465ea 1085 if (! override_LC_ALL) {
89f7b9aa
KW
1086 return result;
1087 }
1088
dfd77d7a 1089 /* Here the input category was LC_ALL, and we have set it to what is in the
481465ea
KW
1090 * LANG variable or the system default if there is no LANG. But these have
1091 * lower priority than the other LC_foo variables, so override it for each
1092 * one that is set. (If they are set to "", it means to use the same thing
1093 * we just set LC_ALL to, so can skip) */
7d4bcc4a 1094
948523db 1095 for (i = 0; i < LC_ALL_INDEX; i++) {
e5f10d49
KW
1096 result = PerlEnv_getenv(category_names[i]);
1097 if (result && strNE(result, "")) {
1098 setlocale(categories[i], result);
1099 DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s:%d: %s\n",
1100 __FILE__, __LINE__,
1101 setlocale_debug_string(categories[i], result, "not captured")));
1102 }
89f7b9aa 1103 }
7d4bcc4a 1104
bbc98134
KW
1105 result = setlocale(LC_ALL, NULL);
1106 DEBUG_L(PerlIO_printf(Perl_debug_log, "%s:%d: %s\n",
1107 __FILE__, __LINE__,
a4f00dcc 1108 setlocale_debug_string(LC_ALL, NULL, result)));
89f7b9aa 1109
bbc98134 1110 return result;
b385bb4d
KW
1111}
1112
1113#endif
1114
a4f00dcc
KW
1115char *
1116Perl_setlocale(int category, const char * locale)
1117{
1118 /* This wraps POSIX::setlocale() */
1119
1120 char * retval;
1159483a 1121 char * newlocale;
a4f00dcc
KW
1122 dTHX;
1123
a4f00dcc
KW
1124#ifdef USE_LOCALE_NUMERIC
1125
291a84fb
KW
1126 /* A NULL locale means only query what the current one is. We have the
1127 * LC_NUMERIC name saved, because we are normally switched into the C
1128 * locale for it. For an LC_ALL query, switch back to get the correct
1129 * results. All other categories don't require special handling */
a4f00dcc
KW
1130 if (locale == NULL) {
1131 if (category == LC_NUMERIC) {
1132 return savepv(PL_numeric_name);
1133 }
1134
7d4bcc4a 1135# ifdef LC_ALL
a4f00dcc 1136
c0d737a8
KW
1137 else if (category == LC_ALL && ! PL_numeric_underlying) {
1138
a4f00dcc
KW
1139 SET_NUMERIC_UNDERLYING();
1140 }
1141
7d4bcc4a 1142# endif
a4f00dcc
KW
1143
1144 }
1145
1146#endif
1147
e5f10d49
KW
1148 /* Save retval since subsequent setlocale() calls may overwrite it. */
1149 retval = savepv(do_setlocale_r(category, locale));
a4f00dcc
KW
1150
1151 DEBUG_L(PerlIO_printf(Perl_debug_log,
1152 "%s:%d: %s\n", __FILE__, __LINE__,
1153 setlocale_debug_string(category, locale, retval)));
1154 if (! retval) {
1155 /* Should never happen that a query would return an error, but be
1156 * sure and reset to C locale */
1157 if (locale == 0) {
1158 SET_NUMERIC_STANDARD();
1159 }
7d4bcc4a 1160
a4f00dcc
KW
1161 return NULL;
1162 }
1163
a4f00dcc
KW
1164 /* If locale == NULL, we are just querying the state, but may have switched
1165 * to NUMERIC_UNDERLYING. Switch back before returning. */
1166 if (locale == NULL) {
1167 SET_NUMERIC_STANDARD();
1168 return retval;
1169 }
a4f00dcc 1170
1159483a
KW
1171 /* Now that have switched locales, we have to update our records to
1172 * correspond. */
a4f00dcc 1173
1159483a 1174 switch (category) {
a4f00dcc 1175
1159483a 1176#ifdef USE_LOCALE_CTYPE
a4f00dcc 1177
1159483a
KW
1178 case LC_CTYPE:
1179 new_ctype(retval);
1180 break;
a4f00dcc 1181
1159483a 1182#endif
a4f00dcc
KW
1183#ifdef USE_LOCALE_COLLATE
1184
1159483a
KW
1185 case LC_COLLATE:
1186 new_collate(retval);
1187 break;
a4f00dcc 1188
1159483a 1189#endif
a4f00dcc
KW
1190#ifdef USE_LOCALE_NUMERIC
1191
1159483a
KW
1192 case LC_NUMERIC:
1193 new_numeric(retval);
1194 break;
a4f00dcc 1195
1159483a
KW
1196#endif
1197#ifdef LC_ALL
a4f00dcc 1198
1159483a 1199 case LC_ALL:
a4f00dcc 1200
1159483a
KW
1201 /* LC_ALL updates all the things we care about. The values may not
1202 * be the same as 'retval', as the locale "" may have set things
1203 * individually */
a4f00dcc 1204
1159483a 1205# ifdef USE_LOCALE_CTYPE
a4f00dcc 1206
1159483a
KW
1207 newlocale = do_setlocale_c(LC_CTYPE, NULL);
1208 new_ctype(newlocale);
a4f00dcc 1209
1159483a
KW
1210# endif /* USE_LOCALE_CTYPE */
1211# ifdef USE_LOCALE_COLLATE
1212
1213 newlocale = do_setlocale_c(LC_COLLATE, NULL);
1214 new_collate(newlocale);
a4f00dcc 1215
7d4bcc4a 1216# endif
1159483a 1217# ifdef USE_LOCALE_NUMERIC
a4f00dcc 1218
1159483a
KW
1219 newlocale = do_setlocale_c(LC_NUMERIC, NULL);
1220 new_numeric(newlocale);
a4f00dcc 1221
1159483a
KW
1222# endif /* USE_LOCALE_NUMERIC */
1223#endif /* LC_ALL */
a4f00dcc 1224
1159483a
KW
1225 default:
1226 break;
a4f00dcc
KW
1227 }
1228
1229 return retval;
1230
f7416781
KW
1231
1232}
1233
1234PERL_STATIC_INLINE const char *
1235S_save_to_buffer(const char * string, char **buf, Size_t *buf_size, const Size_t offset)
1236{
1237 /* Copy the NUL-terminated 'string' to 'buf' + 'offset'. 'buf' has size 'buf_size',
1238 * growing it if necessary */
1239
1240 const Size_t string_size = strlen(string) + offset + 1;
1241
1242 PERL_ARGS_ASSERT_SAVE_TO_BUFFER;
1243
1244 if (*buf_size == 0) {
1245 Newx(*buf, string_size, char);
1246 *buf_size = string_size;
1247 }
1248 else if (string_size > *buf_size) {
1249 Renew(*buf, string_size, char);
1250 *buf_size = string_size;
1251 }
1252
1253 Copy(string, *buf + offset, string_size - offset, char);
1254 return *buf;
1255}
1256
1257/*
1258
1259=head1 Locale-related functions and macros
1260
1261=for apidoc Perl_langinfo
1262
7d4bcc4a 1263This is an (almost ª) drop-in replacement for the system C<L<nl_langinfo(3)>>,
f7416781
KW
1264taking the same C<item> parameter values, and returning the same information.
1265But it is more thread-safe than regular C<nl_langinfo()>, and hides the quirks
1266of Perl's locale handling from your code, and can be used on systems that lack
1267a native C<nl_langinfo>.
1268
1269Expanding on these:
1270
1271=over
1272
1273=item *
1274
1275It delivers the correct results for the C<RADIXCHAR> and C<THOUSESEP> items,
1276without you having to write extra code. The reason for the extra code would be
1277because these are from the C<LC_NUMERIC> locale category, which is normally
1278kept set to the C locale by Perl, no matter what the underlying locale is
1279supposed to be, and so to get the expected results, you have to temporarily
1280toggle into the underlying locale, and later toggle back. (You could use
1281plain C<nl_langinfo> and C<L</STORE_LC_NUMERIC_FORCE_TO_UNDERLYING>> for this
1282but then you wouldn't get the other advantages of C<Perl_langinfo()>; not
1283keeping C<LC_NUMERIC> in the C locale would break a lot of CPAN, which is
1284expecting the radix (decimal point) character to be a dot.)
1285
1286=item *
1287
1288Depending on C<item>, it works on systems that don't have C<nl_langinfo>, hence
1289makes your code more portable. Of the fifty-some possible items specified by
1290the POSIX 2008 standard,
1291L<http://pubs.opengroup.org/onlinepubs/9699919799/basedefs/langinfo.h.html>,
1292only two are completely unimplemented. It uses various techniques to recover
1293the other items, including calling C<L<localeconv(3)>>, and C<L<strftime(3)>>,
1294both of which are specified in C89, so should be always be available. Later
1295C<strftime()> versions have additional capabilities; C<""> is returned for
1296those not available on your system.
1297
1298The details for those items which may differ from what this emulation returns
1299and what a native C<nl_langinfo()> would return are:
1300
1301=over
1302
1303=item C<CODESET>
1304
1305=item C<ERA>
1306
1307Unimplemented, so returns C<"">.
1308
1309=item C<YESEXPR>
1310
c1566110
KW
1311=item C<YESSTR>
1312
f7416781
KW
1313=item C<NOEXPR>
1314
c1566110
KW
1315=item C<NOSTR>
1316
1317Only the values for English are returned. C<YESSTR> and C<NOSTR> have been
1318removed from POSIX 2008, and are retained for backwards compatibility. Your
1319platform's C<nl_langinfo> may not support them.
f7416781
KW
1320
1321=item C<D_FMT>
1322
1323Always evaluates to C<%x>, the locale's appropriate date representation.
1324
1325=item C<T_FMT>
1326
1327Always evaluates to C<%X>, the locale's appropriate time representation.
1328
1329=item C<D_T_FMT>
1330
1331Always evaluates to C<%c>, the locale's appropriate date and time
1332representation.
1333
1334=item C<CRNCYSTR>
1335
1336The return may be incorrect for those rare locales where the currency symbol
1337replaces the radix character.
1338Send email to L<mailto:perlbug@perl.org> if you have examples of it needing
1339to work differently.
1340
1341=item C<ALT_DIGITS>
1342
1343Currently this gives the same results as Linux does.
1344Send email to L<mailto:perlbug@perl.org> if you have examples of it needing
1345to work differently.
1346
1347=item C<ERA_D_FMT>
1348
1349=item C<ERA_T_FMT>
1350
1351=item C<ERA_D_T_FMT>
1352
1353=item C<T_FMT_AMPM>
1354
1355These are derived by using C<strftime()>, and not all versions of that function
1356know about them. C<""> is returned for these on such systems.
1357
1358=back
1359
1360When using C<Perl_langinfo> on systems that don't have a native
1361C<nl_langinfo()>, you must
1362
1363 #include "perl_langinfo.h"
1364
1365before the C<perl.h> C<#include>. You can replace your C<langinfo.h>
1366C<#include> with this one. (Doing it this way keeps out the symbols that plain
1367C<langinfo.h> imports into the namespace for code that doesn't need it.)
1368
1369You also should not use the bare C<langinfo.h> item names, but should preface
1370them with C<PERL_>, so use C<PERL_RADIXCHAR> instead of plain C<RADIXCHAR>.
1371The C<PERL_I<foo>> versions will also work for this function on systems that do
1372have a native C<nl_langinfo>.
1373
1374=item *
1375
1376It is thread-friendly, returning its result in a buffer that won't be
1377overwritten by another thread, so you don't have to code for that possibility.
1378The buffer can be overwritten by the next call to C<nl_langinfo> or
1379C<Perl_langinfo> in the same thread.
1380
1381=item *
1382
7d4bcc4a 1383ª It returns S<C<const char *>>, whereas plain C<nl_langinfo()> returns S<C<char
f7416781
KW
1384*>>, but you are (only by documentation) forbidden to write into the buffer.
1385By declaring this C<const>, the compiler enforces this restriction. The extra
1386C<const> is why this isn't an unequivocal drop-in replacement for
1387C<nl_langinfo>.
1388
1389=back
1390
1391The original impetus for C<Perl_langinfo()> was so that code that needs to
1392find out the current currency symbol, floating point radix character, or digit
1393grouping separator can use, on all systems, the simpler and more
1394thread-friendly C<nl_langinfo> API instead of C<L<localeconv(3)>> which is a
1395pain to make thread-friendly. For other fields returned by C<localeconv>, it
1396is better to use the methods given in L<perlcall> to call
1397L<C<POSIX::localeconv()>|POSIX/localeconv>, which is thread-friendly.
1398
1399=cut
1400
1401*/
1402
1403const char *
1404#ifdef HAS_NL_LANGINFO
1405Perl_langinfo(const nl_item item)
1406#else
1407Perl_langinfo(const int item)
1408#endif
1409{
f61748ac
KW
1410 return my_nl_langinfo(item, TRUE);
1411}
1412
1413const char *
1414#ifdef HAS_NL_LANGINFO
1415S_my_nl_langinfo(const nl_item item, bool toggle)
1416#else
1417S_my_nl_langinfo(const int item, bool toggle)
1418#endif
1419{
ae74815b 1420 dTHX;
f7416781 1421
5a854ab3
KW
1422 /* We only need to toggle into the underlying LC_NUMERIC locale for these
1423 * two items, and only if not already there */
1424 if (toggle && (( item != PERL_RADIXCHAR && item != PERL_THOUSEP)
1425 || PL_numeric_underlying))
1426 {
1427 toggle = FALSE;
1428 }
1429
ab340fff
KW
1430#if defined(HAS_NL_LANGINFO) /* nl_langinfo() is available. */
1431#if ! defined(HAS_POSIX_2008_LOCALE)
f7416781 1432
ab340fff 1433 /* Here, use plain nl_langinfo(), switching to the underlying LC_NUMERIC
ae74815b
KW
1434 * for those items dependent on it. This must be copied to a buffer before
1435 * switching back, as some systems destroy the buffer when setlocale() is
1436 * called */
f7416781 1437
038d3702
KW
1438 {
1439 DECLARATION_FOR_LC_NUMERIC_MANIPULATION;
1440
b2fee59c 1441 if (toggle) {
038d3702 1442 STORE_LC_NUMERIC_FORCE_TO_UNDERLYING();
b2fee59c 1443 }
f7416781 1444
5acc4454
KW
1445 LOCALE_LOCK; /* Prevent interference from another thread executing
1446 this code section (the only call to nl_langinfo in
1447 the core) */
1448
b2fee59c
KW
1449 save_to_buffer(nl_langinfo(item), &PL_langinfo_buf,
1450 &PL_langinfo_bufsize, 0);
f7416781 1451
5acc4454
KW
1452 LOCALE_UNLOCK;
1453
b2fee59c 1454 if (toggle) {
038d3702 1455 RESTORE_LC_NUMERIC();
b2fee59c 1456 }
f7416781
KW
1457 }
1458
ab340fff
KW
1459# else /* Use nl_langinfo_l(), avoiding both a mutex and changing the locale */
1460
5a854ab3 1461 {
b2fee59c
KW
1462 bool do_free = FALSE;
1463 locale_t cur = uselocale((locale_t) 0);
ab340fff 1464
b2fee59c
KW
1465 if (cur == LC_GLOBAL_LOCALE) {
1466 cur = duplocale(LC_GLOBAL_LOCALE);
1467 do_free = TRUE;
1468 }
ab340fff 1469
b2fee59c 1470 if (toggle) {
e1aa2579
KW
1471 if (PL_underlying_numeric_obj) {
1472 cur = PL_underlying_numeric_obj;
1473 }
1474 else {
1475 cur = newlocale(LC_NUMERIC_MASK, PL_numeric_name, cur);
1476 do_free = TRUE;
1477 }
b2fee59c 1478 }
ab340fff 1479
b2fee59c
KW
1480 save_to_buffer(nl_langinfo_l(item, cur),
1481 &PL_langinfo_buf, &PL_langinfo_bufsize, 0);
1482 if (do_free) {
1483 freelocale(cur);
1484 }
5a854ab3 1485 }
ab340fff 1486
c1566110
KW
1487# endif
1488
1489 if (strEQ(PL_langinfo_buf, "")) {
1490 if (item == PERL_YESSTR) {
1491 return "yes";
1492 }
1493 if (item == PERL_NOSTR) {
1494 return "no";
1495 }
1496 }
1497
ab340fff
KW
1498 return PL_langinfo_buf;
1499
f7416781 1500#else /* Below, emulate nl_langinfo as best we can */
43dd6b15
KW
1501
1502 {
1503
f7416781
KW
1504# ifdef HAS_LOCALECONV
1505
43dd6b15 1506 const struct lconv* lc;
038d3702 1507 DECLARATION_FOR_LC_NUMERIC_MANIPULATION;
f7416781
KW
1508
1509# endif
1510# ifdef HAS_STRFTIME
1511
43dd6b15
KW
1512 struct tm tm;
1513 bool return_format = FALSE; /* Return the %format, not the value */
1514 const char * format;
f7416781
KW
1515
1516# endif
1517
43dd6b15
KW
1518 /* We copy the results to a per-thread buffer, even if not
1519 * multi-threaded. This is in part to simplify this code, and partly
1520 * because we need a buffer anyway for strftime(), and partly because a
1521 * call of localeconv() could otherwise wipe out the buffer, and the
1522 * programmer would not be expecting this, as this is a nl_langinfo()
1523 * substitute after all, so s/he might be thinking their localeconv()
1524 * is safe until another localeconv() call. */
f7416781 1525
43dd6b15
KW
1526 switch (item) {
1527 Size_t len;
1528 const char * retval;
f7416781 1529
43dd6b15
KW
1530 /* These 2 are unimplemented */
1531 case PERL_CODESET:
1532 case PERL_ERA: /* For use with strftime() %E modifier */
f7416781 1533
43dd6b15
KW
1534 default:
1535 return "";
f7416781 1536
43dd6b15
KW
1537 /* We use only an English set, since we don't know any more */
1538 case PERL_YESEXPR: return "^[+1yY]";
1539 case PERL_YESSTR: return "yes";
1540 case PERL_NOEXPR: return "^[-0nN]";
1541 case PERL_NOSTR: return "no";
f7416781
KW
1542
1543# ifdef HAS_LOCALECONV
1544
43dd6b15 1545 case PERL_CRNCYSTR:
f7416781 1546
43dd6b15
KW
1547 /* We don't bother with localeconv_l() because any system that
1548 * has it is likely to also have nl_langinfo() */
291a84fb 1549
5acc4454
KW
1550 LOCALE_LOCK; /* Prevent interference with other threads
1551 using localeconv() */
1552
43dd6b15
KW
1553 lc = localeconv();
1554 if ( ! lc
1555 || ! lc->currency_symbol
1556 || strEQ("", lc->currency_symbol))
1557 {
1558 LOCALE_UNLOCK;
1559 return "";
1560 }
f7416781 1561
43dd6b15
KW
1562 /* Leave the first spot empty to be filled in below */
1563 save_to_buffer(lc->currency_symbol, &PL_langinfo_buf,
1564 &PL_langinfo_bufsize, 1);
1565 if (lc->mon_decimal_point && strEQ(lc->mon_decimal_point, ""))
1566 { /* khw couldn't figure out how the localedef specifications
1567 would show that the $ should replace the radix; this is
1568 just a guess as to how it might work.*/
1569 *PL_langinfo_buf = '.';
1570 }
1571 else if (lc->p_cs_precedes) {
1572 *PL_langinfo_buf = '-';
1573 }
1574 else {
1575 *PL_langinfo_buf = '+';
1576 }
f7416781 1577
43dd6b15
KW
1578 LOCALE_UNLOCK;
1579 break;
f7416781 1580
43dd6b15
KW
1581 case PERL_RADIXCHAR:
1582 case PERL_THOUSEP:
f7416781 1583
43dd6b15 1584 if (toggle) {
038d3702 1585 STORE_LC_NUMERIC_FORCE_TO_UNDERLYING();
c0d737a8 1586 }
f7416781 1587
5acc4454
KW
1588 LOCALE_LOCK; /* Prevent interference with other threads
1589 using localeconv() */
1590
43dd6b15
KW
1591 lc = localeconv();
1592 if (! lc) {
33394adc
KW
1593 retval = "";
1594 }
43dd6b15
KW
1595 else {
1596 retval = (item == PERL_RADIXCHAR)
1597 ? lc->decimal_point
1598 : lc->thousands_sep;
1599 if (! retval) {
1600 retval = "";
1601 }
1602 }
f7416781 1603
43dd6b15
KW
1604 save_to_buffer(retval, &PL_langinfo_buf,
1605 &PL_langinfo_bufsize, 0);
f7416781 1606
5acc4454
KW
1607 LOCALE_UNLOCK;
1608
43dd6b15 1609 if (toggle) {
038d3702 1610 RESTORE_LC_NUMERIC();
43dd6b15 1611 }
f7416781 1612
43dd6b15 1613 break;
f7416781
KW
1614
1615# endif
1616# ifdef HAS_STRFTIME
1617
43dd6b15
KW
1618 /* These are defined by C89, so we assume that strftime supports
1619 * them, and so are returned unconditionally; they may not be what
1620 * the locale actually says, but should give good enough results
1621 * for someone using them as formats (as opposed to trying to parse
1622 * them to figure out what the locale says). The other format
1623 * items are actually tested to verify they work on the platform */
1624 case PERL_D_FMT: return "%x";
1625 case PERL_T_FMT: return "%X";
1626 case PERL_D_T_FMT: return "%c";
1627
1628 /* These formats are only available in later strfmtime's */
1629 case PERL_ERA_D_FMT: case PERL_ERA_T_FMT: case PERL_ERA_D_T_FMT:
1630 case PERL_T_FMT_AMPM:
1631
1632 /* The rest can be gotten from most versions of strftime(). */
1633 case PERL_ABDAY_1: case PERL_ABDAY_2: case PERL_ABDAY_3:
1634 case PERL_ABDAY_4: case PERL_ABDAY_5: case PERL_ABDAY_6:
1635 case PERL_ABDAY_7:
1636 case PERL_ALT_DIGITS:
1637 case PERL_AM_STR: case PERL_PM_STR:
1638 case PERL_ABMON_1: case PERL_ABMON_2: case PERL_ABMON_3:
1639 case PERL_ABMON_4: case PERL_ABMON_5: case PERL_ABMON_6:
1640 case PERL_ABMON_7: case PERL_ABMON_8: case PERL_ABMON_9:
1641 case PERL_ABMON_10: case PERL_ABMON_11: case PERL_ABMON_12:
1642 case PERL_DAY_1: case PERL_DAY_2: case PERL_DAY_3: case PERL_DAY_4:
1643 case PERL_DAY_5: case PERL_DAY_6: case PERL_DAY_7:
1644 case PERL_MON_1: case PERL_MON_2: case PERL_MON_3: case PERL_MON_4:
1645 case PERL_MON_5: case PERL_MON_6: case PERL_MON_7: case PERL_MON_8:
1646 case PERL_MON_9: case PERL_MON_10: case PERL_MON_11:
1647 case PERL_MON_12:
1648
1649 LOCALE_LOCK;
1650
1651 init_tm(&tm); /* Precaution against core dumps */
1652 tm.tm_sec = 30;
1653 tm.tm_min = 30;
1654 tm.tm_hour = 6;
1655 tm.tm_year = 2017 - 1900;
1656 tm.tm_wday = 0;
1657 tm.tm_mon = 0;
1658 switch (item) {
1659 default:
1660 LOCALE_UNLOCK;
1661 Perl_croak(aTHX_
1662 "panic: %s: %d: switch case: %d problem",
1663 __FILE__, __LINE__, item);
1664 NOT_REACHED; /* NOTREACHED */
1665
1666 case PERL_PM_STR: tm.tm_hour = 18;
1667 case PERL_AM_STR:
1668 format = "%p";
1669 break;
1670
1671 case PERL_ABDAY_7: tm.tm_wday++;
1672 case PERL_ABDAY_6: tm.tm_wday++;
1673 case PERL_ABDAY_5: tm.tm_wday++;
1674 case PERL_ABDAY_4: tm.tm_wday++;
1675 case PERL_ABDAY_3: tm.tm_wday++;
1676 case PERL_ABDAY_2: tm.tm_wday++;
1677 case PERL_ABDAY_1:
1678 format = "%a";
1679 break;
1680
1681 case PERL_DAY_7: tm.tm_wday++;
1682 case PERL_DAY_6: tm.tm_wday++;
1683 case PERL_DAY_5: tm.tm_wday++;
1684 case PERL_DAY_4: tm.tm_wday++;
1685 case PERL_DAY_3: tm.tm_wday++;
1686 case PERL_DAY_2: tm.tm_wday++;
1687 case PERL_DAY_1:
1688 format = "%A";
1689 break;
1690
1691 case PERL_ABMON_12: tm.tm_mon++;
1692 case PERL_ABMON_11: tm.tm_mon++;
1693 case PERL_ABMON_10: tm.tm_mon++;
1694 case PERL_ABMON_9: tm.tm_mon++;
1695 case PERL_ABMON_8: tm.tm_mon++;
1696 case PERL_ABMON_7: tm.tm_mon++;
1697 case PERL_ABMON_6: tm.tm_mon++;
1698 case PERL_ABMON_5: tm.tm_mon++;
1699 case PERL_ABMON_4: tm.tm_mon++;
1700 case PERL_ABMON_3: tm.tm_mon++;
1701 case PERL_ABMON_2: tm.tm_mon++;
1702 case PERL_ABMON_1:
1703 format = "%b";
1704 break;
1705
1706 case PERL_MON_12: tm.tm_mon++;
1707 case PERL_MON_11: tm.tm_mon++;
1708 case PERL_MON_10: tm.tm_mon++;
1709 case PERL_MON_9: tm.tm_mon++;
1710 case PERL_MON_8: tm.tm_mon++;
1711 case PERL_MON_7: tm.tm_mon++;
1712 case PERL_MON_6: tm.tm_mon++;
1713 case PERL_MON_5: tm.tm_mon++;
1714 case PERL_MON_4: tm.tm_mon++;
1715 case PERL_MON_3: tm.tm_mon++;
1716 case PERL_MON_2: tm.tm_mon++;
1717 case PERL_MON_1:
1718 format = "%B";
1719 break;
1720
1721 case PERL_T_FMT_AMPM:
1722 format = "%r";
1723 return_format = TRUE;
1724 break;
1725
1726 case PERL_ERA_D_FMT:
1727 format = "%Ex";
1728 return_format = TRUE;
1729 break;
1730
1731 case PERL_ERA_T_FMT:
1732 format = "%EX";
1733 return_format = TRUE;
1734 break;
1735
1736 case PERL_ERA_D_T_FMT:
1737 format = "%Ec";
1738 return_format = TRUE;
1739 break;
1740
1741 case PERL_ALT_DIGITS:
1742 tm.tm_wday = 0;
1743 format = "%Ow"; /* Find the alternate digit for 0 */
1744 break;
1745 }
f7416781 1746
43dd6b15
KW
1747 /* We can't use my_strftime() because it doesn't look at
1748 * tm_wday */
1749 while (0 == strftime(PL_langinfo_buf, PL_langinfo_bufsize,
1750 format, &tm))
1751 {
1752 /* A zero return means one of:
1753 * a) there wasn't enough space in PL_langinfo_buf
1754 * b) the format, like a plain %p, returns empty
1755 * c) it was an illegal format, though some
1756 * implementations of strftime will just return the
1757 * illegal format as a plain character sequence.
1758 *
1759 * To quickly test for case 'b)', try again but precede
1760 * the format with a plain character. If that result is
1761 * still empty, the problem is either 'a)' or 'c)' */
1762
1763 Size_t format_size = strlen(format) + 1;
1764 Size_t mod_size = format_size + 1;
1765 char * mod_format;
1766 char * temp_result;
1767
1768 Newx(mod_format, mod_size, char);
1769 Newx(temp_result, PL_langinfo_bufsize, char);
6873aa47 1770 *mod_format = ' ';
43dd6b15
KW
1771 my_strlcpy(mod_format + 1, format, mod_size);
1772 len = strftime(temp_result,
1773 PL_langinfo_bufsize,
1774 mod_format, &tm);
1775 Safefree(mod_format);
1776 Safefree(temp_result);
1777
1778 /* If 'len' is non-zero, it means that we had a case like
1779 * %p which means the current locale doesn't use a.m. or
1780 * p.m., and that is valid */
1781 if (len == 0) {
1782
1783 /* Here, still didn't work. If we get well beyond a
1784 * reasonable size, bail out to prevent an infinite
1785 * loop. */
1786
1787 if (PL_langinfo_bufsize > 100 * format_size) {
1788 *PL_langinfo_buf = '\0';
1789 }
1790 else {
1791 /* Double the buffer size to retry; Add 1 in case
1792 * original was 0, so we aren't stuck at 0. */
1793 PL_langinfo_bufsize *= 2;
1794 PL_langinfo_bufsize++;
1795 Renew(PL_langinfo_buf, PL_langinfo_bufsize, char);
1796 continue;
1797 }
1798 }
f7416781 1799
f7416781 1800 break;
43dd6b15 1801 }
f7416781 1802
43dd6b15
KW
1803 /* Here, we got a result.
1804 *
1805 * If the item is 'ALT_DIGITS', PL_langinfo_buf contains the
1806 * alternate format for wday 0. If the value is the same as
1807 * the normal 0, there isn't an alternate, so clear the buffer.
1808 * */
1809 if ( item == PERL_ALT_DIGITS
1810 && strEQ(PL_langinfo_buf, "0"))
1811 {
1812 *PL_langinfo_buf = '\0';
1813 }
f7416781 1814
43dd6b15
KW
1815 /* ALT_DIGITS is problematic. Experiments on it showed that
1816 * strftime() did not always work properly when going from
1817 * alt-9 to alt-10. Only a few locales have this item defined,
1818 * and in all of them on Linux that khw was able to find,
1819 * nl_langinfo() merely returned the alt-0 character, possibly
1820 * doubled. Most Unicode digits are in blocks of 10
1821 * consecutive code points, so that is sufficient information
1822 * for those scripts, as we can infer alt-1, alt-2, .... But
1823 * for a Japanese locale, a CJK ideographic 0 is returned, and
1824 * the CJK digits are not in code point order, so you can't
1825 * really infer anything. The localedef for this locale did
1826 * specify the succeeding digits, so that strftime() works
1827 * properly on them, without needing to infer anything. But
1828 * the nl_langinfo() return did not give sufficient information
1829 * for the caller to understand what's going on. So until
1830 * there is evidence that it should work differently, this
1831 * returns the alt-0 string for ALT_DIGITS.
1832 *
1833 * wday was chosen because its range is all a single digit.
1834 * Things like tm_sec have two digits as the minimum: '00' */
f7416781 1835
43dd6b15 1836 LOCALE_UNLOCK;
f7416781 1837
43dd6b15
KW
1838 /* If to return the format, not the value, overwrite the buffer
1839 * with it. But some strftime()s will keep the original format
1840 * if illegal, so change those to "" */
1841 if (return_format) {
1842 if (strEQ(PL_langinfo_buf, format)) {
f7416781
KW
1843 *PL_langinfo_buf = '\0';
1844 }
43dd6b15
KW
1845 else {
1846 save_to_buffer(format, &PL_langinfo_buf,
1847 &PL_langinfo_bufsize, 0);
f7416781
KW
1848 }
1849 }
1850
1851 break;
f7416781
KW
1852
1853# endif
1854
43dd6b15 1855 }
f7416781
KW
1856 }
1857
1858 return PL_langinfo_buf;
1859
1860#endif
1861
a4f00dcc 1862}
b385bb4d 1863
98994639
HS
1864/*
1865 * Initialize locale awareness.
1866 */
1867int
1868Perl_init_i18nl10n(pTHX_ int printwarn)
1869{
0e92a118
KW
1870 /* printwarn is
1871 *
1872 * 0 if not to output warning when setup locale is bad
1873 * 1 if to output warning based on value of PERL_BADLANG
1874 * >1 if to output regardless of PERL_BADLANG
1875 *
1876 * returns
98994639 1877 * 1 = set ok or not applicable,
0e92a118
KW
1878 * 0 = fallback to a locale of lower priority
1879 * -1 = fallback to all locales failed, not even to the C locale
6b058d42
KW
1880 *
1881 * Under -DDEBUGGING, if the environment variable PERL_DEBUG_LOCALE_INIT is
1882 * set, debugging information is output.
1883 *
1884 * This looks more complicated than it is, mainly due to the #ifdefs.
1885 *
1886 * We try to set LC_ALL to the value determined by the environment. If
1887 * there is no LC_ALL on this platform, we try the individual categories we
1888 * know about. If this works, we are done.
1889 *
1890 * But if it doesn't work, we have to do something else. We search the
1891 * environment variables ourselves instead of relying on the system to do
1892 * it. We look at, in order, LC_ALL, LANG, a system default locale (if we
1893 * think there is one), and the ultimate fallback "C". This is all done in
1894 * the same loop as above to avoid duplicating code, but it makes things
7d4bcc4a
KW
1895 * more complex. The 'trial_locales' array is initialized with just one
1896 * element; it causes the behavior described in the paragraph above this to
1897 * happen. If that fails, we add elements to 'trial_locales', and do extra
1898 * loop iterations to cause the behavior described in this paragraph.
6b058d42
KW
1899 *
1900 * On Ultrix, the locale MUST come from the environment, so there is
1901 * preliminary code to set it. I (khw) am not sure that it is necessary,
1902 * and that this couldn't be folded into the loop, but barring any real
1903 * platforms to test on, it's staying as-is
1904 *
1905 * A slight complication is that in embedded Perls, the locale may already
1906 * be set-up, and we don't want to get it from the normal environment
1907 * variables. This is handled by having a special environment variable
1908 * indicate we're in this situation. We simply set setlocale's 2nd
1909 * parameter to be a NULL instead of "". That indicates to setlocale that
1910 * it is not to change anything, but to return the current value,
1911 * effectively initializing perl's db to what the locale already is.
1912 *
1913 * We play the same trick with NULL if a LC_ALL succeeds. We call
1914 * setlocale() on the individual categores with NULL to get their existing
1915 * values for our db, instead of trying to change them.
1916 * */
98994639 1917
0e92a118
KW
1918 int ok = 1;
1919
7d4bcc4a
KW
1920#ifndef USE_LOCALE
1921
1922 PERL_UNUSED_ARG(printwarn);
1923
1924#else /* USE_LOCALE */
7d4bcc4a
KW
1925# ifdef __GLIBC__
1926
175c4cf9 1927 const char * const language = savepv(PerlEnv_getenv("LANGUAGE"));
7d4bcc4a
KW
1928
1929# endif
65ebb059 1930
ccd65d51
KW
1931 /* NULL uses the existing already set up locale */
1932 const char * const setlocale_init = (PerlEnv_getenv("PERL_SKIP_LOCALE_INIT"))
1933 ? NULL
1934 : "";
c3fcd832
KW
1935 const char* trial_locales[5]; /* 5 = 1 each for "", LC_ALL, LANG, "", C */
1936 unsigned int trial_locales_count;
175c4cf9
KW
1937 const char * const lc_all = savepv(PerlEnv_getenv("LC_ALL"));
1938 const char * const lang = savepv(PerlEnv_getenv("LANG"));
98994639 1939 bool setlocale_failure = FALSE;
65ebb059 1940 unsigned int i;
175c4cf9
KW
1941
1942 /* A later getenv() could zap this, so only use here */
1943 const char * const bad_lang_use_once = PerlEnv_getenv("PERL_BADLANG");
1944
1945 const bool locwarn = (printwarn > 1
e5f10d49
KW
1946 || ( printwarn
1947 && ( ! bad_lang_use_once
22ff3130 1948 || (
e5f10d49
KW
1949 /* disallow with "" or "0" */
1950 *bad_lang_use_once
1951 && strNE("0", bad_lang_use_once)))));
ea92aad8 1952
291a84fb 1953 /* setlocale() return vals; not copied so must be looked at immediately */
8de4332b 1954 const char * sl_result[NOMINAL_LC_ALL_INDEX + 1];
291a84fb
KW
1955
1956 /* current locale for given category; should have been copied so aren't
1957 * volatile */
8de4332b 1958 const char * curlocales[NOMINAL_LC_ALL_INDEX + 1];
291a84fb 1959
7d4bcc4a
KW
1960# ifdef WIN32
1961
6bce99ee
JH
1962 /* In some systems you can find out the system default locale
1963 * and use that as the fallback locale. */
7d4bcc4a
KW
1964# define SYSTEM_DEFAULT_LOCALE
1965# endif
1966# ifdef SYSTEM_DEFAULT_LOCALE
1967
65ebb059 1968 const char *system_default_locale = NULL;
98994639 1969
7d4bcc4a 1970# endif
948523db
KW
1971
1972# ifndef DEBUGGING
1973# define DEBUG_LOCALE_INIT(a,b,c)
1974# else
7d4bcc4a 1975
8298454c 1976 DEBUG_INITIALIZATION_set(cBOOL(PerlEnv_getenv("PERL_DEBUG_LOCALE_INIT")));
7d4bcc4a
KW
1977
1978# define DEBUG_LOCALE_INIT(category, locale, result) \
2fcc0ca9
KW
1979 STMT_START { \
1980 if (debug_initialization) { \
1981 PerlIO_printf(Perl_debug_log, \
1982 "%s:%d: %s\n", \
1983 __FILE__, __LINE__, \
a4f00dcc 1984 setlocale_debug_string(category, \
2fcc0ca9
KW
1985 locale, \
1986 result)); \
1987 } \
1988 } STMT_END
2fcc0ca9 1989
948523db
KW
1990/* Make sure the parallel arrays are properly set up */
1991# ifdef USE_LOCALE_NUMERIC
1992 assert(categories[LC_NUMERIC_INDEX] == LC_NUMERIC);
1993 assert(strEQ(category_names[LC_NUMERIC_INDEX], "LC_NUMERIC"));
1994# endif
1995# ifdef USE_LOCALE_CTYPE
1996 assert(categories[LC_CTYPE_INDEX] == LC_CTYPE);
1997 assert(strEQ(category_names[LC_CTYPE_INDEX], "LC_CTYPE"));
1998# endif
1999# ifdef USE_LOCALE_COLLATE
2000 assert(categories[LC_COLLATE_INDEX] == LC_COLLATE);
2001 assert(strEQ(category_names[LC_COLLATE_INDEX], "LC_COLLATE"));
2002# endif
2003# ifdef USE_LOCALE_TIME
2004 assert(categories[LC_TIME_INDEX] == LC_TIME);
2005 assert(strEQ(category_names[LC_TIME_INDEX], "LC_TIME"));
2006# endif
2007# ifdef USE_LOCALE_MESSAGES
2008 assert(categories[LC_MESSAGES_INDEX] == LC_MESSAGES);
2009 assert(strEQ(category_names[LC_MESSAGES_INDEX], "LC_MESSAGES"));
2010# endif
2011# ifdef USE_LOCALE_MONETARY
2012 assert(categories[LC_MONETARY_INDEX] == LC_MONETARY);
2013 assert(strEQ(category_names[LC_MONETARY_INDEX], "LC_MONETARY"));
2014# endif
9821811f
KW
2015# ifdef USE_LOCALE_ADDRESS
2016 assert(categories[LC_ADDRESS_INDEX] == LC_ADDRESS);
2017 assert(strEQ(category_names[LC_ADDRESS_INDEX], "LC_ADDRESS"));
2018# endif
2019# ifdef USE_LOCALE_IDENTIFICATION
2020 assert(categories[LC_IDENTIFICATION_INDEX] == LC_IDENTIFICATION);
2021 assert(strEQ(category_names[LC_IDENTIFICATION_INDEX], "LC_IDENTIFICATION"));
2022# endif
2023# ifdef USE_LOCALE_MEASUREMENT
2024 assert(categories[LC_MEASUREMENT_INDEX] == LC_MEASUREMENT);
2025 assert(strEQ(category_names[LC_MEASUREMENT_INDEX], "LC_MEASUREMENT"));
2026# endif
2027# ifdef USE_LOCALE_PAPER
2028 assert(categories[LC_PAPER_INDEX] == LC_PAPER);
2029 assert(strEQ(category_names[LC_PAPER_INDEX], "LC_PAPER"));
2030# endif
2031# ifdef USE_LOCALE_TELEPHONE
2032 assert(categories[LC_TELEPHONE_INDEX] == LC_TELEPHONE);
2033 assert(strEQ(category_names[LC_TELEPHONE_INDEX], "LC_TELEPHONE"));
2034# endif
948523db
KW
2035# ifdef LC_ALL
2036 assert(categories[LC_ALL_INDEX] == LC_ALL);
2037 assert(strEQ(category_names[LC_ALL_INDEX], "LC_ALL"));
2038 assert(NOMINAL_LC_ALL_INDEX == LC_ALL_INDEX);
2039# endif
2040# endif /* DEBUGGING */
47280b20
KW
2041
2042 /* Initialize the cache of the program's UTF-8ness for the always known
2043 * locales C and POSIX */
2044 my_strlcpy(PL_locale_utf8ness, C_and_POSIX_utf8ness,
2045 sizeof(PL_locale_utf8ness));
2046
3ca88433
KW
2047 PL_numeric_radix_sv = newSVpvs(".");
2048
b56c4436 2049# ifdef LOCALE_ENVIRON_REQUIRED
98994639
HS
2050
2051 /*
2052 * Ultrix setlocale(..., "") fails if there are no environment
2053 * variables from which to get a locale name.
2054 */
2055
b56c4436
KW
2056# ifndef LC_ALL
2057# error Ultrix without LC_ALL not implemented
2058# else
7d4bcc4a 2059
b56c4436
KW
2060 {
2061 bool done = FALSE;
ea92aad8
KW
2062 if (lang) {
2063 sl_result[LC_ALL_INDEX] = do_setlocale_c(LC_ALL, setlocale_init);
2064 DEBUG_LOCALE_INIT(LC_ALL, setlocale_init, sl_result[LC_ALL_INDEX]);
2065 if (sl_result[LC_ALL_INDEX])
2066 done = TRUE;
2067 else
e5f10d49 2068 setlocale_failure = TRUE;
ea92aad8
KW
2069 }
2070 if (! setlocale_failure) {
2071 const char * locale_param;
2072 for (i = 0; i < LC_ALL_INDEX; i++) {
2073 locale_param = (! done && (lang || PerlEnv_getenv(category_names[i])))
2074 ? setlocale_init
2075 : NULL;
2076 sl_result[i] = do_setlocale_r(categories[i], locale_param);
2077 if (! sl_result[i]) {
2078 setlocale_failure = TRUE;
2079 }
2080 DEBUG_LOCALE_INIT(categories[i], locale_param, sl_result[i]);
e5f10d49 2081 }
c835d6be 2082 }
7d4bcc4a
KW
2083 }
2084
2085# endif /* LC_ALL */
e5f10d49 2086# endif /* LOCALE_ENVIRON_REQUIRED */
98994639 2087
65ebb059 2088 /* We try each locale in the list until we get one that works, or exhaust
20a240df
KW
2089 * the list. Normally the loop is executed just once. But if setting the
2090 * locale fails, inside the loop we add fallback trials to the array and so
2091 * will execute the loop multiple times */
c3fcd832
KW
2092 trial_locales[0] = setlocale_init;
2093 trial_locales_count = 1;
7d4bcc4a 2094
65ebb059
KW
2095 for (i= 0; i < trial_locales_count; i++) {
2096 const char * trial_locale = trial_locales[i];
2097
2098 if (i > 0) {
2099
2100 /* XXX This is to preserve old behavior for LOCALE_ENVIRON_REQUIRED
2101 * when i==0, but I (khw) don't think that behavior makes much
2102 * sense */
2103 setlocale_failure = FALSE;
2104
7d4bcc4a 2105# ifdef SYSTEM_DEFAULT_LOCALE
291a84fb 2106# ifdef WIN32 /* Note that assumes Win32 has LC_ALL */
7d4bcc4a 2107
65ebb059
KW
2108 /* On Windows machines, an entry of "" after the 0th means to use
2109 * the system default locale, which we now proceed to get. */
2110 if (strEQ(trial_locale, "")) {
2111 unsigned int j;
2112
2113 /* Note that this may change the locale, but we are going to do
2114 * that anyway just below */
837ce802 2115 system_default_locale = do_setlocale_c(LC_ALL, "");
5d1187d1 2116 DEBUG_LOCALE_INIT(LC_ALL, "", system_default_locale);
65ebb059 2117
7d4bcc4a 2118 /* Skip if invalid or if it's already on the list of locales to
65ebb059
KW
2119 * try */
2120 if (! system_default_locale) {
2121 goto next_iteration;
2122 }
2123 for (j = 0; j < trial_locales_count; j++) {
2124 if (strEQ(system_default_locale, trial_locales[j])) {
2125 goto next_iteration;
2126 }
2127 }
2128
2129 trial_locale = system_default_locale;
2130 }
ec0202b5
KW
2131# else
2132# error SYSTEM_DEFAULT_LOCALE only implemented for Win32
2133# endif
7d4bcc4a 2134# endif /* SYSTEM_DEFAULT_LOCALE */
291a84fb
KW
2135
2136 } /* For i > 0 */
65ebb059 2137
7d4bcc4a
KW
2138# ifdef LC_ALL
2139
948523db
KW
2140 sl_result[LC_ALL_INDEX] = do_setlocale_c(LC_ALL, trial_locale);
2141 DEBUG_LOCALE_INIT(LC_ALL, trial_locale, sl_result[LC_ALL_INDEX]);
2142 if (! sl_result[LC_ALL_INDEX]) {
49c85077 2143 setlocale_failure = TRUE;
7cd8b568
KW
2144 }
2145 else {
2146 /* Since LC_ALL succeeded, it should have changed all the other
2147 * categories it can to its value; so we massage things so that the
2148 * setlocales below just return their category's current values.
2149 * This adequately handles the case in NetBSD where LC_COLLATE may
2150 * not be defined for a locale, and setting it individually will
7d4bcc4a 2151 * fail, whereas setting LC_ALL succeeds, leaving LC_COLLATE set to
7cd8b568
KW
2152 * the POSIX locale. */
2153 trial_locale = NULL;
2154 }
7d4bcc4a
KW
2155
2156# endif /* LC_ALL */
98994639 2157
e5f10d49
KW
2158 if (! setlocale_failure) {
2159 unsigned int j;
2160 for (j = 0; j < NOMINAL_LC_ALL_INDEX; j++) {
2161 curlocales[j]
2162 = savepv(do_setlocale_r(categories[j], trial_locale));
2163 if (! curlocales[j]) {
2164 setlocale_failure = TRUE;
2165 }
2166 DEBUG_LOCALE_INIT(categories[j], trial_locale, curlocales[j]);
2167 }
c835d6be 2168
e5f10d49
KW
2169 if (! setlocale_failure) { /* All succeeded */
2170 break; /* Exit trial_locales loop */
49c85077 2171 }
65ebb059 2172 }
98994639 2173
49c85077
KW
2174 /* Here, something failed; will need to try a fallback. */
2175 ok = 0;
65ebb059 2176
49c85077
KW
2177 if (i == 0) {
2178 unsigned int j;
98994639 2179
65ebb059 2180 if (locwarn) { /* Output failure info only on the first one */
7d4bcc4a
KW
2181
2182# ifdef LC_ALL
98994639 2183
49c85077
KW
2184 PerlIO_printf(Perl_error_log,
2185 "perl: warning: Setting locale failed.\n");
98994639 2186
7d4bcc4a 2187# else /* !LC_ALL */
98994639 2188
49c85077
KW
2189 PerlIO_printf(Perl_error_log,
2190 "perl: warning: Setting locale failed for the categories:\n\t");
7d4bcc4a 2191
e5f10d49
KW
2192 for (j = 0; j < NOMINAL_LC_ALL_INDEX; j++) {
2193 if (! curlocales[j]) {
2194 PerlIO_printf(Perl_error_log, category_names[j]);
2195 }
2196 else {
2197 Safefree(curlocales[j]);
2198 }
2199 }
7d4bcc4a 2200
7d4bcc4a 2201# endif /* LC_ALL */
98994639 2202
49c85077
KW
2203 PerlIO_printf(Perl_error_log,
2204 "perl: warning: Please check that your locale settings:\n");
98994639 2205
7d4bcc4a
KW
2206# ifdef __GLIBC__
2207
49c85077
KW
2208 PerlIO_printf(Perl_error_log,
2209 "\tLANGUAGE = %c%s%c,\n",
2210 language ? '"' : '(',
2211 language ? language : "unset",
2212 language ? '"' : ')');
7d4bcc4a 2213# endif
98994639 2214
49c85077
KW
2215 PerlIO_printf(Perl_error_log,
2216 "\tLC_ALL = %c%s%c,\n",
2217 lc_all ? '"' : '(',
2218 lc_all ? lc_all : "unset",
2219 lc_all ? '"' : ')');
98994639 2220
7d4bcc4a
KW
2221# if defined(USE_ENVIRON_ARRAY)
2222
49c85077 2223 {
cd999af9 2224 char **e;
d5e32b93
KW
2225
2226 /* Look through the environment for any variables of the
2227 * form qr/ ^ LC_ [A-Z]+ = /x, except LC_ALL which was
2228 * already handled above. These are assumed to be locale
2229 * settings. Output them and their values. */
cd999af9 2230 for (e = environ; *e; e++) {
d5e32b93
KW
2231 const STRLEN prefix_len = sizeof("LC_") - 1;
2232 STRLEN uppers_len;
2233
cd999af9 2234 if ( strBEGINs(*e, "LC_")
c8b388b0 2235 && ! strBEGINs(*e, "LC_ALL=")
d5e32b93
KW
2236 && (uppers_len = strspn(*e + prefix_len,
2237 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"))
2238 && ((*e)[prefix_len + uppers_len] == '='))
cd999af9
KW
2239 {
2240 PerlIO_printf(Perl_error_log, "\t%.*s = \"%s\",\n",
d5e32b93
KW
2241 (int) (prefix_len + uppers_len), *e,
2242 *e + prefix_len + uppers_len + 1);
cd999af9
KW
2243 }
2244 }
49c85077 2245 }
7d4bcc4a
KW
2246
2247# else
2248
49c85077
KW
2249 PerlIO_printf(Perl_error_log,
2250 "\t(possibly more locale environment variables)\n");
7d4bcc4a
KW
2251
2252# endif
98994639 2253
49c85077
KW
2254 PerlIO_printf(Perl_error_log,
2255 "\tLANG = %c%s%c\n",
2256 lang ? '"' : '(',
2257 lang ? lang : "unset",
2258 lang ? '"' : ')');
98994639 2259
49c85077
KW
2260 PerlIO_printf(Perl_error_log,
2261 " are supported and installed on your system.\n");
2262 }
98994639 2263
65ebb059 2264 /* Calculate what fallback locales to try. We have avoided this
f6bab5f6 2265 * until we have to, because failure is quite unlikely. This will
65ebb059
KW
2266 * usually change the upper bound of the loop we are in.
2267 *
2268 * Since the system's default way of setting the locale has not
2269 * found one that works, We use Perl's defined ordering: LC_ALL,
2270 * LANG, and the C locale. We don't try the same locale twice, so
2271 * don't add to the list if already there. (On POSIX systems, the
2272 * LC_ALL element will likely be a repeat of the 0th element "",
6b058d42
KW
2273 * but there's no harm done by doing it explicitly.
2274 *
2275 * Note that this tries the LC_ALL environment variable even on
2276 * systems which have no LC_ALL locale setting. This may or may
2277 * not have been originally intentional, but there's no real need
2278 * to change the behavior. */
65ebb059
KW
2279 if (lc_all) {
2280 for (j = 0; j < trial_locales_count; j++) {
2281 if (strEQ(lc_all, trial_locales[j])) {
2282 goto done_lc_all;
2283 }
2284 }
2285 trial_locales[trial_locales_count++] = lc_all;
2286 }
2287 done_lc_all:
98994639 2288
65ebb059
KW
2289 if (lang) {
2290 for (j = 0; j < trial_locales_count; j++) {
2291 if (strEQ(lang, trial_locales[j])) {
2292 goto done_lang;
2293 }
2294 }
2295 trial_locales[trial_locales_count++] = lang;
2296 }
2297 done_lang:
2298
7d4bcc4a
KW
2299# if defined(WIN32) && defined(LC_ALL)
2300
65ebb059
KW
2301 /* For Windows, we also try the system default locale before "C".
2302 * (If there exists a Windows without LC_ALL we skip this because
2303 * it gets too complicated. For those, the "C" is the next
2304 * fallback possibility). The "" is the same as the 0th element of
2305 * the array, but the code at the loop above knows to treat it
2306 * differently when not the 0th */
2307 trial_locales[trial_locales_count++] = "";
7d4bcc4a
KW
2308
2309# endif
65ebb059
KW
2310
2311 for (j = 0; j < trial_locales_count; j++) {
2312 if (strEQ("C", trial_locales[j])) {
2313 goto done_C;
2314 }
2315 }
2316 trial_locales[trial_locales_count++] = "C";
98994639 2317
65ebb059
KW
2318 done_C: ;
2319 } /* end of first time through the loop */
98994639 2320
7d4bcc4a
KW
2321# ifdef WIN32
2322
65ebb059 2323 next_iteration: ;
7d4bcc4a
KW
2324
2325# endif
65ebb059
KW
2326
2327 } /* end of looping through the trial locales */
2328
2329 if (ok < 1) { /* If we tried to fallback */
2330 const char* msg;
2331 if (! setlocale_failure) { /* fallback succeeded */
2332 msg = "Falling back to";
2333 }
2334 else { /* fallback failed */
e5f10d49 2335 unsigned int j;
98994639 2336
65ebb059
KW
2337 /* We dropped off the end of the loop, so have to decrement i to
2338 * get back to the value the last time through */
2339 i--;
98994639 2340
65ebb059
KW
2341 ok = -1;
2342 msg = "Failed to fall back to";
2343
2344 /* To continue, we should use whatever values we've got */
7d4bcc4a 2345
e5f10d49
KW
2346 for (j = 0; j < NOMINAL_LC_ALL_INDEX; j++) {
2347 Safefree(curlocales[j]);
2348 curlocales[j] = savepv(do_setlocale_r(categories[j], NULL));
2349 DEBUG_LOCALE_INIT(categories[j], NULL, curlocales[j]);
2350 }
65ebb059
KW
2351 }
2352
2353 if (locwarn) {
2354 const char * description;
2355 const char * name = "";
2356 if (strEQ(trial_locales[i], "C")) {
2357 description = "the standard locale";
2358 name = "C";
2359 }
7d4bcc4a
KW
2360
2361# ifdef SYSTEM_DEFAULT_LOCALE
2362
65ebb059
KW
2363 else if (strEQ(trial_locales[i], "")) {
2364 description = "the system default locale";
2365 if (system_default_locale) {
2366 name = system_default_locale;
2367 }
2368 }
7d4bcc4a
KW
2369
2370# endif /* SYSTEM_DEFAULT_LOCALE */
2371
65ebb059
KW
2372 else {
2373 description = "a fallback locale";
2374 name = trial_locales[i];
2375 }
2376 if (name && strNE(name, "")) {
2377 PerlIO_printf(Perl_error_log,
2378 "perl: warning: %s %s (\"%s\").\n", msg, description, name);
2379 }
2380 else {
2381 PerlIO_printf(Perl_error_log,
2382 "perl: warning: %s %s.\n", msg, description);
2383 }
2384 }
2385 } /* End of tried to fallback */
98994639 2386
e5f10d49
KW
2387 /* Done with finding the locales; update our records */
2388
7d4bcc4a
KW
2389# ifdef USE_LOCALE_CTYPE
2390
948523db 2391 new_ctype(curlocales[LC_CTYPE_INDEX]);
98994639 2392
e5f10d49 2393# endif
7d4bcc4a
KW
2394# ifdef USE_LOCALE_COLLATE
2395
948523db 2396 new_collate(curlocales[LC_COLLATE_INDEX]);
98994639 2397
e5f10d49 2398# endif
7d4bcc4a
KW
2399# ifdef USE_LOCALE_NUMERIC
2400
948523db 2401 new_numeric(curlocales[LC_NUMERIC_INDEX]);
e5f10d49
KW
2402
2403# endif
2404
948523db 2405 for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++) {
47280b20
KW
2406
2407# if defined(USE_ITHREADS)
2408
2409 /* This caches whether each category's locale is UTF-8 or not. This
2410 * may involve changing the locale. It is ok to do this at
2411 * initialization time before any threads have started, but not later.
2412 * Caching means that if the program heeds our dictate not to change
2413 * locales in threaded applications, this data will remain valid, and
2414 * it may get queried without changing locales. If the environment is
2415 * such that all categories have the same locale, this isn't needed, as
2416 * the code will not change the locale; but this handles the uncommon
2417 * case where the environment has disparate locales for the categories
2418 * */
2419 (void) _is_cur_LC_category_utf8(categories[i]);
2420
2421# endif
2422
e5f10d49
KW
2423 Safefree(curlocales[i]);
2424 }
b310b053 2425
7d4bcc4a
KW
2426# if defined(USE_PERLIO) && defined(USE_LOCALE_CTYPE)
2427
49c85077 2428 /* Set PL_utf8locale to TRUE if using PerlIO _and_ the current LC_CTYPE
50bf02bd
KW
2429 * locale is UTF-8. The call to new_ctype() just above has already
2430 * calculated the latter value and saved it in PL_in_utf8_CTYPE_locale. If
2431 * both PL_utf8locale and PL_unicode (set by -C or by $ENV{PERL_UNICODE})
2432 * are true, perl.c:S_parse_body() will turn on the PerlIO :utf8 layer on
2433 * STDIN, STDOUT, STDERR, _and_ the default open discipline. */
2434 PL_utf8locale = PL_in_utf8_CTYPE_locale;
49c85077 2435
a05d7ebb 2436 /* Set PL_unicode to $ENV{PERL_UNICODE} if using PerlIO.
fde18df1
JH
2437 This is an alternative to using the -C command line switch
2438 (the -C if present will override this). */
2439 {
dd374669 2440 const char *p = PerlEnv_getenv("PERL_UNICODE");
a05d7ebb 2441 PL_unicode = p ? parse_unicode_opts(&p) : 0;
5a22a2bb
NC
2442 if (PL_unicode & PERL_UNICODE_UTF8CACHEASSERT_FLAG)
2443 PL_utf8cache = -1;
b310b053
JH
2444 }
2445
7d4bcc4a 2446# endif
7d4bcc4a
KW
2447# ifdef __GLIBC__
2448
175c4cf9 2449 Safefree(language);
7d4bcc4a
KW
2450
2451# endif
175c4cf9
KW
2452
2453 Safefree(lc_all);
2454 Safefree(lang);
2455
e3305790 2456#endif /* USE_LOCALE */
2fcc0ca9 2457#ifdef DEBUGGING
7d4bcc4a 2458
2fcc0ca9 2459 /* So won't continue to output stuff */
27cdc72e 2460 DEBUG_INITIALIZATION_set(FALSE);
7d4bcc4a 2461
2fcc0ca9
KW
2462#endif
2463
98994639
HS
2464 return ok;
2465}
2466
98994639
HS
2467#ifdef USE_LOCALE_COLLATE
2468
a4a439fb 2469char *
a4a439fb
KW
2470Perl__mem_collxfrm(pTHX_ const char *input_string,
2471 STRLEN len, /* Length of 'input_string' */
2472 STRLEN *xlen, /* Set to length of returned string
2473 (not including the collation index
2474 prefix) */
2475 bool utf8 /* Is the input in UTF-8? */
6696cfa7 2476 )
98994639 2477{
a4a439fb
KW
2478
2479 /* _mem_collxfrm() is a bit like strxfrm() but with two important
2480 * differences. First, it handles embedded NULs. Second, it allocates a bit
2481 * more memory than needed for the transformed data itself. The real
55e5378d 2482 * transformed data begins at offset COLLXFRM_HDR_LEN. *xlen is set to
a4a439fb
KW
2483 * the length of that, and doesn't include the collation index size.
2484 * Please see sv_collxfrm() to see how this is used. */
2485
55e5378d
KW
2486#define COLLXFRM_HDR_LEN sizeof(PL_collation_ix)
2487
6696cfa7
KW
2488 char * s = (char *) input_string;
2489 STRLEN s_strlen = strlen(input_string);
79f120c8 2490 char *xbuf = NULL;
55e5378d 2491 STRLEN xAlloc; /* xalloc is a reserved word in VC */
17f41037 2492 STRLEN length_in_chars;
c664130f 2493 bool first_time = TRUE; /* Cleared after first loop iteration */
98994639 2494
a4a439fb
KW
2495 PERL_ARGS_ASSERT__MEM_COLLXFRM;
2496
2497 /* Must be NUL-terminated */
2498 assert(*(input_string + len) == '\0');
7918f24d 2499
79f120c8
KW
2500 /* If this locale has defective collation, skip */
2501 if (PL_collxfrm_base == 0 && PL_collxfrm_mult == 0) {
c7202dee
KW
2502 DEBUG_L(PerlIO_printf(Perl_debug_log,
2503 "_mem_collxfrm: locale's collation is defective\n"));
79f120c8
KW
2504 goto bad;
2505 }
2506
6696cfa7
KW
2507 /* Replace any embedded NULs with the control that sorts before any others.
2508 * This will give as good as possible results on strings that don't
2509 * otherwise contain that character, but otherwise there may be
2510 * less-than-perfect results with that character and NUL. This is
fdc080f3 2511 * unavoidable unless we replace strxfrm with our own implementation. */
fd43f63c
KW
2512 if (UNLIKELY(s_strlen < len)) { /* Only execute if there is an embedded
2513 NUL */
6696cfa7
KW
2514 char * e = s + len;
2515 char * sans_nuls;
fdc080f3 2516 STRLEN sans_nuls_len;
94762aa0 2517 int try_non_controls;
afc4976f
KW
2518 char this_replacement_char[] = "?\0"; /* Room for a two-byte string,
2519 making sure 2nd byte is NUL.
2520 */
2521 STRLEN this_replacement_len;
2522
1e4c9676
KW
2523 /* If we don't know what non-NUL control character sorts lowest for
2524 * this locale, find it */
f28f4d2a 2525 if (PL_strxfrm_NUL_replacement == '\0') {
6696cfa7 2526 int j;
afc4976f 2527 char * cur_min_x = NULL; /* The min_char's xfrm, (except it also
6696cfa7
KW
2528 includes the collation index
2529 prefixed. */
2530
91c0e2e0 2531 DEBUG_Lv(PerlIO_printf(Perl_debug_log, "Looking to replace NUL\n"));
94762aa0
KW
2532
2533 /* Unlikely, but it may be that no control will work to replace
1e4c9676
KW
2534 * NUL, in which case we instead look for any character. Controls
2535 * are preferred because collation order is, in general, context
2536 * sensitive, with adjoining characters affecting the order, and
2537 * controls are less likely to have such interactions, allowing the
2538 * NUL-replacement to stand on its own. (Another way to look at it
2539 * is to imagine what would happen if the NUL were replaced by a
2540 * combining character; it wouldn't work out all that well.) */
94762aa0
KW
2541 for (try_non_controls = 0;
2542 try_non_controls < 2;
2543 try_non_controls++)
2544 {
d4ff9586
KW
2545 /* Look through all legal code points (NUL isn't) */
2546 for (j = 1; j < 256; j++) {
2547 char * x; /* j's xfrm plus collation index */
2548 STRLEN x_len; /* length of 'x' */
2549 STRLEN trial_len = 1;
736a4fed 2550 char cur_source[] = { '\0', '\0' };
d4ff9586 2551
736a4fed
KW
2552 /* Skip non-controls the first time through the loop. The
2553 * controls in a UTF-8 locale are the L1 ones */
afc4976f
KW
2554 if (! try_non_controls && (PL_in_utf8_COLLATE_locale)
2555 ? ! isCNTRL_L1(j)
2556 : ! isCNTRL_LC(j))
2557 {
d4ff9586 2558 continue;
6696cfa7 2559 }
6696cfa7 2560
736a4fed
KW
2561 /* Create a 1-char string of the current code point */
2562 cur_source[0] = (char) j;
2563
d4ff9586
KW
2564 /* Then transform it */
2565 x = _mem_collxfrm(cur_source, trial_len, &x_len,
afc4976f 2566 0 /* The string is not in UTF-8 */);
6696cfa7 2567
1e4c9676 2568 /* Ignore any character that didn't successfully transform.
d4ff9586
KW
2569 * */
2570 if (! x) {
2571 continue;
2572 }
6696cfa7 2573
d4ff9586
KW
2574 /* If this character's transformation is lower than
2575 * the current lowest, this one becomes the lowest */
2576 if ( cur_min_x == NULL
2577 || strLT(x + COLLXFRM_HDR_LEN,
2578 cur_min_x + COLLXFRM_HDR_LEN))
2579 {
f28f4d2a 2580 PL_strxfrm_NUL_replacement = j;
d4ff9586 2581 cur_min_x = x;
d4ff9586
KW
2582 }
2583 else {
2584 Safefree(x);
2585 }
1e4c9676 2586 } /* end of loop through all 255 characters */
6696cfa7 2587
1e4c9676 2588 /* Stop looking if found */
94762aa0
KW
2589 if (cur_min_x) {
2590 break;
2591 }
2592
2593 /* Unlikely, but possible, if there aren't any controls that
2594 * work in the locale, repeat the loop, looking for any
2595 * character that works */
2596 DEBUG_L(PerlIO_printf(Perl_debug_log,
2597 "_mem_collxfrm: No control worked. Trying non-controls\n"));
1e4c9676 2598 } /* End of loop to try first the controls, then any char */
6696cfa7 2599
94762aa0
KW
2600 if (! cur_min_x) {
2601 DEBUG_L(PerlIO_printf(Perl_debug_log,
2602 "_mem_collxfrm: Couldn't find any character to replace"
2603 " embedded NULs in locale %s with", PL_collation_name));
2604 goto bad;
58eebef2
KW
2605 }
2606
94762aa0
KW
2607 DEBUG_L(PerlIO_printf(Perl_debug_log,
2608 "_mem_collxfrm: Replacing embedded NULs in locale %s with "
f28f4d2a 2609 "0x%02X\n", PL_collation_name, PL_strxfrm_NUL_replacement));
94762aa0 2610
6696cfa7 2611 Safefree(cur_min_x);
1e4c9676 2612 } /* End of determining the character that is to replace NULs */
afc4976f
KW
2613
2614 /* If the replacement is variant under UTF-8, it must match the
291a84fb 2615 * UTF8-ness of the original */
f28f4d2a
KW
2616 if ( ! UVCHR_IS_INVARIANT(PL_strxfrm_NUL_replacement) && utf8) {
2617 this_replacement_char[0] =
2618 UTF8_EIGHT_BIT_HI(PL_strxfrm_NUL_replacement);
2619 this_replacement_char[1] =
2620 UTF8_EIGHT_BIT_LO(PL_strxfrm_NUL_replacement);
afc4976f
KW
2621 this_replacement_len = 2;
2622 }
2623 else {
f28f4d2a 2624 this_replacement_char[0] = PL_strxfrm_NUL_replacement;
afc4976f
KW
2625 /* this_replacement_char[1] = '\0' was done at initialization */
2626 this_replacement_len = 1;
6696cfa7
KW
2627 }
2628
2629 /* The worst case length for the replaced string would be if every
2630 * character in it is NUL. Multiply that by the length of each
2631 * replacement, and allow for a trailing NUL */
afc4976f 2632 sans_nuls_len = (len * this_replacement_len) + 1;
fdc080f3 2633 Newx(sans_nuls, sans_nuls_len, char);
6696cfa7
KW
2634 *sans_nuls = '\0';
2635
6696cfa7
KW
2636 /* Replace each NUL with the lowest collating control. Loop until have
2637 * exhausted all the NULs */
2638 while (s + s_strlen < e) {
6069d6c5 2639 my_strlcat(sans_nuls, s, sans_nuls_len);
6696cfa7
KW
2640
2641 /* Do the actual replacement */
6069d6c5 2642 my_strlcat(sans_nuls, this_replacement_char, sans_nuls_len);
6696cfa7
KW
2643
2644 /* Move past the input NUL */
2645 s += s_strlen + 1;
2646 s_strlen = strlen(s);
2647 }
2648
2649 /* And add anything that trails the final NUL */
6069d6c5 2650 my_strlcat(sans_nuls, s, sans_nuls_len);
6696cfa7
KW
2651
2652 /* Switch so below we transform this modified string */
2653 s = sans_nuls;
2654 len = strlen(s);
1e4c9676 2655 } /* End of replacing NULs */
6696cfa7 2656
a4a439fb
KW
2657 /* Make sure the UTF8ness of the string and locale match */
2658 if (utf8 != PL_in_utf8_COLLATE_locale) {
2659 const char * const t = s; /* Temporary so we can later find where the
2660 input was */
2661
2662 /* Here they don't match. Change the string's to be what the locale is
2663 * expecting */
2664
2665 if (! utf8) { /* locale is UTF-8, but input isn't; upgrade the input */
2666 s = (char *) bytes_to_utf8((const U8 *) s, &len);
2667 utf8 = TRUE;
2668 }
2669 else { /* locale is not UTF-8; but input is; downgrade the input */
2670
2671 s = (char *) bytes_from_utf8((const U8 *) s, &len, &utf8);
2672
2673 /* If the downgrade was successful we are done, but if the input
2674 * contains things that require UTF-8 to represent, have to do
2675 * damage control ... */
2676 if (UNLIKELY(utf8)) {
2677
2678 /* What we do is construct a non-UTF-8 string with
2679 * 1) the characters representable by a single byte converted
2680 * to be so (if necessary);
2681 * 2) and the rest converted to collate the same as the
2682 * highest collating representable character. That makes
2683 * them collate at the end. This is similar to how we
2684 * handle embedded NULs, but we use the highest collating
2685 * code point instead of the smallest. Like the NUL case,
2686 * this isn't perfect, but is the best we can reasonably
2687 * do. Every above-255 code point will sort the same as
2688 * the highest-sorting 0-255 code point. If that code
2689 * point can combine in a sequence with some other code
2690 * points for weight calculations, us changing something to
2691 * be it can adversely affect the results. But in most
2692 * cases, it should work reasonably. And note that this is
2693 * really an illegal situation: using code points above 255
2694 * on a locale where only 0-255 are valid. If two strings
2695 * sort entirely equal, then the sort order for the
2696 * above-255 code points will be in code point order. */
2697
2698 utf8 = FALSE;
2699
2700 /* If we haven't calculated the code point with the maximum
2701 * collating order for this locale, do so now */
2702 if (! PL_strxfrm_max_cp) {
2703 int j;
2704
2705 /* The current transformed string that collates the
2706 * highest (except it also includes the prefixed collation
2707 * index. */
2708 char * cur_max_x = NULL;
2709
2710 /* Look through all legal code points (NUL isn't) */
2711 for (j = 1; j < 256; j++) {
2712 char * x;
2713 STRLEN x_len;
736a4fed 2714 char cur_source[] = { '\0', '\0' };
a4a439fb 2715
736a4fed
KW
2716 /* Create a 1-char string of the current code point */
2717 cur_source[0] = (char) j;
a4a439fb
KW
2718
2719 /* Then transform it */
2720 x = _mem_collxfrm(cur_source, 1, &x_len, FALSE);
2721
2722 /* If something went wrong (which it shouldn't), just
2723 * ignore this code point */
94762aa0 2724 if (! x) {
a4a439fb
KW
2725 continue;
2726 }
2727
2728 /* If this character's transformation is higher than
2729 * the current highest, this one becomes the highest */
2730 if ( cur_max_x == NULL
55e5378d
KW
2731 || strGT(x + COLLXFRM_HDR_LEN,
2732 cur_max_x + COLLXFRM_HDR_LEN))
a4a439fb
KW
2733 {
2734 PL_strxfrm_max_cp = j;
2735 cur_max_x = x;
2736 }
2737 else {
2738 Safefree(x);
2739 }
2740 }
2741
94762aa0
KW
2742 if (! cur_max_x) {
2743 DEBUG_L(PerlIO_printf(Perl_debug_log,
2744 "_mem_collxfrm: Couldn't find any character to"
2745 " replace above-Latin1 chars in locale %s with",
2746 PL_collation_name));
2747 goto bad;
2748 }
2749
58eebef2
KW
2750 DEBUG_L(PerlIO_printf(Perl_debug_log,
2751 "_mem_collxfrm: highest 1-byte collating character"
2752 " in locale %s is 0x%02X\n",
2753 PL_collation_name,
2754 PL_strxfrm_max_cp));
58eebef2 2755
a4a439fb
KW
2756 Safefree(cur_max_x);
2757 }
2758
2759 /* Here we know which legal code point collates the highest.
2760 * We are ready to construct the non-UTF-8 string. The length
2761 * will be at least 1 byte smaller than the input string
2762 * (because we changed at least one 2-byte character into a
2763 * single byte), but that is eaten up by the trailing NUL */
2764 Newx(s, len, char);
2765
2766 {
2767 STRLEN i;
2768 STRLEN d= 0;
042d9e50 2769 char * e = (char *) t + len;
a4a439fb
KW
2770
2771 for (i = 0; i < len; i+= UTF8SKIP(t + i)) {
2772 U8 cur_char = t[i];
2773 if (UTF8_IS_INVARIANT(cur_char)) {
2774 s[d++] = cur_char;
2775 }
042d9e50 2776 else if (UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(t + i, e)) {
a4a439fb
KW
2777 s[d++] = EIGHT_BIT_UTF8_TO_NATIVE(cur_char, t[i+1]);
2778 }
2779 else { /* Replace illegal cp with highest collating
2780 one */
2781 s[d++] = PL_strxfrm_max_cp;
2782 }
2783 }
2784 s[d++] = '\0';
2785 Renew(s, d, char); /* Free up unused space */
2786 }
2787 }
2788 }
2789
2790 /* Here, we have constructed a modified version of the input. It could
2791 * be that we already had a modified copy before we did this version.
2792 * If so, that copy is no longer needed */
2793 if (t != input_string) {
2794 Safefree(t);
2795 }
2796 }
2797
17f41037
KW
2798 length_in_chars = (utf8)
2799 ? utf8_length((U8 *) s, (U8 *) s + len)
2800 : len;
2801
59c018b9
KW
2802 /* The first element in the output is the collation id, used by
2803 * sv_collxfrm(); then comes the space for the transformed string. The
2804 * equation should give us a good estimate as to how much is needed */
55e5378d 2805 xAlloc = COLLXFRM_HDR_LEN
a4a439fb 2806 + PL_collxfrm_base
17f41037 2807 + (PL_collxfrm_mult * length_in_chars);
a02a5408 2808 Newx(xbuf, xAlloc, char);
c7202dee
KW
2809 if (UNLIKELY(! xbuf)) {
2810 DEBUG_L(PerlIO_printf(Perl_debug_log,
2811 "_mem_collxfrm: Couldn't malloc %zu bytes\n", xAlloc));
98994639 2812 goto bad;
c7202dee 2813 }
98994639 2814
d35fca5f 2815 /* Store the collation id */
98994639 2816 *(U32*)xbuf = PL_collation_ix;
d35fca5f
KW
2817
2818 /* Then the transformation of the input. We loop until successful, or we
2819 * give up */
4ebeff16 2820 for (;;) {
1adab0a7 2821
55e5378d 2822 *xlen = strxfrm(xbuf + COLLXFRM_HDR_LEN, s, xAlloc - COLLXFRM_HDR_LEN);
4ebeff16
KW
2823
2824 /* If the transformed string occupies less space than we told strxfrm()
2825 * was available, it means it successfully transformed the whole
2826 * string. */
55e5378d 2827 if (*xlen < xAlloc - COLLXFRM_HDR_LEN) {
17f41037 2828
1adab0a7
KW
2829 /* Some systems include a trailing NUL in the returned length.
2830 * Ignore it, using a loop in case multiple trailing NULs are
2831 * returned. */
2832 while ( (*xlen) > 0
2833 && *(xbuf + COLLXFRM_HDR_LEN + (*xlen) - 1) == '\0')
2834 {
2835 (*xlen)--;
2836 }
2837
17f41037
KW
2838 /* If the first try didn't get it, it means our prediction was low.
2839 * Modify the coefficients so that we predict a larger value in any
2840 * future transformations */
2841 if (! first_time) {
2842 STRLEN needed = *xlen + 1; /* +1 For trailing NUL */
2843 STRLEN computed_guess = PL_collxfrm_base
2844 + (PL_collxfrm_mult * length_in_chars);
e1c30f0c
KW
2845
2846 /* On zero-length input, just keep current slope instead of
2847 * dividing by 0 */
2848 const STRLEN new_m = (length_in_chars != 0)
2849 ? needed / length_in_chars
2850 : PL_collxfrm_mult;
17f41037
KW
2851
2852 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
b07929e4
KW
2853 "%s: %d: initial size of %zu bytes for a length "
2854 "%zu string was insufficient, %zu needed\n",
17f41037 2855 __FILE__, __LINE__,
b07929e4 2856 computed_guess, length_in_chars, needed));
17f41037
KW
2857
2858 /* If slope increased, use it, but discard this result for
2859 * length 1 strings, as we can't be sure that it's a real slope
2860 * change */
2861 if (length_in_chars > 1 && new_m > PL_collxfrm_mult) {
7d4bcc4a
KW
2862
2863# ifdef DEBUGGING
2864
17f41037
KW
2865 STRLEN old_m = PL_collxfrm_mult;
2866 STRLEN old_b = PL_collxfrm_base;
7d4bcc4a
KW
2867
2868# endif
2869
17f41037
KW
2870 PL_collxfrm_mult = new_m;
2871 PL_collxfrm_base = 1; /* +1 For trailing NUL */
2872 computed_guess = PL_collxfrm_base
2873 + (PL_collxfrm_mult * length_in_chars);
2874 if (computed_guess < needed) {
2875 PL_collxfrm_base += needed - computed_guess;
2876 }
2877
2878 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
b07929e4
KW
2879 "%s: %d: slope is now %zu; was %zu, base "
2880 "is now %zu; was %zu\n",
17f41037 2881 __FILE__, __LINE__,
b07929e4
KW
2882 PL_collxfrm_mult, old_m,
2883 PL_collxfrm_base, old_b));
17f41037
KW
2884 }
2885 else { /* Slope didn't change, but 'b' did */
2886 const STRLEN new_b = needed
2887 - computed_guess
2888 + PL_collxfrm_base;
2889 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
b07929e4 2890 "%s: %d: base is now %zu; was %zu\n",
17f41037 2891 __FILE__, __LINE__,
b07929e4 2892 new_b, PL_collxfrm_base));
17f41037
KW
2893 PL_collxfrm_base = new_b;
2894 }
2895 }
2896
4ebeff16
KW
2897 break;
2898 }
bb0f664e 2899
c7202dee
KW
2900 if (UNLIKELY(*xlen >= PERL_INT_MAX)) {
2901 DEBUG_L(PerlIO_printf(Perl_debug_log,
2902 "_mem_collxfrm: Needed %zu bytes, max permissible is %u\n",
2903 *xlen, PERL_INT_MAX));
4ebeff16 2904 goto bad;
c7202dee 2905 }
d35fca5f 2906
c664130f 2907 /* A well-behaved strxfrm() returns exactly how much space it needs
1adab0a7
KW
2908 * (usually not including the trailing NUL) when it fails due to not
2909 * enough space being provided. Assume that this is the case unless
2910 * it's been proven otherwise */
c664130f 2911 if (LIKELY(PL_strxfrm_is_behaved) && first_time) {
55e5378d 2912 xAlloc = *xlen + COLLXFRM_HDR_LEN + 1;
c664130f
KW
2913 }
2914 else { /* Here, either:
2915 * 1) The strxfrm() has previously shown bad behavior; or
2916 * 2) It isn't the first time through the loop, which means
2917 * that the strxfrm() is now showing bad behavior, because
2918 * we gave it what it said was needed in the previous
2919 * iteration, and it came back saying it needed still more.
2920 * (Many versions of cygwin fit this. When the buffer size
2921 * isn't sufficient, they return the input size instead of
2922 * how much is needed.)
d4ff9586
KW
2923 * Increase the buffer size by a fixed percentage and try again.
2924 * */
6ddd902c 2925 xAlloc += (xAlloc / 4) + 1;
c664130f 2926 PL_strxfrm_is_behaved = FALSE;
c664130f 2927
7d4bcc4a
KW
2928# ifdef DEBUGGING
2929
58eebef2
KW
2930 if (DEBUG_Lv_TEST || debug_initialization) {
2931 PerlIO_printf(Perl_debug_log,
2932 "_mem_collxfrm required more space than previously calculated"
b07929e4 2933 " for locale %s, trying again with new guess=%d+%zu\n",
58eebef2 2934 PL_collation_name, (int) COLLXFRM_HDR_LEN,
b07929e4 2935 xAlloc - COLLXFRM_HDR_LEN);
58eebef2 2936 }
7d4bcc4a
KW
2937
2938# endif
2939
58eebef2 2940 }
c664130f 2941
4ebeff16 2942 Renew(xbuf, xAlloc, char);
c7202dee
KW
2943 if (UNLIKELY(! xbuf)) {
2944 DEBUG_L(PerlIO_printf(Perl_debug_log,
2945 "_mem_collxfrm: Couldn't realloc %zu bytes\n", xAlloc));
4ebeff16 2946 goto bad;
c7202dee 2947 }
c664130f
KW
2948
2949 first_time = FALSE;
4ebeff16 2950 }
98994639 2951
6696cfa7 2952
7d4bcc4a
KW
2953# ifdef DEBUGGING
2954
58eebef2 2955 if (DEBUG_Lv_TEST || debug_initialization) {
c7202dee
KW
2956
2957 print_collxfrm_input_and_return(s, s + len, xlen, utf8);
2958 PerlIO_printf(Perl_debug_log, "Its xfrm is:");
7e2f38b2
KW
2959 PerlIO_printf(Perl_debug_log, "%s\n",
2960 _byte_dump_string((U8 *) xbuf + COLLXFRM_HDR_LEN,
2961 *xlen, 1));
58eebef2 2962 }
7d4bcc4a
KW
2963
2964# endif
58eebef2 2965
3c5f993e 2966 /* Free up unneeded space; retain ehough for trailing NUL */
55e5378d 2967 Renew(xbuf, COLLXFRM_HDR_LEN + *xlen + 1, char);
98994639 2968
6696cfa7
KW
2969 if (s != input_string) {
2970 Safefree(s);
98994639
HS
2971 }
2972
98994639
HS
2973 return xbuf;
2974
2975 bad:
2976 Safefree(xbuf);
6696cfa7
KW
2977 if (s != input_string) {
2978 Safefree(s);
2979 }
98994639 2980 *xlen = 0;
7d4bcc4a
KW
2981
2982# ifdef DEBUGGING
2983
58eebef2 2984 if (DEBUG_Lv_TEST || debug_initialization) {
c7202dee 2985 print_collxfrm_input_and_return(s, s + len, NULL, utf8);
58eebef2 2986 }
7d4bcc4a
KW
2987
2988# endif
2989
98994639
HS
2990 return NULL;
2991}
2992
7d4bcc4a 2993# ifdef DEBUGGING
c7202dee 2994
4cbaac56 2995STATIC void
c7202dee
KW
2996S_print_collxfrm_input_and_return(pTHX_
2997 const char * const s,
2998 const char * const e,
2999 const STRLEN * const xlen,
3000 const bool is_utf8)
3001{
c7202dee
KW
3002
3003 PERL_ARGS_ASSERT_PRINT_COLLXFRM_INPUT_AND_RETURN;
3004
511e4ff7
DM
3005 PerlIO_printf(Perl_debug_log, "_mem_collxfrm[%" UVuf "]: returning ",
3006 (UV)PL_collation_ix);
c7202dee 3007 if (xlen) {
08b6dc1d 3008 PerlIO_printf(Perl_debug_log, "%zu", *xlen);
c7202dee
KW
3009 }
3010 else {
3011 PerlIO_printf(Perl_debug_log, "NULL");
3012 }
3013 PerlIO_printf(Perl_debug_log, " for locale '%s', string='",
3014 PL_collation_name);
9c8a6dc2
KW
3015 print_bytes_for_locale(s, e, is_utf8);
3016
3017 PerlIO_printf(Perl_debug_log, "'\n");
3018}
3019
3020STATIC void
3021S_print_bytes_for_locale(pTHX_
3022 const char * const s,
3023 const char * const e,
3024 const bool is_utf8)
3025{
3026 const char * t = s;
3027 bool prev_was_printable = TRUE;
3028 bool first_time = TRUE;
3029
3030 PERL_ARGS_ASSERT_PRINT_BYTES_FOR_LOCALE;
c7202dee
KW
3031
3032 while (t < e) {
3033 UV cp = (is_utf8)
3034 ? utf8_to_uvchr_buf((U8 *) t, e, NULL)
3035 : * (U8 *) t;
3036 if (isPRINT(cp)) {
3037 if (! prev_was_printable) {
3038 PerlIO_printf(Perl_debug_log, " ");
3039 }
3040 PerlIO_printf(Perl_debug_log, "%c", (U8) cp);
3041 prev_was_printable = TRUE;
3042 }
3043 else {
3044 if (! first_time) {
3045 PerlIO_printf(Perl_debug_log, " ");
3046 }
147e3846 3047 PerlIO_printf(Perl_debug_log, "%02" UVXf, cp);
c7202dee
KW
3048 prev_was_printable = FALSE;
3049 }
3050 t += (is_utf8) ? UTF8SKIP(t) : 1;
3051 first_time = FALSE;
3052 }
c7202dee
KW
3053}
3054
7d4bcc4a 3055# endif /* #ifdef DEBUGGING */
98994639 3056#endif /* USE_LOCALE_COLLATE */
58eebef2 3057
8ef6e574
KW
3058#ifdef USE_LOCALE
3059
962aa53f
KW
3060STATIC const char *
3061S_switch_category_locale_to_template(pTHX_ const int switch_category, const int template_category, const char * template_locale)
3062{
3063 /* Changes the locale for LC_'switch_category" to that of
3064 * LC_'template_category', if they aren't already the same. If not NULL,
3065 * 'template_locale' is the locale that 'template_category' is in.
3066 *
3067 * Returns the original locale for 'switch_category' so can be switched
3068 * back to with the companion function restore_switched_locale(), (NULL if
3069 * no restoral is necessary.) */
3070
3071 char * restore_to_locale = NULL;
3072
3073 if (switch_category == template_category) { /* No changes needed */
3074 return NULL;
3075 }
3076
3077 /* Find the original locale of the category we may need to change, so that
3078 * it can be restored to later */
3079 restore_to_locale = do_setlocale_r(switch_category, NULL);
3080 if (! restore_to_locale) {
3081 Perl_croak(aTHX_
3082 "panic: %s: %d: Could not find current %s locale, errno=%d\n",
3083 __FILE__, __LINE__, category_name(switch_category), errno);
3084 }
3085 restore_to_locale = stdize_locale(savepv(restore_to_locale));
3086
3087 /* If the locale of the template category wasn't passed in, find it now */
3088 if (template_locale == NULL) {
3089 template_locale = do_setlocale_r(template_category, NULL);
3090 if (! template_locale) {
3091 Perl_croak(aTHX_
3092 "panic: %s: %d: Could not find current %s locale, errno=%d\n",
3093 __FILE__, __LINE__, category_name(template_category), errno);
3094 }
3095 }
3096
3097 /* It the locales are the same, there's nothing to do */
3098 if (strEQ(restore_to_locale, template_locale)) {
3099 Safefree(restore_to_locale);
3100
3101 DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s locale unchanged as %s\n",
3102 category_name(switch_category), restore_to_locale));
3103
3104 return NULL;
3105 }
3106
3107 /* Finally, change the locale to the template one */
3108 if (! do_setlocale_r(switch_category, template_locale)) {
3109 Perl_croak(aTHX_
3110 "panic: %s: %d: Could not change %s locale to %s, errno=%d\n",
3111 __FILE__, __LINE__, category_name(switch_category),
3112 template_locale, errno);
3113 }
3114
3115 DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s locale switched to %s\n",
3116 category_name(switch_category), template_locale));
3117
3118 return restore_to_locale;
3119}
3120
3121STATIC void
3122S_restore_switched_locale(pTHX_ const int category, const char * const original_locale)
3123{
3124 /* Restores the locale for LC_'category' to 'original_locale', or do
3125 * nothing if the latter parameter is NULL */
3126
3127 if (original_locale == NULL) {
3128 return;
3129 }
3130
3131 if (! do_setlocale_r(category, original_locale)) {
3132 Perl_croak(aTHX_
3133 "panic: %s: %d: setlocale %s restore to %s failed, errno=%d\n",
3134 __FILE__, __LINE__,
3135 category_name(category), original_locale, errno);
3136 }
3137
3138 Safefree(original_locale);
3139}
3140
c1284011
KW
3141bool
3142Perl__is_cur_LC_category_utf8(pTHX_ int category)
7d74bb61
KW
3143{
3144 /* Returns TRUE if the current locale for 'category' is UTF-8; FALSE
3145 * otherwise. 'category' may not be LC_ALL. If the platform doesn't have
119ee68b 3146 * nl_langinfo(), nor MB_CUR_MAX, this employs a heuristic, which hence
609548d2
KW
3147 * could give the wrong result. The result will very likely be correct for
3148 * languages that have commonly used non-ASCII characters, but for notably
3149 * English, it comes down to if the locale's name ends in something like
3150 * "UTF-8". It errs on the side of not being a UTF-8 locale. */
7d74bb61 3151
47280b20 3152 /* Name of current locale corresponding to the input category */
8de4332b 3153 const char *save_input_locale = NULL;
47280b20
KW
3154
3155 bool is_utf8 = FALSE; /* The return value */
7d74bb61 3156
47280b20
KW
3157 /* The variables below are for the cache of previous lookups using this
3158 * function. The cache is a C string, described at the definition for
3159 * 'C_and_POSIX_utf8ness'.
3160 *
3161 * The first part of the cache is fixed, for the C and POSIX locales. The
3162 * varying part starts just after them. */
3163 char * utf8ness_cache = PL_locale_utf8ness + STRLENs(C_and_POSIX_utf8ness);
3164
3165 Size_t utf8ness_cache_size; /* Size of the varying portion */
3166 Size_t input_name_len; /* Length in bytes of save_input_locale */
3167 Size_t input_name_len_with_overhead; /* plus extra chars used to store
3168 the name in the cache */
3169 char * delimited; /* The name plus the delimiters used to store
3170 it in the cache */
3171 char * name_pos; /* position of 'delimited' in the cache, or 0
3172 if not there */
3173
3174
7d4bcc4a
KW
3175# ifdef LC_ALL
3176
7d74bb61 3177 assert(category != LC_ALL);
7d4bcc4a
KW
3178
3179# endif
7d74bb61 3180
47280b20 3181 /* Get the desired category's locale */
837ce802 3182 save_input_locale = do_setlocale_r(category, NULL);
7d74bb61 3183 if (! save_input_locale) {
47280b20 3184 Perl_croak(aTHX_
d707d779
KW
3185 "panic: %s: %d: Could not find current %s locale, errno=%d\n",
3186 __FILE__, __LINE__, category_name(category), errno);
7d74bb61 3187 }
47280b20 3188
b07fffd1 3189 save_input_locale = stdize_locale(savepv(save_input_locale));
47280b20
KW
3190 DEBUG_L(PerlIO_printf(Perl_debug_log,
3191 "Current locale for %s is %s\n",
3192 category_name(category), save_input_locale));
3193
3194 input_name_len = strlen(save_input_locale);
3195
3196 /* In our cache, each name is accompanied by two delimiters and a single
3197 * utf8ness digit */
3198 input_name_len_with_overhead = input_name_len + 3;
3199
3200 /* Allocate and populate space for a copy of the name surrounded by the
3201 * delimiters */
3202 Newx(delimited, input_name_len_with_overhead, char);
3203 delimited[0] = UTF8NESS_SEP[0];
3204 Copy(save_input_locale, delimited + 1, input_name_len, char);
3205 delimited[input_name_len+1] = UTF8NESS_PREFIX[0];
3206 delimited[input_name_len+2] = '\0';
3207
3208 /* And see if that is in the cache */
3209 name_pos = instr(PL_locale_utf8ness, delimited);
3210 if (name_pos) {
3211 is_utf8 = *(name_pos + input_name_len_with_overhead - 1) - '0';
3212
3213# ifdef DEBUGGING
3214
3215 if (DEBUG_Lv_TEST || debug_initialization) {
3216 PerlIO_printf(Perl_debug_log, "UTF8ness for locale %s=%d, \n",
3217 save_input_locale, is_utf8);
3218 }
3219
3220# endif
3221
3222 /* And, if not already in that position, move it to the beginning of
3223 * the non-constant portion of the list, since it is the most recently
3224 * used. (We don't have to worry about overflow, since just moving
3225 * existing names around) */
3226 if (name_pos > utf8ness_cache) {
3227 Move(utf8ness_cache,
3228 utf8ness_cache + input_name_len_with_overhead,
3229 name_pos - utf8ness_cache, char);
3230 Copy(delimited,
3231 utf8ness_cache,
3232 input_name_len_with_overhead - 1, char);
3233 utf8ness_cache[input_name_len_with_overhead - 1] = is_utf8 + '0';
3234 }
3235
3236 Safefree(delimited);
b07fffd1 3237 Safefree(save_input_locale);
47280b20 3238 return is_utf8;
7d74bb61
KW
3239 }
3240
47280b20
KW
3241 /* Here we don't have stored the utf8ness for the input locale. We have to
3242 * calculate it */
3243
94646a69
KW
3244# if defined(USE_LOCALE_CTYPE) \
3245 && ( (defined(HAS_NL_LANGINFO) && defined(CODESET)) \
3246 || (defined(HAS_MBTOWC) || defined(HAS_MBRTOWC)))
7d74bb61 3247
0dec74cd 3248 {
962aa53f
KW
3249 const char *original_ctype_locale
3250 = switch_category_locale_to_template(LC_CTYPE,
3251 category,
3252 save_input_locale);
69014004 3253
7d74bb61 3254 /* Here the current LC_CTYPE is set to the locale of the category whose
0dec74cd 3255 * information is desired. This means that nl_langinfo() and mbtowc()
1d958db2 3256 * should give the correct results */
119ee68b 3257
0dec74cd
KW
3258# ifdef MB_CUR_MAX /* But we can potentially rule out UTF-8ness, avoiding
3259 calling the functions if we have this */
3260
3261 /* Standard UTF-8 needs at least 4 bytes to represent the maximum
3262 * Unicode code point. */
3263
3264 DEBUG_L(PerlIO_printf(Perl_debug_log, "\tMB_CUR_MAX=%d\n",
3265 (int) MB_CUR_MAX));
3266 if ((unsigned) MB_CUR_MAX < STRLENs(MAX_UNICODE_UTF8)) {
3267 is_utf8 = FALSE;
b5b2847c
KW
3268 restore_switched_locale(LC_CTYPE, original_ctype_locale);
3269 goto finish_and_return;
0dec74cd
KW
3270 }
3271
3272# endif
7d4bcc4a
KW
3273# if defined(HAS_NL_LANGINFO) && defined(CODESET)
3274
0dec74cd
KW
3275 { /* The task is easiest if the platform has this POSIX 2001 function.
3276 Except on some platforms it can wrongly return "", so have to have
3277 a fallback. And it can return that it's UTF-8, even if there are
3278 variances from that. For example, Turkish locales may use the
3279 alternate dotted I rules, and sometimes it appears to be a
3280 defective locale definition. XXX We should probably check for
3281 these in the Latin1 range and warn */
c70a3e68
KW
3282 const char *codeset = my_nl_langinfo(PERL_CODESET, FALSE);
3283 /* FALSE => already in dest locale */
119ee68b 3284
c70a3e68
KW
3285 DEBUG_L(PerlIO_printf(Perl_debug_log,
3286 "\tnllanginfo returned CODESET '%s'\n", codeset));
3287
3288 if (codeset && strNE(codeset, "")) {
1d958db2 3289
1d075173
KW
3290 /* If the implementation of foldEQ() somehow were
3291 * to change to not go byte-by-byte, this could
3292 * read past end of string, as only one length is
3293 * checked. But currently, a premature NUL will
3294 * compare false, and it will stop there */
3295 is_utf8 = cBOOL( foldEQ(codeset, STR_WITH_LEN("UTF-8"))
3296 || foldEQ(codeset, STR_WITH_LEN("UTF8")));
1d958db2 3297
69014004
KW
3298 DEBUG_L(PerlIO_printf(Perl_debug_log,
3299 "\tnllanginfo returned CODESET '%s'; ?UTF8 locale=%d\n",
3300 codeset, is_utf8));
b5b2847c
KW
3301 restore_switched_locale(LC_CTYPE, original_ctype_locale);
3302 goto finish_and_return;
1d958db2 3303 }
119ee68b
KW
3304 }
3305
7d4bcc4a 3306# endif
94646a69 3307# if defined(HAS_MBTOWC) || defined(HAS_MBRTOWC)
0dec74cd
KW
3308 /* We can see if this is a UTF-8-like locale if have mbtowc(). It was a
3309 * late adder to C89, so very likely to have it. However, testing has
3310 * shown that, like nl_langinfo() above, there are locales that are not
3311 * strictly UTF-8 that this will return that they are */
69014004 3312
0dec74cd 3313 {
119ee68b 3314 wchar_t wc;
51fc4b19
KW
3315 int len;
3316
94646a69
KW
3317# if defined(HAS_MBRTOWC) && defined(USE_ITHREADS)
3318
3319 mbstate_t ps;
3320
3321# endif
3322
3323 /* mbrtowc() and mbtowc() convert a byte string to a wide
3324 * character. Feed a byte string to one of them and check that the
3325 * result is the expected Unicode code point */
3326
3327# if defined(HAS_MBRTOWC) && defined(USE_ITHREADS)
3328 /* Prefer this function if available, as it's reentrant */
3329
3330 memset(&ps, 0, sizeof(ps));;
3331 PERL_UNUSED_RESULT(mbrtowc(&wc, NULL, 0, &ps)); /* Reset any shift
3332 state */
3333 errno = 0;
3334 len = mbrtowc(&wc, STR_WITH_LEN(REPLACEMENT_CHARACTER_UTF8), &ps);
3335
3336# else
0dec74cd 3337
856b881c 3338 PERL_UNUSED_RESULT(mbtowc(&wc, NULL, 0));/* Reset any shift state */
69014004 3339 errno = 0;
b1d4925c 3340 len = mbtowc(&wc, STR_WITH_LEN(REPLACEMENT_CHARACTER_UTF8));
51fc4b19 3341
94646a69
KW
3342# endif
3343
0dec74cd
KW
3344 DEBUG_L(PerlIO_printf(Perl_debug_log,
3345 "\treturn from mbtowc; len=%d; code_point=%x; errno=%d\n",
3346 len, (unsigned int) wc, errno));
b1d4925c 3347
0dec74cd
KW
3348 is_utf8 = cBOOL( len == STRLENs(REPLACEMENT_CHARACTER_UTF8)
3349 && wc == (wchar_t) UNICODE_REPLACEMENT);
119ee68b 3350 }
7d4bcc4a 3351
962aa53f 3352 restore_switched_locale(LC_CTYPE, original_ctype_locale);
47280b20 3353 goto finish_and_return;
7d74bb61 3354 }
119ee68b 3355
0dec74cd
KW
3356# endif
3357# else
7d74bb61 3358
0dec74cd
KW
3359 /* Here, we must have a C89 compiler that doesn't have mbtowc(). Next
3360 * try looking at the currency symbol to see if it disambiguates
3361 * things. Often that will be in the native script, and if the symbol
3362 * isn't in UTF-8, we know that the locale isn't. If it is non-ASCII
3363 * UTF-8, we infer that the locale is too, as the odds of a non-UTF8
3364 * string being valid UTF-8 are quite small */
fa9b773e 3365
7d4bcc4a
KW
3366# ifdef HAS_LOCALECONV
3367# ifdef USE_LOCALE_MONETARY
3368
9db16864
KW
3369 {
3370 const char *original_monetary_locale
3371 = switch_category_locale_to_template(LC_MONETARY,
3372 category,
3373 save_input_locale);
3374 bool only_ascii = FALSE;
3375 struct lconv* lc;
fa9b773e 3376
9db16864
KW
3377 /* Like above for LC_CTYPE, we first set LC_MONETARY to the locale of
3378 * the desired category, if it isn't that locale already */
97f4de96 3379
fa9b773e 3380
9db16864
KW
3381 /* Here the current LC_MONETARY is set to the locale of the category
3382 * whose information is desired. */
fa9b773e 3383
9db16864
KW
3384 lc = localeconv();
3385 if (! lc
3386 || ! lc->currency_symbol
3387 || is_utf8_invariant_string((U8 *) lc->currency_symbol, 0))
3388 {
3389 DEBUG_L(PerlIO_printf(Perl_debug_log, "Couldn't get currency symbol for %s, or contains only ASCII; can't use for determining if UTF-8 locale\n", save_input_locale));
3390 only_ascii = TRUE;
3391 }
3392 else {
3393 is_utf8 = is_utf8_string((U8 *) lc->currency_symbol, 0);
3394 }
fa9b773e 3395
9db16864 3396 restore_switched_locale(LC_MONETARY, original_monetary_locale);
fa9b773e 3397
9db16864 3398 if (! only_ascii) {
fa9b773e 3399
9db16864
KW
3400 /* It isn't a UTF-8 locale if the symbol is not legal UTF-8;
3401 * otherwise assume the locale is UTF-8 if and only if the symbol
3402 * is non-ascii UTF-8. */
3403 DEBUG_L(PerlIO_printf(Perl_debug_log, "\t?Currency symbol for %s is UTF-8=%d\n",
3404 save_input_locale, is_utf8));
3405 goto finish_and_return;
3406 }
13542a67 3407 }
fa9b773e 3408
7d4bcc4a
KW
3409# endif /* USE_LOCALE_MONETARY */
3410# endif /* HAS_LOCALECONV */
fa9b773e 3411
7d4bcc4a 3412# if defined(HAS_STRFTIME) && defined(USE_LOCALE_TIME)
15f7e74e 3413
9db16864
KW
3414 /* Still haven't found a non-ASCII string to disambiguate UTF-8 or not. Try
3415 * the names of the months and weekdays, timezone, and am/pm indicator */
3416 {
3417 const char *original_time_locale
3418 = switch_category_locale_to_template(LC_TIME,
3419 category,
3420 save_input_locale);
3421 int hour = 10;
3422 bool is_dst = FALSE;
3423 int dom = 1;
3424 int month = 0;
3425 int i;
3426 char * formatted_time;
3427
3428 /* Here the current LC_TIME is set to the locale of the category
3429 * whose information is desired. Look at all the days of the week and
3430 * month names, and the timezone and am/pm indicator for UTF-8 variant
3431 * characters. The first such a one found will tell us if the locale
3432 * is UTF-8 or not */
3433
3434 for (i = 0; i < 7 + 12; i++) { /* 7 days; 12 months */
3435 formatted_time = my_strftime("%A %B %Z %p",
3436 0, 0, hour, dom, month, 2012 - 1900, 0, 0, is_dst);
3437 if ( ! formatted_time
3438 || is_utf8_invariant_string((U8 *) formatted_time, 0))
3439 {
15f7e74e 3440
9db16864
KW
3441 /* Here, we didn't find a non-ASCII. Try the next time through
3442 * with the complemented dst and am/pm, and try with the next
3443 * weekday. After we have gotten all weekdays, try the next
3444 * month */
3445 is_dst = ! is_dst;
3446 hour = (hour + 12) % 24;
3447 dom++;
3448 if (i > 6) {
3449 month++;
3450 }
3451 continue;
15f7e74e 3452 }
9db16864
KW
3453
3454 /* Here, we have a non-ASCII. Return TRUE is it is valid UTF8;
3455 * false otherwise. But first, restore LC_TIME to its original
3456 * locale if we changed it */
3457 restore_switched_locale(LC_TIME, original_time_locale);
3458
3459 DEBUG_L(PerlIO_printf(Perl_debug_log, "\t?time-related strings for %s are UTF-8=%d\n",
3460 save_input_locale,
3461 is_utf8_string((U8 *) formatted_time, 0)));
3462 is_utf8 = is_utf8_string((U8 *) formatted_time, 0);
3463 goto finish_and_return;
15f7e74e
KW
3464 }
3465
9db16864
KW
3466 /* Falling off the end of the loop indicates all the names were just
3467 * ASCII. Go on to the next test. If we changed it, restore LC_TIME
3468 * to its original locale */
962aa53f 3469 restore_switched_locale(LC_TIME, original_time_locale);
9db16864 3470 DEBUG_L(PerlIO_printf(Perl_debug_log, "All time-related words for %s contain only ASCII; can't use for determining if UTF-8 locale\n", save_input_locale));
15f7e74e
KW
3471 }
3472
7d4bcc4a 3473# endif
15f7e74e 3474
7d4bcc4a 3475# if 0 && defined(USE_LOCALE_MESSAGES) && defined(HAS_SYS_ERRLIST)
855aeb93 3476
9db16864
KW
3477 /* This code is ifdefd out because it was found to not be necessary in testing
3478 * on our dromedary test machine, which has over 700 locales. There, this
3479 * added no value to looking at the currency symbol and the time strings. I
3480 * left it in so as to avoid rewriting it if real-world experience indicates
3481 * that dromedary is an outlier. Essentially, instead of returning abpve if we
3482 * haven't found illegal utf8, we continue on and examine all the strerror()
3483 * messages on the platform for utf8ness. If all are ASCII, we still don't
3484 * know the answer; but otherwise we have a pretty good indication of the
3485 * utf8ness. The reason this doesn't help much is that the messages may not
3486 * have been translated into the locale. The currency symbol and time strings
3487 * are much more likely to have been translated. */
3488 {
3489 int e;
3490 bool non_ascii = FALSE;
3491 const char *original_messages_locale
3492 = switch_category_locale_to_template(LC_MESSAGES,
3493 category,
3494 save_input_locale);
3495 const char * errmsg = NULL;
3496
3497 /* Here the current LC_MESSAGES is set to the locale of the category
3498 * whose information is desired. Look through all the messages. We
3499 * can't use Strerror() here because it may expand to code that
3500 * segfaults in miniperl */
3501
3502 for (e = 0; e <= sys_nerr; e++) {
3503 errno = 0;
3504 errmsg = sys_errlist[e];
3505 if (errno || !errmsg) {
3506 break;
3507 }
3508 errmsg = savepv(errmsg);
3509 if (! is_utf8_invariant_string((U8 *) errmsg, 0)) {
3510 non_ascii = TRUE;
3511 is_utf8 = is_utf8_string((U8 *) errmsg, 0);
3512 break;
3513 }
855aeb93 3514 }
9db16864 3515 Safefree(errmsg);
855aeb93 3516
9db16864 3517 restore_switched_locale(LC_MESSAGES, original_messages_locale);
855aeb93 3518
9db16864 3519 if (non_ascii) {
5857e934 3520
9db16864
KW
3521 /* Any non-UTF-8 message means not a UTF-8 locale; if all are valid,
3522 * any non-ascii means it is one; otherwise we assume it isn't */
3523 DEBUG_L(PerlIO_printf(Perl_debug_log, "\t?error messages for %s are UTF-8=%d\n",
3524 save_input_locale,
3525 is_utf8));
3526 goto finish_and_return;
3527 }
855aeb93 3528
9db16864
KW
3529 DEBUG_L(PerlIO_printf(Perl_debug_log, "All error messages for %s contain only ASCII; can't use for determining if UTF-8 locale\n", save_input_locale));
3530 }
855aeb93 3531
7d4bcc4a 3532# endif
0dec74cd 3533# ifndef EBCDIC /* On os390, even if the name ends with "UTF-8', it isn't a
92c0a900 3534 UTF-8 locale */
7d4bcc4a 3535
97f4de96
KW
3536 /* As a last resort, look at the locale name to see if it matches
3537 * qr/UTF -? * 8 /ix, or some other common locale names. This "name", the
3538 * return of setlocale(), is actually defined to be opaque, so we can't
3539 * really rely on the absence of various substrings in the name to indicate
3540 * its UTF-8ness, but if it has UTF8 in the name, it is extremely likely to
3541 * be a UTF-8 locale. Similarly for the other common names */
3542
0dec74cd 3543 {
c36d8df8 3544 const Size_t final_pos = strlen(save_input_locale) - 1;
0dec74cd 3545
c36d8df8
KW
3546 if (final_pos >= 3) {
3547 const char *name = save_input_locale;
97f4de96 3548
c36d8df8
KW
3549 /* Find next 'U' or 'u' and look from there */
3550 while ((name += strcspn(name, "Uu") + 1)
3551 <= save_input_locale + final_pos - 2)
97f4de96 3552 {
c36d8df8
KW
3553 if ( isALPHA_FOLD_NE(*name, 't')
3554 || isALPHA_FOLD_NE(*(name + 1), 'f'))
3555 {
3556 continue;
3557 }
3558 name += 2;
3559 if (*(name) == '-') {
3560 if ((name > save_input_locale + final_pos - 1)) {
3561 break;
3562 }
3563 name++;
3564 }
3565 if (*(name) == '8') {
3566 DEBUG_L(PerlIO_printf(Perl_debug_log,
3567 "Locale %s ends with UTF-8 in name\n",
3568 save_input_locale));
3569 is_utf8 = TRUE;
3570 goto finish_and_return;
97f4de96 3571 }
97f4de96 3572 }
c36d8df8
KW
3573 DEBUG_L(PerlIO_printf(Perl_debug_log,
3574 "Locale %s doesn't end with UTF-8 in name\n",
3575 save_input_locale));
97f4de96 3576 }
97f4de96 3577
4d8d465a 3578# ifdef WIN32
7d4bcc4a 3579
c36d8df8
KW
3580 /* http://msdn.microsoft.com/en-us/library/windows/desktop/dd317756.aspx */
3581 if (memENDs(save_input_locale, final_pos, "65001")) {
3582 DEBUG_L(PerlIO_printf(Perl_debug_log,
8a0832a1 3583 "Locale %s ends with 65001 in name, is UTF-8 locale\n",
97f4de96 3584 save_input_locale));
c36d8df8
KW
3585 is_utf8 = TRUE;
3586 goto finish_and_return;
3587 }
0dec74cd 3588 }
7d4bcc4a 3589
4d8d465a
KW
3590# endif
3591# endif
97f4de96
KW
3592
3593 /* Other common encodings are the ISO 8859 series, which aren't UTF-8. But
3594 * since we are about to return FALSE anyway, there is no point in doing
3595 * this extra work */
7d4bcc4a 3596
0dec74cd 3597# if 0
97f4de96
KW
3598 if (instr(save_input_locale, "8859")) {
3599 DEBUG_L(PerlIO_printf(Perl_debug_log,
3600 "Locale %s has 8859 in name, not UTF-8 locale\n",
3601 save_input_locale));
47280b20
KW
3602 is_utf8 = FALSE;
3603 goto finish_and_return;
97f4de96 3604 }
0dec74cd 3605# endif
97f4de96 3606
69014004
KW
3607 DEBUG_L(PerlIO_printf(Perl_debug_log,
3608 "Assuming locale %s is not a UTF-8 locale\n",
3609 save_input_locale));
47280b20
KW
3610 is_utf8 = FALSE;
3611
0dec74cd
KW
3612# endif /* the code that is compiled when no modern LC_CTYPE */
3613
47280b20
KW
3614 finish_and_return:
3615
3616 /* Cache this result so we don't have to go through all this next time. */
3617 utf8ness_cache_size = sizeof(PL_locale_utf8ness)
3618 - (utf8ness_cache - PL_locale_utf8ness);
3619
3620 /* But we can't save it if it is too large for the total space available */
3621 if (LIKELY(input_name_len_with_overhead < utf8ness_cache_size)) {
3622 Size_t utf8ness_cache_len = strlen(utf8ness_cache);
3623
3624 /* Here it can fit, but we may need to clear out the oldest cached
3625 * result(s) to do so. Check */
3626 if (utf8ness_cache_len + input_name_len_with_overhead
3627 >= utf8ness_cache_size)
3628 {
3629 /* Here we have to clear something out to make room for this.
3630 * Start looking at the rightmost place where it could fit and find
3631 * the beginning of the entry that extends past that. */
3632 char * cutoff = (char *) my_memrchr(utf8ness_cache,
3633 UTF8NESS_SEP[0],
3634 utf8ness_cache_size
3635 - input_name_len_with_overhead);
3636
3637 assert(cutoff);
3638 assert(cutoff >= utf8ness_cache);
3639
3640 /* This and all subsequent entries must be removed */
3641 *cutoff = '\0';
3642 utf8ness_cache_len = strlen(utf8ness_cache);
3643 }
3644
3645 /* Make space for the new entry */
3646 Move(utf8ness_cache,
3647 utf8ness_cache + input_name_len_with_overhead,
3648 utf8ness_cache_len + 1 /* Incl. trailing NUL */, char);
3649
3650 /* And insert it */
3651 Copy(delimited, utf8ness_cache, input_name_len_with_overhead - 1, char);
3652 utf8ness_cache[input_name_len_with_overhead - 1] = is_utf8 + '0';
3653
3654 if ((PL_locale_utf8ness[strlen(PL_locale_utf8ness)-1]
3655 & (PERL_UINTMAX_T) ~1) != '0')
3656 {
3657 Perl_croak(aTHX_
3658 "panic: %s: %d: Corrupt utf8ness_cache=%s\nlen=%u,"
3659 " inserted_name=%s, its_len=%u\n",
3660 __FILE__, __LINE__,
3661 PL_locale_utf8ness, strlen(PL_locale_utf8ness),
3662 delimited, input_name_len_with_overhead);
3663 }
3664 }
3665
3666# ifdef DEBUGGING
3667
3668 if (DEBUG_Lv_TEST || debug_initialization) {
3669 PerlIO_printf(Perl_debug_log,
3670 "PL_locale_utf8ness is now %s; returning %d\n",
3671 PL_locale_utf8ness, is_utf8);
3672 }
3673
3674# endif
3675
3676 Safefree(delimited);
fa9b773e 3677 Safefree(save_input_locale);
47280b20 3678 return is_utf8;
7d74bb61
KW
3679}
3680
8ef6e574 3681#endif
7d74bb61 3682
d6ded950
KW
3683bool
3684Perl__is_in_locale_category(pTHX_ const bool compiling, const int category)
3685{
1a4f13e1 3686 dVAR;
d6ded950
KW
3687 /* Internal function which returns if we are in the scope of a pragma that
3688 * enables the locale category 'category'. 'compiling' should indicate if
3689 * this is during the compilation phase (TRUE) or not (FALSE). */
3690
3691 const COP * const cop = (compiling) ? &PL_compiling : PL_curcop;
3692
3693 SV *categories = cop_hints_fetch_pvs(cop, "locale", 0);
3694 if (! categories || categories == &PL_sv_placeholder) {
3695 return FALSE;
3696 }
3697
3698 /* The pseudo-category 'not_characters' is -1, so just add 1 to each to get
3699 * a valid unsigned */
3700 assert(category >= -1);
3701 return cBOOL(SvUV(categories) & (1U << (category + 1)));
3702}
3703
2c6ee1a7 3704char *
6ebbc862
KW
3705Perl_my_strerror(pTHX_ const int errnum)
3706{
3707 /* Returns a mortalized copy of the text of the error message associated
3708 * with 'errnum'. It uses the current locale's text unless the platform
3709 * doesn't have the LC_MESSAGES category or we are not being called from
3710 * within the scope of 'use locale'. In the former case, it uses whatever
3711 * strerror returns; in the latter case it uses the text from the C locale.
3712 *
3713 * The function just calls strerror(), but temporarily switches, if needed,
3714 * to the C locale */
3715
3716 char *errstr;
52770946 3717 dVAR;
6ebbc862 3718
52770946 3719#ifndef USE_LOCALE_MESSAGES
6ebbc862 3720
52770946
KW
3721 /* If platform doesn't have messages category, we don't do any switching to
3722 * the C locale; we just use whatever strerror() returns */
3723
3724 errstr = savepv(Strerror(errnum));
3725
3726#else /* Has locale messages */
3727
3728 const bool within_locale_scope = IN_LC(LC_MESSAGES);
2c6ee1a7 3729
7aaa36b1
KW
3730# if defined(HAS_POSIX_2008_LOCALE) && defined(HAS_STRERROR_L)
3731
43cb6651
KW
3732 /* This function is trivial if we don't have to worry about thread safety
3733 * and have strerror_l(), as it handles the switch of locales so we don't
3734 * have to deal with that. We don't have to worry about thread safety if
3735 * this is an unthreaded build, or if strerror_r() is also available. Both
3736 * it and strerror_l() are thread-safe. Plain strerror() isn't thread
3737 * safe. But on threaded builds when strerror_r() is available, the
3738 * apparent call to strerror() below is actually a macro that
3739 * behind-the-scenes calls strerror_r().
3740 */
3741
3742# if ! defined(USE_ITHREADS) || defined(HAS_STRERROR_R)
7aaa36b1
KW
3743
3744 if (within_locale_scope) {
4eb27fc5 3745 errstr = savepv(strerror(errnum));
7aaa36b1
KW
3746 }
3747 else {
4eb27fc5 3748 errstr = savepv(strerror_l(errnum, PL_C_locale_obj));
7aaa36b1
KW
3749 }
3750
43cb6651
KW
3751# else
3752
3753 /* Here we have strerror_l(), but not strerror_r() and we are on a
3754 * threaded-build. We use strerror_l() for everything, constructing a
3755 * locale to pass to it if necessary */
3756
3757 bool do_free = FALSE;
3758 locale_t locale_to_use;
3759
3760 if (within_locale_scope) {
3761 locale_to_use = uselocale((locale_t) 0);
3762 if (locale_to_use == LC_GLOBAL_LOCALE) {
3763 locale_to_use = duplocale(LC_GLOBAL_LOCALE);
3764 do_free = TRUE;