This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
locale.c: Rmv unnecessary locale toggle
[perl5.git] / locale.c
CommitLineData
98994639
HS
1/* locale.c
2 *
1129b882
NC
3 * Copyright (C) 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001,
4 * 2002, 2003, 2005, 2006, 2007, 2008 by Larry Wall and others
98994639
HS
5 *
6 * You may distribute under the terms of either the GNU General Public
7 * License or the Artistic License, as specified in the README file.
8 *
9 */
10
11/*
4ac71550 12 * A Elbereth Gilthoniel,
cdad3b53 13 * silivren penna míriel
4ac71550 14 * o menel aglar elenath!
cdad3b53 15 * Na-chaered palan-díriel
4ac71550
TC
16 * o galadhremmin ennorath,
17 * Fanuilos, le linnathon
18 * nef aear, si nef aearon!
19 *
20 * [p.238 of _The Lord of the Rings_, II/i: "Many Meetings"]
98994639
HS
21 */
22
166f8a29
DM
23/* utility functions for handling locale-specific stuff like what
24 * character represents the decimal point.
0d071d52 25 *
7d4bcc4a
KW
26 * All C programs have an underlying locale. Perl code generally doesn't pay
27 * any attention to it except within the scope of a 'use locale'. For most
0d071d52
KW
28 * categories, it accomplishes this by just using different operations if it is
29 * in such scope than if not. However, various libc functions called by Perl
30 * are affected by the LC_NUMERIC category, so there are macros in perl.h that
31 * are used to toggle between the current locale and the C locale depending on
a9ad02a8
KW
32 * the desired behavior of those functions at the moment. And, LC_MESSAGES is
33 * switched to the C locale for outputting the message unless within the scope
34 * of 'use locale'.
166f8a29
DM
35 */
36
98994639
HS
37#include "EXTERN.h"
38#define PERL_IN_LOCALE_C
f7416781 39#include "perl_langinfo.h"
98994639
HS
40#include "perl.h"
41
a4af207c
JH
42#include "reentr.h"
43
0dec74cd
KW
44#ifdef I_WCHAR
45# include <wchar.h>
46#endif
47
2fcc0ca9
KW
48/* If the environment says to, we can output debugging information during
49 * initialization. This is done before option parsing, and before any thread
50 * creation, so can be a file-level static */
5a4b0634
KW
51#if ! defined(DEBUGGING) || defined(PERL_GLOBAL_STRUCT)
52# define debug_initialization 0
53# define DEBUG_INITIALIZATION_set(v)
54#else
2fcc0ca9 55static bool debug_initialization = FALSE;
5a4b0634 56# define DEBUG_INITIALIZATION_set(v) (debug_initialization = v)
2fcc0ca9
KW
57#endif
58
291a84fb
KW
59/* strlen() of a literal string constant. We might want this more general,
60 * but using it in just this file for now. A problem with more generality is
61 * the compiler warnings about comparing unlike signs */
ff1b739b
KW
62#define STRLENs(s) (sizeof("" s "") - 1)
63
63e5b0d7
KW
64/* Is the C string input 'name' "C" or "POSIX"? If so, and 'name' is the
65 * return of setlocale(), then this is extremely likely to be the C or POSIX
66 * locale. However, the output of setlocale() is documented to be opaque, but
67 * the odds are extremely small that it would return these two strings for some
68 * other locale. Note that VMS in these two locales includes many non-ASCII
69 * characters as controls and punctuation (below are hex bytes):
70 * cntrl: 84-97 9B-9F
71 * punct: A1-A3 A5 A7-AB B0-B3 B5-B7 B9-BD BF-CF D1-DD DF-EF F1-FD
72 * Oddly, none there are listed as alphas, though some represent alphabetics
73 * http://www.nntp.perl.org/group/perl.perl5.porters/2013/02/msg198753.html */
74#define isNAME_C_OR_POSIX(name) \
75 ( (name) != NULL \
76 && (( *(name) == 'C' && (*(name + 1)) == '\0') \
77 || strEQ((name), "POSIX")))
78
8ef6e574
KW
79#ifdef USE_LOCALE
80
47280b20
KW
81/* This code keeps a LRU cache of the UTF-8ness of the locales it has so-far
82 * looked up. This is in the form of a C string: */
83
84#define UTF8NESS_SEP "\v"
85#define UTF8NESS_PREFIX "\f"
86
87/* So, the string looks like:
98994639 88 *
47280b20 89 * \vC\a0\vPOSIX\a0\vam_ET\a0\vaf_ZA.utf8\a1\ven_US.UTF-8\a1\0
98994639 90 *
47280b20
KW
91 * where the digit 0 after the \a indicates that the locale starting just
92 * after the preceding \v is not UTF-8, and the digit 1 mean it is. */
93
94STATIC_ASSERT_DECL(STRLENs(UTF8NESS_SEP) == 1);
95STATIC_ASSERT_DECL(STRLENs(UTF8NESS_PREFIX) == 1);
96
97#define C_and_POSIX_utf8ness UTF8NESS_SEP "C" UTF8NESS_PREFIX "0" \
98 UTF8NESS_SEP "POSIX" UTF8NESS_PREFIX "0"
99
100/* The cache is initialized to C_and_POSIX_utf8ness at start up. These are
101 * kept there always. The remining portion of the cache is LRU, with the
102 * oldest looked-up locale at the tail end */
103
98994639
HS
104STATIC char *
105S_stdize_locale(pTHX_ char *locs)
106{
47280b20
KW
107 /* Standardize the locale name from a string returned by 'setlocale',
108 * possibly modifying that string.
109 *
110 * The typical return value of setlocale() is either
111 * (1) "xx_YY" if the first argument of setlocale() is not LC_ALL
112 * (2) "xa_YY xb_YY ..." if the first argument of setlocale() is LC_ALL
113 * (the space-separated values represent the various sublocales,
114 * in some unspecified order). This is not handled by this function.
115 *
116 * In some platforms it has a form like "LC_SOMETHING=Lang_Country.866\n",
117 * which is harmful for further use of the string in setlocale(). This
118 * function removes the trailing new line and everything up through the '='
119 * */
120
7452cf6a 121 const char * const s = strchr(locs, '=');
98994639
HS
122 bool okay = TRUE;
123
7918f24d
NC
124 PERL_ARGS_ASSERT_STDIZE_LOCALE;
125
8772537c
AL
126 if (s) {
127 const char * const t = strchr(s, '.');
98994639 128 okay = FALSE;
8772537c
AL
129 if (t) {
130 const char * const u = strchr(t, '\n');
131 if (u && (u[1] == 0)) {
132 const STRLEN len = u - s;
133 Move(s + 1, locs, len, char);
134 locs[len] = 0;
135 okay = TRUE;
98994639
HS
136 }
137 }
138 }
139
140 if (!okay)
141 Perl_croak(aTHX_ "Can't fix broken locale name \"%s\"", locs);
142
143 return locs;
144}
145
e5f10d49
KW
146/* Two parallel arrays; first the locale categories Perl uses on this system;
147 * the second array is their names. These arrays are in mostly arbitrary
148 * order. */
149
150const int categories[] = {
151
152# ifdef USE_LOCALE_NUMERIC
153 LC_NUMERIC,
154# endif
155# ifdef USE_LOCALE_CTYPE
156 LC_CTYPE,
157# endif
158# ifdef USE_LOCALE_COLLATE
159 LC_COLLATE,
160# endif
161# ifdef USE_LOCALE_TIME
162 LC_TIME,
163# endif
164# ifdef USE_LOCALE_MESSAGES
165 LC_MESSAGES,
166# endif
167# ifdef USE_LOCALE_MONETARY
168 LC_MONETARY,
169# endif
9821811f
KW
170# ifdef USE_LOCALE_ADDRESS
171 LC_ADDRESS,
172# endif
173# ifdef USE_LOCALE_IDENTIFICATION
174 LC_IDENTIFICATION,
175# endif
176# ifdef USE_LOCALE_MEASUREMENT
177 LC_MEASUREMENT,
178# endif
179# ifdef USE_LOCALE_PAPER
180 LC_PAPER,
181# endif
182# ifdef USE_LOCALE_TELEPHONE
183 LC_TELEPHONE,
184# endif
e5f10d49
KW
185# ifdef LC_ALL
186 LC_ALL,
187# endif
188 -1 /* Placeholder because C doesn't allow a
189 trailing comma, and it would get complicated
190 with all the #ifdef's */
191};
192
193/* The top-most real element is LC_ALL */
194
195const char * category_names[] = {
196
197# ifdef USE_LOCALE_NUMERIC
198 "LC_NUMERIC",
199# endif
200# ifdef USE_LOCALE_CTYPE
201 "LC_CTYPE",
202# endif
203# ifdef USE_LOCALE_COLLATE
204 "LC_COLLATE",
205# endif
206# ifdef USE_LOCALE_TIME
207 "LC_TIME",
208# endif
209# ifdef USE_LOCALE_MESSAGES
210 "LC_MESSAGES",
211# endif
212# ifdef USE_LOCALE_MONETARY
213 "LC_MONETARY",
214# endif
9821811f
KW
215# ifdef USE_LOCALE_ADDRESS
216 "LC_ADDRESS",
217# endif
218# ifdef USE_LOCALE_IDENTIFICATION
219 "LC_IDENTIFICATION",
220# endif
221# ifdef USE_LOCALE_MEASUREMENT
222 "LC_MEASUREMENT",
223# endif
224# ifdef USE_LOCALE_PAPER
225 "LC_PAPER",
226# endif
227# ifdef USE_LOCALE_TELEPHONE
228 "LC_TELEPHONE",
229# endif
e5f10d49
KW
230# ifdef LC_ALL
231 "LC_ALL",
232# endif
233 NULL /* Placeholder */
234 };
235
236# ifdef LC_ALL
237
238 /* On systems with LC_ALL, it is kept in the highest index position. (-2
239 * to account for the final unused placeholder element.) */
240# define NOMINAL_LC_ALL_INDEX (C_ARRAY_LENGTH(categories) - 2)
241
242# else
243
244 /* On systems without LC_ALL, we pretend it is there, one beyond the real
245 * top element, hence in the unused placeholder element. */
246# define NOMINAL_LC_ALL_INDEX (C_ARRAY_LENGTH(categories) - 1)
247
248# endif
249
250/* Pretending there is an LC_ALL element just above allows us to avoid most
251 * special cases. Most loops through these arrays in the code below are
252 * written like 'for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++)'. They will work
253 * on either type of system. But the code must be written to not access the
948523db
KW
254 * element at 'LC_ALL_INDEX' except on platforms that have it. This can be
255 * checked for at compile time by using the #define LC_ALL_INDEX which is only
256 * defined if we do have LC_ALL. */
e5f10d49 257
b09aaf40
KW
258STATIC const char *
259S_category_name(const int category)
260{
261 unsigned int i;
262
263#ifdef LC_ALL
264
265 if (category == LC_ALL) {
266 return "LC_ALL";
267 }
268
269#endif
270
271 for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++) {
272 if (category == categories[i]) {
273 return category_names[i];
274 }
275 }
276
277 {
278 const char suffix[] = " (unknown)";
279 int temp = category;
280 Size_t length = sizeof(suffix) + 1;
281 char * unknown;
282 dTHX;
283
284 if (temp < 0) {
285 length++;
286 temp = - temp;
287 }
288
289 /* Calculate the number of digits */
290 while (temp >= 10) {
291 temp /= 10;
292 length++;
293 }
294
295 Newx(unknown, length, char);
296 my_snprintf(unknown, length, "%d%s", category, suffix);
297 SAVEFREEPV(unknown);
298 return unknown;
299 }
300}
301
948523db
KW
302/* Now create LC_foo_INDEX #defines for just those categories on this system */
303# ifdef USE_LOCALE_NUMERIC
304# define LC_NUMERIC_INDEX 0
305# define _DUMMY_NUMERIC LC_NUMERIC_INDEX
306# else
307# define _DUMMY_NUMERIC -1
308# endif
309# ifdef USE_LOCALE_CTYPE
310# define LC_CTYPE_INDEX _DUMMY_NUMERIC + 1
311# define _DUMMY_CTYPE LC_CTYPE_INDEX
312# else
313# define _DUMMY_CTYPE _DUMMY_NUMERIC
314# endif
315# ifdef USE_LOCALE_COLLATE
316# define LC_COLLATE_INDEX _DUMMY_CTYPE + 1
317# define _DUMMY_COLLATE LC_COLLATE_INDEX
318# else
319# define _DUMMY_COLLATE _DUMMY_COLLATE
320# endif
321# ifdef USE_LOCALE_TIME
322# define LC_TIME_INDEX _DUMMY_COLLATE + 1
323# define _DUMMY_TIME LC_TIME_INDEX
324# else
325# define _DUMMY_TIME _DUMMY_COLLATE
326# endif
327# ifdef USE_LOCALE_MESSAGES
328# define LC_MESSAGES_INDEX _DUMMY_TIME + 1
329# define _DUMMY_MESSAGES LC_MESSAGES_INDEX
330# else
331# define _DUMMY_MESSAGES _DUMMY_TIME
332# endif
333# ifdef USE_LOCALE_MONETARY
334# define LC_MONETARY_INDEX _DUMMY_MESSAGES + 1
335# define _DUMMY_MONETARY LC_MONETARY_INDEX
336# else
337# define _DUMMY_MONETARY _DUMMY_MESSAGES
338# endif
9821811f
KW
339# ifdef USE_LOCALE_ADDRESS
340# define LC_ADDRESS_INDEX _DUMMY_MONETARY + 1
341# define _DUMMY_ADDRESS LC_ADDRESS_INDEX
342# else
343# define _DUMMY_ADDRESS _DUMMY_MONETARY
344# endif
345# ifdef USE_LOCALE_IDENTIFICATION
346# define LC_IDENTIFICATION_INDEX _DUMMY_ADDRESS + 1
347# define _DUMMY_IDENTIFICATION LC_IDENTIFICATION_INDEX
348# else
349# define _DUMMY_IDENTIFICATION _DUMMY_ADDRESS
350# endif
351# ifdef USE_LOCALE_MEASUREMENT
352# define LC_MEASUREMENT_INDEX _DUMMY_IDENTIFICATION + 1
353# define _DUMMY_MEASUREMENT LC_MEASUREMENT_INDEX
354# else
355# define _DUMMY_MEASUREMENT _DUMMY_IDENTIFICATION
356# endif
357# ifdef USE_LOCALE_PAPER
358# define LC_PAPER_INDEX _DUMMY_MEASUREMENT + 1
359# define _DUMMY_PAPER LC_PAPER_INDEX
360# else
361# define _DUMMY_PAPER _DUMMY_MEASUREMENT
362# endif
363# ifdef USE_LOCALE_TELEPHONE
364# define LC_TELEPHONE_INDEX _DUMMY_PAPER + 1
365# define _DUMMY_TELEPHONE LC_TELEPHONE_INDEX
366# else
367# define _DUMMY_TELEPHONE _DUMMY_PAPER
368# endif
948523db 369# ifdef LC_ALL
9821811f 370# define LC_ALL_INDEX _DUMMY_TELEPHONE + 1
948523db
KW
371# endif
372#endif /* ifdef USE_LOCALE */
8ef6e574 373
d2b24094
KW
374/* Windows requres a customized base-level setlocale() */
375# ifdef WIN32
376# define my_setlocale(cat, locale) win32_setlocale(cat, locale)
377# else
378# define my_setlocale(cat, locale) setlocale(cat, locale)
379# endif
380
837ce802
KW
381/* Just placeholders for now. "_c" is intended to be called when the category
382 * is a constant known at compile time; "_r", not known until run time */
d2b24094
KW
383# define do_setlocale_c(category, locale) my_setlocale(category, locale)
384# define do_setlocale_r(category, locale) my_setlocale(category, locale)
837ce802 385
a4f00dcc 386STATIC void
86799d2d 387S_set_numeric_radix(pTHX_ const bool use_locale)
98994639 388{
86799d2d
KW
389 /* If 'use_locale' is FALSE, set to use a dot for the radix character. If
390 * TRUE, use the radix character derived from the current locale */
7d4bcc4a 391
86799d2d
KW
392#if defined(USE_LOCALE_NUMERIC) && ( defined(HAS_LOCALECONV) \
393 || defined(HAS_NL_LANGINFO))
98994639 394
87f8e8e7
KW
395 const char * radix = (use_locale)
396 ? my_nl_langinfo(PERL_RADIXCHAR, FALSE)
397 /* FALSE => already in dest locale */
398 : ".";
2213a3be 399
87f8e8e7 400 sv_setpv(PL_numeric_radix_sv, radix);
86799d2d 401
87f8e8e7
KW
402 /* If this is valid UTF-8 that isn't totally ASCII, and we are in
403 * a UTF-8 locale, then mark the radix as being in UTF-8 */
404 if (is_utf8_non_invariant_string((U8 *) SvPVX(PL_numeric_radix_sv),
7a393424 405 SvCUR(PL_numeric_radix_sv))
87f8e8e7
KW
406 && _is_cur_LC_category_utf8(LC_NUMERIC))
407 {
408 SvUTF8_on(PL_numeric_radix_sv);
409 }
86799d2d
KW
410
411# ifdef DEBUGGING
7d4bcc4a 412
2fcc0ca9
KW
413 if (DEBUG_L_TEST || debug_initialization) {
414 PerlIO_printf(Perl_debug_log, "Locale radix is '%s', ?UTF-8=%d\n",
3ca88433
KW
415 SvPVX(PL_numeric_radix_sv),
416 cBOOL(SvUTF8(PL_numeric_radix_sv)));
2fcc0ca9 417 }
69014004 418
86799d2d
KW
419# endif
420#endif /* USE_LOCALE_NUMERIC and can find the radix char */
7d4bcc4a 421
98994639
HS
422}
423
a39edc4c 424
98994639 425void
8772537c 426Perl_new_numeric(pTHX_ const char *newnum)
98994639 427{
7d4bcc4a
KW
428
429#ifndef USE_LOCALE_NUMERIC
430
431 PERL_UNUSED_ARG(newnum);
432
433#else
0d071d52 434
291a84fb 435 /* Called after each libc setlocale() call affecting LC_NUMERIC, to tell
0d071d52
KW
436 * core Perl this and that 'newnum' is the name of the new locale.
437 * It installs this locale as the current underlying default.
438 *
439 * The default locale and the C locale can be toggled between by use of the
5792c642
KW
440 * set_numeric_underlying() and set_numeric_standard() functions, which
441 * should probably not be called directly, but only via macros like
0d071d52
KW
442 * SET_NUMERIC_STANDARD() in perl.h.
443 *
444 * The toggling is necessary mainly so that a non-dot radix decimal point
445 * character can be output, while allowing internal calculations to use a
446 * dot.
447 *
448 * This sets several interpreter-level variables:
bb304765 449 * PL_numeric_name The underlying locale's name: a copy of 'newnum'
892e6465 450 * PL_numeric_underlying A boolean indicating if the toggled state is such
7738054c
KW
451 * that the current locale is the program's underlying
452 * locale
453 * PL_numeric_standard An int indicating if the toggled state is such
4c68b815
KW
454 * that the current locale is the C locale or
455 * indistinguishable from the C locale. If non-zero, it
456 * is in C; if > 1, it means it may not be toggled away
7738054c 457 * from C.
4c68b815
KW
458 * PL_numeric_underlying_is_standard A bool kept by this function
459 * indicating that the underlying locale and the standard
460 * C locale are indistinguishable for the purposes of
461 * LC_NUMERIC. This happens when both of the above two
462 * variables are true at the same time. (Toggling is a
463 * no-op under these circumstances.) This variable is
464 * used to avoid having to recalculate.
0d071d52
KW
465 * Any code changing the locale (outside this file) should use
466 * POSIX::setlocale, which calls this function. Therefore this function
467 * should be called directly only from this file and from
468 * POSIX::setlocale() */
469
b03f34cf 470 char *save_newnum;
98994639
HS
471
472 if (! newnum) {
43c5f42d
NC
473 Safefree(PL_numeric_name);
474 PL_numeric_name = NULL;
98994639 475 PL_numeric_standard = TRUE;
892e6465 476 PL_numeric_underlying = TRUE;
4c68b815 477 PL_numeric_underlying_is_standard = TRUE;
98994639
HS
478 return;
479 }
480
b03f34cf 481 save_newnum = stdize_locale(savepv(newnum));
892e6465 482 PL_numeric_underlying = TRUE;
4c68b815
KW
483 PL_numeric_standard = isNAME_C_OR_POSIX(save_newnum);
484
485 /* If its name isn't C nor POSIX, it could still be indistinguishable from
486 * them */
487 if (! PL_numeric_standard) {
488 PL_numeric_standard = cBOOL(strEQ(".", my_nl_langinfo(PERL_RADIXCHAR,
489 FALSE /* Don't toggle locale */ ))
490 && strEQ("", my_nl_langinfo(PERL_THOUSEP,
491 FALSE)));
492 }
abe1abcf 493
4c68b815 494 /* Save the new name if it isn't the same as the previous one, if any */
b03f34cf 495 if (! PL_numeric_name || strNE(PL_numeric_name, save_newnum)) {
98994639 496 Safefree(PL_numeric_name);
b03f34cf 497 PL_numeric_name = save_newnum;
b03f34cf 498 }
abe1abcf
KW
499 else {
500 Safefree(save_newnum);
501 }
4c28b29c 502
4c68b815
KW
503 PL_numeric_underlying_is_standard = PL_numeric_standard;
504
7e5377f7 505# ifdef HAS_POSIX_2008_LOCALE
e1aa2579
KW
506
507 PL_underlying_numeric_obj = newlocale(LC_NUMERIC_MASK,
508 PL_numeric_name,
509 PL_underlying_numeric_obj);
510
511#endif
512
4c68b815
KW
513 if (DEBUG_L_TEST || debug_initialization) {
514 PerlIO_printf(Perl_debug_log, "Called new_numeric with %s, PL_numeric_name=%s\n", newnum, PL_numeric_name);
515 }
516
4c28b29c
KW
517 /* Keep LC_NUMERIC in the C locale. This is for XS modules, so they don't
518 * have to worry about the radix being a non-dot. (Core operations that
519 * need the underlying locale change to it temporarily). */
520 set_numeric_standard();
521
98994639 522#endif /* USE_LOCALE_NUMERIC */
7d4bcc4a 523
98994639
HS
524}
525
526void
527Perl_set_numeric_standard(pTHX)
528{
7d4bcc4a 529
98994639 530#ifdef USE_LOCALE_NUMERIC
7d4bcc4a 531
28c1bf33
KW
532 /* Toggle the LC_NUMERIC locale to C. Most code should use the macros like
533 * SET_NUMERIC_STANDARD() in perl.h instead of calling this directly. The
534 * macro avoids calling this routine if toggling isn't necessary according
535 * to our records (which could be wrong if some XS code has changed the
536 * locale behind our back) */
0d071d52 537
837ce802 538 do_setlocale_c(LC_NUMERIC, "C");
a9b8c0d8 539 PL_numeric_standard = TRUE;
4c68b815 540 PL_numeric_underlying = PL_numeric_underlying_is_standard;
86799d2d 541 set_numeric_radix(0);
7d4bcc4a
KW
542
543# ifdef DEBUGGING
544
2fcc0ca9
KW
545 if (DEBUG_L_TEST || debug_initialization) {
546 PerlIO_printf(Perl_debug_log,
58e4a467 547 "LC_NUMERIC locale now is standard C\n");
2fcc0ca9 548 }
98994639 549
7d4bcc4a 550# endif
98994639 551#endif /* USE_LOCALE_NUMERIC */
7d4bcc4a 552
98994639
HS
553}
554
555void
5792c642 556Perl_set_numeric_underlying(pTHX)
98994639 557{
7d4bcc4a 558
98994639 559#ifdef USE_LOCALE_NUMERIC
7d4bcc4a 560
28c1bf33 561 /* Toggle the LC_NUMERIC locale to the current underlying default. Most
7d4bcc4a
KW
562 * code should use the macros like SET_NUMERIC_UNDERLYING() in perl.h
563 * instead of calling this directly. The macro avoids calling this routine
564 * if toggling isn't necessary according to our records (which could be
565 * wrong if some XS code has changed the locale behind our back) */
a9b8c0d8 566
837ce802 567 do_setlocale_c(LC_NUMERIC, PL_numeric_name);
4c68b815 568 PL_numeric_standard = PL_numeric_underlying_is_standard;
892e6465 569 PL_numeric_underlying = TRUE;
3c62fd46 570 set_numeric_radix(! PL_numeric_standard);
7d4bcc4a
KW
571
572# ifdef DEBUGGING
573
2fcc0ca9
KW
574 if (DEBUG_L_TEST || debug_initialization) {
575 PerlIO_printf(Perl_debug_log,
58e4a467 576 "LC_NUMERIC locale now is %s\n",
2fcc0ca9
KW
577 PL_numeric_name);
578 }
98994639 579
7d4bcc4a 580# endif
98994639 581#endif /* USE_LOCALE_NUMERIC */
7d4bcc4a 582
98994639
HS
583}
584
585/*
586 * Set up for a new ctype locale.
587 */
a4f00dcc
KW
588STATIC void
589S_new_ctype(pTHX_ const char *newctype)
98994639 590{
7d4bcc4a
KW
591
592#ifndef USE_LOCALE_CTYPE
593
594 PERL_ARGS_ASSERT_NEW_CTYPE;
595 PERL_UNUSED_ARG(newctype);
596 PERL_UNUSED_CONTEXT;
597
598#else
0d071d52 599
291a84fb 600 /* Called after each libc setlocale() call affecting LC_CTYPE, to tell
0d071d52
KW
601 * core Perl this and that 'newctype' is the name of the new locale.
602 *
603 * This function sets up the folding arrays for all 256 bytes, assuming
604 * that tofold() is tolc() since fold case is not a concept in POSIX,
605 *
606 * Any code changing the locale (outside this file) should use
607 * POSIX::setlocale, which calls this function. Therefore this function
608 * should be called directly only from this file and from
609 * POSIX::setlocale() */
610
27da23d5 611 dVAR;
68067e4e 612 UV i;
98994639 613
7918f24d
NC
614 PERL_ARGS_ASSERT_NEW_CTYPE;
615
215c5139
KW
616 /* We will replace any bad locale warning with 1) nothing if the new one is
617 * ok; or 2) a new warning for the bad new locale */
618 if (PL_warn_locale) {
619 SvREFCNT_dec_NN(PL_warn_locale);
620 PL_warn_locale = NULL;
621 }
622
c1284011 623 PL_in_utf8_CTYPE_locale = _is_cur_LC_category_utf8(LC_CTYPE);
31f05a37
KW
624
625 /* A UTF-8 locale gets standard rules. But note that code still has to
626 * handle this specially because of the three problematic code points */
627 if (PL_in_utf8_CTYPE_locale) {
628 Copy(PL_fold_latin1, PL_fold_locale, 256, U8);
629 }
630 else {
8c6180a9
KW
631 /* Assume enough space for every character being bad. 4 spaces each
632 * for the 94 printable characters that are output like "'x' "; and 5
633 * spaces each for "'\\' ", "'\t' ", and "'\n' "; plus a terminating
634 * NUL */
635 char bad_chars_list[ (94 * 4) + (3 * 5) + 1 ];
636
cc9eaeb0
KW
637 /* Don't check for problems if we are suppressing the warnings */
638 bool check_for_problems = ckWARN_d(WARN_LOCALE)
639 || UNLIKELY(DEBUG_L_TEST);
8c6180a9
KW
640 bool multi_byte_locale = FALSE; /* Assume is a single-byte locale
641 to start */
642 unsigned int bad_count = 0; /* Count of bad characters */
643
baa60164 644 for (i = 0; i < 256; i++) {
5a8a8f21
KW
645 if (isupper(i))
646 PL_fold_locale[i] = (U8) tolower(i);
647 else if (islower(i))
648 PL_fold_locale[i] = (U8) toupper(i);
baa60164
KW
649 else
650 PL_fold_locale[i] = (U8) i;
8c6180a9
KW
651
652 /* If checking for locale problems, see if the native ASCII-range
653 * printables plus \n and \t are in their expected categories in
654 * the new locale. If not, this could mean big trouble, upending
655 * Perl's and most programs' assumptions, like having a
656 * metacharacter with special meaning become a \w. Fortunately,
657 * it's very rare to find locales that aren't supersets of ASCII
658 * nowadays. It isn't a problem for most controls to be changed
659 * into something else; we check only \n and \t, though perhaps \r
660 * could be an issue as well. */
7d4bcc4a 661 if ( check_for_problems
8c6180a9
KW
662 && (isGRAPH_A(i) || isBLANK_A(i) || i == '\n'))
663 {
ef80cd99 664 if ( cBOOL(isalnum(i)) != cBOOL(isALPHANUMERIC_A(i))
adf947ba
KW
665 || cBOOL(isalpha(i)) != cBOOL(isALPHA_A(i))
666 || cBOOL(isdigit(i)) != cBOOL(isDIGIT_A(i))
667 || cBOOL(isgraph(i)) != cBOOL(isGRAPH_A(i))
668 || cBOOL(islower(i)) != cBOOL(isLOWER_A(i))
669 || cBOOL(isprint(i)) != cBOOL(isPRINT_A(i))
670 || cBOOL(ispunct(i)) != cBOOL(isPUNCT_A(i))
671 || cBOOL(isspace(i)) != cBOOL(isSPACE_A(i))
672 || cBOOL(isupper(i)) != cBOOL(isUPPER_A(i))
673 || cBOOL(isxdigit(i))!= cBOOL(isXDIGIT_A(i))
674 || tolower(i) != (int) toLOWER_A(i)
675 || toupper(i) != (int) toUPPER_A(i)
676 || (i == '\n' && ! isCNTRL_LC(i)))
8c6180a9
KW
677 {
678 if (bad_count) { /* Separate multiple entries with a
679 blank */
680 bad_chars_list[bad_count++] = ' ';
681 }
682 bad_chars_list[bad_count++] = '\'';
683 if (isPRINT_A(i)) {
684 bad_chars_list[bad_count++] = (char) i;
685 }
686 else {
687 bad_chars_list[bad_count++] = '\\';
688 if (i == '\n') {
689 bad_chars_list[bad_count++] = 'n';
690 }
691 else {
692 assert(i == '\t');
693 bad_chars_list[bad_count++] = 't';
694 }
695 }
696 bad_chars_list[bad_count++] = '\'';
697 bad_chars_list[bad_count] = '\0';
698 }
699 }
700 }
701
7d4bcc4a
KW
702# ifdef MB_CUR_MAX
703
8c6180a9 704 /* We only handle single-byte locales (outside of UTF-8 ones; so if
d35fca5f 705 * this locale requires more than one byte, there are going to be
8c6180a9 706 * problems. */
9c8a6dc2
KW
707 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
708 "%s:%d: check_for_problems=%d, MB_CUR_MAX=%d\n",
709 __FILE__, __LINE__, check_for_problems, (int) MB_CUR_MAX));
710
ba1a4362
KW
711 if (check_for_problems && MB_CUR_MAX > 1
712
713 /* Some platforms return MB_CUR_MAX > 1 for even the "C"
714 * locale. Just assume that the implementation for them (plus
715 * for POSIX) is correct and the > 1 value is spurious. (Since
716 * these are specially handled to never be considered UTF-8
717 * locales, as long as this is the only problem, everything
718 * should work fine */
719 && strNE(newctype, "C") && strNE(newctype, "POSIX"))
720 {
8c6180a9
KW
721 multi_byte_locale = TRUE;
722 }
7d4bcc4a
KW
723
724# endif
8c6180a9
KW
725
726 if (bad_count || multi_byte_locale) {
780fcc9f 727 PL_warn_locale = Perl_newSVpvf(aTHX_
8c6180a9 728 "Locale '%s' may not work well.%s%s%s\n",
780fcc9f 729 newctype,
8c6180a9
KW
730 (multi_byte_locale)
731 ? " Some characters in it are not recognized by"
732 " Perl."
733 : "",
734 (bad_count)
735 ? "\nThe following characters (and maybe others)"
736 " may not have the same meaning as the Perl"
737 " program expects:\n"
738 : "",
739 (bad_count)
740 ? bad_chars_list
741 : ""
742 );
cc9eaeb0 743 /* If we are actually in the scope of the locale or are debugging,
bddebb56
KW
744 * output the message now. If not in that scope, we save the
745 * message to be output at the first operation using this locale,
746 * if that actually happens. Most programs don't use locales, so
747 * they are immune to bad ones. */
cc9eaeb0 748 if (IN_LC(LC_CTYPE) || UNLIKELY(DEBUG_L_TEST)) {
780fcc9f 749
780fcc9f
KW
750 /* The '0' below suppresses a bogus gcc compiler warning */
751 Perl_warner(aTHX_ packWARN(WARN_LOCALE), SvPVX(PL_warn_locale), 0);
bddebb56 752
bddebb56
KW
753 if (IN_LC(LC_CTYPE)) {
754 SvREFCNT_dec_NN(PL_warn_locale);
755 PL_warn_locale = NULL;
756 }
780fcc9f 757 }
baa60164 758 }
31f05a37 759 }
98994639
HS
760
761#endif /* USE_LOCALE_CTYPE */
7d4bcc4a 762
98994639
HS
763}
764
98994639 765void
2726666d
KW
766Perl__warn_problematic_locale()
767{
2726666d
KW
768
769#ifdef USE_LOCALE_CTYPE
770
5f04a188
KW
771 dTHX;
772
773 /* Internal-to-core function that outputs the message in PL_warn_locale,
774 * and then NULLS it. Should be called only through the macro
775 * _CHECK_AND_WARN_PROBLEMATIC_LOCALE */
776
2726666d 777 if (PL_warn_locale) {
2726666d
KW
778 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
779 SvPVX(PL_warn_locale),
780 0 /* dummy to avoid compiler warning */ );
2726666d
KW
781 SvREFCNT_dec_NN(PL_warn_locale);
782 PL_warn_locale = NULL;
783 }
784
785#endif
786
787}
788
a4f00dcc
KW
789STATIC void
790S_new_collate(pTHX_ const char *newcoll)
98994639 791{
7d4bcc4a
KW
792
793#ifndef USE_LOCALE_COLLATE
794
795 PERL_UNUSED_ARG(newcoll);
796 PERL_UNUSED_CONTEXT;
797
798#else
0d071d52 799
291a84fb 800 /* Called after each libc setlocale() call affecting LC_COLLATE, to tell
0d071d52
KW
801 * core Perl this and that 'newcoll' is the name of the new locale.
802 *
d35fca5f
KW
803 * The design of locale collation is that every locale change is given an
804 * index 'PL_collation_ix'. The first time a string particpates in an
805 * operation that requires collation while locale collation is active, it
806 * is given PERL_MAGIC_collxfrm magic (via sv_collxfrm_flags()). That
807 * magic includes the collation index, and the transformation of the string
808 * by strxfrm(), q.v. That transformation is used when doing comparisons,
809 * instead of the string itself. If a string changes, the magic is
810 * cleared. The next time the locale changes, the index is incremented,
811 * and so we know during a comparison that the transformation is not
812 * necessarily still valid, and so is recomputed. Note that if the locale
813 * changes enough times, the index could wrap (a U32), and it is possible
814 * that a transformation would improperly be considered valid, leading to
815 * an unlikely bug */
0d071d52 816
98994639
HS
817 if (! newcoll) {
818 if (PL_collation_name) {
819 ++PL_collation_ix;
820 Safefree(PL_collation_name);
821 PL_collation_name = NULL;
822 }
823 PL_collation_standard = TRUE;
00bf60ca 824 is_standard_collation:
98994639
HS
825 PL_collxfrm_base = 0;
826 PL_collxfrm_mult = 2;
165a1c52 827 PL_in_utf8_COLLATE_locale = FALSE;
f28f4d2a 828 PL_strxfrm_NUL_replacement = '\0';
a4a439fb 829 PL_strxfrm_max_cp = 0;
98994639
HS
830 return;
831 }
832
d35fca5f 833 /* If this is not the same locale as currently, set the new one up */
98994639
HS
834 if (! PL_collation_name || strNE(PL_collation_name, newcoll)) {
835 ++PL_collation_ix;
836 Safefree(PL_collation_name);
837 PL_collation_name = stdize_locale(savepv(newcoll));
a39edc4c 838 PL_collation_standard = isNAME_C_OR_POSIX(newcoll);
00bf60ca
KW
839 if (PL_collation_standard) {
840 goto is_standard_collation;
841 }
98994639 842
165a1c52 843 PL_in_utf8_COLLATE_locale = _is_cur_LC_category_utf8(LC_COLLATE);
f28f4d2a 844 PL_strxfrm_NUL_replacement = '\0';
a4a439fb 845 PL_strxfrm_max_cp = 0;
165a1c52 846
59c018b9
KW
847 /* A locale collation definition includes primary, secondary, tertiary,
848 * etc. weights for each character. To sort, the primary weights are
849 * used, and only if they compare equal, then the secondary weights are
850 * used, and only if they compare equal, then the tertiary, etc.
851 *
852 * strxfrm() works by taking the input string, say ABC, and creating an
853 * output transformed string consisting of first the primary weights,
854 * A¹B¹C¹ followed by the secondary ones, A²B²C²; and then the
855 * tertiary, etc, yielding A¹B¹C¹ A²B²C² A³B³C³ .... Some characters
856 * may not have weights at every level. In our example, let's say B
857 * doesn't have a tertiary weight, and A doesn't have a secondary
858 * weight. The constructed string is then going to be
859 * A¹B¹C¹ B²C² A³C³ ....
860 * This has the desired effect that strcmp() will look at the secondary
861 * or tertiary weights only if the strings compare equal at all higher
862 * priority weights. The spaces shown here, like in
c342d20e 863 * "A¹B¹C¹ A²B²C² "
59c018b9
KW
864 * are not just for readability. In the general case, these must
865 * actually be bytes, which we will call here 'separator weights'; and
866 * they must be smaller than any other weight value, but since these
867 * are C strings, only the terminating one can be a NUL (some
868 * implementations may include a non-NUL separator weight just before
869 * the NUL). Implementations tend to reserve 01 for the separator
870 * weights. They are needed so that a shorter string's secondary
871 * weights won't be misconstrued as primary weights of a longer string,
872 * etc. By making them smaller than any other weight, the shorter
873 * string will sort first. (Actually, if all secondary weights are
874 * smaller than all primary ones, there is no need for a separator
875 * weight between those two levels, etc.)
876 *
877 * The length of the transformed string is roughly a linear function of
878 * the input string. It's not exactly linear because some characters
879 * don't have weights at all levels. When we call strxfrm() we have to
880 * allocate some memory to hold the transformed string. The
881 * calculations below try to find coefficients 'm' and 'b' for this
882 * locale so that m*x + b equals how much space we need, given the size
883 * of the input string in 'x'. If we calculate too small, we increase
884 * the size as needed, and call strxfrm() again, but it is better to
885 * get it right the first time to avoid wasted expensive string
886 * transformations. */
887
98994639 888 {
79f120c8
KW
889 /* We use the string below to find how long the tranformation of it
890 * is. Almost all locales are supersets of ASCII, or at least the
891 * ASCII letters. We use all of them, half upper half lower,
892 * because if we used fewer, we might hit just the ones that are
893 * outliers in a particular locale. Most of the strings being
894 * collated will contain a preponderance of letters, and even if
895 * they are above-ASCII, they are likely to have the same number of
896 * weight levels as the ASCII ones. It turns out that digits tend
897 * to have fewer levels, and some punctuation has more, but those
898 * are relatively sparse in text, and khw believes this gives a
899 * reasonable result, but it could be changed if experience so
900 * dictates. */
901 const char longer[] = "ABCDEFGHIJKLMnopqrstuvwxyz";
902 char * x_longer; /* Transformed 'longer' */
903 Size_t x_len_longer; /* Length of 'x_longer' */
904
905 char * x_shorter; /* We also transform a substring of 'longer' */
906 Size_t x_len_shorter;
907
a4a439fb 908 /* _mem_collxfrm() is used get the transformation (though here we
79f120c8
KW
909 * are interested only in its length). It is used because it has
910 * the intelligence to handle all cases, but to work, it needs some
911 * values of 'm' and 'b' to get it started. For the purposes of
912 * this calculation we use a very conservative estimate of 'm' and
913 * 'b'. This assumes a weight can be multiple bytes, enough to
914 * hold any UV on the platform, and there are 5 levels, 4 weight
915 * bytes, and a trailing NUL. */
916 PL_collxfrm_base = 5;
917 PL_collxfrm_mult = 5 * sizeof(UV);
918
919 /* Find out how long the transformation really is */
a4a439fb
KW
920 x_longer = _mem_collxfrm(longer,
921 sizeof(longer) - 1,
922 &x_len_longer,
923
924 /* We avoid converting to UTF-8 in the
925 * called function by telling it the
926 * string is in UTF-8 if the locale is a
927 * UTF-8 one. Since the string passed
928 * here is invariant under UTF-8, we can
929 * claim it's UTF-8 even though it isn't.
930 * */
931 PL_in_utf8_COLLATE_locale);
79f120c8
KW
932 Safefree(x_longer);
933
934 /* Find out how long the transformation of a substring of 'longer'
935 * is. Together the lengths of these transformations are
936 * sufficient to calculate 'm' and 'b'. The substring is all of
937 * 'longer' except the first character. This minimizes the chances
938 * of being swayed by outliers */
a4a439fb 939 x_shorter = _mem_collxfrm(longer + 1,
79f120c8 940 sizeof(longer) - 2,
a4a439fb
KW
941 &x_len_shorter,
942 PL_in_utf8_COLLATE_locale);
79f120c8
KW
943 Safefree(x_shorter);
944
945 /* If the results are nonsensical for this simple test, the whole
946 * locale definition is suspect. Mark it so that locale collation
947 * is not active at all for it. XXX Should we warn? */
948 if ( x_len_shorter == 0
949 || x_len_longer == 0
950 || x_len_shorter >= x_len_longer)
951 {
952 PL_collxfrm_mult = 0;
953 PL_collxfrm_base = 0;
954 }
955 else {
956 SSize_t base; /* Temporary */
957
958 /* We have both: m * strlen(longer) + b = x_len_longer
959 * m * strlen(shorter) + b = x_len_shorter;
960 * subtracting yields:
961 * m * (strlen(longer) - strlen(shorter))
962 * = x_len_longer - x_len_shorter
963 * But we have set things up so that 'shorter' is 1 byte smaller
964 * than 'longer'. Hence:
965 * m = x_len_longer - x_len_shorter
966 *
967 * But if something went wrong, make sure the multiplier is at
968 * least 1.
969 */
970 if (x_len_longer > x_len_shorter) {
971 PL_collxfrm_mult = (STRLEN) x_len_longer - x_len_shorter;
972 }
973 else {
974 PL_collxfrm_mult = 1;
975 }
976
977 /* mx + b = len
978 * so: b = len - mx
979 * but in case something has gone wrong, make sure it is
980 * non-negative */
981 base = x_len_longer - PL_collxfrm_mult * (sizeof(longer) - 1);
982 if (base < 0) {
983 base = 0;
984 }
985
986 /* Add 1 for the trailing NUL */
987 PL_collxfrm_base = base + 1;
988 }
58eebef2 989
7d4bcc4a
KW
990# ifdef DEBUGGING
991
58eebef2
KW
992 if (DEBUG_L_TEST || debug_initialization) {
993 PerlIO_printf(Perl_debug_log,
b07929e4
KW
994 "%s:%d: ?UTF-8 locale=%d; x_len_shorter=%zu, "
995 "x_len_longer=%zu,"
996 " collate multipler=%zu, collate base=%zu\n",
58eebef2
KW
997 __FILE__, __LINE__,
998 PL_in_utf8_COLLATE_locale,
999 x_len_shorter, x_len_longer,
1000 PL_collxfrm_mult, PL_collxfrm_base);
1001 }
7d4bcc4a
KW
1002# endif
1003
98994639
HS
1004 }
1005 }
1006
1007#endif /* USE_LOCALE_COLLATE */
7d4bcc4a 1008
98994639
HS
1009}
1010
d2b24094 1011#ifdef WIN32
b8cc575c 1012
a4f00dcc 1013STATIC char *
b8cc575c 1014S_win32_setlocale(pTHX_ int category, const char* locale)
b385bb4d
KW
1015{
1016 /* This, for Windows, emulates POSIX setlocale() behavior. There is no
7d4bcc4a
KW
1017 * difference between the two unless the input locale is "", which normally
1018 * means on Windows to get the machine default, which is set via the
1019 * computer's "Regional and Language Options" (or its current equivalent).
1020 * In POSIX, it instead means to find the locale from the user's
1021 * environment. This routine changes the Windows behavior to first look in
1022 * the environment, and, if anything is found, use that instead of going to
1023 * the machine default. If there is no environment override, the machine
1024 * default is used, by calling the real setlocale() with "".
1025 *
1026 * The POSIX behavior is to use the LC_ALL variable if set; otherwise to
1027 * use the particular category's variable if set; otherwise to use the LANG
1028 * variable. */
b385bb4d 1029
175c4cf9 1030 bool override_LC_ALL = FALSE;
89f7b9aa 1031 char * result;
e5f10d49 1032 unsigned int i;
89f7b9aa 1033
b385bb4d 1034 if (locale && strEQ(locale, "")) {
7d4bcc4a
KW
1035
1036# ifdef LC_ALL
1037
b385bb4d
KW
1038 locale = PerlEnv_getenv("LC_ALL");
1039 if (! locale) {
e5f10d49
KW
1040 if (category == LC_ALL) {
1041 override_LC_ALL = TRUE;
1042 }
1043 else {
7d4bcc4a
KW
1044
1045# endif
7d4bcc4a 1046
e5f10d49
KW
1047 for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++) {
1048 if (category == categories[i]) {
1049 locale = PerlEnv_getenv(category_names[i]);
1050 goto found_locale;
1051 }
1052 }
7d4bcc4a 1053
b385bb4d 1054 locale = PerlEnv_getenv("LANG");
481465ea 1055 if (! locale) {
b385bb4d
KW
1056 locale = "";
1057 }
e5f10d49
KW
1058
1059 found_locale: ;
7d4bcc4a
KW
1060
1061# ifdef LC_ALL
1062
e5f10d49 1063 }
b385bb4d 1064 }
7d4bcc4a
KW
1065
1066# endif
1067
b385bb4d
KW
1068 }
1069
89f7b9aa 1070 result = setlocale(category, locale);
bbc98134 1071 DEBUG_L(PerlIO_printf(Perl_debug_log, "%s:%d: %s\n", __FILE__, __LINE__,
a4f00dcc 1072 setlocale_debug_string(category, locale, result)));
89f7b9aa 1073
481465ea 1074 if (! override_LC_ALL) {
89f7b9aa
KW
1075 return result;
1076 }
1077
dfd77d7a 1078 /* Here the input category was LC_ALL, and we have set it to what is in the
481465ea
KW
1079 * LANG variable or the system default if there is no LANG. But these have
1080 * lower priority than the other LC_foo variables, so override it for each
1081 * one that is set. (If they are set to "", it means to use the same thing
1082 * we just set LC_ALL to, so can skip) */
7d4bcc4a 1083
948523db 1084 for (i = 0; i < LC_ALL_INDEX; i++) {
e5f10d49
KW
1085 result = PerlEnv_getenv(category_names[i]);
1086 if (result && strNE(result, "")) {
1087 setlocale(categories[i], result);
1088 DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s:%d: %s\n",
1089 __FILE__, __LINE__,
1090 setlocale_debug_string(categories[i], result, "not captured")));
1091 }
89f7b9aa 1092 }
7d4bcc4a 1093
bbc98134
KW
1094 result = setlocale(LC_ALL, NULL);
1095 DEBUG_L(PerlIO_printf(Perl_debug_log, "%s:%d: %s\n",
1096 __FILE__, __LINE__,
a4f00dcc 1097 setlocale_debug_string(LC_ALL, NULL, result)));
89f7b9aa 1098
bbc98134 1099 return result;
b385bb4d
KW
1100}
1101
1102#endif
1103
a4f00dcc
KW
1104char *
1105Perl_setlocale(int category, const char * locale)
1106{
1107 /* This wraps POSIX::setlocale() */
1108
1109 char * retval;
1159483a 1110 char * newlocale;
a4f00dcc
KW
1111 dTHX;
1112
a4f00dcc
KW
1113#ifdef USE_LOCALE_NUMERIC
1114
291a84fb
KW
1115 /* A NULL locale means only query what the current one is. We have the
1116 * LC_NUMERIC name saved, because we are normally switched into the C
1117 * locale for it. For an LC_ALL query, switch back to get the correct
1118 * results. All other categories don't require special handling */
a4f00dcc
KW
1119 if (locale == NULL) {
1120 if (category == LC_NUMERIC) {
1121 return savepv(PL_numeric_name);
1122 }
1123
7d4bcc4a 1124# ifdef LC_ALL
a4f00dcc 1125
c0d737a8
KW
1126 else if (category == LC_ALL && ! PL_numeric_underlying) {
1127
a4f00dcc
KW
1128 SET_NUMERIC_UNDERLYING();
1129 }
1130
7d4bcc4a 1131# endif
a4f00dcc
KW
1132
1133 }
1134
1135#endif
1136
e5f10d49
KW
1137 /* Save retval since subsequent setlocale() calls may overwrite it. */
1138 retval = savepv(do_setlocale_r(category, locale));
a4f00dcc
KW
1139
1140 DEBUG_L(PerlIO_printf(Perl_debug_log,
1141 "%s:%d: %s\n", __FILE__, __LINE__,
1142 setlocale_debug_string(category, locale, retval)));
1143 if (! retval) {
1144 /* Should never happen that a query would return an error, but be
1145 * sure and reset to C locale */
1146 if (locale == 0) {
1147 SET_NUMERIC_STANDARD();
1148 }
7d4bcc4a 1149
a4f00dcc
KW
1150 return NULL;
1151 }
1152
a4f00dcc
KW
1153 /* If locale == NULL, we are just querying the state, but may have switched
1154 * to NUMERIC_UNDERLYING. Switch back before returning. */
1155 if (locale == NULL) {
1156 SET_NUMERIC_STANDARD();
1157 return retval;
1158 }
a4f00dcc 1159
1159483a
KW
1160 /* Now that have switched locales, we have to update our records to
1161 * correspond. */
a4f00dcc 1162
1159483a 1163 switch (category) {
a4f00dcc 1164
1159483a 1165#ifdef USE_LOCALE_CTYPE
a4f00dcc 1166
1159483a
KW
1167 case LC_CTYPE:
1168 new_ctype(retval);
1169 break;
a4f00dcc 1170
1159483a 1171#endif
a4f00dcc
KW
1172#ifdef USE_LOCALE_COLLATE
1173
1159483a
KW
1174 case LC_COLLATE:
1175 new_collate(retval);
1176 break;
a4f00dcc 1177
1159483a 1178#endif
a4f00dcc
KW
1179#ifdef USE_LOCALE_NUMERIC
1180
1159483a
KW
1181 case LC_NUMERIC:
1182 new_numeric(retval);
1183 break;
a4f00dcc 1184
1159483a
KW
1185#endif
1186#ifdef LC_ALL
a4f00dcc 1187
1159483a 1188 case LC_ALL:
a4f00dcc 1189
1159483a
KW
1190 /* LC_ALL updates all the things we care about. The values may not
1191 * be the same as 'retval', as the locale "" may have set things
1192 * individually */
a4f00dcc 1193
1159483a 1194# ifdef USE_LOCALE_CTYPE
a4f00dcc 1195
1159483a
KW
1196 newlocale = do_setlocale_c(LC_CTYPE, NULL);
1197 new_ctype(newlocale);
a4f00dcc 1198
1159483a
KW
1199# endif /* USE_LOCALE_CTYPE */
1200# ifdef USE_LOCALE_COLLATE
1201
1202 newlocale = do_setlocale_c(LC_COLLATE, NULL);
1203 new_collate(newlocale);
a4f00dcc 1204
7d4bcc4a 1205# endif
1159483a 1206# ifdef USE_LOCALE_NUMERIC
a4f00dcc 1207
1159483a
KW
1208 newlocale = do_setlocale_c(LC_NUMERIC, NULL);
1209 new_numeric(newlocale);
a4f00dcc 1210
1159483a
KW
1211# endif /* USE_LOCALE_NUMERIC */
1212#endif /* LC_ALL */
a4f00dcc 1213
1159483a
KW
1214 default:
1215 break;
a4f00dcc
KW
1216 }
1217
1218 return retval;
1219
f7416781
KW
1220
1221}
1222
1223PERL_STATIC_INLINE const char *
1224S_save_to_buffer(const char * string, char **buf, Size_t *buf_size, const Size_t offset)
1225{
1226 /* Copy the NUL-terminated 'string' to 'buf' + 'offset'. 'buf' has size 'buf_size',
1227 * growing it if necessary */
1228
1229 const Size_t string_size = strlen(string) + offset + 1;
1230
1231 PERL_ARGS_ASSERT_SAVE_TO_BUFFER;
1232
1233 if (*buf_size == 0) {
1234 Newx(*buf, string_size, char);
1235 *buf_size = string_size;
1236 }
1237 else if (string_size > *buf_size) {
1238 Renew(*buf, string_size, char);
1239 *buf_size = string_size;
1240 }
1241
1242 Copy(string, *buf + offset, string_size - offset, char);
1243 return *buf;
1244}
1245
1246/*
1247
1248=head1 Locale-related functions and macros
1249
1250=for apidoc Perl_langinfo
1251
7d4bcc4a 1252This is an (almost ª) drop-in replacement for the system C<L<nl_langinfo(3)>>,
f7416781
KW
1253taking the same C<item> parameter values, and returning the same information.
1254But it is more thread-safe than regular C<nl_langinfo()>, and hides the quirks
1255of Perl's locale handling from your code, and can be used on systems that lack
1256a native C<nl_langinfo>.
1257
1258Expanding on these:
1259
1260=over
1261
1262=item *
1263
1264It delivers the correct results for the C<RADIXCHAR> and C<THOUSESEP> items,
1265without you having to write extra code. The reason for the extra code would be
1266because these are from the C<LC_NUMERIC> locale category, which is normally
1267kept set to the C locale by Perl, no matter what the underlying locale is
1268supposed to be, and so to get the expected results, you have to temporarily
1269toggle into the underlying locale, and later toggle back. (You could use
1270plain C<nl_langinfo> and C<L</STORE_LC_NUMERIC_FORCE_TO_UNDERLYING>> for this
1271but then you wouldn't get the other advantages of C<Perl_langinfo()>; not
1272keeping C<LC_NUMERIC> in the C locale would break a lot of CPAN, which is
1273expecting the radix (decimal point) character to be a dot.)
1274
1275=item *
1276
1277Depending on C<item>, it works on systems that don't have C<nl_langinfo>, hence
1278makes your code more portable. Of the fifty-some possible items specified by
1279the POSIX 2008 standard,
1280L<http://pubs.opengroup.org/onlinepubs/9699919799/basedefs/langinfo.h.html>,
1281only two are completely unimplemented. It uses various techniques to recover
1282the other items, including calling C<L<localeconv(3)>>, and C<L<strftime(3)>>,
1283both of which are specified in C89, so should be always be available. Later
1284C<strftime()> versions have additional capabilities; C<""> is returned for
1285those not available on your system.
1286
1287The details for those items which may differ from what this emulation returns
1288and what a native C<nl_langinfo()> would return are:
1289
1290=over
1291
1292=item C<CODESET>
1293
1294=item C<ERA>
1295
1296Unimplemented, so returns C<"">.
1297
1298=item C<YESEXPR>
1299
c1566110
KW
1300=item C<YESSTR>
1301
f7416781
KW
1302=item C<NOEXPR>
1303
c1566110
KW
1304=item C<NOSTR>
1305
1306Only the values for English are returned. C<YESSTR> and C<NOSTR> have been
1307removed from POSIX 2008, and are retained for backwards compatibility. Your
1308platform's C<nl_langinfo> may not support them.
f7416781
KW
1309
1310=item C<D_FMT>
1311
1312Always evaluates to C<%x>, the locale's appropriate date representation.
1313
1314=item C<T_FMT>
1315
1316Always evaluates to C<%X>, the locale's appropriate time representation.
1317
1318=item C<D_T_FMT>
1319
1320Always evaluates to C<%c>, the locale's appropriate date and time
1321representation.
1322
1323=item C<CRNCYSTR>
1324
1325The return may be incorrect for those rare locales where the currency symbol
1326replaces the radix character.
1327Send email to L<mailto:perlbug@perl.org> if you have examples of it needing
1328to work differently.
1329
1330=item C<ALT_DIGITS>
1331
1332Currently this gives the same results as Linux does.
1333Send email to L<mailto:perlbug@perl.org> if you have examples of it needing
1334to work differently.
1335
1336=item C<ERA_D_FMT>
1337
1338=item C<ERA_T_FMT>
1339
1340=item C<ERA_D_T_FMT>
1341
1342=item C<T_FMT_AMPM>
1343
1344These are derived by using C<strftime()>, and not all versions of that function
1345know about them. C<""> is returned for these on such systems.
1346
1347=back
1348
1349When using C<Perl_langinfo> on systems that don't have a native
1350C<nl_langinfo()>, you must
1351
1352 #include "perl_langinfo.h"
1353
1354before the C<perl.h> C<#include>. You can replace your C<langinfo.h>
1355C<#include> with this one. (Doing it this way keeps out the symbols that plain
1356C<langinfo.h> imports into the namespace for code that doesn't need it.)
1357
1358You also should not use the bare C<langinfo.h> item names, but should preface
1359them with C<PERL_>, so use C<PERL_RADIXCHAR> instead of plain C<RADIXCHAR>.
1360The C<PERL_I<foo>> versions will also work for this function on systems that do
1361have a native C<nl_langinfo>.
1362
1363=item *
1364
1365It is thread-friendly, returning its result in a buffer that won't be
1366overwritten by another thread, so you don't have to code for that possibility.
1367The buffer can be overwritten by the next call to C<nl_langinfo> or
1368C<Perl_langinfo> in the same thread.
1369
1370=item *
1371
7d4bcc4a 1372ª It returns S<C<const char *>>, whereas plain C<nl_langinfo()> returns S<C<char
f7416781
KW
1373*>>, but you are (only by documentation) forbidden to write into the buffer.
1374By declaring this C<const>, the compiler enforces this restriction. The extra
1375C<const> is why this isn't an unequivocal drop-in replacement for
1376C<nl_langinfo>.
1377
1378=back
1379
1380The original impetus for C<Perl_langinfo()> was so that code that needs to
1381find out the current currency symbol, floating point radix character, or digit
1382grouping separator can use, on all systems, the simpler and more
1383thread-friendly C<nl_langinfo> API instead of C<L<localeconv(3)>> which is a
1384pain to make thread-friendly. For other fields returned by C<localeconv>, it
1385is better to use the methods given in L<perlcall> to call
1386L<C<POSIX::localeconv()>|POSIX/localeconv>, which is thread-friendly.
1387
1388=cut
1389
1390*/
1391
1392const char *
1393#ifdef HAS_NL_LANGINFO
1394Perl_langinfo(const nl_item item)
1395#else
1396Perl_langinfo(const int item)
1397#endif
1398{
f61748ac
KW
1399 return my_nl_langinfo(item, TRUE);
1400}
1401
1402const char *
1403#ifdef HAS_NL_LANGINFO
1404S_my_nl_langinfo(const nl_item item, bool toggle)
1405#else
1406S_my_nl_langinfo(const int item, bool toggle)
1407#endif
1408{
ae74815b 1409 dTHX;
f7416781 1410
5a854ab3
KW
1411 /* We only need to toggle into the underlying LC_NUMERIC locale for these
1412 * two items, and only if not already there */
1413 if (toggle && (( item != PERL_RADIXCHAR && item != PERL_THOUSEP)
1414 || PL_numeric_underlying))
1415 {
1416 toggle = FALSE;
1417 }
1418
ab340fff
KW
1419#if defined(HAS_NL_LANGINFO) /* nl_langinfo() is available. */
1420#if ! defined(HAS_POSIX_2008_LOCALE)
f7416781 1421
ab340fff 1422 /* Here, use plain nl_langinfo(), switching to the underlying LC_NUMERIC
ae74815b
KW
1423 * for those items dependent on it. This must be copied to a buffer before
1424 * switching back, as some systems destroy the buffer when setlocale() is
1425 * called */
f7416781 1426
038d3702
KW
1427 {
1428 DECLARATION_FOR_LC_NUMERIC_MANIPULATION;
1429
b2fee59c 1430 if (toggle) {
038d3702 1431 STORE_LC_NUMERIC_FORCE_TO_UNDERLYING();
b2fee59c 1432 }
f7416781 1433
5acc4454
KW
1434 LOCALE_LOCK; /* Prevent interference from another thread executing
1435 this code section (the only call to nl_langinfo in
1436 the core) */
1437
b2fee59c
KW
1438 save_to_buffer(nl_langinfo(item), &PL_langinfo_buf,
1439 &PL_langinfo_bufsize, 0);
f7416781 1440
5acc4454
KW
1441 LOCALE_UNLOCK;
1442
b2fee59c 1443 if (toggle) {
038d3702 1444 RESTORE_LC_NUMERIC();
b2fee59c 1445 }
f7416781
KW
1446 }
1447
ab340fff
KW
1448# else /* Use nl_langinfo_l(), avoiding both a mutex and changing the locale */
1449
5a854ab3 1450 {
b2fee59c
KW
1451 bool do_free = FALSE;
1452 locale_t cur = uselocale((locale_t) 0);
ab340fff 1453
b2fee59c
KW
1454 if (cur == LC_GLOBAL_LOCALE) {
1455 cur = duplocale(LC_GLOBAL_LOCALE);
1456 do_free = TRUE;
1457 }
ab340fff 1458
b2fee59c 1459 if (toggle) {
e1aa2579
KW
1460 if (PL_underlying_numeric_obj) {
1461 cur = PL_underlying_numeric_obj;
1462 }
1463 else {
1464 cur = newlocale(LC_NUMERIC_MASK, PL_numeric_name, cur);
1465 do_free = TRUE;
1466 }
b2fee59c 1467 }
ab340fff 1468
b2fee59c
KW
1469 save_to_buffer(nl_langinfo_l(item, cur),
1470 &PL_langinfo_buf, &PL_langinfo_bufsize, 0);
1471 if (do_free) {
1472 freelocale(cur);
1473 }
5a854ab3 1474 }
ab340fff 1475
c1566110
KW
1476# endif
1477
1478 if (strEQ(PL_langinfo_buf, "")) {
1479 if (item == PERL_YESSTR) {
1480 return "yes";
1481 }
1482 if (item == PERL_NOSTR) {
1483 return "no";
1484 }
1485 }
1486
ab340fff
KW
1487 return PL_langinfo_buf;
1488
f7416781 1489#else /* Below, emulate nl_langinfo as best we can */
43dd6b15
KW
1490
1491 {
1492
f7416781
KW
1493# ifdef HAS_LOCALECONV
1494
43dd6b15 1495 const struct lconv* lc;
038d3702 1496 DECLARATION_FOR_LC_NUMERIC_MANIPULATION;
f7416781
KW
1497
1498# endif
1499# ifdef HAS_STRFTIME
1500
43dd6b15
KW
1501 struct tm tm;
1502 bool return_format = FALSE; /* Return the %format, not the value */
1503 const char * format;
f7416781
KW
1504
1505# endif
1506
43dd6b15
KW
1507 /* We copy the results to a per-thread buffer, even if not
1508 * multi-threaded. This is in part to simplify this code, and partly
1509 * because we need a buffer anyway for strftime(), and partly because a
1510 * call of localeconv() could otherwise wipe out the buffer, and the
1511 * programmer would not be expecting this, as this is a nl_langinfo()
1512 * substitute after all, so s/he might be thinking their localeconv()
1513 * is safe until another localeconv() call. */
f7416781 1514
43dd6b15
KW
1515 switch (item) {
1516 Size_t len;
1517 const char * retval;
f7416781 1518
43dd6b15
KW
1519 /* These 2 are unimplemented */
1520 case PERL_CODESET:
1521 case PERL_ERA: /* For use with strftime() %E modifier */
f7416781 1522
43dd6b15
KW
1523 default:
1524 return "";
f7416781 1525
43dd6b15
KW
1526 /* We use only an English set, since we don't know any more */
1527 case PERL_YESEXPR: return "^[+1yY]";
1528 case PERL_YESSTR: return "yes";
1529 case PERL_NOEXPR: return "^[-0nN]";
1530 case PERL_NOSTR: return "no";
f7416781
KW
1531
1532# ifdef HAS_LOCALECONV
1533
43dd6b15 1534 case PERL_CRNCYSTR:
f7416781 1535
43dd6b15
KW
1536 /* We don't bother with localeconv_l() because any system that
1537 * has it is likely to also have nl_langinfo() */
291a84fb 1538
5acc4454
KW
1539 LOCALE_LOCK; /* Prevent interference with other threads
1540 using localeconv() */
1541
43dd6b15
KW
1542 lc = localeconv();
1543 if ( ! lc
1544 || ! lc->currency_symbol
1545 || strEQ("", lc->currency_symbol))
1546 {
1547 LOCALE_UNLOCK;
1548 return "";
1549 }
f7416781 1550
43dd6b15
KW
1551 /* Leave the first spot empty to be filled in below */
1552 save_to_buffer(lc->currency_symbol, &PL_langinfo_buf,
1553 &PL_langinfo_bufsize, 1);
1554 if (lc->mon_decimal_point && strEQ(lc->mon_decimal_point, ""))
1555 { /* khw couldn't figure out how the localedef specifications
1556 would show that the $ should replace the radix; this is
1557 just a guess as to how it might work.*/
1558 *PL_langinfo_buf = '.';
1559 }
1560 else if (lc->p_cs_precedes) {
1561 *PL_langinfo_buf = '-';
1562 }
1563 else {
1564 *PL_langinfo_buf = '+';
1565 }
f7416781 1566
43dd6b15
KW
1567 LOCALE_UNLOCK;
1568 break;
f7416781 1569
43dd6b15
KW
1570 case PERL_RADIXCHAR:
1571 case PERL_THOUSEP:
f7416781 1572
43dd6b15 1573 if (toggle) {
038d3702 1574 STORE_LC_NUMERIC_FORCE_TO_UNDERLYING();
c0d737a8 1575 }
f7416781 1576
5acc4454
KW
1577 LOCALE_LOCK; /* Prevent interference with other threads
1578 using localeconv() */
1579
43dd6b15
KW
1580 lc = localeconv();
1581 if (! lc) {
33394adc
KW
1582 retval = "";
1583 }
43dd6b15
KW
1584 else {
1585 retval = (item == PERL_RADIXCHAR)
1586 ? lc->decimal_point
1587 : lc->thousands_sep;
1588 if (! retval) {
1589 retval = "";
1590 }
1591 }
f7416781 1592
43dd6b15
KW
1593 save_to_buffer(retval, &PL_langinfo_buf,
1594 &PL_langinfo_bufsize, 0);
f7416781 1595
5acc4454
KW
1596 LOCALE_UNLOCK;
1597
43dd6b15 1598 if (toggle) {
038d3702 1599 RESTORE_LC_NUMERIC();
43dd6b15 1600 }
f7416781 1601
43dd6b15 1602 break;
f7416781
KW
1603
1604# endif
1605# ifdef HAS_STRFTIME
1606
43dd6b15
KW
1607 /* These are defined by C89, so we assume that strftime supports
1608 * them, and so are returned unconditionally; they may not be what
1609 * the locale actually says, but should give good enough results
1610 * for someone using them as formats (as opposed to trying to parse
1611 * them to figure out what the locale says). The other format
1612 * items are actually tested to verify they work on the platform */
1613 case PERL_D_FMT: return "%x";
1614 case PERL_T_FMT: return "%X";
1615 case PERL_D_T_FMT: return "%c";
1616
1617 /* These formats are only available in later strfmtime's */
1618 case PERL_ERA_D_FMT: case PERL_ERA_T_FMT: case PERL_ERA_D_T_FMT:
1619 case PERL_T_FMT_AMPM:
1620
1621 /* The rest can be gotten from most versions of strftime(). */
1622 case PERL_ABDAY_1: case PERL_ABDAY_2: case PERL_ABDAY_3:
1623 case PERL_ABDAY_4: case PERL_ABDAY_5: case PERL_ABDAY_6:
1624 case PERL_ABDAY_7:
1625 case PERL_ALT_DIGITS:
1626 case PERL_AM_STR: case PERL_PM_STR:
1627 case PERL_ABMON_1: case PERL_ABMON_2: case PERL_ABMON_3:
1628 case PERL_ABMON_4: case PERL_ABMON_5: case PERL_ABMON_6:
1629 case PERL_ABMON_7: case PERL_ABMON_8: case PERL_ABMON_9:
1630 case PERL_ABMON_10: case PERL_ABMON_11: case PERL_ABMON_12:
1631 case PERL_DAY_1: case PERL_DAY_2: case PERL_DAY_3: case PERL_DAY_4:
1632 case PERL_DAY_5: case PERL_DAY_6: case PERL_DAY_7:
1633 case PERL_MON_1: case PERL_MON_2: case PERL_MON_3: case PERL_MON_4:
1634 case PERL_MON_5: case PERL_MON_6: case PERL_MON_7: case PERL_MON_8:
1635 case PERL_MON_9: case PERL_MON_10: case PERL_MON_11:
1636 case PERL_MON_12:
1637
1638 LOCALE_LOCK;
1639
1640 init_tm(&tm); /* Precaution against core dumps */
1641 tm.tm_sec = 30;
1642 tm.tm_min = 30;
1643 tm.tm_hour = 6;
1644 tm.tm_year = 2017 - 1900;
1645 tm.tm_wday = 0;
1646 tm.tm_mon = 0;
1647 switch (item) {
1648 default:
1649 LOCALE_UNLOCK;
1650 Perl_croak(aTHX_
1651 "panic: %s: %d: switch case: %d problem",
1652 __FILE__, __LINE__, item);
1653 NOT_REACHED; /* NOTREACHED */
1654
1655 case PERL_PM_STR: tm.tm_hour = 18;
1656 case PERL_AM_STR:
1657 format = "%p";
1658 break;
1659
1660 case PERL_ABDAY_7: tm.tm_wday++;
1661 case PERL_ABDAY_6: tm.tm_wday++;
1662 case PERL_ABDAY_5: tm.tm_wday++;
1663 case PERL_ABDAY_4: tm.tm_wday++;
1664 case PERL_ABDAY_3: tm.tm_wday++;
1665 case PERL_ABDAY_2: tm.tm_wday++;
1666 case PERL_ABDAY_1:
1667 format = "%a";
1668 break;
1669
1670 case PERL_DAY_7: tm.tm_wday++;
1671 case PERL_DAY_6: tm.tm_wday++;
1672 case PERL_DAY_5: tm.tm_wday++;
1673 case PERL_DAY_4: tm.tm_wday++;
1674 case PERL_DAY_3: tm.tm_wday++;
1675 case PERL_DAY_2: tm.tm_wday++;
1676 case PERL_DAY_1:
1677 format = "%A";
1678 break;
1679
1680 case PERL_ABMON_12: tm.tm_mon++;
1681 case PERL_ABMON_11: tm.tm_mon++;
1682 case PERL_ABMON_10: tm.tm_mon++;
1683 case PERL_ABMON_9: tm.tm_mon++;
1684 case PERL_ABMON_8: tm.tm_mon++;
1685 case PERL_ABMON_7: tm.tm_mon++;
1686 case PERL_ABMON_6: tm.tm_mon++;
1687 case PERL_ABMON_5: tm.tm_mon++;
1688 case PERL_ABMON_4: tm.tm_mon++;
1689 case PERL_ABMON_3: tm.tm_mon++;
1690 case PERL_ABMON_2: tm.tm_mon++;
1691 case PERL_ABMON_1:
1692 format = "%b";
1693 break;
1694
1695 case PERL_MON_12: tm.tm_mon++;
1696 case PERL_MON_11: tm.tm_mon++;
1697 case PERL_MON_10: tm.tm_mon++;
1698 case PERL_MON_9: tm.tm_mon++;
1699 case PERL_MON_8: tm.tm_mon++;
1700 case PERL_MON_7: tm.tm_mon++;
1701 case PERL_MON_6: tm.tm_mon++;
1702 case PERL_MON_5: tm.tm_mon++;
1703 case PERL_MON_4: tm.tm_mon++;
1704 case PERL_MON_3: tm.tm_mon++;
1705 case PERL_MON_2: tm.tm_mon++;
1706 case PERL_MON_1:
1707 format = "%B";
1708 break;
1709
1710 case PERL_T_FMT_AMPM:
1711 format = "%r";
1712 return_format = TRUE;
1713 break;
1714
1715 case PERL_ERA_D_FMT:
1716 format = "%Ex";
1717 return_format = TRUE;
1718 break;
1719
1720 case PERL_ERA_T_FMT:
1721 format = "%EX";
1722 return_format = TRUE;
1723 break;
1724
1725 case PERL_ERA_D_T_FMT:
1726 format = "%Ec";
1727 return_format = TRUE;
1728 break;
1729
1730 case PERL_ALT_DIGITS:
1731 tm.tm_wday = 0;
1732 format = "%Ow"; /* Find the alternate digit for 0 */
1733 break;
1734 }
f7416781 1735
43dd6b15
KW
1736 /* We can't use my_strftime() because it doesn't look at
1737 * tm_wday */
1738 while (0 == strftime(PL_langinfo_buf, PL_langinfo_bufsize,
1739 format, &tm))
1740 {
1741 /* A zero return means one of:
1742 * a) there wasn't enough space in PL_langinfo_buf
1743 * b) the format, like a plain %p, returns empty
1744 * c) it was an illegal format, though some
1745 * implementations of strftime will just return the
1746 * illegal format as a plain character sequence.
1747 *
1748 * To quickly test for case 'b)', try again but precede
1749 * the format with a plain character. If that result is
1750 * still empty, the problem is either 'a)' or 'c)' */
1751
1752 Size_t format_size = strlen(format) + 1;
1753 Size_t mod_size = format_size + 1;
1754 char * mod_format;
1755 char * temp_result;
1756
1757 Newx(mod_format, mod_size, char);
1758 Newx(temp_result, PL_langinfo_bufsize, char);
6873aa47 1759 *mod_format = ' ';
43dd6b15
KW
1760 my_strlcpy(mod_format + 1, format, mod_size);
1761 len = strftime(temp_result,
1762 PL_langinfo_bufsize,
1763 mod_format, &tm);
1764 Safefree(mod_format);
1765 Safefree(temp_result);
1766
1767 /* If 'len' is non-zero, it means that we had a case like
1768 * %p which means the current locale doesn't use a.m. or
1769 * p.m., and that is valid */
1770 if (len == 0) {
1771
1772 /* Here, still didn't work. If we get well beyond a
1773 * reasonable size, bail out to prevent an infinite
1774 * loop. */
1775
1776 if (PL_langinfo_bufsize > 100 * format_size) {
1777 *PL_langinfo_buf = '\0';
1778 }
1779 else {
1780 /* Double the buffer size to retry; Add 1 in case
1781 * original was 0, so we aren't stuck at 0. */
1782 PL_langinfo_bufsize *= 2;
1783 PL_langinfo_bufsize++;
1784 Renew(PL_langinfo_buf, PL_langinfo_bufsize, char);
1785 continue;
1786 }
1787 }
f7416781 1788
f7416781 1789 break;
43dd6b15 1790 }
f7416781 1791
43dd6b15
KW
1792 /* Here, we got a result.
1793 *
1794 * If the item is 'ALT_DIGITS', PL_langinfo_buf contains the
1795 * alternate format for wday 0. If the value is the same as
1796 * the normal 0, there isn't an alternate, so clear the buffer.
1797 * */
1798 if ( item == PERL_ALT_DIGITS
1799 && strEQ(PL_langinfo_buf, "0"))
1800 {
1801 *PL_langinfo_buf = '\0';
1802 }
f7416781 1803
43dd6b15
KW
1804 /* ALT_DIGITS is problematic. Experiments on it showed that
1805 * strftime() did not always work properly when going from
1806 * alt-9 to alt-10. Only a few locales have this item defined,
1807 * and in all of them on Linux that khw was able to find,
1808 * nl_langinfo() merely returned the alt-0 character, possibly
1809 * doubled. Most Unicode digits are in blocks of 10
1810 * consecutive code points, so that is sufficient information
1811 * for those scripts, as we can infer alt-1, alt-2, .... But
1812 * for a Japanese locale, a CJK ideographic 0 is returned, and
1813 * the CJK digits are not in code point order, so you can't
1814 * really infer anything. The localedef for this locale did
1815 * specify the succeeding digits, so that strftime() works
1816 * properly on them, without needing to infer anything. But
1817 * the nl_langinfo() return did not give sufficient information
1818 * for the caller to understand what's going on. So until
1819 * there is evidence that it should work differently, this
1820 * returns the alt-0 string for ALT_DIGITS.
1821 *
1822 * wday was chosen because its range is all a single digit.
1823 * Things like tm_sec have two digits as the minimum: '00' */
f7416781 1824
43dd6b15 1825 LOCALE_UNLOCK;
f7416781 1826
43dd6b15
KW
1827 /* If to return the format, not the value, overwrite the buffer
1828 * with it. But some strftime()s will keep the original format
1829 * if illegal, so change those to "" */
1830 if (return_format) {
1831 if (strEQ(PL_langinfo_buf, format)) {
f7416781
KW
1832 *PL_langinfo_buf = '\0';
1833 }
43dd6b15
KW
1834 else {
1835 save_to_buffer(format, &PL_langinfo_buf,
1836 &PL_langinfo_bufsize, 0);
f7416781
KW
1837 }
1838 }
1839
1840 break;
f7416781
KW
1841
1842# endif
1843
43dd6b15 1844 }
f7416781
KW
1845 }
1846
1847 return PL_langinfo_buf;
1848
1849#endif
1850
a4f00dcc 1851}
b385bb4d 1852
98994639
HS
1853/*
1854 * Initialize locale awareness.
1855 */
1856int
1857Perl_init_i18nl10n(pTHX_ int printwarn)
1858{
0e92a118
KW
1859 /* printwarn is
1860 *
1861 * 0 if not to output warning when setup locale is bad
1862 * 1 if to output warning based on value of PERL_BADLANG
1863 * >1 if to output regardless of PERL_BADLANG
1864 *
1865 * returns
98994639 1866 * 1 = set ok or not applicable,
0e92a118
KW
1867 * 0 = fallback to a locale of lower priority
1868 * -1 = fallback to all locales failed, not even to the C locale
6b058d42
KW
1869 *
1870 * Under -DDEBUGGING, if the environment variable PERL_DEBUG_LOCALE_INIT is
1871 * set, debugging information is output.
1872 *
1873 * This looks more complicated than it is, mainly due to the #ifdefs.
1874 *
1875 * We try to set LC_ALL to the value determined by the environment. If
1876 * there is no LC_ALL on this platform, we try the individual categories we
1877 * know about. If this works, we are done.
1878 *
1879 * But if it doesn't work, we have to do something else. We search the
1880 * environment variables ourselves instead of relying on the system to do
1881 * it. We look at, in order, LC_ALL, LANG, a system default locale (if we
1882 * think there is one), and the ultimate fallback "C". This is all done in
1883 * the same loop as above to avoid duplicating code, but it makes things
7d4bcc4a
KW
1884 * more complex. The 'trial_locales' array is initialized with just one
1885 * element; it causes the behavior described in the paragraph above this to
1886 * happen. If that fails, we add elements to 'trial_locales', and do extra
1887 * loop iterations to cause the behavior described in this paragraph.
6b058d42
KW
1888 *
1889 * On Ultrix, the locale MUST come from the environment, so there is
1890 * preliminary code to set it. I (khw) am not sure that it is necessary,
1891 * and that this couldn't be folded into the loop, but barring any real
1892 * platforms to test on, it's staying as-is
1893 *
1894 * A slight complication is that in embedded Perls, the locale may already
1895 * be set-up, and we don't want to get it from the normal environment
1896 * variables. This is handled by having a special environment variable
1897 * indicate we're in this situation. We simply set setlocale's 2nd
1898 * parameter to be a NULL instead of "". That indicates to setlocale that
1899 * it is not to change anything, but to return the current value,
1900 * effectively initializing perl's db to what the locale already is.
1901 *
1902 * We play the same trick with NULL if a LC_ALL succeeds. We call
1903 * setlocale() on the individual categores with NULL to get their existing
1904 * values for our db, instead of trying to change them.
1905 * */
98994639 1906
0e92a118
KW
1907 int ok = 1;
1908
7d4bcc4a
KW
1909#ifndef USE_LOCALE
1910
1911 PERL_UNUSED_ARG(printwarn);
1912
1913#else /* USE_LOCALE */
7d4bcc4a
KW
1914# ifdef __GLIBC__
1915
175c4cf9 1916 const char * const language = savepv(PerlEnv_getenv("LANGUAGE"));
7d4bcc4a
KW
1917
1918# endif
65ebb059 1919
ccd65d51
KW
1920 /* NULL uses the existing already set up locale */
1921 const char * const setlocale_init = (PerlEnv_getenv("PERL_SKIP_LOCALE_INIT"))
1922 ? NULL
1923 : "";
c3fcd832
KW
1924 const char* trial_locales[5]; /* 5 = 1 each for "", LC_ALL, LANG, "", C */
1925 unsigned int trial_locales_count;
175c4cf9
KW
1926 const char * const lc_all = savepv(PerlEnv_getenv("LC_ALL"));
1927 const char * const lang = savepv(PerlEnv_getenv("LANG"));
98994639 1928 bool setlocale_failure = FALSE;
65ebb059 1929 unsigned int i;
175c4cf9
KW
1930
1931 /* A later getenv() could zap this, so only use here */
1932 const char * const bad_lang_use_once = PerlEnv_getenv("PERL_BADLANG");
1933
1934 const bool locwarn = (printwarn > 1
e5f10d49
KW
1935 || ( printwarn
1936 && ( ! bad_lang_use_once
22ff3130 1937 || (
e5f10d49
KW
1938 /* disallow with "" or "0" */
1939 *bad_lang_use_once
1940 && strNE("0", bad_lang_use_once)))));
ea92aad8 1941
291a84fb 1942 /* setlocale() return vals; not copied so must be looked at immediately */
8de4332b 1943 const char * sl_result[NOMINAL_LC_ALL_INDEX + 1];
291a84fb
KW
1944
1945 /* current locale for given category; should have been copied so aren't
1946 * volatile */
8de4332b 1947 const char * curlocales[NOMINAL_LC_ALL_INDEX + 1];
291a84fb 1948
7d4bcc4a
KW
1949# ifdef WIN32
1950
6bce99ee
JH
1951 /* In some systems you can find out the system default locale
1952 * and use that as the fallback locale. */
7d4bcc4a
KW
1953# define SYSTEM_DEFAULT_LOCALE
1954# endif
1955# ifdef SYSTEM_DEFAULT_LOCALE
1956
65ebb059 1957 const char *system_default_locale = NULL;
98994639 1958
7d4bcc4a 1959# endif
948523db
KW
1960
1961# ifndef DEBUGGING
1962# define DEBUG_LOCALE_INIT(a,b,c)
1963# else
7d4bcc4a 1964
8298454c 1965 DEBUG_INITIALIZATION_set(cBOOL(PerlEnv_getenv("PERL_DEBUG_LOCALE_INIT")));
7d4bcc4a
KW
1966
1967# define DEBUG_LOCALE_INIT(category, locale, result) \
2fcc0ca9
KW
1968 STMT_START { \
1969 if (debug_initialization) { \
1970 PerlIO_printf(Perl_debug_log, \
1971 "%s:%d: %s\n", \
1972 __FILE__, __LINE__, \
a4f00dcc 1973 setlocale_debug_string(category, \
2fcc0ca9
KW
1974 locale, \
1975 result)); \
1976 } \
1977 } STMT_END
2fcc0ca9 1978
948523db
KW
1979/* Make sure the parallel arrays are properly set up */
1980# ifdef USE_LOCALE_NUMERIC
1981 assert(categories[LC_NUMERIC_INDEX] == LC_NUMERIC);
1982 assert(strEQ(category_names[LC_NUMERIC_INDEX], "LC_NUMERIC"));
1983# endif
1984# ifdef USE_LOCALE_CTYPE
1985 assert(categories[LC_CTYPE_INDEX] == LC_CTYPE);
1986 assert(strEQ(category_names[LC_CTYPE_INDEX], "LC_CTYPE"));
1987# endif
1988# ifdef USE_LOCALE_COLLATE
1989 assert(categories[LC_COLLATE_INDEX] == LC_COLLATE);
1990 assert(strEQ(category_names[LC_COLLATE_INDEX], "LC_COLLATE"));
1991# endif
1992# ifdef USE_LOCALE_TIME
1993 assert(categories[LC_TIME_INDEX] == LC_TIME);
1994 assert(strEQ(category_names[LC_TIME_INDEX], "LC_TIME"));
1995# endif
1996# ifdef USE_LOCALE_MESSAGES
1997 assert(categories[LC_MESSAGES_INDEX] == LC_MESSAGES);
1998 assert(strEQ(category_names[LC_MESSAGES_INDEX], "LC_MESSAGES"));
1999# endif
2000# ifdef USE_LOCALE_MONETARY
2001 assert(categories[LC_MONETARY_INDEX] == LC_MONETARY);
2002 assert(strEQ(category_names[LC_MONETARY_INDEX], "LC_MONETARY"));
2003# endif
9821811f
KW
2004# ifdef USE_LOCALE_ADDRESS
2005 assert(categories[LC_ADDRESS_INDEX] == LC_ADDRESS);
2006 assert(strEQ(category_names[LC_ADDRESS_INDEX], "LC_ADDRESS"));
2007# endif
2008# ifdef USE_LOCALE_IDENTIFICATION
2009 assert(categories[LC_IDENTIFICATION_INDEX] == LC_IDENTIFICATION);
2010 assert(strEQ(category_names[LC_IDENTIFICATION_INDEX], "LC_IDENTIFICATION"));
2011# endif
2012# ifdef USE_LOCALE_MEASUREMENT
2013 assert(categories[LC_MEASUREMENT_INDEX] == LC_MEASUREMENT);
2014 assert(strEQ(category_names[LC_MEASUREMENT_INDEX], "LC_MEASUREMENT"));
2015# endif
2016# ifdef USE_LOCALE_PAPER
2017 assert(categories[LC_PAPER_INDEX] == LC_PAPER);
2018 assert(strEQ(category_names[LC_PAPER_INDEX], "LC_PAPER"));
2019# endif
2020# ifdef USE_LOCALE_TELEPHONE
2021 assert(categories[LC_TELEPHONE_INDEX] == LC_TELEPHONE);
2022 assert(strEQ(category_names[LC_TELEPHONE_INDEX], "LC_TELEPHONE"));
2023# endif
948523db
KW
2024# ifdef LC_ALL
2025 assert(categories[LC_ALL_INDEX] == LC_ALL);
2026 assert(strEQ(category_names[LC_ALL_INDEX], "LC_ALL"));
2027 assert(NOMINAL_LC_ALL_INDEX == LC_ALL_INDEX);
2028# endif
2029# endif /* DEBUGGING */
47280b20
KW
2030
2031 /* Initialize the cache of the program's UTF-8ness for the always known
2032 * locales C and POSIX */
2033 my_strlcpy(PL_locale_utf8ness, C_and_POSIX_utf8ness,
2034 sizeof(PL_locale_utf8ness));
2035
3ca88433
KW
2036 PL_numeric_radix_sv = newSVpvs(".");
2037
b56c4436 2038# ifdef LOCALE_ENVIRON_REQUIRED
98994639
HS
2039
2040 /*
2041 * Ultrix setlocale(..., "") fails if there are no environment
2042 * variables from which to get a locale name.
2043 */
2044
b56c4436
KW
2045# ifndef LC_ALL
2046# error Ultrix without LC_ALL not implemented
2047# else
7d4bcc4a 2048
b56c4436
KW
2049 {
2050 bool done = FALSE;
ea92aad8
KW
2051 if (lang) {
2052 sl_result[LC_ALL_INDEX] = do_setlocale_c(LC_ALL, setlocale_init);
2053 DEBUG_LOCALE_INIT(LC_ALL, setlocale_init, sl_result[LC_ALL_INDEX]);
2054 if (sl_result[LC_ALL_INDEX])
2055 done = TRUE;
2056 else
e5f10d49 2057 setlocale_failure = TRUE;
ea92aad8
KW
2058 }
2059 if (! setlocale_failure) {
2060 const char * locale_param;
2061 for (i = 0; i < LC_ALL_INDEX; i++) {
2062 locale_param = (! done && (lang || PerlEnv_getenv(category_names[i])))
2063 ? setlocale_init
2064 : NULL;
2065 sl_result[i] = do_setlocale_r(categories[i], locale_param);
2066 if (! sl_result[i]) {
2067 setlocale_failure = TRUE;
2068 }
2069 DEBUG_LOCALE_INIT(categories[i], locale_param, sl_result[i]);
e5f10d49 2070 }
c835d6be 2071 }
7d4bcc4a
KW
2072 }
2073
2074# endif /* LC_ALL */
e5f10d49 2075# endif /* LOCALE_ENVIRON_REQUIRED */
98994639 2076
65ebb059 2077 /* We try each locale in the list until we get one that works, or exhaust
20a240df
KW
2078 * the list. Normally the loop is executed just once. But if setting the
2079 * locale fails, inside the loop we add fallback trials to the array and so
2080 * will execute the loop multiple times */
c3fcd832
KW
2081 trial_locales[0] = setlocale_init;
2082 trial_locales_count = 1;
7d4bcc4a 2083
65ebb059
KW
2084 for (i= 0; i < trial_locales_count; i++) {
2085 const char * trial_locale = trial_locales[i];
2086
2087 if (i > 0) {
2088
2089 /* XXX This is to preserve old behavior for LOCALE_ENVIRON_REQUIRED
2090 * when i==0, but I (khw) don't think that behavior makes much
2091 * sense */
2092 setlocale_failure = FALSE;
2093
7d4bcc4a 2094# ifdef SYSTEM_DEFAULT_LOCALE
291a84fb 2095# ifdef WIN32 /* Note that assumes Win32 has LC_ALL */
7d4bcc4a 2096
65ebb059
KW
2097 /* On Windows machines, an entry of "" after the 0th means to use
2098 * the system default locale, which we now proceed to get. */
2099 if (strEQ(trial_locale, "")) {
2100 unsigned int j;
2101
2102 /* Note that this may change the locale, but we are going to do
2103 * that anyway just below */
837ce802 2104 system_default_locale = do_setlocale_c(LC_ALL, "");
5d1187d1 2105 DEBUG_LOCALE_INIT(LC_ALL, "", system_default_locale);
65ebb059 2106
7d4bcc4a 2107 /* Skip if invalid or if it's already on the list of locales to
65ebb059
KW
2108 * try */
2109 if (! system_default_locale) {
2110 goto next_iteration;
2111 }
2112 for (j = 0; j < trial_locales_count; j++) {
2113 if (strEQ(system_default_locale, trial_locales[j])) {
2114 goto next_iteration;
2115 }
2116 }
2117
2118 trial_locale = system_default_locale;
2119 }
ec0202b5
KW
2120# else
2121# error SYSTEM_DEFAULT_LOCALE only implemented for Win32
2122# endif
7d4bcc4a 2123# endif /* SYSTEM_DEFAULT_LOCALE */
291a84fb
KW
2124
2125 } /* For i > 0 */
65ebb059 2126
7d4bcc4a
KW
2127# ifdef LC_ALL
2128
948523db
KW
2129 sl_result[LC_ALL_INDEX] = do_setlocale_c(LC_ALL, trial_locale);
2130 DEBUG_LOCALE_INIT(LC_ALL, trial_locale, sl_result[LC_ALL_INDEX]);
2131 if (! sl_result[LC_ALL_INDEX]) {
49c85077 2132 setlocale_failure = TRUE;
7cd8b568
KW
2133 }
2134 else {
2135 /* Since LC_ALL succeeded, it should have changed all the other
2136 * categories it can to its value; so we massage things so that the
2137 * setlocales below just return their category's current values.
2138 * This adequately handles the case in NetBSD where LC_COLLATE may
2139 * not be defined for a locale, and setting it individually will
7d4bcc4a 2140 * fail, whereas setting LC_ALL succeeds, leaving LC_COLLATE set to
7cd8b568
KW
2141 * the POSIX locale. */
2142 trial_locale = NULL;
2143 }
7d4bcc4a
KW
2144
2145# endif /* LC_ALL */
98994639 2146
e5f10d49
KW
2147 if (! setlocale_failure) {
2148 unsigned int j;
2149 for (j = 0; j < NOMINAL_LC_ALL_INDEX; j++) {
2150 curlocales[j]
2151 = savepv(do_setlocale_r(categories[j], trial_locale));
2152 if (! curlocales[j]) {
2153 setlocale_failure = TRUE;
2154 }
2155 DEBUG_LOCALE_INIT(categories[j], trial_locale, curlocales[j]);
2156 }
c835d6be 2157
e5f10d49
KW
2158 if (! setlocale_failure) { /* All succeeded */
2159 break; /* Exit trial_locales loop */
49c85077 2160 }
65ebb059 2161 }
98994639 2162
49c85077
KW
2163 /* Here, something failed; will need to try a fallback. */
2164 ok = 0;
65ebb059 2165
49c85077
KW
2166 if (i == 0) {
2167 unsigned int j;
98994639 2168
65ebb059 2169 if (locwarn) { /* Output failure info only on the first one */
7d4bcc4a
KW
2170
2171# ifdef LC_ALL
98994639 2172
49c85077
KW
2173 PerlIO_printf(Perl_error_log,
2174 "perl: warning: Setting locale failed.\n");
98994639 2175
7d4bcc4a 2176# else /* !LC_ALL */
98994639 2177
49c85077
KW
2178 PerlIO_printf(Perl_error_log,
2179 "perl: warning: Setting locale failed for the categories:\n\t");
7d4bcc4a 2180
e5f10d49
KW
2181 for (j = 0; j < NOMINAL_LC_ALL_INDEX; j++) {
2182 if (! curlocales[j]) {
2183 PerlIO_printf(Perl_error_log, category_names[j]);
2184 }
2185 else {
2186 Safefree(curlocales[j]);
2187 }
2188 }
7d4bcc4a 2189
7d4bcc4a 2190# endif /* LC_ALL */
98994639 2191
49c85077
KW
2192 PerlIO_printf(Perl_error_log,
2193 "perl: warning: Please check that your locale settings:\n");
98994639 2194
7d4bcc4a
KW
2195# ifdef __GLIBC__
2196
49c85077
KW
2197 PerlIO_printf(Perl_error_log,
2198 "\tLANGUAGE = %c%s%c,\n",
2199 language ? '"' : '(',
2200 language ? language : "unset",
2201 language ? '"' : ')');
7d4bcc4a 2202# endif
98994639 2203
49c85077
KW
2204 PerlIO_printf(Perl_error_log,
2205 "\tLC_ALL = %c%s%c,\n",
2206 lc_all ? '"' : '(',
2207 lc_all ? lc_all : "unset",
2208 lc_all ? '"' : ')');
98994639 2209
7d4bcc4a
KW
2210# if defined(USE_ENVIRON_ARRAY)
2211
49c85077 2212 {
cd999af9 2213 char **e;
d5e32b93
KW
2214
2215 /* Look through the environment for any variables of the
2216 * form qr/ ^ LC_ [A-Z]+ = /x, except LC_ALL which was
2217 * already handled above. These are assumed to be locale
2218 * settings. Output them and their values. */
cd999af9 2219 for (e = environ; *e; e++) {
d5e32b93
KW
2220 const STRLEN prefix_len = sizeof("LC_") - 1;
2221 STRLEN uppers_len;
2222
cd999af9 2223 if ( strBEGINs(*e, "LC_")
c8b388b0 2224 && ! strBEGINs(*e, "LC_ALL=")
d5e32b93
KW
2225 && (uppers_len = strspn(*e + prefix_len,
2226 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"))
2227 && ((*e)[prefix_len + uppers_len] == '='))
cd999af9
KW
2228 {
2229 PerlIO_printf(Perl_error_log, "\t%.*s = \"%s\",\n",
d5e32b93
KW
2230 (int) (prefix_len + uppers_len), *e,
2231 *e + prefix_len + uppers_len + 1);
cd999af9
KW
2232 }
2233 }
49c85077 2234 }
7d4bcc4a
KW
2235
2236# else
2237
49c85077
KW
2238 PerlIO_printf(Perl_error_log,
2239 "\t(possibly more locale environment variables)\n");
7d4bcc4a
KW
2240
2241# endif
98994639 2242
49c85077
KW
2243 PerlIO_printf(Perl_error_log,
2244 "\tLANG = %c%s%c\n",
2245 lang ? '"' : '(',
2246 lang ? lang : "unset",
2247 lang ? '"' : ')');
98994639 2248
49c85077
KW
2249 PerlIO_printf(Perl_error_log,
2250 " are supported and installed on your system.\n");
2251 }
98994639 2252
65ebb059 2253 /* Calculate what fallback locales to try. We have avoided this
f6bab5f6 2254 * until we have to, because failure is quite unlikely. This will
65ebb059
KW
2255 * usually change the upper bound of the loop we are in.
2256 *
2257 * Since the system's default way of setting the locale has not
2258 * found one that works, We use Perl's defined ordering: LC_ALL,
2259 * LANG, and the C locale. We don't try the same locale twice, so
2260 * don't add to the list if already there. (On POSIX systems, the
2261 * LC_ALL element will likely be a repeat of the 0th element "",
6b058d42
KW
2262 * but there's no harm done by doing it explicitly.
2263 *
2264 * Note that this tries the LC_ALL environment variable even on
2265 * systems which have no LC_ALL locale setting. This may or may
2266 * not have been originally intentional, but there's no real need
2267 * to change the behavior. */
65ebb059
KW
2268 if (lc_all) {
2269 for (j = 0; j < trial_locales_count; j++) {
2270 if (strEQ(lc_all, trial_locales[j])) {
2271 goto done_lc_all;
2272 }
2273 }
2274 trial_locales[trial_locales_count++] = lc_all;
2275 }
2276 done_lc_all:
98994639 2277
65ebb059
KW
2278 if (lang) {
2279 for (j = 0; j < trial_locales_count; j++) {
2280 if (strEQ(lang, trial_locales[j])) {
2281 goto done_lang;
2282 }
2283 }
2284 trial_locales[trial_locales_count++] = lang;
2285 }
2286 done_lang:
2287
7d4bcc4a
KW
2288# if defined(WIN32) && defined(LC_ALL)
2289
65ebb059
KW
2290 /* For Windows, we also try the system default locale before "C".
2291 * (If there exists a Windows without LC_ALL we skip this because
2292 * it gets too complicated. For those, the "C" is the next
2293 * fallback possibility). The "" is the same as the 0th element of
2294 * the array, but the code at the loop above knows to treat it
2295 * differently when not the 0th */
2296 trial_locales[trial_locales_count++] = "";
7d4bcc4a
KW
2297
2298# endif
65ebb059
KW
2299
2300 for (j = 0; j < trial_locales_count; j++) {
2301 if (strEQ("C", trial_locales[j])) {
2302 goto done_C;
2303 }
2304 }
2305 trial_locales[trial_locales_count++] = "C";
98994639 2306
65ebb059
KW
2307 done_C: ;
2308 } /* end of first time through the loop */
98994639 2309
7d4bcc4a
KW
2310# ifdef WIN32
2311
65ebb059 2312 next_iteration: ;
7d4bcc4a
KW
2313
2314# endif
65ebb059
KW
2315
2316 } /* end of looping through the trial locales */
2317
2318 if (ok < 1) { /* If we tried to fallback */
2319 const char* msg;
2320 if (! setlocale_failure) { /* fallback succeeded */
2321 msg = "Falling back to";
2322 }
2323 else { /* fallback failed */
e5f10d49 2324 unsigned int j;
98994639 2325
65ebb059
KW
2326 /* We dropped off the end of the loop, so have to decrement i to
2327 * get back to the value the last time through */
2328 i--;
98994639 2329
65ebb059
KW
2330 ok = -1;
2331 msg = "Failed to fall back to";
2332
2333 /* To continue, we should use whatever values we've got */
7d4bcc4a 2334
e5f10d49
KW
2335 for (j = 0; j < NOMINAL_LC_ALL_INDEX; j++) {
2336 Safefree(curlocales[j]);
2337 curlocales[j] = savepv(do_setlocale_r(categories[j], NULL));
2338 DEBUG_LOCALE_INIT(categories[j], NULL, curlocales[j]);
2339 }
65ebb059
KW
2340 }
2341
2342 if (locwarn) {
2343 const char * description;
2344 const char * name = "";
2345 if (strEQ(trial_locales[i], "C")) {
2346 description = "the standard locale";
2347 name = "C";
2348 }
7d4bcc4a
KW
2349
2350# ifdef SYSTEM_DEFAULT_LOCALE
2351
65ebb059
KW
2352 else if (strEQ(trial_locales[i], "")) {
2353 description = "the system default locale";
2354 if (system_default_locale) {
2355 name = system_default_locale;
2356 }
2357 }
7d4bcc4a
KW
2358
2359# endif /* SYSTEM_DEFAULT_LOCALE */
2360
65ebb059
KW
2361 else {
2362 description = "a fallback locale";
2363 name = trial_locales[i];
2364 }
2365 if (name && strNE(name, "")) {
2366 PerlIO_printf(Perl_error_log,
2367 "perl: warning: %s %s (\"%s\").\n", msg, description, name);
2368 }
2369 else {
2370 PerlIO_printf(Perl_error_log,
2371 "perl: warning: %s %s.\n", msg, description);
2372 }
2373 }
2374 } /* End of tried to fallback */
98994639 2375
e5f10d49
KW
2376 /* Done with finding the locales; update our records */
2377
7d4bcc4a
KW
2378# ifdef USE_LOCALE_CTYPE
2379
948523db 2380 new_ctype(curlocales[LC_CTYPE_INDEX]);
98994639 2381
e5f10d49 2382# endif
7d4bcc4a
KW
2383# ifdef USE_LOCALE_COLLATE
2384
948523db 2385 new_collate(curlocales[LC_COLLATE_INDEX]);
98994639 2386
e5f10d49 2387# endif
7d4bcc4a
KW
2388# ifdef USE_LOCALE_NUMERIC
2389
948523db 2390 new_numeric(curlocales[LC_NUMERIC_INDEX]);
e5f10d49
KW
2391
2392# endif
2393
948523db 2394 for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++) {
47280b20
KW
2395
2396# if defined(USE_ITHREADS)
2397
2398 /* This caches whether each category's locale is UTF-8 or not. This
2399 * may involve changing the locale. It is ok to do this at
2400 * initialization time before any threads have started, but not later.
2401 * Caching means that if the program heeds our dictate not to change
2402 * locales in threaded applications, this data will remain valid, and
2403 * it may get queried without changing locales. If the environment is
2404 * such that all categories have the same locale, this isn't needed, as
2405 * the code will not change the locale; but this handles the uncommon
2406 * case where the environment has disparate locales for the categories
2407 * */
2408 (void) _is_cur_LC_category_utf8(categories[i]);
2409
2410# endif
2411
e5f10d49
KW
2412 Safefree(curlocales[i]);
2413 }
b310b053 2414
7d4bcc4a
KW
2415# if defined(USE_PERLIO) && defined(USE_LOCALE_CTYPE)
2416
49c85077 2417 /* Set PL_utf8locale to TRUE if using PerlIO _and_ the current LC_CTYPE
50bf02bd
KW
2418 * locale is UTF-8. The call to new_ctype() just above has already
2419 * calculated the latter value and saved it in PL_in_utf8_CTYPE_locale. If
2420 * both PL_utf8locale and PL_unicode (set by -C or by $ENV{PERL_UNICODE})
2421 * are true, perl.c:S_parse_body() will turn on the PerlIO :utf8 layer on
2422 * STDIN, STDOUT, STDERR, _and_ the default open discipline. */
2423 PL_utf8locale = PL_in_utf8_CTYPE_locale;
49c85077 2424
a05d7ebb 2425 /* Set PL_unicode to $ENV{PERL_UNICODE} if using PerlIO.
fde18df1
JH
2426 This is an alternative to using the -C command line switch
2427 (the -C if present will override this). */
2428 {
dd374669 2429 const char *p = PerlEnv_getenv("PERL_UNICODE");
a05d7ebb 2430 PL_unicode = p ? parse_unicode_opts(&p) : 0;
5a22a2bb
NC
2431 if (PL_unicode & PERL_UNICODE_UTF8CACHEASSERT_FLAG)
2432 PL_utf8cache = -1;
b310b053
JH
2433 }
2434
7d4bcc4a 2435# endif
7d4bcc4a
KW
2436# ifdef __GLIBC__
2437
175c4cf9 2438 Safefree(language);
7d4bcc4a
KW
2439
2440# endif
175c4cf9
KW
2441
2442 Safefree(lc_all);
2443 Safefree(lang);
2444
e3305790 2445#endif /* USE_LOCALE */
2fcc0ca9 2446#ifdef DEBUGGING
7d4bcc4a 2447
2fcc0ca9 2448 /* So won't continue to output stuff */
27cdc72e 2449 DEBUG_INITIALIZATION_set(FALSE);
7d4bcc4a 2450
2fcc0ca9
KW
2451#endif
2452
98994639
HS
2453 return ok;
2454}
2455
98994639
HS
2456#ifdef USE_LOCALE_COLLATE
2457
a4a439fb 2458char *
a4a439fb
KW
2459Perl__mem_collxfrm(pTHX_ const char *input_string,
2460 STRLEN len, /* Length of 'input_string' */
2461 STRLEN *xlen, /* Set to length of returned string
2462 (not including the collation index
2463 prefix) */
2464 bool utf8 /* Is the input in UTF-8? */
6696cfa7 2465 )
98994639 2466{
a4a439fb
KW
2467
2468 /* _mem_collxfrm() is a bit like strxfrm() but with two important
2469 * differences. First, it handles embedded NULs. Second, it allocates a bit
2470 * more memory than needed for the transformed data itself. The real
55e5378d 2471 * transformed data begins at offset COLLXFRM_HDR_LEN. *xlen is set to
a4a439fb
KW
2472 * the length of that, and doesn't include the collation index size.
2473 * Please see sv_collxfrm() to see how this is used. */
2474
55e5378d
KW
2475#define COLLXFRM_HDR_LEN sizeof(PL_collation_ix)
2476
6696cfa7
KW
2477 char * s = (char *) input_string;
2478 STRLEN s_strlen = strlen(input_string);
79f120c8 2479 char *xbuf = NULL;
55e5378d 2480 STRLEN xAlloc; /* xalloc is a reserved word in VC */
17f41037 2481 STRLEN length_in_chars;
c664130f 2482 bool first_time = TRUE; /* Cleared after first loop iteration */
98994639 2483
a4a439fb
KW
2484 PERL_ARGS_ASSERT__MEM_COLLXFRM;
2485
2486 /* Must be NUL-terminated */
2487 assert(*(input_string + len) == '\0');
7918f24d 2488
79f120c8
KW
2489 /* If this locale has defective collation, skip */
2490 if (PL_collxfrm_base == 0 && PL_collxfrm_mult == 0) {
c7202dee
KW
2491 DEBUG_L(PerlIO_printf(Perl_debug_log,
2492 "_mem_collxfrm: locale's collation is defective\n"));
79f120c8
KW
2493 goto bad;
2494 }
2495
6696cfa7
KW
2496 /* Replace any embedded NULs with the control that sorts before any others.
2497 * This will give as good as possible results on strings that don't
2498 * otherwise contain that character, but otherwise there may be
2499 * less-than-perfect results with that character and NUL. This is
fdc080f3 2500 * unavoidable unless we replace strxfrm with our own implementation. */
fd43f63c
KW
2501 if (UNLIKELY(s_strlen < len)) { /* Only execute if there is an embedded
2502 NUL */
6696cfa7
KW
2503 char * e = s + len;
2504 char * sans_nuls;
fdc080f3 2505 STRLEN sans_nuls_len;
94762aa0 2506 int try_non_controls;
afc4976f
KW
2507 char this_replacement_char[] = "?\0"; /* Room for a two-byte string,
2508 making sure 2nd byte is NUL.
2509 */
2510 STRLEN this_replacement_len;
2511
1e4c9676
KW
2512 /* If we don't know what non-NUL control character sorts lowest for
2513 * this locale, find it */
f28f4d2a 2514 if (PL_strxfrm_NUL_replacement == '\0') {
6696cfa7 2515 int j;
afc4976f 2516 char * cur_min_x = NULL; /* The min_char's xfrm, (except it also
6696cfa7
KW
2517 includes the collation index
2518 prefixed. */
2519
91c0e2e0 2520 DEBUG_Lv(PerlIO_printf(Perl_debug_log, "Looking to replace NUL\n"));
94762aa0
KW
2521
2522 /* Unlikely, but it may be that no control will work to replace
1e4c9676
KW
2523 * NUL, in which case we instead look for any character. Controls
2524 * are preferred because collation order is, in general, context
2525 * sensitive, with adjoining characters affecting the order, and
2526 * controls are less likely to have such interactions, allowing the
2527 * NUL-replacement to stand on its own. (Another way to look at it
2528 * is to imagine what would happen if the NUL were replaced by a
2529 * combining character; it wouldn't work out all that well.) */
94762aa0
KW
2530 for (try_non_controls = 0;
2531 try_non_controls < 2;
2532 try_non_controls++)
2533 {
d4ff9586
KW
2534 /* Look through all legal code points (NUL isn't) */
2535 for (j = 1; j < 256; j++) {
2536 char * x; /* j's xfrm plus collation index */
2537 STRLEN x_len; /* length of 'x' */
2538 STRLEN trial_len = 1;
736a4fed 2539 char cur_source[] = { '\0', '\0' };
d4ff9586 2540
736a4fed
KW
2541 /* Skip non-controls the first time through the loop. The
2542 * controls in a UTF-8 locale are the L1 ones */
afc4976f
KW
2543 if (! try_non_controls && (PL_in_utf8_COLLATE_locale)
2544 ? ! isCNTRL_L1(j)
2545 : ! isCNTRL_LC(j))
2546 {
d4ff9586 2547 continue;
6696cfa7 2548 }
6696cfa7 2549
736a4fed
KW
2550 /* Create a 1-char string of the current code point */
2551 cur_source[0] = (char) j;
2552
d4ff9586
KW
2553 /* Then transform it */
2554 x = _mem_collxfrm(cur_source, trial_len, &x_len,
afc4976f 2555 0 /* The string is not in UTF-8 */);
6696cfa7 2556
1e4c9676 2557 /* Ignore any character that didn't successfully transform.
d4ff9586
KW
2558 * */
2559 if (! x) {
2560 continue;
2561 }
6696cfa7 2562
d4ff9586
KW
2563 /* If this character's transformation is lower than
2564 * the current lowest, this one becomes the lowest */
2565 if ( cur_min_x == NULL
2566 || strLT(x + COLLXFRM_HDR_LEN,
2567 cur_min_x + COLLXFRM_HDR_LEN))
2568 {
f28f4d2a 2569 PL_strxfrm_NUL_replacement = j;
d4ff9586 2570 cur_min_x = x;
d4ff9586
KW
2571 }
2572 else {
2573 Safefree(x);
2574 }
1e4c9676 2575 } /* end of loop through all 255 characters */
6696cfa7 2576
1e4c9676 2577 /* Stop looking if found */
94762aa0
KW
2578 if (cur_min_x) {
2579 break;
2580 }
2581
2582 /* Unlikely, but possible, if there aren't any controls that
2583 * work in the locale, repeat the loop, looking for any
2584 * character that works */
2585 DEBUG_L(PerlIO_printf(Perl_debug_log,
2586 "_mem_collxfrm: No control worked. Trying non-controls\n"));
1e4c9676 2587 } /* End of loop to try first the controls, then any char */
6696cfa7 2588
94762aa0
KW
2589 if (! cur_min_x) {
2590 DEBUG_L(PerlIO_printf(Perl_debug_log,
2591 "_mem_collxfrm: Couldn't find any character to replace"
2592 " embedded NULs in locale %s with", PL_collation_name));
2593 goto bad;
58eebef2
KW
2594 }
2595
94762aa0
KW
2596 DEBUG_L(PerlIO_printf(Perl_debug_log,
2597 "_mem_collxfrm: Replacing embedded NULs in locale %s with "
f28f4d2a 2598 "0x%02X\n", PL_collation_name, PL_strxfrm_NUL_replacement));
94762aa0 2599
6696cfa7 2600 Safefree(cur_min_x);
1e4c9676 2601 } /* End of determining the character that is to replace NULs */
afc4976f
KW
2602
2603 /* If the replacement is variant under UTF-8, it must match the
291a84fb 2604 * UTF8-ness of the original */
f28f4d2a
KW
2605 if ( ! UVCHR_IS_INVARIANT(PL_strxfrm_NUL_replacement) && utf8) {
2606 this_replacement_char[0] =
2607 UTF8_EIGHT_BIT_HI(PL_strxfrm_NUL_replacement);
2608 this_replacement_char[1] =
2609 UTF8_EIGHT_BIT_LO(PL_strxfrm_NUL_replacement);
afc4976f
KW
2610 this_replacement_len = 2;
2611 }
2612 else {
f28f4d2a 2613 this_replacement_char[0] = PL_strxfrm_NUL_replacement;
afc4976f
KW
2614 /* this_replacement_char[1] = '\0' was done at initialization */
2615 this_replacement_len = 1;
6696cfa7
KW
2616 }
2617
2618 /* The worst case length for the replaced string would be if every
2619 * character in it is NUL. Multiply that by the length of each
2620 * replacement, and allow for a trailing NUL */
afc4976f 2621 sans_nuls_len = (len * this_replacement_len) + 1;
fdc080f3 2622 Newx(sans_nuls, sans_nuls_len, char);
6696cfa7
KW
2623 *sans_nuls = '\0';
2624
6696cfa7
KW
2625 /* Replace each NUL with the lowest collating control. Loop until have
2626 * exhausted all the NULs */
2627 while (s + s_strlen < e) {
6069d6c5 2628 my_strlcat(sans_nuls, s, sans_nuls_len);
6696cfa7
KW
2629
2630 /* Do the actual replacement */
6069d6c5 2631 my_strlcat(sans_nuls, this_replacement_char, sans_nuls_len);
6696cfa7
KW
2632
2633 /* Move past the input NUL */
2634 s += s_strlen + 1;
2635 s_strlen = strlen(s);
2636 }
2637
2638 /* And add anything that trails the final NUL */
6069d6c5 2639 my_strlcat(sans_nuls, s, sans_nuls_len);
6696cfa7
KW
2640
2641 /* Switch so below we transform this modified string */
2642 s = sans_nuls;
2643 len = strlen(s);
1e4c9676 2644 } /* End of replacing NULs */
6696cfa7 2645
a4a439fb
KW
2646 /* Make sure the UTF8ness of the string and locale match */
2647 if (utf8 != PL_in_utf8_COLLATE_locale) {
2648 const char * const t = s; /* Temporary so we can later find where the
2649 input was */
2650
2651 /* Here they don't match. Change the string's to be what the locale is
2652 * expecting */
2653
2654 if (! utf8) { /* locale is UTF-8, but input isn't; upgrade the input */
2655 s = (char *) bytes_to_utf8((const U8 *) s, &len);
2656 utf8 = TRUE;
2657 }
2658 else { /* locale is not UTF-8; but input is; downgrade the input */
2659
2660 s = (char *) bytes_from_utf8((const U8 *) s, &len, &utf8);
2661
2662 /* If the downgrade was successful we are done, but if the input
2663 * contains things that require UTF-8 to represent, have to do
2664 * damage control ... */
2665 if (UNLIKELY(utf8)) {
2666
2667 /* What we do is construct a non-UTF-8 string with
2668 * 1) the characters representable by a single byte converted
2669 * to be so (if necessary);
2670 * 2) and the rest converted to collate the same as the
2671 * highest collating representable character. That makes
2672 * them collate at the end. This is similar to how we
2673 * handle embedded NULs, but we use the highest collating
2674 * code point instead of the smallest. Like the NUL case,
2675 * this isn't perfect, but is the best we can reasonably
2676 * do. Every above-255 code point will sort the same as
2677 * the highest-sorting 0-255 code point. If that code
2678 * point can combine in a sequence with some other code
2679 * points for weight calculations, us changing something to
2680 * be it can adversely affect the results. But in most
2681 * cases, it should work reasonably. And note that this is
2682 * really an illegal situation: using code points above 255
2683 * on a locale where only 0-255 are valid. If two strings
2684 * sort entirely equal, then the sort order for the
2685 * above-255 code points will be in code point order. */
2686
2687 utf8 = FALSE;
2688
2689 /* If we haven't calculated the code point with the maximum
2690 * collating order for this locale, do so now */
2691 if (! PL_strxfrm_max_cp) {
2692 int j;
2693
2694 /* The current transformed string that collates the
2695 * highest (except it also includes the prefixed collation
2696 * index. */
2697 char * cur_max_x = NULL;
2698
2699 /* Look through all legal code points (NUL isn't) */
2700 for (j = 1; j < 256; j++) {
2701 char * x;
2702 STRLEN x_len;
736a4fed 2703 char cur_source[] = { '\0', '\0' };
a4a439fb 2704
736a4fed
KW
2705 /* Create a 1-char string of the current code point */
2706 cur_source[0] = (char) j;
a4a439fb
KW
2707
2708 /* Then transform it */
2709 x = _mem_collxfrm(cur_source, 1, &x_len, FALSE);
2710
2711 /* If something went wrong (which it shouldn't), just
2712 * ignore this code point */
94762aa0 2713 if (! x) {
a4a439fb
KW
2714 continue;
2715 }
2716
2717 /* If this character's transformation is higher than
2718 * the current highest, this one becomes the highest */
2719 if ( cur_max_x == NULL
55e5378d
KW
2720 || strGT(x + COLLXFRM_HDR_LEN,
2721 cur_max_x + COLLXFRM_HDR_LEN))
a4a439fb
KW
2722 {
2723 PL_strxfrm_max_cp = j;
2724 cur_max_x = x;
2725 }
2726 else {
2727 Safefree(x);
2728 }
2729 }
2730
94762aa0
KW
2731 if (! cur_max_x) {
2732 DEBUG_L(PerlIO_printf(Perl_debug_log,
2733 "_mem_collxfrm: Couldn't find any character to"
2734 " replace above-Latin1 chars in locale %s with",
2735 PL_collation_name));
2736 goto bad;
2737 }
2738
58eebef2
KW
2739 DEBUG_L(PerlIO_printf(Perl_debug_log,
2740 "_mem_collxfrm: highest 1-byte collating character"
2741 " in locale %s is 0x%02X\n",
2742 PL_collation_name,
2743 PL_strxfrm_max_cp));
58eebef2 2744
a4a439fb
KW
2745 Safefree(cur_max_x);
2746 }
2747
2748 /* Here we know which legal code point collates the highest.
2749 * We are ready to construct the non-UTF-8 string. The length
2750 * will be at least 1 byte smaller than the input string
2751 * (because we changed at least one 2-byte character into a
2752 * single byte), but that is eaten up by the trailing NUL */
2753 Newx(s, len, char);
2754
2755 {
2756 STRLEN i;
2757 STRLEN d= 0;
042d9e50 2758 char * e = (char *) t + len;
a4a439fb
KW
2759
2760 for (i = 0; i < len; i+= UTF8SKIP(t + i)) {
2761 U8 cur_char = t[i];
2762 if (UTF8_IS_INVARIANT(cur_char)) {
2763 s[d++] = cur_char;
2764 }
042d9e50 2765 else if (UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(t + i, e)) {
a4a439fb
KW
2766 s[d++] = EIGHT_BIT_UTF8_TO_NATIVE(cur_char, t[i+1]);
2767 }
2768 else { /* Replace illegal cp with highest collating
2769 one */
2770 s[d++] = PL_strxfrm_max_cp;
2771 }
2772 }
2773 s[d++] = '\0';
2774 Renew(s, d, char); /* Free up unused space */
2775 }
2776 }
2777 }
2778
2779 /* Here, we have constructed a modified version of the input. It could
2780 * be that we already had a modified copy before we did this version.
2781 * If so, that copy is no longer needed */
2782 if (t != input_string) {
2783 Safefree(t);
2784 }
2785 }
2786
17f41037
KW
2787 length_in_chars = (utf8)
2788 ? utf8_length((U8 *) s, (U8 *) s + len)
2789 : len;
2790
59c018b9
KW
2791 /* The first element in the output is the collation id, used by
2792 * sv_collxfrm(); then comes the space for the transformed string. The
2793 * equation should give us a good estimate as to how much is needed */
55e5378d 2794 xAlloc = COLLXFRM_HDR_LEN
a4a439fb 2795 + PL_collxfrm_base
17f41037 2796 + (PL_collxfrm_mult * length_in_chars);
a02a5408 2797 Newx(xbuf, xAlloc, char);
c7202dee
KW
2798 if (UNLIKELY(! xbuf)) {
2799 DEBUG_L(PerlIO_printf(Perl_debug_log,
2800 "_mem_collxfrm: Couldn't malloc %zu bytes\n", xAlloc));
98994639 2801 goto bad;
c7202dee 2802 }
98994639 2803
d35fca5f 2804 /* Store the collation id */
98994639 2805 *(U32*)xbuf = PL_collation_ix;
d35fca5f
KW
2806
2807 /* Then the transformation of the input. We loop until successful, or we
2808 * give up */
4ebeff16 2809 for (;;) {
1adab0a7 2810
55e5378d 2811 *xlen = strxfrm(xbuf + COLLXFRM_HDR_LEN, s, xAlloc - COLLXFRM_HDR_LEN);
4ebeff16
KW
2812
2813 /* If the transformed string occupies less space than we told strxfrm()
2814 * was available, it means it successfully transformed the whole
2815 * string. */
55e5378d 2816 if (*xlen < xAlloc - COLLXFRM_HDR_LEN) {
17f41037 2817
1adab0a7
KW
2818 /* Some systems include a trailing NUL in the returned length.
2819 * Ignore it, using a loop in case multiple trailing NULs are
2820 * returned. */
2821 while ( (*xlen) > 0
2822 && *(xbuf + COLLXFRM_HDR_LEN + (*xlen) - 1) == '\0')
2823 {
2824 (*xlen)--;
2825 }
2826
17f41037
KW
2827 /* If the first try didn't get it, it means our prediction was low.
2828 * Modify the coefficients so that we predict a larger value in any
2829 * future transformations */
2830 if (! first_time) {
2831 STRLEN needed = *xlen + 1; /* +1 For trailing NUL */
2832 STRLEN computed_guess = PL_collxfrm_base
2833 + (PL_collxfrm_mult * length_in_chars);
e1c30f0c
KW
2834
2835 /* On zero-length input, just keep current slope instead of
2836 * dividing by 0 */
2837 const STRLEN new_m = (length_in_chars != 0)
2838 ? needed / length_in_chars
2839 : PL_collxfrm_mult;
17f41037
KW
2840
2841 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
b07929e4
KW
2842 "%s: %d: initial size of %zu bytes for a length "
2843 "%zu string was insufficient, %zu needed\n",
17f41037 2844 __FILE__, __LINE__,
b07929e4 2845 computed_guess, length_in_chars, needed));
17f41037
KW
2846
2847 /* If slope increased, use it, but discard this result for
2848 * length 1 strings, as we can't be sure that it's a real slope
2849 * change */
2850 if (length_in_chars > 1 && new_m > PL_collxfrm_mult) {
7d4bcc4a
KW
2851
2852# ifdef DEBUGGING
2853
17f41037
KW
2854 STRLEN old_m = PL_collxfrm_mult;
2855 STRLEN old_b = PL_collxfrm_base;
7d4bcc4a
KW
2856
2857# endif
2858
17f41037
KW
2859 PL_collxfrm_mult = new_m;
2860 PL_collxfrm_base = 1; /* +1 For trailing NUL */
2861 computed_guess = PL_collxfrm_base
2862 + (PL_collxfrm_mult * length_in_chars);
2863 if (computed_guess < needed) {
2864 PL_collxfrm_base += needed - computed_guess;
2865 }
2866
2867 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
b07929e4
KW
2868 "%s: %d: slope is now %zu; was %zu, base "
2869 "is now %zu; was %zu\n",
17f41037 2870 __FILE__, __LINE__,
b07929e4
KW
2871 PL_collxfrm_mult, old_m,
2872 PL_collxfrm_base, old_b));
17f41037
KW
2873 }
2874 else { /* Slope didn't change, but 'b' did */
2875 const STRLEN new_b = needed
2876 - computed_guess
2877 + PL_collxfrm_base;
2878 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
b07929e4 2879 "%s: %d: base is now %zu; was %zu\n",
17f41037 2880 __FILE__, __LINE__,
b07929e4 2881 new_b, PL_collxfrm_base));
17f41037
KW
2882 PL_collxfrm_base = new_b;
2883 }
2884 }
2885
4ebeff16
KW
2886 break;
2887 }
bb0f664e 2888
c7202dee
KW
2889 if (UNLIKELY(*xlen >= PERL_INT_MAX)) {
2890 DEBUG_L(PerlIO_printf(Perl_debug_log,
2891 "_mem_collxfrm: Needed %zu bytes, max permissible is %u\n",
2892 *xlen, PERL_INT_MAX));
4ebeff16 2893 goto bad;
c7202dee 2894 }
d35fca5f 2895
c664130f 2896 /* A well-behaved strxfrm() returns exactly how much space it needs
1adab0a7
KW
2897 * (usually not including the trailing NUL) when it fails due to not
2898 * enough space being provided. Assume that this is the case unless
2899 * it's been proven otherwise */
c664130f 2900 if (LIKELY(PL_strxfrm_is_behaved) && first_time) {
55e5378d 2901 xAlloc = *xlen + COLLXFRM_HDR_LEN + 1;
c664130f
KW
2902 }
2903 else { /* Here, either:
2904 * 1) The strxfrm() has previously shown bad behavior; or
2905 * 2) It isn't the first time through the loop, which means
2906 * that the strxfrm() is now showing bad behavior, because
2907 * we gave it what it said was needed in the previous
2908 * iteration, and it came back saying it needed still more.
2909 * (Many versions of cygwin fit this. When the buffer size
2910 * isn't sufficient, they return the input size instead of
2911 * how much is needed.)
d4ff9586
KW
2912 * Increase the buffer size by a fixed percentage and try again.
2913 * */
6ddd902c 2914 xAlloc += (xAlloc / 4) + 1;
c664130f 2915 PL_strxfrm_is_behaved = FALSE;
c664130f 2916
7d4bcc4a
KW
2917# ifdef DEBUGGING
2918
58eebef2
KW
2919 if (DEBUG_Lv_TEST || debug_initialization) {
2920 PerlIO_printf(Perl_debug_log,
2921 "_mem_collxfrm required more space than previously calculated"
b07929e4 2922 " for locale %s, trying again with new guess=%d+%zu\n",
58eebef2 2923 PL_collation_name, (int) COLLXFRM_HDR_LEN,
b07929e4 2924 xAlloc - COLLXFRM_HDR_LEN);
58eebef2 2925 }
7d4bcc4a
KW
2926
2927# endif
2928
58eebef2 2929 }
c664130f 2930
4ebeff16 2931 Renew(xbuf, xAlloc, char);
c7202dee
KW
2932 if (UNLIKELY(! xbuf)) {
2933 DEBUG_L(PerlIO_printf(Perl_debug_log,
2934 "_mem_collxfrm: Couldn't realloc %zu bytes\n", xAlloc));
4ebeff16 2935 goto bad;
c7202dee 2936 }
c664130f
KW
2937
2938 first_time = FALSE;
4ebeff16 2939 }
98994639 2940
6696cfa7 2941
7d4bcc4a
KW
2942# ifdef DEBUGGING
2943
58eebef2 2944 if (DEBUG_Lv_TEST || debug_initialization) {
c7202dee
KW
2945
2946 print_collxfrm_input_and_return(s, s + len, xlen, utf8);
2947 PerlIO_printf(Perl_debug_log, "Its xfrm is:");
7e2f38b2
KW
2948 PerlIO_printf(Perl_debug_log, "%s\n",
2949 _byte_dump_string((U8 *) xbuf + COLLXFRM_HDR_LEN,
2950 *xlen, 1));
58eebef2 2951 }
7d4bcc4a
KW
2952
2953# endif
58eebef2 2954
3c5f993e 2955 /* Free up unneeded space; retain ehough for trailing NUL */
55e5378d 2956 Renew(xbuf, COLLXFRM_HDR_LEN + *xlen + 1, char);
98994639 2957
6696cfa7
KW
2958 if (s != input_string) {
2959 Safefree(s);
98994639
HS
2960 }
2961
98994639
HS
2962 return xbuf;
2963
2964 bad:
2965 Safefree(xbuf);
6696cfa7
KW
2966 if (s != input_string) {
2967 Safefree(s);
2968 }
98994639 2969 *xlen = 0;
7d4bcc4a
KW
2970
2971# ifdef DEBUGGING
2972
58eebef2 2973 if (DEBUG_Lv_TEST || debug_initialization) {
c7202dee 2974 print_collxfrm_input_and_return(s, s + len, NULL, utf8);
58eebef2 2975 }
7d4bcc4a
KW
2976
2977# endif
2978
98994639
HS
2979 return NULL;
2980}
2981
7d4bcc4a 2982# ifdef DEBUGGING
c7202dee 2983
4cbaac56 2984STATIC void
c7202dee
KW
2985S_print_collxfrm_input_and_return(pTHX_
2986 const char * const s,
2987 const char * const e,
2988 const STRLEN * const xlen,
2989 const bool is_utf8)
2990{
c7202dee
KW
2991
2992 PERL_ARGS_ASSERT_PRINT_COLLXFRM_INPUT_AND_RETURN;
2993
511e4ff7
DM
2994 PerlIO_printf(Perl_debug_log, "_mem_collxfrm[%" UVuf "]: returning ",
2995 (UV)PL_collation_ix);
c7202dee 2996 if (xlen) {
08b6dc1d 2997 PerlIO_printf(Perl_debug_log, "%zu", *xlen);
c7202dee
KW
2998 }
2999 else {
3000 PerlIO_printf(Perl_debug_log, "NULL");
3001 }
3002 PerlIO_printf(Perl_debug_log, " for locale '%s', string='",
3003 PL_collation_name);
9c8a6dc2
KW
3004 print_bytes_for_locale(s, e, is_utf8);
3005
3006 PerlIO_printf(Perl_debug_log, "'\n");
3007}
3008
3009STATIC void
3010S_print_bytes_for_locale(pTHX_
3011 const char * const s,
3012 const char * const e,
3013 const bool is_utf8)
3014{
3015 const char * t = s;
3016 bool prev_was_printable = TRUE;
3017 bool first_time = TRUE;
3018
3019 PERL_ARGS_ASSERT_PRINT_BYTES_FOR_LOCALE;
c7202dee
KW
3020
3021 while (t < e) {
3022 UV cp = (is_utf8)
3023 ? utf8_to_uvchr_buf((U8 *) t, e, NULL)
3024 : * (U8 *) t;
3025 if (isPRINT(cp)) {
3026 if (! prev_was_printable) {
3027 PerlIO_printf(Perl_debug_log, " ");
3028 }
3029 PerlIO_printf(Perl_debug_log, "%c", (U8) cp);
3030 prev_was_printable = TRUE;
3031 }
3032 else {
3033 if (! first_time) {
3034 PerlIO_printf(Perl_debug_log, " ");
3035 }
147e3846 3036 PerlIO_printf(Perl_debug_log, "%02" UVXf, cp);
c7202dee
KW
3037 prev_was_printable = FALSE;
3038 }
3039 t += (is_utf8) ? UTF8SKIP(t) : 1;
3040 first_time = FALSE;
3041 }
c7202dee
KW
3042}
3043
7d4bcc4a 3044# endif /* #ifdef DEBUGGING */
98994639 3045#endif /* USE_LOCALE_COLLATE */
58eebef2 3046
8ef6e574
KW
3047#ifdef USE_LOCALE
3048
962aa53f
KW
3049STATIC const char *
3050S_switch_category_locale_to_template(pTHX_ const int switch_category, const int template_category, const char * template_locale)
3051{
3052 /* Changes the locale for LC_'switch_category" to that of
3053 * LC_'template_category', if they aren't already the same. If not NULL,
3054 * 'template_locale' is the locale that 'template_category' is in.
3055 *
3056 * Returns the original locale for 'switch_category' so can be switched
3057 * back to with the companion function restore_switched_locale(), (NULL if
3058 * no restoral is necessary.) */
3059
3060 char * restore_to_locale = NULL;
3061
3062 if (switch_category == template_category) { /* No changes needed */
3063 return NULL;
3064 }
3065
3066 /* Find the original locale of the category we may need to change, so that
3067 * it can be restored to later */
d6189047
KW
3068 restore_to_locale = stdize_locale(savepv(do_setlocale_r(switch_category,
3069 NULL)));
962aa53f
KW
3070 if (! restore_to_locale) {
3071 Perl_croak(aTHX_
3072 "panic: %s: %d: Could not find current %s locale, errno=%d\n",
3073 __FILE__, __LINE__, category_name(switch_category), errno);
3074 }
962aa53f
KW
3075
3076 /* If the locale of the template category wasn't passed in, find it now */
3077 if (template_locale == NULL) {
3078 template_locale = do_setlocale_r(template_category, NULL);
3079 if (! template_locale) {
3080 Perl_croak(aTHX_
3081 "panic: %s: %d: Could not find current %s locale, errno=%d\n",
3082 __FILE__, __LINE__, category_name(template_category), errno);
3083 }
3084 }
3085
3086 /* It the locales are the same, there's nothing to do */
3087 if (strEQ(restore_to_locale, template_locale)) {
3088 Safefree(restore_to_locale);
3089
3090 DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s locale unchanged as %s\n",
3091 category_name(switch_category), restore_to_locale));
3092
3093 return NULL;
3094 }
3095
3096 /* Finally, change the locale to the template one */
3097 if (! do_setlocale_r(switch_category, template_locale)) {
3098 Perl_croak(aTHX_
3099 "panic: %s: %d: Could not change %s locale to %s, errno=%d\n",
3100 __FILE__, __LINE__, category_name(switch_category),
3101 template_locale, errno);
3102 }
3103
3104 DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s locale switched to %s\n",
3105 category_name(switch_category), template_locale));
3106
3107 return restore_to_locale;
3108}
3109
3110STATIC void
3111S_restore_switched_locale(pTHX_ const int category, const char * const original_locale)
3112{
3113 /* Restores the locale for LC_'category' to 'original_locale', or do
3114 * nothing if the latter parameter is NULL */
3115
3116 if (original_locale == NULL) {
3117 return;
3118 }
3119
3120 if (! do_setlocale_r(category, original_locale)) {
3121 Perl_croak(aTHX_
3122 "panic: %s: %d: setlocale %s restore to %s failed, errno=%d\n",
3123 __FILE__, __LINE__,
3124 category_name(category), original_locale, errno);
3125 }
3126
3127 Safefree(original_locale);
3128}
3129
c1284011
KW
3130bool
3131Perl__is_cur_LC_category_utf8(pTHX_ int category)
7d74bb61
KW
3132{
3133 /* Returns TRUE if the current locale for 'category' is UTF-8; FALSE
3134 * otherwise. 'category' may not be LC_ALL. If the platform doesn't have
119ee68b 3135 * nl_langinfo(), nor MB_CUR_MAX, this employs a heuristic, which hence
609548d2
KW
3136 * could give the wrong result. The result will very likely be correct for
3137 * languages that have commonly used non-ASCII characters, but for notably
3138 * English, it comes down to if the locale's name ends in something like
3139 * "UTF-8". It errs on the side of not being a UTF-8 locale. */
7d74bb61 3140
47280b20 3141 /* Name of current locale corresponding to the input category */
8de4332b 3142 const char *save_input_locale = NULL;
47280b20
KW
3143
3144 bool is_utf8 = FALSE; /* The return value */
7d74bb61 3145
47280b20
KW
3146 /* The variables below are for the cache of previous lookups using this
3147 * function. The cache is a C string, described at the definition for
3148 * 'C_and_POSIX_utf8ness'.
3149 *
3150 * The first part of the cache is fixed, for the C and POSIX locales. The
3151 * varying part starts just after them. */
3152 char * utf8ness_cache = PL_locale_utf8ness + STRLENs(C_and_POSIX_utf8ness);
3153
3154 Size_t utf8ness_cache_size; /* Size of the varying portion */
3155 Size_t input_name_len; /* Length in bytes of save_input_locale */
3156 Size_t input_name_len_with_overhead; /* plus extra chars used to store
3157 the name in the cache */
3158 char * delimited; /* The name plus the delimiters used to store
3159 it in the cache */
3160 char * name_pos; /* position of 'delimited' in the cache, or 0
3161 if not there */
3162
3163
7d4bcc4a
KW
3164# ifdef LC_ALL
3165
7d74bb61 3166 assert(category != LC_ALL);
7d4bcc4a
KW
3167
3168# endif
7d74bb61 3169
47280b20 3170 /* Get the desired category's locale */
d6189047 3171 save_input_locale = stdize_locale(savepv(do_setlocale_r(category, NULL)));
7d74bb61 3172 if (! save_input_locale) {
47280b20 3173 Perl_croak(aTHX_
d707d779
KW
3174 "panic: %s: %d: Could not find current %s locale, errno=%d\n",
3175 __FILE__, __LINE__, category_name(category), errno);
7d74bb61 3176 }
47280b20 3177
47280b20
KW
3178 DEBUG_L(PerlIO_printf(Perl_debug_log,
3179 "Current locale for %s is %s\n",
3180 category_name(category), save_input_locale));
3181
3182 input_name_len = strlen(save_input_locale);
3183
3184 /* In our cache, each name is accompanied by two delimiters and a single
3185 * utf8ness digit */
3186 input_name_len_with_overhead = input_name_len + 3;
3187
3188 /* Allocate and populate space for a copy of the name surrounded by the
3189 * delimiters */
3190 Newx(delimited, input_name_len_with_overhead, char);
3191 delimited[0] = UTF8NESS_SEP[0];
3192 Copy(save_input_locale, delimited + 1, input_name_len, char);
3193 delimited[input_name_len+1] = UTF8NESS_PREFIX[0];
3194 delimited[input_name_len+2] = '\0';
3195
3196 /* And see if that is in the cache */
3197 name_pos = instr(PL_locale_utf8ness, delimited);
3198 if (name_pos) {
3199 is_utf8 = *(name_pos + input_name_len_with_overhead - 1) - '0';
3200
3201# ifdef DEBUGGING
3202
3203 if (DEBUG_Lv_TEST || debug_initialization) {
3204 PerlIO_printf(Perl_debug_log, "UTF8ness for locale %s=%d, \n",
3205 save_input_locale, is_utf8);
3206 }
3207
3208# endif
3209
3210 /* And, if not already in that position, move it to the beginning of
3211 * the non-constant portion of the list, since it is the most recently
3212 * used. (We don't have to worry about overflow, since just moving
3213 * existing names around) */
3214 if (name_pos > utf8ness_cache) {
3215 Move(utf8ness_cache,
3216 utf8ness_cache + input_name_len_with_overhead,
3217 name_pos - utf8ness_cache, char);
3218 Copy(delimited,
3219 utf8ness_cache,
3220 input_name_len_with_overhead - 1, char);
3221 utf8ness_cache[input_name_len_with_overhead - 1] = is_utf8 + '0';
3222 }
3223
3224 Safefree(delimited);
b07fffd1 3225 Safefree(save_input_locale);
47280b20 3226 return is_utf8;
7d74bb61
KW
3227 }
3228
47280b20
KW
3229 /* Here we don't have stored the utf8ness for the input locale. We have to
3230 * calculate it */
3231
94646a69
KW
3232# if defined(USE_LOCALE_CTYPE) \
3233 && ( (defined(HAS_NL_LANGINFO) && defined(CODESET)) \
3234 || (defined(HAS_MBTOWC) || defined(HAS_MBRTOWC)))
7d74bb61 3235
0dec74cd 3236 {
962aa53f
KW
3237 const char *original_ctype_locale
3238 = switch_category_locale_to_template(LC_CTYPE,
3239 category,
3240 save_input_locale);
69014004 3241
7d74bb61 3242 /* Here the current LC_CTYPE is set to the locale of the category whose
0dec74cd 3243 * information is desired. This means that nl_langinfo() and mbtowc()
1d958db2 3244 * should give the correct results */
119ee68b 3245
0dec74cd
KW
3246# ifdef MB_CUR_MAX /* But we can potentially rule out UTF-8ness, avoiding
3247 calling the functions if we have this */
3248
3249 /* Standard UTF-8 needs at least 4 bytes to represent the maximum
3250 * Unicode code point. */
3251
3252 DEBUG_L(PerlIO_printf(Perl_debug_log, "\tMB_CUR_MAX=%d\n",
3253 (int) MB_CUR_MAX));
3254 if ((unsigned) MB_CUR_MAX < STRLENs(MAX_UNICODE_UTF8)) {
3255 is_utf8 = FALSE;
b5b2847c
KW
3256 restore_switched_locale(LC_CTYPE, original_ctype_locale);
3257 goto finish_and_return;
0dec74cd
KW
3258 }
3259
3260# endif
7d4bcc4a
KW
3261# if defined(HAS_NL_LANGINFO) && defined(CODESET)
3262
0dec74cd
KW
3263 { /* The task is easiest if the platform has this POSIX 2001 function.
3264 Except on some platforms it can wrongly return "", so have to have
3265 a fallback. And it can return that it's UTF-8, even if there are
3266 variances from that. For example, Turkish locales may use the
3267 alternate dotted I rules, and sometimes it appears to be a
3268 defective locale definition. XXX We should probably check for
3269 these in the Latin1 range and warn */
c70a3e68
KW
3270 const char *codeset = my_nl_langinfo(PERL_CODESET, FALSE);
3271 /* FALSE => already in dest locale */
119ee68b 3272
c70a3e68
KW
3273 DEBUG_L(PerlIO_printf(Perl_debug_log,
3274 "\tnllanginfo returned CODESET '%s'\n", codeset));
3275
3276 if (codeset && strNE(codeset, "")) {
1d958db2 3277
1d075173
KW
3278 /* If the implementation of foldEQ() somehow were
3279 * to change to not go byte-by-byte, this could
3280 * read past end of string, as only one length is
3281 * checked. But currently, a premature NUL will
3282 * compare false, and it will stop there */
3283 is_utf8 = cBOOL( foldEQ(codeset, STR_WITH_LEN("UTF-8"))
3284 || foldEQ(codeset, STR_WITH_LEN("UTF8")));
1d958db2 3285
69014004
KW
3286 DEBUG_L(PerlIO_printf(Perl_debug_log,
3287 "\tnllanginfo returned CODESET '%s'; ?UTF8 locale=%d\n",
3288 codeset, is_utf8));
b5b2847c
KW
3289 restore_switched_locale(LC_CTYPE, original_ctype_locale);
3290 goto finish_and_return;
1d958db2 3291 }
119ee68b
KW
3292 }
3293
7d4bcc4a 3294# endif
94646a69 3295# if defined(HAS_MBTOWC) || defined(HAS_MBRTOWC)
0dec74cd
KW
3296 /* We can see if this is a UTF-8-like locale if have mbtowc(). It was a
3297 * late adder to C89, so very likely to have it. However, testing has
3298 * shown that, like nl_langinfo() above, there are locales that are not
3299 * strictly UTF-8 that this will return that they are */
69014004 3300
0dec74cd 3301 {
119ee68b 3302 wchar_t wc;
51fc4b19
KW
3303 int len;
3304
94646a69
KW
3305# if defined(HAS_MBRTOWC) && defined(USE_ITHREADS)
3306
3307 mbstate_t ps;
3308
3309# endif
3310
3311 /* mbrtowc() and mbtowc() convert a byte string to a wide
3312 * character. Feed a byte string to one of them and check that the
3313 * result is the expected Unicode code point */
3314
3315# if defined(HAS_MBRTOWC) && defined(USE_ITHREADS)
3316 /* Prefer this function if available, as it's reentrant */
3317
3318 memset(&ps, 0, sizeof(ps));;
3319 PERL_UNUSED_RESULT(mbrtowc(&wc, NULL, 0, &ps)); /* Reset any shift
3320 state */
3321 errno = 0;
3322 len = mbrtowc(&wc, STR_WITH_LEN(REPLACEMENT_CHARACTER_UTF8), &ps);
3323
3324# else
0dec74cd 3325
856b881c 3326 PERL_UNUSED_RESULT(mbtowc(&wc, NULL, 0));/* Reset any shift state */
69014004 3327 errno = 0;
b1d4925c 3328 len = mbtowc(&wc, STR_WITH_LEN(REPLACEMENT_CHARACTER_UTF8));
51fc4b19 3329
94646a69
KW
3330# endif
3331
0dec74cd
KW
3332 DEBUG_L(PerlIO_printf(Perl_debug_log,
3333 "\treturn from mbtowc; len=%d; code_point=%x; errno=%d\n",
3334 len, (unsigned int) wc, errno));
b1d4925c 3335
0dec74cd
KW
3336 is_utf8 = cBOOL( len == STRLENs(REPLACEMENT_CHARACTER_UTF8)
3337 && wc == (wchar_t) UNICODE_REPLACEMENT);
119ee68b 3338 }
7d4bcc4a 3339
962aa53f 3340 restore_switched_locale(LC_CTYPE, original_ctype_locale);
47280b20 3341 goto finish_and_return;
7d74bb61 3342 }
119ee68b 3343
0dec74cd
KW
3344# endif
3345# else
7d74bb61 3346
0dec74cd
KW
3347 /* Here, we must have a C89 compiler that doesn't have mbtowc(). Next
3348 * try looking at the currency symbol to see if it disambiguates
3349 * things. Often that will be in the native script, and if the symbol
3350 * isn't in UTF-8, we know that the locale isn't. If it is non-ASCII
3351 * UTF-8, we infer that the locale is too, as the odds of a non-UTF8
3352 * string being valid UTF-8 are quite small */
fa9b773e 3353
660ee27b
KW
3354# ifdef USE_LOCALE_MONETARY
3355
3356 /* If have LC_MONETARY, we can look at the currency symbol. Often that
3357 * will be in the native script. We do this one first because there is
3358 * just one string to examine, so potentially avoids work */
7d4bcc4a 3359
9db16864
KW
3360 {
3361 const char *original_monetary_locale
660ee27b
KW
3362 = switch_category_locale_to_template(LC_MONETARY,
3363 category,
3364 save_input_locale);
9db16864 3365 bool only_ascii = FALSE;
660ee27b
KW
3366 const U8 * currency_string
3367 = (const U8 *) my_nl_langinfo(PERL_CRNCYSTR, FALSE);
3368 /* 2nd param not relevant for this item */
3369 const U8 * first_variant;
fa9b773e 3370
660ee27b
KW
3371 assert( *currency_string == '-'
3372 || *currency_string == '+'
3373 || *currency_string == '.');
97f4de96 3374
660ee27b 3375 currency_string++;
fa9b773e 3376
660ee27b 3377 if (is_utf8_invariant_string_loc(currency_string, 0, &first_variant))
9db16864
KW
3378 {
3379 DEBUG_L(PerlIO_printf(Perl_debug_log, "Couldn't get currency symbol for %s, or contains only ASCII; can't use for determining if UTF-8 locale\n", save_input_locale));
3380 only_ascii = TRUE;
3381 }
3382 else {
660ee27b 3383 is_utf8 = is_strict_utf8_string(first_variant, 0);
9db16864 3384 }
fa9b773e 3385
9db16864 3386 restore_switched_locale(LC_MONETARY, original_monetary_locale);
fa9b773e 3387
9db16864 3388 if (! only_ascii) {
fa9b773e 3389
9db16864
KW
3390 /* It isn't a UTF-8 locale if the symbol is not legal UTF-8;
3391 * otherwise assume the locale is UTF-8 if and only if the symbol
3392 * is non-ascii UTF-8. */
3393 DEBUG_L(PerlIO_printf(Perl_debug_log, "\t?Currency symbol for %s is UTF-8=%d\n",
3394 save_input_locale, is_utf8));
3395 goto finish_and_return;
3396 }
13542a67 3397 }
fa9b773e 3398
660ee27b 3399# endif /* USE_LOCALE_MONETARY */
7d4bcc4a 3400# if defined(HAS_STRFTIME) && defined(USE_LOCALE_TIME)
15f7e74e 3401
9db16864
KW
3402 /* Still haven't found a non-ASCII string to disambiguate UTF-8 or not. Try
3403 * the names of the months and weekdays, timezone, and am/pm indicator */
3404 {
3405 const char *original_time_locale
3406 = switch_category_locale_to_template(LC_TIME,
3407 category,
3408 save_input_locale);
3409 int hour = 10;
3410 bool is_dst = FALSE;
3411 int dom = 1;
3412 int month = 0;
3413 int i;
3414 char * formatted_time;
3415
3416 /* Here the current LC_TIME is set to the locale of the category
3417 * whose information is desired. Look at all the days of the week and
3418 * month names, and the timezone and am/pm indicator for UTF-8 variant
3419 * characters. The first such a one found will tell us if the locale
3420 * is UTF-8 or not */
3421
3422 for (i = 0; i < 7 + 12; i++) { /* 7 days; 12 months */
3423 formatted_time = my_strftime("%A %B %Z %p",
3424 0, 0, hour, dom, month, 2012 - 1900, 0, 0, is_dst);
3425 if ( ! formatted_time
3426 || is_utf8_invariant_string((U8 *) formatted_time, 0))
3427 {
15f7e74e 3428
9db16864
KW
3429 /* Here, we didn't find a non-ASCII. Try the next time through
3430 * with the complemented dst and am/pm, and try with the next
3431 * weekday. After we have gotten all weekdays, try the next
3432 * month */
3433 is_dst = ! is_dst;
3434 hour = (hour + 12) % 24;
3435 dom++;
3436 if (i > 6) {
3437 month++;
3438 }
3439 continue;
15f7e74e 3440 }
9db16864
KW
3441
3442 /* Here, we have a non-ASCII. Return TRUE is it is valid UTF8;
3443 * false otherwise. But first, restore LC_TIME to its original
3444 * locale if we changed it */
3445 restore_switched_locale(LC_TIME, original_time_locale);
3446
3447 DEBUG_L(PerlIO_printf(Perl_debug_log, "\t?time-related strings for %s are UTF-8=%d\n",
3448 save_input_locale,
3449 is_utf8_string((U8 *) formatted_time, 0)));
3450 is_utf8 = is_utf8_string((U8 *) formatted_time, 0);
3451 goto finish_and_return;
15f7e74e
KW
3452 }
3453
9db16864
KW
3454 /* Falling off the end of the loop indicates all the names were just
3455 * ASCII. Go on to the next test. If we changed it, restore LC_TIME
3456 * to its original locale */
962aa53f 3457 restore_switched_locale(LC_TIME, original_time_locale);
9db16864 3458 DEBUG_L(PerlIO_printf(Perl_debug_log, "All time-related words for %s contain only ASCII; can't use for determining if UTF-8 locale\n", save_input_locale));
15f7e74e
KW
3459 }
3460
7d4bcc4a 3461# endif
15f7e74e 3462
7d4bcc4a 3463# if 0 && defined(USE_LOCALE_MESSAGES) && defined(HAS_SYS_ERRLIST)
855aeb93 3464
9db16864
KW
3465 /* This code is ifdefd out because it was found to not be necessary in testing
3466 * on our dromedary test machine, which has over 700 locales. There, this
3467 * added no value to looking at the currency symbol and the time strings. I
3468 * left it in so as to avoid rewriting it if real-world experience indicates
3469 * that dromedary is an outlier. Essentially, instead of returning abpve if we
3470 * haven't found illegal utf8, we continue on and examine all the strerror()
3471 * messages on the platform for utf8ness. If all are ASCII, we still don't
3472 * know the answer; but otherwise we have a pretty good indication of the
3473 * utf8ness. The reason this doesn't help much is that the messages may not
3474 * have been translated into the locale. The currency symbol and time strings
3475 * are much more likely to have been translated. */
3476 {
3477 int e;
3478 bool non_ascii = FALSE;
3479 const char *original_messages_locale
3480 = switch_category_locale_to_template(LC_MESSAGES,
3481 category,
3482 save_input_locale);
3483 const char * errmsg = NULL;
3484
3485 /* Here the current LC_MESSAGES is set to the locale of the category
3486 * whose information is desired. Look through all the messages. We
3487 * can't use Strerror() here because it may expand to code that
3488 * segfaults in miniperl */
3489
3490 for (e = 0; e <= sys_nerr; e++) {
3491 errno = 0;
3492 errmsg = sys_errlist[e];
3493 if (errno || !errmsg) {
3494 break;
3495 }
3496 errmsg = savepv(errmsg);
3497 if (! is_utf8_invariant_string((U8 *) errmsg, 0)) {
3498 non_ascii = TRUE;
3499 is_utf8 = is_utf8_string((U8 *) errmsg, 0);
3500 break;
3501 }
855aeb93 3502 }
9db16864 3503 Safefree(errmsg);
855aeb93 3504
9db16864 3505 restore_switched_locale(LC_MESSAGES, original_messages_locale);
855aeb93 3506
9db16864 3507 if (non_ascii) {
5857e934 3508
9db16864
KW
3509 /* Any non-UTF-8 message means not a UTF-8 locale; if all are valid,
3510 * any non-ascii means it is one; otherwise we assume it isn't */
3511 DEBUG_L(PerlIO_printf(Perl_debug_log, "\t?error messages for %s are UTF-8=%d\n",
3512 save_input_locale,
3513 is_utf8));
3514 goto finish_and_return;
3515 }
855aeb93 3516
9db16864
KW
3517 DEBUG_L(PerlIO_printf(Perl_debug_log, "All error messages for %s contain only ASCII; can't use for determining if UTF-8 locale\n", save_input_locale));
3518 }
855aeb93 3519
7d4bcc4a 3520# endif
0dec74cd 3521# ifndef EBCDIC /* On os390, even if the name ends with "UTF-8', it isn't a
92c0a900 3522 UTF-8 locale */
7d4bcc4a 3523
97f4de96
KW
3524 /* As a last resort, look at the locale name to see if it matches
3525 * qr/UTF -? * 8 /ix, or some other common locale names. This "name", the
3526 * return of setlocale(), is actually defined to be opaque, so we can't
3527 * really rely on the absence of various substrings in the name to indicate
3528 * its UTF-8ness, but if it has UTF8 in the name, it is extremely likely to
3529 * be a UTF-8 locale. Similarly for the other common names */
3530
0dec74cd 3531 {
c36d8df8 3532 const Size_t final_pos = strlen(save_input_locale) - 1;
0dec74cd 3533
c36d8df8
KW
3534 if (final_pos >= 3) {
3535 const char *name = save_input_locale;
97f4de96 3536
c36d8df8
KW
3537 /* Find next 'U' or 'u' and look from there */
3538 while ((name += strcspn(name, "Uu") + 1)
3539 <= save_input_locale + final_pos - 2)
97f4de96 3540 {
c36d8df8
KW
3541 if ( isALPHA_FOLD_NE(*name, 't')
3542 || isALPHA_FOLD_NE(*(name + 1), 'f'))
3543 {
3544 continue;
3545 }
3546 name += 2;
3547 if (*(name) == '-') {
3548 if ((name > save_input_locale + final_pos - 1)) {
3549 break;
3550 }
3551 name++;
3552 }
3553 if (*(name) == '8') {
3554 DEBUG_L(PerlIO_printf(Perl_debug_log,
3555 "Locale %s ends with UTF-8 in name\n",
3556 save_input_locale));
3557 is_utf8 = TRUE;
3558 goto finish_and_return;
97f4de96 3559 }
97f4de96 3560 }
c36d8df8
KW
3561 DEBUG_L(PerlIO_printf(Perl_debug_log,
3562 "Locale %s doesn't end with UTF-8 in name\n",
3563 save_input_locale));
97f4de96 3564 }
97f4de96 3565
4d8d465a 3566# ifdef WIN32
7d4bcc4a 3567
c36d8df8
KW
3568 /* http://msdn.microsoft.com/en-us/library/windows/desktop/dd317756.aspx */
3569 if (memENDs(save_input_locale, final_pos, "65001")) {
3570 DEBUG_L(PerlIO_printf(Perl_debug_log,
8a0832a1 3571 "Locale %s ends with 65001 in name, is UTF-8 locale\n",
97f4de96 3572 save_input_locale));
c36d8df8
KW
3573 is_utf8 = TRUE;
3574 goto finish_and_return;
3575 }
0dec74cd 3576 }
7d4bcc4a 3577
4d8d465a
KW
3578# endif
3579# endif
97f4de96
KW
3580
3581 /* Other common encodings are the ISO 8859 series, which aren't UTF-8. But
3582 * since we are about to return FALSE anyway, there is no point in doing
3583 * this extra work */
7d4bcc4a 3584
0dec74cd 3585# if 0
97f4de96
KW
3586 if (instr(save_input_locale, "8859")) {
3587 DEBUG_L(PerlIO_printf(Perl_debug_log,
3588 "Locale %s has 8859 in name, not UTF-8 locale\n",
3589 save_input_locale));
47280b20
KW
3590 is_utf8 = FALSE;
3591 goto finish_and_return;
97f4de96 3592 }
0dec74cd 3593# endif
97f4de96 3594
69014004
KW
3595 DEBUG_L(PerlIO_printf(Perl_debug_log,
3596 "Assuming locale %s is not a UTF-8 locale\n",
3597 save_input_locale));
47280b20
KW
3598 is_utf8 = FALSE;
3599
0dec74cd
KW
3600# endif /* the code that is compiled when no modern LC_CTYPE */
3601
47280b20
KW
3602 finish_and_return:
3603
3604 /* Cache this result so we don't have to go through all this next time. */
3605 utf8ness_cache_size = sizeof(PL_locale_utf8ness)
3606 - (utf8ness_cache - PL_locale_utf8ness);
3607
3608 /* But we can't save it if it is too large for the total space available */
3609 if (LIKELY(input_name_len_with_overhead < utf8ness_cache_size)) {
3610 Size_t utf8ness_cache_len = strlen(utf8ness_cache);
3611
3612 /* Here it can fit, but we may need to clear out the oldest cached
3613 * result(s) to do so. Check */
3614 if (utf8ness_cache_len + input_name_len_with_overhead
3615 >= utf8ness_cache_size)
3616 {
3617 /* Here we have to clear something out to make room for this.
3618 * Start looking at the rightmost place where it could fit and find
3619 * the beginning of the entry that extends past that. */
3620 char * cutoff = (char *) my_memrchr(utf8ness_cache,
3621 UTF8NESS_SEP[0],
3622 utf8ness_cache_size
3623 - input_name_len_with_overhead);
3624
3625 assert(cutoff);
3626 assert(cutoff >= utf8ness_cache);
3627
3628 /* This and all subsequent entries must be removed */
3629 *cutoff = '\0';
3630 utf8ness_cache_len = strlen(utf8ness_cache);
3631 }
3632
3633 /* Make space for the new entry */
3634 Move(utf8ness_cache,
3635 utf8ness_cache + input_name_len_with_overhead,
3636 utf8ness_cache_len + 1 /* Incl. trailing NUL */, char);
3637
3638 /* And insert it */
3639 Copy(delimited, utf8ness_cache, input_name_len_with_overhead - 1, char);
3640 utf8ness_cache[input_name_len_with_overhead - 1] = is_utf8 + '0';
3641
3642 if ((PL_locale_utf8ness[strlen(PL_locale_utf8ness)-1]
3643 & (PERL_UINTMAX_T) ~1) != '0')
3644 {
3645 Perl_croak(aTHX_
7fcf9272
KW
3646 "panic: %s: %d: Corrupt utf8ness_cache=%s\nlen=%zu,"
3647 " inserted_name=%s, its_len=%zu\n",
47280b20
KW
3648 __FILE__, __LINE__,
3649 PL_locale_utf8ness, strlen(PL_locale_utf8ness),
3650 delimited, input_name_len_with_overhead);
3651 }
3652 }
3653
3654# ifdef DEBUGGING
3655
3656 if (DEBUG_Lv_TEST || debug_initialization) {
3657 PerlIO_printf(Perl_debug_log,
3658 "PL_locale_utf8ness is now %s; returning %d\n",
3659 PL_locale_utf8ness, is_utf8);
3660 }
3661
3662# endif
3663
3664 Safefree(delimited);
fa9b773e 3665 Safefree(save_input_locale);
47280b20 3666 return is_utf8;
7d74bb61
KW
3667}
3668
8ef6e574 3669#endif
7d74bb61 3670
d6ded950
KW
3671bool
3672Perl__is_in_locale_category(pTHX_ const bool compiling, const int category)
3673{
1a4f13e1 3674 dVAR;
d6ded950
KW
3675 /* Internal function which returns if we are in the scope of a pragma that
3676 * enables the locale category 'category'. 'compiling' should indicate if
3677 * this is during the compilation phase (TRUE) or not (FALSE). */
3678
3679 const COP * const cop = (compiling) ? &PL_compiling : PL_curcop;
3680
3681 SV *categories = cop_hints_fetch_pvs(cop, "locale", 0);
3682 if (! categories || categories == &PL_sv_placeholder) {
3683 return FALSE;
3684 }
3685
3686 /* The pseudo-category 'not_characters' is -1, so just add 1 to each to get
3687 * a valid unsigned */
3688 assert(category >= -1);
3689 return cBOOL(SvUV(categories) & (1U << (category + 1)));
3690}
3691
2c6ee1a7 3692char *
6ebbc862
KW
3693Perl_my_strerror(pTHX_ const int errnum)
3694{
3695 /* Returns a mortalized copy of the text of the error message associated
3696 * with 'errnum'. It uses the current locale's text unless the platform
3697 * doesn't have the LC_MESSAGES category or we are not being called from
3698 * within the scope of 'use locale'. In the former case, it uses whatever
3699 * strerror returns; in the latter case it uses the text from the C locale.
3700 *
3701 * The function just calls strerror(), but temporarily switches, if needed,
3702 * to the C locale */
3703
3704 char *errstr;
52770946 3705 dVAR;
6ebbc862 3706
52770946 3707#ifndef USE_LOCALE_MESSAGES
6ebbc862 3708
52770946
KW
3709 /* If platform doesn't have messages category, we don't do any switching to
3710 * the C locale; we just use whatever strerror() returns */
3711
3712 errstr = savepv(Strerror(errnum));
3713
3714#else /* Has locale messages */
3715
3716 const bool within_locale_scope = IN_LC(LC_MESSAGES);
2c6ee1a7 3717
7aaa36b1
KW
3718# if defined(HAS_POSIX_2008_LOCALE) && defined(HAS_STRERROR_L)
3719
43cb6651
KW
3720 /* This function is trivial if we don't have to worry about thread safety
3721 * and have strerror_l(), as it handles the switch of locales so we don't
3722 * have to deal with that. We don't have to worry about thread safety if
3723 * this is an unthreaded build, or if strerror_r() is also available. Both
3724 * it and strerror_l() are thread-safe. Plain strerror() isn't thread
3725 * safe. But on threaded builds when strerror_r() is available, the
3726 * apparent call to strerror() below is actually a macro that
3727 * behind-the-scenes calls strerror_r().
3728 */
3729
3730# if ! defined(USE_ITHREADS) || defined(HAS_STRERROR_R)
7aaa36b1
KW
3731
3732 if (within_locale_scope) {
4eb27fc5 3733 errstr = savepv(strerror(errnum));
7aaa36b1
KW
3734 }
3735 else {
4eb27fc5 3736 errstr = savepv(strerror_l(errnum, PL_C_locale_obj));
7aaa36b1
KW
3737 }
3738
43cb6651
KW
3739# else
3740
3741 /* Here we have strerror_l(), but not strerror_r() and we are on a
3742 * threaded-build. We use strerror_l() for everything, constructing a
3743 * locale to pass to it if necessary */
3744
3745 bool do_free = FALSE;
3746 locale_t locale_to_use;
3747
3748 if (within_locale_scope) {
3749 locale_to_use = uselocale((locale_t) 0);
3750 if (locale_to_use == LC_GLOBAL_LOCALE) {
3751 locale_to_use = duplocale(LC_GLOBAL_LOCALE);
3752 do_free = TRUE;
3753 }
3754 }
3755 else { /* Use C locale if not within 'use locale' scope */
3756 locale_to_use = PL_C_locale_obj;
3757 }
3758
3759