locale.c

   1 /*    locale.c
   2  *
   3  *    Copyright (C) 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001,
   4  *    2002, 2003, 2005, 2006, 2007, 2008 by Larry Wall and others
   5  *
   6  *    You may distribute under the terms of either the GNU General Public
   7  *    License or the Artistic License, as specified in the README file.
   8  *
   9  */
  10
  11 /*
  12  *      A Elbereth Gilthoniel,
  13  *      silivren penna míriel
  14  *      o menel aglar elenath!
  15  *      Na-chaered palan-díriel
  16  *      o galadhremmin ennorath,
  17  *      Fanuilos, le linnathon
  18  *      nef aear, si nef aearon!
  19  *
  20  *     [p.238 of _The Lord of the Rings_, II/i: "Many Meetings"]
  21  */
  22
  23 /* utility functions for handling locale-specific stuff like what
  24  * character represents the decimal point.
  25  *
  26  * All C programs have an underlying locale.  Perl code generally doesn't pay
  27  * any attention to it except within the scope of a 'use locale'.  For most
  28  * categories, it accomplishes this by just using different operations if it is
  29  * in such scope than if not.  However, various libc functions called by Perl
  30  * are affected by the LC_NUMERIC category, so there are macros in perl.h that
  31  * are used to toggle between the current locale and the C locale depending on
  32  * the desired behavior of those functions at the moment.  And, LC_MESSAGES is
  33  * switched to the C locale for outputting the message unless within the scope
  34  * of 'use locale'.
  35  *
  36  * This code now has multi-thread-safe locale handling on systems that support
  37  * that.  This is completely transparent to most XS code.  On earlier systems,
  38  * it would be possible to emulate thread-safe locales, but this likely would
  39  * involve a lot of locale switching, and would require XS code changes.
  40  * Macros could be written so that the code wouldn't have to know which type of
  41  * system is being used.
  42  *
  43  * Table-driven code is used for simplicity and clarity, as many operations
  44  * differ only in which category is being worked on.  However the system
  45  * categories need not be small contiguous integers, so do not lend themselves
  46  * to table lookup.  Instead we have created our own equivalent values which
  47  * are all small contiguous non-negative integers, and translation functions
  48  * between the two sets.  For category 'LC_foo', the name of our index is
  49  * LC_foo_INDEX_.  Various parallel tables, indexed by these, are used.
  50  *
  51  * Many of the macros and functions in this file have one of the suffixes '_c',
  52  * '_r', or '_i'.  khw found these useful in remembering what type of locale
  53  * category to use as their parameter.  '_r' takes an int category number as
  54  * passed to setlocale(), like LC_ALL, LC_CTYPE, etc.  The 'r' indicates that
  55  * the value isn't known until runtime.  '_c' also indicates such a category
  56  * number, but its value is known at compile time.  These are both converted
  57  * into unsigned indexes into various tables of category information, where the
  58  * real work is generally done.  The tables are generated at compile-time based
  59  * on platform characteristics and Configure options.  They hide from the code
  60  * many of the vagaries of the different locale implementations out there.  You
  61  * may have already guessed that '_i' indicates the parameter is such an
  62  * unsigned index.  Converting from '_r' to '_i' requires run-time lookup.
  63  * '_c' is used to get cpp to do this at compile time.  To avoid the runtime
  64  * expense, the code is structured to use '_r' at the API level, and once
  65  * converted, everything possible is done using the table indexes.
  66  *
  67  * On unthreaded perls, most operations expand out to just the basic
  68  * setlocale() calls.  The same is true on threaded perls on modern Windows
  69  * systems where the same API, after set up, is used for thread-safe locale
  70  * handling.  On other systems, there is a completely different API, specified
  71  * in POSIX 2008, to do thread-safe locales.  On these systems, our
  72  * emulate_setlocale_i() function is used to hide the different API from the
  73  * outside.  This makes it completely transparent to most XS code.
  74  *
  75  * A huge complicating factor is that the LC_NUMERIC category is normally held
  76  * in the C locale, except during those relatively rare times when it needs to
  77  * be in the underlying locale.  There is a bunch of code to accomplish this,
  78  * and to allow easy switches from one state to the other.
  79  *
  80  * z/OS (os390) is an outlier.  Locales really don't work under threads when
  81  * either the radix character isn't a dot, or attempts are made to change
  82  * locales after the first thread is created.  The reason is that IBM has made
  83  * it thread-safe by refusing to change locales (returning failure if
  84  * attempted) any time after an application has called pthread_create() to
  85  * create another thread.  The expectation is that an application will set up
  86  * its locale information before the first fork, and be stable thereafter.  But
  87  * perl toggles LC_NUMERIC if the locale's radix character isn't a dot, as do
  88  * the other toggles, which are less common.
  89  */
  90
  91 /* If the environment says to, we can output debugging information during
  92  * initialization.  This is done before option parsing, and before any thread
  93  * creation, so can be a file-level static.  (Must come before #including
  94  * perl.h) */
  95 #ifdef DEBUGGING
  96 static int debug_initialization = 0;
  97 #  define DEBUG_INITIALIZATION_set(v) (debug_initialization = v)
  98 #  define DEBUG_LOCALE_INITIALIZATION_  debug_initialization
  99 #else
 100 #  define debug_initialization 0
 101 #  define DEBUG_INITIALIZATION_set(v)
 102 #endif
 103
 104 #define DEBUG_PRE_STMTS   dSAVE_ERRNO;                                        \
 105     PerlIO_printf(Perl_debug_log, "%s: %" LINE_Tf ": ", __FILE__, __LINE__);
 106 #define DEBUG_POST_STMTS  RESTORE_ERRNO;
 107
 108 #include "EXTERN.h"
 109 #define PERL_IN_LOCALE_C
 110 #include "perl_langinfo.h"
 111 #include "perl.h"
 112
 113 #include "reentr.h"
 114
 115 #ifdef I_WCHAR
 116 #  include <wchar.h>
 117 #endif
 118 #ifdef I_WCTYPE
 119 #  include <wctype.h>
 120 #endif
 121
 122 PERL_STATIC_INLINE const char *
 123 S_mortalized_pv_copy(pTHX_ const char * const pv)
 124 {
 125     PERL_ARGS_ASSERT_MORTALIZED_PV_COPY;
 126
 127     /* Copies the input pv, and arranges for it to be freed at an unspecified
 128      * later time. */
 129
 130     if (pv == NULL) {
 131         return NULL;
 132     }
 133
 134     const char * copy = savepv(pv);
 135     SAVEFREEPV(copy);
 136     return copy;
 137 }
 138
 139
 140 /* Returns the Unix errno portion; ignoring any others.  This is a macro here
 141  * instead of putting it into perl.h, because unclear to khw what should be
 142  * done generally. */
 143 #define GET_ERRNO   saved_errno
 144
 145 /* Default values come from the C locale */
 146 static const char C_codeset[] = "ANSI_X3.4-1968";
 147 static const char C_decimal_point[] = ".";
 148 static const char C_thousands_sep[] = "";
 149
 150 /* Is the C string input 'name' "C" or "POSIX"?  If so, and 'name' is the
 151  * return of setlocale(), then this is extremely likely to be the C or POSIX
 152  * locale.  However, the output of setlocale() is documented to be opaque, but
 153  * the odds are extremely small that it would return these two strings for some
 154  * other locale.  Note that VMS in these two locales includes many non-ASCII
 155  * characters as controls and punctuation (below are hex bytes):
 156  *   cntrl:  84-97 9B-9F
 157  *   punct:  A1-A3 A5 A7-AB B0-B3 B5-B7 B9-BD BF-CF D1-DD DF-EF F1-FD
 158  * Oddly, none there are listed as alphas, though some represent alphabetics
 159  * http://www.nntp.perl.org/group/perl.perl5.porters/2013/02/msg198753.html */
 160 #define isNAME_C_OR_POSIX(name)                                              \
 161                              (   (name) != NULL                              \
 162                               && (( *(name) == 'C' && (*(name + 1)) == '\0') \
 163                                    || strEQ((name), "POSIX")))
 164
 165 #if defined(HAS_NL_LANGINFO_L) || defined(HAS_NL_LANGINFO)
 166 #  define HAS_SOME_LANGINFO
 167 #endif
 168 #if defined(HAS_LOCALECONV) || defined(HAS_LOCALECONV_L)
 169 #  define HAS_SOME_LOCALECONV
 170 #endif
 171
 172 #define my_langinfo_c(item, category, locale, retbufp, retbuf_sizep, utf8ness) \
 173             my_langinfo_i(item, category##_INDEX_, locale, retbufp,            \
 174                                                       retbuf_sizep,  utf8ness)
 175
 176 #ifdef USE_LOCALE
 177
 178 /* This code keeps a LRU cache of the UTF-8ness of the locales it has so-far
 179  * looked up.  This is in the form of a C string:  */
 180
 181 #  define UTF8NESS_SEP     "\v"
 182 #  define UTF8NESS_PREFIX  "\f"
 183
 184 /* So, the string looks like:
 185  *
 186  *      \vC\a0\vPOSIX\a0\vam_ET\a0\vaf_ZA.utf8\a1\ven_US.UTF-8\a1\0
 187  *
 188  * where the digit 0 after the \a indicates that the locale starting just
 189  * after the preceding \v is not UTF-8, and the digit 1 mean it is. */
 190
 191 STATIC_ASSERT_DECL(STRLENs(UTF8NESS_SEP) == 1);
 192 STATIC_ASSERT_DECL(STRLENs(UTF8NESS_PREFIX) == 1);
 193
 194 #  define C_and_POSIX_utf8ness    UTF8NESS_SEP "C"     UTF8NESS_PREFIX "0"    \
 195                                 UTF8NESS_SEP "POSIX" UTF8NESS_PREFIX "0"
 196
 197 /* The cache is initialized to C_and_POSIX_utf8ness at start up.  These are
 198  * kept there always.  The remining portion of the cache is LRU, with the
 199  * oldest looked-up locale at the tail end */
 200
 201 #  ifdef DEBUGGING
 202 #    define setlocale_debug_string_c(category, locale, result)              \
 203                 setlocale_debug_string_i(category##_INDEX_, locale, result)
 204 #    define setlocale_debug_string_r(category, locale, result)              \
 205              setlocale_debug_string_i(get_category_index(category, locale), \
 206                                       locale, result)
 207 #  endif
 208
 209 #  define toggle_locale_i(index, locale)                                    \
 210                  S_toggle_locale_i(aTHX_ index, locale, __LINE__)
 211 #  define toggle_locale_c(cat, locale)  toggle_locale_i(cat##_INDEX_, locale)
 212 #  define restore_toggled_locale_i(index, locale)                           \
 213                  S_restore_toggled_locale_i(aTHX_ index, locale, __LINE__)
 214 #  define restore_toggled_locale_c(cat, locale)                             \
 215                              restore_toggled_locale_i(cat##_INDEX_, locale)
 216
 217 /* Two parallel arrays indexed by our mapping of category numbers into small
 218  * non-negative indexes; first the locale categories Perl uses on this system,
 219  * used to do the inverse mapping.  The second array is their names.  These
 220  * arrays are in mostly arbitrary order. */
 221
 222 STATIC const int categories[] = {
 223
 224 #    ifdef USE_LOCALE_CTYPE
 225                              LC_CTYPE,
 226 #    endif
 227 #  ifdef USE_LOCALE_NUMERIC
 228                              LC_NUMERIC,
 229 #  endif
 230 #    ifdef USE_LOCALE_COLLATE
 231                              LC_COLLATE,
 232 #    endif
 233 #    ifdef USE_LOCALE_TIME
 234                              LC_TIME,
 235 #    endif
 236 #    ifdef USE_LOCALE_MESSAGES
 237                              LC_MESSAGES,
 238 #    endif
 239 #    ifdef USE_LOCALE_MONETARY
 240                              LC_MONETARY,
 241 #    endif
 242 #    ifdef USE_LOCALE_ADDRESS
 243                              LC_ADDRESS,
 244 #    endif
 245 #    ifdef USE_LOCALE_IDENTIFICATION
 246                              LC_IDENTIFICATION,
 247 #    endif
 248 #    ifdef USE_LOCALE_MEASUREMENT
 249                              LC_MEASUREMENT,
 250 #    endif
 251 #    ifdef USE_LOCALE_PAPER
 252                              LC_PAPER,
 253 #    endif
 254 #    ifdef USE_LOCALE_TELEPHONE
 255                              LC_TELEPHONE,
 256 #    endif
 257 #    ifdef USE_LOCALE_SYNTAX
 258                              LC_SYNTAX,
 259 #    endif
 260 #    ifdef USE_LOCALE_TOD
 261                              LC_TOD,
 262 #    endif
 263 #    ifdef LC_ALL
 264                              LC_ALL,
 265 #    endif
 266
 267    /* Placeholder as a precaution if code fails to check the return of
 268     * get_category_index(), which returns this element to indicate an error */
 269                             -1
 270 };
 271
 272 /* The top-most real element is LC_ALL */
 273
 274 STATIC const char * const category_names[] = {
 275
 276 #    ifdef USE_LOCALE_CTYPE
 277                                  "LC_CTYPE",
 278 #    endif
 279 #  ifdef USE_LOCALE_NUMERIC
 280                                  "LC_NUMERIC",
 281 #  endif
 282 #    ifdef USE_LOCALE_COLLATE
 283                                  "LC_COLLATE",
 284 #    endif
 285 #    ifdef USE_LOCALE_TIME
 286                                  "LC_TIME",
 287 #    endif
 288 #    ifdef USE_LOCALE_MESSAGES
 289                                  "LC_MESSAGES",
 290 #    endif
 291 #    ifdef USE_LOCALE_MONETARY
 292                                  "LC_MONETARY",
 293 #    endif
 294 #    ifdef USE_LOCALE_ADDRESS
 295                                  "LC_ADDRESS",
 296 #    endif
 297 #    ifdef USE_LOCALE_IDENTIFICATION
 298                                  "LC_IDENTIFICATION",
 299 #    endif
 300 #    ifdef USE_LOCALE_MEASUREMENT
 301                                  "LC_MEASUREMENT",
 302 #    endif
 303 #    ifdef USE_LOCALE_PAPER
 304                                  "LC_PAPER",
 305 #    endif
 306 #    ifdef USE_LOCALE_TELEPHONE
 307                                  "LC_TELEPHONE",
 308 #    endif
 309 #    ifdef USE_LOCALE_SYNTAX
 310                                  "LC_SYNTAX",
 311 #    endif
 312 #    ifdef USE_LOCALE_TOD
 313                                  "LC_TOD",
 314 #    endif
 315 #    ifdef LC_ALL
 316                                  "LC_ALL",
 317 #    endif
 318
 319    /* Placeholder as a precaution if code fails to check the return of
 320     * get_category_index(), which returns this element to indicate an error */
 321                                  NULL
 322 };
 323
 324 /* A few categories require additional setup when they are changed.  This table
 325  * points to the functions that do that setup */
 326 STATIC void (*update_functions[]) (pTHX_ const char *) = {
 327 #  ifdef USE_LOCALE_CTYPE
 328                                 S_new_ctype,
 329 #  endif
 330 #  ifdef USE_LOCALE_NUMERIC
 331                                 S_new_numeric,
 332 #  endif
 333 #  ifdef USE_LOCALE_COLLATE
 334                                 S_new_collate,
 335 #  endif
 336 #  ifdef USE_LOCALE_TIME
 337                                 NULL,
 338 #  endif
 339 #  ifdef USE_LOCALE_MESSAGES
 340                                 NULL,
 341 #  endif
 342 #  ifdef USE_LOCALE_MONETARY
 343                                 NULL,
 344 #  endif
 345 #  ifdef USE_LOCALE_ADDRESS
 346                                 NULL,
 347 #  endif
 348 #  ifdef USE_LOCALE_IDENTIFICATION
 349                                 NULL,
 350 #  endif
 351 #  ifdef USE_LOCALE_MEASUREMENT
 352                                 NULL,
 353 #  endif
 354 #  ifdef USE_LOCALE_PAPER
 355                                 NULL,
 356 #  endif
 357 #  ifdef USE_LOCALE_TELEPHONE
 358                                 NULL,
 359 #  endif
 360 #  ifdef USE_LOCALE_SYNTAX
 361                                 NULL,
 362 #  endif
 363 #  ifdef USE_LOCALE_TOD
 364                                 NULL,
 365 #  endif
 366     /* No harm done to have this even without an LC_ALL */
 367                                 S_new_LC_ALL,
 368
 369    /* Placeholder as a precaution if code fails to check the return of
 370     * get_category_index(), which returns this element to indicate an error */
 371                                 NULL
 372 };
 373
 374 #  ifdef LC_ALL
 375
 376     /* On systems with LC_ALL, it is kept in the highest index position.  (-2
 377      * to account for the final unused placeholder element.) */
 378 #    define NOMINAL_LC_ALL_INDEX (C_ARRAY_LENGTH(categories) - 2)
 379 #  else
 380
 381     /* On systems without LC_ALL, we pretend it is there, one beyond the real
 382      * top element, hence in the unused placeholder element. */
 383 #    define NOMINAL_LC_ALL_INDEX (C_ARRAY_LENGTH(categories) - 1)
 384 #  endif
 385
 386 /* Pretending there is an LC_ALL element just above allows us to avoid most
 387  * special cases.  Most loops through these arrays in the code below are
 388  * written like 'for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++)'.  They will work
 389  * on either type of system.  But the code must be written to not access the
 390  * element at 'LC_ALL_INDEX_' except on platforms that have it.  This can be
 391  * checked for at compile time by using the #define LC_ALL_INDEX_ which is only
 392  * defined if we do have LC_ALL. */
 393
 394 STATIC unsigned int
 395 S_get_category_index(const int category, const char * locale)
 396 {
 397     /* Given a category, return the equivalent internal index we generally use
 398      * instead.
 399      *
 400      * 'locale' is for use in any generated diagnostics, and may be NULL
 401      *
 402      * Some sort of hash could be used instead of this loop, but the number of
 403      * elements is so far at most 12 */
 404
 405     unsigned int i;
 406     const char * conditional_warn_text = "; can't set it to ";
 407
 408     PERL_ARGS_ASSERT_GET_CATEGORY_INDEX;
 409
 410 #  ifdef LC_ALL
 411     for (i = 0; i <=         LC_ALL_INDEX_; i++)
 412 #  else
 413     for (i = 0; i <  NOMINAL_LC_ALL_INDEX;  i++)
 414 #  endif
 415     {
 416         if (category == categories[i]) {
 417             dTHX_DEBUGGING;
 418             DEBUG_Lv(PerlIO_printf(Perl_debug_log,
 419                                    "index of category %d (%s) is %d\n",
 420                                    category, category_names[i], i));
 421             return i;
 422         }
 423     }
 424
 425     /* Here, we don't know about this category, so can't handle it. */
 426
 427     if (! locale) {
 428         locale = "";
 429         conditional_warn_text = "";
 430     }
 431
 432     /* diag_listed_as: Unknown locale category %d; can't set it to %s */
 433     Perl_warner_nocontext(packWARN(WARN_LOCALE),
 434                           "Unknown locale category %d%s%s",
 435                           category, conditional_warn_text, locale);
 436
 437 #  ifdef EINVAL
 438
 439     SETERRNO(EINVAL, LIB_INVARG);
 440
 441 #  endif
 442
 443     /* Return an out-of-bounds value */
 444     return NOMINAL_LC_ALL_INDEX + 1;
 445 }
 446
 447 STATIC const char *
 448 S_category_name(const int category)
 449 {
 450     unsigned int index;
 451
 452     index = get_category_index(category, NULL);
 453
 454     if (index <= NOMINAL_LC_ALL_INDEX) {
 455         return category_names[index];
 456     }
 457
 458     return Perl_form_nocontext("%d (unknown)", category);
 459 }
 460
 461 #endif /* ifdef USE_LOCALE */
 462
 463 void
 464 Perl_force_locale_unlock()
 465 {
 466
 467 #if defined(USE_LOCALE_THREADS)
 468
 469     dTHX;
 470 #  ifdef LOCALE_UNLOCK_
 471     LOCALE_UNLOCK_;
 472 #  endif
 473
 474 #endif
 475
 476 }
 477
 478 #ifdef USE_POSIX_2008_LOCALE
 479
 480 STATIC locale_t
 481 S_use_curlocale_scratch(pTHX)
 482 {
 483     /* This function is used to hide from the caller the case where the current
 484      * locale_t object in POSIX 2008 is the global one, which is illegal in
 485      * many of the P2008 API calls.  This checks for that and, if necessary
 486      * creates a proper P2008 object.  Any prior object is deleted, as is any
 487      * remaining object during global destruction. */
 488
 489     locale_t cur = uselocale((locale_t) 0);
 490
 491     if (cur != LC_GLOBAL_LOCALE) {
 492         return cur;
 493     }
 494
 495     if (PL_scratch_locale_obj) {
 496         freelocale(PL_scratch_locale_obj);
 497     }
 498
 499     PL_scratch_locale_obj = duplocale(LC_GLOBAL_LOCALE);
 500     return PL_scratch_locale_obj;
 501 }
 502
 503 #endif
 504
 505 void
 506 Perl_locale_panic(const char * msg,
 507                   const char * file_name,
 508                   const line_t line,
 509                   const int errnum)
 510 {
 511     dTHX;
 512
 513     PERL_ARGS_ASSERT_LOCALE_PANIC;
 514
 515     force_locale_unlock();
 516
 517 #ifdef USE_C_BACKTRACE
 518     dump_c_backtrace(Perl_debug_log, 20, 1);
 519 #endif
 520
 521     /* diag_listed_as: panic: %s */
 522     Perl_croak(aTHX_ "%s: %d: panic: %s; errno=%d\n",
 523                      file_name, line, msg, errnum);
 524 }
 525
 526 #define setlocale_failure_panic_c(                                          \
 527                         cat, current, failed, caller_0_line, caller_1_line) \
 528         setlocale_failure_panic_i(cat##_INDEX_, current, failed,            \
 529                         caller_0_line, caller_1_line)
 530
 531 /* porcelain_setlocale() presents a consistent POSIX-compliant interface to
 532  * setlocale().   Windows requres a customized base-level setlocale() */
 533 #ifdef WIN32
 534 #  define porcelain_setlocale(cat, locale) win32_setlocale(cat, locale)
 535 #else
 536 #  define porcelain_setlocale(cat, locale)                              \
 537                                 ((const char *) setlocale(cat, locale))
 538 #endif
 539
 540 /* The next layer up is to catch vagaries and bugs in the libc setlocale return
 541  * value */
 542 #ifdef stdize_locale
 543 #  define stdized_setlocale(cat, locale)                                       \
 544      stdize_locale(cat, porcelain_setlocale(cat, locale),                      \
 545                    &PL_stdize_locale_buf, &PL_stdize_locale_bufsize, __LINE__)
 546 #else
 547 #  define stdized_setlocale(cat, locale)  porcelain_setlocale(cat, locale)
 548 #endif
 549
 550 /* The next many lines form a layer above the close-to-the-metal 'porcelain'
 551  * and 'stdized' macros.  They are used to present a uniform API to the rest of
 552  * the code in this file in spite of the disparate underlying implementations.
 553  * */
 554
 555 #ifndef USE_POSIX_2008_LOCALE
 556
 557 /* For non-threaded perls (which we are not to use the POSIX 2008 API on), or a
 558  * thread-safe Windows one in which threading is invisible to us, the added
 559  * layer just calls the base-level functions.  See the introductory comments in
 560  * this file for the meaning of the suffixes '_c', '_r', '_i'. */
 561
 562 #  define setlocale_r(cat, locale)        stdized_setlocale(cat, locale)
 563 #  define setlocale_i(i, locale)      setlocale_r(categories[i], locale)
 564 #  define setlocale_c(cat, locale)              setlocale_r(cat, locale)
 565
 566 #  define void_setlocale_i(i, locale)                                       \
 567     STMT_START {                                                            \
 568         if (! porcelain_setlocale(categories[i], locale)) {                 \
 569             setlocale_failure_panic_i(i, NULL, locale, __LINE__, 0);        \
 570             NOT_REACHED; /* NOTREACHED */                                   \
 571         }                                                                   \
 572     } STMT_END
 573 #  define void_setlocale_c(cat, locale)                                     \
 574                                   void_setlocale_i(cat##_INDEX_, locale)
 575 #  define void_setlocale_r(cat, locale)                                     \
 576                void_setlocale_i(get_category_index(cat, locale), locale)
 577
 578 #  define bool_setlocale_r(cat, locale)                                     \
 579                                   cBOOL(porcelain_setlocale(cat, locale))
 580 #  define bool_setlocale_i(i, locale)                                       \
 581                                  bool_setlocale_c(categories[i], locale)
 582 #  define bool_setlocale_c(cat, locale)    bool_setlocale_r(cat, locale)
 583
 584 /* All the querylocale...() forms return a mortalized copy.  If you need
 585  * something stable across calls, you need to savepv() the result yourself */
 586
 587 #  define querylocale_r(cat)        mortalized_pv_copy(setlocale_r(cat, NULL))
 588 #  define querylocale_c(cat)        querylocale_r(cat)
 589 #  define querylocale_i(i)          querylocale_c(categories[i])
 590
 591 #else   /* Below is defined(POSIX 2008) */
 592
 593 /* Here, there is a completely different API to get thread-safe locales.  We
 594  * emulate the setlocale() API with our own function(s).  setlocale categories,
 595  * like LC_NUMERIC, are not valid here for the POSIX 2008 API.  Instead, there
 596  * are equivalents, like LC_NUMERIC_MASK, which we use instead, converting to
 597  * by using get_category_index() followed by table lookup. */
 598
 599 #  define emulate_setlocale_c(cat, locale, recalc_LC_ALL, line)             \
 600            emulate_setlocale_i(cat##_INDEX_, locale, recalc_LC_ALL, line)
 601
 602      /* A wrapper for the macros below. */
 603 #  define common_emulate_setlocale(i, locale)                               \
 604                  emulate_setlocale_i(i, locale, YES_RECALC_LC_ALL, __LINE__)
 605
 606 #  define setlocale_i(i, locale)     common_emulate_setlocale(i, locale)
 607 #  define setlocale_c(cat, locale)     setlocale_i(cat##_INDEX_, locale)
 608 #  define setlocale_r(cat, locale)                                          \
 609                     setlocale_i(get_category_index(cat, locale), locale)
 610
 611 #  define void_setlocale_i(i, locale)     ((void) setlocale_i(i, locale))
 612 #  define void_setlocale_c(cat, locale)                                     \
 613                                   void_setlocale_i(cat##_INDEX_, locale)
 614 #  define void_setlocale_r(cat, locale) ((void) setlocale_r(cat, locale))
 615
 616 #  define bool_setlocale_i(i, locale)       cBOOL(setlocale_i(i, locale))
 617 #  define bool_setlocale_c(cat, locale)                                     \
 618                                   bool_setlocale_i(cat##_INDEX_, locale)
 619 #  define bool_setlocale_r(cat, locale)   cBOOL(setlocale_r(cat, locale))
 620
 621 #  define querylocale_i(i)      mortalized_pv_copy(my_querylocale_i(i))
 622 #  define querylocale_c(cat)    querylocale_i(cat##_INDEX_)
 623 #  define querylocale_r(cat)    querylocale_i(get_category_index(cat,NULL))
 624
 625 #  ifndef USE_QUERYLOCALE
 626 #    define USE_PL_CURLOCALES
 627 #  else
 628 #    define isSINGLE_BIT_SET(mask) isPOWER_OF_2(mask)
 629
 630      /* This code used to think querylocale() was valid on LC_ALL.  Make sure
 631       * all instances of that have been removed */
 632 #    define QUERYLOCALE_ASSERT(index)                                       \
 633                         __ASSERT_(isSINGLE_BIT_SET(category_masks[index]))
 634 #    if ! defined(HAS_QUERYLOCALE) && defined(_NL_LOCALE_NAME)
 635 #      define querylocale_l(index, locale_obj)                              \
 636             (QUERYLOCALE_ASSERT(index)                                      \
 637              mortalized_pv_copy(nl_langinfo_l(                              \
 638                          _NL_LOCALE_NAME(categories[index]), locale_obj)))
 639 #    else
 640 #      define querylocale_l(index, locale_obj)                              \
 641         (QUERYLOCALE_ASSERT(index)                                          \
 642          mortalized_pv_copy(querylocale(category_masks[index], locale_obj)))
 643 #    endif
 644 #  endif
 645 #  if defined(__GLIBC__) && defined(USE_LOCALE_MESSAGES)
 646 #    define HAS_GLIBC_LC_MESSAGES_BUG
 647 #    include <libintl.h>
 648 #  endif
 649
 650 /* A fourth array, parallel to the ones above to map from category to its
 651  * equivalent mask */
 652 STATIC const int category_masks[] = {
 653 #  ifdef USE_LOCALE_CTYPE
 654                                 LC_CTYPE_MASK,
 655 #  endif
 656 #  ifdef USE_LOCALE_NUMERIC
 657                                 LC_NUMERIC_MASK,
 658 #  endif
 659 #  ifdef USE_LOCALE_COLLATE
 660                                 LC_COLLATE_MASK,
 661 #  endif
 662 #  ifdef USE_LOCALE_TIME
 663                                 LC_TIME_MASK,
 664 #  endif
 665 #  ifdef USE_LOCALE_MESSAGES
 666                                 LC_MESSAGES_MASK,
 667 #  endif
 668 #  ifdef USE_LOCALE_MONETARY
 669                                 LC_MONETARY_MASK,
 670 #  endif
 671 #  ifdef USE_LOCALE_ADDRESS
 672                                 LC_ADDRESS_MASK,
 673 #  endif
 674 #  ifdef USE_LOCALE_IDENTIFICATION
 675                                 LC_IDENTIFICATION_MASK,
 676 #  endif
 677 #  ifdef USE_LOCALE_MEASUREMENT
 678                                 LC_MEASUREMENT_MASK,
 679 #  endif
 680 #  ifdef USE_LOCALE_PAPER
 681                                 LC_PAPER_MASK,
 682 #  endif
 683 #  ifdef USE_LOCALE_TELEPHONE
 684                                 LC_TELEPHONE_MASK,
 685 #  endif
 686 #  ifdef USE_LOCALE_SYNTAX
 687                                 LC_SYNTAX_MASK,
 688 #  endif
 689 #  ifdef USE_LOCALE_TOD
 690                                 LC_TOD_MASK,
 691 #  endif
 692                                 /* LC_ALL can't be turned off by a Configure
 693                                  * option, and in Posix 2008, should always be
 694                                  * here, so compile it in unconditionally.
 695                                  * This could catch some glitches at compile
 696                                  * time */
 697                                 LC_ALL_MASK,
 698
 699    /* Placeholder as a precaution if code fails to check the return of
 700     * get_category_index(), which returns this element to indicate an error */
 701                                 0
 702 };
 703
 704 #  define my_querylocale_c(cat) my_querylocale_i(cat##_INDEX_)
 705
 706 STATIC const char *
 707 S_my_querylocale_i(pTHX_ const unsigned int index)
 708 {
 709     /* This function returns the name of the locale category given by the input
 710      * index into our parallel tables of them.
 711      *
 712      * POSIX 2008, for some sick reason, chose not to provide a method to find
 713      * the category name of a locale, discarding a basic linguistic tenet that
 714      * for any object, people will create a name for it.  Some vendors have
 715      * created a querylocale() function to do just that.  This function is a
 716      * lot simpler to implement on systems that have this.  Otherwise, we have
 717      * to keep track of what the locale has been set to, so that we can return
 718      * its name so as to emulate setlocale().  It's also possible for C code in
 719      * some library to change the locale without us knowing it, though as of
 720      * September 2017, there are no occurrences in CPAN of uselocale().  Some
 721      * libraries do use setlocale(), but that changes the global locale, and
 722      * threads using per-thread locales will just ignore those changes. */
 723
 724     int category;
 725     const locale_t cur_obj = uselocale((locale_t) 0);
 726     const char * retval;
 727
 728     PERL_ARGS_ASSERT_MY_QUERYLOCALE_I;
 729     assert(index <= NOMINAL_LC_ALL_INDEX);
 730
 731     category = categories[index];
 732
 733     DEBUG_Lv(PerlIO_printf(Perl_debug_log, "my_querylocale_i(%s) on %p\n",
 734                                            category_names[index], cur_obj));
 735         if (cur_obj == LC_GLOBAL_LOCALE) {
 736         retval = porcelain_setlocale(category, NULL);
 737         }
 738     else {
 739
 740 #  ifdef USE_QUERYLOCALE
 741
 742         /* We don't currently keep records when there is querylocale(), so have
 743          * to get it anew each time */
 744         retval = (index == LC_ALL_INDEX_)
 745                  ? calculate_LC_ALL(cur_obj)
 746                  : querylocale_l(index, cur_obj);
 747
 748 #  else
 749
 750         /* But we do have up-to-date values when we keep our own records */
 751         retval = PL_curlocales[index];
 752
 753 #  endif
 754
 755             }
 756
 757                 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
 758                            "my_querylocale_i(%s) returning '%s'\n",
 759                            category_names[index], retval));
 760     return retval;
 761 }
 762
 763 #  ifdef USE_PL_CURLOCALES
 764
 765 STATIC const char *
 766 S_update_PL_curlocales_i(pTHX_
 767                          const unsigned int index,
 768                          const char * new_locale,
 769                          recalc_lc_all_t recalc_LC_ALL)
 770 {
 771     /* This is a helper function for emulate_setlocale_i(), mostly used to
 772      * make that function easier to read. */
 773
 774     PERL_ARGS_ASSERT_UPDATE_PL_CURLOCALES_I;
 775     assert(index <= NOMINAL_LC_ALL_INDEX);
 776
 777     if (index == LC_ALL_INDEX_) {
 778         unsigned int i;
 779
 780         /* For LC_ALL, we change all individual categories to correspond */
 781                          /* PL_curlocales is a parallel array, so has same
 782                           * length as 'categories' */
 783         for (i = 0; i < LC_ALL_INDEX_; i++) {
 784             Safefree(PL_curlocales[i]);
 785             PL_curlocales[i] = savepv(new_locale);
 786         }
 787
 788         recalc_LC_ALL = YES_RECALC_LC_ALL;
 789     }
 790     else {
 791
 792         /* Update the single category's record */
 793         Safefree(PL_curlocales[index]);
 794         PL_curlocales[index] = savepv(new_locale);
 795
 796         if (recalc_LC_ALL == RECALCULATE_LC_ALL_ON_FINAL_INTERATION) {
 797             recalc_LC_ALL = (index == NOMINAL_LC_ALL_INDEX - 1)
 798                             ? YES_RECALC_LC_ALL
 799                             : DONT_RECALC_LC_ALL;
 800         }
 801     }
 802
 803     if (recalc_LC_ALL == YES_RECALC_LC_ALL) {
 804         Safefree(PL_curlocales[LC_ALL_INDEX_]);
 805         PL_curlocales[LC_ALL_INDEX_] =
 806                                     savepv(calculate_LC_ALL(PL_curlocales));
 807     }
 808
 809     return PL_curlocales[index];
 810 }
 811
 812 #  endif  /* Need PL_curlocales[] */
 813
 814 STATIC const char *
 815 S_setlocale_from_aggregate_LC_ALL(pTHX_ const char * locale, const line_t line)
 816 {
 817     /* This function parses the value of the LC_ALL locale, assuming glibc
 818      * syntax, and sets each individual category on the system to the proper
 819      * value.
 820      *
 821      * This is likely to only ever be called from one place, so exists to make
 822      * the calling function easier to read by moving this ancillary code out of
 823      * the main line.
 824      *
 825      * The locale for each category is independent of the other categories.
 826      * Often, they are all the same, but certainly not always.  Perl, in fact,
 827      * usually keeps LC_NUMERIC in the C locale, regardless of the underlying
 828      * locale.  LC_ALL has to be able to represent the case of when there are
 829      * varying locales.  Platforms have differing ways of representing this.
 830      * Because of this, the code in this file goes to lengths to avoid the
 831      * issue, generally looping over the component categories instead of
 832      * referring to them in the aggregate, wherever possible.  However, there
 833      * are cases where we have to parse our own constructed aggregates, which use
 834      * the glibc syntax. */
 835
 836     const char * locale_on_entry = querylocale_c(LC_ALL);
 837
 838     PERL_ARGS_ASSERT_SETLOCALE_FROM_AGGREGATE_LC_ALL;
 839
 840     /* If the string that gives what to set doesn't include all categories,
 841      * the omitted ones get set to "C".  To get this behavior, first set
 842      * all the individual categories to "C", and override the furnished
 843      * ones below.  FALSE => No need to recalculate LC_ALL, as this is a
 844      * temporary state */
 845     if (! emulate_setlocale_c(LC_ALL, "C", DONT_RECALC_LC_ALL, line)) {
 846         setlocale_failure_panic_c(LC_ALL, locale_on_entry,
 847                                   "C", __LINE__, line);
 848         NOT_REACHED; /* NOTREACHED */
 849     }
 850
 851     const char * s = locale;
 852     const char * e = locale + strlen(locale);
 853     while (s < e) {
 854         const char * p = s;
 855
 856         /* Parse through the category */
 857         while (isWORDCHAR(*p)) {
 858             p++;
 859         }
 860
 861         const char * category_end = p;
 862
 863         if (*p++ != '=') {
 864             locale_panic_(Perl_form(aTHX_
 865                   "Unexpected character in locale category name '%02X", *(p-1)));
 866         }
 867
 868         /* Parse through the locale name */
 869         const char * name_start = p;
 870         while (p < e && *p != ';') {
 871             if (! isGRAPH(*p)) {
 872                 locale_panic_(Perl_form(aTHX_
 873                               "Unexpected character in locale name '%02X", *p));
 874             }
 875             p++;
 876         }
 877
 878         const char * name_end = p;
 879
 880         /* Space past the semi-colon */
 881         if (p < e) {
 882             p++;
 883         }
 884
 885         /* Find the index of the category name in our lists */
 886         for (PERL_UINT_FAST8_T i = 0; i < LC_ALL_INDEX_; i++) {
 887
 888             /* Keep going if this index doesn't point to the category being
 889              * parsed.  The strnNE() avoids a Perl_form(), but would fail if
 890              * ever a category name could be a substring of another one, e.g.,
 891              * if there were a "LC_TIME_DATE" */
 892             if strnNE(s, category_names[i], category_end - s) {
 893                 continue;
 894             }
 895
 896             /* Here i points to the category being parsed.  Now isolate the
 897              * locale it is being changed to */
 898             const char * individ_locale = Perl_form(aTHX_ "%.*s",
 899                                 (int) (name_end - name_start), name_start);
 900
 901             /* And do the change.  FALSE => Don't recalculate LC_ALL; we'll do
 902              * it ourselves after the loop */
 903             if (! emulate_setlocale_i(i, individ_locale,
 904                                       DONT_RECALC_LC_ALL, line))
 905             {
 906
 907                 /* But if we have to back out, do fix up LC_ALL */
 908                 if (! emulate_setlocale_c(LC_ALL, locale_on_entry,
 909                                           YES_RECALC_LC_ALL, line))
 910                 {
 911                     setlocale_failure_panic_i(i, individ_locale,
 912                                               locale, __LINE__, line);
 913                     NOT_REACHED; /* NOTREACHED */
 914                 }
 915
 916                 /* Reverting to the entry value succeeded, but the operation
 917                  * failed to go to the requested locale. */
 918                 return NULL;
 919             }
 920
 921             /* Found and handled the desired category.  Quit the inner loop to
 922              * try the next category */
 923             break;
 924         }
 925
 926         /* Finished with this category; iterate to the next one in the input */
 927         s = p;
 928     }
 929
 930 #    ifdef USE_PL_CURLOCALES
 931
 932     /* Here we have set all the individual categories.  Update the LC_ALL entry
 933      * as well.  We can't just use the input 'locale' as the value may omit
 934      * categories whose locale is 'C'.  khw thinks it's better to store a
 935      * complete LC_ALL.  So calculate it. */
 936     const char * retval = savepv(calculate_LC_ALL(PL_curlocales));
 937     Safefree(PL_curlocales[LC_ALL_INDEX_]);
 938     PL_curlocales[LC_ALL_INDEX_] = retval;
 939
 940 #    else
 941
 942     const char * retval = querylocale_c(LC_ALL);
 943
 944 #    endif
 945
 946     return retval;
 947 }
 948
 949 #  ifndef USE_QUERYLOCALE
 950
 951 STATIC const char *
 952 S_find_locale_from_environment(pTHX_ const unsigned int index)
 953 {
 954     /* On systems without querylocale(), it is problematic getting the results
 955      * of the POSIX 2008 equivalent of setlocale(category, "") (which gets the
 956      * locale from the environment).
 957      *
 958      * To ensure that we know exactly what those values are, we do the setting
 959      * ourselves, using the documented algorithm (assuming the documentation is
 960      * correct) rather than use "" as the locale.  This will lead to results
 961      * that differ from native behavior if the native behavior differs from the
 962      * standard documented value, but khw believes it is better to know what's
 963      * going on, even if different from native, than to just guess.
 964      *
 965      * Another option would be, in a critical section, to save the global
 966      * locale's current value, and do a straight setlocale(LC_ALL, "").  That
 967      * would return our desired values, destroying the global locale's, which
 968      * we would then restore.  But that could cause races with any other thread
 969      * that is using the global locale and isn't using the mutex.  And, the
 970      * only reason someone would have done that is because they are calling a
 971      * library function, like in gtk, that calls setlocale(), and which can't
 972      * be changed to use the mutex.  That wouldn't be a problem if this were to
 973      * be done before any threads had switched, say during perl construction
 974      * time.  But this code would still be needed for the general case. */
 975
 976     const char * default_name;
 977     unsigned int i;
 978     const char * locale_names[LC_ALL_INDEX_];
 979
 980     /* We rely on PerlEnv_getenv() returning a mortalized copy */
 981     const char * const lc_all = PerlEnv_getenv("LC_ALL");
 982
 983     /* Use any "LC_ALL" environment variable, as it overrides everything
 984      * else. */
 985     if (lc_all && strNE(lc_all, "")) {
 986         return lc_all;
 987     }
 988
 989     /* Otherwise, we need to dig deeper.  Unless overridden, the default is
 990      * the LANG environment variable; "C" if it doesn't exist. */
 991     default_name = PerlEnv_getenv("LANG");
 992     if (! default_name || strEQ(default_name, "")) {
 993         default_name = "C";
 994     }
 995
 996     /* If setting an individual category, use its corresponding value found in
 997      * the environment, if any; otherwise use the default we already
 998      * calculated. */
 999     if (index != LC_ALL_INDEX_) {
1000         const char * const new_value = PerlEnv_getenv(category_names[index]);
1001
1002         return (new_value && strNE(new_value, ""))
1003                 ? new_value
1004                 : default_name;
1005     }
1006
1007     /* Here, we are getting LC_ALL.  Any categories that don't have a
1008      * corresponding environment variable set should be set to 'default_name'
1009      *
1010      * Simply find the values for all categories, and call the function to
1011      * compute LC_ALL. */
1012     for (i = 0; i < LC_ALL_INDEX_; i++) {
1013         const char * const env_override = PerlEnv_getenv(category_names[i]);
1014
1015         locale_names[i] = (env_override && strNE(env_override, ""))
1016                           ? env_override
1017                           : default_name;
1018
1019         DEBUG_Lv(PerlIO_printf(Perl_debug_log,
1020                  "find_locale_from_environment i=%d, name=%s, locale=%s\n",
1021                  i, category_names[i], locale_names[i]));
1022     }
1023
1024     return calculate_LC_ALL(locale_names);
1025 }
1026
1027 #  endif
1028
1029 STATIC const char *
1030 S_emulate_setlocale_i(pTHX_
1031
1032         /* Our internal index of the 'category' setlocale is
1033            called with */
1034         const unsigned int index,
1035
1036         const char * new_locale, /* The locale to set the category to */
1037         const recalc_lc_all_t recalc_LC_ALL,  /* Explained below */
1038         const line_t line     /* Called from this line number */
1039        )
1040 {
1041     PERL_ARGS_ASSERT_EMULATE_SETLOCALE_I;
1042     assert(index <= NOMINAL_LC_ALL_INDEX);
1043
1044     /* This function effectively performs a setlocale() on just the current
1045      * thread; thus it is thread-safe.  It does this by using the POSIX 2008
1046      * locale functions to emulate the behavior of setlocale().  Similar to
1047      * regular setlocale(), the return from this function points to memory that
1048      * can be overwritten by other system calls, so needs to be copied
1049      * immediately if you need to retain it.  The difference here is that
1050      * system calls besides another setlocale() can overwrite it.
1051      *
1052      * By doing this, most locale-sensitive functions become thread-safe.  The
1053      * exceptions are mostly those that return a pointer to static memory.
1054      *
1055      * This function may be called in a tight loop that iterates over all
1056      * categories.  Because LC_ALL is not a "real" category, but merely the sum
1057      * of all the other ones, such loops don't include LC_ALL.  On systems that
1058      * have querylocale() or similar, the current LC_ALL value is immediately
1059      * retrievable; on systems lacking that feature, we have to keep track of
1060      * LC_ALL ourselves.  We could do that on each iteration, only to throw it
1061      * away on the next, but the calculation is more than a trivial amount of
1062      * work.  Instead, the 'recalc_LC_ALL' parameter is set to
1063      * RECALCULATE_LC_ALL_ON_FINAL_INTERATION to only do the calculation once.
1064      * This function calls itself recursively in such a loop.
1065      *
1066      * When not in such a loop, the parameter is set to the other enum values
1067      * DONT_RECALC_LC_ALL or YES_RECALC_LC_ALL. */
1068
1069     int mask = category_masks[index];
1070     const locale_t entry_obj = uselocale((locale_t) 0);
1071     const char * locale_on_entry = querylocale_i(index);
1072
1073     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
1074              "emulate_setlocale_i input=%d (%s), mask=0x%x,"
1075              " new locale=\"%s\", current locale=\"%s\","
1076              "index=%d, object=%p\n",
1077              categories[index], category_name(categories[index]), mask,
1078              ((new_locale == NULL) ? "(nil)" : new_locale),
1079              locale_on_entry, index, entry_obj));
1080
1081     /* Return the already-calculated info if just querying what the existing
1082      * locale is */
1083     if (new_locale == NULL) {
1084         return locale_on_entry;
1085     }
1086
1087     /* Here, trying to change the locale, but it is a no-op if the new boss is
1088      * the same as the old boss.  Except this routine is called when converting
1089      * from the global locale, so in that case we will create a per-thread
1090      * locale below (with the current values).  Bitter experience also
1091      * indicates that newlocale() can free up the basis locale memory if we
1092      * call it with the new and old being the same. */
1093     if (   entry_obj != LC_GLOBAL_LOCALE
1094         && locale_on_entry
1095         && strEQ(new_locale, locale_on_entry))
1096     {
1097         DEBUG_Lv(PerlIO_printf(Perl_debug_log,
1098                  "(%" LINE_Tf "): emulate_setlocale_i"
1099                  " no-op to change to what it already was\n",
1100                  line));
1101
1102 #  ifdef USE_PL_CURLOCALES
1103
1104        /* On the final iteration of a loop that needs to recalculate LC_ALL, do
1105         * so.  If no iteration changed anything, LC_ALL also doesn't change,
1106         * but khw believes the complexity needed to keep track of that isn't
1107         * worth it. */
1108         if (UNLIKELY(   recalc_LC_ALL == RECALCULATE_LC_ALL_ON_FINAL_INTERATION
1109                      && index == NOMINAL_LC_ALL_INDEX - 1))
1110         {
1111             Safefree(PL_curlocales[LC_ALL_INDEX_]);
1112             PL_curlocales[LC_ALL_INDEX_] =
1113                                         savepv(calculate_LC_ALL(PL_curlocales));
1114         }
1115
1116 #  endif
1117
1118         return locale_on_entry;
1119     }
1120
1121 #  ifndef USE_QUERYLOCALE
1122
1123     /* Without a querylocale() mechanism, we have to figure out ourselves what
1124      * happens with setting a locale to "" */
1125     if (strEQ(new_locale, "")) {
1126         new_locale = find_locale_from_environment(index);
1127     }
1128
1129 #  endif
1130
1131     /* So far, it has worked that a semi-colon in the locale name means that
1132      * the category is LC_ALL and it subsumes categories which don't all have
1133      * the same locale.  This is the glibc syntax. */
1134     if (strchr(new_locale, ';')) {
1135         assert(index == LC_ALL_INDEX_);
1136         return setlocale_from_aggregate_LC_ALL(new_locale, line);
1137     }
1138
1139 #  ifdef HAS_GLIBC_LC_MESSAGES_BUG
1140
1141     /* For this bug, if the LC_MESSAGES locale changes, we have to do an
1142      * expensive workaround.  Save the current value so we can later determine
1143      * if it changed. */
1144     const char * old_messages_locale = NULL;
1145     if (   (index == LC_MESSAGES_INDEX_ || index == LC_ALL_INDEX_)
1146         &&  LIKELY(PL_phase != PERL_PHASE_CONSTRUCT))
1147     {
1148         old_messages_locale = querylocale_c(LC_MESSAGES);
1149     }
1150
1151 #  endif
1152
1153     assert(PL_C_locale_obj);
1154
1155     /* Now ready to switch to the input 'new_locale' */
1156
1157     /* Switching locales generally entails freeing the current one's space (at
1158      * the C library's discretion), hence we can't be using that locale at the
1159      * time of the switch (this wasn't obvious to khw from the man pages).  So
1160      * switch to a known locale object that we don't otherwise mess with. */
1161     if (! uselocale(PL_C_locale_obj)) {
1162
1163         /* Not being able to change to the C locale is severe; don't keep
1164          * going.  */
1165         setlocale_failure_panic_i(index, locale_on_entry, "C", __LINE__, line);
1166         NOT_REACHED; /* NOTREACHED */
1167     }
1168
1169     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
1170              "(%" LINE_Tf "): emulate_setlocale_i now using C"
1171              " object=%p\n", line, PL_C_locale_obj));
1172
1173     locale_t new_obj;
1174
1175     /* We created a (never changing) object at start-up for LC_ALL being in the
1176      * C locale.  If this call is to switch to LC_ALL=>C, simply use that
1177      * object.  But in fact, we already have switched to it just above, in
1178      * preparation for the general case.  Since we're already there, no need to
1179      * do further switching. */
1180     if (mask == LC_ALL_MASK && isNAME_C_OR_POSIX(new_locale)) {
1181         DEBUG_Lv(PerlIO_printf(Perl_debug_log, "(%" LINE_Tf "):"
1182                                                " emulate_setlocale_i will stay"
1183                                                " in C object\n", line));
1184         new_obj = PL_C_locale_obj;
1185
1186         /* And free the old object if it isn't a special one */
1187         if (entry_obj != LC_GLOBAL_LOCALE && entry_obj != PL_C_locale_obj) {
1188             freelocale(entry_obj);
1189         }
1190     }
1191     else {  /* Here is the general case, not to LC_ALL=>C */
1192         locale_t basis_obj = entry_obj;
1193
1194         /* Specially handle two objects */
1195         if (basis_obj == LC_GLOBAL_LOCALE || basis_obj == PL_C_locale_obj) {
1196
1197             /* For these two objects, we make duplicates to hand to newlocale()
1198              * below.  For LC_GLOBAL_LOCALE, this is because newlocale()
1199              * doesn't necessarily accept it as input (the results are
1200              * undefined).  For PL_C_locale_obj, it is so that it never gets
1201              * modified, as otherwise newlocale() is free to do so */
1202             basis_obj = duplocale(entry_obj);
1203             if (! basis_obj) {
1204                 locale_panic_(Perl_form(aTHX_ "(%" LINE_Tf "): duplocale failed",
1205                                               line));
1206                 NOT_REACHED; /* NOTREACHED */
1207             }
1208
1209             DEBUG_Lv(PerlIO_printf(Perl_debug_log,
1210                                    "(%" LINE_Tf "): emulate_setlocale_i"
1211                                    " created %p by duping the input\n",
1212                                    line, basis_obj));
1213         }
1214
1215         /* Ready to create a new locale by modification of the exising one */
1216         new_obj = newlocale(mask, new_locale, basis_obj);
1217
1218         if (! new_obj) {
1219             DEBUG_L(PerlIO_printf(Perl_debug_log,
1220                                   " (%" LINE_Tf "): emulate_setlocale_i"
1221                                   " creating new object from %p failed:"
1222                                   " errno=%d\n",
1223                                   line, basis_obj, GET_ERRNO));
1224
1225             /* Failed.  Likely this is because the proposed new locale isn't
1226              * valid on this system.  But we earlier switched to the LC_ALL=>C
1227              * locale in anticipation of it succeeding,  Now have to switch
1228              * back to the state upon entry */
1229             if (! uselocale(entry_obj)) {
1230                 setlocale_failure_panic_i(index, "switching back to",
1231                                           locale_on_entry, __LINE__, line);
1232                 NOT_REACHED; /* NOTREACHED */
1233             }
1234
1235 #    ifdef USE_PL_CURLOCALES
1236
1237             if (entry_obj == LC_GLOBAL_LOCALE) {
1238                 /* Here, we are back in the global locale.  We may never have
1239                  * set PL_curlocales.  If the locale change had succeeded, the
1240                  * code would have then set them up, but since it didn't, do so
1241                  * here.  khw isn't sure if this prevents some issues or not,
1242                  * but tis is defensive coding.  The system setlocale() returns
1243                  * the desired information.  This will calculate LC_ALL's entry
1244                  * only on the final iteration */
1245                 for (PERL_UINT_FAST8_T i = 0; i < NOMINAL_LC_ALL_INDEX; i++) {
1246                     update_PL_curlocales_i(i,
1247                                        porcelain_setlocale(categories[i], NULL),
1248                                        RECALCULATE_LC_ALL_ON_FINAL_INTERATION);
1249                 }
1250             }
1251 #    endif
1252
1253             return NULL;
1254         }
1255
1256         DEBUG_Lv(PerlIO_printf(Perl_debug_log,
1257                                "(%" LINE_Tf "): emulate_setlocale_i created %p"
1258                                " while freeing %p\n", line, new_obj, basis_obj));
1259
1260         /* Here, successfully created an object representing the desired
1261          * locale; now switch into it */
1262         if (! uselocale(new_obj)) {
1263             freelocale(new_obj);
1264             locale_panic_(Perl_form(aTHX_ "(%" LINE_Tf "): emulate_setlocale_i"
1265                                           " switching into new locale failed",
1266                                           line));
1267         }
1268     }
1269
1270     /* Here, we are using 'new_obj' which matches the input 'new_locale'. */
1271     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
1272              "(%" LINE_Tf "): emulate_setlocale_i now using %p\n", line, new_obj));
1273
1274     /* We are done, except for updating our records (if the system doesn't keep
1275      * them) and in the case of locale "", we don't actually know what the
1276      * locale that got switched to is, as it came from the environment.  So
1277      * have to find it */
1278
1279 #  ifdef USE_QUERYLOCALE
1280
1281     if (strEQ(new_locale, "")) {
1282         new_locale = querylocale_i(index);
1283     }
1284
1285     PERL_UNUSED_ARG(recalc_LC_ALL);
1286
1287 #  else
1288
1289     new_locale = update_PL_curlocales_i(index, new_locale, recalc_LC_ALL);
1290
1291 #  endif
1292 #  ifdef HAS_GLIBC_LC_MESSAGES_BUG
1293
1294     /* Invalidate the glibc cache of loaded translations if the locale has changed,
1295      * see [perl #134264] */
1296     if (old_messages_locale) {
1297         if (strNE(old_messages_locale, my_querylocale_c(LC_MESSAGES))) {
1298             textdomain(textdomain(NULL));
1299         }
1300     }
1301
1302 #  endif
1303
1304     return new_locale;
1305 }
1306
1307 #endif   /* End of the various implementations of the setlocale and
1308             querylocale macros used in the remainder of this program */
1309
1310 #ifdef USE_LOCALE
1311
1312 /* So far, the locale strings returned by modern 2008-compliant systems have
1313  * been fine */
1314
1315 STATIC const char *
1316 S_stdize_locale(pTHX_ const int category,
1317                       const char *input_locale,
1318                       const char **buf,
1319                       Size_t *buf_size,
1320                       const line_t caller_line)
1321 {
1322     /* The return value of setlocale() is opaque, but is required to be usable
1323      * as input to a future setlocale() to create the same state.
1324      * Unfortunately not all systems are compliant.  But most often they are of
1325      * a very restricted set of forms that this file has been coded to expect.
1326      *
1327      * There are some outliers, though, that this function tries to tame:
1328      *
1329      * 1) A new-line.  This function chomps any \n characters
1330      * 2) foo=bar.     'bar' is what is generally meant, and the foo= part is
1331      *                 stripped.  This form is legal for LC_ALL.  When found in
1332      *                 that category group, the function calls itself
1333      *                 recursively on each possible component category to make
1334      *                 sure the individual categories are ok.
1335      *
1336      * If no changes to the input were made, it is returned; otherwise the
1337      * changed version is stored into memory at *buf, with *buf_size set to its
1338      * new value, and *buf is returned.
1339      */
1340
1341     const char * first_bad;
1342     const char * retval;
1343
1344     PERL_ARGS_ASSERT_STDIZE_LOCALE;
1345
1346     if (input_locale == NULL) {
1347         return NULL;
1348     }
1349
1350     first_bad = strpbrk(input_locale, "=\n");
1351
1352     /* Most likely, there isn't a problem with the input */
1353     if (LIKELY(! first_bad)) {
1354         return input_locale;
1355     }
1356
1357 #    ifdef LC_ALL
1358
1359     /* But if there is, and the category is LC_ALL, we have to look at each
1360      * component category */
1361     if (category == LC_ALL) {
1362         const char * individ_locales[LC_ALL_INDEX_];
1363         bool made_changes = FALSE;
1364         unsigned int i;
1365
1366         for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++) {
1367             Size_t this_size = 0;
1368             individ_locales[i] = stdize_locale(categories[i],
1369                                                porcelain_setlocale(categories[i],
1370                                                                    NULL),
1371                                                &individ_locales[i],
1372                                                &this_size,
1373                                                caller_line);
1374
1375             /* If the size didn't change, it means this category did not have
1376              * to be adjusted, and individ_locales[i] points to the buffer
1377              * returned by porcelain_setlocale(); we have to copy that before
1378              * it's called again in the next iteration */
1379             if (this_size == 0) {
1380                 individ_locales[i] = savepv(individ_locales[i]);
1381             }
1382             else {
1383                 made_changes = TRUE;
1384             }
1385         }
1386
1387         /* If all the individual categories were ok as-is, this was a false
1388          * alarm.  We must have seen an '=' which was a legal occurrence in
1389          * this combination locale */
1390         if (! made_changes) {
1391             retval = input_locale;  /* The input can be returned unchanged */
1392         }
1393         else {
1394             retval = save_to_buffer(querylocale_c(LC_ALL), buf, buf_size);
1395         }
1396
1397         for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++) {
1398             Safefree(individ_locales[i]);
1399         }
1400
1401         return retval;
1402     }
1403
1404 #    else   /* else no LC_ALL */
1405
1406     PERL_UNUSED_ARG(category);
1407     PERL_UNUSED_ARG(caller_line);
1408
1409 #    endif
1410
1411     /* Here, there was a problem in an individual category.  This means that at
1412      * least one adjustment will be necessary.  Create a modifiable copy */
1413     retval = save_to_buffer(input_locale, buf, buf_size);
1414
1415     if (*first_bad != '=') {
1416
1417         /* Translate the found position into terms of the copy */
1418         first_bad = retval + (first_bad - input_locale);
1419     }
1420     else { /* An '=' */
1421
1422         /* It is unlikely that the return is so screwed-up that it contains
1423          * multiple equals signs, but handle that case by stripping all of
1424          * them.  */
1425         const char * final_equals = strrchr(retval, '=');
1426
1427         /* The length passed here causes the move to include the terminating
1428          * NUL */
1429         Move(final_equals + 1, retval, strlen(final_equals), char);
1430
1431         /* See if there are additional problems; if not, we're good to return.
1432          * */
1433         first_bad = strpbrk(retval, "\n");
1434
1435         if (! first_bad) {
1436             return retval;
1437         }
1438     }
1439
1440     /* Here, the problem must be a \n.  Get rid of it and what follows.
1441      * (Originally, only a trailing \n was stripped.  Unsure what to do if not
1442      * trailing) */
1443     *((char *) first_bad) = '\0';
1444     return retval;
1445 }
1446
1447 #if defined(USE_POSIX_2008_LOCALE)
1448
1449 STATIC
1450 const char *
1451
1452 #  ifdef USE_QUERYLOCALE
1453 S_calculate_LC_ALL(pTHX_ const locale_t cur_obj)
1454 #  else
1455 S_calculate_LC_ALL(pTHX_ const char ** individ_locales)
1456 #  endif
1457
1458 {
1459     /* For POSIX 2008, we have to figure out LC_ALL ourselves when needed.
1460      * querylocale(), on systems that have it, doesn't tend to work for LC_ALL.
1461      * So we have to construct the answer ourselves based on the passed in
1462      * data, which is either a locale_t object, for systems with querylocale(),
1463      * or an array we keep updated to the proper values, otherwise.
1464      *
1465      * This returns a mortalized string containing the locale name(s) of
1466      * LC_ALL.
1467      *
1468      * If all individual categories are the same locale, we can just set LC_ALL
1469      * to that locale.  But if not, we have to create an aggregation of all the
1470      * categories on the system.  Platforms differ as to the syntax they use
1471      * for these non-uniform locales for LC_ALL.  Some use a '/' or other
1472      * delimiter of the locales with a predetermined order of categories; a
1473      * Configure probe would be needed to tell us how to decipher those.  glibc
1474      * uses a series of name=value pairs, like
1475      *      LC_NUMERIC=C;LC_TIME=en_US.UTF-8;...
1476      * The syntax we use for our aggregation doesn't much matter, as we take
1477      * care not to use the native setlocale() function on whatever style is
1478      * chosen.  But, it would be possible for someone to call Perl_setlocale()
1479      * using a native style we don't understand.  So far no one has complained.
1480      *
1481      * For systems that have categories we don't know about, the algorithm
1482      * below won't know about those missing categories, leading to potential
1483      * bugs for code that looks at them.  If there is an environment variable
1484      * that sets that category, we won't know to look for it, and so our use of
1485      * LANG or "C" improperly overrides it.  On the other hand, if we don't do
1486      * what is done here, and there is no environment variable, the category's
1487      * locale should be set to LANG or "C".  So there is no good solution.  khw
1488      * thinks the best is to make sure we have a complete list of possible
1489      * categories, adding new ones as they show up on obscure platforms.
1490      */
1491
1492     unsigned int i;
1493     Size_t names_len = 0;
1494     bool are_all_categories_the_same_locale = TRUE;
1495     char * aggregate_locale;
1496     char * previous_start = NULL;
1497     char * this_start = NULL;
1498     Size_t entry_len = 0;
1499
1500     PERL_ARGS_ASSERT_CALCULATE_LC_ALL;
1501
1502     /* First calculate the needed size for the string listing the categories
1503      * and their locales. */
1504     for (i = 0; i < LC_ALL_INDEX_; i++) {
1505
1506 #  ifdef USE_QUERYLOCALE
1507         const char * entry = querylocale_l(i, cur_obj);
1508 #  else
1509         const char * entry = individ_locales[i];
1510 #  endif
1511
1512         names_len += strlen(category_names[i])
1513                   + 1                           /* '=' */
1514                   + strlen(entry)
1515                   + 1;                          /* ';' */
1516     }
1517
1518     names_len++;    /* Trailing '\0' */
1519
1520     /* Allocate enough space for the aggregated string */
1521     SAVEFREEPV(Newxz(aggregate_locale, names_len, char));
1522
1523     /* Then fill it in */
1524     for (i = 0; i < LC_ALL_INDEX_; i++) {
1525         Size_t new_len;
1526
1527 #  ifdef USE_QUERYLOCALE
1528         const char * entry = querylocale_l(i, cur_obj);
1529 #  else
1530         const char * entry = individ_locales[i];
1531 #  endif
1532
1533         new_len = my_strlcat(aggregate_locale, category_names[i], names_len);
1534         assert(new_len <= names_len);
1535         new_len = my_strlcat(aggregate_locale, "=", names_len);
1536         assert(new_len <= names_len);
1537
1538         this_start = aggregate_locale + strlen(aggregate_locale);
1539         entry_len = strlen(entry);
1540
1541         new_len = my_strlcat(aggregate_locale, entry, names_len);
1542         assert(new_len <= names_len);
1543         new_len = my_strlcat(aggregate_locale, ";", names_len);
1544         assert(new_len <= names_len);
1545         PERL_UNUSED_VAR(new_len);   /* Only used in DEBUGGING */
1546
1547         if (   i > 0
1548             && are_all_categories_the_same_locale
1549             && memNE(previous_start, this_start, entry_len + 1))
1550         {
1551             are_all_categories_the_same_locale = FALSE;
1552         }
1553         else {
1554             previous_start = this_start;
1555         }
1556     }
1557
1558     /* If they are all the same, just return any one of them */
1559     if (are_all_categories_the_same_locale) {
1560         aggregate_locale = this_start;
1561         aggregate_locale[entry_len] = '\0';
1562     }
1563
1564     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
1565                            "calculate_LC_ALL returning '%s'\n",
1566                            aggregate_locale));
1567
1568     return aggregate_locale;
1569 }
1570 #endif /*defined(USE_POSIX_2008_LOCALE)*/
1571
1572 STATIC void
1573 S_setlocale_failure_panic_i(pTHX_
1574                             const unsigned int cat_index,
1575                             const char * current,
1576                             const char * failed,
1577                             const line_t caller_0_line,
1578                             const line_t caller_1_line)
1579 {
1580     dSAVE_ERRNO;
1581     const int cat = categories[cat_index];
1582     const char * name = category_names[cat_index];
1583
1584     PERL_ARGS_ASSERT_SETLOCALE_FAILURE_PANIC_I;
1585
1586     if (current == NULL) {
1587         current = querylocale_i(cat_index);
1588     }
1589
1590     Perl_locale_panic(Perl_form(aTHX_ "(%" LINE_Tf
1591                                       "): Can't change locale for %s(%d)"
1592                                       " from '%s' to '%s'",
1593                                       caller_1_line, name, cat,
1594                                       current, failed),
1595                       __FILE__, caller_0_line, GET_ERRNO);
1596     NOT_REACHED; /* NOTREACHED */
1597 }
1598
1599 /* Any of these will allow us to find the RADIX */
1600 #  if defined(USE_LOCALE_NUMERIC) && (   defined(HAS_SOME_LANGINFO)         \
1601                                       || defined(HAS_SOME_LOCALECONV)       \
1602                                       || defined(HAS_SNPRINTF))
1603 #    define CAN_CALCULATE_RADIX
1604 #  endif
1605
1606 STATIC void
1607 S_new_numeric(pTHX_ const char *newnum)
1608 {
1609     PERL_ARGS_ASSERT_NEW_NUMERIC;
1610
1611 #  ifndef USE_LOCALE_NUMERIC
1612
1613     PERL_ARGS_ASSERT_NEW_NUMERIC;
1614     PERL_UNUSED_ARG(newnum);
1615
1616 #  else
1617
1618     /* Called after each libc setlocale() call affecting LC_NUMERIC, to tell
1619      * core Perl this and that 'newnum' is the name of the new locale, and we
1620      * are switched into it.  It installs this locale as the current underlying
1621      * default, and then switches to the C locale, if necessary, so that the
1622      * code that has traditionally expected the radix character to be a dot may
1623      * continue to do so.
1624      *
1625      * The default locale and the C locale can be toggled between by use of the
1626      * set_numeric_underlying() and set_numeric_standard() functions, which
1627      * should probably not be called directly, but only via macros like
1628      * SET_NUMERIC_STANDARD() in perl.h.
1629      *
1630      * The toggling is necessary mainly so that a non-dot radix decimal point
1631      * character can be input and output, while allowing internal calculations
1632      * to use a dot.
1633      *
1634      * This sets several interpreter-level variables:
1635      * PL_numeric_name  The underlying locale's name: a copy of 'newnum'
1636      * PL_numeric_underlying  A boolean indicating if the toggled state is such
1637      *                  that the current locale is the program's underlying
1638      *                  locale
1639      * PL_numeric_standard An int indicating if the toggled state is such
1640      *                  that the current locale is the C locale or
1641      *                  indistinguishable from the C locale.  If non-zero, it
1642      *                  is in C; if > 1, it means it may not be toggled away
1643      *                  from C.
1644      * PL_numeric_underlying_is_standard   A bool kept by this function
1645      *                  indicating that the underlying locale and the standard
1646      *                  C locale are indistinguishable for the purposes of
1647      *                  LC_NUMERIC.  This happens when both of the above two
1648      *                  variables are true at the same time.  (Toggling is a
1649      *                  no-op under these circumstances.)  This variable is
1650      *                  used to avoid having to recalculate.
1651      * PL_numeric_radix_sv  Contains the string that code should use for the
1652      *                  decimal point.  It is set to either a dot or the
1653      *                  program's underlying locale's radix character string,
1654      *                  depending on the situation.
1655      * PL_underlying_radix_sv  Contains the program's underlying locale's radix
1656      *                  character string.  This is copied into
1657      *                  PL_numeric_radix_sv when the situation warrants.  It
1658      *                  exists to avoid having to recalculate it when toggling.
1659      * PL_underlying_numeric_obj = (only on POSIX 2008 platforms)  An object
1660      *                  with everything set up properly so as to avoid work on
1661      *                  such platforms.
1662      */
1663
1664     const char * radix = C_decimal_point;
1665     utf8ness_t utf8ness = UTF8NESS_IMMATERIAL;
1666
1667     DEBUG_L( PerlIO_printf(Perl_debug_log,
1668                            "Called new_numeric with %s, PL_numeric_name=%s\n",
1669                            newnum, PL_numeric_name));
1670
1671     /* If this isn't actually a change, do nothing */
1672     if (strEQ(PL_numeric_name, newnum)) {
1673         return;
1674     }
1675
1676     Safefree(PL_numeric_name);
1677     PL_numeric_name = savepv(newnum);
1678
1679     /* Handle the trivial case */
1680     if (isNAME_C_OR_POSIX(PL_numeric_name)) {
1681         PL_numeric_standard = TRUE;
1682         PL_numeric_underlying_is_standard = TRUE;
1683         PL_numeric_underlying = TRUE;
1684         sv_setpv(PL_numeric_radix_sv, C_decimal_point);
1685         sv_setpv(PL_underlying_radix_sv, C_decimal_point);
1686         return;
1687     }
1688
1689     /* We are in the underlying locale until changed at the end of this
1690      * function */
1691     PL_numeric_underlying = TRUE;
1692
1693 #  ifdef USE_POSIX_2008_LOCALE
1694
1695     /* We keep a special object for easy switching to */
1696     PL_underlying_numeric_obj = newlocale(LC_NUMERIC_MASK,
1697                                           PL_numeric_name,
1698                                           PL_underlying_numeric_obj);
1699
1700 #    endif
1701
1702     /* Find and save this locale's radix character. */
1703     my_langinfo_c(RADIXCHAR, LC_NUMERIC, PL_numeric_name,
1704                   &radix, NULL, &utf8ness);
1705     sv_setpv(PL_underlying_radix_sv, radix);
1706
1707     if (utf8ness == UTF8NESS_YES) {
1708         SvUTF8_on(PL_underlying_radix_sv);
1709     }
1710
1711     DEBUG_L(PerlIO_printf(Perl_debug_log,
1712                           "Locale radix is '%s', ?UTF-8=%d\n",
1713                           SvPVX(PL_underlying_radix_sv),
1714                           cBOOL(SvUTF8(PL_underlying_radix_sv))));
1715
1716     /* This locale is indistinguishable from C (for numeric purposes) if both
1717      * the radix character and the thousands separator are the same as C's.
1718      * Start with the radix. */
1719     PL_numeric_underlying_is_standard = strEQ(C_decimal_point, radix);
1720     Safefree(radix);
1721
1722 #    ifndef TS_W32_BROKEN_LOCALECONV
1723
1724     /* If the radix isn't the same as C's, we know it is distinguishable from
1725      * C; otherwise check the thousands separator too.  Only if both are the
1726      * same as C's is the locale indistinguishable from C.
1727      *
1728      * But on earlier Windows versions, there is a potential race.  This code
1729      * knows that localeconv() (elsewhere in this file) will be used to extract
1730      * the needed value, and localeconv() was buggy for quite a while, and that
1731      * code in this file hence uses a workaround.  And that workaround may have
1732      * an (unlikely) race.  Gathering the radix uses a different workaround on
1733      * Windows that doesn't involve a race.  It might be possible to do the
1734      * same for this (patches welcome).
1735      *
1736      * Until then khw doesn't think it's worth even the small risk of a race to
1737      * get this value, which in almost all locales is empty, and doesn't appear
1738      * to be used in any of the Micrsoft library routines anyway. */
1739
1740     const char * scratch_buffer = NULL;
1741     PL_numeric_underlying_is_standard &= strEQ(C_thousands_sep,
1742                                                my_langinfo_c(THOUSEP, LC_NUMERIC,
1743                                                              PL_numeric_name,
1744                                                              &scratch_buffer,
1745                                                              NULL, NULL));
1746     Safefree(scratch_buffer);
1747
1748 #    endif
1749
1750     PL_numeric_standard = PL_numeric_underlying_is_standard;
1751
1752     /* Keep LC_NUMERIC so that it has the C locale radix and thousands
1753      * separator.  This is for XS modules, so they don't have to worry about
1754      * the radix being a non-dot.  (Core operations that need the underlying
1755      * locale change to it temporarily). */
1756     if (! PL_numeric_standard) {
1757         set_numeric_standard();
1758     }
1759
1760 #  endif
1761
1762 }
1763
1764 void
1765 Perl_set_numeric_standard(pTHX)
1766 {
1767
1768 #  ifdef USE_LOCALE_NUMERIC
1769
1770     /* Unconditionally toggle the LC_NUMERIC locale to the current underlying
1771      * default.
1772      *
1773      * Most code should use the macro SET_NUMERIC_STANDARD() in perl.h
1774      * instead of calling this directly.  The macro avoids calling this routine
1775      * if toggling isn't necessary according to our records (which could be
1776      * wrong if some XS code has changed the locale behind our back) */
1777
1778     DEBUG_L(PerlIO_printf(Perl_debug_log,
1779                                   "Setting LC_NUMERIC locale to standard C\n"));
1780
1781     void_setlocale_c(LC_NUMERIC, "C");
1782     PL_numeric_standard = TRUE;
1783     sv_setpv(PL_numeric_radix_sv, C_decimal_point);
1784
1785     PL_numeric_underlying = PL_numeric_underlying_is_standard;
1786
1787 #  endif /* USE_LOCALE_NUMERIC */
1788
1789 }
1790
1791 void
1792 Perl_set_numeric_underlying(pTHX)
1793 {
1794
1795 #  ifdef USE_LOCALE_NUMERIC
1796
1797     /* Unconditionally toggle the LC_NUMERIC locale to the current underlying
1798      * default.
1799      *
1800      * Most code should use the macro SET_NUMERIC_UNDERLYING() in perl.h
1801      * instead of calling this directly.  The macro avoids calling this routine
1802      * if toggling isn't necessary according to our records (which could be
1803      * wrong if some XS code has changed the locale behind our back) */
1804
1805     DEBUG_L(PerlIO_printf(Perl_debug_log, "Setting LC_NUMERIC locale to %s\n",
1806                                           PL_numeric_name));
1807
1808     void_setlocale_c(LC_NUMERIC, PL_numeric_name);
1809     PL_numeric_underlying = TRUE;
1810     sv_setsv_nomg(PL_numeric_radix_sv, PL_underlying_radix_sv);
1811
1812     PL_numeric_standard = PL_numeric_underlying_is_standard;
1813
1814 #  endif /* USE_LOCALE_NUMERIC */
1815
1816 }
1817
1818 /*
1819  * Set up for a new ctype locale.
1820  */
1821 STATIC void
1822 S_new_ctype(pTHX_ const char *newctype)
1823 {
1824
1825 #  ifndef USE_LOCALE_CTYPE
1826
1827     PERL_UNUSED_ARG(newctype);
1828     PERL_UNUSED_CONTEXT;
1829
1830 #  else
1831
1832     /* Called after each libc setlocale() call affecting LC_CTYPE, to tell
1833      * core Perl this and that 'newctype' is the name of the new locale.
1834      *
1835      * This function sets up the folding arrays for all 256 bytes, assuming
1836      * that tofold() is tolc() since fold case is not a concept in POSIX,
1837      *
1838      * Any code changing the locale (outside this file) should use
1839      * Perl_setlocale or POSIX::setlocale, which call this function.  Therefore
1840      * this function should be called directly only from this file and from
1841      * POSIX::setlocale() */
1842
1843     unsigned int i;
1844
1845     /* Don't check for problems if we are suppressing the warnings */
1846     bool check_for_problems = ckWARN_d(WARN_LOCALE) || UNLIKELY(DEBUG_L_TEST);
1847     bool maybe_utf8_turkic = FALSE;
1848
1849     PERL_ARGS_ASSERT_NEW_CTYPE;
1850
1851     DEBUG_L(PerlIO_printf(Perl_debug_log, "Entering new_ctype(%s)\n", newctype));
1852
1853     /* No change means no-op */
1854     if (PL_ctype_name && strEQ(PL_ctype_name, newctype)) {
1855         return;
1856     }
1857
1858     /* We will replace any bad locale warning with 1) nothing if the new one is
1859      * ok; or 2) a new warning for the bad new locale */
1860     if (PL_warn_locale) {
1861         SvREFCNT_dec_NN(PL_warn_locale);
1862         PL_warn_locale = NULL;
1863     }
1864
1865     /* Clear cache */
1866     Safefree(PL_ctype_name);
1867     PL_ctype_name = "";
1868
1869     /* Guard against the is_locale_utf8() call potentially zapping newctype.
1870      * This is not extra work as the cache is set to this a few lines down, and
1871      * that needs to be copied anyway */
1872     newctype = savepv(newctype);
1873
1874     /* With cache cleared, this will know to compute a new value */
1875     PL_in_utf8_CTYPE_locale = is_locale_utf8(newctype);
1876
1877     /* Cache new name */
1878     PL_ctype_name = newctype;
1879
1880     PL_in_utf8_turkic_locale = FALSE;
1881
1882     if (isNAME_C_OR_POSIX(PL_ctype_name)) {
1883         Copy(PL_fold, PL_fold_locale, 256, U8);
1884     }
1885     else if (PL_in_utf8_CTYPE_locale) {
1886
1887         /* A UTF-8 locale gets standard rules.  But note that code still has to
1888          * handle this specially because of the three problematic code points
1889          * */
1890         Copy(PL_fold_latin1, PL_fold_locale, 256, U8);
1891
1892         /* UTF-8 locales can have special handling for 'I' and 'i' if they are
1893          * Turkic.  Make sure these two are the only anomalies.  (We don't
1894          * require towupper and towlower because they aren't in C89.) */
1895
1896 #    if defined(HAS_TOWUPPER) && defined (HAS_TOWLOWER)
1897
1898         if (towupper('i') == 0x130 && towlower('I') == 0x131)
1899
1900 #    else
1901
1902         if (toU8_UPPER_LC('i') == 'i' && toU8_LOWER_LC('I') == 'I')
1903
1904 #    endif
1905
1906         {
1907             /* This is how we determine it really is Turkic */
1908             check_for_problems = TRUE;
1909             maybe_utf8_turkic = TRUE;
1910         }
1911     }
1912     else {  /* Not a canned locale we know the values for.  Compute them */
1913
1914 #    ifdef DEBUGGING
1915
1916         bool has_non_ascii_fold = FALSE;
1917         bool found_unexpected = FALSE;
1918
1919         if (DEBUG_Lv_TEST) {
1920             for (i = 128; i < 256; i++) {
1921                 int j = LATIN1_TO_NATIVE(i);
1922                 if (toU8_LOWER_LC(j) != j || toU8_UPPER_LC(j) != j) {
1923                     has_non_ascii_fold = TRUE;
1924                     break;
1925                 }
1926             }
1927         }
1928
1929 #    endif
1930
1931         for (i = 0; i < 256; i++) {
1932             if (isU8_UPPER_LC(i))
1933                 PL_fold_locale[i] = (U8) toU8_LOWER_LC(i);
1934             else if (isU8_LOWER_LC(i))
1935                 PL_fold_locale[i] = (U8) toU8_UPPER_LC(i);
1936             else
1937                 PL_fold_locale[i] = (U8) i;
1938
1939 #    ifdef DEBUGGING
1940
1941             if (DEBUG_Lv_TEST) {
1942                 bool unexpected = FALSE;
1943
1944                 if (isUPPER_L1(i)) {
1945                     if (isUPPER_A(i)) {
1946                         if (PL_fold_locale[i] != toLOWER_A(i)) {
1947                             unexpected = TRUE;
1948                         }
1949                     }
1950                     else if (has_non_ascii_fold) {
1951                         if (PL_fold_locale[i] != toLOWER_L1(i)) {
1952                             unexpected = TRUE;
1953                         }
1954                     }
1955                     else if (PL_fold_locale[i] != i) {
1956                         unexpected = TRUE;
1957                     }
1958                 }
1959                 else if (   isLOWER_L1(i)
1960                          && i != LATIN_SMALL_LETTER_SHARP_S
1961                          && i != MICRO_SIGN)
1962                 {
1963                     if (isLOWER_A(i)) {
1964                         if (PL_fold_locale[i] != toUPPER_A(i)) {
1965                             unexpected = TRUE;
1966                         }
1967                     }
1968                     else if (has_non_ascii_fold) {
1969                         if (PL_fold_locale[i] != toUPPER_LATIN1_MOD(i)) {
1970                             unexpected = TRUE;
1971                         }
1972                     }
1973                     else if (PL_fold_locale[i] != i) {
1974                         unexpected = TRUE;
1975                     }
1976                 }
1977                 else if (PL_fold_locale[i] != i) {
1978                     unexpected = TRUE;
1979                 }
1980
1981                 if (unexpected) {
1982                     found_unexpected = TRUE;
1983                     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
1984                                            "For %s, fold of %02x is %02x\n",
1985                                            newctype, i, PL_fold_locale[i]));
1986                 }
1987             }
1988         }
1989
1990         if (found_unexpected) {
1991             DEBUG_Lv(PerlIO_printf(Perl_debug_log,
1992                                "All bytes not mentioned above either fold to"
1993                                " themselves or are the expected ASCII or"
1994                                " Latin1 ones\n"));
1995         }
1996         else {
1997             DEBUG_Lv(PerlIO_printf(Perl_debug_log,
1998                                    "No nonstandard folds were found\n"));
1999 #    endif
2000
2001         }
2002     }
2003
2004 #    ifdef MB_CUR_MAX
2005
2006     /* We only handle single-byte locales (outside of UTF-8 ones; so if this
2007      * locale requires more than one byte, there are going to be BIG problems.
2008      * */
2009
2010     if (MB_CUR_MAX > 1 && ! PL_in_utf8_CTYPE_locale
2011
2012             /* Some platforms return MB_CUR_MAX > 1 for even the "C" locale.
2013              * Just assume that the implementation for them (plus for POSIX) is
2014              * correct and the > 1 value is spurious.  (Since these are
2015              * specially handled to never be considered UTF-8 locales, as long
2016              * as this is the only problem, everything should work fine */
2017         && ! isNAME_C_OR_POSIX(newctype))
2018     {
2019         DEBUG_L(PerlIO_printf(Perl_debug_log,
2020                               "Unsupported, MB_CUR_MAX=%d\n", (int) MB_CUR_MAX));
2021
2022         Perl_ck_warner_d(aTHX_ packWARN(WARN_LOCALE),
2023                          "Locale '%s' is unsupported, and may crash the"
2024                          " interpreter.\n",
2025                          newctype);
2026     }
2027
2028 #    endif
2029
2030     DEBUG_Lv(PerlIO_printf(Perl_debug_log, "check_for_problems=%d\n",
2031                                            check_for_problems));
2032
2033     /* We don't populate the other lists if a UTF-8 locale, but do check that
2034      * everything works as expected, unless checking turned off */
2035     if (check_for_problems) {
2036         /* Assume enough space for every character being bad.  4 spaces each
2037          * for the 94 printable characters that are output like "'x' "; and 5
2038          * spaces each for "'\\' ", "'\t' ", and "'\n' "; plus a terminating
2039          * NUL */
2040         char bad_chars_list[ (94 * 4) + (3 * 5) + 1 ] = { '\0' };
2041         unsigned int bad_count = 0;         /* Count of bad characters */
2042
2043         for (i = 0; i < 256; i++) {
2044
2045             /* If checking for locale problems, see if the native ASCII-range
2046              * printables plus \n and \t are in their expected categories in
2047              * the new locale.  If not, this could mean big trouble, upending
2048              * Perl's and most programs' assumptions, like having a
2049              * metacharacter with special meaning become a \w.  Fortunately,
2050              * it's very rare to find locales that aren't supersets of ASCII
2051              * nowadays.  It isn't a problem for most controls to be changed
2052              * into something else; we check only \n and \t, though perhaps \r
2053              * could be an issue as well. */
2054             if (isGRAPH_A(i) || isBLANK_A(i) || i == '\n') {
2055                 bool is_bad = FALSE;
2056                 char name[4] = { '\0' };
2057
2058                 /* Convert the name into a string */
2059                 if (isGRAPH_A(i)) {
2060                     name[0] = i;
2061                     name[1] = '\0';
2062                 }
2063                 else if (i == '\n') {
2064                     my_strlcpy(name, "\\n", sizeof(name));
2065                 }
2066                 else if (i == '\t') {
2067                     my_strlcpy(name, "\\t", sizeof(name));
2068                 }
2069                 else {
2070                     assert(i == ' ');
2071                     my_strlcpy(name, "' '", sizeof(name));
2072                 }
2073
2074                 /* Check each possibe class */
2075                 if (UNLIKELY(cBOOL(isU8_ALPHANUMERIC_LC(i)) != cBOOL(isALPHANUMERIC_A(i))))  {
2076                     is_bad = TRUE;
2077                     DEBUG_L(PerlIO_printf(Perl_debug_log,
2078                                           "isalnum('%s') unexpectedly is %x\n",
2079                                           name, cBOOL(isU8_ALPHANUMERIC_LC(i))));
2080                 }
2081                 if (UNLIKELY(cBOOL(isU8_ALPHA_LC(i)) != cBOOL(isALPHA_A(i))))  {
2082                     is_bad = TRUE;
2083                     DEBUG_L(PerlIO_printf(Perl_debug_log,
2084                                           "isalpha('%s') unexpectedly is %x\n",
2085                                           name, cBOOL(isU8_ALPHA_LC(i))));
2086                 }
2087                 if (UNLIKELY(cBOOL(isU8_DIGIT_LC(i)) != cBOOL(isDIGIT_A(i))))  {
2088                     is_bad = TRUE;
2089                     DEBUG_L(PerlIO_printf(Perl_debug_log,
2090                                           "isdigit('%s') unexpectedly is %x\n",
2091                                           name, cBOOL(isU8_DIGIT_LC(i))));
2092                 }
2093                 if (UNLIKELY(cBOOL(isU8_GRAPH_LC(i)) != cBOOL(isGRAPH_A(i))))  {
2094                     is_bad = TRUE;
2095                     DEBUG_L(PerlIO_printf(Perl_debug_log,
2096                                           "isgraph('%s') unexpectedly is %x\n",
2097                                           name, cBOOL(isU8_GRAPH_LC(i))));
2098                 }
2099                 if (UNLIKELY(cBOOL(isU8_LOWER_LC(i)) != cBOOL(isLOWER_A(i))))  {
2100                     is_bad = TRUE;
2101                     DEBUG_L(PerlIO_printf(Perl_debug_log,
2102                                           "islower('%s') unexpectedly is %x\n",
2103                                           name, cBOOL(isU8_LOWER_LC(i))));
2104                 }
2105                 if (UNLIKELY(cBOOL(isU8_PRINT_LC(i)) != cBOOL(isPRINT_A(i))))  {
2106                     is_bad = TRUE;
2107                     DEBUG_L(PerlIO_printf(Perl_debug_log,
2108                                           "isprint('%s') unexpectedly is %x\n",
2109                                           name, cBOOL(isU8_PRINT_LC(i))));
2110                 }
2111                 if (UNLIKELY(cBOOL(isU8_PUNCT_LC(i)) != cBOOL(isPUNCT_A(i))))  {
2112                     is_bad = TRUE;
2113                     DEBUG_L(PerlIO_printf(Perl_debug_log,
2114                                           "ispunct('%s') unexpectedly is %x\n",
2115                                           name, cBOOL(isU8_PUNCT_LC(i))));
2116                 }
2117                 if (UNLIKELY(cBOOL(isU8_SPACE_LC(i)) != cBOOL(isSPACE_A(i))))  {
2118                     is_bad = TRUE;
2119                     DEBUG_L(PerlIO_printf(Perl_debug_log,
2120                                           "isspace('%s') unexpectedly is %x\n",
2121                                           name, cBOOL(isU8_SPACE_LC(i))));
2122                 }
2123                 if (UNLIKELY(cBOOL(isU8_UPPER_LC(i)) != cBOOL(isUPPER_A(i))))  {
2124                     is_bad = TRUE;
2125                     DEBUG_L(PerlIO_printf(Perl_debug_log,
2126                                           "isupper('%s') unexpectedly is %x\n",
2127                                           name, cBOOL(isU8_UPPER_LC(i))));
2128                 }
2129                 if (UNLIKELY(cBOOL(isU8_XDIGIT_LC(i))!= cBOOL(isXDIGIT_A(i))))  {
2130                     is_bad = TRUE;
2131                     DEBUG_L(PerlIO_printf(Perl_debug_log,
2132                                           "isxdigit('%s') unexpectedly is %x\n",
2133                                           name, cBOOL(isU8_XDIGIT_LC(i))));
2134                 }
2135                 if (UNLIKELY(toU8_LOWER_LC(i) != (int) toLOWER_A(i))) {
2136                     is_bad = TRUE;
2137                     DEBUG_L(PerlIO_printf(Perl_debug_log,
2138                             "tolower('%s')=0x%x instead of the expected 0x%x\n",
2139                             name, toU8_LOWER_LC(i), (int) toLOWER_A(i)));
2140                 }
2141                 if (UNLIKELY(toU8_UPPER_LC(i) != (int) toUPPER_A(i))) {
2142                     is_bad = TRUE;
2143                     DEBUG_L(PerlIO_printf(Perl_debug_log,
2144                             "toupper('%s')=0x%x instead of the expected 0x%x\n",
2145                             name, toU8_UPPER_LC(i), (int) toUPPER_A(i)));
2146                 }
2147                 if (UNLIKELY((i == '\n' && ! isCNTRL_LC(i))))  {
2148                     is_bad = TRUE;
2149                     DEBUG_L(PerlIO_printf(Perl_debug_log,
2150                                 "'\\n' (=%02X) is not a control\n", (int) i));
2151                 }
2152
2153                 /* Add to the list;  Separate multiple entries with a blank */
2154                 if (is_bad) {
2155                     if (bad_count) {
2156                         my_strlcat(bad_chars_list, " ", sizeof(bad_chars_list));
2157                     }
2158                     my_strlcat(bad_chars_list, name, sizeof(bad_chars_list));
2159                     bad_count++;
2160                 }
2161             }
2162         }
2163
2164         if (bad_count == 2 && maybe_utf8_turkic) {
2165             bad_count = 0;
2166             *bad_chars_list = '\0';
2167             PL_fold_locale['I'] = 'I';
2168             PL_fold_locale['i'] = 'i';
2169             PL_in_utf8_turkic_locale = TRUE;
2170             DEBUG_L(PerlIO_printf(Perl_debug_log, "%s is turkic\n", newctype));
2171         }
2172
2173         /* If we found problems and we want them output, do so */
2174         if (   (UNLIKELY(bad_count))
2175             && (LIKELY(ckWARN_d(WARN_LOCALE)) || UNLIKELY(DEBUG_L_TEST)))
2176         {
2177             if (UNLIKELY(bad_count) && PL_in_utf8_CTYPE_locale) {
2178                 PL_warn_locale = Perl_newSVpvf(aTHX_
2179                      "Locale '%s' contains (at least) the following characters"
2180                      " which have\nunexpected meanings: %s\nThe Perl program"
2181                      " will use the expected meanings",
2182                       newctype, bad_chars_list);
2183             }
2184             else {
2185                 PL_warn_locale =
2186                     Perl_newSVpvf(aTHX_
2187                                   "\nThe following characters (and maybe"
2188                                   " others) may not have the same meaning as"
2189                                   " the Perl program expects: %s\n",
2190                                   bad_chars_list
2191                             );
2192             }
2193
2194 #    ifdef HAS_SOME_LANGINFO
2195
2196             const char * scratch_buffer = NULL;
2197             Perl_sv_catpvf(aTHX_ PL_warn_locale, "; codeset=%s",
2198                                  my_langinfo_c(CODESET, LC_CTYPE,
2199                                                newctype,
2200                                                &scratch_buffer, NULL,
2201                                                NULL));
2202             Safefree(scratch_buffer);
2203
2204 #  endif
2205
2206             Perl_sv_catpvf(aTHX_ PL_warn_locale, "\n");
2207
2208             /* If we are actually in the scope of the locale or are debugging,
2209              * output the message now.  If not in that scope, we save the
2210              * message to be output at the first operation using this locale,
2211              * if that actually happens.  Most programs don't use locales, so
2212              * they are immune to bad ones.  */
2213             if (IN_LC(LC_CTYPE) || UNLIKELY(DEBUG_L_TEST)) {
2214
2215                 /* The '0' below suppresses a bogus gcc compiler warning */
2216                 Perl_warner(aTHX_ packWARN(WARN_LOCALE), SvPVX(PL_warn_locale),
2217                                                                             0);
2218
2219                 if (IN_LC(LC_CTYPE)) {
2220                     SvREFCNT_dec_NN(PL_warn_locale);
2221                     PL_warn_locale = NULL;
2222                 }
2223             }
2224         }
2225     }
2226
2227 #  endif /* USE_LOCALE_CTYPE */
2228
2229 }
2230
2231 void
2232 Perl__warn_problematic_locale()
2233 {
2234
2235 #  ifdef USE_LOCALE_CTYPE
2236
2237     dTHX;
2238
2239     /* Internal-to-core function that outputs the message in PL_warn_locale,
2240      * and then NULLS it.  Should be called only through the macro
2241      * CHECK_AND_WARN_PROBLEMATIC_LOCALE_ */
2242
2243     if (PL_warn_locale) {
2244         Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
2245                              SvPVX(PL_warn_locale),
2246                              0 /* dummy to avoid compiler warning */ );
2247         SvREFCNT_dec_NN(PL_warn_locale);
2248         PL_warn_locale = NULL;
2249     }
2250
2251 #  endif
2252
2253 }
2254
2255 STATIC void
2256 S_new_LC_ALL(pTHX_ const char *unused)
2257 {
2258     unsigned int i;
2259
2260     /* LC_ALL updates all the things we care about. */
2261
2262     PERL_UNUSED_ARG(unused);
2263
2264     for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++) {
2265         if (update_functions[i]) {
2266             const char * this_locale = querylocale_i(i);
2267             update_functions[i](aTHX_ this_locale);
2268         }
2269     }
2270 }
2271
2272 STATIC void
2273 S_new_collate(pTHX_ const char *newcoll)
2274 {
2275     PERL_ARGS_ASSERT_NEW_COLLATE;
2276
2277 #  ifndef USE_LOCALE_COLLATE
2278
2279     PERL_UNUSED_ARG(newcoll);
2280     PERL_UNUSED_CONTEXT;
2281
2282 #  else
2283
2284     /* Called after each libc setlocale() call affecting LC_COLLATE, to tell
2285      * core Perl this and that 'newcoll' is the name of the new locale.
2286      *
2287      * The design of locale collation is that every locale change is given an
2288      * index 'PL_collation_ix'.  The first time a string particpates in an
2289      * operation that requires collation while locale collation is active, it
2290      * is given PERL_MAGIC_collxfrm magic (via sv_collxfrm_flags()).  That
2291      * magic includes the collation index, and the transformation of the string
2292      * by strxfrm(), q.v.  That transformation is used when doing comparisons,
2293      * instead of the string itself.  If a string changes, the magic is
2294      * cleared.  The next time the locale changes, the index is incremented,
2295      * and so we know during a comparison that the transformation is not
2296      * necessarily still valid, and so is recomputed.  Note that if the locale
2297      * changes enough times, the index could wrap (a U32), and it is possible
2298      * that a transformation would improperly be considered valid, leading to
2299      * an unlikely bug */
2300
2301     /* Return if the locale isn't changing */
2302     if (strEQ(PL_collation_name, newcoll)) {
2303         return;
2304     }
2305
2306     Safefree(PL_collation_name);
2307     PL_collation_name = savepv(newcoll);
2308     ++PL_collation_ix;
2309
2310     /* Set the new one up if trivial */
2311     PL_collation_standard = isNAME_C_OR_POSIX(newcoll);
2312     if (PL_collation_standard) {
2313
2314     /* Do minimal set up now */
2315         DEBUG_Lv(PerlIO_printf(Perl_debug_log, "Setting PL_collation name='%s'\n", PL_collation_name));
2316         PL_collxfrm_base = 0;
2317         PL_collxfrm_mult = 2;
2318         PL_in_utf8_COLLATE_locale = FALSE;
2319         PL_strxfrm_NUL_replacement = '\0';
2320         PL_strxfrm_max_cp = 0;
2321         return;
2322     }
2323
2324     /* Flag that the remainder of the set up is being deferred until first need */
2325     PL_collxfrm_mult = 0;
2326     PL_collxfrm_base = 0;
2327
2328 #  endif /* USE_LOCALE_COLLATE */
2329
2330 }
2331
2332 #endif  /* USE_LOCALE */
2333
2334 #ifdef WIN32
2335
2336 wchar_t *
2337 Perl_Win_utf8_string_to_wstring(const char * utf8_string)
2338 {
2339     wchar_t *wstring;
2340
2341     int req_size = MultiByteToWideChar(CP_UTF8, 0, utf8_string, -1, NULL, 0);
2342     if (! req_size) {
2343         errno = EINVAL;
2344         return NULL;
2345     }
2346
2347     Newx(wstring, req_size, wchar_t);
2348
2349     if (! MultiByteToWideChar(CP_UTF8, 0, utf8_string, -1, wstring, req_size))
2350     {
2351         Safefree(wstring);
2352         errno = EINVAL;
2353         return NULL;
2354     }
2355
2356     return wstring;
2357 }
2358
2359 char *
2360 Perl_Win_wstring_to_utf8_string(const wchar_t * wstring)
2361 {
2362     char *utf8_string;
2363
2364     int req_size =
2365               WideCharToMultiByte(CP_UTF8, 0, wstring, -1, NULL, 0, NULL, NULL);
2366
2367     Newx(utf8_string, req_size, char);
2368
2369     if (! WideCharToMultiByte(CP_UTF8, 0, wstring, -1, utf8_string,
2370                                                          req_size, NULL, NULL))
2371     {
2372         Safefree(utf8_string);
2373         errno = EINVAL;
2374         return NULL;
2375     }
2376
2377     return utf8_string;
2378 }
2379
2380 #define USE_WSETLOCALE
2381
2382 #ifdef USE_WSETLOCALE
2383
2384 STATIC char *
2385 S_wrap_wsetlocale(pTHX_ int category, const char *locale) {
2386     wchar_t *wlocale = NULL;
2387     wchar_t *wresult;
2388     char *result;
2389
2390     if (locale) {
2391         wlocale = Win_utf8_string_to_wstring(locale);
2392         if (! wlocale) {
2393             return NULL;
2394         }
2395     }
2396     else {
2397         wlocale = NULL;
2398     }
2399
2400     wresult = _wsetlocale(category, wlocale);
2401     Safefree(wlocale);
2402
2403     if (! wresult) {
2404             return NULL;
2405         }
2406
2407     result = Win_wstring_to_utf8_string(wresult);
2408     SAVEFREEPV(result); /* is there something better we can do here? */
2409
2410     return result;
2411 }
2412
2413 #endif
2414
2415 STATIC char *
2416 S_win32_setlocale(pTHX_ int category, const char* locale)
2417 {
2418     /* This, for Windows, emulates POSIX setlocale() behavior.  There is no
2419      * difference between the two unless the input locale is "", which normally
2420      * means on Windows to get the machine default, which is set via the
2421      * computer's "Regional and Language Options" (or its current equivalent).
2422      * In POSIX, it instead means to find the locale from the user's
2423      * environment.  This routine changes the Windows behavior to first look in
2424      * the environment, and, if anything is found, use that instead of going to
2425      * the machine default.  If there is no environment override, the machine
2426      * default is used, by calling the real setlocale() with "".
2427      *
2428      * The POSIX behavior is to use the LC_ALL variable if set; otherwise to
2429      * use the particular category's variable if set; otherwise to use the LANG
2430      * variable. */
2431
2432     bool override_LC_ALL = FALSE;
2433     char * result;
2434     unsigned int i;
2435
2436     if (locale && strEQ(locale, "")) {
2437
2438 #  ifdef LC_ALL
2439
2440         locale = PerlEnv_getenv("LC_ALL");
2441         if (! locale) {
2442             if (category ==  LC_ALL) {
2443                 override_LC_ALL = TRUE;
2444             }
2445             else {
2446
2447 #  endif
2448
2449                 for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++) {
2450                     if (category == categories[i]) {
2451                         locale = PerlEnv_getenv(category_names[i]);
2452                         goto found_locale;
2453                     }
2454                 }
2455
2456                 locale = PerlEnv_getenv("LANG");
2457                 if (! locale) {
2458                     locale = "";
2459                 }
2460
2461               found_locale: ;
2462
2463 #  ifdef LC_ALL
2464
2465             }
2466         }
2467
2468 #  endif
2469
2470     }
2471
2472 #ifdef USE_WSETLOCALE
2473     result = S_wrap_wsetlocale(aTHX_ category, locale);
2474 #else
2475     result = setlocale(category, locale);
2476 #endif
2477     DEBUG_L(STMT_START {
2478                 PerlIO_printf(Perl_debug_log, "%s\n",
2479                             setlocale_debug_string_r(category, locale, result));
2480             } STMT_END);
2481
2482     if (! override_LC_ALL)  {
2483         return result;
2484     }
2485
2486     /* Here the input category was LC_ALL, and we have set it to what is in the
2487      * LANG variable or the system default if there is no LANG.  But these have
2488      * lower priority than the other LC_foo variables, so override it for each
2489      * one that is set.  (If they are set to "", it means to use the same thing
2490      * we just set LC_ALL to, so can skip) */
2491
2492     for (i = 0; i < LC_ALL_INDEX_; i++) {
2493         result = PerlEnv_getenv(category_names[i]);
2494         if (result && strNE(result, "")) {
2495 #ifdef USE_WSETLOCALE
2496             S_wrap_wsetlocale(aTHX_ categories[i], result);
2497 #else
2498             setlocale(categories[i], result);
2499 #endif
2500             DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s\n",
2501                 setlocale_debug_string_i(i, result, "not captured")));
2502         }
2503     }
2504
2505     result = setlocale(LC_ALL, NULL);
2506     DEBUG_L(STMT_START {
2507                 PerlIO_printf(Perl_debug_log, "%s\n",
2508                                setlocale_debug_string_c(LC_ALL, NULL, result));
2509             } STMT_END);
2510
2511     return result;
2512 }
2513
2514 #endif
2515
2516 /*
2517 =for apidoc Perl_setlocale
2518
2519 This is an (almost) drop-in replacement for the system L<C<setlocale(3)>>,
2520 taking the same parameters, and returning the same information, except that it
2521 returns the correct underlying C<LC_NUMERIC> locale.  Regular C<setlocale> will
2522 instead return C<C> if the underlying locale has a non-dot decimal point
2523 character, or a non-empty thousands separator for displaying floating point
2524 numbers.  This is because perl keeps that locale category such that it has a
2525 dot and empty separator, changing the locale briefly during the operations
2526 where the underlying one is required. C<Perl_setlocale> knows about this, and
2527 compensates; regular C<setlocale> doesn't.
2528
2529 Another reason it isn't completely a drop-in replacement is that it is
2530 declared to return S<C<const char *>>, whereas the system setlocale omits the
2531 C<const> (presumably because its API was specified long ago, and can't be
2532 updated; it is illegal to change the information C<setlocale> returns; doing
2533 so leads to segfaults.)
2534
2535 Finally, C<Perl_setlocale> works under all circumstances, whereas plain
2536 C<setlocale> can be completely ineffective on some platforms under some
2537 configurations.
2538
2539 C<Perl_setlocale> should not be used to change the locale except on systems
2540 where the predefined variable C<${^SAFE_LOCALES}> is 1.  On some such systems,
2541 the system C<setlocale()> is ineffective, returning the wrong information, and
2542 failing to actually change the locale.  C<Perl_setlocale>, however works
2543 properly in all circumstances.
2544
2545 The return points to a per-thread static buffer, which is overwritten the next
2546 time C<Perl_setlocale> is called from the same thread.
2547
2548 =cut
2549
2550 */
2551
2552 #ifndef USE_LOCALE_NUMERIC
2553 #  define affects_LC_NUMERIC(cat) 0
2554 #elif defined(LC_ALL)
2555 #  define affects_LC_NUMERIC(cat) (cat == LC_NUMERIC || cat == LC_ALL)
2556 #else
2557 #  define affects_LC_NUMERIC(cat) (cat == LC_NUMERIC)
2558 #endif
2559
2560 const char *
2561 Perl_setlocale(const int category, const char * locale)
2562 {
2563     /* This wraps POSIX::setlocale() */
2564
2565 #ifndef USE_LOCALE
2566
2567     PERL_UNUSED_ARG(category);
2568     PERL_UNUSED_ARG(locale);
2569
2570     return "C";
2571
2572 #else
2573
2574     const char * retval;
2575     dTHX;
2576
2577     DEBUG_L(PerlIO_printf(Perl_debug_log,
2578                           "Entering Perl_setlocale(%d, \"%s\")\n",
2579                           category, locale));
2580
2581     /* A NULL locale means only query what the current one is. */
2582     if (locale == NULL) {
2583
2584 #  ifndef USE_LOCALE_NUMERIC
2585
2586         /* Without LC_NUMERIC, it's trivial; we just return the value */
2587         return save_to_buffer(querylocale_r(category),
2588                               &PL_setlocale_buf, &PL_setlocale_bufsize);
2589 #  else
2590
2591         /* We have the LC_NUMERIC name saved, because we are normally switched
2592          * into the C locale (or equivalent) for it. */
2593         if (category == LC_NUMERIC) {
2594             DEBUG_L(PerlIO_printf(Perl_debug_log,
2595                     "Perl_setlocale(LC_NUMERIC, NULL) returning stashed '%s'\n",
2596                     PL_numeric_name));
2597
2598             /* We don't have to copy this return value, as it is a per-thread
2599              * variable, and won't change until a future setlocale */
2600             return PL_numeric_name;
2601         }
2602
2603 #    ifndef LC_ALL
2604
2605         /* Without LC_ALL, just return the value */
2606         return save_to_buffer(querylocale_r(category),
2607                               &PL_setlocale_buf, &PL_setlocale_bufsize);
2608
2609 #    else
2610
2611         /* Here, LC_ALL is available on this platform.  It's the one
2612          * complicating category (because it can contain a toggled LC_NUMERIC
2613          * value), for all the remaining ones (we took care of LC_NUMERIC
2614          * above), just return the value */
2615         if (category != LC_ALL) {
2616             return save_to_buffer(querylocale_r(category),
2617                                   &PL_setlocale_buf, &PL_setlocale_bufsize);
2618         }
2619
2620         bool toggled = FALSE;
2621
2622         /* For an LC_ALL query, switch back to the underlying numeric locale
2623          * (if we aren't there already) so as to get the correct results.  Our
2624          * records for all the other categories are valid without switching */
2625         if (! PL_numeric_underlying) {
2626             set_numeric_underlying();
2627             toggled = TRUE;
2628         }
2629
2630         retval = querylocale_c(LC_ALL);
2631
2632         if (toggled) {
2633             set_numeric_standard();
2634         }
2635
2636         DEBUG_L(PerlIO_printf(Perl_debug_log, "%s\n",
2637                             setlocale_debug_string_r(category, locale, retval)));
2638
2639         return save_to_buffer(retval, &PL_setlocale_buf, &PL_setlocale_bufsize);
2640
2641 #    endif      /* Has LC_ALL */
2642 #  endif        /* Has LC_NUMERIC */
2643
2644     } /* End of querying the current locale */
2645
2646
2647     /* Here, the input has a locale to change to.  First find the current
2648      * locale */
2649     unsigned int cat_index = get_category_index(category, NULL);
2650     retval = querylocale_i(cat_index);
2651
2652     /* If the new locale is the same as the current one, nothing is actually
2653      * being changed, so do nothing. */
2654     if (      strEQ(retval, locale)
2655         && (   ! affects_LC_NUMERIC(category)
2656
2657 #  ifdef USE_LOCALE_NUMERIC
2658
2659             || strEQ(locale, PL_numeric_name)
2660
2661 #  endif
2662
2663     )) {
2664         DEBUG_L(PerlIO_printf(Perl_debug_log,
2665                               "Already in requested locale: no action taken\n"));
2666         return save_to_buffer(setlocale_i(cat_index, locale),
2667                               &PL_setlocale_buf, &PL_setlocale_bufsize);
2668     }
2669
2670     /* Here, an actual change is being requested.  Do it */
2671     retval = setlocale_i(cat_index, locale);
2672     if (! retval) {
2673         DEBUG_L(PerlIO_printf(Perl_debug_log, "%s\n",
2674                           setlocale_debug_string_i(cat_index, locale, "NULL")));
2675         return NULL;
2676     }
2677
2678     retval = save_to_buffer(retval, &PL_setlocale_buf, &PL_setlocale_bufsize);
2679
2680     /* Now that have changed locales, we have to update our records to
2681      * correspond.  Only certain categories have extra work to update. */
2682     if (update_functions[cat_index]) {
2683         update_functions[cat_index](aTHX_ retval);
2684     }
2685
2686     DEBUG_L(PerlIO_printf(Perl_debug_log, "returning '%s'\n", retval));
2687
2688     return retval;
2689
2690 #endif
2691
2692 }
2693
2694 #ifdef USE_LOCALE
2695
2696 STATIC const char *
2697 S_save_to_buffer(const char * string, const char **buf, Size_t *buf_size)
2698 {
2699     /* Copy the NUL-terminated 'string' to a buffer whose address before this
2700      * call began at *buf, and whose available length before this call was
2701      * *buf_size.
2702      *
2703      * If the length of 'string' is greater than the space available, the
2704      * buffer is grown accordingly, which may mean that it gets relocated.
2705      * *buf and *buf_size will be updated to reflect this.
2706      *
2707      * Regardless, the function returns a pointer to where 'string' is now
2708      * stored.
2709      *
2710      * 'string' may be NULL, which means no action gets taken, and NULL is
2711      * returned.
2712      *
2713      * If *buf or 'buf_size' are NULL or *buf_size is 0, the buffer is assumed
2714      * empty, and memory is malloc'd.   'buf-size' being NULL is to be used
2715      * when this is a single use buffer, which will shortly be freed by the
2716      * caller.
2717      */
2718
2719     Size_t string_size;
2720
2721     PERL_ARGS_ASSERT_SAVE_TO_BUFFER;
2722
2723     if (! string) {
2724         return NULL;
2725     }
2726
2727     string_size = strlen(string) + 1;
2728
2729     if (buf_size == NULL) {
2730         Newx(*buf, string_size, char);
2731     }
2732     else if (*buf_size == 0) {
2733         Newx(*buf, string_size, char);
2734         *buf_size = string_size;
2735     }
2736     else if (string_size > *buf_size) {
2737         Renew(*buf, string_size, char);
2738         *buf_size = string_size;
2739     }
2740
2741     {
2742         dTHX_DEBUGGING;
2743         DEBUG_Lv(PerlIO_printf(Perl_debug_log,
2744                          "Copying '%s' to %p\n",
2745                          ((is_utf8_string((U8 *) string, 0))
2746                           ? string
2747                           :_byte_dump_string((U8 *) string, strlen(string), 0)),
2748                           *buf));
2749             }
2750
2751     Copy(string, *buf, string_size, char);
2752     return *buf;
2753 }
2754
2755 STATIC utf8ness_t
2756 S_get_locale_string_utf8ness_i(pTHX_ const char * locale,
2757                                      const unsigned cat_index,
2758                                      const char * string,
2759                                      const locale_utf8ness_t known_utf8)
2760 {
2761     /* Return to indicate if 'string' in the locale given by the input
2762      * arguments should be considered UTF-8 or not.
2763      *
2764      * If the input 'locale' is not NULL, use that for the locale; otherwise
2765      * use the current locale for the category specified by 'cat_index'.
2766      */
2767
2768     Size_t len;
2769     const U8 * first_variant = NULL;
2770
2771     PERL_ARGS_ASSERT_GET_LOCALE_STRING_UTF8NESS_I;
2772     assert(cat_index <= NOMINAL_LC_ALL_INDEX);
2773
2774     if (string == NULL) {
2775         return UTF8NESS_NO;
2776     }
2777
2778     if (IN_BYTES) { /* respect 'use bytes' */
2779         return UTF8NESS_NO;
2780     }
2781
2782     len = strlen(string);
2783
2784     /* UTF8ness is immaterial if the representation doesn't vary */
2785     if (is_utf8_invariant_string_loc((U8 *) string, len, &first_variant)) {
2786         return UTF8NESS_IMMATERIAL;
2787     }
2788
2789     /* Can't be UTF-8 if invalid */
2790     if (! is_utf8_string((U8 *) first_variant,
2791                          len - ((char *) first_variant - string)))
2792     {
2793         return UTF8NESS_NO;
2794     }
2795
2796     /* Here and below, we know the string is legal UTF-8, containing at least
2797      * one character requiring a sequence of two or more bytes.  It is quite
2798      * likely to be UTF-8.  But it pays to be paranoid and do further checking.
2799      *
2800      * If we already know the UTF-8ness of the locale, then we immediately know
2801      * what the string is */
2802     if (UNLIKELY(known_utf8 != LOCALE_UTF8NESS_UNKNOWN)) {
2803         if (known_utf8 == LOCALE_IS_UTF8) {
2804             return UTF8NESS_YES;
2805         }
2806         else {
2807             return UTF8NESS_NO;
2808         }
2809     }
2810
2811 #  if defined(HAS_SOME_LANGINFO) || defined(HAS_MBTOWC) || defined(HAS_MBRTOWC)
2812
2813     /* Here, we have available the libc functions that can be used to
2814      * accurately determine the UTF8ness of the underlying locale.  If it is a
2815      * UTF-8 locale, the string is UTF-8;  otherwise it was coincidental that
2816      * the string is legal UTF-8
2817      *
2818      * However, if the perl is compiled to not pay attention to the category
2819      * being passed in, you might think that that locale is essentially always
2820      * the C locale, so it would make sense to say it isn't UTF-8.  But to get
2821      * here, the string has to contain characters unknown in the C locale.  And
2822      * in fact, Windows boxes are compiled without LC_MESSAGES, as their
2823      * message catalog isn't really a part of the locale system.  But those
2824      * messages really could be UTF-8, and given that the odds are rather small
2825      * of something not being UTF-8 but being syntactically valid UTF-8, khw
2826      * has decided to call such strings as UTF-8. */
2827
2828     if (locale == NULL) {
2829         locale = querylocale_i(cat_index);
2830     }
2831     if (is_locale_utf8(locale)) {
2832         return UTF8NESS_YES;
2833     }
2834
2835     return UTF8NESS_NO;
2836
2837 #  else
2838
2839     /* Here, we have a valid UTF-8 string containing non-ASCII characters, and
2840      * don't have access to functions to check if the locale is UTF-8 or not.
2841      * Assume that it is.  khw tried adding a check that the string is entirely
2842      * in a single Unicode script, but discovered the strftime() timezone is
2843      * user-settable through the environment, which may be in a different
2844      * script than the locale-expected value. */
2845     PERL_UNUSED_ARG(locale);
2846     PERL_UNUSED_ARG(cat_index);
2847
2848     return UTF8NESS_YES;
2849
2850 #  endif
2851
2852 }
2853
2854 #  ifdef WIN32
2855
2856 bool
2857 Perl_get_win32_message_utf8ness(pTHX_ const char * string)
2858 {
2859     /* NULL => locale irrelevant, 0 => category irrelevant
2860      * so returns based on the UTF-8 legality of the input string, ignoring the
2861      * locale and category completely.
2862      *
2863      * This is because Windows doesn't have LC_MESSAGES */
2864     return get_locale_string_utf8ness_i(NULL, 0, string, LOCALE_IS_UTF8);
2865 }
2866
2867 #  endif
2868 #endif  /* USE_LOCALE */
2869
2870
2871 int
2872 Perl_mbtowc_(pTHX_ const wchar_t * pwc, const char * s, const Size_t len)
2873 {
2874
2875 #if ! defined(HAS_MBRTOWC) && ! defined(HAS_MBTOWC)
2876
2877     PERL_UNUSED_ARG(pwc);
2878     PERL_UNUSED_ARG(s);
2879     PERL_UNUSED_ARG(len);
2880     return -1;
2881
2882 #else   /* Below we have some form of mbtowc() */
2883 #   if defined(HAS_MBRTOWC)                                     \
2884    && (defined(USE_LOCALE_THREADS) || ! defined(HAS_MBTOWC))
2885 #    define USE_MBRTOWC
2886 #  else
2887 #    undef USE_MBRTOWC
2888 #  endif
2889
2890     int retval = -1;
2891
2892     if (s == NULL) { /* Initialize the shift state to all zeros in
2893                         PL_mbrtowc_ps. */
2894
2895 #  if defined(USE_MBRTOWC)
2896
2897         memzero(&PL_mbrtowc_ps, sizeof(PL_mbrtowc_ps));
2898         return 0;
2899
2900 #  else
2901
2902         MBTOWC_LOCK;
2903         SETERRNO(0, 0);
2904         retval = mbtowc(NULL, NULL, 0);
2905         MBTOWC_UNLOCK;
2906         return retval;
2907
2908 #  endif
2909
2910     }
2911
2912 #  if defined(USE_MBRTOWC)
2913
2914     SETERRNO(0, 0);
2915     retval = (SSize_t) mbrtowc((wchar_t *) pwc, s, len, &PL_mbrtowc_ps);
2916
2917 #  else
2918
2919     /* Locking prevents races, but locales can be switched out without locking,
2920      * so this isn't a cure all */
2921     MBTOWC_LOCK;
2922     SETERRNO(0, 0);
2923     retval = mbtowc((wchar_t *) pwc, s, len);
2924     MBTOWC_UNLOCK;
2925
2926 #  endif
2927
2928     return retval;
2929
2930 #endif
2931
2932 }
2933
2934 /*
2935 =for apidoc Perl_localeconv
2936
2937 This is a thread-safe version of the libc L<localeconv(3)>.  It is the same as
2938 L<POSIX::localeconv|POSIX/localeconv> (returning a hash of the C<localeconv()>
2939 fields), but directly callable from XS code.
2940
2941 =cut
2942 */
2943
2944 HV *
2945 Perl_localeconv(pTHX)
2946 {
2947
2948 #if  ! defined(HAS_SOME_LOCALECONV)                                     \
2949  || (! defined(USE_LOCALE_MONETARY) && ! defined(USE_LOCALE_NUMERIC))
2950
2951     return newHV();
2952
2953 #else
2954
2955     return my_localeconv(0, LOCALE_UTF8NESS_UNKNOWN);
2956
2957 #endif
2958
2959 }
2960
2961 #if  defined(HAS_SOME_LOCALECONV)                                   \
2962  && (defined(USE_LOCALE_MONETARY) || defined(USE_LOCALE_NUMERIC))
2963
2964 HV *
2965 S_my_localeconv(pTHX_ const int item, const locale_utf8ness_t locale_is_utf8)
2966 {
2967     HV * retval;
2968     locale_utf8ness_t numeric_locale_is_utf8  = LOCALE_UTF8NESS_UNKNOWN;
2969     locale_utf8ness_t monetary_locale_is_utf8 = LOCALE_UTF8NESS_UNKNOWN;
2970     HV * (*copy_localeconv)(pTHX_ const struct lconv *,
2971                                   const int,
2972                                   const locale_utf8ness_t,
2973                                   const locale_utf8ness_t);
2974
2975     /* A thread-safe locale_conv().  The locking mechanisms vary greatly
2976      * depending on platform capabilities.  They all share this common set up
2977      * code for the function, and then conditional compilations choose one of
2978      * several terminations.
2979      *
2980      * There are two use cases:
2981      * 1) Called from POSIX::locale_conv().  This returns lconv() copied to
2982      *    a hash, based on the current underlying locale.
2983      * 2) Certain items that nl_langinfo() provides are also derivable from
2984      *    the return of localeconv().  Windows notably doesn't have
2985      *    nl_langinfo(), so on that, and actually any platform lacking it,
2986      *    my_localeconv() is used to emulate it for those particular items.
2987      *    The code to do this is compiled only on such platforms.  Rather than
2988      *    going to the expense of creating a full hash when only one item is
2989      *    needed, just the desired item is returned, in an SV cast to an HV.
2990      *
2991      * There is a helper function to accomplish each of the two tasks.  The
2992      * function pointer just below is set to the appropriate one, and is called
2993      * from each of the various implementations, in the middle of whatever
2994      * necessary locking/locale swapping have been done. */
2995
2996 #  ifdef HAS_SOME_LANGINFO
2997
2998     PERL_UNUSED_ARG(item);
2999     PERL_UNUSED_ARG(locale_is_utf8);
3000
3001 #    ifdef USE_LOCALE_NUMERIC
3002
3003     /* When there is a nl_langinfo, we will only be called for localeconv
3004      * numeric purposes. */
3005     const bool is_localeconv_call = true;
3006
3007 #    endif
3008
3009 #  else
3010
3011     /* Note we use this sentinel; this works because this only gets compiled
3012      * when our perl_langinfo.h is used, and that uses negative numbers for all
3013      * the items */
3014     const bool is_localeconv_call = (item == 0);
3015     if (is_localeconv_call)
3016
3017 #  endif
3018
3019     {
3020         copy_localeconv = S_populate_localeconv;
3021
3022 #    ifdef USE_LOCALE_NUMERIC
3023
3024         /* Get the UTF8ness of the locales now to avoid repeating this for each
3025          * string returned by localeconv() */
3026         numeric_locale_is_utf8 = (is_locale_utf8(PL_numeric_name))
3027                                   ? LOCALE_IS_UTF8
3028                                   : LOCALE_NOT_UTF8;
3029
3030 #    endif
3031 #    ifdef USE_LOCALE_MONETARY
3032
3033         monetary_locale_is_utf8 = (is_locale_utf8(querylocale_c(LC_MONETARY)))
3034                                   ? LOCALE_IS_UTF8
3035                                   : LOCALE_NOT_UTF8;
3036
3037 #  endif
3038
3039     }
3040
3041 #  ifndef HAS_SOME_LANGINFO
3042
3043     else {
3044         copy_localeconv = S_get_nl_item_from_localeconv;
3045         numeric_locale_is_utf8 = locale_is_utf8;
3046     }
3047
3048 #  endif
3049
3050     PERL_ARGS_ASSERT_MY_LOCALECONV;
3051 /*--------------------------------------------------------------------------*/
3052 /* Here, we are done with the common beginning of all the implementations of
3053  * my_localeconv().  Below are the various terminations of the function (except
3054  * the closing '}'.  They are separated out because the preprocessor directives
3055  * were making the simple logic hard to follow.  Each implementation ends with
3056  * the same few lines.  khw decided to keep those separate because he thought
3057  * it was clearer to the reader.
3058  *
3059  * The first distinct termination (of the above common code) are the
3060  * implementations when we have locale_conv_l() and can use it.  These are the
3061  * simplest cases, without any locking needed. */
3062 #  if defined(USE_POSIX_2008_LOCALE) && defined(HAS_LOCALECONV_L)
3063
3064      /* And there are two sub-cases: First (by far the most common) is where we
3065       * are compiled to pay attention to LC_NUMERIC */
3066 #    ifdef USE_LOCALE_NUMERIC
3067
3068     const locale_t cur = use_curlocale_scratch();
3069     locale_t with_numeric = duplocale(cur);
3070
3071     /* Just create a new locale object with what we've got, but using the
3072      * underlying LC_NUMERIC locale */
3073     with_numeric = newlocale(LC_NUMERIC_MASK, PL_numeric_name, with_numeric);
3074
3075     retval = copy_localeconv(aTHX_ localeconv_l(with_numeric),
3076                                    item,
3077                                    numeric_locale_is_utf8,
3078                                    monetary_locale_is_utf8);
3079     freelocale(with_numeric);
3080
3081     return retval;
3082
3083 /*--------------------------------------------------------------------------*/
3084 #    else   /* Below not paying attention to LC_NUMERIC */
3085
3086     const locale_t cur = use_curlocale_scratch();
3087
3088     retval = copy_localeconv(aTHX_ localeconv_l(cur),
3089                                    item,
3090                                    numeric_locale_is_utf8,
3091                                    monetary_locale_is_utf8);
3092     return retval;
3093
3094 #    endif  /* Above, using lconv_l(); below plain lconv() */
3095 /*--------------------------------------------------------------------------*/
3096 #  elif ! defined(TS_W32_BROKEN_LOCALECONV)  /* Next is regular lconv() */
3097
3098     /* There are so many locks because localeconv() deals with two
3099      * categories, and returns in a single global static buffer.  Some
3100      * locks might be no-ops on this platform, but not others.  We need to
3101      * lock if any one isn't a no-op. */
3102
3103 #    ifdef USE_LOCALE_NUMERIC
3104
3105     LC_NUMERIC_LOCK(0);
3106     const char * orig_switched_locale = NULL;
3107
3108     /* When called internally, are already switched into the proper numeric
3109      * locale; otherwise must toggle to it */
3110     if (is_localeconv_call) {
3111         orig_switched_locale = toggle_locale_c(LC_NUMERIC, PL_numeric_name);
3112     }
3113
3114 #    endif
3115
3116     LOCALECONV_LOCK;
3117     retval = copy_localeconv(aTHX_ localeconv(),
3118                                    item,
3119                                    numeric_locale_is_utf8,
3120                                    monetary_locale_is_utf8);
3121     LOCALECONV_UNLOCK;
3122
3123 #    ifdef USE_LOCALE_NUMERIC
3124
3125     if (orig_switched_locale) {
3126         restore_toggled_locale_c(LC_NUMERIC, orig_switched_locale);
3127     }
3128     LC_NUMERIC_UNLOCK;
3129
3130 #    endif
3131
3132     return retval;
3133
3134 /*--------------------------------------------------------------------------*/
3135 #  else /* defined(TS_W32_BROKEN_LOCALECONV) */
3136
3137     /* Last is a workaround for the broken localeconv() on Windows with
3138      * thread-safe locales prior to VS 15.  It looks at the global locale
3139      * instead of the thread one.  As a work-around, we toggle to the global
3140      * locale; populate the return; then toggle back.  We have to use LC_ALL
3141      * instead of the individual categories because of another bug in Windows.
3142      *
3143      * This introduces a potential race with any other thread that has also
3144      * converted to use the global locale, and doesn't protect its locale calls
3145      * with mutexes.  khw can't think of any reason for a thread to do so on
3146      * Windows, as the locale API is the same regardless of thread-safety, except
3147      * if the code is ported from working on another platform where there might
3148      * be some reason to do this.  But this is typically due to some
3149      * alien-to-Perl library that thinks it owns locale setting.  Such a
3150      * library usn't likely to exist on Windows, so such an application is
3151      * unlikely to be run on Windows
3152      */
3153     bool restore_per_thread = FALSE;
3154
3155 #    ifdef USE_LOCALE_NUMERIC
3156
3157     const char * orig_switched_locale = NULL;
3158
3159     LC_NUMERIC_LOCK(0);
3160
3161     /* When called internally, are already switched into the proper numeric
3162      * locale; otherwise must toggle to it */
3163     if (is_localeconv_call) {
3164         orig_switched_locale = toggle_locale_c(LC_NUMERIC, PL_numeric_name);
3165     }
3166
3167 #    endif
3168
3169     /* Save the per-thread locale state */
3170     const char * save_thread = querylocale_c(LC_ALL);
3171
3172     /* Change to the global locale, and note if we already were there */
3173     if (_configthreadlocale(_DISABLE_PER_THREAD_LOCALE)
3174                          != _DISABLE_PER_THREAD_LOCALE)
3175     {
3176         restore_per_thread = TRUE;
3177     }
3178
3179     /* Save the state of the global locale; then convert to our desired
3180      * state.  */
3181     const char * save_global = querylocale_c(LC_ALL);
3182     void_setlocale_c(LC_ALL, save_thread);
3183
3184     /* Safely stash the desired data */
3185     LOCALECONV_LOCK;
3186     retval = copy_localeconv(aTHX_ localeconv(),
3187                                    item,
3188                                    numeric_locale_is_utf8,
3189                                    monetary_locale_is_utf8);
3190     LOCALECONV_UNLOCK;
3191
3192     /* Restore the global locale's prior state */
3193     void_setlocale_c(LC_ALL, save_global);
3194
3195     /* And back to per-thread locales */
3196     if (restore_per_thread) {
3197         _configthreadlocale(_ENABLE_PER_THREAD_LOCALE);
3198     }
3199
3200     /* Restore the per-thread locale state */
3201     void_setlocale_c(LC_ALL, save_thread);
3202
3203 #    ifdef USE_LOCALE_NUMERIC
3204
3205     if (orig_switched_locale) {
3206         restore_toggled_locale_c(LC_NUMERIC, orig_switched_locale);
3207     }
3208     LC_NUMERIC_UNLOCK;
3209
3210 #    endif
3211
3212     return retval;
3213
3214 #  endif
3215 /*--------------------------------------------------------------------------*/
3216 }
3217
3218 STATIC HV *
3219 S_populate_localeconv(pTHX_ const struct lconv *lcbuf,
3220                             const int unused,
3221                             const locale_utf8ness_t numeric_locale_is_utf8,
3222                             const locale_utf8ness_t monetary_locale_is_utf8)
3223 {
3224     /* This returns a mortalized hash containing all the elements returned by
3225      * localeconv().  It is used by Perl_localeconv() and POSIX::localeconv()
3226      */
3227     PERL_UNUSED_ARG(unused);
3228
3229     struct lconv_offset {
3230         const char *name;
3231         size_t offset;
3232     };
3233
3234     /* Create e.g.,
3235         {"thousands_sep", STRUCT_OFFSET(struct lconv, thousands_sep)},
3236      */
3237 #  define LCONV_ENTRY(name)                                         \
3238             {STRINGIFY(name), STRUCT_OFFSET(struct lconv, name)}
3239
3240     /* Set up structures containing the documented fields.  One structure for
3241      * LC_NUMERIC-controlled strings; one for LC_MONETARY ones, and a final one
3242      * of just numerics. */
3243 #  ifdef USE_LOCALE_NUMERIC
3244
3245     static const struct lconv_offset lconv_numeric_strings[] = {
3246         LCONV_ENTRY(decimal_point),
3247         LCONV_ENTRY(thousands_sep),
3248 #    ifndef NO_LOCALECONV_GROUPING
3249         LCONV_ENTRY(grouping),
3250 #    endif
3251         {NULL, 0}
3252     };
3253
3254 #  endif
3255 #  ifdef USE_LOCALE_MONETARY
3256
3257     static const struct lconv_offset lconv_monetary_strings[] = {
3258         LCONV_ENTRY(int_curr_symbol),
3259         LCONV_ENTRY(currency_symbol),
3260         LCONV_ENTRY(mon_decimal_point),
3261 #    ifndef NO_LOCALECONV_MON_THOUSANDS_SEP
3262         LCONV_ENTRY(mon_thousands_sep),
3263 #    endif
3264 #    ifndef NO_LOCALECONV_MON_GROUPING
3265         LCONV_ENTRY(mon_grouping),
3266 #    endif
3267         LCONV_ENTRY(positive_sign),
3268         LCONV_ENTRY(negative_sign),
3269         {NULL, 0}
3270     };
3271
3272 #  endif
3273
3274     static const struct lconv_offset lconv_integers[] = {
3275 #  ifdef USE_LOCALE_MONETARY
3276         LCONV_ENTRY(int_frac_digits),
3277         LCONV_ENTRY(frac_digits),
3278         LCONV_ENTRY(p_cs_precedes),
3279         LCONV_ENTRY(p_sep_by_space),
3280         LCONV_ENTRY(n_cs_precedes),
3281         LCONV_ENTRY(n_sep_by_space),
3282         LCONV_ENTRY(p_sign_posn),
3283         LCONV_ENTRY(n_sign_posn),
3284 #    ifdef HAS_LC_MONETARY_2008
3285         LCONV_ENTRY(int_p_cs_precedes),
3286         LCONV_ENTRY(int_p_sep_by_space),
3287         LCONV_ENTRY(int_n_cs_precedes),
3288         LCONV_ENTRY(int_n_sep_by_space),
3289         LCONV_ENTRY(int_p_sign_posn),
3290         LCONV_ENTRY(int_n_sign_posn),
3291 #    endif
3292 #  endif
3293         {NULL, 0}
3294     };
3295
3296     static const unsigned category_indices[] = {
3297 #  ifdef USE_LOCALE_NUMERIC
3298         LC_NUMERIC_INDEX_,
3299 #  endif
3300 #  ifdef USE_LOCALE_MONETARY
3301         LC_MONETARY_INDEX_,
3302 #  endif
3303         (unsigned) -1   /* Just so the previous element can always end with a
3304                            comma => subtract 1 below for the max loop index */
3305     };
3306
3307     const char *ptr = (const char *) lcbuf;
3308     const struct lconv_offset *integers = lconv_integers;
3309
3310     HV * retval = newHV();
3311     sv_2mortal((SV*)retval);
3312
3313     PERL_ARGS_ASSERT_POPULATE_LOCALECONV;
3314
3315     /* For each enabled category ... */
3316     for (PERL_UINT_FAST8_T i = 0; i < C_ARRAY_LENGTH(category_indices) - 1; i++) {
3317         const unsigned cat_index = category_indices[i];
3318         locale_utf8ness_t locale_is_utf8 = LOCALE_UTF8NESS_UNKNOWN;
3319         const char *locale;
3320
3321         /* ( = NULL silences a compiler warning; would segfault if it could
3322          * actually happen.) */
3323         const struct lconv_offset *strings = NULL;
3324
3325 #  ifdef USE_LOCALE_NUMERIC
3326         if (cat_index == LC_NUMERIC_INDEX_) {
3327             locale_is_utf8 = numeric_locale_is_utf8;
3328             strings = lconv_numeric_strings;
3329         }
3330 #  else
3331         PERL_UNUSED_ARG(numeric_locale_is_utf8);
3332 #  endif
3333 #  ifdef USE_LOCALE_MONETARY
3334         if (cat_index == LC_MONETARY_INDEX_) {
3335             locale_is_utf8 = monetary_locale_is_utf8;
3336             strings = lconv_monetary_strings;
3337         }
3338 #  else
3339         PERL_UNUSED_ARG(monetary_locale_is_utf8);
3340 #  endif
3341
3342         assert(locale_is_utf8 != LOCALE_UTF8NESS_UNKNOWN);
3343
3344         /* Iterate over the strings structure for this category */
3345         locale = querylocale_i(cat_index);
3346
3347         while (strings->name) {
3348             const char *value = *((const char **)(ptr + strings->offset));
3349             if (value && *value) {
3350                 bool is_utf8 =  /* Only make UTF-8 if required to */
3351                     (UTF8NESS_YES == (get_locale_string_utf8ness_i(locale,
3352                                                               cat_index,
3353                                                               value,
3354                                                               locale_is_utf8)));
3355                 (void) hv_store(retval,
3356                                 strings->name,
3357                                 strlen(strings->name),
3358                                 newSVpvn_utf8(value, strlen(value), is_utf8),
3359                                 0);
3360             }
3361
3362             strings++;
3363         }
3364     }
3365
3366     while (integers->name) {
3367         const char value = *((const char *)(ptr + integers->offset));
3368
3369         if (value != CHAR_MAX)
3370             (void) hv_store(retval, integers->name,
3371                             strlen(integers->name), newSViv(value), 0);
3372         integers++;
3373     }
3374
3375     return retval;
3376 }
3377
3378 #  ifndef HAS_SOME_LANGINFO
3379
3380 STATIC HV *
3381 S_get_nl_item_from_localeconv(pTHX_ const struct lconv *lcbuf,
3382                                     const int item,
3383                                     const locale_utf8ness_t unused1,
3384                                     const locale_utf8ness_t unused2)
3385 {
3386     /* This is a helper function for my_localeconv(), which is called from
3387      * my_langinfo() to emulate the libc nl_langinfo() function on platforms
3388      * that don't have it available.
3389      *
3390      * This function acts as an extension to my_langinfo(), the intermediate
3391      * my_localeconv() call is to set up the locks and switch into the proper
3392      * locale.  That logic exists for other reasons, and by doing it this way,
3393      * it doesn't have to be duplicated.
3394      *
3395      * This function extracts the current value of 'item' in the current locale
3396      * using the localconv() result also passed in, via 'lcbuf'.  The other
3397      * parameter is unused, a placeholder so the signature of this function
3398      * matches another that does need it, and so the two functions can be
3399      * referred to by a single function pointer, to simplify the code below */
3400
3401     const char * prefix = "";
3402     const char * temp = NULL;
3403
3404     PERL_ARGS_ASSERT_GET_NL_ITEM_FROM_LOCALECONV;
3405     PERL_UNUSED_ARG(unused1);
3406     PERL_UNUSED_ARG(unused2);
3407
3408     switch (item) {
3409       case CRNCYSTR:
3410         temp = lcbuf->currency_symbol;
3411
3412         if (lcbuf->p_cs_precedes) {
3413
3414             /* khw couldn't find any documentation that CHAR_MAX is the signal,
3415              * but cygwin uses it thusly */
3416             if (lcbuf->p_cs_precedes == CHAR_MAX) {
3417                 prefix = ".";
3418             }
3419             else {
3420                 prefix = "-";
3421             }
3422         }
3423         else {
3424             prefix = "+";
3425         }
3426
3427         break;
3428
3429       case RADIXCHAR:
3430         temp = lcbuf->decimal_point;
3431         break;
3432
3433       case THOUSEP:
3434         temp = lcbuf->thousands_sep;
3435         break;
3436
3437       default:
3438         locale_panic_(Perl_form(aTHX_
3439                     "Unexpected item passed to populate_localeconv: %d", item));
3440     }
3441
3442     return (HV *) Perl_newSVpvf(aTHX_ "%s%s", prefix, temp);
3443 }
3444
3445 #  endif    /* ! Has some form of langinfo() */
3446 #endif      /*   Has some form of localeconv() and paying attn to a category it
3447                  traffics in */
3448
3449 #ifndef HAS_SOME_LANGINFO
3450
3451 typedef int nl_item;    /* Substitute 'int' for emulated nl_langinfo() */
3452
3453 #endif
3454
3455 /*
3456
3457 =for apidoc      Perl_langinfo
3458 =for apidoc_item Perl_langinfo8
3459
3460 C<Perl_langinfo> is an (almost) drop-in replacement for the system
3461 C<L<nl_langinfo(3)>>, taking the same C<item> parameter values, and returning
3462 the same information.  But it is more thread-safe than regular
3463 C<nl_langinfo()>, and hides the quirks of Perl's locale handling from your
3464 code, and can be used on systems that lack a native C<nl_langinfo>.
3465
3466 However, you should instead use the improved version of this:
3467 L</Perl_langinfo8>, which behaves identically except for an additional
3468 parameter, a pointer to a variable declared as L</C<utf8ness_t>>, into which it
3469 returns to you how you should treat the returned string with regards to it
3470 being encoded in UTF-8 or not.
3471
3472 Concerning the differences between these and plain C<nl_langinfo()>:
3473
3474 =over
3475
3476 =item a.
3477
3478 C<Perl_langinfo8> has an extra parameter, described above.  Besides this, the
3479 other reasons they aren't quite a drop-in replacement is actually an advantage.
3480 The C<const>ness of the return allows the compiler to catch attempts to write
3481 into the returned buffer, which is illegal and could cause run-time crashes.
3482
3483 =item b.
3484
3485 They deliver the correct results for the C<RADIXCHAR> and C<THOUSEP> items,
3486 without you having to write extra code.  The reason for the extra code would be
3487 because these are from the C<LC_NUMERIC> locale category, which is normally
3488 kept set by Perl so that the radix is a dot, and the separator is the empty
3489 string, no matter what the underlying locale is supposed to be, and so to get
3490 the expected results, you have to temporarily toggle into the underlying
3491 locale, and later toggle back.  (You could use plain C<nl_langinfo> and
3492 C<L</STORE_LC_NUMERIC_FORCE_TO_UNDERLYING>> for this but then you wouldn't get
3493 the other advantages of C<Perl_langinfo()>; not keeping C<LC_NUMERIC> in the C
3494 (or equivalent) locale would break a lot of CPAN, which is expecting the radix
3495 (decimal point) character to be a dot.)
3496
3497 =item c.
3498
3499 The system function they replace can have its static return buffer trashed,
3500 not only by a subsequent call to that function, but by a C<freelocale>,
3501 C<setlocale>, or other locale change.  The returned buffer of these functions
3502 is not changed until the next call to one or the other, so the buffer is never
3503 in a trashed state.
3504
3505 =item d.
3506
3507 The return buffer is per-thread, so it also is never overwritten by a call to
3508 these functions from another thread;  unlike the function it replaces.
3509
3510 =item e.
3511
3512 But most importantly, they work on systems that don't have C<nl_langinfo>, such
3513 as Windows, hence making your code more portable.  Of the fifty-some possible
3514 items specified by the POSIX 2008 standard,
3515 L<http://pubs.opengroup.org/onlinepubs/9699919799/basedefs/langinfo.h.html>,
3516 only one is completely unimplemented, though on non-Windows platforms, another
3517 significant one is not fully implemented).  They use various techniques to
3518 recover the other items, including calling C<L<localeconv(3)>>, and
3519 C<L<strftime(3)>>, both of which are specified in C89, so should be always be
3520 available.  Later C<strftime()> versions have additional capabilities; C<""> is
3521 returned for any item not available on your system.
3522
3523 It is important to note that, when called with an item that is recovered by
3524 using C<localeconv>, the buffer from any previous explicit call to
3525 C<L<localeconv(3)>> will be overwritten.  But you shouldn't be using
3526 C<localeconv> anyway because it is is very much not thread-safe, and suffers
3527 from the same problems outlined in item 'b.' above for the fields it returns that
3528 are controlled by the LC_NUMERIC locale category.  Instead, avoid all of those
3529 problems by calling L</Perl_localeconv>, which is thread-safe; or by using the
3530 methods given in L<perlcall>  to call
3531 L<C<POSIX::localeconv()>|POSIX/localeconv>, which is also thread-safe.
3532
3533 =back
3534
3535 The details for those items which may deviate from what this emulation returns
3536 and what a native C<nl_langinfo()> would return are specified in
3537 L<I18N::Langinfo>.
3538
3539 When using C<Perl_langinfo8> (or plain C<Perl_langinfo>) on systems that don't
3540 have a native C<nl_langinfo()>, you must
3541
3542  #include "perl_langinfo.h"
3543
3544 before the C<perl.h> C<#include>.  You can replace your C<langinfo.h>
3545 C<#include> with this one.  (Doing it this way keeps out the symbols that plain
3546 C<langinfo.h> would try to import into the namespace for code that doesn't need
3547 it.)
3548
3549 =cut
3550
3551 */
3552
3553 const char *
3554 Perl_langinfo(const nl_item item)
3555 {
3556     return Perl_langinfo8(item, NULL);
3557 }
3558
3559 const char *
3560 Perl_langinfo8(const nl_item item, utf8ness_t * utf8ness)
3561 {
3562     dTHX;
3563     unsigned cat_index;
3564
3565     PERL_ARGS_ASSERT_PERL_LANGINFO8;
3566
3567     if (utf8ness) {     /* Assume for now */
3568         *utf8ness = UTF8NESS_IMMATERIAL;
3569     }
3570
3571     /* Find the locale category that controls the input 'item'.  If we are not
3572      * paying attention to that category, instead return a default value.  Also
3573      * return the default value if there is no way for us to figure out the
3574      * correct value.  If we have some form of nl_langinfo(), we can always
3575      * figure it out, but lacking that, there may be alternative methods that
3576      * can be used to recover most of the possible items.  Some of those
3577      * methods need libc functions, which may or may not be available.  If
3578      * unavailable, we can't compute the correct value, so must here return the
3579      * default. */
3580     switch (item) {
3581
3582       case CODESET:
3583
3584 #ifdef USE_LOCALE_CTYPE
3585
3586         cat_index = LC_CTYPE_INDEX_;
3587         break;
3588
3589 #else
3590         return C_codeset;
3591 #endif
3592 #if defined(USE_LOCALE_MESSAGES) && defined(HAS_SOME_LANGINFO)
3593
3594       case YESEXPR: case YESSTR: case NOEXPR: case NOSTR:
3595         cat_index = LC_MESSAGES_INDEX_;
3596         break;
3597 #else
3598       case YESEXPR:   return "^[+1yY]";
3599       case YESSTR:    return "yes";
3600       case NOEXPR:    return "^[-0nN]";
3601       case NOSTR:     return "no";
3602 #endif
3603
3604       case CRNCYSTR:
3605
3606 #if  defined(USE_LOCALE_MONETARY)                                   \
3607  && (defined(HAS_SOME_LANGINFO) || defined(HAS_SOME_LOCALECONV))
3608
3609         cat_index = LC_MONETARY_INDEX_;
3610         break;
3611 #else
3612         return "-";
3613 #endif
3614
3615       case RADIXCHAR:
3616
3617 #ifdef CAN_CALCULATE_RADIX
3618
3619         cat_index = LC_NUMERIC_INDEX_;
3620         break;
3621 #else
3622         return C_decimal_point;
3623 #endif
3624
3625       case THOUSEP:
3626
3627 #if  defined(USE_LOCALE_NUMERIC)                                    \
3628  && (defined(HAS_SOME_LANGINFO) || defined(HAS_SOME_LOCALECONV))
3629
3630         cat_index = LC_NUMERIC_INDEX_;
3631         break;
3632 #else
3633         return C_thousands_sep;
3634 #endif
3635
3636 /* The other possible items are all in LC_TIME. */
3637 #ifdef USE_LOCALE_TIME
3638
3639       default:
3640         cat_index = LC_TIME_INDEX_;
3641         break;
3642
3643 #endif
3644 #if ! defined(USE_LOCALE_TIME) || ! defined(HAS_SOME_LANGINFO)
3645
3646     /* If not using LC_TIME, hard code the rest.  Or, if there is no
3647      * nl_langinfo(), we use strftime() as an alternative, and it is missing
3648      * functionality to get every single one, so hard-code those */
3649
3650       case ERA: return "";  /* Unimplemented; for use with strftime() %E
3651                                modifier */
3652
3653       /* These formats are defined by C89, so we assume that strftime supports
3654        * them, and so are returned unconditionally; they may not be what the
3655        * locale actually says, but should give good enough results for someone
3656        * using them as formats (as opposed to trying to parse them to figure
3657        * out what the locale says).  The other format items are actually tested
3658        * to verify they work on the platform */
3659       case D_FMT:         return "%x";
3660       case T_FMT:         return "%X";
3661       case D_T_FMT:       return "%c";
3662
3663 #  if defined(WIN32) || ! defined(USE_LOCALE_TIME)
3664
3665       /* strftime() on Windows doesn't have the POSIX (beyond C89) extensions
3666        * that would allow it to recover these */
3667       case ERA_D_FMT:     return "%x";
3668       case ERA_T_FMT:     return "%X";
3669       case ERA_D_T_FMT:   return "%c";
3670       case ALT_DIGITS:    return "0";
3671
3672 #  endif
3673 #  ifndef USE_LOCALE_TIME
3674
3675       case T_FMT_AMPM:    return "%r";
3676       case ABDAY_1:       return "Sun";
3677       case ABDAY_2:       return "Mon";
3678       case ABDAY_3:       return "Tue";
3679       case ABDAY_4:       return "Wed";
3680       case ABDAY_5:       return "Thu";
3681       case ABDAY_6:       return "Fri";
3682       case ABDAY_7:       return "Sat";
3683       case AM_STR:        return "AM";
3684       case PM_STR:        return "PM";
3685       case ABMON_1:       return "Jan";
3686       case ABMON_2:       return "Feb";
3687       case ABMON_3:       return "Mar";
3688       case ABMON_4:       return "Apr";
3689       case ABMON_5:       return "May";
3690       case ABMON_6:       return "Jun";
3691       case ABMON_7:       return "Jul";
3692       case ABMON_8:       return "Aug";
3693       case ABMON_9:       return "Sep";
3694       case ABMON_10:      return "Oct";
3695       case ABMON_11:      return "Nov";
3696       case ABMON_12:      return "Dec";
3697       case DAY_1:         return "Sunday";
3698       case DAY_2:         return "Monday";
3699       case DAY_3:         return "Tuesday";
3700       case DAY_4:         return "Wednesday";
3701       case DAY_5:         return "Thursday";
3702       case DAY_6:         return "Friday";
3703       case DAY_7:         return "Saturday";
3704       case MON_1:         return "January";
3705       case MON_2:         return "February";
3706       case MON_3:         return "March";
3707       case MON_4:         return "April";
3708       case MON_5:         return "May";
3709       case MON_6:         return "June";
3710       case MON_7:         return "July";
3711       case MON_8:         return "August";
3712       case MON_9:         return "September";
3713       case MON_10:        return "October";
3714       case MON_11:        return "November";
3715       case MON_12:        return "December";
3716
3717 #  endif
3718 #endif
3719
3720     } /* End of switch on item */
3721
3722 #ifndef USE_LOCALE
3723
3724     Perl_croak_nocontext("panic: Unexpected nl_langinfo() item %d", item);
3725     NOT_REACHED; /* NOTREACHED */
3726     PERL_UNUSED_VAR(cat_index);
3727
3728 #else
3729 #  ifdef USE_LOCALE_NUMERIC
3730
3731     /* Use either the underlying numeric, or the other underlying categories */
3732     if (cat_index == LC_NUMERIC_INDEX_) {
3733         return my_langinfo_c(item, LC_NUMERIC, PL_numeric_name,
3734                              &PL_langinfo_buf, &PL_langinfo_bufsize, utf8ness);
3735     }
3736     else
3737
3738 #  endif
3739
3740     {
3741         return my_langinfo_i(item, cat_index, querylocale_i(cat_index),
3742                              &PL_langinfo_buf, &PL_langinfo_bufsize, utf8ness);
3743     }
3744
3745 #endif
3746
3747 }
3748
3749 #ifdef USE_LOCALE
3750
3751 /* There are several implementations of my_langinfo, depending on the
3752  * Configuration.  They all share the same beginning of the function */
3753 STATIC const char *
3754 S_my_langinfo_i(pTHX_
3755                 const nl_item item,           /* The item to look up */
3756                 const unsigned int cat_index, /* The locale category that
3757                                                  controls it */
3758                 /* The locale to look up 'item' in. */
3759                 const char * locale,
3760
3761                 /* Where to store the result, and where the size of that buffer
3762                  * is stored, updated on exit. retbuf_sizep may be NULL for an
3763                  * empty-on-entry, single use buffer whose size we don't need
3764                  * to keep track of */
3765                 const char ** retbufp,
3766                 Size_t * retbuf_sizep,
3767
3768                 /* If not NULL, the location to store the UTF8-ness of 'item's
3769                  * value, as documented */
3770                 utf8ness_t * utf8ness)
3771 {
3772     const char * retval = NULL;
3773
3774     PERL_ARGS_ASSERT_MY_LANGINFO_I;
3775     assert(cat_index <= NOMINAL_LC_ALL_INDEX);
3776
3777     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
3778                            "Entering my_langinfo item=%d, using locale %s\n",
3779                            item, locale));
3780 /*--------------------------------------------------------------------------*/
3781 /* Above is the common beginning to all the implementations of my_langinfo().
3782  * Below are the various completions.
3783  *
3784  * Some platforms don't deal well with non-ASCII strings in locale X when
3785  * LC_CTYPE is not in X.  (Actually it is probably when X is UTF-8 and LC_CTYPE
3786  * isn't, or vice versa).  There is explicit code to bring the categories into
3787  * sync.  This doesn't seem to be a problem with nl_langinfo(), so that
3788  * implementation doesn't currently worry about it.  But it is a problem on
3789  * Windows boxes, which don't have nl_langinfo(). */
3790
3791 #  if defined(HAS_THREAD_SAFE_NL_LANGINFO_L) && defined(USE_POSIX_2008_LOCALE)
3792
3793     /* Simplest is if we can use nl_langinfo_l()
3794      *
3795      * With it, we can change LC_CTYPE in the same call as the other category */
3796 #    ifdef USE_LOCALE_CTYPE
3797 #      define CTYPE_SAFETY_MASK LC_CTYPE_MASK
3798 #    else
3799 #      define CTYPE_SAFETY_MASK 0
3800 #    endif
3801
3802     locale_t cur = newlocale((category_masks[cat_index] | CTYPE_SAFETY_MASK),
3803                              locale, (locale_t) 0);
3804
3805     retval = save_to_buffer(nl_langinfo_l(item, cur), retbufp, retbuf_sizep);
3806     if (utf8ness) {
3807         *utf8ness = get_locale_string_utf8ness_i(locale, cat_index, retval,
3808                                                  LOCALE_UTF8NESS_UNKNOWN);
3809     }
3810
3811     freelocale(cur);
3812
3813     return retval;
3814 /*--------------------------------------------------------------------------*/
3815 #  elif defined(HAS_NL_LANGINFO) /* nl_langinfo() is available.  */
3816
3817     /* The second version of my_langinfo() is if we have plain nl_langinfo() */
3818
3819 #    ifdef USE_LOCALE_CTYPE
3820
3821     /* Ths function sorts out if things actually have to be switched or not,
3822      * for both calls. */
3823     const char * orig_CTYPE_locale = toggle_locale_c(LC_CTYPE, locale);
3824
3825 #    endif
3826
3827     const char * orig_switched_locale = toggle_locale_i(cat_index, locale);
3828
3829     NL_LANGINFO_LOCK;
3830     retval = save_to_buffer(nl_langinfo(item), retbufp, retbuf_sizep);
3831     NL_LANGINFO_UNLOCK;
3832
3833     if (utf8ness) {
3834         *utf8ness = get_locale_string_utf8ness_i(locale, cat_index,
3835                                                retval, LOCALE_UTF8NESS_UNKNOWN);
3836     }
3837
3838     restore_toggled_locale_i(cat_index, orig_switched_locale);
3839
3840 #    ifdef USE_LOCALE_CTYPE
3841     restore_toggled_locale_c(LC_CTYPE, orig_CTYPE_locale);
3842 #    endif
3843
3844     return retval;
3845 /*--------------------------------------------------------------------------*/
3846 #  else   /* Below, emulate nl_langinfo as best we can */
3847
3848     /* And the third and final completion is where we have to emulate
3849      * nl_langinfo().  There are various possibilities depending on the
3850      * Configuration */
3851
3852 #    ifdef USE_LOCALE_CTYPE
3853
3854     const char * orig_CTYPE_locale =  toggle_locale_c(LC_CTYPE, locale);
3855
3856 #    endif
3857
3858     const char * orig_switched_locale = toggle_locale_i(cat_index, locale);
3859
3860     /* Here, we are in the locale we want information about */
3861
3862     /* Almost all the items will have ASCII return values.  Set that here, and
3863      * override if necessary */
3864     utf8ness_t is_utf8 = UTF8NESS_IMMATERIAL;
3865
3866     switch (item) {
3867       default:
3868         retval = "";
3869         break;
3870
3871       case RADIXCHAR:
3872
3873 #    if      defined(HAS_SNPRINTF)                                              \
3874        && (! defined(HAS_SOME_LOCALECONV) || defined(TS_W32_BROKEN_LOCALECONV))
3875
3876         {
3877             /* snprintf() can be used to find the radix character by outputting
3878              * a known simple floating point number to a buffer, and parsing
3879              * it, inferring the radix as the bytes separating the integer and
3880              * fractional parts.  But localeconv() is more direct, not
3881              * requiring inference, so use it instead of the code just below,
3882              * if (likely) it is available and works ok */
3883
3884             char * floatbuf = NULL;
3885             const Size_t initial_size = 10;
3886
3887             Newx(floatbuf, initial_size, char);
3888
3889             /* 1.5 is exactly representable on binary computers */
3890             Size_t needed_size = snprintf(floatbuf, initial_size, "%.1f", 1.5);
3891
3892             /* If our guess wasn't big enough, increase and try again, based on
3893              * the real number that strnprintf() is supposed to return */
3894             if (UNLIKELY(needed_size >= initial_size)) {
3895                 needed_size++;  /* insurance */
3896                 Renew(floatbuf, needed_size, char);
3897                 Size_t new_needed = snprintf(floatbuf, needed_size, "%.1f", 1.5);
3898                 assert(new_needed <= needed_size);
3899                 needed_size = new_needed;
3900             }
3901
3902             char * s = floatbuf;
3903             char * e = floatbuf + needed_size;
3904
3905             /* Find the '1' */
3906             while (s < e && *s != '1') {
3907                 s++;
3908             }
3909
3910             if (LIKELY(s < e)) {
3911                 s++;
3912             }
3913
3914             /* Find the '5' */
3915             char * item_start = s;
3916             while (s < e && *s != '5') {
3917                 s++;
3918             }
3919
3920             /* Everything in between is the radix string */
3921             if (LIKELY(s < e)) {
3922                 *s = '\0';
3923                 retval = save_to_buffer(item_start,
3924                                         (const char **) &PL_langinfo_buf,
3925                                         &PL_langinfo_bufsize);
3926                 Safefree(floatbuf);
3927
3928                 if (utf8ness) {
3929                     is_utf8 = get_locale_string_utf8ness_i(locale, cat_index,
3930                                                            retval,
3931                                                        LOCALE_UTF8NESS_UNKNOWN);
3932
3933                 }
3934
3935                 break;
3936             }
3937
3938             Safefree(floatbuf);
3939         }
3940
3941 #      ifdef HAS_SOME_LOCALECONV /* snprintf() failed; drop down to use
3942                                     localeconv() */
3943
3944         /* FALLTHROUGH */                                                           \
3945
3946 #      else                      /* snprintf() failed and no localeconv() */
3947
3948         retval = C_decimal_point;
3949         break;
3950
3951 #      endif
3952 #    endif
3953 #    ifdef HAS_SOME_LOCALECONV
3954
3955     /* These items are available from localeconv().  (To avoid using
3956      * TS_W32_BROKEN_LOCALECONV, one could use GetNumberFormat and
3957      * GetCurrencyFormat; patches welcome) */
3958
3959       case CRNCYSTR:
3960       case THOUSEP:
3961         {
3962             SV * string = (SV *) my_localeconv(item, LOCALE_UTF8NESS_UNKNOWN);
3963
3964             retval = save_to_buffer(SvPV_nolen(string), retbufp, retbuf_sizep);
3965
3966             if (utf8ness) {
3967                 is_utf8 = get_locale_string_utf8ness_i(locale, cat_index, retval,
3968                                                        LOCALE_UTF8NESS_UNKNOWN);
3969             }
3970
3971             SvREFCNT_dec_NN(string);
3972             break;
3973         }
3974
3975 #    endif  /* Some form of localeconv */
3976 #    ifdef HAS_STRFTIME
3977
3978       /* These formats are only available in later strfmtime's */
3979       case ERA_D_FMT: case ERA_T_FMT: case ERA_D_T_FMT: case T_FMT_AMPM:
3980
3981       /* The rest can be gotten from most versions of strftime(). */
3982       case ABDAY_1: case ABDAY_2: case ABDAY_3:
3983       case ABDAY_4: case ABDAY_5: case ABDAY_6: case ABDAY_7:
3984       case ALT_DIGITS:
3985       case AM_STR: case PM_STR:
3986       case ABMON_1: case ABMON_2: case ABMON_3: case ABMON_4:
3987       case ABMON_5: case ABMON_6: case ABMON_7: case ABMON_8:
3988       case ABMON_9: case ABMON_10: case ABMON_11: case ABMON_12:
3989       case DAY_1: case DAY_2: case DAY_3: case DAY_4:
3990       case DAY_5: case DAY_6: case DAY_7:
3991       case MON_1: case MON_2: case MON_3: case MON_4:
3992       case MON_5: case MON_6: case MON_7: case MON_8:
3993       case MON_9: case MON_10: case MON_11: case MON_12:
3994         {
3995             const char * format;
3996             bool return_format = FALSE;
3997             int mon = 0;
3998             int mday = 1;
3999             int hour = 6;
4000
4001             GCC_DIAG_IGNORE_STMT(-Wimplicit-fallthrough);
4002
4003             switch (item) {
4004               default:
4005                 locale_panic_(Perl_form(aTHX_ "switch case: %d problem", item));
4006                 NOT_REACHED; /* NOTREACHED */
4007
4008               case PM_STR: hour = 18;
4009               case AM_STR:
4010                 format = "%p";
4011                 break;
4012               case ABDAY_7: mday++;
4013               case ABDAY_6: mday++;
4014               case ABDAY_5: mday++;
4015               case ABDAY_4: mday++;
4016               case ABDAY_3: mday++;
4017               case ABDAY_2: mday++;
4018               case ABDAY_1:
4019                 format = "%a";
4020                 break;
4021               case DAY_7: mday++;
4022               case DAY_6: mday++;
4023               case DAY_5: mday++;
4024               case DAY_4: mday++;
4025               case DAY_3: mday++;
4026               case DAY_2: mday++;
4027               case DAY_1:
4028                 format = "%A";
4029                 break;
4030               case ABMON_12: mon++;
4031               case ABMON_11: mon++;
4032               case ABMON_10: mon++;
4033               case ABMON_9:  mon++;
4034               case ABMON_8:  mon++;
4035               case ABMON_7:  mon++;
4036               case ABMON_6:  mon++;
4037               case ABMON_5:  mon++;
4038               case ABMON_4:  mon++;
4039               case ABMON_3:  mon++;
4040               case ABMON_2:  mon++;
4041               case ABMON_1:
4042                 format = "%b";
4043                 break;
4044               case MON_12: mon++;
4045               case MON_11: mon++;
4046               case MON_10: mon++;
4047               case MON_9:  mon++;
4048               case MON_8:  mon++;
4049               case MON_7:  mon++;
4050               case MON_6:  mon++;
4051               case MON_5:  mon++;
4052               case MON_4:  mon++;
4053               case MON_3:  mon++;
4054               case MON_2:  mon++;
4055               case MON_1:
4056                 format = "%B";
4057                 break;
4058               case T_FMT_AMPM:
4059                 format = "%r";
4060                 return_format = TRUE;
4061                 break;
4062               case ERA_D_FMT:
4063                 format = "%Ex";
4064                 return_format = TRUE;
4065                 break;
4066               case ERA_T_FMT:
4067                 format = "%EX";
4068                 return_format = TRUE;
4069                 break;
4070               case ERA_D_T_FMT:
4071                 format = "%Ec";
4072                 return_format = TRUE;
4073                 break;
4074               case ALT_DIGITS:
4075                 format = "%Ow"; /* Find the alternate digit for 0 */
4076                 break;
4077             }
4078
4079             GCC_DIAG_RESTORE_STMT;
4080
4081             /* The year was deliberately chosen so that January 1 is on the
4082              * first day of the week.  Since we're only getting one thing at a
4083              * time, it all works */
4084             const char * temp = my_strftime8(format, 30, 30, hour, mday, mon,
4085                                              2011, 0, 0, 0, &is_utf8);
4086             retval = save_to_buffer(temp, retbufp, retbuf_sizep);
4087             Safefree(temp);
4088
4089             /* If the item is 'ALT_DIGITS', '*retbuf' contains the alternate
4090              * format for wday 0.  If the value is the same as the normal 0,
4091              * there isn't an alternate, so clear the buffer.
4092              *
4093              * (wday was chosen because its range is all a single digit.
4094              * Things like tm_sec have two digits as the minimum: '00'.) */
4095             if (item == ALT_DIGITS && strEQ(*retbufp, "0")) {
4096                 retval = "";
4097                 break;
4098             }
4099
4100             /* ALT_DIGITS is problematic.  Experiments on it showed that
4101              * strftime() did not always work properly when going from alt-9 to
4102              * alt-10.  Only a few locales have this item defined, and in all
4103              * of them on Linux that khw was able to find, nl_langinfo() merely
4104              * returned the alt-0 character, possibly doubled.  Most Unicode
4105              * digits are in blocks of 10 consecutive code points, so that is
4106              * sufficient information for such scripts, as we can infer alt-1,
4107              * alt-2, ....  But for a Japanese locale, a CJK ideographic 0 is
4108              * returned, and the CJK digits are not in code point order, so you
4109              * can't really infer anything.  The localedef for this locale did
4110              * specify the succeeding digits, so that strftime() works properly
4111              * on them, without needing to infer anything.  But the
4112              * nl_langinfo() return did not give sufficient information for the
4113              * caller to understand what's going on.  So until there is
4114              * evidence that it should work differently, this returns the alt-0
4115              * string for ALT_DIGITS. */
4116
4117             if (return_format) {
4118
4119                 /* If to return the format, not the value, overwrite the buffer
4120                  * with it.  But some strftime()s will keep the original format
4121                  * if illegal, so change those to "" */
4122                 if (strEQ(*retbufp, format)) {
4123                     retval = "";
4124                 }
4125                 else {
4126                     retval = format;
4127                 }
4128
4129                 /* A format is always in ASCII */
4130                 is_utf8 = UTF8NESS_IMMATERIAL;
4131             }
4132
4133             break;
4134         }
4135
4136 #    endif
4137
4138       case CODESET:
4139
4140         /* The trivial case */
4141         if (isNAME_C_OR_POSIX(locale)) {
4142             retval = C_codeset;
4143             break;
4144         }
4145
4146 #    ifdef WIN32
4147
4148         /* This function retrieves the code page.  It is subject to change, but
4149          * is documented and has been stable for many releases */
4150         UINT ___lc_codepage_func(void);
4151
4152         retval = save_to_buffer(Perl_form(aTHX_ "%d", ___lc_codepage_func()),
4153                                 retbufp, retbuf_sizep);
4154         DEBUG_Lv(PerlIO_printf(Perl_debug_log, "locale='%s' cp=%s\n",
4155                                                locale, retval));
4156         break;
4157
4158 #    else
4159
4160         /* The codeset is important, but khw did not figure out a way for it to
4161          * be retrieved on non-Windows boxes without nl_langinfo().  But even
4162          * if we can't get it directly, we can usually determine if it is a
4163          * UTF-8 locale or not.  If it is UTF-8, we (correctly) use that for
4164          * the code set. */
4165
4166 #      if defined(HAS_MBTOWC) || defined(HAS_MBRTOWC)
4167
4168         /* If libc mbtowc() evaluates the bytes that form the REPLACEMENT
4169          * CHARACTER as that Unicode code point, this has to be a UTF-8 locale.
4170          * */
4171
4172         wchar_t wc = 0;
4173         (void) Perl_mbtowc_(aTHX_ NULL, NULL, 0);/* Reset shift state */
4174         int mbtowc_ret = Perl_mbtowc_(aTHX_ &wc,
4175                               STR_WITH_LEN(REPLACEMENT_CHARACTER_UTF8));
4176         if (mbtowc_ret >= 0 && wc == UNICODE_REPLACEMENT) {
4177             DEBUG_Lv(PerlIO_printf(Perl_debug_log,
4178                                    "mbtowc returned REPLACEMENT\n"));
4179             retval = "UTF-8";
4180             break;
4181         }
4182
4183         /* Here, it isn't a UTF-8 locale. */
4184
4185 #    else   /* mbtowc() is not available. */
4186
4187         /* Sling together several possibilities, depending on platform
4188          * capabilities and what we found.
4189          *
4190          * For non-English locales or non-dollar currency locales, we likely
4191          * will find out whether a locale is UTF-8 or not */
4192
4193         utf8ness_t is_utf8 = UTF8NESS_UNKNOWN;
4194         const char * scratch_buf = NULL;
4195
4196 #      if defined(USE_LOCALE_MONETARY) && defined(HAS_SOME_LOCALECONV)
4197
4198         /* Can't use this method unless localeconv() is available, as that's
4199          * the way we find out the currency symbol. */
4200
4201         /* First try looking at the currency symbol (via a recursive call) to
4202          * see if it disambiguates things.  Often that will be in the native
4203          * script, and if the symbol isn't legal UTF-8, we know that the locale
4204          * isn't either. */
4205         (void) my_langinfo_c(CRNCYSTR, LC_MONETARY, locale, &scratch_buf, NULL,
4206                              &is_utf8);
4207         Safefree(scratch_buf);
4208
4209 #      endif
4210 #      ifdef USE_LOCALE_TIME
4211
4212         /* If we have ruled out being UTF-8, no point in checking further. */
4213         if (is_utf8 != UTF8NESS_NO) {
4214
4215             /* But otherwise do check more.  This is done even if the currency
4216              * symbol looks to be UTF-8, just in case that's a false positive.
4217              *
4218              * Look at the LC_TIME entries, like the names of the months or
4219              * weekdays.  We quit at the first one that is illegal UTF-8 */
4220
4221             utf8ness_t this_is_utf8 = UTF8NESS_UNKNOWN;
4222             const int times[] = {
4223                 DAY_1, DAY_2, DAY_3, DAY_4, DAY_5, DAY_6, DAY_7,
4224                 MON_1, MON_2, MON_3, MON_4, MON_5, MON_6, MON_7, MON_8,
4225                                             MON_9, MON_10, MON_11, MON_12,
4226                 ALT_DIGITS, AM_STR, PM_STR,
4227                 ABDAY_1, ABDAY_2, ABDAY_3, ABDAY_4, ABDAY_5, ABDAY_6,
4228                                                              ABDAY_7,
4229                 ABMON_1, ABMON_2, ABMON_3, ABMON_4, ABMON_5, ABMON_6,
4230                 ABMON_7, ABMON_8, ABMON_9, ABMON_10, ABMON_11, ABMON_12
4231             };
4232
4233             /* The code in the recursive call can handle switching the locales,
4234              * but by doing it here, we avoid switching each iteration of the
4235              * loop */
4236             const char * orig_TIME_locale = toggle_locale_c(LC_TIME, locale);
4237
4238             for (PERL_UINT_FAST8_T i = 0; i < C_ARRAY_LENGTH(times); i++) {
4239                 scratch_buf = NULL;
4240                 (void) my_langinfo_c(times[i], LC_TIME, locale, &scratch_buf,
4241                                      NULL, &this_is_utf8);
4242                 Safefree(scratch_buf);
4243                 if (this_is_utf8 == UTF8NESS_NO) {
4244                     is_utf8 = UTF8NESS_NO;
4245                     break;
4246                 }
4247
4248                 if (this_is_utf8 == UTF8NESS_YES) {
4249                     is_utf8 = UTF8NESS_YES;
4250                 }
4251             }
4252
4253             /* Here we have gone through all the LC_TIME elements.  is_utf8 has
4254              * been set as follows:
4255              *      UTF8NESS_NO           If any aren't legal UTF-8
4256              *      UTF8NESS_IMMMATERIAL  If all are ASCII
4257              *      UTF8NESS_YES          If all are legal UTF-8 (including
4258              *                            ASCIIi), and at least one isn't
4259              *                            ASCII. */
4260
4261             restore_toggled_locale_c(LC_TIME, orig_TIME_locale);
4262         }
4263
4264 #      endif    /* LC_TIME */
4265
4266         /* If nothing examined above rules out it being UTF-8, and at least one
4267          * thing fits as UTF-8 (and not plain ASCII), assume the codeset is
4268          * UTF-8. */
4269         if (is_utf8 == UTF8NESS_YES) {
4270             retval = "UTF-8";
4271             break;
4272         }
4273
4274         /* Here, nothing examined indicates that the codeset is UTF-8.  But
4275          * what is it?  The other locale categories are not likely to be of
4276          * further help:
4277          *
4278          * LC_NUMERIC   Only a few locales in the world have a non-ASCII radix
4279          *              or group separator.
4280          * LC_CTYPE     This code wouldn't be compiled if mbtowc() existed and
4281          *              was reliable.  This is unlikely in C99.  There are
4282          *              other functions that could be used instead, but are
4283          *              they going to exist, and be able to distinguish between
4284          *              UTF-8 and 8859-1?  Deal with this only if it becomes
4285          *              necessary.
4286          * LC_MESSAGES  The strings returned from strerror() would seem likely
4287          *              candidates, but experience has shown that many systems
4288          *              don't actually have translations installed for them.
4289          *              They are instead always in English, so everything in
4290          *              them is ASCII, which is of no help to us.  A Configure
4291          *              probe could possibly be written to see if this platform
4292          *              has non-ASCII error messages.  But again, wait until it
4293          *              turns out to be an actual problem. */
4294
4295 #    endif    /* ! mbtowc() */
4296
4297         /* Rejoin the mbtowc available/not-available cases.
4298          *
4299          * We got here only because we haven't been able to find the codeset.
4300          * The only other option khw could think of is to see if the codeset is
4301          * part of the locale name.  This is very less than ideal; often there
4302          * is no code set in the name; and at other times they even lie.
4303          *
4304          * Find any dot in the locale name */
4305         retval = (const char *) strchr(locale, '.');
4306         if (! retval) {
4307             retval = "";  /* Alas, no dot */
4308             break;
4309         }
4310
4311         /* Use everything past the dot */
4312         retval++;
4313
4314 #      if defined(HAS_MBTOWC) || defined(HAS_MBRTOWC)
4315
4316         /* When these functions, are available, they were tried earlier and
4317          * indicated that the locale did not act like a proper UTF-8 one.  So
4318          * if it claims to be UTF-8, it is a lie */
4319         if (is_codeset_name_UTF8(retval)) {
4320             retval = "";
4321             break;
4322         }
4323
4324 #      endif
4325
4326         /* Otherwise the code set name is considered to be everything past the
4327          * dot. */
4328         retval = save_to_buffer(retval, retbufp, retbuf_sizep);
4329
4330         break;
4331
4332 #    endif
4333
4334     } /* Giant switch() of nl_langinfo() items */
4335
4336     restore_toggled_locale_i(cat_index, orig_switched_locale);
4337
4338 #    ifdef USE_LOCALE_CTYPE
4339     restore_toggled_locale_c(LC_CTYPE, orig_CTYPE_locale);
4340 #    endif
4341
4342     if (utf8ness) {
4343         *utf8ness = is_utf8;
4344     }
4345
4346     return retval;
4347
4348 #  endif    /* All the implementations of my_langinfo() */
4349
4350 /*--------------------------------------------------------------------------*/
4351
4352 }   /* my_langinfo() */
4353
4354 #endif      /* USE_LOCALE */
4355
4356 char *
4357 Perl_my_strftime8(pTHX_ const char *fmt, int sec, int min, int hour, int mday,
4358                          int mon, int year, int wday, int yday, int isdst,
4359                          utf8ness_t * utf8ness)
4360 {   /* Documented in util.c */
4361     char * retval = my_strftime(fmt, sec, min, hour, mday, mon, year, wday,
4362                                 yday, isdst);
4363
4364     PERL_ARGS_ASSERT_MY_STRFTIME8;
4365
4366     if (utf8ness) {
4367
4368 #ifdef USE_LOCALE_TIME
4369         *utf8ness = get_locale_string_utf8ness_i(NULL, LC_TIME_INDEX_,
4370                                                retval, LOCALE_UTF8NESS_UNKNOWN);
4371 #else
4372         *utf8ness = UTF8NESS_IMMATERIAL;
4373 #endif
4374
4375     }
4376
4377     DEBUG_Lv(PerlIO_printf(Perl_debug_log, "fmt=%s, retval=%s", fmt,
4378                  ((is_utf8_string((U8 *) retval, 0))
4379                   ? retval
4380                   :_byte_dump_string((U8 *) retval, strlen(retval), 0)));
4381              if (utf8ness) PerlIO_printf(Perl_debug_log, "; utf8ness=%d",
4382                                                          (int) *utf8ness);
4383              PerlIO_printf(Perl_debug_log, "\n");
4384             );
4385
4386     return retval;
4387 }
4388
4389 /*
4390  * Initialize locale awareness.
4391  */
4392 int
4393 Perl_init_i18nl10n(pTHX_ int printwarn)
4394 {
4395     /* printwarn is
4396      *
4397      *    0 if not to output warning when setup locale is bad
4398      *    1 if to output warning based on value of PERL_BADLANG
4399      *    >1 if to output regardless of PERL_BADLANG
4400      *
4401      * returns
4402      *    1 = set ok or not applicable,
4403      *    0 = fallback to a locale of lower priority
4404      *   -1 = fallback to all locales failed, not even to the C locale
4405      *
4406      * Under -DDEBUGGING, if the environment variable PERL_DEBUG_LOCALE_INIT is
4407      * set, debugging information is output.
4408      *
4409      * This looks more complicated than it is, mainly due to the #ifdefs and
4410      * error handling.
4411      *
4412      * Besides some asserts, data structure initialization, and specific
4413      * platform complications, this routine is effectively represented by this
4414      * pseudo-code:
4415      *
4416      *      setlocale(LC_ALL, "");                                            x
4417      *      foreach (subcategory) {                                           x
4418      *          curlocales[f(subcategory)] = setlocale(subcategory, NULL);    x
4419      *      }                                                                 x
4420      *      if (platform_so_requires) {
4421      *          foreach (subcategory) {
4422      *            PL_curlocales[f(subcategory)] = curlocales[f(subcategory)]
4423      *          }
4424      *      }
4425      *      foreach (subcategory) {
4426      *          if (needs_special_handling[f(subcategory)] &this_subcat_handler
4427      *      }
4428      *
4429      * This sets all the categories to the values in the current environment,
4430      * saves them temporarily in curlocales[] until they can be handled and/or
4431      * on some platforms saved in a per-thread array PL_curlocales[].
4432      *
4433      * f(foo) is a mapping from the opaque system category numbers to small
4434      * non-negative integers used most everywhere in this file as indices into
4435      * arrays (such as curlocales[]) so the program doesn't have to otherwise
4436      * deal with the opaqueness.
4437      *
4438      * If the platform doesn't have LC_ALL, the lines marked 'x' above are
4439      * effectively replaced by:
4440      *      foreach (subcategory) {                                           y
4441      *          curlocales[f(subcategory)] = setlocale(subcategory, "");      y
4442      *      }                                                                 y
4443      *
4444      * The only differences being the lack of an LC_ALL call, and using ""
4445      * instead of NULL in the setlocale calls.
4446      *
4447      * But there are, of course, complications.
4448      *
4449      * it has to deal with if this is an embedded perl, whose locale doesn't
4450      * come from the environment, but has been set up by the caller.  This is
4451      * pretty simply handled: the "" in the setlocale calls is not a string
4452      * constant, but a variable which is set to NULL in the embedded case.
4453      *
4454      * But the major complication is handling failure and doing fallback.  All
4455      * the code marked 'x' or 'y' above is actually enclosed in an outer loop,
4456      * using the array trial_locales[].  On entry, trial_locales[] is
4457      * initialized to just one entry, containing the NULL or "" locale argument
4458      * shown above.  If, as is almost always the case, everything works, it
4459      * exits after just the one iteration, going on to the next step.
4460      *
4461      * But if there is a failure, the code tries its best to honor the
4462      * environment as much as possible.  It self-modifies trial_locales[] to
4463      * have more elements, one for each of the POSIX-specified settings from
4464      * the environment, such as LANG, ending in the ultimate fallback, the C
4465      * locale.  Thus if there is something bogus with a higher priority
4466      * environment variable, it will try with the next highest, until something
4467      * works.  If everything fails, it limps along with whatever state it got
4468      * to.
4469      *
4470      * A further complication is that Windows has an additional fallback, the
4471      * user-default ANSI code page obtained from the operating system.  This is
4472      * added as yet another loop iteration, just before the final "C"
4473      *
4474      * A slight complication is that in embedded Perls, the locale may already
4475      * be set-up, and we don't want to get it from the normal environment
4476      * variables.  This is handled by having a special environment variable
4477      * indicate we're in this situation.  We simply set setlocale's 2nd
4478      * parameter to be a NULL instead of "".  That indicates to setlocale that
4479      * it is not to change anything, but to return the current value,
4480      * effectively initializing perl's db to what the locale already is.
4481      *
4482      * We play the same trick with NULL if a LC_ALL succeeds.  We call
4483      * setlocale() on the individual categores with NULL to get their existing
4484      * values for our db, instead of trying to change them.
4485      * */
4486
4487     int ok = 1;
4488
4489 #ifndef USE_LOCALE
4490
4491     PERL_UNUSED_ARG(printwarn);
4492
4493 #else  /* USE_LOCALE */
4494 #  ifdef __GLIBC__
4495
4496     const char * const language = PerlEnv_getenv("LANGUAGE");
4497
4498 #  endif
4499
4500     /* NULL uses the existing already set up locale */
4501     const char * const setlocale_init = (PerlEnv_getenv("PERL_SKIP_LOCALE_INIT"))
4502                                         ? NULL
4503                                         : "";
4504     typedef struct trial_locales_struct_s {
4505         const char* trial_locale;
4506         const char* fallback_desc;
4507         const char* fallback_name;
4508     } trial_locales_struct;
4509     /* 5 = 1 each for "", LC_ALL, LANG, (Win32) system default locale, C */
4510     trial_locales_struct trial_locales[5];
4511     unsigned int trial_locales_count;
4512     const char * const lc_all     = PerlEnv_getenv("LC_ALL");
4513     const char * const lang       = PerlEnv_getenv("LANG");
4514     bool setlocale_failure = FALSE;
4515     unsigned int i;
4516
4517     /* A later getenv() could zap this, so only use here */
4518     const char * const bad_lang_use_once = PerlEnv_getenv("PERL_BADLANG");
4519
4520     const bool locwarn = (printwarn > 1
4521                           || (          printwarn
4522                               && (    ! bad_lang_use_once
4523                                   || (
4524                                          /* disallow with "" or "0" */
4525                                          *bad_lang_use_once
4526                                        && strNE("0", bad_lang_use_once)))));
4527
4528     /* current locale for given category; should have been copied so aren't
4529      * volatile */
4530     const char * curlocales[NOMINAL_LC_ALL_INDEX + 1];
4531
4532 #  ifndef DEBUGGING
4533 #    define DEBUG_LOCALE_INIT(a,b,c)
4534 #  else
4535
4536     DEBUG_INITIALIZATION_set(cBOOL(PerlEnv_getenv("PERL_DEBUG_LOCALE_INIT")));
4537
4538 #    define DEBUG_LOCALE_INIT(cat_index, locale, result)                    \
4539         DEBUG_L(PerlIO_printf(Perl_debug_log, "%s\n",                       \
4540                     setlocale_debug_string_i(cat_index, locale, result)));
4541
4542 /* Make sure the parallel arrays are properly set up */
4543 #    ifdef USE_LOCALE_NUMERIC
4544     assert(categories[LC_NUMERIC_INDEX_] == LC_NUMERIC);
4545     assert(strEQ(category_names[LC_NUMERIC_INDEX_], "LC_NUMERIC"));
4546 #      ifdef USE_POSIX_2008_LOCALE
4547     assert(category_masks[LC_NUMERIC_INDEX_] == LC_NUMERIC_MASK);
4548 #      endif
4549 #    endif
4550 #    ifdef USE_LOCALE_CTYPE
4551     assert(categories[LC_CTYPE_INDEX_] == LC_CTYPE);
4552     assert(strEQ(category_names[LC_CTYPE_INDEX_], "LC_CTYPE"));
4553 #      ifdef USE_POSIX_2008_LOCALE
4554     assert(category_masks[LC_CTYPE_INDEX_] == LC_CTYPE_MASK);
4555 #      endif
4556 #    endif
4557 #    ifdef USE_LOCALE_COLLATE
4558     assert(categories[LC_COLLATE_INDEX_] == LC_COLLATE);
4559     assert(strEQ(category_names[LC_COLLATE_INDEX_], "LC_COLLATE"));
4560 #      ifdef USE_POSIX_2008_LOCALE
4561     assert(category_masks[LC_COLLATE_INDEX_] == LC_COLLATE_MASK);
4562 #      endif
4563 #    endif
4564 #    ifdef USE_LOCALE_TIME
4565     assert(categories[LC_TIME_INDEX_] == LC_TIME);
4566     assert(strEQ(category_names[LC_TIME_INDEX_], "LC_TIME"));
4567 #      ifdef USE_POSIX_2008_LOCALE
4568     assert(category_masks[LC_TIME_INDEX_] == LC_TIME_MASK);
4569 #      endif
4570 #    endif
4571 #    ifdef USE_LOCALE_MESSAGES
4572     assert(categories[LC_MESSAGES_INDEX_] == LC_MESSAGES);
4573     assert(strEQ(category_names[LC_MESSAGES_INDEX_], "LC_MESSAGES"));
4574 #      ifdef USE_POSIX_2008_LOCALE
4575     assert(category_masks[LC_MESSAGES_INDEX_] == LC_MESSAGES_MASK);
4576 #      endif
4577 #    endif
4578 #    ifdef USE_LOCALE_MONETARY
4579     assert(categories[LC_MONETARY_INDEX_] == LC_MONETARY);
4580     assert(strEQ(category_names[LC_MONETARY_INDEX_], "LC_MONETARY"));
4581 #      ifdef USE_POSIX_2008_LOCALE
4582     assert(category_masks[LC_MONETARY_INDEX_] == LC_MONETARY_MASK);
4583 #      endif
4584 #    endif
4585 #    ifdef USE_LOCALE_ADDRESS
4586     assert(categories[LC_ADDRESS_INDEX_] == LC_ADDRESS);
4587     assert(strEQ(category_names[LC_ADDRESS_INDEX_], "LC_ADDRESS"));
4588 #      ifdef USE_POSIX_2008_LOCALE
4589     assert(category_masks[LC_ADDRESS_INDEX_] == LC_ADDRESS_MASK);
4590 #      endif
4591 #    endif
4592 #    ifdef USE_LOCALE_IDENTIFICATION
4593     assert(categories[LC_IDENTIFICATION_INDEX_] == LC_IDENTIFICATION);
4594     assert(strEQ(category_names[LC_IDENTIFICATION_INDEX_], "LC_IDENTIFICATION"));
4595 #      ifdef USE_POSIX_2008_LOCALE
4596     assert(category_masks[LC_IDENTIFICATION_INDEX_] == LC_IDENTIFICATION_MASK);
4597 #      endif
4598 #    endif
4599 #    ifdef USE_LOCALE_MEASUREMENT
4600     assert(categories[LC_MEASUREMENT_INDEX_] == LC_MEASUREMENT);
4601     assert(strEQ(category_names[LC_MEASUREMENT_INDEX_], "LC_MEASUREMENT"));
4602 #      ifdef USE_POSIX_2008_LOCALE
4603     assert(category_masks[LC_MEASUREMENT_INDEX_] == LC_MEASUREMENT_MASK);
4604 #      endif
4605 #    endif
4606 #    ifdef USE_LOCALE_PAPER
4607     assert(categories[LC_PAPER_INDEX_] == LC_PAPER);
4608     assert(strEQ(category_names[LC_PAPER_INDEX_], "LC_PAPER"));
4609 #      ifdef USE_POSIX_2008_LOCALE
4610     assert(category_masks[LC_PAPER_INDEX_] == LC_PAPER_MASK);
4611 #      endif
4612 #    endif
4613 #    ifdef USE_LOCALE_TELEPHONE
4614     assert(categories[LC_TELEPHONE_INDEX_] == LC_TELEPHONE);
4615     assert(strEQ(category_names[LC_TELEPHONE_INDEX_], "LC_TELEPHONE"));
4616 #      ifdef USE_POSIX_2008_LOCALE
4617     assert(category_masks[LC_TELEPHONE_INDEX_] == LC_TELEPHONE_MASK);
4618 #      endif
4619 #    endif
4620 #    ifdef USE_LOCALE_SYNTAX
4621     assert(categories[LC_SYNTAX_INDEX_] == LC_SYNTAX);
4622     assert(strEQ(category_names[LC_SYNTAX_INDEX_], "LC_SYNTAX"));
4623 #      ifdef USE_POSIX_2008_LOCALE
4624     assert(category_masks[LC_SYNTAX_INDEX_] == LC_SYNTAX_MASK);
4625 #      endif
4626 #    endif
4627 #    ifdef USE_LOCALE_TOD
4628     assert(categories[LC_TOD_INDEX_] == LC_TOD);
4629     assert(strEQ(category_names[LC_TOD_INDEX_], "LC_TOD"));
4630 #      ifdef USE_POSIX_2008_LOCALE
4631     assert(category_masks[LC_TOD_INDEX_] == LC_TOD_MASK);
4632 #      endif
4633 #    endif
4634 #    ifdef LC_ALL
4635     assert(categories[LC_ALL_INDEX_] == LC_ALL);
4636     assert(strEQ(category_names[LC_ALL_INDEX_], "LC_ALL"));
4637     STATIC_ASSERT_STMT(NOMINAL_LC_ALL_INDEX == LC_ALL_INDEX_);
4638 #      ifdef USE_POSIX_2008_LOCALE
4639     assert(category_masks[LC_ALL_INDEX_] == LC_ALL_MASK);
4640 #      endif
4641 #    endif
4642 #  endif    /* DEBUGGING */
4643
4644     /* Initialize the per-thread mbrFOO() state variables.  See POSIX.xs for
4645      * why these particular incantations are used. */
4646 #  ifdef HAS_MBRLEN
4647     memzero(&PL_mbrlen_ps, sizeof(PL_mbrlen_ps));
4648 #  endif
4649 #  ifdef HAS_MBRTOWC
4650     memzero(&PL_mbrtowc_ps, sizeof(PL_mbrtowc_ps));
4651 #  endif
4652 #  ifdef HAS_WCTOMBR
4653     wcrtomb(NULL, L'\0', &PL_wcrtomb_ps);
4654 #  endif
4655
4656     /* Initialize the cache of the program's UTF-8ness for the always known
4657      * locales C and POSIX */
4658     my_strlcpy(PL_locale_utf8ness, C_and_POSIX_utf8ness,
4659                sizeof(PL_locale_utf8ness));
4660
4661     /* See https://github.com/Perl/perl5/issues/17824 */
4662     Zero(curlocales, NOMINAL_LC_ALL_INDEX, char *);
4663
4664 #  ifdef USE_THREAD_SAFE_LOCALE
4665 #    ifdef WIN32
4666
4667     _configthreadlocale(_ENABLE_PER_THREAD_LOCALE);
4668
4669 #    endif
4670 #  endif
4671 #  ifdef USE_POSIX_2008_LOCALE
4672
4673     PL_C_locale_obj = newlocale(LC_ALL_MASK, "C", (locale_t) 0);
4674     if (! PL_C_locale_obj) {
4675         locale_panic_(Perl_form(aTHX_
4676                                 "Cannot create POSIX 2008 C locale object"));
4677     }
4678
4679     DEBUG_Lv(PerlIO_printf(Perl_debug_log, "created C object %p\n",
4680                            PL_C_locale_obj));
4681 #  endif
4682 #  ifdef USE_LOCALE_NUMERIC
4683
4684     PL_numeric_radix_sv    = newSVpvn(C_decimal_point, strlen(C_decimal_point));
4685     PL_underlying_radix_sv = newSVpvn(C_decimal_point, strlen(C_decimal_point));
4686     Newx(PL_numeric_name, 2, char);
4687     Copy("C", PL_numeric_name, 2, char);
4688
4689 #  endif
4690 #  ifdef USE_LOCALE_COLLATE
4691
4692     Newx(PL_collation_name, 2, char);
4693     Copy("C", PL_collation_name, 2, char);
4694
4695 #  endif
4696 #  ifdef USE_PL_CURLOCALES
4697
4698     /* Initialize our records.  If we have POSIX 2008, we have LC_ALL */
4699     void_setlocale_c(LC_ALL, porcelain_setlocale(LC_ALL, NULL));
4700
4701 #  endif
4702
4703     /* We try each locale in the list until we get one that works, or exhaust
4704      * the list.  Normally the loop is executed just once.  But if setting the
4705      * locale fails, inside the loop we add fallback trials to the array and so
4706      * will execute the loop multiple times */
4707     trial_locales[0] = (trial_locales_struct) {
4708         .trial_locale = setlocale_init,
4709         .fallback_desc = NULL,
4710         .fallback_name = NULL,
4711     };
4712     trial_locales_count = 1;
4713
4714     for (i= 0; i < trial_locales_count; i++) {
4715         const char * trial_locale = trial_locales[i].trial_locale;
4716         setlocale_failure = FALSE;
4717
4718 #  ifdef LC_ALL
4719
4720         /* setlocale() return vals; not copied so must be looked at
4721          * immediately. */
4722         const char * sl_result[NOMINAL_LC_ALL_INDEX + 1];
4723         sl_result[LC_ALL_INDEX_] = stdized_setlocale(LC_ALL, trial_locale);
4724         DEBUG_LOCALE_INIT(LC_ALL_INDEX_, trial_locale, sl_result[LC_ALL_INDEX_]);
4725         if (! sl_result[LC_ALL_INDEX_]) {
4726             setlocale_failure = TRUE;
4727         }
4728         else {
4729             /* Since LC_ALL succeeded, it should have changed all the other
4730              * categories it can to its value; so we massage things so that the
4731              * setlocales below just return their category's current values.
4732              * This adequately handles the case in NetBSD where LC_COLLATE may
4733              * not be defined for a locale, and setting it individually will
4734              * fail, whereas setting LC_ALL succeeds, leaving LC_COLLATE set to
4735              * the POSIX locale. */
4736             trial_locale = NULL;
4737         }
4738
4739 #  endif /* LC_ALL */
4740
4741         if (! setlocale_failure) {
4742             unsigned int j;
4743             for (j = 0; j < NOMINAL_LC_ALL_INDEX; j++) {
4744                 Safefree(curlocales[j]);
4745                 curlocales[j] = stdized_setlocale(categories[j], trial_locale);
4746                 if (! curlocales[j]) {
4747                     setlocale_failure = TRUE;
4748                 }
4749                 curlocales[j] = savepv(curlocales[j]);
4750                 DEBUG_LOCALE_INIT(j, trial_locale, curlocales[j]);
4751             }
4752
4753             if (LIKELY(! setlocale_failure)) {  /* All succeeded */
4754                 break;  /* Exit trial_locales loop */
4755             }
4756         }
4757
4758         /* Here, something failed; will need to try a fallback. */
4759         ok = 0;
4760
4761         if (i == 0) {
4762             unsigned int j;
4763
4764             if (locwarn) { /* Output failure info only on the first one */
4765
4766 #  ifdef LC_ALL
4767
4768                 PerlIO_printf(Perl_error_log,
4769                 "perl: warning: Setting locale failed.\n");
4770
4771 #  else /* !LC_ALL */
4772
4773                 PerlIO_printf(Perl_error_log,
4774                 "perl: warning: Setting locale failed for the categories:\n");
4775
4776                 for (j = 0; j < NOMINAL_LC_ALL_INDEX; j++) {
4777                     if (! curlocales[j]) {
4778                         PerlIO_printf(Perl_error_log, "\t%s\n", category_names[j]);
4779                     }
4780                 }
4781
4782 #  endif /* LC_ALL */
4783
4784                 PerlIO_printf(Perl_error_log,
4785                     "perl: warning: Please check that your locale settings:\n");
4786
4787 #  ifdef __GLIBC__
4788
4789                 PerlIO_printf(Perl_error_log,
4790                             "\tLANGUAGE = %c%s%c,\n",
4791                             language ? '"' : '(',
4792                             language ? language : "unset",
4793                             language ? '"' : ')');
4794 #  endif
4795
4796                 PerlIO_printf(Perl_error_log,
4797                             "\tLC_ALL = %c%s%c,\n",
4798                             lc_all ? '"' : '(',
4799                             lc_all ? lc_all : "unset",
4800                             lc_all ? '"' : ')');
4801
4802 #  if defined(USE_ENVIRON_ARRAY)
4803
4804                 {
4805                     char **e;
4806
4807                     /* Look through the environment for any variables of the
4808                      * form qr/ ^ LC_ [A-Z]+ = /x, except LC_ALL which was
4809                      * already handled above.  These are assumed to be locale
4810                      * settings.  Output them and their values. */
4811                     for (e = environ; *e; e++) {
4812                         const STRLEN prefix_len = sizeof("LC_") - 1;
4813                         STRLEN uppers_len;
4814
4815                         if (     strBEGINs(*e, "LC_")
4816                             && ! strBEGINs(*e, "LC_ALL=")
4817                             && (uppers_len = strspn(*e + prefix_len,
4818                                              "ABCDEFGHIJKLMNOPQRSTUVWXYZ"))
4819                             && ((*e)[prefix_len + uppers_len] == '='))
4820                         {
4821                             PerlIO_printf(Perl_error_log, "\t%.*s = \"%s\",\n",
4822                                 (int) (prefix_len + uppers_len), *e,
4823                                 *e + prefix_len + uppers_len + 1);
4824                         }
4825                     }
4826                 }
4827
4828 #  else
4829
4830                 PerlIO_printf(Perl_error_log,
4831                             "\t(possibly more locale environment variables)\n");
4832
4833 #  endif
4834
4835                 PerlIO_printf(Perl_error_log,
4836                             "\tLANG = %c%s%c\n",
4837                             lang ? '"' : '(',
4838                             lang ? lang : "unset",
4839                             lang ? '"' : ')');
4840
4841                 PerlIO_printf(Perl_error_log,
4842                             "    are supported and installed on your system.\n");
4843             }
4844
4845             /* Calculate what fallback locales to try.  We have avoided this
4846              * until we have to, because failure is quite unlikely.  This will
4847              * usually change the upper bound of the loop we are in.
4848              *
4849              * Since the system's default way of setting the locale has not
4850              * found one that works, We use Perl's defined ordering: LC_ALL,
4851              * LANG, and the C locale.  We don't try the same locale twice, so
4852              * don't add to the list if already there.  (On POSIX systems, the
4853              * LC_ALL element will likely be a repeat of the 0th element "",
4854              * but there's no harm done by doing it explicitly.
4855              *
4856              * Note that this tries the LC_ALL environment variable even on
4857              * systems which have no LC_ALL locale setting.  This may or may
4858              * not have been originally intentional, but there's no real need
4859              * to change the behavior. */
4860             if (lc_all) {
4861                 for (j = 0; j < trial_locales_count; j++) {
4862                     if (strEQ(lc_all, trial_locales[j].trial_locale)) {
4863                         goto done_lc_all;
4864                     }
4865                 }
4866                 trial_locales[trial_locales_count++] = (trial_locales_struct) {
4867                     .trial_locale = lc_all,
4868                     .fallback_desc = (strEQ(lc_all, "C")
4869                                       ? "the standard locale"
4870                                       : "a fallback locale"),
4871                     .fallback_name = lc_all,
4872                 };
4873             }
4874           done_lc_all:
4875
4876             if (lang) {
4877                 for (j = 0; j < trial_locales_count; j++) {
4878                     if (strEQ(lang, trial_locales[j].trial_locale)) {
4879                         goto done_lang;
4880                     }
4881                 }
4882                 trial_locales[trial_locales_count++] = (trial_locales_struct) {
4883                     .trial_locale = lang,
4884                     .fallback_desc = (strEQ(lang, "C")
4885                                       ? "the standard locale"
4886                                       : "a fallback locale"),
4887                     .fallback_name = lang,
4888                 };
4889             }
4890           done_lang:
4891
4892 #  if defined(WIN32) && defined(LC_ALL)
4893
4894             /* For Windows, we also try the system default locale before "C".
4895              * (If there exists a Windows without LC_ALL we skip this because
4896              * it gets too complicated.  For those, the "C" is the next
4897              * fallback possibility). */
4898             {
4899                 /* Note that this may change the locale, but we are going to do
4900                  * that anyway.
4901                  *
4902                  * Our normal Windows setlocale() implementation ignores the
4903                  * system default locale to make things work like POSIX.  This
4904                  * is the only place where we want to consider it, so have to
4905                  * use wrap_wsetlocale(). */
4906                 const char *system_default_locale =
4907                                     stdize_locale(LC_ALL,
4908                                                   S_wrap_wsetlocale(aTHX_ LC_ALL, ""),
4909                                                   &PL_stdize_locale_buf,
4910                                                   &PL_stdize_locale_bufsize,
4911                                                   __LINE__);
4912                 DEBUG_LOCALE_INIT(LC_ALL_INDEX_, "", system_default_locale);
4913
4914                 /* Skip if invalid or if it's already on the list of locales to
4915                  * try */
4916                 if (! system_default_locale) {
4917                     goto done_system_default;
4918                 }
4919                 for (j = 0; j < trial_locales_count; j++) {
4920                     if (strEQ(system_default_locale, trial_locales[j].trial_locale)) {
4921                         goto done_system_default;
4922                     }
4923                 }
4924
4925                 trial_locales[trial_locales_count++] = (trial_locales_struct) {
4926                     .trial_locale = system_default_locale,
4927                     .fallback_desc = (strEQ(system_default_locale, "C")
4928                                       ? "the standard locale"
4929                                       : "the system default locale"),
4930                     .fallback_name = system_default_locale,
4931                 };
4932             }
4933           done_system_default:
4934
4935 #  endif
4936
4937             for (j = 0; j < trial_locales_count; j++) {
4938                 if (strEQ("C", trial_locales[j].trial_locale)) {
4939                     goto done_C;
4940                 }
4941             }
4942             trial_locales[trial_locales_count++] = (trial_locales_struct) {
4943                 .trial_locale = "C",
4944                 .fallback_desc = "the standard locale",
4945                 .fallback_name = "C",
4946             };
4947
4948           done_C: ;
4949         }   /* end of first time through the loop */
4950
4951 #  ifdef WIN32
4952
4953       next_iteration: ;
4954
4955 #  endif
4956
4957     }   /* end of looping through the trial locales */
4958
4959     if (ok < 1) {   /* If we tried to fallback */
4960         const char* msg;
4961         if (! setlocale_failure) {  /* fallback succeeded */
4962            msg = "Falling back to";
4963         }
4964         else {  /* fallback failed */
4965             unsigned int j;
4966
4967             /* We dropped off the end of the loop, so have to decrement i to
4968              * get back to the value the last time through */
4969             i--;
4970
4971             ok = -1;
4972             msg = "Failed to fall back to";
4973
4974             /* To continue, we should use whatever values we've got */
4975
4976             for (j = 0; j < NOMINAL_LC_ALL_INDEX; j++) {
4977                 Safefree(curlocales[j]);
4978                 curlocales[j] = savepv(stdized_setlocale(categories[j], NULL));
4979                 DEBUG_LOCALE_INIT(j, NULL, curlocales[j]);
4980             }
4981         }
4982
4983         if (locwarn) {
4984             const char * description = trial_locales[i].fallback_desc;
4985             const char * name = trial_locales[i].fallback_name;
4986
4987             if (name && strNE(name, "")) {
4988                 PerlIO_printf(Perl_error_log,
4989                     "perl: warning: %s %s (\"%s\").\n", msg, description, name);
4990             }
4991             else {
4992                 PerlIO_printf(Perl_error_log,
4993                                    "perl: warning: %s %s.\n", msg, description);
4994             }
4995         }
4996     } /* End of tried to fallback */
4997
4998 #  ifdef USE_POSIX_2008_LOCALE
4999
5000     /* The stdized setlocales haven't affected the P2008 locales.  Initialize
5001      * them now, calculating LC_ALL only on the final go round, when all have
5002      * been set. */
5003     for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++) {
5004         (void) emulate_setlocale_i(i, curlocales[i],
5005                                    RECALCULATE_LC_ALL_ON_FINAL_INTERATION,
5006                                    __LINE__);
5007     }
5008
5009 #  endif
5010
5011     /* Done with finding the locales; update the auxiliary records */
5012     new_LC_ALL(NULL);
5013
5014     for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++) {
5015
5016 #  if defined(USE_LOCALE_THREADS) && ! defined(USE_THREAD_SAFE_LOCALE)
5017
5018         /* This caches whether each category's locale is UTF-8 or not.  This
5019          * may involve changing the locale.  It is ok to do this at
5020          * initialization time before any threads have started, but not later
5021          * unless thread-safe operations are used.
5022          * Caching means that if the program heeds our dictate not to change
5023          * locales in threaded applications, this data will remain valid, and
5024          * it may get queried without having to change locales.  If the
5025          * environment is such that all categories have the same locale, this
5026          * isn't needed, as the code will not change the locale; but this
5027          * handles the uncommon case where the environment has disparate
5028          * locales for the categories */
5029         (void) _is_cur_LC_category_utf8(categories[i]);
5030
5031 #  endif
5032
5033         Safefree(curlocales[i]);
5034     }
5035
5036 #  if defined(USE_PERLIO) && defined(USE_LOCALE_CTYPE)
5037
5038     /* Set PL_utf8locale to TRUE if using PerlIO _and_ the current LC_CTYPE
5039      * locale is UTF-8.  The call to new_ctype() just above has already
5040      * calculated the latter value and saved it in PL_in_utf8_CTYPE_locale. If
5041      * both PL_utf8locale and PL_unicode (set by -C or by $ENV{PERL_UNICODE})
5042      * are true, perl.c:S_parse_body() will turn on the PerlIO :utf8 layer on
5043      * STDIN, STDOUT, STDERR, _and_ the default open discipline.  */
5044     PL_utf8locale = PL_in_utf8_CTYPE_locale;
5045
5046     /* Set PL_unicode to $ENV{PERL_UNICODE} if using PerlIO.
5047        This is an alternative to using the -C command line switch
5048        (the -C if present will override this). */
5049     {
5050          const char *p = PerlEnv_getenv("PERL_UNICODE");
5051          PL_unicode = p ? parse_unicode_opts(&p) : 0;
5052          if (PL_unicode & PERL_UNICODE_UTF8CACHEASSERT_FLAG)
5053              PL_utf8cache = -1;
5054     }
5055
5056 #  endif
5057 #endif /* USE_LOCALE */
5058
5059     /* So won't continue to output stuff */
5060     DEBUG_INITIALIZATION_set(FALSE);
5061
5062     return ok;
5063 }
5064
5065 #ifdef USE_LOCALE_COLLATE
5066
5067 STATIC void
5068 S_compute_collxfrm_coefficients(pTHX)
5069 {
5070
5071         PL_in_utf8_COLLATE_locale = (PL_collation_standard)
5072                                     ? 0
5073                                     : is_locale_utf8(PL_collation_name);
5074         PL_strxfrm_NUL_replacement = '\0';
5075         PL_strxfrm_max_cp = 0;
5076
5077         /* A locale collation definition includes primary, secondary, tertiary,
5078          * etc. weights for each character.  To sort, the primary weights are
5079          * used, and only if they compare equal, then the secondary weights are
5080          * used, and only if they compare equal, then the tertiary, etc.
5081          *
5082          * strxfrm() works by taking the input string, say ABC, and creating an
5083          * output transformed string consisting of first the primary weights,
5084          * A¹B¹C¹ followed by the secondary ones, A²B²C²; and then the
5085          * tertiary, etc, yielding A¹B¹C¹ A²B²C² A³B³C³ ....  Some characters
5086          * may not have weights at every level.  In our example, let's say B
5087          * doesn't have a tertiary weight, and A doesn't have a secondary
5088          * weight.  The constructed string is then going to be
5089          *  A¹B¹C¹ B²C² A³C³ ....
5090          * This has the desired effect that strcmp() will look at the secondary
5091          * or tertiary weights only if the strings compare equal at all higher
5092          * priority weights.  The spaces shown here, like in
5093          *  "A¹B¹C¹ A²B²C² "
5094          * are not just for readability.  In the general case, these must
5095          * actually be bytes, which we will call here 'separator weights'; and
5096          * they must be smaller than any other weight value, but since these
5097          * are C strings, only the terminating one can be a NUL (some
5098          * implementations may include a non-NUL separator weight just before
5099          * the NUL).  Implementations tend to reserve 01 for the separator
5100          * weights.  They are needed so that a shorter string's secondary
5101          * weights won't be misconstrued as primary weights of a longer string,
5102          * etc.  By making them smaller than any other weight, the shorter
5103          * string will sort first.  (Actually, if all secondary weights are
5104          * smaller than all primary ones, there is no need for a separator
5105          * weight between those two levels, etc.)
5106          *
5107          * The length of the transformed string is roughly a linear function of
5108          * the input string.  It's not exactly linear because some characters
5109          * don't have weights at all levels.  When we call strxfrm() we have to
5110          * allocate some memory to hold the transformed string.  The
5111          * calculations below try to find coefficients 'm' and 'b' for this
5112          * locale so that m*x + b equals how much space we need, given the size
5113          * of the input string in 'x'.  If we calculate too small, we increase
5114          * the size as needed, and call strxfrm() again, but it is better to
5115          * get it right the first time to avoid wasted expensive string
5116          * transformations. */
5117
5118         {
5119             /* We use the string below to find how long the tranformation of it
5120              * is.  Almost all locales are supersets of ASCII, or at least the
5121              * ASCII letters.  We use all of them, half upper half lower,
5122              * because if we used fewer, we might hit just the ones that are
5123              * outliers in a particular locale.  Most of the strings being
5124              * collated will contain a preponderance of letters, and even if
5125              * they are above-ASCII, they are likely to have the same number of
5126              * weight levels as the ASCII ones.  It turns out that digits tend
5127              * to have fewer levels, and some punctuation has more, but those
5128              * are relatively sparse in text, and khw believes this gives a
5129              * reasonable result, but it could be changed if experience so
5130              * dictates. */
5131             const char longer[] = "ABCDEFGHIJKLMnopqrstuvwxyz";
5132             char * x_longer;        /* Transformed 'longer' */
5133             Size_t x_len_longer;    /* Length of 'x_longer' */
5134
5135             char * x_shorter;   /* We also transform a substring of 'longer' */
5136             Size_t x_len_shorter;
5137
5138             /* mem_collxfrm_() is used get the transformation (though here we
5139              * are interested only in its length).  It is used because it has
5140              * the intelligence to handle all cases, but to work, it needs some
5141              * values of 'm' and 'b' to get it started.  For the purposes of
5142              * this calculation we use a very conservative estimate of 'm' and
5143              * 'b'.  This assumes a weight can be multiple bytes, enough to
5144              * hold any UV on the platform, and there are 5 levels, 4 weight
5145              * bytes, and a trailing NUL.  */
5146             PL_collxfrm_base = 5;
5147             PL_collxfrm_mult = 5 * sizeof(UV);
5148
5149             /* Find out how long the transformation really is */
5150             x_longer = mem_collxfrm_(longer,
5151                                      sizeof(longer) - 1,
5152                                      &x_len_longer,
5153
5154                                      /* We avoid converting to UTF-8 in the
5155                                       * called function by telling it the
5156                                       * string is in UTF-8 if the locale is a
5157                                       * UTF-8 one.  Since the string passed
5158                                       * here is invariant under UTF-8, we can
5159                                       * claim it's UTF-8 even though it isn't.
5160                                       * */
5161                                      PL_in_utf8_COLLATE_locale);
5162             Safefree(x_longer);
5163
5164             /* Find out how long the transformation of a substring of 'longer'
5165              * is.  Together the lengths of these transformations are
5166              * sufficient to calculate 'm' and 'b'.  The substring is all of
5167              * 'longer' except the first character.  This minimizes the chances
5168              * of being swayed by outliers */
5169             x_shorter = mem_collxfrm_(longer + 1,
5170                                       sizeof(longer) - 2,
5171                                       &x_len_shorter,
5172                                       PL_in_utf8_COLLATE_locale);
5173             Safefree(x_shorter);
5174
5175             /* If the results are nonsensical for this simple test, the whole
5176              * locale definition is suspect.  Mark it so that locale collation
5177              * is not active at all for it.  XXX Should we warn? */
5178             if (   x_len_shorter == 0
5179                 || x_len_longer == 0
5180                 || x_len_shorter >= x_len_longer)
5181             {
5182                 PL_collxfrm_mult = 0;
5183                 PL_collxfrm_base = 1;
5184                 DEBUG_L(PerlIO_printf(Perl_debug_log,
5185                         "Disabling locale collation for LC_COLLATE='%s';"
5186                         " length for shorter sample=%zu; longer=%zu\n",
5187                         PL_collation_name, x_len_shorter, x_len_longer));
5188             }
5189             else {
5190                 SSize_t base;       /* Temporary */
5191
5192                 /* We have both:    m * strlen(longer)  + b = x_len_longer
5193                  *                  m * strlen(shorter) + b = x_len_shorter;
5194                  * subtracting yields:
5195                  *          m * (strlen(longer) - strlen(shorter))
5196                  *                             = x_len_longer - x_len_shorter
5197                  * But we have set things up so that 'shorter' is 1 byte smaller
5198                  * than 'longer'.  Hence:
5199                  *          m = x_len_longer - x_len_shorter
5200                  *
5201                  * But if something went wrong, make sure the multiplier is at
5202                  * least 1.
5203                  */
5204                 if (x_len_longer > x_len_shorter) {
5205                     PL_collxfrm_mult = (STRLEN) x_len_longer - x_len_shorter;
5206                 }
5207                 else {
5208                     PL_collxfrm_mult = 1;
5209                 }
5210
5211                 /*     mx + b = len
5212                  * so:      b = len - mx
5213                  * but in case something has gone wrong, make sure it is
5214                  * non-negative */
5215                 base = x_len_longer - PL_collxfrm_mult * (sizeof(longer) - 1);
5216                 if (base < 0) {
5217                     base = 0;
5218                 }
5219
5220                 /* Add 1 for the trailing NUL */
5221                 PL_collxfrm_base = base + 1;
5222             }
5223
5224             DEBUG_L(PerlIO_printf(Perl_debug_log,
5225                                   "?UTF-8 locale=%d; x_len_shorter=%zu, "
5226                     "x_len_longer=%zu,"
5227                     " collate multipler=%zu, collate base=%zu\n",
5228                     PL_in_utf8_COLLATE_locale,
5229                     x_len_shorter, x_len_longer,
5230                                   PL_collxfrm_mult, PL_collxfrm_base));
5231         }
5232 }
5233
5234 char *
5235 Perl_mem_collxfrm_(pTHX_ const char *input_string,
5236                          STRLEN len,    /* Length of 'input_string' */
5237                          STRLEN *xlen,  /* Set to length of returned string
5238                                            (not including the collation index
5239                                            prefix) */
5240                          bool utf8      /* Is the input in UTF-8? */
5241                    )
5242 {
5243     /* mem_collxfrm_() is like strxfrm() but with two important differences.
5244      * First, it handles embedded NULs. Second, it allocates a bit more memory
5245      * than needed for the transformed data itself.  The real transformed data
5246      * begins at offset COLLXFRM_HDR_LEN.  *xlen is set to the length of that,
5247      * and doesn't include the collation index size.
5248      *
5249      * It is the caller's responsibility to eventually free the memory returned
5250      * by this function.
5251      *
5252      * Please see sv_collxfrm() to see how this is used. */
5253
5254 #  define COLLXFRM_HDR_LEN    sizeof(PL_collation_ix)
5255
5256     char * s = (char *) input_string;
5257     STRLEN s_strlen = strlen(input_string);
5258     char *xbuf = NULL;
5259     STRLEN xAlloc;          /* xalloc is a reserved word in VC */
5260     STRLEN length_in_chars;
5261     bool first_time = TRUE; /* Cleared after first loop iteration */
5262     const char * orig_CTYPE_locale = NULL;
5263
5264 #  if defined(USE_POSIX_2008_LOCALE) && defined HAS_STRXFRM_L
5265     locale_t constructed_locale = (locale_t) 0;
5266 #  endif
5267
5268     PERL_ARGS_ASSERT_MEM_COLLXFRM_;
5269
5270     /* Must be NUL-terminated */
5271     assert(*(input_string + len) == '\0');
5272
5273     if (PL_collxfrm_mult == 0) {     /* unknown or bad */
5274         if (PL_collxfrm_base != 0) { /* bad collation => skip */
5275             DEBUG_L(PerlIO_printf(Perl_debug_log,
5276                             "mem_collxfrm_: locale's collation is defective\n"));
5277             goto bad;
5278         }
5279
5280         S_compute_collxfrm_coefficients(aTHX);
5281     }
5282
5283     /* Replace any embedded NULs with the control that sorts before any others.
5284      * This will give as good as possible results on strings that don't
5285      * otherwise contain that character, but otherwise there may be
5286      * less-than-perfect results with that character and NUL.  This is
5287      * unavoidable unless we replace strxfrm with our own implementation. */
5288     if (UNLIKELY(s_strlen < len)) {   /* Only execute if there is an embedded
5289                                          NUL */
5290         char * e = s + len;
5291         char * sans_nuls;
5292         STRLEN sans_nuls_len;
5293         int try_non_controls;
5294         char this_replacement_char[] = "?\0";   /* Room for a two-byte string,
5295                                                    making sure 2nd byte is NUL.
5296                                                  */
5297         STRLEN this_replacement_len;
5298
5299         /* If we don't know what non-NUL control character sorts lowest for
5300          * this locale, find it */
5301         if (PL_strxfrm_NUL_replacement == '\0') {
5302             int j;
5303             char * cur_min_x = NULL;    /* The min_char's xfrm, (except it also
5304                                            includes the collation index
5305                                            prefixed. */
5306
5307             DEBUG_Lv(PerlIO_printf(Perl_debug_log, "Looking to replace NUL\n"));
5308
5309             /* Unlikely, but it may be that no control will work to replace
5310              * NUL, in which case we instead look for any character.  Controls
5311              * are preferred because collation order is, in general, context
5312              * sensitive, with adjoining characters affecting the order, and
5313              * controls are less likely to have such interactions, allowing the
5314              * NUL-replacement to stand on its own.  (Another way to look at it
5315              * is to imagine what would happen if the NUL were replaced by a
5316              * combining character; it wouldn't work out all that well.) */
5317             for (try_non_controls = 0;
5318                  try_non_controls < 2;
5319                  try_non_controls++)
5320             {
5321
5322 #  ifdef USE_LOCALE_CTYPE
5323
5324                 /* In this case we use isCNTRL_LC() below, which relies on
5325                  * LC_CTYPE, so that must be switched to correspond with the
5326                  * LC_COLLATE locale */
5327                 if (! try_non_controls && ! PL_in_utf8_COLLATE_locale) {
5328                     orig_CTYPE_locale = toggle_locale_c(LC_CTYPE, PL_collation_name);
5329                 }
5330 #  endif
5331                 /* Look through all legal code points (NUL isn't) */
5332                 for (j = 1; j < 256; j++) {
5333                     char * x;       /* j's xfrm plus collation index */
5334                     STRLEN x_len;   /* length of 'x' */
5335                     STRLEN trial_len = 1;
5336                     char cur_source[] = { '\0', '\0' };
5337
5338                     /* Skip non-controls the first time through the loop.  The
5339                      * controls in a UTF-8 locale are the L1 ones */
5340                     if (! try_non_controls && (PL_in_utf8_COLLATE_locale)
5341                                                ? ! isCNTRL_L1(j)
5342                                                : ! isCNTRL_LC(j))
5343                     {
5344                         continue;
5345                     }
5346
5347                     /* Create a 1-char string of the current code point */
5348                     cur_source[0] = (char) j;
5349
5350                     /* Then transform it */
5351                     x = mem_collxfrm_(cur_source, trial_len, &x_len,
5352                                       0 /* The string is not in UTF-8 */);
5353
5354                     /* Ignore any character that didn't successfully transform.
5355                      * */
5356                     if (! x) {
5357                         continue;
5358                     }
5359
5360                     /* If this character's transformation is lower than
5361                      * the current lowest, this one becomes the lowest */
5362                     if (   cur_min_x == NULL
5363                         || strLT(x         + COLLXFRM_HDR_LEN,
5364                                  cur_min_x + COLLXFRM_HDR_LEN))
5365                     {
5366                         PL_strxfrm_NUL_replacement = j;
5367                         Safefree(cur_min_x);
5368                         cur_min_x = x;
5369                     }
5370                     else {
5371                         Safefree(x);
5372                     }
5373                 } /* end of loop through all 255 characters */
5374
5375 #  ifdef USE_LOCALE_CTYPE
5376                 restore_toggled_locale_c(LC_CTYPE, orig_CTYPE_locale);
5377 #  endif
5378
5379                 /* Stop looking if found */
5380                 if (cur_min_x) {
5381                     break;
5382                 }
5383
5384                 /* Unlikely, but possible, if there aren't any controls that
5385                  * work in the locale, repeat the loop, looking for any
5386                  * character that works */
5387                 DEBUG_L(PerlIO_printf(Perl_debug_log,
5388                 "mem_collxfrm_: No control worked.  Trying non-controls\n"));
5389             } /* End of loop to try first the controls, then any char */
5390
5391             if (! cur_min_x) {
5392                 DEBUG_L(PerlIO_printf(Perl_debug_log,
5393                     "mem_collxfrm_: Couldn't find any character to replace"
5394                     " embedded NULs in locale %s with", PL_collation_name));
5395                 goto bad;
5396             }
5397
5398             DEBUG_L(PerlIO_printf(Perl_debug_log,
5399                     "mem_collxfrm_: Replacing embedded NULs in locale %s with "
5400                     "0x%02X\n", PL_collation_name, PL_strxfrm_NUL_replacement));
5401
5402             Safefree(cur_min_x);
5403         } /* End of determining the character that is to replace NULs */
5404
5405         /* If the replacement is variant under UTF-8, it must match the
5406          * UTF8-ness of the original */
5407         if ( ! UVCHR_IS_INVARIANT(PL_strxfrm_NUL_replacement) && utf8) {
5408             this_replacement_char[0] =
5409                                 UTF8_EIGHT_BIT_HI(PL_strxfrm_NUL_replacement);
5410             this_replacement_char[1] =
5411                                 UTF8_EIGHT_BIT_LO(PL_strxfrm_NUL_replacement);
5412             this_replacement_len = 2;
5413         }
5414         else {
5415             this_replacement_char[0] = PL_strxfrm_NUL_replacement;
5416             /* this_replacement_char[1] = '\0' was done at initialization */
5417             this_replacement_len = 1;
5418         }
5419
5420         /* The worst case length for the replaced string would be if every
5421          * character in it is NUL.  Multiply that by the length of each
5422          * replacement, and allow for a trailing NUL */
5423         sans_nuls_len = (len * this_replacement_len) + 1;
5424         Newx(sans_nuls, sans_nuls_len, char);
5425         *sans_nuls = '\0';
5426
5427         /* Replace each NUL with the lowest collating control.  Loop until have
5428          * exhausted all the NULs */
5429         while (s + s_strlen < e) {
5430             my_strlcat(sans_nuls, s, sans_nuls_len);
5431
5432             /* Do the actual replacement */
5433             my_strlcat(sans_nuls, this_replacement_char, sans_nuls_len);
5434
5435             /* Move past the input NUL */
5436             s += s_strlen + 1;
5437             s_strlen = strlen(s);
5438         }
5439
5440         /* And add anything that trails the final NUL */
5441         my_strlcat(sans_nuls, s, sans_nuls_len);
5442
5443         /* Switch so below we transform this modified string */
5444         s = sans_nuls;
5445         len = strlen(s);
5446     } /* End of replacing NULs */
5447
5448     /* Make sure the UTF8ness of the string and locale match */
5449     if (utf8 != PL_in_utf8_COLLATE_locale) {
5450         /* XXX convert above Unicode to 10FFFF? */
5451         const char * const t = s;   /* Temporary so we can later find where the
5452                                        input was */
5453
5454         /* Here they don't match.  Change the string's to be what the locale is
5455          * expecting */
5456
5457         if (! utf8) { /* locale is UTF-8, but input isn't; upgrade the input */
5458             s = (char *) bytes_to_utf8((const U8 *) s, &len);
5459             utf8 = TRUE;
5460         }
5461         else {   /* locale is not UTF-8; but input is; downgrade the input */
5462
5463             s = (char *) bytes_from_utf8((const U8 *) s, &len, &utf8);
5464
5465             /* If the downgrade was successful we are done, but if the input
5466              * contains things that require UTF-8 to represent, have to do
5467              * damage control ... */
5468             if (UNLIKELY(utf8)) {
5469
5470                 /* What we do is construct a non-UTF-8 string with
5471                  *  1) the characters representable by a single byte converted
5472                  *     to be so (if necessary);
5473                  *  2) and the rest converted to collate the same as the
5474                  *     highest collating representable character.  That makes
5475                  *     them collate at the end.  This is similar to how we
5476                  *     handle embedded NULs, but we use the highest collating
5477                  *     code point instead of the smallest.  Like the NUL case,
5478                  *     this isn't perfect, but is the best we can reasonably
5479                  *     do.  Every above-255 code point will sort the same as
5480                  *     the highest-sorting 0-255 code point.  If that code
5481                  *     point can combine in a sequence with some other code
5482                  *     points for weight calculations, us changing something to
5483                  *     be it can adversely affect the results.  But in most
5484                  *     cases, it should work reasonably.  And note that this is
5485                  *     really an illegal situation: using code points above 255
5486                  *     on a locale where only 0-255 are valid.  If two strings
5487                  *     sort entirely equal, then the sort order for the
5488                  *     above-255 code points will be in code point order. */
5489
5490                 utf8 = FALSE;
5491
5492                 /* If we haven't calculated the code point with the maximum
5493                  * collating order for this locale, do so now */
5494                 if (! PL_strxfrm_max_cp) {
5495                     int j;
5496
5497                     /* The current transformed string that collates the
5498                      * highest (except it also includes the prefixed collation
5499                      * index. */
5500                     char * cur_max_x = NULL;
5501
5502                     /* Look through all legal code points (NUL isn't) */
5503                     for (j = 1; j < 256; j++) {
5504                         char * x;
5505                         STRLEN x_len;
5506                         char cur_source[] = { '\0', '\0' };
5507
5508                         /* Create a 1-char string of the current code point */
5509                         cur_source[0] = (char) j;
5510
5511                         /* Then transform it */
5512                         x = mem_collxfrm_(cur_source, 1, &x_len, FALSE);
5513
5514                         /* If something went wrong (which it shouldn't), just
5515                          * ignore this code point */
5516                         if (! x) {
5517                             continue;
5518                         }
5519
5520                         /* If this character's transformation is higher than
5521                          * the current highest, this one becomes the highest */
5522                         if (   cur_max_x == NULL
5523                             || strGT(x         + COLLXFRM_HDR_LEN,
5524                                      cur_max_x + COLLXFRM_HDR_LEN))
5525                         {
5526                             PL_strxfrm_max_cp = j;
5527                             Safefree(cur_max_x);
5528                             cur_max_x = x;
5529                         }
5530                         else {
5531                             Safefree(x);
5532                         }
5533                     }
5534
5535                     if (! cur_max_x) {
5536                         DEBUG_L(PerlIO_printf(Perl_debug_log,
5537                             "mem_collxfrm_: Couldn't find any character to"
5538                             " replace above-Latin1 chars in locale %s with",
5539                             PL_collation_name));
5540                         goto bad;
5541                     }
5542
5543                     DEBUG_L(PerlIO_printf(Perl_debug_log,
5544                             "mem_collxfrm_: highest 1-byte collating character"
5545                             " in locale %s is 0x%02X\n",
5546                             PL_collation_name,
5547                             PL_strxfrm_max_cp));
5548
5549                     Safefree(cur_max_x);
5550                 }
5551
5552                 /* Here we know which legal code point collates the highest.
5553                  * We are ready to construct the non-UTF-8 string.  The length
5554                  * will be at least 1 byte smaller than the input string
5555                  * (because we changed at least one 2-byte character into a
5556                  * single byte), but that is eaten up by the trailing NUL */
5557                 Newx(s, len, char);
5558
5559                 {
5560                     STRLEN i;
5561                     STRLEN d= 0;
5562                     char * e = (char *) t + len;
5563
5564                     for (i = 0; i < len; i+= UTF8SKIP(t + i)) {
5565                         U8 cur_char = t[i];
5566                         if (UTF8_IS_INVARIANT(cur_char)) {
5567                             s[d++] = cur_char;
5568                         }
5569                         else if (UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(t + i, e)) {
5570                             s[d++] = EIGHT_BIT_UTF8_TO_NATIVE(cur_char, t[i+1]);
5571                         }
5572                         else {  /* Replace illegal cp with highest collating
5573                                    one */
5574                             s[d++] = PL_strxfrm_max_cp;
5575                         }
5576                     }
5577                     s[d++] = '\0';
5578                     Renew(s, d, char);   /* Free up unused space */
5579                 }
5580             }
5581         }
5582
5583         /* Here, we have constructed a modified version of the input.  It could
5584          * be that we already had a modified copy before we did this version.
5585          * If so, that copy is no longer needed */
5586         if (t != input_string) {
5587             Safefree(t);
5588         }
5589     }
5590
5591     length_in_chars = (utf8)
5592                       ? utf8_length((U8 *) s, (U8 *) s + len)
5593                       : len;
5594
5595     /* The first element in the output is the collation id, used by
5596      * sv_collxfrm(); then comes the space for the transformed string.  The
5597      * equation should give us a good estimate as to how much is needed */
5598     xAlloc = COLLXFRM_HDR_LEN
5599            + PL_collxfrm_base
5600            + (PL_collxfrm_mult * length_in_chars);
5601     Newx(xbuf, xAlloc, char);
5602     if (UNLIKELY(! xbuf)) {
5603         DEBUG_L(PerlIO_printf(Perl_debug_log,
5604                       "mem_collxfrm_: Couldn't malloc %zu bytes\n", xAlloc));
5605         goto bad;
5606     }
5607
5608     /* Store the collation id */
5609     *(U32*)xbuf = PL_collation_ix;
5610
5611 #  if defined(USE_POSIX_2008_LOCALE) && defined HAS_STRXFRM_L
5612 #    ifdef USE_LOCALE_CTYPE
5613
5614     constructed_locale = newlocale(LC_CTYPE_MASK, PL_collation_name,
5615                                    duplocale(use_curlocale_scratch()));
5616 #    else
5617
5618     constructed_locale = duplocale(use_curlocale_scratch());
5619
5620 #    endif
5621 #    define my_strxfrm(dest, src, n)  strxfrm_l(dest, src, n,           \
5622                                                 constructed_locale)
5623 #    define CLEANUP_STRXFRM                                             \
5624         STMT_START {                                                    \
5625             if (constructed_locale != (locale_t) 0)                     \
5626                 freelocale(constructed_locale);                         \
5627         } STMT_END
5628 #  else
5629 #    define my_strxfrm(dest, src, n)  strxfrm(dest, src, n)
5630 #    ifdef USE_LOCALE_CTYPE
5631
5632     orig_CTYPE_locale = toggle_locale_c(LC_CTYPE, PL_collation_name);
5633
5634 #      define CLEANUP_STRXFRM                                           \
5635                 restore_toggled_locale_c(LC_CTYPE, orig_CTYPE_locale)
5636 #    else
5637 #      define CLEANUP_STRXFRM  NOOP
5638 #    endif
5639 #  endif
5640
5641     /* Then the transformation of the input.  We loop until successful, or we
5642      * give up */
5643     for (;;) {
5644
5645         errno = 0;
5646         *xlen = my_strxfrm(xbuf + COLLXFRM_HDR_LEN, s, xAlloc - COLLXFRM_HDR_LEN);
5647
5648         /* If the transformed string occupies less space than we told strxfrm()
5649          * was available, it means it transformed the whole string. */
5650         if (*xlen < xAlloc - COLLXFRM_HDR_LEN) {
5651
5652             /* But there still could have been a problem */
5653             if (errno != 0) {
5654                 DEBUG_L(PerlIO_printf(Perl_debug_log,
5655                        "strxfrm failed for LC_COLLATE=%s; errno=%d, input=%s\n",
5656                        PL_collation_name, errno,
5657                        _byte_dump_string((U8 *) s, len, 0)));
5658                 goto bad;
5659             }
5660
5661             /* Here, the transformation was successful.  Some systems include a
5662              * trailing NUL in the returned length.  Ignore it, using a loop in
5663              * case multiple trailing NULs are returned. */
5664             while (   (*xlen) > 0
5665                    && *(xbuf + COLLXFRM_HDR_LEN + (*xlen) - 1) == '\0')
5666             {
5667                 (*xlen)--;
5668             }
5669
5670             /* If the first try didn't get it, it means our prediction was low.
5671              * Modify the coefficients so that we predict a larger value in any
5672              * future transformations */
5673             if (! first_time) {
5674                 STRLEN needed = *xlen + 1;   /* +1 For trailing NUL */
5675                 STRLEN computed_guess = PL_collxfrm_base
5676                                       + (PL_collxfrm_mult * length_in_chars);
5677
5678                 /* On zero-length input, just keep current slope instead of
5679                  * dividing by 0 */
5680                 const STRLEN new_m = (length_in_chars != 0)
5681                                      ? needed / length_in_chars
5682                                      : PL_collxfrm_mult;
5683
5684                 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
5685                     "initial size of %zu bytes for a length "
5686                     "%zu string was insufficient, %zu needed\n",
5687                     computed_guess, length_in_chars, needed));
5688
5689                 /* If slope increased, use it, but discard this result for
5690                  * length 1 strings, as we can't be sure that it's a real slope
5691                  * change */
5692                 if (length_in_chars > 1 && new_m  > PL_collxfrm_mult) {
5693
5694 #  ifdef DEBUGGING
5695
5696                     STRLEN old_m = PL_collxfrm_mult;
5697                     STRLEN old_b = PL_collxfrm_base;
5698
5699 #  endif
5700
5701                     PL_collxfrm_mult = new_m;
5702                     PL_collxfrm_base = 1;   /* +1 For trailing NUL */
5703                     computed_guess = PL_collxfrm_base
5704                                     + (PL_collxfrm_mult * length_in_chars);
5705                     if (computed_guess < needed) {
5706                         PL_collxfrm_base += needed - computed_guess;
5707                     }
5708
5709                     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
5710                                     "slope is now %zu; was %zu, base "
5711                         "is now %zu; was %zu\n",
5712                         PL_collxfrm_mult, old_m,
5713                         PL_collxfrm_base, old_b));
5714                 }
5715                 else {  /* Slope didn't change, but 'b' did */
5716                     const STRLEN new_b = needed
5717                                         - computed_guess
5718                                         + PL_collxfrm_base;
5719                     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
5720                         "base is now %zu; was %zu\n", new_b, PL_collxfrm_base));
5721                     PL_collxfrm_base = new_b;
5722                 }
5723             }
5724
5725             break;
5726         }
5727
5728         if (UNLIKELY(*xlen >= PERL_INT_MAX)) {
5729             DEBUG_L(PerlIO_printf(Perl_debug_log,
5730                   "mem_collxfrm_: Needed %zu bytes, max permissible is %u\n",
5731                   *xlen, PERL_INT_MAX));
5732             goto bad;
5733         }
5734
5735         /* A well-behaved strxfrm() returns exactly how much space it needs
5736          * (usually not including the trailing NUL) when it fails due to not
5737          * enough space being provided.  Assume that this is the case unless
5738          * it's been proven otherwise */
5739         if (LIKELY(PL_strxfrm_is_behaved) && first_time) {
5740             xAlloc = *xlen + COLLXFRM_HDR_LEN + 1;
5741         }
5742         else { /* Here, either:
5743                 *  1)  The strxfrm() has previously shown bad behavior; or
5744                 *  2)  It isn't the first time through the loop, which means
5745                 *      that the strxfrm() is now showing bad behavior, because
5746                 *      we gave it what it said was needed in the previous
5747                 *      iteration, and it came back saying it needed still more.
5748                 *      (Many versions of cygwin fit this.  When the buffer size
5749                 *      isn't sufficient, they return the input size instead of
5750                 *      how much is needed.)
5751                 * Increase the buffer size by a fixed percentage and try again.
5752                 * */
5753             xAlloc += (xAlloc / 4) + 1;
5754             PL_strxfrm_is_behaved = FALSE;
5755
5756             DEBUG_Lv(PerlIO_printf(Perl_debug_log,
5757                      "mem_collxfrm_ required more space than previously"
5758                      " calculated for locale %s, trying again with new"
5759                      " guess=%zu+%zu\n",
5760                 PL_collation_name,  COLLXFRM_HDR_LEN,
5761                      xAlloc - COLLXFRM_HDR_LEN));
5762         }
5763
5764         Renew(xbuf, xAlloc, char);
5765         if (UNLIKELY(! xbuf)) {
5766             DEBUG_L(PerlIO_printf(Perl_debug_log,
5767                       "mem_collxfrm_: Couldn't realloc %zu bytes\n", xAlloc));
5768             goto bad;
5769         }
5770
5771         first_time = FALSE;
5772     }
5773
5774     CLEANUP_STRXFRM;
5775
5776     DEBUG_L(print_collxfrm_input_and_return(s, s + len, xbuf, *xlen, utf8));
5777
5778     /* Free up unneeded space; retain enough for trailing NUL */
5779     Renew(xbuf, COLLXFRM_HDR_LEN + *xlen + 1, char);
5780
5781     if (s != input_string) {
5782         Safefree(s);
5783     }
5784
5785     return xbuf;
5786
5787   bad:
5788
5789     CLEANUP_STRXFRM;
5790     DEBUG_L(print_collxfrm_input_and_return(s, s + len, NULL, 0, utf8));
5791
5792     Safefree(xbuf);
5793     if (s != input_string) {
5794         Safefree(s);
5795     }
5796     *xlen = 0;
5797
5798     return NULL;
5799 }
5800
5801 #  ifdef DEBUGGING
5802
5803 STATIC void
5804 S_print_collxfrm_input_and_return(pTHX_
5805                                   const char * s,
5806                                   const char * e,
5807                                   const char * xbuf,
5808                                   const STRLEN xlen,
5809                                   const bool is_utf8)
5810 {
5811
5812     PERL_ARGS_ASSERT_PRINT_COLLXFRM_INPUT_AND_RETURN;
5813
5814     PerlIO_printf(Perl_debug_log,
5815                   "mem_collxfrm_[ix %" UVuf "] for locale '%s':\n",
5816                   (UV) PL_collation_ix, PL_collation_name);
5817     PerlIO_printf(Perl_debug_log,  "     input=");
5818     print_bytes_for_locale(s, e, is_utf8);
5819     PerlIO_printf(Perl_debug_log, "\n    return=%s\n    return len=%zu\n",
5820            ((xbuf == NULL)
5821              ? "(null)"
5822              : _byte_dump_string((U8 *) xbuf + COLLXFRM_HDR_LEN, xlen, 0)),
5823             xlen);
5824 }
5825
5826 #  endif    /* DEBUGGING */
5827 #endif /* USE_LOCALE_COLLATE */
5828
5829 #ifdef DEBUGGING
5830
5831 STATIC void
5832 S_print_bytes_for_locale(pTHX_
5833                     const char * const s,
5834                     const char * const e,
5835                     const bool is_utf8)
5836 {
5837     const char * t = s;
5838     bool prev_was_printable = TRUE;
5839     bool first_time = TRUE;
5840
5841     PERL_ARGS_ASSERT_PRINT_BYTES_FOR_LOCALE;
5842
5843     while (t < e) {
5844         UV cp = (is_utf8)
5845                 ?  utf8_to_uvchr_buf((U8 *) t, e, NULL)
5846                 : * (U8 *) t;
5847         if (isPRINT(cp)) {
5848             if (! prev_was_printable) {
5849                 PerlIO_printf(Perl_debug_log, " ");
5850             }
5851             PerlIO_printf(Perl_debug_log, "%c", (U8) cp);
5852             prev_was_printable = TRUE;
5853         }
5854         else {
5855             if (! first_time) {
5856                 PerlIO_printf(Perl_debug_log, " ");
5857             }
5858             PerlIO_printf(Perl_debug_log, "%02" UVXf, cp);
5859             prev_was_printable = FALSE;
5860         }
5861         t += (is_utf8) ? UTF8SKIP(t) : 1;
5862         first_time = FALSE;
5863     }
5864 }
5865
5866 #endif   /* #ifdef DEBUGGING */
5867 #ifdef USE_LOCALE
5868
5869 STATIC const char *
5870 S_toggle_locale_i(pTHX_ const unsigned cat_index,
5871                         const char * new_locale,
5872                         const line_t caller_line)
5873 {
5874     /* Changes the locale for the category specified by 'index' to 'new_locale,
5875      * if they aren't already the same.
5876      *
5877      * Returns a copy of the name of the original locale for 'cat_index'
5878      * so can be switched back to with the companion function
5879      * restore_toggled_locale_i(),  (NULL if no restoral is necessary.) */
5880
5881     const char * locale_to_restore_to = NULL;
5882
5883     PERL_ARGS_ASSERT_TOGGLE_LOCALE_I;
5884     assert(cat_index <= NOMINAL_LC_ALL_INDEX);
5885
5886     /* Find the original locale of the category we may need to change, so that
5887      * it can be restored to later */
5888
5889     locale_to_restore_to = querylocale_i(cat_index);
5890
5891     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
5892              "(%" LINE_Tf "): toggle_locale_i: index=%d(%s), wanted=%s,"
5893              " actual=%s\n",
5894              caller_line, cat_index, category_names[cat_index],
5895              new_locale, locale_to_restore_to));
5896
5897     if (! locale_to_restore_to) {
5898         locale_panic_(Perl_form(aTHX_ "Could not find current %s locale, errno=%d",
5899                                 category_names[cat_index], errno));
5900     }
5901
5902     /* If the locales are the same, there's nothing to do */
5903     if (strEQ(locale_to_restore_to, new_locale)) {
5904         DEBUG_Lv(PerlIO_printf(Perl_debug_log,
5905                                "(%d): %s locale unchanged as %s\n",
5906                                caller_line, category_names[cat_index],
5907                                new_locale));
5908
5909         return NULL;
5910     }
5911
5912     /* Finally, change the locale to the new one */
5913     void_setlocale_i(cat_index, new_locale);
5914
5915     DEBUG_Lv(PerlIO_printf(Perl_debug_log, "(%" LINE_Tf "): %s locale switched to %s\n",
5916                            caller_line, category_names[cat_index], new_locale));
5917
5918     return locale_to_restore_to;
5919
5920 #ifndef DEBUGGING
5921     PERL_UNUSED_ARG(caller_line);
5922 #endif
5923
5924 }
5925
5926 STATIC void
5927 S_restore_toggled_locale_i(pTHX_ const unsigned int cat_index,
5928                                  const char * restore_locale,
5929                                  const line_t caller_line)
5930 {
5931     /* Restores the locale for LC_category corresponding to cat_indes to
5932      * 'restore_locale' (which is a copy that will be freed by this function),
5933      * or do nothing if the latter parameter is NULL */
5934
5935     PERL_ARGS_ASSERT_RESTORE_TOGGLED_LOCALE_I;
5936     assert(cat_index <= NOMINAL_LC_ALL_INDEX);
5937
5938     if (restore_locale == NULL) {
5939         DEBUG_Lv(PerlIO_printf(Perl_debug_log,
5940                                "(%" LINE_Tf "): No need to restore %s\n",
5941                                caller_line, category_names[cat_index]));
5942         return;
5943     }
5944
5945     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
5946                            "(%" LINE_Tf "): %s restoring locale to %s\n",
5947                            caller_line, category_names[cat_index],
5948                            restore_locale));
5949
5950     void_setlocale_i(cat_index, restore_locale);
5951
5952 #ifndef DEBUGGING
5953     PERL_UNUSED_ARG(caller_line);
5954 #endif
5955
5956 }
5957
5958 STATIC const char *
5959 S_switch_category_locale_to_template(pTHX_ const int switch_category,
5960                                      const int template_category,
5961                                      const char * template_locale)
5962 {
5963     /* Changes the locale for LC_'switch_category" to that of
5964      * LC_'template_category', if they aren't already the same.  If not NULL,
5965      * 'template_locale' is the locale that 'template_category' is in.
5966      *
5967      * Returns a copy of the name of the original locale for 'switch_category'
5968      * so can be switched back to with the companion function
5969      * restore_switched_locale(),  (NULL if no restoral is necessary.) */
5970
5971     const char * restore_to_locale = NULL;
5972
5973     if (switch_category == template_category) { /* No changes needed */
5974         return NULL;
5975     }
5976
5977     /* Find the original locale of the category we may need to change, so that
5978      * it can be restored to later */
5979     restore_to_locale = querylocale_r(switch_category);
5980     if (! restore_to_locale) {
5981         locale_panic_(Perl_form(aTHX_ "Could not find current %s locale",
5982                                       category_name(switch_category)));
5983     }
5984     restore_to_locale = savepv(restore_to_locale);
5985
5986     /* If the locale of the template category wasn't passed in, find it now */
5987     if (template_locale == NULL) {
5988         template_locale = querylocale_r(template_category);
5989         if (! template_locale) {
5990             locale_panic_(Perl_form(aTHX_ "Could not find current %s locale\n",
5991                                           category_name(template_category)));
5992         }
5993     }
5994
5995     /* It the locales are the same, there's nothing to do */
5996     if (strEQ(restore_to_locale, template_locale)) {
5997         Safefree(restore_to_locale);
5998
5999         DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s locale unchanged as %s\n",
6000                             category_name(switch_category), template_locale));
6001
6002         return NULL;
6003     }
6004
6005     /* Finally, change the locale to the template one */
6006     if (! bool_setlocale_r(switch_category, template_locale)) {
6007         setlocale_failure_panic_i(get_category_index(switch_category,
6008                                                      NULL),
6009                                   category_name(switch_category),
6010                                   template_locale,
6011                                   __LINE__,
6012                                   __LINE__);
6013     }
6014
6015     DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s locale switched to %s\n",
6016                             category_name(switch_category), template_locale));
6017
6018     return restore_to_locale;
6019 }
6020
6021 STATIC void
6022 S_restore_switched_locale(pTHX_ const int category,
6023                                 const char * const original_locale)
6024 {
6025     /* Restores the locale for LC_'category' to 'original_locale' (which is a
6026      * copy that will be freed by this function), or do nothing if the latter
6027      * parameter is NULL */
6028
6029     if (original_locale == NULL) {
6030         return;
6031     }
6032
6033     if (! bool_setlocale_r(category, original_locale)) {
6034         locale_panic_(Perl_form(aTHX_ "s restoring %s to %s failed",
6035                                       category_name(category), original_locale));
6036     }
6037
6038     Safefree(original_locale);
6039 }
6040
6041 /* is_cur_LC_category_utf8 uses a small char buffer to avoid malloc/free */
6042 #  define CUR_LC_BUFFER_SIZE  64
6043
6044 bool
6045 Perl__is_cur_LC_category_utf8(pTHX_ int category)
6046 {
6047     /* Returns TRUE if the current locale for 'category' is UTF-8; FALSE
6048      * otherwise. 'category' may not be LC_ALL.  If the platform doesn't have
6049      * nl_langinfo(), nor MB_CUR_MAX, this employs a heuristic, which hence
6050      * could give the wrong result.  The result will very likely be correct for
6051      * languages that have commonly used non-ASCII characters, but for notably
6052      * English, it comes down to if the locale's name ends in something like
6053      * "UTF-8".  It errs on the side of not being a UTF-8 locale.
6054      *
6055      * If the platform is early C89, not containing mbtowc(), or we are
6056      * compiled to not pay attention to LC_CTYPE, this employs heuristics.
6057      * These work very well for non-Latin locales or those whose currency
6058      * symbol isn't a '$' nor plain ASCII text.  But without LC_CTYPE and at
6059      * least MB_CUR_MAX, English locales with an ASCII currency symbol depend
6060      * on the name containing UTF-8 or not. */
6061
6062     /* Name of current locale corresponding to the input category */
6063     const char *save_input_locale = NULL;
6064
6065     bool is_utf8 = FALSE;                /* The return value */
6066
6067     /* The variables below are for the cache of previous lookups using this
6068      * function.  The cache is a C string, described at the definition for
6069      * 'C_and_POSIX_utf8ness'.
6070      *
6071      * The first part of the cache is fixed, for the C and POSIX locales.  The
6072      * varying part starts just after them. */
6073     char * utf8ness_cache = PL_locale_utf8ness + STRLENs(C_and_POSIX_utf8ness);
6074
6075     Size_t utf8ness_cache_size; /* Size of the varying portion */
6076     Size_t input_name_len;      /* Length in bytes of save_input_locale */
6077     Size_t input_name_len_with_overhead;    /* plus extra chars used to store
6078                                                the name in the cache */
6079     char * delimited;           /* The name plus the delimiters used to store
6080                                    it in the cache */
6081     char buffer[CUR_LC_BUFFER_SIZE];        /* small buffer */
6082     char * name_pos;            /* position of 'delimited' in the cache, or 0
6083                                    if not there */
6084
6085
6086 #  ifdef LC_ALL
6087
6088     assert(category != LC_ALL);
6089
6090 #  endif
6091
6092     /* Get the desired category's locale */
6093     save_input_locale = querylocale_r(category);
6094
6095     DEBUG_L(PerlIO_printf(Perl_debug_log,
6096                           "Current locale for %s is %s\n",
6097                           category_name(category), save_input_locale));
6098
6099     input_name_len = strlen(save_input_locale);
6100
6101     /* In our cache, each name is accompanied by two delimiters and a single
6102      * utf8ness digit */
6103     input_name_len_with_overhead = input_name_len + 3;
6104
6105     if ( input_name_len_with_overhead <= CUR_LC_BUFFER_SIZE ) {
6106         /* we can use the buffer, avoid a malloc */
6107         delimited = buffer;
6108     } else { /* need a malloc */
6109         /* Allocate and populate space for a copy of the name surrounded by the
6110          * delimiters */
6111         Newx(delimited, input_name_len_with_overhead, char);
6112     }
6113
6114     delimited[0] = UTF8NESS_SEP[0];
6115     Copy(save_input_locale, delimited + 1, input_name_len, char);
6116     delimited[input_name_len+1] = UTF8NESS_PREFIX[0];
6117     delimited[input_name_len+2] = '\0';
6118
6119     /* And see if that is in the cache */
6120     name_pos = instr(PL_locale_utf8ness, delimited);
6121     if (name_pos) {
6122         is_utf8 = *(name_pos + input_name_len_with_overhead - 1) - '0';
6123
6124         DEBUG_Lv(PerlIO_printf(Perl_debug_log,
6125                  "UTF8ness for locale %s=%d, \n",
6126                  save_input_locale, is_utf8));
6127
6128         /* And, if not already in that position, move it to the beginning of
6129          * the non-constant portion of the list, since it is the most recently
6130          * used.  (We don't have to worry about overflow, since just moving
6131          * existing names around) */
6132         if (name_pos > utf8ness_cache) {
6133             Move(utf8ness_cache,
6134                  utf8ness_cache + input_name_len_with_overhead,
6135                  name_pos - utf8ness_cache, char);
6136             Copy(delimited,
6137                  utf8ness_cache,
6138                  input_name_len_with_overhead - 1, char);
6139             utf8ness_cache[input_name_len_with_overhead - 1] = is_utf8 + '0';
6140         }
6141
6142         /* free only when not using the buffer */
6143         if ( delimited != buffer ) Safefree(delimited);
6144         return is_utf8;
6145     }
6146
6147     /* Here we don't have stored the utf8ness for the input locale.  We have to
6148      * calculate it */
6149
6150 #  if        defined(USE_LOCALE_CTYPE)                                  \
6151      && (    defined(HAS_SOME_LANGINFO)                                 \
6152          || (defined(HAS_MBTOWC) || defined(HAS_MBRTOWC)))
6153
6154     {
6155         const char *original_ctype_locale
6156                         = switch_category_locale_to_template(LC_CTYPE,
6157                                                              category,
6158                                                              save_input_locale);
6159
6160         /* Here the current LC_CTYPE is set to the locale of the category whose
6161          * information is desired.  This means that nl_langinfo() and mbtowc()
6162          * should give the correct results */
6163
6164 #    ifdef MB_CUR_MAX  /* But we can potentially rule out UTF-8ness, avoiding
6165                           calling the functions if we have this */
6166
6167             /* Standard UTF-8 needs at least 4 bytes to represent the maximum
6168              * Unicode code point. */
6169
6170             DEBUG_L(PerlIO_printf(Perl_debug_log, "MB_CUR_MAX=%d\n",
6171                                              (int) MB_CUR_MAX));
6172             if ((unsigned) MB_CUR_MAX < STRLENs(MAX_UNICODE_UTF8)) {
6173                 is_utf8 = FALSE;
6174                 restore_switched_locale(LC_CTYPE, original_ctype_locale);
6175                 goto finish_and_return;
6176             }
6177
6178 #    endif
6179 #    if defined(HAS_SOME_LANGINFO)
6180
6181         { /* The task is easiest if the platform has this POSIX 2001 function.
6182              Except on some platforms it can wrongly return "", so have to have
6183              a fallback.  And it can return that it's UTF-8, even if there are
6184              variances from that.  For example, Turkish locales may use the
6185              alternate dotted I rules, and sometimes it appears to be a
6186              defective locale definition.  XXX We should probably check for
6187              these in the Latin1 range and warn (but on glibc, requires
6188              iswalnum() etc. due to their not handling 80-FF correctly */
6189             const char * scratch_buffer = NULL;
6190             const char *codeset = my_langinfo_c(CODESET, LC_CTYPE,
6191                                                 save_input_locale,
6192                                                 &scratch_buffer, NULL, NULL);
6193
6194             DEBUG_Lv(PerlIO_printf(Perl_debug_log,
6195                             "\tnllanginfo returned CODESET '%s'\n", codeset));
6196
6197             if (codeset && strNE(codeset, "")) {
6198
6199                               /* If the implementation of foldEQ() somehow were
6200                                * to change to not go byte-by-byte, this could
6201                                * read past end of string, as only one length is
6202                                * checked.  But currently, a premature NUL will
6203                                * compare false, and it will stop there */
6204                 is_utf8 = cBOOL(   foldEQ(codeset, "UTF-8", STRLENs("UTF-8"))
6205                                 || foldEQ(codeset, "UTF8",  STRLENs("UTF8")));
6206
6207                 DEBUG_L(PerlIO_printf(Perl_debug_log,
6208                        "\tnllanginfo returned CODESET '%s'; ?UTF8 locale=%d\n",
6209                                                      codeset,         is_utf8));
6210                 restore_switched_locale(LC_CTYPE, original_ctype_locale);
6211                 Safefree(scratch_buffer);
6212                 goto finish_and_return;
6213             }
6214         }
6215
6216 #    endif
6217 #    if defined(HAS_MBTOWC) || defined(HAS_MBRTOWC)
6218      /* We can see if this is a UTF-8-like locale if have mbtowc().  It was a
6219       * late adder to C89, so very likely to have it.  However, testing has
6220       * shown that, like nl_langinfo() above, there are locales that are not
6221       * strictly UTF-8 that this will return that they are */
6222         {
6223             wchar_t wc = 0;
6224             int len;
6225
6226             PERL_UNUSED_RESULT(mbtowc_(NULL, NULL, 0));
6227             len = mbtowc_(&wc, REPLACEMENT_CHARACTER_UTF8,
6228                        STRLENs(REPLACEMENT_CHARACTER_UTF8));
6229
6230             is_utf8 = cBOOL(   len == STRLENs(REPLACEMENT_CHARACTER_UTF8)
6231                             && wc == (wchar_t) UNICODE_REPLACEMENT);
6232         }
6233
6234 #    endif
6235
6236         restore_switched_locale(LC_CTYPE, original_ctype_locale);
6237         goto finish_and_return;
6238     }
6239
6240 #  else
6241
6242         /* Here, we must have a C89 compiler that doesn't have mbtowc().  Next
6243          * try looking at the currency symbol to see if it disambiguates
6244          * things.  Often that will be in the native script, and if the symbol
6245          * isn't in UTF-8, we know that the locale isn't.  If it is non-ASCII
6246          * UTF-8, we infer that the locale is too, as the odds of a non-UTF8
6247          * string being valid UTF-8 are quite small */
6248
6249 #    ifdef USE_LOCALE_MONETARY
6250
6251         /* If have LC_MONETARY, we can look at the currency symbol.  Often that
6252          * will be in the native script.  We do this one first because there is
6253          * just one string to examine, so potentially avoids work */
6254
6255         {
6256             const char *original_monetary_locale
6257                         = switch_category_locale_to_template(LC_MONETARY,
6258                                                              category,
6259                                                              save_input_locale);
6260             bool only_ascii = FALSE;
6261             const char * scratch_buffer = NULL;
6262             const U8 * currency_string
6263                         = (const U8 *) my_langinfo_c(CRNCYSTR, LC_MONETARY,
6264                                                    save_input_locale,
6265                                                    &scratch_buffer, NULL, NULL);
6266                                       /* 2nd param not relevant for this item */
6267             const U8 * first_variant;
6268
6269             assert(   *currency_string == '-'
6270                    || *currency_string == '+'
6271                    || *currency_string == '.');
6272
6273             currency_string++;
6274
6275             if (is_utf8_invariant_string_loc(currency_string, 0, &first_variant))
6276             {
6277                 DEBUG_L(PerlIO_printf(Perl_debug_log,
6278                         "Couldn't get currency symbol for %s, or contains"
6279                         " only ASCII; can't use for determining if UTF-8"
6280                         " locale\n", save_input_locale));
6281                 only_ascii = TRUE;
6282             }
6283             else {
6284                 is_utf8 = is_strict_utf8_string(first_variant, 0);
6285             }
6286             Safefree(scratch_buffer);
6287
6288             restore_switched_locale(LC_MONETARY, original_monetary_locale);
6289
6290             if (! only_ascii) {
6291
6292                 /* It isn't a UTF-8 locale if the symbol is not legal UTF-8;
6293                  * otherwise assume the locale is UTF-8 if and only if the symbol
6294                  * is non-ascii UTF-8. */
6295                 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
6296                                       "\t?Currency symbol for %s is UTF-8=%d\n",
6297                                         save_input_locale, is_utf8));
6298                 goto finish_and_return;
6299             }
6300         }
6301
6302 #    endif /* USE_LOCALE_MONETARY */
6303 #    if defined(HAS_STRFTIME) && defined(USE_LOCALE_TIME)
6304
6305     /* Still haven't found a non-ASCII string to disambiguate UTF-8 or not.  Try
6306      * the names of the months and weekdays, timezone, and am/pm indicator */
6307         {
6308             const char *original_time_locale
6309                             = switch_category_locale_to_template(LC_TIME,
6310                                                                  category,
6311                                                                  save_input_locale);
6312             int hour = 10;
6313             bool is_dst = FALSE;
6314             int dom = 1;
6315             int month = 0;
6316             int i;
6317             char * formatted_time;
6318
6319             /* Here the current LC_TIME is set to the locale of the category
6320              * whose information is desired.  Look at all the days of the week
6321              * and month names, and the timezone and am/pm indicator for UTF-8
6322              * variant characters.  The first such a one found will tell us if
6323              * the locale is UTF-8 or not */
6324
6325             for (i = 0; i < 7 + 12; i++) {  /* 7 days; 12 months */
6326                 formatted_time = my_strftime("%A %B %Z %p",
6327                                 0, 0, hour, dom, month, 2012 - 1900, 0, 0, is_dst);
6328                 if ( ! formatted_time
6329                     || is_utf8_invariant_string((U8 *) formatted_time, 0))
6330                 {
6331
6332                     /* Here, we didn't find a non-ASCII.  Try the next time
6333                      * through with the complemented dst and am/pm, and try
6334                      * with the next weekday.  After we have gotten all
6335                      * weekdays, try the next month */
6336                     is_dst = ! is_dst;
6337                     hour = (hour + 12) % 24;
6338                     dom++;
6339                     if (i > 6) {
6340                         month++;
6341                     }
6342                     Safefree(formatted_time);
6343                     continue;
6344                 }
6345
6346                 /* Here, we have a non-ASCII.  Return TRUE is it is valid UTF8;
6347                  * false otherwise.  But first, restore LC_TIME to its original
6348                  * locale if we changed it */
6349                 restore_switched_locale(LC_TIME, original_time_locale);
6350
6351                 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
6352                             "\t?time-related strings for %s are UTF-8=%d\n",
6353                                     save_input_locale,
6354                                     is_utf8_string((U8 *) formatted_time, 0)));
6355                 is_utf8 = is_utf8_string((U8 *) formatted_time, 0);
6356                 Safefree(formatted_time);
6357                 goto finish_and_return;
6358             }
6359
6360             /* Falling off the end of the loop indicates all the names were just
6361              * ASCII.  Go on to the next test.  If we changed it, restore LC_TIME
6362              * to its original locale */
6363             restore_switched_locale(LC_TIME, original_time_locale);
6364             DEBUG_Lv(PerlIO_printf(Perl_debug_log,
6365                      "All time-related words for %s contain only ASCII;"
6366                      " can't use for determining if UTF-8 locale\n",
6367                      save_input_locale));
6368         }
6369
6370 #    endif
6371
6372 #    if 0 && defined(USE_LOCALE_MESSAGES) && defined(HAS_SYS_ERRLIST)
6373
6374     /* This code is ifdefd out because it was found to not be necessary in
6375      * testing on our dromedary test machine, which has over 700 locales.
6376      * There, this added no value to looking at the currency symbol and the
6377      * time strings.  I left it in so as to avoid rewriting it if real-world
6378      * experience indicates that dromedary is an outlier.  Essentially, instead
6379      * of returning abpve if we haven't found illegal utf8, we continue on and
6380      * examine all the strerror() messages on the platform for utf8ness.  If
6381      * all are ASCII, we still don't know the answer; but otherwise we have a
6382      * pretty good indication of the utf8ness.  The reason this doesn't help
6383      * much is that the messages may not have been translated into the locale.
6384      * The currency symbol and time strings are much more likely to have been
6385      * translated.  */
6386         {
6387             int e;
6388             bool non_ascii = FALSE;
6389             const char *original_messages_locale
6390                             = switch_category_locale_to_template(LC_MESSAGES,
6391                                                                  category,
6392                                                                  save_input_locale);
6393             const char * errmsg = NULL;
6394
6395             /* Here the current LC_MESSAGES is set to the locale of the category
6396              * whose information is desired.  Look through all the messages.  We
6397              * can't use Strerror() here because it may expand to code that
6398              * segfaults in miniperl */
6399
6400             for (e = 0; e <= sys_nerr; e++) {
6401                 errno = 0;
6402                 errmsg = sys_errlist[e];
6403                 if (errno || !errmsg) {
6404                     break;
6405                 }
6406                 errmsg = savepv(errmsg);
6407                 if (! is_utf8_invariant_string((U8 *) errmsg, 0)) {
6408                     non_ascii = TRUE;
6409                     is_utf8 = is_utf8_string((U8 *) errmsg, 0);
6410                     break;
6411                 }
6412             }
6413             Safefree(errmsg);
6414
6415             restore_switched_locale(LC_MESSAGES, original_messages_locale);
6416
6417             if (non_ascii) {
6418
6419                 /* Any non-UTF-8 message means not a UTF-8 locale; if all are
6420                  * valid, any non-ascii means it is one; otherwise we assume it
6421                  * isn't */
6422                 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
6423                                     "\t?error messages for %s are UTF-8=%d\n",
6424                                     save_input_locale,
6425                                     is_utf8));
6426                 goto finish_and_return;
6427             }
6428
6429             DEBUG_L(PerlIO_printf(Perl_debug_log,
6430                     "All error messages for %s contain only ASCII;"
6431                     " can't use for determining if UTF-8 locale\n",
6432                     save_input_locale));
6433         }
6434
6435 #    endif
6436 #    ifndef EBCDIC  /* On os390, even if the name ends with "UTF-8', it isn't a
6437                    UTF-8 locale */
6438
6439     /* As a last resort, look at the locale name to see if it matches
6440      * qr/UTF -?  * 8 /ix, or some other common locale names.  This "name", the
6441      * return of setlocale(), is actually defined to be opaque, so we can't
6442      * really rely on the absence of various substrings in the name to indicate
6443      * its UTF-8ness, but if it has UTF8 in the name, it is extremely likely to
6444      * be a UTF-8 locale.  Similarly for the other common names */
6445
6446     {
6447         const Size_t final_pos = strlen(save_input_locale) - 1;
6448
6449         if (final_pos >= 3) {
6450             const char *name = save_input_locale;
6451
6452             /* Find next 'U' or 'u' and look from there */
6453             while ((name += strcspn(name, "Uu") + 1)
6454                                         <= save_input_locale + final_pos - 2)
6455             {
6456                 if (   isALPHA_FOLD_NE(*name, 't')
6457                     || isALPHA_FOLD_NE(*(name + 1), 'f'))
6458                 {
6459                     continue;
6460                 }
6461                 name += 2;
6462                 if (*(name) == '-') {
6463                     if ((name > save_input_locale + final_pos - 1)) {
6464                         break;
6465                     }
6466                     name++;
6467                 }
6468                 if (*(name) == '8') {
6469                     DEBUG_L(PerlIO_printf(Perl_debug_log,
6470                                         "Locale %s ends with UTF-8 in name\n",
6471                                         save_input_locale));
6472                     is_utf8 = TRUE;
6473                     goto finish_and_return;
6474                 }
6475             }
6476             DEBUG_L(PerlIO_printf(Perl_debug_log,
6477                                 "Locale %s doesn't end with UTF-8 in name\n",
6478                                     save_input_locale));
6479         }
6480
6481 #      ifdef WIN32
6482
6483         /* http://msdn.microsoft.com/en-us/library/windows/desktop/dd317756.aspx */
6484         if (memENDs(save_input_locale, final_pos, "65001")) {
6485             DEBUG_L(PerlIO_printf(Perl_debug_log,
6486                         "Locale %s ends with 65001 in name, is UTF-8 locale\n",
6487                         save_input_locale));
6488             is_utf8 = TRUE;
6489             goto finish_and_return;
6490         }
6491
6492 #      endif
6493     }
6494 #    endif
6495
6496     /* Other common encodings are the ISO 8859 series, which aren't UTF-8.  But
6497      * since we are about to return FALSE anyway, there is no point in doing
6498      * this extra work */
6499
6500 #    if 0
6501     if (instr(save_input_locale, "8859")) {
6502         DEBUG_L(PerlIO_printf(Perl_debug_log,
6503                              "Locale %s has 8859 in name, not UTF-8 locale\n",
6504                              save_input_locale));
6505         is_utf8 = FALSE;
6506         goto finish_and_return;
6507     }
6508 #    endif
6509
6510     DEBUG_L(PerlIO_printf(Perl_debug_log,
6511                           "Assuming locale %s is not a UTF-8 locale\n",
6512                                     save_input_locale));
6513     is_utf8 = FALSE;
6514
6515 #  endif /* the code that is compiled when no modern LC_CTYPE */
6516
6517   finish_and_return:
6518
6519     /* Cache this result so we don't have to go through all this next time. */
6520     utf8ness_cache_size = sizeof(PL_locale_utf8ness)
6521                        - (utf8ness_cache - PL_locale_utf8ness);
6522
6523     /* But we can't save it if it is too large for the total space available */
6524     if (LIKELY(input_name_len_with_overhead < utf8ness_cache_size)) {
6525         Size_t utf8ness_cache_len = strlen(utf8ness_cache);
6526
6527         /* Here it can fit, but we may need to clear out the oldest cached
6528          * result(s) to do so.  Check */
6529         if (utf8ness_cache_len + input_name_len_with_overhead
6530                                                         >= utf8ness_cache_size)
6531         {
6532             /* Here we have to clear something out to make room for this.
6533              * Start looking at the rightmost place where it could fit and find
6534              * the beginning of the entry that extends past that. */
6535             char * cutoff = (char *) my_memrchr(utf8ness_cache,
6536                                                 UTF8NESS_SEP[0],
6537                                                 utf8ness_cache_size
6538                                               - input_name_len_with_overhead);
6539
6540             assert(cutoff);
6541             assert(cutoff >= utf8ness_cache);
6542
6543             /* This and all subsequent entries must be removed */
6544             *cutoff = '\0';
6545             utf8ness_cache_len = strlen(utf8ness_cache);
6546         }
6547
6548         /* Make space for the new entry */
6549         Move(utf8ness_cache,
6550              utf8ness_cache + input_name_len_with_overhead,
6551              utf8ness_cache_len + 1 /* Incl. trailing NUL */, char);
6552
6553         /* And insert it */
6554         Copy(delimited, utf8ness_cache, input_name_len_with_overhead - 1, char);
6555         utf8ness_cache[input_name_len_with_overhead - 1] = is_utf8 + '0';
6556
6557         if ((PL_locale_utf8ness[strlen(PL_locale_utf8ness)-1] & ~1) != '0') {
6558             locale_panic_(Perl_form(aTHX_
6559                                     "Corrupt utf8ness_cache=%s\nlen=%zu,"
6560                                     " inserted_name=%s, its_len=%zu",
6561                                     PL_locale_utf8ness, strlen(PL_locale_utf8ness),
6562                                     delimited, input_name_len_with_overhead));
6563         }
6564     }
6565
6566 #  ifdef DEBUGGING
6567
6568     if (DEBUG_Lv_TEST) {
6569         const char * s = PL_locale_utf8ness;
6570
6571         /* Audit the structure */
6572         while (s < PL_locale_utf8ness + strlen(PL_locale_utf8ness)) {
6573             const char *e;
6574
6575             if (*s != UTF8NESS_SEP[0]) {
6576                 locale_panic_(Perl_form(aTHX_
6577                                         "Corrupt utf8ness_cache: missing"
6578                                         " separator %.*s<-- HERE %s",
6579                                         (int) (s - PL_locale_utf8ness),
6580                                         PL_locale_utf8ness,
6581                                         s));
6582             }
6583             s++;
6584             e = strchr(s, UTF8NESS_PREFIX[0]);
6585             if (! e) {
6586                 e = PL_locale_utf8ness + strlen(PL_locale_utf8ness);
6587                 locale_panic_(Perl_form(aTHX_
6588                                         "Corrupt utf8ness_cache: missing"
6589                                         " separator %.*s<-- HERE %s",
6590                                         (int) (e - PL_locale_utf8ness),
6591                                         PL_locale_utf8ness,
6592                                         e));
6593             }
6594             e++;
6595             if (*e != '0' && *e != '1') {
6596                 locale_panic_(Perl_form(aTHX_
6597                                         "Corrupt utf8ness_cache: utf8ness"
6598                                         " must be [01] %.*s<-- HERE %s",
6599                                         (int) (e + 1 - PL_locale_utf8ness),
6600                                         PL_locale_utf8ness,
6601                                         e + 1));
6602             }
6603             if (ninstr(PL_locale_utf8ness, s, s-1, e)) {
6604                 locale_panic_(Perl_form(aTHX_
6605                                         "Corrupt utf8ness_cache: entry"
6606                                         " has duplicate %.*s<-- HERE %s",
6607                                         (int) (e - PL_locale_utf8ness),
6608                                         PL_locale_utf8ness,
6609                                         e));
6610             }
6611             s = e + 1;
6612         }
6613     }
6614
6615     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
6616                 "PL_locale_utf8ness is now %s; returning %d\n",
6617                            PL_locale_utf8ness, is_utf8));
6618
6619 #  endif
6620
6621     /* free only when not using the buffer */
6622     if ( delimited != buffer ) Safefree(delimited);
6623     return is_utf8;
6624 }
6625
6626 STATIC bool
6627 S_is_codeset_name_UTF8(const char * name)
6628 {
6629     /* Return a boolean as to if the passed-in name indicates it is a UTF-8
6630      * code set.  Several variants are possible */
6631     const Size_t len = strlen(name);
6632
6633     PERL_ARGS_ASSERT_IS_CODESET_NAME_UTF8;
6634
6635 #  ifdef WIN32
6636
6637     /* http://msdn.microsoft.com/en-us/library/windows/desktop/dd317756.aspx */
6638     if (memENDs(name, len, "65001")) {
6639         return TRUE;
6640     }
6641
6642 #  endif
6643                /* 'UTF8' or 'UTF-8' */
6644     return (    inRANGE(len, 4, 5)
6645             &&  name[len-1] == '8'
6646             && (   memBEGINs(name, len, "UTF")
6647                 || memBEGINs(name, len, "utf"))
6648             && (len == 4 || name[3] == '-'));
6649 }
6650
6651 STATIC bool
6652 S_is_locale_utf8(pTHX_ const char * locale)
6653 {
6654     /* Returns TRUE if the locale 'locale' is UTF-8; FALSE otherwise.  It uses
6655      * my_langinfo(), which employs various methods to get this information
6656      * if nl_langinfo() isn't available, using heuristics as a last resort, in
6657      * which case, the result will very likely be correct for locales for
6658      * languages that have commonly used non-ASCII characters, but for notably
6659      * English, it comes down to if the locale's name ends in something like
6660      * "UTF-8".  It errs on the side of not being a UTF-8 locale. */
6661
6662 #  if ! defined(USE_LOCALE_CTYPE)                                             \
6663    ||   defined(EBCDIC) /* There aren't any real UTF-8 locales at this time */
6664
6665     PERL_UNUSED_ARG(locale);
6666
6667     return FALSE;
6668
6669 #  else
6670
6671     const char * scratch_buffer = NULL;
6672     const char * codeset;
6673     bool retval;
6674
6675     PERL_ARGS_ASSERT_IS_LOCALE_UTF8;
6676
6677     if (PL_ctype_name && strEQ(locale, PL_ctype_name)) {
6678         return PL_in_utf8_CTYPE_locale;
6679     }
6680
6681     codeset = my_langinfo_c(CODESET, LC_CTYPE, locale,
6682                             &scratch_buffer, NULL, NULL);
6683     retval = is_codeset_name_UTF8(codeset);
6684
6685     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
6686                            "found codeset=%s, is_utf8=%d\n", codeset, retval));
6687
6688     Safefree(scratch_buffer);
6689     return retval;
6690
6691 #  endif
6692
6693 }
6694
6695 #endif  /* USE_LOCALE */
6696
6697 bool
6698 Perl__is_in_locale_category(pTHX_ const bool compiling, const int category)
6699 {
6700     /* Internal function which returns if we are in the scope of a pragma that
6701      * enables the locale category 'category'.  'compiling' should indicate if
6702      * this is during the compilation phase (TRUE) or not (FALSE). */
6703
6704     const COP * const cop = (compiling) ? &PL_compiling : PL_curcop;
6705
6706     SV *these_categories = cop_hints_fetch_pvs(cop, "locale", 0);
6707     if (! these_categories || these_categories == &PL_sv_placeholder) {
6708         return FALSE;
6709     }
6710
6711     /* The pseudo-category 'not_characters' is -1, so just add 1 to each to get
6712      * a valid unsigned */
6713     assert(category >= -1);
6714     return cBOOL(SvUV(these_categories) & (1U << (category + 1)));
6715 }
6716
6717 /* my_strerror() returns a mortalized copy of the text of the error message
6718  * associated with 'errnum'.
6719  *
6720  * If not called from within the scope of 'use locale', it uses the text from
6721  * the C locale.  If Perl is compiled to not pay attention to LC_CTYPE nor
6722  * LC_MESSAGES, it uses whatever strerror() returns.  Otherwise the text is
6723  * derived from the locale, LC_MESSAGES if we have that; LC_CTYPE if not.
6724  *
6725  * It returns in *utf8ness the result's UTF-8ness
6726  *
6727  * The function just calls strerror(), but temporarily switches locales, if
6728  * needed.  Many platforms require LC_CTYPE and LC_MESSAGES to be in the same
6729  * CODESET in order for the return from strerror() to not contain '?' symbols,
6730  * or worse, mojibaked.  It's cheaper to just use the stricter criteria of
6731  * being in the same locale.  So the code below uses a common locale for both
6732  * categories.  Again, that is C if not within 'use locale' scope; or the
6733  * LC_MESSAGES locale if in scope and we have that category; and LC_CTYPE if we
6734  * don't have LC_MESSAGES; and whatever strerror returns if we don't have
6735  * either category.
6736  *
6737  * There are two sets of implementations.  The first below is if we have
6738  * strerror_l().  This is the simpler.  We just use the already-built C locale
6739  * object if not in locale scope, or build up a custom one otherwise.
6740  *
6741  * When strerror_l() is not available, we may have to swap locales temporarily
6742  * to bring the two categories into sync with each other, and possibly to the C
6743  * locale.
6744  *
6745  * Because the prepropessing directives to conditionally compile this function
6746  * would greatly obscure the logic of the various implementations, the whole
6747  * function is repeated for each configuration, with some common macros. */
6748
6749 /* Used to shorten the definitions of the following implementations of
6750  * my_strerror() */
6751 #define DEBUG_STRERROR_ENTER(errnum, in_locale)                             \
6752     DEBUG_Lv(PerlIO_printf(Perl_debug_log,                                  \
6753                            "my_strerror called with errnum %d;"             \
6754                            " Within locale scope=%d\n",                     \
6755                            errnum, in_locale))
6756 #define DEBUG_STRERROR_RETURN(errstr, utf8ness)                             \
6757     DEBUG_Lv(PerlIO_printf(Perl_debug_log,                                  \
6758                            "Strerror returned; saving a copy: '");          \
6759        print_bytes_for_locale(errstr, errstr + strlen(errstr), 0);          \
6760        PerlIO_printf(Perl_debug_log, "'; utf8ness=%d\n", (int) *utf8ness);)
6761
6762 /* On platforms that have precisely one of these categories (Windows
6763  * qualifies), these yield the correct one */
6764 #if defined(USE_LOCALE_CTYPE)
6765 #  define WHICH_LC_INDEX LC_CTYPE_INDEX_
6766 #elif defined(USE_LOCALE_MESSAGES)
6767 #  define WHICH_LC_INDEX LC_MESSAGES_INDEX_
6768 #endif
6769
6770 /*==========================================================================*/
6771 /* First set of implementations, when have strerror_l() */
6772
6773 #if defined(USE_POSIX_2008_LOCALE) && defined(HAS_STRERROR_L)
6774
6775 #  if ! defined(USE_LOCALE_CTYPE) && ! defined(USE_LOCALE_MESSAGES)
6776
6777 /* Here, neither category is defined: use the C locale */
6778 char *
6779 Perl_my_strerror(pTHX_ const int errnum, utf8ness_t * utf8ness)
6780 {
6781     PERL_ARGS_ASSERT_MY_STRERROR;
6782
6783     DEBUG_STRERROR_ENTER(errnum, 0);
6784
6785     char *errstr = savepv(strerror_l(errnum, PL_C_locale_obj));
6786     *utf8ness = UTF8NESS_IMMATERIAL;
6787
6788     DEBUG_STRERROR_RETURN(errstr, utf8ness);
6789
6790     SAVEFREEPV(errstr);
6791     return errstr;
6792 }
6793
6794 #  elif ! defined(USE_LOCALE_CTYPE) || ! defined(USE_LOCALE_MESSAGES)
6795
6796 /*--------------------------------------------------------------------------*/
6797
6798 /* Here one or the other of CTYPE or MESSAGES is defined, but not both.  If we
6799  * are not within 'use locale' scope of the only one defined, we use the C
6800  * locale; otherwise use the current locale object */
6801
6802 char *
6803 Perl_my_strerror(pTHX_ const int errnum, utf8ness_t * utf8ness)
6804 {
6805     PERL_ARGS_ASSERT_MY_STRERROR;
6806
6807     DEBUG_STRERROR_ENTER(errnum, IN_LC(categories[WHICH_LC_INDEX]));
6808
6809     /* Use C if not within locale scope;  Otherwise, use current locale */
6810     const locale_t which_obj = (IN_LC(categories[WHICH_LC_INDEX]))
6811                                ? PL_C_locale_obj
6812                                : use_curlocale_scratch();
6813
6814     char *errstr = savepv(strerror_l(errnum, which_obj));
6815     *utf8ness = get_locale_string_utf8ness_i(NULL, WHICH_LC_INDEX, errstr,
6816                                              LOCALE_UTF8NESS_UNKNOWN);
6817     DEBUG_STRERROR_RETURN(errstr, utf8ness);
6818
6819     SAVEFREEPV(errstr);
6820     return errstr;
6821 }
6822
6823 /*--------------------------------------------------------------------------*/
6824 #  else     /* Are using both categories.  Place them in the same CODESET,
6825              * either C or the LC_MESSAGES locale */
6826
6827 char *
6828 Perl_my_strerror(pTHX_ const int errnum, utf8ness_t * utf8ness)
6829 {
6830     PERL_ARGS_ASSERT_MY_STRERROR;
6831
6832     DEBUG_STRERROR_ENTER(errnum, IN_LC(LC_MESSAGES));
6833
6834     char *errstr;
6835     if (! IN_LC(LC_MESSAGES)) {    /* Use C if not within locale scope */
6836         errstr = savepv(strerror_l(errnum, PL_C_locale_obj));
6837         *utf8ness = UTF8NESS_IMMATERIAL;
6838     }
6839     else {  /* Otherwise, use the LC_MESSAGES locale, making sure LC_CTYPE
6840                matches */
6841         locale_t cur = duplocale(use_curlocale_scratch());
6842
6843         cur = newlocale(LC_CTYPE_MASK, querylocale_c(LC_MESSAGES), cur);
6844         errstr = savepv(strerror_l(errnum, cur));
6845         *utf8ness = get_locale_string_utf8ness_i(NULL, LC_MESSAGES_INDEX_,
6846                                                errstr, LOCALE_UTF8NESS_UNKNOWN);
6847         freelocale(cur);
6848     }
6849
6850     DEBUG_STRERROR_RETURN(errstr, utf8ness);
6851
6852     SAVEFREEPV(errstr);
6853     return errstr;
6854 }
6855 #  endif    /* Above is using strerror_l */
6856 /*==========================================================================*/
6857 #else       /* Below is not using strerror_l */
6858 #  if ! defined(USE_LOCALE_CTYPE) && ! defined(USE_LOCALE_MESSAGES)
6859
6860 /* If not using using either of the categories, return plain, unadorned
6861  * strerror */
6862
6863 char *
6864 Perl_my_strerror(pTHX_ const int errnum, utf8ness_t * utf8ness)
6865 {
6866     PERL_ARGS_ASSERT_MY_STRERROR;
6867
6868     DEBUG_STRERROR_ENTER(errnum, 0);
6869
6870     char *errstr = savepv(Strerror(errnum));
6871     *utf8ness = UTF8NESS_IMMATERIAL;
6872
6873     DEBUG_STRERROR_RETURN(errstr, utf8ness);
6874
6875     SAVEFREEPV(errstr);
6876     return errstr;
6877 }
6878
6879 /*--------------------------------------------------------------------------*/
6880 #  elif ! defined(USE_LOCALE_CTYPE) || ! defined(USE_LOCALE_MESSAGES)
6881
6882 /* Here one or the other of CTYPE or MESSAGES is defined, but not both.  If we
6883  * are not within 'use locale' scope of the only one defined, we use the C
6884  * locale; otherwise use the current locale */
6885
6886 char *
6887 Perl_my_strerror(pTHX_ const int errnum, utf8ness_t * utf8ness)
6888 {
6889     PERL_ARGS_ASSERT_MY_STRERROR;
6890
6891     DEBUG_STRERROR_ENTER(errnum, IN_LC(categories[WHICH_LC_INDEX]));
6892
6893     char *errstr;
6894     if (IN_LC(categories[WHICH_LC_INDEX])) {
6895         errstr = savepv(Strerror(errnum));
6896         *utf8ness = get_locale_string_utf8ness_i(NULL, WHICH_LC_INDEX, errstr,
6897                                                  LOCALE_UTF8NESS_UNKNOWN);
6898     }
6899     else {
6900
6901         SETLOCALE_LOCK;
6902
6903         const char * orig_locale = toggle_locale_i(WHICH_LC_INDEX, "C");
6904
6905         errstr = savepv(Strerror(errnum));
6906
6907         restore_toggled_locale_i(WHICH_LC_INDEX, orig_locale);
6908
6909         SETLOCALE_UNLOCK;
6910
6911         *utf8ness = UTF8NESS_IMMATERIAL;
6912
6913     }
6914
6915     DEBUG_STRERROR_RETURN(errstr, utf8ness);
6916
6917     SAVEFREEPV(errstr);
6918     return errstr;
6919 }
6920
6921 /*--------------------------------------------------------------------------*/
6922 #  else
6923
6924 /* Below, have both LC_CTYPE and LC_MESSAGES.  Place them in the same CODESET,
6925  * either C or the LC_MESSAGES locale */
6926
6927 char *
6928 Perl_my_strerror(pTHX_ const int errnum, utf8ness_t * utf8ness)
6929 {
6930     PERL_ARGS_ASSERT_MY_STRERROR;
6931
6932     DEBUG_STRERROR_ENTER(errnum, IN_LC(LC_MESSAGES));
6933
6934     const char * desired_locale = (IN_LC(LC_MESSAGES))
6935                                   ? querylocale_c(LC_MESSAGES)
6936                                   : "C";
6937     /* XXX Can fail on z/OS */
6938
6939     SETLOCALE_LOCK;
6940
6941     const char* orig_CTYPE_locale    = toggle_locale_c(LC_CTYPE, desired_locale);
6942     const char* orig_MESSAGES_locale = toggle_locale_c(LC_MESSAGES,
6943                                                        desired_locale);
6944     char *errstr = savepv(Strerror(errnum));
6945
6946     restore_toggled_locale_c(LC_MESSAGES, orig_MESSAGES_locale);
6947     restore_toggled_locale_c(LC_CTYPE, orig_CTYPE_locale);
6948
6949     SETLOCALE_UNLOCK;
6950
6951     *utf8ness = get_locale_string_utf8ness_i(NULL, LC_MESSAGES_INDEX_, errstr,
6952                                              LOCALE_UTF8NESS_UNKNOWN);
6953     DEBUG_STRERROR_RETURN(errstr, utf8ness);
6954
6955     SAVEFREEPV(errstr);
6956     return errstr;
6957 }
6958
6959 /*--------------------------------------------------------------------------*/
6960 #  endif /* end of not using strerror_l() */
6961 #endif   /* end of all the my_strerror() implementations */
6962
6963 /*
6964
6965 =for apidoc switch_to_global_locale
6966
6967 On systems without locale support, or on typical single-threaded builds, or on
6968 platforms that do not support per-thread locale operations, this function does
6969 nothing.  On such systems that do have locale support, only a locale global to
6970 the whole program is available.
6971
6972 On multi-threaded builds on systems that do have per-thread locale operations,
6973 this function converts the thread it is running in to use the global locale.
6974 This is for code that has not yet or cannot be updated to handle multi-threaded
6975 locale operation.  As long as only a single thread is so-converted, everything
6976 works fine, as all the other threads continue to ignore the global one, so only
6977 this thread looks at it.
6978
6979 However, on Windows systems this isn't quite true prior to Visual Studio 15,
6980 at which point Microsoft fixed a bug.  A race can occur if you use the
6981 following operations on earlier Windows platforms:
6982
6983 =over
6984
6985 =item L<POSIX::localeconv|POSIX/localeconv>
6986
6987 =item L<I18N::Langinfo>, items C<CRNCYSTR> and C<THOUSEP>
6988
6989 =item L<perlapi/Perl_langinfo>, items C<CRNCYSTR> and C<THOUSEP>
6990
6991 =back
6992
6993 The first item is not fixable (except by upgrading to a later Visual Studio
6994 release), but it would be possible to work around the latter two items by using
6995 the Windows API functions C<GetNumberFormat> and C<GetCurrencyFormat>; patches
6996 welcome.
6997
6998 Without this function call, threads that use the L<C<setlocale(3)>> system
6999 function will not work properly, as all the locale-sensitive functions will
7000 look at the per-thread locale, and C<setlocale> will have no effect on this
7001 thread.
7002
7003 Perl code should convert to either call
7004 L<C<Perl_setlocale>|perlapi/Perl_setlocale> (which is a drop-in for the system
7005 C<setlocale>) or use the methods given in L<perlcall> to call
7006 L<C<POSIX::setlocale>|POSIX/setlocale>.  Either one will transparently properly
7007 handle all cases of single- vs multi-thread, POSIX 2008-supported or not.
7008
7009 Non-Perl libraries, such as C<gtk>, that call the system C<setlocale> can
7010 continue to work if this function is called before transferring control to the
7011 library.
7012
7013 Upon return from the code that needs to use the global locale,
7014 L<C<sync_locale()>|perlapi/sync_locale> should be called to restore the safe
7015 multi-thread operation.
7016
7017 =cut
7018 */
7019
7020 void
7021 Perl_switch_to_global_locale()
7022 {
7023     dTHX;
7024
7025 #ifdef USE_THREAD_SAFE_LOCALE
7026 #  ifdef WIN32
7027
7028     _configthreadlocale(_DISABLE_PER_THREAD_LOCALE);
7029
7030 #  else
7031
7032     {
7033         unsigned int i;
7034
7035         for (i = 0; i < LC_ALL_INDEX_; i++) {
7036             setlocale(categories[i], querylocale_i(i));
7037         }
7038     }
7039
7040     uselocale(LC_GLOBAL_LOCALE);
7041
7042 #  endif
7043 #endif
7044
7045 }
7046
7047 /*
7048
7049 =for apidoc sync_locale
7050
7051 L<C<Perl_setlocale>|perlapi/Perl_setlocale> can be used at any time to query or
7052 change the locale (though changing the locale is antisocial and dangerous on
7053 multi-threaded systems that don't have multi-thread safe locale operations.
7054 (See L<perllocale/Multi-threaded operation>).  Using the system
7055 L<C<setlocale(3)>> should be avoided.  Nevertheless, certain non-Perl libraries
7056 called from XS, such as C<Gtk> do so, and this can't be changed.  When the
7057 locale is changed by XS code that didn't use
7058 L<C<Perl_setlocale>|perlapi/Perl_setlocale>, Perl needs to be told that the
7059 locale has changed.  Use this function to do so, before returning to Perl.
7060
7061 The return value is a boolean: TRUE if the global locale at the time of call
7062 was in effect; and FALSE if a per-thread locale was in effect.  This can be
7063 used by the caller that needs to restore things as-they-were to decide whether
7064 or not to call
7065 L<C<Perl_switch_to_global_locale>|perlapi/switch_to_global_locale>.
7066
7067 =cut
7068 */
7069
7070 bool
7071 Perl_sync_locale()
7072 {
7073
7074 #ifndef USE_LOCALE
7075
7076     return TRUE;
7077
7078 #else
7079
7080     const char * newlocale;
7081     dTHX;
7082
7083 #  ifdef USE_POSIX_2008_LOCALE
7084
7085     bool was_in_global_locale = FALSE;
7086     locale_t cur_obj = uselocale((locale_t) 0);
7087
7088     /* On Windows, unless the foreign code has turned off the thread-safe
7089      * locale setting, any plain setlocale() will have affected what we see, so
7090      * no need to worry.  Otherwise, If the foreign code has done a plain
7091      * setlocale(), it will only affect the global locale on POSIX systems, but
7092      * will affect the */
7093     if (cur_obj == LC_GLOBAL_LOCALE) {
7094
7095 #    ifdef HAS_QUERY_LOCALE
7096
7097         void_setlocale_c(LC_ALL, querylocale_c(LC_ALL));
7098
7099 #    else
7100
7101         unsigned int i;
7102
7103         /* We can't trust that we can read the LC_ALL format on the
7104          * platform, so do them individually */
7105         for (i = 0; i < LC_ALL_INDEX_; i++) {
7106             void_setlocale_i(i, querylocale_i(i));
7107         }
7108
7109 #    endif
7110
7111         was_in_global_locale = TRUE;
7112     }
7113
7114 #  else
7115
7116     bool was_in_global_locale = TRUE;
7117
7118 #  endif
7119 #  ifdef USE_LOCALE_CTYPE
7120
7121     newlocale = querylocale_c(LC_CTYPE);
7122     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
7123                   "%s\n", setlocale_debug_string_c(LC_CTYPE, NULL, newlocale)));
7124     new_ctype(newlocale);
7125
7126 #  endif /* USE_LOCALE_CTYPE */
7127 #  ifdef USE_LOCALE_COLLATE
7128
7129     newlocale = querylocale_c(LC_COLLATE);
7130     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
7131                 "%s\n", setlocale_debug_string_c(LC_COLLATE, NULL, newlocale)));
7132     new_collate(newlocale);
7133
7134 #  endif
7135 #  ifdef USE_LOCALE_NUMERIC
7136
7137     newlocale = querylocale_c(LC_NUMERIC);
7138     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
7139                 "%s\n", setlocale_debug_string_c(LC_NUMERIC, NULL, newlocale)));
7140     new_numeric(newlocale);
7141
7142 #  endif /* USE_LOCALE_NUMERIC */
7143
7144     return was_in_global_locale;
7145
7146 #endif
7147
7148 }
7149
7150 #if defined(DEBUGGING) && defined(USE_LOCALE)
7151
7152 STATIC char *
7153 S_setlocale_debug_string_i(const unsigned cat_index,
7154                            const char* const locale, /* Optional locale name */
7155
7156                             /* return value from setlocale() when attempting to
7157                              * set 'category' to 'locale' */
7158                             const char* const retval)
7159 {
7160     /* Returns a pointer to a NUL-terminated string in static storage with
7161      * added text about the info passed in.  This is not thread safe and will
7162      * be overwritten by the next call, so this should be used just to
7163      * formulate a string to immediately print or savepv() on. */
7164
7165     static char ret[1024];
7166     assert(cat_index <= NOMINAL_LC_ALL_INDEX);
7167
7168     my_strlcpy(ret, "setlocale(", sizeof(ret));
7169     my_strlcat(ret, category_names[cat_index], sizeof(ret));
7170     my_strlcat(ret, ", ", sizeof(ret));
7171
7172     if (locale) {
7173         my_strlcat(ret, "\"", sizeof(ret));
7174         my_strlcat(ret, locale, sizeof(ret));
7175         my_strlcat(ret, "\"", sizeof(ret));
7176     }
7177     else {
7178         my_strlcat(ret, "NULL", sizeof(ret));
7179     }
7180
7181     my_strlcat(ret, ") returned ", sizeof(ret));
7182
7183     if (retval) {
7184         my_strlcat(ret, "\"", sizeof(ret));
7185         my_strlcat(ret, retval, sizeof(ret));
7186         my_strlcat(ret, "\"", sizeof(ret));
7187     }
7188     else {
7189         my_strlcat(ret, "NULL", sizeof(ret));
7190     }
7191
7192     assert(strlen(ret) < sizeof(ret));
7193
7194     return ret;
7195 }
7196
7197 #endif
7198
7199 void
7200 Perl_thread_locale_init()
7201 {
7202     /* Called from a thread on startup*/
7203
7204 #ifdef USE_THREAD_SAFE_LOCALE
7205
7206     dTHX_DEBUGGING;
7207
7208
7209      DEBUG_L(PerlIO_printf(Perl_debug_log,
7210             "new thread, initial locale is %s; calling setlocale\n",
7211             setlocale(LC_ALL, NULL)));
7212
7213 #  ifdef WIN32
7214
7215     /* On Windows, make sure new thread has per-thread locales enabled */
7216     _configthreadlocale(_ENABLE_PER_THREAD_LOCALE);
7217
7218 #  else
7219
7220     /* This thread starts off in the C locale */
7221     Perl_setlocale(LC_ALL, "C");
7222
7223 #  endif
7224 #endif
7225
7226 }
7227
7228 void
7229 Perl_thread_locale_term()
7230 {
7231     /* Called from a thread as it gets ready to terminate */
7232
7233 #ifdef USE_POSIX_2008_LOCALE
7234
7235     /* C starts the new thread in the global C locale.  If we are thread-safe,
7236      * we want to not be in the global locale */
7237
7238     {   /* Free up */
7239         locale_t cur_obj = uselocale(LC_GLOBAL_LOCALE);
7240         if (cur_obj != LC_GLOBAL_LOCALE && cur_obj != PL_C_locale_obj) {
7241             freelocale(cur_obj);
7242         }
7243     }
7244
7245 #endif
7246
7247 }
7248
7249 /*
7250  * ex: set ts=8 sts=4 sw=4 et:
7251  */