locale.c

   1 /*    locale.c
   2  *
   3  *    Copyright (C) 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001,
   4  *    2002, 2003, 2005, 2006, 2007, 2008 by Larry Wall and others
   5  *
   6  *    You may distribute under the terms of either the GNU General Public
   7  *    License or the Artistic License, as specified in the README file.
   8  *
   9  */
  10
  11 /*
  12  *      A Elbereth Gilthoniel,
  13  *      silivren penna míriel
  14  *      o menel aglar elenath!
  15  *      Na-chaered palan-díriel
  16  *      o galadhremmin ennorath,
  17  *      Fanuilos, le linnathon
  18  *      nef aear, si nef aearon!
  19  *
  20  *     [p.238 of _The Lord of the Rings_, II/i: "Many Meetings"]
  21  */
  22
  23 /* utility functions for handling locale-specific stuff like what
  24  * character represents the decimal point.
  25  *
  26  * All C programs have an underlying locale.  Perl code generally doesn't pay
  27  * any attention to it except within the scope of a 'use locale'.  For most
  28  * categories, it accomplishes this by just using different operations if it is
  29  * in such scope than if not.  However, various libc functions called by Perl
  30  * are affected by the LC_NUMERIC category, so there are macros in perl.h that
  31  * are used to toggle between the current locale and the C locale depending on
  32  * the desired behavior of those functions at the moment.  And, LC_MESSAGES is
  33  * switched to the C locale for outputting the message unless within the scope
  34  * of 'use locale'.
  35  *
  36  * This code now has multi-thread-safe locale handling on systems that support
  37  * that.  This is completely transparent to most XS code.  On earlier systems,
  38  * it would be possible to emulate thread-safe locales, but this likely would
  39  * involve a lot of locale switching, and would require XS code changes.
  40  * Macros could be written so that the code wouldn't have to know which type of
  41  * system is being used.
  42  *
  43  * Table-driven code is used for simplicity and clarity, as many operations
  44  * differ only in which category is being worked on.  However the system
  45  * categories need not be small contiguous integers, so do not lend themselves
  46  * to table lookup.  Instead we have created our own equivalent values which
  47  * are all small contiguous non-negative integers, and translation functions
  48  * between the two sets.  For category 'LC_foo', the name of our index is
  49  * LC_foo_INDEX_.  Various parallel tables, indexed by these, are used.
  50  *
  51  * Many of the macros and functions in this file have one of the suffixes '_c',
  52  * '_r', or '_i'.  khw found these useful in remembering what type of locale
  53  * category to use as their parameter.  '_r' takes an int category number as
  54  * passed to setlocale(), like LC_ALL, LC_CTYPE, etc.  The 'r' indicates that
  55  * the value isn't known until runtime.  '_c' also indicates such a category
  56  * number, but its value is known at compile time.  These are both converted
  57  * into unsigned indexes into various tables of category information, where the
  58  * real work is generally done.  The tables are generated at compile-time based
  59  * on platform characteristics and Configure options.  They hide from the code
  60  * many of the vagaries of the different locale implementations out there.  You
  61  * may have already guessed that '_i' indicates the parameter is such an
  62  * unsigned index.  Converting from '_r' to '_i' requires run-time lookup.
  63  * '_c' is used to get cpp to do this at compile time.  To avoid the runtime
  64  * expense, the code is structured to use '_r' at the API level, and once
  65  * converted, everything possible is done using the table indexes.
  66  *
  67  * On unthreaded perls, most operations expand out to just the basic
  68  * setlocale() calls.  The same is true on threaded perls on modern Windows
  69  * systems where the same API, after set up, is used for thread-safe locale
  70  * handling.  On other systems, there is a completely different API, specified
  71  * in POSIX 2008, to do thread-safe locales.  On these systems, our
  72  * emulate_setlocale_i() function is used to hide the different API from the
  73  * outside.  This makes it completely transparent to most XS code.
  74  *
  75  * A huge complicating factor is that the LC_NUMERIC category is normally held
  76  * in the C locale, except during those relatively rare times when it needs to
  77  * be in the underlying locale.  There is a bunch of code to accomplish this,
  78  * and to allow easy switches from one state to the other.
  79  *
  80  * z/OS (os390) is an outlier.  Locales really don't work under threads when
  81  * either the radix character isn't a dot, or attempts are made to change
  82  * locales after the first thread is created.  The reason is that IBM has made
  83  * it thread-safe by refusing to change locales (returning failure if
  84  * attempted) any time after an application has called pthread_create() to
  85  * create another thread.  The expectation is that an application will set up
  86  * its locale information before the first fork, and be stable thereafter.  But
  87  * perl toggles LC_NUMERIC if the locale's radix character isn't a dot, as do
  88  * the other toggles, which are less common.
  89  */
  90
  91 /* If the environment says to, we can output debugging information during
  92  * initialization.  This is done before option parsing, and before any thread
  93  * creation, so can be a file-level static.  (Must come before #including
  94  * perl.h) */
  95 #ifdef DEBUGGING
  96 static int debug_initialization = 0;
  97 #  define DEBUG_INITIALIZATION_set(v) (debug_initialization = v)
  98 #  define DEBUG_LOCALE_INITIALIZATION_  debug_initialization
  99 #else
 100 #  define debug_initialization 0
 101 #  define DEBUG_INITIALIZATION_set(v)
 102 #endif
 103
 104 #define DEBUG_PRE_STMTS   dSAVE_ERRNO;                                        \
 105     PerlIO_printf(Perl_debug_log, "%s: %" LINE_Tf ": ", __FILE__, __LINE__);
 106 #define DEBUG_POST_STMTS  RESTORE_ERRNO;
 107
 108 #include "EXTERN.h"
 109 #define PERL_IN_LOCALE_C
 110 #include "perl_langinfo.h"
 111 #include "perl.h"
 112
 113 #include "reentr.h"
 114
 115 #ifdef I_WCHAR
 116 #  include <wchar.h>
 117 #endif
 118 #ifdef I_WCTYPE
 119 #  include <wctype.h>
 120 #endif
 121
 122
 123 /* Returns the Unix errno portion; ignoring any others.  This is a macro here
 124  * instead of putting it into perl.h, because unclear to khw what should be
 125  * done generally. */
 126 #define GET_ERRNO   saved_errno
 127
 128 /* Is the C string input 'name' "C" or "POSIX"?  If so, and 'name' is the
 129  * return of setlocale(), then this is extremely likely to be the C or POSIX
 130  * locale.  However, the output of setlocale() is documented to be opaque, but
 131  * the odds are extremely small that it would return these two strings for some
 132  * other locale.  Note that VMS in these two locales includes many non-ASCII
 133  * characters as controls and punctuation (below are hex bytes):
 134  *   cntrl:  84-97 9B-9F
 135  *   punct:  A1-A3 A5 A7-AB B0-B3 B5-B7 B9-BD BF-CF D1-DD DF-EF F1-FD
 136  * Oddly, none there are listed as alphas, though some represent alphabetics
 137  * http://www.nntp.perl.org/group/perl.perl5.porters/2013/02/msg198753.html */
 138 #define isNAME_C_OR_POSIX(name)                                              \
 139                              (   (name) != NULL                              \
 140                               && (( *(name) == 'C' && (*(name + 1)) == '\0') \
 141                                    || strEQ((name), "POSIX")))
 142
 143 #ifdef USE_LOCALE
 144
 145 /* This code keeps a LRU cache of the UTF-8ness of the locales it has so-far
 146  * looked up.  This is in the form of a C string:  */
 147
 148 #  define UTF8NESS_SEP     "\v"
 149 #  define UTF8NESS_PREFIX  "\f"
 150
 151 /* So, the string looks like:
 152  *
 153  *      \vC\a0\vPOSIX\a0\vam_ET\a0\vaf_ZA.utf8\a1\ven_US.UTF-8\a1\0
 154  *
 155  * where the digit 0 after the \a indicates that the locale starting just
 156  * after the preceding \v is not UTF-8, and the digit 1 mean it is. */
 157
 158 STATIC_ASSERT_DECL(STRLENs(UTF8NESS_SEP) == 1);
 159 STATIC_ASSERT_DECL(STRLENs(UTF8NESS_PREFIX) == 1);
 160
 161 #  define C_and_POSIX_utf8ness    UTF8NESS_SEP "C"     UTF8NESS_PREFIX "0"    \
 162                                 UTF8NESS_SEP "POSIX" UTF8NESS_PREFIX "0"
 163
 164 /* The cache is initialized to C_and_POSIX_utf8ness at start up.  These are
 165  * kept there always.  The remining portion of the cache is LRU, with the
 166  * oldest looked-up locale at the tail end */
 167
 168 #  ifdef DEBUGGING
 169 #    define setlocale_debug_string_c(category, locale, result)              \
 170                 setlocale_debug_string_i(category##_INDEX_, locale, result)
 171 #    define setlocale_debug_string_r(category, locale, result)              \
 172              setlocale_debug_string_i(get_category_index(category, locale), \
 173                                       locale, result)
 174 #  endif
 175
 176 /* Two parallel arrays indexed by our mapping of category numbers into small
 177  * non-negative indexes; first the locale categories Perl uses on this system,
 178  * used to do the inverse mapping.  The second array is their names.  These
 179  * arrays are in mostly arbitrary order. */
 180
 181 STATIC const int categories[] = {
 182
 183 #    ifdef USE_LOCALE_CTYPE
 184                              LC_CTYPE,
 185 #    endif
 186 #  ifdef USE_LOCALE_NUMERIC
 187                              LC_NUMERIC,
 188 #  endif
 189 #    ifdef USE_LOCALE_COLLATE
 190                              LC_COLLATE,
 191 #    endif
 192 #    ifdef USE_LOCALE_TIME
 193                              LC_TIME,
 194 #    endif
 195 #    ifdef USE_LOCALE_MESSAGES
 196                              LC_MESSAGES,
 197 #    endif
 198 #    ifdef USE_LOCALE_MONETARY
 199                              LC_MONETARY,
 200 #    endif
 201 #    ifdef USE_LOCALE_ADDRESS
 202                              LC_ADDRESS,
 203 #    endif
 204 #    ifdef USE_LOCALE_IDENTIFICATION
 205                              LC_IDENTIFICATION,
 206 #    endif
 207 #    ifdef USE_LOCALE_MEASUREMENT
 208                              LC_MEASUREMENT,
 209 #    endif
 210 #    ifdef USE_LOCALE_PAPER
 211                              LC_PAPER,
 212 #    endif
 213 #    ifdef USE_LOCALE_TELEPHONE
 214                              LC_TELEPHONE,
 215 #    endif
 216 #    ifdef USE_LOCALE_SYNTAX
 217                              LC_SYNTAX,
 218 #    endif
 219 #    ifdef USE_LOCALE_TOD
 220                              LC_TOD,
 221 #    endif
 222 #    ifdef LC_ALL
 223                              LC_ALL,
 224 #    endif
 225
 226    /* Placeholder as a precaution if code fails to check the return of
 227     * get_category_index(), which returns this element to indicate an error */
 228                             -1
 229 };
 230
 231 /* The top-most real element is LC_ALL */
 232
 233 STATIC const char * const category_names[] = {
 234
 235 #    ifdef USE_LOCALE_CTYPE
 236                                  "LC_CTYPE",
 237 #    endif
 238 #  ifdef USE_LOCALE_NUMERIC
 239                                  "LC_NUMERIC",
 240 #  endif
 241 #    ifdef USE_LOCALE_COLLATE
 242                                  "LC_COLLATE",
 243 #    endif
 244 #    ifdef USE_LOCALE_TIME
 245                                  "LC_TIME",
 246 #    endif
 247 #    ifdef USE_LOCALE_MESSAGES
 248                                  "LC_MESSAGES",
 249 #    endif
 250 #    ifdef USE_LOCALE_MONETARY
 251                                  "LC_MONETARY",
 252 #    endif
 253 #    ifdef USE_LOCALE_ADDRESS
 254                                  "LC_ADDRESS",
 255 #    endif
 256 #    ifdef USE_LOCALE_IDENTIFICATION
 257                                  "LC_IDENTIFICATION",
 258 #    endif
 259 #    ifdef USE_LOCALE_MEASUREMENT
 260                                  "LC_MEASUREMENT",
 261 #    endif
 262 #    ifdef USE_LOCALE_PAPER
 263                                  "LC_PAPER",
 264 #    endif
 265 #    ifdef USE_LOCALE_TELEPHONE
 266                                  "LC_TELEPHONE",
 267 #    endif
 268 #    ifdef USE_LOCALE_SYNTAX
 269                                  "LC_SYNTAX",
 270 #    endif
 271 #    ifdef USE_LOCALE_TOD
 272                                  "LC_TOD",
 273 #    endif
 274 #    ifdef LC_ALL
 275                                  "LC_ALL",
 276 #    endif
 277
 278    /* Placeholder as a precaution if code fails to check the return of
 279     * get_category_index(), which returns this element to indicate an error */
 280                                  NULL
 281 };
 282
 283 /* A few categories require additional setup when they are changed.  This table
 284  * points to the functions that do that setup */
 285 STATIC void (*update_functions[]) (pTHX_ const char *) = {
 286 #  ifdef USE_LOCALE_CTYPE
 287                                 S_new_ctype,
 288 #  endif
 289 #  ifdef USE_LOCALE_NUMERIC
 290                                 S_new_numeric,
 291 #  endif
 292 #  ifdef USE_LOCALE_COLLATE
 293                                 S_new_collate,
 294 #  endif
 295 #  ifdef USE_LOCALE_TIME
 296                                 NULL,
 297 #  endif
 298 #  ifdef USE_LOCALE_MESSAGES
 299                                 NULL,
 300 #  endif
 301 #  ifdef USE_LOCALE_MONETARY
 302                                 NULL,
 303 #  endif
 304 #  ifdef USE_LOCALE_ADDRESS
 305                                 NULL,
 306 #  endif
 307 #  ifdef USE_LOCALE_IDENTIFICATION
 308                                 NULL,
 309 #  endif
 310 #  ifdef USE_LOCALE_MEASUREMENT
 311                                 NULL,
 312 #  endif
 313 #  ifdef USE_LOCALE_PAPER
 314                                 NULL,
 315 #  endif
 316 #  ifdef USE_LOCALE_TELEPHONE
 317                                 NULL,
 318 #  endif
 319 #  ifdef USE_LOCALE_SYNTAX
 320                                 NULL,
 321 #  endif
 322 #  ifdef USE_LOCALE_TOD
 323                                 NULL,
 324 #  endif
 325     /* No harm done to have this even without an LC_ALL */
 326                                 S_new_LC_ALL,
 327
 328    /* Placeholder as a precaution if code fails to check the return of
 329     * get_category_index(), which returns this element to indicate an error */
 330                                 NULL
 331 };
 332
 333 #  ifdef LC_ALL
 334
 335     /* On systems with LC_ALL, it is kept in the highest index position.  (-2
 336      * to account for the final unused placeholder element.) */
 337 #    define NOMINAL_LC_ALL_INDEX (C_ARRAY_LENGTH(categories) - 2)
 338 #  else
 339
 340     /* On systems without LC_ALL, we pretend it is there, one beyond the real
 341      * top element, hence in the unused placeholder element. */
 342 #    define NOMINAL_LC_ALL_INDEX (C_ARRAY_LENGTH(categories) - 1)
 343 #  endif
 344
 345 /* Pretending there is an LC_ALL element just above allows us to avoid most
 346  * special cases.  Most loops through these arrays in the code below are
 347  * written like 'for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++)'.  They will work
 348  * on either type of system.  But the code must be written to not access the
 349  * element at 'LC_ALL_INDEX_' except on platforms that have it.  This can be
 350  * checked for at compile time by using the #define LC_ALL_INDEX_ which is only
 351  * defined if we do have LC_ALL. */
 352
 353 STATIC unsigned int
 354 S_get_category_index(const int category, const char * locale)
 355 {
 356     /* Given a category, return the equivalent internal index we generally use
 357      * instead.
 358      *
 359      * 'locale' is for use in any generated diagnostics, and may be NULL
 360      *
 361      * Some sort of hash could be used instead of this loop, but the number of
 362      * elements is so far at most 12 */
 363
 364     unsigned int i;
 365     const char * conditional_warn_text = "; can't set it to ";
 366
 367     PERL_ARGS_ASSERT_GET_CATEGORY_INDEX;
 368
 369 #  ifdef LC_ALL
 370     for (i = 0; i <=         LC_ALL_INDEX_; i++)
 371 #  else
 372     for (i = 0; i <  NOMINAL_LC_ALL_INDEX;  i++)
 373 #  endif
 374     {
 375         if (category == categories[i]) {
 376             dTHX_DEBUGGING;
 377             DEBUG_Lv(PerlIO_printf(Perl_debug_log,
 378                                    "index of category %d (%s) is %d\n",
 379                                    category, category_names[i], i));
 380             return i;
 381         }
 382     }
 383
 384     /* Here, we don't know about this category, so can't handle it. */
 385
 386     if (! locale) {
 387         locale = "";
 388         conditional_warn_text = "";
 389     }
 390
 391     /* diag_listed_as: Unknown locale category %d; can't set it to %s */
 392     Perl_warner_nocontext(packWARN(WARN_LOCALE),
 393                           "Unknown locale category %d%s%s",
 394                           category, conditional_warn_text, locale);
 395
 396 #  ifdef EINVAL
 397
 398     SETERRNO(EINVAL, LIB_INVARG);
 399
 400 #  endif
 401
 402     /* Return an out-of-bounds value */
 403     return NOMINAL_LC_ALL_INDEX + 1;
 404 }
 405
 406 STATIC const char *
 407 S_category_name(const int category)
 408 {
 409     unsigned int index;
 410
 411     index = get_category_index(category, NULL);
 412
 413     if (index <= NOMINAL_LC_ALL_INDEX) {
 414         return category_names[index];
 415     }
 416
 417     return Perl_form_nocontext("%d (unknown)", category);
 418 }
 419
 420 #endif /* ifdef USE_LOCALE */
 421
 422 void
 423 Perl_force_locale_unlock()
 424 {
 425
 426 #if defined(USE_LOCALE_THREADS)
 427
 428     dTHX;
 429 #  ifdef LOCALE_UNLOCK_
 430     LOCALE_UNLOCK_;
 431 #  endif
 432
 433 #endif
 434
 435 }
 436
 437 #ifdef USE_POSIX_2008_LOCALE
 438
 439 STATIC locale_t
 440 S_use_curlocale_scratch(pTHX)
 441 {
 442     /* This function is used to hide from the caller the case where the current
 443      * locale_t object in POSIX 2008 is the global one, which is illegal in
 444      * many of the P2008 API calls.  This checks for that and, if necessary
 445      * creates a proper P2008 object.  Any prior object is deleted, as is any
 446      * remaining object during global destruction. */
 447
 448     locale_t cur = uselocale((locale_t) 0);
 449
 450     if (cur != LC_GLOBAL_LOCALE) {
 451         return cur;
 452     }
 453
 454     if (PL_scratch_locale_obj) {
 455         freelocale(PL_scratch_locale_obj);
 456     }
 457
 458     PL_scratch_locale_obj = duplocale(LC_GLOBAL_LOCALE);
 459     return PL_scratch_locale_obj;
 460 }
 461
 462 #endif
 463
 464 void
 465 Perl_locale_panic(const char * msg,
 466                   const char * file_name,
 467                   const line_t line,
 468                   const int errnum)
 469 {
 470     dTHX;
 471
 472     PERL_ARGS_ASSERT_LOCALE_PANIC;
 473
 474     force_locale_unlock();
 475
 476 #ifdef USE_C_BACKTRACE
 477     dump_c_backtrace(Perl_debug_log, 20, 1);
 478 #endif
 479
 480     /* diag_listed_as: panic: %s */
 481     Perl_croak(aTHX_ "%s: %d: panic: %s; errno=%d\n",
 482                      file_name, line, msg, errnum);
 483 }
 484
 485 #define setlocale_failure_panic_c(                                          \
 486                         cat, current, failed, caller_0_line, caller_1_line) \
 487         setlocale_failure_panic_i(cat##_INDEX_, current, failed,            \
 488                         caller_0_line, caller_1_line)
 489
 490 /* porcelain_setlocale() presents a consistent POSIX-compliant interface to
 491  * setlocale().   Windows requres a customized base-level setlocale() */
 492 #ifdef WIN32
 493 #  define porcelain_setlocale(cat, locale) win32_setlocale(cat, locale)
 494 #else
 495 #  define porcelain_setlocale(cat, locale)                              \
 496                                 ((const char *) setlocale(cat, locale))
 497 #endif
 498
 499 /* The next layer up is to catch vagaries and bugs in the libc setlocale return
 500  * value */
 501 #ifdef stdize_locale
 502 #  define stdized_setlocale(cat, locale)                                       \
 503      stdize_locale(cat, porcelain_setlocale(cat, locale),                      \
 504                    &PL_stdize_locale_buf, &PL_stdize_locale_bufsize, __LINE__)
 505 #else
 506 #  define stdized_setlocale(cat, locale)  porcelain_setlocale(cat, locale)
 507 #endif
 508
 509 /* The next many lines form a layer above the close-to-the-metal 'porcelain'
 510  * and 'stdized' macros.  They are used to present a uniform API to the rest of
 511  * the code in this file in spite of the disparate underlying implementations.
 512  * */
 513
 514 #ifndef USE_POSIX_2008_LOCALE
 515
 516 /* For non-threaded perls (which we are not to use the POSIX 2008 API on), or a
 517  * thread-safe Windows one in which threading is invisible to us, the added
 518  * layer just calls the base-level functions.  See the introductory comments in
 519  * this file for the meaning of the suffixes '_c', '_r', '_i'. */
 520
 521 #  define setlocale_r(cat, locale)        stdized_setlocale(cat, locale)
 522 #  define setlocale_i(i, locale)      setlocale_r(categories[i], locale)
 523 #  define setlocale_c(cat, locale)              setlocale_r(cat, locale)
 524
 525 #  define void_setlocale_i(i, locale)                                       \
 526     STMT_START {                                                            \
 527         if (! porcelain_setlocale(categories[i], locale)) {                 \
 528             setlocale_failure_panic_i(i, NULL, locale, __LINE__, 0);        \
 529             NOT_REACHED; /* NOTREACHED */                                   \
 530         }                                                                   \
 531     } STMT_END
 532 #  define void_setlocale_c(cat, locale)                                     \
 533                                   void_setlocale_i(cat##_INDEX_, locale)
 534 #  define void_setlocale_r(cat, locale)                                     \
 535                void_setlocale_i(get_category_index(cat, locale), locale)
 536
 537 #  define bool_setlocale_r(cat, locale)                                     \
 538                                   cBOOL(porcelain_setlocale(cat, locale))
 539 #  define bool_setlocale_i(i, locale)                                       \
 540                                  bool_setlocale_c(categories[i], locale)
 541 #  define bool_setlocale_c(cat, locale)    bool_setlocale_r(cat, locale)
 542
 543 #  define querylocale_r(cat)        setlocale_r(cat, NULL)
 544 #  define querylocale_c(cat)        querylocale_r(cat)
 545 #  define querylocale_i(i)          querylocale_c(categories[i])
 546
 547 #else   /* Below is defined(POSIX 2008) */
 548
 549 /* Here, there is a completely different API to get thread-safe locales.  We
 550  * emulate the setlocale() API with our own function(s).  setlocale categories,
 551  * like LC_NUMERIC, are not valid here for the POSIX 2008 API.  Instead, there
 552  * are equivalents, like LC_NUMERIC_MASK, which we use instead, converting to
 553  * by using get_category_index() followed by table lookup. */
 554
 555 #  define emulate_setlocale_c(cat, locale, recalc_LC_ALL, line)             \
 556            emulate_setlocale_i(cat##_INDEX_, locale, recalc_LC_ALL, line)
 557
 558      /* A wrapper for the macros below. */
 559 #  define common_emulate_setlocale(i, locale)                               \
 560                  emulate_setlocale_i(i, locale, YES_RECALC_LC_ALL, __LINE__)
 561
 562 #  define setlocale_i(i, locale)     common_emulate_setlocale(i, locale)
 563 #  define setlocale_c(cat, locale)     setlocale_i(cat##_INDEX_, locale)
 564 #  define setlocale_r(cat, locale)                                          \
 565                     setlocale_i(get_category_index(cat, locale), locale)
 566
 567 #  define void_setlocale_i(i, locale)     ((void) setlocale_i(i, locale))
 568 #  define void_setlocale_c(cat, locale)                                     \
 569                                   void_setlocale_i(cat##_INDEX_, locale)
 570 #  define void_setlocale_r(cat, locale) ((void) setlocale_r(cat, locale))
 571
 572 #  define bool_setlocale_i(i, locale)       cBOOL(setlocale_i(i, locale))
 573 #  define bool_setlocale_c(cat, locale)                                     \
 574                                   bool_setlocale_i(cat##_INDEX_, locale)
 575 #  define bool_setlocale_r(cat, locale)   cBOOL(setlocale_r(cat, locale))
 576
 577 #  define querylocale_i(i)      my_querylocale_i(i)
 578 #  define querylocale_c(cat)    querylocale_i(cat##_INDEX_)
 579 #  define querylocale_r(cat)    querylocale_i(get_category_index(cat,NULL))
 580
 581 #  ifndef USE_QUERYLOCALE
 582 #    define USE_PL_CURLOCALES
 583 #  else
 584 #    define isSINGLE_BIT_SET(mask) isPOWER_OF_2(mask)
 585
 586      /* This code used to think querylocale() was valid on LC_ALL.  Make sure
 587       * all instances of that have been removed */
 588 #    define QUERYLOCALE_ASSERT(index)                                       \
 589                         __ASSERT_(isSINGLE_BIT_SET(category_masks[index]))
 590 #    if ! defined(HAS_QUERYLOCALE) && defined(_NL_LOCALE_NAME)
 591 #      define querylocale_l(index, locale_obj)                              \
 592             (QUERYLOCALE_ASSERT(index)                                      \
 593              nl_langinfo_l(_NL_LOCALE_NAME(categories[index]), locale_obj))
 594 #    else
 595 #      define querylocale_l(index, locale_obj)                              \
 596                            (QUERYLOCALE_ASSERT(index)                       \
 597                             querylocale(category_masks[index], locale_obj))
 598 #    endif
 599 #  endif
 600 #  if defined(__GLIBC__) && defined(USE_LOCALE_MESSAGES)
 601 #    define HAS_GLIBC_LC_MESSAGES_BUG
 602 #    include <libintl.h>
 603 #  endif
 604
 605 /* A fourth array, parallel to the ones above to map from category to its
 606  * equivalent mask */
 607 STATIC const int category_masks[] = {
 608 #  ifdef USE_LOCALE_CTYPE
 609                                 LC_CTYPE_MASK,
 610 #  endif
 611 #  ifdef USE_LOCALE_NUMERIC
 612                                 LC_NUMERIC_MASK,
 613 #  endif
 614 #  ifdef USE_LOCALE_COLLATE
 615                                 LC_COLLATE_MASK,
 616 #  endif
 617 #  ifdef USE_LOCALE_TIME
 618                                 LC_TIME_MASK,
 619 #  endif
 620 #  ifdef USE_LOCALE_MESSAGES
 621                                 LC_MESSAGES_MASK,
 622 #  endif
 623 #  ifdef USE_LOCALE_MONETARY
 624                                 LC_MONETARY_MASK,
 625 #  endif
 626 #  ifdef USE_LOCALE_ADDRESS
 627                                 LC_ADDRESS_MASK,
 628 #  endif
 629 #  ifdef USE_LOCALE_IDENTIFICATION
 630                                 LC_IDENTIFICATION_MASK,
 631 #  endif
 632 #  ifdef USE_LOCALE_MEASUREMENT
 633                                 LC_MEASUREMENT_MASK,
 634 #  endif
 635 #  ifdef USE_LOCALE_PAPER
 636                                 LC_PAPER_MASK,
 637 #  endif
 638 #  ifdef USE_LOCALE_TELEPHONE
 639                                 LC_TELEPHONE_MASK,
 640 #  endif
 641 #  ifdef USE_LOCALE_SYNTAX
 642                                 LC_SYNTAX_MASK,
 643 #  endif
 644 #  ifdef USE_LOCALE_TOD
 645                                 LC_TOD_MASK,
 646 #  endif
 647                                 /* LC_ALL can't be turned off by a Configure
 648                                  * option, and in Posix 2008, should always be
 649                                  * here, so compile it in unconditionally.
 650                                  * This could catch some glitches at compile
 651                                  * time */
 652                                 LC_ALL_MASK,
 653
 654    /* Placeholder as a precaution if code fails to check the return of
 655     * get_category_index(), which returns this element to indicate an error */
 656                                 0
 657 };
 658
 659 #  define my_querylocale_c(cat) my_querylocale_i(cat##_INDEX_)
 660
 661 STATIC const char *
 662 S_my_querylocale_i(pTHX_ const unsigned int index)
 663 {
 664     /* This function returns the name of the locale category given by the input
 665      * index into our parallel tables of them.
 666      *
 667      * POSIX 2008, for some sick reason, chose not to provide a method to find
 668      * the category name of a locale, discarding a basic linguistic tenet that
 669      * for any object, people will create a name for it.  Some vendors have
 670      * created a querylocale() function to do just that.  This function is a
 671      * lot simpler to implement on systems that have this.  Otherwise, we have
 672      * to keep track of what the locale has been set to, so that we can return
 673      * its name so as to emulate setlocale().  It's also possible for C code in
 674      * some library to change the locale without us knowing it, though as of
 675      * September 2017, there are no occurrences in CPAN of uselocale().  Some
 676      * libraries do use setlocale(), but that changes the global locale, and
 677      * threads using per-thread locales will just ignore those changes. */
 678
 679     int category;
 680     const locale_t cur_obj = uselocale((locale_t) 0);
 681     const char * retval;
 682
 683     PERL_ARGS_ASSERT_MY_QUERYLOCALE_I;
 684     assert(index <= NOMINAL_LC_ALL_INDEX);
 685
 686     category = categories[index];
 687
 688     DEBUG_Lv(PerlIO_printf(Perl_debug_log, "my_querylocale_i(%s) on %p\n",
 689                                            category_names[index], cur_obj));
 690         if (cur_obj == LC_GLOBAL_LOCALE) {
 691         retval = porcelain_setlocale(category, NULL);
 692         }
 693     else {
 694
 695 #  ifdef USE_QUERYLOCALE
 696
 697         /* We don't currently keep records when there is querylocale(), so have
 698          * to get it anew each time */
 699         retval = (index == LC_ALL_INDEX_)
 700                  ? calculate_LC_ALL(cur_obj)
 701                  : querylocale_l(index, cur_obj);
 702
 703 #  else
 704
 705         /* But we do have up-to-date values when we keep our own records */
 706         retval = PL_curlocales[index];
 707
 708 #  endif
 709
 710             }
 711
 712                 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
 713                            "my_querylocale_i(%s) returning '%s'\n",
 714                            category_names[index], retval));
 715     return retval;
 716 }
 717
 718 #  ifdef USE_PL_CURLOCALES
 719
 720 STATIC const char *
 721 S_update_PL_curlocales_i(pTHX_
 722                          const unsigned int index,
 723                          const char * new_locale,
 724                          recalc_lc_all_t recalc_LC_ALL)
 725 {
 726     /* This is a helper function for emulate_setlocale_i(), mostly used to
 727      * make that function easier to read. */
 728
 729     PERL_ARGS_ASSERT_UPDATE_PL_CURLOCALES_I;
 730     assert(index <= NOMINAL_LC_ALL_INDEX);
 731
 732     if (index == LC_ALL_INDEX_) {
 733         unsigned int i;
 734
 735         /* For LC_ALL, we change all individual categories to correspond */
 736                          /* PL_curlocales is a parallel array, so has same
 737                           * length as 'categories' */
 738         for (i = 0; i < LC_ALL_INDEX_; i++) {
 739             Safefree(PL_curlocales[i]);
 740             PL_curlocales[i] = savepv(new_locale);
 741         }
 742
 743         recalc_LC_ALL = YES_RECALC_LC_ALL;
 744     }
 745     else {
 746
 747         /* Update the single category's record */
 748         Safefree(PL_curlocales[index]);
 749         PL_curlocales[index] = savepv(new_locale);
 750
 751         if (recalc_LC_ALL == RECALCULATE_LC_ALL_ON_FINAL_INTERATION) {
 752             recalc_LC_ALL = (index == NOMINAL_LC_ALL_INDEX - 1)
 753                             ? YES_RECALC_LC_ALL
 754                             : DONT_RECALC_LC_ALL;
 755         }
 756     }
 757
 758     if (recalc_LC_ALL == YES_RECALC_LC_ALL) {
 759         Safefree(PL_curlocales[LC_ALL_INDEX_]);
 760         PL_curlocales[LC_ALL_INDEX_] =
 761                                     savepv(calculate_LC_ALL(PL_curlocales));
 762     }
 763
 764     return PL_curlocales[index];
 765 }
 766
 767 #  endif  /* Need PL_curlocales[] */
 768
 769 STATIC const char *
 770 S_setlocale_from_aggregate_LC_ALL(pTHX_ const char * locale, const line_t line)
 771 {
 772     /* This function parses the value of the LC_ALL locale, assuming glibc
 773      * syntax, and sets each individual category on the system to the proper
 774      * value.
 775      *
 776      * This is likely to only ever be called from one place, so exists to make
 777      * the calling function easier to read by moving this ancillary code out of
 778      * the main line.
 779      *
 780      * The locale for each category is independent of the other categories.
 781      * Often, they are all the same, but certainly not always.  Perl, in fact,
 782      * usually keeps LC_NUMERIC in the C locale, regardless of the underlying
 783      * locale.  LC_ALL has to be able to represent the case of when there are
 784      * varying locales.  Platforms have differing ways of representing this.
 785      * Because of this, the code in this file goes to lengths to avoid the
 786      * issue, generally looping over the component categories instead of
 787      * referring to them in the aggregate, wherever possible.  However, there
 788      * are cases where we have to parse our own constructed aggregates, which use
 789      * the glibc syntax. */
 790
 791     const char * locale_on_entry = savepv(querylocale_c(LC_ALL));
 792
 793     PERL_ARGS_ASSERT_SETLOCALE_FROM_AGGREGATE_LC_ALL;
 794
 795     /* If the string that gives what to set doesn't include all categories,
 796      * the omitted ones get set to "C".  To get this behavior, first set
 797      * all the individual categories to "C", and override the furnished
 798      * ones below.  FALSE => No need to recalculate LC_ALL, as this is a
 799      * temporary state */
 800     if (! emulate_setlocale_c(LC_ALL, "C", DONT_RECALC_LC_ALL, line)) {
 801         setlocale_failure_panic_c(LC_ALL, locale_on_entry,
 802                                   "C", __LINE__, line);
 803         NOT_REACHED; /* NOTREACHED */
 804     }
 805
 806     const char * s = locale;
 807     const char * e = locale + strlen(locale);
 808     while (s < e) {
 809         const char * p = s;
 810
 811         /* Parse through the category */
 812         while (isWORDCHAR(*p)) {
 813             p++;
 814         }
 815
 816         const char * category_end = p;
 817
 818         if (*p++ != '=') {
 819             locale_panic_(Perl_form(aTHX_
 820                   "Unexpected character in locale category name '%02X", *(p-1)));
 821         }
 822
 823         /* Parse through the locale name */
 824         const char * name_start = p;
 825         while (p < e && *p != ';') {
 826             if (! isGRAPH(*p)) {
 827                 locale_panic_(Perl_form(aTHX_
 828                               "Unexpected character in locale name '%02X", *p));
 829             }
 830             p++;
 831         }
 832
 833         const char * name_end = p;
 834
 835         /* Space past the semi-colon */
 836         if (p < e) {
 837             p++;
 838         }
 839
 840         /* Find the index of the category name in our lists */
 841         for (PERL_UINT_FAST8_T i = 0; i < LC_ALL_INDEX_; i++) {
 842
 843             /* Keep going if this index doesn't point to the category being
 844              * parsed.  The strnNE() avoids a Perl_form(), but would fail if
 845              * ever a category name could be a substring of another one, e.g.,
 846              * if there were a "LC_TIME_DATE" */
 847             if strnNE(s, category_names[i], category_end - s) {
 848                 continue;
 849             }
 850
 851             /* Here i points to the category being parsed.  Now isolate the
 852              * locale it is being changed to */
 853             const char * individ_locale = Perl_form(aTHX_ "%.*s",
 854                                 (int) (name_end - name_start), name_start);
 855
 856             /* And do the change.  FALSE => Don't recalculate LC_ALL; we'll do
 857              * it ourselves after the loop */
 858             if (! emulate_setlocale_i(i, individ_locale,
 859                                       DONT_RECALC_LC_ALL, line))
 860             {
 861
 862                 /* But if we have to back out, do fix up LC_ALL */
 863                 if (! emulate_setlocale_c(LC_ALL, locale_on_entry,
 864                                           YES_RECALC_LC_ALL, line))
 865                 {
 866                     Safefree(locale_on_entry);
 867                     setlocale_failure_panic_i(i, individ_locale,
 868                                               locale, __LINE__, line);
 869                     NOT_REACHED; /* NOTREACHED */
 870                 }
 871
 872                 Safefree(locale_on_entry);
 873
 874                 /* Reverting to the entry value succeeded, but the operation
 875                  * failed to go to the requested locale. */
 876                 return NULL;
 877             }
 878
 879             /* Found and handled the desired category.  Quit the inner loop to
 880              * try the next category */
 881             break;
 882         }
 883
 884         /* Finished with this category; iterate to the next one in the input */
 885         s = p;
 886     }
 887
 888 #    ifdef USE_PL_CURLOCALES
 889
 890     /* Here we have set all the individual categories.  Update the LC_ALL entry
 891      * as well.  We can't just use the input 'locale' as the value may omit
 892      * categories whose locale is 'C'.  khw thinks it's better to store a
 893      * complete LC_ALL.  So calculate it. */
 894     const char * retval = savepv(calculate_LC_ALL(PL_curlocales));
 895     Safefree(PL_curlocales[LC_ALL_INDEX_]);
 896     PL_curlocales[LC_ALL_INDEX_] = retval;
 897
 898 #    else
 899
 900     const char * retval = querylocale_c(LC_ALL);
 901
 902 #    endif
 903
 904     Safefree(locale_on_entry);
 905     return retval;
 906 }
 907
 908 #  ifndef USE_QUERYLOCALE
 909
 910 STATIC const char *
 911 S_find_locale_from_environment(pTHX_ const unsigned int index)
 912 {
 913     /* On systems without querylocale(), it is problematic getting the results
 914      * of the POSIX 2008 equivalent of setlocale(category, "") (which gets the
 915      * locale from the environment).
 916      *
 917      * To ensure that we know exactly what those values are, we do the setting
 918      * ourselves, using the documented algorithm (assuming the documentation is
 919      * correct) rather than use "" as the locale.  This will lead to results
 920      * that differ from native behavior if the native behavior differs from the
 921      * standard documented value, but khw believes it is better to know what's
 922      * going on, even if different from native, than to just guess.
 923      *
 924      * Another option would be, in a critical section, to save the global
 925      * locale's current value, and do a straight setlocale(LC_ALL, "").  That
 926      * would return our desired values, destroying the global locale's, which
 927      * we would then restore.  But that could cause races with any other thread
 928      * that is using the global locale and isn't using the mutex.  And, the
 929      * only reason someone would have done that is because they are calling a
 930      * library function, like in gtk, that calls setlocale(), and which can't
 931      * be changed to use the mutex.  That wouldn't be a problem if this were to
 932      * be done before any threads had switched, say during perl construction
 933      * time.  But this code would still be needed for the general case. */
 934
 935     const char * default_name;
 936     unsigned int i;
 937     const char * locale_names[LC_ALL_INDEX_];
 938
 939     /* We rely on PerlEnv_getenv() returning a mortalized copy */
 940     const char * const lc_all = PerlEnv_getenv("LC_ALL");
 941
 942     /* Use any "LC_ALL" environment variable, as it overrides everything
 943      * else. */
 944     if (lc_all && strNE(lc_all, "")) {
 945         return lc_all;
 946     }
 947
 948     /* Otherwise, we need to dig deeper.  Unless overridden, the default is
 949      * the LANG environment variable; "C" if it doesn't exist. */
 950     default_name = PerlEnv_getenv("LANG");
 951     if (! default_name || strEQ(default_name, "")) {
 952         default_name = "C";
 953     }
 954
 955     /* If setting an individual category, use its corresponding value found in
 956      * the environment, if any; otherwise use the default we already
 957      * calculated. */
 958     if (index != LC_ALL_INDEX_) {
 959         const char * const new_value = PerlEnv_getenv(category_names[index]);
 960
 961         return (new_value && strNE(new_value, ""))
 962                 ? new_value
 963                 : default_name;
 964     }
 965
 966     /* Here, we are getting LC_ALL.  Any categories that don't have a
 967      * corresponding environment variable set should be set to 'default_name'
 968      *
 969      * Simply find the values for all categories, and call the function to
 970      * compute LC_ALL. */
 971     for (i = 0; i < LC_ALL_INDEX_; i++) {
 972         const char * const env_override = PerlEnv_getenv(category_names[i]);
 973
 974         locale_names[i] = (env_override && strNE(env_override, ""))
 975                           ? env_override
 976                           : default_name;
 977
 978         DEBUG_Lv(PerlIO_printf(Perl_debug_log,
 979                  "find_locale_from_environment i=%d, name=%s, locale=%s\n",
 980                  i, category_names[i], locale_names[i]));
 981     }
 982
 983     return calculate_LC_ALL(locale_names);
 984 }
 985
 986 #  endif
 987
 988 STATIC const char *
 989 S_emulate_setlocale_i(pTHX_
 990
 991         /* Our internal index of the 'category' setlocale is
 992            called with */
 993         const unsigned int index,
 994
 995         const char * new_locale, /* The locale to set the category to */
 996         const recalc_lc_all_t recalc_LC_ALL,  /* Explained below */
 997         const line_t line     /* Called from this line number */
 998        )
 999 {
1000     PERL_ARGS_ASSERT_EMULATE_SETLOCALE_I;
1001     assert(index <= NOMINAL_LC_ALL_INDEX);
1002
1003     /* This function effectively performs a setlocale() on just the current
1004      * thread; thus it is thread-safe.  It does this by using the POSIX 2008
1005      * locale functions to emulate the behavior of setlocale().  Similar to
1006      * regular setlocale(), the return from this function points to memory that
1007      * can be overwritten by other system calls, so needs to be copied
1008      * immediately if you need to retain it.  The difference here is that
1009      * system calls besides another setlocale() can overwrite it.
1010      *
1011      * By doing this, most locale-sensitive functions become thread-safe.  The
1012      * exceptions are mostly those that return a pointer to static memory.
1013      *
1014      * This function may be called in a tight loop that iterates over all
1015      * categories.  Because LC_ALL is not a "real" category, but merely the sum
1016      * of all the other ones, such loops don't include LC_ALL.  On systems that
1017      * have querylocale() or similar, the current LC_ALL value is immediately
1018      * retrievable; on systems lacking that feature, we have to keep track of
1019      * LC_ALL ourselves.  We could do that on each iteration, only to throw it
1020      * away on the next, but the calculation is more than a trivial amount of
1021      * work.  Instead, the 'recalc_LC_ALL' parameter is set to
1022      * RECALCULATE_LC_ALL_ON_FINAL_INTERATION to only do the calculation once.
1023      * This function calls itself recursively in such a loop.
1024      *
1025      * When not in such a loop, the parameter is set to the other enum values
1026      * DONT_RECALC_LC_ALL or YES_RECALC_LC_ALL. */
1027
1028     int mask = category_masks[index];
1029     const locale_t entry_obj = uselocale((locale_t) 0);
1030     const char * locale_on_entry = querylocale_i(index);
1031
1032     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
1033              "emulate_setlocale_i input=%d (%s), mask=0x%x,"
1034              " new locale=\"%s\", current locale=\"%s\","
1035              "index=%d, object=%p\n",
1036              categories[index], category_name(categories[index]), mask,
1037              ((new_locale == NULL) ? "(nil)" : new_locale),
1038              locale_on_entry, index, entry_obj));
1039
1040     /* Return the already-calculated info if just querying what the existing
1041      * locale is */
1042     if (new_locale == NULL) {
1043         return locale_on_entry;
1044     }
1045
1046     /* Here, trying to change the locale, but it is a no-op if the new boss is
1047      * the same as the old boss.  Except this routine is called when converting
1048      * from the global locale, so in that case we will create a per-thread
1049      * locale below (with the current values).  Bitter experience also
1050      * indicates that newlocale() can free up the basis locale memory if we
1051      * call it with the new and old being the same. */
1052     if (   entry_obj != LC_GLOBAL_LOCALE
1053         && locale_on_entry
1054         && strEQ(new_locale, locale_on_entry))
1055     {
1056         DEBUG_Lv(PerlIO_printf(Perl_debug_log,
1057                  "(%" LINE_Tf "): emulate_setlocale_i"
1058                  " no-op to change to what it already was\n",
1059                  line));
1060
1061 #  ifdef USE_PL_CURLOCALES
1062
1063        /* On the final iteration of a loop that needs to recalculate LC_ALL, do
1064         * so.  If no iteration changed anything, LC_ALL also doesn't change,
1065         * but khw believes the complexity needed to keep track of that isn't
1066         * worth it. */
1067         if (UNLIKELY(   recalc_LC_ALL == RECALCULATE_LC_ALL_ON_FINAL_INTERATION
1068                      && index == NOMINAL_LC_ALL_INDEX - 1))
1069         {
1070             Safefree(PL_curlocales[LC_ALL_INDEX_]);
1071             PL_curlocales[LC_ALL_INDEX_] =
1072                                         savepv(calculate_LC_ALL(PL_curlocales));
1073         }
1074
1075 #  endif
1076
1077         return locale_on_entry;
1078     }
1079
1080 #  ifndef USE_QUERYLOCALE
1081
1082     /* Without a querylocale() mechanism, we have to figure out ourselves what
1083      * happens with setting a locale to "" */
1084     if (strEQ(new_locale, "")) {
1085         new_locale = find_locale_from_environment(index);
1086     }
1087
1088 #  endif
1089
1090     /* So far, it has worked that a semi-colon in the locale name means that
1091      * the category is LC_ALL and it subsumes categories which don't all have
1092      * the same locale.  This is the glibc syntax. */
1093     if (strchr(new_locale, ';')) {
1094         assert(index == LC_ALL_INDEX_);
1095         return setlocale_from_aggregate_LC_ALL(new_locale, line);
1096     }
1097
1098 #  ifdef HAS_GLIBC_LC_MESSAGES_BUG
1099
1100     /* For this bug, if the LC_MESSAGES locale changes, we have to do an
1101      * expensive workaround.  Save the current value so we can later determine
1102      * if it changed. */
1103     const char * old_messages_locale = NULL;
1104     if (   (index == LC_MESSAGES_INDEX_ || index == LC_ALL_INDEX_)
1105         &&  LIKELY(PL_phase != PERL_PHASE_CONSTRUCT))
1106     {
1107         old_messages_locale = savepv(querylocale_c(LC_MESSAGES));
1108     }
1109
1110 #  endif
1111
1112     assert(PL_C_locale_obj);
1113
1114     /* Now ready to switch to the input 'new_locale' */
1115
1116     /* Switching locales generally entails freeing the current one's space (at
1117      * the C library's discretion), hence we can't be using that locale at the
1118      * time of the switch (this wasn't obvious to khw from the man pages).  So
1119      * switch to a known locale object that we don't otherwise mess with. */
1120     if (! uselocale(PL_C_locale_obj)) {
1121
1122         /* Not being able to change to the C locale is severe; don't keep
1123          * going.  */
1124         setlocale_failure_panic_i(index, locale_on_entry, "C", __LINE__, line);
1125         NOT_REACHED; /* NOTREACHED */
1126     }
1127
1128     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
1129              "(%" LINE_Tf "): emulate_setlocale_i now using C"
1130              " object=%p\n", line, PL_C_locale_obj));
1131
1132     locale_t new_obj;
1133
1134     /* We created a (never changing) object at start-up for LC_ALL being in the
1135      * C locale.  If this call is to switch to LC_ALL=>C, simply use that
1136      * object.  But in fact, we already have switched to it just above, in
1137      * preparation for the general case.  Since we're already there, no need to
1138      * do further switching. */
1139     if (mask == LC_ALL_MASK && isNAME_C_OR_POSIX(new_locale)) {
1140         DEBUG_Lv(PerlIO_printf(Perl_debug_log, "(%" LINE_Tf "):"
1141                                                " emulate_setlocale_i will stay"
1142                                                " in C object\n", line));
1143         new_obj = PL_C_locale_obj;
1144
1145         /* And free the old object if it isn't a special one */
1146         if (entry_obj != LC_GLOBAL_LOCALE && entry_obj != PL_C_locale_obj) {
1147             freelocale(entry_obj);
1148         }
1149     }
1150     else {  /* Here is the general case, not to LC_ALL=>C */
1151         locale_t basis_obj = entry_obj;
1152
1153         /* Specially handle two objects */
1154         if (basis_obj == LC_GLOBAL_LOCALE || basis_obj == PL_C_locale_obj) {
1155
1156             /* For these two objects, we make duplicates to hand to newlocale()
1157              * below.  For LC_GLOBAL_LOCALE, this is because newlocale()
1158              * doesn't necessarily accept it as input (the results are
1159              * undefined).  For PL_C_locale_obj, it is so that it never gets
1160              * modified, as otherwise newlocale() is free to do so */
1161             basis_obj = duplocale(entry_obj);
1162             if (! basis_obj) {
1163                 locale_panic_(Perl_form(aTHX_ "(%" LINE_Tf "): duplocale failed",
1164                                               line));
1165                 NOT_REACHED; /* NOTREACHED */
1166             }
1167
1168             DEBUG_Lv(PerlIO_printf(Perl_debug_log,
1169                                    "(%" LINE_Tf "): emulate_setlocale_i"
1170                                    " created %p by duping the input\n",
1171                                    line, basis_obj));
1172         }
1173
1174         /* Ready to create a new locale by modification of the exising one */
1175         new_obj = newlocale(mask, new_locale, basis_obj);
1176
1177         if (! new_obj) {
1178             DEBUG_L(PerlIO_printf(Perl_debug_log,
1179                                   " (%" LINE_Tf "): emulate_setlocale_i"
1180                                   " creating new object from %p failed:"
1181                                   " errno=%d\n",
1182                                   line, basis_obj, GET_ERRNO));
1183
1184             /* Failed.  Likely this is because the proposed new locale isn't
1185              * valid on this system.  But we earlier switched to the LC_ALL=>C
1186              * locale in anticipation of it succeeding,  Now have to switch
1187              * back to the state upon entry */
1188             if (! uselocale(entry_obj)) {
1189                 setlocale_failure_panic_i(index, "switching back to",
1190                                           locale_on_entry, __LINE__, line);
1191                 NOT_REACHED; /* NOTREACHED */
1192             }
1193
1194 #    ifdef USE_PL_CURLOCALES
1195
1196             if (entry_obj == LC_GLOBAL_LOCALE) {
1197                 /* Here, we are back in the global locale.  We may never have
1198                  * set PL_curlocales.  If the locale change had succeeded, the
1199                  * code would have then set them up, but since it didn't, do so
1200                  * here.  khw isn't sure if this prevents some issues or not,
1201                  * but tis is defensive coding.  The system setlocale() returns
1202                  * the desired information.  This will calculate LC_ALL's entry
1203                  * only on the final iteration */
1204                 for (PERL_UINT_FAST8_T i = 0; i < NOMINAL_LC_ALL_INDEX; i++) {
1205                     update_PL_curlocales_i(i,
1206                                        porcelain_setlocale(categories[i], NULL),
1207                                        RECALCULATE_LC_ALL_ON_FINAL_INTERATION);
1208                 }
1209             }
1210 #    endif
1211
1212 #    ifdef HAS_GLIBC_LC_MESSAGES_BUG
1213             Safefree(old_messages_locale);
1214 #    endif
1215
1216             return NULL;
1217         }
1218
1219         DEBUG_Lv(PerlIO_printf(Perl_debug_log,
1220                                "(%" LINE_Tf "): emulate_setlocale_i created %p"
1221                                " while freeing %p\n", line, new_obj, basis_obj));
1222
1223         /* Here, successfully created an object representing the desired
1224          * locale; now switch into it */
1225         if (! uselocale(new_obj)) {
1226             freelocale(new_obj);
1227             locale_panic_(Perl_form(aTHX_ "(%" LINE_Tf "): emulate_setlocale_i"
1228                                           " switching into new locale failed",
1229                                           line));
1230         }
1231     }
1232
1233     /* Here, we are using 'new_obj' which matches the input 'new_locale'. */
1234     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
1235              "(%" LINE_Tf "): emulate_setlocale_i now using %p\n", line, new_obj));
1236
1237     /* We are done, except for updating our records (if the system doesn't keep
1238      * them) and in the case of locale "", we don't actually know what the
1239      * locale that got switched to is, as it came from the environment.  So
1240      * have to find it */
1241
1242 #  ifdef USE_QUERYLOCALE
1243
1244     if (strEQ(new_locale, "")) {
1245         new_locale = querylocale_i(index);
1246     }
1247
1248     PERL_UNUSED_ARG(recalc_LC_ALL);
1249
1250 #  else
1251
1252     new_locale = update_PL_curlocales_i(index, new_locale, recalc_LC_ALL);
1253
1254 #  endif
1255 #  ifdef HAS_GLIBC_LC_MESSAGES_BUG
1256
1257     /* Invalidate the glibc cache of loaded translations if the locale has changed,
1258      * see [perl #134264] */
1259     if (old_messages_locale) {
1260         if (strNE(old_messages_locale, my_querylocale_c(LC_MESSAGES))) {
1261             textdomain(textdomain(NULL));
1262         }
1263
1264         Safefree(old_messages_locale);
1265     }
1266
1267 #  endif
1268
1269     return new_locale;
1270 }
1271
1272 #endif   /* End of the various implementations of the setlocale and
1273             querylocale macros used in the remainder of this program */
1274
1275 #ifdef USE_LOCALE
1276
1277 /* So far, the locale strings returned by modern 2008-compliant systems have
1278  * been fine */
1279
1280 STATIC const char *
1281 S_stdize_locale(pTHX_ const int category,
1282                       const char *input_locale,
1283                       const char **buf,
1284                       Size_t *buf_size,
1285                       const line_t caller_line)
1286 {
1287     /* The return value of setlocale() is opaque, but is required to be usable
1288      * as input to a future setlocale() to create the same state.
1289      * Unfortunately not all systems are compliant.  But most often they are of
1290      * a very restricted set of forms that this file has been coded to expect.
1291      *
1292      * There are some outliers, though, that this function tries to tame:
1293      *
1294      * 1) A new-line.  This function chomps any \n characters
1295      * 2) foo=bar.     'bar' is what is generally meant, and the foo= part is
1296      *                 stripped.  This form is legal for LC_ALL.  When found in
1297      *                 that category group, the function calls itself
1298      *                 recursively on each possible component category to make
1299      *                 sure the individual categories are ok.
1300      *
1301      * If no changes to the input were made, it is returned; otherwise the
1302      * changed version is stored into memory at *buf, with *buf_size set to its
1303      * new value, and *buf is returned.
1304      */
1305
1306     const char * first_bad;
1307     const char * retval;
1308
1309     PERL_ARGS_ASSERT_STDIZE_LOCALE;
1310
1311     if (input_locale == NULL) {
1312         return NULL;
1313     }
1314
1315     first_bad = strpbrk(input_locale, "=\n");
1316
1317     /* Most likely, there isn't a problem with the input */
1318     if (LIKELY(! first_bad)) {
1319         return input_locale;
1320     }
1321
1322 #    ifdef LC_ALL
1323
1324     /* But if there is, and the category is LC_ALL, we have to look at each
1325      * component category */
1326     if (category == LC_ALL) {
1327         const char * individ_locales[LC_ALL_INDEX_];
1328         bool made_changes = FALSE;
1329         unsigned int i;
1330
1331         for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++) {
1332             Size_t this_size = 0;
1333             individ_locales[i] = stdize_locale(categories[i],
1334                                                porcelain_setlocale(categories[i],
1335                                                                    NULL),
1336                                                &individ_locales[i],
1337                                                &this_size,
1338                                                caller_line);
1339
1340             /* If the size didn't change, it means this category did not have
1341              * to be adjusted, and individ_locales[i] points to the buffer
1342              * returned by porcelain_setlocale(); we have to copy that before
1343              * it's called again in the next iteration */
1344             if (this_size == 0) {
1345                 individ_locales[i] = savepv(individ_locales[i]);
1346             }
1347             else {
1348                 made_changes = TRUE;
1349             }
1350         }
1351
1352         /* If all the individual categories were ok as-is, this was a false
1353          * alarm.  We must have seen an '=' which was a legal occurrence in
1354          * this combination locale */
1355         if (! made_changes) {
1356             retval = input_locale;  /* The input can be returned unchanged */
1357         }
1358         else {
1359             retval = save_to_buffer(querylocale_c(LC_ALL), buf, buf_size, 0);
1360         }
1361
1362         for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++) {
1363             Safefree(individ_locales[i]);
1364         }
1365
1366         return retval;
1367     }
1368
1369 #    endif
1370
1371     /* Here, there was a problem in an individual category.  This means that at
1372      * least one adjustment will be necessary.  Create a modifiable copy */
1373     retval = save_to_buffer(input_locale, buf, buf_size, 0);
1374
1375     if (*first_bad != '=') {
1376
1377         /* Translate the found position into terms of the copy */
1378         first_bad = retval + (first_bad - input_locale);
1379     }
1380     else { /* An '=' */
1381
1382         /* It is unlikely that the return is so screwed-up that it contains
1383          * multiple equals signs, but handle that case by stripping all of
1384          * them.  */
1385         const char * final_equals = strrchr(retval, '=');
1386
1387         /* The length passed here causes the move to include the terminating
1388          * NUL */
1389         Move(final_equals + 1, retval, strlen(final_equals), char);
1390
1391         /* See if there are additional problems; if not, we're good to return.
1392          * */
1393         first_bad = strpbrk(retval, "\n");
1394
1395         if (! first_bad) {
1396             return retval;
1397         }
1398     }
1399
1400     /* Here, the problem must be a \n.  Get rid of it and what follows.
1401      * (Originally, only a trailing \n was stripped.  Unsure what to do if not
1402      * trailing) */
1403     *((char *) first_bad) = '\0';
1404     return retval;
1405 }
1406
1407 #if defined(USE_POSIX_2008_LOCALE)
1408
1409 STATIC
1410 const char *
1411
1412 #  ifdef USE_QUERYLOCALE
1413 S_calculate_LC_ALL(pTHX_ const locale_t cur_obj)
1414 #  else
1415 S_calculate_LC_ALL(pTHX_ const char ** individ_locales)
1416 #  endif
1417
1418 {
1419     /* For POSIX 2008, we have to figure out LC_ALL ourselves when needed.
1420      * querylocale(), on systems that have it, doesn't tend to work for LC_ALL.
1421      * So we have to construct the answer ourselves based on the passed in
1422      * data, which is either a locale_t object, for systems with querylocale(),
1423      * or an array we keep updated to the proper values, otherwise.
1424      *
1425      * This returns a mortalized string containing the locale name(s) of
1426      * LC_ALL.
1427      *
1428      * If all individual categories are the same locale, we can just set LC_ALL
1429      * to that locale.  But if not, we have to create an aggregation of all the
1430      * categories on the system.  Platforms differ as to the syntax they use
1431      * for these non-uniform locales for LC_ALL.  Some use a '/' or other
1432      * delimiter of the locales with a predetermined order of categories; a
1433      * Configure probe would be needed to tell us how to decipher those.  glibc
1434      * uses a series of name=value pairs, like
1435      *      LC_NUMERIC=C;LC_TIME=en_US.UTF-8;...
1436      * The syntax we use for our aggregation doesn't much matter, as we take
1437      * care not to use the native setlocale() function on whatever style is
1438      * chosen.  But, it would be possible for someone to call Perl_setlocale()
1439      * using a native style we don't understand.  So far no one has complained.
1440      *
1441      * For systems that have categories we don't know about, the algorithm
1442      * below won't know about those missing categories, leading to potential
1443      * bugs for code that looks at them.  If there is an environment variable
1444      * that sets that category, we won't know to look for it, and so our use of
1445      * LANG or "C" improperly overrides it.  On the other hand, if we don't do
1446      * what is done here, and there is no environment variable, the category's
1447      * locale should be set to LANG or "C".  So there is no good solution.  khw
1448      * thinks the best is to make sure we have a complete list of possible
1449      * categories, adding new ones as they show up on obscure platforms.
1450      */
1451
1452     unsigned int i;
1453     Size_t names_len = 0;
1454     bool are_all_categories_the_same_locale = TRUE;
1455     char * aggregate_locale;
1456     char * previous_start = NULL;
1457     char * this_start = NULL;
1458     Size_t entry_len = 0;
1459
1460     PERL_ARGS_ASSERT_CALCULATE_LC_ALL;
1461
1462     /* First calculate the needed size for the string listing the categories
1463      * and their locales. */
1464     for (i = 0; i < LC_ALL_INDEX_; i++) {
1465
1466 #  ifdef USE_QUERYLOCALE
1467         const char * entry = querylocale_l(i, cur_obj);
1468 #  else
1469         const char * entry = individ_locales[i];
1470 #  endif
1471
1472         names_len += strlen(category_names[i])
1473                   + 1                           /* '=' */
1474                   + strlen(entry)
1475                   + 1;                          /* ';' */
1476     }
1477
1478     names_len++;    /* Trailing '\0' */
1479
1480     /* Allocate enough space for the aggregated string */
1481     SAVEFREEPV(Newxz(aggregate_locale, names_len, char));
1482
1483     /* Then fill it in */
1484     for (i = 0; i < LC_ALL_INDEX_; i++) {
1485         Size_t new_len;
1486
1487 #  ifdef USE_QUERYLOCALE
1488         const char * entry = querylocale_l(i, cur_obj);
1489 #  else
1490         const char * entry = individ_locales[i];
1491 #  endif
1492
1493         new_len = my_strlcat(aggregate_locale, category_names[i], names_len);
1494         assert(new_len <= names_len);
1495         new_len = my_strlcat(aggregate_locale, "=", names_len);
1496         assert(new_len <= names_len);
1497
1498         this_start = aggregate_locale + strlen(aggregate_locale);
1499         entry_len = strlen(entry);
1500
1501         new_len = my_strlcat(aggregate_locale, entry, names_len);
1502         assert(new_len <= names_len);
1503         new_len = my_strlcat(aggregate_locale, ";", names_len);
1504         assert(new_len <= names_len);
1505         PERL_UNUSED_VAR(new_len);   /* Only used in DEBUGGING */
1506
1507         if (   i > 0
1508             && are_all_categories_the_same_locale
1509             && memNE(previous_start, this_start, entry_len + 1))
1510         {
1511             are_all_categories_the_same_locale = FALSE;
1512         }
1513         else {
1514             previous_start = this_start;
1515         }
1516     }
1517
1518     /* If they are all the same, just return any one of them */
1519     if (are_all_categories_the_same_locale) {
1520         aggregate_locale = this_start;
1521         aggregate_locale[entry_len] = '\0';
1522     }
1523
1524     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
1525                            "calculate_LC_ALL returning '%s'\n",
1526                            aggregate_locale));
1527
1528     return aggregate_locale;
1529 }
1530 #endif /*defined(USE_POSIX_2008_LOCALE)*/
1531
1532 STATIC void
1533 S_setlocale_failure_panic_i(pTHX_
1534                             const unsigned int cat_index,
1535                             const char * current,
1536                             const char * failed,
1537                             const line_t caller_0_line,
1538                             const line_t caller_1_line)
1539 {
1540     dSAVE_ERRNO;
1541     const int cat = categories[cat_index];
1542     const char * name = category_names[cat_index];
1543
1544     PERL_ARGS_ASSERT_SETLOCALE_FAILURE_PANIC_I;
1545
1546     if (current == NULL) {
1547         current = querylocale_i(cat_index);
1548     }
1549
1550     Perl_locale_panic(Perl_form(aTHX_ "(%" LINE_Tf
1551                                       "): Can't change locale for %s(%d)"
1552                                       " from '%s' to '%s'",
1553                                       caller_1_line, name, cat,
1554                                       current, failed),
1555                       __FILE__, caller_0_line, GET_ERRNO);
1556     NOT_REACHED; /* NOTREACHED */
1557 }
1558
1559 STATIC void
1560 S_set_numeric_radix(pTHX_ const bool use_locale)
1561 {
1562     /* If 'use_locale' is FALSE, set to use a dot for the radix character.  If
1563      * TRUE, use the radix character derived from the current locale */
1564
1565 #  if defined(USE_LOCALE_NUMERIC) && (   defined(HAS_LOCALECONV)              \
1566                                     || defined(HAS_NL_LANGINFO))
1567
1568     const char * radix = (use_locale)
1569                          ? my_nl_langinfo(RADIXCHAR, FALSE)
1570                                         /* FALSE => already in dest locale */
1571                          : ".";
1572
1573         sv_setpv(PL_numeric_radix_sv, radix);
1574
1575     /* If this is valid UTF-8 that isn't totally ASCII, and we are in
1576         * a UTF-8 locale, then mark the radix as being in UTF-8 */
1577     if (is_utf8_non_invariant_string((U8 *) SvPVX(PL_numeric_radix_sv),
1578                                             SvCUR(PL_numeric_radix_sv))
1579         && _is_cur_LC_category_utf8(LC_NUMERIC))
1580     {
1581         SvUTF8_on(PL_numeric_radix_sv);
1582     }
1583
1584     DEBUG_L(PerlIO_printf(Perl_debug_log, "Locale radix is '%s', ?UTF-8=%d\n",
1585                                            SvPVX(PL_numeric_radix_sv),
1586                                            cBOOL(SvUTF8(PL_numeric_radix_sv))));
1587 #  else
1588
1589     PERL_UNUSED_ARG(use_locale);
1590
1591 #  endif /* USE_LOCALE_NUMERIC and can find the radix char */
1592
1593 }
1594
1595 STATIC void
1596 S_new_numeric(pTHX_ const char *newnum)
1597 {
1598
1599 #  ifndef USE_LOCALE_NUMERIC
1600
1601     PERL_UNUSED_ARG(newnum);
1602
1603 #  else
1604
1605     /* Called after each libc setlocale() call affecting LC_NUMERIC, to tell
1606      * core Perl this and that 'newnum' is the name of the new locale, and we
1607      * are switched into it.  It installs this locale as the current underlying
1608      * default, and then switches to the C locale, if necessary, so that the
1609      * code that has traditionally expected the radix character to be a dot may
1610      * continue to do so.
1611      *
1612      * The default locale and the C locale can be toggled between by use of the
1613      * set_numeric_underlying() and set_numeric_standard() functions, which
1614      * should probably not be called directly, but only via macros like
1615      * SET_NUMERIC_STANDARD() in perl.h.
1616      *
1617      * The toggling is necessary mainly so that a non-dot radix decimal point
1618      * character can be input and output, while allowing internal calculations
1619      * to use a dot.
1620      *
1621      * This sets several interpreter-level variables:
1622      * PL_numeric_name  The underlying locale's name: a copy of 'newnum'
1623      * PL_numeric_underlying  A boolean indicating if the toggled state is such
1624      *                  that the current locale is the program's underlying
1625      *                  locale
1626      * PL_numeric_standard An int indicating if the toggled state is such
1627      *                  that the current locale is the C locale or
1628      *                  indistinguishable from the C locale.  If non-zero, it
1629      *                  is in C; if > 1, it means it may not be toggled away
1630      *                  from C.
1631      * PL_numeric_underlying_is_standard   A bool kept by this function
1632      *                  indicating that the underlying locale and the standard
1633      *                  C locale are indistinguishable for the purposes of
1634      *                  LC_NUMERIC.  This happens when both of the above two
1635      *                  variables are true at the same time.  (Toggling is a
1636      *                  no-op under these circumstances.)  This variable is
1637      *                  used to avoid having to recalculate.
1638      * PL_numeric_radix_sv  Contains the string that code should use for the
1639      *                  decimal point.  It is set to either a dot or the
1640      *                  program's underlying locale's radix character string,
1641      *                  depending on the situation.
1642      * PL_underlying_numeric_obj = (only on POSIX 2008 platforms)  An object
1643      *                  with everything set up properly so as to avoid work on
1644      *                  such platforms.
1645      */
1646
1647     char *save_newnum;
1648
1649     if (! newnum) {
1650         Safefree(PL_numeric_name);
1651         PL_numeric_name = NULL;
1652         PL_numeric_standard = TRUE;
1653         PL_numeric_underlying = TRUE;
1654         PL_numeric_underlying_is_standard = TRUE;
1655         return;
1656     }
1657
1658     save_newnum = savepv(newnum);
1659     PL_numeric_underlying = TRUE;
1660     PL_numeric_standard = isNAME_C_OR_POSIX(save_newnum);
1661
1662 #    ifndef TS_W32_BROKEN_LOCALECONV
1663
1664     /* If its name isn't C nor POSIX, it could still be indistinguishable from
1665      * them.  But on broken Windows systems calling my_nl_langinfo() for
1666      * THOUSEP can currently (but rarely) cause a race, so avoid doing that,
1667      * and just always change the locale if not C nor POSIX on those systems */
1668     if (! PL_numeric_standard) {
1669         PL_numeric_standard = cBOOL(strEQ(".", my_nl_langinfo(RADIXCHAR,
1670                                             FALSE /* Don't toggle locale */  ))
1671                                  && strEQ("",  my_nl_langinfo(THOUSEP, FALSE)));
1672     }
1673
1674 #    endif
1675
1676     /* Save the new name if it isn't the same as the previous one, if any */
1677     if (! PL_numeric_name || strNE(PL_numeric_name, save_newnum)) {
1678     /* Save the locale name for future use */
1679         Safefree(PL_numeric_name);
1680         PL_numeric_name = save_newnum;
1681     }
1682     else {
1683         Safefree(save_newnum);
1684     }
1685
1686     PL_numeric_underlying_is_standard = PL_numeric_standard;
1687
1688 #  ifdef USE_POSIX_2008_LOCALE
1689
1690     /* We keep a special object for easy switching to */
1691     PL_underlying_numeric_obj = newlocale(LC_NUMERIC_MASK,
1692                                           PL_numeric_name,
1693                                           PL_underlying_numeric_obj);
1694
1695 #    endif
1696
1697     DEBUG_L( PerlIO_printf(Perl_debug_log,
1698                             "Called new_numeric with %s, PL_numeric_name=%s\n",
1699                             newnum, PL_numeric_name));
1700
1701     /* Keep LC_NUMERIC so that it has the C locale radix and thousands
1702      * separator.  This is for XS modules, so they don't have to worry about
1703      * the radix being a non-dot.  (Core operations that need the underlying
1704      * locale change to it temporarily). */
1705     if (PL_numeric_standard) {
1706         set_numeric_radix(0);
1707     }
1708     else {
1709         set_numeric_standard();
1710     }
1711
1712 #  endif
1713
1714 }
1715
1716 void
1717 Perl_set_numeric_standard(pTHX)
1718 {
1719
1720 #  ifdef USE_LOCALE_NUMERIC
1721
1722     /* Unconditionally toggle the LC_NUMERIC locale to the current underlying
1723      * default.
1724      *
1725      * Most code should use the macro SET_NUMERIC_STANDARD() in perl.h
1726      * instead of calling this directly.  The macro avoids calling this routine
1727      * if toggling isn't necessary according to our records (which could be
1728      * wrong if some XS code has changed the locale behind our back) */
1729
1730     DEBUG_L(PerlIO_printf(Perl_debug_log,
1731                                   "Setting LC_NUMERIC locale to standard C\n"));
1732
1733     void_setlocale_c(LC_NUMERIC, "C");
1734     PL_numeric_standard = TRUE;
1735     PL_numeric_underlying = PL_numeric_underlying_is_standard;
1736     set_numeric_radix(0);
1737
1738 #  endif /* USE_LOCALE_NUMERIC */
1739
1740 }
1741
1742 void
1743 Perl_set_numeric_underlying(pTHX)
1744 {
1745
1746 #  ifdef USE_LOCALE_NUMERIC
1747
1748     /* Unconditionally toggle the LC_NUMERIC locale to the current underlying
1749      * default.
1750      *
1751      * Most code should use the macro SET_NUMERIC_UNDERLYING() in perl.h
1752      * instead of calling this directly.  The macro avoids calling this routine
1753      * if toggling isn't necessary according to our records (which could be
1754      * wrong if some XS code has changed the locale behind our back) */
1755
1756     DEBUG_L(PerlIO_printf(Perl_debug_log, "Setting LC_NUMERIC locale to %s\n",
1757                                           PL_numeric_name));
1758
1759     void_setlocale_c(LC_NUMERIC, PL_numeric_name);
1760     PL_numeric_standard = PL_numeric_underlying_is_standard;
1761     PL_numeric_underlying = TRUE;
1762     set_numeric_radix(! PL_numeric_standard);
1763
1764 #  endif /* USE_LOCALE_NUMERIC */
1765
1766 }
1767
1768 /*
1769  * Set up for a new ctype locale.
1770  */
1771 STATIC void
1772 S_new_ctype(pTHX_ const char *newctype)
1773 {
1774
1775 #  ifndef USE_LOCALE_CTYPE
1776
1777     PERL_UNUSED_ARG(newctype);
1778     PERL_UNUSED_CONTEXT;
1779
1780 #  else
1781
1782     /* Called after each libc setlocale() call affecting LC_CTYPE, to tell
1783      * core Perl this and that 'newctype' is the name of the new locale.
1784      *
1785      * This function sets up the folding arrays for all 256 bytes, assuming
1786      * that tofold() is tolc() since fold case is not a concept in POSIX,
1787      *
1788      * Any code changing the locale (outside this file) should use
1789      * Perl_setlocale or POSIX::setlocale, which call this function.  Therefore
1790      * this function should be called directly only from this file and from
1791      * POSIX::setlocale() */
1792
1793     unsigned int i;
1794
1795     /* Don't check for problems if we are suppressing the warnings */
1796     bool check_for_problems = ckWARN_d(WARN_LOCALE) || UNLIKELY(DEBUG_L_TEST);
1797     bool maybe_utf8_turkic = FALSE;
1798
1799     PERL_ARGS_ASSERT_NEW_CTYPE;
1800
1801     DEBUG_L(PerlIO_printf(Perl_debug_log, "Entering new_ctype(%s)\n", newctype));
1802
1803     /* We will replace any bad locale warning with 1) nothing if the new one is
1804      * ok; or 2) a new warning for the bad new locale */
1805     if (PL_warn_locale) {
1806         SvREFCNT_dec_NN(PL_warn_locale);
1807         PL_warn_locale = NULL;
1808     }
1809
1810     PL_in_utf8_CTYPE_locale = _is_cur_LC_category_utf8(LC_CTYPE);
1811
1812     /* A UTF-8 locale gets standard rules.  But note that code still has to
1813      * handle this specially because of the three problematic code points */
1814     if (PL_in_utf8_CTYPE_locale) {
1815         Copy(PL_fold_latin1, PL_fold_locale, 256, U8);
1816
1817         /* UTF-8 locales can have special handling for 'I' and 'i' if they are
1818          * Turkic.  Make sure these two are the only anomalies.  (We don't
1819          * require towupper and towlower because they aren't in C89.) */
1820
1821 #    if defined(HAS_TOWUPPER) && defined (HAS_TOWLOWER)
1822
1823         if (towupper('i') == 0x130 && towlower('I') == 0x131)
1824
1825 #    else
1826
1827         if (toU8_UPPER_LC('i') == 'i' && toU8_LOWER_LC('I') == 'I')
1828
1829 #    endif
1830
1831         {
1832             /* This is how we determine it really is Turkic */
1833             check_for_problems = TRUE;
1834             maybe_utf8_turkic = TRUE;
1835         }
1836     }
1837
1838     /* We don't populate the other lists if a UTF-8 locale, but do check that
1839      * everything works as expected, unless checking turned off */
1840     if (check_for_problems || ! PL_in_utf8_CTYPE_locale) {
1841         /* Assume enough space for every character being bad.  4 spaces each
1842          * for the 94 printable characters that are output like "'x' "; and 5
1843          * spaces each for "'\\' ", "'\t' ", and "'\n' "; plus a terminating
1844          * NUL */
1845         char bad_chars_list[ (94 * 4) + (3 * 5) + 1 ] = { '\0' };
1846         bool multi_byte_locale = FALSE;     /* Assume is a single-byte locale
1847                                                to start */
1848         unsigned int bad_count = 0;         /* Count of bad characters */
1849
1850         for (i = 0; i < 256; i++) {
1851             if (! PL_in_utf8_CTYPE_locale) {
1852                 if (isU8_UPPER_LC(i))
1853                     PL_fold_locale[i] = (U8) toU8_LOWER_LC(i);
1854                 else if (isU8_LOWER_LC(i))
1855                     PL_fold_locale[i] = (U8) toU8_UPPER_LC(i);
1856                 else
1857                     PL_fold_locale[i] = (U8) i;
1858             }
1859
1860             /* If checking for locale problems, see if the native ASCII-range
1861              * printables plus \n and \t are in their expected categories in
1862              * the new locale.  If not, this could mean big trouble, upending
1863              * Perl's and most programs' assumptions, like having a
1864              * metacharacter with special meaning become a \w.  Fortunately,
1865              * it's very rare to find locales that aren't supersets of ASCII
1866              * nowadays.  It isn't a problem for most controls to be changed
1867              * into something else; we check only \n and \t, though perhaps \r
1868              * could be an issue as well. */
1869             if (    check_for_problems
1870                 && (isGRAPH_A(i) || isBLANK_A(i) || i == '\n'))
1871             {
1872                 bool is_bad = FALSE;
1873                 char name[4] = { '\0' };
1874
1875                 /* Convert the name into a string */
1876                 if (isGRAPH_A(i)) {
1877                     name[0] = i;
1878                     name[1] = '\0';
1879                 }
1880                 else if (i == '\n') {
1881                     my_strlcpy(name, "\\n", sizeof(name));
1882                 }
1883                 else if (i == '\t') {
1884                     my_strlcpy(name, "\\t", sizeof(name));
1885                 }
1886                 else {
1887                     assert(i == ' ');
1888                     my_strlcpy(name, "' '", sizeof(name));
1889                 }
1890
1891                 /* Check each possibe class */
1892                 if (UNLIKELY(cBOOL(isU8_ALPHANUMERIC_LC(i)) != cBOOL(isALPHANUMERIC_A(i))))  {
1893                     is_bad = TRUE;
1894                     DEBUG_L(PerlIO_printf(Perl_debug_log,
1895                                           "isalnum('%s') unexpectedly is %x\n",
1896                                           name, cBOOL(isU8_ALPHANUMERIC_LC(i))));
1897                 }
1898                 if (UNLIKELY(cBOOL(isU8_ALPHA_LC(i)) != cBOOL(isALPHA_A(i))))  {
1899                     is_bad = TRUE;
1900                     DEBUG_L(PerlIO_printf(Perl_debug_log,
1901                                           "isalpha('%s') unexpectedly is %x\n",
1902                                           name, cBOOL(isU8_ALPHA_LC(i))));
1903                 }
1904                 if (UNLIKELY(cBOOL(isU8_DIGIT_LC(i)) != cBOOL(isDIGIT_A(i))))  {
1905                     is_bad = TRUE;
1906                     DEBUG_L(PerlIO_printf(Perl_debug_log,
1907                                           "isdigit('%s') unexpectedly is %x\n",
1908                                           name, cBOOL(isU8_DIGIT_LC(i))));
1909                 }
1910                 if (UNLIKELY(cBOOL(isU8_GRAPH_LC(i)) != cBOOL(isGRAPH_A(i))))  {
1911                     is_bad = TRUE;
1912                     DEBUG_L(PerlIO_printf(Perl_debug_log,
1913                                           "isgraph('%s') unexpectedly is %x\n",
1914                                           name, cBOOL(isU8_GRAPH_LC(i))));
1915                 }
1916                 if (UNLIKELY(cBOOL(isU8_LOWER_LC(i)) != cBOOL(isLOWER_A(i))))  {
1917                     is_bad = TRUE;
1918                     DEBUG_L(PerlIO_printf(Perl_debug_log,
1919                                           "islower('%s') unexpectedly is %x\n",
1920                                           name, cBOOL(isU8_LOWER_LC(i))));
1921                 }
1922                 if (UNLIKELY(cBOOL(isU8_PRINT_LC(i)) != cBOOL(isPRINT_A(i))))  {
1923                     is_bad = TRUE;
1924                     DEBUG_L(PerlIO_printf(Perl_debug_log,
1925                                           "isprint('%s') unexpectedly is %x\n",
1926                                           name, cBOOL(isU8_PRINT_LC(i))));
1927                 }
1928                 if (UNLIKELY(cBOOL(isU8_PUNCT_LC(i)) != cBOOL(isPUNCT_A(i))))  {
1929                     is_bad = TRUE;
1930                     DEBUG_L(PerlIO_printf(Perl_debug_log,
1931                                           "ispunct('%s') unexpectedly is %x\n",
1932                                           name, cBOOL(isU8_PUNCT_LC(i))));
1933                 }
1934                 if (UNLIKELY(cBOOL(isU8_SPACE_LC(i)) != cBOOL(isSPACE_A(i))))  {
1935                     is_bad = TRUE;
1936                     DEBUG_L(PerlIO_printf(Perl_debug_log,
1937                                           "isspace('%s') unexpectedly is %x\n",
1938                                           name, cBOOL(isU8_SPACE_LC(i))));
1939                 }
1940                 if (UNLIKELY(cBOOL(isU8_UPPER_LC(i)) != cBOOL(isUPPER_A(i))))  {
1941                     is_bad = TRUE;
1942                     DEBUG_L(PerlIO_printf(Perl_debug_log,
1943                                           "isupper('%s') unexpectedly is %x\n",
1944                                           name, cBOOL(isU8_UPPER_LC(i))));
1945                 }
1946                 if (UNLIKELY(cBOOL(isU8_XDIGIT_LC(i))!= cBOOL(isXDIGIT_A(i))))  {
1947                     is_bad = TRUE;
1948                     DEBUG_L(PerlIO_printf(Perl_debug_log,
1949                                           "isxdigit('%s') unexpectedly is %x\n",
1950                                           name, cBOOL(isU8_XDIGIT_LC(i))));
1951                 }
1952                 if (UNLIKELY(toU8_LOWER_LC(i) != (int) toLOWER_A(i))) {
1953                     is_bad = TRUE;
1954                     DEBUG_L(PerlIO_printf(Perl_debug_log,
1955                             "tolower('%s')=0x%x instead of the expected 0x%x\n",
1956                             name, toU8_LOWER_LC(i), (int) toLOWER_A(i)));
1957                 }
1958                 if (UNLIKELY(toU8_UPPER_LC(i) != (int) toUPPER_A(i))) {
1959                     is_bad = TRUE;
1960                     DEBUG_L(PerlIO_printf(Perl_debug_log,
1961                             "toupper('%s')=0x%x instead of the expected 0x%x\n",
1962                             name, toU8_UPPER_LC(i), (int) toUPPER_A(i)));
1963                 }
1964                 if (UNLIKELY((i == '\n' && ! isCNTRL_LC(i))))  {
1965                     is_bad = TRUE;
1966                     DEBUG_L(PerlIO_printf(Perl_debug_log,
1967                                 "'\\n' (=%02X) is not a control\n", (int) i));
1968                 }
1969
1970                 /* Add to the list;  Separate multiple entries with a blank */
1971                 if (is_bad) {
1972                     if (bad_count) {
1973                         my_strlcat(bad_chars_list, " ", sizeof(bad_chars_list));
1974                     }
1975                     my_strlcat(bad_chars_list, name, sizeof(bad_chars_list));
1976                     bad_count++;
1977                 }
1978             }
1979         }
1980
1981         if (bad_count == 2 && maybe_utf8_turkic) {
1982             bad_count = 0;
1983             *bad_chars_list = '\0';
1984             PL_fold_locale['I'] = 'I';
1985             PL_fold_locale['i'] = 'i';
1986             PL_in_utf8_turkic_locale = TRUE;
1987             DEBUG_L(PerlIO_printf(Perl_debug_log, "%s is turkic\n", newctype));
1988         }
1989         else {
1990             PL_in_utf8_turkic_locale = FALSE;
1991         }
1992
1993 #  ifdef MB_CUR_MAX
1994
1995         /* We only handle single-byte locales (outside of UTF-8 ones; so if
1996          * this locale requires more than one byte, there are going to be
1997          * problems. */
1998         DEBUG_Lv(PerlIO_printf(Perl_debug_log,
1999                  "check_for_problems=%d, MB_CUR_MAX=%d\n",
2000                  check_for_problems, (int) MB_CUR_MAX));
2001
2002         if (   check_for_problems && MB_CUR_MAX > 1
2003             && ! PL_in_utf8_CTYPE_locale
2004
2005                /* Some platforms return MB_CUR_MAX > 1 for even the "C"
2006                 * locale.  Just assume that the implementation for them (plus
2007                 * for POSIX) is correct and the > 1 value is spurious.  (Since
2008                 * these are specially handled to never be considered UTF-8
2009                 * locales, as long as this is the only problem, everything
2010                 * should work fine */
2011             && strNE(newctype, "C") && strNE(newctype, "POSIX"))
2012         {
2013             multi_byte_locale = TRUE;
2014         }
2015
2016 #  endif
2017
2018         /* If we found problems and we want them output, do so */
2019         if (   (UNLIKELY(bad_count) || UNLIKELY(multi_byte_locale))
2020             && (LIKELY(ckWARN_d(WARN_LOCALE)) || UNLIKELY(DEBUG_L_TEST)))
2021         {
2022             if (UNLIKELY(bad_count) && PL_in_utf8_CTYPE_locale) {
2023                 PL_warn_locale = Perl_newSVpvf(aTHX_
2024                      "Locale '%s' contains (at least) the following characters"
2025                      " which have\nunexpected meanings: %s\nThe Perl program"
2026                      " will use the expected meanings",
2027                       newctype, bad_chars_list);
2028             }
2029             else {
2030                 PL_warn_locale = Perl_newSVpvf(aTHX_
2031                              "Locale '%s' may not work well.%s%s%s\n",
2032                              newctype,
2033                              (multi_byte_locale)
2034                               ? "  Some characters in it are not recognized by"
2035                                 " Perl."
2036                               : "",
2037                              (bad_count)
2038                               ? "\nThe following characters (and maybe others)"
2039                                 " may not have the same meaning as the Perl"
2040                                 " program expects:\n"
2041                               : "",
2042                              (bad_count)
2043                               ? bad_chars_list
2044                               : ""
2045                             );
2046             }
2047
2048 #  ifdef HAS_NL_LANGINFO
2049
2050             Perl_sv_catpvf(aTHX_ PL_warn_locale, "; codeset=%s",
2051                                     /* parameter FALSE is a don't care here */
2052                                     my_nl_langinfo(CODESET, FALSE));
2053
2054 #  endif
2055
2056             Perl_sv_catpvf(aTHX_ PL_warn_locale, "\n");
2057
2058             /* If we are actually in the scope of the locale or are debugging,
2059              * output the message now.  If not in that scope, we save the
2060              * message to be output at the first operation using this locale,
2061              * if that actually happens.  Most programs don't use locales, so
2062              * they are immune to bad ones.  */
2063             if (IN_LC(LC_CTYPE) || UNLIKELY(DEBUG_L_TEST)) {
2064
2065                 /* The '0' below suppresses a bogus gcc compiler warning */
2066                 Perl_warner(aTHX_ packWARN(WARN_LOCALE), SvPVX(PL_warn_locale),
2067                                                                             0);
2068
2069                 if (IN_LC(LC_CTYPE)) {
2070                     SvREFCNT_dec_NN(PL_warn_locale);
2071                     PL_warn_locale = NULL;
2072                 }
2073             }
2074         }
2075     }
2076
2077 #  endif /* USE_LOCALE_CTYPE */
2078
2079 }
2080
2081 void
2082 Perl__warn_problematic_locale()
2083 {
2084
2085 #  ifdef USE_LOCALE_CTYPE
2086
2087     dTHX;
2088
2089     /* Internal-to-core function that outputs the message in PL_warn_locale,
2090      * and then NULLS it.  Should be called only through the macro
2091      * CHECK_AND_WARN_PROBLEMATIC_LOCALE_ */
2092
2093     if (PL_warn_locale) {
2094         Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
2095                              SvPVX(PL_warn_locale),
2096                              0 /* dummy to avoid compiler warning */ );
2097         SvREFCNT_dec_NN(PL_warn_locale);
2098         PL_warn_locale = NULL;
2099     }
2100
2101 #  endif
2102
2103 }
2104
2105 STATIC void
2106 S_new_LC_ALL(pTHX_ const char *unused)
2107 {
2108     unsigned int i;
2109
2110     /* LC_ALL updates all the things we care about. */
2111
2112     PERL_UNUSED_ARG(unused);
2113
2114     for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++) {
2115         if (update_functions[i]) {
2116             const char * this_locale = savepv(querylocale_i(i));
2117             update_functions[i](aTHX_ this_locale);
2118             Safefree(this_locale);
2119         }
2120     }
2121 }
2122
2123 STATIC void
2124 S_new_collate(pTHX_ const char *newcoll)
2125 {
2126
2127 #  ifndef USE_LOCALE_COLLATE
2128
2129     PERL_UNUSED_ARG(newcoll);
2130     PERL_UNUSED_CONTEXT;
2131
2132 #  else
2133
2134     /* Called after each libc setlocale() call affecting LC_COLLATE, to tell
2135      * core Perl this and that 'newcoll' is the name of the new locale.
2136      *
2137      * The design of locale collation is that every locale change is given an
2138      * index 'PL_collation_ix'.  The first time a string particpates in an
2139      * operation that requires collation while locale collation is active, it
2140      * is given PERL_MAGIC_collxfrm magic (via sv_collxfrm_flags()).  That
2141      * magic includes the collation index, and the transformation of the string
2142      * by strxfrm(), q.v.  That transformation is used when doing comparisons,
2143      * instead of the string itself.  If a string changes, the magic is
2144      * cleared.  The next time the locale changes, the index is incremented,
2145      * and so we know during a comparison that the transformation is not
2146      * necessarily still valid, and so is recomputed.  Note that if the locale
2147      * changes enough times, the index could wrap (a U32), and it is possible
2148      * that a transformation would improperly be considered valid, leading to
2149      * an unlikely bug */
2150
2151     if (! newcoll) {
2152         if (PL_collation_name) {
2153             ++PL_collation_ix;
2154             Safefree(PL_collation_name);
2155             PL_collation_name = NULL;
2156         }
2157         PL_collation_standard = TRUE;
2158       is_standard_collation:
2159         PL_collxfrm_base = 0;
2160         PL_collxfrm_mult = 2;
2161         PL_in_utf8_COLLATE_locale = FALSE;
2162         PL_strxfrm_NUL_replacement = '\0';
2163         PL_strxfrm_max_cp = 0;
2164         return;
2165     }
2166
2167     /* If this is not the same locale as currently, set the new one up */
2168     if (! PL_collation_name || strNE(PL_collation_name, newcoll)) {
2169         ++PL_collation_ix;
2170         Safefree(PL_collation_name);
2171         PL_collation_name = savepv(newcoll);
2172         PL_collation_standard = isNAME_C_OR_POSIX(newcoll);
2173         if (PL_collation_standard) {
2174             goto is_standard_collation;
2175         }
2176
2177         PL_in_utf8_COLLATE_locale = _is_cur_LC_category_utf8(LC_COLLATE);
2178         PL_strxfrm_NUL_replacement = '\0';
2179         PL_strxfrm_max_cp = 0;
2180
2181         /* A locale collation definition includes primary, secondary, tertiary,
2182          * etc. weights for each character.  To sort, the primary weights are
2183          * used, and only if they compare equal, then the secondary weights are
2184          * used, and only if they compare equal, then the tertiary, etc.
2185          *
2186          * strxfrm() works by taking the input string, say ABC, and creating an
2187          * output transformed string consisting of first the primary weights,
2188          * A¹B¹C¹ followed by the secondary ones, A²B²C²; and then the
2189          * tertiary, etc, yielding A¹B¹C¹ A²B²C² A³B³C³ ....  Some characters
2190          * may not have weights at every level.  In our example, let's say B
2191          * doesn't have a tertiary weight, and A doesn't have a secondary
2192          * weight.  The constructed string is then going to be
2193          *  A¹B¹C¹ B²C² A³C³ ....
2194          * This has the desired effect that strcmp() will look at the secondary
2195          * or tertiary weights only if the strings compare equal at all higher
2196          * priority weights.  The spaces shown here, like in
2197          *  "A¹B¹C¹ A²B²C² "
2198          * are not just for readability.  In the general case, these must
2199          * actually be bytes, which we will call here 'separator weights'; and
2200          * they must be smaller than any other weight value, but since these
2201          * are C strings, only the terminating one can be a NUL (some
2202          * implementations may include a non-NUL separator weight just before
2203          * the NUL).  Implementations tend to reserve 01 for the separator
2204          * weights.  They are needed so that a shorter string's secondary
2205          * weights won't be misconstrued as primary weights of a longer string,
2206          * etc.  By making them smaller than any other weight, the shorter
2207          * string will sort first.  (Actually, if all secondary weights are
2208          * smaller than all primary ones, there is no need for a separator
2209          * weight between those two levels, etc.)
2210          *
2211          * The length of the transformed string is roughly a linear function of
2212          * the input string.  It's not exactly linear because some characters
2213          * don't have weights at all levels.  When we call strxfrm() we have to
2214          * allocate some memory to hold the transformed string.  The
2215          * calculations below try to find coefficients 'm' and 'b' for this
2216          * locale so that m*x + b equals how much space we need, given the size
2217          * of the input string in 'x'.  If we calculate too small, we increase
2218          * the size as needed, and call strxfrm() again, but it is better to
2219          * get it right the first time to avoid wasted expensive string
2220          * transformations. */
2221
2222         {
2223             /* We use the string below to find how long the tranformation of it
2224              * is.  Almost all locales are supersets of ASCII, or at least the
2225              * ASCII letters.  We use all of them, half upper half lower,
2226              * because if we used fewer, we might hit just the ones that are
2227              * outliers in a particular locale.  Most of the strings being
2228              * collated will contain a preponderance of letters, and even if
2229              * they are above-ASCII, they are likely to have the same number of
2230              * weight levels as the ASCII ones.  It turns out that digits tend
2231              * to have fewer levels, and some punctuation has more, but those
2232              * are relatively sparse in text, and khw believes this gives a
2233              * reasonable result, but it could be changed if experience so
2234              * dictates. */
2235             const char longer[] = "ABCDEFGHIJKLMnopqrstuvwxyz";
2236             char * x_longer;        /* Transformed 'longer' */
2237             Size_t x_len_longer;    /* Length of 'x_longer' */
2238
2239             char * x_shorter;   /* We also transform a substring of 'longer' */
2240             Size_t x_len_shorter;
2241
2242             /* _mem_collxfrm() is used get the transformation (though here we
2243              * are interested only in its length).  It is used because it has
2244              * the intelligence to handle all cases, but to work, it needs some
2245              * values of 'm' and 'b' to get it started.  For the purposes of
2246              * this calculation we use a very conservative estimate of 'm' and
2247              * 'b'.  This assumes a weight can be multiple bytes, enough to
2248              * hold any UV on the platform, and there are 5 levels, 4 weight
2249              * bytes, and a trailing NUL.  */
2250             PL_collxfrm_base = 5;
2251             PL_collxfrm_mult = 5 * sizeof(UV);
2252
2253             /* Find out how long the transformation really is */
2254             x_longer = _mem_collxfrm(longer,
2255                                      sizeof(longer) - 1,
2256                                      &x_len_longer,
2257
2258                                      /* We avoid converting to UTF-8 in the
2259                                       * called function by telling it the
2260                                       * string is in UTF-8 if the locale is a
2261                                       * UTF-8 one.  Since the string passed
2262                                       * here is invariant under UTF-8, we can
2263                                       * claim it's UTF-8 even though it isn't.
2264                                       * */
2265                                      PL_in_utf8_COLLATE_locale);
2266             Safefree(x_longer);
2267
2268             /* Find out how long the transformation of a substring of 'longer'
2269              * is.  Together the lengths of these transformations are
2270              * sufficient to calculate 'm' and 'b'.  The substring is all of
2271              * 'longer' except the first character.  This minimizes the chances
2272              * of being swayed by outliers */
2273             x_shorter = _mem_collxfrm(longer + 1,
2274                                       sizeof(longer) - 2,
2275                                       &x_len_shorter,
2276                                       PL_in_utf8_COLLATE_locale);
2277             Safefree(x_shorter);
2278
2279             /* If the results are nonsensical for this simple test, the whole
2280              * locale definition is suspect.  Mark it so that locale collation
2281              * is not active at all for it.  XXX Should we warn? */
2282             if (   x_len_shorter == 0
2283                 || x_len_longer == 0
2284                 || x_len_shorter >= x_len_longer)
2285             {
2286                 PL_collxfrm_mult = 0;
2287                 PL_collxfrm_base = 0;
2288             }
2289             else {
2290                 SSize_t base;       /* Temporary */
2291
2292                 /* We have both:    m * strlen(longer)  + b = x_len_longer
2293                  *                  m * strlen(shorter) + b = x_len_shorter;
2294                  * subtracting yields:
2295                  *          m * (strlen(longer) - strlen(shorter))
2296                  *                             = x_len_longer - x_len_shorter
2297                  * But we have set things up so that 'shorter' is 1 byte smaller
2298                  * than 'longer'.  Hence:
2299                  *          m = x_len_longer - x_len_shorter
2300                  *
2301                  * But if something went wrong, make sure the multiplier is at
2302                  * least 1.
2303                  */
2304                 if (x_len_longer > x_len_shorter) {
2305                     PL_collxfrm_mult = (STRLEN) x_len_longer - x_len_shorter;
2306                 }
2307                 else {
2308                     PL_collxfrm_mult = 1;
2309                 }
2310
2311                 /*     mx + b = len
2312                  * so:      b = len - mx
2313                  * but in case something has gone wrong, make sure it is
2314                  * non-negative */
2315                 base = x_len_longer - PL_collxfrm_mult * (sizeof(longer) - 1);
2316                 if (base < 0) {
2317                     base = 0;
2318                 }
2319
2320                 /* Add 1 for the trailing NUL */
2321                 PL_collxfrm_base = base + 1;
2322             }
2323
2324             DEBUG_L(PerlIO_printf(Perl_debug_log,
2325                                   "?UTF-8 locale=%d; x_len_shorter=%zu, "
2326                     "x_len_longer=%zu,"
2327                     " collate multipler=%zu, collate base=%zu\n",
2328                     PL_in_utf8_COLLATE_locale,
2329                     x_len_shorter, x_len_longer,
2330                                   PL_collxfrm_mult, PL_collxfrm_base));
2331         }
2332     }
2333
2334 #  endif /* USE_LOCALE_COLLATE */
2335
2336 }
2337
2338 #endif  /* USE_LOCALE */
2339
2340 #ifdef WIN32
2341
2342 #define USE_WSETLOCALE
2343
2344 #ifdef USE_WSETLOCALE
2345
2346 STATIC char *
2347 S_wrap_wsetlocale(pTHX_ int category, const char *locale) {
2348     wchar_t *wlocale;
2349     wchar_t *wresult;
2350     char *result;
2351
2352     if (locale) {
2353         int req_size =
2354             MultiByteToWideChar(CP_UTF8, 0, locale, -1, NULL, 0);
2355
2356         if (!req_size) {
2357             errno = EINVAL;
2358             return NULL;
2359         }
2360
2361         Newx(wlocale, req_size, wchar_t);
2362         if (!MultiByteToWideChar(CP_UTF8, 0, locale, -1, wlocale, req_size)) {
2363             Safefree(wlocale);
2364             errno = EINVAL;
2365             return NULL;
2366         }
2367     }
2368     else {
2369         wlocale = NULL;
2370     }
2371     wresult = _wsetlocale(category, wlocale);
2372     Safefree(wlocale);
2373     if (wresult) {
2374         int req_size =
2375             WideCharToMultiByte(CP_UTF8, 0, wresult, -1, NULL, 0, NULL, NULL);
2376         Newx(result, req_size, char);
2377         SAVEFREEPV(result); /* is there something better we can do here? */
2378         if (!WideCharToMultiByte(CP_UTF8, 0, wresult, -1,
2379                                  result, req_size, NULL, NULL)) {
2380             errno = EINVAL;
2381             return NULL;
2382         }
2383     }
2384     else {
2385         result = NULL;
2386     }
2387
2388     return result;
2389 }
2390
2391 #endif
2392
2393 STATIC char *
2394 S_win32_setlocale(pTHX_ int category, const char* locale)
2395 {
2396     /* This, for Windows, emulates POSIX setlocale() behavior.  There is no
2397      * difference between the two unless the input locale is "", which normally
2398      * means on Windows to get the machine default, which is set via the
2399      * computer's "Regional and Language Options" (or its current equivalent).
2400      * In POSIX, it instead means to find the locale from the user's
2401      * environment.  This routine changes the Windows behavior to first look in
2402      * the environment, and, if anything is found, use that instead of going to
2403      * the machine default.  If there is no environment override, the machine
2404      * default is used, by calling the real setlocale() with "".
2405      *
2406      * The POSIX behavior is to use the LC_ALL variable if set; otherwise to
2407      * use the particular category's variable if set; otherwise to use the LANG
2408      * variable. */
2409
2410     bool override_LC_ALL = FALSE;
2411     char * result;
2412     unsigned int i;
2413
2414     if (locale && strEQ(locale, "")) {
2415
2416 #  ifdef LC_ALL
2417
2418         locale = PerlEnv_getenv("LC_ALL");
2419         if (! locale) {
2420             if (category ==  LC_ALL) {
2421                 override_LC_ALL = TRUE;
2422             }
2423             else {
2424
2425 #  endif
2426
2427                 for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++) {
2428                     if (category == categories[i]) {
2429                         locale = PerlEnv_getenv(category_names[i]);
2430                         goto found_locale;
2431                     }
2432                 }
2433
2434                 locale = PerlEnv_getenv("LANG");
2435                 if (! locale) {
2436                     locale = "";
2437                 }
2438
2439               found_locale: ;
2440
2441 #  ifdef LC_ALL
2442
2443             }
2444         }
2445
2446 #  endif
2447
2448     }
2449
2450 #ifdef USE_WSETLOCALE
2451     result = S_wrap_wsetlocale(aTHX_ category, locale);
2452 #else
2453     result = setlocale(category, locale);
2454 #endif
2455     DEBUG_L(STMT_START {
2456                 PerlIO_printf(Perl_debug_log, "%s\n",
2457                             setlocale_debug_string_r(category, locale, result));
2458             } STMT_END);
2459
2460     if (! override_LC_ALL)  {
2461         return result;
2462     }
2463
2464     /* Here the input category was LC_ALL, and we have set it to what is in the
2465      * LANG variable or the system default if there is no LANG.  But these have
2466      * lower priority than the other LC_foo variables, so override it for each
2467      * one that is set.  (If they are set to "", it means to use the same thing
2468      * we just set LC_ALL to, so can skip) */
2469
2470     for (i = 0; i < LC_ALL_INDEX_; i++) {
2471         result = PerlEnv_getenv(category_names[i]);
2472         if (result && strNE(result, "")) {
2473 #ifdef USE_WSETLOCALE
2474             S_wrap_wsetlocale(aTHX_ categories[i], result);
2475 #else
2476             setlocale(categories[i], result);
2477 #endif
2478             DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s\n",
2479                 setlocale_debug_string_i(i, result, "not captured")));
2480         }
2481     }
2482
2483     result = setlocale(LC_ALL, NULL);
2484     DEBUG_L(STMT_START {
2485                 PerlIO_printf(Perl_debug_log, "%s\n",
2486                                setlocale_debug_string_c(LC_ALL, NULL, result));
2487             } STMT_END);
2488
2489     return result;
2490 }
2491
2492 #endif
2493
2494 /*
2495 =for apidoc Perl_setlocale
2496
2497 This is an (almost) drop-in replacement for the system L<C<setlocale(3)>>,
2498 taking the same parameters, and returning the same information, except that it
2499 returns the correct underlying C<LC_NUMERIC> locale.  Regular C<setlocale> will
2500 instead return C<C> if the underlying locale has a non-dot decimal point
2501 character, or a non-empty thousands separator for displaying floating point
2502 numbers.  This is because perl keeps that locale category such that it has a
2503 dot and empty separator, changing the locale briefly during the operations
2504 where the underlying one is required. C<Perl_setlocale> knows about this, and
2505 compensates; regular C<setlocale> doesn't.
2506
2507 Another reason it isn't completely a drop-in replacement is that it is
2508 declared to return S<C<const char *>>, whereas the system setlocale omits the
2509 C<const> (presumably because its API was specified long ago, and can't be
2510 updated; it is illegal to change the information C<setlocale> returns; doing
2511 so leads to segfaults.)
2512
2513 Finally, C<Perl_setlocale> works under all circumstances, whereas plain
2514 C<setlocale> can be completely ineffective on some platforms under some
2515 configurations.
2516
2517 C<Perl_setlocale> should not be used to change the locale except on systems
2518 where the predefined variable C<${^SAFE_LOCALES}> is 1.  On some such systems,
2519 the system C<setlocale()> is ineffective, returning the wrong information, and
2520 failing to actually change the locale.  C<Perl_setlocale>, however works
2521 properly in all circumstances.
2522
2523 The return points to a per-thread static buffer, which is overwritten the next
2524 time C<Perl_setlocale> is called from the same thread.
2525
2526 =cut
2527
2528 */
2529
2530 const char *
2531 Perl_setlocale(const int category, const char * locale)
2532 {
2533     /* This wraps POSIX::setlocale() */
2534
2535 #ifndef USE_LOCALE
2536
2537     PERL_UNUSED_ARG(category);
2538     PERL_UNUSED_ARG(locale);
2539
2540     return "C";
2541
2542 #else
2543
2544     const char * retval;
2545     dTHX;
2546
2547     DEBUG_L(PerlIO_printf(Perl_debug_log,
2548                           "Entering Perl_setlocale(%d, \"%s\")\n",
2549                           category, locale));
2550
2551     /* A NULL locale means only query what the current one is. */
2552     if (locale == NULL) {
2553
2554 #  ifndef USE_LOCALE_NUMERIC
2555
2556         /* Without LC_NUMERIC, it's trivial; we just return the value */
2557         return querylocale_r(category);
2558
2559 #  else
2560
2561         /* We have the LC_NUMERIC name saved, because we are normally switched
2562          * into the C locale (or equivalent) for it. */
2563         if (category == LC_NUMERIC) {
2564             DEBUG_L(PerlIO_printf(Perl_debug_log,
2565                     "Perl_setlocale(LC_NUMERIC, NULL) returning stashed '%s'\n",
2566                     PL_numeric_name));
2567
2568             /* We don't have to copy this return value, as it is a per-thread
2569              * variable, and won't change until a future setlocale */
2570             return PL_numeric_name;
2571         }
2572
2573 #    ifndef LC_ALL
2574
2575         /* Without LC_ALL, just return the value */
2576         return querylocale_r(category);
2577
2578 #    else
2579
2580         /* Here, LC_ALL is available on this platform.  It's the one
2581          * complicating category (because it can contain a toggled LC_NUMERIC
2582          * value), for all the remaining ones (we took care of LC_NUMERIC
2583          * above), just return the value */
2584         if (category != LC_ALL) {
2585             return querylocale_r(category);
2586         }
2587
2588         bool toggled = FALSE;
2589
2590         /* For an LC_ALL query, switch back to the underlying numeric locale
2591          * (if we aren't there already) so as to get the correct results.  Our
2592          * records for all the other categories are valid without switching */
2593         if (! PL_numeric_underlying) {
2594             set_numeric_underlying();
2595             toggled = TRUE;
2596         }
2597
2598         retval = querylocale_c(LC_ALL);
2599
2600         if (toggled) {
2601
2602             /* This toggling back could destroy 'retval' */
2603             retval = save_to_buffer(retval,
2604                                     &PL_setlocale_buf, &PL_setlocale_bufsize, 0);
2605             set_numeric_standard();
2606         }
2607
2608         DEBUG_L(PerlIO_printf(Perl_debug_log, "%s\n",
2609                             setlocale_debug_string_r(category, locale, retval)));
2610         return retval;
2611
2612 #    endif      /* Has LC_ALL */
2613 #  endif        /* Has LC_NUMERIC */
2614
2615     } /* End of querying the current locale */
2616
2617     unsigned int cat_index = get_category_index(category, NULL);
2618     retval = save_to_buffer(setlocale_i(cat_index, locale),
2619                             &PL_setlocale_buf, &PL_setlocale_bufsize, 0);
2620     if (! retval) {
2621         DEBUG_L(PerlIO_printf(Perl_debug_log, "%s\n",
2622                           setlocale_debug_string_i(cat_index, locale, "NULL")));
2623         return NULL;
2624     }
2625
2626     /* Now that have changed locales, we have to update our records to
2627      * correspond.  Only certain categories have extra work to update. */
2628     if (update_functions[cat_index]) {
2629         update_functions[cat_index](aTHX_ retval);
2630     }
2631
2632     DEBUG_L(PerlIO_printf(Perl_debug_log, "returning '%s'\n", retval));
2633
2634     return retval;
2635
2636 #endif
2637
2638 }
2639
2640 PERL_STATIC_INLINE const char *
2641 S_save_to_buffer(const char * string, const char **buf, Size_t *buf_size,
2642                  const Size_t offset)
2643 {
2644     /* Copy the NUL-terminated 'string' to 'buf' + 'offset'.  'buf' has size
2645      * 'buf_size', growing it if necessary */
2646
2647     Size_t string_size;
2648
2649     PERL_ARGS_ASSERT_SAVE_TO_BUFFER;
2650
2651     if (! string) {
2652         return NULL;
2653     }
2654
2655     string_size = strlen(string) + offset + 1;
2656
2657     if (*buf_size == 0) {
2658         Newx(*buf, string_size, char);
2659         *buf_size = string_size;
2660     }
2661     else if (string_size > *buf_size) {
2662         Renew(*buf, string_size, char);
2663         *buf_size = string_size;
2664     }
2665
2666     Copy(string, *buf + offset, string_size - offset, char);
2667     return *buf;
2668 }
2669
2670 /*
2671
2672 =for apidoc Perl_langinfo
2673
2674 This is an (almost) drop-in replacement for the system C<L<nl_langinfo(3)>>,
2675 taking the same C<item> parameter values, and returning the same information.
2676 But it is more thread-safe than regular C<nl_langinfo()>, and hides the quirks
2677 of Perl's locale handling from your code, and can be used on systems that lack
2678 a native C<nl_langinfo>.
2679
2680 Expanding on these:
2681
2682 =over
2683
2684 =item *
2685
2686 The reason it isn't quite a drop-in replacement is actually an advantage.  The
2687 only difference is that it returns S<C<const char *>>, whereas plain
2688 C<nl_langinfo()> returns S<C<char *>>, but you are (only by documentation)
2689 forbidden to write into the buffer.  By declaring this C<const>, the compiler
2690 enforces this restriction, so if it is violated, you know at compilation time,
2691 rather than getting segfaults at runtime.
2692
2693 =item *
2694
2695 It delivers the correct results for the C<RADIXCHAR> and C<THOUSEP> items,
2696 without you having to write extra code.  The reason for the extra code would be
2697 because these are from the C<LC_NUMERIC> locale category, which is normally
2698 kept set by Perl so that the radix is a dot, and the separator is the empty
2699 string, no matter what the underlying locale is supposed to be, and so to get
2700 the expected results, you have to temporarily toggle into the underlying
2701 locale, and later toggle back.  (You could use plain C<nl_langinfo> and
2702 C<L</STORE_LC_NUMERIC_FORCE_TO_UNDERLYING>> for this but then you wouldn't get
2703 the other advantages of C<Perl_langinfo()>; not keeping C<LC_NUMERIC> in the C
2704 (or equivalent) locale would break a lot of CPAN, which is expecting the radix
2705 (decimal point) character to be a dot.)
2706
2707 =item *
2708
2709 The system function it replaces can have its static return buffer trashed,
2710 not only by a subsequent call to that function, but by a C<freelocale>,
2711 C<setlocale>, or other locale change.  The returned buffer of this function is
2712 not changed until the next call to it, so the buffer is never in a trashed
2713 state.
2714
2715 =item *
2716
2717 Its return buffer is per-thread, so it also is never overwritten by a call to
2718 this function from another thread;  unlike the function it replaces.
2719
2720 =item *
2721
2722 But most importantly, it works on systems that don't have C<nl_langinfo>, such
2723 as Windows, hence makes your code more portable.  Of the fifty-some possible
2724 items specified by the POSIX 2008 standard,
2725 L<http://pubs.opengroup.org/onlinepubs/9699919799/basedefs/langinfo.h.html>,
2726 only one is completely unimplemented, though on non-Windows platforms, another
2727 significant one is also not implemented).  It uses various techniques to
2728 recover the other items, including calling C<L<localeconv(3)>>, and
2729 C<L<strftime(3)>>, both of which are specified in C89, so should be always be
2730 available.  Later C<strftime()> versions have additional capabilities; C<""> is
2731 returned for those not available on your system.
2732
2733 It is important to note that when called with an item that is recovered by
2734 using C<localeconv>, the buffer from any previous explicit call to
2735 C<localeconv> will be overwritten.  This means you must save that buffer's
2736 contents if you need to access them after a call to this function.  (But note
2737 that you might not want to be using C<localeconv()> directly anyway, because of
2738 issues like the ones listed in the second item of this list (above) for
2739 C<RADIXCHAR> and C<THOUSEP>.  You can use the methods given in L<perlcall> to
2740 call L<POSIX/localeconv> and avoid all the issues, but then you have a hash to
2741 unpack).
2742
2743 The details for those items which may deviate from what this emulation returns
2744 and what a native C<nl_langinfo()> would return are specified in
2745 L<I18N::Langinfo>.
2746
2747 =back
2748
2749 When using C<Perl_langinfo> on systems that don't have a native
2750 C<nl_langinfo()>, you must
2751
2752  #include "perl_langinfo.h"
2753
2754 before the C<perl.h> C<#include>.  You can replace your C<langinfo.h>
2755 C<#include> with this one.  (Doing it this way keeps out the symbols that plain
2756 C<langinfo.h> would try to import into the namespace for code that doesn't need
2757 it.)
2758
2759 The original impetus for C<Perl_langinfo()> was so that code that needs to
2760 find out the current currency symbol, floating point radix character, or digit
2761 grouping separator can use, on all systems, the simpler and more
2762 thread-friendly C<nl_langinfo> API instead of C<L<localeconv(3)>> which is a
2763 pain to make thread-friendly.  For other fields returned by C<localeconv>, it
2764 is better to use the methods given in L<perlcall> to call
2765 L<C<POSIX::localeconv()>|POSIX/localeconv>, which is thread-friendly.
2766
2767 =cut
2768
2769 */
2770
2771 const char *
2772 #ifdef HAS_NL_LANGINFO
2773 Perl_langinfo(const nl_item item)
2774 #else
2775 Perl_langinfo(const int item)
2776 #endif
2777 {
2778     return my_nl_langinfo(item, TRUE);
2779 }
2780
2781 STATIC const char *
2782 #  ifdef HAS_NL_LANGINFO
2783 S_my_nl_langinfo(const nl_item item, bool toggle)
2784 #  else
2785 S_my_nl_langinfo(const int item, bool toggle)
2786 #  endif
2787 {
2788     dTHX;
2789     const char * retval;
2790
2791 #  ifdef USE_LOCALE_NUMERIC
2792
2793     /* We only need to toggle into the underlying LC_NUMERIC locale for these
2794      * two items, and only if not already there */
2795     if (toggle && ((   item != RADIXCHAR && item != THOUSEP)
2796                     || PL_numeric_underlying))
2797
2798 #  endif  /* No toggling needed if not using LC_NUMERIC */
2799
2800         toggle = FALSE;
2801
2802 /*--------------------------------------------------------------------------*/
2803 /* Above is the common beginning to all the implementations of my_langinfo().
2804  * Below are the various completions */
2805 #  if defined(HAS_NL_LANGINFO) /* nl_langinfo() is available.  */
2806 #  if   ! defined(HAS_THREAD_SAFE_NL_LANGINFO_L)      \
2807      || ! defined(USE_POSIX_2008_LOCALE)
2808
2809     /* Here, use plain nl_langinfo(), switching to the underlying LC_NUMERIC
2810      * for those items dependent on it.  This must be copied to a buffer before
2811      * switching back, as some systems destroy the buffer when setlocale() is
2812      * called */
2813
2814     {
2815         DECLARATION_FOR_LC_NUMERIC_MANIPULATION;
2816
2817         if (toggle) {
2818             STORE_LC_NUMERIC_FORCE_TO_UNDERLYING();
2819         }
2820
2821         /* Prevent interference from another thread executing this code
2822          * section. */
2823         NL_LANGINFO_LOCK;
2824
2825         /* Copy to a per-thread buffer, which is also one that won't be
2826          * destroyed by a subsequent setlocale(), such as the
2827          * RESTORE_LC_NUMERIC may do just below. */
2828         retval = save_to_buffer(nl_langinfo(item),
2829                                 ((const char **) &PL_langinfo_buf),
2830                                 &PL_langinfo_bufsize, 0);
2831         NL_LANGINFO_UNLOCK;
2832
2833         if (toggle) {
2834             RESTORE_LC_NUMERIC();
2835         }
2836     }
2837 /*--------------------------------------------------------------------------*/
2838 #    else /* Use nl_langinfo_l(), avoiding both a mutex and changing the
2839              locale. */
2840
2841     {
2842         locale_t cur = use_curlocale_scratch();
2843
2844 #    ifdef USE_LOCALE_NUMERIC
2845
2846         if (toggle) {
2847             if (PL_underlying_numeric_obj) {
2848                 cur = PL_underlying_numeric_obj;
2849             }
2850             else {
2851                 cur = newlocale(LC_NUMERIC_MASK, PL_numeric_name, cur);
2852             }
2853         }
2854
2855 #    endif
2856
2857         /* We have to save it to a buffer, because the freelocale() just below
2858          * can invalidate the internal one */
2859         retval = save_to_buffer(nl_langinfo_l(item, cur),
2860                                 ((const char **) &PL_langinfo_buf),
2861                                 &PL_langinfo_bufsize, 0);
2862     }
2863
2864 #  endif
2865
2866     /* We can return 'yes' and 'no' even if we didn't get a result */
2867     if (strEQ(retval, "")) {
2868         if (item == YESSTR) {
2869             return "yes";
2870         }
2871         if (item == NOSTR) {
2872             return "no";
2873         }
2874     }
2875
2876     return retval;
2877 /*--------------------------------------------------------------------------*/
2878 #  else   /* Below, emulate nl_langinfo as best we can */
2879
2880     {
2881
2882 #  ifdef HAS_LOCALECONV
2883
2884         const struct lconv* lc;
2885         const char * temp;
2886         DECLARATION_FOR_LC_NUMERIC_MANIPULATION;
2887
2888 #    ifdef TS_W32_BROKEN_LOCALECONV
2889
2890         const char * save_global;
2891         const char * save_thread;
2892         int needed_size;
2893         char * ptr;
2894         char * e;
2895         char * item_start;
2896
2897 #    endif
2898 #  endif
2899 #  ifdef HAS_STRFTIME
2900
2901         struct tm tm;
2902         bool return_format = FALSE; /* Return the %format, not the value */
2903         const char * format;
2904
2905 #  endif
2906
2907         /* We copy the results to a per-thread buffer, even if not
2908          * multi-threaded.  This is in part to simplify this code, and partly
2909          * because we need a buffer anyway for strftime(), and partly because a
2910          * call of localeconv() could otherwise wipe out the buffer, and the
2911          * programmer would not be expecting this, as this is a nl_langinfo()
2912          * substitute after all, so s/he might be thinking their localeconv()
2913          * is safe until another localeconv() call. */
2914
2915         switch (item) {
2916             Size_t len;
2917
2918             /* This is unimplemented */
2919             case ERA:      /* For use with strftime() %E modifier */
2920
2921             default:
2922                 return "";
2923
2924             /* We use only an English set, since we don't know any more */
2925             case YESEXPR:   return "^[+1yY]";
2926             case YESSTR:    return "yes";
2927             case NOEXPR:    return "^[-0nN]";
2928             case NOSTR:     return "no";
2929
2930             case CODESET:
2931
2932 #  ifndef WIN32
2933
2934                 /* On non-windows, this is unimplemented, in part because of
2935                  * inconsistencies between vendors.  The Darwin native
2936                  * nl_langinfo() implementation simply looks at everything past
2937                  * any dot in the name, but that doesn't work for other
2938                  * vendors.  Many Linux locales that don't have UTF-8 in their
2939                  * names really are UTF-8, for example; z/OS locales that do
2940                  * have UTF-8 in their names, aren't really UTF-8 */
2941                 return "";
2942
2943 #  else
2944
2945                 {   /* But on Windows, the name does seem to be consistent, so
2946                        use that. */
2947                     const char * p;
2948                     const char * first;
2949                     Size_t offset = 0;
2950             const char * name = querylocale_c(LC_CTYPE);
2951
2952                     if (isNAME_C_OR_POSIX(name)) {
2953                         return "ANSI_X3.4-1968";
2954                     }
2955
2956                     /* Find the dot in the locale name */
2957                     first = (const char *) strchr(name, '.');
2958                     if (! first) {
2959                         first = name;
2960                         goto has_nondigit;
2961                     }
2962
2963                     /* Look at everything past the dot */
2964                     first++;
2965                     p = first;
2966
2967                     while (*p) {
2968                         if (! isDIGIT(*p)) {
2969                             goto has_nondigit;
2970                         }
2971
2972                         p++;
2973                     }
2974
2975                     /* Here everything past the dot is a digit.  Treat it as a
2976                      * code page */
2977                     retval = save_to_buffer("CP",
2978                                             ((const char **) &PL_langinfo_buf),
2979                                             &PL_langinfo_bufsize, 0);
2980                     offset = STRLENs("CP");
2981
2982                   has_nondigit:
2983
2984                     retval = save_to_buffer(first,
2985                                             ((const char **) &PL_langinfo_buf),
2986                                             &PL_langinfo_bufsize, offset);
2987                 }
2988
2989                 break;
2990
2991 #  endif
2992 #  ifdef HAS_LOCALECONV
2993
2994             case CRNCYSTR:
2995
2996                 /* We don't bother with localeconv_l() because any system that
2997                  * has it is likely to also have nl_langinfo() */
2998
2999                 LOCALECONV_LOCK;    /* Prevent interference with other threads
3000                                        using localeconv() */
3001
3002 #    ifdef TS_W32_BROKEN_LOCALECONV
3003
3004                 /* This is a workaround for a Windows bug prior to VS 15.
3005                  * What we do here is, while locked, switch to the global
3006                  * locale so localeconv() works; then switch back just before
3007                  * the unlock.  This can screw things up if some thread is
3008                  * already using the global locale while assuming no other is.
3009                  * A different workaround would be to call GetCurrencyFormat on
3010                  * a known value, and parse it; patches welcome
3011                  *
3012                  * We have to use LC_ALL instead of LC_MONETARY because of
3013                  * another bug in Windows */
3014
3015         save_thread = savepv(querylocale_c(LC_ALL));
3016                 _configthreadlocale(_DISABLE_PER_THREAD_LOCALE);
3017         save_global= savepv(querylocale_c(LC_ALL));
3018         void_setlocale_c(LC_ALL, save_thread);
3019
3020 #    endif
3021
3022                 lc = localeconv();
3023
3024                 if (   ! lc
3025                     || ! lc->currency_symbol
3026                     || strEQ("", lc->currency_symbol))
3027                 {
3028                     LOCALECONV_UNLOCK;
3029                     return "";
3030                 }
3031
3032                 /* Leave the first spot empty to be filled in below */
3033                 retval = save_to_buffer(lc->currency_symbol,
3034                                         ((const char **) &PL_langinfo_buf),
3035                                         &PL_langinfo_bufsize, 1);
3036                 if (lc->mon_decimal_point && strEQ(lc->mon_decimal_point, ""))
3037                 { /*  khw couldn't figure out how the localedef specifications
3038                       would show that the $ should replace the radix; this is
3039                       just a guess as to how it might work.*/
3040                     PL_langinfo_buf[0] = '.';
3041                 }
3042                 else if (lc->p_cs_precedes) {
3043                     PL_langinfo_buf[0] = '-';
3044                 }
3045                 else {
3046                     PL_langinfo_buf[0] = '+';
3047                 }
3048
3049 #    ifdef TS_W32_BROKEN_LOCALECONV
3050
3051         void_setlocale_c(LC_ALL, save_global);
3052                 _configthreadlocale(_ENABLE_PER_THREAD_LOCALE);
3053         void_setlocale_c(LC_ALL, save_thread);
3054                 Safefree(save_global);
3055                 Safefree(save_thread);
3056
3057 #    endif
3058
3059                 LOCALECONV_UNLOCK;
3060                 break;
3061
3062 #    ifdef TS_W32_BROKEN_LOCALECONV
3063
3064             case RADIXCHAR:
3065
3066                 /* For this, we output a known simple floating point number to
3067                  * a buffer, and parse it, looking for the radix */
3068
3069                 if (toggle) {
3070                     STORE_LC_NUMERIC_FORCE_TO_UNDERLYING();
3071                 }
3072
3073                 if (PL_langinfo_bufsize < 10) {
3074                     PL_langinfo_bufsize = 10;
3075                     Renew(PL_langinfo_buf, PL_langinfo_bufsize, char);
3076                 }
3077
3078                 needed_size = my_snprintf(PL_langinfo_buf, PL_langinfo_bufsize,
3079                                           "%.1f", 1.5);
3080                 if (needed_size >= (int) PL_langinfo_bufsize) {
3081                     PL_langinfo_bufsize = needed_size + 1;
3082                     Renew(PL_langinfo_buf, PL_langinfo_bufsize, char);
3083             needed_size
3084                     = my_snprintf(PL_langinfo_buf, PL_langinfo_bufsize,
3085                                              "%.1f", 1.5);
3086                     assert(needed_size < (int) PL_langinfo_bufsize);
3087                 }
3088
3089                 ptr = PL_langinfo_buf;
3090                 e = PL_langinfo_buf + PL_langinfo_bufsize;
3091
3092                 /* Find the '1' */
3093                 while (ptr < e && *ptr != '1') {
3094                     ptr++;
3095                 }
3096                 ptr++;
3097
3098                 /* Find the '5' */
3099                 item_start = ptr;
3100                 while (ptr < e && *ptr != '5') {
3101                     ptr++;
3102                 }
3103
3104                 /* Everything in between is the radix string */
3105                 if (ptr >= e) {
3106                     PL_langinfo_buf[0] = '?';
3107                     PL_langinfo_buf[1] = '\0';
3108                 }
3109                 else {
3110                     *ptr = '\0';
3111             Move(item_start, PL_langinfo_buf, ptr - PL_langinfo_buf,
3112                                                                 char);
3113                 }
3114
3115                 if (toggle) {
3116                     RESTORE_LC_NUMERIC();
3117                 }
3118
3119                 retval = PL_langinfo_buf;
3120                 break;
3121
3122 #    else
3123
3124             case RADIXCHAR:     /* No special handling needed */
3125
3126 #    endif
3127
3128             case THOUSEP:
3129
3130                 if (toggle) {
3131                     STORE_LC_NUMERIC_FORCE_TO_UNDERLYING();
3132                 }
3133
3134                 LOCALECONV_LOCK;    /* Prevent interference with other threads
3135                                        using localeconv() */
3136
3137 #    ifdef TS_W32_BROKEN_LOCALECONV
3138
3139                 /* This should only be for the thousands separator.  A
3140                  * different work around would be to use GetNumberFormat on a
3141                  * known value and parse the result to find the separator */
3142         save_thread = savepv(querylocale_c(LC_ALL));
3143                 _configthreadlocale(_DISABLE_PER_THREAD_LOCALE);
3144         save_global = savepv(querylocale_c(LC_ALL));
3145         void_setlocale_c(LC_ALL, save_thread);
3146 #      if 0
3147                 /* This is the start of code that for broken Windows replaces
3148                  * the above and below code, and instead calls
3149                  * GetNumberFormat() and then would parse that to find the
3150                  * thousands separator.  It needs to handle UTF-16 vs -8
3151                  * issues. */
3152
3153         needed_size = GetNumberFormatEx(PL_numeric_name, 0, "1234.5",
3154                             NULL, PL_langinfo_buf, PL_langinfo_bufsize);
3155                 DEBUG_L(PerlIO_printf(Perl_debug_log,
3156                     "return from GetNumber, count=%d, val=%s\n",
3157                     needed_size, PL_langinfo_buf));
3158
3159 #      endif
3160 #    endif
3161
3162                 lc = localeconv();
3163                 if (! lc) {
3164                     temp = "";
3165                 }
3166                 else {
3167                     temp = (item == RADIXCHAR)
3168                              ? lc->decimal_point
3169                              : lc->thousands_sep;
3170                     if (! temp) {
3171                         temp = "";
3172                     }
3173                 }
3174
3175                 retval = save_to_buffer(temp,
3176                                         ((const char **) &PL_langinfo_buf),
3177                                         &PL_langinfo_bufsize, 0);
3178
3179 #    ifdef TS_W32_BROKEN_LOCALECONV
3180
3181         void_setlocale_c(LC_ALL, save_global);
3182                 _configthreadlocale(_ENABLE_PER_THREAD_LOCALE);
3183         void_setlocale_c(LC_ALL, save_thread);
3184                 Safefree(save_global);
3185                 Safefree(save_thread);
3186
3187 #    endif
3188
3189                 LOCALECONV_UNLOCK;
3190
3191                 if (toggle) {
3192                     RESTORE_LC_NUMERIC();
3193                 }
3194
3195                 break;
3196
3197 #  endif
3198 #  ifdef HAS_STRFTIME
3199
3200             /* These are defined by C89, so we assume that strftime supports
3201              * them, and so are returned unconditionally; they may not be what
3202              * the locale actually says, but should give good enough results
3203              * for someone using them as formats (as opposed to trying to parse
3204              * them to figure out what the locale says).  The other format
3205              * items are actually tested to verify they work on the platform */
3206             case D_FMT:         return "%x";
3207             case T_FMT:         return "%X";
3208             case D_T_FMT:       return "%c";
3209
3210             /* These formats are only available in later strfmtime's */
3211             case ERA_D_FMT: case ERA_T_FMT: case ERA_D_T_FMT: case T_FMT_AMPM:
3212
3213             /* The rest can be gotten from most versions of strftime(). */
3214             case ABDAY_1: case ABDAY_2: case ABDAY_3:
3215             case ABDAY_4: case ABDAY_5: case ABDAY_6: case ABDAY_7:
3216             case ALT_DIGITS:
3217             case AM_STR: case PM_STR:
3218             case ABMON_1: case ABMON_2: case ABMON_3: case ABMON_4:
3219             case ABMON_5: case ABMON_6: case ABMON_7: case ABMON_8:
3220             case ABMON_9: case ABMON_10: case ABMON_11: case ABMON_12:
3221             case DAY_1: case DAY_2: case DAY_3: case DAY_4:
3222             case DAY_5: case DAY_6: case DAY_7:
3223             case MON_1: case MON_2: case MON_3: case MON_4:
3224             case MON_5: case MON_6: case MON_7: case MON_8:
3225             case MON_9: case MON_10: case MON_11: case MON_12:
3226
3227                 init_tm(&tm);   /* Precaution against core dumps */
3228                 tm.tm_sec = 30;
3229                 tm.tm_min = 30;
3230                 tm.tm_hour = 6;
3231                 tm.tm_year = 2017 - 1900;
3232                 tm.tm_wday = 0;
3233                 tm.tm_mon = 0;
3234
3235                 GCC_DIAG_IGNORE_STMT(-Wimplicit-fallthrough);
3236
3237                 switch (item) {
3238                     default:
3239                 locale_panic_(Perl_form(aTHX_ "switch case: %d problem", item));
3240                         NOT_REACHED; /* NOTREACHED */
3241
3242                     case PM_STR: tm.tm_hour = 18;
3243                     case AM_STR:
3244                         format = "%p";
3245                         break;
3246
3247                     case ABDAY_7: tm.tm_wday++;
3248                     case ABDAY_6: tm.tm_wday++;
3249                     case ABDAY_5: tm.tm_wday++;
3250                     case ABDAY_4: tm.tm_wday++;
3251                     case ABDAY_3: tm.tm_wday++;
3252                     case ABDAY_2: tm.tm_wday++;
3253                     case ABDAY_1:
3254                         format = "%a";
3255                         break;
3256
3257                     case DAY_7: tm.tm_wday++;
3258                     case DAY_6: tm.tm_wday++;
3259                     case DAY_5: tm.tm_wday++;
3260                     case DAY_4: tm.tm_wday++;
3261                     case DAY_3: tm.tm_wday++;
3262                     case DAY_2: tm.tm_wday++;
3263                     case DAY_1:
3264                         format = "%A";
3265                         break;
3266
3267                     case ABMON_12: tm.tm_mon++;
3268                     case ABMON_11: tm.tm_mon++;
3269                     case ABMON_10: tm.tm_mon++;
3270                     case ABMON_9: tm.tm_mon++;
3271                     case ABMON_8: tm.tm_mon++;
3272                     case ABMON_7: tm.tm_mon++;
3273                     case ABMON_6: tm.tm_mon++;
3274                     case ABMON_5: tm.tm_mon++;
3275                     case ABMON_4: tm.tm_mon++;
3276                     case ABMON_3: tm.tm_mon++;
3277                     case ABMON_2: tm.tm_mon++;
3278                     case ABMON_1:
3279                         format = "%b";
3280                         break;
3281
3282                     case MON_12: tm.tm_mon++;
3283                     case MON_11: tm.tm_mon++;
3284                     case MON_10: tm.tm_mon++;
3285                     case MON_9: tm.tm_mon++;
3286                     case MON_8: tm.tm_mon++;
3287                     case MON_7: tm.tm_mon++;
3288                     case MON_6: tm.tm_mon++;
3289                     case MON_5: tm.tm_mon++;
3290                     case MON_4: tm.tm_mon++;
3291                     case MON_3: tm.tm_mon++;
3292                     case MON_2: tm.tm_mon++;
3293                     case MON_1:
3294                         format = "%B";
3295                         break;
3296
3297                     case T_FMT_AMPM:
3298                         format = "%r";
3299                         return_format = TRUE;
3300                         break;
3301
3302                     case ERA_D_FMT:
3303                         format = "%Ex";
3304                         return_format = TRUE;
3305                         break;
3306
3307                     case ERA_T_FMT:
3308                         format = "%EX";
3309                         return_format = TRUE;
3310                         break;
3311
3312                     case ERA_D_T_FMT:
3313                         format = "%Ec";
3314                         return_format = TRUE;
3315                         break;
3316
3317                     case ALT_DIGITS:
3318                         tm.tm_wday = 0;
3319                         format = "%Ow"; /* Find the alternate digit for 0 */
3320                         break;
3321                 }
3322
3323                 GCC_DIAG_RESTORE_STMT;
3324
3325                 /* We can't use my_strftime() because it doesn't look at
3326                  * tm_wday  */
3327                 while (0 == strftime(PL_langinfo_buf, PL_langinfo_bufsize,
3328                                      format, &tm))
3329                 {
3330                     /* A zero return means one of:
3331                      *  a)  there wasn't enough space in PL_langinfo_buf
3332                      *  b)  the format, like a plain %p, returns empty
3333                      *  c)  it was an illegal format, though some
3334                      *      implementations of strftime will just return the
3335                      *      illegal format as a plain character sequence.
3336                      *
3337                      *  To quickly test for case 'b)', try again but precede
3338                      *  the format with a plain character.  If that result is
3339                      *  still empty, the problem is either 'a)' or 'c)' */
3340
3341                     Size_t format_size = strlen(format) + 1;
3342                     Size_t mod_size = format_size + 1;
3343                     char * mod_format;
3344                     char * temp_result;
3345
3346                     Newx(mod_format, mod_size, char);
3347                     Newx(temp_result, PL_langinfo_bufsize, char);
3348                     *mod_format = ' ';
3349                     my_strlcpy(mod_format + 1, format, mod_size);
3350                     len = strftime(temp_result,
3351                                    PL_langinfo_bufsize,
3352                                    mod_format, &tm);
3353                     Safefree(mod_format);
3354                     Safefree(temp_result);
3355
3356                     /* If 'len' is non-zero, it means that we had a case like
3357                      * %p which means the current locale doesn't use a.m. or
3358                      * p.m., and that is valid */
3359                     if (len == 0) {
3360
3361                         /* Here, still didn't work.  If we get well beyond a
3362                          * reasonable size, bail out to prevent an infinite
3363                          * loop. */
3364
3365                         if (PL_langinfo_bufsize > 100 * format_size) {
3366                             *PL_langinfo_buf = '\0';
3367                         }
3368                         else {
3369                             /* Double the buffer size to retry;  Add 1 in case
3370                              * original was 0, so we aren't stuck at 0.  */
3371                             PL_langinfo_bufsize *= 2;
3372                             PL_langinfo_bufsize++;
3373                             Renew(PL_langinfo_buf, PL_langinfo_bufsize, char);
3374                             continue;
3375                         }
3376                     }
3377
3378                     break;
3379                 }
3380
3381                 /* Here, we got a result.
3382                  *
3383              * If the item is 'ALT_DIGITS', 'PL_langinfo_buf' contains the
3384              * alternate format for wday 0.  If the value is the same as the
3385              * normal 0, there isn't an alternate, so clear the buffer.  */
3386             if (item == ALT_DIGITS && strEQ(PL_langinfo_buf, "0")) {
3387                     *PL_langinfo_buf = '\0';
3388                 }
3389
3390                 /* ALT_DIGITS is problematic.  Experiments on it showed that
3391              * strftime() did not always work properly when going from alt-9 to
3392              * alt-10.  Only a few locales have this item defined, and in all
3393              * of them on Linux that khw was able to find, nl_langinfo() merely
3394              * returned the alt-0 character, possibly doubled.  Most Unicode
3395              * digits are in blocks of 10 consecutive code points, so that is
3396              * sufficient information for such scripts, as we can infer alt-1,
3397              * alt-2, ....  But for a Japanese locale, a CJK ideographic 0 is
3398              * returned, and the CJK digits are not in code point order, so you
3399              * can't really infer anything.  The localedef for this locale did
3400              * specify the succeeding digits, so that strftime() works properly
3401              * on them, without needing to infer anything.  But the
3402              * nl_langinfo() return did not give sufficient information for the
3403              * caller to understand what's going on.  So until there is
3404              * evidence that it should work differently, this returns the alt-0
3405              * string for ALT_DIGITS.
3406                  *
3407                  * wday was chosen because its range is all a single digit.
3408                  * Things like tm_sec have two digits as the minimum: '00' */
3409
3410                 retval = PL_langinfo_buf;
3411
3412                 /* If to return the format, not the value, overwrite the buffer
3413              * with it.  But some strftime()s will keep the original format if
3414              * illegal, so change those to "" */
3415                 if (return_format) {
3416                     if (strEQ(PL_langinfo_buf, format)) {
3417                         *PL_langinfo_buf = '\0';
3418                     }
3419                     else {
3420                         retval = save_to_buffer(format,
3421                                                 ((const char **) &PL_langinfo_buf),
3422                                                 &PL_langinfo_bufsize, 0);
3423                     }
3424                 }
3425
3426                 break;
3427
3428 #  endif
3429
3430         }
3431     }
3432
3433     return retval;
3434
3435 #  endif
3436 /*--------------------------------------------------------------------------*/
3437 }
3438
3439 /*
3440  * Initialize locale awareness.
3441  */
3442 int
3443 Perl_init_i18nl10n(pTHX_ int printwarn)
3444 {
3445     /* printwarn is
3446      *
3447      *    0 if not to output warning when setup locale is bad
3448      *    1 if to output warning based on value of PERL_BADLANG
3449      *    >1 if to output regardless of PERL_BADLANG
3450      *
3451      * returns
3452      *    1 = set ok or not applicable,
3453      *    0 = fallback to a locale of lower priority
3454      *   -1 = fallback to all locales failed, not even to the C locale
3455      *
3456      * Under -DDEBUGGING, if the environment variable PERL_DEBUG_LOCALE_INIT is
3457      * set, debugging information is output.
3458      *
3459      * This looks more complicated than it is, mainly due to the #ifdefs and
3460      * error handling.
3461      *
3462      * Besides some asserts, data structure initialization, and specific
3463      * platform complications, this routine is effectively represented by this
3464      * pseudo-code:
3465      *
3466      *      setlocale(LC_ALL, "");                                            x
3467      *      foreach (subcategory) {                                           x
3468      *          curlocales[f(subcategory)] = setlocale(subcategory, NULL);    x
3469      *      }                                                                 x
3470      *      if (platform_so_requires) {
3471      *          foreach (subcategory) {
3472      *            PL_curlocales[f(subcategory)] = curlocales[f(subcategory)]
3473      *          }
3474      *      }
3475      *      foreach (subcategory) {
3476      *          if (needs_special_handling[f(subcategory)] &this_subcat_handler
3477      *      }
3478      *
3479      * This sets all the categories to the values in the current environment,
3480      * saves them temporarily in curlocales[] until they can be handled and/or
3481      * on some platforms saved in a per-thread array PL_curlocales[].
3482      *
3483      * f(foo) is a mapping from the opaque system category numbers to small
3484      * non-negative integers used most everywhere in this file as indices into
3485      * arrays (such as curlocales[]) so the program doesn't have to otherwise
3486      * deal with the opaqueness.
3487      *
3488      * If the platform doesn't have LC_ALL, the lines marked 'x' above are
3489      * effectively replaced by:
3490      *      foreach (subcategory) {                                           y
3491      *          curlocales[f(subcategory)] = setlocale(subcategory, "");      y
3492      *      }                                                                 y
3493      *
3494      * The only differences being the lack of an LC_ALL call, and using ""
3495      * instead of NULL in the setlocale calls.
3496      *
3497      * But there are, of course, complications.
3498      *
3499      * it has to deal with if this is an embedded perl, whose locale doesn't
3500      * come from the environment, but has been set up by the caller.  This is
3501      * pretty simply handled: the "" in the setlocale calls is not a string
3502      * constant, but a variable which is set to NULL in the embedded case.
3503      *
3504      * But the major complication is handling failure and doing fallback.  All
3505      * the code marked 'x' or 'y' above is actually enclosed in an outer loop,
3506      * using the array trial_locales[].  On entry, trial_locales[] is
3507      * initialized to just one entry, containing the NULL or "" locale argument
3508      * shown above.  If, as is almost always the case, everything works, it
3509      * exits after just the one iteration, going on to the next step.
3510      *
3511      * But if there is a failure, the code tries its best to honor the
3512      * environment as much as possible.  It self-modifies trial_locales[] to
3513      * have more elements, one for each of the POSIX-specified settings from
3514      * the environment, such as LANG, ending in the ultimate fallback, the C
3515      * locale.  Thus if there is something bogus with a higher priority
3516      * environment variable, it will try with the next highest, until something
3517      * works.  If everything fails, it limps along with whatever state it got
3518      * to.
3519      *
3520      * A further complication is that Windows has an additional fallback, the
3521      * user-default ANSI code page obtained from the operating system.  This is
3522      * added as yet another loop iteration, just before the final "C"
3523      *
3524      * A slight complication is that in embedded Perls, the locale may already
3525      * be set-up, and we don't want to get it from the normal environment
3526      * variables.  This is handled by having a special environment variable
3527      * indicate we're in this situation.  We simply set setlocale's 2nd
3528      * parameter to be a NULL instead of "".  That indicates to setlocale that
3529      * it is not to change anything, but to return the current value,
3530      * effectively initializing perl's db to what the locale already is.
3531      *
3532      * We play the same trick with NULL if a LC_ALL succeeds.  We call
3533      * setlocale() on the individual categores with NULL to get their existing
3534      * values for our db, instead of trying to change them.
3535      * */
3536
3537     int ok = 1;
3538
3539 #ifndef USE_LOCALE
3540
3541     PERL_UNUSED_ARG(printwarn);
3542
3543 #else  /* USE_LOCALE */
3544 #  ifdef __GLIBC__
3545
3546     const char * const language = PerlEnv_getenv("LANGUAGE");
3547
3548 #  endif
3549
3550     /* NULL uses the existing already set up locale */
3551     const char * const setlocale_init = (PerlEnv_getenv("PERL_SKIP_LOCALE_INIT"))
3552                                         ? NULL
3553                                         : "";
3554     typedef struct trial_locales_struct_s {
3555         const char* trial_locale;
3556         const char* fallback_desc;
3557         const char* fallback_name;
3558     } trial_locales_struct;
3559     /* 5 = 1 each for "", LC_ALL, LANG, (Win32) system default locale, C */
3560     trial_locales_struct trial_locales[5];
3561     unsigned int trial_locales_count;
3562     const char * const lc_all     = PerlEnv_getenv("LC_ALL");
3563     const char * const lang       = PerlEnv_getenv("LANG");
3564     bool setlocale_failure = FALSE;
3565     unsigned int i;
3566
3567     /* A later getenv() could zap this, so only use here */
3568     const char * const bad_lang_use_once = PerlEnv_getenv("PERL_BADLANG");
3569
3570     const bool locwarn = (printwarn > 1
3571                           || (          printwarn
3572                               && (    ! bad_lang_use_once
3573                                   || (
3574                                          /* disallow with "" or "0" */
3575                                          *bad_lang_use_once
3576                                        && strNE("0", bad_lang_use_once)))));
3577
3578     /* setlocale() return vals; not copied so must be looked at immediately */
3579     const char * sl_result[NOMINAL_LC_ALL_INDEX + 1];
3580
3581     /* current locale for given category; should have been copied so aren't
3582      * volatile */
3583     const char * curlocales[NOMINAL_LC_ALL_INDEX + 1];
3584
3585 #  ifndef DEBUGGING
3586 #    define DEBUG_LOCALE_INIT(a,b,c)
3587 #  else
3588
3589     DEBUG_INITIALIZATION_set(cBOOL(PerlEnv_getenv("PERL_DEBUG_LOCALE_INIT")));
3590
3591 #    define DEBUG_LOCALE_INIT(cat_index, locale, result)                    \
3592         DEBUG_L(PerlIO_printf(Perl_debug_log, "%s\n",                       \
3593                     setlocale_debug_string_i(cat_index, locale, result)));
3594
3595 /* Make sure the parallel arrays are properly set up */
3596 #    ifdef USE_LOCALE_NUMERIC
3597     assert(categories[LC_NUMERIC_INDEX_] == LC_NUMERIC);
3598     assert(strEQ(category_names[LC_NUMERIC_INDEX_], "LC_NUMERIC"));
3599 #      ifdef USE_POSIX_2008_LOCALE
3600     assert(category_masks[LC_NUMERIC_INDEX_] == LC_NUMERIC_MASK);
3601 #      endif
3602 #    endif
3603 #    ifdef USE_LOCALE_CTYPE
3604     assert(categories[LC_CTYPE_INDEX_] == LC_CTYPE);
3605     assert(strEQ(category_names[LC_CTYPE_INDEX_], "LC_CTYPE"));
3606 #      ifdef USE_POSIX_2008_LOCALE
3607     assert(category_masks[LC_CTYPE_INDEX_] == LC_CTYPE_MASK);
3608 #      endif
3609 #    endif
3610 #    ifdef USE_LOCALE_COLLATE
3611     assert(categories[LC_COLLATE_INDEX_] == LC_COLLATE);
3612     assert(strEQ(category_names[LC_COLLATE_INDEX_], "LC_COLLATE"));
3613 #      ifdef USE_POSIX_2008_LOCALE
3614     assert(category_masks[LC_COLLATE_INDEX_] == LC_COLLATE_MASK);
3615 #      endif
3616 #    endif
3617 #    ifdef USE_LOCALE_TIME
3618     assert(categories[LC_TIME_INDEX_] == LC_TIME);
3619     assert(strEQ(category_names[LC_TIME_INDEX_], "LC_TIME"));
3620 #      ifdef USE_POSIX_2008_LOCALE
3621     assert(category_masks[LC_TIME_INDEX_] == LC_TIME_MASK);
3622 #      endif
3623 #    endif
3624 #    ifdef USE_LOCALE_MESSAGES
3625     assert(categories[LC_MESSAGES_INDEX_] == LC_MESSAGES);
3626     assert(strEQ(category_names[LC_MESSAGES_INDEX_], "LC_MESSAGES"));
3627 #      ifdef USE_POSIX_2008_LOCALE
3628     assert(category_masks[LC_MESSAGES_INDEX_] == LC_MESSAGES_MASK);
3629 #      endif
3630 #    endif
3631 #    ifdef USE_LOCALE_MONETARY
3632     assert(categories[LC_MONETARY_INDEX_] == LC_MONETARY);
3633     assert(strEQ(category_names[LC_MONETARY_INDEX_], "LC_MONETARY"));
3634 #      ifdef USE_POSIX_2008_LOCALE
3635     assert(category_masks[LC_MONETARY_INDEX_] == LC_MONETARY_MASK);
3636 #      endif
3637 #    endif
3638 #    ifdef USE_LOCALE_ADDRESS
3639     assert(categories[LC_ADDRESS_INDEX_] == LC_ADDRESS);
3640     assert(strEQ(category_names[LC_ADDRESS_INDEX_], "LC_ADDRESS"));
3641 #      ifdef USE_POSIX_2008_LOCALE
3642     assert(category_masks[LC_ADDRESS_INDEX_] == LC_ADDRESS_MASK);
3643 #      endif
3644 #    endif
3645 #    ifdef USE_LOCALE_IDENTIFICATION
3646     assert(categories[LC_IDENTIFICATION_INDEX_] == LC_IDENTIFICATION);
3647     assert(strEQ(category_names[LC_IDENTIFICATION_INDEX_], "LC_IDENTIFICATION"));
3648 #      ifdef USE_POSIX_2008_LOCALE
3649     assert(category_masks[LC_IDENTIFICATION_INDEX_] == LC_IDENTIFICATION_MASK);
3650 #      endif
3651 #    endif
3652 #    ifdef USE_LOCALE_MEASUREMENT
3653     assert(categories[LC_MEASUREMENT_INDEX_] == LC_MEASUREMENT);
3654     assert(strEQ(category_names[LC_MEASUREMENT_INDEX_], "LC_MEASUREMENT"));
3655 #      ifdef USE_POSIX_2008_LOCALE
3656     assert(category_masks[LC_MEASUREMENT_INDEX_] == LC_MEASUREMENT_MASK);
3657 #      endif
3658 #    endif
3659 #    ifdef USE_LOCALE_PAPER
3660     assert(categories[LC_PAPER_INDEX_] == LC_PAPER);
3661     assert(strEQ(category_names[LC_PAPER_INDEX_], "LC_PAPER"));
3662 #      ifdef USE_POSIX_2008_LOCALE
3663     assert(category_masks[LC_PAPER_INDEX_] == LC_PAPER_MASK);
3664 #      endif
3665 #    endif
3666 #    ifdef USE_LOCALE_TELEPHONE
3667     assert(categories[LC_TELEPHONE_INDEX_] == LC_TELEPHONE);
3668     assert(strEQ(category_names[LC_TELEPHONE_INDEX_], "LC_TELEPHONE"));
3669 #      ifdef USE_POSIX_2008_LOCALE
3670     assert(category_masks[LC_TELEPHONE_INDEX_] == LC_TELEPHONE_MASK);
3671 #      endif
3672 #    endif
3673 #    ifdef USE_LOCALE_SYNTAX
3674     assert(categories[LC_SYNTAX_INDEX_] == LC_SYNTAX);
3675     assert(strEQ(category_names[LC_SYNTAX_INDEX_], "LC_SYNTAX"));
3676 #      ifdef USE_POSIX_2008_LOCALE
3677     assert(category_masks[LC_SYNTAX_INDEX_] == LC_SYNTAX_MASK);
3678 #      endif
3679 #    endif
3680 #    ifdef USE_LOCALE_TOD
3681     assert(categories[LC_TOD_INDEX_] == LC_TOD);
3682     assert(strEQ(category_names[LC_TOD_INDEX_], "LC_TOD"));
3683 #      ifdef USE_POSIX_2008_LOCALE
3684     assert(category_masks[LC_TOD_INDEX_] == LC_TOD_MASK);
3685 #      endif
3686 #    endif
3687 #    ifdef LC_ALL
3688     assert(categories[LC_ALL_INDEX_] == LC_ALL);
3689     assert(strEQ(category_names[LC_ALL_INDEX_], "LC_ALL"));
3690     STATIC_ASSERT_STMT(NOMINAL_LC_ALL_INDEX == LC_ALL_INDEX_);
3691 #      ifdef USE_POSIX_2008_LOCALE
3692     assert(category_masks[LC_ALL_INDEX_] == LC_ALL_MASK);
3693 #      endif
3694 #    endif
3695 #  endif    /* DEBUGGING */
3696
3697     /* Initialize the per-thread mbrFOO() state variables.  See POSIX.xs for
3698      * why these particular incantations are used. */
3699 #  ifdef HAS_MBRLEN
3700     memzero(&PL_mbrlen_ps, sizeof(PL_mbrlen_ps));
3701 #  endif
3702 #  ifdef HAS_MBRTOWC
3703     memzero(&PL_mbrtowc_ps, sizeof(PL_mbrtowc_ps));
3704 #  endif
3705 #  ifdef HAS_WCTOMBR
3706     wcrtomb(NULL, L'\0', &PL_wcrtomb_ps);
3707 #  endif
3708
3709     /* Initialize the cache of the program's UTF-8ness for the always known
3710      * locales C and POSIX */
3711     my_strlcpy(PL_locale_utf8ness, C_and_POSIX_utf8ness,
3712                sizeof(PL_locale_utf8ness));
3713
3714     /* See https://github.com/Perl/perl5/issues/17824 */
3715     Zero(curlocales, NOMINAL_LC_ALL_INDEX, char *);
3716
3717 #  ifdef USE_THREAD_SAFE_LOCALE
3718 #    ifdef WIN32
3719
3720     _configthreadlocale(_ENABLE_PER_THREAD_LOCALE);
3721
3722 #    endif
3723 #  endif
3724 #  ifdef USE_POSIX_2008_LOCALE
3725
3726     PL_C_locale_obj = newlocale(LC_ALL_MASK, "C", (locale_t) 0);
3727     if (! PL_C_locale_obj) {
3728         locale_panic_(Perl_form(aTHX_
3729                                 "Cannot create POSIX 2008 C locale object"));
3730     }
3731
3732     DEBUG_Lv(PerlIO_printf(Perl_debug_log, "created C object %p\n",
3733                            PL_C_locale_obj));
3734 #  endif
3735 #  ifdef USE_LOCALE_NUMERIC
3736
3737     PL_numeric_radix_sv = newSVpvs(".");
3738
3739 #  endif
3740 #  ifdef USE_PL_CURLOCALES
3741
3742     /* Initialize our records.  If we have POSIX 2008, we have LC_ALL */
3743     void_setlocale_c(LC_ALL, porcelain_setlocale(LC_ALL, NULL));
3744
3745 #  endif
3746
3747     /* We try each locale in the list until we get one that works, or exhaust
3748      * the list.  Normally the loop is executed just once.  But if setting the
3749      * locale fails, inside the loop we add fallback trials to the array and so
3750      * will execute the loop multiple times */
3751     trial_locales[0] = (trial_locales_struct) {
3752         .trial_locale = setlocale_init,
3753         .fallback_desc = NULL,
3754         .fallback_name = NULL,
3755     };
3756     trial_locales_count = 1;
3757
3758     for (i= 0; i < trial_locales_count; i++) {
3759         const char * trial_locale = trial_locales[i].trial_locale;
3760         setlocale_failure = FALSE;
3761
3762 #  ifdef LC_ALL
3763
3764         sl_result[LC_ALL_INDEX_] = stdized_setlocale(LC_ALL, trial_locale);
3765         DEBUG_LOCALE_INIT(LC_ALL_INDEX_, trial_locale, sl_result[LC_ALL_INDEX_]);
3766         if (! sl_result[LC_ALL_INDEX_]) {
3767             setlocale_failure = TRUE;
3768         }
3769         else {
3770             /* Since LC_ALL succeeded, it should have changed all the other
3771              * categories it can to its value; so we massage things so that the
3772              * setlocales below just return their category's current values.
3773              * This adequately handles the case in NetBSD where LC_COLLATE may
3774              * not be defined for a locale, and setting it individually will
3775              * fail, whereas setting LC_ALL succeeds, leaving LC_COLLATE set to
3776              * the POSIX locale. */
3777             trial_locale = NULL;
3778         }
3779
3780 #  endif /* LC_ALL */
3781
3782         if (! setlocale_failure) {
3783             unsigned int j;
3784             for (j = 0; j < NOMINAL_LC_ALL_INDEX; j++) {
3785                 Safefree(curlocales[j]);
3786                 curlocales[j] = stdized_setlocale(categories[j], trial_locale);
3787                 if (! curlocales[j]) {
3788                     setlocale_failure = TRUE;
3789                 }
3790                 curlocales[j] = savepv(curlocales[j]);
3791                 DEBUG_LOCALE_INIT(j, trial_locale, curlocales[j]);
3792             }
3793
3794             if (LIKELY(! setlocale_failure)) {  /* All succeeded */
3795                 break;  /* Exit trial_locales loop */
3796             }
3797         }
3798
3799         /* Here, something failed; will need to try a fallback. */
3800         ok = 0;
3801
3802         if (i == 0) {
3803             unsigned int j;
3804
3805             if (locwarn) { /* Output failure info only on the first one */
3806
3807 #  ifdef LC_ALL
3808
3809                 PerlIO_printf(Perl_error_log,
3810                 "perl: warning: Setting locale failed.\n");
3811
3812 #  else /* !LC_ALL */
3813
3814                 PerlIO_printf(Perl_error_log,
3815                 "perl: warning: Setting locale failed for the categories:\n");
3816
3817                 for (j = 0; j < NOMINAL_LC_ALL_INDEX; j++) {
3818                     if (! curlocales[j]) {
3819                         PerlIO_printf(Perl_error_log, "\t%s\n", category_names[j]);
3820                     }
3821                 }
3822
3823 #  endif /* LC_ALL */
3824
3825                 PerlIO_printf(Perl_error_log,
3826                     "perl: warning: Please check that your locale settings:\n");
3827
3828 #  ifdef __GLIBC__
3829
3830                 PerlIO_printf(Perl_error_log,
3831                             "\tLANGUAGE = %c%s%c,\n",
3832                             language ? '"' : '(',
3833                             language ? language : "unset",
3834                             language ? '"' : ')');
3835 #  endif
3836
3837                 PerlIO_printf(Perl_error_log,
3838                             "\tLC_ALL = %c%s%c,\n",
3839                             lc_all ? '"' : '(',
3840                             lc_all ? lc_all : "unset",
3841                             lc_all ? '"' : ')');
3842
3843 #  if defined(USE_ENVIRON_ARRAY)
3844
3845                 {
3846                     char **e;
3847
3848                     /* Look through the environment for any variables of the
3849                      * form qr/ ^ LC_ [A-Z]+ = /x, except LC_ALL which was
3850                      * already handled above.  These are assumed to be locale
3851                      * settings.  Output them and their values. */
3852                     for (e = environ; *e; e++) {
3853                         const STRLEN prefix_len = sizeof("LC_") - 1;
3854                         STRLEN uppers_len;
3855
3856                         if (     strBEGINs(*e, "LC_")
3857                             && ! strBEGINs(*e, "LC_ALL=")
3858                             && (uppers_len = strspn(*e + prefix_len,
3859                                              "ABCDEFGHIJKLMNOPQRSTUVWXYZ"))
3860                             && ((*e)[prefix_len + uppers_len] == '='))
3861                         {
3862                             PerlIO_printf(Perl_error_log, "\t%.*s = \"%s\",\n",
3863                                 (int) (prefix_len + uppers_len), *e,
3864                                 *e + prefix_len + uppers_len + 1);
3865                         }
3866                     }
3867                 }
3868
3869 #  else
3870
3871                 PerlIO_printf(Perl_error_log,
3872                             "\t(possibly more locale environment variables)\n");
3873
3874 #  endif
3875
3876                 PerlIO_printf(Perl_error_log,
3877                             "\tLANG = %c%s%c\n",
3878                             lang ? '"' : '(',
3879                             lang ? lang : "unset",
3880                             lang ? '"' : ')');
3881
3882                 PerlIO_printf(Perl_error_log,
3883                             "    are supported and installed on your system.\n");
3884             }
3885
3886             /* Calculate what fallback locales to try.  We have avoided this
3887              * until we have to, because failure is quite unlikely.  This will
3888              * usually change the upper bound of the loop we are in.
3889              *
3890              * Since the system's default way of setting the locale has not
3891              * found one that works, We use Perl's defined ordering: LC_ALL,
3892              * LANG, and the C locale.  We don't try the same locale twice, so
3893              * don't add to the list if already there.  (On POSIX systems, the
3894              * LC_ALL element will likely be a repeat of the 0th element "",
3895              * but there's no harm done by doing it explicitly.
3896              *
3897              * Note that this tries the LC_ALL environment variable even on
3898              * systems which have no LC_ALL locale setting.  This may or may
3899              * not have been originally intentional, but there's no real need
3900              * to change the behavior. */
3901             if (lc_all) {
3902                 for (j = 0; j < trial_locales_count; j++) {
3903                     if (strEQ(lc_all, trial_locales[j].trial_locale)) {
3904                         goto done_lc_all;
3905                     }
3906                 }
3907                 trial_locales[trial_locales_count++] = (trial_locales_struct) {
3908                     .trial_locale = lc_all,
3909                     .fallback_desc = (strEQ(lc_all, "C")
3910                                       ? "the standard locale"
3911                                       : "a fallback locale"),
3912                     .fallback_name = lc_all,
3913                 };
3914             }
3915           done_lc_all:
3916
3917             if (lang) {
3918                 for (j = 0; j < trial_locales_count; j++) {
3919                     if (strEQ(lang, trial_locales[j].trial_locale)) {
3920                         goto done_lang;
3921                     }
3922                 }
3923                 trial_locales[trial_locales_count++] = (trial_locales_struct) {
3924                     .trial_locale = lang,
3925                     .fallback_desc = (strEQ(lang, "C")
3926                                       ? "the standard locale"
3927                                       : "a fallback locale"),
3928                     .fallback_name = lang,
3929                 };
3930             }
3931           done_lang:
3932
3933 #  if defined(WIN32) && defined(LC_ALL)
3934
3935             /* For Windows, we also try the system default locale before "C".
3936              * (If there exists a Windows without LC_ALL we skip this because
3937              * it gets too complicated.  For those, the "C" is the next
3938              * fallback possibility). */
3939             {
3940                 /* Note that this may change the locale, but we are going to do
3941                  * that anyway */
3942                 const char *system_default_locale = stdized_setlocale(LC_ALL, "");
3943                 DEBUG_LOCALE_INIT(LC_ALL_INDEX_, "", system_default_locale);
3944
3945                 /* Skip if invalid or if it's already on the list of locales to
3946                  * try */
3947                 if (! system_default_locale) {
3948                     goto done_system_default;
3949                 }
3950                 for (j = 0; j < trial_locales_count; j++) {
3951                     if (strEQ(system_default_locale, trial_locales[j].trial_locale)) {
3952                         goto done_system_default;
3953                     }
3954                 }
3955
3956                 trial_locales[trial_locales_count++] = (trial_locales_struct) {
3957                     .trial_locale = system_default_locale,
3958                     .fallback_desc = (strEQ(system_default_locale, "C")
3959                                       ? "the standard locale"
3960                                       : "the system default locale"),
3961                     .fallback_name = system_default_locale,
3962                 };
3963             }
3964           done_system_default:
3965
3966 #  endif
3967
3968             for (j = 0; j < trial_locales_count; j++) {
3969                 if (strEQ("C", trial_locales[j].trial_locale)) {
3970                     goto done_C;
3971                 }
3972             }
3973             trial_locales[trial_locales_count++] = (trial_locales_struct) {
3974                 .trial_locale = "C",
3975                 .fallback_desc = "the standard locale",
3976                 .fallback_name = "C",
3977             };
3978
3979           done_C: ;
3980         }   /* end of first time through the loop */
3981
3982 #  ifdef WIN32
3983
3984       next_iteration: ;
3985
3986 #  endif
3987
3988     }   /* end of looping through the trial locales */
3989
3990     if (ok < 1) {   /* If we tried to fallback */
3991         const char* msg;
3992         if (! setlocale_failure) {  /* fallback succeeded */
3993            msg = "Falling back to";
3994         }
3995         else {  /* fallback failed */
3996             unsigned int j;
3997
3998             /* We dropped off the end of the loop, so have to decrement i to
3999              * get back to the value the last time through */
4000             i--;
4001
4002             ok = -1;
4003             msg = "Failed to fall back to";
4004
4005             /* To continue, we should use whatever values we've got */
4006
4007             for (j = 0; j < NOMINAL_LC_ALL_INDEX; j++) {
4008                 Safefree(curlocales[j]);
4009                 curlocales[j] = savepv(stdized_setlocale(categories[j], NULL));
4010                 DEBUG_LOCALE_INIT(j, NULL, curlocales[j]);
4011             }
4012         }
4013
4014         if (locwarn) {
4015             const char * description = trial_locales[i].fallback_desc;
4016             const char * name = trial_locales[i].fallback_name;
4017
4018             if (name && strNE(name, "")) {
4019                 PerlIO_printf(Perl_error_log,
4020                     "perl: warning: %s %s (\"%s\").\n", msg, description, name);
4021             }
4022             else {
4023                 PerlIO_printf(Perl_error_log,
4024                                    "perl: warning: %s %s.\n", msg, description);
4025             }
4026         }
4027     } /* End of tried to fallback */
4028
4029 #  ifdef USE_POSIX_2008_LOCALE
4030
4031     /* The stdized setlocales haven't affected the P2008 locales.  Initialize
4032      * them now, calculating LC_ALL only on the final go round, when all have
4033      * been set. */
4034     for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++) {
4035         (void) emulate_setlocale_i(i, curlocales[i],
4036                                    RECALCULATE_LC_ALL_ON_FINAL_INTERATION,
4037                                    __LINE__);
4038     }
4039
4040 #  endif
4041
4042     /* Done with finding the locales; update the auxiliary records */
4043     new_LC_ALL(NULL);
4044
4045     for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++) {
4046
4047 #  if defined(USE_LOCALE_THREADS) && ! defined(USE_THREAD_SAFE_LOCALE)
4048
4049         /* This caches whether each category's locale is UTF-8 or not.  This
4050          * may involve changing the locale.  It is ok to do this at
4051          * initialization time before any threads have started, but not later
4052          * unless thread-safe operations are used.
4053          * Caching means that if the program heeds our dictate not to change
4054          * locales in threaded applications, this data will remain valid, and
4055          * it may get queried without having to change locales.  If the
4056          * environment is such that all categories have the same locale, this
4057          * isn't needed, as the code will not change the locale; but this
4058          * handles the uncommon case where the environment has disparate
4059          * locales for the categories */
4060         (void) _is_cur_LC_category_utf8(categories[i]);
4061
4062 #  endif
4063
4064         Safefree(curlocales[i]);
4065     }
4066
4067 #  if defined(USE_PERLIO) && defined(USE_LOCALE_CTYPE)
4068
4069     /* Set PL_utf8locale to TRUE if using PerlIO _and_ the current LC_CTYPE
4070      * locale is UTF-8.  The call to new_ctype() just above has already
4071      * calculated the latter value and saved it in PL_in_utf8_CTYPE_locale. If
4072      * both PL_utf8locale and PL_unicode (set by -C or by $ENV{PERL_UNICODE})
4073      * are true, perl.c:S_parse_body() will turn on the PerlIO :utf8 layer on
4074      * STDIN, STDOUT, STDERR, _and_ the default open discipline.  */
4075     PL_utf8locale = PL_in_utf8_CTYPE_locale;
4076
4077     /* Set PL_unicode to $ENV{PERL_UNICODE} if using PerlIO.
4078        This is an alternative to using the -C command line switch
4079        (the -C if present will override this). */
4080     {
4081          const char *p = PerlEnv_getenv("PERL_UNICODE");
4082          PL_unicode = p ? parse_unicode_opts(&p) : 0;
4083          if (PL_unicode & PERL_UNICODE_UTF8CACHEASSERT_FLAG)
4084              PL_utf8cache = -1;
4085     }
4086
4087 #  endif
4088 #endif /* USE_LOCALE */
4089
4090     /* So won't continue to output stuff */
4091     DEBUG_INITIALIZATION_set(FALSE);
4092
4093     return ok;
4094 }
4095
4096 #ifdef USE_LOCALE_COLLATE
4097
4098 char *
4099 Perl__mem_collxfrm(pTHX_ const char *input_string,
4100                          STRLEN len,    /* Length of 'input_string' */
4101                          STRLEN *xlen,  /* Set to length of returned string
4102                                            (not including the collation index
4103                                            prefix) */
4104                          bool utf8      /* Is the input in UTF-8? */
4105                    )
4106 {
4107     /* _mem_collxfrm() is like strxfrm() but with two important differences.
4108      * First, it handles embedded NULs. Second, it allocates a bit more memory
4109      * than needed for the transformed data itself.  The real transformed data
4110      * begins at offset COLLXFRM_HDR_LEN.  *xlen is set to the length of that,
4111      * and doesn't include the collation index size.
4112      *
4113      * It is the caller's responsibility to eventually free the memory returned
4114      * by this function.
4115      *
4116      * Please see sv_collxfrm() to see how this is used. */
4117
4118 #  define COLLXFRM_HDR_LEN    sizeof(PL_collation_ix)
4119
4120     char * s = (char *) input_string;
4121     STRLEN s_strlen = strlen(input_string);
4122     char *xbuf = NULL;
4123     STRLEN xAlloc;          /* xalloc is a reserved word in VC */
4124     STRLEN length_in_chars;
4125     bool first_time = TRUE; /* Cleared after first loop iteration */
4126
4127     PERL_ARGS_ASSERT__MEM_COLLXFRM;
4128
4129     /* Must be NUL-terminated */
4130     assert(*(input_string + len) == '\0');
4131
4132     /* If this locale has defective collation, skip */
4133     if (PL_collxfrm_base == 0 && PL_collxfrm_mult == 0) {
4134         DEBUG_L(PerlIO_printf(Perl_debug_log,
4135                       "_mem_collxfrm: locale's collation is defective\n"));
4136         goto bad;
4137     }
4138
4139     /* Replace any embedded NULs with the control that sorts before any others.
4140      * This will give as good as possible results on strings that don't
4141      * otherwise contain that character, but otherwise there may be
4142      * less-than-perfect results with that character and NUL.  This is
4143      * unavoidable unless we replace strxfrm with our own implementation. */
4144     if (UNLIKELY(s_strlen < len)) {   /* Only execute if there is an embedded
4145                                          NUL */
4146         char * e = s + len;
4147         char * sans_nuls;
4148         STRLEN sans_nuls_len;
4149         int try_non_controls;
4150         char this_replacement_char[] = "?\0";   /* Room for a two-byte string,
4151                                                    making sure 2nd byte is NUL.
4152                                                  */
4153         STRLEN this_replacement_len;
4154
4155         /* If we don't know what non-NUL control character sorts lowest for
4156          * this locale, find it */
4157         if (PL_strxfrm_NUL_replacement == '\0') {
4158             int j;
4159             char * cur_min_x = NULL;    /* The min_char's xfrm, (except it also
4160                                            includes the collation index
4161                                            prefixed. */
4162
4163             DEBUG_Lv(PerlIO_printf(Perl_debug_log, "Looking to replace NUL\n"));
4164
4165             /* Unlikely, but it may be that no control will work to replace
4166              * NUL, in which case we instead look for any character.  Controls
4167              * are preferred because collation order is, in general, context
4168              * sensitive, with adjoining characters affecting the order, and
4169              * controls are less likely to have such interactions, allowing the
4170              * NUL-replacement to stand on its own.  (Another way to look at it
4171              * is to imagine what would happen if the NUL were replaced by a
4172              * combining character; it wouldn't work out all that well.) */
4173             for (try_non_controls = 0;
4174                  try_non_controls < 2;
4175                  try_non_controls++)
4176             {
4177                 /* Look through all legal code points (NUL isn't) */
4178                 for (j = 1; j < 256; j++) {
4179                     char * x;       /* j's xfrm plus collation index */
4180                     STRLEN x_len;   /* length of 'x' */
4181                     STRLEN trial_len = 1;
4182                     char cur_source[] = { '\0', '\0' };
4183
4184                     /* Skip non-controls the first time through the loop.  The
4185                      * controls in a UTF-8 locale are the L1 ones */
4186                     if (! try_non_controls && (PL_in_utf8_COLLATE_locale)
4187                                                ? ! isCNTRL_L1(j)
4188                                                : ! isCNTRL_LC(j))
4189                     {
4190                         continue;
4191                     }
4192
4193                     /* Create a 1-char string of the current code point */
4194                     cur_source[0] = (char) j;
4195
4196                     /* Then transform it */
4197                     x = _mem_collxfrm(cur_source, trial_len, &x_len,
4198                                       0 /* The string is not in UTF-8 */);
4199
4200                     /* Ignore any character that didn't successfully transform.
4201                      * */
4202                     if (! x) {
4203                         continue;
4204                     }
4205
4206                     /* If this character's transformation is lower than
4207                      * the current lowest, this one becomes the lowest */
4208                     if (   cur_min_x == NULL
4209                         || strLT(x         + COLLXFRM_HDR_LEN,
4210                                  cur_min_x + COLLXFRM_HDR_LEN))
4211                     {
4212                         PL_strxfrm_NUL_replacement = j;
4213                         Safefree(cur_min_x);
4214                         cur_min_x = x;
4215                     }
4216                     else {
4217                         Safefree(x);
4218                     }
4219                 } /* end of loop through all 255 characters */
4220
4221                 /* Stop looking if found */
4222                 if (cur_min_x) {
4223                     break;
4224                 }
4225
4226                 /* Unlikely, but possible, if there aren't any controls that
4227                  * work in the locale, repeat the loop, looking for any
4228                  * character that works */
4229                 DEBUG_L(PerlIO_printf(Perl_debug_log,
4230                 "_mem_collxfrm: No control worked.  Trying non-controls\n"));
4231             } /* End of loop to try first the controls, then any char */
4232
4233             if (! cur_min_x) {
4234                 DEBUG_L(PerlIO_printf(Perl_debug_log,
4235                     "_mem_collxfrm: Couldn't find any character to replace"
4236                     " embedded NULs in locale %s with", PL_collation_name));
4237                 goto bad;
4238             }
4239
4240             DEBUG_L(PerlIO_printf(Perl_debug_log,
4241                     "_mem_collxfrm: Replacing embedded NULs in locale %s with "
4242                     "0x%02X\n", PL_collation_name, PL_strxfrm_NUL_replacement));
4243
4244             Safefree(cur_min_x);
4245         } /* End of determining the character that is to replace NULs */
4246
4247         /* If the replacement is variant under UTF-8, it must match the
4248          * UTF8-ness of the original */
4249         if ( ! UVCHR_IS_INVARIANT(PL_strxfrm_NUL_replacement) && utf8) {
4250             this_replacement_char[0] =
4251                                 UTF8_EIGHT_BIT_HI(PL_strxfrm_NUL_replacement);
4252             this_replacement_char[1] =
4253                                 UTF8_EIGHT_BIT_LO(PL_strxfrm_NUL_replacement);
4254             this_replacement_len = 2;
4255         }
4256         else {
4257             this_replacement_char[0] = PL_strxfrm_NUL_replacement;
4258             /* this_replacement_char[1] = '\0' was done at initialization */
4259             this_replacement_len = 1;
4260         }
4261
4262         /* The worst case length for the replaced string would be if every
4263          * character in it is NUL.  Multiply that by the length of each
4264          * replacement, and allow for a trailing NUL */
4265         sans_nuls_len = (len * this_replacement_len) + 1;
4266         Newx(sans_nuls, sans_nuls_len, char);
4267         *sans_nuls = '\0';
4268
4269         /* Replace each NUL with the lowest collating control.  Loop until have
4270          * exhausted all the NULs */
4271         while (s + s_strlen < e) {
4272             my_strlcat(sans_nuls, s, sans_nuls_len);
4273
4274             /* Do the actual replacement */
4275             my_strlcat(sans_nuls, this_replacement_char, sans_nuls_len);
4276
4277             /* Move past the input NUL */
4278             s += s_strlen + 1;
4279             s_strlen = strlen(s);
4280         }
4281
4282         /* And add anything that trails the final NUL */
4283         my_strlcat(sans_nuls, s, sans_nuls_len);
4284
4285         /* Switch so below we transform this modified string */
4286         s = sans_nuls;
4287         len = strlen(s);
4288     } /* End of replacing NULs */
4289
4290     /* Make sure the UTF8ness of the string and locale match */
4291     if (utf8 != PL_in_utf8_COLLATE_locale) {
4292         /* XXX convert above Unicode to 10FFFF? */
4293         const char * const t = s;   /* Temporary so we can later find where the
4294                                        input was */
4295
4296         /* Here they don't match.  Change the string's to be what the locale is
4297          * expecting */
4298
4299         if (! utf8) { /* locale is UTF-8, but input isn't; upgrade the input */
4300             s = (char *) bytes_to_utf8((const U8 *) s, &len);
4301             utf8 = TRUE;
4302         }
4303         else {   /* locale is not UTF-8; but input is; downgrade the input */
4304
4305             s = (char *) bytes_from_utf8((const U8 *) s, &len, &utf8);
4306
4307             /* If the downgrade was successful we are done, but if the input
4308              * contains things that require UTF-8 to represent, have to do
4309              * damage control ... */
4310             if (UNLIKELY(utf8)) {
4311
4312                 /* What we do is construct a non-UTF-8 string with
4313                  *  1) the characters representable by a single byte converted
4314                  *     to be so (if necessary);
4315                  *  2) and the rest converted to collate the same as the
4316                  *     highest collating representable character.  That makes
4317                  *     them collate at the end.  This is similar to how we
4318                  *     handle embedded NULs, but we use the highest collating
4319                  *     code point instead of the smallest.  Like the NUL case,
4320                  *     this isn't perfect, but is the best we can reasonably
4321                  *     do.  Every above-255 code point will sort the same as
4322                  *     the highest-sorting 0-255 code point.  If that code
4323                  *     point can combine in a sequence with some other code
4324                  *     points for weight calculations, us changing something to
4325                  *     be it can adversely affect the results.  But in most
4326                  *     cases, it should work reasonably.  And note that this is
4327                  *     really an illegal situation: using code points above 255
4328                  *     on a locale where only 0-255 are valid.  If two strings
4329                  *     sort entirely equal, then the sort order for the
4330                  *     above-255 code points will be in code point order. */
4331
4332                 utf8 = FALSE;
4333
4334                 /* If we haven't calculated the code point with the maximum
4335                  * collating order for this locale, do so now */
4336                 if (! PL_strxfrm_max_cp) {
4337                     int j;
4338
4339                     /* The current transformed string that collates the
4340                      * highest (except it also includes the prefixed collation
4341                      * index. */
4342                     char * cur_max_x = NULL;
4343
4344                     /* Look through all legal code points (NUL isn't) */
4345                     for (j = 1; j < 256; j++) {
4346                         char * x;
4347                         STRLEN x_len;
4348                         char cur_source[] = { '\0', '\0' };
4349
4350                         /* Create a 1-char string of the current code point */
4351                         cur_source[0] = (char) j;
4352
4353                         /* Then transform it */
4354                         x = _mem_collxfrm(cur_source, 1, &x_len, FALSE);
4355
4356                         /* If something went wrong (which it shouldn't), just
4357                          * ignore this code point */
4358                         if (! x) {
4359                             continue;
4360                         }
4361
4362                         /* If this character's transformation is higher than
4363                          * the current highest, this one becomes the highest */
4364                         if (   cur_max_x == NULL
4365                             || strGT(x         + COLLXFRM_HDR_LEN,
4366                                      cur_max_x + COLLXFRM_HDR_LEN))
4367                         {
4368                             PL_strxfrm_max_cp = j;
4369                             Safefree(cur_max_x);
4370                             cur_max_x = x;
4371                         }
4372                         else {
4373                             Safefree(x);
4374                         }
4375                     }
4376
4377                     if (! cur_max_x) {
4378                         DEBUG_L(PerlIO_printf(Perl_debug_log,
4379                             "_mem_collxfrm: Couldn't find any character to"
4380                             " replace above-Latin1 chars in locale %s with",
4381                             PL_collation_name));
4382                         goto bad;
4383                     }
4384
4385                     DEBUG_L(PerlIO_printf(Perl_debug_log,
4386                             "_mem_collxfrm: highest 1-byte collating character"
4387                             " in locale %s is 0x%02X\n",
4388                             PL_collation_name,
4389                             PL_strxfrm_max_cp));
4390
4391                     Safefree(cur_max_x);
4392                 }
4393
4394                 /* Here we know which legal code point collates the highest.
4395                  * We are ready to construct the non-UTF-8 string.  The length
4396                  * will be at least 1 byte smaller than the input string
4397                  * (because we changed at least one 2-byte character into a
4398                  * single byte), but that is eaten up by the trailing NUL */
4399                 Newx(s, len, char);
4400
4401                 {
4402                     STRLEN i;
4403                     STRLEN d= 0;
4404                     char * e = (char *) t + len;
4405
4406                     for (i = 0; i < len; i+= UTF8SKIP(t + i)) {
4407                         U8 cur_char = t[i];
4408                         if (UTF8_IS_INVARIANT(cur_char)) {
4409                             s[d++] = cur_char;
4410                         }
4411                         else if (UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(t + i, e)) {
4412                             s[d++] = EIGHT_BIT_UTF8_TO_NATIVE(cur_char, t[i+1]);
4413                         }
4414                         else {  /* Replace illegal cp with highest collating
4415                                    one */
4416                             s[d++] = PL_strxfrm_max_cp;
4417                         }
4418                     }
4419                     s[d++] = '\0';
4420                     Renew(s, d, char);   /* Free up unused space */
4421                 }
4422             }
4423         }
4424
4425         /* Here, we have constructed a modified version of the input.  It could
4426          * be that we already had a modified copy before we did this version.
4427          * If so, that copy is no longer needed */
4428         if (t != input_string) {
4429             Safefree(t);
4430         }
4431     }
4432
4433     length_in_chars = (utf8)
4434                       ? utf8_length((U8 *) s, (U8 *) s + len)
4435                       : len;
4436
4437     /* The first element in the output is the collation id, used by
4438      * sv_collxfrm(); then comes the space for the transformed string.  The
4439      * equation should give us a good estimate as to how much is needed */
4440     xAlloc = COLLXFRM_HDR_LEN
4441            + PL_collxfrm_base
4442            + (PL_collxfrm_mult * length_in_chars);
4443     Newx(xbuf, xAlloc, char);
4444     if (UNLIKELY(! xbuf)) {
4445         DEBUG_L(PerlIO_printf(Perl_debug_log,
4446                       "_mem_collxfrm: Couldn't malloc %zu bytes\n", xAlloc));
4447         goto bad;
4448     }
4449
4450     /* Store the collation id */
4451     *(U32*)xbuf = PL_collation_ix;
4452
4453     /* Then the transformation of the input.  We loop until successful, or we
4454      * give up */
4455     for (;;) {
4456
4457         errno = 0;
4458         *xlen = strxfrm(xbuf + COLLXFRM_HDR_LEN, s, xAlloc - COLLXFRM_HDR_LEN);
4459
4460         /* If the transformed string occupies less space than we told strxfrm()
4461          * was available, it means it transformed the whole string. */
4462         if (*xlen < xAlloc - COLLXFRM_HDR_LEN) {
4463
4464             /* But there still could have been a problem */
4465             if (errno != 0) {
4466                 DEBUG_L(PerlIO_printf(Perl_debug_log,
4467                        "strxfrm failed for LC_COLLATE=%s; errno=%d, input=%s\n",
4468                        PL_collation_name, errno,
4469                        _byte_dump_string((U8 *) s, len, 0)));
4470                 goto bad;
4471             }
4472
4473             /* Here, the transformation was successful.  Some systems include a
4474              * trailing NUL in the returned length.  Ignore it, using a loop in
4475              * case multiple trailing NULs are returned. */
4476             while (   (*xlen) > 0
4477                    && *(xbuf + COLLXFRM_HDR_LEN + (*xlen) - 1) == '\0')
4478             {
4479                 (*xlen)--;
4480             }
4481
4482             /* If the first try didn't get it, it means our prediction was low.
4483              * Modify the coefficients so that we predict a larger value in any
4484              * future transformations */
4485             if (! first_time) {
4486                 STRLEN needed = *xlen + 1;   /* +1 For trailing NUL */
4487                 STRLEN computed_guess = PL_collxfrm_base
4488                                       + (PL_collxfrm_mult * length_in_chars);
4489
4490                 /* On zero-length input, just keep current slope instead of
4491                  * dividing by 0 */
4492                 const STRLEN new_m = (length_in_chars != 0)
4493                                      ? needed / length_in_chars
4494                                      : PL_collxfrm_mult;
4495
4496                 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
4497                     "initial size of %zu bytes for a length "
4498                     "%zu string was insufficient, %zu needed\n",
4499                     computed_guess, length_in_chars, needed));
4500
4501                 /* If slope increased, use it, but discard this result for
4502                  * length 1 strings, as we can't be sure that it's a real slope
4503                  * change */
4504                 if (length_in_chars > 1 && new_m  > PL_collxfrm_mult) {
4505
4506 #  ifdef DEBUGGING
4507
4508                     STRLEN old_m = PL_collxfrm_mult;
4509                     STRLEN old_b = PL_collxfrm_base;
4510
4511 #  endif
4512
4513                     PL_collxfrm_mult = new_m;
4514                     PL_collxfrm_base = 1;   /* +1 For trailing NUL */
4515                     computed_guess = PL_collxfrm_base
4516                                     + (PL_collxfrm_mult * length_in_chars);
4517                     if (computed_guess < needed) {
4518                         PL_collxfrm_base += needed - computed_guess;
4519                     }
4520
4521                     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
4522                                     "slope is now %zu; was %zu, base "
4523                         "is now %zu; was %zu\n",
4524                         PL_collxfrm_mult, old_m,
4525                         PL_collxfrm_base, old_b));
4526                 }
4527                 else {  /* Slope didn't change, but 'b' did */
4528                     const STRLEN new_b = needed
4529                                         - computed_guess
4530                                         + PL_collxfrm_base;
4531                     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
4532                         "base is now %zu; was %zu\n", new_b, PL_collxfrm_base));
4533                     PL_collxfrm_base = new_b;
4534                 }
4535             }
4536
4537             break;
4538         }
4539
4540         if (UNLIKELY(*xlen >= PERL_INT_MAX)) {
4541             DEBUG_L(PerlIO_printf(Perl_debug_log,
4542                   "_mem_collxfrm: Needed %zu bytes, max permissible is %u\n",
4543                   *xlen, PERL_INT_MAX));
4544             goto bad;
4545         }
4546
4547         /* A well-behaved strxfrm() returns exactly how much space it needs
4548          * (usually not including the trailing NUL) when it fails due to not
4549          * enough space being provided.  Assume that this is the case unless
4550          * it's been proven otherwise */
4551         if (LIKELY(PL_strxfrm_is_behaved) && first_time) {
4552             xAlloc = *xlen + COLLXFRM_HDR_LEN + 1;
4553         }
4554         else { /* Here, either:
4555                 *  1)  The strxfrm() has previously shown bad behavior; or
4556                 *  2)  It isn't the first time through the loop, which means
4557                 *      that the strxfrm() is now showing bad behavior, because
4558                 *      we gave it what it said was needed in the previous
4559                 *      iteration, and it came back saying it needed still more.
4560                 *      (Many versions of cygwin fit this.  When the buffer size
4561                 *      isn't sufficient, they return the input size instead of
4562                 *      how much is needed.)
4563                 * Increase the buffer size by a fixed percentage and try again.
4564                 * */
4565             xAlloc += (xAlloc / 4) + 1;
4566             PL_strxfrm_is_behaved = FALSE;
4567
4568             DEBUG_Lv(PerlIO_printf(Perl_debug_log,
4569                      "_mem_collxfrm required more space than previously"
4570                      " calculated for locale %s, trying again with new"
4571                      " guess=%zu+%zu\n",
4572                 PL_collation_name,  COLLXFRM_HDR_LEN,
4573                      xAlloc - COLLXFRM_HDR_LEN));
4574         }
4575
4576         Renew(xbuf, xAlloc, char);
4577         if (UNLIKELY(! xbuf)) {
4578             DEBUG_L(PerlIO_printf(Perl_debug_log,
4579                       "_mem_collxfrm: Couldn't realloc %zu bytes\n", xAlloc));
4580             goto bad;
4581         }
4582
4583         first_time = FALSE;
4584     }
4585
4586     DEBUG_Lv((print_collxfrm_input_and_return(s, s + len, xlen, utf8),
4587               PerlIO_printf(Perl_debug_log, "Its xfrm is:"),
4588         PerlIO_printf(Perl_debug_log, "%s\n",
4589                       _byte_dump_string((U8 *) xbuf + COLLXFRM_HDR_LEN,
4590                             *xlen, 1))));
4591
4592     /* Free up unneeded space; retain enough for trailing NUL */
4593     Renew(xbuf, COLLXFRM_HDR_LEN + *xlen + 1, char);
4594
4595     if (s != input_string) {
4596         Safefree(s);
4597     }
4598
4599     return xbuf;
4600
4601   bad:
4602
4603     DEBUG_Lv(print_collxfrm_input_and_return(s, s + len, NULL, utf8));
4604
4605     Safefree(xbuf);
4606     if (s != input_string) {
4607         Safefree(s);
4608     }
4609     *xlen = 0;
4610
4611     return NULL;
4612 }
4613
4614 #  ifdef DEBUGGING
4615
4616 STATIC void
4617 S_print_collxfrm_input_and_return(pTHX_
4618                                   const char * const s,
4619                                   const char * const e,
4620                                   const STRLEN * const xlen,
4621                                   const bool is_utf8)
4622 {
4623
4624     PERL_ARGS_ASSERT_PRINT_COLLXFRM_INPUT_AND_RETURN;
4625
4626     PerlIO_printf(Perl_debug_log, "_mem_collxfrm[%" UVuf "]: returning ",
4627                                                         (UV)PL_collation_ix);
4628     if (xlen) {
4629         PerlIO_printf(Perl_debug_log, "%zu", *xlen);
4630     }
4631     else {
4632         PerlIO_printf(Perl_debug_log, "NULL");
4633     }
4634     PerlIO_printf(Perl_debug_log, " for locale '%s', string='",
4635                                                             PL_collation_name);
4636     print_bytes_for_locale(s, e, is_utf8);
4637
4638     PerlIO_printf(Perl_debug_log, "'\n");
4639 }
4640
4641 #  endif    /* DEBUGGING */
4642 #endif /* USE_LOCALE_COLLATE */
4643
4644 #ifdef USE_LOCALE
4645 #  ifdef DEBUGGING
4646
4647 STATIC void
4648 S_print_bytes_for_locale(pTHX_
4649                     const char * const s,
4650                     const char * const e,
4651                     const bool is_utf8)
4652 {
4653     const char * t = s;
4654     bool prev_was_printable = TRUE;
4655     bool first_time = TRUE;
4656
4657     PERL_ARGS_ASSERT_PRINT_BYTES_FOR_LOCALE;
4658
4659     while (t < e) {
4660         UV cp = (is_utf8)
4661                 ?  utf8_to_uvchr_buf((U8 *) t, e, NULL)
4662                 : * (U8 *) t;
4663         if (isPRINT(cp)) {
4664             if (! prev_was_printable) {
4665                 PerlIO_printf(Perl_debug_log, " ");
4666             }
4667             PerlIO_printf(Perl_debug_log, "%c", (U8) cp);
4668             prev_was_printable = TRUE;
4669         }
4670         else {
4671             if (! first_time) {
4672                 PerlIO_printf(Perl_debug_log, " ");
4673             }
4674             PerlIO_printf(Perl_debug_log, "%02" UVXf, cp);
4675             prev_was_printable = FALSE;
4676         }
4677         t += (is_utf8) ? UTF8SKIP(t) : 1;
4678         first_time = FALSE;
4679     }
4680 }
4681
4682 #  endif   /* #ifdef DEBUGGING */
4683
4684 STATIC const char *
4685 S_switch_category_locale_to_template(pTHX_ const int switch_category,
4686                                      const int template_category,
4687                                      const char * template_locale)
4688 {
4689     /* Changes the locale for LC_'switch_category" to that of
4690      * LC_'template_category', if they aren't already the same.  If not NULL,
4691      * 'template_locale' is the locale that 'template_category' is in.
4692      *
4693      * Returns a copy of the name of the original locale for 'switch_category'
4694      * so can be switched back to with the companion function
4695      * restore_switched_locale(),  (NULL if no restoral is necessary.) */
4696
4697     const char * restore_to_locale = NULL;
4698
4699     if (switch_category == template_category) { /* No changes needed */
4700         return NULL;
4701     }
4702
4703     /* Find the original locale of the category we may need to change, so that
4704      * it can be restored to later */
4705     restore_to_locale = querylocale_r(switch_category);
4706     if (! restore_to_locale) {
4707         locale_panic_(Perl_form(aTHX_ "Could not find current %s locale",
4708                                       category_name(switch_category)));
4709     }
4710     restore_to_locale = savepv(restore_to_locale);
4711
4712     /* If the locale of the template category wasn't passed in, find it now */
4713     if (template_locale == NULL) {
4714         template_locale = querylocale_r(template_category);
4715         if (! template_locale) {
4716             locale_panic_(Perl_form(aTHX_ "Could not find current %s locale\n",
4717                                           category_name(template_category)));
4718         }
4719     }
4720
4721     /* It the locales are the same, there's nothing to do */
4722     if (strEQ(restore_to_locale, template_locale)) {
4723         Safefree(restore_to_locale);
4724
4725         DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s locale unchanged as %s\n",
4726                             category_name(switch_category), template_locale));
4727
4728         return NULL;
4729     }
4730
4731     /* Finally, change the locale to the template one */
4732     if (! bool_setlocale_r(switch_category, template_locale)) {
4733         setlocale_failure_panic_i(get_category_index(switch_category,
4734                                                      NULL),
4735                                   category_name(switch_category),
4736                                   template_locale,
4737                                   __LINE__,
4738                                   __LINE__);
4739     }
4740
4741     DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s locale switched to %s\n",
4742                             category_name(switch_category), template_locale));
4743
4744     return restore_to_locale;
4745 }
4746
4747 STATIC void
4748 S_restore_switched_locale(pTHX_ const int category,
4749                                 const char * const original_locale)
4750 {
4751     /* Restores the locale for LC_'category' to 'original_locale' (which is a
4752      * copy that will be freed by this function), or do nothing if the latter
4753      * parameter is NULL */
4754
4755     if (original_locale == NULL) {
4756         return;
4757     }
4758
4759     if (! bool_setlocale_r(category, original_locale)) {
4760         locale_panic_(Perl_form(aTHX_ "s restoring %s to %s failed",
4761                                       category_name(category), original_locale));
4762     }
4763
4764     Safefree(original_locale);
4765 }
4766
4767 /* is_cur_LC_category_utf8 uses a small char buffer to avoid malloc/free */
4768 #  define CUR_LC_BUFFER_SIZE  64
4769
4770 bool
4771 Perl__is_cur_LC_category_utf8(pTHX_ int category)
4772 {
4773     /* Returns TRUE if the current locale for 'category' is UTF-8; FALSE
4774      * otherwise. 'category' may not be LC_ALL.  If the platform doesn't have
4775      * nl_langinfo(), nor MB_CUR_MAX, this employs a heuristic, which hence
4776      * could give the wrong result.  The result will very likely be correct for
4777      * languages that have commonly used non-ASCII characters, but for notably
4778      * English, it comes down to if the locale's name ends in something like
4779      * "UTF-8".  It errs on the side of not being a UTF-8 locale.
4780      *
4781      * If the platform is early C89, not containing mbtowc(), or we are
4782      * compiled to not pay attention to LC_CTYPE, this employs heuristics.
4783      * These work very well for non-Latin locales or those whose currency
4784      * symbol isn't a '$' nor plain ASCII text.  But without LC_CTYPE and at
4785      * least MB_CUR_MAX, English locales with an ASCII currency symbol depend
4786      * on the name containing UTF-8 or not. */
4787
4788     /* Name of current locale corresponding to the input category */
4789     const char *save_input_locale = NULL;
4790
4791     bool is_utf8 = FALSE;                /* The return value */
4792
4793     /* The variables below are for the cache of previous lookups using this
4794      * function.  The cache is a C string, described at the definition for
4795      * 'C_and_POSIX_utf8ness'.
4796      *
4797      * The first part of the cache is fixed, for the C and POSIX locales.  The
4798      * varying part starts just after them. */
4799     char * utf8ness_cache = PL_locale_utf8ness + STRLENs(C_and_POSIX_utf8ness);
4800
4801     Size_t utf8ness_cache_size; /* Size of the varying portion */
4802     Size_t input_name_len;      /* Length in bytes of save_input_locale */
4803     Size_t input_name_len_with_overhead;    /* plus extra chars used to store
4804                                                the name in the cache */
4805     char * delimited;           /* The name plus the delimiters used to store
4806                                    it in the cache */
4807     char buffer[CUR_LC_BUFFER_SIZE];        /* small buffer */
4808     char * name_pos;            /* position of 'delimited' in the cache, or 0
4809                                    if not there */
4810
4811
4812 #  ifdef LC_ALL
4813
4814     assert(category != LC_ALL);
4815
4816 #  endif
4817
4818     /* Get the desired category's locale */
4819     save_input_locale = querylocale_r(category);
4820     if (! save_input_locale) {
4821         locale_panic_(Perl_form(aTHX_ "Could not find current %s locale",
4822                                       category_name(category)));
4823     }
4824     save_input_locale = savepv(save_input_locale);
4825
4826     DEBUG_L(PerlIO_printf(Perl_debug_log,
4827                           "Current locale for %s is %s\n",
4828                           category_name(category), save_input_locale));
4829
4830     input_name_len = strlen(save_input_locale);
4831
4832     /* In our cache, each name is accompanied by two delimiters and a single
4833      * utf8ness digit */
4834     input_name_len_with_overhead = input_name_len + 3;
4835
4836     if ( input_name_len_with_overhead <= CUR_LC_BUFFER_SIZE ) {
4837         /* we can use the buffer, avoid a malloc */
4838         delimited = buffer;
4839     } else { /* need a malloc */
4840         /* Allocate and populate space for a copy of the name surrounded by the
4841          * delimiters */
4842         Newx(delimited, input_name_len_with_overhead, char);
4843     }
4844
4845     delimited[0] = UTF8NESS_SEP[0];
4846     Copy(save_input_locale, delimited + 1, input_name_len, char);
4847     delimited[input_name_len+1] = UTF8NESS_PREFIX[0];
4848     delimited[input_name_len+2] = '\0';
4849
4850     /* And see if that is in the cache */
4851     name_pos = instr(PL_locale_utf8ness, delimited);
4852     if (name_pos) {
4853         is_utf8 = *(name_pos + input_name_len_with_overhead - 1) - '0';
4854
4855         DEBUG_Lv(PerlIO_printf(Perl_debug_log,
4856                  "UTF8ness for locale %s=%d, \n",
4857                  save_input_locale, is_utf8));
4858
4859         /* And, if not already in that position, move it to the beginning of
4860          * the non-constant portion of the list, since it is the most recently
4861          * used.  (We don't have to worry about overflow, since just moving
4862          * existing names around) */
4863         if (name_pos > utf8ness_cache) {
4864             Move(utf8ness_cache,
4865                  utf8ness_cache + input_name_len_with_overhead,
4866                  name_pos - utf8ness_cache, char);
4867             Copy(delimited,
4868                  utf8ness_cache,
4869                  input_name_len_with_overhead - 1, char);
4870             utf8ness_cache[input_name_len_with_overhead - 1] = is_utf8 + '0';
4871         }
4872
4873         /* free only when not using the buffer */
4874         if ( delimited != buffer ) Safefree(delimited);
4875         Safefree(save_input_locale);
4876         return is_utf8;
4877     }
4878
4879     /* Here we don't have stored the utf8ness for the input locale.  We have to
4880      * calculate it */
4881
4882 #  if        defined(USE_LOCALE_CTYPE)                                  \
4883      && (    defined(HAS_NL_LANGINFO)                                   \
4884          || (defined(HAS_MBTOWC) || defined(HAS_MBRTOWC)))
4885
4886     {
4887         const char *original_ctype_locale
4888                         = switch_category_locale_to_template(LC_CTYPE,
4889                                                              category,
4890                                                              save_input_locale);
4891
4892         /* Here the current LC_CTYPE is set to the locale of the category whose
4893          * information is desired.  This means that nl_langinfo() and mbtowc()
4894          * should give the correct results */
4895
4896 #    ifdef MB_CUR_MAX  /* But we can potentially rule out UTF-8ness, avoiding
4897                           calling the functions if we have this */
4898
4899             /* Standard UTF-8 needs at least 4 bytes to represent the maximum
4900              * Unicode code point. */
4901
4902             DEBUG_L(PerlIO_printf(Perl_debug_log, "MB_CUR_MAX=%d\n",
4903                                              (int) MB_CUR_MAX));
4904             if ((unsigned) MB_CUR_MAX < STRLENs(MAX_UNICODE_UTF8)) {
4905                 is_utf8 = FALSE;
4906                 restore_switched_locale(LC_CTYPE, original_ctype_locale);
4907                 goto finish_and_return;
4908             }
4909
4910 #    endif
4911 #    if defined(HAS_NL_LANGINFO)
4912
4913         { /* The task is easiest if the platform has this POSIX 2001 function.
4914              Except on some platforms it can wrongly return "", so have to have
4915              a fallback.  And it can return that it's UTF-8, even if there are
4916              variances from that.  For example, Turkish locales may use the
4917              alternate dotted I rules, and sometimes it appears to be a
4918              defective locale definition.  XXX We should probably check for
4919              these in the Latin1 range and warn (but on glibc, requires
4920              iswalnum() etc. due to their not handling 80-FF correctly */
4921             const char *codeset = my_nl_langinfo(CODESET, FALSE);
4922                                           /* FALSE => already in dest locale */
4923
4924             DEBUG_Lv(PerlIO_printf(Perl_debug_log,
4925                             "\tnllanginfo returned CODESET '%s'\n", codeset));
4926
4927             if (codeset && strNE(codeset, "")) {
4928
4929                               /* If the implementation of foldEQ() somehow were
4930                                * to change to not go byte-by-byte, this could
4931                                * read past end of string, as only one length is
4932                                * checked.  But currently, a premature NUL will
4933                                * compare false, and it will stop there */
4934                 is_utf8 = cBOOL(   foldEQ(codeset, "UTF-8", STRLENs("UTF-8"))
4935                                 || foldEQ(codeset, "UTF8",  STRLENs("UTF8")));
4936
4937                 DEBUG_L(PerlIO_printf(Perl_debug_log,
4938                        "\tnllanginfo returned CODESET '%s'; ?UTF8 locale=%d\n",
4939                                                      codeset,         is_utf8));
4940                 restore_switched_locale(LC_CTYPE, original_ctype_locale);
4941                 goto finish_and_return;
4942             }
4943         }
4944
4945 #    endif
4946 #    if defined(HAS_MBTOWC) || defined(HAS_MBRTOWC)
4947      /* We can see if this is a UTF-8-like locale if have mbtowc().  It was a
4948       * late adder to C89, so very likely to have it.  However, testing has
4949       * shown that, like nl_langinfo() above, there are locales that are not
4950       * strictly UTF-8 that this will return that they are */
4951
4952         {
4953             wchar_t wc;
4954             int len;
4955             dSAVEDERRNO;
4956
4957 #      if defined(HAS_MBRTOWC) && defined(USE_LOCALE_THREADS)
4958
4959             mbstate_t ps;
4960
4961 #      endif
4962
4963             /* mbrtowc() and mbtowc() convert a byte string to a wide
4964              * character.  Feed a byte string to one of them and check that the
4965              * result is the expected Unicode code point */
4966
4967 #      if defined(HAS_MBRTOWC) && defined(USE_LOCALE_THREADS)
4968             /* Prefer this function if available, as it's reentrant */
4969
4970             memzero(&ps, sizeof(ps));;
4971             PERL_UNUSED_RESULT(mbrtowc(&wc, NULL, 0, &ps)); /* Reset any shift
4972                                                                state */
4973             SETERRNO(0, 0);
4974             len = mbrtowc(&wc, STR_WITH_LEN(REPLACEMENT_CHARACTER_UTF8), &ps);
4975             SAVE_ERRNO;
4976
4977 #      else
4978
4979             MBTOWC_LOCK;
4980             PERL_UNUSED_RESULT(mbtowc(&wc, NULL, 0));/* Reset any shift state */
4981             SETERRNO(0, 0);
4982             len = mbtowc(&wc, STR_WITH_LEN(REPLACEMENT_CHARACTER_UTF8));
4983             SAVE_ERRNO;
4984             MBTOWC_UNLOCK;
4985
4986 #      endif
4987
4988             RESTORE_ERRNO;
4989             DEBUG_Lv(PerlIO_printf(Perl_debug_log,
4990                     "\treturn from mbtowc; len=%d; code_point=%x; errno=%d\n",
4991                                    len,      (unsigned int) wc, GET_ERRNO));
4992
4993             is_utf8 = cBOOL(   len == STRLENs(REPLACEMENT_CHARACTER_UTF8)
4994                             && wc == (wchar_t) UNICODE_REPLACEMENT);
4995         }
4996
4997 #    endif
4998
4999         restore_switched_locale(LC_CTYPE, original_ctype_locale);
5000         goto finish_and_return;
5001     }
5002
5003 #  else
5004
5005         /* Here, we must have a C89 compiler that doesn't have mbtowc().  Next
5006          * try looking at the currency symbol to see if it disambiguates
5007          * things.  Often that will be in the native script, and if the symbol
5008          * isn't in UTF-8, we know that the locale isn't.  If it is non-ASCII
5009          * UTF-8, we infer that the locale is too, as the odds of a non-UTF8
5010          * string being valid UTF-8 are quite small */
5011
5012 #    ifdef USE_LOCALE_MONETARY
5013
5014         /* If have LC_MONETARY, we can look at the currency symbol.  Often that
5015          * will be in the native script.  We do this one first because there is
5016          * just one string to examine, so potentially avoids work */
5017
5018         {
5019             const char *original_monetary_locale
5020                         = switch_category_locale_to_template(LC_MONETARY,
5021                                                              category,
5022                                                              save_input_locale);
5023             bool only_ascii = FALSE;
5024             const U8 * currency_string
5025                             = (const U8 *) my_nl_langinfo(CRNCYSTR, FALSE);
5026                                       /* 2nd param not relevant for this item */
5027             const U8 * first_variant;
5028
5029             assert(   *currency_string == '-'
5030                    || *currency_string == '+'
5031                    || *currency_string == '.');
5032
5033             currency_string++;
5034
5035             if (is_utf8_invariant_string_loc(currency_string, 0, &first_variant))
5036             {
5037                 DEBUG_L(PerlIO_printf(Perl_debug_log,
5038                         "Couldn't get currency symbol for %s, or contains"
5039                         " only ASCII; can't use for determining if UTF-8"
5040                         " locale\n", save_input_locale));
5041                 only_ascii = TRUE;
5042             }
5043             else {
5044                 is_utf8 = is_strict_utf8_string(first_variant, 0);
5045             }
5046
5047             restore_switched_locale(LC_MONETARY, original_monetary_locale);
5048
5049             if (! only_ascii) {
5050
5051                 /* It isn't a UTF-8 locale if the symbol is not legal UTF-8;
5052                  * otherwise assume the locale is UTF-8 if and only if the symbol
5053                  * is non-ascii UTF-8. */
5054                 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
5055                                       "\t?Currency symbol for %s is UTF-8=%d\n",
5056                                         save_input_locale, is_utf8));
5057                 goto finish_and_return;
5058             }
5059         }
5060
5061 #    endif /* USE_LOCALE_MONETARY */
5062 #    if defined(HAS_STRFTIME) && defined(USE_LOCALE_TIME)
5063
5064     /* Still haven't found a non-ASCII string to disambiguate UTF-8 or not.  Try
5065      * the names of the months and weekdays, timezone, and am/pm indicator */
5066         {
5067             const char *original_time_locale
5068                             = switch_category_locale_to_template(LC_TIME,
5069                                                                  category,
5070                                                                  save_input_locale);
5071             int hour = 10;
5072             bool is_dst = FALSE;
5073             int dom = 1;
5074             int month = 0;
5075             int i;
5076             char * formatted_time;
5077
5078             /* Here the current LC_TIME is set to the locale of the category
5079              * whose information is desired.  Look at all the days of the week
5080              * and month names, and the timezone and am/pm indicator for UTF-8
5081              * variant characters.  The first such a one found will tell us if
5082              * the locale is UTF-8 or not */
5083
5084             for (i = 0; i < 7 + 12; i++) {  /* 7 days; 12 months */
5085                 formatted_time = my_strftime("%A %B %Z %p",
5086                                 0, 0, hour, dom, month, 2012 - 1900, 0, 0, is_dst);
5087                 if ( ! formatted_time
5088                     || is_utf8_invariant_string((U8 *) formatted_time, 0))
5089                 {
5090
5091                     /* Here, we didn't find a non-ASCII.  Try the next time
5092                      * through with the complemented dst and am/pm, and try
5093                      * with the next weekday.  After we have gotten all
5094                      * weekdays, try the next month */
5095                     is_dst = ! is_dst;
5096                     hour = (hour + 12) % 24;
5097                     dom++;
5098                     if (i > 6) {
5099                         month++;
5100                     }
5101                     Safefree(formatted_time);
5102                     continue;
5103                 }
5104
5105                 /* Here, we have a non-ASCII.  Return TRUE is it is valid UTF8;
5106                  * false otherwise.  But first, restore LC_TIME to its original
5107                  * locale if we changed it */
5108                 restore_switched_locale(LC_TIME, original_time_locale);
5109
5110                 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
5111                             "\t?time-related strings for %s are UTF-8=%d\n",
5112                                     save_input_locale,
5113                                     is_utf8_string((U8 *) formatted_time, 0)));
5114                 is_utf8 = is_utf8_string((U8 *) formatted_time, 0);
5115                 Safefree(formatted_time);
5116                 goto finish_and_return;
5117             }
5118
5119             /* Falling off the end of the loop indicates all the names were just
5120              * ASCII.  Go on to the next test.  If we changed it, restore LC_TIME
5121              * to its original locale */
5122             restore_switched_locale(LC_TIME, original_time_locale);
5123             DEBUG_Lv(PerlIO_printf(Perl_debug_log,
5124                      "All time-related words for %s contain only ASCII;"
5125                      " can't use for determining if UTF-8 locale\n",
5126                      save_input_locale));
5127         }
5128
5129 #    endif
5130
5131 #    if 0 && defined(USE_LOCALE_MESSAGES) && defined(HAS_SYS_ERRLIST)
5132
5133     /* This code is ifdefd out because it was found to not be necessary in
5134      * testing on our dromedary test machine, which has over 700 locales.
5135      * There, this added no value to looking at the currency symbol and the
5136      * time strings.  I left it in so as to avoid rewriting it if real-world
5137      * experience indicates that dromedary is an outlier.  Essentially, instead
5138      * of returning abpve if we haven't found illegal utf8, we continue on and
5139      * examine all the strerror() messages on the platform for utf8ness.  If
5140      * all are ASCII, we still don't know the answer; but otherwise we have a
5141      * pretty good indication of the utf8ness.  The reason this doesn't help
5142      * much is that the messages may not have been translated into the locale.
5143      * The currency symbol and time strings are much more likely to have been
5144      * translated.  */
5145         {
5146             int e;
5147             bool non_ascii = FALSE;
5148             const char *original_messages_locale
5149                             = switch_category_locale_to_template(LC_MESSAGES,
5150                                                                  category,
5151                                                                  save_input_locale);
5152             const char * errmsg = NULL;
5153
5154             /* Here the current LC_MESSAGES is set to the locale of the category
5155              * whose information is desired.  Look through all the messages.  We
5156              * can't use Strerror() here because it may expand to code that
5157              * segfaults in miniperl */
5158
5159             for (e = 0; e <= sys_nerr; e++) {
5160                 errno = 0;
5161                 errmsg = sys_errlist[e];
5162                 if (errno || !errmsg) {
5163                     break;
5164                 }
5165                 errmsg = savepv(errmsg);
5166                 if (! is_utf8_invariant_string((U8 *) errmsg, 0)) {
5167                     non_ascii = TRUE;
5168                     is_utf8 = is_utf8_string((U8 *) errmsg, 0);
5169                     break;
5170                 }
5171             }
5172             Safefree(errmsg);
5173
5174             restore_switched_locale(LC_MESSAGES, original_messages_locale);
5175
5176             if (non_ascii) {
5177
5178                 /* Any non-UTF-8 message means not a UTF-8 locale; if all are
5179                  * valid, any non-ascii means it is one; otherwise we assume it
5180                  * isn't */
5181                 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
5182                                     "\t?error messages for %s are UTF-8=%d\n",
5183                                     save_input_locale,
5184                                     is_utf8));
5185                 goto finish_and_return;
5186             }
5187
5188             DEBUG_L(PerlIO_printf(Perl_debug_log,
5189                     "All error messages for %s contain only ASCII;"
5190                     " can't use for determining if UTF-8 locale\n",
5191                     save_input_locale));
5192         }
5193
5194 #    endif
5195 #    ifndef EBCDIC  /* On os390, even if the name ends with "UTF-8', it isn't a
5196                    UTF-8 locale */
5197
5198     /* As a last resort, look at the locale name to see if it matches
5199      * qr/UTF -?  * 8 /ix, or some other common locale names.  This "name", the
5200      * return of setlocale(), is actually defined to be opaque, so we can't
5201      * really rely on the absence of various substrings in the name to indicate
5202      * its UTF-8ness, but if it has UTF8 in the name, it is extremely likely to
5203      * be a UTF-8 locale.  Similarly for the other common names */
5204
5205     {
5206         const Size_t final_pos = strlen(save_input_locale) - 1;
5207
5208         if (final_pos >= 3) {
5209             const char *name = save_input_locale;
5210
5211             /* Find next 'U' or 'u' and look from there */
5212             while ((name += strcspn(name, "Uu") + 1)
5213                                         <= save_input_locale + final_pos - 2)
5214             {
5215                 if (   isALPHA_FOLD_NE(*name, 't')
5216                     || isALPHA_FOLD_NE(*(name + 1), 'f'))
5217                 {
5218                     continue;
5219                 }
5220                 name += 2;
5221                 if (*(name) == '-') {
5222                     if ((name > save_input_locale + final_pos - 1)) {
5223                         break;
5224                     }
5225                     name++;
5226                 }
5227                 if (*(name) == '8') {
5228                     DEBUG_L(PerlIO_printf(Perl_debug_log,
5229                                         "Locale %s ends with UTF-8 in name\n",
5230                                         save_input_locale));
5231                     is_utf8 = TRUE;
5232                     goto finish_and_return;
5233                 }
5234             }
5235             DEBUG_L(PerlIO_printf(Perl_debug_log,
5236                                 "Locale %s doesn't end with UTF-8 in name\n",
5237                                     save_input_locale));
5238         }
5239
5240 #      ifdef WIN32
5241
5242         /* http://msdn.microsoft.com/en-us/library/windows/desktop/dd317756.aspx */
5243         if (memENDs(save_input_locale, final_pos, "65001")) {
5244             DEBUG_L(PerlIO_printf(Perl_debug_log,
5245                         "Locale %s ends with 65001 in name, is UTF-8 locale\n",
5246                         save_input_locale));
5247             is_utf8 = TRUE;
5248             goto finish_and_return;
5249         }
5250
5251 #      endif
5252     }
5253 #    endif
5254
5255     /* Other common encodings are the ISO 8859 series, which aren't UTF-8.  But
5256      * since we are about to return FALSE anyway, there is no point in doing
5257      * this extra work */
5258
5259 #    if 0
5260     if (instr(save_input_locale, "8859")) {
5261         DEBUG_L(PerlIO_printf(Perl_debug_log,
5262                              "Locale %s has 8859 in name, not UTF-8 locale\n",
5263                              save_input_locale));
5264         is_utf8 = FALSE;
5265         goto finish_and_return;
5266     }
5267 #    endif
5268
5269     DEBUG_L(PerlIO_printf(Perl_debug_log,
5270                           "Assuming locale %s is not a UTF-8 locale\n",
5271                                     save_input_locale));
5272     is_utf8 = FALSE;
5273
5274 #  endif /* the code that is compiled when no modern LC_CTYPE */
5275
5276   finish_and_return:
5277
5278     /* Cache this result so we don't have to go through all this next time. */
5279     utf8ness_cache_size = sizeof(PL_locale_utf8ness)
5280                        - (utf8ness_cache - PL_locale_utf8ness);
5281
5282     /* But we can't save it if it is too large for the total space available */
5283     if (LIKELY(input_name_len_with_overhead < utf8ness_cache_size)) {
5284         Size_t utf8ness_cache_len = strlen(utf8ness_cache);
5285
5286         /* Here it can fit, but we may need to clear out the oldest cached
5287          * result(s) to do so.  Check */
5288         if (utf8ness_cache_len + input_name_len_with_overhead
5289                                                         >= utf8ness_cache_size)
5290         {
5291             /* Here we have to clear something out to make room for this.
5292              * Start looking at the rightmost place where it could fit and find
5293              * the beginning of the entry that extends past that. */
5294             char * cutoff = (char *) my_memrchr(utf8ness_cache,
5295                                                 UTF8NESS_SEP[0],
5296                                                 utf8ness_cache_size
5297                                               - input_name_len_with_overhead);
5298
5299             assert(cutoff);
5300             assert(cutoff >= utf8ness_cache);
5301
5302             /* This and all subsequent entries must be removed */
5303             *cutoff = '\0';
5304             utf8ness_cache_len = strlen(utf8ness_cache);
5305         }
5306
5307         /* Make space for the new entry */
5308         Move(utf8ness_cache,
5309              utf8ness_cache + input_name_len_with_overhead,
5310              utf8ness_cache_len + 1 /* Incl. trailing NUL */, char);
5311
5312         /* And insert it */
5313         Copy(delimited, utf8ness_cache, input_name_len_with_overhead - 1, char);
5314         utf8ness_cache[input_name_len_with_overhead - 1] = is_utf8 + '0';
5315
5316         if ((PL_locale_utf8ness[strlen(PL_locale_utf8ness)-1] & ~1) != '0') {
5317             locale_panic_(Perl_form(aTHX_
5318                                     "Corrupt utf8ness_cache=%s\nlen=%zu,"
5319                                     " inserted_name=%s, its_len=%zu",
5320                                     PL_locale_utf8ness, strlen(PL_locale_utf8ness),
5321                                     delimited, input_name_len_with_overhead));
5322         }
5323     }
5324
5325 #  ifdef DEBUGGING
5326
5327     if (DEBUG_Lv_TEST) {
5328         const char * s = PL_locale_utf8ness;
5329
5330         /* Audit the structure */
5331         while (s < PL_locale_utf8ness + strlen(PL_locale_utf8ness)) {
5332             const char *e;
5333
5334             if (*s != UTF8NESS_SEP[0]) {
5335                 locale_panic_(Perl_form(aTHX_
5336                                         "Corrupt utf8ness_cache: missing"
5337                                         " separator %.*s<-- HERE %s",
5338                                         (int) (s - PL_locale_utf8ness),
5339                                         PL_locale_utf8ness,
5340                                         s));
5341             }
5342             s++;
5343             e = strchr(s, UTF8NESS_PREFIX[0]);
5344             if (! e) {
5345                 e = PL_locale_utf8ness + strlen(PL_locale_utf8ness);
5346                 locale_panic_(Perl_form(aTHX_
5347                                         "Corrupt utf8ness_cache: missing"
5348                                         " separator %.*s<-- HERE %s",
5349                                         (int) (e - PL_locale_utf8ness),
5350                                         PL_locale_utf8ness,
5351                                         e));
5352             }
5353             e++;
5354             if (*e != '0' && *e != '1') {
5355                 locale_panic_(Perl_form(aTHX_
5356                                         "Corrupt utf8ness_cache: utf8ness"
5357                                         " must be [01] %.*s<-- HERE %s",
5358                                         (int) (e + 1 - PL_locale_utf8ness),
5359                                         PL_locale_utf8ness,
5360                                         e + 1));
5361             }
5362             if (ninstr(PL_locale_utf8ness, s, s-1, e)) {
5363                 locale_panic_(Perl_form(aTHX_
5364                                         "Corrupt utf8ness_cache: entry"
5365                                         " has duplicate %.*s<-- HERE %s",
5366                                         (int) (e - PL_locale_utf8ness),
5367                                         PL_locale_utf8ness,
5368                                         e));
5369             }
5370             s = e + 1;
5371         }
5372     }
5373
5374     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
5375                 "PL_locale_utf8ness is now %s; returning %d\n",
5376                            PL_locale_utf8ness, is_utf8));
5377
5378 #  endif
5379
5380     /* free only when not using the buffer */
5381     if ( delimited != buffer ) Safefree(delimited);
5382     Safefree(save_input_locale);
5383     return is_utf8;
5384 }
5385
5386 #endif
5387
5388 bool
5389 Perl__is_in_locale_category(pTHX_ const bool compiling, const int category)
5390 {
5391     /* Internal function which returns if we are in the scope of a pragma that
5392      * enables the locale category 'category'.  'compiling' should indicate if
5393      * this is during the compilation phase (TRUE) or not (FALSE). */
5394
5395     const COP * const cop = (compiling) ? &PL_compiling : PL_curcop;
5396
5397     SV *these_categories = cop_hints_fetch_pvs(cop, "locale", 0);
5398     if (! these_categories || these_categories == &PL_sv_placeholder) {
5399         return FALSE;
5400     }
5401
5402     /* The pseudo-category 'not_characters' is -1, so just add 1 to each to get
5403      * a valid unsigned */
5404     assert(category >= -1);
5405     return cBOOL(SvUV(these_categories) & (1U << (category + 1)));
5406 }
5407
5408 char *
5409 Perl_my_strerror(pTHX_ const int errnum)
5410 {
5411     /* Returns a mortalized copy of the text of the error message associated
5412      * with 'errnum'.  It uses the current locale's text unless the platform
5413      * doesn't have the LC_MESSAGES category or we are not being called from
5414      * within the scope of 'use locale'.  In the former case, it uses whatever
5415      * strerror returns; in the latter case it uses the text from the C locale.
5416      *
5417      * The function just calls strerror(), but temporarily switches, if needed,
5418      * to the C locale */
5419
5420     char *errstr;
5421
5422 #ifndef USE_LOCALE_MESSAGES
5423
5424     /* If platform doesn't have messages category, we don't do any switching to
5425      * the C locale; we just use whatever strerror() returns */
5426
5427     errstr = savepv(Strerror(errnum));
5428
5429 #else   /* Has locale messages */
5430
5431     const bool within_locale_scope = IN_LC(LC_MESSAGES);
5432
5433 #  ifndef USE_LOCALE_THREADS
5434
5435     /* This function is trivial without threads. */
5436     if (within_locale_scope) {
5437         errstr = savepv(Strerror(errnum));
5438     }
5439     else {
5440         const char * save_locale = savepv(querylocale_c(LC_MESSAGES));
5441
5442         void_setlocale_c(LC_MESSAGES, "C");
5443         errstr = savepv(Strerror(errnum));
5444         void_setlocale_c(LC_MESSAGES, save_locale);
5445         Safefree(save_locale);
5446     }
5447
5448 #  elif defined(USE_POSIX_2008_LOCALE) && defined(HAS_STRERROR_L)
5449
5450     /* This function is also trivial if we don't have to worry about thread
5451      * safety and have strerror_l(), as it handles the switch of locales so we
5452      * don't have to deal with that.  We don't have to worry about thread
5453      * safety if strerror_r() is also available.  Both it and strerror_l() are
5454      * thread-safe.  Plain strerror() isn't thread safe.  But on threaded
5455      * builds when strerror_r() is available, the apparent call to strerror()
5456      * below is actually a macro that behind-the-scenes calls strerror_r(). */
5457
5458 #    ifdef HAS_STRERROR_R
5459
5460     if (within_locale_scope) {
5461         errstr = savepv(Strerror(errnum));
5462     }
5463     else {
5464         errstr = savepv(strerror_l(errnum, PL_C_locale_obj));
5465     }
5466
5467 #    else
5468
5469     /* Here we have strerror_l(), but not strerror_r() and we are on a
5470      * threaded-build.  We use strerror_l() for everything, constructing a
5471      * locale to pass to it if necessary */
5472
5473     locale_t locale_to_use;
5474
5475     if (within_locale_scope) {
5476         locale_to_use = use_curlocale_scratch();
5477     }
5478     else {  /* Use C locale if not within 'use locale' scope */
5479         locale_to_use = PL_C_locale_obj;
5480     }
5481
5482     errstr = savepv(strerror_l(errnum, locale_to_use));
5483
5484 #    endif
5485 #  else /* Doesn't have strerror_l() */
5486
5487     const char * save_locale = NULL;
5488     bool locale_is_C = FALSE;
5489
5490     /* We have a critical section to prevent another thread from executing this
5491      * same code at the same time.  (On thread-safe perls, the LOCK is a
5492      * no-op.)  Since this is the only place in core that changes LC_MESSAGES
5493      * (unless the user has called setlocale(), this works to prevent races. */
5494     SETLOCALE_LOCK;
5495
5496     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
5497                             "my_strerror called with errnum %d\n", errnum));
5498     if (! within_locale_scope) {
5499         save_locale = querylocale_c(LC_MESSAGES);
5500         if (! save_locale) {
5501             SETLOCALE_UNLOCK;
5502             locale_panic_("Could not find current LC_MESSAGES locale");
5503         }
5504         else {
5505             locale_is_C = isNAME_C_OR_POSIX(save_locale);
5506
5507             /* Switch to the C locale if not already in it */
5508             if (! locale_is_C) {
5509
5510                 /* The setlocale() just below likely will zap 'save_locale', so
5511                  * create a copy.  */
5512                 save_locale = savepv(save_locale);
5513                 if (! bool_setlocale_c(LC_MESSAGES, "C")) {
5514
5515                     /* If, for some reason, the locale change failed, we
5516                      * soldier on as best as possible under the circumstances,
5517                      * using the current locale, and clear save_locale, so we
5518                      * don't try to change back.  On z/0S, all setlocale()
5519                      * calls fail after you've created a thread.  This is their
5520                      * way of making sure the entire process is always a single
5521                      * locale.  This means that 'use locale' is always in place
5522                      * for messages under these circumstances. */
5523                     Safefree(save_locale);
5524                     save_locale = NULL;
5525                 }
5526             }
5527         }
5528     }   /* end of ! within_locale_scope */
5529     else {
5530         DEBUG_Lv(PerlIO_printf(Perl_debug_log, "WITHIN locale scope\n"));
5531     }
5532
5533     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
5534              "Any locale change has been done; about to call Strerror\n"));
5535     errstr = savepv(Strerror(errnum));
5536
5537     if (! within_locale_scope) {
5538         if (save_locale && ! locale_is_C) {
5539             if (! bool_setlocale_c(LC_MESSAGES, save_locale)) {
5540                 SETLOCALE_UNLOCK;
5541                 locale_panic_(Perl_form(aTHX_
5542                                         "setlocale restore to '%s' failed",
5543                                         save_locale));
5544             }
5545             Safefree(save_locale);
5546         }
5547     }
5548
5549     SETLOCALE_UNLOCK;
5550
5551 #  endif /* End of doesn't have strerror_l */
5552
5553     DEBUG_Lv((PerlIO_printf(Perl_debug_log,
5554               "Strerror returned; saving a copy: '"),
5555               print_bytes_for_locale(errstr, errstr + strlen(errstr), 0),
5556               PerlIO_printf(Perl_debug_log, "'\n")));
5557
5558 #endif   /* End of does have locale messages */
5559
5560     SAVEFREEPV(errstr);
5561     return errstr;
5562 }
5563
5564 /*
5565
5566 =for apidoc switch_to_global_locale
5567
5568 On systems without locale support, or on typical single-threaded builds, or on
5569 platforms that do not support per-thread locale operations, this function does
5570 nothing.  On such systems that do have locale support, only a locale global to
5571 the whole program is available.
5572
5573 On multi-threaded builds on systems that do have per-thread locale operations,
5574 this function converts the thread it is running in to use the global locale.
5575 This is for code that has not yet or cannot be updated to handle multi-threaded
5576 locale operation.  As long as only a single thread is so-converted, everything
5577 works fine, as all the other threads continue to ignore the global one, so only
5578 this thread looks at it.
5579
5580 However, on Windows systems this isn't quite true prior to Visual Studio 15,
5581 at which point Microsoft fixed a bug.  A race can occur if you use the
5582 following operations on earlier Windows platforms:
5583
5584 =over
5585
5586 =item L<POSIX::localeconv|POSIX/localeconv>
5587
5588 =item L<I18N::Langinfo>, items C<CRNCYSTR> and C<THOUSEP>
5589
5590 =item L<perlapi/Perl_langinfo>, items C<CRNCYSTR> and C<THOUSEP>
5591
5592 =back
5593
5594 The first item is not fixable (except by upgrading to a later Visual Studio
5595 release), but it would be possible to work around the latter two items by using
5596 the Windows API functions C<GetNumberFormat> and C<GetCurrencyFormat>; patches
5597 welcome.
5598
5599 Without this function call, threads that use the L<C<setlocale(3)>> system
5600 function will not work properly, as all the locale-sensitive functions will
5601 look at the per-thread locale, and C<setlocale> will have no effect on this
5602 thread.
5603
5604 Perl code should convert to either call
5605 L<C<Perl_setlocale>|perlapi/Perl_setlocale> (which is a drop-in for the system
5606 C<setlocale>) or use the methods given in L<perlcall> to call
5607 L<C<POSIX::setlocale>|POSIX/setlocale>.  Either one will transparently properly
5608 handle all cases of single- vs multi-thread, POSIX 2008-supported or not.
5609
5610 Non-Perl libraries, such as C<gtk>, that call the system C<setlocale> can
5611 continue to work if this function is called before transferring control to the
5612 library.
5613
5614 Upon return from the code that needs to use the global locale,
5615 L<C<sync_locale()>|perlapi/sync_locale> should be called to restore the safe
5616 multi-thread operation.
5617
5618 =cut
5619 */
5620
5621 void
5622 Perl_switch_to_global_locale()
5623 {
5624     dTHX;
5625
5626 #ifdef USE_THREAD_SAFE_LOCALE
5627 #  ifdef WIN32
5628
5629     _configthreadlocale(_DISABLE_PER_THREAD_LOCALE);
5630
5631 #  else
5632
5633     {
5634         unsigned int i;
5635
5636         for (i = 0; i < LC_ALL_INDEX_; i++) {
5637             setlocale(categories[i], querylocale_i(i));
5638         }
5639     }
5640
5641     uselocale(LC_GLOBAL_LOCALE);
5642
5643 #  endif
5644 #endif
5645
5646 }
5647
5648 /*
5649
5650 =for apidoc sync_locale
5651
5652 L<C<Perl_setlocale>|perlapi/Perl_setlocale> can be used at any time to query or
5653 change the locale (though changing the locale is antisocial and dangerous on
5654 multi-threaded systems that don't have multi-thread safe locale operations.
5655 (See L<perllocale/Multi-threaded operation>).  Using the system
5656 L<C<setlocale(3)>> should be avoided.  Nevertheless, certain non-Perl libraries
5657 called from XS, such as C<Gtk> do so, and this can't be changed.  When the
5658 locale is changed by XS code that didn't use
5659 L<C<Perl_setlocale>|perlapi/Perl_setlocale>, Perl needs to be told that the
5660 locale has changed.  Use this function to do so, before returning to Perl.
5661
5662 The return value is a boolean: TRUE if the global locale at the time of call
5663 was in effect; and FALSE if a per-thread locale was in effect.  This can be
5664 used by the caller that needs to restore things as-they-were to decide whether
5665 or not to call
5666 L<C<Perl_switch_to_global_locale>|perlapi/switch_to_global_locale>.
5667
5668 =cut
5669 */
5670
5671 bool
5672 Perl_sync_locale()
5673 {
5674
5675 #ifndef USE_LOCALE
5676
5677     return TRUE;
5678
5679 #else
5680
5681     const char * newlocale;
5682     dTHX;
5683
5684 #  ifdef USE_POSIX_2008_LOCALE
5685
5686     bool was_in_global_locale = FALSE;
5687     locale_t cur_obj = uselocale((locale_t) 0);
5688
5689     /* On Windows, unless the foreign code has turned off the thread-safe
5690      * locale setting, any plain setlocale() will have affected what we see, so
5691      * no need to worry.  Otherwise, If the foreign code has done a plain
5692      * setlocale(), it will only affect the global locale on POSIX systems, but
5693      * will affect the */
5694     if (cur_obj == LC_GLOBAL_LOCALE) {
5695
5696 #    ifdef HAS_QUERY_LOCALE
5697
5698         void_setlocale_c(LC_ALL, querylocale_c(LC_ALL));
5699
5700 #    else
5701
5702         unsigned int i;
5703
5704         /* We can't trust that we can read the LC_ALL format on the
5705          * platform, so do them individually */
5706         for (i = 0; i < LC_ALL_INDEX_; i++) {
5707             void_setlocale_i(i, querylocale_i(i));
5708         }
5709
5710 #    endif
5711
5712         was_in_global_locale = TRUE;
5713     }
5714
5715 #  else
5716
5717     bool was_in_global_locale = TRUE;
5718
5719 #  endif
5720 #  ifdef USE_LOCALE_CTYPE
5721
5722     newlocale = savepv(querylocale_c(LC_CTYPE));
5723     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
5724                   "%s\n", setlocale_debug_string_c(LC_CTYPE, NULL, newlocale)));
5725     new_ctype(newlocale);
5726     Safefree(newlocale);
5727
5728 #  endif /* USE_LOCALE_CTYPE */
5729 #  ifdef USE_LOCALE_COLLATE
5730
5731     newlocale = savepv(querylocale_c(LC_COLLATE));
5732     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
5733                 "%s\n", setlocale_debug_string_c(LC_COLLATE, NULL, newlocale)));
5734     new_collate(newlocale);
5735     Safefree(newlocale);
5736
5737 #  endif
5738 #  ifdef USE_LOCALE_NUMERIC
5739
5740     newlocale = savepv(querylocale_c(LC_NUMERIC));
5741     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
5742                 "%s\n", setlocale_debug_string_c(LC_NUMERIC, NULL, newlocale)));
5743     new_numeric(newlocale);
5744     Safefree(newlocale);
5745
5746 #  endif /* USE_LOCALE_NUMERIC */
5747
5748     return was_in_global_locale;
5749
5750 #endif
5751
5752 }
5753
5754 #if defined(DEBUGGING) && defined(USE_LOCALE)
5755
5756 STATIC char *
5757 S_setlocale_debug_string_i(const unsigned cat_index,
5758                            const char* const locale, /* Optional locale name */
5759
5760                             /* return value from setlocale() when attempting to
5761                              * set 'category' to 'locale' */
5762                             const char* const retval)
5763 {
5764     /* Returns a pointer to a NUL-terminated string in static storage with
5765      * added text about the info passed in.  This is not thread safe and will
5766      * be overwritten by the next call, so this should be used just to
5767      * formulate a string to immediately print or savepv() on. */
5768
5769     static char ret[1024];
5770     assert(cat_index <= NOMINAL_LC_ALL_INDEX);
5771
5772     my_strlcpy(ret, "setlocale(", sizeof(ret));
5773     my_strlcat(ret, category_names[cat_index], sizeof(ret));
5774     my_strlcat(ret, ", ", sizeof(ret));
5775
5776     if (locale) {
5777         my_strlcat(ret, "\"", sizeof(ret));
5778         my_strlcat(ret, locale, sizeof(ret));
5779         my_strlcat(ret, "\"", sizeof(ret));
5780     }
5781     else {
5782         my_strlcat(ret, "NULL", sizeof(ret));
5783     }
5784
5785     my_strlcat(ret, ") returned ", sizeof(ret));
5786
5787     if (retval) {
5788         my_strlcat(ret, "\"", sizeof(ret));
5789         my_strlcat(ret, retval, sizeof(ret));
5790         my_strlcat(ret, "\"", sizeof(ret));
5791     }
5792     else {
5793         my_strlcat(ret, "NULL", sizeof(ret));
5794     }
5795
5796     assert(strlen(ret) < sizeof(ret));
5797
5798     return ret;
5799 }
5800
5801 #endif
5802
5803 void
5804 Perl_thread_locale_init()
5805 {
5806     /* Called from a thread on startup*/
5807
5808 #ifdef USE_THREAD_SAFE_LOCALE
5809
5810     dTHX_DEBUGGING;
5811
5812
5813      DEBUG_L(PerlIO_printf(Perl_debug_log,
5814             "new thread, initial locale is %s; calling setlocale\n",
5815             setlocale(LC_ALL, NULL)));
5816
5817 #  ifdef WIN32
5818
5819     /* On Windows, make sure new thread has per-thread locales enabled */
5820     _configthreadlocale(_ENABLE_PER_THREAD_LOCALE);
5821
5822 #  else
5823
5824     /* This thread starts off in the C locale */
5825     Perl_setlocale(LC_ALL, "C");
5826
5827 #  endif
5828 #endif
5829
5830 }
5831
5832 void
5833 Perl_thread_locale_term()
5834 {
5835     /* Called from a thread as it gets ready to terminate */
5836
5837 #ifdef USE_POSIX_2008_LOCALE
5838
5839     /* C starts the new thread in the global C locale.  If we are thread-safe,
5840      * we want to not be in the global locale */
5841
5842     {   /* Free up */
5843         locale_t cur_obj = uselocale(LC_GLOBAL_LOCALE);
5844         if (cur_obj != LC_GLOBAL_LOCALE && cur_obj != PL_C_locale_obj) {
5845             freelocale(cur_obj);
5846         }
5847     }
5848
5849 #endif
5850
5851 }
5852
5853 /*
5854  * ex: set ts=8 sts=4 sw=4 et:
5855  */