locale.c

   1 /*    locale.c
   2  *
   3  *    Copyright (C) 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001,
   4  *    2002, 2003, 2005, 2006, 2007, 2008 by Larry Wall and others
   5  *
   6  *    You may distribute under the terms of either the GNU General Public
   7  *    License or the Artistic License, as specified in the README file.
   8  *
   9  */
  10
  11 /*
  12  *      A Elbereth Gilthoniel,
  13  *      silivren penna míriel
  14  *      o menel aglar elenath!
  15  *      Na-chaered palan-díriel
  16  *      o galadhremmin ennorath,
  17  *      Fanuilos, le linnathon
  18  *      nef aear, si nef aearon!
  19  *
  20  *     [p.238 of _The Lord of the Rings_, II/i: "Many Meetings"]
  21  */
  22
  23 /* utility functions for handling locale-specific stuff like what
  24  * character represents the decimal point.
  25  *
  26  * All C programs have an underlying locale.  Perl code generally doesn't pay
  27  * any attention to it except within the scope of a 'use locale'.  For most
  28  * categories, it accomplishes this by just using different operations if it is
  29  * in such scope than if not.  However, various libc functions called by Perl
  30  * are affected by the LC_NUMERIC category, so there are macros in perl.h that
  31  * are used to toggle between the current locale and the C locale depending on
  32  * the desired behavior of those functions at the moment.  And, LC_MESSAGES is
  33  * switched to the C locale for outputting the message unless within the scope
  34  * of 'use locale'.
  35  *
  36  * There is more than the typical amount of variation between platforms with
  37  * regard to locale handling.  At the end of these introductory comments, are
  38  * listed various relevent Configuration options, including some that can be
  39  * used to pretend to some extent that this is being developed on a different
  40  * platform than it actually is.  This allows you to make changes and catch
  41  * some errors without having access to those other platforms.
  42  *
  43  * This code now has multi-thread-safe locale handling on systems that support
  44  * that.  This is completely transparent to most XS code.  On earlier systems,
  45  * it would be possible to emulate thread-safe locales, but this likely would
  46  * involve a lot of locale switching, and would require XS code changes.
  47  * Macros could be written so that the code wouldn't have to know which type of
  48  * system is being used.
  49  *
  50  * Table-driven code is used for simplicity and clarity, as many operations
  51  * differ only in which category is being worked on.  However the system
  52  * categories need not be small contiguous integers, so do not lend themselves
  53  * to table lookup.  Instead we have created our own equivalent values which
  54  * are all small contiguous non-negative integers, and translation functions
  55  * between the two sets.  For category 'LC_foo', the name of our index is
  56  * LC_foo_INDEX_.  Various parallel tables, indexed by these, are used for the
  57  * translation.  The tables are generated at compile-time based on platform
  58  * characteristics and Configure options.  They hide from the code many of the
  59  * vagaries of the different locale implementations out there.
  60  *
  61  * On unthreaded perls, most operations expand out to just the basic
  62  * setlocale() calls.  That sort of is true on threaded perls on modern Windows
  63  * systems where the same API, after set up, is used for thread-safe locale
  64  * handling.  (But there are complications on Windows due to internal character
  65  * set issues.)  On other systems, there is a completely different API,
  66  * specified in POSIX 2008, to do thread-safe locales.  On these systems, our
  67  * bool_setlocale_2008_i() function is used to hide the different API from the
  68  * outside.  This makes it completely transparent to most XS code.
  69  *
  70  * A huge complicating factor is that the LC_NUMERIC category is normally held
  71  * in the C locale, except during those relatively rare times when it needs to
  72  * be in the underlying locale.  There is a bunch of code to accomplish this,
  73  * and to allow easy switches from one state to the other.
  74  *
  75  * In addition, the setlocale equivalents have versions for the return context,
  76  * 'void' and 'bool', besides the full return value.  This can present
  77  * opportunities for avoiding work.  We don't have to necessarily create a safe
  78  * copy to return if no return is desired.
  79  *
  80  * There are 3.5 major implementations here; which one chosen depends on what
  81  * the platform has available, and Configuration options.
  82  *
  83  * 1) Raw posix_setlocale().  This implementation is basically the libc
  84  *    setlocale(), with possibly minor tweaks.  This is used for startup, and
  85  *    always for unthreaded perls, and when the API for safe locale threading
  86  *    is identical to the unsafe API (Windows, currently).
  87  *
  88  *    This implementation is composed of two layers:
  89  *      a)  posix_setlocale() implements the libc setlocale().  In most cases,
  90  *          it is just an alias for the libc version.  But Windows doesn't
  91  *          fully conform to the POSIX standard, and this is a layer on top of
  92  *          libc to bring it more into conformance.  And in Configurations
  93  *          where perl is to ignore some locale categories that the libc
  94  *          setlocale() knows about, there is a layer to cope with that.
  95  *      b)  stdized_setlocale() is a layer above a) that fixes some vagaries in
  96  *          the return value of the libc setlocale().  On most platforms this
  97  *          layer is empty; in order to be activated, it requires perl to be
  98  *          Configured with a parameter indicating the platform's defect.  The
  99  *          current ones are listed at the definition of the macro.
 100  *
 101  * 2) An implementation that adds a minimal layer above implementation 1),
 102  *    making that implementation uninterruptible and returning a
 103  *    per-thread/per-category value.
 104  *
 105  * 3a and 3b) An implementation of POSIX 2008 thread-safe locale handling,
 106  *    hiding from the programmer the completely different API for this.
 107  *    This automatically makes almost all code thread-safe without need for
 108  *    changes.  This implementation is chosen on threaded perls when the
 109  *    platform properly supports the POSIX 2008 functions, and when there is no
 110  *    manual override to the contrary passed to Configure.
 111  *
 112  *    3a) is when the platform has a documented reliable querylocale() function
 113  *        or equivalent that is selected to be used.
 114  *    3b) is when we have to emulate that functionality.
 115  *
 116  *    Unfortunately, it seems that some platforms that claim to support these
 117  *    are buggy, in one way or another.  There are workarounds encoded here,
 118  *    where feasible, for platforms where the bugs are amenable to that
 119  *    (glibc, for example).  But other platforms instead don't use this
 120  *    implementation.
 121  *
 122  * z/OS (os390) is an outlier.  Locales really don't work under threads when
 123  * either the radix character isn't a dot, or attempts are made to change
 124  * locales after the first thread is created.  The reason is that IBM has made
 125  * it thread-safe by refusing to change locales (returning failure if
 126  * attempted) any time after an application has called pthread_create() to
 127  * create another thread.  The expectation is that an application will set up
 128  * its locale information before the first fork, and be stable thereafter.  But
 129  * perl toggles LC_NUMERIC if the locale's radix character isn't a dot, as do
 130  * the other toggles, which are less common.
 131  *
 132  * Associated with each implementation are three sets of macros that translate
 133  * a consistent API into what that implementation needs.  Each set consists of
 134  * three macros with the suffixes:
 135  *  _c  Means the argument is a locale category number known at compile time.
 136  *          An example would be LC_TIME.  This token is a compile-time constant
 137  *          and can be passed to a '_c' macro.
 138  *  _r  Means the argument is a locale category number whose value might not be
 139  *      known until runtime
 140  *  _i  Means the argument is our internal index of a locale category
 141  *
 142  * The three sets are:    ('_X'  means one of '_c', '_r', '_i')
 143  * 1) bool_setlocale_X()
 144  *      This calls the appropriate setlocale()-equivalent for the
 145  *      implementation, with the category and new locale.  The input locale is
 146  *      not necessarily valid, so the return is true or false depending on
 147  *      whether or not the setlocale() succeeded.  This is not used for
 148  *      querying the locale, so the input locale must not be NULL.
 149  *
 150  *      This macro is suitable for toggling the locale back and forth during an
 151  *      operation.  For example, the names of days and months under LC_TIME are
 152  *      strings that are also subject to LC_CTYPE.  If the locales of these two
 153  *      categories differ, mojibake can result on many platforms.  The code
 154  *      here will toggle LC_CTYPE into the locale of LC_TIME temporarily to
 155  *      avoid this.
 156  *
 157  *      Several categories require extra work when their locale is changed.
 158  *      LC_CTYPE, for example, requires the calculation of the table of which
 159  *      characters fold to which others under /i pattern matching or fc(), as
 160  *      folding is not a concept in POSIX.  This table isn't needed when the
 161  *      LC_CTYPE locale gets toggled during an operation, and will be toggled
 162  *      back before return to the caller.  To save work that would be
 163  *      discarded, the bool_setlocale_X() implementations don't do this extra
 164  *      work.  Instead, there is a separate function for just this purpose to
 165  *      be done before control is transferred back to the external caller.  All
 166  *      categories that have such requirements have such a function.  The
 167  *      update_functions[] array contains pointers to them (or NULL for
 168  *      categories which don't need a function).
 169  *
 170  *      Care must be taken to remember to call the separate function before
 171  *      returning to an external caller, and to not use things it updates
 172  *      before its call.  An alternative approach would be to have
 173  *      bool_setlocale_X() always call the update, which would return
 174  *      immediately if a flag wasn't set indicating it was time to actually
 175  *      perform it.
 176  *
 177  * 2) void_setlocale_X()
 178  *      This is like bool_setlocale_X(), but it is used only when it is
 179  *      expected that the call must succeed, or something is seriously wrong.
 180  *      A panic is issued if it fails.  The caller uses this form when it just
 181  *      wants to assume things worked.
 182  *
 183  * 3) querylocale_X()
 184  *      This returns a string that specifies the current locale for the given
 185  *      category given by the input argument.  The string is safe from other
 186  *      threads zapping it, and the caller need not worry about freeing it, but
 187  *      it may be mortalized, so must be copied if you need to preserve it
 188  *      across calls, or long term.  This returns the actual current locale,
 189  *      not the nominal.  These differ, for example, when LC_NUMERIC is
 190  *      supposed to be a locale whose decimal radix character is a comma.  As
 191  *      mentioned above, Perl actually keeps this category set to C in such
 192  *      circumstances so that XS code can just assume a dot radix character.
 193  *      querylocale_X() returns the locale that libc has stored at this moment,
 194  *      so most of the time will return a locale whose radix character is a
 195  *      dot.  The macro query_nominal_locale_i() can be used to get the nominal
 196  *      locale that an external caller would expect, for all categories except
 197  *      LC_ALL.  For that, you can use the function
 198  *      S_calculate_LC_ALL_string().  Or S_native_querylocale_i() will operate
 199  *      on any category.
 200  *
 201  * The underlying C API that this implements uses category numbers, hence the
 202  * code is structured to use '_r' at the API level to convert to indexes, which
 203  * are then used internally with the '_i' forms.
 204  *
 205  * The splitting apart into setting vs querying means that the return value of
 206  * the bool macros is not subject to potential clashes with other threads,
 207  * eliminating any need for the calling code to worry about that and get it
 208  * wrong.  Whereas, you do have to think about thread interactions when using a
 209  * query.
 210  *
 211  * Additionally, for the implementations where there aren't any complications,
 212  * a setlocale_i() is defined that is like plain setlocale(), returning the new
 213  * locale.  Thus it combines a bool_setlocale_X() with a querylocale_X().  It
 214  * is used only for performance on implementations that allow it, such as
 215  * non-threaded perls.
 216  *
 217  * There are also a few other macros herein that use this naming convention to
 218  * describe their category parameter.
 219  *
 220  * Relevant Configure options
 221  *
 222  *      -Accflags=-DNO_LOCALE
 223  *          This compiles perl to always use the C locale, ignoring any
 224  *          attempts to change it.  This could be useful on platforms with a
 225  *          crippled locale implementation.
 226  *
 227  *      -Accflags=-DNO_THREAD_SAFE_LOCALE
 228  *          Even if thread-safe operations are available on this platform and
 229  *          would otherwise be used (because this is a perl with multiplicity),
 230  *          perl is compiled to not use them.  This could be useful on
 231  *          platforms where the libc is buggy.
 232  *
 233  *      -Accflags=-DNO_POSIX_2008_LOCALE
 234  *          Even if the libc locale operations specified by the Posix 2008
 235  *          Standard are available on this platform and would otherwise be used
 236  *          (because this is a perl with multiplicity), perl is compiled to not
 237  *          use them.  This could be useful on platforms where the libc is
 238  *          buggy.  This is like NO_THREAD_SAFE_LOCALE, but has no effect on
 239  *          platforms that don't have these functions.
 240  *
 241  *      -Accflags=-DUSE_POSIX_2008_LOCALE
 242  *          Normally, setlocale() is used for locale operations on perls
 243  *          compiled without multiplicity.  This option causes the locale
 244  *          operations defined by the Posix 2008 Standard to always be used
 245  *          instead.  This could be useful on platforms where the libc
 246  *          setlocale() is buggy.
 247  *
 248  *      -Accflags=-DNO_THREAD_SAFE_QUERYLOCALE
 249  *          This applies only to platforms that have a querylocale() libc
 250  *          function.  perl assumes that that function is thread-safe, unless
 251  *          overridden by this, typically in a hints file.  When overridden,
 252  *          querylocale() is called only while the locale mutex is locked, and
 253  *          the result is copied to a per-thread place before unlocking.
 254  *
 255  *      -Accflags=-DUSE_NL_LOCALE_NAME
 256  *          glibc has an undocumented equivalent function to querylocale().  It
 257  *          currently isn't used by default because it is undocumented.  But
 258  *          testing hasn't found any problems with it.  Using this Configure
 259  *          option enables it on systems that have it (with no effect on
 260  *          systems lacking it).  Enabling this removes the need for perl
 261  *          to keep its own records, hence is more efficient and guaranteed to
 262  *          be accurate.
 263  *
 264  *      -Accflags=-DNO_LOCALE_CTYPE
 265  *      -Accflags=-DNO_LOCALE_NUMERIC
 266  *          etc.
 267  *
 268  *          If the named category(ies) does(do) not exist on this platform,
 269  *          these have no effect.  Otherwise they cause perl to be compiled to
 270  *          always keep the named category(ies) in the C locale.
 271  *
 272  *      -Accflags=-DHAS_BROKEN_SETLOCALE_QUERY_LC_ALL
 273  *          This would be set in a hints file to tell perl that doing a libc
 274  *              setlocale(LC_ALL, NULL)
 275  *          can give erroneous results, and perl will compensate to get the
 276  *          correct results.  This is known to be a problem in earlier AIX
 277  *          versions
 278  *
 279  *      -Accflags=-DHAS_LF_IN_SETLOCALE_RETURN
 280  *          This would be set in a hints file to tell perl that a libc
 281  *          setlocale() can return results containing \n characters that need
 282  *          to be stripped off.  khw believes there aren't any such platforms
 283  *          still in existence.
 284  *
 285  *      -Accflags=-DLIBC_HANDLES_MISMATCHED_CTYPE
 286  *          Consider the name of a month in some language, Chinese for example.
 287  *          If LC_TIME has been set to a Chinese locale, strftime() can be used
 288  *          to generate the Chinese month name for any given date, by using the
 289  *          %B format.  But also suppose that LC_CTYPE is set to, say, "C".
 290  *          The return from strftime() on many platforms will be mojibake given
 291  *          that no Chinese month name is composed of just ASCII characters.
 292  *          Perl handles this for you by automatically toggling LC_CTYPE to
 293  *          whatever LC_TIME is during the execution of strftime(), and
 294  *          afterwards restoring it to its prior value.  But the strftime()
 295  *          (and similar functions) in some libc implementations already do
 296  *          this toggle, meaning perl's action is redundant.  You can tell perl
 297  *          that a libc does this by setting this Configure option, and it will
 298  *          skip its syncing LC_CTYPE and whatever the other locale is.
 299  *          Currently, perl ignores this Configuration option and  syncs anyway
 300  *          for LC_COLLATE-related operations, due to perl's internal needs.
 301  *
 302  *      -Accflags=USE_FAKE_LC_ALL_POSITIONAL_NOTATION
 303  *          This is used when developing Perl on a platform that uses
 304  *          'name=value;' notation to represent LC_ALL when not all categories
 305  *          are the same.  When so compiled, much of the code gets compiled
 306  *          and exercised that applies to platforms that instead use positional
 307  *          notation.  This allows for finding many bugs in that portion of the
 308  *          implementation, without having to access such a platform.
 309  *
 310  *      -Accflags=-DWIN32_USE_FAKE_OLD_MINGW_LOCALES
 311  *          This is used when developing Perl on a non-Windows platform to
 312  *          compile and exercise much of the locale-related code that instead
 313  *          applies to MingW platforms that don't use the more modern UCRT
 314  *          library.  This allows for finding many bugs in that portion of the
 315  *          implementation, without having to access such a platform.
 316  */
 317
 318 /* If the environment says to, we can output debugging information during
 319  * initialization.  This is done before option parsing, and before any thread
 320  * creation, so can be a file-level static.  (Must come before #including
 321  * perl.h) */
 322 #include "config.h"
 323
 324 /* Returns the Unix errno portion; ignoring any others.  This is a macro here
 325  * instead of putting it into perl.h, because unclear to khw what should be
 326  * done generally. */
 327 #define GET_ERRNO   saved_errno
 328
 329 #ifdef DEBUGGING
 330 static int debug_initialization = 0;
 331 #  define DEBUG_INITIALIZATION_set(v) (debug_initialization = v)
 332 #  define DEBUG_LOCALE_INITIALIZATION_  debug_initialization
 333
 334 #  ifdef HAS_EXTENDED_OS_ERRNO
 335      /* Output the non-zero errno and/or the non-zero extended errno */
 336 #    define DEBUG_ERRNO                                                     \
 337         dSAVE_ERRNO; dTHX;                                                  \
 338         int extended = get_extended_os_errno();                             \
 339         const char * errno_string;                                          \
 340         if (GET_ERRNO == 0) { /* Skip output if both errno types are 0 */   \
 341             if (LIKELY(extended == 0)) errno_string = "";                   \
 342             else errno_string = Perl_form(aTHX_ "; $^E=%d", extended);      \
 343         }                                                                   \
 344         else if (LIKELY(extended == GET_ERRNO))                             \
 345             errno_string = Perl_form(aTHX_ "; $!=%d", GET_ERRNO);           \
 346         else errno_string = Perl_form(aTHX_ "; $!=%d, $^E=%d",              \
 347                                                     GET_ERRNO, extended);
 348 #  else
 349      /* Output the errno, if non-zero */
 350 #    define DEBUG_ERRNO                                                     \
 351         dSAVE_ERRNO;                                                        \
 352         const char * errno_string = "";                                     \
 353         if (GET_ERRNO != 0) {                                               \
 354             dTHX;                                                           \
 355             errno_string = Perl_form(aTHX_ "; $!=%d", GET_ERRNO);           \
 356         }
 357 #  endif
 358
 359 /* Automatically include the caller's file, and line number in debugging output;
 360  * and the errno (and/or extended errno) if non-zero.  On threaded perls add
 361  * the aTHX too. */
 362 #  if defined(USE_ITHREADS) && ! defined(NO_LOCALE_THREADS)
 363 #    define DEBUG_PRE_STMTS                                                 \
 364         DEBUG_ERRNO;                                                        \
 365         PerlIO_printf(Perl_debug_log, "\n%s: %" LINE_Tf ": 0x%p%s: ",       \
 366                                       __FILE__, (line_t)__LINE__, aTHX_     \
 367                                       errno_string);
 368 #  else
 369 #    define DEBUG_PRE_STMTS                                                 \
 370         DEBUG_ERRNO;                                                        \
 371         PerlIO_printf(Perl_debug_log, "\n%s: %" LINE_Tf "%s: ",             \
 372                                       __FILE__, (line_t)__LINE__,           \
 373                                       errno_string);
 374 #  endif
 375 #  define DEBUG_POST_STMTS  RESTORE_ERRNO;
 376 #else
 377 #  define debug_initialization 0
 378 #  define DEBUG_INITIALIZATION_set(v)
 379 #  define DEBUG_PRE_STMTS
 380 #  define DEBUG_POST_STMTS
 381 #endif
 382
 383 #include "EXTERN.h"
 384 #define PERL_IN_LOCALE_C
 385 #include "perl.h"
 386
 387 /* Some platforms require LC_CTYPE to be congruent with the category we are
 388  * looking for.  XXX This still presumes that we have to match COLLATE and
 389  * CTYPE even on platforms that apparently handle this. */
 390 #if defined(USE_LOCALE_CTYPE) && ! defined(LIBC_HANDLES_MISMATCHED_CTYPE)
 391 #  define WE_MUST_DEAL_WITH_MISMATCHED_CTYPE
 392 #  define start_DEALING_WITH_MISMATCHED_CTYPE(locale)                          \
 393         const char * orig_CTYPE_locale = toggle_locale_c(LC_CTYPE, locale)
 394 #  define end_DEALING_WITH_MISMATCHED_CTYPE(locale)                            \
 395         restore_toggled_locale_c(LC_CTYPE, orig_CTYPE_locale);
 396 #else
 397 #  define start_DEALING_WITH_MISMATCHED_CTYPE(locale)
 398 #  define end_DEALING_WITH_MISMATCHED_CTYPE(locale)
 399 #endif
 400
 401 #if PERL_VERSION_GT(5,39,9)
 402 #  error Revert the commit that added this line
 403 #endif
 404
 405 #ifdef WIN32_USE_FAKE_OLD_MINGW_LOCALES
 406
 407    /* Use -Accflags=-DWIN32_USE_FAKE_OLD_MINGW_LOCALES on a POSIX or *nix box
 408     * to get a semblance of pretending the locale handling is that of a MingW
 409     * that doesn't use UCRT (hence 'OLD' in the name).  This exercizes code
 410     * paths that are not compiled on non-Windows boxes, and allows for ASAN
 411     * and PERL_MEMLOG.  This is thus a way to see if locale.c on Windows is
 412     * likely going to compile, without having to use a real Win32 box.  And
 413     * running the test suite will verify to a large extent our logic and memory
 414     * allocation handling for such boxes.  Of course the underlying calls are
 415     * to the POSIX libc, so any differences in implementation between those and
 416     * the Windows versions will not be caught by this. */
 417
 418 #  define WIN32
 419 #  undef P_CS_PRECEDES
 420 #  undef CURRENCY_SYMBOL
 421 #  define CP_UTF8 -1
 422 #  undef _configthreadlocale
 423 #  define _configthreadlocale(arg) NOOP
 424
 425 #  define MultiByteToWideChar(cp, flags, byte_string, m1, wstring, req_size) \
 426                     (PERL_UNUSED_ARG(cp),                                    \
 427                      mbsrtowcs(wstring, &(byte_string), req_size, NULL) + 1)
 428 #  define WideCharToMultiByte(cp, flags, wstring, m1, byte_string,          \
 429                               req_size, default_char, found_default_char)   \
 430                     (PERL_UNUSED_ARG(cp),                                   \
 431                      wcsrtombs(byte_string, &(wstring), req_size, NULL) + 1)
 432
 433 #  ifdef USE_LOCALE
 434
 435 static const wchar_t * wsetlocale_buf = NULL;
 436 static Size_t wsetlocale_buf_size = 0;
 437
 438 #    ifdef MULTIPLICITY
 439
 440 static PerlInterpreter * wsetlocale_buf_aTHX = NULL;
 441
 442 #    endif
 443
 444 STATIC
 445 const wchar_t *
 446 S_wsetlocale(const int category, const wchar_t * wlocale)
 447 {
 448     /* Windows uses a setlocale that takes a wchar_t* locale.  Other boxes
 449      * don't have this, so this Windows replacement converts the wchar_t input
 450      * to plain 'char*', calls plain setlocale(), and converts the result back
 451      * to 'wchar_t*' */
 452
 453     const char * byte_locale = NULL;
 454     if (wlocale) {
 455         byte_locale = Win_wstring_to_byte_string(CP_UTF8, wlocale);
 456     }
 457
 458     const char * byte_result = setlocale(category, byte_locale);
 459     Safefree(byte_locale);
 460     if (byte_result == NULL) {
 461         return NULL;
 462     }
 463
 464     const wchar_t * wresult = Win_byte_string_to_wstring(CP_UTF8, byte_result);
 465
 466     if (! wresult) {
 467         return NULL;
 468     }
 469
 470     /* Emulate a global static memory return from wsetlocale().  This currently
 471      * leaks at process end; would require changing LOCALE_TERM to fix that */
 472     Size_t string_size = wcslen(wresult) + 1;
 473
 474     if (wsetlocale_buf_size == 0) {
 475         Newx(wsetlocale_buf, string_size, wchar_t);
 476         wsetlocale_buf_size = string_size;
 477
 478 #  ifdef MULTIPLICITY
 479
 480         dTHX;
 481         wsetlocale_buf_aTHX = aTHX;
 482
 483 #  endif
 484
 485     }
 486     else if (string_size > wsetlocale_buf_size) {
 487         Renew(wsetlocale_buf, string_size, wchar_t);
 488         wsetlocale_buf_size = string_size;
 489     }
 490
 491     Copy(wresult, wsetlocale_buf, string_size, wchar_t);
 492     Safefree(wresult);
 493
 494     return wsetlocale_buf;
 495 }
 496
 497 #  define _wsetlocale(category, wlocale)  S_wsetlocale(category, wlocale)
 498 #  endif
 499 #endif  /* WIN32_USE_FAKE_OLD_MINGW_LOCALES */
 500
 501 /* 'for' loop headers to hide the necessary casts */
 502 #define for_all_individual_category_indexes(i)                              \
 503     for (locale_category_index i = (locale_category_index) 0;               \
 504          i < LC_ALL_INDEX_;                                                 \
 505          i = (locale_category_index) ((int) i + 1))
 506
 507 #define for_all_but_0th_individual_category_indexes(i)                      \
 508     for (locale_category_index i = (locale_category_index) 1;               \
 509          i < LC_ALL_INDEX_;                                                 \
 510          i = (locale_category_index) ((int) i + 1))
 511
 512 #define for_all_category_indexes(i)                                         \
 513     for (locale_category_index i = (locale_category_index) 0;               \
 514          i <= LC_ALL_INDEX_;                                                \
 515          i = (locale_category_index) ((int) i + 1))
 516
 517 #ifdef USE_LOCALE
 518 #  if defined(USE_FAKE_LC_ALL_POSITIONAL_NOTATION) && defined(LC_ALL)
 519
 520 /* This simulates an underlying positional notation for LC_ALL when compiled on
 521  * a system that uses name=value notation.  Use this to develop on Linux and
 522  * make a quick check that things have some chance of working on a positional
 523  * box.  Enable by adding to the Congfigure parameters:
 524  *      -Accflags=USE_FAKE_LC_ALL_POSITIONAL_NOTATION
 525  *
 526  * NOTE it redefines setlocale() and usequerylocale()
 527  * */
 528
 529 STATIC const char *
 530 S_positional_name_value_xlation(const char * locale, bool direction)
 531 {   /* direction == 1 is from name=value to positional
 532        direction == 0 is from positional to name=value */
 533     assert(locale);
 534
 535     dTHX;
 536     const char * individ_locales[LC_ALL_INDEX_] = { NULL };
 537
 538     /* This parses either notation */
 539     switch (parse_LC_ALL_string(locale,
 540                                 (const char **) &individ_locales,
 541                                 no_override,  /* Handled by other code */
 542                                 false,      /* Return only [0] if suffices */
 543                                 false,      /* Don't panic on error */
 544                                 __LINE__))
 545     {
 546       default:      /* Some compilers don't realize that below is the complete
 547                        list of the available enum values */
 548       case invalid:
 549         return NULL;
 550
 551       case no_array:
 552         return locale;
 553       case only_element_0:
 554         SAVEFREEPV(individ_locales[0]);
 555         return individ_locales[0];
 556       case full_array:
 557        {
 558         calc_LC_ALL_format  format = (direction)
 559                                      ? EXTERNAL_FORMAT_FOR_SET
 560                                      : INTERNAL_FORMAT;
 561         const char * retval = calculate_LC_ALL_string(individ_locales,
 562                                                       format,
 563                                                       WANT_TEMP_PV,
 564                                                       __LINE__);
 565
 566         for_all_individual_category_indexes(i) {
 567             Safefree(individ_locales[i]);
 568         }
 569
 570         return retval;
 571        }
 572     }
 573 }
 574
 575 STATIC const char *
 576 S_positional_setlocale(int cat, const char * locale)
 577 {
 578     if (cat != LC_ALL) return setlocale(cat, locale);
 579
 580     if (locale && strNE(locale, "")) {
 581         locale = S_positional_name_value_xlation(locale, 0);
 582         if (! locale) return NULL;
 583     }
 584
 585     locale = setlocale(cat, locale);
 586     if (locale == NULL) return NULL;
 587     return S_positional_name_value_xlation(locale, 1);
 588 }
 589
 590 #    undef setlocale
 591 #    define setlocale(a,b)  S_positional_setlocale(a,b)
 592 #    ifdef USE_POSIX_2008_LOCALE
 593
 594 STATIC locale_t
 595 S_positional_newlocale(int mask, const char * locale, locale_t base)
 596 {
 597     assert(locale);
 598
 599     if (mask != LC_ALL_MASK) return newlocale(mask, locale, base);
 600
 601     if (strNE(locale, "")) locale = S_positional_name_value_xlation(locale, 0);
 602     if (locale == NULL) return NULL;
 603     return newlocale(LC_ALL_MASK, locale, base);
 604 }
 605
 606 #    undef newlocale
 607 #    define newlocale(a,b,c)  S_positional_newlocale(a,b,c)
 608 #    endif
 609 #  endif
 610 #endif  /* End of fake positional notation */
 611
 612 #include "reentr.h"
 613
 614 #ifdef I_WCHAR
 615 #  include <wchar.h>
 616 #endif
 617 #ifdef I_WCTYPE
 618 #  include <wctype.h>
 619 #endif
 620
 621  /* The main errno that gets used is this one, on platforms that support it */
 622 #ifdef EINVAL
 623 #  define SET_EINVAL  SETERRNO(EINVAL, LIB_INVARG)
 624 #else
 625 #  define SET_EINVAL
 626 #endif
 627
 628 /* This is a starting guess as to when this is true.  It definititely isn't
 629  * true on *BSD where positional LC_ALL notation is used.  Likely this will end
 630  * up being defined in hints files. */
 631 #ifdef PERL_LC_ALL_USES_NAME_VALUE_PAIRS
 632 #  define NEWLOCALE_HANDLES_DISPARATE_LC_ALL
 633 #endif
 634
 635 /* But regardless, we have to look at individual categories if some are
 636  * ignored.  */
 637 #ifdef HAS_IGNORED_LOCALE_CATEGORIES_
 638 #  undef NEWLOCALE_HANDLES_DISPARATE_LC_ALL
 639 #endif
 640 #ifdef USE_LOCALE
 641
 642 /* Not all categories need be set to the same locale.  This macro determines if
 643  * 'name' which represents LC_ALL is uniform or disparate.  There are two
 644  * situations: 1) the platform uses unordered name=value pairs; 2) the platform
 645  * uses ordered positional values, with a separator string between them */
 646 #  ifdef PERL_LC_ALL_SEPARATOR   /* positional */
 647 #    define is_disparate_LC_ALL(name)  cBOOL(instr(name, PERL_LC_ALL_SEPARATOR))
 648 #  else     /* name=value */
 649
 650     /* In the, hopefully never occurring, event that the platform doesn't use
 651      * either mechanism for disparate LC_ALL's, assume the name=value pairs
 652      * form, rather than taking the extreme step of refusing to compile.  Many
 653      * programs won't have disparate locales, so will generally work */
 654 #    define PERL_LC_ALL_SEPARATOR  ";"
 655 #    define is_disparate_LC_ALL(name)  cBOOL(   strchr(name, ';')   \
 656                                              && strchr(name, '='))
 657 #  endif
 658
 659 /* It is possible to compile perl to always keep any individual category in the
 660  * C locale.  This would be done where the implementation on a platform is
 661  * flawed or incomplete.  At the time of this writing, for example, OpenBSD has
 662  * not implemented LC_COLLATE beyond the C locale.  The 'category_available[]'
 663  * table is a bool that says whether a category is changeable, or must be kept
 664  * in C.  This macro substitutes C for the locale appropriately, expanding to
 665  * nothing on the more typical case where all possible categories present on
 666  * the platform are handled. */
 667 #  ifdef HAS_IGNORED_LOCALE_CATEGORIES_
 668 #    define need_to_override_category(i)  (! category_available[i])
 669 #    define override_ignored_category(i, new_locale)                        \
 670                     ((need_to_override_category(i)) ? "C" : (new_locale))
 671 #  else
 672 #    define need_to_override_category(i)  0
 673 #    define override_ignored_category(i, new_locale)  (new_locale)
 674 #  endif
 675
 676 PERL_STATIC_INLINE const char *
 677 S_mortalized_pv_copy(pTHX_ const char * const pv)
 678 {
 679     PERL_ARGS_ASSERT_MORTALIZED_PV_COPY;
 680
 681     /* Copies the input pv, and arranges for it to be freed at an unspecified
 682      * later time. */
 683
 684     if (pv == NULL) {
 685         return NULL;
 686     }
 687
 688     const char * copy = savepv(pv);
 689     SAVEFREEPV(copy);
 690     return copy;
 691 }
 692
 693 #endif
 694
 695 /* Default values come from the C locale */
 696 #define C_codeset "ANSI_X3.4-1968" /* Only in some Configurations, and usually
 697                                       a single instance, so is a #define */
 698 static const char C_decimal_point[] = ".";
 699
 700 #if defined(HAS_NL_LANGINFO_L) || defined(HAS_NL_LANGINFO)
 701 #  define HAS_SOME_LANGINFO
 702 #endif
 703
 704 #if (defined(USE_LOCALE_NUMERIC) && ! defined(TS_W32_BROKEN_LOCALECONV))    \
 705  || ! (   defined(USE_LOCALE_NUMERIC)                                       \
 706        && (defined(HAS_SOME_LANGINFO) || defined(HAS_LOCALECONV)))
 707 static const char C_thousands_sep[] = "";
 708 #endif
 709
 710 /* Is the C string input 'name' "C" or "POSIX"?  If so, and 'name' is the
 711  * return of setlocale(), then this is extremely likely to be the C or POSIX
 712  * locale.  However, the output of setlocale() is documented to be opaque, but
 713  * the odds are extremely small that it would return these two strings for some
 714  * other locale.  Note that VMS includes many non-ASCII characters in these two
 715  * locales as controls and punctuation (below are hex bytes):
 716  *   cntrl:  84-97 9B-9F
 717  *   punct:  A1-A3 A5 A7-AB B0-B3 B5-B7 B9-BD BF-CF D1-DD DF-EF F1-FD
 718  * Oddly, none there are listed as alphas, though some represent alphabetics
 719  * https://www.nntp.perl.org/group/perl.perl5.porters/2013/02/msg198753.html */
 720 #define isNAME_C_OR_POSIX(name)                                              \
 721                              (   (name) != NULL                              \
 722                               && (( *(name) == 'C' && (*(name + 1)) == '\0') \
 723                                    || strEQ((name), "POSIX")))
 724
 725 #ifndef my_langinfo_i
 726 #  define my_langinfo_i(i, c, l, b, s, u)                                   \
 727                     (PERL_UNUSED_VAR(c), emulate_langinfo(i, l, b, s, u))
 728 #endif
 729 #define my_langinfo_c(item, category, locale, retbufp, retbuf_sizep, utf8ness) \
 730             my_langinfo_i(item, category##_INDEX_, locale, retbufp,            \
 731                                                       retbuf_sizep,  utf8ness)
 732 #ifndef USE_LOCALE  /* A no-op unless locales are enabled */
 733 #  define toggle_locale_i(index, locale)    NULL
 734 #  define restore_toggled_locale_i(index, locale)  PERL_UNUSED_VAR(locale)
 735 #else
 736 #  define toggle_locale_i(index, locale)                                    \
 737                  S_toggle_locale_i(aTHX_ index, locale, __LINE__)
 738 #  define restore_toggled_locale_i(index, locale)                           \
 739                  S_restore_toggled_locale_i(aTHX_ index, locale, __LINE__)
 740 #endif
 741
 742 #  define toggle_locale_c(cat, locale)  toggle_locale_i(cat##_INDEX_, locale)
 743 #  define restore_toggled_locale_c(cat, locale)                             \
 744                              restore_toggled_locale_i(cat##_INDEX_, locale)
 745 #ifdef USE_LOCALE
 746 #  ifdef DEBUGGING
 747 #    define setlocale_debug_string_i(index, locale, result)                 \
 748             my_setlocale_debug_string_i(index, locale, result, __LINE__)
 749 #    define setlocale_debug_string_c(category, locale, result)              \
 750                 setlocale_debug_string_i(category##_INDEX_, locale, result)
 751 #    define setlocale_debug_string_r(category, locale, result)              \
 752              setlocale_debug_string_i(get_category_index(category),         \
 753                                       locale, result)
 754 #  endif
 755
 756 /* On systems without LC_ALL, pretending it exists anyway simplifies things.
 757  * Choose a value for it that is very unlikely to clash with any actual
 758  * category */
 759 #  define FAKE_LC_ALL  PERL_INT_MIN
 760
 761 /* Below are parallel arrays for locale information indexed by our mapping of
 762  * category numbers into small non-negative indexes.  locale_table.h contains
 763  * an entry like this for each individual category used on this system:
 764  *      PERL_LOCALE_TABLE_ENTRY(CTYPE, S_new_ctype)
 765  *
 766  * Each array redefines PERL_LOCALE_TABLE_ENTRY to generate the information
 767  * needed for that array, and #includes locale_table.h to get the valid
 768  * categories.
 769  *
 770  * An entry for the conglomerate category LC_ALL is added here, immediately
 771  * following the individual categories.  (The treatment for it varies, so can't
 772  * be in locale_table.h.)
 773  *
 774  * Following this, each array ends with an entry for illegal categories.  All
 775  * category numbers unknown to perl get mapped to this entry.  This is likely
 776  * to be a parameter error from the calling program; but it could be that this
 777  * platform has a category we don't know about, in which case it needs to be
 778  * added, using the paradigm of one of the existing categories. */
 779
 780 /* The first array is the locale categories perl uses on this system, used to
 781  * map our index back to the system's category number. */
 782 STATIC const int categories[] = {
 783
 784 #  undef PERL_LOCALE_TABLE_ENTRY
 785 #  define PERL_LOCALE_TABLE_ENTRY(name, call_back)  LC_ ## name,
 786 #  include "locale_table.h"
 787
 788 #  ifdef LC_ALL
 789     LC_ALL,
 790 #  else
 791     FAKE_LC_ALL,
 792 #  endif
 793
 794    (FAKE_LC_ALL + 1)    /* Entry for unknown category; this number is unlikely
 795                            to clash with a real category */
 796 };
 797
 798 # define GET_NAME_AS_STRING(token)  # token
 799 # define GET_LC_NAME_AS_STRING(token) GET_NAME_AS_STRING(LC_ ## token)
 800
 801 /* The second array is the category names. */
 802 STATIC const char * const category_names[] = {
 803
 804 #  undef PERL_LOCALE_TABLE_ENTRY
 805 #  define PERL_LOCALE_TABLE_ENTRY(name, call_back)  GET_LC_NAME_AS_STRING(name),
 806 #  include "locale_table.h"
 807
 808 #  ifdef LC_ALL
 809 #    define LC_ALL_STRING  "LC_ALL"
 810 #  else
 811 #    define LC_ALL_STRING  "If you see this, it is a bug in perl;"      \
 812                            " please report it via perlbug"
 813 #  endif
 814
 815     LC_ALL_STRING,
 816
 817 #  define LC_UNKNOWN_STRING  "Locale category unknown to Perl; if you see"  \
 818                              " this, it is a bug in perl; please report it" \
 819                              " via perlbug"
 820     LC_UNKNOWN_STRING
 821 };
 822
 823 STATIC const Size_t category_name_lengths[] = {
 824
 825 #  undef PERL_LOCALE_TABLE_ENTRY
 826 #  define PERL_LOCALE_TABLE_ENTRY(name, call_back)                          \
 827                                     STRLENs(GET_LC_NAME_AS_STRING(name)),
 828 #  include "locale_table.h"
 829
 830     STRLENs(LC_ALL_STRING),
 831     STRLENs(LC_UNKNOWN_STRING)
 832 };
 833
 834 /* Each entry includes space for the '=' and ';' */
 835 #  undef PERL_LOCALE_TABLE_ENTRY
 836 #  define PERL_LOCALE_TABLE_ENTRY(name, call_back)                          \
 837                                 + STRLENs(GET_LC_NAME_AS_STRING(name)) + 2
 838
 839 STATIC const Size_t lc_all_boiler_plate_length = 1  /* space for trailing NUL */
 840 #  include "locale_table.h"
 841 ;
 842
 843 /* A few categories require additional setup when they are changed.  This table
 844  * points to the functions that do that setup */
 845 STATIC void (*update_functions[]) (pTHX_ const char *, bool force) = {
 846
 847 #  undef PERL_LOCALE_TABLE_ENTRY
 848 #  define PERL_LOCALE_TABLE_ENTRY(name, call_back)  call_back,
 849 #  include "locale_table.h"
 850
 851     S_new_LC_ALL,
 852     NULL,   /* No update for unknown category */
 853 };
 854
 855 #  if defined(HAS_IGNORED_LOCALE_CATEGORIES_)
 856
 857 /* Indicates if each category on this platform is available to use not in
 858  * the C locale */
 859 STATIC const bool category_available[] = {
 860
 861 #  undef PERL_LOCALE_TABLE_ENTRY
 862 #  define PERL_LOCALE_TABLE_ENTRY(name, call_back)  LC_ ## name ## _AVAIL_,
 863 #  include "locale_table.h"
 864
 865 #  ifdef LC_ALL
 866     true,
 867 #  else
 868     false,
 869 #  endif
 870
 871     false   /* LC_UNKNOWN_AVAIL_ */
 872 };
 873
 874 #  endif
 875 #  if defined(USE_POSIX_2008_LOCALE)
 876
 877 STATIC const int category_masks[] = {
 878
 879 #    undef PERL_LOCALE_TABLE_ENTRY
 880 #    define PERL_LOCALE_TABLE_ENTRY(name, call_back)  LC_ ## name ## _MASK,
 881 #    include "locale_table.h"
 882
 883     LC_ALL_MASK,    /* Will rightly refuse to compile unless this is defined */
 884     0               /* Empty mask for unknown category */
 885 };
 886
 887 #  endif
 888 #  if ! defined(PERL_LC_ALL_USES_NAME_VALUE_PAIRS)
 889
 890 /* On platforms that use positional notation for expressing LC_ALL, this maps
 891  * the position of each category to our corresponding internal index for it.
 892  * This is initialized at run time if needed.  LC_ALL_INDEX_ is not legal for
 893  * an individual locale, hence marks the elements here as not actually
 894  * initialized. */
 895 STATIC
 896 unsigned int
 897 map_LC_ALL_position_to_index[LC_ALL_INDEX_] = { LC_ALL_INDEX_ };
 898
 899 #  endif
 900 #endif
 901 #if defined(USE_LOCALE) || defined(DEBUGGING)
 902
 903 STATIC const char *
 904 S_get_displayable_string(pTHX_
 905                          const char * const s,
 906                          const char * const e,
 907                          const bool is_utf8)
 908 {
 909     PERL_ARGS_ASSERT_GET_DISPLAYABLE_STRING;
 910
 911     if (e <= s) {
 912         return "";
 913     }
 914
 915     const char * t = s;
 916     bool prev_was_printable = TRUE;
 917     bool first_time = TRUE;
 918     char * ret;
 919
 920     /* Worst case scenario: All are non-printable so have a blank between each.
 921      * If UTF-8, all are the largest possible code point; otherwise all are a
 922      * single byte.  '(2 + 1)'  is from each byte takes 2 characters to
 923      * display, and a blank (or NUL for the final one) after it */
 924     const Size_t size = (e - s) * (2 + 1) * ((is_utf8) ? UVSIZE : 1);
 925     Newxz(ret, size, char);
 926     SAVEFREEPV(ret);
 927
 928     while (t < e) {
 929         UV cp = (is_utf8)
 930                 ?  utf8_to_uvchr_buf((U8 *) t, e, NULL)
 931                 : * (U8 *) t;
 932         if (isPRINT(cp)) {
 933             if (! prev_was_printable) {
 934                 my_strlcat(ret, " ", size);
 935             }
 936
 937             /* Escape these to avoid any ambiguity */
 938             if (cp == ' ' || cp == '\\') {
 939                 my_strlcat(ret, "\\", size);
 940             }
 941             my_strlcat(ret, Perl_form(aTHX_ "%c", (U8) cp), size);
 942             prev_was_printable = TRUE;
 943         }
 944         else {
 945             if (! first_time) {
 946                 my_strlcat(ret, " ", size);
 947             }
 948             my_strlcat(ret, Perl_form(aTHX_ "%02" UVXf, cp), size);
 949             prev_was_printable = FALSE;
 950         }
 951         t += (is_utf8) ? UTF8SKIP(t) : 1;
 952         first_time = FALSE;
 953     }
 954
 955     return ret;
 956 }
 957
 958 #endif
 959 #ifdef USE_LOCALE
 960
 961 # define get_category_index(cat) get_category_index_helper(cat, NULL, __LINE__)
 962
 963 STATIC locale_category_index
 964 S_get_category_index_helper(pTHX_ const int category, bool * succeeded,
 965                                   const line_t caller_line)
 966 {
 967     PERL_ARGS_ASSERT_GET_CATEGORY_INDEX_HELPER;
 968
 969     /* Given a category, return the equivalent internal index we generally use
 970      * instead, warn or panic if not found. */
 971
 972     locale_category_index i;
 973
 974 #  undef PERL_LOCALE_TABLE_ENTRY
 975 #  define PERL_LOCALE_TABLE_ENTRY(name, call_back)                          \
 976                     case LC_ ## name: i =  LC_ ## name ## _INDEX_; break;
 977
 978     switch (category) {
 979
 980 #  include "locale_table.h"
 981 #  ifdef LC_ALL
 982       case LC_ALL: i =  LC_ALL_INDEX_; break;
 983 #  endif
 984
 985       default: goto unknown_locale;
 986     }
 987
 988     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
 989                            "index of category %d (%s) is %d;"
 990                            " called from %" LINE_Tf "\n",
 991                            category, category_names[i], i, caller_line));
 992
 993     if (succeeded) {
 994         *succeeded = true;
 995     }
 996
 997     return i;
 998
 999   unknown_locale:
1000
1001     if (succeeded) {
1002         *succeeded = false;
1003         return LC_ALL_INDEX_;   /* Arbitrary */
1004     }
1005
1006     locale_panic_via_(Perl_form(aTHX_ "Unknown locale category %d", category),
1007                       __FILE__, caller_line);
1008     NOT_REACHED; /* NOTREACHED */
1009 }
1010
1011 #endif /* ifdef USE_LOCALE */
1012
1013 void
1014 Perl_force_locale_unlock(pTHX)
1015 {
1016     /* Remove any locale mutex, in preperation for an inglorious termination,
1017      * typically a  panic */
1018
1019 #if defined(USE_LOCALE_THREADS)
1020
1021     /* If recursively locked, clear all at once */
1022     if (PL_locale_mutex_depth > 1) {
1023         PL_locale_mutex_depth = 1;
1024     }
1025
1026     if (PL_locale_mutex_depth > 0) {
1027         LOCALE_UNLOCK_;
1028     }
1029
1030 #endif
1031
1032 }
1033
1034 #ifdef USE_POSIX_2008_LOCALE
1035
1036 STATIC locale_t
1037 S_use_curlocale_scratch(pTHX)
1038 {
1039     /* This function is used to hide from the caller the case where the current
1040      * locale_t object in POSIX 2008 is the global one, which is illegal in
1041      * many of the P2008 API calls.  This checks for that and, if necessary
1042      * creates a proper P2008 object.  Any prior object is deleted, as is any
1043      * remaining object during global destruction. */
1044
1045     locale_t cur = uselocale((locale_t) 0);
1046
1047     if (cur != LC_GLOBAL_LOCALE) {
1048         return cur;
1049     }
1050
1051     if (PL_scratch_locale_obj) {
1052         freelocale(PL_scratch_locale_obj);
1053     }
1054
1055     PL_scratch_locale_obj = duplocale(LC_GLOBAL_LOCALE);
1056     return PL_scratch_locale_obj;
1057 }
1058
1059 #endif
1060
1061 void
1062 Perl_locale_panic(const char * msg,
1063                   const line_t immediate_caller_line,
1064                   const char * const higher_caller_file,
1065                   const line_t higher_caller_line)
1066 {
1067     PERL_ARGS_ASSERT_LOCALE_PANIC;
1068     dTHX;
1069     dSAVE_ERRNO;
1070
1071     force_locale_unlock();
1072
1073 #ifdef USE_C_BACKTRACE
1074     dump_c_backtrace(Perl_debug_log, 20, 1);
1075 #endif
1076
1077     const char * called_by = "";
1078     if (   strNE(__FILE__, higher_caller_file)
1079         || immediate_caller_line != higher_caller_line)
1080     {
1081         called_by = Perl_form(aTHX_ "\nCalled by %s: %" LINE_Tf "\n",
1082                                     higher_caller_file, higher_caller_line);
1083     }
1084
1085     RESTORE_ERRNO;
1086
1087     const char * errno_text;
1088
1089 #ifdef HAS_EXTENDED_OS_ERRNO
1090
1091     const int extended_errnum = get_extended_os_errno();
1092     if (errno != extended_errnum) {
1093         errno_text = Perl_form(aTHX_ "; errno=%d, $^E=%d",
1094                                      errno, extended_errnum);
1095     }
1096     else
1097
1098 #endif
1099
1100     {
1101         errno_text = Perl_form(aTHX_ "; errno=%d", errno);
1102     }
1103
1104     /* diag_listed_as: panic: %s */
1105     Perl_croak(aTHX_ "%s: %" LINE_Tf ": panic: %s%s%s\n",
1106                      __FILE__, immediate_caller_line,
1107                      msg, errno_text, called_by);
1108 }
1109
1110 /* Macros to report and croak on an unexpected failure to set the locale.  The
1111  * via version has more stack trace information */
1112 #define setlocale_failure_panic_i(i, cur, fail, line, higher_line)          \
1113     setlocale_failure_panic_via_i(i, cur, fail, __LINE__, line,             \
1114                                   __FILE__, higher_line)
1115
1116 #define setlocale_failure_panic_c(cat, cur, fail, line, higher_line)        \
1117    setlocale_failure_panic_i(cat##_INDEX_, cur, fail, line, higher_line)
1118
1119 #if defined(USE_LOCALE)
1120
1121 /* Expands to the code to
1122  *      result = savepvn(s, len)
1123  * if the category whose internal index is 'i' doesn't need to be kept in the C
1124  * locale on this system, or if 'action is 'no_override'.  Otherwise it expands
1125  * to
1126  *      result = savepv("C")
1127  * unless 'action' isn't 'check_that_overridden', in which case if the string
1128  * 's' isn't already "C" it panics */
1129 #  ifndef HAS_IGNORED_LOCALE_CATEGORIES_
1130 #    define OVERRIDE_AND_SAVEPV(s, len, result, i, action)                  \
1131                                                   result = savepvn(s, len)
1132 #  else
1133 #    define OVERRIDE_AND_SAVEPV(s, len, result, i, action)                  \
1134         STMT_START {                                                        \
1135             if (LIKELY(   ! need_to_override_category(i)                    \
1136                        || action == no_override)) {                         \
1137                 result = savepvn(s, len);                                   \
1138             }                                                               \
1139             else {                                                          \
1140                 const char * temp = savepvn(s, len);                        \
1141                 result = savepv(override_ignored_category(i, temp));        \
1142                 if (action == check_that_overridden && strNE(result, temp)) { \
1143                     locale_panic_(Perl_form(aTHX_                           \
1144                                 "%s expected to be '%s', instead is '%s'",  \
1145                                 category_names[i], result, temp));          \
1146                 }                                                           \
1147                 Safefree(temp);                                             \
1148             }                                                               \
1149         } STMT_END
1150 #  endif
1151
1152 STATIC parse_LC_ALL_string_return
1153 S_parse_LC_ALL_string(pTHX_ const char * string,
1154                             const char ** output,
1155                             const parse_LC_ALL_STRING_action  override,
1156                             bool always_use_full_array,
1157                             const bool panic_on_error,
1158                             const line_t caller_line)
1159 {
1160     /* This function parses the value of the input 'string' which is expected
1161      * to be the representation of an LC_ALL locale, and splits the result into
1162      * the values for the individual component categories, returning those in
1163      * the 'output' array.  Each array value will be a savepv() copy that is
1164      * the responsibility of the caller to make sure gets freed
1165      *
1166      * The locale for each category is independent of the other categories.
1167      * Often, they are all the same, but certainly not always.  Perl, in fact,
1168      * usually keeps LC_NUMERIC in the C locale, regardless of the underlying
1169      * locale.  LC_ALL has to be able to represent the case of when not all
1170      * categories have the same locale.  Platforms have differing ways of
1171      * representing this.  Internally, this file uses the 'name=value;'
1172      * representation found on some platforms, so this function always looks
1173      * for and parses that.  Other platforms use a positional notation.  On
1174      * those platforms, this function also parses that form.  It examines the
1175      * input to see which form is being parsed.
1176      *
1177      * Often, all categories will have the same locale.  This is special cased
1178      * if 'always_use_full_array' is false on input:
1179      *      1) If the input 'string' is a single value, this function doesn't
1180      *         store anything into 'output', and returns 'no_array'
1181      *      2) Some platforms will return multiple occurrences of the same
1182      *         value rather than coalescing them down to a single one.  HP-UX
1183      *         is such a one.  This function will do that collapsing for you,
1184      *         returning 'only_element_0' and saving the single value in
1185      *         output[0], which the caller will need to arrange to be freed.
1186      *         The rest of output[] is undefined, and does not need to be
1187      *         freed.
1188      *
1189      * Otherwise, the input 'string' may not be valid.  This function looks
1190      * mainly for syntactic errors, and if found, returns 'invalid'.  'output'
1191      * will not be filled in in that case, but the input state of it isn't
1192      * necessarily preserved.  Turning on -DL debugging will give details as to
1193      * the error.  If 'panic_on_error' is 'true', the function panics instead
1194      * of returning on error, with a message giving the details.
1195      *
1196      * Otherwise, output[] will be filled with the individual locale names for
1197      * all categories on the system, 'full_array' will be returned, and the
1198      * caller needs to arrange for each to be freed.  This means that either at
1199      * least one category differed from the others, or 'always_use_full_array' was
1200      * true on input.
1201      *
1202      * perl may be configured to ignore changes to a category's locale to
1203      * non-C.  The parameter 'override' tells this function what to do when
1204      * encountering such an illegal combination:
1205      *
1206      *      no_override             indicates to take no special action
1207      *      override_if_ignored,    indicates to return 'C' instead of what the
1208      *                              input string actually says.
1209      *      check_that_overridden   indicates to panic if the string says the
1210      *                              category is not 'C'.  This is used when
1211      *                              non-C is very unexpected behavior.
1212      * */
1213
1214     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
1215                            "Entering parse_LC_ALL_string; called from %"    \
1216                            LINE_Tf "\nnew='%s'\n", caller_line, string));
1217
1218 #  ifdef PERL_LC_ALL_USES_NAME_VALUE_PAIRS
1219
1220     const char separator[] = ";";
1221     const Size_t separator_len = 1;
1222     const bool single_component = (strchr(string, ';') == NULL);
1223
1224 #  else
1225
1226     /* It's possible (but quite unlikely) that the separator string is an '='
1227      * or a ';'.  Requiring both to be present for using the 'name=value;' form
1228      * properly handles those possibilities */
1229     const bool name_value = strchr(string, '=') && strchr(string, ';');
1230     const char * separator;
1231     Size_t separator_len;
1232     bool single_component;
1233     if (name_value) {
1234         separator = ";";
1235         separator_len = 1;
1236         single_component = false;   /* Since has both [;=], must be multi */
1237     }
1238     else {
1239         separator = PERL_LC_ALL_SEPARATOR;
1240         separator_len = STRLENs(PERL_LC_ALL_SEPARATOR);
1241         single_component = instr(string, separator) == NULL;
1242     }
1243
1244     Size_t component_number = 0;    /* Position in the parsing loop below */
1245
1246 #  endif
1247 #  ifndef HAS_IGNORED_LOCALE_CATEGORIES_
1248     PERL_UNUSED_ARG(override);
1249 #  else
1250
1251     /* Any ignored categories are to be set to "C", so if this single-component
1252      * LC_ALL isn't to C, it has both "C" and non-C, so isn't really a single
1253      * component.  All the non-ignored categories are set to the input
1254      * component, but the ignored ones are overridden to be C.
1255      *
1256      * This incidentally handles the case where the string is "".  The return
1257      * will be C for each ignored category and "" for the others.  Then the
1258      * caller can individually set each category, and get the right answer. */
1259     if (single_component && ! isNAME_C_OR_POSIX(string)) {
1260         for_all_individual_category_indexes(i) {
1261            OVERRIDE_AND_SAVEPV(string, strlen(string), output[i], i, override);
1262         }
1263
1264         return full_array;
1265     }
1266
1267 #  endif
1268
1269     if (single_component) {
1270         if (! always_use_full_array) {
1271             return no_array;
1272         }
1273
1274         for_all_individual_category_indexes(i) {
1275             output[i] = savepv(string);
1276         }
1277
1278         return full_array;
1279     }
1280
1281     /* Here the input is multiple components.  Parse through them.  (It is
1282      * possible that these components are all the same, so we check, and if so,
1283      * return just the 0th component (unless 'always_use_full_array' is true)
1284      *
1285      * This enum notes the possible errors findable in parsing */
1286     enum {
1287             incomplete,
1288             no_equals,
1289             unknown_category,
1290             contains_LC_ALL_element
1291     } error;
1292
1293     /* Keep track of the categories we have encountered so far */
1294     bool seen[LC_ALL_INDEX_] = { false };
1295
1296     Size_t index;           /* Our internal index for the current category */
1297     const char * s = string;
1298     const char * e = s + strlen(string);
1299     const char * category_end = NULL;
1300     const char * saved_first = NULL;
1301
1302     /* Parse the input locale string */
1303     while (s < e) {
1304
1305         /* 'separator' has been set up to delimit the components */
1306         const char * next_sep = instr(s, separator);
1307         if (! next_sep) {   /* At the end of the input */
1308             next_sep = e;
1309         }
1310
1311 #  ifndef PERL_LC_ALL_USES_NAME_VALUE_PAIRS
1312
1313         if (! name_value) {
1314             /* Get the index of the category in this position */
1315             index = map_LC_ALL_position_to_index[component_number++];
1316         }
1317         else
1318
1319 #  endif
1320
1321         {   /* Get the category part when each component is the
1322              * 'category=locale' form */
1323
1324             category_end = strchr(s, '=');
1325
1326             /* The '=' terminates the category name.  If no '=', is improper
1327              * form */
1328             if (! category_end) {
1329                 error = no_equals;
1330                 goto failure;
1331             }
1332
1333             /* Find our internal index of the category name; uses a linear
1334              * search.  (XXX This could be avoided by various means, but the
1335              * maximum likely search is 6 items, and khw doesn't think the
1336              * added complexity would save very much at all.) */
1337             const unsigned int name_len = (unsigned int) (category_end - s);
1338             for (index = 0; index < C_ARRAY_LENGTH(category_names); index++) {
1339                 if (   name_len == category_name_lengths[index]
1340                     && memEQ(s, category_names[index], name_len))
1341                 {
1342                     goto found_category;
1343                 }
1344             }
1345
1346             /* Here, the category is not in our list. */
1347             error = unknown_category;
1348             goto failure;
1349
1350           found_category:   /* The system knows about this category. */
1351
1352             if (index == LC_ALL_INDEX_) {
1353                 error = contains_LC_ALL_element;
1354                 goto failure;
1355             }
1356
1357             /* The locale name starts just beyond the '=' */
1358             s = category_end + 1;
1359
1360             /* Linux (and maybe others) doesn't treat a duplicate category in
1361              * the string as an error.  Instead it uses the final occurrence as
1362              * the intended value.  So if this is a duplicate, free the former
1363              * value before setting the new one */
1364             if (seen[index]) {
1365                 Safefree(output[index]);
1366             }
1367             else {
1368                 seen[index] = true;
1369             }
1370         }
1371
1372         /* Here, 'index' contains our internal index number for the current
1373          * category, and 's' points to the beginning of the locale name for
1374          * that category. */
1375         OVERRIDE_AND_SAVEPV(s, next_sep - s, output[index], index, override);
1376
1377         if (! always_use_full_array) {
1378             if (! saved_first) {
1379                 saved_first = output[index];
1380             }
1381             else {
1382                 if (strNE(saved_first, output[index])) {
1383                     always_use_full_array = true;
1384                 }
1385             }
1386         }
1387
1388         /* Next time start from the new position */
1389         s = next_sep + separator_len;
1390     }
1391
1392     /* Finished looping through all the categories
1393      *
1394      * Check if the input was incomplete. */
1395
1396 #  ifndef PERL_LC_ALL_USES_NAME_VALUE_PAIRS
1397
1398     if (! name_value) {     /* Positional notation */
1399         if (component_number != LC_ALL_INDEX_) {
1400             error = incomplete;
1401             goto failure;
1402         }
1403     }
1404     else
1405
1406 #  endif
1407
1408     {   /* Here is the name=value notation */
1409         for_all_individual_category_indexes(i) {
1410             if (! seen[i]) {
1411                 error = incomplete;
1412                 goto failure;
1413             }
1414         }
1415     }
1416
1417     /* In the loop above, we changed 'always_use_full_array' to true iff not all
1418      * categories have the same locale.  Hence, if it is still 'false', all of
1419      * them are the same. */
1420     if (always_use_full_array) {
1421         return full_array;
1422     }
1423
1424     /* Free the dangling ones */
1425     for_all_but_0th_individual_category_indexes(i) {
1426         Safefree(output[i]);
1427         output[i] = NULL;
1428     }
1429
1430     return only_element_0;
1431
1432   failure:
1433
1434     /* Don't leave memory dangling that we allocated before the failure */
1435     for_all_individual_category_indexes(i) {
1436         if (seen[i]) {
1437             Safefree(output[i]);
1438             output[i] = NULL;
1439         }
1440     }
1441
1442     const char * msg;
1443     const char * display_start = s;
1444     const char * display_end = e;
1445
1446     switch (error) {
1447         case incomplete:
1448             msg = "doesn't list every locale category";
1449             display_start = string;
1450             break;
1451         case no_equals:
1452             msg = "needs an '=' to split name=value";
1453             break;
1454         case unknown_category:
1455             msg = "is an unknown category";
1456             display_end = (category_end && category_end > display_start)
1457                           ? category_end
1458                           : e;
1459             break;
1460         case contains_LC_ALL_element:
1461             msg = "has LC_ALL, which is illegal here";
1462             break;
1463     }
1464
1465     msg = Perl_form(aTHX_ "'%.*s' %s\n",
1466                           (int) (display_end - display_start),
1467                           display_start, msg);
1468
1469     DEBUG_L(PerlIO_printf(Perl_debug_log, "%s", msg));
1470
1471     if (panic_on_error) {
1472         locale_panic_via_(msg, __FILE__, caller_line);
1473     }
1474
1475     return invalid;
1476 }
1477
1478 #  undef OVERRIDE_AND_SAVEPV
1479 #endif
1480
1481 /*==========================================================================
1482  * Here starts the code that gives a uniform interface to its callers, hiding
1483  * the differences between platforms.
1484  *
1485  * base_posix_setlocale_() presents a consistent POSIX-compliant interface to
1486  * setlocale().   Windows requres a customized base-level setlocale().  This
1487  * layer should only be used by the next level up: the plain posix_setlocale
1488  * layer.  Any necessary mutex locking needs to be done at a higher level.  The
1489  * return may be overwritten by the next call to this function */
1490 #ifdef WIN32
1491 #  define base_posix_setlocale_(cat, locale) win32_setlocale(cat, locale)
1492 #else
1493 #  define base_posix_setlocale_(cat, locale)                                \
1494                                     ((const char *) setlocale(cat, locale))
1495 #endif
1496
1497 /*==========================================================================
1498  * Here is the main posix layer.  It is the same as the base one unless the
1499  * system is lacking LC_ALL, or there are categories that we ignore, but that
1500  * the system libc knows about */
1501
1502 #if ! defined(USE_LOCALE)                                                   \
1503  ||  (defined(LC_ALL) && ! defined(HAS_IGNORED_LOCALE_CATEGORIES_))
1504 #  define posix_setlocale(cat, locale) base_posix_setlocale_(cat, locale)
1505 #else
1506 #  define posix_setlocale(cat, locale)                                      \
1507         S_posix_setlocale_with_complications(aTHX_ cat, locale, __LINE__)
1508
1509 STATIC const char *
1510 S_posix_setlocale_with_complications(pTHX_ const int cat,
1511                                            const char * new_locale,
1512                                            const line_t caller_line)
1513 {
1514     /* This implements the posix layer above the base posix layer.
1515      * It is needed to reconcile our internal records that reflect only a
1516      * proper subset of the categories known by the system. */
1517
1518     /* Querying the current locale returns the real value */
1519     if (new_locale == NULL) {
1520         new_locale = base_posix_setlocale_(cat, NULL);
1521         assert(new_locale);
1522         return new_locale;
1523     }
1524
1525     const char * locale_on_entry = NULL;
1526
1527     /* If setting from the environment, actually do the set to get the system's
1528      * idea of what that means; we may have to override later. */
1529     if (strEQ(new_locale, "")) {
1530         locale_on_entry = base_posix_setlocale_(cat, NULL);
1531         assert(locale_on_entry);
1532         new_locale = base_posix_setlocale_(cat, "");
1533         if (! new_locale) {
1534             SET_EINVAL;
1535             return NULL;
1536         }
1537     }
1538
1539 #  ifdef LC_ALL
1540
1541     const char * new_locales[LC_ALL_INDEX_] = { NULL };
1542
1543     if (cat == LC_ALL) {
1544         switch (parse_LC_ALL_string(new_locale,
1545                                     (const char **) &new_locales,
1546                                     override_if_ignored,   /* Override any
1547                                                               ignored
1548                                                               categories */
1549                                     false,    /* Return only [0] if suffices */
1550                                     false,    /* Don't panic on error */
1551                                     caller_line))
1552         {
1553           case invalid:
1554             SET_EINVAL;
1555             return NULL;
1556
1557           case no_array:
1558             break;
1559
1560           case only_element_0:
1561             new_locale = new_locales[0];
1562             SAVEFREEPV(new_locale);
1563             break;
1564
1565           case full_array:
1566
1567             /* Turn the array into a string that the libc setlocale() should
1568              * understand.   (Another option would be to loop, setting the
1569              * individual locales, and then return base(cat, NULL) */
1570             new_locale = calculate_LC_ALL_string(new_locales,
1571                                                  EXTERNAL_FORMAT_FOR_SET,
1572                                                  WANT_TEMP_PV,
1573                                                  caller_line);
1574
1575             for_all_individual_category_indexes(i) {
1576                 Safefree(new_locales[i]);
1577             }
1578
1579             /* And call the libc setlocale.  We could avoid this call if
1580              * locale_on_entry is set and eq the new_locale.  But that would be
1581              * only for the relatively rare case of the desired locale being
1582              * "", and the time spent in doing the string compare might be more
1583              * than that of just setting it unconditionally */
1584             new_locale = base_posix_setlocale_(cat, new_locale);
1585             if (! new_locale) {
1586                  goto failure;
1587             }
1588
1589             return new_locale;
1590         }
1591     }
1592
1593 #  endif
1594
1595     /* Here, 'new_locale' is a single value, not an aggregation.  Just set it.
1596      * */
1597     new_locale =
1598         base_posix_setlocale_(cat,
1599                               override_ignored_category(
1600                                           get_category_index(cat), new_locale));
1601     if (! new_locale) {
1602         goto failure;
1603     }
1604
1605     return new_locale;
1606
1607  failure:
1608
1609     /* 'locale_on_entry' being set indicates there has likely been a change in
1610      * locale which needs to be restored */
1611     if (locale_on_entry) {
1612         if (! base_posix_setlocale_(cat, locale_on_entry)) {
1613             setlocale_failure_panic_i(get_category_index(cat),
1614                                       NULL, locale_on_entry,
1615                                       __LINE__, caller_line);
1616         }
1617     }
1618
1619     SET_EINVAL;
1620     return NULL;
1621 }
1622
1623 #endif
1624
1625 /* End of posix layer
1626  *==========================================================================
1627  *
1628  * The next layer up is to catch vagaries and bugs in the libc setlocale return
1629  * value.  The return is not guaranteed to be stable.
1630  *
1631  * Any necessary mutex locking needs to be done at a higher level.
1632  *
1633  * On most platforms this layer is empty, expanding to just the layer
1634  * below.   To enable it, call Configure with either or both:
1635  * -Accflags=-DHAS_LF_IN_SETLOCALE_RETURN
1636  *                  to indicate that extraneous \n characters can be returned
1637  *                  by setlocale()
1638  * -Accflags=-DHAS_BROKEN_SETLOCALE_QUERY_LC_ALL
1639  *                  to indicate that setlocale(LC_ALL, NULL) cannot be relied
1640  *                  on
1641  */
1642
1643 #define STDIZED_SETLOCALE_LOCK    POSIX_SETLOCALE_LOCK
1644 #define STDIZED_SETLOCALE_UNLOCK  POSIX_SETLOCALE_UNLOCK
1645 #if ! defined(USE_LOCALE)                                                   \
1646  || ! (   defined(HAS_LF_IN_SETLOCALE_RETURN)                               \
1647        || defined(HAS_BROKEN_SETLOCALE_QUERY_LC_ALL))
1648 #  define stdized_setlocale(cat, locale)  posix_setlocale(cat, locale)
1649 #  define stdize_locale(cat, locale)  (locale)
1650 #else
1651 #  define stdized_setlocale(cat, locale)                                    \
1652         S_stdize_locale(aTHX_ cat, posix_setlocale(cat, locale), __LINE__)
1653
1654 STATIC const char *
1655 S_stdize_locale(pTHX_ const int category,
1656                       const char *input_locale,
1657                       const line_t caller_line)
1658 {
1659     /* The return value of setlocale() is opaque, but is required to be usable
1660      * as input to a future setlocale() to create the same state.
1661      * Unfortunately not all systems are compliant.  This function brings those
1662      * outliers into conformance.  It is based on what problems have arisen in
1663      * the field.
1664      *
1665      * This has similar constraints as the posix layer.  You need to lock
1666      * around it until its return is safely copied or no longer needed. (The
1667      * return may point to a global static buffer or may be mortalized.)
1668      *
1669      * The current things this corrects are:
1670      * 1) A new-line.  This function chops any \n characters
1671      * 2) A broken 'setlocale(LC_ALL, foo)'  This constructs a proper returned
1672      *                 string from the constituent categories
1673      *
1674      * If no changes were made, the input is returned as-is */
1675
1676     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
1677                           "Entering stdize_locale(%d, '%s');"
1678                           " called from %" LINE_Tf "\n",
1679                           category, input_locale, caller_line));
1680
1681     if (input_locale == NULL) {
1682         SET_EINVAL;
1683         return NULL;
1684     }
1685
1686     char * retval = (char *) input_locale;
1687
1688 #  if defined(LC_ALL) && defined(HAS_BROKEN_SETLOCALE_QUERY_LC_ALL)
1689
1690         /* If setlocale(LC_ALL, NULL) is broken, compute what the system
1691          * actually thinks it should be from its individual components */
1692     if (category == LC_ALL) {
1693         retval = (char *) calculate_LC_ALL_string(
1694                                           NULL,  /* query each individ locale */
1695                                           EXTERNAL_FORMAT_FOR_SET,
1696                                           WANT_TEMP_PV,
1697                                           caller_line);
1698     }
1699
1700 #  endif
1701 #  ifdef HAS_NL_IN_SETLOCALE_RETURN
1702
1703     char * first_bad = NULL;
1704
1705 #    ifndef LC_ALL
1706
1707     PERL_UNUSED_ARG(category);
1708     PERL_UNUSED_ARG(caller_line);
1709
1710 #      define INPUT_LOCALE  retval
1711 #      define MARK_CHANGED
1712 #    else
1713
1714     char * individ_locales[LC_ALL_INDEX_] = { NULL };
1715     bool made_changes = false;
1716     Size_t upper;
1717     if (category != LC_ALL) {
1718         individ_locales[0] = retval;
1719         upper = 0;
1720     }
1721     else {
1722
1723         /* And parse the locale string, splitting into its individual
1724          * components. */
1725         switch (parse_LC_ALL_string(retval,
1726                                     (const char **) &individ_locales,
1727                                     check_that_overridden, /* ignored
1728                                                               categories should
1729                                                               already have been
1730                                                               overridden */
1731                                     false,    /* Return only [0] if suffices */
1732                                     false,    /* Don't panic on error */
1733                                     caller_line))
1734         {
1735           case invalid:
1736             SET_EINVAL;
1737             return NULL;
1738
1739           case full_array: /* Loop below through all the component categories.
1740                             */
1741             upper = LC_ALL_INDEX_ - 1;
1742             break;
1743
1744           case no_array:
1745             /* All categories here are set to the same locale, and the parse
1746              * didn't fill in any of 'individ_locales'.  Set the 0th element to
1747              * that locale. */
1748             individ_locales[0] = retval;
1749             /* FALLTHROUGH */
1750
1751           case only_element_0: /* Element 0 is the only element we need to look
1752                                   at */
1753             upper = 0;
1754             break;
1755         }
1756     }
1757
1758     for (unsigned int i = 0; i <= upper; i++)
1759
1760 #      define INPUT_LOCALE  individ_locales[i]
1761 #      define MARK_CHANGED  made_changes = true;
1762 #    endif    /* Has LC_ALL */
1763
1764     {
1765         first_bad = (char *) strchr(INPUT_LOCALE, '\n');
1766
1767         /* Most likely, there isn't a problem with the input */
1768         if (UNLIKELY(first_bad)) {
1769
1770             /* This element will need to be adjusted.  Create a modifiable
1771              * copy. */
1772             MARK_CHANGED
1773             retval = savepv(INPUT_LOCALE);
1774             SAVEFREEPV(retval);
1775
1776             /* Translate the found position into terms of the copy */
1777             first_bad = retval + (first_bad - INPUT_LOCALE);
1778
1779             /* Get rid of the \n and what follows.  (Originally, only a
1780              * trailing \n was stripped.  Unsure what to do if not trailing) */
1781             *((char *) first_bad) = '\0';
1782         }   /* End of needs adjusting */
1783     }   /* End of looking for problems */
1784
1785 #    ifdef LC_ALL
1786
1787     /* If we had multiple elements, extra work is required */
1788     if (upper != 0) {
1789
1790         /* If no changes were made to the input, 'retval' already contains it
1791          * */
1792         if (made_changes) {
1793
1794             /* But if did make changes, need to calculate the new value */
1795             retval = (char *) calculate_LC_ALL_string(
1796                                             (const char **) &individ_locales,
1797                                             EXTERNAL_FORMAT_FOR_SET,
1798                                             WANT_TEMP_PV,
1799                                             caller_line);
1800         }
1801
1802         /* And free the no-longer needed memory */
1803         for (unsigned int i = 0; i <= upper; i++) {
1804             Safefree(individ_locales[i]);
1805         }
1806     }
1807
1808 #    endif
1809 #    undef INPUT_LOCALE
1810 #    undef MARK_CHANGED
1811 #  endif    /* HAS_NL_IN_SETLOCALE_RETURN */
1812
1813     return (const char *) retval;
1814 }
1815
1816 #endif  /* USE_LOCALE */
1817
1818 /* End of stdize_locale layer
1819  *
1820  * ==========================================================================
1821  *
1822  * The next many lines form several implementations of a layer above the
1823  * close-to-the-metal 'posix' and 'stdized' macros.  They are used to present a
1824  * uniform API to the rest of the code in this file in spite of the disparate
1825  * underlying implementations.  Which implementation gets compiled depends on
1826  * the platform capabilities (and some user choice) as determined by Configure.
1827  *
1828  * As more fully described in the introductory comments in this file, the
1829  * API of each implementation consists of three sets of macros.  Each set has
1830  * three variants with suffixes '_c', '_r', and '_i'.  In the list below '_X'
1831  * is to be replaced by any of these suffixes.
1832  *
1833  * 1) bool_setlocale_X  attempts to set the given category's locale to the
1834  *                      given value, returning if it worked or not.
1835  * 2) void_setlocale_X  is like the corresponding bool_setlocale, but used when
1836  *                      success is the only sane outcome, so failure causes it
1837  *                      to panic.
1838  * 3) querylocale_X     to see what the given category's locale is
1839  *
1840  * 4) setlocale_i()     is defined only in those implementations where the bool
1841  *                      and query forms are essentially the same, and can be
1842  *                      combined to save CPU time.
1843  *
1844  * Each implementation below is separated by ==== lines, and includes bool,
1845  * void, and query macros.  The query macros are first, followed by any
1846  * functions needed to implement them.  Then come the bool, again followed by
1847  * any implementing functions  Then are the void macros; next is setlocale_i if
1848  * present on this implementation.  Finally are any helper functions.  The sets
1849  * in each implementation are separated by ---- lines.
1850  *
1851  * The returned strings from all the querylocale...() forms in all
1852  * implementations are thread-safe, and the caller should not free them,
1853  * but each may be a mortalized copy.  If you need something stable across
1854  * calls, you need to savepv() the result yourself.
1855  *
1856  *===========================================================================*/
1857
1858 #if    (! defined(USE_LOCALE_THREADS) && ! defined(USE_POSIX_2008_LOCALE))    \
1859     || (  defined(WIN32) && defined(USE_THREAD_SAFE_LOCALE))
1860
1861 /* For non-threaded perls, the implementation just expands to the base-level
1862  * functions (except if we are Configured to nonetheless use the POSIX 2008
1863  * interface) This implementation is also used on threaded perls where
1864  * threading is invisible to us.  Currently this is only on later Windows
1865  * versions. */
1866
1867 #  define querylocale_r(cat)  mortalized_pv_copy(stdized_setlocale(cat, NULL))
1868 #  define querylocale_c(cat)  querylocale_r(cat)
1869 #  define querylocale_i(i)    querylocale_c(categories[i])
1870
1871 /*---------------------------------------------------------------------------*/
1872
1873 #  define bool_setlocale_r(cat, locale) cBOOL(posix_setlocale(cat, locale))
1874 #  define bool_setlocale_i(i, locale)                                       \
1875                                    bool_setlocale_c(categories[i], locale)
1876 #  define bool_setlocale_c(cat, locale)      bool_setlocale_r(cat, locale)
1877
1878 /*---------------------------------------------------------------------------*/
1879
1880 #  define void_setlocale_r_with_caller(cat, locale, file, line)             \
1881      STMT_START {                                                           \
1882         if (! bool_setlocale_r(cat, locale))                                \
1883             setlocale_failure_panic_via_i(get_category_index(cat),          \
1884                                           NULL, locale, __LINE__, 0,        \
1885                                           file, line);                      \
1886      } STMT_END
1887
1888 #  define void_setlocale_c_with_caller(cat, locale, file, line)             \
1889                     void_setlocale_r_with_caller(cat, locale, file, line)
1890
1891 #  define void_setlocale_i_with_caller(i, locale, file, line)               \
1892           void_setlocale_r_with_caller(categories[i], locale, file, line)
1893
1894 #  define void_setlocale_r(cat, locale)                                     \
1895             void_setlocale_r_with_caller(cat, locale, __FILE__, __LINE__)
1896 #  define void_setlocale_c(cat, locale) void_setlocale_r(cat, locale)
1897 #  define void_setlocale_i(i, locale)   void_setlocale_r(categories[i], locale)
1898
1899 /*---------------------------------------------------------------------------*/
1900
1901 /* setlocale_i is only defined for Configurations where the libc setlocale()
1902  * doesn't need any tweaking.  It allows for some shortcuts */
1903 #  ifndef USE_LOCALE_THREADS
1904 #    define setlocale_i(i, locale)   stdized_setlocale(categories[i], locale)
1905
1906 #  elif defined(WIN32) && defined(USE_THREAD_SAFE_LOCALE)
1907
1908 /* On Windows, we don't know at compile time if we are in thread-safe mode or
1909  * not.  If we are, we can just return the result of the layer below us.  If we
1910  * are in unsafe mode, we need to first copy that result to a safe place while
1911  * in a critical section */
1912
1913 #    define setlocale_i(i, locale)   S_setlocale_i(aTHX_ categories[i], locale)
1914
1915 STATIC const char *
1916 S_setlocale_i(pTHX_ const int category, const char * locale)
1917 {
1918     if (LIKELY(_configthreadlocale(0) == _ENABLE_PER_THREAD_LOCALE)) {
1919         return stdized_setlocale(category, locale);
1920     }
1921
1922     gwLOCALE_LOCK;
1923     const char * retval = save_to_buffer(stdized_setlocale(category, locale),
1924                                          &PL_setlocale_buf,
1925                                          &PL_setlocale_bufsize);
1926     gwLOCALE_UNLOCK;
1927
1928     return retval;
1929 }
1930
1931 #  endif
1932
1933 /*===========================================================================*/
1934 #elif   defined(USE_LOCALE_THREADS)                 \
1935    && ! defined(USE_THREAD_SAFE_LOCALE)
1936
1937    /* Here, there are threads, and there is no support for thread-safe
1938     * operation.  This is a dangerous situation, which perl is documented as
1939     * not supporting, but it arises in practice.  We can do a modicum of
1940     * automatic mitigation by making sure there is a per-thread return from
1941     * setlocale(), and that a mutex protects it from races */
1942
1943 #  define querylocale_r(cat)                                                \
1944                       mortalized_pv_copy(less_dicey_setlocale_r(cat, NULL))
1945 #  define querylocale_c(cat)  querylocale_r(cat)
1946 #  define querylocale_i(i)    querylocale_r(categories[i])
1947
1948 STATIC const char *
1949 S_less_dicey_setlocale_r(pTHX_ const int category, const char * locale)
1950 {
1951     const char * retval;
1952
1953     PERL_ARGS_ASSERT_LESS_DICEY_SETLOCALE_R;
1954
1955     STDIZED_SETLOCALE_LOCK;
1956
1957     retval = save_to_buffer(stdized_setlocale(category, locale),
1958                             &PL_less_dicey_locale_buf,
1959                             &PL_less_dicey_locale_bufsize);
1960
1961     STDIZED_SETLOCALE_UNLOCK;
1962
1963     return retval;
1964 }
1965
1966 /*---------------------------------------------------------------------------*/
1967
1968 #  define bool_setlocale_r(cat, locale)                                     \
1969                                less_dicey_bool_setlocale_r(cat, locale)
1970 #  define bool_setlocale_i(i, locale)                                       \
1971                                 bool_setlocale_r(categories[i], locale)
1972 #  define bool_setlocale_c(cat, locale) bool_setlocale_r(cat, locale)
1973
1974 STATIC bool
1975 S_less_dicey_bool_setlocale_r(pTHX_ const int cat, const char * locale)
1976 {
1977     bool retval;
1978
1979     PERL_ARGS_ASSERT_LESS_DICEY_BOOL_SETLOCALE_R;
1980
1981     /* Unlikely, but potentially possible that another thread could zap the
1982      * buffer from true to false or vice-versa, so need to lock here */
1983     POSIX_SETLOCALE_LOCK;
1984     retval = cBOOL(posix_setlocale(cat, locale));
1985     POSIX_SETLOCALE_UNLOCK;
1986
1987     return retval;
1988 }
1989
1990 /*---------------------------------------------------------------------------*/
1991
1992 #  define void_setlocale_r_with_caller(cat, locale, file, line)             \
1993      STMT_START {                                                           \
1994         if (! bool_setlocale_r(cat, locale))                                \
1995             setlocale_failure_panic_via_i(get_category_index(cat),          \
1996                                           NULL, locale, __LINE__, 0,        \
1997                                           file, line);                      \
1998      } STMT_END
1999
2000 #  define void_setlocale_c_with_caller(cat, locale, file, line)             \
2001                     void_setlocale_r_with_caller(cat, locale, file, line)
2002
2003 #  define void_setlocale_i_with_caller(i, locale, file, line)               \
2004           void_setlocale_r_with_caller(categories[i], locale, file, line)
2005
2006 #  define void_setlocale_r(cat, locale)                                     \
2007             void_setlocale_r_with_caller(cat, locale, __FILE__, __LINE__)
2008 #  define void_setlocale_c(cat, locale) void_setlocale_r(cat, locale)
2009 #  define void_setlocale_i(i, locale)   void_setlocale_r(categories[i], locale)
2010
2011 /*---------------------------------------------------------------------------*/
2012
2013 /* setlocale_i is only defined for Configurations where the libc setlocale()
2014  * suffices for both querying and setting the locale.  It allows for some
2015  * shortcuts */
2016 #  define setlocale_i(i, locale)  less_dicey_setlocale_r(categories[i], locale)
2017
2018 /*===========================================================================*/
2019
2020 #elif defined(USE_POSIX_2008_LOCALE)
2021 #  ifndef LC_ALL
2022 #    error This code assumes that LC_ALL is available on a system modern enough to have POSIX 2008
2023 #  endif
2024
2025 /* Here, there is a completely different API to get thread-safe locales.  We
2026  * emulate the setlocale() API with our own function(s).  setlocale categories,
2027  * like LC_NUMERIC, are not valid here for the POSIX 2008 API.  Instead, there
2028  * are equivalents, like LC_NUMERIC_MASK, which we use instead, which we find
2029  * by table lookup. */
2030
2031 #  if defined(__GLIBC__) && defined(USE_LOCALE_MESSAGES)
2032             /* https://sourceware.org/bugzilla/show_bug.cgi?id=24936 */
2033 #    define HAS_GLIBC_LC_MESSAGES_BUG
2034 #    include <libintl.h>
2035 #  endif
2036
2037 #  define querylocale_i(i)    querylocale_2008_i(i, __LINE__)
2038 #  define querylocale_c(cat)  querylocale_i(cat##_INDEX_)
2039 #  define querylocale_r(cat)  querylocale_i(get_category_index(cat))
2040
2041 STATIC const char *
2042 S_querylocale_2008_i(pTHX_ const locale_category_index index,
2043                            const line_t caller_line)
2044 {
2045     PERL_ARGS_ASSERT_QUERYLOCALE_2008_I;
2046     assert(index <= LC_ALL_INDEX_);
2047
2048     /* This function returns the name of the locale category given by the input
2049      * 'index' into our parallel tables of them.
2050      *
2051      * POSIX 2008, for some sick reason, chose not to provide a method to find
2052      * the category name of a locale, disregarding a basic linguistic tenet
2053      * that for any object, people will create a name for it.  (The next
2054      * version of the POSIX standard is proposed to fix this.)  Some vendors
2055      * have created a querylocale() function to do this in the meantime.  On
2056      * systems without querylocale(), we have to keep track of what the locale
2057      * has been set to, so that we can return its name so as to emulate
2058      * setlocale().  There are potential problems with this:
2059      *
2060      *  1)  We don't know what calling newlocale() with the locale argument ""
2061      *      actually does.  It gets its values from the program's environment.
2062      *      find_locale_from_environment() is used to work around this.  But it
2063      *      isn't fool-proof.  See the comments for that function for details.
2064      *  2)  It's possible for C code in some library to change the locale
2065      *      without us knowing it, and thus our records become wrong;
2066      *      querylocale() would catch this.  But as of September 2017, there
2067      *      are no occurrences in CPAN of uselocale().  Some libraries do use
2068      *      setlocale(), but that changes the global locale, and threads using
2069      *      per-thread locales will just ignore those changes.
2070      *  3)  Many systems have multiple names for the same locale.  Generally,
2071      *      there is an underlying base name, with aliases that evaluate to it.
2072      *      On some systems, if you set the locale to an alias, and then
2073      *      retrieve the name, you get the alias as expected; but on others you
2074      *      get the base name, not the alias you used.  And sometimes the
2075      *      charade is incomplete.  See
2076      *      https://bugs.freebsd.org/bugzilla/show_bug.cgi?id=269375.
2077      *
2078      *      The code is structured so that the returned locale name when the
2079      *      locale is changed is whatever the result of querylocale() on the
2080      *      new locale is.  This effectively gives the result the system
2081      *      expects.  Without querylocale, the name returned is always the
2082      *      input name.  Theoretically this could cause problems, but khw knows
2083      *      of none so far, but mentions it here in case you are trying to
2084      *      debug something.  (This could be worked around by messing with the
2085      *      global locale temporarily, using setlocale() to get the base name;
2086      *      but that could cause a race.  The comments for
2087      *      find_locale_from_environment() give details on the potential race.)
2088      */
2089
2090     const locale_t cur_obj = uselocale((locale_t) 0);
2091     const char * retval;
2092
2093     DEBUG_Lv(PerlIO_printf(Perl_debug_log, "querylocale_2008_i(%s) on %p;"
2094                                            " called from %" LINE_Tf "\n",
2095                                            category_names[index], cur_obj,
2096                                            caller_line));
2097
2098     if (UNLIKELY(cur_obj == LC_GLOBAL_LOCALE)) {
2099
2100         /* Even on platforms that have querylocale(), it is unclear if they
2101          * work in the global locale, and we have the means to get the correct
2102          * answer anyway.  khw is unsure this situation even comes up these
2103          * days, hence the branch prediction */
2104         POSIX_SETLOCALE_LOCK;
2105         retval = mortalized_pv_copy(posix_setlocale(categories[index], NULL));
2106         POSIX_SETLOCALE_UNLOCK;
2107     }
2108
2109     /* Here we have handled the case of the current locale being the global
2110      * one.  Below is the 'else' case of that.  There are two different
2111      * implementations, depending on USE_PL_CURLOCALES */
2112
2113 #  ifdef USE_PL_CURLOCALES
2114
2115     else {
2116
2117         /* PL_curlocales[] is kept up-to-date for all categories except LC_ALL,
2118          * which may have been invalidated by setting it to NULL, and if so,
2119          * should now be calculated.  (The called function updates that
2120          * element.) */
2121         if (index == LC_ALL_INDEX_ && PL_curlocales[LC_ALL_INDEX_] == NULL) {
2122             calculate_LC_ALL_string((const char **) &PL_curlocales,
2123                                     INTERNAL_FORMAT,
2124                                     WANT_VOID,
2125                                     caller_line);
2126         }
2127
2128         if (cur_obj == PL_C_locale_obj) {
2129
2130             /* If the current locale object is the C object, then the answer is
2131              * "C" or POSIX, regardless of the category.  Handling this
2132              * reasonably likely case specially shortcuts extra effort, and
2133              * hides some bugs from us in OS's that alias other locales to C,
2134              * but do so incompletely.  If our records say it is POSIX, use
2135              * that; otherwise use C.  See
2136              * https://bugs.freebsd.org/bugzilla/show_bug.cgi?id=269375 */
2137             retval = mortalized_pv_copy((strEQ(PL_curlocales[index], "POSIX"))
2138                                         ? "POSIX"
2139                                         : "C");
2140         }
2141         else {
2142             retval = mortalized_pv_copy(PL_curlocales[index]);
2143         }
2144     }
2145
2146 #  else
2147
2148     /* Below is the implementation of the 'else' clause which handles the case
2149      * of the current locale not being the global one on platforms where
2150      * USE_PL_CURLOCALES is NOT in effect.  That means the system must have
2151      * some form of querylocale.  But these have varying characteristics, so
2152      * first create some #defines to make the actual 'else' clause uniform.
2153      *
2154      * First, glibc has a function that implements querylocale(), but is called
2155      * something else, and takes the category number; the others take the mask.
2156      * */
2157 #    if defined(USE_QUERYLOCALE) && (   defined(_NL_LOCALE_NAME)            \
2158                                      && defined(HAS_NL_LANGINFO_L))
2159 #      define my_querylocale(index, cur_obj)                                \
2160                 nl_langinfo_l(_NL_LOCALE_NAME(categories[index]), cur_obj)
2161
2162        /* Experience so far shows it is thread-safe, as well as glibc's
2163         * nl_langinfo_l(), so unless overridden, mark it so */
2164 #      ifdef NO_THREAD_SAFE_QUERYLOCALE
2165 #        undef HAS_THREAD_SAFE_QUERYLOCALE
2166 #      else
2167 #        define HAS_THREAD_SAFE_QUERYLOCALE
2168 #      endif
2169 #    else   /* below, ! glibc */
2170
2171        /* Otherwise, use the system's querylocale(). */
2172 #      define my_querylocale(index, cur_obj)                                \
2173                                querylocale(category_masks[index], cur_obj)
2174
2175        /* There is no standard for this function, and khw has never seen
2176         * anything beyond minimal vendor documentation, lacking important
2177         * details.  Experience has shown that some implementations have race
2178         * condiions, and their returns may not be thread safe.  It would be
2179         * unreliable to test for complete thread safety in Configure.  What we
2180         * do instead is to assume that it is thread-safe, unless overriden by,
2181         * say, a hints file specifying
2182         * -Accflags='-DNO_THREAD_SAFE_QUERYLOCALE */
2183 #      ifdef NO_THREAD_SAFE_QUERYLOCALE
2184 #        undef HAS_THREAD_SAFE_QUERYLOCALE
2185 #      else
2186 #        define HAS_THREAD_SAFE_QUERYLOCALE
2187 #      endif
2188 #    endif
2189
2190      /* Here, we have set up enough information to know if this querylocale()
2191       * is thread-safe, or needs to use a mutex */
2192 #    ifdef HAS_THREAD_SAFE_QUERYLOCALE
2193 #      define QUERYLOCALE_LOCK
2194 #      define QUERYLOCALE_UNLOCK
2195 #    else
2196 #      define QUERYLOCALE_LOCK    gwLOCALE_LOCK
2197 #      define QUERYLOCALE_UNLOCK  gwLOCALE_UNLOCK
2198 #    endif
2199
2200     /* Finally, everything is ready, so here is the 'else' clause to implement
2201      * the case of the current locale not being the global one on systems that
2202      * have some form of querylocale().  (POSIX will presumably eventually
2203      * publish their next version in their pipeline, which will define a
2204      * precisely specified querylocale equivalent, and there can be a new
2205      * #ifdef to use it without having to guess at its characteristics) */
2206
2207     else {
2208         /* We don't keep records when there is querylocale(), so as to avoid the
2209          * pitfalls mentioned at the beginning of this function.
2210          *
2211          * That means LC_ALL has to be calculated from all its constituent
2212          * categories each time, since the querylocale() forms on many (if not
2213          * all) platforms only work on individual categories */
2214         if (index == LC_ALL_INDEX_) {
2215             retval = calculate_LC_ALL_string(NULL, INTERNAL_FORMAT,
2216                                              WANT_TEMP_PV,
2217                                              caller_line);
2218         }
2219         else {
2220
2221             QUERYLOCALE_LOCK;
2222             retval = savepv(my_querylocale(index, cur_obj));
2223             QUERYLOCALE_UNLOCK;
2224
2225             /* querylocale() may conflate the C locale with something that
2226              * isn't exactly the same.  See for example
2227              * https://bugs.freebsd.org/bugzilla/show_bug.cgi?id=269375
2228              * We know that if the locale object is the C one, we
2229              * are in the C locale, which may go by the name POSIX, as both, by
2230              * definition, are equivalent.  But we consider any other name
2231              * spurious, so override with "C".  As in the PL_CURLOCALES case
2232              * above, this hides those glitches, for the most part, from the
2233              * rest of our code.  (The code is ordered this way so that if the
2234              * system distinugishes "C" from "POSIX", we do too.) */
2235             if (cur_obj == PL_C_locale_obj && ! isNAME_C_OR_POSIX(retval)) {
2236                 Safefree(retval);
2237                 retval = savepv("C");
2238             }
2239
2240             SAVEFREEPV(retval);
2241         }
2242     }
2243
2244 #    undef QUERYLOCALE_LOCK
2245 #    undef QUERYLOCALE_UNLOCK
2246 #  endif
2247
2248     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
2249                            "querylocale_2008_i(%s) returning '%s'\n",
2250                            category_names[index], retval));
2251     assert(strNE(retval, ""));
2252     return retval;
2253 }
2254
2255 /*---------------------------------------------------------------------------*/
2256
2257 #  define bool_setlocale_i(i, locale)                                       \
2258                               bool_setlocale_2008_i(i, locale, __LINE__)
2259 #  define bool_setlocale_c(cat, locale)                                     \
2260                                   bool_setlocale_i(cat##_INDEX_, locale)
2261 #  define bool_setlocale_r(cat, locale)                                     \
2262                        bool_setlocale_i(get_category_index(cat), locale)
2263
2264 /* If this doesn't exist on this platform, make it a no-op (to save #ifdefs) */
2265 #  ifndef update_PL_curlocales_i
2266 #    define update_PL_curlocales_i(index, new_locale, caller_line)
2267 #  endif
2268
2269 STATIC bool
2270 S_bool_setlocale_2008_i(pTHX_
2271
2272         /* Our internal index of the 'category' setlocale is called with */
2273         const locale_category_index  index,
2274         const char * new_locale,    /* The locale to set the category to */
2275         const line_t caller_line    /* Called from this line number */
2276        )
2277 {
2278     PERL_ARGS_ASSERT_BOOL_SETLOCALE_2008_I;
2279     assert(index <= LC_ALL_INDEX_);
2280
2281     /* This function effectively performs a setlocale() on just the current
2282      * thread; thus it is thread-safe.  It does this by using the POSIX 2008
2283      * locale functions to emulate the behavior of setlocale().  Similar to
2284      * regular setlocale(), the return from this function points to memory that
2285      * can be overwritten by other system calls, so needs to be copied
2286      * immediately if you need to retain it.  The difference here is that
2287      * system calls besides another setlocale() can overwrite it.
2288      *
2289      * By doing this, most locale-sensitive functions become thread-safe.  The
2290      * exceptions are mostly those that return a pointer to static memory.
2291      */
2292
2293     int mask = category_masks[index];
2294     const locale_t entry_obj = uselocale((locale_t) 0);
2295     const char * locale_on_entry = querylocale_i(index);
2296
2297     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
2298                            "bool_setlocale_2008_i: input=%d (%s), mask=0x%x,"
2299                            " new locale=\"%s\", current locale=\"%s\","
2300                            " index=%d, entry object=%p;"
2301                            " called from %" LINE_Tf "\n",
2302                            categories[index], category_names[index], mask,
2303                            ((new_locale == NULL) ? "(nil)" : new_locale),
2304                            locale_on_entry, index, entry_obj, caller_line));
2305
2306     /* Here, trying to change the locale, but it is a no-op if the new boss is
2307      * the same as the old boss.  Except this routine is called when converting
2308      * from the global locale, so in that case we will create a per-thread
2309      * locale below (with the current values).  It also seemed that newlocale()
2310      * could free up the basis locale memory if we called it with the new and
2311      * old being the same, but khw now thinks that this was due to some other
2312      * bug, since fixed, as there are other places where newlocale() gets
2313      * similarly called without problems. */
2314     if (   entry_obj != LC_GLOBAL_LOCALE
2315         && locale_on_entry
2316         && strEQ(new_locale, locale_on_entry))
2317     {
2318         DEBUG_Lv(PerlIO_printf(Perl_debug_log,
2319                                "bool_setlocale_2008_i: no-op to change to"
2320                                " what it already was\n"));
2321         return true;
2322     }
2323
2324 #  ifndef USE_QUERYLOCALE
2325
2326     /* Without a querylocale() mechanism, we have to figure out ourselves what
2327      * happens with setting a locale to "" */
2328
2329     if (strEQ(new_locale, "")) {
2330         new_locale = find_locale_from_environment(index);
2331         if (! new_locale) {
2332             SET_EINVAL;
2333             return false;
2334         }
2335     }
2336
2337 #  endif
2338 #  ifdef NEWLOCALE_HANDLES_DISPARATE_LC_ALL
2339
2340     const bool need_loop = false;
2341
2342 #  else
2343
2344     bool need_loop = false;
2345     const char * new_locales[LC_ALL_INDEX_] = { NULL };
2346
2347     /* If we're going to have to parse the LC_ALL string, might as well do it
2348      * now before we have made changes that we would have to back out of if the
2349      * parse fails */
2350     if (index == LC_ALL_INDEX_) {
2351         switch (parse_LC_ALL_string(new_locale,
2352                                     (const char **) &new_locales,
2353                                     override_if_ignored,
2354                                     false,    /* Return only [0] if suffices */
2355                                     false,    /* Don't panic on error */
2356                                     caller_line))
2357         {
2358           case invalid:
2359             SET_EINVAL;
2360             return false;
2361
2362           case no_array:
2363             need_loop = false;
2364             break;
2365
2366           case only_element_0:
2367             SAVEFREEPV(new_locales[0]);
2368             new_locale = new_locales[0];
2369             need_loop = false;
2370             break;
2371
2372           case full_array:
2373             need_loop = true;
2374             break;
2375         }
2376     }
2377
2378 #  endif
2379 #  ifdef HAS_GLIBC_LC_MESSAGES_BUG
2380
2381     /* For this bug, if the LC_MESSAGES locale changes, we have to do an
2382      * expensive workaround.  Save the current value so we can later determine
2383      * if it changed. */
2384     const char * old_messages_locale = NULL;
2385     if (   (index == LC_MESSAGES_INDEX_ || index == LC_ALL_INDEX_)
2386         &&  LIKELY(PL_phase != PERL_PHASE_CONSTRUCT))
2387     {
2388         old_messages_locale = querylocale_c(LC_MESSAGES);
2389     }
2390
2391 #  endif
2392
2393     assert(PL_C_locale_obj);
2394
2395     /* Now ready to switch to the input 'new_locale' */
2396
2397     /* Switching locales generally entails freeing the current one's space (at
2398      * the C library's discretion), hence we can't be using that locale at the
2399      * time of the switch (this wasn't obvious to khw from the man pages).  So
2400      * switch to a known locale object that we don't otherwise mess with. */
2401     if (! uselocale(PL_C_locale_obj)) {
2402
2403         /* Not being able to change to the C locale is severe; don't keep
2404          * going.  */
2405         setlocale_failure_panic_i(index, locale_on_entry, "C",
2406                                   __LINE__, caller_line);
2407         NOT_REACHED; /* NOTREACHED */
2408     }
2409
2410     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
2411                            "bool_setlocale_2008_i: now using C"
2412                            " object=%p\n", PL_C_locale_obj));
2413
2414     /* These two objects are special:
2415      *  LC_GLOBAL_LOCALE    because it is undefined behavior to call
2416      *                      newlocale() with it as a parameter.
2417      *  PL_C_locale_obj     because newlocale() generally destroys its locale
2418      *                      object parameter when it succeeds; and we don't
2419      *                      want that happening to this immutable object.
2420      * Copies will be made for them to use instead if we get so far as to call
2421      * newlocale(). */
2422     bool entry_obj_is_special = (   entry_obj == LC_GLOBAL_LOCALE
2423                                  || entry_obj == PL_C_locale_obj);
2424     locale_t new_obj;
2425
2426     /* PL_C_locale_obj is LC_ALL set to the C locale.  If this call is to
2427      * switch to LC_ALL => C, simply use that object.  But in fact, we already
2428      * have switched to it just above, in preparation for the general case.
2429      * Since we're already there, no need to do further switching. */
2430     if (mask == LC_ALL_MASK && isNAME_C_OR_POSIX(new_locale)) {
2431         DEBUG_Lv(PerlIO_printf(Perl_debug_log,
2432                                "bool_setlocale_2008_i: will stay in C"
2433                                " object\n"));
2434         new_obj = PL_C_locale_obj;
2435
2436         /* 'entry_obj' is now dangling, of no further use to anyone (unless it
2437          * is one of the special ones).  Free it to avoid a leak */
2438         if (! entry_obj_is_special) {
2439             freelocale(entry_obj);
2440         }
2441
2442         update_PL_curlocales_i(index, new_locale, caller_line);
2443     }
2444     else {  /* Here is the general case, not to LC_ALL => C */
2445
2446         /* The newlocale() call(s) below take a basis object to build upon to
2447          * create the changed locale, trashing it iff successful.
2448          *
2449          * For the objects that are not to be modified by this function, we
2450          * create a duplicate that gets trashed instead.
2451          *
2452          * Also if we will have to loop doing multiple newlocale()s, there is a
2453          * chance we will succeed for the first few, and then fail, having to
2454          * back out.  We need to duplicate 'entry_obj' in this case as well, so
2455          * it remains valid as something to back out to. */
2456         locale_t basis_obj = entry_obj;
2457
2458         if (entry_obj_is_special || need_loop) {
2459             basis_obj = duplocale(basis_obj);
2460             if (! basis_obj) {
2461                 locale_panic_via_("duplocale failed", __FILE__, caller_line);
2462                 NOT_REACHED; /* NOTREACHED */
2463             }
2464
2465             DEBUG_Lv(PerlIO_printf(Perl_debug_log,
2466                                    "bool_setlocale_2008_i created %p by"
2467                                    " duping the input\n", basis_obj));
2468         }
2469
2470 #  define DEBUG_NEW_OBJECT_CREATED(category, locale, new, old, caller_line) \
2471       DEBUG_Lv(PerlIO_printf(Perl_debug_log,                                \
2472                              "bool_setlocale_2008_i(%s, %s): created %p"    \
2473                              " while freeing %p; called from %" LINE_Tf     \
2474                              " via %" LINE_Tf "\n",                         \
2475                              category, locale, new, old,                    \
2476                              caller_line, __LINE__))
2477 #  define DEBUG_NEW_OBJECT_FAILED(category, locale, basis_obj)              \
2478       DEBUG_L(PerlIO_printf(Perl_debug_log,                                 \
2479                             "bool_setlocale_2008_i: creating new object"    \
2480                             " for (%s '%s') from %p failed; called from %"  \
2481                             LINE_Tf " via %" LINE_Tf "\n",                  \
2482                             category, locale, basis_obj,                    \
2483                             caller_line, __LINE__));
2484
2485         /* Ready to create a new locale by modification of the existing one.
2486          *
2487          * NOTE: This code may incorrectly show up as a leak under the address
2488          * sanitizer. We do not free this object under normal teardown, however
2489          * you can set PERL_DESTRUCT_LEVEL=2 to cause it to be freed.
2490          */
2491
2492 #  ifdef NEWLOCALE_HANDLES_DISPARATE_LC_ALL
2493
2494         /* Some platforms have a newlocale() that can handle disparate LC_ALL
2495          * input, so on these a single call to newlocale() always works */
2496 #  else
2497
2498         /* If a single call to newlocale() will do */
2499         if (! need_loop)
2500
2501 #  endif
2502
2503         {
2504             new_obj = newlocale(mask,
2505                                 override_ignored_category(index, new_locale),
2506                                 basis_obj);
2507             if (! new_obj) {
2508                 DEBUG_NEW_OBJECT_FAILED(category_names[index], new_locale,
2509                                         basis_obj);
2510
2511                 /* Since the call failed, it didn't trash 'basis_obj', which is
2512                  * a dup for these objects, and hence would leak if we don't
2513                  * free it.  XXX However, something is seriously wrong if we
2514                  * can't switch to C or the global locale, so maybe should
2515                  * panic instead */
2516                 if (entry_obj_is_special) {
2517                     freelocale(basis_obj);
2518                 }
2519
2520                 goto must_restore_state;
2521             }
2522
2523             DEBUG_NEW_OBJECT_CREATED(category_names[index], new_locale,
2524                                      new_obj, basis_obj, caller_line);
2525
2526             update_PL_curlocales_i(index, new_locale, caller_line);
2527         }
2528
2529 #  ifndef NEWLOCALE_HANDLES_DISPARATE_LC_ALL
2530
2531         else {  /* Need multiple newlocale() calls */
2532
2533             /* Loop through the individual categories, setting the locale of
2534              * each to the corresponding name previously populated into
2535              * newlocales[].  Each iteration builds on the previous one, adding
2536              * its category to what's already been calculated, and taking as a
2537              * basis for what's been calculated 'basis_obj', which is updated
2538              * each iteration to be the result of the previous one.  Upon
2539              * success, newlocale() trashes the 'basis_obj' parameter to it.
2540              * If any iteration fails, we immediately give up, restore the
2541              * locale to what it was at the time this function was called
2542              * (saved in 'entry_obj'), and return failure. */
2543
2544             /* Loop, using the previous iteration's result as the basis for the
2545              * next one.  (The first time we effectively use the locale in
2546              * force upon entry to this function.) */
2547             for_all_individual_category_indexes(i) {
2548                 new_obj = newlocale(category_masks[i],
2549                                     new_locales[i],
2550                                     basis_obj);
2551                 if (new_obj) {
2552                     DEBUG_NEW_OBJECT_CREATED(category_names[i],
2553                                              new_locales[i],
2554                                              new_obj, basis_obj,
2555                                              caller_line);
2556                     basis_obj = new_obj;
2557                     continue;
2558                 }
2559
2560                 /* Failed.  Likely this is because the proposed new locale
2561                  * isn't valid on this system. */
2562
2563                 DEBUG_NEW_OBJECT_FAILED(category_names[i],
2564                                         new_locales[i],
2565                                         basis_obj);
2566
2567                 /* newlocale() didn't trash this, since the function call
2568                  * failed */
2569                 freelocale(basis_obj);
2570
2571                 for_all_individual_category_indexes(j) {
2572                     Safefree(new_locales[j]);
2573                 }
2574
2575                 goto must_restore_state;
2576             }
2577
2578             /* Success for all categories. */
2579             for_all_individual_category_indexes(i) {
2580                 update_PL_curlocales_i(i, new_locales[i], caller_line);
2581                 Safefree(new_locales[i]);
2582             }
2583
2584             /* We dup'd entry_obj in case we had to fall back to it.  The
2585              * newlocale() above destroyed the dup when it first succeeded, but
2586              * entry_obj itself is left dangling, so free it */
2587             if (! entry_obj_is_special) {
2588                 freelocale(entry_obj);
2589             }
2590         }
2591
2592 #  endif    /* End of newlocale can't handle disparate LC_ALL input */
2593
2594     }
2595
2596 #  undef DEBUG_NEW_OBJECT_CREATED
2597 #  undef DEBUG_NEW_OBJECT_FAILED
2598
2599     /* Here, successfully created an object representing the desired locale;
2600      * now switch into it */
2601     if (! uselocale(new_obj)) {
2602         freelocale(new_obj);
2603         locale_panic_(Perl_form(aTHX_ "(called from %" LINE_Tf "):"
2604                                       " bool_setlocale_2008_i: switching"
2605                                       " into new locale failed",
2606                                       caller_line));
2607     }
2608
2609     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
2610                            "bool_setlocale_2008_i: now using %p\n", new_obj));
2611
2612 #  ifdef MULTIPLICITY   /* Unlikely, but POSIX 2008 functions could be
2613                            Configured to be used on unthreaded perls, in which
2614                            case this object doesn't exist */
2615
2616     if (DEBUG_Lv_TEST) {
2617         if (PL_cur_locale_obj != new_obj) {
2618             PerlIO_printf(Perl_debug_log,
2619                           "bool_setlocale_2008_i: PL_cur_locale_obj"
2620                           " was %p, now is %p\n",
2621                           PL_cur_locale_obj, new_obj);
2622         }
2623     }
2624
2625     /* Update the current object */
2626     PL_cur_locale_obj = new_obj;
2627
2628 #  endif
2629 #  ifdef HAS_GLIBC_LC_MESSAGES_BUG
2630
2631     /* Invalidate the glibc cache of loaded translations if the locale has
2632      * changed, see [perl #134264] and
2633      * https://sourceware.org/bugzilla/show_bug.cgi?id=24936 */
2634     if (old_messages_locale) {
2635         if (strNE(old_messages_locale, querylocale_c(LC_MESSAGES))) {
2636             textdomain(textdomain(NULL));
2637         }
2638     }
2639
2640 #  endif
2641
2642     return true;
2643
2644   must_restore_state:
2645
2646     /* We earlier switched to the LC_ALL => C locale in anticipation of it
2647      * succeeding,  Now have to switch back to the state upon entry.  */
2648     if (! uselocale(entry_obj)) {
2649         setlocale_failure_panic_i(index, "switching back to",
2650                                   locale_on_entry, __LINE__, caller_line);
2651     }
2652
2653     return false;
2654 }
2655
2656 /*---------------------------------------------------------------------------*/
2657
2658 #  define void_setlocale_i_with_caller(i, locale, file, line)               \
2659      STMT_START {                                                           \
2660         if (! bool_setlocale_i(i, locale))                                  \
2661             setlocale_failure_panic_via_i(i, NULL, locale, __LINE__, 0,     \
2662                                           file, line);                      \
2663      } STMT_END
2664
2665 #  define void_setlocale_r_with_caller(cat, locale, file, line)             \
2666         void_setlocale_i_with_caller(get_category_index(cat), locale,       \
2667                                      file, line)
2668
2669 #  define void_setlocale_c_with_caller(cat, locale, file, line)             \
2670             void_setlocale_i_with_caller(cat##_INDEX_, locale, file, line)
2671
2672 #  define void_setlocale_i(i, locale)                                       \
2673                 void_setlocale_i_with_caller(i, locale, __FILE__, __LINE__)
2674 #  define void_setlocale_c(cat, locale)                                     \
2675                                   void_setlocale_i(cat##_INDEX_, locale)
2676 #  define void_setlocale_r(cat, locale)                                     \
2677                   void_setlocale_i(get_category_index(cat), locale)
2678
2679 /*===========================================================================*/
2680
2681 #else
2682 #  error Unexpected Configuration
2683 #endif   /* End of the various implementations of the setlocale and
2684             querylocale macros used in the remainder of this program */
2685
2686 /* query_nominal_locale_i() is used when the caller needs the locale that an
2687  * external caller would be expecting, and not what we're secretly using
2688  * behind the scenes.  It deliberately doesn't handle LC_ALL; use
2689  * calculate_LC_ALL_string() for that. */
2690 #ifdef USE_LOCALE_NUMERIC
2691 #  define query_nominal_locale_i(i)                                         \
2692       (__ASSERT_(i != LC_ALL_INDEX_)                                        \
2693        ((i == LC_NUMERIC_INDEX_) ? PL_numeric_name : querylocale_i(i)))
2694 #elif defined(USE_LOCALE)
2695 #  define query_nominal_locale_i(i)                                         \
2696       (__ASSERT_(i != LC_ALL_INDEX_) querylocale_i(i))
2697 #else
2698 #  define query_nominal_locale_i(i)  "C"
2699 #endif
2700
2701 #ifdef USE_PL_CURLOCALES
2702
2703 STATIC void
2704 S_update_PL_curlocales_i(pTHX_
2705                          const locale_category_index index,
2706                          const char * new_locale,
2707                          const line_t caller_line)
2708 {
2709     /* Update PL_curlocales[], which is parallel to the other ones indexed by
2710      * our mapping of libc category number to our internal equivalents. */
2711
2712     PERL_ARGS_ASSERT_UPDATE_PL_CURLOCALES_I;
2713     assert(index <= LC_ALL_INDEX_);
2714
2715     if (index == LC_ALL_INDEX_) {
2716
2717         /* For LC_ALL, we change all individual categories to correspond,
2718          * including the LC_ALL element */
2719         for (unsigned int i = 0; i <= LC_ALL_INDEX_; i++) {
2720             Safefree(PL_curlocales[i]);
2721             PL_curlocales[i] = NULL;
2722         }
2723
2724         switch (parse_LC_ALL_string(new_locale,
2725                                     (const char **) &PL_curlocales,
2726                                     check_that_overridden,  /* things should
2727                                                                have already
2728                                                                been overridden
2729                                                                */
2730                                     true,   /* Always fill array */
2731                                     true,   /* Panic if fails, as to get here
2732                                                it earlier had to have succeeded
2733                                                */
2734                                    caller_line))
2735         {
2736           case invalid:
2737           case no_array:
2738           case only_element_0:
2739             locale_panic_via_("Unexpected return from parse_LC_ALL_string",
2740                               __FILE__, caller_line);
2741
2742           case full_array:
2743             /* parse_LC_ALL_string() has already filled PL_curlocales properly,
2744              * except for the LC_ALL element, which should be set to
2745              * 'new_locale'. */
2746             PL_curlocales[LC_ALL_INDEX_] = savepv(new_locale);
2747         }
2748     }
2749     else {  /* Not LC_ALL */
2750
2751         /* Update the single category's record */
2752         Safefree(PL_curlocales[index]);
2753         PL_curlocales[index] = savepv(new_locale);
2754
2755         /* Invalidate LC_ALL */
2756         Safefree(PL_curlocales[LC_ALL_INDEX_]);
2757         PL_curlocales[LC_ALL_INDEX_] = NULL;
2758     }
2759 }
2760
2761 #  endif  /* Need PL_curlocales[] */
2762
2763 /*===========================================================================*/
2764
2765 #if defined(USE_LOCALE)
2766
2767 /* This paradigm is needed in several places in the function below.  We have to
2768  * substitute the nominal locale for LC_NUMERIC when returning a value for
2769  * external consumption */
2770 #  ifndef USE_LOCALE_NUMERIC
2771 #    define ENTRY(i, array, format)  array[i]
2772 #  else
2773 #    define ENTRY(i, array, format)                         \
2774        (UNLIKELY(   format == EXTERNAL_FORMAT_FOR_QUERY     \
2775                  && i == LC_NUMERIC_INDEX_)                 \
2776         ? PL_numeric_name                                   \
2777         : array[i])
2778 #  endif
2779
2780 STATIC
2781 const char *
2782 S_calculate_LC_ALL_string(pTHX_ const char ** category_locales_list,
2783                                 const calc_LC_ALL_format format,
2784                                 const calc_LC_ALL_return returning,
2785                                 const line_t caller_line)
2786 {
2787     PERL_ARGS_ASSERT_CALCULATE_LC_ALL_STRING;
2788
2789     /* NOTE: On Configurations that have PL_curlocales[], this function has the
2790      * side effect of updating the LC_ALL_INDEX_ element with its result.
2791      *
2792      * This function calculates a string that defines the locale(s) LC_ALL is
2793      * set to, in either:
2794      *  1)  Our internal format if 'format' is set to INTERNAL_FORMAT.
2795      *  2)  The external format returned by Perl_setlocale() if 'format' is set
2796      *      to EXTERNAL_FORMAT_FOR_QUERY or EXTERNAL_FORMAT_FOR_SET.
2797      *
2798      *      These two are distinguished by:
2799      *       a) EXTERNAL_FORMAT_FOR_SET returns the actual locale currently in
2800      *          effect.
2801      *       b) EXTERNAL_FORMAT_FOR_QUERY returns the nominal locale.
2802      *          Currently this can differ only from the actual locale in the
2803      *          LC_NUMERIC category when it is set to a locale whose radix is
2804      *          not a dot.  (The actual locale is kept as a dot to accommodate
2805      *          the large corpus of XS code that expects it to be that;
2806      *          switched to a non-dot temporarily during certain operations
2807      *          that require the actual radix.)
2808      *
2809      * In both 1) and 2), LC_ALL's values are passed to this function by
2810      * 'category_locales_list' which is either:
2811      *  1) a pointer to an array of strings with up-to-date values of all the
2812      *     individual categories; or
2813      *  2) NULL, to indicate to use querylocale_i() to get each individual
2814      *     value.
2815      *
2816      * The caller sets 'returning' to
2817      *      WANT_TEMP_PV        the function returns the calculated string
2818      *                              as a mortalized temporary, so the caller
2819      *                              doesn't have to worry about it being
2820      *                              per-thread, nor needs to arrange for its
2821      *                              clean-up.
2822      *      WANT_PL_setlocale_buf  the function stores the calculated string
2823      *                              into the per-thread buffer PL_setlocale_buf
2824      *                              and returns a pointer to that.  The buffer
2825      *                              is cleaned up automatically in process
2826      *                              destruction.  This return method avoids
2827      *                              extra copies in some circumstances.
2828      *      WANT_VOID           NULL is returned.  This is used when the
2829      *                              function is being called only for its side
2830      *                              effect of updating
2831      *                              PL_curlocales[LC_ALL_INDEX_]
2832      *
2833      * querylocale(), on systems that have it, doesn't tend to work for LC_ALL.
2834      * So we have to construct the answer ourselves based on the passed in
2835      * data.
2836      *
2837      * If all individual categories are the same locale, we can just set LC_ALL
2838      * to that locale.  But if not, we have to create an aggregation of all the
2839      * categories on the system.  Platforms differ as to the syntax they use
2840      * for these non-uniform locales for LC_ALL.  Some, like glibc and Windows,
2841      * use an unordered series of name=value pairs, like
2842      *      LC_NUMERIC=C;LC_TIME=en_US.UTF-8;...
2843      * to specify LC_ALL; others, like *BSD, use a positional notation with a
2844      * delimitter, typically a single '/' character:
2845      *      C/en_UK.UTF-8/...
2846      *
2847      * When the external format is desired, this function returns whatever the
2848      * system expects.  The internal format is always name=value pairs.
2849      *
2850      * For systems that have categories we don't know about, the algorithm
2851      * below won't know about those missing categories, leading to potential
2852      * bugs for code that looks at them.  If there is an environment variable
2853      * that sets that category, we won't know to look for it, and so our use of
2854      * LANG or "C" improperly overrides it.  On the other hand, if we don't do
2855      * what is done here, and there is no environment variable, the category's
2856      * locale should be set to LANG or "C".  So there is no good solution.  khw
2857      * thinks the best is to make sure we have a complete list of possible
2858      * categories, adding new ones as they show up on obscure platforms.
2859      */
2860
2861     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
2862                            "Entering calculate_LC_ALL_string(%s);"
2863                            " called from %" LINE_Tf "\n",
2864                            ((format == EXTERNAL_FORMAT_FOR_QUERY)
2865                             ? "EXTERNAL_FORMAT_FOR_QUERY"
2866                             : ((format == EXTERNAL_FORMAT_FOR_SET)
2867                                ? "EXTERNAL_FORMAT_FOR_SET"
2868                                : "INTERNAL_FORMAT")),
2869                            caller_line));
2870
2871     bool input_list_was_NULL = (category_locales_list == NULL);
2872
2873     /* If there was no input category list, construct a temporary one
2874      * ourselves. */
2875     const char * my_category_locales_list[LC_ALL_INDEX_];
2876     const char ** locales_list = category_locales_list;
2877     if (locales_list == NULL) {
2878         locales_list = my_category_locales_list;
2879
2880         if (format == EXTERNAL_FORMAT_FOR_QUERY) {
2881             for_all_individual_category_indexes(i) {
2882                 locales_list[i] = query_nominal_locale_i(i);
2883             }
2884         }
2885         else {
2886             for_all_individual_category_indexes(i) {
2887                 locales_list[i] = querylocale_i(i);
2888             }
2889         }
2890     }
2891
2892     /* While we are calculating LC_ALL, we see if every category's locale is
2893      * the same as every other's or not. */
2894 #  ifndef HAS_IGNORED_LOCALE_CATEGORIES_
2895
2896     /* When we pay attention to all categories, we assume they are all the same
2897      * until proven different */
2898     bool disparate = false;
2899
2900 #  else
2901
2902     /* But if there are ignored categories, those will be set to "C", so try an
2903      * arbitrary category, and if it isn't C, we know immediately that the
2904      * locales are disparate.  (The #if conditionals are to handle the case
2905      * where LC_NUMERIC_INDEX_ is 0.  We don't want to use LC_NUMERIC to
2906      * compare, as that may be different between external and internal forms.)
2907      * */
2908 #    if ! defined(USE_LOCALE_NUMERIC)
2909
2910     bool disparate = ! isNAME_C_OR_POSIX(locales_list[0]);
2911
2912 #    elif LC_NUMERIC_INDEX_ != 0
2913
2914     bool disparate = ! isNAME_C_OR_POSIX(locales_list[0]);
2915
2916 #    else
2917
2918     /* Would need revision to handle the very unlikely case where only a single
2919      * category, LC_NUMERIC, is defined */
2920     assert(LOCALE_CATEGORIES_COUNT_ > 0);
2921
2922     bool disparate = ! isNAME_C_OR_POSIX(locales_list[1]);
2923
2924 #    endif
2925 #  endif
2926
2927     /* Calculate the needed size for the string listing the individual locales.
2928      * Initialize with values known at compile time. */
2929     Size_t total_len;
2930     const char *separator;
2931
2932 #  ifdef PERL_LC_ALL_USES_NAME_VALUE_PAIRS  /* Positional formatted LC_ALL */
2933     PERL_UNUSED_ARG(format);
2934 #  else
2935
2936     if (format != INTERNAL_FORMAT) {
2937
2938         /* Here, we will be using positional notation.  it includes n-1
2939          * separators */
2940         total_len = (  LOCALE_CATEGORIES_COUNT_ - 1)
2941                      * STRLENs(PERL_LC_ALL_SEPARATOR)
2942                   + 1;   /* And a trailing NUL */
2943         separator = PERL_LC_ALL_SEPARATOR;
2944     }
2945     else
2946
2947 #  endif
2948
2949     {
2950         /* name=value output is always used in internal format, and when
2951          * positional isn't available on the platform. */
2952         total_len = lc_all_boiler_plate_length;
2953         separator = ";";
2954     }
2955
2956     /* The total length then is just the sum of the above boiler-plate plus the
2957      * total strlen()s of the locale name of each individual category. */
2958     for_all_individual_category_indexes(i) {
2959         const char * entry = ENTRY(i, locales_list, format);
2960
2961         total_len += strlen(entry);
2962         if (! disparate && strNE(entry, locales_list[0])) {
2963             disparate = true;
2964         }
2965     }
2966
2967     bool free_if_void_return = false;
2968     const char * retval;
2969
2970     /* If all categories have the same locale, we already know the answer */
2971     if (! disparate) {
2972         if (returning == WANT_PL_setlocale_buf) {
2973             save_to_buffer(locales_list[0],
2974                            &PL_setlocale_buf,
2975                            &PL_setlocale_bufsize);
2976             retval = PL_setlocale_buf;
2977         }
2978         else {
2979
2980             retval = locales_list[0];
2981
2982             /* If a temporary is wanted for the return, and we had to create
2983              * the input list ourselves, we created it into such a temporary,
2984              * so no further work is needed; but otherwise, make a mortal copy
2985              * of this passed-in list element */
2986             if (returning == WANT_TEMP_PV && ! input_list_was_NULL) {
2987                 retval = savepv(retval);
2988                 SAVEFREEPV(retval);
2989             }
2990
2991             /* In all cases here, there's nothing we create that needs to be
2992              * freed, so leave 'free_if_void_return' set to the default
2993              * 'false'. */
2994         }
2995     }
2996     else {  /* Here, not all categories have the same locale */
2997
2998         char * constructed;
2999
3000         /* If returning to PL_setlocale_buf, set up to write directly to it,
3001          * being sure it is resized to be large enough */
3002         if (returning == WANT_PL_setlocale_buf) {
3003             set_save_buffer_min_size(total_len,
3004                                      &PL_setlocale_buf,
3005                                      &PL_setlocale_bufsize);
3006             constructed = PL_setlocale_buf;
3007         }
3008         else {  /* Otherwise we need new memory to hold the calculated value. */
3009
3010             Newx(constructed, total_len, char);
3011
3012             /* If returning the new memory, it must be set up to be freed
3013              * later; otherwise at the end of this function */
3014             if (returning == WANT_TEMP_PV) {
3015                 SAVEFREEPV(constructed);
3016             }
3017             else {
3018                 free_if_void_return = true;
3019             }
3020         }
3021
3022         constructed[0] = '\0';
3023
3024         /* Loop through all the categories */
3025         for_all_individual_category_indexes(j) {
3026
3027             /* Add a separator, except before the first one */
3028             if (j != 0) {
3029                 my_strlcat(constructed, separator, total_len);
3030             }
3031
3032             const char * entry;
3033             Size_t needed_len;
3034             unsigned int i = j;
3035
3036 #  ifndef PERL_LC_ALL_USES_NAME_VALUE_PAIRS
3037
3038             if (UNLIKELY(format != INTERNAL_FORMAT)) {
3039
3040                 /* In positional notation 'j' means the position, and we have
3041                  * to convert to the index 'i' */
3042                 i = map_LC_ALL_position_to_index[j];
3043
3044                 entry = ENTRY(i, locales_list, format);
3045                 needed_len = my_strlcat(constructed, entry, total_len);
3046             }
3047             else
3048
3049 #  endif
3050             {
3051                 /* Below, we are to use name=value notation, either because
3052                  * that's what the platform uses, or because this is the
3053                  * internal format, which uses that notation regardless of the
3054                  * external form */
3055
3056                 entry = ENTRY(i, locales_list, format);
3057
3058                 /* "name=locale;" */
3059                 my_strlcat(constructed, category_names[i], total_len);
3060                 my_strlcat(constructed, "=", total_len);
3061                 needed_len = my_strlcat(constructed, entry, total_len);
3062             }
3063
3064             if (LIKELY(needed_len <= total_len)) {
3065                 continue;
3066             }
3067
3068             /* If would have overflowed, panic */
3069             locale_panic_via_(Perl_form(aTHX_
3070                                         "Internal length calculation wrong.\n"
3071                                         "\"%s\" was not entirely added to"
3072                                         " \"%.*s\"; needed=%zu, had=%zu",
3073                                         entry, (int) total_len,
3074                                         constructed,
3075                                         needed_len, total_len),
3076                                 __FILE__,
3077                                 caller_line);
3078         } /* End of loop through the categories */
3079
3080         retval = constructed;
3081     } /* End of the categories' locales are displarate */
3082
3083 #  if defined(USE_PL_CURLOCALES) && defined(LC_ALL)
3084
3085     if (format == INTERNAL_FORMAT) {
3086
3087         /* PL_curlocales[LC_ALL_INDEX_] is updated as a side-effect of this
3088          * function for internal format. */
3089         Safefree(PL_curlocales[LC_ALL_INDEX_]);
3090         PL_curlocales[LC_ALL_INDEX_] = savepv(retval);
3091     }
3092
3093 #  endif
3094
3095     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
3096                            "calculate_LC_ALL_string calculated '%s'\n",
3097                            retval));
3098
3099     if (returning == WANT_VOID) {
3100         if (free_if_void_return) {
3101             Safefree(retval);
3102         }
3103
3104         return NULL;
3105     }
3106
3107     return retval;
3108 }
3109
3110 #  if defined(WIN32) || (     defined(USE_POSIX_2008_LOCALE)        \
3111                          && ! defined(USE_QUERYLOCALE))
3112
3113 STATIC const char *
3114 S_find_locale_from_environment(pTHX_ const locale_category_index index)
3115 {
3116     /* NB: This function may actually change the locale on Windows.  It
3117      * currently is designed to be called only from setting the locale on
3118      * Windows, and POSIX 2008
3119      *
3120      * This function returns the locale specified by the program's environment
3121      * for the category specified by our internal index number 'index'.  It
3122      * therefore simulates:
3123      *      setlocale(cat, "")
3124      * but, except for some cases in Windows, doesn't actually change the
3125      * locale; merely returns it.
3126      *
3127      * The return need not be freed by the caller.  This
3128      * promise relies on PerlEnv_getenv() returning a mortalized copy to us.
3129      *
3130      * The simulation is needed only on certain platforms; otherwise, libc is
3131      * called with "" to get the actual value(s).  The simulation is needed
3132      * for:
3133      *
3134      *  1)  On Windows systems, the concept of the POSIX ordering of
3135      *      environment variables is missing.  To increase portability of
3136      *      programs across platforms, the POSIX ordering is emulated on
3137      *      Windows.
3138      *
3139      *  2)  On POSIX 2008 systems without querylocale(), it is problematic
3140      *      getting the results of the POSIX 2008 equivalent of
3141      *
3142      *          setlocale(category, "")
3143      *
3144      *      To ensure that we know exactly what those values are, we do the
3145      *      setting ourselves, using the documented algorithm specified by the
3146      *      POSIX standard (assuming the platform follows the Standard) rather
3147      *      than use "" as the locale.  This will lead to results that differ
3148      *      from native behavior if the native behavior differs from the
3149      *      Standard's documented value, but khw believes it is better to know
3150      *      what's going on, even if different from native, than to just guess.
3151      *
3152      *      glibc systems differ from this standard in having a LANGUAGE
3153      *      environment variable used for just LC_MESSAGES.  This function does
3154      *      NOT handle that.
3155      *
3156      *      Another option for the POSIX 2008 case would be, in a critical
3157      *      section, to save the global locale's current value, and do a
3158      *      straight setlocale(LC_ALL, "").  That would return our desired
3159      *      values, destroying the global locale's, which we would then
3160      *      restore.  But that could cause races with any other thread that is
3161      *      using the global locale and isn't using the mutex.  And, the only
3162      *      reason someone would have done that is because they are calling a
3163      *      library function, like in gtk, that calls setlocale(), and which
3164      *      can't be changed to use the mutex.  That wouldn't be a problem if
3165      *      this were to be done before any threads had switched, say during
3166      *      perl construction time.  But this code would still be needed for
3167      *      the general case.
3168      *
3169      * The Windows and POSIX 2008 differ in that the ultimate fallback is "C"
3170      * in POSIX, and is the system default locale in Windows.  To get that
3171      * system default value, we actually have to call setlocale() on Windows.
3172      */
3173
3174     const char * const lc_all = PerlEnv_getenv("LC_ALL");
3175     const char * locale_names[LC_ALL_INDEX_] = { NULL };
3176
3177     /* Use any "LC_ALL" environment variable, as it overrides everything else.
3178      * */
3179     if (lc_all && strNE(lc_all, "")) {
3180         return lc_all;
3181     }
3182
3183     /* Here, no usable LC_ALL environment variable.  We have to handle each
3184      * category separately.  If all categories are desired, we loop through
3185      * them all.  If only an individual category is desired, to avoid
3186      * duplicating logic, we use the same loop, but set up the limits so it is
3187      * only executed once, for that particular category. */
3188     locale_category_index lower, upper, offset;
3189     if (index == LC_ALL_INDEX_) {
3190         lower = (locale_category_index) 0;
3191         upper = (locale_category_index) ((int) LC_ALL_INDEX_ - 1);
3192         offset = (locale_category_index) 0;
3193     }
3194     else {
3195         lower = index;
3196         upper = index;
3197
3198         /* 'offset' is used so that the result of the single loop iteration is
3199          * stored into output[0] */
3200         offset = lower;
3201     }
3202
3203     /* When no LC_ALL environment variable, LANG is used as a default, but
3204      * overridden for individual categories that have corresponding environment
3205      * variables.  If no LANG exists, the default is "C" on POSIX 2008, or the
3206      * system default for the category on Windows. */
3207     const char * env_lang = NULL;
3208
3209     /* For each desired category, use any corresponding environment variable;
3210      * or the default if none such exists. */
3211     bool is_disparate = false;  /* Assume is uniform until proven otherwise */
3212     for (unsigned i = lower; i <= upper; i++) {
3213         const char * const env_override = PerlEnv_getenv(category_names[i]);
3214         unsigned int j = i - offset;
3215
3216         if (env_override && strNE(env_override, "")) {
3217             locale_names[j] = env_override;
3218         }
3219         else { /* Here, no corresponding environment variable, see if LANG
3220                   exists and is usable.  Done this way to avoid fetching LANG
3221                   unless it is actually needed */
3222             if (env_lang == NULL) {
3223                 env_lang = PerlEnv_getenv("LANG");
3224
3225                 /* If not usable, set it to a non-NULL illegal value so won't
3226                  * try to use it below */
3227                 if (env_lang == NULL || strEQ(env_lang, "")) {
3228                     env_lang = (const char *) 1;
3229                 }
3230             }
3231
3232             /* If a usable LANG exists, use it. */
3233             if (env_lang != NULL && env_lang != (const char *) 1) {
3234                 locale_names[j] = env_lang;
3235             }
3236             else {
3237
3238 #    ifdef WIN32
3239                 /* If no LANG, use the system default on Windows. */
3240                 locale_names[j] = wrap_wsetlocale(categories[i], ".ACP");
3241                 if (locale_names[j]) {
3242                     SAVEFREEPV(locale_names[j]);
3243                 }
3244                 else
3245 #    endif
3246                 {   /* If nothing was found or worked, use C */
3247                     locale_names[j] = "C";
3248                 }
3249             }
3250         }
3251
3252         if (j > 0 && ! is_disparate && strNE(locale_names[0], locale_names[j]))
3253         {
3254             is_disparate = true;
3255         }
3256
3257         DEBUG_Lv(PerlIO_printf(Perl_debug_log,
3258                  "find_locale_from_environment i=%u, j=%u, name=%s,"
3259                  " locale=%s, locale of 0th category=%s, disparate=%d\n",
3260                  i, j, category_names[i],
3261                  locale_names[j], locale_names[0], is_disparate));
3262     }
3263
3264     if (! is_disparate) {
3265         return locale_names[0];
3266     }
3267
3268     return calculate_LC_ALL_string(locale_names, INTERNAL_FORMAT,
3269                                    WANT_TEMP_PV,
3270                                    __LINE__);
3271 }
3272
3273 #  endif
3274 #  if defined(DEBUGGING) || defined(USE_PERL_SWITCH_LOCALE_CONTEXT)
3275
3276 STATIC const char *
3277 S_get_LC_ALL_display(pTHX)
3278 {
3279     return calculate_LC_ALL_string(NULL, INTERNAL_FORMAT,
3280                                    WANT_TEMP_PV,
3281                                    __LINE__);
3282 }
3283
3284 #  endif
3285
3286 STATIC void
3287 S_setlocale_failure_panic_via_i(pTHX_
3288                                 const locale_category_index cat_index,
3289                                 const char * current,
3290                                 const char * failed,
3291                                 const line_t proxy_caller_line,
3292                                 const line_t immediate_caller_line,
3293                                 const char * const higher_caller_file,
3294                                 const line_t higher_caller_line)
3295 {
3296     PERL_ARGS_ASSERT_SETLOCALE_FAILURE_PANIC_VIA_I;
3297
3298     /* Called to panic when a setlocale form unexpectedly failed for the
3299      * category determined by 'cat_index', and the locale that was in effect
3300      * (and likely still is) is 'current'.  'current' may be NULL, which causes
3301      * this function to query what it is.
3302      *
3303      * The extra caller information is used for when a function acts as a
3304      * stand-in for another function, which a typical reader would more likely
3305      * think would be the caller
3306      *
3307      * If a line number is 0, its stack (sort-of) frame is omitted; same if
3308      * it's the same line number as the next higher caller. */
3309
3310     const int cat = categories[cat_index];
3311     const char * name = category_names[cat_index];
3312
3313     dSAVE_ERRNO;
3314
3315     if (current == NULL) {
3316         current = querylocale_i(cat_index);
3317     }
3318
3319     const char * proxy_text = "";
3320     if (proxy_caller_line != 0 && proxy_caller_line != immediate_caller_line)
3321     {
3322         proxy_text = Perl_form(aTHX_ "\nCalled via %s: %" LINE_Tf,
3323                                       __FILE__, proxy_caller_line);
3324     }
3325     if (   strNE(__FILE__, higher_caller_file)
3326         || (   immediate_caller_line != 0
3327             && immediate_caller_line != higher_caller_line))
3328     {
3329         proxy_text = Perl_form(aTHX_ "%s\nCalled via %s: %" LINE_Tf,
3330                                       proxy_text, __FILE__,
3331                                       immediate_caller_line);
3332     }
3333
3334     /* 'false' in the get_displayable_string() calls makes it not think the
3335      * locale is UTF-8, so just dumps bytes.  Actually figuring it out can be
3336      * too complicated for a panic situation. */
3337     const char * msg = Perl_form(aTHX_
3338                             "Can't change locale for %s (%d) from '%s' to '%s'"
3339                             " %s",
3340                             name, cat,
3341                             get_displayable_string(current,
3342                                                    current + strlen(current),
3343                                                    false),
3344                             get_displayable_string(failed,
3345                                                    failed + strlen(failed),
3346                                                    false),
3347                             proxy_text);
3348     RESTORE_ERRNO;
3349
3350     Perl_locale_panic(msg, __LINE__, higher_caller_file, higher_caller_line);
3351     NOT_REACHED; /* NOTREACHED */
3352 }
3353
3354 #  ifdef USE_LOCALE_NUMERIC
3355
3356 STATIC void
3357 S_new_numeric(pTHX_ const char *newnum, bool force)
3358 {
3359     PERL_ARGS_ASSERT_NEW_NUMERIC;
3360
3361     /* Called after each libc setlocale() or uselocale() call affecting
3362      * LC_NUMERIC, to tell core Perl this and that 'newnum' is the name of the
3363      * new locale, and we are switched into it.  It installs this locale as the
3364      * current underlying default, and then switches to the C locale, if
3365      * necessary, so that the code that has traditionally expected the radix
3366      * character to be a dot may continue to do so.
3367      *
3368      * The default locale and the C locale can be toggled between by use of the
3369      * set_numeric_underlying() and set_numeric_standard() functions, which
3370      * should probably not be called directly, but only via macros like
3371      * SET_NUMERIC_STANDARD() in perl.h.
3372      *
3373      * The toggling is necessary mainly so that a non-dot radix decimal point
3374      * character can be input and output, while allowing internal calculations
3375      * to use a dot.
3376      *
3377      * This sets several interpreter-level variables:
3378      * PL_numeric_name  The underlying locale's name: a copy of 'newnum'
3379      * PL_numeric_underlying   A boolean indicating if the toggled state is
3380      *                  such that the current locale is the program's
3381      *                  underlying locale
3382      * PL_numeric_standard   An int indicating if the toggled state is such
3383      *                  that the current locale is the C locale or
3384      *                  indistinguishable from the C locale.  If non-zero, it
3385      *                  is in C; if > 1, it means it may not be toggled away
3386      *                  from C.
3387      * PL_numeric_underlying_is_standard   A bool kept by this function
3388      *                  indicating that the underlying locale and the standard
3389      *                  C locale are indistinguishable for the purposes of
3390      *                  LC_NUMERIC.  This happens when both of the above two
3391      *                  variables are true at the same time.  (Toggling is a
3392      *                  no-op under these circumstances.)  This variable is
3393      *                  used to avoid having to recalculate.
3394      * PL_numeric_radix_sv   Contains the string that code should use for the
3395      *                  decimal point.  It is set to either a dot or the
3396      *                  program's underlying locale's radix character string,
3397      *                  depending on the situation.
3398      * PL_underlying_radix_sv   Contains the program's underlying locale's
3399      *                  radix character string.  This is copied into
3400      *                  PL_numeric_radix_sv when the situation warrants.  It
3401      *                  exists to avoid having to recalculate it when toggling.
3402      */
3403
3404     DEBUG_L( PerlIO_printf(Perl_debug_log,
3405                            "Called new_numeric with %s, PL_numeric_name=%s\n",
3406                            newnum, PL_numeric_name));
3407
3408     /* We keep records comparing the characteristics of the LC_NUMERIC catetory
3409      * of the current locale vs the standard C locale.  If the new locale that
3410      * has just been changed to is the same as the one our records are for,
3411      * they are still valid, and we don't have to recalculate them.  'force' is
3412      * true if the caller suspects that the records are out-of-date, so do go
3413      * ahead and recalculate them.  (This can happen when an external library
3414      * has had control and now perl is reestablishing control; we have to
3415      * assume that that library changed the locale in unknown ways.)
3416      *
3417      * Even if our records are valid, the new locale will likely have been
3418      * switched to before this function gets called, and we must toggle into
3419      * one indistinguishable from the C locale with regards to LC_NUMERIC
3420      * handling, so that all the libc functions that are affected by LC_NUMERIC
3421      * will work as expected.  This can be skipped if we already know that the
3422      * locale is indistinguishable from the C locale. */
3423     if (! force && strEQ(PL_numeric_name, newnum)) {
3424         if (! PL_numeric_underlying_is_standard) {
3425             set_numeric_standard(__FILE__, __LINE__);
3426         }
3427
3428         return;
3429     }
3430
3431     Safefree(PL_numeric_name);
3432     PL_numeric_name = savepv(newnum);
3433
3434     /* Handle the trivial case.  Since this is called at process
3435      * initialization, be aware that this bit can't rely on much being
3436      * available. */
3437     if (isNAME_C_OR_POSIX(PL_numeric_name)) {
3438         PL_numeric_standard = TRUE;
3439         PL_numeric_underlying_is_standard = TRUE;
3440         PL_numeric_underlying = TRUE;
3441         sv_setpv(PL_numeric_radix_sv, C_decimal_point);
3442         SvUTF8_off(PL_numeric_radix_sv);
3443         sv_setpv(PL_underlying_radix_sv, C_decimal_point);
3444         SvUTF8_off(PL_underlying_radix_sv);
3445         return;
3446     }
3447
3448     /* We are in the underlying locale until changed at the end of this
3449      * function */
3450     PL_numeric_underlying = TRUE;
3451
3452     char * radix = NULL;
3453     utf8ness_t utf8ness = UTF8NESS_IMMATERIAL;
3454
3455     /* Find and save this locale's radix character. */
3456     my_langinfo_c(RADIXCHAR, LC_NUMERIC, PL_numeric_name,
3457                   &radix, NULL, &utf8ness);
3458     sv_setpv(PL_underlying_radix_sv, radix);
3459
3460     if (utf8ness == UTF8NESS_YES) {
3461         SvUTF8_on(PL_underlying_radix_sv);
3462     }
3463     else {
3464         SvUTF8_off(PL_underlying_radix_sv);
3465     }
3466
3467     DEBUG_L(PerlIO_printf(Perl_debug_log,
3468                           "Locale radix is '%s', ?UTF-8=%d\n",
3469                           SvPVX(PL_underlying_radix_sv),
3470                           cBOOL(SvUTF8(PL_underlying_radix_sv))));
3471
3472     /* This locale is indistinguishable from C (for numeric purposes) if both
3473      * the radix character and the thousands separator are the same as C's.
3474      * Start with the radix. */
3475     PL_numeric_underlying_is_standard = strEQ(C_decimal_point, radix);
3476     Safefree(radix);
3477
3478 #    ifndef TS_W32_BROKEN_LOCALECONV
3479
3480     /* If the radix isn't the same as C's, we know it is distinguishable from
3481      * C; otherwise check the thousands separator too.  Only if both are the
3482      * same as C's is the locale indistinguishable from C.
3483      *
3484      * But on earlier Windows versions, there is a potential race.  This code
3485      * knows that localeconv() (elsewhere in this file) will be used to extract
3486      * the needed value, and localeconv() was buggy for quite a while, and that
3487      * code in this file hence uses a workaround.  And that workaround may have
3488      * an (unlikely) race.  Gathering the radix uses a different workaround on
3489      * Windows that doesn't involve a race.  It might be possible to do the
3490      * same for this (patches welcome).
3491      *
3492      * Until then khw doesn't think it's worth even the small risk of a race to
3493      * get this value, which doesn't appear to be used in any of the Microsoft
3494      * library routines anyway. */
3495
3496     if (PL_numeric_underlying_is_standard) {
3497         char * scratch_buffer = NULL;
3498         PL_numeric_underlying_is_standard = strEQ(C_thousands_sep,
3499                                              my_langinfo_c(THOUSEP, LC_NUMERIC,
3500                                                            PL_numeric_name,
3501                                                            &scratch_buffer,
3502                                                            NULL, NULL));
3503         Safefree(scratch_buffer);
3504     }
3505
3506 #    endif
3507
3508     PL_numeric_standard = PL_numeric_underlying_is_standard;
3509
3510     /* Keep LC_NUMERIC so that it has the C locale radix and thousands
3511      * separator.  This is for XS modules, so they don't have to worry about
3512      * the radix being a non-dot.  (Core operations that need the underlying
3513      * locale change to it temporarily). */
3514     if (! PL_numeric_standard) {
3515         set_numeric_standard(__FILE__, __LINE__);
3516     }
3517 }
3518
3519 #  endif
3520
3521 void
3522 Perl_set_numeric_standard(pTHX_ const char * const file, const line_t line)
3523 {
3524     PERL_ARGS_ASSERT_SET_NUMERIC_STANDARD;
3525     PERL_UNUSED_ARG(file);      /* Some Configurations ignore these */
3526     PERL_UNUSED_ARG(line);
3527
3528 #  ifdef USE_LOCALE_NUMERIC
3529
3530     /* Unconditionally toggle the LC_NUMERIC locale to the C locale
3531      *
3532      * Most code should use the macro SET_NUMERIC_STANDARD() in perl.h
3533      * instead of calling this directly.  The macro avoids calling this routine
3534      * if toggling isn't necessary according to our records (which could be
3535      * wrong if some XS code has changed the locale behind our back) */
3536
3537     DEBUG_L(PerlIO_printf(Perl_debug_log, "Setting LC_NUMERIC locale to"
3538                                           " standard C; called from %s: %"
3539                                           LINE_Tf "\n", file, line));
3540
3541     void_setlocale_c_with_caller(LC_NUMERIC, "C", file, line);
3542     PL_numeric_standard = TRUE;
3543     sv_setpv(PL_numeric_radix_sv, C_decimal_point);
3544     SvUTF8_off(PL_numeric_radix_sv);
3545
3546     PL_numeric_underlying = PL_numeric_underlying_is_standard;
3547
3548 #  endif /* USE_LOCALE_NUMERIC */
3549
3550 }
3551
3552 void
3553 Perl_set_numeric_underlying(pTHX_ const char * const file, const line_t line)
3554 {
3555     PERL_ARGS_ASSERT_SET_NUMERIC_UNDERLYING;
3556     PERL_UNUSED_ARG(file);      /* Some Configurations ignore these */
3557     PERL_UNUSED_ARG(line);
3558
3559 #  ifdef USE_LOCALE_NUMERIC
3560
3561     /* Unconditionally toggle the LC_NUMERIC locale to the current underlying
3562      * default.
3563      *
3564      * Most code should use the macro SET_NUMERIC_UNDERLYING() in perl.h
3565      * instead of calling this directly.  The macro avoids calling this routine
3566      * if toggling isn't necessary according to our records (which could be
3567      * wrong if some XS code has changed the locale behind our back) */
3568
3569     DEBUG_L(PerlIO_printf(Perl_debug_log, "Setting LC_NUMERIC locale to %s;"
3570                                           " called from %s: %" LINE_Tf "\n",
3571                                           PL_numeric_name, file, line));
3572     /* Maybe not in init? assert(PL_locale_mutex_depth > 0);*/
3573
3574     void_setlocale_c_with_caller(LC_NUMERIC, PL_numeric_name, file, line);
3575     PL_numeric_underlying = TRUE;
3576     sv_setsv_nomg(PL_numeric_radix_sv, PL_underlying_radix_sv);
3577
3578     PL_numeric_standard = PL_numeric_underlying_is_standard;
3579
3580 #  endif /* USE_LOCALE_NUMERIC */
3581
3582 }
3583
3584 #  ifdef USE_LOCALE_CTYPE
3585
3586 STATIC void
3587 S_new_ctype(pTHX_ const char *newctype, bool force)
3588 {
3589     PERL_ARGS_ASSERT_NEW_CTYPE;
3590     PERL_UNUSED_ARG(force);
3591
3592     /* Called after each libc setlocale() call affecting LC_CTYPE, to tell
3593      * core Perl this and that 'newctype' is the name of the new locale.
3594      *
3595      * This function sets up the folding arrays for all 256 bytes, assuming
3596      * that tofold() is tolc() since fold case is not a concept in POSIX,
3597      */
3598
3599     DEBUG_L(PerlIO_printf(Perl_debug_log, "Entering new_ctype(%s)\n",
3600                                           newctype));
3601
3602     /* No change means no-op */
3603     if (strEQ(PL_ctype_name, newctype)) {
3604         return;
3605     }
3606
3607     /* We will replace any bad locale warning with
3608      *  1)  nothing if the new one is ok; or
3609      *  2)  a new warning for the bad new locale */
3610     if (PL_warn_locale) {
3611         SvREFCNT_dec_NN(PL_warn_locale);
3612         PL_warn_locale = NULL;
3613     }
3614
3615     /* Clear cache */
3616     Safefree(PL_ctype_name);
3617     PL_ctype_name = "";
3618
3619     PL_in_utf8_turkic_locale = FALSE;
3620
3621     /* For the C locale, just use the standard folds, and we know there are no
3622      * glitches possible, so return early.  Since this is called at process
3623      * initialization, be aware that this bit can't rely on much being
3624      * available. */
3625     if (isNAME_C_OR_POSIX(newctype)) {
3626         Copy(PL_fold, PL_fold_locale, 256, U8);
3627         PL_ctype_name = savepv(newctype);
3628         PL_in_utf8_CTYPE_locale = FALSE;
3629         return;
3630     }
3631
3632     /* The cache being cleared signals the called function to compute a new
3633      * value */
3634     PL_in_utf8_CTYPE_locale = is_locale_utf8(newctype);
3635
3636     PL_ctype_name = savepv(newctype);
3637     bool maybe_utf8_turkic = FALSE;
3638
3639     /* Don't check for problems if we are suppressing the warnings */
3640     bool check_for_problems = ckWARN_d(WARN_LOCALE) || UNLIKELY(DEBUG_L_TEST);
3641
3642     if (PL_in_utf8_CTYPE_locale) {
3643
3644         /* A UTF-8 locale gets standard rules.  But note that code still has to
3645          * handle this specially because of the three problematic code points
3646          * */
3647         Copy(PL_fold_latin1, PL_fold_locale, 256, U8);
3648
3649         /* UTF-8 locales can have special handling for 'I' and 'i' if they are
3650          * Turkic.  Make sure these two are the only anomalies.  (We don't
3651          * require towupper and towlower because they aren't in C89.) */
3652
3653 #    if defined(HAS_TOWUPPER) && defined (HAS_TOWLOWER)
3654
3655         if (towupper('i') == 0x130 && towlower('I') == 0x131)
3656
3657 #    else
3658
3659         if (toU8_UPPER_LC('i') == 'i' && toU8_LOWER_LC('I') == 'I')
3660
3661 #    endif
3662
3663         {
3664             /* This is how we determine it really is Turkic */
3665             check_for_problems = TRUE;
3666             maybe_utf8_turkic = TRUE;
3667         }
3668     }
3669     else {  /* Not a canned locale we know the values for.  Compute them */
3670
3671 #    ifdef DEBUGGING
3672
3673         bool has_non_ascii_fold = FALSE;
3674         bool found_unexpected = FALSE;
3675
3676         /* Under -DLv, see if there are any folds outside the ASCII range.
3677          * This factoid is used below */
3678         if (DEBUG_Lv_TEST) {
3679             for (unsigned i = 128; i < 256; i++) {
3680                 int j = LATIN1_TO_NATIVE(i);
3681                 if (toU8_LOWER_LC(j) != j || toU8_UPPER_LC(j) != j) {
3682                     has_non_ascii_fold = TRUE;
3683                     break;
3684                 }
3685             }
3686         }
3687
3688 #    endif
3689
3690         for (unsigned i = 0; i < 256; i++) {
3691             if (isU8_UPPER_LC(i))
3692                 PL_fold_locale[i] = (U8) toU8_LOWER_LC(i);
3693             else if (isU8_LOWER_LC(i))
3694                 PL_fold_locale[i] = (U8) toU8_UPPER_LC(i);
3695             else
3696                 PL_fold_locale[i] = (U8) i;
3697
3698 #    ifdef DEBUGGING
3699
3700             /* Most locales these days are supersets of ASCII.  When debugging,
3701              * it is helpful to know what the exceptions to that are in this
3702              * locale */
3703             if (DEBUG_L_TEST) {
3704                 bool unexpected = FALSE;
3705
3706                 if (isUPPER_L1(i)) {
3707                     if (isUPPER_A(i)) {
3708                         if (PL_fold_locale[i] != toLOWER_A(i)) {
3709                             unexpected = TRUE;
3710                         }
3711                     }
3712                     else if (has_non_ascii_fold) {
3713                         if (PL_fold_locale[i] != toLOWER_L1(i)) {
3714                             unexpected = TRUE;
3715                         }
3716                     }
3717                     else if (PL_fold_locale[i] != i) {
3718                         unexpected = TRUE;
3719                     }
3720                 }
3721                 else if (   isLOWER_L1(i)
3722                          && i != LATIN_SMALL_LETTER_SHARP_S
3723                          && i != MICRO_SIGN)
3724                 {
3725                     if (isLOWER_A(i)) {
3726                         if (PL_fold_locale[i] != toUPPER_A(i)) {
3727                             unexpected = TRUE;
3728                         }
3729                     }
3730                     else if (has_non_ascii_fold) {
3731                         if (PL_fold_locale[i] != toUPPER_LATIN1_MOD(i)) {
3732                             unexpected = TRUE;
3733                         }
3734                     }
3735                     else if (PL_fold_locale[i] != i) {
3736                         unexpected = TRUE;
3737                     }
3738                 }
3739                 else if (PL_fold_locale[i] != i) {
3740                     unexpected = TRUE;
3741                 }
3742
3743                 if (unexpected) {
3744                     found_unexpected = TRUE;
3745                     DEBUG_L(PerlIO_printf(Perl_debug_log,
3746                                            "For %s, fold of %02x is %02x\n",
3747                                            newctype, i, PL_fold_locale[i]));
3748                 }
3749             }
3750         }
3751
3752         if (found_unexpected) {
3753             DEBUG_L(PerlIO_printf(Perl_debug_log,
3754                                "All bytes not mentioned above either fold to"
3755                                " themselves or are the expected ASCII or"
3756                                " Latin1 ones\n"));
3757         }
3758         else {
3759             DEBUG_Lv(PerlIO_printf(Perl_debug_log,
3760                                    "No nonstandard folds were found\n"));
3761 #    endif
3762
3763         }
3764     }
3765
3766 #    ifdef MB_CUR_MAX
3767
3768     /* We only handle single-byte locales (outside of UTF-8 ones); so if this
3769      * locale requires more than one byte, there are going to be BIG problems.
3770      * */
3771
3772     const int mb_cur_max = MB_CUR_MAX;
3773
3774     if (mb_cur_max > 1 && ! PL_in_utf8_CTYPE_locale
3775
3776             /* Some platforms return MB_CUR_MAX > 1 for even the "C" locale.
3777              * Just assume that the implementation for them (plus for POSIX) is
3778              * correct and the > 1 value is spurious.  (Since these are
3779              * specially handled to never be considered UTF-8 locales, as long
3780              * as this is the only problem, everything should work fine */
3781         && ! isNAME_C_OR_POSIX(newctype))
3782     {
3783         DEBUG_L(PerlIO_printf(Perl_debug_log,
3784                               "Unsupported, MB_CUR_MAX=%d\n", mb_cur_max));
3785
3786         Perl_ck_warner_d(aTHX_ packWARN(WARN_LOCALE),
3787                          "Locale '%s' is unsupported, and may crash the"
3788                          " interpreter",
3789                          newctype);
3790     }
3791
3792 #    endif
3793
3794     DEBUG_Lv(PerlIO_printf(Perl_debug_log, "check_for_problems=%d\n",
3795                                            check_for_problems));
3796
3797     /* We don't populate the other lists if a UTF-8 locale, but do check that
3798      * everything works as expected, unless checking turned off */
3799     if (check_for_problems) {
3800         /* Assume enough space for every character being bad.  4 spaces each
3801          * for the 94 printable characters that are output like "'x' "; and 5
3802          * spaces each for "'\\' ", "'\t' ", and "'\n' "; plus a terminating
3803          * NUL */
3804         char bad_chars_list[ (94 * 4) + (3 * 5) + 1 ] = { '\0' };
3805         unsigned int bad_count = 0;         /* Count of bad characters */
3806
3807         for (unsigned i = 0; i < 256; i++) {
3808
3809             /* If checking for locale problems, see if the native ASCII-range
3810              * printables plus \n and \t are in their expected categories in
3811              * the new locale.  If not, this could mean big trouble, upending
3812              * Perl's and most programs' assumptions, like having a
3813              * metacharacter with special meaning become a \w.  Fortunately,
3814              * it's very rare to find locales that aren't supersets of ASCII
3815              * nowadays.  It isn't a problem for most controls to be changed
3816              * into something else; we check only \n and \t, though perhaps \r
3817              * could be an issue as well. */
3818             if (isGRAPH_A(i) || isBLANK_A(i) || i == '\n') {
3819                 bool is_bad = FALSE;
3820                 char name[4] = { '\0' };
3821
3822                 /* Convert the name into a string */
3823                 if (isGRAPH_A(i)) {
3824                     name[0] = i;
3825                     name[1] = '\0';
3826                 }
3827                 else if (i == '\n') {
3828                     my_strlcpy(name, "\\n", sizeof(name));
3829                 }
3830                 else if (i == '\t') {
3831                     my_strlcpy(name, "\\t", sizeof(name));
3832                 }
3833                 else {
3834                     assert(i == ' ');
3835                     my_strlcpy(name, "' '", sizeof(name));
3836                 }
3837
3838                 /* Check each possibe class */
3839                 if (UNLIKELY(cBOOL(isU8_ALPHANUMERIC_LC(i)) !=
3840                                                     cBOOL(isALPHANUMERIC_A(i))))
3841                 {
3842                     is_bad = TRUE;
3843                     DEBUG_L(PerlIO_printf(Perl_debug_log,
3844                                         "isalnum('%s') unexpectedly is %x\n",
3845                                         name, cBOOL(isU8_ALPHANUMERIC_LC(i))));
3846                 }
3847                 if (UNLIKELY(cBOOL(isU8_ALPHA_LC(i)) != cBOOL(isALPHA_A(i))))  {
3848                     is_bad = TRUE;
3849                     DEBUG_L(PerlIO_printf(Perl_debug_log,
3850                                           "isalpha('%s') unexpectedly is %x\n",
3851                                           name, cBOOL(isU8_ALPHA_LC(i))));
3852                 }
3853                 if (UNLIKELY(cBOOL(isU8_DIGIT_LC(i)) != cBOOL(isDIGIT_A(i))))  {
3854                     is_bad = TRUE;
3855                     DEBUG_L(PerlIO_printf(Perl_debug_log,
3856                                           "isdigit('%s') unexpectedly is %x\n",
3857                                           name, cBOOL(isU8_DIGIT_LC(i))));
3858                 }
3859                 if (UNLIKELY(cBOOL(isU8_GRAPH_LC(i)) != cBOOL(isGRAPH_A(i))))  {
3860                     is_bad = TRUE;
3861                     DEBUG_L(PerlIO_printf(Perl_debug_log,
3862                                           "isgraph('%s') unexpectedly is %x\n",
3863                                           name, cBOOL(isU8_GRAPH_LC(i))));
3864                 }
3865                 if (UNLIKELY(cBOOL(isU8_LOWER_LC(i)) != cBOOL(isLOWER_A(i))))  {
3866                     is_bad = TRUE;
3867                     DEBUG_L(PerlIO_printf(Perl_debug_log,
3868                                           "islower('%s') unexpectedly is %x\n",
3869                                           name, cBOOL(isU8_LOWER_LC(i))));
3870                 }
3871                 if (UNLIKELY(cBOOL(isU8_PRINT_LC(i)) != cBOOL(isPRINT_A(i))))  {
3872                     is_bad = TRUE;
3873                     DEBUG_L(PerlIO_printf(Perl_debug_log,
3874                                           "isprint('%s') unexpectedly is %x\n",
3875                                           name, cBOOL(isU8_PRINT_LC(i))));
3876                 }
3877                 if (UNLIKELY(cBOOL(isU8_PUNCT_LC(i)) != cBOOL(isPUNCT_A(i))))  {
3878                     is_bad = TRUE;
3879                     DEBUG_L(PerlIO_printf(Perl_debug_log,
3880                                           "ispunct('%s') unexpectedly is %x\n",
3881                                           name, cBOOL(isU8_PUNCT_LC(i))));
3882                 }
3883                 if (UNLIKELY(cBOOL(isU8_SPACE_LC(i)) != cBOOL(isSPACE_A(i))))  {
3884                     is_bad = TRUE;
3885                     DEBUG_L(PerlIO_printf(Perl_debug_log,
3886                                           "isspace('%s') unexpectedly is %x\n",
3887                                           name, cBOOL(isU8_SPACE_LC(i))));
3888                 }
3889                 if (UNLIKELY(cBOOL(isU8_UPPER_LC(i)) != cBOOL(isUPPER_A(i))))  {
3890                     is_bad = TRUE;
3891                     DEBUG_L(PerlIO_printf(Perl_debug_log,
3892                                           "isupper('%s') unexpectedly is %x\n",
3893                                           name, cBOOL(isU8_UPPER_LC(i))));
3894                 }
3895                 if (UNLIKELY(cBOOL(isU8_XDIGIT_LC(i))!= cBOOL(isXDIGIT_A(i)))) {
3896                     is_bad = TRUE;
3897                     DEBUG_L(PerlIO_printf(Perl_debug_log,
3898                                           "isxdigit('%s') unexpectedly is %x\n",
3899                                           name, cBOOL(isU8_XDIGIT_LC(i))));
3900                 }
3901                 if (UNLIKELY(toU8_LOWER_LC(i) != (int) toLOWER_A(i))) {
3902                     is_bad = TRUE;
3903                     DEBUG_L(PerlIO_printf(Perl_debug_log,
3904                             "tolower('%s')=0x%x instead of the expected 0x%x\n",
3905                             name, toU8_LOWER_LC(i), (int) toLOWER_A(i)));
3906                 }
3907                 if (UNLIKELY(toU8_UPPER_LC(i) != (int) toUPPER_A(i))) {
3908                     is_bad = TRUE;
3909                     DEBUG_L(PerlIO_printf(Perl_debug_log,
3910                             "toupper('%s')=0x%x instead of the expected 0x%x\n",
3911                             name, toU8_UPPER_LC(i), (int) toUPPER_A(i)));
3912                 }
3913                 if (UNLIKELY((i == '\n' && ! isCNTRL_LC(i))))  {
3914                     is_bad = TRUE;
3915                     DEBUG_L(PerlIO_printf(Perl_debug_log,
3916                                 "'\\n' (=%02X) is not a control\n", (int) i));
3917                 }
3918
3919                 /* Add to the list;  Separate multiple entries with a blank */
3920                 if (is_bad) {
3921                     if (bad_count) {
3922                         my_strlcat(bad_chars_list, " ", sizeof(bad_chars_list));
3923                     }
3924                     my_strlcat(bad_chars_list, name, sizeof(bad_chars_list));
3925                     bad_count++;
3926                 }
3927             }
3928         }
3929
3930         if (bad_count == 2 && maybe_utf8_turkic) {
3931             bad_count = 0;
3932             *bad_chars_list = '\0';
3933
3934             /* The casts are because otherwise some compilers warn:
3935                 gcc.gnu.org/bugzilla/show_bug.cgi?id=99950
3936                 gcc.gnu.org/bugzilla/show_bug.cgi?id=94182
3937              */
3938             PL_fold_locale[ (U8) 'I' ] = 'I';
3939             PL_fold_locale[ (U8) 'i' ] = 'i';
3940             PL_in_utf8_turkic_locale = TRUE;
3941             DEBUG_L(PerlIO_printf(Perl_debug_log, "%s is turkic\n", newctype));
3942         }
3943
3944         /* If we found problems and we want them output, do so */
3945         if (   (UNLIKELY(bad_count))
3946             && (LIKELY(ckWARN_d(WARN_LOCALE)) || UNLIKELY(DEBUG_L_TEST)))
3947         {
3948             /* WARNING.  If you change the wording of these; be sure to update
3949              * t/loc_tools.pl correspondingly */
3950
3951             if (PL_in_utf8_CTYPE_locale) {
3952                 PL_warn_locale = Perl_newSVpvf(aTHX_
3953                      "Locale '%s' contains (at least) the following characters"
3954                      " which have\nunexpected meanings: %s\nThe Perl program"
3955                      " will use the expected meanings",
3956                       newctype, bad_chars_list);
3957             }
3958             else {
3959                 PL_warn_locale =
3960                     Perl_newSVpvf(aTHX_
3961                                   "\nThe following characters (and maybe"
3962                                   " others) may not have the same meaning as"
3963                                   " the Perl program expects: %s\n",
3964                                   bad_chars_list
3965                             );
3966             }
3967
3968 #    if defined(HAS_SOME_LANGINFO) || defined(WIN32)
3969
3970             char * scratch_buffer = NULL;
3971             Perl_sv_catpvf(aTHX_ PL_warn_locale, "; codeset=%s",
3972                                  my_langinfo_c(CODESET, LC_CTYPE,
3973                                                newctype,
3974                                                &scratch_buffer, NULL,
3975                                                NULL));
3976             Safefree(scratch_buffer);
3977
3978 #    endif
3979
3980             Perl_sv_catpvf(aTHX_ PL_warn_locale, "\n");
3981
3982             /* If we are actually in the scope of the locale or are debugging,
3983              * output the message now.  If not in that scope, we save the
3984              * message to be output at the first operation using this locale,
3985              * if that actually happens.  Most programs don't use locales, so
3986              * they are immune to bad ones.  */
3987             if (IN_LC(LC_CTYPE) || UNLIKELY(DEBUG_L_TEST)) {
3988
3989                 /* The '0' below suppresses a bogus gcc compiler warning */
3990                 Perl_warner(aTHX_ packWARN(WARN_LOCALE), SvPVX(PL_warn_locale),
3991                                                                             0);
3992                 if (IN_LC(LC_CTYPE)) {
3993                     SvREFCNT_dec_NN(PL_warn_locale);
3994                     PL_warn_locale = NULL;
3995                 }
3996             }
3997         }
3998     }
3999 }
4000
4001 void
4002 Perl_warn_problematic_locale()
4003 {
4004     dTHX;
4005
4006     /* Core-only function that outputs the message in PL_warn_locale,
4007      * and then NULLS it.  Should be called only through the macro
4008      * CHECK_AND_WARN_PROBLEMATIC_LOCALE_ */
4009
4010     if (PL_warn_locale) {
4011         Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
4012                              SvPVX(PL_warn_locale),
4013                              0 /* dummy to avoid compiler warning */ );
4014         SvREFCNT_dec_NN(PL_warn_locale);
4015         PL_warn_locale = NULL;
4016     }
4017 }
4018
4019 #  endif /* USE_LOCALE_CTYPE */
4020
4021 STATIC void
4022 S_new_LC_ALL(pTHX_ const char *lc_all, bool force)
4023 {
4024     PERL_ARGS_ASSERT_NEW_LC_ALL;
4025
4026     /* new_LC_ALL() updates all the things we care about.  Note that this is
4027      * called just after a change, so uses the actual underlying locale just
4028      * set, and not the nominal one (should they differ, as they may in
4029      * LC_NUMERIC). */
4030
4031     const char * individ_locales[LC_ALL_INDEX_] = { NULL };
4032
4033     switch (parse_LC_ALL_string(lc_all,
4034                                 individ_locales,
4035                                 override_if_ignored,   /* Override any ignored
4036                                                           categories */
4037                                 true,   /* Always fill array */
4038                                 true,   /* Panic if fails, as to get here it
4039                                            earlier had to have succeeded */
4040                                 __LINE__))
4041     {
4042       case invalid:
4043       case no_array:
4044       case only_element_0:
4045         locale_panic_("Unexpected return from parse_LC_ALL_string");
4046
4047       case full_array:
4048         break;
4049     }
4050
4051     for_all_individual_category_indexes(i) {
4052         if (update_functions[i]) {
4053             const char * this_locale = individ_locales[i];
4054             update_functions[i](aTHX_ this_locale, force);
4055         }
4056
4057         Safefree(individ_locales[i]);
4058     }
4059 }
4060
4061 #  ifdef USE_LOCALE_COLLATE
4062
4063 STATIC void
4064 S_new_collate(pTHX_ const char *newcoll, bool force)
4065 {
4066     PERL_ARGS_ASSERT_NEW_COLLATE;
4067     PERL_UNUSED_ARG(force);
4068
4069     /* Called after each libc setlocale() call affecting LC_COLLATE, to tell
4070      * core Perl this and that 'newcoll' is the name of the new locale.
4071      *
4072      * The design of locale collation is that every locale change is given an
4073      * index 'PL_collation_ix'.  The first time a string participates in an
4074      * operation that requires collation while locale collation is active, it
4075      * is given PERL_MAGIC_collxfrm magic (via sv_collxfrm_flags()).  That
4076      * magic includes the collation index, and the transformation of the string
4077      * by strxfrm(), q.v.  That transformation is used when doing comparisons,
4078      * instead of the string itself.  If a string changes, the magic is
4079      * cleared.  The next time the locale changes, the index is incremented,
4080      * and so we know during a comparison that the transformation is not
4081      * necessarily still valid, and so is recomputed.  Note that if the locale
4082      * changes enough times, the index could wrap, and it is possible that a
4083      * transformation would improperly be considered valid, leading to an
4084      * unlikely bug.  The value is declared to the widest possible type on this
4085      * platform. */
4086
4087     /* Return if the locale isn't changing */
4088     if (strEQ(PL_collation_name, newcoll)) {
4089         return;
4090     }
4091
4092     Safefree(PL_collation_name);
4093     PL_collation_name = savepv(newcoll);
4094     ++PL_collation_ix;
4095
4096     /* Set the new one up if trivial.  Since this is called at process
4097      * initialization, be aware that this bit can't rely on much being
4098      * available. */
4099     PL_collation_standard = isNAME_C_OR_POSIX(newcoll);
4100     if (PL_collation_standard) {
4101         DEBUG_Lv(PerlIO_printf(Perl_debug_log,
4102                                "Setting PL_collation name='%s'\n",
4103                                PL_collation_name));
4104         PL_collxfrm_base = 0;
4105         PL_collxfrm_mult = 2;
4106         PL_in_utf8_COLLATE_locale = FALSE;
4107         PL_strxfrm_NUL_replacement = '\0';
4108         PL_strxfrm_max_cp = 0;
4109         return;
4110     }
4111
4112     /* Flag that the remainder of the set up is being deferred until first
4113      * need. */
4114     PL_collxfrm_mult = 0;
4115     PL_collxfrm_base = 0;
4116
4117 }
4118
4119 #  endif /* USE_LOCALE_COLLATE */
4120
4121 #  ifdef WIN32
4122
4123 STATIC wchar_t *
4124 S_Win_byte_string_to_wstring(const UINT code_page, const char * byte_string)
4125 {
4126     /* Caller must arrange to free the returned string */
4127
4128     int req_size = MultiByteToWideChar(code_page, 0, byte_string, -1, NULL, 0);
4129     if (! req_size) {
4130         SET_EINVAL;
4131         return NULL;
4132     }
4133
4134     wchar_t *wstring;
4135     Newx(wstring, req_size, wchar_t);
4136
4137     if (! MultiByteToWideChar(code_page, 0, byte_string, -1, wstring, req_size))
4138     {
4139         Safefree(wstring);
4140         SET_EINVAL;
4141         return NULL;
4142     }
4143
4144     return wstring;
4145 }
4146
4147 #    define Win_utf8_string_to_wstring(s)                                   \
4148                                     Win_byte_string_to_wstring(CP_UTF8, (s))
4149
4150 STATIC char *
4151 S_Win_wstring_to_byte_string(const UINT code_page, const wchar_t * wstring)
4152 {
4153     /* Caller must arrange to free the returned string */
4154
4155     int req_size =
4156             WideCharToMultiByte(code_page, 0, wstring, -1, NULL, 0, NULL, NULL);
4157
4158     char *byte_string;
4159     Newx(byte_string, req_size, char);
4160
4161     if (! WideCharToMultiByte(code_page, 0, wstring, -1, byte_string,
4162                                                          req_size, NULL, NULL))
4163     {
4164         Safefree(byte_string);
4165         SET_EINVAL;
4166         return NULL;
4167     }
4168
4169     return byte_string;
4170 }
4171
4172 #    define Win_wstring_to_utf8_string(ws)                                  \
4173                                    Win_wstring_to_byte_string(CP_UTF8, (ws))
4174
4175 STATIC const char *
4176 S_wrap_wsetlocale(pTHX_ const int category, const char *locale)
4177 {
4178     PERL_ARGS_ASSERT_WRAP_WSETLOCALE;
4179
4180     /* Calls _wsetlocale(), converting the parameters/return to/from
4181      * Perl-expected forms as if plain setlocale() were being called instead.
4182      *
4183      * Caller must arrange for the returned PV to be freed.
4184      */
4185
4186     const wchar_t * wlocale = NULL;
4187
4188     if (locale) {
4189         wlocale = Win_utf8_string_to_wstring(locale);
4190         if (! wlocale) {
4191             return NULL;
4192         }
4193     }
4194
4195     WSETLOCALE_LOCK;
4196     const wchar_t * wresult = _wsetlocale(category, wlocale);
4197
4198     if (! wresult) {
4199         WSETLOCALE_UNLOCK;
4200         Safefree(wlocale);
4201         return NULL;
4202     }
4203
4204     const char * result = Win_wstring_to_utf8_string(wresult);
4205     WSETLOCALE_UNLOCK;
4206
4207     Safefree(wlocale);
4208     return result;
4209 }
4210
4211 STATIC const char *
4212 S_win32_setlocale(pTHX_ int category, const char* locale)
4213 {
4214     /* This, for Windows, emulates POSIX setlocale() behavior.  There is no
4215      * difference between the two unless the input locale is "", which normally
4216      * means on Windows to get the machine default, which is set via the
4217      * computer's "Regional and Language Options" (or its current equivalent).
4218      * In POSIX, it instead means to find the locale from the user's
4219      * environment.  This routine changes the Windows behavior to try the POSIX
4220      * behavior first.  Further details are in the called function
4221      * find_locale_from_environment().
4222      */
4223
4224     if (locale != NULL && strEQ(locale, "")) {
4225         /* Note this function may change the locale, but that's ok because we
4226          * are about to change it anyway */
4227         locale = find_locale_from_environment(get_category_index(category));
4228         if (locale == NULL) {
4229             SET_EINVAL;
4230             return NULL;
4231         }
4232     }
4233
4234     const char * result = wrap_wsetlocale(category, locale);
4235     DEBUG_L(PerlIO_printf(Perl_debug_log, "%s\n",
4236                           setlocale_debug_string_r(category, locale, result)));
4237
4238     if (! result) {
4239         SET_EINVAL;
4240         return NULL;
4241     }
4242
4243     save_to_buffer(result, &PL_setlocale_buf, &PL_setlocale_bufsize);
4244
4245 #    ifndef USE_PL_CUR_LC_ALL
4246
4247     Safefree(result);
4248
4249 #  else
4250
4251     /* Here, we need to keep track of LC_ALL, so store the new value.  but if
4252      * the input locale is NULL, we were just querying, so the original value
4253      * hasn't changed */
4254     if (locale == NULL) {
4255         Safefree(result);
4256     }
4257     else {
4258
4259         /* If we set LC_ALL directly above, we already know its new value; but
4260          * if we changed just an individual category, find the new LC_ALL */
4261         if (category != LC_ALL) {
4262             Safefree(result);
4263             result = wrap_wsetlocale(LC_ALL, NULL);
4264         }
4265
4266         Safefree(PL_cur_LC_ALL);
4267         PL_cur_LC_ALL = result;
4268     }
4269
4270     DEBUG_L(PerlIO_printf(Perl_debug_log, "new PL_cur_LC_ALL=%s\n",
4271                                           PL_cur_LC_ALL));
4272 #    endif
4273
4274     return PL_setlocale_buf;
4275 }
4276
4277 #  endif
4278
4279 STATIC const char *
4280 S_native_querylocale_i(pTHX_ const locale_category_index cat_index)
4281 {
4282     /* Determine the current locale and return it in the form the platform's
4283      * native locale handling understands.  This is different only from our
4284      * internal form for the LC_ALL category, as platforms differ in how they
4285      * represent that.
4286      *
4287      * This is only called from Perl_setlocale().  As such it returns in
4288      * PL_setlocale_buf */
4289
4290 #  ifdef USE_LOCALE_NUMERIC
4291
4292     /* We have the LC_NUMERIC name saved, because we are normally switched into
4293      * the C locale (or equivalent) for it. */
4294     if (cat_index == LC_NUMERIC_INDEX_) {
4295
4296         /* We don't have to copy this return value, as it is a per-thread
4297          * variable, and won't change until a future setlocale */
4298         return PL_numeric_name;
4299     }
4300
4301 #  endif
4302 #  ifdef LC_ALL
4303
4304     if (cat_index != LC_ALL_INDEX_)
4305
4306 #  endif
4307
4308     {
4309         /* Here, not LC_ALL, and not LC_NUMERIC: the actual and native values
4310          * match */
4311
4312 #  ifdef setlocale_i    /* Can shortcut if this is defined */
4313
4314         return setlocale_i(cat_index, NULL);
4315
4316 #  else
4317
4318         return save_to_buffer(querylocale_i(cat_index),
4319                               &PL_setlocale_buf, &PL_setlocale_bufsize);
4320 #  endif
4321
4322     }
4323
4324     /* Below, querying LC_ALL */
4325
4326 #  ifdef LC_ALL
4327 #    ifdef USE_PL_CURLOCALES
4328 #      define LC_ALL_ARG  PL_curlocales
4329 #    else
4330 #      define LC_ALL_ARG  NULL  /* Causes calculate_LC_ALL_string() to find the
4331                                    locale using a querylocale function */
4332 #    endif
4333
4334     return calculate_LC_ALL_string(LC_ALL_ARG, EXTERNAL_FORMAT_FOR_QUERY,
4335                                    WANT_PL_setlocale_buf,
4336                                    __LINE__);
4337 #    undef LC_ALL_ARG
4338 #  endif    /* has LC_ALL */
4339
4340 }
4341
4342 #endif      /* USE_LOCALE */
4343
4344 /*
4345 =for apidoc Perl_setlocale
4346
4347 This is an (almost) drop-in replacement for the system L<C<setlocale(3)>>,
4348 taking the same parameters, and returning the same information, except that it
4349 returns the correct underlying C<LC_NUMERIC> locale.  Regular C<setlocale> will
4350 instead return C<C> if the underlying locale has a non-dot decimal point
4351 character, or a non-empty thousands separator for displaying floating point
4352 numbers.  This is because perl keeps that locale category such that it has a
4353 dot and empty separator, changing the locale briefly during the operations
4354 where the underlying one is required. C<Perl_setlocale> knows about this, and
4355 compensates; regular C<setlocale> doesn't.
4356
4357 Another reason it isn't completely a drop-in replacement is that it is
4358 declared to return S<C<const char *>>, whereas the system setlocale omits the
4359 C<const> (presumably because its API was specified long ago, and can't be
4360 updated; it is illegal to change the information C<setlocale> returns; doing
4361 so leads to segfaults.)
4362
4363 Finally, C<Perl_setlocale> works under all circumstances, whereas plain
4364 C<setlocale> can be completely ineffective on some platforms under some
4365 configurations.
4366
4367 Changing the locale is not a good idea when more than one thread is running,
4368 except on systems where the predefined variable C<${^SAFE_LOCALES}> is
4369 non-zero.  This is because on such systems the locale is global to the whole
4370 process and not local to just the thread calling the function.  So changing it
4371 in one thread instantaneously changes it in all.  On some such systems, the
4372 system C<setlocale()> is ineffective, returning the wrong information, and
4373 failing to actually change the locale.  z/OS refuses to try to change the
4374 locale once a second thread is created.  C<Perl_setlocale>, should give you
4375 accurate results of what actually happened on these problematic platforms,
4376 returning NULL if the system forbade the locale change.
4377
4378 The return points to a per-thread static buffer, which is overwritten the next
4379 time C<Perl_setlocale> is called from the same thread.
4380
4381 =cut
4382
4383 */
4384
4385 const char *
4386 Perl_setlocale(const int category, const char * locale)
4387 {
4388     /* This wraps POSIX::setlocale() */
4389
4390 #ifndef USE_LOCALE
4391
4392     PERL_UNUSED_ARG(category);
4393     PERL_UNUSED_ARG(locale);
4394
4395     return "C";
4396
4397 #else
4398
4399     dTHX;
4400
4401     DEBUG_L(PerlIO_printf(Perl_debug_log,
4402                           "Entering Perl_setlocale(%d, \"%s\")\n",
4403                           category, locale));
4404
4405     bool valid_category;
4406     locale_category_index cat_index = get_category_index_helper(category,
4407                                                                 &valid_category,
4408                                                                 __LINE__);
4409     if (! valid_category) {
4410         if (ckWARN(WARN_LOCALE)) {
4411             const char * conditional_warn_text;
4412             if (locale == NULL) {
4413                 conditional_warn_text = "";
4414                 locale = "";
4415             }
4416             else {
4417                 conditional_warn_text = "; can't set it to ";
4418             }
4419
4420             /* diag_listed_as: Unknown locale category %d; can't set it to %s */
4421             Perl_warner(aTHX_
4422                            packWARN(WARN_LOCALE),
4423                            "Unknown locale category %d%s%s",
4424                            category, conditional_warn_text, locale);
4425         }
4426
4427         SET_EINVAL;
4428         return NULL;
4429     }
4430
4431 #  ifdef setlocale_i
4432
4433     /* setlocale_i() gets defined only on Configurations that use setlocale()
4434      * in a simple manner that adequately handles all cases.  If this category
4435      * doesn't have any perl complications, just do that. */
4436     if (! update_functions[cat_index]) {
4437         return setlocale_i(cat_index, locale);
4438     }
4439
4440 #  endif
4441
4442     /* Get current locale */
4443     const char * current_locale = native_querylocale_i(cat_index);
4444
4445     /* A NULL locale means only query what the current one is. */
4446     if (locale == NULL) {
4447         return current_locale;
4448     }
4449
4450     if (strEQ(current_locale, locale)) {
4451         DEBUG_L(PerlIO_printf(Perl_debug_log,
4452                              "Already in requested locale: no action taken\n"));
4453         return current_locale;
4454     }
4455
4456     /* Here, an actual change is being requested.  Do it */
4457     if (! bool_setlocale_i(cat_index, locale)) {
4458         DEBUG_L(PerlIO_printf(Perl_debug_log, "%s\n",
4459                           setlocale_debug_string_i(cat_index, locale, "NULL")));
4460         return NULL;
4461     }
4462
4463     /* At this point, the locale has been changed based on the requested value,
4464      * and the querylocale_i() will return the actual new value that the system
4465      * has for the category.  That may not be the same as the input, as libc
4466      * may have returned a synonymous locale name instead of the input one; or,
4467      * if there are locale categories that we are compiled to ignore, any
4468      * attempt to change them away from "C" is overruled */
4469     current_locale = querylocale_i(cat_index);
4470
4471     /* But certain categories need further work.  For example we may need to
4472      * calculate new folding or collation rules.  And for LC_NUMERIC, we have
4473      * to switch into a locale that has a dot radix. */
4474     if (update_functions[cat_index]) {
4475         update_functions[cat_index](aTHX_ current_locale,
4476                                           /* No need to force recalculation, as
4477                                            * aren't coming from a situation
4478                                            * where Perl hasn't been controlling
4479                                            * the locale, so has accurate
4480                                            * records. */
4481                                           false);
4482     }
4483
4484     /* Make sure the result is in a stable buffer for the caller's use, and is
4485      * in the expected format */
4486     current_locale = native_querylocale_i(cat_index);
4487
4488     DEBUG_L(PerlIO_printf(Perl_debug_log, "returning '%s'\n", current_locale));
4489
4490     return current_locale;
4491
4492 #endif
4493
4494 }
4495
4496 #ifdef USE_LOCALE
4497
4498 STATIC const char *
4499 S_toggle_locale_i(pTHX_ const locale_category_index cat_index,
4500                         const char * new_locale,
4501                         const line_t caller_line)
4502 {
4503     PERL_ARGS_ASSERT_TOGGLE_LOCALE_I;
4504     assert(cat_index <= LC_ALL_INDEX_);
4505
4506     /* Changes the locale for the category specified by 'index' to 'new_locale,
4507      * if they aren't already the same.
4508      *
4509      * Returns a copy of the name of the original locale for 'cat_index'
4510      * so can be switched back to with the companion function
4511      * restore_toggled_locale_i(),  (NULL if no restoral is necessary.) */
4512
4513     /* Find the original locale of the category we may need to change, so that
4514      * it can be restored to later */
4515     const char * locale_to_restore_to = querylocale_i(cat_index);
4516
4517     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
4518                            "Entering toggle_locale_i: index=%d(%s),"        \
4519                            " wanted=%s, actual=%s; called from %" LINE_Tf   \
4520                            "\n", cat_index, category_names[cat_index],
4521                            new_locale, locale_to_restore_to, caller_line));
4522
4523     if (! locale_to_restore_to) {
4524         locale_panic_via_(Perl_form(aTHX_
4525                                     "Could not find current %s locale",
4526                                     category_names[cat_index]),
4527                          __FILE__, caller_line);
4528     }
4529
4530     /* If the locales are the same, there's nothing to do */
4531     if (strEQ(locale_to_restore_to, new_locale)) {
4532         DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s locale unchanged as %s\n",
4533                                                category_names[cat_index],
4534                                                new_locale));
4535
4536         return NULL;
4537     }
4538
4539     /* Finally, change the locale to the new one */
4540     void_setlocale_i_with_caller(cat_index, new_locale, __FILE__, caller_line);
4541
4542     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
4543                            "%s locale switched to %s\n",
4544                            category_names[cat_index], new_locale));
4545
4546     return locale_to_restore_to;
4547
4548 #  ifndef DEBUGGING
4549     PERL_UNUSED_ARG(caller_line);
4550 #  endif
4551
4552 }
4553
4554 STATIC void
4555 S_restore_toggled_locale_i(pTHX_ const locale_category_index cat_index,
4556                                  const char * restore_locale,
4557                                  const line_t caller_line)
4558 {
4559     /* Restores the locale for LC_category corresponding to cat_index to
4560      * 'restore_locale' (which is a copy that will be freed by this function),
4561      * or do nothing if the latter parameter is NULL */
4562
4563     PERL_ARGS_ASSERT_RESTORE_TOGGLED_LOCALE_I;
4564     assert(cat_index <= LC_ALL_INDEX_);
4565
4566     if (restore_locale == NULL) {
4567         DEBUG_Lv(PerlIO_printf(Perl_debug_log,
4568                                "restore_toggled_locale_i: No need to"       \
4569                                " restore %s; called from %" LINE_Tf "\n",   \
4570                                category_names[cat_index], caller_line));
4571         return;
4572     }
4573
4574     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
4575                            "restore_toggled_locale_i: restoring locale for" \
4576                            " %s to  %s; called from %" LINE_Tf "\n",        \
4577                            category_names[cat_index], restore_locale,
4578                            caller_line));
4579
4580     void_setlocale_i_with_caller(cat_index, restore_locale,
4581                                   __FILE__, caller_line);
4582
4583 #  ifndef DEBUGGING
4584     PERL_UNUSED_ARG(caller_line);
4585 #  endif
4586
4587 }
4588
4589 #endif
4590 #if defined(USE_LOCALE) || defined(HAS_SOME_LANGINFO) || defined(HAS_LOCALECONV)
4591
4592 STATIC utf8ness_t
4593 S_get_locale_string_utf8ness_i(pTHX_ const char * string,
4594                                      const locale_utf8ness_t known_utf8,
4595                                      const char * locale,
4596                                      const locale_category_index cat_index)
4597 {
4598     PERL_ARGS_ASSERT_GET_LOCALE_STRING_UTF8NESS_I;
4599
4600 #  ifndef USE_LOCALE
4601
4602     return UTF8NESS_NO;
4603     PERL_UNUSED_ARG(string);
4604     PERL_UNUSED_ARG(known_utf8);
4605     PERL_UNUSED_ARG(locale);
4606     PERL_UNUSED_ARG(cat_index);
4607
4608 #  else
4609
4610     assert(cat_index <= LC_ALL_INDEX_);
4611
4612     /* Return to indicate if 'string' in the locale given by the input
4613      * arguments should be considered UTF-8 or not.
4614      *
4615      * If the input 'locale' is not NULL, use that for the locale; otherwise
4616      * use the current locale for the category specified by 'cat_index'.
4617      */
4618
4619     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
4620                            "Entering get_locale_string_utf8ness_i; locale=%s,"
4621                            " index=%u(%s), string=%s, known_utf8=%d\n",
4622                            locale, cat_index, category_names[cat_index],
4623                            ((string)
4624                             ?  _byte_dump_string((U8 *) string,
4625                                                  strlen(string),
4626                                                  0)
4627                             : "nil"),
4628                            known_utf8));
4629     if (string == NULL) {
4630         return UTF8NESS_IMMATERIAL;
4631     }
4632
4633     if (IN_BYTES) { /* respect 'use bytes' */
4634         return UTF8NESS_NO;
4635     }
4636
4637     Size_t len = strlen(string);
4638
4639     /* UTF8ness is immaterial if the representation doesn't vary */
4640     const U8 * first_variant = NULL;
4641     if (is_utf8_invariant_string_loc((U8 *) string, len, &first_variant)) {
4642         return UTF8NESS_IMMATERIAL;
4643     }
4644
4645     /* Can't be UTF-8 if invalid */
4646     if (! is_utf8_string((U8 *) first_variant,
4647                          len - ((char *) first_variant - string)))
4648     {
4649         return UTF8NESS_NO;
4650     }
4651
4652     /* Here and below, we know the string is legal UTF-8, containing at least
4653      * one character requiring a sequence of two or more bytes.  It is quite
4654      * likely to be UTF-8.  But it pays to be paranoid and do further checking.
4655      *
4656      * If we already know the UTF-8ness of the locale, then we immediately know
4657      * what the string is */
4658     if (UNLIKELY(known_utf8 != LOCALE_UTF8NESS_UNKNOWN)) {
4659         return (known_utf8 == LOCALE_IS_UTF8) ? UTF8NESS_YES : UTF8NESS_NO;
4660     }
4661
4662     if (locale == NULL) {
4663         locale = querylocale_i(cat_index);
4664     }
4665
4666     /* If the locale is UTF-8, the string is UTF-8;  otherwise it was
4667      * coincidental that the string is legal UTF-8
4668      *
4669      * However, if the perl is compiled to not pay attention to the category
4670      * being passed in, you might think that that locale is essentially always
4671      * the C locale, so it would make sense to say it isn't UTF-8.  But to get
4672      * here, the string has to contain characters unknown in the C locale.  And
4673      * in fact, Windows boxes are compiled without LC_MESSAGES, as their
4674      * message catalog isn't really a part of the locale system.  But those
4675      * messages really could be UTF-8, and given that the odds are rather small
4676      * of something not being UTF-8 but being syntactically valid UTF-8, khw
4677      * has decided to call such strings as UTF-8. */
4678     return (is_locale_utf8(locale)) ? UTF8NESS_YES : UTF8NESS_NO;
4679
4680 #  endif
4681
4682 }
4683
4684 STATIC bool
4685 S_is_locale_utf8(pTHX_ const char * locale)
4686 {
4687     PERL_ARGS_ASSERT_IS_LOCALE_UTF8;
4688
4689     /* Returns TRUE if the locale 'locale' is UTF-8; FALSE otherwise. */
4690
4691 #  if ! defined(USE_LOCALE)                                                   \
4692    || ! defined(USE_LOCALE_CTYPE)                                             \
4693    ||   defined(EBCDIC) /* There aren't any real UTF-8 locales at this time */
4694
4695     PERL_UNUSED_ARG(locale);
4696
4697     return FALSE;
4698
4699      /* Definitively, can't be UTF-8 */
4700 #    define HAS_DEFINITIVE_UTF8NESS_DETERMINATION
4701 #  else
4702
4703     /* If the input happens to be the same locale as we are currently setup
4704      * for, the answer has already been cached. */
4705     if (strEQ(locale, PL_ctype_name)) {
4706         return PL_in_utf8_CTYPE_locale;
4707     }
4708
4709     if (isNAME_C_OR_POSIX(locale)) {
4710         return false;
4711     }
4712
4713 #    if ! defined(HAS_SOME_LANGINFO) && ! defined(WIN32)
4714
4715     /* On non-Windows without nl_langinfo(), we have to do some digging to get
4716      * the answer.  First, toggle to the desired locale so can query its state
4717      * */
4718     const char * orig_CTYPE_locale = toggle_locale_c(LC_CTYPE, locale);
4719
4720 #      define TEARDOWN_FOR_IS_LOCALE_UTF8                                   \
4721                       restore_toggled_locale_c(LC_CTYPE, orig_CTYPE_locale)
4722
4723 #      ifdef MB_CUR_MAX
4724
4725     /* If there are fewer bytes available in this locale than are required
4726      * to represent the largest legal UTF-8 code point, this isn't a UTF-8
4727      * locale. */
4728     const int mb_cur_max = MB_CUR_MAX;
4729     if (mb_cur_max < (int) UNISKIP(PERL_UNICODE_MAX)) {
4730         TEARDOWN_FOR_IS_LOCALE_UTF8;
4731         return false;
4732     }
4733
4734 #      endif
4735 #      if defined(HAS_MBTOWC) || defined(HAS_MBRTOWC)
4736
4737          /* With these functions, we can definitively determine a locale's
4738           * UTF-8ness */
4739 #        define HAS_DEFINITIVE_UTF8NESS_DETERMINATION
4740
4741     /* If libc mbtowc() evaluates the bytes that form the REPLACEMENT CHARACTER
4742      * as that Unicode code point, this has to be a UTF-8 locale; otherwise it
4743      * can't be  */
4744     wchar_t wc = 0;
4745     (void) Perl_mbtowc_(aTHX_ NULL, NULL, 0);/* Reset shift state */
4746     int mbtowc_ret = Perl_mbtowc_(aTHX_ &wc,
4747                                   STR_WITH_LEN(REPLACEMENT_CHARACTER_UTF8));
4748     TEARDOWN_FOR_IS_LOCALE_UTF8;
4749     return (   mbtowc_ret == STRLENs(REPLACEMENT_CHARACTER_UTF8)
4750             && wc == UNICODE_REPLACEMENT);
4751
4752 #      else
4753
4754         /* If the above two C99 functions aren't working, you could try some
4755          * different methods.  It seems likely that the obvious choices,
4756          * wctomb() and wcrtomb(), wouldn't be working either.  But you could
4757          * choose one of the dozen-ish Unicode titlecase triples and verify
4758          * that towupper/towlower work as expected.
4759          *
4760          * But, our emulation of nl_langinfo() works quite well, so avoid the
4761          * extra code until forced to by some weird non-conforming platform. */
4762 #        define USE_LANGINFO_FOR_UTF8NESS
4763 #        undef HAS_DEFINITIVE_UTF8NESS_DETERMINATION
4764 #      endif
4765 #    else
4766
4767      /* On Windows or on platforms with nl_langinfo(), there is a direct way to
4768       * get the locale's codeset, which will be some form of 'UTF-8' for a
4769       * UTF-8 locale.  my_langinfo_i() handles this, and we will call that
4770       * below */
4771 #      define HAS_DEFINITIVE_UTF8NESS_DETERMINATION
4772 #      define USE_LANGINFO_FOR_UTF8NESS
4773 #      define TEARDOWN_FOR_IS_LOCALE_UTF8
4774 #    endif  /* USE_LANGINFO_FOR_UTF8NESS */
4775
4776      /* If the above compiled into code, it found the locale's UTF-8ness,
4777       * nothing more to do; if it didn't get compiled,
4778       * USE_LANGINFO_FOR_UTF8NESS is defined.  There are two possible reasons:
4779       *   1)  it is the preferred method because it knows directly for sure
4780       *       what the codeset is because the platform has libc functions that
4781       *       return this; or
4782       *   2)  the functions the above code section would compile to use don't
4783       *       exist or are unreliable on this platform; we are less sure of the
4784       *       my_langinfo() result, though it is very unlikely to be wrong
4785       *       about if it is UTF-8 or not */
4786 #    ifdef USE_LANGINFO_FOR_UTF8NESS
4787
4788     char * scratch_buffer = NULL;
4789     const char * codeset = my_langinfo_c(CODESET, LC_CTYPE, locale,
4790                                          &scratch_buffer, NULL, NULL);
4791     bool retval = is_codeset_name_UTF8(codeset);
4792
4793     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
4794                            "found codeset=%s, is_utf8=%d\n", codeset, retval));
4795
4796     Safefree(scratch_buffer);
4797
4798     DEBUG_Lv(PerlIO_printf(Perl_debug_log, "is_locale_utf8(%s) returning %d\n",
4799                                                             locale, retval));
4800     TEARDOWN_FOR_IS_LOCALE_UTF8;
4801     return retval;
4802
4803 #    endif
4804 #  endif      /* End of the #else clause, for the non-trivial case */
4805
4806 }
4807
4808 #endif
4809
4810 STATIC void
4811 S_set_save_buffer_min_size(pTHX_ Size_t min_len,
4812                                  char **buf,
4813                                  Size_t * buf_cursize)
4814 {
4815     /* Make sure the buffer pointed to by *buf is at least as large 'min_len';
4816      * *buf_cursize is the size of 'buf' upon entry; it will be updated to the
4817      * new size on exit.  'buf_cursize' being NULL is to be used when this is a
4818      * single use buffer, which will shortly be freed by the caller. */
4819
4820     if (buf_cursize == NULL) {
4821         Newx(*buf, min_len, char);
4822     }
4823     else if (*buf_cursize == 0) {
4824         Newx(*buf, min_len, char);
4825         *buf_cursize = min_len;
4826     }
4827     else if (min_len > *buf_cursize) {
4828         Renew(*buf, min_len, char);
4829         *buf_cursize = min_len;
4830     }
4831 }
4832
4833 STATIC const char *
4834 S_save_to_buffer(pTHX_ const char * string, char **buf, Size_t *buf_size)
4835 {
4836     PERL_ARGS_ASSERT_SAVE_TO_BUFFER;
4837
4838     /* Copy the NUL-terminated 'string' to a buffer whose address before this
4839      * call began at *buf, and whose available length before this call was
4840      * *buf_size.
4841      *
4842      * If the length of 'string' is greater than the space available, the
4843      * buffer is grown accordingly, which may mean that it gets relocated.
4844      * *buf and *buf_size will be updated to reflect this.
4845      *
4846      * Regardless, the function returns a pointer to where 'string' is now
4847      * stored.
4848      *
4849      * 'string' may be NULL, which means no action gets taken, and NULL is
4850      * returned.
4851      *
4852      * 'buf_size' being NULL is to be used when this is a single use buffer,
4853      * which will shortly be freed by the caller.
4854      *
4855      * If *buf or 'buf_size' are NULL or *buf_size is 0, the buffer is assumed
4856      * empty, and memory is malloc'd.
4857      */
4858
4859     if (! string) {
4860         return NULL;
4861     }
4862
4863     /* No-op to copy over oneself */
4864     if (string == *buf) {
4865         return string;
4866     }
4867
4868     Size_t string_size = strlen(string) + 1;
4869     set_save_buffer_min_size(string_size, buf, buf_size);
4870
4871 #  ifdef DEBUGGING
4872
4873     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
4874                          "Copying '%s' to %p\n",
4875                          ((is_utf8_string((U8 *) string, 0))
4876                           ? string
4877                           :_byte_dump_string((U8 *) string, strlen(string), 0)),
4878                           *buf));
4879
4880 #    ifdef USE_LOCALE_CTYPE
4881
4882     /* Catch glitches.  Usually this is because LC_CTYPE needs to be the same
4883      * locale as whatever is being worked on */
4884     if (UNLIKELY(instr(string, REPLACEMENT_CHARACTER_UTF8))) {
4885         locale_panic_(Perl_form(aTHX_
4886                                 "Unexpected REPLACEMENT_CHARACTER in '%s'\n%s",
4887                                 string, get_LC_ALL_display()));
4888     }
4889
4890 #    endif
4891 #  endif
4892
4893     Copy(string, *buf, string_size, char);
4894     return *buf;
4895 }
4896
4897 #ifdef USE_LOCALE
4898 #  ifdef WIN32
4899
4900 bool
4901 Perl_get_win32_message_utf8ness(pTHX_ const char * string)
4902 {
4903     /* This is because Windows doesn't have LC_MESSAGES. */
4904
4905 #    ifdef USE_LOCALE_CTYPE
4906
4907     /* We don't know the locale utf8ness here, and not even the locale itself.
4908      * Since Windows uses a different mechanism to specify message language
4909      * output than the locale system, it is going to be problematic deciding
4910      * if we are to store it as UTF-8 or not.  By specifying LOCALE_IS_UTF8, we
4911      * are telling the called function to return true iff the string has
4912      * non-ASCII characters in it that are all syntactically UTF-8.  We are
4913      * thus relying on the fact that a string that is syntactically valid UTF-8
4914      * is likely to be UTF-8.  Should this ever cause problems, this function
4915      * could be replaced by something more Windows-specific */
4916     return get_locale_string_utf8ness_i(string, LOCALE_IS_UTF8,
4917                                         NULL, LC_CTYPE_INDEX_);
4918 #    else
4919
4920     PERL_UNUSED_ARG(string);
4921     return false;
4922
4923 #    endif
4924
4925 }
4926
4927 #  endif
4928 #endif  /* USE_LOCALE */
4929
4930 int
4931 Perl_mbtowc_(pTHX_ const wchar_t * pwc, const char * s, const Size_t len)
4932 {
4933
4934 #if ! defined(HAS_MBRTOWC) && ! defined(HAS_MBTOWC)
4935
4936     PERL_UNUSED_ARG(pwc);
4937     PERL_UNUSED_ARG(s);
4938     PERL_UNUSED_ARG(len);
4939     return -1;
4940
4941 #else   /* Below we have some form of mbtowc() */
4942 #  if defined(HAS_MBRTOWC)                                      \
4943    && (defined(USE_LOCALE_THREADS) || ! defined(HAS_MBTOWC))
4944 #    define USE_MBRTOWC
4945 #  else
4946 #    undef USE_MBRTOWC
4947 #  endif
4948
4949     int retval = -1;
4950
4951     if (s == NULL) { /* Initialize the shift state to all zeros in
4952                         PL_mbrtowc_ps. */
4953
4954 #  if defined(USE_MBRTOWC)
4955
4956         memzero(&PL_mbrtowc_ps, sizeof(PL_mbrtowc_ps));
4957         return 0;
4958
4959 #  else
4960
4961         SETERRNO(0, 0);
4962         MBTOWC_LOCK_;
4963         retval = mbtowc(NULL, NULL, 0);
4964         MBTOWC_UNLOCK_;
4965         return retval;
4966
4967 #  endif
4968
4969     }
4970
4971 #  if defined(USE_MBRTOWC)
4972
4973     SETERRNO(0, 0);
4974     MBRTOWC_LOCK_;
4975     retval = (SSize_t) mbrtowc((wchar_t *) pwc, s, len, &PL_mbrtowc_ps);
4976     MBRTOWC_UNLOCK_;
4977
4978 #  else
4979
4980     /* Locking prevents races, but locales can be switched out without locking,
4981      * so this isn't a cure all */
4982     SETERRNO(0, 0);
4983     MBTOWC_LOCK_;
4984     retval = mbtowc((wchar_t *) pwc, s, len);
4985     MBTOWC_UNLOCK_;
4986
4987 #  endif
4988
4989     return retval;
4990
4991 #endif
4992
4993 }
4994
4995 /*
4996 =for apidoc Perl_localeconv
4997
4998 This is a thread-safe version of the libc L<localeconv(3)>.  It is the same as
4999 L<POSIX::localeconv|POSIX/localeconv> (returning a hash of the C<localeconv()>
5000 fields), but directly callable from XS code.
5001
5002 =cut
5003 */
5004
5005 HV *
5006 Perl_localeconv(pTHX)
5007 {
5008
5009 #if  ! defined(HAS_LOCALECONV)
5010
5011     return newHV();
5012
5013 #else
5014
5015     return my_localeconv(0);
5016
5017 #endif
5018
5019 }
5020
5021 #if defined(HAS_LOCALECONV)
5022
5023 HV *
5024 S_my_localeconv(pTHX_ const int item)
5025 {
5026     PERL_ARGS_ASSERT_MY_LOCALECONV;
5027
5028     /* This returns a mortalized hash containing all or certain elements
5029      * returned by localeconv(). */
5030     HV * hv = newHV();      /* The returned hash, initially empty */
5031     sv_2mortal((SV*)hv);
5032
5033     /* The function is used by Perl_localeconv() and POSIX::localeconv(), or
5034      * internally from this file, and is thread-safe.
5035      *
5036      * localeconv() returns items from two different locale categories,
5037      * LC_MONETARY and LC_NUMERIC.  Various data structures in this function
5038      * are arrays with two elements, one for each category, and these indexes
5039      * indicate which array element applies to which category */
5040 #define NUMERIC_OFFSET   0
5041 #define MONETARY_OFFSET  1
5042
5043     /* Some operations apply to one or the other category, or both.  A mask
5044      * is used to specify all the possibilities.  This macro converts from the
5045      * category offset to its bit position in the mask. */
5046 #define OFFSET_TO_BIT(i)  (1 << (i))
5047
5048     /* There are two use cases for this function:
5049      * 1) Called as Perl_localeconv(), or from POSIX::locale_conv().  This
5050      *    returns the lconv structure copied to a hash, based on the current
5051      *    underlying locales for LC_NUMERIC and LC_MONETARY. An input item==0
5052      *    signifies this case, or on many platforms it is the only use case
5053      *    compiled.
5054      * 2) Certain items that nl_langinfo() provides are also derivable from
5055      *    the return of localeconv().  Windows notably doesn't have
5056      *    nl_langinfo(), so on that, and actually any platform lacking it,
5057      *    my_localeconv() is used also to emulate it for those particular
5058      *    items.  The code to do this is compiled only on such platforms.
5059      *    Rather than going to the expense of creating a full hash when only
5060      *    one item is needed, the returned hash has just the desired item in
5061      *    it.
5062      *
5063      * To access all the localeconv() struct lconv fields, there is a data
5064      * structure that contains every commonly documented field in it.  (Maybe
5065      * some minority platforms have extra fields.  Those could be added here
5066      * without harm; they would just be ignored on platforms lacking them.)
5067      *
5068      * Our structure is compiled to make looping through the fields easier by
5069      * pointing each name to its value's offset within lconv, e.g.,
5070         { "thousands_sep", STRUCT_OFFSET(struct lconv, thousands_sep) }
5071      */
5072 #  define LCONV_ENTRY(name)                                                 \
5073                        {STRINGIFY(name), STRUCT_OFFSET(struct lconv, name)}
5074
5075     /* These synonyms are just for clarity, and to make it easier in case
5076      * something needs to change in the future */
5077 #  define LCONV_NUMERIC_ENTRY(name)  LCONV_ENTRY(name)
5078 #  define LCONV_MONETARY_ENTRY(name) LCONV_ENTRY(name)
5079
5080     /* There are just a few fields for NUMERIC strings */
5081     const lconv_offset_t lconv_numeric_strings[] = {
5082 #  ifndef NO_LOCALECONV_GROUPING
5083         LCONV_NUMERIC_ENTRY(grouping),
5084 #   endif
5085         LCONV_NUMERIC_ENTRY(thousands_sep),
5086 #   define THOUSANDS_SEP_LITERAL  "thousands_sep"
5087         LCONV_NUMERIC_ENTRY(decimal_point),
5088 #   define DECIMAL_POINT_LITERAL "decimal_point"
5089         {NULL, 0}
5090     };
5091
5092     /* When used to implement nl_langinfo(), we save time by only populating
5093      * the hash with the field(s) needed.  Thus we would need a data structure
5094      * of just:
5095      *  LCONV_NUMERIC_ENTRY(decimal_point),
5096      *  {NULL, 0}
5097      *
5098      * By placing the decimal_point field last in the full structure, we can
5099      * use just the tail for this bit of it, saving space.  This macro yields
5100      * the address of the sub structure. */
5101 #  define DECIMAL_POINT_ADDRESS                                             \
5102         &lconv_numeric_strings[(C_ARRAY_LENGTH(lconv_numeric_strings) - 2)]
5103
5104     /* And the MONETARY string fields */
5105     const lconv_offset_t lconv_monetary_strings[] = {
5106         LCONV_MONETARY_ENTRY(int_curr_symbol),
5107         LCONV_MONETARY_ENTRY(mon_decimal_point),
5108 #  ifndef NO_LOCALECONV_MON_THOUSANDS_SEP
5109         LCONV_MONETARY_ENTRY(mon_thousands_sep),
5110 #  endif
5111 #  ifndef NO_LOCALECONV_MON_GROUPING
5112         LCONV_MONETARY_ENTRY(mon_grouping),
5113 #  endif
5114         LCONV_MONETARY_ENTRY(positive_sign),
5115         LCONV_MONETARY_ENTRY(negative_sign),
5116         LCONV_MONETARY_ENTRY(currency_symbol),
5117 #  define CURRENCY_SYMBOL_LITERAL  "currency_symbol"
5118         {NULL, 0}
5119     };
5120
5121     /* Like above, this field being last can be used as a sub structure */
5122 #  define CURRENCY_SYMBOL_ADDRESS                                            \
5123       &lconv_monetary_strings[(C_ARRAY_LENGTH(lconv_monetary_strings) - 2)]
5124
5125     /* Finally there are integer fields, all are for monetary purposes */
5126     const lconv_offset_t lconv_integers[] = {
5127         LCONV_ENTRY(int_frac_digits),
5128         LCONV_ENTRY(frac_digits),
5129         LCONV_ENTRY(p_sep_by_space),
5130         LCONV_ENTRY(n_cs_precedes),
5131         LCONV_ENTRY(n_sep_by_space),
5132         LCONV_ENTRY(p_sign_posn),
5133         LCONV_ENTRY(n_sign_posn),
5134 #  ifdef HAS_LC_MONETARY_2008
5135         LCONV_ENTRY(int_p_cs_precedes),
5136         LCONV_ENTRY(int_p_sep_by_space),
5137         LCONV_ENTRY(int_n_cs_precedes),
5138         LCONV_ENTRY(int_n_sep_by_space),
5139         LCONV_ENTRY(int_p_sign_posn),
5140         LCONV_ENTRY(int_n_sign_posn),
5141 #  endif
5142 #      define P_CS_PRECEDES_LITERAL    "p_cs_precedes"
5143         LCONV_ENTRY(p_cs_precedes),
5144         {NULL, 0}
5145     };
5146
5147     /* Like above, this field being last can be used as a sub structure */
5148 #  define P_CS_PRECEDES_ADDRESS                                       \
5149       &lconv_integers[(C_ARRAY_LENGTH(lconv_integers) - 2)]
5150
5151     /* The actual populating of the hash is done by two sub functions that get
5152      * passed an array of length two containing the data structure they are
5153      * supposed to use to get the key names to fill the hash with.  One element
5154      * is always for the NUMERIC strings (or NULL if none to use), and the
5155      * other element similarly for the MONETARY ones. */
5156     const lconv_offset_t * strings[2] = { lconv_numeric_strings,
5157                                           lconv_monetary_strings
5158                                         };
5159
5160     /* The LC_MONETARY category also has some integer-valued fields, whose
5161      * information is kept in a separate parallel array to 'strings' */
5162     const lconv_offset_t * integers[2] = {
5163                                            NULL,
5164                                            lconv_integers
5165                                          };
5166
5167 #  if ! defined(USE_LOCALE_NUMERIC) && ! defined(USE_LOCALE_MONETARY)
5168
5169     /* If both NUMERIC and MONETARY must be the "C" locale, simply populate the
5170      * hash using the function that works on just that locale. */
5171     populate_hash_from_C_localeconv(hv,
5172                                     "C",
5173                                     (  OFFSET_TO_BIT(NUMERIC_OFFSET)
5174                                      | OFFSET_TO_BIT(MONETARY_OFFSET)),
5175                                      strings, integers);
5176
5177     /* We shouldn't get to here for the case of an individual item, as
5178      * preprocessor directives elsewhere in this file should have filled in the
5179      * correct values at a higher level */
5180     assert(item == 0);
5181     PERL_UNUSED_ARG(item);
5182
5183     return hv;
5184
5185 #  else
5186
5187     /* From here to the end of this function, at least one of NUMERIC or
5188      * MONETARY can be non-C */
5189
5190     /* This is a mask, with one bit to tell the populate functions to populate
5191      * the NUMERIC items; another bit for the MONETARY ones.  This way they can
5192      * choose which (or both) to populate from */
5193     U32 index_bits = 0;
5194
5195     /* Some platforms, for correct non-mojibake results, require LC_CTYPE's
5196      * locale to match LC_NUMERIC's for the numeric fields, and LC_MONETARY's
5197      * for the monetary ones.  What happens if LC_NUMERIC and LC_MONETARY
5198      * aren't compatible?  Wrong results.  To avoid that, we call localeconv()
5199      * twice, once for each locale, setting LC_CTYPE to match the category.
5200      * But if the locales of both categories are the same, there is no need for
5201      * a second call.  Assume this is the case unless overridden below */
5202     bool requires_2nd_localeconv = false;
5203
5204     /* The actual hash populating is done by one of the two populate functions.
5205      * Which one is appropriate for either the MONETARY_OFFSET or the
5206      * NUMERIC_OFFSET is calculated and then stored in this table */
5207     void (*populate[2]) (pTHX_
5208                          HV * ,
5209                          const char *,
5210                          const U32,
5211                          const lconv_offset_t **,
5212                          const lconv_offset_t **);
5213
5214     /* This gives the locale to use for the corresponding OFFSET, like the
5215      * 'populate' array above */
5216     const char * locales[2];
5217
5218 #  ifdef HAS_SOME_LANGINFO
5219
5220     /* If the only use-case for this is the full localeconv(), the 'item'
5221      * parameter is ignored. */
5222     PERL_UNUSED_ARG(item);
5223
5224 #  else     /* This only gets compiled for the use-case of using localeconv()
5225                to emulate nl_langinfo() when missing from the platform. */
5226
5227 #      ifdef USE_LOCALE_NUMERIC
5228
5229     /* We need this substructure to only return this field for the THOUSEP
5230      * item.  The other items also need substructures, but they were handled
5231      * above by placing the substructure's item at the end of the full one, so
5232      * the data structure could do double duty.  However, both this and
5233      * RADIXCHAR would need to be in the final position of the same full
5234      * structure; an impossibility.  So make this into a separate structure */
5235     const lconv_offset_t  thousands_sep_string[] = {
5236         LCONV_NUMERIC_ENTRY(thousands_sep),
5237         {NULL, 0}
5238     };
5239
5240 #      endif
5241
5242     /* End of all the initialization of data structures.  Now for actual code.
5243      *
5244      * Without nl_langinfo(), the call to my_localeconv() could be for all of
5245      * the localeconv() items or for just one of the following 3 items to
5246      * emulate nl_langinfo().
5247      *
5248      * This is compiled only when using perl_langinfo.h, which we control, and
5249      * it has been constructed so that no item is numbered 0.
5250      *
5251      * For each individual item, either return the known value if the current
5252      * locale is "C", or set up the appropriate parameters for the call below
5253      * to the populate function */
5254     if (item != 0) {
5255         const char *locale;
5256
5257         switch (item) {
5258           default:
5259             locale_panic_(Perl_form(aTHX_
5260                           "Unexpected item passed to my_localeconv: %d", item));
5261             break;
5262
5263 #      ifdef USE_LOCALE_NUMERIC
5264
5265           case RADIXCHAR:
5266             if (isNAME_C_OR_POSIX(PL_numeric_name)) {
5267                 (void) hv_stores(hv, DECIMAL_POINT_LITERAL, newSVpvs("."));
5268                 return hv;
5269             }
5270
5271             strings[NUMERIC_OFFSET] = DECIMAL_POINT_ADDRESS;
5272             goto numeric_common;
5273
5274           case THOUSEP:
5275             if (isNAME_C_OR_POSIX(PL_numeric_name)) {
5276                 (void) hv_stores(hv, THOUSANDS_SEP_LITERAL, newSVpvs(""));
5277                 return hv;
5278             }
5279
5280             strings[NUMERIC_OFFSET] = thousands_sep_string;
5281
5282           numeric_common:
5283             index_bits = OFFSET_TO_BIT(NUMERIC_OFFSET);
5284             locale = PL_numeric_name;
5285             break;
5286
5287 #      endif
5288 #      ifdef USE_LOCALE_MONETARY
5289
5290           case CRNCYSTR:    /* This item needs the values for both the currency
5291                                symbol, and another one used to construct the
5292                                nl_langino()-compatible return. */
5293
5294             locale = querylocale_c(LC_MONETARY);
5295             if (isNAME_C_OR_POSIX(locale)) {
5296                 (void) hv_stores(hv, CURRENCY_SYMBOL_LITERAL, newSVpvs(""));
5297                 (void) hv_stores(hv, P_CS_PRECEDES_LITERAL, newSViv(-1));
5298                 return hv;
5299             }
5300
5301             strings[MONETARY_OFFSET] = CURRENCY_SYMBOL_ADDRESS;
5302             integers[MONETARY_OFFSET] = P_CS_PRECEDES_ADDRESS;
5303
5304             index_bits = OFFSET_TO_BIT(MONETARY_OFFSET);
5305             break;
5306
5307 #      endif
5308
5309         } /* End of switch() */
5310
5311         /* There's only one item, so only one of each of these will get used,
5312          * but cheap to initialize both */
5313         populate[MONETARY_OFFSET] =
5314         populate[NUMERIC_OFFSET]  = S_populate_hash_from_localeconv;
5315         locales[MONETARY_OFFSET] = locales[NUMERIC_OFFSET]  = locale;
5316     }
5317     else   /* End of for just one item to emulate nl_langinfo() */
5318
5319 #    endif
5320
5321     {
5322         /* Here, the call is for all of localeconv().  It has a bunch of
5323          * items.  The first function call always gets the MONETARY values */
5324         index_bits = OFFSET_TO_BIT(MONETARY_OFFSET);
5325
5326 #    ifdef USE_LOCALE_MONETARY
5327
5328         locales[MONETARY_OFFSET] = querylocale_c(LC_MONETARY);
5329         populate[MONETARY_OFFSET] =
5330                                 (isNAME_C_OR_POSIX(locales[MONETARY_OFFSET]))
5331                                 ?  S_populate_hash_from_C_localeconv
5332                                 :  S_populate_hash_from_localeconv;
5333
5334 #    else
5335
5336         locales[MONETARY_OFFSET] = "C";
5337         populate[MONETARY_OFFSET] = S_populate_hash_from_C_localeconv;
5338
5339 #    endif
5340 #    ifdef USE_LOCALE_NUMERIC
5341
5342         /* And if the locales for the two categories are the same, we can also
5343          * do the NUMERIC values in the same call */
5344         if (strEQ(PL_numeric_name, locales[MONETARY_OFFSET])) {
5345             index_bits |= OFFSET_TO_BIT(NUMERIC_OFFSET);
5346             locales[NUMERIC_OFFSET] = locales[MONETARY_OFFSET];
5347             populate[NUMERIC_OFFSET] = populate[MONETARY_OFFSET];
5348         }
5349         else {
5350             requires_2nd_localeconv = true;
5351             locales[NUMERIC_OFFSET] = PL_numeric_name;
5352             populate[NUMERIC_OFFSET] = (isNAME_C_OR_POSIX(PL_numeric_name))
5353                                        ?  S_populate_hash_from_C_localeconv
5354                                        :  S_populate_hash_from_localeconv;
5355         }
5356
5357 #    else
5358
5359         /* When LC_NUMERIC is confined to "C", the two locales are the same
5360            iff LC_MONETARY in this case is also "C".  We set up the function
5361            for that case above, so fastest to test just its address */
5362         locales[NUMERIC_OFFSET] = "C";
5363         if (populate[MONETARY_OFFSET] == S_populate_hash_from_C_localeconv) {
5364             index_bits |= OFFSET_TO_BIT(NUMERIC_OFFSET);
5365             populate[NUMERIC_OFFSET] = populate[MONETARY_OFFSET];
5366         }
5367         else {
5368             requires_2nd_localeconv = true;
5369             populate[NUMERIC_OFFSET] = S_populate_hash_from_C_localeconv;
5370         }
5371
5372 #    endif
5373
5374     }   /* End of call is for localeconv() */
5375
5376     /* Call the proper populate function (which may call localeconv()) and copy
5377      * its results into the hash.  All the parameters have been initialized
5378      * above */
5379     (*populate[MONETARY_OFFSET])(aTHX_
5380                                  hv, locales[MONETARY_OFFSET],
5381                                  index_bits, strings, integers);
5382
5383 #    ifndef HAS_SOME_LANGINFO  /* Could be using this function to emulate
5384                                 nl_langinfo() */
5385
5386     /* We are done when called with an individual item.  There are no integer
5387      * items to adjust, and it's best for the caller to determine if this
5388      * string item is UTF-8 or not.  This is because the locale's UTF-8ness is
5389      * calculated below, and in some Configurations, that can lead to a
5390      * recursive call to here, which could recurse infinitely. */
5391     if (item != 0) {
5392         return hv;
5393     }
5394
5395 #    endif
5396
5397     /* The above call may have done all the hash fields, but not always, as
5398      * already explained.  If we need a second call it is always for the
5399      * NUMERIC fields */
5400     if (requires_2nd_localeconv) {
5401         (*populate[NUMERIC_OFFSET])(aTHX_
5402                                     hv,
5403                                     locales[NUMERIC_OFFSET],
5404                                     OFFSET_TO_BIT(NUMERIC_OFFSET),
5405                                     strings, integers);
5406     }
5407
5408     /* Here, the hash has been completely populated.
5409      *
5410      * Now go through all the items and:
5411      *  a) For string items, see if they should be marked as UTF-8 or not.
5412      *     This would have been more convenient and faster to do while
5413      *     populating the hash in the first place, but that operation has to be
5414      *     done within a critical section, keeping other threads from
5415      *     executing, so only the minimal amount of work necessary is done at
5416      *     that time.
5417      *  b) For integer items, convert the C CHAR_MAX value into -1.  Again,
5418      *     this could have been done in the critical section, but was deferred
5419      *     to here to keep to the bare minimum amount the time spent owning the
5420      *     processor. CHAR_MAX is a C concept for an 8-bit character type.
5421      *     Perl has no such type; the closest fit is a -1.
5422      *
5423      * XXX On unthreaded perls, this code could be #ifdef'd out, and the
5424      * corrections determined at hash population time, at an extra maintenance
5425      * cost which khw doesn't think is worth it
5426      */
5427
5428     for (unsigned int i = 0; i < 2; i++) {  /* Try both types of strings */
5429
5430         /* The return from this function is already adjusted */
5431         if (populate[i] == S_populate_hash_from_C_localeconv) {
5432             continue;
5433         }
5434
5435         /* Examine each string */
5436         for (const lconv_offset_t *strp = strings[i]; strp->name; strp++) {
5437             const char * name = strp->name;
5438
5439             /* 'value' will contain the string that may need to be marked as
5440              * UTF-8 */
5441             SV ** value = hv_fetch(hv, name, strlen(name), true);
5442             if (value == NULL) {
5443                 continue;
5444             }
5445
5446             /* Determine if the string should be marked as UTF-8. */
5447             if (UTF8NESS_YES == (get_locale_string_utf8ness_i(SvPVX(*value),
5448                                                   LOCALE_UTF8NESS_UNKNOWN,
5449                                                   locales[i],
5450                                                   LC_ALL_INDEX_ /* OOB */)))
5451             {
5452                 SvUTF8_on(*value);
5453             }
5454         }
5455
5456         if (integers[i] == NULL) {
5457             continue;
5458         }
5459
5460         /* And each integer */
5461         for (const lconv_offset_t *intp = integers[i]; intp->name; intp++) {
5462             const char * name = intp->name;
5463
5464         if (! name) {   /* Reached the end */
5465             break;
5466         }
5467
5468         SV ** value = hv_fetch(hv, name, strlen(name), true);
5469         if (! value) {
5470             continue;
5471         }
5472
5473         /* Change CHAR_MAX to -1 */
5474         if (SvIV(*value) == CHAR_MAX) {
5475             sv_setiv(*value, -1);
5476         }
5477         }
5478     }
5479
5480     return hv;
5481
5482 #  endif    /* End of must have one or both USE_MONETARY, USE_NUMERIC */
5483
5484 }
5485
5486 STATIC void
5487 S_populate_hash_from_C_localeconv(pTHX_ HV * hv,
5488                                         const char * locale,  /* Unused */
5489
5490                                         /* bit mask of which categories to
5491                                          * populate */
5492                                         const U32 which_mask,
5493
5494                                         /* The string type values to return;
5495                                          * one element for numeric; the other
5496                                          * for monetary */
5497                                         const lconv_offset_t * strings[2],
5498
5499                                         /* And the integer fields */
5500                                         const lconv_offset_t * integers[2])
5501 {
5502     PERL_ARGS_ASSERT_POPULATE_HASH_FROM_C_LOCALECONV;
5503     PERL_UNUSED_ARG(locale);
5504     assert(isNAME_C_OR_POSIX(locale));
5505
5506     /* Fill hv with the values that localeconv() is supposed to return for
5507      * the C locale */
5508
5509     U32 working_mask = which_mask;
5510     while (working_mask) {
5511
5512         /* Get the bit position of the next lowest set bit.  That is the
5513          * index into the 'strings' array of the category we use in this loop
5514          * iteration.  Turn the bit off so we don't work on this category
5515          * again in this function call. */
5516         const PERL_UINT_FAST8_T i = lsbit_pos(working_mask);
5517         working_mask &= ~ (1 << i);
5518
5519         /* This category's string fields */
5520         const lconv_offset_t * category_strings = strings[i];
5521
5522 #  ifndef HAS_SOME_LANGINFO /* This doesn't work properly if called on a single
5523                                item, which could only happen when there isn't
5524                                nl_langinfo on the platform */
5525         assert(category_strings[1].name != NULL);
5526 #  endif
5527
5528         /* All string fields are empty except for one NUMERIC one.  That one
5529          * has been initialized to be the final one in the NUMERIC strings, so
5530          * stop the loop early in that case.  Otherwise, we would store an
5531          * empty string to the hash, and immediately overwrite it with the
5532          * correct value */
5533         const unsigned int stop_early = (i == NUMERIC_OFFSET) ? 1 : 0;
5534
5535         /* A NULL element terminates the list */
5536         while ((category_strings + stop_early)->name) {
5537             (void) hv_store(hv,
5538                             category_strings->name,
5539                             strlen(category_strings->name),
5540                             newSVpvs(""),
5541                             0);
5542
5543             category_strings++;
5544         }
5545
5546         /* And fill in the NUMERIC exception */
5547         if (i == NUMERIC_OFFSET) {
5548             (void) hv_stores(hv, "decimal_point", newSVpvs("."));
5549             category_strings++;
5550         }
5551
5552         /* Add any int fields.  In the C locale, all are -1 */
5553         if (integers[i]) {
5554             const lconv_offset_t * current = integers[i];
5555             while (current->name) {
5556                 (void) hv_store(hv,
5557                                 current->name, strlen(current->name),
5558                                 newSViv(-1),
5559                                 0);
5560                 current++;
5561             }
5562         }
5563     }
5564 }
5565
5566 #  if defined(USE_LOCALE_NUMERIC) || defined(USE_LOCALE_MONETARY)
5567
5568 STATIC void
5569 S_populate_hash_from_localeconv(pTHX_ HV * hv,
5570
5571                                       /* Switch to this locale to run
5572                                        * localeconv() from */
5573                                       const char * locale,
5574
5575                                       /* bit mask of which categories to
5576                                        * populate */
5577                                       const U32 which_mask,
5578
5579                                       /* The string type values to return; one
5580                                        * element for numeric; the other for
5581                                        * monetary */
5582                                       const lconv_offset_t * strings[2],
5583
5584                                       /* And similarly the integer fields */
5585                                       const lconv_offset_t * integers[2])
5586 {
5587     PERL_ARGS_ASSERT_POPULATE_HASH_FROM_LOCALECONV;
5588
5589     /* Run localeconv() and copy some or all of its results to the input 'hv'
5590      * hash.  Most localeconv() implementations return the values in a global
5591      * static buffer, so the operation must be performed in a critical section,
5592      * ending only after the copy is completed.  There are so many locks
5593      * because localeconv() deals with two categories, and returns in a single
5594      * global static buffer.  Some locks might be no-ops on this platform, but
5595      * not others.  We need to lock if any one isn't a no-op. */
5596
5597     /* If the call could be for either or both of the two categories, we need
5598      * to test which one; but if the Configuration is such that we will never
5599      * be called with one of them, the code for that one will be #ifdef'd out
5600      * below, leaving code for just the other category.  That code will always
5601      * want to be executed, no conditional required.  Create a macro that
5602      * replaces the condition with an always-true value so the compiler will
5603      * omit the conditional */
5604 #    if defined(USE_LOCALE_NUMERIC) && defined(USE_LOCALE_MONETARY)
5605 #      define CALL_IS_FOR(x)  (which_mask & OFFSET_TO_BIT(x ## _OFFSET))
5606 #    else
5607 #      define CALL_IS_FOR(x) 1
5608 #    endif
5609
5610     start_DEALING_WITH_MISMATCHED_CTYPE(locale);
5611
5612 #    ifdef USE_LOCALE_NUMERIC
5613
5614     /* We need to toggle to the underlying NUMERIC locale if we are getting
5615      * NUMERIC strings */
5616     const char * orig_NUMERIC_locale = NULL;
5617     if (CALL_IS_FOR(NUMERIC)) {
5618         LC_NUMERIC_LOCK(0);
5619
5620 #      if defined(WIN32)
5621
5622         /* There is a bug in Windows in which setting LC_CTYPE after the others
5623          * doesn't actually take effect for localeconv().  See commit
5624          * 418efacd1950763f74ed3cc22f8cf9206661b892 for details.  Thus we have
5625          * to make sure that the locale we want is set after LC_CTYPE.  We
5626          * unconditionally toggle away from and back to the current locale
5627          * prior to calling localeconv(). */
5628         orig_NUMERIC_locale = toggle_locale_c(LC_NUMERIC, "C");
5629         toggle_locale_c(LC_NUMERIC, locale);
5630
5631 #      else
5632
5633         /* No need for the extra toggle when not on Windows */
5634         orig_NUMERIC_locale = toggle_locale_c(LC_NUMERIC, locale);
5635
5636 #      endif
5637
5638     }
5639
5640 #    endif
5641 #    if defined(USE_LOCALE_MONETARY) && defined(WIN32)
5642
5643     /* Same Windows bug as described just above for NUMERIC.  Otherwise, no
5644      * need to toggle LC_MONETARY, as it is kept in the underlying locale */
5645     const char * orig_MONETARY_locale = NULL;
5646     if (CALL_IS_FOR(MONETARY)) {
5647         orig_MONETARY_locale = toggle_locale_c(LC_MONETARY, "C");
5648         toggle_locale_c(LC_MONETARY, locale);
5649     }
5650
5651 #    endif
5652
5653     /* Finally ready to do the actual localeconv().  Lock to prevent other
5654      * accesses until we have made a copy of its returned static buffer */
5655     gwLOCALE_LOCK;
5656
5657 #    if defined(TS_W32_BROKEN_LOCALECONV) && defined(USE_THREAD_SAFE_LOCALE)
5658
5659     /* This is a workaround for another bug in Windows.  localeconv() was
5660      * broken with thread-safe locales prior to VS 15.  It looks at the global
5661      * locale instead of the thread one.  As a work-around, we toggle to the
5662      * global locale; populate the return; then toggle back.  We have to use
5663      * LC_ALL instead of the individual categories because of yet another bug
5664      * in Windows.  And this all has to be done in a critical section.
5665      *
5666      * This introduces a potential race with any other thread that has also
5667      * converted to use the global locale, and doesn't protect its locale calls
5668      * with mutexes.  khw can't think of any reason for a thread to do so on
5669      * Windows, as the locale API is the same regardless of thread-safety,
5670      * except if the code is ported from working on another platform where
5671      * there might be some reason to do this.  But this is typically due to
5672      * some alien-to-Perl library that thinks it owns locale setting.  Such a
5673      * library isn't likely to exist on Windows, so such an application is
5674      * unlikely to be run on Windows
5675      */
5676     bool restore_per_thread = FALSE;
5677
5678     /* Save the per-thread locale state */
5679     const char * save_thread = querylocale_c(LC_ALL);
5680
5681     /* Change to the global locale, and note if we already were there */
5682     int config_return = _configthreadlocale(_DISABLE_PER_THREAD_LOCALE);
5683     if (config_return != _DISABLE_PER_THREAD_LOCALE) {
5684         if (config_return == -1) {
5685             locale_panic_("_configthreadlocale returned an error");
5686         }
5687
5688         restore_per_thread = TRUE;
5689     }
5690
5691     /* Save the state of the global locale; then convert to our desired
5692      * state.  */
5693     const char * save_global = querylocale_c(LC_ALL);
5694     void_setlocale_c(LC_ALL, save_thread);
5695
5696 #    endif  /* TS_W32_BROKEN_LOCALECONV */
5697
5698     /* Finally, do the actual localeconv */
5699     const char *lcbuf_as_string = (const char *) localeconv();
5700
5701     /* Copy its results for each desired category as determined by
5702      * 'which_mask' */
5703     U32 working_mask = which_mask;
5704     while (working_mask) {
5705
5706         /* Get the bit position of the next lowest set bit.  That is the
5707          * index into the 'strings' array of the category we use in this loop
5708          * iteration.  Turn the bit off so we don't work on this category
5709          * again in this function call. */
5710         const PERL_UINT_FAST8_T i = lsbit_pos32(working_mask);
5711         working_mask &= ~ (1 << i);
5712
5713         /* Point to the string field list for the given category ... */
5714         const lconv_offset_t * category_strings = strings[i];
5715         while (category_strings->name) {
5716
5717             /* We have set things up so that we know where in the returned
5718              * structure, when viewed as a string, the corresponding value is.
5719              * */
5720             const char *value = *((const char **)(  lcbuf_as_string
5721                                                   + category_strings->offset));
5722             if (value) {    /* Copy to the hash */
5723                 (void) hv_store(hv,
5724                                 category_strings->name,
5725                                 strlen(category_strings->name),
5726                                 newSVpv(value, strlen(value)),
5727                                 0);
5728             }
5729
5730             category_strings++;
5731         }
5732
5733         /* Add any int fields to the HV*. */
5734         if (integers[i]) {
5735             const lconv_offset_t * current = integers[i];
5736             while (current->name) {
5737                 const char value = *((const char *)(  lcbuf_as_string
5738                                                     + current->offset));
5739                 (void) hv_store(hv,
5740                                 current->name, strlen(current->name),
5741                                 newSViv(value),
5742                                 0);
5743                 current++;
5744             }
5745         }
5746     }   /* End of loop through the fields */
5747
5748     /* Done with copying to the hash.  Can unwind the critical section locks */
5749
5750 #    if defined(TS_W32_BROKEN_LOCALECONV) && defined(USE_THREAD_SAFE_LOCALE)
5751
5752     /* Restore the global locale's prior state */
5753     void_setlocale_c(LC_ALL, save_global);
5754
5755     /* And back to per-thread locales */
5756     if (restore_per_thread) {
5757         if (_configthreadlocale(_ENABLE_PER_THREAD_LOCALE) == -1) {
5758             locale_panic_("_configthreadlocale returned an error");
5759         }
5760     }
5761
5762     /* Restore the per-thread locale state */
5763     void_setlocale_c(LC_ALL, save_thread);
5764
5765 #    endif  /* TS_W32_BROKEN_LOCALECONV */
5766
5767     gwLOCALE_UNLOCK;    /* Finished with the critical section of a
5768                            globally-accessible buffer */
5769
5770 #    if defined(USE_LOCALE_MONETARY) && defined(WIN32)
5771
5772     restore_toggled_locale_c(LC_MONETARY, orig_MONETARY_locale);
5773
5774 #    endif
5775 #    ifdef USE_LOCALE_NUMERIC
5776
5777     restore_toggled_locale_c(LC_NUMERIC, orig_NUMERIC_locale);
5778     if (CALL_IS_FOR(NUMERIC)) {
5779         LC_NUMERIC_UNLOCK;
5780     }
5781
5782 #    endif
5783
5784     end_DEALING_WITH_MISMATCHED_CTYPE(locale);
5785
5786 #    undef CALL_IS_FOR
5787
5788 }
5789
5790 #  endif    /* defined(USE_LOCALE_NUMERIC) || defined(USE_LOCALE_MONETARY) */
5791 #endif /* defined(HAS_LOCALECONV) */
5792
5793 /*
5794
5795 =for apidoc      Perl_langinfo
5796 =for apidoc_item Perl_langinfo8
5797
5798 C<Perl_langinfo> is an (almost) drop-in replacement for the system
5799 C<L<nl_langinfo(3)>>, taking the same C<item> parameter values, and returning
5800 the same information.  But it is more thread-safe than regular
5801 C<nl_langinfo()>, and hides the quirks of Perl's locale handling from your
5802 code, and can be used on systems that lack a native C<nl_langinfo>.
5803
5804 However, you should instead use the improved version of this:
5805 L</Perl_langinfo8>, which behaves identically except for an additional
5806 parameter, a pointer to a variable declared as L</C<utf8ness_t>>, into which it
5807 returns to you how you should treat the returned string with regards to it
5808 being encoded in UTF-8 or not.
5809
5810 Concerning the differences between these and plain C<nl_langinfo()>:
5811
5812 =over
5813
5814 =item a.
5815
5816 C<Perl_langinfo8> has an extra parameter, described above.  Besides this, the
5817 other reason they aren't quite a drop-in replacement is actually an advantage.
5818 The C<const>ness of the return allows the compiler to catch attempts to write
5819 into the returned buffer, which is illegal and could cause run-time crashes.
5820
5821 =item b.
5822
5823 They deliver the correct results for the C<RADIXCHAR> and C<THOUSEP> items,
5824 without you having to write extra code.  The reason for the extra code would be
5825 because these are from the C<LC_NUMERIC> locale category, which is normally
5826 kept set by Perl so that the radix is a dot, and the separator is the empty
5827 string, no matter what the underlying locale is supposed to be, and so to get
5828 the expected results, you have to temporarily toggle into the underlying
5829 locale, and later toggle back.  (You could use plain C<nl_langinfo> and
5830 C<L</STORE_LC_NUMERIC_FORCE_TO_UNDERLYING>> for this but then you wouldn't get
5831 the other advantages of C<Perl_langinfo()>; not keeping C<LC_NUMERIC> in the C
5832 (or equivalent) locale would break a lot of CPAN, which is expecting the radix
5833 (decimal point) character to be a dot.)
5834
5835 =item c.
5836
5837 The system function they replace can have its static return buffer trashed,
5838 not only by a subsequent call to that function, but by a C<freelocale>,
5839 C<setlocale>, or other locale change.  The returned buffer of these functions
5840 is not changed until the next call to one or the other, so the buffer is never
5841 in a trashed state.
5842
5843 =item d.
5844
5845 The return buffer is per-thread, so it also is never overwritten by a call to
5846 these functions from another thread;  unlike the function it replaces.
5847
5848 =item e.
5849
5850 But most importantly, they work on systems that don't have C<nl_langinfo>, such
5851 as Windows, hence making your code more portable.  Of the fifty-some possible
5852 items specified by the POSIX 2008 standard,
5853 L<https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/langinfo.h.html>,
5854 only one is completely unimplemented, though on non-Windows platforms, another
5855 significant one is not fully implemented).  They use various techniques to
5856 recover the other items, including calling C<L<localeconv(3)>>, and
5857 C<L<strftime(3)>>, both of which are specified in C89, so should be always be
5858 available.  Later C<strftime()> versions have additional capabilities.
5859 If an item is not available on your system, this returns either the value
5860 associated with the C locale, or simply C<"">, whichever is more appropriate.
5861
5862 It is important to note that, when called with an item that is recovered by
5863 using C<localeconv>, the buffer from any previous explicit call to
5864 C<L<localeconv(3)>> will be overwritten.  But you shouldn't be using
5865 C<localeconv> anyway because it is is very much not thread-safe, and suffers
5866 from the same problems outlined in item 'b.' above for the fields it returns
5867 that are controlled by the LC_NUMERIC locale category.  Instead, avoid all of
5868 those problems by calling L</Perl_localeconv>, which is thread-safe; or by
5869 using the methods given in L<perlcall>  to call
5870 L<C<POSIX::localeconv()>|POSIX/localeconv>, which is also thread-safe.
5871
5872 =back
5873
5874 The details for those items which may deviate from what this emulation returns
5875 and what a native C<nl_langinfo()> would return are specified in
5876 L<I18N::Langinfo>.
5877
5878 =cut
5879
5880 */
5881
5882 /* external_call_langinfo() is an interface to callers from outside this file to
5883  * my_langinfo_i(), calculating a necessary value for it.  If those functions
5884  * aren't defined, the fallback function is emulate_langinfo(), which doesn't
5885  * use that value (as everything in this situation takes place in the "C"
5886  * locale), and so we define this macro to transparently hide the absence of
5887  * the missing functions */
5888 #ifndef external_call_langinfo
5889 #  define external_call_langinfo(item, utf8p, bufp, bufsizep)               \
5890                         emulate_langinfo(item, "C", bufp, bufsizep, utf8p)
5891 #endif
5892
5893 const char *
5894 Perl_langinfo(const nl_item item)
5895 {
5896     dTHX;
5897
5898     return external_call_langinfo(item,
5899                                   NULL,   /* Don't want UTF-8ness */
5900                                   &PL_langinfo_buf, &PL_langinfo_bufsize);
5901 }
5902
5903 const char *
5904 Perl_langinfo8(const nl_item item, utf8ness_t * utf8ness)
5905 {
5906     PERL_ARGS_ASSERT_PERL_LANGINFO8;
5907     dTHX;
5908
5909     if (utf8ness) {     /* Assume for now */
5910         *utf8ness = UTF8NESS_IMMATERIAL;
5911     }
5912
5913     return external_call_langinfo(item, utf8ness,
5914                                   &PL_langinfo_buf, &PL_langinfo_bufsize);
5915 }
5916
5917 #ifdef USE_LOCALE
5918
5919 const char *
5920 S_external_call_langinfo(pTHX_ const nl_item item, utf8ness_t * utf8ness,
5921                                char ** retbufp, Size_t * retbuf_sizep)
5922 {
5923     PERL_ARGS_ASSERT_EXTERNAL_CALL_LANGINFO;
5924
5925     /* Find the locale category that controls the input 'item', and call
5926      * my_langinfo_i() including that value.
5927      *
5928      * If we are not paying attention to that category, instead call
5929      * emulate_langinfo(), which knows how to handle this situation. */
5930     locale_category_index  cat_index = LC_ALL_INDEX_;  /* Out-of-bounds */
5931
5932     switch (item) {
5933       case CODESET:
5934
5935 #  ifdef USE_LOCALE_CTYPE
5936         cat_index = LC_CTYPE_INDEX_;
5937 #  endif
5938         break;
5939
5940
5941       case YESEXPR: case YESSTR: case NOEXPR: case NOSTR:
5942
5943 #  ifdef USE_LOCALE_MESSAGES
5944         cat_index = LC_MESSAGES_INDEX_;
5945 #  endif
5946         break;
5947
5948
5949       case CRNCYSTR:
5950
5951 #  ifdef USE_LOCALE_MONETARY
5952         cat_index = LC_MONETARY_INDEX_;
5953 #  endif
5954         break;
5955
5956
5957       case RADIXCHAR: case THOUSEP:
5958
5959 #  ifdef USE_LOCALE_NUMERIC
5960         cat_index = LC_NUMERIC_INDEX_;
5961 #  endif
5962         break;
5963
5964
5965       default:  /* The other possible items are all in LC_TIME. */
5966 #  ifdef USE_LOCALE_TIME
5967         cat_index = LC_TIME_INDEX_;
5968 #  endif
5969         break;
5970
5971     } /* End of switch on item */
5972
5973 #  if defined(HAS_IGNORED_LOCALE_CATEGORIES_) || ! defined(LC_MESSAGES)
5974
5975     /* If the above didn't find the category's index, it has to be because the
5976      * item is unknown to us (and the callee will handle that), or the category
5977      * is confined to the "C" locale on this platform, which the callee also
5978      * handles.  (LC_MESSAGES is not required by the C Standard (the others
5979      * above are), so we have to emulate it on platforms lacking it (such as
5980      * Windows).) */
5981     if (cat_index == LC_ALL_INDEX_) {
5982         return emulate_langinfo(item, "C",
5983                                 retbufp, retbuf_sizep,
5984                                 utf8ness);
5985     }
5986
5987 #  endif
5988
5989     /* And get the value for this 'item', whose category has now been
5990      * calculated.  We need to find the current corresponding locale, and pass
5991      * that as well. */
5992     return my_langinfo_i(item,
5993                          cat_index,
5994                          query_nominal_locale_i(cat_index),
5995                          retbufp, retbuf_sizep,
5996                          utf8ness);
5997 }
5998
5999 #endif
6000 #if defined(USE_LOCALE) && defined(HAS_NL_LANGINFO)
6001
6002 STATIC const char *
6003 S_my_langinfo_i(pTHX_
6004                 const nl_item item,           /* The item to look up */
6005
6006                 /* The locale category that controls it */
6007                 locale_category_index cat_index,
6008
6009                 /* The locale to look up 'item' in. */
6010                 const char * locale,
6011
6012                 /* Where to store the result, and where the size of that buffer
6013                  * is stored, updated on exit. retbuf_sizep may be NULL for an
6014                  * empty-on-entry, single use buffer whose size we don't need
6015                  * to keep track of */
6016                 char ** retbufp,
6017                 Size_t * retbuf_sizep,
6018
6019                 /* If not NULL, the location to store the UTF8-ness of 'item's
6020                  * value, as documented */
6021                 utf8ness_t * utf8ness)
6022 {
6023     PERL_ARGS_ASSERT_MY_LANGINFO_I;
6024     assert(cat_index < LC_ALL_INDEX_);
6025
6026     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
6027                            "Entering my_langinfo item=%ld, using locale %s\n",
6028                            (long) item, locale));
6029
6030 #    ifdef HAS_IGNORED_LOCALE_CATEGORIES
6031
6032     if (! category_available[cat_index]) {
6033         return emulate_langinfo(item, locale,
6034                                 retbufp, retbuf_sizep,
6035                                 utf8ness);
6036     }
6037
6038 #    endif
6039
6040     /* One might be tempted to avoid any toggling by instead using
6041      * nl_langinfo_l() on platforms that have it.  This would entail creating a
6042      * locale object with newlocale() and freeing it afterwards.  But doing so
6043      * runs significantly slower than just doing the toggle ourselves.
6044      * lib/locale_threads.t was slowed down by 25% on Ubuntu 22.04 */
6045
6046     start_DEALING_WITH_MISMATCHED_CTYPE(locale);
6047
6048     const char * orig_switched_locale = toggle_locale_i(cat_index, locale);
6049
6050         gwLOCALE_LOCK;
6051         const char * retval = save_to_buffer(nl_langinfo(item),
6052                                              retbufp, retbuf_sizep);
6053         gwLOCALE_UNLOCK;
6054
6055     restore_toggled_locale_i(cat_index, orig_switched_locale);
6056     end_DEALING_WITH_MISMATCHED_CTYPE(locale)
6057
6058     if (utf8ness) {
6059         *utf8ness = get_locale_string_utf8ness_i(retval,
6060                                                  LOCALE_UTF8NESS_UNKNOWN,
6061                                                  locale, cat_index);
6062     }
6063
6064     return retval;
6065 }
6066
6067 #endif
6068 #ifndef HAS_DEFINITIVE_UTF8NESS_DETERMINATION
6069
6070 /* Forward declaration of function that we don't put into embed.fnc so as to
6071  * make its removal easier, as there may not be any extant platforms that need
6072  * it; and the function is located after emulate_langinfo() because it's easier
6073  * to understand when placed in the context of that code */
6074 STATIC const char * S_override_codeset_if_utf8_found(pTHX_
6075                                                      const char *codeset,
6076                                                      const char *locale);
6077 #endif
6078 #if ! defined(HAS_NL_LANGINFO)                      \
6079  ||   defined(HAS_IGNORED_LOCALE_CATEGORIES_)       \
6080  || ! defined(LC_MESSAGES)
6081
6082 STATIC const char *
6083 S_emulate_langinfo(pTHX_ const int item,
6084                          const char * locale,
6085                          char ** retbufp,
6086                          Size_t * retbuf_sizep,
6087                          utf8ness_t * utf8ness)
6088 {
6089     PERL_ARGS_ASSERT_EMULATE_LANGINFO;
6090 #  ifndef USE_LOCALE
6091     PERL_UNUSED_ARG(locale);
6092 #  endif
6093
6094     /* This emulates nl_langinfo() on platforms:
6095      *   1) where it doesn't exist; or
6096      *   2) where it does exist, but there are categories that it shouldn't be
6097      *      called on because they don't exist on the platform or we are
6098      *      supposed to always stay in the C locale for them.  This function
6099      *      has hard-coded in the results for those for the C locale.
6100      *
6101      * The major platform lacking nl_langinfo() is Windows.  It does have
6102      * GetLocaleInfoEx() that could be used to get most of the items, but it
6103      * (and other similar Windows API functions) use what MS calls "locale
6104      * names", whereas the C functions use what MS calls "locale strings".  The
6105      * locale string "English_United_States.1252" is equivalent to the locale
6106      * name "en_US".  There are tables inside Windows that translate between
6107      * the two forms, but they are not exposed.  Also calling setlocale(), then
6108      * calling GetThreadLocale() doesn't work, as the former doesn't change the
6109      * latter's return.  Therefore we are stuck using the mechanisms below. */
6110
6111     /* Almost all the items will have ASCII return values.  Set that here, and
6112      * override if necessary */
6113     utf8ness_t is_utf8 = UTF8NESS_IMMATERIAL;
6114     const char * retval = NULL;
6115     bool retval_saved = false;
6116
6117     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
6118                         "Entering emulate_langinfo item=%ld, using locale %s\n",
6119                         (long) item, locale));
6120
6121 #  if defined(HAS_LOCALECONV) && (   defined(USE_LOCALE_NUMERIC)      \
6122                                   || defined(USE_LOCALE_MONETARY))
6123
6124     locale_category_index  cat_index;
6125
6126 #  endif
6127
6128     GCC_DIAG_IGNORE_STMT(-Wimplicit-fallthrough);
6129
6130     switch (item) {
6131
6132       /* The following items have no way khw could figure out how to get except
6133        * via nl_langinfo() */
6134       case YESEXPR:   retval = "^[+1yY]"; break;
6135       case YESSTR:    retval = "yes";     break;
6136       case NOEXPR:    retval = "^[-0nN]"; break;
6137       case NOSTR:     retval = "no";      break;
6138
6139 #  if defined(USE_LOCALE_MONETARY) && defined(HAS_LOCALECONV)
6140
6141       case CRNCYSTR:
6142         cat_index = LC_MONETARY_INDEX_;
6143         goto use_localeconv;
6144
6145 #  else
6146
6147       case CRNCYSTR:
6148
6149         /* The locale's currency symbol may be empty.  But if not, the return
6150          * from nl_langinfo() prefixes it with a character that indicates where
6151          * in the monetary value the symbol is to be placed
6152          *  a) before, like $9.99);
6153          *  b) middle, rare, but would like be 9$99; or
6154          *  c) after,  like 9.99USD
6155          *
6156          * The POSIX Standard permits an implementation to choose whether or
6157          * not to omit the prefix character if the symbol is empty (the
6158          * placement position is meaningless if there is nothing to place).
6159          * glibc has chosen to always prefix an empty symbol by a minus (which
6160          * is the prefix for 'before' positioning).  FreeBSD has chosen to
6161          * return an empty string for an empty symbol.  Perl has always
6162          * emulated the glibc way (probably with little thought). */
6163         retval = "-";
6164         break;
6165
6166 #  endif
6167 #  if defined(USE_LOCALE_NUMERIC) && defined(HAS_LOCALECONV)
6168
6169       case THOUSEP:
6170         cat_index = LC_NUMERIC_INDEX_;
6171         goto use_localeconv;
6172
6173 #  else
6174
6175       case THOUSEP:
6176         retval = C_thousands_sep;
6177         break;
6178
6179 #  endif
6180
6181       case RADIXCHAR:
6182
6183 #  if    defined(USE_LOCALE_NUMERIC) && defined(HAS_SNPRINTF)           \
6184    && (! defined(HAS_LOCALECONV) || defined(TS_W32_BROKEN_LOCALECONV))
6185
6186         {   /* snprintf() can be used to find the radix character by outputting
6187              * a known simple floating point number to a buffer, and parsing
6188              * it, inferring the radix as the bytes separating the integer and
6189              * fractional parts.  But localeconv() is more direct, not
6190              * requiring inference, so use it instead of the code just below,
6191              * if (likely) it is available and works ok */
6192
6193             char * floatbuf = NULL;
6194             const Size_t initial_size = 10;
6195
6196             Newx(floatbuf, initial_size, char);
6197
6198             start_DEALING_WITH_MISMATCHED_CTYPE(locale);
6199             const char * orig_NUMERIC_locale = toggle_locale_c(LC_NUMERIC,
6200                                                                locale);
6201             /* 1.5 is exactly representable on binary computers */
6202             Size_t needed_size = snprintf(floatbuf, initial_size, "%.1f", 1.5);
6203
6204             /* If our guess wasn't big enough, increase and try again, based on
6205              * the real number that snprintf() is supposed to return */
6206             if (UNLIKELY(needed_size >= initial_size)) {
6207                 needed_size++;  /* insurance */
6208                 Renew(floatbuf, needed_size, char);
6209                 Size_t new_needed = snprintf(floatbuf, needed_size, "%.1f",
6210                                              1.5);
6211                 assert(new_needed <= needed_size);
6212                 needed_size = new_needed;
6213             }
6214
6215             restore_toggled_locale_c(LC_NUMERIC, orig_NUMERIC_locale);
6216             end_DEALING_WITH_MISMATCHED_CTYPE(locale);
6217
6218             char * s = floatbuf;
6219             char * e = floatbuf + needed_size;
6220
6221             /* Find the '1' */
6222             while (s < e && *s != '1') {
6223                 s++;
6224             }
6225
6226             if (LIKELY(s < e)) {
6227                 s++;
6228             }
6229
6230             /* Find the '5' */
6231             char * item_start = s;
6232             while (s < e && *s != '5') {
6233                 s++;
6234             }
6235
6236             /* Everything in between is the radix string */
6237             if (LIKELY(s < e)) {
6238                 *s = '\0';
6239                 retval = save_to_buffer(item_start, retbufp, retbuf_sizep);
6240                 retval_saved = true;
6241                 Safefree(floatbuf);
6242
6243                 if (utf8ness) {
6244                     is_utf8 = get_locale_string_utf8ness_i(retval,
6245                                                         LOCALE_UTF8NESS_UNKNOWN,
6246                                                         locale,
6247                                                         LC_NUMERIC_INDEX_);
6248                 }
6249
6250                 break;
6251             }
6252
6253             Safefree(floatbuf);
6254         }
6255
6256 #  endif  /* Trying snprintf() */
6257
6258         /* Here snprintf() was not compiled, or failed */
6259
6260 #  if ! defined(USE_LOCALE_NUMERIC) || ! defined(HAS_LOCALECONV)
6261
6262         retval = C_decimal_point;
6263         break;
6264
6265 #  else         /* snprintf() failed; drop down to use localeconv() */
6266
6267         cat_index = LC_NUMERIC_INDEX_;
6268
6269 #  endif
6270 #  if defined(HAS_LOCALECONV) && (   defined(USE_LOCALE_NUMERIC)      \
6271                                   || defined(USE_LOCALE_MONETARY))
6272
6273     /* These items are available from localeconv(). */
6274
6275    /* case RADIXCHAR:   // May drop down to here in some configurations
6276       case THOUSEP:     // Jumps to here
6277       case CRNCYSTR:    // Jumps to here */
6278       use_localeconv:
6279        {
6280
6281         /* The hash gets populated with just the field(s) related to 'item'. */
6282         HV * result_hv = my_localeconv(item);
6283
6284         SV* string;
6285         if (item != CRNCYSTR) {
6286
6287             /* These items have been populated with just one key => value */
6288             (void) hv_iterinit(result_hv);
6289             HE * entry = hv_iternext(result_hv);
6290             string = hv_iterval(result_hv, entry);
6291         }
6292         else {
6293
6294             /* But CRNCYSTR localeconv() returns a slightly different value
6295              * than the nl_langinfo() API calls for, so have to modify this one
6296              * to conform.  We need another value from localeconv() to know
6297              * what to change it to.  my_localeconv() has populated the hash
6298              * with exactly both fields.  Delete this one, leaving just the
6299              * CRNCYSTR one in the hash */
6300             SV* precedes = hv_delete(result_hv,
6301                                      P_CS_PRECEDES_LITERAL,
6302                                      STRLENs(P_CS_PRECEDES_LITERAL),
6303                                      0);
6304             if (! precedes) {
6305                 locale_panic_("my_localeconv() unexpectedly didn't return"
6306                               " a value for " P_CS_PRECEDES_LITERAL);
6307             }
6308
6309             /* The modification is to prefix the localeconv() return with a
6310              * single byte, calculated as follows: */
6311             const char * prefix = (LIKELY(SvIV(precedes) != -1))
6312                                    ? ((precedes != 0) ?  "-" : "+")
6313                                    : ".";
6314             /* (khw couldn't find any documentation that the dot is signalled
6315              * by CHAR_MAX (which we modify to -1), but cygwin uses it thusly,
6316              * and it makes sense given that CHAR_MAX indicates the value isn't
6317              * used, so it neither precedes nor succeeds) */
6318
6319             /* Now get CRNCYSTR */
6320             (void) hv_iterinit(result_hv);
6321             HE * entry = hv_iternext(result_hv);
6322             string = hv_iterval(result_hv, entry);
6323
6324             /* And perform the modification */
6325             sv_insert(string, 0, 0, prefix, 1);
6326         }
6327
6328         /* Here, 'string' contains the value we want to return */
6329         retval = save_to_buffer(SvPV_nolen(string), retbufp, retbuf_sizep);
6330         retval_saved = true;
6331
6332         if (utf8ness) {
6333             is_utf8 = get_locale_string_utf8ness_i(retval,
6334                                                    LOCALE_UTF8NESS_UNKNOWN,
6335                                                    locale,
6336                                                    cat_index);
6337         }
6338
6339         break;
6340
6341        }
6342
6343 #  endif  /* Using localeconv() for something or other */
6344 #  ifndef USE_LOCALE_CTYPE
6345
6346       case CODESET:
6347         retval = C_codeset;
6348         break;
6349
6350 #  else
6351
6352       case CODESET:
6353
6354         /* The trivial case */
6355         if (isNAME_C_OR_POSIX(locale)) {
6356             retval = C_codeset;
6357             break;
6358         }
6359
6360         /* If this happens to match our cached value */
6361         if (PL_in_utf8_CTYPE_locale && strEQ(locale, PL_ctype_name)) {
6362             retval = "UTF-8";
6363             break;
6364         }
6365
6366 #    ifdef WIN32
6367 #      ifdef WIN32_USE_FAKE_OLD_MINGW_LOCALES
6368 #        define CODE_PAGE_FORMAT  "%s"
6369 #        define CODE_PAGE_FUNCTION  nl_langinfo(CODESET)
6370 #      else
6371 #        define CODE_PAGE_FORMAT  "%d"
6372
6373          /* This Windows function retrieves the code page.  It is subject to
6374           * change, but is documented, and has been stable for many releases */
6375 #        define CODE_PAGE_FUNCTION  ___lc_codepage_func()
6376 #      endif
6377
6378         const char * orig_CTYPE_locale;
6379         orig_CTYPE_locale = toggle_locale_c(LC_CTYPE, locale);
6380         retval = save_to_buffer(Perl_form(aTHX_ CODE_PAGE_FORMAT,
6381                                                 CODE_PAGE_FUNCTION),
6382                                 retbufp, retbuf_sizep);
6383         retval_saved = true;
6384         restore_toggled_locale_c(LC_CTYPE, orig_CTYPE_locale);
6385
6386         DEBUG_Lv(PerlIO_printf(Perl_debug_log, "locale='%s' cp=%s\n",
6387                                                locale, retval));
6388         break;
6389
6390 #    else   /* Below is ! Win32 */
6391
6392         /* The codeset is important, but khw did not figure out a way for it to
6393          * be retrieved on non-Windows boxes without nl_langinfo().  But even
6394          * if we can't get it directly, we can usually determine if it is a
6395          * UTF-8 locale or not.  If it is UTF-8, we (correctly) use that for
6396          * the code set. */
6397
6398 #      ifdef HAS_DEFINITIVE_UTF8NESS_DETERMINATION
6399
6400         if (is_locale_utf8(locale)) {
6401             retval = "UTF-8";
6402             break;
6403         }
6404
6405 #      endif
6406
6407         /* Here, the code set has not been found.  The only other option khw
6408          * could think of is to see if the codeset is part of the locale name.
6409          * This is very less than ideal; often there is no code set in the
6410          * name; and at other times they even lie.
6411          *
6412          * But there is an XPG standard syntax, which many locales follow:
6413          *
6414          *    language[_territory[.codeset]][@modifier]
6415          *
6416          * So we take the part between the dot and any '@' */
6417         retval = strchr(locale, '.');
6418         if (! retval) {
6419             retval = "";  /* Alas, no dot */
6420         }
6421         else {
6422
6423             /* Don't include the dot */
6424             retval++;
6425
6426             /* And stop before any '@' */
6427             const char * modifier = strchr(retval, '@');
6428             if (modifier) {
6429                 char * code_set_name;
6430                 const Size_t name_len = modifier - retval;
6431                 Newx(code_set_name, name_len + 1, char);    /* +1 for NUL */
6432                 my_strlcpy(code_set_name, retval, name_len + 1);
6433                 SAVEFREEPV(code_set_name);
6434                 retval = code_set_name;
6435             }
6436
6437             /* The code set name is considered to be everything between the dot
6438              * and the '@' */
6439             retval = save_to_buffer(retval, retbufp, retbuf_sizep);
6440             retval_saved = true;
6441         }
6442
6443 #      ifndef HAS_DEFINITIVE_UTF8NESS_DETERMINATION
6444
6445         /* Here, 'retval' contains any codeset name derived from the locale
6446          * name.  That derived name may be empty or not necessarily indicative
6447          * of the real codeset.  But we can often determine if it should be
6448          * UTF-8, regardless of what the name is.  On most platforms, that
6449          * determination is definitive, and was already done.  But for this
6450          * code to be compiled, this platform is not one of them.  However,
6451          * there are typically tools available to make a very good guess, and
6452          * knowing the derived codeset name improves the quality of that guess.
6453          * The following function overrides the derived codeset name when it
6454          * guesses that it actually should be UTF-8.  It could be inlined here,
6455          * but was moved out of this switch() so as to make the switch()
6456          * control flow easier to follow */
6457         retval = S_override_codeset_if_utf8_found(aTHX_ retval, locale);
6458
6459 #      endif
6460
6461         break;
6462
6463 #    endif    /* ! WIN32 */
6464 #  endif      /* USE_LOCALE_CTYPE */
6465
6466       default:  /* Anything else that is legal is LC_TIME-related */
6467        {
6468
6469         const char * format = NULL;
6470         retval = NULL;
6471
6472 #  ifdef HAS_STRFTIME
6473
6474         bool return_format = FALSE;
6475
6476         /* Without strftime(), default compiled-in values are returned.
6477          * Otherwise, we generally compute a date as explained below.
6478          * Initialize default values for that computation */
6479         int mon = 0;
6480         int mday = 1;
6481         int hour = 6;
6482
6483 #  endif
6484
6485         /* Nested switch for LC_TIME items, plus the default: case is for
6486          * unknown items */
6487         switch (item) {
6488
6489           default:
6490
6491             /* On systems with langinfo.h, 'item' is an enum.  If we don't
6492              * handle one of those, the code needs to change to be able to do
6493              * so.  But otherwise, the parameter can be any int, and so could
6494              * be a garbage value and all we can do is to return that it is
6495              * invalid. */;
6496 #  if defined(I_LANGINFO)
6497
6498             Perl_croak_nocontext("panic: Unexpected nl_langinfo() item %ld",
6499                                  (long) item);
6500
6501 #  else
6502             assert(item < 0);   /* Make sure using perl_langinfo.h */
6503             SET_EINVAL;
6504             retval = "";
6505             break;
6506 #  endif
6507
6508             /* The case: statments in this switch are all for LC_TIME related
6509              * values.  There are four types of values returned.  One type is
6510              * "Give me the name in this locale of the 3rd month of the year"
6511              * (March in an English locale).  The second main type is "Give me
6512              * the best format string understood by strftime(), like '%c', for
6513              * formatting the date and time in this locale."  The other two
6514              * types are for ERA and ALT_DIGITS, and are explained at the case
6515              * statements for them.
6516              *
6517              * For the first type, suppose we want to find the name of the 3rd
6518              * month of the year.  We pass a date/time to strftime() that is
6519              * known to evaluate to sometime in March, along with a format that
6520              * tells strftime() to return the month's name.  We then return
6521              * that to our caller.  Similarly for the names of the days of the
6522              * week, like "Tuesday".  There are also abbreviated versions for
6523              * each of these.
6524              *
6525              * To implement the second type (returning to the caller a string
6526              * containing a format suitable for passing to strftime() ) we
6527              * guess a format, pass that to strftime, and examine its return to
6528              * see if that format is known on this platform.  If so, we return
6529              * that guess.  Otherwise we return the empty string "".  There are
6530              * no second guesses, as there don't seem to be alternatives
6531              * lurking out there.  For some formats that are supposed to be
6532              * known to all strftime()s since C89, we just assume that they are
6533              * valid, not bothering to check.  The guesses may not be the best
6534              * available for this locale on this platform, but should be good
6535              * enough, so that a native speaker would find them understandable.
6536              * */
6537
6538             /* Unimplemented by perl; for use with strftime() %E modifier */
6539           case ERA: retval = ""; break;
6540
6541 #  if ! defined(USE_LOCALE_TIME) || ! defined(HAS_STRFTIME)
6542
6543           case AM_STR: retval = "AM"; break;
6544           case PM_STR: retval = "PM"; break;
6545 #  else
6546           case PM_STR: hour = 18;
6547           case AM_STR:
6548             format = "%p";
6549             break;
6550 #  endif
6551 #  if ! defined(USE_LOCALE_TIME) || ! defined(HAS_STRFTIME)
6552
6553           case ABDAY_1: retval = "Sun"; break;
6554           case ABDAY_2: retval = "Mon"; break;
6555           case ABDAY_3: retval = "Tue"; break;
6556           case ABDAY_4: retval = "Wed"; break;
6557           case ABDAY_5: retval = "Thu"; break;
6558           case ABDAY_6: retval = "Fri"; break;
6559           case ABDAY_7: retval = "Sat"; break;
6560 #  else
6561           case ABDAY_7: mday++;
6562           case ABDAY_6: mday++;
6563           case ABDAY_5: mday++;
6564           case ABDAY_4: mday++;
6565           case ABDAY_3: mday++;
6566           case ABDAY_2: mday++;
6567           case ABDAY_1:
6568             format = "%a";
6569             break;
6570 #  endif
6571 #  if ! defined(USE_LOCALE_TIME) || ! defined(HAS_STRFTIME)
6572
6573           case DAY_1: retval = "Sunday";    break;
6574           case DAY_2: retval = "Monday";    break;
6575           case DAY_3: retval = "Tuesday";   break;
6576           case DAY_4: retval = "Wednesday"; break;
6577           case DAY_5: retval = "Thursday";  break;
6578           case DAY_6: retval = "Friday";    break;
6579           case DAY_7: retval = "Saturday";  break;
6580 #  else
6581           case DAY_7: mday++;
6582           case DAY_6: mday++;
6583           case DAY_5: mday++;
6584           case DAY_4: mday++;
6585           case DAY_3: mday++;
6586           case DAY_2: mday++;
6587           case DAY_1:
6588             format = "%A";
6589             break;
6590 #  endif
6591 #  if ! defined(USE_LOCALE_TIME) || ! defined(HAS_STRFTIME)
6592           case ABMON_1:  retval = "Jan"; break;
6593           case ABMON_2:  retval = "Feb"; break;
6594           case ABMON_3:  retval = "Mar"; break;
6595           case ABMON_4:  retval = "Apr"; break;
6596           case ABMON_5:  retval = "May"; break;
6597           case ABMON_6:  retval = "Jun"; break;
6598           case ABMON_7:  retval = "Jul"; break;
6599           case ABMON_8:  retval = "Aug"; break;
6600           case ABMON_9:  retval = "Sep"; break;
6601           case ABMON_10: retval = "Oct"; break;
6602           case ABMON_11: retval = "Nov"; break;
6603           case ABMON_12: retval = "Dec"; break;
6604 #  else
6605           case ABMON_12: mon++;
6606           case ABMON_11: mon++;
6607           case ABMON_10: mon++;
6608           case ABMON_9:  mon++;
6609           case ABMON_8:  mon++;
6610           case ABMON_7:  mon++;
6611           case ABMON_6:  mon++;
6612           case ABMON_5:  mon++;
6613           case ABMON_4:  mon++;
6614           case ABMON_3:  mon++;
6615           case ABMON_2:  mon++;
6616           case ABMON_1:
6617             format = "%b";
6618             break;
6619 #  endif
6620 #  if ! defined(USE_LOCALE_TIME) || ! defined(HAS_STRFTIME)
6621
6622           case MON_1:  retval = "January";  break;
6623           case MON_2:  retval = "February"; break;
6624           case MON_3:  retval = "March";    break;
6625           case MON_4:  retval = "April";    break;
6626           case MON_5:  retval = "May";      break;
6627           case MON_6:  retval = "June";     break;
6628           case MON_7:  retval = "July";     break;
6629           case MON_8:  retval = "August";   break;
6630           case MON_9:  retval = "September";break;
6631           case MON_10: retval = "October";  break;
6632           case MON_11: retval = "November"; break;
6633           case MON_12: retval = "December"; break;
6634 #  else
6635           case MON_12: mon++;
6636           case MON_11: mon++;
6637           case MON_10: mon++;
6638           case MON_9:  mon++;
6639           case MON_8:  mon++;
6640           case MON_7:  mon++;
6641           case MON_6:  mon++;
6642           case MON_5:  mon++;
6643           case MON_4:  mon++;
6644           case MON_3:  mon++;
6645           case MON_2:  mon++;
6646           case MON_1:
6647             format = "%B";
6648             break;
6649 #  endif
6650 #  ifndef HAS_STRFTIME
6651
6652           /* If no strftime() on this system, no format will be recognized, so
6653            * return empty */
6654           case D_FMT:  case T_FMT:  case D_T_FMT:
6655           case ERA_D_FMT: case ERA_T_FMT: case ERA_D_T_FMT:
6656           case T_FMT_AMPM:
6657             retval = "";
6658             break;
6659 #  else
6660           /* These strftime formats are defined by C89, so we assume that
6661            * strftime supports them, and so are returned unconditionally; they
6662            * may not be what the locale actually says, but should give good
6663            * enough results for someone using them as formats (as opposed to
6664            * trying to parse them to figure out what the locale says).  The
6665            * other format items are actually tested to verify they work on the
6666            * platform */
6667           case D_FMT:   retval = "%x"; break;
6668           case T_FMT:   retval = "%X"; break;
6669           case D_T_FMT: retval = "%c"; break;
6670
6671           /* This format isn't in C89; test that it actually works on the
6672            * platform */
6673           case T_FMT_AMPM:
6674             format = "%r";
6675             return_format = TRUE;
6676             break;
6677
6678 #    if defined(WIN32) || ! defined(USE_LOCALE_TIME)
6679
6680           /* strftime() on Windows doesn't have the POSIX (beyond C89)
6681            * extensions that would allow it to recover these, so use the plain
6682            * non-ERA formats.  Also, when LC_TIME is constrained to the C
6683            * locale, the %E modifier is useless, so don't return it. */
6684           case ERA_D_FMT:   retval = "%x"; break;
6685           case ERA_T_FMT:   retval = "%X"; break;
6686           case ERA_D_T_FMT: retval = "%c"; break;
6687 #    else
6688           case ERA_D_FMT:
6689             format = "%Ex";
6690             return_format = TRUE;   /* Test that this works on the platform */
6691             break;
6692
6693           case ERA_T_FMT:
6694             format = "%EX";
6695             return_format = TRUE;
6696             break;
6697
6698           case ERA_D_T_FMT:
6699             format = "%Ec";
6700             return_format = TRUE;
6701             break;
6702 #    endif
6703 #  endif
6704 #  if defined(WIN32) || ! defined(USE_LOCALE_TIME) || ! defined(HAS_STRFTIME)
6705
6706           case ALT_DIGITS: retval = "0"; break;
6707 #  else
6708           case ALT_DIGITS:
6709             format = "%Ow"; /* Find the alternate digit for 0 */
6710             break;
6711 #  endif
6712
6713         } /* End of inner switch() */
6714
6715         /* The inner switch() above has set 'retval' iff that is the final
6716          * answer */
6717         if (retval) {
6718             break;
6719         }
6720
6721         /* And it hasn't set 'format' iff it can't figure out a good value on
6722          * this platform. */
6723         if (! format) {
6724             retval = "";
6725             break;
6726         }
6727
6728 #  ifdef HAS_STRFTIME
6729
6730         /* Here we have figured out what to call strftime() with */
6731
6732         struct tm  mytm;
6733         const char * orig_TIME_locale = toggle_locale_c(LC_TIME, locale);
6734
6735         /* The year was deliberately chosen so that January 1 is on the
6736         * first day of the week.  Since we're only getting one thing at a
6737         * time, it all works */
6738         ints_to_tm(&mytm, 30, 30, hour, mday, mon, 2011, 0, 0, 0);
6739         char * temp;
6740         if (utf8ness) {
6741             temp = strftime8(format,
6742                              &mytm,
6743                              UTF8NESS_IMMATERIAL, /* All possible formats
6744                                                      specified above are
6745                                                      entirely ASCII */
6746                              &is_utf8,
6747                              false      /* not calling from sv_strftime */
6748                             );
6749         }
6750         else {
6751             temp = strftime_tm(format, &mytm);
6752         }
6753
6754         restore_toggled_locale_c(LC_TIME, orig_TIME_locale);
6755
6756         /* If the item is 'ALT_DIGITS', '*retbuf' contains the alternate
6757         * format for wday 0.  If the value is the same as the normal 0,
6758         * there isn't an alternate, so clear the buffer.
6759         *
6760         * (wday was chosen because its range is all a single digit.
6761         * Things like tm_sec have two digits as the minimum: '00'.) */
6762         if (item == ALT_DIGITS && strEQ(*retbufp, "0")) {
6763             retval = "";
6764             Safefree(temp);
6765             break;
6766         }
6767
6768         /* ALT_DIGITS is problematic.  Experiments on it showed that
6769         * strftime() did not always work properly when going from alt-9 to
6770         * alt-10.  Only a few locales have this item defined, and in all
6771         * of them on Linux that khw was able to find, nl_langinfo() merely
6772         * returned the alt-0 character, possibly doubled.  Most Unicode
6773         * digits are in blocks of 10 consecutive code points, so that is
6774         * sufficient information for such scripts, as we can infer alt-1,
6775         * alt-2, ....  But for a Japanese locale, a CJK ideographic 0 is
6776         * returned, and the CJK digits are not in code point order, so you
6777         * can't really infer anything.  The localedef for this locale did
6778         * specify the succeeding digits, so that strftime() works properly
6779         * on them, without needing to infer anything.  But the
6780         * nl_langinfo() return did not give sufficient information for the
6781         * caller to understand what's going on.  So until there is
6782         * evidence that it should work differently, this returns the alt-0
6783         * string for ALT_DIGITS. */
6784
6785             /* If to return what strftime() returns, are done */
6786             if (! return_format) {
6787                 retval = save_to_buffer(temp, retbufp, retbuf_sizep);
6788                 retval_saved = true;
6789                 Safefree(temp);
6790                 break;
6791             }
6792
6793             /* Here are to return the format, not the value.  This is used when
6794              * we are testing if the format we expect to return is legal on
6795              * this platform.  We have passed the format, say "%r, to
6796              * strftime(), and now have in 'retval' what strftime processed it
6797              * to be.  But the caller doesnt't want that; it wants the actual
6798              * %r, if it is understood on this platform, and "" if it isn't.
6799              * Some strftime()s return "" for an unknown format.  (None of the
6800              * formats exposed by langinfo can have "" be a legal result.)
6801              * Other strftime()s return the format unchanged if not understood.
6802              * So if we pass "%r" to strftime(), and it's illegal, we will get
6803              * back either "" or "%r", and we return "" to our caller.  If the
6804              * strftime() return is anything else, we conclude that "%r" is
6805              * understood by the platform, and return "%r". */
6806             if (*temp == '\0' || strEQ(temp, format)) {
6807                 retval = "";
6808             }
6809             else {
6810                 retval = format;
6811             }
6812
6813             /* A format is always in ASCII */
6814             is_utf8 = UTF8NESS_IMMATERIAL;
6815
6816             Safefree(temp);
6817             break;
6818 #  endif
6819
6820        }    /* End of braced group for outer switch 'default:' case */
6821     } /* Giant switch() of nl_langinfo() items */
6822
6823     GCC_DIAG_RESTORE_STMT;
6824
6825     if (utf8ness) {
6826         *utf8ness = is_utf8;
6827     }
6828
6829     if (! retval_saved) {
6830         retval = save_to_buffer(retval, retbufp, retbuf_sizep);
6831     }
6832
6833     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
6834                          "Leaving emulate_langinfo item=%ld, using locale %s\n",
6835                          (long) item, locale));
6836     return retval;
6837 }
6838
6839 #endif      /* Needs emulate_langinfo() */
6840 #ifndef HAS_DEFINITIVE_UTF8NESS_DETERMINATION
6841
6842 STATIC const char *
6843 S_override_codeset_if_utf8_found(pTHX_ const char * codeset,
6844                                        const char * locale)
6845 {
6846 #  define NAME_INDICATES_UTF8       0x1
6847 #  define MB_CUR_MAX_SUGGESTS_UTF8  0x2
6848
6849     /* Override 'codeset' with UTF-8 if this routine guesses that it should be.
6850      * Conversely (but rarely), "UTF-8" in the locale name might be wrong.  We
6851      * return "" as the code set name if we find that to be the case.  */
6852
6853     unsigned int lean_towards_being_utf8 = 0;
6854     if (is_codeset_name_UTF8(codeset)) {
6855         lean_towards_being_utf8 |= NAME_INDICATES_UTF8;
6856     }
6857
6858     const char * orig_CTYPE_locale = toggle_locale_c(LC_CTYPE, locale);
6859
6860     /* For this portion of the file to compile, some C99 functions aren't
6861      * available to us, even though we now require C99.  So, something must be
6862      * wrong with them.  The code here should be good enough to work around
6863      * this issue, but should the need arise, comments in S_is_locale_utf8()
6864      * list some alternative C99 functions that could be tried.
6865      *
6866      * But MB_CUR_MAX is a C89 construct that helps a lot, is simple for a
6867      * vendor to implement, and our experience with it is that it works well on
6868      * a variety of platforms.  We have found that it returns a too-large
6869      * number on some platforms for the C locale, but for no others.  That
6870      * locale was already ruled out in the code that called this function.  (If
6871      * MB_CUR_MAX returned too small a number, that would break a lot of
6872      * things, and likely would be quickly corrected by the vendor.)  khw has
6873      * some confidence that it doesn't return >1 when 1 is meant, as that would
6874      * trigger a Perl warning, and we've had no reports of invalid occurrences
6875      * of such. */
6876 #  ifdef MB_CUR_MAX
6877
6878     /* If there are fewer bytes available in this locale than are required to
6879      * represent the largest legal UTF-8 code point, this definitely isn't a
6880      * UTF-8 locale, even if the locale name says it is. */
6881     const int mb_cur_max = MB_CUR_MAX;
6882     if (mb_cur_max < (int) UNISKIP(PERL_UNICODE_MAX)) {
6883         if (lean_towards_being_utf8 & NAME_INDICATES_UTF8) {
6884             return "";    /* The name is wrong; override */
6885         }
6886
6887         restore_toggled_locale_c(LC_CTYPE, orig_CTYPE_locale);
6888         return codeset;
6889     }
6890
6891     /* But if the locale could be UTF-8, and also the name corroborates this,
6892      * assume it is so */
6893     if (lean_towards_being_utf8 & NAME_INDICATES_UTF8) {
6894         restore_toggled_locale_c(LC_CTYPE, orig_CTYPE_locale);
6895         return codeset;
6896     }
6897
6898     /* Here, the name doesn't indicate UTF-8, but MB_CUR_MAX indicates it could
6899      * be.  khw knows of only two other locales in the world, EUC-TW and GB
6900      * 18030, that legitimately require this many bytes (4).  So, if the name
6901      * is one of those, MB_CUR_MAX has corroborated that. */
6902     bool name_implies_non_utf8 = false;
6903     if (foldEQ(codeset, "GB", 2)) {
6904         const char * s = codeset + 2;
6905         if (*s == '-' || *s == '_') {
6906             s++;
6907         }
6908
6909         if strEQ(s, "18030") {
6910             name_implies_non_utf8 = true;
6911         }
6912     }
6913     else if (foldEQ(codeset, "EUC", 3)) {
6914         const char * s = codeset + 3;
6915         if (*s == '-' || *s == '_') {
6916             s++;
6917         }
6918
6919         if (foldEQ(s, "TW", 2)) {
6920             name_implies_non_utf8 = true;
6921         }
6922     }
6923
6924     /* Otherwise, the locale is likely UTF-8 */
6925     if (! name_implies_non_utf8) {
6926         lean_towards_being_utf8 |= MB_CUR_MAX_SUGGESTS_UTF8;
6927     }
6928
6929     /* (In both those two other multibyte locales, the single byte characters
6930      * are the same as ASCII.  No multi-byte character in EUC-TW is legal UTF-8
6931      * (since the first byte of each is a continuation).  GB 18030 has no three
6932      * byte sequences, and none of the four byte ones is legal UTF-8 (as the
6933      * second byte for these is a non-continuation).  But every legal UTF-8 two
6934      * byte sequence is also legal in GB 18030, though none have the same
6935      * meaning, and no Han code point expressed in UTF-8 is two byte.  So the
6936      * further tests below which look for native expressions of currency and
6937      * time will not return two byte sequences, hence they will reliably rule
6938      * out such a locale as being UTF-8, even if the code set name checked
6939      * above isn't correct.) */
6940
6941 #  endif    /* has MB_CUR_MAX */
6942
6943     /* Here, MB_CUR_MAX is not available, or was inconclusive.  What we do is
6944      * to look at various strings associated with the locale:
6945      *  1)  If any are illegal UTF-8, the locale can't be UTF-8.
6946      *  2)  If all are legal UTF-8, and some non-ASCII characters are present,
6947      *      it is likely to be UTF-8, because of the strictness of UTF-8
6948      *      syntax. So assume it is UTF-8
6949      *  3)  If all are ASCII and the locale name and/or MB_CUR_MAX indicate
6950      *      UTF-8, assume the locale is UTF-8.
6951      *  4)  Otherwise, assume the locale isn't UTF-8
6952      *
6953      * To save cycles, if the locale name indicates it is a UTF-8 locale, we
6954      * stop looking at the first instance with legal non-ASCII UTF-8.  It is
6955      * very unlikely this combination is coincidental. */
6956
6957     utf8ness_t strings_utf8ness = UTF8NESS_UNKNOWN;
6958     char * scratch_buf = NULL;
6959     Size_t scratch_buf_size = 0;
6960
6961     /* List of strings to look at */
6962     const int trials[] = {
6963
6964 #  if defined(USE_LOCALE_MONETARY) && defined(HAS_LOCALECONV)
6965
6966         /* The first string tried is the locale currency name.  Often that will
6967          * be in the native script.
6968          *
6969          * But this is usable only if localeconv() is available, as that's the
6970          * way we find out the currency symbol. */
6971
6972         CRNCYSTR,
6973
6974 #  endif
6975 #  ifdef USE_LOCALE_TIME
6976
6977     /* We can also try various strings associated with LC_TIME, like the names
6978      * of months or days of the week */
6979
6980         DAY_1, DAY_2, DAY_3, DAY_4, DAY_5, DAY_6, DAY_7,
6981         MON_1, MON_2, MON_3, MON_4, MON_5, MON_6, MON_7, MON_8,
6982                                     MON_9, MON_10, MON_11, MON_12,
6983         ALT_DIGITS, AM_STR, PM_STR,
6984         ABDAY_1, ABDAY_2, ABDAY_3, ABDAY_4, ABDAY_5, ABDAY_6, ABDAY_7,
6985         ABMON_1, ABMON_2, ABMON_3, ABMON_4, ABMON_5, ABMON_6,
6986         ABMON_7, ABMON_8, ABMON_9, ABMON_10, ABMON_11, ABMON_12
6987
6988 #  endif
6989
6990     };
6991
6992 #  ifdef USE_LOCALE_TIME
6993
6994     /* The code in the recursive call below can handle switching the locales,
6995      * but by doing it now here, that code will check and discover that there
6996      * is no need to switch then restore, avoiding those each loop iteration */
6997     const char * orig_TIME_locale = toggle_locale_c(LC_TIME, locale);
6998
6999 #  endif
7000
7001     /* The trials array may consist of strings from two different locale
7002      * categories.  The call to my_langinfo_i() below needs to pass the proper
7003      * category for each string.  There is a max of 1 trial for LC_MONETARY;
7004      * the rest are LC_TIME.  So the array is arranged so the LC_MONETARY item
7005      * (if any) is first, and all subsequent iterations will use LC_TIME.
7006      * These #ifdefs set up the values for all possible combinations. */
7007 #  if defined(USE_LOCALE_MONETARY) && defined(HAS_LOCALECONV)
7008
7009     locale_category_index  cat_index = LC_MONETARY_INDEX_;
7010
7011 #    ifdef USE_LOCALE_TIME
7012
7013     const locale_category_index  follow_on_cat_index = LC_TIME_INDEX_;
7014     assert(trials[1] == DAY_1); /* Make sure only a single non-time entry */
7015
7016 #    else
7017
7018     /* Effectively out-of-bounds, as there is only the monetary entry */
7019     const locale_category_index  follow_on_cat_index = LC_ALL_INDEX_;
7020
7021 #    endif
7022 #  elif defined(USE_LOCALE_TIME)
7023
7024     locale_category_index  cat_index = LC_TIME_INDEX_;
7025     const locale_category_index  follow_on_cat_index = LC_TIME_INDEX_;
7026
7027 #  else
7028
7029     /* Effectively out-of-bounds, as here there are no trial entries at all.
7030      * This allows this code to compile, but there are no strings to test, and
7031      * so the answer will always be non-UTF-8. */
7032     locale_category_index  cat_index = LC_ALL_INDEX_;
7033     const locale_category_index  follow_on_cat_index = LC_ALL_INDEX_;
7034
7035 #  endif
7036
7037     /* Everything set up; look through all the strings */
7038     for (PERL_UINT_FAST8_T i = 0; i < C_ARRAY_LENGTH(trials); i++) {
7039         (void) my_langinfo_i(trials[i], cat_index, locale,
7040                              &scratch_buf, &scratch_buf_size, NULL);
7041         cat_index = follow_on_cat_index;
7042
7043         /* To prevent infinite recursive calls, we don't ask for the UTF-8ness
7044          * of the string (in 'trials[i]') above.  Instead we examine the
7045          * returned string here */
7046         const Size_t len = strlen(scratch_buf);
7047         const U8 * first_variant;
7048
7049         /* If the string is identical whether or not it is encoded as UTF-8, it
7050          * isn't helpful in determining UTF8ness. */
7051         if (is_utf8_invariant_string_loc((U8 *) scratch_buf, len,
7052                                          &first_variant))
7053         {
7054             continue;
7055         }
7056
7057         /* Here, has non-ASCII.  If not legal UTF-8, isn't a UTF-8 locale */
7058         if (! is_utf8_string(first_variant,
7059                              len - (first_variant - (U8 *) scratch_buf)))
7060         {
7061             strings_utf8ness = UTF8NESS_NO;
7062             break;
7063         }
7064
7065         /* Here, is a legal non-ASCII UTF-8 string; tentatively set the return
7066          * to YES; possibly overridden by later iterations */
7067         strings_utf8ness = UTF8NESS_YES;
7068
7069         /* But if this corroborates our expectation, quit now */
7070         if (lean_towards_being_utf8 & NAME_INDICATES_UTF8) {
7071             break;
7072         }
7073     }
7074
7075 #  ifdef USE_LOCALE_TIME
7076
7077     restore_toggled_locale_c(LC_TIME, orig_TIME_locale);
7078
7079 #  endif
7080
7081     restore_toggled_locale_c(LC_CTYPE, orig_CTYPE_locale);
7082
7083     Safefree(scratch_buf);
7084     scratch_buf = NULL;
7085
7086     if (strings_utf8ness == UTF8NESS_NO) {
7087         return codeset;     /* No override */
7088     }
7089
7090     /* Here all tested strings are legal UTF-8.
7091      *
7092      * Above we set UTF8NESS_YES if any string wasn't ASCII.  But even if they
7093      * are all ascii, and the locale name indicates it is a UTF-8 locale,
7094      * assume the locale is UTF-8. */
7095     if (lean_towards_being_utf8) {
7096         strings_utf8ness = UTF8NESS_YES;
7097     }
7098
7099     if (strings_utf8ness == UTF8NESS_YES) {
7100         return "UTF-8";
7101     }
7102
7103     /* Here, nothing examined indicates that the codeset is or isn't UTF-8.
7104      * But what is it?  The other locale categories are not likely to be of
7105      * further help:
7106      *
7107      * LC_NUMERIC   Only a few locales in the world have a non-ASCII radix or
7108      *              group separator.
7109      * LC_CTYPE     This code wouldn't be compiled if mbtowc() existed and was
7110      *              reliable.  This is unlikely in C99.  There are other
7111      *              functions that could be used instead, but are they going to
7112      *              exist, and be able to distinguish between UTF-8 and 8859-1?
7113      *              Deal with this only if it becomes necessary.
7114      * LC_MESSAGES  The strings returned from strerror() would seem likely
7115      *              candidates, but experience has shown that many systems
7116      *              don't actually have translations installed for them.  They
7117      *              are instead always in English, so everything in them is
7118      *              ASCII, which is of no help to us.  A Configure probe could
7119      *              possibly be written to see if this platform has non-ASCII
7120      *              error messages.  But again, wait until it turns out to be
7121      *              an actual problem.
7122      *
7123      *              Things like YESSTR, NOSTR, might not be in ASCII, but need
7124      *              nl_langinfo() to access, which we don't have.
7125      */
7126
7127     /* Otherwise, assume the locale isn't UTF-8.  This can be wrong if we don't
7128      * have MB_CUR_MAX, and the locale is English without UTF-8 in its name,
7129      * and with a dollar currency symbol. */
7130     return codeset;     /* No override */
7131 }
7132
7133 #  endif /* ! HAS_DEFINITIVE_UTF8NESS_DETERMINATION */
7134
7135 /*
7136 =for apidoc_section $time
7137 =for apidoc      sv_strftime_tm
7138 =for apidoc_item sv_strftime_ints
7139 =for apidoc_item my_strftime
7140
7141 These implement the libc strftime(), but with a different API so that the return
7142 value is a pointer to the formatted result (which MUST be arranged to be FREED
7143 BY THE CALLER).  This allows these functions to increase the buffer size as
7144 needed, so that the caller doesn't have to worry about that.
7145
7146 On failure, they return NULL, and set C<errno> to C<EINVAL>.
7147
7148 C<sv_strftime_tm> and C<sv_strftime_ints> are preferred, as they transparently
7149 handle the UTF-8ness of the current locale, the input C<fmt>, and the returned
7150 result.  Only if the current C<LC_TIME> locale is a UTF-8 one (and S<C<use
7151 bytes>> is not in effect) will the result be marked as UTF-8.  These differ
7152 only in the form of their inputs.  C<sv_strftime_tm> takes a filled-in
7153 S<C<struct tm>> parameter.  C<sv_strftime_ints> takes a bunch of integer
7154 parameters that together completely define a given time.
7155
7156 C<my_strftime> is kept for backwards compatibility.  Knowing if its result
7157 should be considered UTF-8 or not requires significant extra logic.
7158
7159 Note that C<yday> and C<wday> effectively are ignored by C<sv_strftime_ints>
7160 and C<my_strftime>, as mini_mktime() overwrites them
7161
7162 Also note that all three functions are always executed in the underlying
7163 C<LC_TIME> locale of the program, giving results based on that locale.
7164
7165 =cut
7166  */
7167
7168 char *
7169 Perl_my_strftime(pTHX_ const char *fmt, int sec, int min, int hour,
7170                        int mday, int mon, int year, int wday, int yday,
7171                        int isdst)
7172 {   /* Documented above */
7173     PERL_ARGS_ASSERT_MY_STRFTIME;
7174
7175     struct tm  mytm;
7176     ints_to_tm(&mytm, sec, min, hour, mday, mon, year, wday, yday, isdst);
7177     char * ret = strftime_tm(fmt, &mytm);
7178     return ret;
7179 }
7180
7181 SV *
7182 Perl_sv_strftime_tm(pTHX_ SV * fmt, const struct tm * mytm)
7183 {   /* Documented above */
7184     PERL_ARGS_ASSERT_SV_STRFTIME_TM;
7185
7186     utf8ness_t fmt_utf8ness = (SvUTF8(fmt) && LIKELY(! IN_BYTES))
7187                               ? UTF8NESS_YES
7188                               : UTF8NESS_UNKNOWN;
7189
7190     utf8ness_t result_utf8ness;
7191     char * retval = strftime8(SvPV_nolen(fmt),
7192                               mytm,
7193                               fmt_utf8ness,
7194                               &result_utf8ness,
7195                               true  /* calling from sv_strftime */
7196                              );
7197     SV * sv = NULL;
7198     if (retval) {
7199         sv = newSV_type(SVt_PV);
7200         sv_usepvn_flags(sv, retval, strlen(retval), SV_HAS_TRAILING_NUL);
7201
7202         if (result_utf8ness == UTF8NESS_YES) {
7203             SvUTF8_on(sv);
7204         }
7205     }
7206
7207     return sv;
7208 }
7209
7210 SV *
7211 Perl_sv_strftime_ints(pTHX_ SV * fmt, int sec, int min, int hour,
7212                             int mday, int mon, int year, int wday,
7213                             int yday, int isdst)
7214 {   /* Documented above */
7215     PERL_ARGS_ASSERT_SV_STRFTIME_INTS;
7216
7217     struct tm  mytm;
7218     ints_to_tm(&mytm, sec, min, hour, mday, mon, year, wday, yday, isdst);
7219     SV * ret = sv_strftime_tm(fmt, &mytm);
7220     return ret;
7221 }
7222
7223 STATIC void
7224 S_ints_to_tm(pTHX_ struct tm * mytm,
7225                    int sec, int min, int hour, int mday, int mon, int year,
7226                    int wday, int yday, int isdst)
7227 {
7228     /* Create a struct tm structure from the input time-related integer
7229      * variables */
7230
7231     /* Override with the passed-in values */
7232     Zero(mytm, 1, struct tm);
7233     mytm->tm_sec = sec;
7234     mytm->tm_min = min;
7235     mytm->tm_hour = hour;
7236     mytm->tm_mday = mday;
7237     mytm->tm_mon = mon;
7238     mytm->tm_year = year;
7239     mytm->tm_wday = wday;
7240     mytm->tm_yday = yday;
7241     mytm->tm_isdst = isdst;
7242     mini_mktime(mytm);
7243
7244     /* use libc to get the values for tm_gmtoff and tm_zone on platforms that
7245      * have them [perl #18238] */
7246 #if  defined(HAS_MKTIME)                                      \
7247  && (defined(HAS_TM_TM_GMTOFF) || defined(HAS_TM_TM_ZONE))
7248     struct tm mytm2 = *mytm;
7249     MKTIME_LOCK;
7250     mktime(&mytm2);
7251     MKTIME_UNLOCK;
7252 #  ifdef HAS_TM_TM_GMTOFF
7253     mytm->tm_gmtoff = mytm2.tm_gmtoff;
7254 #  endif
7255 #  ifdef HAS_TM_TM_ZONE
7256     mytm->tm_zone = mytm2.tm_zone;
7257 #  endif
7258 #endif
7259
7260     return;
7261 }
7262
7263 STATIC char *
7264 S_strftime_tm(pTHX_ const char *fmt, const struct tm *mytm)
7265 {
7266     PERL_ARGS_ASSERT_STRFTIME_TM;
7267
7268     /* Execute strftime() based on the input struct tm */
7269
7270     /* An empty format yields an empty result */
7271     const int fmtlen = strlen(fmt);
7272     if (fmtlen == 0) {
7273         char *ret;
7274         Newxz (ret, 1, char);
7275         return ret;
7276     }
7277
7278 #ifndef HAS_STRFTIME
7279     Perl_croak(aTHX_ "panic: no strftime");
7280 #else
7281 #  if defined(WE_MUST_DEAL_WITH_MISMATCHED_CTYPE) && defined(USE_LOCALE_TIME)
7282
7283     const char * orig_CTYPE_locale = toggle_locale_c(LC_CTYPE,
7284                                                      querylocale_c(LC_TIME));
7285 #  endif
7286
7287     /* Guess an initial size for the returned string based on an expansion
7288      * factor of the input format, but with a minimum that should handle most
7289      * common cases.  If this guess is too small, we will try again with a
7290      * larger one */
7291     int bufsize = MAX(fmtlen * 2, 64);
7292
7293     char *buf = NULL;   /* Makes Renew() act as Newx() on the first iteration */
7294     do {
7295         Renew(buf, bufsize, char);
7296
7297         /* allowing user-supplied (rather than literal) formats is normally
7298          * frowned upon as a potential security risk; but this is part of the
7299          * API so we have to allow it (and the available formats have a much
7300          * lower chance of doing something bad than the ones for printf etc. */
7301         GCC_DIAG_IGNORE_STMT(-Wformat-nonliteral);
7302
7303 #ifdef WIN32    /* Windows will tell you if the input is invalid */
7304
7305         /* Needed because the LOCK might (or might not) save/restore errno */
7306         bool strftime_failed = false;
7307
7308         STRFTIME_LOCK;
7309         dSAVE_ERRNO;
7310         errno = 0;
7311
7312         int len = strftime(buf, bufsize, fmt, mytm);
7313         if (errno == EINVAL) {
7314             strftime_failed = true;
7315         }
7316
7317         RESTORE_ERRNO;
7318         STRFTIME_UNLOCK;
7319
7320         if (strftime_failed) {
7321             goto strftime_failed;
7322         }
7323
7324 #else
7325         STRFTIME_LOCK;
7326         int len = strftime(buf, bufsize, fmt, mytm);
7327         STRFTIME_UNLOCK;
7328 #endif
7329
7330         GCC_DIAG_RESTORE_STMT;
7331
7332         /* A non-zero return indicates success.  But to make sure we're not
7333          * dealing with some rogue strftime that returns how much space it
7334          * needs instead of 0 when there isn't enough, check that the return
7335          * indicates we have at least one byte of spare space (which will be
7336          * used for the terminating NUL). */
7337         if (inRANGE(len, 1, bufsize - 1)) {
7338             goto strftime_return;
7339         }
7340
7341         /* There are several possible reasons for a 0 return code for a
7342          * non-empty format, and they are not trivial to tease apart.  This
7343          * issue is a known bug in the strftime() API.  What we do to cope is
7344          * to assume that the reason is not enough space in the buffer, so
7345          * increase it and try again. */
7346         bufsize *= 2;
7347
7348         /* But don't just keep increasing the size indefinitely.  Stop when it
7349          * becomes obvious that the reason for failure is something besides not
7350          * enough space.  The most likely largest expanding format is %c.  On
7351          * khw's Linux box, the maximum result of this is 67 characters, in the
7352          * km_KH locale.  If a new script comes along that uses 4 UTF-8 bytes
7353          * per character, and with a similar expansion factor, that would be a
7354          * 268:2 byte ratio, or a bit more than 128:1 = 2**7:1.  Some strftime
7355          * implementations allow you to say %1000c to pad to 1000 bytes.  This
7356          * shows that it is impossible to implement this without a heuristic
7357          * (which can fail).  But it indicates we need to be generous in the
7358          * upper limit before failing.  The previous heuristic used was too
7359          * stingy.  Since the size doubles per iteration, it doesn't take many
7360          * to reach the limit */
7361     } while (bufsize < ((1 << 11) + 1) * fmtlen);
7362
7363     /* Here, strftime() returned 0, and it likely wasn't for lack of space.
7364      * There are two possible reasons:
7365      *
7366      * First is that the result is legitimately 0 length.  This can happen
7367      * when the format is precisely "%p".  That is the only documented format
7368      * that can have an empty result. */
7369     if (strEQ(fmt, "%p")) {
7370         Renew(buf, 1, char);
7371         *buf = '\0';
7372         goto strftime_return;
7373     }
7374
7375     /* The other reason is that the format string is malformed.  Probably it is
7376      * that the string is syntactically invalid for the locale.  On some
7377      * platforms an invalid conversion specifier '%?' (for all illegal '?') is
7378      * treated as a literal, but others may fail when '?' is illegal */
7379
7380 #  ifdef WIN32
7381   strftime_failed:
7382 #  endif
7383
7384     SET_EINVAL;
7385
7386     Safefree(buf);
7387     buf = NULL;
7388
7389   strftime_return:
7390
7391 #  if defined(WE_MUST_DEAL_WITH_MISMATCHED_CTYPE) && defined(USE_LOCALE_TIME)
7392
7393     restore_toggled_locale_c(LC_CTYPE, orig_CTYPE_locale);
7394
7395 #  endif
7396
7397     return buf;
7398
7399 #endif
7400
7401 }
7402
7403 STATIC char *
7404 S_strftime8(pTHX_ const char * fmt,
7405                   const struct tm * mytm,
7406                   const utf8ness_t fmt_utf8ness,
7407                   utf8ness_t * result_utf8ness,
7408                   const bool came_from_sv)
7409 {
7410     PERL_ARGS_ASSERT_STRFTIME8;
7411
7412     /* Wrap strftime_tm, taking into account the input and output UTF-8ness */
7413
7414 #ifdef USE_LOCALE_TIME
7415 #  define INDEX_TO_USE  LC_TIME_INDEX_
7416
7417     const char * locale = querylocale_c(LC_TIME);
7418     locale_utf8ness_t locale_utf8ness = LOCALE_UTF8NESS_UNKNOWN;
7419
7420 #else
7421 #  define INDEX_TO_USE  LC_ALL_INDEX_   /* Effectively out of bounds */
7422
7423     const char * locale = "C";
7424     locale_utf8ness_t locale_utf8ness = LOCALE_NOT_UTF8;
7425
7426 #endif
7427
7428     switch (fmt_utf8ness) {
7429       case UTF8NESS_IMMATERIAL:
7430         break;
7431
7432       case UTF8NESS_NO: /* Known not to be UTF-8; must not be UTF-8 locale */
7433         if (is_locale_utf8(locale)) {
7434             SET_EINVAL;
7435             return NULL;
7436         }
7437
7438         locale_utf8ness = LOCALE_NOT_UTF8;
7439         break;
7440
7441       case UTF8NESS_YES:    /* Known to be UTF-8; must be UTF-8 locale if can't
7442                                downgrade. */
7443         if (! is_locale_utf8(locale)) {
7444             locale_utf8ness = LOCALE_NOT_UTF8;
7445
7446             bool is_utf8 = true;
7447             Size_t fmt_len = strlen(fmt);
7448             fmt = (char *) bytes_from_utf8((U8 *) fmt, &fmt_len, &is_utf8);
7449             if (is_utf8) {
7450                 SET_EINVAL;
7451                 return NULL;
7452             }
7453
7454             SAVEFREEPV(fmt);
7455         }
7456         else {
7457             locale_utf8ness = LOCALE_IS_UTF8;
7458         }
7459
7460         break;
7461
7462       case UTF8NESS_UNKNOWN:
7463         if (! is_locale_utf8(locale)) {
7464             locale_utf8ness = LOCALE_NOT_UTF8;
7465         }
7466         else {
7467             locale_utf8ness = LOCALE_IS_UTF8;
7468             if (came_from_sv) {
7469
7470                 /* Upgrade 'fmt' to UTF-8 for a UTF-8 locale.  Otherwise the
7471                  * locale would find any UTF-8 variant characters to be
7472                  * malformed */
7473                 Size_t fmt_len = strlen(fmt);
7474                 fmt = (char *) bytes_to_utf8((U8 *) fmt, &fmt_len);
7475                 SAVEFREEPV(fmt);
7476             }
7477         }
7478
7479         break;
7480     }
7481
7482     char * retval = strftime_tm(fmt, mytm);
7483     *result_utf8ness = get_locale_string_utf8ness_i(retval,
7484                                                     locale_utf8ness,
7485                                                     locale,
7486                                                     INDEX_TO_USE);
7487     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
7488                           "fmt=%s, retval=%s; utf8ness=%d",
7489                           fmt,
7490                           ((is_utf8_string((U8 *) retval, 0))
7491                            ? retval
7492                            :_byte_dump_string((U8 *) retval, strlen(retval),0)),
7493                           *result_utf8ness));
7494     return retval;
7495
7496 #undef INDEX_TO_USE
7497
7498 }
7499
7500 #ifdef USE_LOCALE
7501
7502 STATIC void
7503 S_give_perl_locale_control(pTHX_
7504 #  ifdef LC_ALL
7505                            const char * lc_all_string,
7506 #  else
7507                            const char ** locales,
7508 #  endif
7509                            const line_t caller_line)
7510 {
7511     PERL_UNUSED_ARG(caller_line);
7512
7513     /* This is called when the program is in the global locale and are
7514      * switching to per-thread (if available).  And it is called at
7515      * initialization time to do the same.
7516      */
7517
7518 #  if defined(WIN32) && defined(USE_THREAD_SAFE_LOCALE)
7519
7520     /* On Windows, convert to per-thread behavior.  This isn't necessary in
7521      * POSIX 2008, as the conversion gets done automatically in the
7522      * void_setlocale_i() calls below. */
7523     if (_configthreadlocale(_ENABLE_PER_THREAD_LOCALE) == -1) {
7524         locale_panic_("_configthreadlocale returned an error");
7525     }
7526
7527 #  endif
7528 #  if ! defined(USE_THREAD_SAFE_LOCALE)                               \
7529    && ! defined(USE_POSIX_2008_LOCALE)
7530 #    if defined(LC_ALL)
7531     PERL_UNUSED_ARG(lc_all_string);
7532 #    else
7533     PERL_UNUSED_ARG(locales);
7534 #    endif
7535 #  else
7536
7537     /* This platform has per-thread locale handling.  Do the conversion. */
7538
7539 #    if defined(LC_ALL)
7540
7541     void_setlocale_c_with_caller(LC_ALL, lc_all_string, __FILE__, caller_line);
7542
7543 #    else
7544
7545     for_all_individual_category_indexes(i) {
7546         void_setlocale_i_with_caller(i, locales[i], __FILE__, caller_line);
7547     }
7548
7549 #    endif
7550 #  endif
7551
7552     /* Finally, update our remaining records.  'true' => force recalculation.
7553      * This is needed because we don't know what's happened while Perl hasn't
7554      * had control, so we need to figure out the current state */
7555
7556 #  if defined(LC_ALL)
7557
7558     new_LC_ALL(lc_all_string, true);
7559
7560 #    else
7561
7562     new_LC_ALL(calculate_LC_ALL_string(locales,
7563                                        INTERNAL_FORMAT,
7564                                        WANT_TEMP_PV,
7565                                        caller_line),
7566                true);
7567 #    endif
7568
7569 }
7570
7571 STATIC void
7572 S_output_check_environment_warning(pTHX_ const char * const language,
7573                                          const char * const lc_all,
7574                                          const char * const lang)
7575 {
7576     PerlIO_printf(Perl_error_log,
7577                   "perl: warning: Please check that your locale settings:\n");
7578
7579 #  ifdef __GLIBC__
7580
7581     PerlIO_printf(Perl_error_log, "\tLANGUAGE = %c%s%c,\n",
7582                                   language ? '"' : '(',
7583                                   language ? language : "unset",
7584                                   language ? '"' : ')');
7585 #  else
7586     PERL_UNUSED_ARG(language);
7587 #  endif
7588
7589     PerlIO_printf(Perl_error_log, "\tLC_ALL = %c%s%c,\n",
7590                                   lc_all ? '"' : '(',
7591                                   lc_all ? lc_all : "unset",
7592                                   lc_all ? '"' : ')');
7593
7594     for_all_individual_category_indexes(i) {
7595         const char * value = PerlEnv_getenv(category_names[i]);
7596         PerlIO_printf(Perl_error_log,
7597                       "\t%s = %c%s%c,\n",
7598                       category_names[i],
7599                       value ? '"' : '(',
7600                       value ? value : "unset",
7601                       value ? '"' : ')');
7602     }
7603
7604     PerlIO_printf(Perl_error_log, "\tLANG = %c%s%c\n",
7605                                   lang ? '"' : '(',
7606                                   lang ? lang : "unset",
7607                                   lang ? '"' : ')');
7608     PerlIO_printf(Perl_error_log,
7609                   "    are supported and installed on your system.\n");
7610 }
7611
7612 #endif
7613
7614 /* A helper macro for the next function.  Needed because would be called in two
7615  * places.  Knows about the internal workings of the function */
7616 #define GET_DESCRIPTION(trial, name)                                    \
7617     ((isNAME_C_OR_POSIX(name))                                          \
7618      ? "the standard locale"                                            \
7619      : ((trial == (system_default_trial)                                \
7620                   ? "the system default locale"                         \
7621                   : "a fallback locale")))
7622
7623 /*
7624  * Initialize locale awareness.
7625  */
7626 int
7627 Perl_init_i18nl10n(pTHX_ int printwarn)
7628 {
7629     /* printwarn is:
7630      *    0 if not to output warning when setup locale is bad
7631      *    1 if to output warning based on value of PERL_BADLANG
7632      *    >1 if to output regardless of PERL_BADLANG
7633      *
7634      * returns
7635      *    1 = set ok or not applicable,
7636      *    0 = fallback to a locale of lower priority
7637      *   -1 = fallback to all locales failed, not even to the C locale
7638      *
7639      * Under -DDEBUGGING, if the environment variable PERL_DEBUG_LOCALE_INIT is
7640      * set, debugging information is output.
7641      *
7642      * This routine effectively does the following in most cases:
7643      *
7644      *      basic initialization;
7645      *      asserts that the compiled tables are consistent;
7646      *      initialize data structures;
7647      *      make sure we are in the global locale;
7648      *      setlocale(LC_ALL, "");
7649      *      switch to per-thread locale if applicable;
7650      *
7651      * The "" causes the locale to be set to what the environment variables at
7652      * the time say it should be.
7653      *
7654      * To handle possible failures, the setlocale is expanded to be like:
7655      *
7656      *      trial_locale = pre-first-trial;
7657      *      while (has_another_trial()) {
7658      *          trial_locale = next_trial();
7659      *          if setlocale(LC_ALL, trial_locale) {
7660      *              ok = true;
7661      *              break;
7662      *          }
7663      *
7664      *          had_failure = true;
7665      *          warn();
7666      *      }
7667      *
7668      *      if (had_failure) {
7669      *          warn_even_more();
7670      *          if (! ok) warn_still_more();
7671      *      }
7672      *
7673      * The first trial is either:
7674      *      ""      to examine the environment variables for the locale
7675      *      NULL    to use the values already set for the locale by the program
7676      *              embedding this perl instantiation.
7677      *
7678      * Something is wrong if this trial fails, but there is a sequence of
7679      * fallbacks to try should that happen.  They are given in the enum below.
7680
7681      * If there is no LC_ALL defined on the system, the setlocale() above is
7682      * replaced by a loop setting each individual category separately.
7683      *
7684      * In a non-embeded environment, this code is executed exactly once.  It
7685      * sets up the global locale environment.  At the end, if some sort of
7686      * thread-safety is in effect, it will turn thread 0 into using that, with
7687      * the same locale as the global initially.  thread 0 can then change its
7688      * locale at will without affecting the global one.
7689      *
7690      * At destruction time, thread 0 will revert to the global locale as the
7691      * other threads die.
7692      *
7693      * Care must be taken in an embedded environment.  This code will be
7694      * executed for each instantiation.  Since it changes the global locale, it
7695      * could clash with another running instantiation that isn't using
7696      * per-thread locales.  perlembed suggests having the controlling program
7697      * set each instantiation's locale and set PERL_SKIP_LOCALE_INIT so this
7698      * code uses that without actually changing anything.  Then the onus is on
7699      * the controlling program to prevent any races.  The code below does
7700      * enough locking so as to prevent system calls from overwriting data
7701      * before it is safely copied here, but that isn't a general solution.
7702      */
7703
7704 #ifndef USE_LOCALE
7705
7706     PERL_UNUSED_ARG(printwarn);
7707     const int ok = 1;
7708
7709 #else  /* USE_LOCALE to near the end of the routine */
7710
7711     int ok = 0;
7712
7713 #  ifdef __GLIBC__
7714
7715     const char * const language = PerlEnv_getenv("LANGUAGE");
7716
7717 #  else
7718     const char * const language = NULL;     /* Unused placeholder */
7719 #  endif
7720
7721     /* A later getenv() could zap this, so only use here */
7722     const char * const bad_lang_use_once = PerlEnv_getenv("PERL_BADLANG");
7723
7724     const bool locwarn = (printwarn > 1
7725                           || (          printwarn
7726                               && (    ! bad_lang_use_once
7727                                   || (
7728                                          /* disallow with "" or "0" */
7729                                          *bad_lang_use_once
7730                                        && strNE("0", bad_lang_use_once)))));
7731
7732 #  ifndef DEBUGGING
7733 #    define DEBUG_LOCALE_INIT(a,b,c)
7734 #  else
7735
7736     DEBUG_INITIALIZATION_set(cBOOL(PerlEnv_getenv("PERL_DEBUG_LOCALE_INIT")));
7737
7738 #    define DEBUG_LOCALE_INIT(cat_index, locale, result)                    \
7739         DEBUG_L(PerlIO_printf(Perl_debug_log, "%s\n",                       \
7740                     setlocale_debug_string_i(cat_index, locale, result)));
7741
7742 #    ifdef LC_ALL
7743     assert(categories[LC_ALL_INDEX_] == LC_ALL);
7744     assert(strEQ(category_names[LC_ALL_INDEX_], "LC_ALL"));
7745 #      ifdef USE_POSIX_2008_LOCALE
7746     assert(category_masks[LC_ALL_INDEX_] == LC_ALL_MASK);
7747 #      endif
7748 #    endif
7749
7750     for_all_individual_category_indexes(i) {
7751         assert(category_name_lengths[i] == strlen(category_names[i]));
7752     }
7753
7754 #  endif    /* DEBUGGING */
7755
7756     /* Initialize the per-thread mbrFOO() state variables.  See POSIX.xs for
7757      * why these particular incantations are used. */
7758 #  ifdef HAS_MBRLEN
7759     memzero(&PL_mbrlen_ps, sizeof(PL_mbrlen_ps));
7760 #  endif
7761 #  ifdef HAS_MBRTOWC
7762     memzero(&PL_mbrtowc_ps, sizeof(PL_mbrtowc_ps));
7763 #  endif
7764 #  ifdef HAS_WCTOMBR
7765     wcrtomb(NULL, L'\0', &PL_wcrtomb_ps);
7766 #  endif
7767 #  ifdef USE_PL_CURLOCALES
7768
7769     for (unsigned int i = 0; i <= LC_ALL_INDEX_; i++) {
7770         PL_curlocales[i] = savepv("C");
7771     }
7772
7773 #  endif
7774 #  ifdef USE_PL_CUR_LC_ALL
7775
7776     PL_cur_LC_ALL = savepv("C");
7777
7778 #  endif
7779 #  if ! defined(PERL_LC_ALL_USES_NAME_VALUE_PAIRS) && defined(LC_ALL)
7780
7781     LOCALE_LOCK;
7782
7783     /* If we haven't done so already, translate the LC_ALL positions of
7784      * categories into our internal indices. */
7785     if (map_LC_ALL_position_to_index[0] == LC_ALL_INDEX_) {
7786
7787         /* Use this array, initialized by a config.h constant */
7788         int lc_all_category_positions[] = PERL_LC_ALL_CATEGORY_POSITIONS_INIT;
7789         STATIC_ASSERT_STMT(   C_ARRAY_LENGTH(lc_all_category_positions)
7790                            == LC_ALL_INDEX_);
7791
7792         for (unsigned int i = 0;
7793              i < C_ARRAY_LENGTH(lc_all_category_positions);
7794              i++)
7795         {
7796             map_LC_ALL_position_to_index[i] =
7797                               get_category_index(lc_all_category_positions[i]);
7798         }
7799     }
7800
7801     LOCALE_UNLOCK;
7802
7803 #  endif
7804 #  ifdef USE_POSIX_2008_LOCALE
7805
7806     /* This is a global, so be sure to keep another instance from zapping it */
7807     LOCALE_LOCK;
7808     if (PL_C_locale_obj) {
7809         LOCALE_UNLOCK;
7810     }
7811     else {
7812         PL_C_locale_obj = newlocale(LC_ALL_MASK, "C", (locale_t) 0);
7813         if (! PL_C_locale_obj) {
7814             LOCALE_UNLOCK;
7815             locale_panic_(Perl_form(aTHX_
7816                                 "Cannot create POSIX 2008 C locale object"));
7817         }
7818         LOCALE_UNLOCK;
7819
7820         DEBUG_Lv(PerlIO_printf(Perl_debug_log, "created C object %p\n",
7821                                                PL_C_locale_obj));
7822     }
7823
7824     /* Switch to using the POSIX 2008 interface now.  This would happen below
7825      * anyway, but deferring it can lead to leaks of memory that would also get
7826      * malloc'd in the interim.  We arbitrarily switch to the C locale,
7827      * overridden below  */
7828     if (! uselocale(PL_C_locale_obj)) {
7829         locale_panic_(Perl_form(aTHX_
7830                                 "Can't uselocale(%p), LC_ALL supposed to"
7831                                 " be 'C'",
7832                                 PL_C_locale_obj));
7833     }
7834
7835 #    ifdef MULTIPLICITY
7836
7837     PL_cur_locale_obj = PL_C_locale_obj;
7838
7839 #    endif
7840 #  endif
7841
7842     /* Now initialize some data structures.  This is entirely so that
7843      * later-executed code doesn't have to concern itself with things not being
7844      * initialized.  Arbitrarily use the C locale (which we know has to exist
7845      * on the system). */
7846
7847 #  ifdef USE_LOCALE_NUMERIC
7848
7849     PL_numeric_radix_sv    = newSV(1);
7850     PL_underlying_radix_sv = newSV(1);
7851     Newxz(PL_numeric_name, 1, char);    /* Single NUL character */
7852
7853 #  endif
7854 #  ifdef USE_LOCALE_COLLATE
7855
7856     Newxz(PL_collation_name, 1, char);
7857
7858 #  endif
7859 #  ifdef USE_LOCALE_CTYPE
7860
7861     Newxz(PL_ctype_name, 1, char);
7862
7863 #  endif
7864
7865     new_LC_ALL("C", true /* Don't shortcut */);
7866
7867 /*===========================================================================*/
7868
7869     /* Now ready to override the initialization with the values that the user
7870      * wants.  This is done in the global locale as explained in the
7871      * introductory comments to this function */
7872     switch_to_global_locale();
7873
7874     const char * const lc_all     = PerlEnv_getenv("LC_ALL");
7875     const char * const lang       = PerlEnv_getenv("LANG");
7876
7877     /* We try each locale in the enum, in order, until we get one that works,
7878      * or exhaust the list.  Normally the loop is executed just once.
7879      *
7880      * Each enum value is +1 from the previous */
7881     typedef enum {
7882             dummy_trial       = -1,
7883             environment_trial =  0,     /* "" or NULL; code below assumes value
7884                                            0 is the first real trial */
7885             LC_ALL_trial,               /* ENV{LC_ALL} */
7886             LANG_trial,                 /* ENV{LANG} */
7887             system_default_trial,       /* Windows .ACP */
7888             C_trial,                    /* C locale */
7889             beyond_final_trial,
7890     } trials;
7891
7892     trials trial;
7893     unsigned int already_checked = 0;
7894     const char * checked[C_trial];
7895
7896 #  ifdef LC_ALL
7897     const char * lc_all_string;
7898 #  else
7899     const char * curlocales[LC_ALL_INDEX_];
7900 #  endif
7901
7902     /* Loop through the initial setting and all the possible fallbacks,
7903      * breaking out of the loop on success */
7904     trial = dummy_trial;
7905     while (trial != beyond_final_trial) {
7906
7907         /* Each time through compute the next trial to use based on the one in
7908          * the previous iteration and switch to the new one.  This enforces the
7909          * order in which the fallbacks are applied */
7910       next_trial:
7911         trial = (trials) ((int) trial + 1);     /* Casts are needed for g++ */
7912
7913         const char * locale = NULL;
7914
7915         /* Set up the parameters for this trial */
7916         switch (trial) {
7917           case dummy_trial:
7918             locale_panic_("Unexpectedly got 'dummy_trial");
7919             break;
7920
7921           case environment_trial:
7922             /* This is either "" to get the values from the environment, or
7923              * NULL if the calling program has initialized the values already.
7924              * */
7925             locale = (PerlEnv_getenv("PERL_SKIP_LOCALE_INIT"))
7926                      ? NULL
7927                      : "";
7928             break;
7929
7930           case LC_ALL_trial:
7931             if (! lc_all || strEQ(lc_all, "")) {
7932                 continue;   /* No-op */
7933             }
7934
7935             locale = lc_all;
7936             break;
7937
7938           case LANG_trial:
7939             if (! lang || strEQ(lang, "")) {
7940                 continue;   /* No-op */
7941             }
7942
7943             locale = lang;
7944             break;
7945
7946           case system_default_trial:
7947
7948 #  if ! defined(WIN32) || ! defined(LC_ALL)
7949
7950             continue;   /* No-op */
7951
7952 #  else
7953             /* For Windows, we also try the system default locale before "C".
7954              * (If there exists a Windows without LC_ALL we skip this because
7955              * it gets too complicated.  For those, "C" is the next fallback
7956              * possibility). */
7957             locale = ".ACP";
7958 #  endif
7959             break;
7960
7961           case C_trial:
7962             locale = "C";
7963             break;
7964
7965           case beyond_final_trial:
7966             continue;     /* No-op, causes loop to exit */
7967         }
7968
7969         /* If the locale is a substantive name, don't try the same locale
7970          * twice. */
7971         if (locale && strNE(locale, "")) {
7972             for (unsigned int i = 0; i < already_checked; i++) {
7973                 if (strEQ(checked[i], locale)) {
7974                     goto next_trial;
7975                 }
7976             }
7977
7978             /* And, for future iterations, indicate we've tried this locale */
7979             assert(already_checked < C_ARRAY_LENGTH(checked));
7980             checked[already_checked] = savepv(locale);
7981             SAVEFREEPV(checked[already_checked]);
7982             already_checked++;
7983         }
7984
7985 #  ifdef LC_ALL
7986
7987         STDIZED_SETLOCALE_LOCK;
7988         lc_all_string = savepv(stdized_setlocale(LC_ALL, locale));
7989         STDIZED_SETLOCALE_UNLOCK;
7990
7991         DEBUG_LOCALE_INIT(LC_ALL_INDEX_, locale, lc_all_string);
7992
7993         if (LIKELY(lc_all_string)) {     /* Succeeded */
7994             ok = 1;
7995             break;
7996         }
7997
7998         if (trial == 0 && locwarn) {
7999             PerlIO_printf(Perl_error_log,
8000                                   "perl: warning: Setting locale failed.\n");
8001             output_check_environment_warning(language, lc_all, lang);
8002         }
8003
8004 #  else /* Below is ! LC_ALL */
8005
8006         bool setlocale_failure = FALSE;  /* This trial hasn't failed so far */
8007         bool dowarn = trial == 0 && locwarn;
8008
8009         for_all_individual_category_indexes(j) {
8010             STDIZED_SETLOCALE_LOCK;
8011             curlocales[j] = savepv(stdized_setlocale(categories[j], locale));
8012             STDIZED_SETLOCALE_UNLOCK;
8013
8014             DEBUG_LOCALE_INIT(j, locale, curlocales[j]);
8015
8016             if (UNLIKELY(! curlocales[j])) {
8017                 setlocale_failure = TRUE;
8018
8019                 /* If are going to warn below, continue to loop so all failures
8020                  * are included in the message */
8021                 if (! dowarn) {
8022                     break;
8023                 }
8024             }
8025         }
8026
8027         if (LIKELY(! setlocale_failure)) {  /* All succeeded */
8028             ok = 1;
8029             break;  /* Exit trial_locales loop */
8030         }
8031
8032         /* Here, this trial failed */
8033
8034         if (dowarn) {
8035             PerlIO_printf(Perl_error_log,
8036                 "perl: warning: Setting locale failed for the categories:\n");
8037
8038             for_all_individual_category_indexes(j) {
8039                 if (! curlocales[j]) {
8040                     PerlIO_printf(Perl_error_log, "\t%s\n", category_names[j]);
8041                 }
8042             }
8043
8044             output_check_environment_warning(language, lc_all, lang);
8045         }   /* end of warning on first failure */
8046
8047 #  endif /* LC_ALL */
8048
8049     }   /* end of looping through the trial locales */
8050
8051     /* If we had to do more than the first trial, it means that one failed, and
8052      * we may need to output a warning, and, if none worked, do more */
8053     if (UNLIKELY(trial != 0)) {
8054         if (locwarn) {
8055             const char * description = "a fallback locale";
8056             const char * name = NULL;;
8057
8058             /* If we didn't find a good fallback, list all we tried */
8059             if (! ok && already_checked > 0) {
8060                 PerlIO_printf(Perl_error_log, "perl: warning: Failed to fall"
8061                                               " back to ");
8062                 if (already_checked > 1) {  /* more than one was tried */
8063                     PerlIO_printf(Perl_error_log, "any of:\n");
8064                 }
8065
8066                 while (already_checked > 0) {
8067                     name = checked[--already_checked];
8068                     description = GET_DESCRIPTION(trial, name);
8069                     PerlIO_printf(Perl_error_log, "%s (\"%s\")\n",
8070                                                   description, name);
8071                 }
8072             }
8073
8074             if (ok) {
8075
8076                 /* Here, a fallback worked.  So we have saved its name, and the
8077                  * trial that succeeded is still valid */
8078 #  ifdef LC_ALL
8079                 const char * individ_locales[LC_ALL_INDEX_] = { NULL };
8080
8081                 /* Even though we know the valid string for LC_ALL that worked,
8082                  * translate it into our internal format, which is the
8083                  * name=value pairs notation.  This is easier for a human to
8084                  * decipher than the positional notation.  Some platforms
8085                  * can return "C C C C C C" for LC_ALL.  This code also
8086                  * standardizes that result into plain "C". */
8087                 switch (parse_LC_ALL_string(lc_all_string,
8088                                             (const char **) &individ_locales,
8089                                             no_override,
8090                                             false,   /* Return only [0] if
8091                                                         suffices */
8092                                             false,   /* Don't panic on error */
8093                                             __LINE__))
8094                 {
8095                   case invalid:
8096
8097                     /* Here, the parse failed, which shouldn't happen, but if
8098                      * it does, we have an easy fallback that allows us to keep
8099                      * going. */
8100                     name = lc_all_string;
8101                     break;
8102
8103                   case no_array:    /* The original is a single locale */
8104                     name = lc_all_string;
8105                     break;
8106
8107                   case only_element_0:  /* element[0] is a single locale valid
8108                                            for all categories */
8109                     SAVEFREEPV(individ_locales[0]);
8110                     name = individ_locales[0];
8111                     break;
8112
8113                   case full_array:
8114                     name = calculate_LC_ALL_string(individ_locales,
8115                                                    INTERNAL_FORMAT,
8116                                                    WANT_TEMP_PV,
8117                                                    __LINE__);
8118                     for_all_individual_category_indexes(j) {
8119                         Safefree(individ_locales[j]);
8120                     }
8121                 }
8122 #  else
8123                 name = calculate_LC_ALL_string(curlocales,
8124                                                INTERNAL_FORMAT,
8125                                                WANT_TEMP_PV,
8126                                                __LINE__);
8127 #  endif
8128                 description = GET_DESCRIPTION(trial, name);
8129             }
8130             else {
8131
8132                 /* Nothing seems to be working, yet we want to continue
8133                  * executing.  It may well be that locales are mostly
8134                  * irrelevant to this particular program, and there must be
8135                  * some locale underlying the program.  Figure it out as best
8136                  * we can, by querying the system's current locale */
8137
8138 #  ifdef LC_ALL
8139
8140                 STDIZED_SETLOCALE_LOCK;
8141                 name = stdized_setlocale(LC_ALL, NULL);
8142                 STDIZED_SETLOCALE_UNLOCK;
8143
8144                 if (UNLIKELY(! name)) {
8145                     name = "locale name not determinable";
8146                 }
8147
8148 #  else /* Below is ! LC_ALL */
8149
8150                 const char * system_locales[LC_ALL_INDEX_] = { NULL };
8151
8152                 for_all_individual_category_indexes(j) {
8153                     STDIZED_SETLOCALE_LOCK;
8154                     system_locales[j] = savepv(stdized_setlocale(categories[j],
8155                                                                  NULL));
8156                     STDIZED_SETLOCALE_UNLOCK;
8157
8158                     if (UNLIKELY(! system_locales[j])) {
8159                         system_locales[j] = "not determinable";
8160                     }
8161                 }
8162
8163                 /* We use the name=value form for the string, as that is more
8164                  * human readable than the positional notation */
8165                 name = calculate_LC_ALL_string(system_locales,
8166                                                INTERNAL_FORMAT,
8167                                                WANT_TEMP_PV,
8168                                                __LINE__);
8169                 description = "what the system says";
8170
8171                 for_all_individual_category_indexes(j) {
8172                     Safefree(system_locales[j]);
8173                 }
8174 #  endif
8175             }
8176
8177             PerlIO_printf(Perl_error_log,
8178                           "perl: warning: Falling back to %s (\"%s\").\n",
8179                           description, name);
8180
8181             /* Here, ok being true indicates that the first attempt failed, but
8182              * a fallback succeeded; false => nothing working.  Translate to
8183              * API return values. */
8184             ok = (ok) ? 0 : -1;
8185         }
8186     }
8187
8188 #  ifdef LC_ALL
8189
8190     give_perl_locale_control(lc_all_string, __LINE__);
8191     Safefree(lc_all_string);
8192
8193 #  else
8194
8195     give_perl_locale_control((const char **) &curlocales, __LINE__);
8196
8197     for_all_individual_category_indexes(j) {
8198         Safefree(curlocales[j]);
8199     }
8200
8201 #  endif
8202 #  if defined(USE_PERLIO) && defined(USE_LOCALE_CTYPE)
8203
8204     /* Set PL_utf8locale to TRUE if using PerlIO _and_ the current LC_CTYPE
8205      * locale is UTF-8.  give_perl_locale_control() just above has already
8206      * calculated the latter value and saved it in PL_in_utf8_CTYPE_locale. If
8207      * both PL_utf8locale and PL_unicode (set by -C or by $ENV{PERL_UNICODE})
8208      * are true, perl.c:S_parse_body() will turn on the PerlIO :utf8 layer on
8209      * STDIN, STDOUT, STDERR, _and_ the default open discipline.  */
8210     PL_utf8locale = PL_in_utf8_CTYPE_locale;
8211
8212     /* Set PL_unicode to $ENV{PERL_UNICODE} if using PerlIO.
8213        This is an alternative to using the -C command line switch
8214        (the -C if present will override this). */
8215     {
8216          const char *p = PerlEnv_getenv("PERL_UNICODE");
8217          PL_unicode = p ? parse_unicode_opts(&p) : 0;
8218          if (PL_unicode & PERL_UNICODE_UTF8CACHEASSERT_FLAG)
8219              PL_utf8cache = -1;
8220     }
8221
8222 #  endif
8223 #  if defined(USE_POSIX_2008_LOCALE) && defined(MULTIPLICITY)
8224     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
8225                            "finished Perl_init_i18nl10n; actual obj=%p,"
8226                            " expected obj=%p, initial=%s\n",
8227                            uselocale(0), PL_cur_locale_obj,
8228                            get_LC_ALL_display()));
8229 #  endif
8230
8231     /* So won't continue to output stuff */
8232     DEBUG_INITIALIZATION_set(FALSE);
8233
8234 #endif /* USE_LOCALE */
8235
8236     return ok;
8237 }
8238
8239 #undef GET_DESCRIPTION
8240 #ifdef USE_LOCALE_COLLATE
8241
8242 STATIC void
8243 S_compute_collxfrm_coefficients(pTHX)
8244 {
8245
8246     /* A locale collation definition includes primary, secondary, tertiary,
8247      * etc. weights for each character.  To sort, the primary weights are used,
8248      * and only if they compare equal, then the secondary weights are used, and
8249      * only if they compare equal, then the tertiary, etc.
8250      *
8251      * strxfrm() works by taking the input string, say ABC, and creating an
8252      * output transformed string consisting of first the primary weights,
8253      * A¹B¹C¹ followed by the secondary ones, A²B²C²; and then the tertiary,
8254      * etc, yielding A¹B¹C¹ A²B²C² A³B³C³ ....  Some characters may not have
8255      * weights at every level.  In our example, let's say B doesn't have a
8256      * tertiary weight, and A doesn't have a secondary weight.  The constructed
8257      * string is then going to be
8258      *  A¹B¹C¹ B²C² A³C³ ....
8259      * This has the desired effect that strcmp() will look at the secondary or
8260      * tertiary weights only if the strings compare equal at all higher
8261      * priority weights.  The spaces shown here, like in
8262      *  "A¹B¹C¹ A²B²C² "
8263      * are not just for readability.  In the general case, these must actually
8264      * be bytes, which we will call here 'separator weights'; and they must be
8265      * smaller than any other weight value, but since these are C strings, only
8266      * the terminating one can be a NUL (some implementations may include a
8267      * non-NUL separator weight just before the NUL).  Implementations tend to
8268      * reserve 01 for the separator weights.  They are needed so that a shorter
8269      * string's secondary weights won't be misconstrued as primary weights of a
8270      * longer string, etc.  By making them smaller than any other weight, the
8271      * shorter string will sort first.  (Actually, if all secondary weights are
8272      * smaller than all primary ones, there is no need for a separator weight
8273      * between those two levels, etc.)
8274      *
8275      * The length of the transformed string is roughly a linear function of the
8276      * input string.  It's not exactly linear because some characters don't
8277      * have weights at all levels.  When we call strxfrm() we have to allocate
8278      * some memory to hold the transformed string.  The calculations below try
8279      * to find coefficients 'm' and 'b' for this locale so that m*x + b equals
8280      * how much space we need, given the size of the input string in 'x'.  If
8281      * we calculate too small, we increase the size as needed, and call
8282      * strxfrm() again, but it is better to get it right the first time to
8283      * avoid wasted expensive string transformations.
8284      *
8285      * We use the string below to find how long the transformation of it is.
8286      * Almost all locales are supersets of ASCII, or at least the ASCII
8287      * letters.  We use all of them, half upper half lower, because if we used
8288      * fewer, we might hit just the ones that are outliers in a particular
8289      * locale.  Most of the strings being collated will contain a preponderance
8290      * of letters, and even if they are above-ASCII, they are likely to have
8291      * the same number of weight levels as the ASCII ones.  It turns out that
8292      * digits tend to have fewer levels, and some punctuation has more, but
8293      * those are relatively sparse in text, and khw believes this gives a
8294      * reasonable result, but it could be changed if experience so dictates. */
8295     const char longer[] = "ABCDEFGHIJKLMnopqrstuvwxyz";
8296     char * x_longer;        /* Transformed 'longer' */
8297     Size_t x_len_longer;    /* Length of 'x_longer' */
8298
8299     char * x_shorter;   /* We also transform a substring of 'longer' */
8300     Size_t x_len_shorter;
8301
8302     PL_in_utf8_COLLATE_locale = (PL_collation_standard)
8303                                 ? 0
8304                                 : is_locale_utf8(PL_collation_name);
8305     PL_strxfrm_NUL_replacement = '\0';
8306     PL_strxfrm_max_cp = 0;
8307
8308     /* mem_collxfrm_() is used get the transformation (though here we are
8309      * interested only in its length).  It is used because it has the
8310      * intelligence to handle all cases, but to work, it needs some values of
8311      * 'm' and 'b' to get it started.  For the purposes of this calculation we
8312      * use a very conservative estimate of 'm' and 'b'.  This assumes a weight
8313      * can be multiple bytes, enough to hold any UV on the platform, and there
8314      * are 5 levels, 4 weight bytes, and a trailing NUL.  */
8315     PL_collxfrm_base = 5;
8316     PL_collxfrm_mult = 5 * sizeof(UV);
8317
8318     /* Find out how long the transformation really is */
8319     x_longer = mem_collxfrm_(longer,
8320                              sizeof(longer) - 1,
8321                              &x_len_longer,
8322
8323                              /* We avoid converting to UTF-8 in the called
8324                               * function by telling it the string is in UTF-8
8325                               * if the locale is a UTF-8 one.  Since the string
8326                               * passed here is invariant under UTF-8, we can
8327                               * claim it's UTF-8 even if it isn't.  */
8328                               PL_in_utf8_COLLATE_locale);
8329     Safefree(x_longer);
8330
8331     /* Find out how long the transformation of a substring of 'longer' is.
8332      * Together the lengths of these transformations are sufficient to
8333      * calculate 'm' and 'b'.  The substring is all of 'longer' except the
8334      * first character.  This minimizes the chances of being swayed by outliers
8335      * */
8336     x_shorter = mem_collxfrm_(longer + 1,
8337                               sizeof(longer) - 2,
8338                               &x_len_shorter,
8339                               PL_in_utf8_COLLATE_locale);
8340     Safefree(x_shorter);
8341
8342     /* If the results are nonsensical for this simple test, the whole locale
8343      * definition is suspect.  Mark it so that locale collation is not active
8344      * at all for it.  XXX Should we warn? */
8345     if (   x_len_shorter == 0
8346         || x_len_longer == 0
8347         || x_len_shorter >= x_len_longer)
8348     {
8349         PL_collxfrm_mult = 0;
8350         PL_collxfrm_base = 1;
8351         DEBUG_L(PerlIO_printf(Perl_debug_log,
8352                 "Disabling locale collation for LC_COLLATE='%s';"
8353                 " length for shorter sample=%zu; longer=%zu\n",
8354                 PL_collation_name, x_len_shorter, x_len_longer));
8355     }
8356     else {
8357         SSize_t base;       /* Temporary */
8358
8359         /* We have both: m * strlen(longer)  + b = x_len_longer
8360          *               m * strlen(shorter) + b = x_len_shorter;
8361          * subtracting yields:
8362          *          m * (strlen(longer) - strlen(shorter))
8363          *                             = x_len_longer - x_len_shorter
8364          * But we have set things up so that 'shorter' is 1 byte smaller than
8365          * 'longer'.  Hence:
8366          *          m = x_len_longer - x_len_shorter
8367          *
8368          * But if something went wrong, make sure the multiplier is at least 1.
8369          */
8370         if (x_len_longer > x_len_shorter) {
8371             PL_collxfrm_mult = (STRLEN) x_len_longer - x_len_shorter;
8372         }
8373         else {
8374             PL_collxfrm_mult = 1;
8375         }
8376
8377         /*     mx + b = len
8378          * so:      b = len - mx
8379          * but in case something has gone wrong, make sure it is non-negative
8380          * */
8381         base = x_len_longer - PL_collxfrm_mult * (sizeof(longer) - 1);
8382         if (base < 0) {
8383             base = 0;
8384         }
8385
8386         /* Add 1 for the trailing NUL */
8387         PL_collxfrm_base = base + 1;
8388     }
8389
8390     DEBUG_L(PerlIO_printf(Perl_debug_log,
8391                           "?UTF-8 locale=%d; x_len_shorter=%zu, "
8392                           "x_len_longer=%zu,"
8393                           " collate multipler=%zu, collate base=%zu\n",
8394                           PL_in_utf8_COLLATE_locale,
8395                           x_len_shorter, x_len_longer,
8396                           PL_collxfrm_mult, PL_collxfrm_base));
8397 }
8398
8399 char *
8400 Perl_mem_collxfrm_(pTHX_ const char *input_string,
8401                          STRLEN len,    /* Length of 'input_string' */
8402                          STRLEN *xlen,  /* Set to length of returned string
8403                                            (not including the collation index
8404                                            prefix) */
8405                          bool utf8      /* Is the input in UTF-8? */
8406                    )
8407 {
8408     /* mem_collxfrm_() is like strxfrm() but with two important differences.
8409      * First, it handles embedded NULs. Second, it allocates a bit more memory
8410      * than needed for the transformed data itself.  The real transformed data
8411      * begins at offset COLLXFRM_HDR_LEN.  *xlen is set to the length of that,
8412      * and doesn't include the collation index size.
8413      *
8414      * It is the caller's responsibility to eventually free the memory returned
8415      * by this function.
8416      *
8417      * Please see sv_collxfrm() to see how this is used. */
8418
8419 #  define COLLXFRM_HDR_LEN    sizeof(PL_collation_ix)
8420
8421     char * s = (char *) input_string;
8422     STRLEN s_strlen = strlen(input_string);
8423     char *xbuf = NULL;
8424     STRLEN xAlloc;          /* xalloc is a reserved word in VC */
8425     STRLEN length_in_chars;
8426     bool first_time = TRUE; /* Cleared after first loop iteration */
8427
8428 #  ifdef USE_LOCALE_CTYPE
8429         const char * orig_CTYPE_locale = NULL;
8430 #  endif
8431
8432 #  if defined(USE_POSIX_2008_LOCALE) && defined HAS_STRXFRM_L
8433     locale_t constructed_locale = (locale_t) 0;
8434 #  endif
8435
8436     PERL_ARGS_ASSERT_MEM_COLLXFRM_;
8437
8438     /* Must be NUL-terminated */
8439     assert(*(input_string + len) == '\0');
8440
8441     if (PL_collxfrm_mult == 0) {     /* unknown or bad */
8442         if (PL_collxfrm_base != 0) { /* bad collation => skip */
8443             DEBUG_L(PerlIO_printf(Perl_debug_log,
8444                           "mem_collxfrm_: locale's collation is defective\n"));
8445             goto bad;
8446         }
8447
8448         /* (mult, base) == (0,0) means we need to calculate mult and base
8449          * before proceeding */
8450         S_compute_collxfrm_coefficients(aTHX);
8451     }
8452
8453     /* Replace any embedded NULs with the control that sorts before any others.
8454      * This will give as good as possible results on strings that don't
8455      * otherwise contain that character, but otherwise there may be
8456      * less-than-perfect results with that character and NUL.  This is
8457      * unavoidable unless we replace strxfrm with our own implementation. */
8458     if (UNLIKELY(s_strlen < len)) {   /* Only execute if there is an embedded
8459                                          NUL */
8460         char * e = s + len;
8461         char * sans_nuls;
8462         STRLEN sans_nuls_len;
8463         int try_non_controls;
8464         char this_replacement_char[] = "?\0";   /* Room for a two-byte string,
8465                                                    making sure 2nd byte is NUL.
8466                                                  */
8467         STRLEN this_replacement_len;
8468
8469         /* If we don't know what non-NUL control character sorts lowest for
8470          * this locale, find it */
8471         if (PL_strxfrm_NUL_replacement == '\0') {
8472             int j;
8473             char * cur_min_x = NULL;    /* The min_char's xfrm, (except it also
8474                                            includes the collation index
8475                                            prefixed. */
8476
8477             DEBUG_Lv(PerlIO_printf(Perl_debug_log, "Looking to replace NUL\n"));
8478
8479             /* Unlikely, but it may be that no control will work to replace
8480              * NUL, in which case we instead look for any character.  Controls
8481              * are preferred because collation order is, in general, context
8482              * sensitive, with adjoining characters affecting the order, and
8483              * controls are less likely to have such interactions, allowing the
8484              * NUL-replacement to stand on its own.  (Another way to look at it
8485              * is to imagine what would happen if the NUL were replaced by a
8486              * combining character; it wouldn't work out all that well.) */
8487             for (try_non_controls = 0;
8488                  try_non_controls < 2;
8489                  try_non_controls++)
8490             {
8491
8492 #  ifdef USE_LOCALE_CTYPE
8493
8494                 /* In this case we use isCNTRL_LC() below, which relies on
8495                  * LC_CTYPE, so that must be switched to correspond with the
8496                  * LC_COLLATE locale */
8497                 if (! try_non_controls && ! PL_in_utf8_COLLATE_locale) {
8498                     orig_CTYPE_locale = toggle_locale_c(LC_CTYPE,
8499                                                         PL_collation_name);
8500                 }
8501 #  endif
8502                 /* Look through all legal code points (NUL isn't) */
8503                 for (j = 1; j < 256; j++) {
8504                     char * x;       /* j's xfrm plus collation index */
8505                     STRLEN x_len;   /* length of 'x' */
8506                     STRLEN trial_len = 1;
8507                     char cur_source[] = { '\0', '\0' };
8508
8509                     /* Skip non-controls the first time through the loop.  The
8510                      * controls in a UTF-8 locale are the L1 ones */
8511                     if (! try_non_controls && (PL_in_utf8_COLLATE_locale)
8512                                                ? ! isCNTRL_L1(j)
8513                                                : ! isCNTRL_LC(j))
8514                     {
8515                         continue;
8516                     }
8517
8518                     /* Create a 1-char string of the current code point */
8519                     cur_source[0] = (char) j;
8520
8521                     /* Then transform it */
8522                     x = mem_collxfrm_(cur_source, trial_len, &x_len,
8523                                       0 /* The string is not in UTF-8 */);
8524
8525                     /* Ignore any character that didn't successfully transform.
8526                      * */
8527                     if (! x) {
8528                         continue;
8529                     }
8530
8531                     /* If this character's transformation is lower than
8532                      * the current lowest, this one becomes the lowest */
8533                     if (   cur_min_x == NULL
8534                         || strLT(x         + COLLXFRM_HDR_LEN,
8535                                  cur_min_x + COLLXFRM_HDR_LEN))
8536                     {
8537                         PL_strxfrm_NUL_replacement = j;
8538                         Safefree(cur_min_x);
8539                         cur_min_x = x;
8540                     }
8541                     else {
8542                         Safefree(x);
8543                     }
8544                 } /* end of loop through all 255 characters */
8545
8546 #  ifdef USE_LOCALE_CTYPE
8547                 restore_toggled_locale_c(LC_CTYPE, orig_CTYPE_locale);
8548 #  endif
8549
8550                 /* Stop looking if found */
8551                 if (cur_min_x) {
8552                     break;
8553                 }
8554
8555                 /* Unlikely, but possible, if there aren't any controls that
8556                  * work in the locale, repeat the loop, looking for any
8557                  * character that works */
8558                 DEBUG_L(PerlIO_printf(Perl_debug_log,
8559                 "mem_collxfrm_: No control worked.  Trying non-controls\n"));
8560             } /* End of loop to try first the controls, then any char */
8561
8562             if (! cur_min_x) {
8563                 DEBUG_L(PerlIO_printf(Perl_debug_log,
8564                     "mem_collxfrm_: Couldn't find any character to replace"
8565                     " embedded NULs in locale %s with", PL_collation_name));
8566                 goto bad;
8567             }
8568
8569             DEBUG_L(PerlIO_printf(Perl_debug_log,
8570                     "mem_collxfrm_: Replacing embedded NULs in locale %s with "
8571                     "0x%02X\n", PL_collation_name, PL_strxfrm_NUL_replacement));
8572
8573             Safefree(cur_min_x);
8574         } /* End of determining the character that is to replace NULs */
8575
8576         /* If the replacement is variant under UTF-8, it must match the
8577          * UTF8-ness of the original */
8578         if ( ! UVCHR_IS_INVARIANT(PL_strxfrm_NUL_replacement) && utf8) {
8579             this_replacement_char[0] =
8580                                 UTF8_EIGHT_BIT_HI(PL_strxfrm_NUL_replacement);
8581             this_replacement_char[1] =
8582                                 UTF8_EIGHT_BIT_LO(PL_strxfrm_NUL_replacement);
8583             this_replacement_len = 2;
8584         }
8585         else {
8586             this_replacement_char[0] = PL_strxfrm_NUL_replacement;
8587             /* this_replacement_char[1] = '\0' was done at initialization */
8588             this_replacement_len = 1;
8589         }
8590
8591         /* The worst case length for the replaced string would be if every
8592          * character in it is NUL.  Multiply that by the length of each
8593          * replacement, and allow for a trailing NUL */
8594         sans_nuls_len = (len * this_replacement_len) + 1;
8595         Newx(sans_nuls, sans_nuls_len, char);
8596         *sans_nuls = '\0';
8597
8598         /* Replace each NUL with the lowest collating control.  Loop until have
8599          * exhausted all the NULs */
8600         while (s + s_strlen < e) {
8601             my_strlcat(sans_nuls, s, sans_nuls_len);
8602
8603             /* Do the actual replacement */
8604             my_strlcat(sans_nuls, this_replacement_char, sans_nuls_len);
8605
8606             /* Move past the input NUL */
8607             s += s_strlen + 1;
8608             s_strlen = strlen(s);
8609         }
8610
8611         /* And add anything that trails the final NUL */
8612         my_strlcat(sans_nuls, s, sans_nuls_len);
8613
8614         /* Switch so below we transform this modified string */
8615         s = sans_nuls;
8616         len = strlen(s);
8617     } /* End of replacing NULs */
8618
8619     /* Make sure the UTF8ness of the string and locale match */
8620     if (utf8 != PL_in_utf8_COLLATE_locale) {
8621         /* XXX convert above Unicode to 10FFFF? */
8622         const char * const t = s;   /* Temporary so we can later find where the
8623                                        input was */
8624
8625         /* Here they don't match.  Change the string's to be what the locale is
8626          * expecting */
8627
8628         if (! utf8) { /* locale is UTF-8, but input isn't; upgrade the input */
8629             s = (char *) bytes_to_utf8((const U8 *) s, &len);
8630             utf8 = TRUE;
8631         }
8632         else {   /* locale is not UTF-8; but input is; downgrade the input */
8633
8634             s = (char *) bytes_from_utf8((const U8 *) s, &len, &utf8);
8635
8636             /* If the downgrade was successful we are done, but if the input
8637              * contains things that require UTF-8 to represent, have to do
8638              * damage control ... */
8639             if (UNLIKELY(utf8)) {
8640
8641                 /* What we do is construct a non-UTF-8 string with
8642                  *  1) the characters representable by a single byte converted
8643                  *     to be so (if necessary);
8644                  *  2) and the rest converted to collate the same as the
8645                  *     highest collating representable character.  That makes
8646                  *     them collate at the end.  This is similar to how we
8647                  *     handle embedded NULs, but we use the highest collating
8648                  *     code point instead of the smallest.  Like the NUL case,
8649                  *     this isn't perfect, but is the best we can reasonably
8650                  *     do.  Every above-255 code point will sort the same as
8651                  *     the highest-sorting 0-255 code point.  If that code
8652                  *     point can combine in a sequence with some other code
8653                  *     points for weight calculations, us changing something to
8654                  *     be it can adversely affect the results.  But in most
8655                  *     cases, it should work reasonably.  And note that this is
8656                  *     really an illegal situation: using code points above 255
8657                  *     on a locale where only 0-255 are valid.  If two strings
8658                  *     sort entirely equal, then the sort order for the
8659                  *     above-255 code points will be in code point order. */
8660
8661                 utf8 = FALSE;
8662
8663                 /* If we haven't calculated the code point with the maximum
8664                  * collating order for this locale, do so now */
8665                 if (! PL_strxfrm_max_cp) {
8666                     int j;
8667
8668                     /* The current transformed string that collates the
8669                      * highest (except it also includes the prefixed collation
8670                      * index. */
8671                     char * cur_max_x = NULL;
8672
8673                     /* Look through all legal code points (NUL isn't) */
8674                     for (j = 1; j < 256; j++) {
8675                         char * x;
8676                         STRLEN x_len;
8677                         char cur_source[] = { '\0', '\0' };
8678
8679                         /* Create a 1-char string of the current code point */
8680                         cur_source[0] = (char) j;
8681
8682                         /* Then transform it */
8683                         x = mem_collxfrm_(cur_source, 1, &x_len, FALSE);
8684
8685                         /* If something went wrong (which it shouldn't), just
8686                          * ignore this code point */
8687                         if (! x) {
8688                             continue;
8689                         }
8690
8691                         /* If this character's transformation is higher than
8692                          * the current highest, this one becomes the highest */
8693                         if (   cur_max_x == NULL
8694                             || strGT(x         + COLLXFRM_HDR_LEN,
8695                                      cur_max_x + COLLXFRM_HDR_LEN))
8696                         {
8697                             PL_strxfrm_max_cp = j;
8698                             Safefree(cur_max_x);
8699                             cur_max_x = x;
8700                         }
8701                         else {
8702                             Safefree(x);
8703                         }
8704                     }
8705
8706                     if (! cur_max_x) {
8707                         DEBUG_L(PerlIO_printf(Perl_debug_log,
8708                             "mem_collxfrm_: Couldn't find any character to"
8709                             " replace above-Latin1 chars in locale %s with",
8710                             PL_collation_name));
8711                         goto bad;
8712                     }
8713
8714                     DEBUG_L(PerlIO_printf(Perl_debug_log,
8715                             "mem_collxfrm_: highest 1-byte collating character"
8716                             " in locale %s is 0x%02X\n",
8717                             PL_collation_name,
8718                             PL_strxfrm_max_cp));
8719
8720                     Safefree(cur_max_x);
8721                 }
8722
8723                 /* Here we know which legal code point collates the highest.
8724                  * We are ready to construct the non-UTF-8 string.  The length
8725                  * will be at least 1 byte smaller than the input string
8726                  * (because we changed at least one 2-byte character into a
8727                  * single byte), but that is eaten up by the trailing NUL */
8728                 Newx(s, len, char);
8729
8730                 {
8731                     STRLEN i;
8732                     STRLEN d= 0;
8733                     char * e = (char *) t + len;
8734
8735                     for (i = 0; i < len; i+= UTF8SKIP(t + i)) {
8736                         U8 cur_char = t[i];
8737                         if (UTF8_IS_INVARIANT(cur_char)) {
8738                             s[d++] = cur_char;
8739                         }
8740                         else if (UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(t + i, e)) {
8741                             s[d++] = EIGHT_BIT_UTF8_TO_NATIVE(cur_char, t[i+1]);
8742                         }
8743                         else {  /* Replace illegal cp with highest collating
8744                                    one */
8745                             s[d++] = PL_strxfrm_max_cp;
8746                         }
8747                     }
8748                     s[d++] = '\0';
8749                     Renew(s, d, char);   /* Free up unused space */
8750                 }
8751             }
8752         }
8753
8754         /* Here, we have constructed a modified version of the input.  It could
8755          * be that we already had a modified copy before we did this version.
8756          * If so, that copy is no longer needed */
8757         if (t != input_string) {
8758             Safefree(t);
8759         }
8760     }
8761
8762     length_in_chars = (utf8)
8763                       ? utf8_length((U8 *) s, (U8 *) s + len)
8764                       : len;
8765
8766     /* The first element in the output is the collation id, used by
8767      * sv_collxfrm(); then comes the space for the transformed string.  The
8768      * equation should give us a good estimate as to how much is needed */
8769     xAlloc = COLLXFRM_HDR_LEN
8770            + PL_collxfrm_base
8771            + (PL_collxfrm_mult * length_in_chars);
8772     Newx(xbuf, xAlloc, char);
8773     if (UNLIKELY(! xbuf)) {
8774         DEBUG_L(PerlIO_printf(Perl_debug_log,
8775                       "mem_collxfrm_: Couldn't malloc %zu bytes\n", xAlloc));
8776         goto bad;
8777     }
8778
8779     /* Store the collation id */
8780     *(PERL_UINTMAX_T *)xbuf = PL_collation_ix;
8781
8782 #  if defined(USE_POSIX_2008_LOCALE) && defined HAS_STRXFRM_L
8783 #    ifdef USE_LOCALE_CTYPE
8784
8785     constructed_locale = newlocale(LC_CTYPE_MASK, PL_collation_name,
8786                                    duplocale(use_curlocale_scratch()));
8787 #    else
8788
8789     constructed_locale = duplocale(use_curlocale_scratch());
8790
8791 #    endif
8792 #    define my_strxfrm(dest, src, n)  strxfrm_l(dest, src, n,           \
8793                                                 constructed_locale)
8794 #    define CLEANUP_STRXFRM                                             \
8795         STMT_START {                                                    \
8796             if (constructed_locale != (locale_t) 0)                     \
8797                 freelocale(constructed_locale);                         \
8798         } STMT_END
8799 #  else
8800 #    define my_strxfrm(dest, src, n)  strxfrm(dest, src, n)
8801 #    ifdef USE_LOCALE_CTYPE
8802
8803     orig_CTYPE_locale = toggle_locale_c(LC_CTYPE, PL_collation_name);
8804
8805 #      define CLEANUP_STRXFRM                                           \
8806                 restore_toggled_locale_c(LC_CTYPE, orig_CTYPE_locale)
8807 #    else
8808 #      define CLEANUP_STRXFRM  NOOP
8809 #    endif
8810 #  endif
8811
8812     /* Then the transformation of the input.  We loop until successful, or we
8813      * give up */
8814     for (;;) {
8815
8816         errno = 0;
8817         *xlen = my_strxfrm(xbuf + COLLXFRM_HDR_LEN,
8818                            s,
8819                            xAlloc - COLLXFRM_HDR_LEN);
8820
8821
8822         /* If the transformed string occupies less space than we told strxfrm()
8823          * was available, it means it transformed the whole string. */
8824         if (*xlen < xAlloc - COLLXFRM_HDR_LEN) {
8825
8826             /* But there still could have been a problem */
8827             if (errno != 0) {
8828                 DEBUG_L(PerlIO_printf(Perl_debug_log,
8829                        "strxfrm failed for LC_COLLATE=%s; errno=%d, input=%s\n",
8830                        PL_collation_name, errno,
8831                        _byte_dump_string((U8 *) s, len, 0)));
8832                 goto bad;
8833             }
8834
8835             /* Here, the transformation was successful.  Some systems include a
8836              * trailing NUL in the returned length.  Ignore it, using a loop in
8837              * case multiple trailing NULs are returned. */
8838             while (   (*xlen) > 0
8839                    && *(xbuf + COLLXFRM_HDR_LEN + (*xlen) - 1) == '\0')
8840             {
8841                 (*xlen)--;
8842             }
8843
8844             /* If the first try didn't get it, it means our prediction was low.
8845              * Modify the coefficients so that we predict a larger value in any
8846              * future transformations */
8847             if (! first_time) {
8848                 STRLEN needed = *xlen + 1;   /* +1 For trailing NUL */
8849                 STRLEN computed_guess = PL_collxfrm_base
8850                                       + (PL_collxfrm_mult * length_in_chars);
8851
8852                 /* On zero-length input, just keep current slope instead of
8853                  * dividing by 0 */
8854                 const STRLEN new_m = (length_in_chars != 0)
8855                                      ? needed / length_in_chars
8856                                      : PL_collxfrm_mult;
8857
8858                 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
8859                     "initial size of %zu bytes for a length "
8860                     "%zu string was insufficient, %zu needed\n",
8861                     computed_guess, length_in_chars, needed));
8862
8863                 /* If slope increased, use it, but discard this result for
8864                  * length 1 strings, as we can't be sure that it's a real slope
8865                  * change */
8866                 if (length_in_chars > 1 && new_m  > PL_collxfrm_mult) {
8867
8868 #  ifdef DEBUGGING
8869
8870                     STRLEN old_m = PL_collxfrm_mult;
8871                     STRLEN old_b = PL_collxfrm_base;
8872
8873 #  endif
8874
8875                     PL_collxfrm_mult = new_m;
8876                     PL_collxfrm_base = 1;   /* +1 For trailing NUL */
8877                     computed_guess = PL_collxfrm_base
8878                                     + (PL_collxfrm_mult * length_in_chars);
8879                     if (computed_guess < needed) {
8880                         PL_collxfrm_base += needed - computed_guess;
8881                     }
8882
8883                     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
8884                                     "slope is now %zu; was %zu, base "
8885                         "is now %zu; was %zu\n",
8886                         PL_collxfrm_mult, old_m,
8887                         PL_collxfrm_base, old_b));
8888                 }
8889                 else {  /* Slope didn't change, but 'b' did */
8890                     const STRLEN new_b = needed
8891                                         - computed_guess
8892                                         + PL_collxfrm_base;
8893                     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
8894                         "base is now %zu; was %zu\n", new_b, PL_collxfrm_base));
8895                     PL_collxfrm_base = new_b;
8896                 }
8897             }
8898
8899             break;
8900         }
8901
8902         if (UNLIKELY(*xlen >= PERL_INT_MAX)) {
8903             DEBUG_L(PerlIO_printf(Perl_debug_log,
8904                   "mem_collxfrm_: Needed %zu bytes, max permissible is %u\n",
8905                   *xlen, PERL_INT_MAX));
8906             goto bad;
8907         }
8908
8909         /* A well-behaved strxfrm() returns exactly how much space it needs
8910          * (usually not including the trailing NUL) when it fails due to not
8911          * enough space being provided.  Assume that this is the case unless
8912          * it's been proven otherwise */
8913         if (LIKELY(PL_strxfrm_is_behaved) && first_time) {
8914             xAlloc = *xlen + COLLXFRM_HDR_LEN + 1;
8915         }
8916         else { /* Here, either:
8917                 *  1)  The strxfrm() has previously shown bad behavior; or
8918                 *  2)  It isn't the first time through the loop, which means
8919                 *      that the strxfrm() is now showing bad behavior, because
8920                 *      we gave it what it said was needed in the previous
8921                 *      iteration, and it came back saying it needed still more.
8922                 *      (Many versions of cygwin fit this.  When the buffer size
8923                 *      isn't sufficient, they return the input size instead of
8924                 *      how much is needed.)
8925                 * Increase the buffer size by a fixed percentage and try again.
8926                 * */
8927             xAlloc += (xAlloc / 4) + 1;
8928             PL_strxfrm_is_behaved = FALSE;
8929
8930             DEBUG_Lv(PerlIO_printf(Perl_debug_log,
8931                      "mem_collxfrm_ required more space than previously"
8932                      " calculated for locale %s, trying again with new"
8933                      " guess=%zu+%zu\n",
8934                 PL_collation_name,  COLLXFRM_HDR_LEN,
8935                      xAlloc - COLLXFRM_HDR_LEN));
8936         }
8937
8938         Renew(xbuf, xAlloc, char);
8939         if (UNLIKELY(! xbuf)) {
8940             DEBUG_L(PerlIO_printf(Perl_debug_log,
8941                       "mem_collxfrm_: Couldn't realloc %zu bytes\n", xAlloc));
8942             goto bad;
8943         }
8944
8945         first_time = FALSE;
8946     }
8947
8948     CLEANUP_STRXFRM;
8949
8950     DEBUG_L(print_collxfrm_input_and_return(s, s + len, xbuf, *xlen, utf8));
8951
8952     /* Free up unneeded space; retain enough for trailing NUL */
8953     Renew(xbuf, COLLXFRM_HDR_LEN + *xlen + 1, char);
8954
8955     if (s != input_string) {
8956         Safefree(s);
8957     }
8958
8959     return xbuf;
8960
8961   bad:
8962
8963     CLEANUP_STRXFRM;
8964     DEBUG_L(print_collxfrm_input_and_return(s, s + len, NULL, 0, utf8));
8965
8966     Safefree(xbuf);
8967     if (s != input_string) {
8968         Safefree(s);
8969     }
8970     *xlen = 0;
8971
8972     return NULL;
8973 }
8974
8975 #  ifdef DEBUGGING
8976
8977 STATIC void
8978 S_print_collxfrm_input_and_return(pTHX_
8979                                   const char * s,
8980                                   const char * e,
8981                                   const char * xbuf,
8982                                   const STRLEN xlen,
8983                                   const bool is_utf8)
8984 {
8985
8986     PERL_ARGS_ASSERT_PRINT_COLLXFRM_INPUT_AND_RETURN;
8987
8988     PerlIO_printf(Perl_debug_log,
8989                   "mem_collxfrm_[ix %" UVuf "] for locale '%s':\n"
8990                   "     input=%s\n    return=%s\n    return len=%zu\n",
8991                   (UV) PL_collation_ix, PL_collation_name,
8992                   get_displayable_string(s, e, is_utf8),
8993                   ((xbuf == NULL)
8994                    ? "(null)"
8995                    : ((xlen == 0)
8996                       ? "(empty)"
8997                       : _byte_dump_string((U8 *) xbuf + COLLXFRM_HDR_LEN,
8998                                           xlen, 0))),
8999                   xlen);
9000 }
9001
9002 #  endif    /* DEBUGGING */
9003
9004 SV *
9005 Perl_strxfrm(pTHX_ SV * src)
9006 {
9007     PERL_ARGS_ASSERT_STRXFRM;
9008
9009     /* For use by POSIX::strxfrm().  If they differ, toggle LC_CTYPE to
9010      * LC_COLLATE to avoid potential mojibake.
9011      *
9012      * If we can't calculate a collation, 'src' is instead returned, so that
9013      * future comparisons will be by code point order */
9014
9015 #  ifdef USE_LOCALE_CTYPE
9016
9017     const char * orig_ctype = toggle_locale_c(LC_CTYPE,
9018                                               querylocale_c(LC_COLLATE));
9019 #  endif
9020
9021     SV * dst = src;
9022     STRLEN dstlen;
9023     STRLEN srclen;
9024     const char *p = SvPV_const(src, srclen);
9025     const U32 utf8_flag = SvUTF8(src);
9026     char *d = mem_collxfrm_(p, srclen, &dstlen, cBOOL(utf8_flag));
9027
9028     assert(utf8_flag == 0 || utf8_flag == SVf_UTF8);
9029
9030     if (d != NULL) {
9031         assert(dstlen > 0);
9032         dst =newSVpvn_flags(d + COLLXFRM_HDR_LEN,
9033                             dstlen, SVs_TEMP|utf8_flag);
9034         Safefree(d);
9035     }
9036
9037 #  ifdef USE_LOCALE_CTYPE
9038
9039     restore_toggled_locale_c(LC_CTYPE, orig_ctype);
9040
9041 #  endif
9042
9043     return dst;
9044 }
9045
9046 #endif /* USE_LOCALE_COLLATE */
9047
9048 #ifdef USE_LOCALE
9049 #  ifdef USE_LOCALE_CTYPE
9050
9051 STATIC bool
9052 S_is_codeset_name_UTF8(const char * name)
9053 {
9054     /* Return a boolean as to if the passed-in name indicates it is a UTF-8
9055      * code set.  Several variants are possible */
9056     const Size_t len = strlen(name);
9057
9058     PERL_ARGS_ASSERT_IS_CODESET_NAME_UTF8;
9059
9060 #    ifdef WIN32
9061
9062     /* https://learn.microsoft.com/en-us/windows/win32/intl/code-page-identifiers */
9063     if (memENDs(name, len, "65001")) {
9064         return TRUE;
9065     }
9066
9067 #    endif
9068                /* 'UTF8' or 'UTF-8' */
9069     return (    inRANGE(len, 4, 5)
9070             &&  name[len-1] == '8'
9071             && (   memBEGINs(name, len, "UTF")
9072                 || memBEGINs(name, len, "utf"))
9073             && (len == 4 || name[3] == '-'));
9074 }
9075
9076 #  endif
9077 #endif  /* USE_LOCALE */
9078
9079 bool
9080 Perl__is_in_locale_category(pTHX_ const bool compiling, const int category)
9081 {
9082     /* Internal function which returns if we are in the scope of a pragma that
9083      * enables the locale category 'category'.  'compiling' should indicate if
9084      * this is during the compilation phase (TRUE) or not (FALSE). */
9085
9086     const COP * const cop = (compiling) ? &PL_compiling : PL_curcop;
9087
9088     SV *these_categories = cop_hints_fetch_pvs(cop, "locale", 0);
9089     if (! these_categories || these_categories == &PL_sv_placeholder) {
9090         return FALSE;
9091     }
9092
9093     /* The pseudo-category 'not_characters' is -1, so just add 1 to each to get
9094      * a valid unsigned */
9095     assert(category >= -1);
9096     return cBOOL(SvUV(these_categories) & (1U << (category + 1)));
9097 }
9098
9099 /* my_strerror() returns a mortalized copy of the text of the error message
9100  * associated with 'errnum'.
9101  *
9102  * If not called from within the scope of 'use locale', it uses the text from
9103  * the C locale.  If Perl is compiled to not pay attention to LC_CTYPE nor
9104  * LC_MESSAGES, it uses whatever strerror() returns.  Otherwise the text is
9105  * derived from the locale, LC_MESSAGES if we have that; LC_CTYPE if not.
9106  *
9107  * It returns in *utf8ness the result's UTF-8ness
9108  *
9109  * The function just calls strerror(), but temporarily switches locales, if
9110  * needed.  Many platforms require LC_CTYPE and LC_MESSAGES to be in the same
9111  * CODESET in order for the return from strerror() to not contain '?' symbols,
9112  * or worse, mojibaked.  It's cheaper to just use the stricter criteria of
9113  * being in the same locale.  So the code below uses a common locale for both
9114  * categories.  Again, that is C if not within 'use locale' scope; or the
9115  * LC_MESSAGES locale if in scope and we have that category; and LC_CTYPE if we
9116  * don't have LC_MESSAGES; and whatever strerror returns if we don't have
9117  * either category.
9118  *
9119  * There are two sets of implementations.  The first below is if we have
9120  * strerror_l().  This is the simpler.  We just use the already-built C locale
9121  * object if not in locale scope, or build up a custom one otherwise.
9122  *
9123  * When strerror_l() is not available, we may have to swap locales temporarily
9124  * to bring the two categories into sync with each other, and possibly to the C
9125  * locale.
9126  *
9127  * Because the prepropessing directives to conditionally compile this function
9128  * would greatly obscure the logic of the various implementations, the whole
9129  * function is repeated for each configuration, with some common macros. */
9130
9131 /* Used to shorten the definitions of the following implementations of
9132  * my_strerror() */
9133 #define DEBUG_STRERROR_ENTER(errnum, in_locale)                             \
9134     DEBUG_Lv(PerlIO_printf(Perl_debug_log,                                  \
9135                            "my_strerror called with errnum %d;"             \
9136                            " Within locale scope=%d\n",                     \
9137                            errnum, in_locale))
9138
9139 #define DEBUG_STRERROR_RETURN(errstr, utf8ness)                             \
9140     DEBUG_Lv(PerlIO_printf(Perl_debug_log,                                  \
9141                            "Strerror returned; saving a copy: '%s';"        \
9142                            " utf8ness=%d\n",                                \
9143                            get_displayable_string(errstr,                   \
9144                                                   errstr + strlen(errstr),  \
9145                                                   *utf8ness),               \
9146                            (int) *utf8ness))
9147
9148 /* On platforms that have precisely one of these categories (Windows
9149  * qualifies), these yield the correct one */
9150 #if defined(USE_LOCALE_CTYPE)
9151 #  define WHICH_LC_INDEX LC_CTYPE_INDEX_
9152 #elif defined(USE_LOCALE_MESSAGES)
9153 #  define WHICH_LC_INDEX LC_MESSAGES_INDEX_
9154 #endif
9155
9156 /*===========================================================================*/
9157 /* First set of implementations, when have strerror_l() */
9158
9159 #if defined(USE_POSIX_2008_LOCALE) && defined(HAS_STRERROR_L)
9160
9161 #  if ! defined(USE_LOCALE_CTYPE) && ! defined(USE_LOCALE_MESSAGES)
9162
9163 /* Here, neither category is defined: use the C locale */
9164 const char *
9165 Perl_my_strerror(pTHX_ const int errnum, utf8ness_t * utf8ness)
9166 {
9167     PERL_ARGS_ASSERT_MY_STRERROR;
9168
9169     DEBUG_STRERROR_ENTER(errnum, 0);
9170
9171     const char *errstr = savepv(strerror_l(errnum, PL_C_locale_obj));
9172     *utf8ness = UTF8NESS_IMMATERIAL;
9173
9174     DEBUG_STRERROR_RETURN(errstr, utf8ness);
9175
9176     SAVEFREEPV(errstr);
9177     return errstr;
9178 }
9179
9180 #  elif ! defined(USE_LOCALE_CTYPE) || ! defined(USE_LOCALE_MESSAGES)
9181
9182 /*--------------------------------------------------------------------------*/
9183
9184 /* Here one or the other of CTYPE or MESSAGES is defined, but not both.  If we
9185  * are not within 'use locale' scope of the only one defined, we use the C
9186  * locale; otherwise use the current locale object */
9187
9188 const char *
9189 Perl_my_strerror(pTHX_ const int errnum, utf8ness_t * utf8ness)
9190 {
9191     PERL_ARGS_ASSERT_MY_STRERROR;
9192
9193     DEBUG_STRERROR_ENTER(errnum, IN_LC(categories[WHICH_LC_INDEX]));
9194
9195     /* Use C if not within locale scope;  Otherwise, use current locale */
9196     const locale_t which_obj = (IN_LC(categories[WHICH_LC_INDEX]))
9197                                ? PL_C_locale_obj
9198                                : use_curlocale_scratch();
9199
9200     const char *errstr = savepv(strerror_l(errnum, which_obj));
9201     *utf8ness = get_locale_string_utf8ness_i(errstr, LOCALE_UTF8NESS_UNKNOWN,
9202                                              NULL, WHICH_LC_INDEX);
9203     DEBUG_STRERROR_RETURN(errstr, utf8ness);
9204
9205     SAVEFREEPV(errstr);
9206     return errstr;
9207 }
9208
9209 /*--------------------------------------------------------------------------*/
9210 #  else     /* Are using both categories.  Place them in the same CODESET,
9211              * either C or the LC_MESSAGES locale */
9212
9213 const char *
9214 Perl_my_strerror(pTHX_ const int errnum, utf8ness_t * utf8ness)
9215 {
9216     PERL_ARGS_ASSERT_MY_STRERROR;
9217
9218     DEBUG_STRERROR_ENTER(errnum, IN_LC(LC_MESSAGES));
9219
9220     const char *errstr;
9221     if (! IN_LC(LC_MESSAGES)) {    /* Use C if not within locale scope */
9222         errstr = savepv(strerror_l(errnum, PL_C_locale_obj));
9223         *utf8ness = UTF8NESS_IMMATERIAL;
9224     }
9225     else {  /* Otherwise, use the LC_MESSAGES locale, making sure LC_CTYPE
9226                matches */
9227         locale_t cur = duplocale(use_curlocale_scratch());
9228
9229         cur = newlocale(LC_CTYPE_MASK, querylocale_c(LC_MESSAGES), cur);
9230         errstr = savepv(strerror_l(errnum, cur));
9231         *utf8ness = get_locale_string_utf8ness_i(errstr,
9232                                                  LOCALE_UTF8NESS_UNKNOWN,
9233                                                  NULL, LC_MESSAGES_INDEX_);
9234         freelocale(cur);
9235     }
9236
9237     DEBUG_STRERROR_RETURN(errstr, utf8ness);
9238
9239     SAVEFREEPV(errstr);
9240     return errstr;
9241 }
9242 #  endif    /* Above is using strerror_l */
9243 /*===========================================================================*/
9244 #else       /* Below is not using strerror_l */
9245 #  if ! defined(USE_LOCALE_CTYPE) && ! defined(USE_LOCALE_MESSAGES)
9246
9247 /* If not using using either of the categories, return plain, unadorned
9248  * strerror */
9249
9250 const char *
9251 Perl_my_strerror(pTHX_ const int errnum, utf8ness_t * utf8ness)
9252 {
9253     PERL_ARGS_ASSERT_MY_STRERROR;
9254
9255     DEBUG_STRERROR_ENTER(errnum, 0);
9256
9257     const char *errstr = savepv(Strerror(errnum));
9258     *utf8ness = UTF8NESS_IMMATERIAL;
9259
9260     DEBUG_STRERROR_RETURN(errstr, utf8ness);
9261
9262     SAVEFREEPV(errstr);
9263     return errstr;
9264 }
9265
9266 /*--------------------------------------------------------------------------*/
9267 #  elif ! defined(USE_LOCALE_CTYPE) || ! defined(USE_LOCALE_MESSAGES)
9268
9269 /* Here one or the other of CTYPE or MESSAGES is defined, but not both.  If we
9270  * are not within 'use locale' scope of the only one defined, we use the C
9271  * locale; otherwise use the current locale */
9272
9273 const char *
9274 Perl_my_strerror(pTHX_ const int errnum, utf8ness_t * utf8ness)
9275 {
9276     PERL_ARGS_ASSERT_MY_STRERROR;
9277
9278     DEBUG_STRERROR_ENTER(errnum, IN_LC(categories[WHICH_LC_INDEX]));
9279
9280     const char *errstr;
9281     if (IN_LC(categories[WHICH_LC_INDEX])) {
9282         errstr = savepv(Strerror(errnum));
9283         *utf8ness = get_locale_string_utf8ness_i(errstr,
9284                                                  LOCALE_UTF8NESS_UNKNOWN,
9285                                                  NULL, WHICH_LC_INDEX);
9286     }
9287     else {
9288
9289         LOCALE_LOCK;
9290
9291         const char * orig_locale = toggle_locale_i(WHICH_LC_INDEX, "C");
9292
9293         errstr = savepv(Strerror(errnum));
9294
9295         restore_toggled_locale_i(WHICH_LC_INDEX, orig_locale);
9296
9297         LOCALE_UNLOCK;
9298
9299         *utf8ness = UTF8NESS_IMMATERIAL;
9300     }
9301
9302     DEBUG_STRERROR_RETURN(errstr, utf8ness);
9303
9304     SAVEFREEPV(errstr);
9305     return errstr;
9306 }
9307
9308 /*--------------------------------------------------------------------------*/
9309 #  else
9310
9311 /* Below, have both LC_CTYPE and LC_MESSAGES.  Place them in the same CODESET,
9312  * either C or the LC_MESSAGES locale */
9313
9314 const char *
9315 Perl_my_strerror(pTHX_ const int errnum, utf8ness_t * utf8ness)
9316 {
9317     PERL_ARGS_ASSERT_MY_STRERROR;
9318
9319     DEBUG_STRERROR_ENTER(errnum, IN_LC(LC_MESSAGES));
9320
9321     const char * desired_locale = (IN_LC(LC_MESSAGES))
9322                                   ? querylocale_c(LC_MESSAGES)
9323                                   : "C";
9324     /* XXX Can fail on z/OS */
9325
9326     LOCALE_LOCK;
9327
9328     const char* orig_CTYPE_locale    = toggle_locale_c(LC_CTYPE,
9329                                                        desired_locale);
9330     const char* orig_MESSAGES_locale = toggle_locale_c(LC_MESSAGES,
9331                                                        desired_locale);
9332     const char *errstr = savepv(Strerror(errnum));
9333
9334     restore_toggled_locale_c(LC_MESSAGES, orig_MESSAGES_locale);
9335     restore_toggled_locale_c(LC_CTYPE, orig_CTYPE_locale);
9336
9337     LOCALE_UNLOCK;
9338
9339     *utf8ness = get_locale_string_utf8ness_i(errstr, LOCALE_UTF8NESS_UNKNOWN,
9340                                              NULL, LC_MESSAGES_INDEX_);
9341     DEBUG_STRERROR_RETURN(errstr, utf8ness);
9342
9343     SAVEFREEPV(errstr);
9344     return errstr;
9345 }
9346
9347 /*--------------------------------------------------------------------------*/
9348 #  endif /* end of not using strerror_l() */
9349 #endif   /* end of all the my_strerror() implementations */
9350
9351 /*
9352
9353 =for apidoc switch_to_global_locale
9354
9355 This function copies the locale state of the calling thread into the program's
9356 global locale, and converts the thread to use that global locale.
9357
9358 It is intended so that Perl can safely be used with C libraries that access the
9359 global locale and which can't be converted to not access it.  Effectively, this
9360 means libraries that call C<L<setlocale(3)>> on non-Windows systems.  (For
9361 portability, it is a good idea to use it on Windows as well.)
9362
9363 A downside of using it is that it disables the services that Perl provides to
9364 hide locale gotchas from your code.  The service you most likely will miss
9365 regards the radix character (decimal point) in floating point numbers.  Code
9366 executed after this function is called can no longer just assume that this
9367 character is correct for the current circumstances.
9368
9369 To return to Perl control, and restart the gotcha prevention services, call
9370 C<L</sync_locale>>.  Behavior is undefined for any pure Perl code that executes
9371 while the switch is in effect.
9372
9373 The global locale and the per-thread locales are independent.  As long as just
9374 one thread converts to the global locale, everything works smoothly.  But if
9375 more than one does, they can easily interfere with each other, and races are
9376 likely.  On Windows systems prior to Visual Studio 15 (at which point Microsoft
9377 fixed a bug), races can occur (even if only one thread has been converted to
9378 the global locale), but only if you use the following operations:
9379
9380 =over
9381
9382 =item L<POSIX::localeconv|POSIX/localeconv>
9383
9384 =item L<I18N::Langinfo>, items C<CRNCYSTR> and C<THOUSEP>
9385
9386 =item L<perlapi/Perl_langinfo>, items C<CRNCYSTR> and C<THOUSEP>
9387
9388 =back
9389
9390 The first item is not fixable (except by upgrading to a later Visual Studio
9391 release), but it would be possible to work around the latter two items by
9392 having Perl change its algorithm for calculating these to use Windows API
9393 functions (likely C<GetNumberFormat> and C<GetCurrencyFormat>); patches
9394 welcome.
9395
9396 XS code should never call plain C<setlocale>, but should instead be converted
9397 to either call L<C<Perl_setlocale>|perlapi/Perl_setlocale> (which is a drop-in
9398 for the system C<setlocale>) or use the methods given in L<perlcall> to call
9399 L<C<POSIX::setlocale>|POSIX/setlocale>.  Either one will transparently properly
9400 handle all cases of single- vs multi-thread, POSIX 2008-supported or not.
9401
9402 =cut
9403 */
9404
9405 #if defined(WIN32) && defined(USE_THREAD_SAFE_LOCALE)
9406 #  define CHANGE_SYSTEM_LOCALE_TO_GLOBAL                                \
9407     STMT_START {                                                        \
9408         if (_configthreadlocale(_DISABLE_PER_THREAD_LOCALE) == -1) {    \
9409             locale_panic_("_configthreadlocale returned an error");     \
9410         }                                                               \
9411     } STMT_END
9412 #elif defined(USE_POSIX_2008_LOCALE)
9413 #  define CHANGE_SYSTEM_LOCALE_TO_GLOBAL                                \
9414     STMT_START {                                                        \
9415         locale_t old_locale = uselocale(LC_GLOBAL_LOCALE);              \
9416         if (! old_locale) {                                             \
9417             locale_panic_("Could not change to global locale");         \
9418         }                                                               \
9419                                                                         \
9420         /* Free the per-thread memory */                                \
9421         if (   old_locale != LC_GLOBAL_LOCALE                           \
9422             && old_locale != PL_C_locale_obj)                           \
9423         {                                                               \
9424             freelocale(old_locale);                                     \
9425         }                                                               \
9426     } STMT_END
9427 #else
9428 #  define CHANGE_SYSTEM_LOCALE_TO_GLOBAL
9429 #endif
9430
9431 void
9432 Perl_switch_to_global_locale(pTHX)
9433 {
9434
9435 #ifdef USE_LOCALE
9436
9437     DEBUG_L(PerlIO_printf(Perl_debug_log, "Entering switch_to_global; %s\n",
9438                                           get_LC_ALL_display()));
9439
9440    /* In these cases, we use the system state to determine if we are in the
9441     * global locale or not. */
9442 #  ifdef USE_POSIX_2008_LOCALE
9443
9444     const bool perl_controls = (LC_GLOBAL_LOCALE != uselocale((locale_t) 0));
9445
9446 #  elif defined(USE_THREAD_SAFE_LOCALE) && defined(WIN32)
9447
9448     int config_return = _configthreadlocale(0);
9449     if (config_return == -1) {
9450         locale_panic_("_configthreadlocale returned an error");
9451     }
9452     const bool perl_controls = (config_return == _ENABLE_PER_THREAD_LOCALE);
9453
9454 #  else
9455
9456     const bool perl_controls = false;
9457
9458 #  endif
9459
9460     /* No-op if already in global */
9461     if (! perl_controls) {
9462         return;
9463     }
9464
9465 #  ifdef LC_ALL
9466
9467     const char * thread_locale = calculate_LC_ALL_string(NULL,
9468                                                          EXTERNAL_FORMAT_FOR_SET,
9469                                                          WANT_TEMP_PV,
9470                                                          __LINE__);
9471     CHANGE_SYSTEM_LOCALE_TO_GLOBAL;
9472     posix_setlocale(LC_ALL, thread_locale);
9473
9474 #  else   /* Must be USE_POSIX_2008_LOCALE) */
9475
9476     const char * cur_thread_locales[LC_ALL_INDEX_];
9477
9478     /* Save each category's current per-thread state */
9479     for_all_individual_category_indexes(i) {
9480         cur_thread_locales[i] = querylocale_i(i);
9481     }
9482
9483     CHANGE_SYSTEM_LOCALE_TO_GLOBAL;
9484
9485     /* Set the global to what was our per-thread state */
9486     POSIX_SETLOCALE_LOCK;
9487     for_all_individual_category_indexes(i) {
9488         posix_setlocale(categories[i], cur_thread_locales[i]);
9489     }
9490     POSIX_SETLOCALE_UNLOCK;
9491
9492 #  endif
9493 #  ifdef USE_LOCALE_NUMERIC
9494
9495     /* Switch to the underlying C numeric locale; the application is on its
9496      * own. */
9497     POSIX_SETLOCALE_LOCK;
9498     posix_setlocale(LC_NUMERIC, PL_numeric_name);
9499     POSIX_SETLOCALE_UNLOCK;
9500
9501 #  endif
9502 #endif
9503
9504 }
9505
9506 /*
9507
9508 =for apidoc sync_locale
9509
9510 This function copies the state of the program global locale into the calling
9511 thread, and converts that thread to using per-thread locales, if it wasn't
9512 already, and the platform supports them.  The LC_NUMERIC locale is toggled into
9513 the standard state (using the C locale's conventions), if not within the
9514 lexical scope of S<C<use locale>>.
9515
9516 Perl will now consider itself to have control of the locale.
9517
9518 Since unthreaded perls have only a global locale, this function is a no-op
9519 without threads.
9520
9521 This function is intended for use with C libraries that do locale manipulation.
9522 It allows Perl to accommodate the use of them.  Call this function before
9523 transferring back to Perl space so that it knows what state the C code has left
9524 things in.
9525
9526 XS code should not manipulate the locale on its own.  Instead,
9527 L<C<Perl_setlocale>|perlapi/Perl_setlocale> can be used at any time to query or
9528 change the locale (though changing the locale is antisocial and dangerous on
9529 multi-threaded systems that don't have multi-thread safe locale operations.
9530 (See L<perllocale/Multi-threaded operation>).
9531
9532 Using the libc L<C<setlocale(3)>> function should be avoided.  Nevertheless,
9533 certain non-Perl libraries called from XS, do call it, and their behavior may
9534 not be able to be changed.  This function, along with
9535 C<L</switch_to_global_locale>>, can be used to get seamless behavior in these
9536 circumstances, as long as only one thread is involved.
9537
9538 If the library has an option to turn off its locale manipulation, doing that is
9539 preferable to using this mechanism.  C<Gtk> is such a library.
9540
9541 The return value is a boolean: TRUE if the global locale at the time of call
9542 was in effect for the caller; and FALSE if a per-thread locale was in effect.
9543
9544 =cut
9545 */
9546
9547 bool
9548 Perl_sync_locale(pTHX)
9549 {
9550
9551 #ifndef USE_LOCALE
9552
9553     return TRUE;
9554
9555 #else
9556
9557     bool was_in_global = TRUE;
9558
9559 #  ifdef USE_THREAD_SAFE_LOCALE
9560 #    if defined(WIN32)
9561
9562     int config_return = _configthreadlocale(_DISABLE_PER_THREAD_LOCALE);
9563     if (config_return == -1) {
9564         locale_panic_("_configthreadlocale returned an error");
9565     }
9566     was_in_global = (config_return == _DISABLE_PER_THREAD_LOCALE);
9567
9568 #    elif defined(USE_POSIX_2008_LOCALE)
9569
9570     was_in_global = (LC_GLOBAL_LOCALE == uselocale(LC_GLOBAL_LOCALE));
9571
9572 #    else
9573 #      error Unexpected Configuration
9574 #    endif
9575 #  endif    /* USE_THREAD_SAFE_LOCALE */
9576
9577     /* Here, we are in the global locale.  Get and save the values for each
9578      * category, and convert the current thread to use them */
9579
9580 #  ifdef LC_ALL
9581
9582     STDIZED_SETLOCALE_LOCK;
9583     const char * lc_all_string = savepv(stdized_setlocale(LC_ALL, NULL));
9584     STDIZED_SETLOCALE_UNLOCK;
9585
9586     give_perl_locale_control(lc_all_string, __LINE__);
9587     Safefree(lc_all_string);
9588
9589 #  else
9590
9591     const char * current_globals[LC_ALL_INDEX_];
9592     for_all_individual_category_indexes(i) {
9593         STDIZED_SETLOCALE_LOCK;
9594         current_globals[i] = savepv(stdized_setlocale(categories[i], NULL));
9595         STDIZED_SETLOCALE_UNLOCK;
9596     }
9597
9598     give_perl_locale_control((const char **) &current_globals, __LINE__);
9599
9600     for_all_individual_category_indexes(i) {
9601         Safefree(current_globals[i]);
9602     }
9603
9604 #  endif
9605
9606     return was_in_global;
9607
9608 #endif
9609
9610 }
9611
9612 #if defined(DEBUGGING) && defined(USE_LOCALE)
9613
9614 STATIC char *
9615 S_my_setlocale_debug_string_i(pTHX_
9616                               const locale_category_index cat_index,
9617                               const char* locale, /* Optional locale name */
9618
9619                               /* return value from setlocale() when attempting
9620                                * to set 'category' to 'locale' */
9621                               const char* retval,
9622
9623                               const line_t line)
9624 {
9625     /* Returns a pointer to a NUL-terminated string in static storage with
9626      * added text about the info passed in.  This is not thread safe and will
9627      * be overwritten by the next call, so this should be used just to
9628      * formulate a string to immediately print or savepv() on. */
9629
9630     const char * locale_quote;
9631     const char * retval_quote;
9632
9633     assert(cat_index <= LC_ALL_INDEX_);
9634
9635     if (locale == NULL) {
9636         locale_quote = "";
9637         locale = "NULL";
9638     }
9639     else {
9640         locale_quote = "\"";
9641     }
9642
9643     if (retval == NULL) {
9644         retval_quote = "";
9645         retval = "NULL";
9646     }
9647     else {
9648         retval_quote = "\"";
9649     }
9650
9651 #  ifdef USE_LOCALE_THREADS
9652 #    define THREAD_FORMAT "%p:"
9653 #    define THREAD_ARGUMENT aTHX_
9654 #  else
9655 #    define THREAD_FORMAT
9656 #    define THREAD_ARGUMENT
9657 #  endif
9658
9659     return Perl_form(aTHX_
9660                      "%s:%" LINE_Tf ": " THREAD_FORMAT
9661                      " setlocale(%s[%d], %s%s%s) returned %s%s%s\n",
9662
9663                      __FILE__, line, THREAD_ARGUMENT
9664                      category_names[cat_index], categories[cat_index],
9665                      locale_quote, locale, locale_quote,
9666                      retval_quote, retval, retval_quote);
9667 }
9668
9669 #endif
9670 #ifdef USE_PERL_SWITCH_LOCALE_CONTEXT
9671
9672 void
9673 Perl_switch_locale_context(pTHX)
9674 {
9675     /* libc keeps per-thread locale status information in some configurations.
9676      * So, we can't just switch out aTHX to switch to a new thread.  libc has
9677      * to follow along.  This routine does that based on per-interpreter
9678      * variables we keep just for this purpose.
9679      *
9680      * There are two implementations where this is an issue.  For the other
9681      * implementations, it doesn't matter because libc is using global values
9682      * that all threads know about.
9683      *
9684      * The two implementations are where libc keeps thread-specific information
9685      * on its own.  These are
9686      *
9687      * POSIX 2008:  The current locale is kept by libc as an object.  We save
9688      *              a copy of that in the per-thread PL_cur_locale_obj, and so
9689      *              this routine uses that copy to tell the thread it should be
9690      *              operating with that object
9691      * Windows thread-safe locales:  A given thread in Windows can be being run
9692      *              with per-thread locales, or not.  When the thread context
9693      *              changes, libc doesn't automatically know if the thread is
9694      *              using per-thread locales, nor does it know what the new
9695      *              thread's locale is.  We keep that information in the
9696      *              per-thread variables:
9697      *                  PL_controls_locale  indicates if this thread is using
9698      *                                      per-thread locales or not
9699      *                  PL_cur_LC_ALL       indicates what the locale should be
9700      *                                      if it is a per-thread locale.
9701      */
9702
9703     if (UNLIKELY(   PL_veto_switch_non_tTHX_context
9704                  || PL_phase == PERL_PHASE_CONSTRUCT))
9705     {
9706         return;
9707     }
9708
9709 #  ifdef USE_POSIX_2008_LOCALE
9710
9711     if (! uselocale(PL_cur_locale_obj)) {
9712         locale_panic_(Perl_form(aTHX_
9713                                 "Can't uselocale(%p), LC_ALL supposed to"
9714                                 " be '%s'",
9715                                 PL_cur_locale_obj, get_LC_ALL_display()));
9716     }
9717
9718 #  elif defined(WIN32)
9719
9720     if (! bool_setlocale_c(LC_ALL, PL_cur_LC_ALL)) {
9721         locale_panic_(Perl_form(aTHX_ "Can't setlocale(%s)", PL_cur_LC_ALL));
9722     }
9723
9724 #  endif
9725
9726 }
9727
9728 #endif
9729
9730 void
9731 Perl_thread_locale_init(pTHX)
9732 {
9733
9734 #ifdef USE_THREAD_SAFE_LOCALE
9735 #  ifdef USE_POSIX_2008_LOCALE
9736
9737     /* Called from a thread on startup.
9738      *
9739      * The operations here have to be done from within the calling thread, as
9740      * they affect libc's knowledge of the thread; libc has no knowledge of
9741      * aTHX */
9742
9743      DEBUG_L(PerlIO_printf(Perl_debug_log,
9744                            "new thread, initial locale is %s;"
9745                            " calling setlocale(LC_ALL, \"C\")\n",
9746                            get_LC_ALL_display()));
9747
9748     if (! uselocale(PL_C_locale_obj)) {
9749
9750         /* Not being able to change to the C locale is severe; don't keep
9751          * going.  */
9752         locale_panic_(Perl_form(aTHX_
9753                                 "Can't uselocale(%p), 'C'", PL_C_locale_obj));
9754         NOT_REACHED; /* NOTREACHED */
9755     }
9756
9757 #    ifdef MULTIPLICITY
9758
9759     PL_cur_locale_obj = PL_C_locale_obj;
9760
9761 #    endif
9762 #  elif defined(WIN32)
9763
9764     /* On Windows, make sure new thread has per-thread locales enabled */
9765     if (_configthreadlocale(_ENABLE_PER_THREAD_LOCALE) == -1) {
9766         locale_panic_("_configthreadlocale returned an error");
9767     }
9768     void_setlocale_c(LC_ALL, "C");
9769
9770 #  endif
9771 #endif
9772
9773 }
9774
9775 void
9776 Perl_thread_locale_term(pTHX)
9777 {
9778     /* Called from a thread as it gets ready to terminate.
9779      *
9780      * The operations here have to be done from within the calling thread, as
9781      * they affect libc's knowledge of the thread; libc has no knowledge of
9782      * aTHX */
9783
9784 #if defined(USE_POSIX_2008_LOCALE) && defined(USE_THREADS)
9785
9786     /* Switch to the global locale, so can free up the per-thread object */
9787     locale_t actual_obj = uselocale(LC_GLOBAL_LOCALE);
9788     if (actual_obj != LC_GLOBAL_LOCALE && actual_obj != PL_C_locale_obj) {
9789         freelocale(actual_obj);
9790     }
9791
9792     /* Prevent leaks even if something has gone wrong */
9793     locale_t expected_obj = PL_cur_locale_obj;
9794     if (UNLIKELY(   expected_obj != actual_obj
9795                  && expected_obj != LC_GLOBAL_LOCALE
9796                  && expected_obj != PL_C_locale_obj))
9797     {
9798         freelocale(expected_obj);
9799     }
9800
9801     PL_cur_locale_obj = LC_GLOBAL_LOCALE;
9802
9803 #endif
9804 #ifdef WIN32_USE_FAKE_OLD_MINGW_LOCALES
9805
9806     /* When faking the mingw implementation, we coerce this function into doing
9807      * something completely different from its intent -- namely to free up our
9808      * static buffer to avoid a leak.  This function gets called for each
9809      * thread that is terminating, so will give us a chance to free the buffer
9810      * from the appropriate pool.  On unthreaded systems, it gets called by the
9811      * mutex termination code. */
9812
9813 #  ifdef MULTIPLICITY
9814
9815     if (aTHX != wsetlocale_buf_aTHX) {
9816         return;
9817     }
9818
9819 #  endif
9820
9821     if (wsetlocale_buf_size > 0) {
9822         Safefree(wsetlocale_buf);
9823         wsetlocale_buf_size = 0;
9824     }
9825
9826 #endif
9827
9828 }
9829
9830 /*
9831  * ex: set ts=8 sts=4 sw=4 et:
9832  */