locale.c

   1 /*    locale.c
   2  *
   3  *    Copyright (C) 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001,
   4  *    2002, 2003, 2005, 2006, 2007, 2008 by Larry Wall and others
   5  *
   6  *    You may distribute under the terms of either the GNU General Public
   7  *    License or the Artistic License, as specified in the README file.
   8  *
   9  */
  10
  11 /*
  12  *      A Elbereth Gilthoniel,
  13  *      silivren penna míriel
  14  *      o menel aglar elenath!
  15  *      Na-chaered palan-díriel
  16  *      o galadhremmin ennorath,
  17  *      Fanuilos, le linnathon
  18  *      nef aear, si nef aearon!
  19  *
  20  *     [p.238 of _The Lord of the Rings_, II/i: "Many Meetings"]
  21  */
  22
  23 /* utility functions for handling locale-specific stuff like what
  24  * character represents the decimal point.
  25  *
  26  * All C programs have an underlying locale.  Perl code generally doesn't pay
  27  * any attention to it except within the scope of a 'use locale'.  For most
  28  * categories, it accomplishes this by just using different operations if it is
  29  * in such scope than if not.  However, various libc functions called by Perl
  30  * are affected by the LC_NUMERIC category, so there are macros in perl.h that
  31  * are used to toggle between the current locale and the C locale depending on
  32  * the desired behavior of those functions at the moment.  And, LC_MESSAGES is
  33  * switched to the C locale for outputting the message unless within the scope
  34  * of 'use locale'.
  35  *
  36  * There is more than the typical amount of variation between platforms with
  37  * regard to locale handling.  At the end of these introductory comments, are
  38  * listed various relevent Configuration options, including some that can be
  39  * used to pretend to some extent that this is being developed on a different
  40  * platform than it actually is.  This allows you to make changes and catch
  41  * some errors without having access to those other platforms.
  42  *
  43  * This code now has multi-thread-safe locale handling on systems that support
  44  * that.  This is completely transparent to most XS code.  On earlier systems,
  45  * it would be possible to emulate thread-safe locales, but this likely would
  46  * involve a lot of locale switching, and would require XS code changes.
  47  * Macros could be written so that the code wouldn't have to know which type of
  48  * system is being used.
  49  *
  50  * Table-driven code is used for simplicity and clarity, as many operations
  51  * differ only in which category is being worked on.  However the system
  52  * categories need not be small contiguous integers, so do not lend themselves
  53  * to table lookup.  Instead we have created our own equivalent values which
  54  * are all small contiguous non-negative integers, and translation functions
  55  * between the two sets.  For category 'LC_foo', the name of our index is
  56  * LC_foo_INDEX_.  Various parallel tables, indexed by these, are used for the
  57  * translation.  The tables are generated at compile-time based on platform
  58  * characteristics and Configure options.  They hide from the code many of the
  59  * vagaries of the different locale implementations out there.
  60  *
  61  * On unthreaded perls, most operations expand out to just the basic
  62  * setlocale() calls.  That sort of is true on threaded perls on modern Windows
  63  * systems where the same API, after set up, is used for thread-safe locale
  64  * handling.  (But there are complications on Windows due to internal character
  65  * set issues.)  On other systems, there is a completely different API,
  66  * specified in POSIX 2008, to do thread-safe locales.  On these systems, our
  67  * bool_setlocale_2008_i() function is used to hide the different API from the
  68  * outside.  This makes it completely transparent to most XS code.
  69  *
  70  * A huge complicating factor is that the LC_NUMERIC category is normally held
  71  * in the C locale, except during those relatively rare times when it needs to
  72  * be in the underlying locale.  There is a bunch of code to accomplish this,
  73  * and to allow easy switches from one state to the other.
  74  *
  75  * In addition, the setlocale equivalents have versions for the return context,
  76  * 'void' and 'bool', besides the full return value.  This can present
  77  * opportunities for avoiding work.  We don't have to necessarily create a safe
  78  * copy to return if no return is desired.
  79  *
  80  * There are 3.5 major implementations here; which one chosen depends on what
  81  * the platform has available, and Configuration options.
  82  *
  83  * 1) Raw posix_setlocale().  This implementation is basically the libc
  84  *    setlocale(), with possibly minor tweaks.  This is used for startup, and
  85  *    always for unthreaded perls, and when the API for safe locale threading
  86  *    is identical to the unsafe API (Windows, currently).
  87  *
  88  *    This implementation is composed of two layers:
  89  *      a)  posix_setlocale() implements the libc setlocale().  In most cases,
  90  *          it is just an alias for the libc version.  But Windows doesn't
  91  *          fully conform to the POSIX standard, and this is a layer on top of
  92  *          libc to bring it more into conformance.  And in Configurations
  93  *          where perl is to ignore some locale categories that the libc
  94  *          setlocale() knows about, there is a layer to cope with that.
  95  *      b)  stdized_setlocale() is a layer above a) that fixes some vagaries in
  96  *          the return value of the libc setlocale().  On most platforms this
  97  *          layer is empty; in order to be activated, it requires perl to be
  98  *          Configured with a parameter indicating the platform's defect.  The
  99  *          current ones are listed at the definition of the macro.
 100  *
 101  * 2) An implementation that adds a minimal layer above implementation 1),
 102  *    making that implementation uninterruptible and returning a
 103  *    per-thread/per-category value.
 104  *
 105  * 3a and 3b) An implementation of POSIX 2008 thread-safe locale handling,
 106  *    hiding from the programmer the completely different API for this.
 107  *    This automatically makes almost all code thread-safe without need for
 108  *    changes.  This implementation is chosen on threaded perls when the
 109  *    platform properly supports the POSIX 2008 functions, and when there is no
 110  *    manual override to the contrary passed to Configure.
 111  *
 112  *    3a) is when the platform has a documented reliable querylocale() function
 113  *        or equivalent that is selected to be used.
 114  *    3b) is when we have to emulate that functionality.
 115  *
 116  *    Unfortunately, it seems that some platforms that claim to support these
 117  *    are buggy, in one way or another.  There are workarounds encoded here,
 118  *    where feasible, for platforms where the bugs are amenable to that
 119  *    (glibc, for example).  But other platforms instead don't use this
 120  *    implementation.
 121  *
 122  * z/OS (os390) is an outlier.  Locales really don't work under threads when
 123  * either the radix character isn't a dot, or attempts are made to change
 124  * locales after the first thread is created.  The reason is that IBM has made
 125  * it thread-safe by refusing to change locales (returning failure if
 126  * attempted) any time after an application has called pthread_create() to
 127  * create another thread.  The expectation is that an application will set up
 128  * its locale information before the first fork, and be stable thereafter.  But
 129  * perl toggles LC_NUMERIC if the locale's radix character isn't a dot, as do
 130  * the other toggles, which are less common.
 131  *
 132  * Associated with each implementation are three sets of macros that translate
 133  * a consistent API into what that implementation needs.  Each set consists of
 134  * three macros with the suffixes:
 135  *  _c  Means the argument is a locale category number known at compile time.
 136  *          An example would be LC_TIME.  This token is a compile-time constant
 137  *          and can be passed to a '_c' macro.
 138  *  _r  Means the argument is a locale category number whose value might not be
 139  *      known until runtime
 140  *  _i  Means the argument is our internal index of a locale category
 141  *
 142  * The three sets are:    ('_X'  means one of '_c', '_r', '_i')
 143  * 1) bool_setlocale_X()
 144  *      This calls the appropriate setlocale()-equivalent for the
 145  *      implementation, with the category and new locale.  The input locale is
 146  *      not necessarily valid, so the return is true or false depending on
 147  *      whether or not the setlocale() succeeded.  This is not used for
 148  *      querying the locale, so the input locale must not be NULL.
 149  *
 150  *      This macro is suitable for toggling the locale back and forth during an
 151  *      operation.  For example, the names of days and months under LC_TIME are
 152  *      strings that are also subject to LC_CTYPE.  If the locales of these two
 153  *      categories differ, mojibake can result on many platforms.  The code
 154  *      here will toggle LC_CTYPE into the locale of LC_TIME temporarily to
 155  *      avoid this.
 156  *
 157  *      Several categories require extra work when their locale is changed.
 158  *      LC_CTYPE, for example, requires the calculation of the table of which
 159  *      characters fold to which others under /i pattern matching or fc(), as
 160  *      folding is not a concept in POSIX.  This table isn't needed when the
 161  *      LC_CTYPE locale gets toggled during an operation, and will be toggled
 162  *      back before return to the caller.  To save work that would be
 163  *      discarded, the bool_setlocale_X() implementations don't do this extra
 164  *      work.  Instead, there is a separate function for just this purpose to
 165  *      be done before control is transferred back to the external caller.  All
 166  *      categories that have such requirements have such a function.  The
 167  *      update_functions[] array contains pointers to them (or NULL for
 168  *      categories which don't need a function).
 169  *
 170  *      Care must be taken to remember to call the separate function before
 171  *      returning to an external caller, and to not use things it updates
 172  *      before its call.  An alternative approach would be to have
 173  *      bool_setlocale_X() always call the update, which would return
 174  *      immediately if a flag wasn't set indicating it was time to actually
 175  *      perform it.
 176  *
 177  * 2) void_setlocale_X()
 178  *      This is like bool_setlocale_X(), but it is used only when it is
 179  *      expected that the call must succeed, or something is seriously wrong.
 180  *      A panic is issued if it fails.  The caller uses this form when it just
 181  *      wants to assume things worked.
 182  *
 183  * 3) querylocale_X()
 184  *      This returns a string that specifies the current locale for the given
 185  *      category given by the input argument.  The string is safe from other
 186  *      threads zapping it, and the caller need not worry about freeing it, but
 187  *      it may be mortalized, so must be copied if you need to preserve it
 188  *      across calls, or long term.  This returns the actual current locale,
 189  *      not the nominal.  These differ, for example, when LC_NUMERIC is
 190  *      supposed to be a locale whose decimal radix character is a comma.  As
 191  *      mentioned above, Perl actually keeps this category set to C in such
 192  *      circumstances so that XS code can just assume a dot radix character.
 193  *      querylocale_X() returns the locale that libc has stored at this moment,
 194  *      so most of the time will return a locale whose radix character is a
 195  *      dot.  The macro query_nominal_locale_i() can be used to get the nominal
 196  *      locale that an external caller would expect, for all categories except
 197  *      LC_ALL.  For that, you can use the function
 198  *      S_calculate_LC_ALL_string().  Or S_native_querylocale_i() will operate
 199  *      on any category.
 200  *
 201  * The underlying C API that this implements uses category numbers, hence the
 202  * code is structured to use '_r' at the API level to convert to indexes, which
 203  * are then used internally with the '_i' forms.
 204  *
 205  * The splitting apart into setting vs querying means that the return value of
 206  * the bool macros is not subject to potential clashes with other threads,
 207  * eliminating any need for the calling code to worry about that and get it
 208  * wrong.  Whereas, you do have to think about thread interactions when using a
 209  * query.
 210  *
 211  * Additionally, for the implementations where there aren't any complications,
 212  * a setlocale_i() is defined that is like plain setlocale(), returning the new
 213  * locale.  Thus it combines a bool_setlocale_X() with a querylocale_X().  It
 214  * is used only for performance on implementations that allow it, such as
 215  * non-threaded perls.
 216  *
 217  * There are also a few other macros herein that use this naming convention to
 218  * describe their category parameter.
 219  *
 220  * Relevant Configure options
 221  *
 222  *      -Accflags=-DNO_LOCALE
 223  *          This compiles perl to always use the C locale, ignoring any
 224  *          attempts to change it.  This could be useful on platforms with a
 225  *          crippled locale implementation.
 226  *
 227  *      -Accflags=-DNO_THREAD_SAFE_LOCALE
 228  *          Even if thread-safe operations are available on this platform and
 229  *          would otherwise be used (because this is a perl with multiplicity),
 230  *          perl is compiled to not use them.  This could be useful on
 231  *          platforms where the libc is buggy.
 232  *
 233  *      -Accflags=-DNO_POSIX_2008_LOCALE
 234  *          Even if the libc locale operations specified by the Posix 2008
 235  *          Standard are available on this platform and would otherwise be used
 236  *          (because this is a perl with multiplicity), perl is compiled to not
 237  *          use them.  This could be useful on platforms where the libc is
 238  *          buggy.  This is like NO_THREAD_SAFE_LOCALE, but has no effect on
 239  *          platforms that don't have these functions.
 240  *
 241  *      -Accflags=-DUSE_POSIX_2008_LOCALE
 242  *          Normally, setlocale() is used for locale operations on perls
 243  *          compiled without multiplicity.  This option causes the locale
 244  *          operations defined by the Posix 2008 Standard to always be used
 245  *          instead.  This could be useful on platforms where the libc
 246  *          setlocale() is buggy.
 247  *
 248  *      -Accflags=-DNO_THREAD_SAFE_QUERYLOCALE
 249  *          This applies only to platforms that have a querylocale() libc
 250  *          function.  perl assumes that that function is thread-safe, unless
 251  *          overridden by this, typically in a hints file.  When overridden,
 252  *          querylocale() is called only while the locale mutex is locked, and
 253  *          the result is copied to a per-thread place before unlocking.
 254  *
 255  *      -Accflags=-DUSE_NL_LOCALE_NAME
 256  *          glibc has an undocumented equivalent function to querylocale().  It
 257  *          currently isn't used by default because it is undocumented.  But
 258  *          testing hasn't found any problems with it.  Using this Configure
 259  *          option enables it on systems that have it (with no effect on
 260  *          systems lacking it).  Enabling this removes the need for perl
 261  *          to keep its own records, hence is more efficient and guaranteed to
 262  *          be accurate.
 263  *
 264  *      -Accflags=-DNO_LOCALE_CTYPE
 265  *      -Accflags=-DNO_LOCALE_NUMERIC
 266  *          etc.
 267  *
 268  *          If the named category(ies) does(do) not exist on this platform,
 269  *          these have no effect.  Otherwise they cause perl to be compiled to
 270  *          always keep the named category(ies) in the C locale.
 271  *
 272  *      -Accflags=-DHAS_BROKEN_SETLOCALE_QUERY_LC_ALL
 273  *          This would be set in a hints file to tell perl that doing a libc
 274  *              setlocale(LC_ALL, NULL)
 275  *          can give erroneous results, and perl will compensate to get the
 276  *          correct results.  This is known to be a problem in earlier AIX
 277  *          versions
 278  *
 279  *      -Accflags=-DHAS_LF_IN_SETLOCALE_RETURN
 280  *          This would be set in a hints file to tell perl that a libc
 281  *          setlocale() can return results containing \n characters that need
 282  *          to be stripped off.  khw believes there aren't any such platforms
 283  *          still in existence.
 284  *
 285  *      -Accflags=-DLIBC_HANDLES_MISMATCHED_CTYPE
 286  *          Consider the name of a month in some language, Chinese for example.
 287  *          If LC_TIME has been set to a Chinese locale, strftime() can be used
 288  *          to generate the Chinese month name for any given date, by using the
 289  *          %B format.  But also suppose that LC_CTYPE is set to, say, "C".
 290  *          The return from strftime() on many platforms will be mojibake given
 291  *          that no Chinese month name is composed of just ASCII characters.
 292  *          Perl handles this for you by automatically toggling LC_CTYPE to
 293  *          whatever LC_TIME is during the execution of strftime(), and
 294  *          afterwards restoring it to its prior value.  But the strftime()
 295  *          (and similar functions) in some libc implementations already do
 296  *          this toggle, meaning perl's action is redundant.  You can tell perl
 297  *          that a libc does this by setting this Configure option, and it will
 298  *          skip its syncing LC_CTYPE and whatever the other locale is.
 299  *          Currently, perl ignores this Configuration option and  syncs anyway
 300  *          for LC_COLLATE-related operations, due to perl's internal needs.
 301  *
 302  *      -Accflags=USE_FAKE_LC_ALL_POSITIONAL_NOTATION
 303  *          This is used when developing Perl on a platform that uses
 304  *          'name=value;' notation to represent LC_ALL when not all categories
 305  *          are the same.  When so compiled, much of the code gets compiled
 306  *          and exercised that applies to platforms that instead use positional
 307  *          notation.  This allows for finding many bugs in that portion of the
 308  *          implementation, without having to access such a platform.
 309  *
 310  *      -Accflags=-DWIN32_USE_FAKE_OLD_MINGW_LOCALES
 311  *          This is used when developing Perl on a non-Windows platform to
 312  *          compile and exercise much of the locale-related code that instead
 313  *          applies to MingW platforms that don't use the more modern UCRT
 314  *          library.  This allows for finding many bugs in that portion of the
 315  *          implementation, without having to access such a platform.
 316  */
 317
 318 /* If the environment says to, we can output debugging information during
 319  * initialization.  This is done before option parsing, and before any thread
 320  * creation, so can be a file-level static.  (Must come before #including
 321  * perl.h) */
 322 #include "config.h"
 323
 324 /* Returns the Unix errno portion; ignoring any others.  This is a macro here
 325  * instead of putting it into perl.h, because unclear to khw what should be
 326  * done generally. */
 327 #define GET_ERRNO   saved_errno
 328
 329 #ifdef DEBUGGING
 330 static int debug_initialization = 0;
 331 #  define DEBUG_INITIALIZATION_set(v) (debug_initialization = v)
 332 #  define DEBUG_LOCALE_INITIALIZATION_  debug_initialization
 333
 334 #  ifdef HAS_EXTENDED_OS_ERRNO
 335      /* Output the non-zero errno and/or the non-zero extended errno */
 336 #    define DEBUG_ERRNO                                                     \
 337         dSAVE_ERRNO; dTHX;                                                  \
 338         int extended = get_extended_os_errno();                             \
 339         const char * errno_string;                                          \
 340         if (GET_ERRNO == 0) { /* Skip output if both errno types are 0 */   \
 341             if (LIKELY(extended == 0)) errno_string = "";                   \
 342             else errno_string = Perl_form(aTHX_ "; $^E=%d", extended);      \
 343         }                                                                   \
 344         else if (LIKELY(extended == GET_ERRNO))                             \
 345             errno_string = Perl_form(aTHX_ "; $!=%d", GET_ERRNO);           \
 346         else errno_string = Perl_form(aTHX_ "; $!=%d, $^E=%d",              \
 347                                                     GET_ERRNO, extended);
 348 #  else
 349      /* Output the errno, if non-zero */
 350 #    define DEBUG_ERRNO                                                     \
 351         dSAVE_ERRNO;                                                        \
 352         const char * errno_string = "";                                     \
 353         if (GET_ERRNO != 0) {                                               \
 354             dTHX;                                                           \
 355             errno_string = Perl_form(aTHX_ "; $!=%d", GET_ERRNO);           \
 356         }
 357 #  endif
 358
 359 /* Automatically include the caller's file, and line number in debugging output;
 360  * and the errno (and/or extended errno) if non-zero.  On threaded perls add
 361  * the aTHX too. */
 362 #  if defined(USE_ITHREADS) && ! defined(NO_LOCALE_THREADS)
 363 #    define DEBUG_PRE_STMTS                                                 \
 364         DEBUG_ERRNO;                                                        \
 365         PerlIO_printf(Perl_debug_log, "\n%s: %" LINE_Tf ": 0x%p%s: ",       \
 366                                       __FILE__, (line_t)__LINE__, aTHX_     \
 367                                       errno_string);
 368 #  else
 369 #    define DEBUG_PRE_STMTS                                                 \
 370         DEBUG_ERRNO;                                                        \
 371         PerlIO_printf(Perl_debug_log, "\n%s: %" LINE_Tf "%s: ",             \
 372                                       __FILE__, (line_t)__LINE__,           \
 373                                       errno_string);
 374 #  endif
 375 #  define DEBUG_POST_STMTS  RESTORE_ERRNO;
 376 #else
 377 #  define debug_initialization 0
 378 #  define DEBUG_INITIALIZATION_set(v)
 379 #  define DEBUG_PRE_STMTS
 380 #  define DEBUG_POST_STMTS
 381 #endif
 382
 383 #include "EXTERN.h"
 384 #define PERL_IN_LOCALE_C
 385 #include "perl.h"
 386
 387 /* Some platforms require LC_CTYPE to be congruent with the category we are
 388  * looking for.  XXX This still presumes that we have to match COLLATE and
 389  * CTYPE even on platforms that apparently handle this. */
 390 #if defined(USE_LOCALE_CTYPE) && ! defined(LIBC_HANDLES_MISMATCHED_CTYPE)
 391 #  define WE_MUST_DEAL_WITH_MISMATCHED_CTYPE
 392 #endif
 393
 394 #if PERL_VERSION_GT(5,39,9)
 395 #  error Revert the commit that added this line
 396 #endif
 397
 398 #ifdef WIN32_USE_FAKE_OLD_MINGW_LOCALES
 399
 400    /* Use -Accflags=-DWIN32_USE_FAKE_OLD_MINGW_LOCALES on a POSIX or *nix box
 401     * to get a semblance of pretending the locale handling is that of a MingW
 402     * that doesn't use UCRT (hence 'OLD' in the name).  This exercizes code
 403     * paths that are not compiled on non-Windows boxes, and allows for ASAN
 404     * and PERL_MEMLOG.  This is thus a way to see if locale.c on Windows is
 405     * likely going to compile, without having to use a real Win32 box.  And
 406     * running the test suite will verify to a large extent our logic and memory
 407     * allocation handling for such boxes.  Of course the underlying calls are
 408     * to the POSIX libc, so any differences in implementation between those and
 409     * the Windows versions will not be caught by this. */
 410
 411 #  define WIN32
 412 #  undef P_CS_PRECEDES
 413 #  undef CURRENCY_SYMBOL
 414 #  define CP_UTF8 -1
 415 #  undef _configthreadlocale
 416 #  define _configthreadlocale(arg) NOOP
 417
 418 #  define MultiByteToWideChar(cp, flags, byte_string, m1, wstring, req_size) \
 419                     (PERL_UNUSED_ARG(cp),                                    \
 420                      mbsrtowcs(wstring, &(byte_string), req_size, NULL) + 1)
 421 #  define WideCharToMultiByte(cp, flags, wstring, m1, byte_string,          \
 422                               req_size, default_char, found_default_char)   \
 423                     (PERL_UNUSED_ARG(cp),                                   \
 424                      wcsrtombs(byte_string, &(wstring), req_size, NULL) + 1)
 425
 426 #  ifdef USE_LOCALE
 427
 428 static const wchar_t * wsetlocale_buf = NULL;
 429 static Size_t wsetlocale_buf_size = 0;
 430
 431 #    ifdef MULTIPLICITY
 432
 433 static PerlInterpreter * wsetlocale_buf_aTHX = NULL;
 434
 435 #    endif
 436
 437 STATIC
 438 const wchar_t *
 439 S_wsetlocale(const int category, const wchar_t * wlocale)
 440 {
 441     /* Windows uses a setlocale that takes a wchar_t* locale.  Other boxes
 442      * don't have this, so this Windows replacement converts the wchar_t input
 443      * to plain 'char*', calls plain setlocale(), and converts the result back
 444      * to 'wchar_t*' */
 445
 446     const char * byte_locale = NULL;
 447     if (wlocale) {
 448         byte_locale = Win_wstring_to_byte_string(CP_UTF8, wlocale);
 449     }
 450
 451     const char * byte_result = setlocale(category, byte_locale);
 452     Safefree(byte_locale);
 453     if (byte_result == NULL) {
 454         return NULL;
 455     }
 456
 457     const wchar_t * wresult = Win_byte_string_to_wstring(CP_UTF8, byte_result);
 458
 459     if (! wresult) {
 460         return NULL;
 461     }
 462
 463     /* Emulate a global static memory return from wsetlocale().  This currently
 464      * leaks at process end; would require changing LOCALE_TERM to fix that */
 465     Size_t string_size = wcslen(wresult) + 1;
 466
 467     if (wsetlocale_buf_size == 0) {
 468         Newx(wsetlocale_buf, string_size, wchar_t);
 469         wsetlocale_buf_size = string_size;
 470
 471 #  ifdef MULTIPLICITY
 472
 473         dTHX;
 474         wsetlocale_buf_aTHX = aTHX;
 475
 476 #  endif
 477
 478     }
 479     else if (string_size > wsetlocale_buf_size) {
 480         Renew(wsetlocale_buf, string_size, wchar_t);
 481         wsetlocale_buf_size = string_size;
 482     }
 483
 484     Copy(wresult, wsetlocale_buf, string_size, wchar_t);
 485     Safefree(wresult);
 486
 487     return wsetlocale_buf;
 488 }
 489
 490 #  define _wsetlocale(category, wlocale)  S_wsetlocale(category, wlocale)
 491 #  endif
 492 #endif  /* WIN32_USE_FAKE_OLD_MINGW_LOCALES */
 493
 494 /* 'for' loop headers to hide the necessary casts */
 495 #define for_all_individual_category_indexes(i)                              \
 496     for (locale_category_index i = (locale_category_index) 0;               \
 497          i < LC_ALL_INDEX_;                                                 \
 498          i = (locale_category_index) ((int) i + 1))
 499
 500 #define for_all_but_0th_individual_category_indexes(i)                      \
 501     for (locale_category_index i = (locale_category_index) 1;               \
 502          i < LC_ALL_INDEX_;                                                 \
 503          i = (locale_category_index) ((int) i + 1))
 504
 505 #define for_all_category_indexes(i)                                         \
 506     for (locale_category_index i = (locale_category_index) 0;               \
 507          i <= LC_ALL_INDEX_;                                                \
 508          i = (locale_category_index) ((int) i + 1))
 509
 510 #ifdef USE_LOCALE
 511 #  if defined(USE_FAKE_LC_ALL_POSITIONAL_NOTATION) && defined(LC_ALL)
 512
 513 /* This simulates an underlying positional notation for LC_ALL when compiled on
 514  * a system that uses name=value notation.  Use this to develop on Linux and
 515  * make a quick check that things have some chance of working on a positional
 516  * box.  Enable by adding to the Congfigure parameters:
 517  *      -Accflags=USE_FAKE_LC_ALL_POSITIONAL_NOTATION
 518  *
 519  * NOTE it redefines setlocale() and usequerylocale()
 520  * */
 521
 522 STATIC const char *
 523 S_positional_name_value_xlation(const char * locale, bool direction)
 524 {   /* direction == 1 is from name=value to positional
 525        direction == 0 is from positional to name=value */
 526     assert(locale);
 527
 528     dTHX;
 529     const char * individ_locales[LC_ALL_INDEX_] = { NULL };
 530
 531     /* This parses either notation */
 532     switch (parse_LC_ALL_string(locale,
 533                                 (const char **) &individ_locales,
 534                                 no_override,  /* Handled by other code */
 535                                 false,      /* Return only [0] if suffices */
 536                                 false,      /* Don't panic on error */
 537                                 __LINE__))
 538     {
 539       default:      /* Some compilers don't realize that below is the complete
 540                        list of the available enum values */
 541       case invalid:
 542         return NULL;
 543
 544       case no_array:
 545         return locale;
 546       case only_element_0:
 547         SAVEFREEPV(individ_locales[0]);
 548         return individ_locales[0];
 549       case full_array:
 550        {
 551         calc_LC_ALL_format  format = (direction)
 552                                      ? EXTERNAL_FORMAT_FOR_SET
 553                                      : INTERNAL_FORMAT;
 554         const char * retval = calculate_LC_ALL_string(individ_locales,
 555                                                       format,
 556                                                       WANT_TEMP_PV,
 557                                                       __LINE__);
 558
 559         for_all_individual_category_indexes(i) {
 560             Safefree(individ_locales[i]);
 561         }
 562
 563         return retval;
 564        }
 565     }
 566 }
 567
 568 STATIC const char *
 569 S_positional_setlocale(int cat, const char * locale)
 570 {
 571     if (cat != LC_ALL) return setlocale(cat, locale);
 572
 573     if (locale && strNE(locale, "")) {
 574         locale = S_positional_name_value_xlation(locale, 0);
 575         if (! locale) return NULL;
 576     }
 577
 578     locale = setlocale(cat, locale);
 579     if (locale == NULL) return NULL;
 580     return S_positional_name_value_xlation(locale, 1);
 581 }
 582
 583 #    undef setlocale
 584 #    define setlocale(a,b)  S_positional_setlocale(a,b)
 585 #    ifdef USE_POSIX_2008_LOCALE
 586
 587 STATIC locale_t
 588 S_positional_newlocale(int mask, const char * locale, locale_t base)
 589 {
 590     assert(locale);
 591
 592     if (mask != LC_ALL_MASK) return newlocale(mask, locale, base);
 593
 594     if (strNE(locale, "")) locale = S_positional_name_value_xlation(locale, 0);
 595     if (locale == NULL) return NULL;
 596     return newlocale(LC_ALL_MASK, locale, base);
 597 }
 598
 599 #    undef newlocale
 600 #    define newlocale(a,b,c)  S_positional_newlocale(a,b,c)
 601 #    endif
 602 #  endif
 603 #endif  /* End of fake positional notation */
 604
 605 #include "reentr.h"
 606
 607 #ifdef I_WCHAR
 608 #  include <wchar.h>
 609 #endif
 610 #ifdef I_WCTYPE
 611 #  include <wctype.h>
 612 #endif
 613
 614  /* The main errno that gets used is this one, on platforms that support it */
 615 #ifdef EINVAL
 616 #  define SET_EINVAL  SETERRNO(EINVAL, LIB_INVARG)
 617 #else
 618 #  define SET_EINVAL
 619 #endif
 620
 621 /* This is a starting guess as to when this is true.  It definititely isn't
 622  * true on *BSD where positional LC_ALL notation is used.  Likely this will end
 623  * up being defined in hints files. */
 624 #ifdef PERL_LC_ALL_USES_NAME_VALUE_PAIRS
 625 #  define NEWLOCALE_HANDLES_DISPARATE_LC_ALL
 626 #endif
 627
 628 /* But regardless, we have to look at individual categories if some are
 629  * ignored.  */
 630 #ifdef HAS_IGNORED_LOCALE_CATEGORIES_
 631 #  undef NEWLOCALE_HANDLES_DISPARATE_LC_ALL
 632 #endif
 633 #ifdef USE_LOCALE
 634
 635 /* Not all categories need be set to the same locale.  This macro determines if
 636  * 'name' which represents LC_ALL is uniform or disparate.  There are two
 637  * situations: 1) the platform uses unordered name=value pairs; 2) the platform
 638  * uses ordered positional values, with a separator string between them */
 639 #  ifdef PERL_LC_ALL_SEPARATOR   /* positional */
 640 #    define is_disparate_LC_ALL(name)  cBOOL(instr(name, PERL_LC_ALL_SEPARATOR))
 641 #  else     /* name=value */
 642
 643     /* In the, hopefully never occurring, event that the platform doesn't use
 644      * either mechanism for disparate LC_ALL's, assume the name=value pairs
 645      * form, rather than taking the extreme step of refusing to compile.  Many
 646      * programs won't have disparate locales, so will generally work */
 647 #    define PERL_LC_ALL_SEPARATOR  ";"
 648 #    define is_disparate_LC_ALL(name)  cBOOL(   strchr(name, ';')   \
 649                                              && strchr(name, '='))
 650 #  endif
 651
 652 /* It is possible to compile perl to always keep any individual category in the
 653  * C locale.  This would be done where the implementation on a platform is
 654  * flawed or incomplete.  At the time of this writing, for example, OpenBSD has
 655  * not implemented LC_COLLATE beyond the C locale.  The 'category_available[]'
 656  * table is a bool that says whether a category is changeable, or must be kept
 657  * in C.  This macro substitutes C for the locale appropriately, expanding to
 658  * nothing on the more typical case where all possible categories present on
 659  * the platform are handled. */
 660 #  ifdef HAS_IGNORED_LOCALE_CATEGORIES_
 661 #    define need_to_override_category(i)  (! category_available[i])
 662 #    define override_ignored_category(i, new_locale)                        \
 663                     ((need_to_override_category(i)) ? "C" : (new_locale))
 664 #  else
 665 #    define need_to_override_category(i)  0
 666 #    define override_ignored_category(i, new_locale)  (new_locale)
 667 #  endif
 668
 669 PERL_STATIC_INLINE const char *
 670 S_mortalized_pv_copy(pTHX_ const char * const pv)
 671 {
 672     PERL_ARGS_ASSERT_MORTALIZED_PV_COPY;
 673
 674     /* Copies the input pv, and arranges for it to be freed at an unspecified
 675      * later time. */
 676
 677     if (pv == NULL) {
 678         return NULL;
 679     }
 680
 681     const char * copy = savepv(pv);
 682     SAVEFREEPV(copy);
 683     return copy;
 684 }
 685
 686 #endif
 687
 688 /* Default values come from the C locale */
 689 #define C_codeset "ANSI_X3.4-1968" /* Only in some Configurations, and usually
 690                                       a single instance, so is a #define */
 691 static const char C_decimal_point[] = ".";
 692
 693 #if defined(HAS_NL_LANGINFO_L) || defined(HAS_NL_LANGINFO)
 694 #  define HAS_SOME_LANGINFO
 695 #endif
 696
 697 #if (defined(USE_LOCALE_NUMERIC) && ! defined(TS_W32_BROKEN_LOCALECONV))    \
 698  || ! (   defined(USE_LOCALE_NUMERIC)                                       \
 699        && (defined(HAS_SOME_LANGINFO) || defined(HAS_LOCALECONV)))
 700 static const char C_thousands_sep[] = "";
 701 #endif
 702
 703 /* Is the C string input 'name' "C" or "POSIX"?  If so, and 'name' is the
 704  * return of setlocale(), then this is extremely likely to be the C or POSIX
 705  * locale.  However, the output of setlocale() is documented to be opaque, but
 706  * the odds are extremely small that it would return these two strings for some
 707  * other locale.  Note that VMS includes many non-ASCII characters in these two
 708  * locales as controls and punctuation (below are hex bytes):
 709  *   cntrl:  84-97 9B-9F
 710  *   punct:  A1-A3 A5 A7-AB B0-B3 B5-B7 B9-BD BF-CF D1-DD DF-EF F1-FD
 711  * Oddly, none there are listed as alphas, though some represent alphabetics
 712  * http://www.nntp.perl.org/group/perl.perl5.porters/2013/02/msg198753.html */
 713 #define isNAME_C_OR_POSIX(name)                                              \
 714                              (   (name) != NULL                              \
 715                               && (( *(name) == 'C' && (*(name + 1)) == '\0') \
 716                                    || strEQ((name), "POSIX")))
 717
 718 #ifndef my_langinfo_i
 719 #  define my_langinfo_i(i, c, l, b, s, u)                                   \
 720                     (PERL_UNUSED_VAR(c), emulate_langinfo(i, l, b, s, u))
 721 #endif
 722 #define my_langinfo_c(item, category, locale, retbufp, retbuf_sizep, utf8ness) \
 723             my_langinfo_i(item, category##_INDEX_, locale, retbufp,            \
 724                                                       retbuf_sizep,  utf8ness)
 725 #ifndef USE_LOCALE  /* A no-op unless locales are enabled */
 726 #  define toggle_locale_i(index, locale)    NULL
 727 #  define restore_toggled_locale_i(index, locale)  PERL_UNUSED_VAR(locale)
 728 #else
 729 #  define toggle_locale_i(index, locale)                                    \
 730                  S_toggle_locale_i(aTHX_ index, locale, __LINE__)
 731 #  define restore_toggled_locale_i(index, locale)                           \
 732                  S_restore_toggled_locale_i(aTHX_ index, locale, __LINE__)
 733 #endif
 734
 735 #  define toggle_locale_c(cat, locale)  toggle_locale_i(cat##_INDEX_, locale)
 736 #  define restore_toggled_locale_c(cat, locale)                             \
 737                              restore_toggled_locale_i(cat##_INDEX_, locale)
 738 #ifdef USE_LOCALE
 739 #  ifdef DEBUGGING
 740 #    define setlocale_debug_string_i(index, locale, result)                 \
 741             my_setlocale_debug_string_i(index, locale, result, __LINE__)
 742 #    define setlocale_debug_string_c(category, locale, result)              \
 743                 setlocale_debug_string_i(category##_INDEX_, locale, result)
 744 #    define setlocale_debug_string_r(category, locale, result)              \
 745              setlocale_debug_string_i(get_category_index(category),         \
 746                                       locale, result)
 747 #  endif
 748
 749 /* On systems without LC_ALL, pretending it exists anyway simplifies things.
 750  * Choose a value for it that is very unlikely to clash with any actual
 751  * category */
 752 #  define FAKE_LC_ALL  PERL_INT_MIN
 753
 754 /* Below are parallel arrays for locale information indexed by our mapping of
 755  * category numbers into small non-negative indexes.  locale_table.h contains
 756  * an entry like this for each individual category used on this system:
 757  *      PERL_LOCALE_TABLE_ENTRY(CTYPE, S_new_ctype)
 758  *
 759  * Each array redefines PERL_LOCALE_TABLE_ENTRY to generate the information
 760  * needed for that array, and #includes locale_table.h to get the valid
 761  * categories.
 762  *
 763  * An entry for the conglomerate category LC_ALL is added here, immediately
 764  * following the individual categories.  (The treatment for it varies, so can't
 765  * be in locale_table.h.)
 766  *
 767  * Following this, each array ends with an entry for illegal categories.  All
 768  * category numbers unknown to perl get mapped to this entry.  This is likely
 769  * to be a parameter error from the calling program; but it could be that this
 770  * platform has a category we don't know about, in which case it needs to be
 771  * added, using the paradigm of one of the existing categories. */
 772
 773 /* The first array is the locale categories perl uses on this system, used to
 774  * map our index back to the system's category number. */
 775 STATIC const int categories[] = {
 776
 777 #  undef PERL_LOCALE_TABLE_ENTRY
 778 #  define PERL_LOCALE_TABLE_ENTRY(name, call_back)  LC_ ## name,
 779 #  include "locale_table.h"
 780
 781 #  ifdef LC_ALL
 782     LC_ALL,
 783 #  else
 784     FAKE_LC_ALL,
 785 #  endif
 786
 787    (FAKE_LC_ALL + 1)    /* Entry for unknown category; this number is unlikely
 788                            to clash with a real category */
 789 };
 790
 791 # define GET_NAME_AS_STRING(token)  # token
 792 # define GET_LC_NAME_AS_STRING(token) GET_NAME_AS_STRING(LC_ ## token)
 793
 794 /* The second array is the category names. */
 795 STATIC const char * const category_names[] = {
 796
 797 #  undef PERL_LOCALE_TABLE_ENTRY
 798 #  define PERL_LOCALE_TABLE_ENTRY(name, call_back)  GET_LC_NAME_AS_STRING(name),
 799 #  include "locale_table.h"
 800
 801 #  ifdef LC_ALL
 802 #    define LC_ALL_STRING  "LC_ALL"
 803 #  else
 804 #    define LC_ALL_STRING  "If you see this, it is a bug in perl;"      \
 805                            " please report it via perlbug"
 806 #  endif
 807
 808     LC_ALL_STRING,
 809
 810 #  define LC_UNKNOWN_STRING  "Locale category unknown to Perl; if you see"  \
 811                              " this, it is a bug in perl; please report it" \
 812                              " via perlbug"
 813     LC_UNKNOWN_STRING
 814 };
 815
 816 STATIC const Size_t category_name_lengths[] = {
 817
 818 #  undef PERL_LOCALE_TABLE_ENTRY
 819 #  define PERL_LOCALE_TABLE_ENTRY(name, call_back)                          \
 820                                     STRLENs(GET_LC_NAME_AS_STRING(name)),
 821 #  include "locale_table.h"
 822
 823     STRLENs(LC_ALL_STRING),
 824     STRLENs(LC_UNKNOWN_STRING)
 825 };
 826
 827 /* Each entry includes space for the '=' and ';' */
 828 #  undef PERL_LOCALE_TABLE_ENTRY
 829 #  define PERL_LOCALE_TABLE_ENTRY(name, call_back)                          \
 830                                 + STRLENs(GET_LC_NAME_AS_STRING(name)) + 2
 831
 832 STATIC const Size_t lc_all_boiler_plate_length = 1  /* space for trailing NUL */
 833 #  include "locale_table.h"
 834 ;
 835
 836 /* A few categories require additional setup when they are changed.  This table
 837  * points to the functions that do that setup */
 838 STATIC void (*update_functions[]) (pTHX_ const char *, bool force) = {
 839
 840 #  undef PERL_LOCALE_TABLE_ENTRY
 841 #  define PERL_LOCALE_TABLE_ENTRY(name, call_back)  call_back,
 842 #  include "locale_table.h"
 843
 844     S_new_LC_ALL,
 845     NULL,   /* No update for unknown category */
 846 };
 847
 848 #  if defined(HAS_IGNORED_LOCALE_CATEGORIES_)
 849
 850 /* Indicates if each category on this platform is available to use not in
 851  * the C locale */
 852 STATIC const bool category_available[] = {
 853
 854 #  undef PERL_LOCALE_TABLE_ENTRY
 855 #  define PERL_LOCALE_TABLE_ENTRY(name, call_back)  LC_ ## name ## _AVAIL_,
 856 #  include "locale_table.h"
 857
 858 #  ifdef LC_ALL
 859     true,
 860 #  else
 861     false,
 862 #  endif
 863
 864     false   /* LC_UNKNOWN_AVAIL_ */
 865 };
 866
 867 #  endif
 868 #  if defined(USE_POSIX_2008_LOCALE)
 869
 870 STATIC const int category_masks[] = {
 871
 872 #    undef PERL_LOCALE_TABLE_ENTRY
 873 #    define PERL_LOCALE_TABLE_ENTRY(name, call_back)  LC_ ## name ## _MASK,
 874 #    include "locale_table.h"
 875
 876     LC_ALL_MASK,    /* Will rightly refuse to compile unless this is defined */
 877     0               /* Empty mask for unknown category */
 878 };
 879
 880 #  endif
 881 #  if ! defined(PERL_LC_ALL_USES_NAME_VALUE_PAIRS)
 882
 883 /* On platforms that use positional notation for expressing LC_ALL, this maps
 884  * the position of each category to our corresponding internal index for it.
 885  * This is initialized at run time if needed.  LC_ALL_INDEX_ is not legal for
 886  * an individual locale, hence marks the elements here as not actually
 887  * initialized. */
 888 STATIC
 889 unsigned int
 890 map_LC_ALL_position_to_index[LC_ALL_INDEX_] = { LC_ALL_INDEX_ };
 891
 892 #  endif
 893 #endif
 894 #if defined(USE_LOCALE) || defined(DEBUGGING)
 895
 896 STATIC const char *
 897 S_get_displayable_string(pTHX_
 898                          const char * const s,
 899                          const char * const e,
 900                          const bool is_utf8)
 901 {
 902     PERL_ARGS_ASSERT_GET_DISPLAYABLE_STRING;
 903
 904     if (e <= s) {
 905         return "";
 906     }
 907
 908     const char * t = s;
 909     bool prev_was_printable = TRUE;
 910     bool first_time = TRUE;
 911     char * ret;
 912
 913     /* Worst case scenario: All are non-printable so have a blank between each.
 914      * If UTF-8, all are the largest possible code point; otherwise all are a
 915      * single byte.  '(2 + 1)'  is from each byte takes 2 characters to
 916      * display, and a blank (or NUL for the final one) after it */
 917     const Size_t size = (e - s) * (2 + 1) * ((is_utf8) ? UVSIZE : 1);
 918     Newxz(ret, size, char);
 919     SAVEFREEPV(ret);
 920
 921     while (t < e) {
 922         UV cp = (is_utf8)
 923                 ?  utf8_to_uvchr_buf((U8 *) t, e, NULL)
 924                 : * (U8 *) t;
 925         if (isPRINT(cp)) {
 926             if (! prev_was_printable) {
 927                 my_strlcat(ret, " ", size);
 928             }
 929
 930             /* Escape these to avoid any ambiguity */
 931             if (cp == ' ' || cp == '\\') {
 932                 my_strlcat(ret, "\\", size);
 933             }
 934             my_strlcat(ret, Perl_form(aTHX_ "%c", (U8) cp), size);
 935             prev_was_printable = TRUE;
 936         }
 937         else {
 938             if (! first_time) {
 939                 my_strlcat(ret, " ", size);
 940             }
 941             my_strlcat(ret, Perl_form(aTHX_ "%02" UVXf, cp), size);
 942             prev_was_printable = FALSE;
 943         }
 944         t += (is_utf8) ? UTF8SKIP(t) : 1;
 945         first_time = FALSE;
 946     }
 947
 948     return ret;
 949 }
 950
 951 #endif
 952 #ifdef USE_LOCALE
 953
 954 # define get_category_index(cat) get_category_index_helper(cat, NULL, __LINE__)
 955
 956 STATIC locale_category_index
 957 S_get_category_index_helper(pTHX_ const int category, bool * succeeded,
 958                                   const line_t caller_line)
 959 {
 960     PERL_ARGS_ASSERT_GET_CATEGORY_INDEX_HELPER;
 961
 962     /* Given a category, return the equivalent internal index we generally use
 963      * instead, warn or panic if not found. */
 964
 965     locale_category_index i;
 966
 967 #  undef PERL_LOCALE_TABLE_ENTRY
 968 #  define PERL_LOCALE_TABLE_ENTRY(name, call_back)                          \
 969                     case LC_ ## name: i =  LC_ ## name ## _INDEX_; break;
 970
 971     switch (category) {
 972
 973 #  include "locale_table.h"
 974 #  ifdef LC_ALL
 975       case LC_ALL: i =  LC_ALL_INDEX_; break;
 976 #  endif
 977
 978       default: goto unknown_locale;
 979     }
 980
 981     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
 982                            "index of category %d (%s) is %d;"
 983                            " called from %" LINE_Tf "\n",
 984                            category, category_names[i], i, caller_line));
 985
 986     if (succeeded) {
 987         *succeeded = true;
 988     }
 989
 990     return i;
 991
 992   unknown_locale:
 993
 994     if (succeeded) {
 995         *succeeded = false;
 996         return LC_ALL_INDEX_;   /* Arbitrary */
 997     }
 998
 999     locale_panic_via_(Perl_form(aTHX_ "Unknown locale category %d", category),
1000                       __FILE__, caller_line);
1001     NOT_REACHED; /* NOTREACHED */
1002 }
1003
1004 #endif /* ifdef USE_LOCALE */
1005
1006 void
1007 Perl_force_locale_unlock(pTHX)
1008 {
1009     /* Remove any locale mutex, in preperation for an inglorious termination,
1010      * typically a  panic */
1011
1012 #if defined(USE_LOCALE_THREADS)
1013
1014     /* If recursively locked, clear all at once */
1015     if (PL_locale_mutex_depth > 1) {
1016         PL_locale_mutex_depth = 1;
1017     }
1018
1019     if (PL_locale_mutex_depth > 0) {
1020         LOCALE_UNLOCK_;
1021     }
1022
1023 #endif
1024
1025 }
1026
1027 #ifdef USE_POSIX_2008_LOCALE
1028
1029 STATIC locale_t
1030 S_use_curlocale_scratch(pTHX)
1031 {
1032     /* This function is used to hide from the caller the case where the current
1033      * locale_t object in POSIX 2008 is the global one, which is illegal in
1034      * many of the P2008 API calls.  This checks for that and, if necessary
1035      * creates a proper P2008 object.  Any prior object is deleted, as is any
1036      * remaining object during global destruction. */
1037
1038     locale_t cur = uselocale((locale_t) 0);
1039
1040     if (cur != LC_GLOBAL_LOCALE) {
1041         return cur;
1042     }
1043
1044     if (PL_scratch_locale_obj) {
1045         freelocale(PL_scratch_locale_obj);
1046     }
1047
1048     PL_scratch_locale_obj = duplocale(LC_GLOBAL_LOCALE);
1049     return PL_scratch_locale_obj;
1050 }
1051
1052 #endif
1053
1054 void
1055 Perl_locale_panic(const char * msg,
1056                   const line_t immediate_caller_line,
1057                   const char * const higher_caller_file,
1058                   const line_t higher_caller_line)
1059 {
1060     PERL_ARGS_ASSERT_LOCALE_PANIC;
1061     dTHX;
1062     dSAVE_ERRNO;
1063
1064     force_locale_unlock();
1065
1066 #ifdef USE_C_BACKTRACE
1067     dump_c_backtrace(Perl_debug_log, 20, 1);
1068 #endif
1069
1070     const char * called_by = "";
1071     if (   strNE(__FILE__, higher_caller_file)
1072         || immediate_caller_line != higher_caller_line)
1073     {
1074         called_by = Perl_form(aTHX_ "\nCalled by %s: %" LINE_Tf "\n",
1075                                     higher_caller_file, higher_caller_line);
1076     }
1077
1078     RESTORE_ERRNO;
1079
1080     const char * errno_text;
1081
1082 #ifdef HAS_EXTENDED_OS_ERRNO
1083
1084     const int extended_errnum = get_extended_os_errno();
1085     if (errno != extended_errnum) {
1086         errno_text = Perl_form(aTHX_ "; errno=%d, $^E=%d",
1087                                      errno, extended_errnum);
1088     }
1089     else
1090
1091 #endif
1092
1093     {
1094         errno_text = Perl_form(aTHX_ "; errno=%d", errno);
1095     }
1096
1097     /* diag_listed_as: panic: %s */
1098     Perl_croak(aTHX_ "%s: %" LINE_Tf ": panic: %s%s%s\n",
1099                      __FILE__, immediate_caller_line,
1100                      msg, errno_text, called_by);
1101 }
1102
1103 /* Macros to report and croak on an unexpected failure to set the locale.  The
1104  * via version has more stack trace information */
1105 #define setlocale_failure_panic_i(i, cur, fail, line, higher_line)          \
1106     setlocale_failure_panic_via_i(i, cur, fail, __LINE__, line,             \
1107                                   __FILE__, higher_line)
1108
1109 #define setlocale_failure_panic_c(cat, cur, fail, line, higher_line)        \
1110    setlocale_failure_panic_i(cat##_INDEX_, cur, fail, line, higher_line)
1111
1112 #if defined(USE_LOCALE)
1113
1114 /* Expands to the code to
1115  *      result = savepvn(s, len)
1116  * if the category whose internal index is 'i' doesn't need to be kept in the C
1117  * locale on this system, or if 'action is 'no_override'.  Otherwise it expands
1118  * to
1119  *      result = savepv("C")
1120  * unless 'action' isn't 'check_that_overridden', in which case if the string
1121  * 's' isn't already "C" it panics */
1122 #  ifndef HAS_IGNORED_LOCALE_CATEGORIES_
1123 #    define OVERRIDE_AND_SAVEPV(s, len, result, i, action)                  \
1124                                                   result = savepvn(s, len)
1125 #  else
1126 #    define OVERRIDE_AND_SAVEPV(s, len, result, i, action)                  \
1127         STMT_START {                                                        \
1128             if (LIKELY(   ! need_to_override_category(i)                    \
1129                        || action == no_override)) {                         \
1130                 result = savepvn(s, len);                                   \
1131             }                                                               \
1132             else {                                                          \
1133                 const char * temp = savepvn(s, len);                        \
1134                 result = savepv(override_ignored_category(i, temp));        \
1135                 if (action == check_that_overridden && strNE(result, temp)) { \
1136                     locale_panic_(Perl_form(aTHX_                           \
1137                                 "%s expected to be '%s', instead is '%s'",  \
1138                                 category_names[i], result, temp));          \
1139                 }                                                           \
1140                 Safefree(temp);                                             \
1141             }                                                               \
1142         } STMT_END
1143 #  endif
1144
1145 STATIC parse_LC_ALL_string_return
1146 S_parse_LC_ALL_string(pTHX_ const char * string,
1147                             const char ** output,
1148                             const parse_LC_ALL_STRING_action  override,
1149                             bool always_use_full_array,
1150                             const bool panic_on_error,
1151                             const line_t caller_line)
1152 {
1153     /* This function parses the value of the input 'string' which is expected
1154      * to be the representation of an LC_ALL locale, and splits the result into
1155      * the values for the individual component categories, returning those in
1156      * the 'output' array.  Each array value will be a savepv() copy that is
1157      * the responsibility of the caller to make sure gets freed
1158      *
1159      * The locale for each category is independent of the other categories.
1160      * Often, they are all the same, but certainly not always.  Perl, in fact,
1161      * usually keeps LC_NUMERIC in the C locale, regardless of the underlying
1162      * locale.  LC_ALL has to be able to represent the case of when not all
1163      * categories have the same locale.  Platforms have differing ways of
1164      * representing this.  Internally, this file uses the 'name=value;'
1165      * representation found on some platforms, so this function always looks
1166      * for and parses that.  Other platforms use a positional notation.  On
1167      * those platforms, this function also parses that form.  It examines the
1168      * input to see which form is being parsed.
1169      *
1170      * Often, all categories will have the same locale.  This is special cased
1171      * if 'always_use_full_array' is false on input:
1172      *      1) If the input 'string' is a single value, this function doesn't
1173      *         store anything into 'output', and returns 'no_array'
1174      *      2) Some platforms will return multiple occurrences of the same
1175      *         value rather than coalescing them down to a single one.  HP-UX
1176      *         is such a one.  This function will do that collapsing for you,
1177      *         returning 'only_element_0' and saving the single value in
1178      *         output[0], which the caller will need to arrange to be freed.
1179      *         The rest of output[] is undefined, and does not need to be
1180      *         freed.
1181      *
1182      * Otherwise, the input 'string' may not be valid.  This function looks
1183      * mainly for syntactic errors, and if found, returns 'invalid'.  'output'
1184      * will not be filled in in that case, but the input state of it isn't
1185      * necessarily preserved.  Turning on -DL debugging will give details as to
1186      * the error.  If 'panic_on_error' is 'true', the function panics instead
1187      * of returning on error, with a message giving the details.
1188      *
1189      * Otherwise, output[] will be filled with the individual locale names for
1190      * all categories on the system, 'full_array' will be returned, and the
1191      * caller needs to arrange for each to be freed.  This means that either at
1192      * least one category differed from the others, or 'always_use_full_array' was
1193      * true on input.
1194      *
1195      * perl may be configured to ignore changes to a category's locale to
1196      * non-C.  The parameter 'override' tells this function what to do when
1197      * encountering such an illegal combination:
1198      *
1199      *      no_override             indicates to take no special action
1200      *      override_if_ignored,    indicates to return 'C' instead of what the
1201      *                              input string actually says.
1202      *      check_that_overridden   indicates to panic if the string says the
1203      *                              category is not 'C'.  This is used when
1204      *                              non-C is very unexpected behavior.
1205      * */
1206
1207     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
1208                            "Entering parse_LC_ALL_string; called from %"    \
1209                            LINE_Tf "\nnew='%s'\n", caller_line, string));
1210
1211 #  ifdef PERL_LC_ALL_USES_NAME_VALUE_PAIRS
1212
1213     const char separator[] = ";";
1214     const Size_t separator_len = 1;
1215     const bool single_component = (strchr(string, ';') == NULL);
1216
1217 #  else
1218
1219     /* It's possible (but quite unlikely) that the separator string is an '='
1220      * or a ';'.  Requiring both to be present for using the 'name=value;' form
1221      * properly handles those possibilities */
1222     const bool name_value = strchr(string, '=') && strchr(string, ';');
1223     const char * separator;
1224     Size_t separator_len;
1225     bool single_component;
1226     if (name_value) {
1227         separator = ";";
1228         separator_len = 1;
1229         single_component = false;   /* Since has both [;=], must be multi */
1230     }
1231     else {
1232         separator = PERL_LC_ALL_SEPARATOR;
1233         separator_len = STRLENs(PERL_LC_ALL_SEPARATOR);
1234         single_component = instr(string, separator) == NULL;
1235     }
1236
1237     Size_t component_number = 0;    /* Position in the parsing loop below */
1238
1239 #  endif
1240 #  ifndef HAS_IGNORED_LOCALE_CATEGORIES_
1241     PERL_UNUSED_ARG(override);
1242 #  else
1243
1244     /* Any ignored categories are to be set to "C", so if this single-component
1245      * LC_ALL isn't to C, it has both "C" and non-C, so isn't really a single
1246      * component.  All the non-ignored categories are set to the input
1247      * component, but the ignored ones are overridden to be C.
1248      *
1249      * This incidentally handles the case where the string is "".  The return
1250      * will be C for each ignored category and "" for the others.  Then the
1251      * caller can individually set each category, and get the right answer. */
1252     if (single_component && ! isNAME_C_OR_POSIX(string)) {
1253         for_all_individual_category_indexes(i) {
1254            OVERRIDE_AND_SAVEPV(string, strlen(string), output[i], i, override);
1255         }
1256
1257         return full_array;
1258     }
1259
1260 #  endif
1261
1262     if (single_component) {
1263         if (! always_use_full_array) {
1264             return no_array;
1265         }
1266
1267         for_all_individual_category_indexes(i) {
1268             output[i] = savepv(string);
1269         }
1270
1271         return full_array;
1272     }
1273
1274     /* Here the input is multiple components.  Parse through them.  (It is
1275      * possible that these components are all the same, so we check, and if so,
1276      * return just the 0th component (unless 'always_use_full_array' is true)
1277      *
1278      * This enum notes the possible errors findable in parsing */
1279     enum {
1280             incomplete,
1281             no_equals,
1282             unknown_category,
1283             contains_LC_ALL_element
1284     } error;
1285
1286     /* Keep track of the categories we have encountered so far */
1287     bool seen[LC_ALL_INDEX_] = { false };
1288
1289     Size_t index;           /* Our internal index for the current category */
1290     const char * s = string;
1291     const char * e = s + strlen(string);
1292     const char * category_end = NULL;
1293     const char * saved_first = NULL;
1294
1295     /* Parse the input locale string */
1296     while (s < e) {
1297
1298         /* 'separator' has been set up to delimit the components */
1299         const char * next_sep = instr(s, separator);
1300         if (! next_sep) {   /* At the end of the input */
1301             next_sep = e;
1302         }
1303
1304 #  ifndef PERL_LC_ALL_USES_NAME_VALUE_PAIRS
1305
1306         if (! name_value) {
1307             /* Get the index of the category in this position */
1308             index = map_LC_ALL_position_to_index[component_number++];
1309         }
1310         else
1311
1312 #  endif
1313
1314         {   /* Get the category part when each component is the
1315              * 'category=locale' form */
1316
1317             category_end = strchr(s, '=');
1318
1319             /* The '=' terminates the category name.  If no '=', is improper
1320              * form */
1321             if (! category_end) {
1322                 error = no_equals;
1323                 goto failure;
1324             }
1325
1326             /* Find our internal index of the category name; uses a linear
1327              * search.  (XXX This could be avoided by various means, but the
1328              * maximum likely search is 6 items, and khw doesn't think the
1329              * added complexity would save very much at all.) */
1330             const unsigned int name_len = (unsigned int) (category_end - s);
1331             for (index = 0; index < C_ARRAY_LENGTH(category_names); index++) {
1332                 if (   name_len == category_name_lengths[index]
1333                     && memEQ(s, category_names[index], name_len))
1334                 {
1335                     goto found_category;
1336                 }
1337             }
1338
1339             /* Here, the category is not in our list. */
1340             error = unknown_category;
1341             goto failure;
1342
1343           found_category:   /* The system knows about this category. */
1344
1345             if (index == LC_ALL_INDEX_) {
1346                 error = contains_LC_ALL_element;
1347                 goto failure;
1348             }
1349
1350             /* The locale name starts just beyond the '=' */
1351             s = category_end + 1;
1352
1353             /* Linux (and maybe others) doesn't treat a duplicate category in
1354              * the string as an error.  Instead it uses the final occurrence as
1355              * the intended value.  So if this is a duplicate, free the former
1356              * value before setting the new one */
1357             if (seen[index]) {
1358                 Safefree(output[index]);
1359             }
1360             else {
1361                 seen[index] = true;
1362             }
1363         }
1364
1365         /* Here, 'index' contains our internal index number for the current
1366          * category, and 's' points to the beginning of the locale name for
1367          * that category. */
1368         OVERRIDE_AND_SAVEPV(s, next_sep - s, output[index], index, override);
1369
1370         if (! always_use_full_array) {
1371             if (! saved_first) {
1372                 saved_first = output[index];
1373             }
1374             else {
1375                 if (strNE(saved_first, output[index])) {
1376                     always_use_full_array = true;
1377                 }
1378             }
1379         }
1380
1381         /* Next time start from the new position */
1382         s = next_sep + separator_len;
1383     }
1384
1385     /* Finished looping through all the categories
1386      *
1387      * Check if the input was incomplete. */
1388
1389 #  ifndef PERL_LC_ALL_USES_NAME_VALUE_PAIRS
1390
1391     if (! name_value) {     /* Positional notation */
1392         if (component_number != LC_ALL_INDEX_) {
1393             error = incomplete;
1394             goto failure;
1395         }
1396     }
1397     else
1398
1399 #  endif
1400
1401     {   /* Here is the name=value notation */
1402         for_all_individual_category_indexes(i) {
1403             if (! seen[i]) {
1404                 error = incomplete;
1405                 goto failure;
1406             }
1407         }
1408     }
1409
1410     /* In the loop above, we changed 'always_use_full_array' to true iff not all
1411      * categories have the same locale.  Hence, if it is still 'false', all of
1412      * them are the same. */
1413     if (always_use_full_array) {
1414         return full_array;
1415     }
1416
1417     /* Free the dangling ones */
1418     for_all_but_0th_individual_category_indexes(i) {
1419         Safefree(output[i]);
1420         output[i] = NULL;
1421     }
1422
1423     return only_element_0;
1424
1425   failure:
1426
1427     /* Don't leave memory dangling that we allocated before the failure */
1428     for_all_individual_category_indexes(i) {
1429         if (seen[i]) {
1430             Safefree(output[i]);
1431             output[i] = NULL;
1432         }
1433     }
1434
1435     const char * msg;
1436     const char * display_start = s;
1437     const char * display_end = e;
1438
1439     switch (error) {
1440         case incomplete:
1441             msg = "doesn't list every locale category";
1442             display_start = string;
1443             break;
1444         case no_equals:
1445             msg = "needs an '=' to split name=value";
1446             break;
1447         case unknown_category:
1448             msg = "is an unknown category";
1449             display_end = (category_end && category_end > display_start)
1450                           ? category_end
1451                           : e;
1452             break;
1453         case contains_LC_ALL_element:
1454             msg = "has LC_ALL, which is illegal here";
1455             break;
1456     }
1457
1458     msg = Perl_form(aTHX_ "'%.*s' %s\n",
1459                           (int) (display_end - display_start),
1460                           display_start, msg);
1461
1462     DEBUG_L(PerlIO_printf(Perl_debug_log, "%s", msg));
1463
1464     if (panic_on_error) {
1465         locale_panic_via_(msg, __FILE__, caller_line);
1466     }
1467
1468     return invalid;
1469 }
1470
1471 #  undef OVERRIDE_AND_SAVEPV
1472 #endif
1473
1474 /*==========================================================================
1475  * Here starts the code that gives a uniform interface to its callers, hiding
1476  * the differences between platforms.
1477  *
1478  * base_posix_setlocale_() presents a consistent POSIX-compliant interface to
1479  * setlocale().   Windows requres a customized base-level setlocale().  This
1480  * layer should only be used by the next level up: the plain posix_setlocale
1481  * layer.  Any necessary mutex locking needs to be done at a higher level.  The
1482  * return may be overwritten by the next call to this function */
1483 #ifdef WIN32
1484 #  define base_posix_setlocale_(cat, locale) win32_setlocale(cat, locale)
1485 #else
1486 #  define base_posix_setlocale_(cat, locale)                                \
1487                                     ((const char *) setlocale(cat, locale))
1488 #endif
1489
1490 /*==========================================================================
1491  * Here is the main posix layer.  It is the same as the base one unless the
1492  * system is lacking LC_ALL, or there are categories that we ignore, but that
1493  * the system libc knows about */
1494
1495 #if ! defined(USE_LOCALE)                                                   \
1496  ||  (defined(LC_ALL) && ! defined(HAS_IGNORED_LOCALE_CATEGORIES_))
1497 #  define posix_setlocale(cat, locale) base_posix_setlocale_(cat, locale)
1498 #else
1499 #  define posix_setlocale(cat, locale)                                      \
1500         S_posix_setlocale_with_complications(aTHX_ cat, locale, __LINE__)
1501
1502 STATIC const char *
1503 S_posix_setlocale_with_complications(pTHX_ const int cat,
1504                                            const char * new_locale,
1505                                            const line_t caller_line)
1506 {
1507     /* This implements the posix layer above the base posix layer.
1508      * It is needed to reconcile our internal records that reflect only a
1509      * proper subset of the categories known by the system. */
1510
1511     /* Querying the current locale returns the real value */
1512     if (new_locale == NULL) {
1513         new_locale = base_posix_setlocale_(cat, NULL);
1514         assert(new_locale);
1515         return new_locale;
1516     }
1517
1518     const char * locale_on_entry = NULL;
1519
1520     /* If setting from the environment, actually do the set to get the system's
1521      * idea of what that means; we may have to override later. */
1522     if (strEQ(new_locale, "")) {
1523         locale_on_entry = base_posix_setlocale_(cat, NULL);
1524         assert(locale_on_entry);
1525         new_locale = base_posix_setlocale_(cat, "");
1526         if (! new_locale) {
1527             SET_EINVAL;
1528             return NULL;
1529         }
1530     }
1531
1532 #  ifdef LC_ALL
1533
1534     const char * new_locales[LC_ALL_INDEX_] = { NULL };
1535
1536     if (cat == LC_ALL) {
1537         switch (parse_LC_ALL_string(new_locale,
1538                                     (const char **) &new_locales,
1539                                     override_if_ignored,   /* Override any
1540                                                               ignored
1541                                                               categories */
1542                                     false,    /* Return only [0] if suffices */
1543                                     false,    /* Don't panic on error */
1544                                     caller_line))
1545         {
1546           case invalid:
1547             SET_EINVAL;
1548             return NULL;
1549
1550           case no_array:
1551             break;
1552
1553           case only_element_0:
1554             new_locale = new_locales[0];
1555             SAVEFREEPV(new_locale);
1556             break;
1557
1558           case full_array:
1559
1560             /* Turn the array into a string that the libc setlocale() should
1561              * understand.   (Another option would be to loop, setting the
1562              * individual locales, and then return base(cat, NULL) */
1563             new_locale = calculate_LC_ALL_string(new_locales,
1564                                                  EXTERNAL_FORMAT_FOR_SET,
1565                                                  WANT_TEMP_PV,
1566                                                  caller_line);
1567
1568             for_all_individual_category_indexes(i) {
1569                 Safefree(new_locales[i]);
1570             }
1571
1572             /* And call the libc setlocale.  We could avoid this call if
1573              * locale_on_entry is set and eq the new_locale.  But that would be
1574              * only for the relatively rare case of the desired locale being
1575              * "", and the time spent in doing the string compare might be more
1576              * than that of just setting it unconditionally */
1577             new_locale = base_posix_setlocale_(cat, new_locale);
1578             if (! new_locale) {
1579                  goto failure;
1580             }
1581
1582             return new_locale;
1583         }
1584     }
1585
1586 #  endif
1587
1588     /* Here, 'new_locale' is a single value, not an aggregation.  Just set it.
1589      * */
1590     new_locale =
1591         base_posix_setlocale_(cat,
1592                               override_ignored_category(
1593                                           get_category_index(cat), new_locale));
1594     if (! new_locale) {
1595         goto failure;
1596     }
1597
1598     return new_locale;
1599
1600  failure:
1601
1602     /* 'locale_on_entry' being set indicates there has likely been a change in
1603      * locale which needs to be restored */
1604     if (locale_on_entry) {
1605         if (! base_posix_setlocale_(cat, locale_on_entry)) {
1606             setlocale_failure_panic_i(get_category_index(cat),
1607                                       NULL, locale_on_entry,
1608                                       __LINE__, caller_line);
1609         }
1610     }
1611
1612     SET_EINVAL;
1613     return NULL;
1614 }
1615
1616 #endif
1617
1618 /* End of posix layer
1619  *==========================================================================
1620  *
1621  * The next layer up is to catch vagaries and bugs in the libc setlocale return
1622  * value.  The return is not guaranteed to be stable.
1623  *
1624  * Any necessary mutex locking needs to be done at a higher level.
1625  *
1626  * On most platforms this layer is empty, expanding to just the layer
1627  * below.   To enable it, call Configure with either or both:
1628  * -Accflags=-DHAS_LF_IN_SETLOCALE_RETURN
1629  *                  to indicate that extraneous \n characters can be returned
1630  *                  by setlocale()
1631  * -Accflags=-DHAS_BROKEN_SETLOCALE_QUERY_LC_ALL
1632  *                  to indicate that setlocale(LC_ALL, NULL) cannot be relied
1633  *                  on
1634  */
1635
1636 #define STDIZED_SETLOCALE_LOCK    POSIX_SETLOCALE_LOCK
1637 #define STDIZED_SETLOCALE_UNLOCK  POSIX_SETLOCALE_UNLOCK
1638 #if ! defined(USE_LOCALE)                                                   \
1639  || ! (   defined(HAS_LF_IN_SETLOCALE_RETURN)                               \
1640        || defined(HAS_BROKEN_SETLOCALE_QUERY_LC_ALL))
1641 #  define stdized_setlocale(cat, locale)  posix_setlocale(cat, locale)
1642 #  define stdize_locale(cat, locale)  (locale)
1643 #else
1644 #  define stdized_setlocale(cat, locale)                                    \
1645         S_stdize_locale(aTHX_ cat, posix_setlocale(cat, locale), __LINE__)
1646
1647 STATIC const char *
1648 S_stdize_locale(pTHX_ const int category,
1649                       const char *input_locale,
1650                       const line_t caller_line)
1651 {
1652     /* The return value of setlocale() is opaque, but is required to be usable
1653      * as input to a future setlocale() to create the same state.
1654      * Unfortunately not all systems are compliant.  This function brings those
1655      * outliers into conformance.  It is based on what problems have arisen in
1656      * the field.
1657      *
1658      * This has similar constraints as the posix layer.  You need to lock
1659      * around it until its return is safely copied or no longer needed. (The
1660      * return may point to a global static buffer or may be mortalized.)
1661      *
1662      * The current things this corrects are:
1663      * 1) A new-line.  This function chops any \n characters
1664      * 2) A broken 'setlocale(LC_ALL, foo)'  This constructs a proper returned
1665      *                 string from the constituent categories
1666      *
1667      * If no changes were made, the input is returned as-is */
1668
1669     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
1670                           "Entering stdize_locale(%d, '%s');"
1671                           " called from %" LINE_Tf "\n",
1672                           category, input_locale, caller_line));
1673
1674     if (input_locale == NULL) {
1675         SET_EINVAL;
1676         return NULL;
1677     }
1678
1679     char * retval = (char *) input_locale;
1680
1681 #  if defined(LC_ALL) && defined(HAS_BROKEN_SETLOCALE_QUERY_LC_ALL)
1682
1683         /* If setlocale(LC_ALL, NULL) is broken, compute what the system
1684          * actually thinks it should be from its individual components */
1685     if (category == LC_ALL) {
1686         retval = (char *) calculate_LC_ALL_string(
1687                                           NULL,  /* query each individ locale */
1688                                           EXTERNAL_FORMAT_FOR_SET,
1689                                           WANT_TEMP_PV,
1690                                           caller_line);
1691     }
1692
1693 #  endif
1694 #  ifdef HAS_NL_IN_SETLOCALE_RETURN
1695
1696     char * first_bad = NULL;
1697
1698 #    ifndef LC_ALL
1699
1700     PERL_UNUSED_ARG(category);
1701     PERL_UNUSED_ARG(caller_line);
1702
1703 #      define INPUT_LOCALE  retval
1704 #      define MARK_CHANGED
1705 #    else
1706
1707     char * individ_locales[LC_ALL_INDEX_] = { NULL };
1708     bool made_changes = false;
1709     Size_t upper;
1710     if (category != LC_ALL) {
1711         individ_locales[0] = retval;
1712         upper = 0;
1713     }
1714     else {
1715
1716         /* And parse the locale string, splitting into its individual
1717          * components. */
1718         switch (parse_LC_ALL_string(retval,
1719                                     (const char **) &individ_locales,
1720                                     check_that_overridden, /* ignored
1721                                                               categories should
1722                                                               already have been
1723                                                               overridden */
1724                                     false,    /* Return only [0] if suffices */
1725                                     false,    /* Don't panic on error */
1726                                     caller_line))
1727         {
1728           case invalid:
1729             SET_EINVAL;
1730             return NULL;
1731
1732           case full_array: /* Loop below through all the component categories.
1733                             */
1734             upper = LC_ALL_INDEX_ - 1;
1735             break;
1736
1737           case no_array:
1738             /* All categories here are set to the same locale, and the parse
1739              * didn't fill in any of 'individ_locales'.  Set the 0th element to
1740              * that locale. */
1741             individ_locales[0] = retval;
1742             /* FALLTHROUGH */
1743
1744           case only_element_0: /* Element 0 is the only element we need to look
1745                                   at */
1746             upper = 0;
1747             break;
1748         }
1749     }
1750
1751     for (unsigned int i = 0; i <= upper; i++)
1752
1753 #      define INPUT_LOCALE  individ_locales[i]
1754 #      define MARK_CHANGED  made_changes = true;
1755 #    endif    /* Has LC_ALL */
1756
1757     {
1758         first_bad = (char *) strchr(INPUT_LOCALE, '\n');
1759
1760         /* Most likely, there isn't a problem with the input */
1761         if (UNLIKELY(first_bad)) {
1762
1763             /* This element will need to be adjusted.  Create a modifiable
1764              * copy. */
1765             MARK_CHANGED
1766             retval = savepv(INPUT_LOCALE);
1767             SAVEFREEPV(retval);
1768
1769             /* Translate the found position into terms of the copy */
1770             first_bad = retval + (first_bad - INPUT_LOCALE);
1771
1772             /* Get rid of the \n and what follows.  (Originally, only a
1773              * trailing \n was stripped.  Unsure what to do if not trailing) */
1774             *((char *) first_bad) = '\0';
1775         }   /* End of needs adjusting */
1776     }   /* End of looking for problems */
1777
1778 #    ifdef LC_ALL
1779
1780     /* If we had multiple elements, extra work is required */
1781     if (upper != 0) {
1782
1783         /* If no changes were made to the input, 'retval' already contains it
1784          * */
1785         if (made_changes) {
1786
1787             /* But if did make changes, need to calculate the new value */
1788             retval = (char *) calculate_LC_ALL_string(
1789                                             (const char **) &individ_locales,
1790                                             EXTERNAL_FORMAT_FOR_SET,
1791                                             WANT_TEMP_PV,
1792                                             caller_line);
1793         }
1794
1795         /* And free the no-longer needed memory */
1796         for (unsigned int i = 0; i <= upper; i++) {
1797             Safefree(individ_locales[i]);
1798         }
1799     }
1800
1801 #    endif
1802 #    undef INPUT_LOCALE
1803 #    undef MARK_CHANGED
1804 #  endif    /* HAS_NL_IN_SETLOCALE_RETURN */
1805
1806     return (const char *) retval;
1807 }
1808
1809 #endif  /* USE_LOCALE */
1810
1811 /* End of stdize_locale layer
1812  *
1813  * ==========================================================================
1814  *
1815  * The next many lines form several implementations of a layer above the
1816  * close-to-the-metal 'posix' and 'stdized' macros.  They are used to present a
1817  * uniform API to the rest of the code in this file in spite of the disparate
1818  * underlying implementations.  Which implementation gets compiled depends on
1819  * the platform capabilities (and some user choice) as determined by Configure.
1820  *
1821  * As more fully described in the introductory comments in this file, the
1822  * API of each implementation consists of three sets of macros.  Each set has
1823  * three variants with suffixes '_c', '_r', and '_i'.  In the list below '_X'
1824  * is to be replaced by any of these suffixes.
1825  *
1826  * 1) bool_setlocale_X  attempts to set the given category's locale to the
1827  *                      given value, returning if it worked or not.
1828  * 2) void_setlocale_X  is like the corresponding bool_setlocale, but used when
1829  *                      success is the only sane outcome, so failure causes it
1830  *                      to panic.
1831  * 3) querylocale_X     to see what the given category's locale is
1832  *
1833  * 4) setlocale_i()     is defined only in those implementations where the bool
1834  *                      and query forms are essentially the same, and can be
1835  *                      combined to save CPU time.
1836  *
1837  * Each implementation below is separated by ==== lines, and includes bool,
1838  * void, and query macros.  The query macros are first, followed by any
1839  * functions needed to implement them.  Then come the bool, again followed by
1840  * any implementing functions  Then are the void macros; next is setlocale_i if
1841  * present on this implementation.  Finally are any helper functions.  The sets
1842  * in each implementation are separated by ---- lines.
1843  *
1844  * The returned strings from all the querylocale...() forms in all
1845  * implementations are thread-safe, and the caller should not free them,
1846  * but each may be a mortalized copy.  If you need something stable across
1847  * calls, you need to savepv() the result yourself.
1848  *
1849  *===========================================================================*/
1850
1851 #if    (! defined(USE_LOCALE_THREADS) && ! defined(USE_POSIX_2008_LOCALE))    \
1852     || (  defined(WIN32) && defined(USE_THREAD_SAFE_LOCALE))
1853
1854 /* For non-threaded perls, the implementation just expands to the base-level
1855  * functions (except if we are Configured to nonetheless use the POSIX 2008
1856  * interface) This implementation is also used on threaded perls where
1857  * threading is invisible to us.  Currently this is only on later Windows
1858  * versions. */
1859
1860 #  define querylocale_r(cat)  mortalized_pv_copy(stdized_setlocale(cat, NULL))
1861 #  define querylocale_c(cat)  querylocale_r(cat)
1862 #  define querylocale_i(i)    querylocale_c(categories[i])
1863
1864 /*---------------------------------------------------------------------------*/
1865
1866 #  define bool_setlocale_r(cat, locale) cBOOL(posix_setlocale(cat, locale))
1867 #  define bool_setlocale_i(i, locale)                                       \
1868                                    bool_setlocale_c(categories[i], locale)
1869 #  define bool_setlocale_c(cat, locale)      bool_setlocale_r(cat, locale)
1870
1871 /*---------------------------------------------------------------------------*/
1872
1873 #  define void_setlocale_r_with_caller(cat, locale, file, line)             \
1874      STMT_START {                                                           \
1875         if (! bool_setlocale_r(cat, locale))                                \
1876             setlocale_failure_panic_via_i(get_category_index(cat),          \
1877                                           NULL, locale, __LINE__, 0,        \
1878                                           file, line);                      \
1879      } STMT_END
1880
1881 #  define void_setlocale_c_with_caller(cat, locale, file, line)             \
1882                     void_setlocale_r_with_caller(cat, locale, file, line)
1883
1884 #  define void_setlocale_i_with_caller(i, locale, file, line)               \
1885           void_setlocale_r_with_caller(categories[i], locale, file, line)
1886
1887 #  define void_setlocale_r(cat, locale)                                     \
1888             void_setlocale_r_with_caller(cat, locale, __FILE__, __LINE__)
1889 #  define void_setlocale_c(cat, locale) void_setlocale_r(cat, locale)
1890 #  define void_setlocale_i(i, locale)   void_setlocale_r(categories[i], locale)
1891
1892 /*---------------------------------------------------------------------------*/
1893
1894 /* setlocale_i is only defined for Configurations where the libc setlocale()
1895  * doesn't need any tweaking.  It allows for some shortcuts */
1896 #  ifndef USE_LOCALE_THREADS
1897 #    define setlocale_i(i, locale)   stdized_setlocale(categories[i], locale)
1898
1899 #  elif defined(WIN32) && defined(USE_THREAD_SAFE_LOCALE)
1900
1901 /* On Windows, we don't know at compile time if we are in thread-safe mode or
1902  * not.  If we are, we can just return the result of the layer below us.  If we
1903  * are in unsafe mode, we need to first copy that result to a safe place while
1904  * in a critical section */
1905
1906 #    define setlocale_i(i, locale)   S_setlocale_i(aTHX_ categories[i], locale)
1907
1908 STATIC const char *
1909 S_setlocale_i(pTHX_ const int category, const char * locale)
1910 {
1911     if (LIKELY(_configthreadlocale(0) == _ENABLE_PER_THREAD_LOCALE)) {
1912         return stdized_setlocale(category, locale);
1913     }
1914
1915     gwLOCALE_LOCK;
1916     const char * retval = save_to_buffer(stdized_setlocale(category, locale),
1917                                          &PL_setlocale_buf,
1918                                          &PL_setlocale_bufsize);
1919     gwLOCALE_UNLOCK;
1920
1921     return retval;
1922 }
1923
1924 #  endif
1925
1926 /*===========================================================================*/
1927 #elif   defined(USE_LOCALE_THREADS)                 \
1928    && ! defined(USE_THREAD_SAFE_LOCALE)
1929
1930    /* Here, there are threads, and there is no support for thread-safe
1931     * operation.  This is a dangerous situation, which perl is documented as
1932     * not supporting, but it arises in practice.  We can do a modicum of
1933     * automatic mitigation by making sure there is a per-thread return from
1934     * setlocale(), and that a mutex protects it from races */
1935
1936 #  define querylocale_r(cat)                                                \
1937                       mortalized_pv_copy(less_dicey_setlocale_r(cat, NULL))
1938 #  define querylocale_c(cat)  querylocale_r(cat)
1939 #  define querylocale_i(i)    querylocale_r(categories[i])
1940
1941 STATIC const char *
1942 S_less_dicey_setlocale_r(pTHX_ const int category, const char * locale)
1943 {
1944     const char * retval;
1945
1946     PERL_ARGS_ASSERT_LESS_DICEY_SETLOCALE_R;
1947
1948     STDIZED_SETLOCALE_LOCK;
1949
1950     retval = save_to_buffer(stdized_setlocale(category, locale),
1951                             &PL_less_dicey_locale_buf,
1952                             &PL_less_dicey_locale_bufsize);
1953
1954     STDIZED_SETLOCALE_UNLOCK;
1955
1956     return retval;
1957 }
1958
1959 /*---------------------------------------------------------------------------*/
1960
1961 #  define bool_setlocale_r(cat, locale)                                     \
1962                                less_dicey_bool_setlocale_r(cat, locale)
1963 #  define bool_setlocale_i(i, locale)                                       \
1964                                 bool_setlocale_r(categories[i], locale)
1965 #  define bool_setlocale_c(cat, locale) bool_setlocale_r(cat, locale)
1966
1967 STATIC bool
1968 S_less_dicey_bool_setlocale_r(pTHX_ const int cat, const char * locale)
1969 {
1970     bool retval;
1971
1972     PERL_ARGS_ASSERT_LESS_DICEY_BOOL_SETLOCALE_R;
1973
1974     /* Unlikely, but potentially possible that another thread could zap the
1975      * buffer from true to false or vice-versa, so need to lock here */
1976     POSIX_SETLOCALE_LOCK;
1977     retval = cBOOL(posix_setlocale(cat, locale));
1978     POSIX_SETLOCALE_UNLOCK;
1979
1980     return retval;
1981 }
1982
1983 /*---------------------------------------------------------------------------*/
1984
1985 #  define void_setlocale_r_with_caller(cat, locale, file, line)             \
1986      STMT_START {                                                           \
1987         if (! bool_setlocale_r(cat, locale))                                \
1988             setlocale_failure_panic_via_i(get_category_index(cat),          \
1989                                           NULL, locale, __LINE__, 0,        \
1990                                           file, line);                      \
1991      } STMT_END
1992
1993 #  define void_setlocale_c_with_caller(cat, locale, file, line)             \
1994                     void_setlocale_r_with_caller(cat, locale, file, line)
1995
1996 #  define void_setlocale_i_with_caller(i, locale, file, line)               \
1997           void_setlocale_r_with_caller(categories[i], locale, file, line)
1998
1999 #  define void_setlocale_r(cat, locale)                                     \
2000             void_setlocale_r_with_caller(cat, locale, __FILE__, __LINE__)
2001 #  define void_setlocale_c(cat, locale) void_setlocale_r(cat, locale)
2002 #  define void_setlocale_i(i, locale)   void_setlocale_r(categories[i], locale)
2003
2004 /*---------------------------------------------------------------------------*/
2005
2006 /* setlocale_i is only defined for Configurations where the libc setlocale()
2007  * suffices for both querying and setting the locale.  It allows for some
2008  * shortcuts */
2009 #  define setlocale_i(i, locale)  less_dicey_setlocale_r(categories[i], locale)
2010
2011 /*===========================================================================*/
2012
2013 #elif defined(USE_POSIX_2008_LOCALE)
2014 #  ifndef LC_ALL
2015 #    error This code assumes that LC_ALL is available on a system modern enough to have POSIX 2008
2016 #  endif
2017
2018 /* Here, there is a completely different API to get thread-safe locales.  We
2019  * emulate the setlocale() API with our own function(s).  setlocale categories,
2020  * like LC_NUMERIC, are not valid here for the POSIX 2008 API.  Instead, there
2021  * are equivalents, like LC_NUMERIC_MASK, which we use instead, which we find
2022  * by table lookup. */
2023
2024 #  if defined(__GLIBC__) && defined(USE_LOCALE_MESSAGES)
2025             /* https://sourceware.org/bugzilla/show_bug.cgi?id=24936 */
2026 #    define HAS_GLIBC_LC_MESSAGES_BUG
2027 #    include <libintl.h>
2028 #  endif
2029
2030 #  define querylocale_i(i)    querylocale_2008_i(i, __LINE__)
2031 #  define querylocale_c(cat)  querylocale_i(cat##_INDEX_)
2032 #  define querylocale_r(cat)  querylocale_i(get_category_index(cat))
2033
2034 STATIC const char *
2035 S_querylocale_2008_i(pTHX_ const locale_category_index index,
2036                            const line_t caller_line)
2037 {
2038     PERL_ARGS_ASSERT_QUERYLOCALE_2008_I;
2039     assert(index <= LC_ALL_INDEX_);
2040
2041     /* This function returns the name of the locale category given by the input
2042      * 'index' into our parallel tables of them.
2043      *
2044      * POSIX 2008, for some sick reason, chose not to provide a method to find
2045      * the category name of a locale, disregarding a basic linguistic tenet
2046      * that for any object, people will create a name for it.  (The next
2047      * version of the POSIX standard is proposed to fix this.)  Some vendors
2048      * have created a querylocale() function to do this in the meantime.  On
2049      * systems without querylocale(), we have to keep track of what the locale
2050      * has been set to, so that we can return its name so as to emulate
2051      * setlocale().  There are potential problems with this:
2052      *
2053      *  1)  We don't know what calling newlocale() with the locale argument ""
2054      *      actually does.  It gets its values from the program's environment.
2055      *      find_locale_from_environment() is used to work around this.  But it
2056      *      isn't fool-proof.  See the comments for that function for details.
2057      *  2)  It's possible for C code in some library to change the locale
2058      *      without us knowing it, and thus our records become wrong;
2059      *      querylocale() would catch this.  But as of September 2017, there
2060      *      are no occurrences in CPAN of uselocale().  Some libraries do use
2061      *      setlocale(), but that changes the global locale, and threads using
2062      *      per-thread locales will just ignore those changes.
2063      *  3)  Many systems have multiple names for the same locale.  Generally,
2064      *      there is an underlying base name, with aliases that evaluate to it.
2065      *      On some systems, if you set the locale to an alias, and then
2066      *      retrieve the name, you get the alias as expected; but on others you
2067      *      get the base name, not the alias you used.  And sometimes the
2068      *      charade is incomplete.  See
2069      *      https://bugs.freebsd.org/bugzilla/show_bug.cgi?id=269375.
2070      *
2071      *      The code is structured so that the returned locale name when the
2072      *      locale is changed is whatever the result of querylocale() on the
2073      *      new locale is.  This effectively gives the result the system
2074      *      expects.  Without querylocale, the name returned is always the
2075      *      input name.  Theoretically this could cause problems, but khw knows
2076      *      of none so far, but mentions it here in case you are trying to
2077      *      debug something.  (This could be worked around by messing with the
2078      *      global locale temporarily, using setlocale() to get the base name;
2079      *      but that could cause a race.  The comments for
2080      *      find_locale_from_environment() give details on the potential race.)
2081      */
2082
2083     const locale_t cur_obj = uselocale((locale_t) 0);
2084     const char * retval;
2085
2086     DEBUG_Lv(PerlIO_printf(Perl_debug_log, "querylocale_2008_i(%s) on %p;"
2087                                            " called from %" LINE_Tf "\n",
2088                                            category_names[index], cur_obj,
2089                                            caller_line));
2090
2091     if (UNLIKELY(cur_obj == LC_GLOBAL_LOCALE)) {
2092
2093         /* Even on platforms that have querylocale(), it is unclear if they
2094          * work in the global locale, and we have the means to get the correct
2095          * answer anyway.  khw is unsure this situation even comes up these
2096          * days, hence the branch prediction */
2097         POSIX_SETLOCALE_LOCK;
2098         retval = mortalized_pv_copy(posix_setlocale(categories[index], NULL));
2099         POSIX_SETLOCALE_UNLOCK;
2100     }
2101
2102     /* Here we have handled the case of the the current locale being the global
2103      * one.  Below is the 'else' case of that.  There are two different
2104      * implementations, depending on USE_PL_CURLOCALES */
2105
2106 #  ifdef USE_PL_CURLOCALES
2107
2108     else {
2109
2110         /* PL_curlocales[] is kept up-to-date for all categories except LC_ALL,
2111          * which may have been invalidated by setting it to NULL, and if so,
2112          * should now be calculated.  (The called function updates that
2113          * element.) */
2114         if (index == LC_ALL_INDEX_ && PL_curlocales[LC_ALL_INDEX_] == NULL) {
2115             calculate_LC_ALL_string((const char **) &PL_curlocales,
2116                                     INTERNAL_FORMAT,
2117                                     WANT_VOID,
2118                                     caller_line);
2119         }
2120
2121         if (cur_obj == PL_C_locale_obj) {
2122
2123             /* If the current locale object is the C object, then the answer is
2124              * "C" or POSIX, regardless of the category.  Handling this
2125              * reasonably likely case specially shortcuts extra effort, and
2126              * hides some bugs from us in OS's that alias other locales to C,
2127              * but do so incompletely.  If our records say it is POSIX, use
2128              * that; otherwise use C.  See
2129              * https://bugs.freebsd.org/bugzilla/show_bug.cgi?id=269375 */
2130             retval = mortalized_pv_copy((strEQ(PL_curlocales[index], "POSIX"))
2131                                         ? "POSIX"
2132                                         : "C");
2133         }
2134         else {
2135             retval = mortalized_pv_copy(PL_curlocales[index]);
2136         }
2137     }
2138
2139 #  else
2140
2141     /* Below is the implementation of the 'else' clause which handles the case
2142      * of the current locale not being the global one on platforms where
2143      * USE_PL_CURLOCALES is NOT in effect.  That means the system must have
2144      * some form of querylocale.  But these have varying characteristics, so
2145      * first create some #defines to make the actual 'else' clause uniform.
2146      *
2147      * First, glibc has a function that implements querylocale(), but is called
2148      * something else, and takes the category number; the others take the mask.
2149      * */
2150 #    if defined(USE_QUERYLOCALE) && (   defined(_NL_LOCALE_NAME)            \
2151                                      && defined(HAS_NL_LANGINFO_L))
2152 #      define my_querylocale(index, cur_obj)                                \
2153                 nl_langinfo_l(_NL_LOCALE_NAME(categories[index]), cur_obj)
2154
2155        /* Experience so far shows it is thread-safe, as well as glibc's
2156         * nl_langinfo_l(), so unless overridden, mark it so */
2157 #      ifdef NO_THREAD_SAFE_QUERYLOCALE
2158 #        undef HAS_THREAD_SAFE_QUERYLOCALE
2159 #      else
2160 #        define HAS_THREAD_SAFE_QUERYLOCALE
2161 #      endif
2162 #    else   /* below, ! glibc */
2163
2164        /* Otherwise, use the system's querylocale(). */
2165 #      define my_querylocale(index, cur_obj)                                \
2166                                querylocale(category_masks[index], cur_obj)
2167
2168        /* There is no standard for this function, and khw has never seen
2169         * anything beyond minimal vendor documentation, lacking important
2170         * details.  Experience has shown that some implementations have race
2171         * condiions, and their returns may not be thread safe.  It would be
2172         * unreliable to test for complete thread safety in Configure.  What we
2173         * do instead is to assume that it is thread-safe, unless overriden by,
2174         * say, a hints file specifying
2175         * -Accflags='-DNO_THREAD_SAFE_QUERYLOCALE */
2176 #      ifdef NO_THREAD_SAFE_QUERYLOCALE
2177 #        undef HAS_THREAD_SAFE_QUERYLOCALE
2178 #      else
2179 #        define HAS_THREAD_SAFE_QUERYLOCALE
2180 #      endif
2181 #    endif
2182
2183      /* Here, we have set up enough information to know if this querylocale()
2184       * is thread-safe, or needs to use a mutex */
2185 #    ifdef HAS_THREAD_SAFE_QUERYLOCALE
2186 #      define QUERYLOCALE_LOCK
2187 #      define QUERYLOCALE_UNLOCK
2188 #    else
2189 #      define QUERYLOCALE_LOCK    gwLOCALE_LOCK
2190 #      define QUERYLOCALE_UNLOCK  gwLOCALE_UNLOCK
2191 #    endif
2192
2193     /* Finally, everything is ready, so here is the 'else' clause to implement
2194      * the case of the current locale not being the global one on systems that
2195      * have some form of querylocale().  (POSIX will presumably eventually
2196      * publish their next version in their pipeline, which will define a
2197      * precisely specified querylocale equivalent, and there can be a new
2198      * #ifdef to use it without having to guess at its characteristics) */
2199
2200     else {
2201         /* We don't keep records when there is querylocale(), so as to avoid the
2202          * pitfalls mentioned at the beginning of this function.
2203          *
2204          * That means LC_ALL has to be calculated from all its constituent
2205          * categories each time, since the querylocale() forms on many (if not
2206          * all) platforms only work on individual categories */
2207         if (index == LC_ALL_INDEX_) {
2208             retval = calculate_LC_ALL_string(NULL, INTERNAL_FORMAT,
2209                                              WANT_TEMP_PV,
2210                                              caller_line);
2211         }
2212         else {
2213
2214             QUERYLOCALE_LOCK;
2215             retval = savepv(my_querylocale(index, cur_obj));
2216             QUERYLOCALE_UNLOCK;
2217
2218             /* querylocale() may conflate the C locale with something that
2219              * isn't exactly the same.  See for example
2220              * https://bugs.freebsd.org/bugzilla/show_bug.cgi?id=269375
2221              * We know that if the locale object is the C one, we
2222              * are in the C locale, which may go by the name POSIX, as both, by
2223              * definition, are equivalent.  But we consider any other name
2224              * spurious, so override with "C".  As in the PL_CURLOCALES case
2225              * above, this hides those glitches, for the most part, from the
2226              * rest of our code.  (The code is ordered this way so that if the
2227              * system distinugishes "C" from "POSIX", we do too.) */
2228             if (cur_obj == PL_C_locale_obj && ! isNAME_C_OR_POSIX(retval)) {
2229                 Safefree(retval);
2230                 retval = savepv("C");
2231             }
2232
2233             SAVEFREEPV(retval);
2234         }
2235     }
2236
2237 #    undef QUERYLOCALE_LOCK
2238 #    undef QUERYLOCALE_UNLOCK
2239 #  endif
2240
2241     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
2242                            "querylocale_2008_i(%s) returning '%s'\n",
2243                            category_names[index], retval));
2244     assert(strNE(retval, ""));
2245     return retval;
2246 }
2247
2248 /*---------------------------------------------------------------------------*/
2249
2250 #  define bool_setlocale_i(i, locale)                                       \
2251                               bool_setlocale_2008_i(i, locale, __LINE__)
2252 #  define bool_setlocale_c(cat, locale)                                     \
2253                                   bool_setlocale_i(cat##_INDEX_, locale)
2254 #  define bool_setlocale_r(cat, locale)                                     \
2255                        bool_setlocale_i(get_category_index(cat), locale)
2256
2257 /* If this doesn't exist on this platform, make it a no-op (to save #ifdefs) */
2258 #  ifndef update_PL_curlocales_i
2259 #    define update_PL_curlocales_i(index, new_locale, caller_line)
2260 #  endif
2261
2262 STATIC bool
2263 S_bool_setlocale_2008_i(pTHX_
2264
2265         /* Our internal index of the 'category' setlocale is called with */
2266         const locale_category_index  index,
2267         const char * new_locale,    /* The locale to set the category to */
2268         const line_t caller_line    /* Called from this line number */
2269        )
2270 {
2271     PERL_ARGS_ASSERT_BOOL_SETLOCALE_2008_I;
2272     assert(index <= LC_ALL_INDEX_);
2273
2274     /* This function effectively performs a setlocale() on just the current
2275      * thread; thus it is thread-safe.  It does this by using the POSIX 2008
2276      * locale functions to emulate the behavior of setlocale().  Similar to
2277      * regular setlocale(), the return from this function points to memory that
2278      * can be overwritten by other system calls, so needs to be copied
2279      * immediately if you need to retain it.  The difference here is that
2280      * system calls besides another setlocale() can overwrite it.
2281      *
2282      * By doing this, most locale-sensitive functions become thread-safe.  The
2283      * exceptions are mostly those that return a pointer to static memory.
2284      */
2285
2286     int mask = category_masks[index];
2287     const locale_t entry_obj = uselocale((locale_t) 0);
2288     const char * locale_on_entry = querylocale_i(index);
2289
2290     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
2291                            "bool_setlocale_2008_i: input=%d (%s), mask=0x%x,"
2292                            " new locale=\"%s\", current locale=\"%s\","
2293                            " index=%d, entry object=%p;"
2294                            " called from %" LINE_Tf "\n",
2295                            categories[index], category_names[index], mask,
2296                            ((new_locale == NULL) ? "(nil)" : new_locale),
2297                            locale_on_entry, index, entry_obj, caller_line));
2298
2299     /* Here, trying to change the locale, but it is a no-op if the new boss is
2300      * the same as the old boss.  Except this routine is called when converting
2301      * from the global locale, so in that case we will create a per-thread
2302      * locale below (with the current values).  It also seemed that newlocale()
2303      * could free up the basis locale memory if we called it with the new and
2304      * old being the same, but khw now thinks that this was due to some other
2305      * bug, since fixed, as there are other places where newlocale() gets
2306      * similarly called without problems. */
2307     if (   entry_obj != LC_GLOBAL_LOCALE
2308         && locale_on_entry
2309         && strEQ(new_locale, locale_on_entry))
2310     {
2311         DEBUG_Lv(PerlIO_printf(Perl_debug_log,
2312                                "bool_setlocale_2008_i: no-op to change to"
2313                                " what it already was\n"));
2314         return true;
2315     }
2316
2317 #  ifndef USE_QUERYLOCALE
2318
2319     /* Without a querylocale() mechanism, we have to figure out ourselves what
2320      * happens with setting a locale to "" */
2321
2322     if (strEQ(new_locale, "")) {
2323         new_locale = find_locale_from_environment(index);
2324         if (! new_locale) {
2325             SET_EINVAL;
2326             return false;
2327         }
2328     }
2329
2330 #  endif
2331 #  ifdef NEWLOCALE_HANDLES_DISPARATE_LC_ALL
2332
2333     const bool need_loop = false;
2334
2335 #  else
2336
2337     bool need_loop = false;
2338     const char * new_locales[LC_ALL_INDEX_] = { NULL };
2339
2340     /* If we're going to have to parse the LC_ALL string, might as well do it
2341      * now before we have made changes that we would have to back out of if the
2342      * parse fails */
2343     if (index == LC_ALL_INDEX_) {
2344         switch (parse_LC_ALL_string(new_locale,
2345                                     (const char **) &new_locales,
2346                                     override_if_ignored,
2347                                     false,    /* Return only [0] if suffices */
2348                                     false,    /* Don't panic on error */
2349                                     caller_line))
2350         {
2351           case invalid:
2352             SET_EINVAL;
2353             return false;
2354
2355           case no_array:
2356             need_loop = false;
2357             break;
2358
2359           case only_element_0:
2360             SAVEFREEPV(new_locales[0]);
2361             new_locale = new_locales[0];
2362             need_loop = false;
2363             break;
2364
2365           case full_array:
2366             need_loop = true;
2367             break;
2368         }
2369     }
2370
2371 #  endif
2372 #  ifdef HAS_GLIBC_LC_MESSAGES_BUG
2373
2374     /* For this bug, if the LC_MESSAGES locale changes, we have to do an
2375      * expensive workaround.  Save the current value so we can later determine
2376      * if it changed. */
2377     const char * old_messages_locale = NULL;
2378     if (   (index == LC_MESSAGES_INDEX_ || index == LC_ALL_INDEX_)
2379         &&  LIKELY(PL_phase != PERL_PHASE_CONSTRUCT))
2380     {
2381         old_messages_locale = querylocale_c(LC_MESSAGES);
2382     }
2383
2384 #  endif
2385
2386     assert(PL_C_locale_obj);
2387
2388     /* Now ready to switch to the input 'new_locale' */
2389
2390     /* Switching locales generally entails freeing the current one's space (at
2391      * the C library's discretion), hence we can't be using that locale at the
2392      * time of the switch (this wasn't obvious to khw from the man pages).  So
2393      * switch to a known locale object that we don't otherwise mess with. */
2394     if (! uselocale(PL_C_locale_obj)) {
2395
2396         /* Not being able to change to the C locale is severe; don't keep
2397          * going.  */
2398         setlocale_failure_panic_i(index, locale_on_entry, "C",
2399                                   __LINE__, caller_line);
2400         NOT_REACHED; /* NOTREACHED */
2401     }
2402
2403     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
2404                            "bool_setlocale_2008_i: now using C"
2405                            " object=%p\n", PL_C_locale_obj));
2406
2407     /* These two objects are special:
2408      *  LC_GLOBAL_LOCALE    because it is undefined behavior to call
2409      *                      newlocale() with it as a parameter.
2410      *  PL_C_locale_obj     because newlocale() generally destroys its locale
2411      *                      object parameter when it succeeds; and we don't
2412      *                      want that happening to this immutable object.
2413      * Copies will be made for them to use instead if we get so far as to call
2414      * newlocale(). */
2415     bool entry_obj_is_special = (   entry_obj == LC_GLOBAL_LOCALE
2416                                  || entry_obj == PL_C_locale_obj);
2417     locale_t new_obj;
2418
2419     /* PL_C_locale_obj is LC_ALL set to the C locale.  If this call is to
2420      * switch to LC_ALL => C, simply use that object.  But in fact, we already
2421      * have switched to it just above, in preparation for the general case.
2422      * Since we're already there, no need to do further switching. */
2423     if (mask == LC_ALL_MASK && isNAME_C_OR_POSIX(new_locale)) {
2424         DEBUG_Lv(PerlIO_printf(Perl_debug_log,
2425                                "bool_setlocale_2008_i: will stay in C"
2426                                " object\n"));
2427         new_obj = PL_C_locale_obj;
2428
2429         /* 'entry_obj' is now dangling, of no further use to anyone (unless it
2430          * is one of the special ones).  Free it to avoid a leak */
2431         if (! entry_obj_is_special) {
2432             freelocale(entry_obj);
2433         }
2434
2435         update_PL_curlocales_i(index, new_locale, caller_line);
2436     }
2437     else {  /* Here is the general case, not to LC_ALL => C */
2438
2439         /* The newlocale() call(s) below take a basis object to build upon to
2440          * create the changed locale, trashing it iff successful.
2441          *
2442          * For the objects that are not to be modified by this function, we
2443          * create a duplicate that gets trashed instead.
2444          *
2445          * Also if we will have to loop doing multiple newlocale()s, there is a
2446          * chance we will succeed for the first few, and then fail, having to
2447          * back out.  We need to duplicate 'entry_obj' in this case as well, so
2448          * it remains valid as something to back out to. */
2449         locale_t basis_obj = entry_obj;
2450
2451         if (entry_obj_is_special || need_loop) {
2452             basis_obj = duplocale(basis_obj);
2453             if (! basis_obj) {
2454                 locale_panic_via_("duplocale failed", __FILE__, caller_line);
2455                 NOT_REACHED; /* NOTREACHED */
2456             }
2457
2458             DEBUG_Lv(PerlIO_printf(Perl_debug_log,
2459                                    "bool_setlocale_2008_i created %p by"
2460                                    " duping the input\n", basis_obj));
2461         }
2462
2463 #  define DEBUG_NEW_OBJECT_CREATED(category, locale, new, old, caller_line) \
2464       DEBUG_Lv(PerlIO_printf(Perl_debug_log,                                \
2465                              "bool_setlocale_2008_i(%s, %s): created %p"    \
2466                              " while freeing %p; called from %" LINE_Tf     \
2467                              " via %" LINE_Tf "\n",                         \
2468                              category, locale, new, old,                    \
2469                              caller_line, __LINE__))
2470 #  define DEBUG_NEW_OBJECT_FAILED(category, locale, basis_obj)              \
2471       DEBUG_L(PerlIO_printf(Perl_debug_log,                                 \
2472                             "bool_setlocale_2008_i: creating new object"    \
2473                             " for (%s '%s') from %p failed; called from %"  \
2474                             LINE_Tf " via %" LINE_Tf "\n",                  \
2475                             category, locale, basis_obj,                    \
2476                             caller_line, __LINE__));
2477
2478         /* Ready to create a new locale by modification of the existing one.
2479          *
2480          * NOTE: This code may incorrectly show up as a leak under the address
2481          * sanitizer. We do not free this object under normal teardown, however
2482          * you can set PERL_DESTRUCT_LEVEL=2 to cause it to be freed.
2483          */
2484
2485 #  ifdef NEWLOCALE_HANDLES_DISPARATE_LC_ALL
2486
2487         /* Some platforms have a newlocale() that can handle disparate LC_ALL
2488          * input, so on these a single call to newlocale() always works */
2489 #  else
2490
2491         /* If a single call to newlocale() will do */
2492         if (! need_loop)
2493
2494 #  endif
2495
2496         {
2497             new_obj = newlocale(mask,
2498                                 override_ignored_category(index, new_locale),
2499                                 basis_obj);
2500             if (! new_obj) {
2501                 DEBUG_NEW_OBJECT_FAILED(category_names[index], new_locale,
2502                                         basis_obj);
2503
2504                 /* Since the call failed, it didn't trash 'basis_obj', which is
2505                  * a dup for these objects, and hence would leak if we don't
2506                  * free it.  XXX However, something is seriously wrong if we
2507                  * can't switch to C or the global locale, so maybe should
2508                  * panic instead */
2509                 if (entry_obj_is_special) {
2510                     freelocale(basis_obj);
2511                 }
2512
2513                 goto must_restore_state;
2514             }
2515
2516             DEBUG_NEW_OBJECT_CREATED(category_names[index], new_locale,
2517                                      new_obj, basis_obj, caller_line);
2518
2519             update_PL_curlocales_i(index, new_locale, caller_line);
2520         }
2521
2522 #  ifndef NEWLOCALE_HANDLES_DISPARATE_LC_ALL
2523
2524         else {  /* Need multiple newlocale() calls */
2525
2526             /* Loop through the individual categories, setting the locale of
2527              * each to the corresponding name previously populated into
2528              * newlocales[].  Each iteration builds on the previous one, adding
2529              * its category to what's already been calculated, and taking as a
2530              * basis for what's been calculated 'basis_obj', which is updated
2531              * each iteration to be the result of the previous one.  Upon
2532              * success, newlocale() trashes the 'basis_obj' parameter to it.
2533              * If any iteration fails, we immediately give up, restore the
2534              * locale to what it was at the time this function was called
2535              * (saved in 'entry_obj'), and return failure. */
2536
2537             /* Loop, using the previous iteration's result as the basis for the
2538              * next one.  (The first time we effectively use the locale in
2539              * force upon entry to this function.) */
2540             for_all_individual_category_indexes(i) {
2541                 new_obj = newlocale(category_masks[i],
2542                                     new_locales[i],
2543                                     basis_obj);
2544                 if (new_obj) {
2545                     DEBUG_NEW_OBJECT_CREATED(category_names[i],
2546                                              new_locales[i],
2547                                              new_obj, basis_obj,
2548                                              caller_line);
2549                     basis_obj = new_obj;
2550                     continue;
2551                 }
2552
2553                 /* Failed.  Likely this is because the proposed new locale
2554                  * isn't valid on this system. */
2555
2556                 DEBUG_NEW_OBJECT_FAILED(category_names[i],
2557                                         new_locales[i],
2558                                         basis_obj);
2559
2560                 /* newlocale() didn't trash this, since the function call
2561                  * failed */
2562                 freelocale(basis_obj);
2563
2564                 for_all_individual_category_indexes(j) {
2565                     Safefree(new_locales[j]);
2566                 }
2567
2568                 goto must_restore_state;
2569             }
2570
2571             /* Success for all categories. */
2572             for_all_individual_category_indexes(i) {
2573                 update_PL_curlocales_i(i, new_locales[i], caller_line);
2574                 Safefree(new_locales[i]);
2575             }
2576
2577             /* We dup'd entry_obj in case we had to fall back to it.  The
2578              * newlocale() above destroyed the dup when it first succeeded, but
2579              * entry_obj itself is left dangling, so free it */
2580             if (! entry_obj_is_special) {
2581                 freelocale(entry_obj);
2582             }
2583         }
2584
2585 #  endif    /* End of newlocale can't handle disparate LC_ALL input */
2586
2587     }
2588
2589 #  undef DEBUG_NEW_OBJECT_CREATED
2590 #  undef DEBUG_NEW_OBJECT_FAILED
2591
2592     /* Here, successfully created an object representing the desired locale;
2593      * now switch into it */
2594     if (! uselocale(new_obj)) {
2595         freelocale(new_obj);
2596         locale_panic_(Perl_form(aTHX_ "(called from %" LINE_Tf "):"
2597                                       " bool_setlocale_2008_i: switching"
2598                                       " into new locale failed",
2599                                       caller_line));
2600     }
2601
2602     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
2603                            "bool_setlocale_2008_i: now using %p\n", new_obj));
2604
2605 #  ifdef MULTIPLICITY   /* Unlikely, but POSIX 2008 functions could be
2606                            Configured to be used on unthreaded perls, in which
2607                            case this object doesn't exist */
2608
2609     if (DEBUG_Lv_TEST) {
2610         if (PL_cur_locale_obj != new_obj) {
2611             PerlIO_printf(Perl_debug_log,
2612                           "bool_setlocale_2008_i: PL_cur_locale_obj"
2613                           " was %p, now is %p\n",
2614                           PL_cur_locale_obj, new_obj);
2615         }
2616     }
2617
2618     /* Update the current object */
2619     PL_cur_locale_obj = new_obj;
2620
2621 #  endif
2622 #  ifdef HAS_GLIBC_LC_MESSAGES_BUG
2623
2624     /* Invalidate the glibc cache of loaded translations if the locale has
2625      * changed, see [perl #134264] and
2626      * https://sourceware.org/bugzilla/show_bug.cgi?id=24936 */
2627     if (old_messages_locale) {
2628         if (strNE(old_messages_locale, querylocale_c(LC_MESSAGES))) {
2629             textdomain(textdomain(NULL));
2630         }
2631     }
2632
2633 #  endif
2634
2635     return true;
2636
2637   must_restore_state:
2638
2639     /* We earlier switched to the LC_ALL => C locale in anticipation of it
2640      * succeeding,  Now have to switch back to the state upon entry.  */
2641     if (! uselocale(entry_obj)) {
2642         setlocale_failure_panic_i(index, "switching back to",
2643                                   locale_on_entry, __LINE__, caller_line);
2644     }
2645
2646     return false;
2647 }
2648
2649 /*---------------------------------------------------------------------------*/
2650
2651 #  define void_setlocale_i_with_caller(i, locale, file, line)               \
2652      STMT_START {                                                           \
2653         if (! bool_setlocale_i(i, locale))                                  \
2654             setlocale_failure_panic_via_i(i, NULL, locale, __LINE__, 0,     \
2655                                           file, line);                      \
2656      } STMT_END
2657
2658 #  define void_setlocale_r_with_caller(cat, locale, file, line)             \
2659         void_setlocale_i_with_caller(get_category_index(cat), locale,       \
2660                                      file, line)
2661
2662 #  define void_setlocale_c_with_caller(cat, locale, file, line)             \
2663             void_setlocale_i_with_caller(cat##_INDEX_, locale, file, line)
2664
2665 #  define void_setlocale_i(i, locale)                                       \
2666                 void_setlocale_i_with_caller(i, locale, __FILE__, __LINE__)
2667 #  define void_setlocale_c(cat, locale)                                     \
2668                                   void_setlocale_i(cat##_INDEX_, locale)
2669 #  define void_setlocale_r(cat, locale)                                     \
2670                   void_setlocale_i(get_category_index(cat), locale)
2671
2672 /*===========================================================================*/
2673
2674 #else
2675 #  error Unexpected Configuration
2676 #endif   /* End of the various implementations of the setlocale and
2677             querylocale macros used in the remainder of this program */
2678
2679 /* query_nominal_locale_i() is used when the caller needs the locale that an
2680  * external caller would be expecting, and not what we're secretly using
2681  * behind the scenes.  It deliberately doesn't handle LC_ALL; use
2682  * calculate_LC_ALL_string() for that. */
2683 #ifdef USE_LOCALE_NUMERIC
2684 #  define query_nominal_locale_i(i)                                         \
2685       (__ASSERT_(i != LC_ALL_INDEX_)                                        \
2686        ((i == LC_NUMERIC_INDEX_) ? PL_numeric_name : querylocale_i(i)))
2687 #elif defined(USE_LOCALE)
2688 #  define query_nominal_locale_i(i)                                         \
2689       (__ASSERT_(i != LC_ALL_INDEX_) querylocale_i(i))
2690 #else
2691 #  define query_nominal_locale_i(i)  "C"
2692 #endif
2693
2694 #ifdef USE_PL_CURLOCALES
2695
2696 STATIC void
2697 S_update_PL_curlocales_i(pTHX_
2698                          const locale_category_index index,
2699                          const char * new_locale,
2700                          const line_t caller_line)
2701 {
2702     /* Update PL_curlocales[], which is parallel to the other ones indexed by
2703      * our mapping of libc category number to our internal equivalents. */
2704
2705     PERL_ARGS_ASSERT_UPDATE_PL_CURLOCALES_I;
2706     assert(index <= LC_ALL_INDEX_);
2707
2708     if (index == LC_ALL_INDEX_) {
2709
2710         /* For LC_ALL, we change all individual categories to correspond,
2711          * including the LC_ALL element */
2712         for (unsigned int i = 0; i <= LC_ALL_INDEX_; i++) {
2713             Safefree(PL_curlocales[i]);
2714             PL_curlocales[i] = NULL;
2715         }
2716
2717         switch (parse_LC_ALL_string(new_locale,
2718                                     (const char **) &PL_curlocales,
2719                                     check_that_overridden,  /* things should
2720                                                                have already
2721                                                                been overridden
2722                                                                */
2723                                     true,   /* Always fill array */
2724                                     true,   /* Panic if fails, as to get here
2725                                                it earlier had to have succeeded
2726                                                */
2727                                    caller_line))
2728         {
2729           case invalid:
2730           case no_array:
2731           case only_element_0:
2732             locale_panic_via_("Unexpected return from parse_LC_ALL_string",
2733                               __FILE__, caller_line);
2734
2735           case full_array:
2736             /* parse_LC_ALL_string() has already filled PL_curlocales properly,
2737              * except for the LC_ALL element, which should be set to
2738              * 'new_locale'. */
2739             PL_curlocales[LC_ALL_INDEX_] = savepv(new_locale);
2740         }
2741     }
2742     else {  /* Not LC_ALL */
2743
2744         /* Update the single category's record */
2745         Safefree(PL_curlocales[index]);
2746         PL_curlocales[index] = savepv(new_locale);
2747
2748         /* Invalidate LC_ALL */
2749         Safefree(PL_curlocales[LC_ALL_INDEX_]);
2750         PL_curlocales[LC_ALL_INDEX_] = NULL;
2751     }
2752 }
2753
2754 #  endif  /* Need PL_curlocales[] */
2755
2756 /*===========================================================================*/
2757
2758 #if defined(USE_LOCALE)
2759
2760 /* This paradigm is needed in several places in the function below.  We have to
2761  * substitute the nominal locale for LC_NUMERIC when returning a value for
2762  * external consumption */
2763 #  ifndef USE_LOCALE_NUMERIC
2764 #    define ENTRY(i, array, format)  array[i]
2765 #  else
2766 #    define ENTRY(i, array, format)                         \
2767        (UNLIKELY(   format == EXTERNAL_FORMAT_FOR_QUERY     \
2768                  && i == LC_NUMERIC_INDEX_)                 \
2769         ? PL_numeric_name                                   \
2770         : array[i])
2771 #  endif
2772
2773 STATIC
2774 const char *
2775 S_calculate_LC_ALL_string(pTHX_ const char ** category_locales_list,
2776                                 const calc_LC_ALL_format format,
2777                                 const calc_LC_ALL_return returning,
2778                                 const line_t caller_line)
2779 {
2780     PERL_ARGS_ASSERT_CALCULATE_LC_ALL_STRING;
2781
2782     /* NOTE: On Configurations that have PL_curlocales[], this function has the
2783      * side effect of updating the LC_ALL_INDEX_ element with its result.
2784      *
2785      * This function calculates a string that defines the locale(s) LC_ALL is
2786      * set to, in either:
2787      *  1)  Our internal format if 'format' is set to INTERNAL_FORMAT.
2788      *  2)  The external format returned by Perl_setlocale() if 'format' is set
2789      *      to EXTERNAL_FORMAT_FOR_QUERY or EXTERNAL_FORMAT_FOR_SET.
2790      *
2791      *      These two are distinguished by:
2792      *       a) EXTERNAL_FORMAT_FOR_SET returns the actual locale currently in
2793      *          effect.
2794      *       b) EXTERNAL_FORMAT_FOR_QUERY returns the nominal locale.
2795      *          Currently this can differ only from the actual locale in the
2796      *          LC_NUMERIC category when it is set to a locale whose radix is
2797      *          not a dot.  (The actual locale is kept as a dot to accommodate
2798      *          the large corpus of XS code that expects it to be that;
2799      *          switched to a non-dot temporarily during certain operations
2800      *          that require the actual radix.)
2801      *
2802      * In both 1) and 2), LC_ALL's values are passed to this function by
2803      * 'category_locales_list' which is either:
2804      *  1) a pointer to an array of strings with up-to-date values of all the
2805      *     individual categories; or
2806      *  2) NULL, to indicate to use querylocale_i() to get each individual
2807      *     value.
2808      *
2809      * The caller sets 'returning' to
2810      *      WANT_TEMP_PV        the function returns the calculated string
2811      *                              as a mortalized temporary, so the caller
2812      *                              doesn't have to worry about it being
2813      *                              per-thread, nor needs to arrange for its
2814      *                              clean-up.
2815      *      WANT_PL_setlocale_buf  the function stores the calculated string
2816      *                              into the per-thread buffer PL_setlocale_buf
2817      *                              and returns a pointer to that.  The buffer
2818      *                              is cleaned up automatically in process
2819      *                              destruction.  This return method avoids
2820      *                              extra copies in some circumstances.
2821      *      WANT_VOID           NULL is returned.  This is used when the
2822      *                              function is being called only for its side
2823      *                              effect of updating
2824      *                              PL_curlocales[LC_ALL_INDEX_]
2825      *
2826      * querylocale(), on systems that have it, doesn't tend to work for LC_ALL.
2827      * So we have to construct the answer ourselves based on the passed in
2828      * data.
2829      *
2830      * If all individual categories are the same locale, we can just set LC_ALL
2831      * to that locale.  But if not, we have to create an aggregation of all the
2832      * categories on the system.  Platforms differ as to the syntax they use
2833      * for these non-uniform locales for LC_ALL.  Some, like glibc and Windows,
2834      * use an unordered series of name=value pairs, like
2835      *      LC_NUMERIC=C;LC_TIME=en_US.UTF-8;...
2836      * to specify LC_ALL; others, like *BSD, use a positional notation with a
2837      * delimitter, typically a single '/' character:
2838      *      C/en_UK.UTF-8/...
2839      *
2840      * When the external format is desired, this function returns whatever the
2841      * system expects.  The internal format is always name=value pairs.
2842      *
2843      * For systems that have categories we don't know about, the algorithm
2844      * below won't know about those missing categories, leading to potential
2845      * bugs for code that looks at them.  If there is an environment variable
2846      * that sets that category, we won't know to look for it, and so our use of
2847      * LANG or "C" improperly overrides it.  On the other hand, if we don't do
2848      * what is done here, and there is no environment variable, the category's
2849      * locale should be set to LANG or "C".  So there is no good solution.  khw
2850      * thinks the best is to make sure we have a complete list of possible
2851      * categories, adding new ones as they show up on obscure platforms.
2852      */
2853
2854     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
2855                            "Entering calculate_LC_ALL_string(%s);"
2856                            " called from %" LINE_Tf "\n",
2857                            ((format == EXTERNAL_FORMAT_FOR_QUERY)
2858                             ? "EXTERNAL_FORMAT_FOR_QUERY"
2859                             : ((format == EXTERNAL_FORMAT_FOR_SET)
2860                                ? "EXTERNAL_FORMAT_FOR_SET"
2861                                : "INTERNAL_FORMAT")),
2862                            caller_line));
2863
2864     bool input_list_was_NULL = (category_locales_list == NULL);
2865
2866     /* If there was no input category list, construct a temporary one
2867      * ourselves. */
2868     const char * my_category_locales_list[LC_ALL_INDEX_];
2869     const char ** locales_list = category_locales_list;
2870     if (locales_list == NULL) {
2871         locales_list = my_category_locales_list;
2872
2873         if (format == EXTERNAL_FORMAT_FOR_QUERY) {
2874             for_all_individual_category_indexes(i) {
2875                 locales_list[i] = query_nominal_locale_i(i);
2876             }
2877         }
2878         else {
2879             for_all_individual_category_indexes(i) {
2880                 locales_list[i] = querylocale_i(i);
2881             }
2882         }
2883     }
2884
2885     /* While we are calculating LC_ALL, we see if every category's locale is
2886      * the same as every other's or not. */
2887 #  ifndef HAS_IGNORED_LOCALE_CATEGORIES_
2888
2889     /* When we pay attention to all categories, we assume they are all the same
2890      * until proven different */
2891     bool disparate = false;
2892
2893 #  else
2894
2895     /* But if there are ignored categories, those will be set to "C", so try an
2896      * arbitrary category, and if it isn't C, we know immediately that the
2897      * locales are disparate.  (The #if conditionals are to handle the case
2898      * where LC_NUMERIC_INDEX_ is 0.  We don't want to use LC_NUMERIC to
2899      * compare, as that may be different between external and internal forms.)
2900      * */
2901 #    if ! defined(USE_LOCALE_NUMERIC)
2902
2903     bool disparate = ! isNAME_C_OR_POSIX(locales_list[0]);
2904
2905 #    elif LC_NUMERIC_INDEX_ != 0
2906
2907     bool disparate = ! isNAME_C_OR_POSIX(locales_list[0]);
2908
2909 #    else
2910
2911     /* Would need revision to handle the very unlikely case where only a single
2912      * category, LC_NUMERIC, is defined */
2913     assert(LOCALE_CATEGORIES_COUNT_ > 0);
2914
2915     bool disparate = ! isNAME_C_OR_POSIX(locales_list[1]);
2916
2917 #    endif
2918 #  endif
2919
2920     /* Calculate the needed size for the string listing the individual locales.
2921      * Initialize with values known at compile time. */
2922     Size_t total_len;
2923     const char *separator;
2924
2925 #  ifdef PERL_LC_ALL_USES_NAME_VALUE_PAIRS  /* Positional formatted LC_ALL */
2926     PERL_UNUSED_ARG(format);
2927 #  else
2928
2929     if (format != INTERNAL_FORMAT) {
2930
2931         /* Here, we will be using positional notation.  it includes n-1
2932          * separators */
2933         total_len = (  LOCALE_CATEGORIES_COUNT_ - 1)
2934                      * STRLENs(PERL_LC_ALL_SEPARATOR)
2935                   + 1;   /* And a trailing NUL */
2936         separator = PERL_LC_ALL_SEPARATOR;
2937     }
2938     else
2939
2940 #  endif
2941
2942     {
2943         /* name=value output is always used in internal format, and when
2944          * positional isn't available on the platform. */
2945         total_len = lc_all_boiler_plate_length;
2946         separator = ";";
2947     }
2948
2949     /* The total length then is just the sum of the above boiler-plate plus the
2950      * total strlen()s of the locale name of each individual category. */
2951     for_all_individual_category_indexes(i) {
2952         const char * entry = ENTRY(i, locales_list, format);
2953
2954         total_len += strlen(entry);
2955         if (! disparate && strNE(entry, locales_list[0])) {
2956             disparate = true;
2957         }
2958     }
2959
2960     bool free_if_void_return = false;
2961     const char * retval;
2962
2963     /* If all categories have the same locale, we already know the answer */
2964     if (! disparate) {
2965         if (returning == WANT_PL_setlocale_buf) {
2966             save_to_buffer(locales_list[0],
2967                            &PL_setlocale_buf,
2968                            &PL_setlocale_bufsize);
2969             retval = PL_setlocale_buf;
2970         }
2971         else {
2972
2973             retval = locales_list[0];
2974
2975             /* If a temporary is wanted for the return, and we had to create
2976              * the input list ourselves, we created it into such a temporary,
2977              * so no further work is needed; but otherwise, make a mortal copy
2978              * of this passed-in list element */
2979             if (returning == WANT_TEMP_PV && ! input_list_was_NULL) {
2980                 retval = savepv(retval);
2981                 SAVEFREEPV(retval);
2982             }
2983
2984             /* In all cases here, there's nothing we create that needs to be
2985              * freed, so leave 'free_if_void_return' set to the default
2986              * 'false'. */
2987         }
2988     }
2989     else {  /* Here, not all categories have the same locale */
2990
2991         char * constructed;
2992
2993         /* If returning to PL_setlocale_buf, set up to write directly to it,
2994          * being sure it is resized to be large enough */
2995         if (returning == WANT_PL_setlocale_buf) {
2996             set_save_buffer_min_size(total_len,
2997                                      &PL_setlocale_buf,
2998                                      &PL_setlocale_bufsize);
2999             constructed = PL_setlocale_buf;
3000         }
3001         else {  /* Otherwise we need new memory to hold the calculated value. */
3002
3003             Newx(constructed, total_len, char);
3004
3005             /* If returning the new memory, it must be set up to be freed
3006              * later; otherwise at the end of this function */
3007             if (returning == WANT_TEMP_PV) {
3008                 SAVEFREEPV(constructed);
3009             }
3010             else {
3011                 free_if_void_return = true;
3012             }
3013         }
3014
3015         constructed[0] = '\0';
3016
3017         /* Loop through all the categories */
3018         for_all_individual_category_indexes(j) {
3019
3020             /* Add a separator, except before the first one */
3021             if (j != 0) {
3022                 my_strlcat(constructed, separator, total_len);
3023             }
3024
3025             const char * entry;
3026             Size_t needed_len;
3027             unsigned int i = j;
3028
3029 #  ifndef PERL_LC_ALL_USES_NAME_VALUE_PAIRS
3030
3031             if (UNLIKELY(format != INTERNAL_FORMAT)) {
3032
3033                 /* In positional notation 'j' means the position, and we have
3034                  * to convert to the index 'i' */
3035                 i = map_LC_ALL_position_to_index[j];
3036
3037                 entry = ENTRY(i, locales_list, format);
3038                 needed_len = my_strlcat(constructed, entry, total_len);
3039             }
3040             else
3041
3042 #  endif
3043             {
3044                 /* Below, we are to use name=value notation, either because
3045                  * that's what the platform uses, or because this is the
3046                  * internal format, which uses that notation regardless of the
3047                  * external form */
3048
3049                 entry = ENTRY(i, locales_list, format);
3050
3051                 /* "name=locale;" */
3052                 my_strlcat(constructed, category_names[i], total_len);
3053                 my_strlcat(constructed, "=", total_len);
3054                 needed_len = my_strlcat(constructed, entry, total_len);
3055             }
3056
3057             if (LIKELY(needed_len <= total_len)) {
3058                 continue;
3059             }
3060
3061             /* If would have overflowed, panic */
3062             locale_panic_via_(Perl_form(aTHX_
3063                                         "Internal length calculation wrong.\n"
3064                                         "\"%s\" was not entirely added to"
3065                                         " \"%.*s\"; needed=%zu, had=%zu",
3066                                         entry, (int) total_len,
3067                                         constructed,
3068                                         needed_len, total_len),
3069                                 __FILE__,
3070                                 caller_line);
3071         } /* End of loop through the categories */
3072
3073         retval = constructed;
3074     } /* End of the categories' locales are displarate */
3075
3076 #  if defined(USE_PL_CURLOCALES) && defined(LC_ALL)
3077
3078     if (format == INTERNAL_FORMAT) {
3079
3080         /* PL_curlocales[LC_ALL_INDEX_] is updated as a side-effect of this
3081          * function for internal format. */
3082         Safefree(PL_curlocales[LC_ALL_INDEX_]);
3083         PL_curlocales[LC_ALL_INDEX_] = savepv(retval);
3084     }
3085
3086 #  endif
3087
3088     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
3089                            "calculate_LC_ALL_string calculated '%s'\n",
3090                            retval));
3091
3092     if (returning == WANT_VOID) {
3093         if (free_if_void_return) {
3094             Safefree(retval);
3095         }
3096
3097         return NULL;
3098     }
3099
3100     return retval;
3101 }
3102
3103 #  if defined(WIN32) || (     defined(USE_POSIX_2008_LOCALE)        \
3104                          && ! defined(USE_QUERYLOCALE))
3105
3106 STATIC const char *
3107 S_find_locale_from_environment(pTHX_ const locale_category_index index)
3108 {
3109     /* NB: This function may actually change the locale on Windows.  It
3110      * currently is designed to be called only from setting the locale on
3111      * Windows, and POSIX 2008
3112      *
3113      * This function returns the locale specified by the program's environment
3114      * for the category specified by our internal index number 'index'.  It
3115      * therefore simulates:
3116      *      setlocale(cat, "")
3117      * but, except for some cases in Windows, doesn't actually change the
3118      * locale; merely returns it.
3119      *
3120      * The return need not be freed by the caller.  This
3121      * promise relies on PerlEnv_getenv() returning a mortalized copy to us.
3122      *
3123      * The simulation is needed only on certain platforms; otherwise, libc is
3124      * called with "" to get the actual value(s).  The simulation is needed
3125      * for:
3126      *
3127      *  1)  On Windows systems, the concept of the POSIX ordering of
3128      *      environment variables is missing.  To increase portability of
3129      *      programs across platforms, the POSIX ordering is emulated on
3130      *      Windows.
3131      *
3132      *  2)  On POSIX 2008 systems without querylocale(), it is problematic
3133      *      getting the results of the POSIX 2008 equivalent of
3134      *
3135      *          setlocale(category, "")
3136      *
3137      *      To ensure that we know exactly what those values are, we do the
3138      *      setting ourselves, using the documented algorithm specified by the
3139      *      POSIX standard (assuming the platform follows the Standard) rather
3140      *      than use "" as the locale.  This will lead to results that differ
3141      *      from native behavior if the native behavior differs from the
3142      *      Standard's documented value, but khw believes it is better to know
3143      *      what's going on, even if different from native, than to just guess.
3144      *
3145      *      glibc systems differ from this standard in having a LANGUAGE
3146      *      environment variable used for just LC_MESSAGES.  This function does
3147      *      NOT handle that.
3148      *
3149      *      Another option for the POSIX 2008 case would be, in a critical
3150      *      section, to save the global locale's current value, and do a
3151      *      straight setlocale(LC_ALL, "").  That would return our desired
3152      *      values, destroying the global locale's, which we would then
3153      *      restore.  But that could cause races with any other thread that is
3154      *      using the global locale and isn't using the mutex.  And, the only
3155      *      reason someone would have done that is because they are calling a
3156      *      library function, like in gtk, that calls setlocale(), and which
3157      *      can't be changed to use the mutex.  That wouldn't be a problem if
3158      *      this were to be done before any threads had switched, say during
3159      *      perl construction time.  But this code would still be needed for
3160      *      the general case.
3161      *
3162      * The Windows and POSIX 2008 differ in that the ultimate fallback is "C"
3163      * in POSIX, and is the system default locale in Windows.  To get that
3164      * system default value, we actually have to call setlocale() on Windows.
3165      */
3166
3167     const char * const lc_all = PerlEnv_getenv("LC_ALL");
3168     const char * locale_names[LC_ALL_INDEX_] = { NULL };
3169
3170     /* Use any "LC_ALL" environment variable, as it overrides everything else.
3171      * */
3172     if (lc_all && strNE(lc_all, "")) {
3173         return lc_all;
3174     }
3175
3176     /* Here, no usable LC_ALL environment variable.  We have to handle each
3177      * category separately.  If all categories are desired, we loop through
3178      * them all.  If only an individual category is desired, to avoid
3179      * duplicating logic, we use the same loop, but set up the limits so it is
3180      * only executed once, for that particular category. */
3181     locale_category_index lower, upper, offset;
3182     if (index == LC_ALL_INDEX_) {
3183         lower = (locale_category_index) 0;
3184         upper = (locale_category_index) ((int) LC_ALL_INDEX_ - 1);
3185         offset = (locale_category_index) 0;
3186     }
3187     else {
3188         lower = index;
3189         upper = index;
3190
3191         /* 'offset' is used so that the result of the single loop iteration is
3192          * stored into output[0] */
3193         offset = lower;
3194     }
3195
3196     /* When no LC_ALL environment variable, LANG is used as a default, but
3197      * overridden for individual categories that have corresponding environment
3198      * variables.  If no LANG exists, the default is "C" on POSIX 2008, or the
3199      * system default for the category on Windows. */
3200     const char * env_lang = NULL;
3201
3202     /* For each desired category, use any corresponding environment variable;
3203      * or the default if none such exists. */
3204     bool is_disparate = false;  /* Assume is uniform until proven otherwise */
3205     for (unsigned i = lower; i <= upper; i++) {
3206         const char * const env_override = PerlEnv_getenv(category_names[i]);
3207         unsigned int j = i - offset;
3208
3209         if (env_override && strNE(env_override, "")) {
3210             locale_names[j] = env_override;
3211         }
3212         else { /* Here, no corresponding environment variable, see if LANG
3213                   exists and is usable.  Done this way to avoid fetching LANG
3214                   unless it is actually needed */
3215             if (env_lang == NULL) {
3216                 env_lang = PerlEnv_getenv("LANG");
3217
3218                 /* If not usable, set it to a non-NULL illegal value so won't
3219                  * try to use it below */
3220                 if (env_lang == NULL || strEQ(env_lang, "")) {
3221                     env_lang = (const char *) 1;
3222                 }
3223             }
3224
3225             /* If a usable LANG exists, use it. */
3226             if (env_lang != NULL && env_lang != (const char *) 1) {
3227                 locale_names[j] = env_lang;
3228             }
3229             else {
3230
3231 #    ifdef WIN32
3232                 /* If no LANG, use the system default on Windows. */
3233                 locale_names[j] = wrap_wsetlocale(categories[i], ".ACP");
3234                 if (locale_names[j]) {
3235                     SAVEFREEPV(locale_names[j]);
3236                 }
3237                 else
3238 #    endif
3239                 {   /* If nothing was found or worked, use C */
3240                     locale_names[j] = "C";
3241                 }
3242             }
3243         }
3244
3245         if (j > 0 && ! is_disparate && strNE(locale_names[0], locale_names[j]))
3246         {
3247             is_disparate = true;
3248         }
3249
3250         DEBUG_Lv(PerlIO_printf(Perl_debug_log,
3251                  "find_locale_from_environment i=%u, j=%u, name=%s,"
3252                  " locale=%s, locale of 0th category=%s, disparate=%d\n",
3253                  i, j, category_names[i],
3254                  locale_names[j], locale_names[0], is_disparate));
3255     }
3256
3257     if (! is_disparate) {
3258         return locale_names[0];
3259     }
3260
3261     return calculate_LC_ALL_string(locale_names, INTERNAL_FORMAT,
3262                                    WANT_TEMP_PV,
3263                                    __LINE__);
3264 }
3265
3266 #  endif
3267 #  if defined(DEBUGGING) || defined(USE_PERL_SWITCH_LOCALE_CONTEXT)
3268
3269 STATIC const char *
3270 S_get_LC_ALL_display(pTHX)
3271 {
3272     return calculate_LC_ALL_string(NULL, INTERNAL_FORMAT,
3273                                    WANT_TEMP_PV,
3274                                    __LINE__);
3275 }
3276
3277 #  endif
3278
3279 STATIC void
3280 S_setlocale_failure_panic_via_i(pTHX_
3281                                 const locale_category_index cat_index,
3282                                 const char * current,
3283                                 const char * failed,
3284                                 const line_t proxy_caller_line,
3285                                 const line_t immediate_caller_line,
3286                                 const char * const higher_caller_file,
3287                                 const line_t higher_caller_line)
3288 {
3289     PERL_ARGS_ASSERT_SETLOCALE_FAILURE_PANIC_VIA_I;
3290
3291     /* Called to panic when a setlocale form unexpectedly failed for the
3292      * category determined by 'cat_index', and the locale that was in effect
3293      * (and likely still is) is 'current'.  'current' may be NULL, which causes
3294      * this function to query what it is.
3295      *
3296      * The extra caller information is used for when a function acts as a
3297      * stand-in for another function, which a typical reader would more likely
3298      * think would be the caller
3299      *
3300      * If a line number is 0, its stack (sort-of) frame is omitted; same if
3301      * it's the same line number as the next higher caller. */
3302
3303     const int cat = categories[cat_index];
3304     const char * name = category_names[cat_index];
3305
3306     dSAVE_ERRNO;
3307
3308     if (current == NULL) {
3309         current = querylocale_i(cat_index);
3310     }
3311
3312     const char * proxy_text = "";
3313     if (proxy_caller_line != 0 && proxy_caller_line != immediate_caller_line)
3314     {
3315         proxy_text = Perl_form(aTHX_ "\nCalled via %s: %" LINE_Tf,
3316                                       __FILE__, proxy_caller_line);
3317     }
3318     if (   strNE(__FILE__, higher_caller_file)
3319         || (   immediate_caller_line != 0
3320             && immediate_caller_line != higher_caller_line))
3321     {
3322         proxy_text = Perl_form(aTHX_ "%s\nCalled via %s: %" LINE_Tf,
3323                                       proxy_text, __FILE__,
3324                                       immediate_caller_line);
3325     }
3326
3327     /* 'false' in the get_displayable_string() calls makes it not think the
3328      * locale is UTF-8, so just dumps bytes.  Actually figuring it out can be
3329      * too complicated for a panic situation. */
3330     const char * msg = Perl_form(aTHX_
3331                             "Can't change locale for %s (%d) from '%s' to '%s'"
3332                             " %s",
3333                             name, cat,
3334                             get_displayable_string(current,
3335                                                    current + strlen(current),
3336                                                    false),
3337                             get_displayable_string(failed,
3338                                                    failed + strlen(failed),
3339                                                    false),
3340                             proxy_text);
3341     RESTORE_ERRNO;
3342
3343     Perl_locale_panic(msg, __LINE__, higher_caller_file, higher_caller_line);
3344     NOT_REACHED; /* NOTREACHED */
3345 }
3346
3347 #  ifdef USE_LOCALE_NUMERIC
3348
3349 STATIC void
3350 S_new_numeric(pTHX_ const char *newnum, bool force)
3351 {
3352     PERL_ARGS_ASSERT_NEW_NUMERIC;
3353
3354     /* Called after each libc setlocale() or uselocale() call affecting
3355      * LC_NUMERIC, to tell core Perl this and that 'newnum' is the name of the
3356      * new locale, and we are switched into it.  It installs this locale as the
3357      * current underlying default, and then switches to the C locale, if
3358      * necessary, so that the code that has traditionally expected the radix
3359      * character to be a dot may continue to do so.
3360      *
3361      * The default locale and the C locale can be toggled between by use of the
3362      * set_numeric_underlying() and set_numeric_standard() functions, which
3363      * should probably not be called directly, but only via macros like
3364      * SET_NUMERIC_STANDARD() in perl.h.
3365      *
3366      * The toggling is necessary mainly so that a non-dot radix decimal point
3367      * character can be input and output, while allowing internal calculations
3368      * to use a dot.
3369      *
3370      * This sets several interpreter-level variables:
3371      * PL_numeric_name  The underlying locale's name: a copy of 'newnum'
3372      * PL_numeric_underlying   A boolean indicating if the toggled state is
3373      *                  such that the current locale is the program's
3374      *                  underlying locale
3375      * PL_numeric_standard   An int indicating if the toggled state is such
3376      *                  that the current locale is the C locale or
3377      *                  indistinguishable from the C locale.  If non-zero, it
3378      *                  is in C; if > 1, it means it may not be toggled away
3379      *                  from C.
3380      * PL_numeric_underlying_is_standard   A bool kept by this function
3381      *                  indicating that the underlying locale and the standard
3382      *                  C locale are indistinguishable for the purposes of
3383      *                  LC_NUMERIC.  This happens when both of the above two
3384      *                  variables are true at the same time.  (Toggling is a
3385      *                  no-op under these circumstances.)  This variable is
3386      *                  used to avoid having to recalculate.
3387      * PL_numeric_radix_sv   Contains the string that code should use for the
3388      *                  decimal point.  It is set to either a dot or the
3389      *                  program's underlying locale's radix character string,
3390      *                  depending on the situation.
3391      * PL_underlying_radix_sv   Contains the program's underlying locale's
3392      *                  radix character string.  This is copied into
3393      *                  PL_numeric_radix_sv when the situation warrants.  It
3394      *                  exists to avoid having to recalculate it when toggling.
3395      */
3396
3397     DEBUG_L( PerlIO_printf(Perl_debug_log,
3398                            "Called new_numeric with %s, PL_numeric_name=%s\n",
3399                            newnum, PL_numeric_name));
3400
3401     /* We keep records comparing the characteristics of the LC_NUMERIC catetory
3402      * of the current locale vs the standard C locale.  If the new locale that
3403      * has just been changed to is the same as the one our records are for,
3404      * they are still valid, and we don't have to recalculate them.  'force' is
3405      * true if the caller suspects that the records are out-of-date, so do go
3406      * ahead and recalculate them.  (This can happen when an external library
3407      * has had control and now perl is reestablishing control; we have to
3408      * assume that that library changed the locale in unknown ways.)
3409      *
3410      * Even if our records are valid, the new locale will likely have been
3411      * switched to before this function gets called, and we must toggle into
3412      * one indistinguishable from the C locale with regards to LC_NUMERIC
3413      * handling, so that all the libc functions that are affected by LC_NUMERIC
3414      * will work as expected.  This can be skipped if we already know that the
3415      * locale is indistinguishable from the C locale. */
3416     if (! force && strEQ(PL_numeric_name, newnum)) {
3417         if (! PL_numeric_underlying_is_standard) {
3418             set_numeric_standard(__FILE__, __LINE__);
3419         }
3420
3421         return;
3422     }
3423
3424     Safefree(PL_numeric_name);
3425     PL_numeric_name = savepv(newnum);
3426
3427     /* Handle the trivial case.  Since this is called at process
3428      * initialization, be aware that this bit can't rely on much being
3429      * available. */
3430     if (isNAME_C_OR_POSIX(PL_numeric_name)) {
3431         PL_numeric_standard = TRUE;
3432         PL_numeric_underlying_is_standard = TRUE;
3433         PL_numeric_underlying = TRUE;
3434         sv_setpv(PL_numeric_radix_sv, C_decimal_point);
3435         SvUTF8_off(PL_numeric_radix_sv);
3436         sv_setpv(PL_underlying_radix_sv, C_decimal_point);
3437         SvUTF8_off(PL_underlying_radix_sv);
3438         return;
3439     }
3440
3441     /* We are in the underlying locale until changed at the end of this
3442      * function */
3443     PL_numeric_underlying = TRUE;
3444
3445     char * radix = NULL;
3446     utf8ness_t utf8ness = UTF8NESS_IMMATERIAL;
3447
3448     /* Find and save this locale's radix character. */
3449     my_langinfo_c(RADIXCHAR, LC_NUMERIC, PL_numeric_name,
3450                   &radix, NULL, &utf8ness);
3451     sv_setpv(PL_underlying_radix_sv, radix);
3452
3453     if (utf8ness == UTF8NESS_YES) {
3454         SvUTF8_on(PL_underlying_radix_sv);
3455     }
3456     else {
3457         SvUTF8_off(PL_underlying_radix_sv);
3458     }
3459
3460     DEBUG_L(PerlIO_printf(Perl_debug_log,
3461                           "Locale radix is '%s', ?UTF-8=%d\n",
3462                           SvPVX(PL_underlying_radix_sv),
3463                           cBOOL(SvUTF8(PL_underlying_radix_sv))));
3464
3465     /* This locale is indistinguishable from C (for numeric purposes) if both
3466      * the radix character and the thousands separator are the same as C's.
3467      * Start with the radix. */
3468     PL_numeric_underlying_is_standard = strEQ(C_decimal_point, radix);
3469     Safefree(radix);
3470
3471 #    ifndef TS_W32_BROKEN_LOCALECONV
3472
3473     /* If the radix isn't the same as C's, we know it is distinguishable from
3474      * C; otherwise check the thousands separator too.  Only if both are the
3475      * same as C's is the locale indistinguishable from C.
3476      *
3477      * But on earlier Windows versions, there is a potential race.  This code
3478      * knows that localeconv() (elsewhere in this file) will be used to extract
3479      * the needed value, and localeconv() was buggy for quite a while, and that
3480      * code in this file hence uses a workaround.  And that workaround may have
3481      * an (unlikely) race.  Gathering the radix uses a different workaround on
3482      * Windows that doesn't involve a race.  It might be possible to do the
3483      * same for this (patches welcome).
3484      *
3485      * Until then khw doesn't think it's worth even the small risk of a race to
3486      * get this value, which doesn't appear to be used in any of the Microsoft
3487      * library routines anyway. */
3488
3489     if (PL_numeric_underlying_is_standard) {
3490         char * scratch_buffer = NULL;
3491         PL_numeric_underlying_is_standard = strEQ(C_thousands_sep,
3492                                              my_langinfo_c(THOUSEP, LC_NUMERIC,
3493                                                            PL_numeric_name,
3494                                                            &scratch_buffer,
3495                                                            NULL, NULL));
3496         Safefree(scratch_buffer);
3497     }
3498
3499 #    endif
3500
3501     PL_numeric_standard = PL_numeric_underlying_is_standard;
3502
3503     /* Keep LC_NUMERIC so that it has the C locale radix and thousands
3504      * separator.  This is for XS modules, so they don't have to worry about
3505      * the radix being a non-dot.  (Core operations that need the underlying
3506      * locale change to it temporarily). */
3507     if (! PL_numeric_standard) {
3508         set_numeric_standard(__FILE__, __LINE__);
3509     }
3510 }
3511
3512 #  endif
3513
3514 void
3515 Perl_set_numeric_standard(pTHX_ const char * const file, const line_t line)
3516 {
3517     PERL_ARGS_ASSERT_SET_NUMERIC_STANDARD;
3518     PERL_UNUSED_ARG(file);      /* Some Configurations ignore these */
3519     PERL_UNUSED_ARG(line);
3520
3521 #  ifdef USE_LOCALE_NUMERIC
3522
3523     /* Unconditionally toggle the LC_NUMERIC locale to the C locale
3524      *
3525      * Most code should use the macro SET_NUMERIC_STANDARD() in perl.h
3526      * instead of calling this directly.  The macro avoids calling this routine
3527      * if toggling isn't necessary according to our records (which could be
3528      * wrong if some XS code has changed the locale behind our back) */
3529
3530     DEBUG_L(PerlIO_printf(Perl_debug_log, "Setting LC_NUMERIC locale to"
3531                                           " standard C; called from %s: %"
3532                                           LINE_Tf "\n", file, line));
3533
3534     void_setlocale_c_with_caller(LC_NUMERIC, "C", file, line);
3535     PL_numeric_standard = TRUE;
3536     sv_setpv(PL_numeric_radix_sv, C_decimal_point);
3537     SvUTF8_off(PL_numeric_radix_sv);
3538
3539     PL_numeric_underlying = PL_numeric_underlying_is_standard;
3540
3541 #  endif /* USE_LOCALE_NUMERIC */
3542
3543 }
3544
3545 void
3546 Perl_set_numeric_underlying(pTHX_ const char * const file, const line_t line)
3547 {
3548     PERL_ARGS_ASSERT_SET_NUMERIC_UNDERLYING;
3549     PERL_UNUSED_ARG(file);      /* Some Configurations ignore these */
3550     PERL_UNUSED_ARG(line);
3551
3552 #  ifdef USE_LOCALE_NUMERIC
3553
3554     /* Unconditionally toggle the LC_NUMERIC locale to the current underlying
3555      * default.
3556      *
3557      * Most code should use the macro SET_NUMERIC_UNDERLYING() in perl.h
3558      * instead of calling this directly.  The macro avoids calling this routine
3559      * if toggling isn't necessary according to our records (which could be
3560      * wrong if some XS code has changed the locale behind our back) */
3561
3562     DEBUG_L(PerlIO_printf(Perl_debug_log, "Setting LC_NUMERIC locale to %s;"
3563                                           " called from %s: %" LINE_Tf "\n",
3564                                           PL_numeric_name, file, line));
3565     /* Maybe not in init? assert(PL_locale_mutex_depth > 0);*/
3566
3567     void_setlocale_c_with_caller(LC_NUMERIC, PL_numeric_name, file, line);
3568     PL_numeric_underlying = TRUE;
3569     sv_setsv_nomg(PL_numeric_radix_sv, PL_underlying_radix_sv);
3570
3571     PL_numeric_standard = PL_numeric_underlying_is_standard;
3572
3573 #  endif /* USE_LOCALE_NUMERIC */
3574
3575 }
3576
3577 #  ifdef USE_LOCALE_CTYPE
3578
3579 STATIC void
3580 S_new_ctype(pTHX_ const char *newctype, bool force)
3581 {
3582     PERL_ARGS_ASSERT_NEW_CTYPE;
3583     PERL_UNUSED_ARG(force);
3584
3585     /* Called after each libc setlocale() call affecting LC_CTYPE, to tell
3586      * core Perl this and that 'newctype' is the name of the new locale.
3587      *
3588      * This function sets up the folding arrays for all 256 bytes, assuming
3589      * that tofold() is tolc() since fold case is not a concept in POSIX,
3590      */
3591
3592     DEBUG_L(PerlIO_printf(Perl_debug_log, "Entering new_ctype(%s)\n",
3593                                           newctype));
3594
3595     /* No change means no-op */
3596     if (strEQ(PL_ctype_name, newctype)) {
3597         return;
3598     }
3599
3600     /* We will replace any bad locale warning with
3601      *  1)  nothing if the new one is ok; or
3602      *  2)  a new warning for the bad new locale */
3603     if (PL_warn_locale) {
3604         SvREFCNT_dec_NN(PL_warn_locale);
3605         PL_warn_locale = NULL;
3606     }
3607
3608     /* Clear cache */
3609     Safefree(PL_ctype_name);
3610     PL_ctype_name = "";
3611
3612     PL_in_utf8_turkic_locale = FALSE;
3613
3614     /* For the C locale, just use the standard folds, and we know there are no
3615      * glitches possible, so return early.  Since this is called at process
3616      * initialization, be aware that this bit can't rely on much being
3617      * available. */
3618     if (isNAME_C_OR_POSIX(newctype)) {
3619         Copy(PL_fold, PL_fold_locale, 256, U8);
3620         PL_ctype_name = savepv(newctype);
3621         PL_in_utf8_CTYPE_locale = FALSE;
3622         return;
3623     }
3624
3625     /* The cache being cleared signals the called function to compute a new
3626      * value */
3627     PL_in_utf8_CTYPE_locale = is_locale_utf8(newctype);
3628
3629     PL_ctype_name = savepv(newctype);
3630     bool maybe_utf8_turkic = FALSE;
3631
3632     /* Don't check for problems if we are suppressing the warnings */
3633     bool check_for_problems = ckWARN_d(WARN_LOCALE) || UNLIKELY(DEBUG_L_TEST);
3634
3635     if (PL_in_utf8_CTYPE_locale) {
3636
3637         /* A UTF-8 locale gets standard rules.  But note that code still has to
3638          * handle this specially because of the three problematic code points
3639          * */
3640         Copy(PL_fold_latin1, PL_fold_locale, 256, U8);
3641
3642         /* UTF-8 locales can have special handling for 'I' and 'i' if they are
3643          * Turkic.  Make sure these two are the only anomalies.  (We don't
3644          * require towupper and towlower because they aren't in C89.) */
3645
3646 #    if defined(HAS_TOWUPPER) && defined (HAS_TOWLOWER)
3647
3648         if (towupper('i') == 0x130 && towlower('I') == 0x131)
3649
3650 #    else
3651
3652         if (toU8_UPPER_LC('i') == 'i' && toU8_LOWER_LC('I') == 'I')
3653
3654 #    endif
3655
3656         {
3657             /* This is how we determine it really is Turkic */
3658             check_for_problems = TRUE;
3659             maybe_utf8_turkic = TRUE;
3660         }
3661     }
3662     else {  /* Not a canned locale we know the values for.  Compute them */
3663
3664 #    ifdef DEBUGGING
3665
3666         bool has_non_ascii_fold = FALSE;
3667         bool found_unexpected = FALSE;
3668
3669         /* Under -DLv, see if there are any folds outside the ASCII range.
3670          * This factoid is used below */
3671         if (DEBUG_Lv_TEST) {
3672             for (unsigned i = 128; i < 256; i++) {
3673                 int j = LATIN1_TO_NATIVE(i);
3674                 if (toU8_LOWER_LC(j) != j || toU8_UPPER_LC(j) != j) {
3675                     has_non_ascii_fold = TRUE;
3676                     break;
3677                 }
3678             }
3679         }
3680
3681 #    endif
3682
3683         for (unsigned i = 0; i < 256; i++) {
3684             if (isU8_UPPER_LC(i))
3685                 PL_fold_locale[i] = (U8) toU8_LOWER_LC(i);
3686             else if (isU8_LOWER_LC(i))
3687                 PL_fold_locale[i] = (U8) toU8_UPPER_LC(i);
3688             else
3689                 PL_fold_locale[i] = (U8) i;
3690
3691 #    ifdef DEBUGGING
3692
3693             /* Most locales these days are supersets of ASCII.  When debugging,
3694              * it is helpful to know what the exceptions to that are in this
3695              * locale */
3696             if (DEBUG_L_TEST) {
3697                 bool unexpected = FALSE;
3698
3699                 if (isUPPER_L1(i)) {
3700                     if (isUPPER_A(i)) {
3701                         if (PL_fold_locale[i] != toLOWER_A(i)) {
3702                             unexpected = TRUE;
3703                         }
3704                     }
3705                     else if (has_non_ascii_fold) {
3706                         if (PL_fold_locale[i] != toLOWER_L1(i)) {
3707                             unexpected = TRUE;
3708                         }
3709                     }
3710                     else if (PL_fold_locale[i] != i) {
3711                         unexpected = TRUE;
3712                     }
3713                 }
3714                 else if (   isLOWER_L1(i)
3715                          && i != LATIN_SMALL_LETTER_SHARP_S
3716                          && i != MICRO_SIGN)
3717                 {
3718                     if (isLOWER_A(i)) {
3719                         if (PL_fold_locale[i] != toUPPER_A(i)) {
3720                             unexpected = TRUE;
3721                         }
3722                     }
3723                     else if (has_non_ascii_fold) {
3724                         if (PL_fold_locale[i] != toUPPER_LATIN1_MOD(i)) {
3725                             unexpected = TRUE;
3726                         }
3727                     }
3728                     else if (PL_fold_locale[i] != i) {
3729                         unexpected = TRUE;
3730                     }
3731                 }
3732                 else if (PL_fold_locale[i] != i) {
3733                     unexpected = TRUE;
3734                 }
3735
3736                 if (unexpected) {
3737                     found_unexpected = TRUE;
3738                     DEBUG_L(PerlIO_printf(Perl_debug_log,
3739                                            "For %s, fold of %02x is %02x\n",
3740                                            newctype, i, PL_fold_locale[i]));
3741                 }
3742             }
3743         }
3744
3745         if (found_unexpected) {
3746             DEBUG_L(PerlIO_printf(Perl_debug_log,
3747                                "All bytes not mentioned above either fold to"
3748                                " themselves or are the expected ASCII or"
3749                                " Latin1 ones\n"));
3750         }
3751         else {
3752             DEBUG_Lv(PerlIO_printf(Perl_debug_log,
3753                                    "No nonstandard folds were found\n"));
3754 #    endif
3755
3756         }
3757     }
3758
3759 #    ifdef MB_CUR_MAX
3760
3761     /* We only handle single-byte locales (outside of UTF-8 ones); so if this
3762      * locale requires more than one byte, there are going to be BIG problems.
3763      * */
3764
3765     const int mb_cur_max = MB_CUR_MAX;
3766
3767     if (mb_cur_max > 1 && ! PL_in_utf8_CTYPE_locale
3768
3769             /* Some platforms return MB_CUR_MAX > 1 for even the "C" locale.
3770              * Just assume that the implementation for them (plus for POSIX) is
3771              * correct and the > 1 value is spurious.  (Since these are
3772              * specially handled to never be considered UTF-8 locales, as long
3773              * as this is the only problem, everything should work fine */
3774         && ! isNAME_C_OR_POSIX(newctype))
3775     {
3776         DEBUG_L(PerlIO_printf(Perl_debug_log,
3777                               "Unsupported, MB_CUR_MAX=%d\n", mb_cur_max));
3778
3779         Perl_ck_warner_d(aTHX_ packWARN(WARN_LOCALE),
3780                          "Locale '%s' is unsupported, and may crash the"
3781                          " interpreter",
3782                          newctype);
3783     }
3784
3785 #    endif
3786
3787     DEBUG_Lv(PerlIO_printf(Perl_debug_log, "check_for_problems=%d\n",
3788                                            check_for_problems));
3789
3790     /* We don't populate the other lists if a UTF-8 locale, but do check that
3791      * everything works as expected, unless checking turned off */
3792     if (check_for_problems) {
3793         /* Assume enough space for every character being bad.  4 spaces each
3794          * for the 94 printable characters that are output like "'x' "; and 5
3795          * spaces each for "'\\' ", "'\t' ", and "'\n' "; plus a terminating
3796          * NUL */
3797         char bad_chars_list[ (94 * 4) + (3 * 5) + 1 ] = { '\0' };
3798         unsigned int bad_count = 0;         /* Count of bad characters */
3799
3800         for (unsigned i = 0; i < 256; i++) {
3801
3802             /* If checking for locale problems, see if the native ASCII-range
3803              * printables plus \n and \t are in their expected categories in
3804              * the new locale.  If not, this could mean big trouble, upending
3805              * Perl's and most programs' assumptions, like having a
3806              * metacharacter with special meaning become a \w.  Fortunately,
3807              * it's very rare to find locales that aren't supersets of ASCII
3808              * nowadays.  It isn't a problem for most controls to be changed
3809              * into something else; we check only \n and \t, though perhaps \r
3810              * could be an issue as well. */
3811             if (isGRAPH_A(i) || isBLANK_A(i) || i == '\n') {
3812                 bool is_bad = FALSE;
3813                 char name[4] = { '\0' };
3814
3815                 /* Convert the name into a string */
3816                 if (isGRAPH_A(i)) {
3817                     name[0] = i;
3818                     name[1] = '\0';
3819                 }
3820                 else if (i == '\n') {
3821                     my_strlcpy(name, "\\n", sizeof(name));
3822                 }
3823                 else if (i == '\t') {
3824                     my_strlcpy(name, "\\t", sizeof(name));
3825                 }
3826                 else {
3827                     assert(i == ' ');
3828                     my_strlcpy(name, "' '", sizeof(name));
3829                 }
3830
3831                 /* Check each possibe class */
3832                 if (UNLIKELY(cBOOL(isU8_ALPHANUMERIC_LC(i)) !=
3833                                                     cBOOL(isALPHANUMERIC_A(i))))
3834                 {
3835                     is_bad = TRUE;
3836                     DEBUG_L(PerlIO_printf(Perl_debug_log,
3837                                         "isalnum('%s') unexpectedly is %x\n",
3838                                         name, cBOOL(isU8_ALPHANUMERIC_LC(i))));
3839                 }
3840                 if (UNLIKELY(cBOOL(isU8_ALPHA_LC(i)) != cBOOL(isALPHA_A(i))))  {
3841                     is_bad = TRUE;
3842                     DEBUG_L(PerlIO_printf(Perl_debug_log,
3843                                           "isalpha('%s') unexpectedly is %x\n",
3844                                           name, cBOOL(isU8_ALPHA_LC(i))));
3845                 }
3846                 if (UNLIKELY(cBOOL(isU8_DIGIT_LC(i)) != cBOOL(isDIGIT_A(i))))  {
3847                     is_bad = TRUE;
3848                     DEBUG_L(PerlIO_printf(Perl_debug_log,
3849                                           "isdigit('%s') unexpectedly is %x\n",
3850                                           name, cBOOL(isU8_DIGIT_LC(i))));
3851                 }
3852                 if (UNLIKELY(cBOOL(isU8_GRAPH_LC(i)) != cBOOL(isGRAPH_A(i))))  {
3853                     is_bad = TRUE;
3854                     DEBUG_L(PerlIO_printf(Perl_debug_log,
3855                                           "isgraph('%s') unexpectedly is %x\n",
3856                                           name, cBOOL(isU8_GRAPH_LC(i))));
3857                 }
3858                 if (UNLIKELY(cBOOL(isU8_LOWER_LC(i)) != cBOOL(isLOWER_A(i))))  {
3859                     is_bad = TRUE;
3860                     DEBUG_L(PerlIO_printf(Perl_debug_log,
3861                                           "islower('%s') unexpectedly is %x\n",
3862                                           name, cBOOL(isU8_LOWER_LC(i))));
3863                 }
3864                 if (UNLIKELY(cBOOL(isU8_PRINT_LC(i)) != cBOOL(isPRINT_A(i))))  {
3865                     is_bad = TRUE;
3866                     DEBUG_L(PerlIO_printf(Perl_debug_log,
3867                                           "isprint('%s') unexpectedly is %x\n",
3868                                           name, cBOOL(isU8_PRINT_LC(i))));
3869                 }
3870                 if (UNLIKELY(cBOOL(isU8_PUNCT_LC(i)) != cBOOL(isPUNCT_A(i))))  {
3871                     is_bad = TRUE;
3872                     DEBUG_L(PerlIO_printf(Perl_debug_log,
3873                                           "ispunct('%s') unexpectedly is %x\n",
3874                                           name, cBOOL(isU8_PUNCT_LC(i))));
3875                 }
3876                 if (UNLIKELY(cBOOL(isU8_SPACE_LC(i)) != cBOOL(isSPACE_A(i))))  {
3877                     is_bad = TRUE;
3878                     DEBUG_L(PerlIO_printf(Perl_debug_log,
3879                                           "isspace('%s') unexpectedly is %x\n",
3880                                           name, cBOOL(isU8_SPACE_LC(i))));
3881                 }
3882                 if (UNLIKELY(cBOOL(isU8_UPPER_LC(i)) != cBOOL(isUPPER_A(i))))  {
3883                     is_bad = TRUE;
3884                     DEBUG_L(PerlIO_printf(Perl_debug_log,
3885                                           "isupper('%s') unexpectedly is %x\n",
3886                                           name, cBOOL(isU8_UPPER_LC(i))));
3887                 }
3888                 if (UNLIKELY(cBOOL(isU8_XDIGIT_LC(i))!= cBOOL(isXDIGIT_A(i)))) {
3889                     is_bad = TRUE;
3890                     DEBUG_L(PerlIO_printf(Perl_debug_log,
3891                                           "isxdigit('%s') unexpectedly is %x\n",
3892                                           name, cBOOL(isU8_XDIGIT_LC(i))));
3893                 }
3894                 if (UNLIKELY(toU8_LOWER_LC(i) != (int) toLOWER_A(i))) {
3895                     is_bad = TRUE;
3896                     DEBUG_L(PerlIO_printf(Perl_debug_log,
3897                             "tolower('%s')=0x%x instead of the expected 0x%x\n",
3898                             name, toU8_LOWER_LC(i), (int) toLOWER_A(i)));
3899                 }
3900                 if (UNLIKELY(toU8_UPPER_LC(i) != (int) toUPPER_A(i))) {
3901                     is_bad = TRUE;
3902                     DEBUG_L(PerlIO_printf(Perl_debug_log,
3903                             "toupper('%s')=0x%x instead of the expected 0x%x\n",
3904                             name, toU8_UPPER_LC(i), (int) toUPPER_A(i)));
3905                 }
3906                 if (UNLIKELY((i == '\n' && ! isCNTRL_LC(i))))  {
3907                     is_bad = TRUE;
3908                     DEBUG_L(PerlIO_printf(Perl_debug_log,
3909                                 "'\\n' (=%02X) is not a control\n", (int) i));
3910                 }
3911
3912                 /* Add to the list;  Separate multiple entries with a blank */
3913                 if (is_bad) {
3914                     if (bad_count) {
3915                         my_strlcat(bad_chars_list, " ", sizeof(bad_chars_list));
3916                     }
3917                     my_strlcat(bad_chars_list, name, sizeof(bad_chars_list));
3918                     bad_count++;
3919                 }
3920             }
3921         }
3922
3923         if (bad_count == 2 && maybe_utf8_turkic) {
3924             bad_count = 0;
3925             *bad_chars_list = '\0';
3926
3927             /* The casts are because otherwise some compilers warn:
3928                 gcc.gnu.org/bugzilla/show_bug.cgi?id=99950
3929                 gcc.gnu.org/bugzilla/show_bug.cgi?id=94182
3930              */
3931             PL_fold_locale[ (U8) 'I' ] = 'I';
3932             PL_fold_locale[ (U8) 'i' ] = 'i';
3933             PL_in_utf8_turkic_locale = TRUE;
3934             DEBUG_L(PerlIO_printf(Perl_debug_log, "%s is turkic\n", newctype));
3935         }
3936
3937         /* If we found problems and we want them output, do so */
3938         if (   (UNLIKELY(bad_count))
3939             && (LIKELY(ckWARN_d(WARN_LOCALE)) || UNLIKELY(DEBUG_L_TEST)))
3940         {
3941             /* WARNING.  If you change the wording of these; be sure to update
3942              * t/loc_tools.pl correspondingly */
3943
3944             if (PL_in_utf8_CTYPE_locale) {
3945                 PL_warn_locale = Perl_newSVpvf(aTHX_
3946                      "Locale '%s' contains (at least) the following characters"
3947                      " which have\nunexpected meanings: %s\nThe Perl program"
3948                      " will use the expected meanings",
3949                       newctype, bad_chars_list);
3950             }
3951             else {
3952                 PL_warn_locale =
3953                     Perl_newSVpvf(aTHX_
3954                                   "\nThe following characters (and maybe"
3955                                   " others) may not have the same meaning as"
3956                                   " the Perl program expects: %s\n",
3957                                   bad_chars_list
3958                             );
3959             }
3960
3961 #    if defined(HAS_SOME_LANGINFO) || defined(WIN32)
3962
3963             char * scratch_buffer = NULL;
3964             Perl_sv_catpvf(aTHX_ PL_warn_locale, "; codeset=%s",
3965                                  my_langinfo_c(CODESET, LC_CTYPE,
3966                                                newctype,
3967                                                &scratch_buffer, NULL,
3968                                                NULL));
3969             Safefree(scratch_buffer);
3970
3971 #    endif
3972
3973             Perl_sv_catpvf(aTHX_ PL_warn_locale, "\n");
3974
3975             /* If we are actually in the scope of the locale or are debugging,
3976              * output the message now.  If not in that scope, we save the
3977              * message to be output at the first operation using this locale,
3978              * if that actually happens.  Most programs don't use locales, so
3979              * they are immune to bad ones.  */
3980             if (IN_LC(LC_CTYPE) || UNLIKELY(DEBUG_L_TEST)) {
3981
3982                 /* The '0' below suppresses a bogus gcc compiler warning */
3983                 Perl_warner(aTHX_ packWARN(WARN_LOCALE), SvPVX(PL_warn_locale),
3984                                                                             0);
3985                 if (IN_LC(LC_CTYPE)) {
3986                     SvREFCNT_dec_NN(PL_warn_locale);
3987                     PL_warn_locale = NULL;
3988                 }
3989             }
3990         }
3991     }
3992 }
3993
3994 void
3995 Perl_warn_problematic_locale()
3996 {
3997     dTHX;
3998
3999     /* Core-only function that outputs the message in PL_warn_locale,
4000      * and then NULLS it.  Should be called only through the macro
4001      * CHECK_AND_WARN_PROBLEMATIC_LOCALE_ */
4002
4003     if (PL_warn_locale) {
4004         Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
4005                              SvPVX(PL_warn_locale),
4006                              0 /* dummy to avoid compiler warning */ );
4007         SvREFCNT_dec_NN(PL_warn_locale);
4008         PL_warn_locale = NULL;
4009     }
4010 }
4011
4012 #  endif /* USE_LOCALE_CTYPE */
4013
4014 STATIC void
4015 S_new_LC_ALL(pTHX_ const char *lc_all, bool force)
4016 {
4017     PERL_ARGS_ASSERT_NEW_LC_ALL;
4018
4019     /* new_LC_ALL() updates all the things we care about.  Note that this is
4020      * called just after a change, so uses the actual underlying locale just
4021      * set, and not the nominal one (should they differ, as they may in
4022      * LC_NUMERIC). */
4023
4024     const char * individ_locales[LC_ALL_INDEX_] = { NULL };
4025
4026     switch (parse_LC_ALL_string(lc_all,
4027                                 individ_locales,
4028                                 override_if_ignored,   /* Override any ignored
4029                                                           categories */
4030                                 true,   /* Always fill array */
4031                                 true,   /* Panic if fails, as to get here it
4032                                            earlier had to have succeeded */
4033                                 __LINE__))
4034     {
4035       case invalid:
4036       case no_array:
4037       case only_element_0:
4038         locale_panic_("Unexpected return from parse_LC_ALL_string");
4039
4040       case full_array:
4041         break;
4042     }
4043
4044     for_all_individual_category_indexes(i) {
4045         if (update_functions[i]) {
4046             const char * this_locale = individ_locales[i];
4047             update_functions[i](aTHX_ this_locale, force);
4048         }
4049
4050         Safefree(individ_locales[i]);
4051     }
4052 }
4053
4054 #  ifdef USE_LOCALE_COLLATE
4055
4056 STATIC void
4057 S_new_collate(pTHX_ const char *newcoll, bool force)
4058 {
4059     PERL_ARGS_ASSERT_NEW_COLLATE;
4060     PERL_UNUSED_ARG(force);
4061
4062     /* Called after each libc setlocale() call affecting LC_COLLATE, to tell
4063      * core Perl this and that 'newcoll' is the name of the new locale.
4064      *
4065      * The design of locale collation is that every locale change is given an
4066      * index 'PL_collation_ix'.  The first time a string participates in an
4067      * operation that requires collation while locale collation is active, it
4068      * is given PERL_MAGIC_collxfrm magic (via sv_collxfrm_flags()).  That
4069      * magic includes the collation index, and the transformation of the string
4070      * by strxfrm(), q.v.  That transformation is used when doing comparisons,
4071      * instead of the string itself.  If a string changes, the magic is
4072      * cleared.  The next time the locale changes, the index is incremented,
4073      * and so we know during a comparison that the transformation is not
4074      * necessarily still valid, and so is recomputed.  Note that if the locale
4075      * changes enough times, the index could wrap, and it is possible that a
4076      * transformation would improperly be considered valid, leading to an
4077      * unlikely bug.  The value is declared to the widest possible type on this
4078      * platform. */
4079
4080     /* Return if the locale isn't changing */
4081     if (strEQ(PL_collation_name, newcoll)) {
4082         return;
4083     }
4084
4085     Safefree(PL_collation_name);
4086     PL_collation_name = savepv(newcoll);
4087     ++PL_collation_ix;
4088
4089     /* Set the new one up if trivial.  Since this is called at process
4090      * initialization, be aware that this bit can't rely on much being
4091      * available. */
4092     PL_collation_standard = isNAME_C_OR_POSIX(newcoll);
4093     if (PL_collation_standard) {
4094         DEBUG_Lv(PerlIO_printf(Perl_debug_log,
4095                                "Setting PL_collation name='%s'\n",
4096                                PL_collation_name));
4097         PL_collxfrm_base = 0;
4098         PL_collxfrm_mult = 2;
4099         PL_in_utf8_COLLATE_locale = FALSE;
4100         PL_strxfrm_NUL_replacement = '\0';
4101         PL_strxfrm_max_cp = 0;
4102         return;
4103     }
4104
4105     /* Flag that the remainder of the set up is being deferred until first
4106      * need. */
4107     PL_collxfrm_mult = 0;
4108     PL_collxfrm_base = 0;
4109
4110 }
4111
4112 #  endif /* USE_LOCALE_COLLATE */
4113
4114 #  ifdef WIN32
4115
4116 STATIC wchar_t *
4117 S_Win_byte_string_to_wstring(const UINT code_page, const char * byte_string)
4118 {
4119     /* Caller must arrange to free the returned string */
4120
4121     int req_size = MultiByteToWideChar(code_page, 0, byte_string, -1, NULL, 0);
4122     if (! req_size) {
4123         SET_EINVAL;
4124         return NULL;
4125     }
4126
4127     wchar_t *wstring;
4128     Newx(wstring, req_size, wchar_t);
4129
4130     if (! MultiByteToWideChar(code_page, 0, byte_string, -1, wstring, req_size))
4131     {
4132         Safefree(wstring);
4133         SET_EINVAL;
4134         return NULL;
4135     }
4136
4137     return wstring;
4138 }
4139
4140 #    define Win_utf8_string_to_wstring(s)                                   \
4141                                     Win_byte_string_to_wstring(CP_UTF8, (s))
4142
4143 STATIC char *
4144 S_Win_wstring_to_byte_string(const UINT code_page, const wchar_t * wstring)
4145 {
4146     /* Caller must arrange to free the returned string */
4147
4148     int req_size =
4149             WideCharToMultiByte(code_page, 0, wstring, -1, NULL, 0, NULL, NULL);
4150
4151     char *byte_string;
4152     Newx(byte_string, req_size, char);
4153
4154     if (! WideCharToMultiByte(code_page, 0, wstring, -1, byte_string,
4155                                                          req_size, NULL, NULL))
4156     {
4157         Safefree(byte_string);
4158         SET_EINVAL;
4159         return NULL;
4160     }
4161
4162     return byte_string;
4163 }
4164
4165 #    define Win_wstring_to_utf8_string(ws)                                  \
4166                                    Win_wstring_to_byte_string(CP_UTF8, (ws))
4167
4168 STATIC const char *
4169 S_wrap_wsetlocale(pTHX_ const int category, const char *locale)
4170 {
4171     PERL_ARGS_ASSERT_WRAP_WSETLOCALE;
4172
4173     /* Calls _wsetlocale(), converting the parameters/return to/from
4174      * Perl-expected forms as if plain setlocale() were being called instead.
4175      *
4176      * Caller must arrange for the returned PV to be freed.
4177      */
4178
4179     const wchar_t * wlocale = NULL;
4180
4181     if (locale) {
4182         wlocale = Win_utf8_string_to_wstring(locale);
4183         if (! wlocale) {
4184             return NULL;
4185         }
4186     }
4187
4188     WSETLOCALE_LOCK;
4189     const wchar_t * wresult = _wsetlocale(category, wlocale);
4190
4191     if (! wresult) {
4192         WSETLOCALE_UNLOCK;
4193         Safefree(wlocale);
4194         return NULL;
4195     }
4196
4197     const char * result = Win_wstring_to_utf8_string(wresult);
4198     WSETLOCALE_UNLOCK;
4199
4200     Safefree(wlocale);
4201     return result;
4202 }
4203
4204 STATIC const char *
4205 S_win32_setlocale(pTHX_ int category, const char* locale)
4206 {
4207     /* This, for Windows, emulates POSIX setlocale() behavior.  There is no
4208      * difference between the two unless the input locale is "", which normally
4209      * means on Windows to get the machine default, which is set via the
4210      * computer's "Regional and Language Options" (or its current equivalent).
4211      * In POSIX, it instead means to find the locale from the user's
4212      * environment.  This routine changes the Windows behavior to try the POSIX
4213      * behavior first.  Further details are in the called function
4214      * find_locale_from_environment().
4215      */
4216
4217     if (locale != NULL && strEQ(locale, "")) {
4218         /* Note this function may change the locale, but that's ok because we
4219          * are about to change it anyway */
4220         locale = find_locale_from_environment(get_category_index(category));
4221         if (locale == NULL) {
4222             SET_EINVAL;
4223             return NULL;
4224         }
4225     }
4226
4227     const char * result = wrap_wsetlocale(category, locale);
4228     DEBUG_L(PerlIO_printf(Perl_debug_log, "%s\n",
4229                           setlocale_debug_string_r(category, locale, result)));
4230
4231     if (! result) {
4232         SET_EINVAL;
4233         return NULL;
4234     }
4235
4236     save_to_buffer(result, &PL_setlocale_buf, &PL_setlocale_bufsize);
4237
4238 #    ifndef USE_PL_CUR_LC_ALL
4239
4240     Safefree(result);
4241
4242 #  else
4243
4244     /* Here, we need to keep track of LC_ALL, so store the new value.  but if
4245      * the input locale is NULL, we were just querying, so the original value
4246      * hasn't changed */
4247     if (locale == NULL) {
4248         Safefree(result);
4249     }
4250     else {
4251
4252         /* If we set LC_ALL directly above, we already know its new value; but
4253          * if we changed just an individual category, find the new LC_ALL */
4254         if (category != LC_ALL) {
4255             Safefree(result);
4256             result = wrap_wsetlocale(LC_ALL, NULL);
4257         }
4258
4259         Safefree(PL_cur_LC_ALL);
4260         PL_cur_LC_ALL = result;
4261     }
4262
4263     DEBUG_L(PerlIO_printf(Perl_debug_log, "new PL_cur_LC_ALL=%s\n",
4264                                           PL_cur_LC_ALL));
4265 #    endif
4266
4267     return PL_setlocale_buf;
4268 }
4269
4270 #  endif
4271
4272 STATIC const char *
4273 S_native_querylocale_i(pTHX_ const locale_category_index cat_index)
4274 {
4275     /* Determine the current locale and return it in the form the platform's
4276      * native locale handling understands.  This is different only from our
4277      * internal form for the LC_ALL category, as platforms differ in how they
4278      * represent that.
4279      *
4280      * This is only called from Perl_setlocale().  As such it returns in
4281      * PL_setlocale_buf */
4282
4283 #  ifdef USE_LOCALE_NUMERIC
4284
4285     /* We have the LC_NUMERIC name saved, because we are normally switched into
4286      * the C locale (or equivalent) for it. */
4287     if (cat_index == LC_NUMERIC_INDEX_) {
4288
4289         /* We don't have to copy this return value, as it is a per-thread
4290          * variable, and won't change until a future setlocale */
4291         return PL_numeric_name;
4292     }
4293
4294 #  endif
4295 #  ifdef LC_ALL
4296
4297     if (cat_index != LC_ALL_INDEX_)
4298
4299 #  endif
4300
4301     {
4302         /* Here, not LC_ALL, and not LC_NUMERIC: the actual and native values
4303          * match */
4304
4305 #  ifdef setlocale_i    /* Can shortcut if this is defined */
4306
4307         return setlocale_i(cat_index, NULL);
4308
4309 #  else
4310
4311         return save_to_buffer(querylocale_i(cat_index),
4312                               &PL_setlocale_buf, &PL_setlocale_bufsize);
4313 #  endif
4314
4315     }
4316
4317     /* Below, querying LC_ALL */
4318
4319 #  ifdef LC_ALL
4320 #    ifdef USE_PL_CURLOCALES
4321 #      define LC_ALL_ARG  PL_curlocales
4322 #    else
4323 #      define LC_ALL_ARG  NULL  /* Causes calculate_LC_ALL_string() to find the
4324                                    locale using a querylocale function */
4325 #    endif
4326
4327     return calculate_LC_ALL_string(LC_ALL_ARG, EXTERNAL_FORMAT_FOR_QUERY,
4328                                    WANT_PL_setlocale_buf,
4329                                    __LINE__);
4330 #    undef LC_ALL_ARG
4331 #  endif    /* has LC_ALL */
4332
4333 }
4334
4335 #endif      /* USE_LOCALE */
4336
4337 /*
4338 =for apidoc Perl_setlocale
4339
4340 This is an (almost) drop-in replacement for the system L<C<setlocale(3)>>,
4341 taking the same parameters, and returning the same information, except that it
4342 returns the correct underlying C<LC_NUMERIC> locale.  Regular C<setlocale> will
4343 instead return C<C> if the underlying locale has a non-dot decimal point
4344 character, or a non-empty thousands separator for displaying floating point
4345 numbers.  This is because perl keeps that locale category such that it has a
4346 dot and empty separator, changing the locale briefly during the operations
4347 where the underlying one is required. C<Perl_setlocale> knows about this, and
4348 compensates; regular C<setlocale> doesn't.
4349
4350 Another reason it isn't completely a drop-in replacement is that it is
4351 declared to return S<C<const char *>>, whereas the system setlocale omits the
4352 C<const> (presumably because its API was specified long ago, and can't be
4353 updated; it is illegal to change the information C<setlocale> returns; doing
4354 so leads to segfaults.)
4355
4356 Finally, C<Perl_setlocale> works under all circumstances, whereas plain
4357 C<setlocale> can be completely ineffective on some platforms under some
4358 configurations.
4359
4360 Changing the locale is not a good idea when more than one thread is running,
4361 except on systems where the predefined variable C<${^SAFE_LOCALES}> is
4362 non-zero.  This is because on such systems the locale is global to the whole
4363 process and not local to just the thread calling the function.  So changing it
4364 in one thread instantaneously changes it in all.  On some such systems, the
4365 system C<setlocale()> is ineffective, returning the wrong information, and
4366 failing to actually change the locale.  z/OS refuses to try to change the
4367 locale once a second thread is created.  C<Perl_setlocale>, should give you
4368 accurate results of what actually happened on these problematic platforms,
4369 returning NULL if the system forbade the locale change.
4370
4371 The return points to a per-thread static buffer, which is overwritten the next
4372 time C<Perl_setlocale> is called from the same thread.
4373
4374 =cut
4375
4376 */
4377
4378 const char *
4379 Perl_setlocale(const int category, const char * locale)
4380 {
4381     /* This wraps POSIX::setlocale() */
4382
4383 #ifndef USE_LOCALE
4384
4385     PERL_UNUSED_ARG(category);
4386     PERL_UNUSED_ARG(locale);
4387
4388     return "C";
4389
4390 #else
4391
4392     dTHX;
4393
4394     DEBUG_L(PerlIO_printf(Perl_debug_log,
4395                           "Entering Perl_setlocale(%d, \"%s\")\n",
4396                           category, locale));
4397
4398     bool valid_category;
4399     locale_category_index cat_index = get_category_index_helper(category,
4400                                                                 &valid_category,
4401                                                                 __LINE__);
4402     if (! valid_category) {
4403         if (ckWARN(WARN_LOCALE)) {
4404             const char * conditional_warn_text;
4405             if (locale == NULL) {
4406                 conditional_warn_text = "";
4407                 locale = "";
4408             }
4409             else {
4410                 conditional_warn_text = "; can't set it to ";
4411             }
4412
4413             /* diag_listed_as: Unknown locale category %d; can't set it to %s */
4414             Perl_warner(aTHX_
4415                            packWARN(WARN_LOCALE),
4416                            "Unknown locale category %d%s%s",
4417                            category, conditional_warn_text, locale);
4418         }
4419
4420         SET_EINVAL;
4421         return NULL;
4422     }
4423
4424 #  ifdef setlocale_i
4425
4426     /* setlocale_i() gets defined only on Configurations that use setlocale()
4427      * in a simple manner that adequately handles all cases.  If this category
4428      * doesn't have any perl complications, just do that. */
4429     if (! update_functions[cat_index]) {
4430         return setlocale_i(cat_index, locale);
4431     }
4432
4433 #  endif
4434
4435     /* Get current locale */
4436     const char * current_locale = native_querylocale_i(cat_index);
4437
4438     /* A NULL locale means only query what the current one is. */
4439     if (locale == NULL) {
4440         return current_locale;
4441     }
4442
4443     if (strEQ(current_locale, locale)) {
4444         DEBUG_L(PerlIO_printf(Perl_debug_log,
4445                              "Already in requested locale: no action taken\n"));
4446         return current_locale;
4447     }
4448
4449     /* Here, an actual change is being requested.  Do it */
4450     if (! bool_setlocale_i(cat_index, locale)) {
4451         DEBUG_L(PerlIO_printf(Perl_debug_log, "%s\n",
4452                           setlocale_debug_string_i(cat_index, locale, "NULL")));
4453         return NULL;
4454     }
4455
4456     /* At this point, the locale has been changed based on the requested value,
4457      * and the querylocale_i() will return the actual new value that the system
4458      * has for the category.  That may not be the same as the input, as libc
4459      * may have returned a synonymous locale name instead of the input one; or,
4460      * if there are locale categories that we are compiled to ignore, any
4461      * attempt to change them away from "C" is overruled */
4462     current_locale = querylocale_i(cat_index);
4463
4464     /* But certain categories need further work.  For example we may need to
4465      * calculate new folding or collation rules.  And for LC_NUMERIC, we have
4466      * to switch into a locale that has a dot radix. */
4467     if (update_functions[cat_index]) {
4468         update_functions[cat_index](aTHX_ current_locale,
4469                                           /* No need to force recalculation, as
4470                                            * aren't coming from a situation
4471                                            * where Perl hasn't been controlling
4472                                            * the locale, so has accurate
4473                                            * records. */
4474                                           false);
4475     }
4476
4477     /* Make sure the result is in a stable buffer for the caller's use, and is
4478      * in the expected format */
4479     current_locale = native_querylocale_i(cat_index);
4480
4481     DEBUG_L(PerlIO_printf(Perl_debug_log, "returning '%s'\n", current_locale));
4482
4483     return current_locale;
4484
4485 #endif
4486
4487 }
4488
4489 #ifdef USE_LOCALE
4490
4491 STATIC const char *
4492 S_toggle_locale_i(pTHX_ const locale_category_index cat_index,
4493                         const char * new_locale,
4494                         const line_t caller_line)
4495 {
4496     PERL_ARGS_ASSERT_TOGGLE_LOCALE_I;
4497     assert(cat_index <= LC_ALL_INDEX_);
4498
4499     /* Changes the locale for the category specified by 'index' to 'new_locale,
4500      * if they aren't already the same.
4501      *
4502      * Returns a copy of the name of the original locale for 'cat_index'
4503      * so can be switched back to with the companion function
4504      * restore_toggled_locale_i(),  (NULL if no restoral is necessary.) */
4505
4506     /* Find the original locale of the category we may need to change, so that
4507      * it can be restored to later */
4508     const char * locale_to_restore_to = querylocale_i(cat_index);
4509
4510     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
4511                            "Entering toggle_locale_i: index=%d(%s),"        \
4512                            " wanted=%s, actual=%s; called from %" LINE_Tf   \
4513                            "\n", cat_index, category_names[cat_index],
4514                            new_locale, locale_to_restore_to, caller_line));
4515
4516     if (! locale_to_restore_to) {
4517         locale_panic_via_(Perl_form(aTHX_
4518                                     "Could not find current %s locale",
4519                                     category_names[cat_index]),
4520                          __FILE__, caller_line);
4521     }
4522
4523     /* If the locales are the same, there's nothing to do */
4524     if (strEQ(locale_to_restore_to, new_locale)) {
4525         DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s locale unchanged as %s\n",
4526                                                category_names[cat_index],
4527                                                new_locale));
4528
4529         return NULL;
4530     }
4531
4532     /* Finally, change the locale to the new one */
4533     void_setlocale_i_with_caller(cat_index, new_locale, __FILE__, caller_line);
4534
4535     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
4536                            "%s locale switched to %s\n",
4537                            category_names[cat_index], new_locale));
4538
4539     return locale_to_restore_to;
4540
4541 #  ifndef DEBUGGING
4542     PERL_UNUSED_ARG(caller_line);
4543 #  endif
4544
4545 }
4546
4547 STATIC void
4548 S_restore_toggled_locale_i(pTHX_ const locale_category_index cat_index,
4549                                  const char * restore_locale,
4550                                  const line_t caller_line)
4551 {
4552     /* Restores the locale for LC_category corresponding to cat_index to
4553      * 'restore_locale' (which is a copy that will be freed by this function),
4554      * or do nothing if the latter parameter is NULL */
4555
4556     PERL_ARGS_ASSERT_RESTORE_TOGGLED_LOCALE_I;
4557     assert(cat_index <= LC_ALL_INDEX_);
4558
4559     if (restore_locale == NULL) {
4560         DEBUG_Lv(PerlIO_printf(Perl_debug_log,
4561                                "restore_toggled_locale_i: No need to"       \
4562                                " restore %s; called from %" LINE_Tf "\n",   \
4563                                category_names[cat_index], caller_line));
4564         return;
4565     }
4566
4567     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
4568                            "restore_toggled_locale_i: restoring locale for" \
4569                            " %s to  %s; called from %" LINE_Tf "\n",        \
4570                            category_names[cat_index], restore_locale,
4571                            caller_line));
4572
4573     void_setlocale_i_with_caller(cat_index, restore_locale,
4574                                   __FILE__, caller_line);
4575
4576 #  ifndef DEBUGGING
4577     PERL_UNUSED_ARG(caller_line);
4578 #  endif
4579
4580 }
4581
4582 #endif
4583 #if defined(USE_LOCALE) || defined(HAS_SOME_LANGINFO) || defined(HAS_LOCALECONV)
4584
4585 STATIC utf8ness_t
4586 S_get_locale_string_utf8ness_i(pTHX_ const char * string,
4587                                      const locale_utf8ness_t known_utf8,
4588                                      const char * locale,
4589                                      const locale_category_index cat_index)
4590 {
4591     PERL_ARGS_ASSERT_GET_LOCALE_STRING_UTF8NESS_I;
4592
4593 #  ifndef USE_LOCALE
4594
4595     return UTF8NESS_NO;
4596     PERL_UNUSED_ARG(string);
4597     PERL_UNUSED_ARG(known_utf8);
4598     PERL_UNUSED_ARG(locale);
4599     PERL_UNUSED_ARG(cat_index);
4600
4601 #  else
4602
4603     assert(cat_index <= LC_ALL_INDEX_);
4604
4605     /* Return to indicate if 'string' in the locale given by the input
4606      * arguments should be considered UTF-8 or not.
4607      *
4608      * If the input 'locale' is not NULL, use that for the locale; otherwise
4609      * use the current locale for the category specified by 'cat_index'.
4610      */
4611
4612     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
4613                            "Entering get_locale_string_utf8ness_i; locale=%s,"
4614                            " index=%u(%s), string=%s, known_utf8=%d\n",
4615                            locale, cat_index, category_names[cat_index],
4616                            ((string)
4617                             ?  _byte_dump_string((U8 *) string,
4618                                                  strlen(string),
4619                                                  0)
4620                             : "nil"),
4621                            known_utf8));
4622     if (string == NULL) {
4623         return UTF8NESS_IMMATERIAL;
4624     }
4625
4626     if (IN_BYTES) { /* respect 'use bytes' */
4627         return UTF8NESS_NO;
4628     }
4629
4630     Size_t len = strlen(string);
4631
4632     /* UTF8ness is immaterial if the representation doesn't vary */
4633     const U8 * first_variant = NULL;
4634     if (is_utf8_invariant_string_loc((U8 *) string, len, &first_variant)) {
4635         return UTF8NESS_IMMATERIAL;
4636     }
4637
4638     /* Can't be UTF-8 if invalid */
4639     if (! is_utf8_string((U8 *) first_variant,
4640                          len - ((char *) first_variant - string)))
4641     {
4642         return UTF8NESS_NO;
4643     }
4644
4645     /* Here and below, we know the string is legal UTF-8, containing at least
4646      * one character requiring a sequence of two or more bytes.  It is quite
4647      * likely to be UTF-8.  But it pays to be paranoid and do further checking.
4648      *
4649      * If we already know the UTF-8ness of the locale, then we immediately know
4650      * what the string is */
4651     if (UNLIKELY(known_utf8 != LOCALE_UTF8NESS_UNKNOWN)) {
4652         return (known_utf8 == LOCALE_IS_UTF8) ? UTF8NESS_YES : UTF8NESS_NO;
4653     }
4654
4655     if (locale == NULL) {
4656         locale = querylocale_i(cat_index);
4657     }
4658
4659     /* If the locale is UTF-8, the string is UTF-8;  otherwise it was
4660      * coincidental that the string is legal UTF-8
4661      *
4662      * However, if the perl is compiled to not pay attention to the category
4663      * being passed in, you might think that that locale is essentially always
4664      * the C locale, so it would make sense to say it isn't UTF-8.  But to get
4665      * here, the string has to contain characters unknown in the C locale.  And
4666      * in fact, Windows boxes are compiled without LC_MESSAGES, as their
4667      * message catalog isn't really a part of the locale system.  But those
4668      * messages really could be UTF-8, and given that the odds are rather small
4669      * of something not being UTF-8 but being syntactically valid UTF-8, khw
4670      * has decided to call such strings as UTF-8. */
4671     return (is_locale_utf8(locale)) ? UTF8NESS_YES : UTF8NESS_NO;
4672
4673 #  endif
4674
4675 }
4676
4677 STATIC bool
4678 S_is_locale_utf8(pTHX_ const char * locale)
4679 {
4680     PERL_ARGS_ASSERT_IS_LOCALE_UTF8;
4681
4682     /* Returns TRUE if the locale 'locale' is UTF-8; FALSE otherwise. */
4683
4684 #  if ! defined(USE_LOCALE)                                                   \
4685    || ! defined(USE_LOCALE_CTYPE)                                             \
4686    ||   defined(EBCDIC) /* There aren't any real UTF-8 locales at this time */
4687
4688     PERL_UNUSED_ARG(locale);
4689
4690     return FALSE;
4691
4692      /* Definitively, can't be UTF-8 */
4693 #    define HAS_DEFINITIVE_UTF8NESS_DETERMINATION
4694 #  else
4695
4696     /* If the input happens to be the same locale as we are currently setup
4697      * for, the answer has already been cached. */
4698     if (strEQ(locale, PL_ctype_name)) {
4699         return PL_in_utf8_CTYPE_locale;
4700     }
4701
4702     if (isNAME_C_OR_POSIX(locale)) {
4703         return false;
4704     }
4705
4706 #    if ! defined(HAS_SOME_LANGINFO) && ! defined(WIN32)
4707
4708     /* On non-Windows without nl_langinfo(), we have to do some digging to get
4709      * the answer.  First, toggle to the desired locale so can query its state
4710      * */
4711     const char * orig_CTYPE_locale = toggle_locale_c(LC_CTYPE, locale);
4712
4713 #      define TEARDOWN_FOR_IS_LOCALE_UTF8                                   \
4714                       restore_toggled_locale_c(LC_CTYPE, orig_CTYPE_locale)
4715
4716 #      ifdef MB_CUR_MAX
4717
4718     /* If there are fewer bytes available in this locale than are required
4719      * to represent the largest legal UTF-8 code point, this isn't a UTF-8
4720      * locale. */
4721     const int mb_cur_max = MB_CUR_MAX;
4722     if (mb_cur_max < (int) UNISKIP(PERL_UNICODE_MAX)) {
4723         TEARDOWN_FOR_IS_LOCALE_UTF8;
4724         return false;
4725     }
4726
4727 #      endif
4728 #      if defined(HAS_MBTOWC) || defined(HAS_MBRTOWC)
4729
4730          /* With these functions, we can definitively determine a locale's
4731           * UTF-8ness */
4732 #        define HAS_DEFINITIVE_UTF8NESS_DETERMINATION
4733
4734     /* If libc mbtowc() evaluates the bytes that form the REPLACEMENT CHARACTER
4735      * as that Unicode code point, this has to be a UTF-8 locale; otherwise it
4736      * can't be  */
4737     wchar_t wc = 0;
4738     (void) Perl_mbtowc_(aTHX_ NULL, NULL, 0);/* Reset shift state */
4739     int mbtowc_ret = Perl_mbtowc_(aTHX_ &wc,
4740                                   STR_WITH_LEN(REPLACEMENT_CHARACTER_UTF8));
4741     TEARDOWN_FOR_IS_LOCALE_UTF8;
4742     return (   mbtowc_ret == STRLENs(REPLACEMENT_CHARACTER_UTF8)
4743             && wc == UNICODE_REPLACEMENT);
4744
4745 #      else
4746
4747         /* If the above two C99 functions aren't working, you could try some
4748          * different methods.  It seems likely that the obvious choices,
4749          * wctomb() and wcrtomb(), wouldn't be working either.  But you could
4750          * choose one of the dozen-ish Unicode titlecase triples and verify
4751          * that towupper/towlower work as expected.
4752          *
4753          * But, our emulation of nl_langinfo() works quite well, so avoid the
4754          * extra code until forced to by some weird non-conforming platform. */
4755 #        define USE_LANGINFO_FOR_UTF8NESS
4756 #        undef HAS_DEFINITIVE_UTF8NESS_DETERMINATION
4757 #      endif
4758 #    else
4759
4760      /* On Windows or on platforms with nl_langinfo(), there is a direct way to
4761       * get the locale's codeset, which will be some form of 'UTF-8' for a
4762       * UTF-8 locale.  my_langinfo_i() handles this, and we will call that
4763       * below */
4764 #      define HAS_DEFINITIVE_UTF8NESS_DETERMINATION
4765 #      define USE_LANGINFO_FOR_UTF8NESS
4766 #      define TEARDOWN_FOR_IS_LOCALE_UTF8
4767 #    endif  /* USE_LANGINFO_FOR_UTF8NESS */
4768
4769      /* If the above compiled into code, it found the locale's UTF-8ness,
4770       * nothing more to do; if it didn't get compiled,
4771       * USE_LANGINFO_FOR_UTF8NESS is defined.  There are two possible reasons:
4772       *   1)  it is the preferred method because it knows directly for sure
4773       *       what the codeset is because the platform has libc functions that
4774       *       return this; or
4775       *   2)  the functions the above code section would compile to use don't
4776       *       exist or are unreliable on this platform; we are less sure of the
4777       *       my_langinfo() result, though it is very unlikely to be wrong
4778       *       about if it is UTF-8 or not */
4779 #    ifdef USE_LANGINFO_FOR_UTF8NESS
4780
4781     char * scratch_buffer = NULL;
4782     const char * codeset = my_langinfo_c(CODESET, LC_CTYPE, locale,
4783                                          &scratch_buffer, NULL, NULL);
4784     bool retval = is_codeset_name_UTF8(codeset);
4785
4786     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
4787                            "found codeset=%s, is_utf8=%d\n", codeset, retval));
4788
4789     Safefree(scratch_buffer);
4790
4791     DEBUG_Lv(PerlIO_printf(Perl_debug_log, "is_locale_utf8(%s) returning %d\n",
4792                                                             locale, retval));
4793     TEARDOWN_FOR_IS_LOCALE_UTF8;
4794     return retval;
4795
4796 #    endif
4797 #  endif      /* End of the #else clause, for the non-trivial case */
4798
4799 }
4800
4801 #endif
4802
4803 STATIC void
4804 S_set_save_buffer_min_size(pTHX_ Size_t min_len,
4805                                  char **buf,
4806                                  Size_t * buf_cursize)
4807 {
4808     /* Make sure the buffer pointed to by *buf is at least as large 'min_len';
4809      * *buf_cursize is the size of 'buf' upon entry; it will be updated to the
4810      * new size on exit.  'buf_cursize' being NULL is to be used when this is a
4811      * single use buffer, which will shortly be freed by the caller. */
4812
4813     if (buf_cursize == NULL) {
4814         Newx(*buf, min_len, char);
4815     }
4816     else if (*buf_cursize == 0) {
4817         Newx(*buf, min_len, char);
4818         *buf_cursize = min_len;
4819     }
4820     else if (min_len > *buf_cursize) {
4821         Renew(*buf, min_len, char);
4822         *buf_cursize = min_len;
4823     }
4824 }
4825
4826 STATIC const char *
4827 S_save_to_buffer(pTHX_ const char * string, char **buf, Size_t *buf_size)
4828 {
4829     PERL_ARGS_ASSERT_SAVE_TO_BUFFER;
4830
4831     /* Copy the NUL-terminated 'string' to a buffer whose address before this
4832      * call began at *buf, and whose available length before this call was
4833      * *buf_size.
4834      *
4835      * If the length of 'string' is greater than the space available, the
4836      * buffer is grown accordingly, which may mean that it gets relocated.
4837      * *buf and *buf_size will be updated to reflect this.
4838      *
4839      * Regardless, the function returns a pointer to where 'string' is now
4840      * stored.
4841      *
4842      * 'string' may be NULL, which means no action gets taken, and NULL is
4843      * returned.
4844      *
4845      * 'buf_size' being NULL is to be used when this is a single use buffer,
4846      * which will shortly be freed by the caller.
4847      *
4848      * If *buf or 'buf_size' are NULL or *buf_size is 0, the buffer is assumed
4849      * empty, and memory is malloc'd.
4850      */
4851
4852     if (! string) {
4853         return NULL;
4854     }
4855
4856     /* No-op to copy over oneself */
4857     if (string == *buf) {
4858         return string;
4859     }
4860
4861     Size_t string_size = strlen(string) + 1;
4862     set_save_buffer_min_size(string_size, buf, buf_size);
4863
4864 #  ifdef DEBUGGING
4865
4866     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
4867                          "Copying '%s' to %p\n",
4868                          ((is_utf8_string((U8 *) string, 0))
4869                           ? string
4870                           :_byte_dump_string((U8 *) string, strlen(string), 0)),
4871                           *buf));
4872
4873 #    ifdef USE_LOCALE_CTYPE
4874
4875     /* Catch glitches.  Usually this is because LC_CTYPE needs to be the same
4876      * locale as whatever is being worked on */
4877     if (UNLIKELY(instr(string, REPLACEMENT_CHARACTER_UTF8))) {
4878         locale_panic_(Perl_form(aTHX_
4879                                 "Unexpected REPLACEMENT_CHARACTER in '%s'\n%s",
4880                                 string, get_LC_ALL_display()));
4881     }
4882
4883 #    endif
4884 #  endif
4885
4886     Copy(string, *buf, string_size, char);
4887     return *buf;
4888 }
4889
4890 #ifdef USE_LOCALE
4891 #  ifdef WIN32
4892
4893 bool
4894 Perl_get_win32_message_utf8ness(pTHX_ const char * string)
4895 {
4896     /* This is because Windows doesn't have LC_MESSAGES. */
4897
4898 #    ifdef USE_LOCALE_CTYPE
4899
4900     /* We don't know the locale utf8ness here, and not even the locale itself.
4901      * Since Windows uses a different mechanism to specify message language
4902      * output than the locale system, it is going to be problematic deciding
4903      * if we are to store it as UTF-8 or not.  By specifying LOCALE_IS_UTF8, we
4904      * are telling the called function to return true iff the string has
4905      * non-ASCII characters in it that are all syntactically UTF-8.  We are
4906      * thus relying on the fact that a string that is syntactically valid UTF-8
4907      * is likely to be UTF-8.  Should this ever cause problems, this function
4908      * could be replaced by something more Windows-specific */
4909     return get_locale_string_utf8ness_i(string, LOCALE_IS_UTF8,
4910                                         NULL, LC_CTYPE_INDEX_);
4911 #    else
4912
4913     PERL_UNUSED_ARG(string);
4914     return false;
4915
4916 #    endif
4917
4918 }
4919
4920 #  endif
4921 #endif  /* USE_LOCALE */
4922
4923 int
4924 Perl_mbtowc_(pTHX_ const wchar_t * pwc, const char * s, const Size_t len)
4925 {
4926
4927 #if ! defined(HAS_MBRTOWC) && ! defined(HAS_MBTOWC)
4928
4929     PERL_UNUSED_ARG(pwc);
4930     PERL_UNUSED_ARG(s);
4931     PERL_UNUSED_ARG(len);
4932     return -1;
4933
4934 #else   /* Below we have some form of mbtowc() */
4935 #  if defined(HAS_MBRTOWC)                                      \
4936    && (defined(USE_LOCALE_THREADS) || ! defined(HAS_MBTOWC))
4937 #    define USE_MBRTOWC
4938 #  else
4939 #    undef USE_MBRTOWC
4940 #  endif
4941
4942     int retval = -1;
4943
4944     if (s == NULL) { /* Initialize the shift state to all zeros in
4945                         PL_mbrtowc_ps. */
4946
4947 #  if defined(USE_MBRTOWC)
4948
4949         memzero(&PL_mbrtowc_ps, sizeof(PL_mbrtowc_ps));
4950         return 0;
4951
4952 #  else
4953
4954         SETERRNO(0, 0);
4955         MBTOWC_LOCK_;
4956         retval = mbtowc(NULL, NULL, 0);
4957         MBTOWC_UNLOCK_;
4958         return retval;
4959
4960 #  endif
4961
4962     }
4963
4964 #  if defined(USE_MBRTOWC)
4965
4966     SETERRNO(0, 0);
4967     MBRTOWC_LOCK_;
4968     retval = (SSize_t) mbrtowc((wchar_t *) pwc, s, len, &PL_mbrtowc_ps);
4969     MBRTOWC_UNLOCK_;
4970
4971 #  else
4972
4973     /* Locking prevents races, but locales can be switched out without locking,
4974      * so this isn't a cure all */
4975     SETERRNO(0, 0);
4976     MBTOWC_LOCK_;
4977     retval = mbtowc((wchar_t *) pwc, s, len);
4978     MBTOWC_UNLOCK_;
4979
4980 #  endif
4981
4982     return retval;
4983
4984 #endif
4985
4986 }
4987
4988 /*
4989 =for apidoc Perl_localeconv
4990
4991 This is a thread-safe version of the libc L<localeconv(3)>.  It is the same as
4992 L<POSIX::localeconv|POSIX/localeconv> (returning a hash of the C<localeconv()>
4993 fields), but directly callable from XS code.
4994
4995 =cut
4996 */
4997
4998 HV *
4999 Perl_localeconv(pTHX)
5000 {
5001
5002 #if  ! defined(HAS_LOCALECONV)
5003
5004     return newHV();
5005
5006 #else
5007
5008     return my_localeconv(0);
5009
5010 #endif
5011
5012 }
5013
5014 #if defined(HAS_LOCALECONV)
5015
5016 HV *
5017 S_my_localeconv(pTHX_ const int item)
5018 {
5019     PERL_ARGS_ASSERT_MY_LOCALECONV;
5020
5021     /* This returns a mortalized hash containing all or certain elements
5022      * returned by localeconv(). */
5023     HV * hv = newHV();      /* The returned hash, initially empty */
5024     sv_2mortal((SV*)hv);
5025
5026     /* The function is used by Perl_localeconv() and POSIX::localeconv(), or
5027      * internally from this file, and is thread-safe.
5028      *
5029      * localeconv() returns items from two different locale categories,
5030      * LC_MONETARY and LC_NUMERIC.  Various data structures in this function
5031      * are arrays with two elements, one for each category, and these indexes
5032      * indicate which array element applies to which category */
5033 #define NUMERIC_OFFSET   0
5034 #define MONETARY_OFFSET  1
5035
5036     /* Some operations apply to one or the other category, or both.  A mask
5037      * is used to specify all the possibilities.  This macro converts from the
5038      * category offset to its bit position in the mask. */
5039 #define OFFSET_TO_BIT(i)  (1 << (i))
5040
5041     /* There are two use cases for this function:
5042      * 1) Called as Perl_localeconv(), or from POSIX::locale_conv().  This
5043      *    returns the lconv structure copied to a hash, based on the current
5044      *    underlying locales for LC_NUMERIC and LC_MONETARY. An input item==0
5045      *    signifies this case, or on many platforms it is the only use case
5046      *    compiled.
5047      * 2) Certain items that nl_langinfo() provides are also derivable from
5048      *    the return of localeconv().  Windows notably doesn't have
5049      *    nl_langinfo(), so on that, and actually any platform lacking it,
5050      *    my_localeconv() is used also to emulate it for those particular
5051      *    items.  The code to do this is compiled only on such platforms.
5052      *    Rather than going to the expense of creating a full hash when only
5053      *    one item is needed, the returned hash has just the desired item in
5054      *    it.
5055      *
5056      * To access all the localeconv() struct lconv fields, there is a data
5057      * structure that contains every commonly documented field in it.  (Maybe
5058      * some minority platforms have extra fields.  Those could be added here
5059      * without harm; they would just be ignored on platforms lacking them.)
5060      *
5061      * Our structure is compiled to make looping through the fields easier by
5062      * pointing each name to its value's offset within lconv, e.g.,
5063         { "thousands_sep", STRUCT_OFFSET(struct lconv, thousands_sep) }
5064      */
5065 #  define LCONV_ENTRY(name)                                                 \
5066                        {STRINGIFY(name), STRUCT_OFFSET(struct lconv, name)}
5067
5068     /* These synonyms are just for clarity, and to make it easier in case
5069      * something needs to change in the future */
5070 #  define LCONV_NUMERIC_ENTRY(name)  LCONV_ENTRY(name)
5071 #  define LCONV_MONETARY_ENTRY(name) LCONV_ENTRY(name)
5072
5073     /* There are just a few fields for NUMERIC strings */
5074     const lconv_offset_t lconv_numeric_strings[] = {
5075 #  ifndef NO_LOCALECONV_GROUPING
5076         LCONV_NUMERIC_ENTRY(grouping),
5077 #   endif
5078         LCONV_NUMERIC_ENTRY(thousands_sep),
5079 #   define THOUSANDS_SEP_LITERAL  "thousands_sep"
5080         LCONV_NUMERIC_ENTRY(decimal_point),
5081 #   define DECIMAL_POINT_LITERAL "decimal_point"
5082         {NULL, 0}
5083     };
5084
5085     /* When used to implement nl_langinfo(), we save time by only populating
5086      * the hash with the field(s) needed.  Thus we would need a data structure
5087      * of just:
5088      *  LCONV_NUMERIC_ENTRY(decimal_point),
5089      *  {NULL, 0}
5090      *
5091      * By placing the decimal_point field last in the full structure, we can
5092      * use just the tail for this bit of it, saving space.  This macro yields
5093      * the address of the sub structure. */
5094 #  define DECIMAL_POINT_ADDRESS                                             \
5095         &lconv_numeric_strings[(C_ARRAY_LENGTH(lconv_numeric_strings) - 2)]
5096
5097     /* And the MONETARY string fields */
5098     const lconv_offset_t lconv_monetary_strings[] = {
5099         LCONV_MONETARY_ENTRY(int_curr_symbol),
5100         LCONV_MONETARY_ENTRY(mon_decimal_point),
5101 #  ifndef NO_LOCALECONV_MON_THOUSANDS_SEP
5102         LCONV_MONETARY_ENTRY(mon_thousands_sep),
5103 #  endif
5104 #  ifndef NO_LOCALECONV_MON_GROUPING
5105         LCONV_MONETARY_ENTRY(mon_grouping),
5106 #  endif
5107         LCONV_MONETARY_ENTRY(positive_sign),
5108         LCONV_MONETARY_ENTRY(negative_sign),
5109         LCONV_MONETARY_ENTRY(currency_symbol),
5110 #  define CURRENCY_SYMBOL_LITERAL  "currency_symbol"
5111         {NULL, 0}
5112     };
5113
5114     /* Like above, this field being last can be used as a sub structure */
5115 #  define CURRENCY_SYMBOL_ADDRESS                                            \
5116       &lconv_monetary_strings[(C_ARRAY_LENGTH(lconv_monetary_strings) - 2)]
5117
5118     /* Finally there are integer fields, all are for monetary purposes */
5119     const lconv_offset_t lconv_integers[] = {
5120         LCONV_ENTRY(int_frac_digits),
5121         LCONV_ENTRY(frac_digits),
5122         LCONV_ENTRY(p_sep_by_space),
5123         LCONV_ENTRY(n_cs_precedes),
5124         LCONV_ENTRY(n_sep_by_space),
5125         LCONV_ENTRY(p_sign_posn),
5126         LCONV_ENTRY(n_sign_posn),
5127 #  ifdef HAS_LC_MONETARY_2008
5128         LCONV_ENTRY(int_p_cs_precedes),
5129         LCONV_ENTRY(int_p_sep_by_space),
5130         LCONV_ENTRY(int_n_cs_precedes),
5131         LCONV_ENTRY(int_n_sep_by_space),
5132         LCONV_ENTRY(int_p_sign_posn),
5133         LCONV_ENTRY(int_n_sign_posn),
5134 #  endif
5135 #      define P_CS_PRECEDES_LITERAL    "p_cs_precedes"
5136         LCONV_ENTRY(p_cs_precedes),
5137         {NULL, 0}
5138     };
5139
5140     /* Like above, this field being last can be used as a sub structure */
5141 #  define P_CS_PRECEDES_ADDRESS                                       \
5142       &lconv_integers[(C_ARRAY_LENGTH(lconv_integers) - 2)]
5143
5144     /* The actual populating of the hash is done by two sub functions that get
5145      * passed an array of length two containing the data structure they are
5146      * supposed to use to get the key names to fill the hash with.  One element
5147      * is always for the NUMERIC strings (or NULL if none to use), and the
5148      * other element similarly for the MONETARY ones. */
5149     const lconv_offset_t * strings[2] = { lconv_numeric_strings,
5150                                           lconv_monetary_strings
5151                                         };
5152
5153     /* The LC_MONETARY category also has some integer-valued fields, whose
5154      * information is kept in a separate parallel array to 'strings' */
5155     const lconv_offset_t * integers[2] = {
5156                                            NULL,
5157                                            lconv_integers
5158                                          };
5159
5160 #  if ! defined(USE_LOCALE_NUMERIC) && ! defined(USE_LOCALE_MONETARY)
5161
5162     /* If both NUMERIC and MONETARY must be the "C" locale, simply populate the
5163      * hash using the function that works on just that locale. */
5164     populate_hash_from_C_localeconv(hv,
5165                                     "C",
5166                                     (  OFFSET_TO_BIT(NUMERIC_OFFSET)
5167                                      | OFFSET_TO_BIT(MONETARY_OFFSET)),
5168                                      strings, integers);
5169
5170     /* We shouldn't get to here for the case of an individual item, as
5171      * preprocessor directives elsewhere in this file should have filled in the
5172      * correct values at a higher level */
5173     assert(item == 0);
5174     PERL_UNUSED_ARG(item);
5175
5176     return hv;
5177
5178 #  else
5179
5180     /* From here to the end of this function, at least one of NUMERIC or
5181      * MONETARY can be non-C */
5182
5183     /* This is a mask, with one bit to tell the populate functions to populate
5184      * the NUMERIC items; another bit for the MONETARY ones.  This way they can
5185      * choose which (or both) to populate from */
5186     U32 index_bits = 0;
5187
5188     /* Some platforms, for correct non-mojibake results, require LC_CTYPE's
5189      * locale to match LC_NUMERIC's for the numeric fields, and LC_MONETARY's
5190      * for the monetary ones.  What happens if LC_NUMERIC and LC_MONETARY
5191      * aren't compatible?  Wrong results.  To avoid that, we call localeconv()
5192      * twice, once for each locale, setting LC_CTYPE to match the category.
5193      * But if the locales of both categories are the same, there is no need for
5194      * a second call.  Assume this is the case unless overridden below */
5195     bool requires_2nd_localeconv = false;
5196
5197     /* The actual hash populating is done by one of the two populate functions.
5198      * Which one is appropriate for either the MONETARY_OFFSET or the
5199      * NUMERIC_OFFSET is calculated and then stored in this table */
5200     void (*populate[2]) (pTHX_
5201                          HV * ,
5202                          const char *,
5203                          const U32,
5204                          const lconv_offset_t **,
5205                          const lconv_offset_t **);
5206
5207     /* This gives the locale to use for the corresponding OFFSET, like the
5208      * 'populate' array above */
5209     const char * locales[2];
5210
5211 #  ifdef HAS_SOME_LANGINFO
5212
5213     /* If the only use-case for this is the full localeconv(), the 'item'
5214      * parameter is ignored. */
5215     PERL_UNUSED_ARG(item);
5216
5217 #  else     /* This only gets compiled for the use-case of using localeconv()
5218                to emulate nl_langinfo() when missing from the platform. */
5219
5220 #      ifdef USE_LOCALE_NUMERIC
5221
5222     /* We need this substructure to only return this field for the THOUSEP
5223      * item.  The other items also need substructures, but they were handled
5224      * above by placing the substructure's item at the end of the full one, so
5225      * the data structure could do double duty.  However, both this and
5226      * RADIXCHAR would need to be in the final position of the same full
5227      * structure; an impossibility.  So make this into a separate structure */
5228     const lconv_offset_t  thousands_sep_string[] = {
5229         LCONV_NUMERIC_ENTRY(thousands_sep),
5230         {NULL, 0}
5231     };
5232
5233 #      endif
5234
5235     /* End of all the initialization of data structures.  Now for actual code.
5236      *
5237      * Without nl_langinfo(), the call to my_localeconv() could be for all of
5238      * the localeconv() items or for just one of the following 3 items to
5239      * emulate nl_langinfo().
5240      *
5241      * This is compiled only when using perl_langinfo.h, which we control, and
5242      * it has been constructed so that no item is numbered 0.
5243      *
5244      * For each individual item, either return the known value if the current
5245      * locale is "C", or set up the appropriate parameters for the call below
5246      * to the populate function */
5247     if (item != 0) {
5248         const char *locale;
5249
5250         switch (item) {
5251           default:
5252             locale_panic_(Perl_form(aTHX_
5253                           "Unexpected item passed to my_localeconv: %d", item));
5254             break;
5255
5256 #      ifdef USE_LOCALE_NUMERIC
5257
5258           case RADIXCHAR:
5259             if (isNAME_C_OR_POSIX(PL_numeric_name)) {
5260                 (void) hv_stores(hv, DECIMAL_POINT_LITERAL, newSVpvs("."));
5261                 return hv;
5262             }
5263
5264             strings[NUMERIC_OFFSET] = DECIMAL_POINT_ADDRESS;
5265             goto numeric_common;
5266
5267           case THOUSEP:
5268             if (isNAME_C_OR_POSIX(PL_numeric_name)) {
5269                 (void) hv_stores(hv, THOUSANDS_SEP_LITERAL, newSVpvs(""));
5270                 return hv;
5271             }
5272
5273             strings[NUMERIC_OFFSET] = thousands_sep_string;
5274
5275           numeric_common:
5276             index_bits = OFFSET_TO_BIT(NUMERIC_OFFSET);
5277             locale = PL_numeric_name;
5278             break;
5279
5280 #      endif
5281 #      ifdef USE_LOCALE_MONETARY
5282
5283           case CRNCYSTR:    /* This item needs the values for both the currency
5284                                symbol, and another one used to construct the
5285                                nl_langino()-compatible return. */
5286
5287             locale = querylocale_c(LC_MONETARY);
5288             if (isNAME_C_OR_POSIX(locale)) {
5289                 (void) hv_stores(hv, CURRENCY_SYMBOL_LITERAL, newSVpvs(""));
5290                 (void) hv_stores(hv, P_CS_PRECEDES_LITERAL, newSViv(-1));
5291                 return hv;
5292             }
5293
5294             strings[MONETARY_OFFSET] = CURRENCY_SYMBOL_ADDRESS;
5295             integers[MONETARY_OFFSET] = P_CS_PRECEDES_ADDRESS;
5296
5297             index_bits = OFFSET_TO_BIT(MONETARY_OFFSET);
5298             break;
5299
5300 #      endif
5301
5302         } /* End of switch() */
5303
5304         /* There's only one item, so only one of each of these will get used,
5305          * but cheap to initialize both */
5306         populate[MONETARY_OFFSET] =
5307         populate[NUMERIC_OFFSET]  = S_populate_hash_from_localeconv;
5308         locales[MONETARY_OFFSET] = locales[NUMERIC_OFFSET]  = locale;
5309     }
5310     else   /* End of for just one item to emulate nl_langinfo() */
5311
5312 #    endif
5313
5314     {
5315         /* Here, the call is for all of localeconv().  It has a bunch of
5316          * items.  The first function call always gets the MONETARY values */
5317         index_bits = OFFSET_TO_BIT(MONETARY_OFFSET);
5318
5319 #    ifdef USE_LOCALE_MONETARY
5320
5321         locales[MONETARY_OFFSET] = querylocale_c(LC_MONETARY);
5322         populate[MONETARY_OFFSET] =
5323                                 (isNAME_C_OR_POSIX(locales[MONETARY_OFFSET]))
5324                                 ?  S_populate_hash_from_C_localeconv
5325                                 :  S_populate_hash_from_localeconv;
5326
5327 #    else
5328
5329         locales[MONETARY_OFFSET] = "C";
5330         populate[MONETARY_OFFSET] = S_populate_hash_from_C_localeconv;
5331
5332 #    endif
5333 #    ifdef USE_LOCALE_NUMERIC
5334
5335         /* And if the locales for the two categories are the same, we can also
5336          * do the NUMERIC values in the same call */
5337         if (strEQ(PL_numeric_name, locales[MONETARY_OFFSET])) {
5338             index_bits |= OFFSET_TO_BIT(NUMERIC_OFFSET);
5339             locales[NUMERIC_OFFSET] = locales[MONETARY_OFFSET];
5340             populate[NUMERIC_OFFSET] = populate[MONETARY_OFFSET];
5341         }
5342         else {
5343             requires_2nd_localeconv = true;
5344             locales[NUMERIC_OFFSET] = PL_numeric_name;
5345             populate[NUMERIC_OFFSET] = (isNAME_C_OR_POSIX(PL_numeric_name))
5346                                        ?  S_populate_hash_from_C_localeconv
5347                                        :  S_populate_hash_from_localeconv;
5348         }
5349
5350 #    else
5351
5352         /* When LC_NUMERIC is confined to "C", the two locales are the same
5353            iff LC_MONETARY in this case is also "C".  We set up the function
5354            for that case above, so fastest to test just its address */
5355         locales[NUMERIC_OFFSET] = "C";
5356         if (populate[MONETARY_OFFSET] == S_populate_hash_from_C_localeconv) {
5357             index_bits |= OFFSET_TO_BIT(NUMERIC_OFFSET);
5358             populate[NUMERIC_OFFSET] = populate[MONETARY_OFFSET];
5359         }
5360         else {
5361             requires_2nd_localeconv = true;
5362             populate[NUMERIC_OFFSET] = S_populate_hash_from_C_localeconv;
5363         }
5364
5365 #    endif
5366
5367     }   /* End of call is for localeconv() */
5368
5369     /* Call the proper populate function (which may call localeconv()) and copy
5370      * its results into the hash.  All the parameters have been initialized
5371      * above */
5372     (*populate[MONETARY_OFFSET])(aTHX_
5373                                  hv, locales[MONETARY_OFFSET],
5374                                  index_bits, strings, integers);
5375
5376 #    ifndef HAS_SOME_LANGINFO  /* Could be using this function to emulate
5377                                 nl_langinfo() */
5378
5379     /* We are done when called with an individual item.  There are no integer
5380      * items to adjust, and it's best for the caller to determine if this
5381      * string item is UTF-8 or not.  This is because the locale's UTF-8ness is
5382      * calculated below, and in some Configurations, that can lead to a
5383      * recursive call to here, which could recurse infinitely. */
5384     if (item != 0) {
5385         return hv;
5386     }
5387
5388 #    endif
5389
5390     /* The above call may have done all the hash fields, but not always, as
5391      * already explained.  If we need a second call it is always for the
5392      * NUMERIC fields */
5393     if (requires_2nd_localeconv) {
5394         (*populate[NUMERIC_OFFSET])(aTHX_
5395                                     hv,
5396                                     locales[NUMERIC_OFFSET],
5397                                     OFFSET_TO_BIT(NUMERIC_OFFSET),
5398                                     strings, integers);
5399     }
5400
5401     /* Here, the hash has been completely populated.
5402      *
5403      * Now go through all the items and:
5404      *  a) For string items, see if they should be marked as UTF-8 or not.
5405      *     This would have been more convenient and faster to do while
5406      *     populating the hash in the first place, but that operation has to be
5407      *     done within a critical section, keeping other threads from
5408      *     executing, so only the minimal amount of work necessary is done at
5409      *     that time.
5410      *  b) For integer items, convert the C CHAR_MAX value into -1.  Again,
5411      *     this could have been done in the critical section, but was deferred
5412      *     to here to keep to the bare minimum amount the time spent owning the
5413      *     processor. CHAR_MAX is a C concept for an 8-bit character type.
5414      *     Perl has no such type; the closest fit is a -1.
5415      *
5416      * XXX On unthreaded perls, this code could be #ifdef'd out, and the
5417      * corrections determined at hash population time, at an extra maintenance
5418      * cost which khw doesn't think is worth it
5419      */
5420
5421     for (unsigned int i = 0; i < 2; i++) {  /* Try both types of strings */
5422
5423         /* The return from this function is already adjusted */
5424         if (populate[i] == S_populate_hash_from_C_localeconv) {
5425             continue;
5426         }
5427
5428         /* Examine each string */
5429         for (const lconv_offset_t *strp = strings[i]; strp->name; strp++) {
5430             const char * name = strp->name;
5431
5432             /* 'value' will contain the string that may need to be marked as
5433              * UTF-8 */
5434             SV ** value = hv_fetch(hv, name, strlen(name), true);
5435             if (value == NULL) {
5436                 continue;
5437             }
5438
5439             /* Determine if the string should be marked as UTF-8. */
5440             if (UTF8NESS_YES == (get_locale_string_utf8ness_i(SvPVX(*value),
5441                                                   LOCALE_UTF8NESS_UNKNOWN,
5442                                                   locales[i],
5443                                                   LC_ALL_INDEX_ /* OOB */)))
5444             {
5445                 SvUTF8_on(*value);
5446             }
5447         }
5448
5449         if (integers[i] == NULL) {
5450             continue;
5451         }
5452
5453         /* And each integer */
5454         for (const lconv_offset_t *intp = integers[i]; intp->name; intp++) {
5455             const char * name = intp->name;
5456
5457         if (! name) {   /* Reached the end */
5458             break;
5459         }
5460
5461         SV ** value = hv_fetch(hv, name, strlen(name), true);
5462         if (! value) {
5463             continue;
5464         }
5465
5466         /* Change CHAR_MAX to -1 */
5467         if (SvIV(*value) == CHAR_MAX) {
5468             sv_setiv(*value, -1);
5469         }
5470         }
5471     }
5472
5473     return hv;
5474
5475 #  endif    /* End of must have one or both USE_MONETARY, USE_NUMERIC */
5476
5477 }
5478
5479 STATIC void
5480 S_populate_hash_from_C_localeconv(pTHX_ HV * hv,
5481                                         const char * locale,  /* Unused */
5482
5483                                         /* bit mask of which categories to
5484                                          * populate */
5485                                         const U32 which_mask,
5486
5487                                         /* The string type values to return;
5488                                          * one element for numeric; the other
5489                                          * for monetary */
5490                                         const lconv_offset_t * strings[2],
5491
5492                                         /* And the integer fields */
5493                                         const lconv_offset_t * integers[2])
5494 {
5495     PERL_ARGS_ASSERT_POPULATE_HASH_FROM_C_LOCALECONV;
5496     PERL_UNUSED_ARG(locale);
5497     assert(isNAME_C_OR_POSIX(locale));
5498
5499     /* Fill hv with the values that localeconv() is supposed to return for
5500      * the C locale */
5501
5502     U32 working_mask = which_mask;
5503     while (working_mask) {
5504
5505         /* Get the bit position of the next lowest set bit.  That is the
5506          * index into the 'strings' array of the category we use in this loop
5507          * iteration.  Turn the bit off so we don't work on this category
5508          * again in this function call. */
5509         const PERL_UINT_FAST8_T i = lsbit_pos(working_mask);
5510         working_mask &= ~ (1 << i);
5511
5512         /* This category's string fields */
5513         const lconv_offset_t * category_strings = strings[i];
5514
5515 #  ifndef HAS_SOME_LANGINFO /* This doesn't work properly if called on a single
5516                                item, which could only happen when there isn't
5517                                nl_langinfo on the platform */
5518         assert(category_strings[1].name != NULL);
5519 #  endif
5520
5521         /* All string fields are empty except for one NUMERIC one.  That one
5522          * has been initialized to be the final one in the NUMERIC strings, so
5523          * stop the loop early in that case.  Otherwise, we would store an
5524          * empty string to the hash, and immediately overwrite it with the
5525          * correct value */
5526         const unsigned int stop_early = (i == NUMERIC_OFFSET) ? 1 : 0;
5527
5528         /* A NULL element terminates the list */
5529         while ((category_strings + stop_early)->name) {
5530             (void) hv_store(hv,
5531                             category_strings->name,
5532                             strlen(category_strings->name),
5533                             newSVpvs(""),
5534                             0);
5535
5536             category_strings++;
5537         }
5538
5539         /* And fill in the NUMERIC exception */
5540         if (i == NUMERIC_OFFSET) {
5541             (void) hv_stores(hv, "decimal_point", newSVpvs("."));
5542             category_strings++;
5543         }
5544
5545         /* Add any int fields.  In the C locale, all are -1 */
5546         if (integers[i]) {
5547             const lconv_offset_t * current = integers[i];
5548             while (current->name) {
5549                 (void) hv_store(hv,
5550                                 current->name, strlen(current->name),
5551                                 newSViv(-1),
5552                                 0);
5553                 current++;
5554             }
5555         }
5556     }
5557 }
5558
5559 #  if defined(USE_LOCALE_NUMERIC) || defined(USE_LOCALE_MONETARY)
5560
5561 STATIC void
5562 S_populate_hash_from_localeconv(pTHX_ HV * hv,
5563
5564                                       /* Switch to this locale to run
5565                                        * localeconv() from */
5566                                       const char * locale,
5567
5568                                       /* bit mask of which categories to
5569                                        * populate */
5570                                       const U32 which_mask,
5571
5572                                       /* The string type values to return; one
5573                                        * element for numeric; the other for
5574                                        * monetary */
5575                                       const lconv_offset_t * strings[2],
5576
5577                                       /* And similarly the integer fields */
5578                                       const lconv_offset_t * integers[2])
5579 {
5580     PERL_ARGS_ASSERT_POPULATE_HASH_FROM_LOCALECONV;
5581
5582     /* Run localeconv() and copy some or all of its results to the input 'hv'
5583      * hash.  Most localeconv() implementations return the values in a global
5584      * static buffer, so the operation must be performed in a critical section,
5585      * ending only after the copy is completed.  There are so many locks
5586      * because localeconv() deals with two categories, and returns in a single
5587      * global static buffer.  Some locks might be no-ops on this platform, but
5588      * not others.  We need to lock if any one isn't a no-op. */
5589
5590 #    ifdef WE_MUST_DEAL_WITH_MISMATCHED_CTYPE
5591
5592     const char * orig_CTYPE_locale = toggle_locale_c(LC_CTYPE, locale);
5593
5594 #    endif
5595 #    ifdef USE_LOCALE_NUMERIC
5596
5597     /* We need to toggle to the underlying NUMERIC locale if we are getting
5598      * NUMERIC strings */
5599     const char * orig_NUMERIC_locale = NULL;
5600     if (which_mask & OFFSET_TO_BIT(NUMERIC_OFFSET)) {
5601         LC_NUMERIC_LOCK(0);
5602
5603 #      if defined(WIN32)
5604
5605         /* There is a bug in Windows in which setting LC_CTYPE after the others
5606          * doesn't actually take effect for localeconv().  See commit
5607          * 418efacd1950763f74ed3cc22f8cf9206661b892 for details.  Thus we have
5608          * to make sure that the locale we want is set after LC_CTYPE.  We
5609          * unconditionally toggle away from and back to the current locale
5610          * prior to calling localeconv(). */
5611         orig_NUMERIC_locale = toggle_locale_c(LC_NUMERIC, "C");
5612         toggle_locale_c(LC_NUMERIC, locale);
5613
5614 #      else
5615
5616         /* No need for the extra toggle when not on Windows */
5617         orig_NUMERIC_locale = toggle_locale_c(LC_NUMERIC, locale);
5618
5619 #      endif
5620
5621     }
5622
5623 #    endif
5624 #    if defined(USE_LOCALE_MONETARY) && defined(WIN32)
5625
5626     /* Same Windows bug as described just above for NUMERIC.  Otherwise, no
5627      * need to toggle LC_MONETARY, as it is kept in the underlying locale */
5628     const char * orig_MONETARY_locale = NULL;
5629     if (which_mask & OFFSET_TO_BIT(MONETARY_OFFSET)) {
5630         orig_MONETARY_locale = toggle_locale_c(LC_MONETARY, "C");
5631         toggle_locale_c(LC_MONETARY, locale);
5632     }
5633
5634 #    endif
5635
5636     /* Finally ready to do the actual localeconv().  Lock to prevent other
5637      * accesses until we have made a copy of its returned static buffer */
5638     gwLOCALE_LOCK;
5639
5640 #    if defined(TS_W32_BROKEN_LOCALECONV) && defined(USE_THREAD_SAFE_LOCALE)
5641
5642     /* This is a workaround for another bug in Windows.  localeconv() was
5643      * broken with thread-safe locales prior to VS 15.  It looks at the global
5644      * locale instead of the thread one.  As a work-around, we toggle to the
5645      * global locale; populate the return; then toggle back.  We have to use
5646      * LC_ALL instead of the individual categories because of yet another bug
5647      * in Windows.  And this all has to be done in a critical section.
5648      *
5649      * This introduces a potential race with any other thread that has also
5650      * converted to use the global locale, and doesn't protect its locale calls
5651      * with mutexes.  khw can't think of any reason for a thread to do so on
5652      * Windows, as the locale API is the same regardless of thread-safety,
5653      * except if the code is ported from working on another platform where
5654      * there might be some reason to do this.  But this is typically due to
5655      * some alien-to-Perl library that thinks it owns locale setting.  Such a
5656      * library isn't likely to exist on Windows, so such an application is
5657      * unlikely to be run on Windows
5658      */
5659     bool restore_per_thread = FALSE;
5660
5661     /* Save the per-thread locale state */
5662     const char * save_thread = querylocale_c(LC_ALL);
5663
5664     /* Change to the global locale, and note if we already were there */
5665     int config_return = _configthreadlocale(_DISABLE_PER_THREAD_LOCALE);
5666     if (config_return != _DISABLE_PER_THREAD_LOCALE) {
5667         if (config_return == -1) {
5668             locale_panic_("_configthreadlocale returned an error");
5669         }
5670
5671         restore_per_thread = TRUE;
5672     }
5673
5674     /* Save the state of the global locale; then convert to our desired
5675      * state.  */
5676     const char * save_global = querylocale_c(LC_ALL);
5677     void_setlocale_c(LC_ALL, save_thread);
5678
5679 #    endif  /* TS_W32_BROKEN_LOCALECONV */
5680
5681     /* Finally, do the actual localeconv */
5682     const char *lcbuf_as_string = (const char *) localeconv();
5683
5684     /* Copy its results for each desired category as determined by
5685      * 'which_mask' */
5686     U32 working_mask = which_mask;
5687     while (working_mask) {
5688
5689         /* Get the bit position of the next lowest set bit.  That is the
5690          * index into the 'strings' array of the category we use in this loop
5691          * iteration.  Turn the bit off so we don't work on this category
5692          * again in this function call. */
5693         const PERL_UINT_FAST8_T i = lsbit_pos32(working_mask);
5694         working_mask &= ~ (1 << i);
5695
5696         /* Point to the string field list for the given category ... */
5697         const lconv_offset_t * category_strings = strings[i];
5698         while (category_strings->name) {
5699
5700             /* We have set things up so that we know where in the returned
5701              * structure, when viewed as a string, the corresponding value is.
5702              * */
5703             const char *value = *((const char **)(  lcbuf_as_string
5704                                                   + category_strings->offset));
5705             if (value) {    /* Copy to the hash */
5706                 (void) hv_store(hv,
5707                                 category_strings->name,
5708                                 strlen(category_strings->name),
5709                                 newSVpv(value, strlen(value)),
5710                                 0);
5711             }
5712
5713             category_strings++;
5714         }
5715
5716         /* Add any int fields to the HV*. */
5717         if (integers[i]) {
5718             const lconv_offset_t * current = integers[i];
5719             while (current->name) {
5720                 const char value = *((const char *)(  lcbuf_as_string
5721                                                     + current->offset));
5722                 (void) hv_store(hv,
5723                                 current->name, strlen(current->name),
5724                                 newSViv(value),
5725                                 0);
5726                 current++;
5727             }
5728         }
5729     }   /* End of loop through the fields */
5730
5731     /* Done with copying to the hash.  Can unwind the critical section locks */
5732
5733 #    if defined(TS_W32_BROKEN_LOCALECONV) && defined(USE_THREAD_SAFE_LOCALE)
5734
5735     /* Restore the global locale's prior state */
5736     void_setlocale_c(LC_ALL, save_global);
5737
5738     /* And back to per-thread locales */
5739     if (restore_per_thread) {
5740         if (_configthreadlocale(_ENABLE_PER_THREAD_LOCALE) == -1) {
5741             locale_panic_("_configthreadlocale returned an error");
5742         }
5743     }
5744
5745     /* Restore the per-thread locale state */
5746     void_setlocale_c(LC_ALL, save_thread);
5747
5748 #    endif  /* TS_W32_BROKEN_LOCALECONV */
5749
5750     gwLOCALE_UNLOCK;    /* Finished with the critical section of a
5751                            globally-accessible buffer */
5752
5753 #    if defined(USE_LOCALE_MONETARY) && defined(WIN32)
5754
5755     restore_toggled_locale_c(LC_MONETARY, orig_MONETARY_locale);
5756
5757 #    endif
5758 #    ifdef USE_LOCALE_NUMERIC
5759
5760     restore_toggled_locale_c(LC_NUMERIC, orig_NUMERIC_locale);
5761     if (which_mask & OFFSET_TO_BIT(NUMERIC_OFFSET)) {
5762         LC_NUMERIC_UNLOCK;
5763     }
5764
5765 #    endif
5766 #    ifdef WE_MUST_DEAL_WITH_MISMATCHED_CTYPE
5767
5768     restore_toggled_locale_c(LC_CTYPE, orig_CTYPE_locale);
5769
5770 #    endif
5771
5772 }
5773
5774 #  endif    /* defined(USE_LOCALE_NUMERIC) || defined(USE_LOCALE_MONETARY) */
5775 #endif /* defined(HAS_LOCALECONV) */
5776
5777 /*
5778
5779 =for apidoc      Perl_langinfo
5780 =for apidoc_item Perl_langinfo8
5781
5782 C<Perl_langinfo> is an (almost) drop-in replacement for the system
5783 C<L<nl_langinfo(3)>>, taking the same C<item> parameter values, and returning
5784 the same information.  But it is more thread-safe than regular
5785 C<nl_langinfo()>, and hides the quirks of Perl's locale handling from your
5786 code, and can be used on systems that lack a native C<nl_langinfo>.
5787
5788 However, you should instead use the improved version of this:
5789 L</Perl_langinfo8>, which behaves identically except for an additional
5790 parameter, a pointer to a variable declared as L</C<utf8ness_t>>, into which it
5791 returns to you how you should treat the returned string with regards to it
5792 being encoded in UTF-8 or not.
5793
5794 Concerning the differences between these and plain C<nl_langinfo()>:
5795
5796 =over
5797
5798 =item a.
5799
5800 C<Perl_langinfo8> has an extra parameter, described above.  Besides this, the
5801 other reason they aren't quite a drop-in replacement is actually an advantage.
5802 The C<const>ness of the return allows the compiler to catch attempts to write
5803 into the returned buffer, which is illegal and could cause run-time crashes.
5804
5805 =item b.
5806
5807 They deliver the correct results for the C<RADIXCHAR> and C<THOUSEP> items,
5808 without you having to write extra code.  The reason for the extra code would be
5809 because these are from the C<LC_NUMERIC> locale category, which is normally
5810 kept set by Perl so that the radix is a dot, and the separator is the empty
5811 string, no matter what the underlying locale is supposed to be, and so to get
5812 the expected results, you have to temporarily toggle into the underlying
5813 locale, and later toggle back.  (You could use plain C<nl_langinfo> and
5814 C<L</STORE_LC_NUMERIC_FORCE_TO_UNDERLYING>> for this but then you wouldn't get
5815 the other advantages of C<Perl_langinfo()>; not keeping C<LC_NUMERIC> in the C
5816 (or equivalent) locale would break a lot of CPAN, which is expecting the radix
5817 (decimal point) character to be a dot.)
5818
5819 =item c.
5820
5821 The system function they replace can have its static return buffer trashed,
5822 not only by a subsequent call to that function, but by a C<freelocale>,
5823 C<setlocale>, or other locale change.  The returned buffer of these functions
5824 is not changed until the next call to one or the other, so the buffer is never
5825 in a trashed state.
5826
5827 =item d.
5828
5829 The return buffer is per-thread, so it also is never overwritten by a call to
5830 these functions from another thread;  unlike the function it replaces.
5831
5832 =item e.
5833
5834 But most importantly, they work on systems that don't have C<nl_langinfo>, such
5835 as Windows, hence making your code more portable.  Of the fifty-some possible
5836 items specified by the POSIX 2008 standard,
5837 L<http://pubs.opengroup.org/onlinepubs/9699919799/basedefs/langinfo.h.html>,
5838 only one is completely unimplemented, though on non-Windows platforms, another
5839 significant one is not fully implemented).  They use various techniques to
5840 recover the other items, including calling C<L<localeconv(3)>>, and
5841 C<L<strftime(3)>>, both of which are specified in C89, so should be always be
5842 available.  Later C<strftime()> versions have additional capabilities.
5843 If an item is not available on your system, this returns either the value
5844 associated with the C locale, or simply C<"">, whichever is more appropriate.
5845
5846 It is important to note that, when called with an item that is recovered by
5847 using C<localeconv>, the buffer from any previous explicit call to
5848 C<L<localeconv(3)>> will be overwritten.  But you shouldn't be using
5849 C<localeconv> anyway because it is is very much not thread-safe, and suffers
5850 from the same problems outlined in item 'b.' above for the fields it returns
5851 that are controlled by the LC_NUMERIC locale category.  Instead, avoid all of
5852 those problems by calling L</Perl_localeconv>, which is thread-safe; or by
5853 using the methods given in L<perlcall>  to call
5854 L<C<POSIX::localeconv()>|POSIX/localeconv>, which is also thread-safe.
5855
5856 =back
5857
5858 The details for those items which may deviate from what this emulation returns
5859 and what a native C<nl_langinfo()> would return are specified in
5860 L<I18N::Langinfo>.
5861
5862 When using C<Perl_langinfo8> (or plain C<Perl_langinfo>) on systems that don't
5863 have a native C<nl_langinfo()>, you must
5864
5865  #include "perl_langinfo.h"
5866
5867 before the C<perl.h> C<#include>.  You can replace your F<langinfo.h>
5868 C<#include> with this one.  (Doing it this way keeps out the symbols that plain
5869 F<langinfo.h> would try to import into the namespace for code that doesn't need
5870 it.)
5871
5872 =cut
5873
5874 */
5875
5876 const char *
5877 Perl_langinfo(const nl_item item)
5878 {
5879     return Perl_langinfo8(item, NULL);
5880 }
5881
5882 const char *
5883 Perl_langinfo8(const nl_item item, utf8ness_t * utf8ness)
5884 {
5885     dTHX;
5886     locale_category_index cat_index = LC_ALL_INDEX_;
5887
5888     PERL_ARGS_ASSERT_PERL_LANGINFO8;
5889
5890     if (utf8ness) {     /* Assume for now */
5891         *utf8ness = UTF8NESS_IMMATERIAL;
5892     }
5893
5894     /* Find the locale category that controls the input 'item'.  If we are not
5895      * paying attention to that category, instead return a default value.  Also
5896      * return the default value if there is no way for us to figure out the
5897      * correct value.  If we have some form of nl_langinfo(), we can always
5898      * figure it out, but lacking that, there may be alternative methods that
5899      * can be used to recover most of the possible items.  Some of those
5900      * methods need libc functions, which may or may not be available.  If
5901      * unavailable, we can't compute the correct value, so must here return the
5902      * default. */
5903     switch (item) {
5904
5905       case CODESET:
5906
5907 #  ifdef USE_LOCALE_CTYPE
5908         cat_index = LC_CTYPE_INDEX_;
5909 #  endif
5910         break;
5911
5912
5913       case YESEXPR: case YESSTR: case NOEXPR: case NOSTR:
5914
5915 #  ifdef USE_LOCALE_MESSAGES
5916         cat_index = LC_MESSAGES_INDEX_;
5917 #  endif
5918         break;
5919
5920
5921       case CRNCYSTR:
5922
5923 #  ifdef USE_LOCALE_MONETARY
5924         cat_index = LC_MONETARY_INDEX_;
5925 #  endif
5926         break;
5927
5928
5929       case RADIXCHAR: case THOUSEP:
5930
5931 #  ifdef USE_LOCALE_NUMERIC
5932         cat_index = LC_NUMERIC_INDEX_;
5933 #  endif
5934         break;
5935
5936
5937       default:  /* The other possible items are all in LC_TIME. */
5938 #  ifdef USE_LOCALE_TIME
5939         cat_index = LC_TIME_INDEX_;
5940 #  endif
5941         break;
5942
5943
5944 #if ! defined(USE_LOCALE_TIME) || ! defined(HAS_SOME_LANGINFO)
5945
5946     /* If not using LC_TIME, hard code the rest.  Or, if there is no
5947      * nl_langinfo(), we use strftime() as an alternative, and it is missing
5948      * functionality to get every single one, so hard-code those */
5949
5950       /* These formats are defined by C89, so we assume that strftime supports
5951        * them, and so are returned unconditionally; they may not be what the
5952        * locale actually says, but should give good enough results for someone
5953        * using them as formats (as opposed to trying to parse them to figure
5954        * out what the locale says).  The other format items are actually tested
5955        * to verify they work on the platform */
5956       case D_FMT:         return "%x";
5957       case T_FMT:         return "%X";
5958       case D_T_FMT:       return "%c";
5959
5960 #  if defined(WIN32) || ! defined(USE_LOCALE_TIME)
5961
5962       /* strftime() on Windows doesn't have the POSIX (beyond C89) extensions
5963        * that would allow it to recover these */
5964       case ERA_D_FMT:     return "%x";
5965       case ERA_T_FMT:     return "%X";
5966       case ERA_D_T_FMT:   return "%c";
5967       case ALT_DIGITS:    return "0";
5968
5969 #  endif
5970 #  ifndef USE_LOCALE_TIME
5971
5972       case T_FMT_AMPM:    return "%r";
5973 #  endif
5974 #endif
5975
5976     } /* End of switch on item */
5977
5978 #ifndef USE_LOCALE
5979
5980     return emulate_langinfo(item, "C",
5981                             &PL_langinfo_buf, &PL_langinfo_bufsize,
5982                             utf8ness);
5983
5984 #else
5985 #  if defined(HAS_IGNORED_LOCALE_CATEGORIES_) || ! defined(LC_MESSAGES)
5986
5987     /* If the above didn't find the category's index, it has to be because the
5988      * item is unknown to us (and the callee will handle that), or the category
5989      * is confined to the "C" locale on this platform, which the callee also
5990      * handles.  (LC_MESSAGES is not required by the C Standard (the others
5991      * above are), so we have to emulate it on platforms lacking it (such as
5992      * Windows).) */
5993     if (cat_index == LC_ALL_INDEX_) {
5994         return emulate_langinfo(item, "C",
5995                                 &PL_langinfo_buf, &PL_langinfo_bufsize,
5996                                 utf8ness);
5997     }
5998
5999 #  endif
6000
6001     return my_langinfo_i(item,
6002                          cat_index,
6003                          query_nominal_locale_i(cat_index),
6004                          &PL_langinfo_buf, &PL_langinfo_bufsize,
6005                          utf8ness);
6006 #endif
6007
6008 }
6009
6010 #if defined(USE_LOCALE) && defined(HAS_NL_LANGINFO)
6011
6012 STATIC const char *
6013 S_my_langinfo_i(pTHX_
6014                 const nl_item item,           /* The item to look up */
6015
6016                 /* The locale category that controls it */
6017                 locale_category_index cat_index,
6018
6019                 /* The locale to look up 'item' in. */
6020                 const char * locale,
6021
6022                 /* Where to store the result, and where the size of that buffer
6023                  * is stored, updated on exit. retbuf_sizep may be NULL for an
6024                  * empty-on-entry, single use buffer whose size we don't need
6025                  * to keep track of */
6026                 char ** retbufp,
6027                 Size_t * retbuf_sizep,
6028
6029                 /* If not NULL, the location to store the UTF8-ness of 'item's
6030                  * value, as documented */
6031                 utf8ness_t * utf8ness)
6032 {
6033     PERL_ARGS_ASSERT_MY_LANGINFO_I;
6034     assert(cat_index < LC_ALL_INDEX_);
6035
6036     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
6037                            "Entering my_langinfo item=%ld, using locale %s\n",
6038                            (long) item, locale));
6039
6040     /* One might be tempted to avoid any toggling by instead using
6041      * nl_langinfo_l() on platforms that have it.  This would entail creating a
6042      * locale object with newlocale() and freeing it afterwards.  But doing so
6043      * runs significantly slower than just doing the toggle ourselves.
6044      * lib/locale_threads.t was slowed down by 25% on Ubuntu 22.04 */
6045
6046 #  ifdef WE_MUST_DEAL_WITH_MISMATCHED_CTYPE
6047
6048     /* This function sorts out if things actually have to be switched or not,
6049      * for both save and restore. */
6050     const char * orig_CTYPE_locale = toggle_locale_c(LC_CTYPE, locale);
6051
6052 #  endif
6053
6054     const char * orig_switched_locale = toggle_locale_i(cat_index, locale);
6055
6056     gwLOCALE_LOCK;
6057     const char * retval = save_to_buffer(nl_langinfo(item),
6058                                          retbufp, retbuf_sizep);
6059     gwLOCALE_UNLOCK;
6060
6061     restore_toggled_locale_i(cat_index, orig_switched_locale);
6062
6063 #  ifdef WE_MUST_DEAL_WITH_MISMATCHED_CTYPE
6064
6065     restore_toggled_locale_c(LC_CTYPE, orig_CTYPE_locale);
6066
6067 #  endif
6068
6069     if (utf8ness) {
6070         *utf8ness = get_locale_string_utf8ness_i(retval,
6071                                                  LOCALE_UTF8NESS_UNKNOWN,
6072                                                  locale, cat_index);
6073     }
6074
6075     return retval;
6076 }
6077
6078 #endif
6079 #ifndef HAS_DEFINITIVE_UTF8NESS_DETERMINATION
6080
6081 /* Forward declaration of function that we don't put into embed.fnc so as to
6082  * make its removal easier, as there may not be any extant platforms that need
6083  * it; and the function is located after emulate_langinfo() because it's easier
6084  * to understand when placed in the context of that code */
6085 STATIC const char * S_override_codeset_if_utf8_found(pTHX_
6086                                                      const char *codeset,
6087                                                      const char *locale);
6088 #endif
6089 #if ! defined(HAS_NL_LANGINFO)                      \
6090  ||   defined(HAS_IGNORED_LOCALE_CATEGORIES_)       \
6091  || ! defined(LC_MESSAGES)
6092
6093 STATIC const char *
6094 S_emulate_langinfo(pTHX_ const nl_item item,
6095                          const char * locale,
6096                          char ** retbufp,
6097                          Size_t * retbuf_sizep,
6098                          utf8ness_t * utf8ness)
6099 {
6100     PERL_ARGS_ASSERT_EMULATE_LANGINFO;
6101 #  ifndef USE_LOCALE
6102     PERL_UNUSED_ARG(locale);
6103 #  endif
6104
6105     /* This emulates nl_langinfo() on platforms:
6106      *   1) where it doesn't exist; or
6107      *   2) where it does exist, but there are categories that it shouldn't be
6108      *      called on because they don't exist on the platform or we are
6109      *      supposed to always stay in the C locale for them.  This function
6110      *      has hard-coded in the results for those for the C locale.
6111      *
6112      * The major platform lacking nl_langinfo() is Windows.  It does have
6113      * GetLocaleInfoEx() that could be used to get most of the items, but it
6114      * (and other similar Windows API functions) use what MS calls "locale
6115      * names", whereas the C functions use what MS calls "locale strings".  The
6116      * locale string "English_United_States.1252" is equivalent to the locale
6117      * name "en_US".  There are tables inside Windows that translate between
6118      * the two forms, but they are not exposed.  Also calling setlocale(), then
6119      * calling GetThreadLocale() doesn't work, as the former doesn't change the
6120      * latter's return.  Therefore we are stuck using the mechanisms below. */
6121
6122     /* Almost all the items will have ASCII return values.  Set that here, and
6123      * override if necessary */
6124     utf8ness_t is_utf8 = UTF8NESS_IMMATERIAL;
6125     const char * retval = NULL;
6126
6127     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
6128                         "Entering emulate_langinfo item=%ld, using locale %s\n",
6129                         (long) item, locale));
6130
6131 #  if defined(HAS_LOCALECONV) && (   defined(USE_LOCALE_NUMERIC)      \
6132                                   || defined(USE_LOCALE_MONETARY))
6133
6134     locale_category_index  cat_index;
6135
6136 #  endif
6137
6138     GCC_DIAG_IGNORE_STMT(-Wimplicit-fallthrough);
6139
6140     switch (item) {
6141
6142       /* The following items have no way khw could figure out how to get except
6143        * via nl_langinfo() */
6144       case YESEXPR:   retval = "^[+1yY]"; break;
6145       case YESSTR:    retval = "yes";     break;
6146       case NOEXPR:    retval = "^[-0nN]"; break;
6147       case NOSTR:     retval = "no";      break;
6148
6149 #  if defined(USE_LOCALE_MONETARY) && defined(HAS_LOCALECONV)
6150
6151       case CRNCYSTR:
6152         cat_index = LC_MONETARY_INDEX_;
6153         goto use_localeconv;
6154
6155 #  else
6156
6157       case CRNCYSTR:
6158         retval = "-";
6159         break;
6160
6161 #  endif
6162 #  if defined(USE_LOCALE_NUMERIC) && defined(HAS_LOCALECONV)
6163
6164       case THOUSEP:
6165         cat_index = LC_NUMERIC_INDEX_;
6166         goto use_localeconv;
6167
6168 #  else
6169
6170       case THOUSEP:
6171         retval = C_thousands_sep;
6172         break;
6173
6174 #  endif
6175
6176       case RADIXCHAR:
6177
6178 #  if    defined(USE_LOCALE_NUMERIC) && defined(HAS_SNPRINTF)           \
6179    && (! defined(HAS_LOCALECONV) || defined(TS_W32_BROKEN_LOCALECONV))
6180
6181         {   /* snprintf() can be used to find the radix character by outputting
6182              * a known simple floating point number to a buffer, and parsing
6183              * it, inferring the radix as the bytes separating the integer and
6184              * fractional parts.  But localeconv() is more direct, not
6185              * requiring inference, so use it instead of the code just below,
6186              * if (likely) it is available and works ok */
6187
6188             char * floatbuf = NULL;
6189             const Size_t initial_size = 10;
6190
6191             Newx(floatbuf, initial_size, char);
6192
6193 #    if defined(WE_MUST_DEAL_WITH_MISMATCHED_CTYPE)
6194             const char * orig_CTYPE_locale = toggle_locale_c(LC_CTYPE, locale);
6195 #    endif
6196
6197             const char * orig_NUMERIC_locale = toggle_locale_c(LC_NUMERIC,
6198                                                                locale);
6199             /* 1.5 is exactly representable on binary computers */
6200             Size_t needed_size = snprintf(floatbuf, initial_size, "%.1f", 1.5);
6201
6202             /* If our guess wasn't big enough, increase and try again, based on
6203              * the real number that snprintf() is supposed to return */
6204             if (UNLIKELY(needed_size >= initial_size)) {
6205                 needed_size++;  /* insurance */
6206                 Renew(floatbuf, needed_size, char);
6207                 Size_t new_needed = snprintf(floatbuf, needed_size, "%.1f",
6208                                              1.5);
6209                 assert(new_needed <= needed_size);
6210                 needed_size = new_needed;
6211             }
6212
6213             restore_toggled_locale_c(LC_NUMERIC, orig_NUMERIC_locale);
6214
6215 #    if defined(WE_MUST_DEAL_WITH_MISMATCHED_CTYPE)
6216             restore_toggled_locale_c(LC_CTYPE, orig_CTYPE_locale);
6217 #    endif
6218
6219             char * s = floatbuf;
6220             char * e = floatbuf + needed_size;
6221
6222             /* Find the '1' */
6223             while (s < e && *s != '1') {
6224                 s++;
6225             }
6226
6227             if (LIKELY(s < e)) {
6228                 s++;
6229             }
6230
6231             /* Find the '5' */
6232             char * item_start = s;
6233             while (s < e && *s != '5') {
6234                 s++;
6235             }
6236
6237             /* Everything in between is the radix string */
6238             if (LIKELY(s < e)) {
6239                 *s = '\0';
6240                 retval = save_to_buffer(item_start, retbufp, retbuf_sizep);
6241                 Safefree(floatbuf);
6242
6243                 if (utf8ness) {
6244                     is_utf8 = get_locale_string_utf8ness_i(retval,
6245                                                         LOCALE_UTF8NESS_UNKNOWN,
6246                                                         locale,
6247                                                         LC_NUMERIC_INDEX_);
6248                 }
6249
6250                 break;
6251             }
6252
6253             Safefree(floatbuf);
6254         }
6255
6256 #  endif  /* Trying snprintf() */
6257
6258         /* Here snprintf() was not compiled, or failed */
6259
6260 #  if ! defined(USE_LOCALE_NUMERIC) || ! defined(HAS_LOCALECONV)
6261
6262         retval = C_decimal_point;
6263         break;
6264
6265 #  else         /* snprintf() failed; drop down to use localeconv() */
6266
6267         cat_index = LC_NUMERIC_INDEX_;
6268
6269 #  endif
6270 #  if defined(HAS_LOCALECONV) && (   defined(USE_LOCALE_NUMERIC)      \
6271                                   || defined(USE_LOCALE_MONETARY))
6272
6273     /* These items are available from localeconv(). */
6274
6275    /* case RADIXCHAR:   // May drop down to here in some configurations
6276       case THOUSEP:     // Jumps to here
6277       case CRNCYSTR:    // Jumps to here */
6278       use_localeconv:
6279        {
6280
6281         /* The hash gets populated with just the field(s) related to 'item'. */
6282         HV * result_hv = my_localeconv(item);
6283
6284         SV* string;
6285         if (item != CRNCYSTR) {
6286
6287             /* These items have been populated with just one key => value */
6288             (void) hv_iterinit(result_hv);
6289             HE * entry = hv_iternext(result_hv);
6290             string = hv_iterval(result_hv, entry);
6291         }
6292         else {
6293
6294             /* But CRNCYSTR localeconv() returns a slightly different value
6295              * than the nl_langinfo() API calls for, so have to modify this one
6296              * to conform.  We need another value from localeconv() to know
6297              * what to change it to.  my_localeconv() has populated the hash
6298              * with exactly both fields.  Delete this one, leaving just the
6299              * CRNCYSTR one in the hash */
6300             SV* precedes = hv_delete(result_hv,
6301                                      P_CS_PRECEDES_LITERAL,
6302                                      STRLENs(P_CS_PRECEDES_LITERAL),
6303                                      0);
6304             if (! precedes) {
6305                 locale_panic_("my_localeconv() unexpectedly didn't return"
6306                               " a value for " P_CS_PRECEDES_LITERAL);
6307             }
6308
6309             /* The modification is to prefix the localeconv() return with a
6310              * single byte, calculated as follows: */
6311             const char * prefix = (LIKELY(SvIV(precedes) != -1))
6312                                    ? ((precedes != 0) ?  "-" : "+")
6313                                    : ".";
6314             /* (khw couldn't find any documentation that the dot is signalled
6315              * by CHAR_MAX (which we modify to -1), but cygwin uses it thusly,
6316              * and it makes sense given that CHAR_MAX indicates the value isn't
6317              * used, so it neither precedes nor succeeds) */
6318
6319             /* Now get CRNCYSTR */
6320             (void) hv_iterinit(result_hv);
6321             HE * entry = hv_iternext(result_hv);
6322             string = hv_iterval(result_hv, entry);
6323
6324             /* And perform the modification */
6325             sv_insert(string, 0, 0, prefix, 1);
6326         }
6327
6328         /* Here, 'string' contains the value we want to return */
6329         retval = save_to_buffer(SvPV_nolen(string), retbufp, retbuf_sizep);
6330
6331         if (utf8ness) {
6332             is_utf8 = get_locale_string_utf8ness_i(retval,
6333                                                    LOCALE_UTF8NESS_UNKNOWN,
6334                                                    locale,
6335                                                    cat_index);
6336         }
6337
6338         break;
6339
6340        }
6341
6342 #  endif  /* Using localeconv() for something or other */
6343 #  ifndef USE_LOCALE_CTYPE
6344
6345       case CODESET:
6346         retval = C_codeset;
6347         break;
6348
6349 #  else
6350
6351       case CODESET:
6352
6353         /* The trivial case */
6354         if (isNAME_C_OR_POSIX(locale)) {
6355             retval = C_codeset;
6356             break;
6357         }
6358
6359         /* If this happens to match our cached value */
6360         if (PL_in_utf8_CTYPE_locale && strEQ(locale, PL_ctype_name)) {
6361             retval = "UTF-8";
6362             break;
6363         }
6364
6365 #    ifdef WIN32
6366 #      ifdef WIN32_USE_FAKE_OLD_MINGW_LOCALES
6367 #        define GET_CODE_PAGE_AS_STRING  nl_langinfo(CODESET)
6368 #      else
6369             /* The Windows function retrieves the code page.  It is subject to
6370              * change, but is documented and has been stable for many releases
6371              * */
6372 #        define GET_CODE_PAGE_AS_STRING                                     \
6373                                 Perl_form(aTHX_ "%d", ___lc_codepage_func())
6374 #      endif
6375
6376         const char * orig_CTYPE_locale;
6377         orig_CTYPE_locale = toggle_locale_c(LC_CTYPE, locale);
6378         retval = save_to_buffer(GET_CODE_PAGE_AS_STRING, retbufp, retbuf_sizep);
6379         restore_toggled_locale_c(LC_CTYPE, orig_CTYPE_locale);
6380
6381         DEBUG_Lv(PerlIO_printf(Perl_debug_log, "locale='%s' cp=%s\n",
6382                                                locale, retval));
6383         break;
6384
6385 #    else   /* Below is ! Win32 */
6386
6387         /* The codeset is important, but khw did not figure out a way for it to
6388          * be retrieved on non-Windows boxes without nl_langinfo().  But even
6389          * if we can't get it directly, we can usually determine if it is a
6390          * UTF-8 locale or not.  If it is UTF-8, we (correctly) use that for
6391          * the code set. */
6392
6393 #      ifdef HAS_DEFINITIVE_UTF8NESS_DETERMINATION
6394
6395         if (is_locale_utf8(locale)) {
6396             retval = "UTF-8";
6397             break;
6398         }
6399
6400 #      endif
6401
6402         /* Here, the code set has not been found.  The only other option khw
6403          * could think of is to see if the codeset is part of the locale name.
6404          * This is very less than ideal; often there is no code set in the
6405          * name; and at other times they even lie.
6406          *
6407          * But there is an XPG standard syntax, which many locales follow:
6408          *
6409          *    language[_territory[.codeset]][@modifier]
6410          *
6411          * So we take the part between the dot and any '@' */
6412         retval = strchr(locale, '.');
6413         if (! retval) {
6414             retval = "";  /* Alas, no dot */
6415         }
6416         else {
6417
6418             /* Don't include the dot */
6419             retval++;
6420
6421             /* And stop before any '@' */
6422             const char * modifier = strchr(retval, '@');
6423             if (modifier) {
6424                 char * code_set_name;
6425                 const Size_t name_len = modifier - retval;
6426                 Newx(code_set_name, name_len + 1, char);    /* +1 for NUL */
6427                 my_strlcpy(code_set_name, retval, name_len + 1);
6428                 SAVEFREEPV(code_set_name);
6429                 retval = code_set_name;
6430             }
6431
6432             /* The code set name is considered to be everything between the dot
6433              * and the '@' */
6434             retval = save_to_buffer(retval, retbufp, retbuf_sizep);
6435         }
6436
6437 #      ifndef HAS_DEFINITIVE_UTF8NESS_DETERMINATION
6438
6439         /* Here, 'retval' contains any codeset name derived from the locale
6440          * name.  That derived name may be empty or not necessarily indicative
6441          * of the real codeset.  But we can often determine if it should be
6442          * UTF-8, regardless of what the name is.  On most platforms, that
6443          * determination is definitive, and was already done.  But for this
6444          * code to be compiled, this platform is not one of them.  However,
6445          * there are typically tools available to make a very good guess, and
6446          * knowing the derived codeset name improves the quality of that guess.
6447          * The following function overrides the derived codeset name when it
6448          * guesses that it actually should be UTF-8.  It could be inlined here,
6449          * but was moved out of this switch() so as to make the switch()
6450          * control flow easier to follow */
6451         retval = S_override_codeset_if_utf8_found(aTHX_ retval, locale);
6452
6453 #      endif
6454
6455         break;
6456
6457 #    endif    /* ! WIN32 */
6458 #  endif      /* USE_LOCALE_CTYPE */
6459
6460       default:  /* Anything else that is legal is LC_TIME-related */
6461        {
6462
6463         const char * format = NULL;
6464         retval = NULL;
6465
6466 #  ifdef HAS_STRFTIME
6467
6468         bool return_format = FALSE;
6469
6470         /* Without strftime(), default compiled-in values are returned.
6471          * Otherwise, we generally compute a date as explained below.
6472          * Initialize default values for that computation */
6473         int mon = 0;
6474         int mday = 1;
6475         int hour = 6;
6476
6477 #  endif
6478
6479         /* Nested switch for LC_TIME items, plus the default: case is for
6480          * unknown items */
6481         switch (item) {
6482           default:  /* Anything not covered here is something we don't know
6483                        about. */
6484             assert(item < 0);   /* Make sure using perl_langinfo.h */
6485            Perl_croak_nocontext("panic: Unexpected nl_langinfo() item %d", item);
6486             NOT_REACHED; /* NOTREACHED */
6487             break;
6488
6489             /* The case: statments in this switch are all for LC_TIME related
6490              * values.  There are four types of values returned.  One type is
6491              * "Give me the name in this locale of the 3rd month of the year"
6492              * (March in an English locale).  The second main type is "Give me
6493              * the best format string understood by strftime(), like '%c', for
6494              * formatting the date and time in this locale."  The other two
6495              * types are for ERA and ALT_DIGITS, and are explained at the case
6496              * statements for them.
6497              *
6498              * For the first type, suppose we want to find the name of the 3rd
6499              * month of the year.  We pass a date/time to strftime() that is
6500              * known to evaluate to sometime in March, along with a format that
6501              * tells strftime() to return the month's name.  We then return
6502              * that to our caller.  Similarly for the names of the days of the
6503              * week, like "Tuesday".  There are also abbreviated versions for
6504              * each of these.
6505              *
6506              * To implement the second type (returning to the caller a string
6507              * containing a format suitable for passing to strftime() ) we
6508              * guess a format, pass that to strftime, and examine its return to
6509              * see if that format is known on this platform.  If so, we return
6510              * that guess.  Otherwise we return the empty string "".  There are
6511              * no second guesses, as there don't seem to be alternatives
6512              * lurking out there.  For some formats that are supposed to be
6513              * known to all strftime()s since C89, we just assume that they are
6514              * valid, not bothering to check.  The guesses may not be the best
6515              * available for this locale on this platform, but should be good
6516              * enough, so that a native speaker would find them understandable.
6517              * */
6518
6519             /* Unimplemented by perl; for use with strftime() %E modifier */
6520           case ERA: retval = ""; break;
6521
6522 #  if ! defined(USE_LOCALE_TIME) || ! defined(HAS_STRFTIME)
6523
6524           case AM_STR: retval = "AM"; break;
6525           case PM_STR: retval = "PM"; break;
6526 #  else
6527           case PM_STR: hour = 18;
6528           case AM_STR:
6529             format = "%p";
6530             break;
6531 #  endif
6532 #  if ! defined(USE_LOCALE_TIME) || ! defined(HAS_STRFTIME)
6533
6534           case ABDAY_1: retval = "Sun"; break;
6535           case ABDAY_2: retval = "Mon"; break;
6536           case ABDAY_3: retval = "Tue"; break;
6537           case ABDAY_4: retval = "Wed"; break;
6538           case ABDAY_5: retval = "Thu"; break;
6539           case ABDAY_6: retval = "Fri"; break;
6540           case ABDAY_7: retval = "Sat"; break;
6541 #  else
6542           case ABDAY_7: mday++;
6543           case ABDAY_6: mday++;
6544           case ABDAY_5: mday++;
6545           case ABDAY_4: mday++;
6546           case ABDAY_3: mday++;
6547           case ABDAY_2: mday++;
6548           case ABDAY_1:
6549             format = "%a";
6550             break;
6551 #  endif
6552 #  if ! defined(USE_LOCALE_TIME) || ! defined(HAS_STRFTIME)
6553
6554           case DAY_1: retval = "Sunday";    break;
6555           case DAY_2: retval = "Monday";    break;
6556           case DAY_3: retval = "Tuesday";   break;
6557           case DAY_4: retval = "Wednesday"; break;
6558           case DAY_5: retval = "Thursday";  break;
6559           case DAY_6: retval = "Friday";    break;
6560           case DAY_7: retval = "Saturday";  break;
6561 #  else
6562           case DAY_7: mday++;
6563           case DAY_6: mday++;
6564           case DAY_5: mday++;
6565           case DAY_4: mday++;
6566           case DAY_3: mday++;
6567           case DAY_2: mday++;
6568           case DAY_1:
6569             format = "%A";
6570             break;
6571 #  endif
6572 #  if ! defined(USE_LOCALE_TIME) || ! defined(HAS_STRFTIME)
6573           case ABMON_1:  retval = "Jan"; break;
6574           case ABMON_2:  retval = "Feb"; break;
6575           case ABMON_3:  retval = "Mar"; break;
6576           case ABMON_4:  retval = "Apr"; break;
6577           case ABMON_5:  retval = "May"; break;
6578           case ABMON_6:  retval = "Jun"; break;
6579           case ABMON_7:  retval = "Jul"; break;
6580           case ABMON_8:  retval = "Aug"; break;
6581           case ABMON_9:  retval = "Sep"; break;
6582           case ABMON_10: retval = "Oct"; break;
6583           case ABMON_11: retval = "Nov"; break;
6584           case ABMON_12: retval = "Dec"; break;
6585 #  else
6586           case ABMON_12: mon++;
6587           case ABMON_11: mon++;
6588           case ABMON_10: mon++;
6589           case ABMON_9:  mon++;
6590           case ABMON_8:  mon++;
6591           case ABMON_7:  mon++;
6592           case ABMON_6:  mon++;
6593           case ABMON_5:  mon++;
6594           case ABMON_4:  mon++;
6595           case ABMON_3:  mon++;
6596           case ABMON_2:  mon++;
6597           case ABMON_1:
6598             format = "%b";
6599             break;
6600 #  endif
6601 #  if ! defined(USE_LOCALE_TIME) || ! defined(HAS_STRFTIME)
6602
6603           case MON_1:  retval = "January";  break;
6604           case MON_2:  retval = "February"; break;
6605           case MON_3:  retval = "March";    break;
6606           case MON_4:  retval = "April";    break;
6607           case MON_5:  retval = "May";      break;
6608           case MON_6:  retval = "June";     break;
6609           case MON_7:  retval = "July";     break;
6610           case MON_8:  retval = "August";   break;
6611           case MON_9:  retval = "September";break;
6612           case MON_10: retval = "October";  break;
6613           case MON_11: retval = "November"; break;
6614           case MON_12: retval = "December"; break;
6615 #  else
6616           case MON_12: mon++;
6617           case MON_11: mon++;
6618           case MON_10: mon++;
6619           case MON_9:  mon++;
6620           case MON_8:  mon++;
6621           case MON_7:  mon++;
6622           case MON_6:  mon++;
6623           case MON_5:  mon++;
6624           case MON_4:  mon++;
6625           case MON_3:  mon++;
6626           case MON_2:  mon++;
6627           case MON_1:
6628             format = "%B";
6629             break;
6630 #  endif
6631 #  ifdef HAS_STRFTIME
6632           case T_FMT_AMPM:
6633             format = "%r";
6634             return_format = TRUE;
6635             break;
6636           case ERA_D_FMT:
6637             format = "%Ex";
6638             return_format = TRUE;
6639             break;
6640           case ERA_T_FMT:
6641             format = "%EX";
6642             return_format = TRUE;
6643             break;
6644           case ERA_D_T_FMT:
6645             format = "%Ec";
6646             return_format = TRUE;
6647             break;
6648           case ALT_DIGITS:
6649             format = "%Ow"; /* Find the alternate digit for 0 */
6650             break;
6651 #  endif
6652
6653         } /* End of inner switch() */
6654
6655         /* The inner switch() above has set 'retval' iff that is the final
6656          * answer */
6657         if (retval) {
6658             break;
6659         }
6660
6661         /* And it hasn't set 'format' iff it can't figure out a good value on
6662          * this platform. */
6663         if (! format) {
6664             retval = "";
6665             break;
6666         }
6667
6668 #  ifdef HAS_STRFTIME
6669
6670         /* Here we have figured out what to call strftime() with */
6671
6672         struct tm  mytm;
6673         const char * orig_TIME_locale = toggle_locale_c(LC_TIME, locale);
6674
6675         /* The year was deliberately chosen so that January 1 is on the
6676         * first day of the week.  Since we're only getting one thing at a
6677         * time, it all works */
6678         ints_to_tm(&mytm, 30, 30, hour, mday, mon, 2011, 0, 0, 0);
6679         char * temp;
6680         if (utf8ness) {
6681             temp = strftime8(format,
6682                              &mytm,
6683                              UTF8NESS_IMMATERIAL, /* All possible formats
6684                                                      specified above are
6685                                                      entirely ASCII */
6686                              &is_utf8,
6687                              false      /* not calling from sv_strftime */
6688                             );
6689         }
6690         else {
6691             temp = strftime_tm(format, &mytm);
6692         }
6693
6694         restore_toggled_locale_c(LC_TIME, orig_TIME_locale);
6695
6696         retval = save_to_buffer(temp, retbufp, retbuf_sizep);
6697         Safefree(temp);
6698
6699         /* If the item is 'ALT_DIGITS', '*retbuf' contains the alternate
6700         * format for wday 0.  If the value is the same as the normal 0,
6701         * there isn't an alternate, so clear the buffer.
6702         *
6703         * (wday was chosen because its range is all a single digit.
6704         * Things like tm_sec have two digits as the minimum: '00'.) */
6705         if (item == ALT_DIGITS && strEQ(*retbufp, "0")) {
6706             retval = "";
6707             break;
6708         }
6709
6710         /* ALT_DIGITS is problematic.  Experiments on it showed that
6711         * strftime() did not always work properly when going from alt-9 to
6712         * alt-10.  Only a few locales have this item defined, and in all
6713         * of them on Linux that khw was able to find, nl_langinfo() merely
6714         * returned the alt-0 character, possibly doubled.  Most Unicode
6715         * digits are in blocks of 10 consecutive code points, so that is
6716         * sufficient information for such scripts, as we can infer alt-1,
6717         * alt-2, ....  But for a Japanese locale, a CJK ideographic 0 is
6718         * returned, and the CJK digits are not in code point order, so you
6719         * can't really infer anything.  The localedef for this locale did
6720         * specify the succeeding digits, so that strftime() works properly
6721         * on them, without needing to infer anything.  But the
6722         * nl_langinfo() return did not give sufficient information for the
6723         * caller to understand what's going on.  So until there is
6724         * evidence that it should work differently, this returns the alt-0
6725         * string for ALT_DIGITS. */
6726
6727         if (return_format) {
6728             /* Here are to return the format, not the value.  This is used when
6729              * we are testing if the format we expect to return is legal on
6730              * this platform.  We have passed the format, say "%r, to
6731              * strftime(), and now have in 'retval' what strftime processed it
6732              * to be.  But the caller doesnt't want that; it wants the actual
6733              * %r, if it is understood on this platform, and "" if it isn't.
6734              * Some strftime()s return "" for an unknown format.  (None of the
6735              * formats exposed by langinfo can have "" be a legal result.)
6736              * Other strftime()s return the format unchanged if not understood.
6737              * So if we pass "%r" to strftime(), and it's illegal, we will get
6738              * back either "" or "%r", and we return "" to our caller.  If the
6739              * strftime() return is anything else, we conclude that "%r" is
6740              * understood by the platform, and return "%r". */
6741             if (strEQ(*retbufp, format)) {
6742                 retval = "";
6743             }
6744             else {
6745                 retval = format;
6746             }
6747
6748             /* A format is always in ASCII */
6749             is_utf8 = UTF8NESS_IMMATERIAL;
6750         }
6751
6752 #  endif
6753
6754         break;
6755        }    /* End of braced group for outer switch 'default:' case */
6756     } /* Giant switch() of nl_langinfo() items */
6757
6758     GCC_DIAG_RESTORE_STMT;
6759
6760     if (utf8ness) {
6761         *utf8ness = is_utf8;
6762     }
6763
6764     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
6765                          "Leaving emulate_langinfo item=%ld, using locale %s\n",
6766                          (long) item, locale));
6767     return retval;
6768 }
6769
6770 #endif      /* Needs emulate_langinfo() */
6771 #ifndef HAS_DEFINITIVE_UTF8NESS_DETERMINATION
6772
6773 STATIC const char *
6774 S_override_codeset_if_utf8_found(pTHX_ const char * codeset,
6775                                        const char * locale)
6776 {
6777 #  define NAME_INDICATES_UTF8       0x1
6778 #  define MB_CUR_MAX_SUGGESTS_UTF8  0x2
6779
6780     /* Override 'codeset' with UTF-8 if this routine guesses that it should be.
6781      * Conversely (but rarely), "UTF-8" in the locale name might be wrong.  We
6782      * return "" as the code set name if we find that to be the case.  */
6783
6784     unsigned int lean_towards_being_utf8 = 0;
6785     if (is_codeset_name_UTF8(codeset)) {
6786         lean_towards_being_utf8 |= NAME_INDICATES_UTF8;
6787     }
6788
6789     const char * orig_CTYPE_locale = toggle_locale_c(LC_CTYPE, locale);
6790
6791     /* For this portion of the file to compile, some C99 functions aren't
6792      * available to us, even though we now require C99.  So, something must be
6793      * wrong with them.  The code here should be good enough to work around
6794      * this issue, but should the need arise, comments in S_is_locale_utf8()
6795      * list some alternative C99 functions that could be tried.
6796      *
6797      * But MB_CUR_MAX is a C89 construct that helps a lot, is simple for a
6798      * vendor to implement, and our experience with it is that it works well on
6799      * a variety of platforms.  We have found that it returns a too-large
6800      * number on some platforms for the C locale, but for no others.  That
6801      * locale was already ruled out in the code that called this function.  (If
6802      * MB_CUR_MAX returned too small a number, that would break a lot of
6803      * things, and likely would be quickly corrected by the vendor.)  khw has
6804      * some confidence that it doesn't return >1 when 1 is meant, as that would
6805      * trigger a Perl warning, and we've had no reports of invalid occurrences
6806      * of such. */
6807 #  ifdef MB_CUR_MAX
6808
6809     /* If there are fewer bytes available in this locale than are required to
6810      * represent the largest legal UTF-8 code point, this definitely isn't a
6811      * UTF-8 locale, even if the locale name says it is. */
6812     const int mb_cur_max = MB_CUR_MAX;
6813     if (mb_cur_max < (int) UNISKIP(PERL_UNICODE_MAX)) {
6814         if (lean_towards_being_utf8 & NAME_INDICATES_UTF8) {
6815             return "";    /* The name is wrong; override */
6816         }
6817
6818         restore_toggled_locale_c(LC_CTYPE, orig_CTYPE_locale);
6819         return codeset;
6820     }
6821
6822     /* But if the locale could be UTF-8, and also the name corroborates this,
6823      * assume it is so */
6824     if (lean_towards_being_utf8 & NAME_INDICATES_UTF8) {
6825         restore_toggled_locale_c(LC_CTYPE, orig_CTYPE_locale);
6826         return codeset;
6827     }
6828
6829     /* Here, the name doesn't indicate UTF-8, but MB_CUR_MAX indicates it could
6830      * be.  khw knows of only two other locales in the world, EUC-TW and GB
6831      * 18030, that legitimately require this many bytes (4).  So, if the name
6832      * is one of those, MB_CUR_MAX has corroborated that. */
6833     bool name_implies_non_utf8 = false;
6834     if (foldEQ(codeset, "GB", 2)) {
6835         const char * s = codeset + 2;
6836         if (*s == '-' || *s == '_') {
6837             s++;
6838         }
6839
6840         if strEQ(s, "18030") {
6841             name_implies_non_utf8 = true;
6842         }
6843     }
6844     else if (foldEQ(codeset, "EUC", 3)) {
6845         const char * s = codeset + 3;
6846         if (*s == '-' || *s == '_') {
6847             s++;
6848         }
6849
6850         if (foldEQ(s, "TW", 2)) {
6851             name_implies_non_utf8 = true;
6852         }
6853     }
6854
6855     /* Otherwise, the locale is likely UTF-8 */
6856     if (! name_implies_non_utf8) {
6857         lean_towards_being_utf8 |= MB_CUR_MAX_SUGGESTS_UTF8;
6858     }
6859
6860     /* (In both those two other multibyte locales, the single byte characters
6861      * are the same as ASCII.  No multi-byte character in EUC-TW is legal UTF-8
6862      * (since the first byte of each is a continuation).  GB 18030 has no three
6863      * byte sequences, and none of the four byte ones is legal UTF-8 (as the
6864      * second byte for these is a non-continuation).  But every legal UTF-8 two
6865      * byte sequence is also legal in GB 18030, though none have the same
6866      * meaning, and no Han code point expressed in UTF-8 is two byte.  So the
6867      * further tests below which look for native expressions of currency and
6868      * time will not return two byte sequences, hence they will reliably rule
6869      * out such a locale as being UTF-8, even if the code set name checked
6870      * above isn't correct.) */
6871
6872 #  endif    /* has MB_CUR_MAX */
6873
6874     /* Here, MB_CUR_MAX is not available, or was inconclusive.  What we do is
6875      * to look at various strings associated with the locale:
6876      *  1)  If any are illegal UTF-8, the locale can't be UTF-8.
6877      *  2)  If all are legal UTF-8, and some non-ASCII characters are present,
6878      *      it is likely to be UTF-8, because of the strictness of UTF-8
6879      *      syntax. So assume it is UTF-8
6880      *  3)  If all are ASCII and the locale name and/or MB_CUR_MAX indicate
6881      *      UTF-8, assume the locale is UTF-8.
6882      *  4)  Otherwise, assume the locale isn't UTF-8
6883      *
6884      * To save cycles, if the locale name indicates it is a UTF-8 locale, we
6885      * stop looking at the first instance with legal non-ASCII UTF-8.  It is
6886      * very unlikely this combination is coincidental. */
6887
6888     utf8ness_t strings_utf8ness = UTF8NESS_UNKNOWN;
6889     char * scratch_buf = NULL;
6890     Size_t scratch_buf_size = 0;
6891
6892     /* List of strings to look at */
6893     const int trials[] = {
6894
6895 #  if defined(USE_LOCALE_MONETARY) && defined(HAS_LOCALECONV)
6896
6897         /* The first string tried is the locale currency name.  Often that will
6898          * be in the native script.
6899          *
6900          * But this is usable only if localeconv() is available, as that's the
6901          * way we find out the currency symbol. */
6902
6903         CRNCYSTR,
6904
6905 #  endif
6906 #  ifdef USE_LOCALE_TIME
6907
6908     /* We can also try various strings associated with LC_TIME, like the names
6909      * of months or days of the week */
6910
6911         DAY_1, DAY_2, DAY_3, DAY_4, DAY_5, DAY_6, DAY_7,
6912         MON_1, MON_2, MON_3, MON_4, MON_5, MON_6, MON_7, MON_8,
6913                                     MON_9, MON_10, MON_11, MON_12,
6914         ALT_DIGITS, AM_STR, PM_STR,
6915         ABDAY_1, ABDAY_2, ABDAY_3, ABDAY_4, ABDAY_5, ABDAY_6, ABDAY_7,
6916         ABMON_1, ABMON_2, ABMON_3, ABMON_4, ABMON_5, ABMON_6,
6917         ABMON_7, ABMON_8, ABMON_9, ABMON_10, ABMON_11, ABMON_12
6918
6919 #  endif
6920
6921     };
6922
6923 #  ifdef USE_LOCALE_TIME
6924
6925     /* The code in the recursive call below can handle switching the locales,
6926      * but by doing it now here, that code will check and discover that there
6927      * is no need to switch then restore, avoiding those each loop iteration */
6928     const char * orig_TIME_locale = toggle_locale_c(LC_TIME, locale);
6929
6930 #  endif
6931
6932     /* The trials array may consist of strings from two different locale
6933      * categories.  The call to my_langinfo_i() below needs to pass the proper
6934      * category for each string.  There is a max of 1 trial for LC_MONETARY;
6935      * the rest are LC_TIME.  So the array is arranged so the LC_MONETARY item
6936      * (if any) is first, and all subsequent iterations will use LC_TIME.
6937      * These #ifdefs set up the values for all possible combinations. */
6938 #  if defined(USE_LOCALE_MONETARY) && defined(HAS_LOCALECONV)
6939
6940     locale_category_index  cat_index = LC_MONETARY_INDEX_;
6941
6942 #    ifdef USE_LOCALE_TIME
6943
6944     const locale_category_index  follow_on_cat_index = LC_TIME_INDEX_;
6945     assert(trials[1] == DAY_1); /* Make sure only a single non-time entry */
6946
6947 #    else
6948
6949     /* Effectively out-of-bounds, as there is only the monetary entry */
6950     const locale_category_index  follow_on_cat_index = LC_ALL_INDEX_;
6951
6952 #    endif
6953 #  elif defined(USE_LOCALE_TIME)
6954
6955     locale_category_index  cat_index = LC_TIME_INDEX_;
6956     const locale_category_index  follow_on_cat_index = LC_TIME_INDEX_;
6957
6958 #  else
6959
6960     /* Effectively out-of-bounds, as here there are no trial entries at all.
6961      * This allows this code to compile, but there are no strings to test, and
6962      * so the answer will always be non-UTF-8. */
6963     locale_category_index  cat_index = LC_ALL_INDEX_;
6964     const locale_category_index  follow_on_cat_index = LC_ALL_INDEX_;
6965
6966 #  endif
6967
6968     /* Everything set up; look through all the strings */
6969     for (PERL_UINT_FAST8_T i = 0; i < C_ARRAY_LENGTH(trials); i++) {
6970         (void) my_langinfo_i(trials[i], cat_index, locale,
6971                              &scratch_buf, &scratch_buf_size, NULL);
6972         cat_index = follow_on_cat_index;
6973
6974         /* To prevent infinite recursive calls, we don't ask for the UTF-8ness
6975          * of the string (in 'trials[i]') above.  Instead we examine the
6976          * returned string here */
6977         const Size_t len = strlen(scratch_buf);
6978         const U8 * first_variant;
6979
6980         /* If the string is identical whether or not it is encoded as UTF-8, it
6981          * isn't helpful in determining UTF8ness. */
6982         if (is_utf8_invariant_string_loc((U8 *) scratch_buf, len,
6983                                          &first_variant))
6984         {
6985             continue;
6986         }
6987
6988         /* Here, has non-ASCII.  If not legal UTF-8, isn't a UTF-8 locale */
6989         if (! is_utf8_string(first_variant,
6990                              len - (first_variant - (U8 *) scratch_buf)))
6991         {
6992             strings_utf8ness = UTF8NESS_NO;
6993             break;
6994         }
6995
6996         /* Here, is a legal non-ASCII UTF-8 string; tentatively set the return
6997          * to YES; possibly overridden by later iterations */
6998         strings_utf8ness = UTF8NESS_YES;
6999
7000         /* But if this corroborates our expectation, quit now */
7001         if (lean_towards_being_utf8 & NAME_INDICATES_UTF8) {
7002             break;
7003         }
7004     }
7005
7006 #  ifdef USE_LOCALE_TIME
7007
7008     restore_toggled_locale_c(LC_TIME, orig_TIME_locale);
7009
7010 #  endif
7011
7012     restore_toggled_locale_c(LC_CTYPE, orig_CTYPE_locale);
7013
7014     Safefree(scratch_buf);
7015     scratch_buf = NULL;
7016
7017     if (strings_utf8ness == UTF8NESS_NO) {
7018         return codeset;     /* No override */
7019     }
7020
7021     /* Here all tested strings are legal UTF-8.
7022      *
7023      * Above we set UTF8NESS_YES if any string wasn't ASCII.  But even if they
7024      * are all ascii, and the locale name indicates it is a UTF-8 locale,
7025      * assume the locale is UTF-8. */
7026     if (lean_towards_being_utf8) {
7027         strings_utf8ness = UTF8NESS_YES;
7028     }
7029
7030     if (strings_utf8ness == UTF8NESS_YES) {
7031         return "UTF-8";
7032     }
7033
7034     /* Here, nothing examined indicates that the codeset is or isn't UTF-8.
7035      * But what is it?  The other locale categories are not likely to be of
7036      * further help:
7037      *
7038      * LC_NUMERIC   Only a few locales in the world have a non-ASCII radix or
7039      *              group separator.
7040      * LC_CTYPE     This code wouldn't be compiled if mbtowc() existed and was
7041      *              reliable.  This is unlikely in C99.  There are other
7042      *              functions that could be used instead, but are they going to
7043      *              exist, and be able to distinguish between UTF-8 and 8859-1?
7044      *              Deal with this only if it becomes necessary.
7045      * LC_MESSAGES  The strings returned from strerror() would seem likely
7046      *              candidates, but experience has shown that many systems
7047      *              don't actually have translations installed for them.  They
7048      *              are instead always in English, so everything in them is
7049      *              ASCII, which is of no help to us.  A Configure probe could
7050      *              possibly be written to see if this platform has non-ASCII
7051      *              error messages.  But again, wait until it turns out to be
7052      *              an actual problem.
7053      *
7054      *              Things like YESSTR, NOSTR, might not be in ASCII, but need
7055      *              nl_langinfo() to access, which we don't have.
7056      */
7057
7058     /* Otherwise, assume the locale isn't UTF-8.  This can be wrong if we don't
7059      * have MB_CUR_MAX, and the locale is English without UTF-8 in its name,
7060      * and with a dollar currency symbol. */
7061     return codeset;     /* No override */
7062 }
7063
7064 #  endif /* ! HAS_DEFINITIVE_UTF8NESS_DETERMINATION */
7065
7066 /*
7067 =for apidoc_section $time
7068 =for apidoc      sv_strftime_tm
7069 =for apidoc_item sv_strftime_ints
7070 =for apidoc_item my_strftime
7071
7072 These implement the libc strftime(), but with a different API so that the return
7073 value is a pointer to the formatted result (which MUST be arranged to be FREED
7074 BY THE CALLER).  This allows these functions to increase the buffer size as
7075 needed, so that the caller doesn't have to worry about that.
7076
7077 On failure, they return NULL, and set C<errno> to C<EINVAL>.
7078
7079 C<sv_strftime_tm> and C<sv_strftime_ints> are preferred, as they transparently
7080 handle the UTF-8ness of the current locale, the input C<fmt>, and the returned
7081 result.  Only if the current C<LC_TIME> locale is a UTF-8 one (and S<C<use
7082 bytes>> is not in effect) will the result be marked as UTF-8.  These differ
7083 only in the form of their inputs.  C<sv_strftime_tm> takes a filled-in
7084 S<C<struct tm>> parameter.  C<sv_strftime_ints> takes a bunch of integer
7085 parameters that together completely define a given time.
7086
7087 C<my_strftime> is kept for backwards compatibility.  Knowing if its result
7088 should be considered UTF-8 or not requires significant extra logic.
7089
7090 Note that C<yday> and C<wday> effectively are ignored by C<sv_strftime_ints>
7091 and C<my_strftime>, as mini_mktime() overwrites them
7092
7093 Also note that all three functions are always executed in the underlying
7094 C<LC_TIME> locale of the program, giving results based on that locale.
7095
7096 =cut
7097  */
7098
7099 char *
7100 Perl_my_strftime(pTHX_ const char *fmt, int sec, int min, int hour,
7101                        int mday, int mon, int year, int wday, int yday,
7102                        int isdst)
7103 {   /* Documented above */
7104     PERL_ARGS_ASSERT_MY_STRFTIME;
7105
7106     struct tm  mytm;
7107     ints_to_tm(&mytm, sec, min, hour, mday, mon, year, wday, yday, isdst);
7108     char * ret = strftime_tm(fmt, &mytm);
7109     return ret;
7110 }
7111
7112 SV *
7113 Perl_sv_strftime_tm(pTHX_ SV * fmt, const struct tm * mytm)
7114 {   /* Documented above */
7115     PERL_ARGS_ASSERT_SV_STRFTIME_TM;
7116
7117     utf8ness_t fmt_utf8ness = (SvUTF8(fmt) && LIKELY(! IN_BYTES))
7118                               ? UTF8NESS_YES
7119                               : UTF8NESS_UNKNOWN;
7120
7121     utf8ness_t result_utf8ness;
7122     char * retval = strftime8(SvPV_nolen(fmt),
7123                               mytm,
7124                               fmt_utf8ness,
7125                               &result_utf8ness,
7126                               true  /* calling from sv_strftime */
7127                              );
7128     SV * sv = NULL;
7129     if (retval) {
7130         sv = newSV_type(SVt_PV);
7131         sv_usepvn_flags(sv, retval, strlen(retval), SV_HAS_TRAILING_NUL);
7132
7133         if (result_utf8ness == UTF8NESS_YES) {
7134             SvUTF8_on(sv);
7135         }
7136     }
7137
7138     return sv;
7139 }
7140
7141 SV *
7142 Perl_sv_strftime_ints(pTHX_ SV * fmt, int sec, int min, int hour,
7143                             int mday, int mon, int year, int wday,
7144                             int yday, int isdst)
7145 {   /* Documented above */
7146     PERL_ARGS_ASSERT_SV_STRFTIME_INTS;
7147
7148     struct tm  mytm;
7149     ints_to_tm(&mytm, sec, min, hour, mday, mon, year, wday, yday, isdst);
7150     SV * ret = sv_strftime_tm(fmt, &mytm);
7151     return ret;
7152 }
7153
7154 STATIC void
7155 S_ints_to_tm(pTHX_ struct tm * mytm,
7156                    int sec, int min, int hour, int mday, int mon, int year,
7157                    int wday, int yday, int isdst)
7158 {
7159     /* Create a struct tm structure from the input time-related integer
7160      * variables */
7161
7162     /* Override with the passed-in values */
7163     Zero(mytm, 1, struct tm);
7164     mytm->tm_sec = sec;
7165     mytm->tm_min = min;
7166     mytm->tm_hour = hour;
7167     mytm->tm_mday = mday;
7168     mytm->tm_mon = mon;
7169     mytm->tm_year = year;
7170     mytm->tm_wday = wday;
7171     mytm->tm_yday = yday;
7172     mytm->tm_isdst = isdst;
7173     mini_mktime(mytm);
7174
7175     /* use libc to get the values for tm_gmtoff and tm_zone on platforms that
7176      * have them [perl #18238] */
7177 #if  defined(HAS_MKTIME)                                      \
7178  && (defined(HAS_TM_TM_GMTOFF) || defined(HAS_TM_TM_ZONE))
7179     struct tm mytm2 = *mytm;
7180     MKTIME_LOCK;
7181     mktime(&mytm2);
7182     MKTIME_UNLOCK;
7183 #  ifdef HAS_TM_TM_GMTOFF
7184     mytm->tm_gmtoff = mytm2.tm_gmtoff;
7185 #  endif
7186 #  ifdef HAS_TM_TM_ZONE
7187     mytm->tm_zone = mytm2.tm_zone;
7188 #  endif
7189 #endif
7190
7191     return;
7192 }
7193
7194 STATIC char *
7195 S_strftime_tm(pTHX_ const char *fmt, const struct tm *mytm)
7196 {
7197     PERL_ARGS_ASSERT_STRFTIME_TM;
7198
7199     /* Execute strftime() based on the input struct tm */
7200
7201     /* An empty format yields an empty result */
7202     const int fmtlen = strlen(fmt);
7203     if (fmtlen == 0) {
7204         char *ret;
7205         Newxz (ret, 1, char);
7206         return ret;
7207     }
7208
7209 #ifndef HAS_STRFTIME
7210     Perl_croak(aTHX_ "panic: no strftime");
7211 #else
7212 #  if defined(WE_MUST_DEAL_WITH_MISMATCHED_CTYPE) && defined(USE_LOCALE_TIME)
7213
7214     const char * orig_CTYPE_locale = toggle_locale_c(LC_CTYPE,
7215                                                      querylocale_c(LC_TIME));
7216 #  endif
7217
7218     /* Guess an initial size for the returned string based on an expansion
7219      * factor of the input format, but with a minimum that should handle most
7220      * common cases.  If this guess is too small, we will try again with a
7221      * larger one */
7222     int bufsize = MAX(fmtlen * 2, 64);
7223
7224     char *buf = NULL;   /* Makes Renew() act as Newx() on the first iteration */
7225     do {
7226         Renew(buf, bufsize, char);
7227
7228         /* allowing user-supplied (rather than literal) formats is normally
7229          * frowned upon as a potential security risk; but this is part of the
7230          * API so we have to allow it (and the available formats have a much
7231          * lower chance of doing something bad than the ones for printf etc. */
7232         GCC_DIAG_IGNORE_STMT(-Wformat-nonliteral);
7233
7234 #ifdef WIN32    /* Windows will tell you if the input is invalid */
7235
7236         /* Needed because the LOCK might (or might not) save/restore errno */
7237         bool strftime_failed = false;
7238
7239         STRFTIME_LOCK;
7240         dSAVE_ERRNO;
7241         errno = 0;
7242
7243         int len = strftime(buf, bufsize, fmt, mytm);
7244         if (errno == EINVAL) {
7245             strftime_failed = true;
7246         }
7247
7248         RESTORE_ERRNO;
7249         STRFTIME_UNLOCK;
7250
7251         if (strftime_failed) {
7252             goto strftime_failed;
7253         }
7254
7255 #else
7256         STRFTIME_LOCK;
7257         int len = strftime(buf, bufsize, fmt, mytm);
7258         STRFTIME_UNLOCK;
7259 #endif
7260
7261         GCC_DIAG_RESTORE_STMT;
7262
7263         /* A non-zero return indicates success.  But to make sure we're not
7264          * dealing with some rogue strftime that returns how much space it
7265          * needs instead of 0 when there isn't enough, check that the return
7266          * indicates we have at least one byte of spare space (which will be
7267          * used for the terminating NUL). */
7268         if (inRANGE(len, 1, bufsize - 1)) {
7269             goto strftime_return;
7270         }
7271
7272         /* There are several possible reasons for a 0 return code for a
7273          * non-empty format, and they are not trivial to tease apart.  This
7274          * issue is a known bug in the strftime() API.  What we do to cope is
7275          * to assume that the reason is not enough space in the buffer, so
7276          * increase it and try again. */
7277         bufsize *= 2;
7278
7279         /* But don't just keep increasing the size indefinitely.  Stop when it
7280          * becomes obvious that the reason for failure is something besides not
7281          * enough space.  The most likely largest expanding format is %c.  On
7282          * khw's Linux box, the maximum result of this is 67 characters, in the
7283          * km_KH locale.  If a new script comes along that uses 4 UTF-8 bytes
7284          * per character, and with a similar expansion factor, that would be a
7285          * 268:2 byte ratio, or a bit more than 128:1 = 2**7:1.  Some strftime
7286          * implementations allow you to say %1000c to pad to 1000 bytes.  This
7287          * shows that it is impossible to implement this without a heuristic
7288          * (which can fail).  But it indicates we need to be generous in the
7289          * upper limit before failing.  The previous heuristic used was too
7290          * stingy.  Since the size doubles per iteration, it doesn't take many
7291          * to reach the limit */
7292     } while (bufsize < ((1 << 11) + 1) * fmtlen);
7293
7294     /* Here, strftime() returned 0, and it likely wasn't for lack of space.
7295      * There are two possible reasons:
7296      *
7297      * First is that the result is legitimately 0 length.  This can happen
7298      * when the format is precisely "%p".  That is the only documented format
7299      * that can have an empty result. */
7300     if (strEQ(fmt, "%p")) {
7301         Renew(buf, 1, char);
7302         *buf = '\0';
7303         goto strftime_return;
7304     }
7305
7306     /* The other reason is that the format string is malformed.  Probably it is
7307      * that the string is syntactically invalid for the locale.  On some
7308      * platforms an invalid conversion specifier '%?' (for all illegal '?') is
7309      * treated as a literal, but others may fail when '?' is illegal */
7310
7311 #  ifdef WIN32
7312   strftime_failed:
7313 #  endif
7314
7315     SET_EINVAL;
7316
7317     Safefree(buf);
7318     buf = NULL;
7319
7320   strftime_return:
7321
7322 #  if defined(WE_MUST_DEAL_WITH_MISMATCHED_CTYPE) && defined(USE_LOCALE_TIME)
7323
7324     restore_toggled_locale_c(LC_CTYPE, orig_CTYPE_locale);
7325
7326 #  endif
7327
7328     return buf;
7329
7330 #endif
7331
7332 }
7333
7334 STATIC char *
7335 S_strftime8(pTHX_ const char * fmt,
7336                   const struct tm * mytm,
7337                   const utf8ness_t fmt_utf8ness,
7338                   utf8ness_t * result_utf8ness,
7339                   const bool came_from_sv)
7340 {
7341     PERL_ARGS_ASSERT_STRFTIME8;
7342
7343     /* Wrap strftime_tm, taking into account the input and output UTF-8ness */
7344
7345 #ifdef USE_LOCALE_TIME
7346 #  define INDEX_TO_USE  LC_TIME_INDEX_
7347
7348     const char * locale = querylocale_c(LC_TIME);
7349     locale_utf8ness_t locale_utf8ness = LOCALE_UTF8NESS_UNKNOWN;
7350
7351 #else
7352 #  define INDEX_TO_USE  LC_ALL_INDEX_   /* Effectively out of bounds */
7353
7354     const char * locale = "C";
7355     locale_utf8ness_t locale_utf8ness = LOCALE_NOT_UTF8;
7356
7357 #endif
7358
7359     switch (fmt_utf8ness) {
7360       case UTF8NESS_IMMATERIAL:
7361         break;
7362
7363       case UTF8NESS_NO: /* Known not to be UTF-8; must not be UTF-8 locale */
7364         if (is_locale_utf8(locale)) {
7365             SET_EINVAL;
7366             return NULL;
7367         }
7368
7369         locale_utf8ness = LOCALE_NOT_UTF8;
7370         break;
7371
7372       case UTF8NESS_YES:    /* Known to be UTF-8; must be UTF-8 locale if can't
7373                                downgrade. */
7374         if (! is_locale_utf8(locale)) {
7375             locale_utf8ness = LOCALE_NOT_UTF8;
7376
7377             bool is_utf8 = true;
7378             Size_t fmt_len = strlen(fmt);
7379             fmt = (char *) bytes_from_utf8((U8 *) fmt, &fmt_len, &is_utf8);
7380             if (is_utf8) {
7381                 SET_EINVAL;
7382                 return NULL;
7383             }
7384
7385             SAVEFREEPV(fmt);
7386         }
7387         else {
7388             locale_utf8ness = LOCALE_IS_UTF8;
7389         }
7390
7391         break;
7392
7393       case UTF8NESS_UNKNOWN:
7394         if (! is_locale_utf8(locale)) {
7395             locale_utf8ness = LOCALE_NOT_UTF8;
7396         }
7397         else {
7398             locale_utf8ness = LOCALE_IS_UTF8;
7399             if (came_from_sv) {
7400
7401                 /* Upgrade 'fmt' to UTF-8 for a UTF-8 locale.  Otherwise the
7402                  * locale would find any UTF-8 variant characters to be
7403                  * malformed */
7404                 Size_t fmt_len = strlen(fmt);
7405                 fmt = (char *) bytes_to_utf8((U8 *) fmt, &fmt_len);
7406                 SAVEFREEPV(fmt);
7407             }
7408         }
7409
7410         break;
7411     }
7412
7413     char * retval = strftime_tm(fmt, mytm);
7414     *result_utf8ness = get_locale_string_utf8ness_i(retval,
7415                                                     locale_utf8ness,
7416                                                     locale,
7417                                                     INDEX_TO_USE);
7418     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
7419                           "fmt=%s, retval=%s; utf8ness=%d",
7420                           fmt,
7421                           ((is_utf8_string((U8 *) retval, 0))
7422                            ? retval
7423                            :_byte_dump_string((U8 *) retval, strlen(retval),0)),
7424                           *result_utf8ness));
7425     return retval;
7426
7427 #undef INDEX_TO_USE
7428
7429 }
7430
7431 #ifdef USE_LOCALE
7432
7433 STATIC void
7434 S_give_perl_locale_control(pTHX_
7435 #  ifdef LC_ALL
7436                            const char * lc_all_string,
7437 #  else
7438                            const char ** locales,
7439 #  endif
7440                            const line_t caller_line)
7441 {
7442     PERL_UNUSED_ARG(caller_line);
7443
7444     /* This is called when the program is in the global locale and are
7445      * switching to per-thread (if available).  And it is called at
7446      * initialization time to do the same.
7447      */
7448
7449 #  if defined(WIN32) && defined(USE_THREAD_SAFE_LOCALE)
7450
7451     /* On Windows, convert to per-thread behavior.  This isn't necessary in
7452      * POSIX 2008, as the conversion gets done automatically in the
7453      * void_setlocale_i() calls below. */
7454     if (_configthreadlocale(_ENABLE_PER_THREAD_LOCALE) == -1) {
7455         locale_panic_("_configthreadlocale returned an error");
7456     }
7457
7458 #  endif
7459 #  if ! defined(USE_THREAD_SAFE_LOCALE)                               \
7460    && ! defined(USE_POSIX_2008_LOCALE)
7461 #    if defined(LC_ALL)
7462     PERL_UNUSED_ARG(lc_all_string);
7463 #    else
7464     PERL_UNUSED_ARG(locales);
7465 #    endif
7466 #  else
7467
7468     /* This platform has per-thread locale handling.  Do the conversion. */
7469
7470 #    if defined(LC_ALL)
7471
7472     void_setlocale_c_with_caller(LC_ALL, lc_all_string, __FILE__, caller_line);
7473
7474 #    else
7475
7476     for_all_individual_category_indexes(i) {
7477         void_setlocale_i_with_caller(i, locales[i], __FILE__, caller_line);
7478     }
7479
7480 #    endif
7481 #  endif
7482
7483     /* Finally, update our remaining records.  'true' => force recalculation.
7484      * This is needed because we don't know what's happened while Perl hasn't
7485      * had control, so we need to figure out the current state */
7486
7487 #  if defined(LC_ALL)
7488
7489     new_LC_ALL(lc_all_string, true);
7490
7491 #    else
7492
7493     new_LC_ALL(calculate_LC_ALL_string(locales,
7494                                        INTERNAL_FORMAT,
7495                                        WANT_TEMP_PV,
7496                                        caller_line),
7497                true);
7498 #    endif
7499
7500 }
7501
7502 STATIC void
7503 S_output_check_environment_warning(pTHX_ const char * const language,
7504                                          const char * const lc_all,
7505                                          const char * const lang)
7506 {
7507     PerlIO_printf(Perl_error_log,
7508                   "perl: warning: Please check that your locale settings:\n");
7509
7510 #  ifdef __GLIBC__
7511
7512     PerlIO_printf(Perl_error_log, "\tLANGUAGE = %c%s%c,\n",
7513                                   language ? '"' : '(',
7514                                   language ? language : "unset",
7515                                   language ? '"' : ')');
7516 #  else
7517     PERL_UNUSED_ARG(language);
7518 #  endif
7519
7520     PerlIO_printf(Perl_error_log, "\tLC_ALL = %c%s%c,\n",
7521                                   lc_all ? '"' : '(',
7522                                   lc_all ? lc_all : "unset",
7523                                   lc_all ? '"' : ')');
7524
7525     for_all_individual_category_indexes(i) {
7526         const char * value = PerlEnv_getenv(category_names[i]);
7527         PerlIO_printf(Perl_error_log,
7528                       "\t%s = %c%s%c,\n",
7529                       category_names[i],
7530                       value ? '"' : '(',
7531                       value ? value : "unset",
7532                       value ? '"' : ')');
7533     }
7534
7535     PerlIO_printf(Perl_error_log, "\tLANG = %c%s%c\n",
7536                                   lang ? '"' : '(',
7537                                   lang ? lang : "unset",
7538                                   lang ? '"' : ')');
7539     PerlIO_printf(Perl_error_log,
7540                   "    are supported and installed on your system.\n");
7541 }
7542
7543 #endif
7544
7545 /* A helper macro for the next function.  Needed because would be called in two
7546  * places.  Knows about the internal workings of the function */
7547 #define GET_DESCRIPTION(trial, name)                                    \
7548     ((isNAME_C_OR_POSIX(name))                                          \
7549      ? "the standard locale"                                            \
7550      : ((trial == (system_default_trial)                                \
7551                   ? "the system default locale"                         \
7552                   : "a fallback locale")))
7553
7554 /*
7555  * Initialize locale awareness.
7556  */
7557 int
7558 Perl_init_i18nl10n(pTHX_ int printwarn)
7559 {
7560     /* printwarn is:
7561      *    0 if not to output warning when setup locale is bad
7562      *    1 if to output warning based on value of PERL_BADLANG
7563      *    >1 if to output regardless of PERL_BADLANG
7564      *
7565      * returns
7566      *    1 = set ok or not applicable,
7567      *    0 = fallback to a locale of lower priority
7568      *   -1 = fallback to all locales failed, not even to the C locale
7569      *
7570      * Under -DDEBUGGING, if the environment variable PERL_DEBUG_LOCALE_INIT is
7571      * set, debugging information is output.
7572      *
7573      * This routine effectively does the following in most cases:
7574      *
7575      *      basic initialization;
7576      *      asserts that the compiled tables are consistent;
7577      *      initialize data structures;
7578      *      make sure we are in the global locale;
7579      *      setlocale(LC_ALL, "");
7580      *      switch to per-thread locale if applicable;
7581      *
7582      * The "" causes the locale to be set to what the environment variables at
7583      * the time say it should be.
7584      *
7585      * To handle possible failures, the setlocale is expanded to be like:
7586      *
7587      *      trial_locale = pre-first-trial;
7588      *      while (has_another_trial()) {
7589      *          trial_locale = next_trial();
7590      *          if setlocale(LC_ALL, trial_locale) {
7591      *              ok = true;
7592      *              break;
7593      *          }
7594      *
7595      *          had_failure = true;
7596      *          warn();
7597      *      }
7598      *
7599      *      if (had_failure) {
7600      *          warn_even_more();
7601      *          if (! ok) warn_still_more();
7602      *      }
7603      *
7604      * The first trial is either:
7605      *      ""      to examine the environment variables for the locale
7606      *      NULL    to use the values already set for the locale by the program
7607      *              embedding this perl instantiation.
7608      *
7609      * Something is wrong if this trial fails, but there is a sequence of
7610      * fallbacks to try should that happen.  They are given in the enum below.
7611
7612      * If there is no LC_ALL defined on the system, the setlocale() above is
7613      * replaced by a loop setting each individual category separately.
7614      *
7615      * In a non-embeded environment, this code is executed exactly once.  It
7616      * sets up the global locale environment.  At the end, if some sort of
7617      * thread-safety is in effect, it will turn thread 0 into using that, with
7618      * the same locale as the global initially.  thread 0 can then change its
7619      * locale at will without affecting the global one.
7620      *
7621      * At destruction time, thread 0 will revert to the global locale as the
7622      * other threads die.
7623      *
7624      * Care must be taken in an embedded environment.  This code will be
7625      * executed for each instantiation.  Since it changes the global locale, it
7626      * could clash with another running instantiation that isn't using
7627      * per-thread locales.  perlembed suggests having the controlling program
7628      * set each instantiation's locale and set PERL_SKIP_LOCALE_INIT so this
7629      * code uses that without actually changing anything.  Then the onus is on
7630      * the controlling program to prevent any races.  The code below does
7631      * enough locking so as to prevent system calls from overwriting data
7632      * before it is safely copied here, but that isn't a general solution.
7633      */
7634
7635 #ifndef USE_LOCALE
7636
7637     PERL_UNUSED_ARG(printwarn);
7638     const int ok = 1;
7639
7640 #else  /* USE_LOCALE to near the end of the routine */
7641
7642     int ok = 0;
7643
7644 #  ifdef __GLIBC__
7645
7646     const char * const language = PerlEnv_getenv("LANGUAGE");
7647
7648 #  else
7649     const char * const language = NULL;     /* Unused placeholder */
7650 #  endif
7651
7652     /* A later getenv() could zap this, so only use here */
7653     const char * const bad_lang_use_once = PerlEnv_getenv("PERL_BADLANG");
7654
7655     const bool locwarn = (printwarn > 1
7656                           || (          printwarn
7657                               && (    ! bad_lang_use_once
7658                                   || (
7659                                          /* disallow with "" or "0" */
7660                                          *bad_lang_use_once
7661                                        && strNE("0", bad_lang_use_once)))));
7662
7663 #  ifndef DEBUGGING
7664 #    define DEBUG_LOCALE_INIT(a,b,c)
7665 #  else
7666
7667     DEBUG_INITIALIZATION_set(cBOOL(PerlEnv_getenv("PERL_DEBUG_LOCALE_INIT")));
7668
7669 #    define DEBUG_LOCALE_INIT(cat_index, locale, result)                    \
7670         DEBUG_L(PerlIO_printf(Perl_debug_log, "%s\n",                       \
7671                     setlocale_debug_string_i(cat_index, locale, result)));
7672
7673 #    ifdef LC_ALL
7674     assert(categories[LC_ALL_INDEX_] == LC_ALL);
7675     assert(strEQ(category_names[LC_ALL_INDEX_], "LC_ALL"));
7676 #      ifdef USE_POSIX_2008_LOCALE
7677     assert(category_masks[LC_ALL_INDEX_] == LC_ALL_MASK);
7678 #      endif
7679 #    endif
7680
7681     for_all_individual_category_indexes(i) {
7682         assert(category_name_lengths[i] == strlen(category_names[i]));
7683     }
7684
7685 #  endif    /* DEBUGGING */
7686
7687     /* Initialize the per-thread mbrFOO() state variables.  See POSIX.xs for
7688      * why these particular incantations are used. */
7689 #  ifdef HAS_MBRLEN
7690     memzero(&PL_mbrlen_ps, sizeof(PL_mbrlen_ps));
7691 #  endif
7692 #  ifdef HAS_MBRTOWC
7693     memzero(&PL_mbrtowc_ps, sizeof(PL_mbrtowc_ps));
7694 #  endif
7695 #  ifdef HAS_WCTOMBR
7696     wcrtomb(NULL, L'\0', &PL_wcrtomb_ps);
7697 #  endif
7698 #  ifdef USE_PL_CURLOCALES
7699
7700     for (unsigned int i = 0; i <= LC_ALL_INDEX_; i++) {
7701         PL_curlocales[i] = savepv("C");
7702     }
7703
7704 #  endif
7705 #  ifdef USE_PL_CUR_LC_ALL
7706
7707     PL_cur_LC_ALL = savepv("C");
7708
7709 #  endif
7710 #  if ! defined(PERL_LC_ALL_USES_NAME_VALUE_PAIRS) && defined(LC_ALL)
7711
7712     LOCALE_LOCK;
7713
7714     /* If we haven't done so already, translate the LC_ALL positions of
7715      * categories into our internal indices. */
7716     if (map_LC_ALL_position_to_index[0] == LC_ALL_INDEX_) {
7717
7718         /* Use this array, initialized by a config.h constant */
7719         int lc_all_category_positions[] = PERL_LC_ALL_CATEGORY_POSITIONS_INIT;
7720         STATIC_ASSERT_STMT(   C_ARRAY_LENGTH(lc_all_category_positions)
7721                            == LC_ALL_INDEX_);
7722
7723         for (unsigned int i = 0;
7724              i < C_ARRAY_LENGTH(lc_all_category_positions);
7725              i++)
7726         {
7727             map_LC_ALL_position_to_index[i] =
7728                               get_category_index(lc_all_category_positions[i]);
7729         }
7730     }
7731
7732     LOCALE_UNLOCK;
7733
7734 #  endif
7735 #  ifdef USE_POSIX_2008_LOCALE
7736
7737     /* This is a global, so be sure to keep another instance from zapping it */
7738     LOCALE_LOCK;
7739     if (PL_C_locale_obj) {
7740         LOCALE_UNLOCK;
7741     }
7742     else {
7743         PL_C_locale_obj = newlocale(LC_ALL_MASK, "C", (locale_t) 0);
7744         if (! PL_C_locale_obj) {
7745             LOCALE_UNLOCK;
7746             locale_panic_(Perl_form(aTHX_
7747                                 "Cannot create POSIX 2008 C locale object"));
7748         }
7749         LOCALE_UNLOCK;
7750
7751         DEBUG_Lv(PerlIO_printf(Perl_debug_log, "created C object %p\n",
7752                                                PL_C_locale_obj));
7753     }
7754
7755     /* Switch to using the POSIX 2008 interface now.  This would happen below
7756      * anyway, but deferring it can lead to leaks of memory that would also get
7757      * malloc'd in the interim.  We arbitrarily switch to the C locale,
7758      * overridden below  */
7759     if (! uselocale(PL_C_locale_obj)) {
7760         locale_panic_(Perl_form(aTHX_
7761                                 "Can't uselocale(%p), LC_ALL supposed to"
7762                                 " be 'C'",
7763                                 PL_C_locale_obj));
7764     }
7765
7766 #    ifdef MULTIPLICITY
7767
7768     PL_cur_locale_obj = PL_C_locale_obj;
7769
7770 #    endif
7771 #  endif
7772
7773     /* Now initialize some data structures.  This is entirely so that
7774      * later-executed code doesn't have to concern itself with things not being
7775      * initialized.  Arbitrarily use the C locale (which we know has to exist
7776      * on the system). */
7777
7778 #  ifdef USE_LOCALE_NUMERIC
7779
7780     PL_numeric_radix_sv    = newSV(1);
7781     PL_underlying_radix_sv = newSV(1);
7782     Newxz(PL_numeric_name, 1, char);    /* Single NUL character */
7783
7784 #  endif
7785 #  ifdef USE_LOCALE_COLLATE
7786
7787     Newxz(PL_collation_name, 1, char);
7788
7789 #  endif
7790 #  ifdef USE_LOCALE_CTYPE
7791
7792     Newxz(PL_ctype_name, 1, char);
7793
7794 #  endif
7795
7796     new_LC_ALL("C", true /* Don't shortcut */);
7797
7798 /*===========================================================================*/
7799
7800     /* Now ready to override the initialization with the values that the user
7801      * wants.  This is done in the global locale as explained in the
7802      * introductory comments to this function */
7803     switch_to_global_locale();
7804
7805     const char * const lc_all     = PerlEnv_getenv("LC_ALL");
7806     const char * const lang       = PerlEnv_getenv("LANG");
7807
7808     /* We try each locale in the enum, in order, until we get one that works,
7809      * or exhaust the list.  Normally the loop is executed just once.
7810      *
7811      * Each enum value is +1 from the previous */
7812     typedef enum {
7813             dummy_trial       = -1,
7814             environment_trial =  0,     /* "" or NULL; code below assumes value
7815                                            0 is the first real trial */
7816             LC_ALL_trial,               /* ENV{LC_ALL} */
7817             LANG_trial,                 /* ENV{LANG} */
7818             system_default_trial,       /* Windows .ACP */
7819             C_trial,                    /* C locale */
7820             beyond_final_trial,
7821     } trials;
7822
7823     trials trial;
7824     unsigned int already_checked = 0;
7825     const char * checked[C_trial];
7826
7827 #  ifdef LC_ALL
7828     const char * lc_all_string;
7829 #  else
7830     const char * curlocales[LC_ALL_INDEX_];
7831 #  endif
7832
7833     /* Loop through the initial setting and all the possible fallbacks,
7834      * breaking out of the loop on success */
7835     trial = dummy_trial;
7836     while (trial != beyond_final_trial) {
7837
7838         /* Each time through compute the next trial to use based on the one in
7839          * the previous iteration and switch to the new one.  This enforces the
7840          * order in which the fallbacks are applied */
7841       next_trial:
7842         trial = (trials) ((int) trial + 1);     /* Casts are needed for g++ */
7843
7844         const char * locale = NULL;
7845
7846         /* Set up the parameters for this trial */
7847         switch (trial) {
7848           case dummy_trial:
7849             locale_panic_("Unexpectedly got 'dummy_trial");
7850             break;
7851
7852           case environment_trial:
7853             /* This is either "" to get the values from the environment, or
7854              * NULL if the calling program has initialized the values already.
7855              * */
7856             locale = (PerlEnv_getenv("PERL_SKIP_LOCALE_INIT"))
7857                      ? NULL
7858                      : "";
7859             break;
7860
7861           case LC_ALL_trial:
7862             if (! lc_all || strEQ(lc_all, "")) {
7863                 continue;   /* No-op */
7864             }
7865
7866             locale = lc_all;
7867             break;
7868
7869           case LANG_trial:
7870             if (! lang || strEQ(lang, "")) {
7871                 continue;   /* No-op */
7872             }
7873
7874             locale = lang;
7875             break;
7876
7877           case system_default_trial:
7878
7879 #  if ! defined(WIN32) || ! defined(LC_ALL)
7880
7881             continue;   /* No-op */
7882
7883 #  else
7884             /* For Windows, we also try the system default locale before "C".
7885              * (If there exists a Windows without LC_ALL we skip this because
7886              * it gets too complicated.  For those, "C" is the next fallback
7887              * possibility). */
7888             locale = ".ACP";
7889 #  endif
7890             break;
7891
7892           case C_trial:
7893             locale = "C";
7894             break;
7895
7896           case beyond_final_trial:
7897             continue;     /* No-op, causes loop to exit */
7898         }
7899
7900         /* If the locale is a substantive name, don't try the same locale
7901          * twice. */
7902         if (locale && strNE(locale, "")) {
7903             for (unsigned int i = 0; i < already_checked; i++) {
7904                 if (strEQ(checked[i], locale)) {
7905                     goto next_trial;
7906                 }
7907             }
7908
7909             /* And, for future iterations, indicate we've tried this locale */
7910             assert(already_checked < C_ARRAY_LENGTH(checked));
7911             checked[already_checked] = savepv(locale);
7912             SAVEFREEPV(checked[already_checked]);
7913             already_checked++;
7914         }
7915
7916 #  ifdef LC_ALL
7917
7918         STDIZED_SETLOCALE_LOCK;
7919         lc_all_string = savepv(stdized_setlocale(LC_ALL, locale));
7920         STDIZED_SETLOCALE_UNLOCK;
7921
7922         DEBUG_LOCALE_INIT(LC_ALL_INDEX_, locale, lc_all_string);
7923
7924         if (LIKELY(lc_all_string)) {     /* Succeeded */
7925             ok = 1;
7926             break;
7927         }
7928
7929         if (trial == 0 && locwarn) {
7930             PerlIO_printf(Perl_error_log,
7931                                   "perl: warning: Setting locale failed.\n");
7932             output_check_environment_warning(language, lc_all, lang);
7933         }
7934
7935 #  else /* Below is ! LC_ALL */
7936
7937         bool setlocale_failure = FALSE;  /* This trial hasn't failed so far */
7938         bool dowarn = trial == 0 && locwarn;
7939
7940         for_all_individual_category_indexes(j) {
7941             STDIZED_SETLOCALE_LOCK;
7942             curlocales[j] = savepv(stdized_setlocale(categories[j], locale));
7943             STDIZED_SETLOCALE_UNLOCK;
7944
7945             DEBUG_LOCALE_INIT(j, locale, curlocales[j]);
7946
7947             if (UNLIKELY(! curlocales[j])) {
7948                 setlocale_failure = TRUE;
7949
7950                 /* If are going to warn below, continue to loop so all failures
7951                  * are included in the message */
7952                 if (! dowarn) {
7953                     break;
7954                 }
7955             }
7956         }
7957
7958         if (LIKELY(! setlocale_failure)) {  /* All succeeded */
7959             ok = 1;
7960             break;  /* Exit trial_locales loop */
7961         }
7962
7963         /* Here, this trial failed */
7964
7965         if (dowarn) {
7966             PerlIO_printf(Perl_error_log,
7967                 "perl: warning: Setting locale failed for the categories:\n");
7968
7969             for_all_individual_category_indexes(j) {
7970                 if (! curlocales[j]) {
7971                     PerlIO_printf(Perl_error_log, "\t%s\n", category_names[j]);
7972                 }
7973             }
7974
7975             output_check_environment_warning(language, lc_all, lang);
7976         }   /* end of warning on first failure */
7977
7978 #  endif /* LC_ALL */
7979
7980     }   /* end of looping through the trial locales */
7981
7982     /* If we had to do more than the first trial, it means that one failed, and
7983      * we may need to output a warning, and, if none worked, do more */
7984     if (UNLIKELY(trial != 0)) {
7985         if (locwarn) {
7986             const char * description = "a fallback locale";
7987             const char * name = NULL;;
7988
7989             /* If we didn't find a good fallback, list all we tried */
7990             if (! ok && already_checked > 0) {
7991                 PerlIO_printf(Perl_error_log, "perl: warning: Failed to fall"
7992                                               " back to ");
7993                 if (already_checked > 1) {  /* more than one was tried */
7994                     PerlIO_printf(Perl_error_log, "any of:\n");
7995                 }
7996
7997                 while (already_checked > 0) {
7998                     name = checked[--already_checked];
7999                     description = GET_DESCRIPTION(trial, name);
8000                     PerlIO_printf(Perl_error_log, "%s (\"%s\")\n",
8001                                                   description, name);
8002                 }
8003             }
8004
8005             if (ok) {
8006
8007                 /* Here, a fallback worked.  So we have saved its name, and the
8008                  * trial that succeeded is still valid */
8009 #  ifdef LC_ALL
8010                 const char * individ_locales[LC_ALL_INDEX_] = { NULL };
8011
8012                 /* Even though we know the valid string for LC_ALL that worked,
8013                  * translate it into our internal format, which is the
8014                  * name=value pairs notation.  This is easier for a human to
8015                  * decipher than the positional notation.  Some platforms
8016                  * can return "C C C C C C" for LC_ALL.  This code also
8017                  * standardizes that result into plain "C". */
8018                 switch (parse_LC_ALL_string(lc_all_string,
8019                                             (const char **) &individ_locales,
8020                                             no_override,
8021                                             false,   /* Return only [0] if
8022                                                         suffices */
8023                                             false,   /* Don't panic on error */
8024                                             __LINE__))
8025                 {
8026                   case invalid:
8027
8028                     /* Here, the parse failed, which shouldn't happen, but if
8029                      * it does, we have an easy fallback that allows us to keep
8030                      * going. */
8031                     name = lc_all_string;
8032                     break;
8033
8034                   case no_array:    /* The original is a single locale */
8035                     name = lc_all_string;
8036                     break;
8037
8038                   case only_element_0:  /* element[0] is a single locale valid
8039                                            for all categories */
8040                     SAVEFREEPV(individ_locales[0]);
8041                     name = individ_locales[0];
8042                     break;
8043
8044                   case full_array:
8045                     name = calculate_LC_ALL_string(individ_locales,
8046                                                    INTERNAL_FORMAT,
8047                                                    WANT_TEMP_PV,
8048                                                    __LINE__);
8049                     for_all_individual_category_indexes(j) {
8050                         Safefree(individ_locales[j]);
8051                     }
8052                 }
8053 #  else
8054                 name = calculate_LC_ALL_string(curlocales,
8055                                                INTERNAL_FORMAT,
8056                                                WANT_TEMP_PV,
8057                                                __LINE__);
8058 #  endif
8059                 description = GET_DESCRIPTION(trial, name);
8060             }
8061             else {
8062
8063                 /* Nothing seems to be working, yet we want to continue
8064                  * executing.  It may well be that locales are mostly
8065                  * irrelevant to this particular program, and there must be
8066                  * some locale underlying the program.  Figure it out as best
8067                  * we can, by querying the system's current locale */
8068
8069 #  ifdef LC_ALL
8070
8071                 STDIZED_SETLOCALE_LOCK;
8072                 name = stdized_setlocale(LC_ALL, NULL);
8073                 STDIZED_SETLOCALE_UNLOCK;
8074
8075                 if (UNLIKELY(! name)) {
8076                     name = "locale name not determinable";
8077                 }
8078
8079 #  else /* Below is ! LC_ALL */
8080
8081                 const char * system_locales[LC_ALL_INDEX_] = { NULL };
8082
8083                 for_all_individual_category_indexes(j) {
8084                     STDIZED_SETLOCALE_LOCK;
8085                     system_locales[j] = savepv(stdized_setlocale(categories[j],
8086                                                                  NULL));
8087                     STDIZED_SETLOCALE_UNLOCK;
8088
8089                     if (UNLIKELY(! system_locales[j])) {
8090                         system_locales[j] = "not determinable";
8091                     }
8092                 }
8093
8094                 /* We use the name=value form for the string, as that is more
8095                  * human readable than the positional notation */
8096                 name = calculate_LC_ALL_string(system_locales,
8097                                                INTERNAL_FORMAT,
8098                                                WANT_TEMP_PV,
8099                                                __LINE__);
8100                 description = "what the system says";
8101
8102                 for_all_individual_category_indexes(j) {
8103                     Safefree(system_locales[j]);
8104                 }
8105 #  endif
8106             }
8107
8108             PerlIO_printf(Perl_error_log,
8109                           "perl: warning: Falling back to %s (\"%s\").\n",
8110                           description, name);
8111
8112             /* Here, ok being true indicates that the first attempt failed, but
8113              * a fallback succeeded; false => nothing working.  Translate to
8114              * API return values. */
8115             ok = (ok) ? 0 : -1;
8116         }
8117     }
8118
8119 #  ifdef LC_ALL
8120
8121     give_perl_locale_control(lc_all_string, __LINE__);
8122     Safefree(lc_all_string);
8123
8124 #  else
8125
8126     give_perl_locale_control((const char **) &curlocales, __LINE__);
8127
8128     for_all_individual_category_indexes(j) {
8129         Safefree(curlocales[j]);
8130     }
8131
8132 #  endif
8133 #  if defined(USE_PERLIO) && defined(USE_LOCALE_CTYPE)
8134
8135     /* Set PL_utf8locale to TRUE if using PerlIO _and_ the current LC_CTYPE
8136      * locale is UTF-8.  give_perl_locale_control() just above has already
8137      * calculated the latter value and saved it in PL_in_utf8_CTYPE_locale. If
8138      * both PL_utf8locale and PL_unicode (set by -C or by $ENV{PERL_UNICODE})
8139      * are true, perl.c:S_parse_body() will turn on the PerlIO :utf8 layer on
8140      * STDIN, STDOUT, STDERR, _and_ the default open discipline.  */
8141     PL_utf8locale = PL_in_utf8_CTYPE_locale;
8142
8143     /* Set PL_unicode to $ENV{PERL_UNICODE} if using PerlIO.
8144        This is an alternative to using the -C command line switch
8145        (the -C if present will override this). */
8146     {
8147          const char *p = PerlEnv_getenv("PERL_UNICODE");
8148          PL_unicode = p ? parse_unicode_opts(&p) : 0;
8149          if (PL_unicode & PERL_UNICODE_UTF8CACHEASSERT_FLAG)
8150              PL_utf8cache = -1;
8151     }
8152
8153 #  endif
8154 #  if defined(USE_POSIX_2008_LOCALE) && defined(MULTIPLICITY)
8155     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
8156                            "finished Perl_init_i18nl10n; actual obj=%p,"
8157                            " expected obj=%p, initial=%s\n",
8158                            uselocale(0), PL_cur_locale_obj,
8159                            get_LC_ALL_display()));
8160 #  endif
8161
8162     /* So won't continue to output stuff */
8163     DEBUG_INITIALIZATION_set(FALSE);
8164
8165 #endif /* USE_LOCALE */
8166
8167     return ok;
8168 }
8169
8170 #undef GET_DESCRIPTION
8171 #ifdef USE_LOCALE_COLLATE
8172
8173 STATIC void
8174 S_compute_collxfrm_coefficients(pTHX)
8175 {
8176
8177     /* A locale collation definition includes primary, secondary, tertiary,
8178      * etc. weights for each character.  To sort, the primary weights are used,
8179      * and only if they compare equal, then the secondary weights are used, and
8180      * only if they compare equal, then the tertiary, etc.
8181      *
8182      * strxfrm() works by taking the input string, say ABC, and creating an
8183      * output transformed string consisting of first the primary weights,
8184      * A¹B¹C¹ followed by the secondary ones, A²B²C²; and then the tertiary,
8185      * etc, yielding A¹B¹C¹ A²B²C² A³B³C³ ....  Some characters may not have
8186      * weights at every level.  In our example, let's say B doesn't have a
8187      * tertiary weight, and A doesn't have a secondary weight.  The constructed
8188      * string is then going to be
8189      *  A¹B¹C¹ B²C² A³C³ ....
8190      * This has the desired effect that strcmp() will look at the secondary or
8191      * tertiary weights only if the strings compare equal at all higher
8192      * priority weights.  The spaces shown here, like in
8193      *  "A¹B¹C¹ A²B²C² "
8194      * are not just for readability.  In the general case, these must actually
8195      * be bytes, which we will call here 'separator weights'; and they must be
8196      * smaller than any other weight value, but since these are C strings, only
8197      * the terminating one can be a NUL (some implementations may include a
8198      * non-NUL separator weight just before the NUL).  Implementations tend to
8199      * reserve 01 for the separator weights.  They are needed so that a shorter
8200      * string's secondary weights won't be misconstrued as primary weights of a
8201      * longer string, etc.  By making them smaller than any other weight, the
8202      * shorter string will sort first.  (Actually, if all secondary weights are
8203      * smaller than all primary ones, there is no need for a separator weight
8204      * between those two levels, etc.)
8205      *
8206      * The length of the transformed string is roughly a linear function of the
8207      * input string.  It's not exactly linear because some characters don't
8208      * have weights at all levels.  When we call strxfrm() we have to allocate
8209      * some memory to hold the transformed string.  The calculations below try
8210      * to find coefficients 'm' and 'b' for this locale so that m*x + b equals
8211      * how much space we need, given the size of the input string in 'x'.  If
8212      * we calculate too small, we increase the size as needed, and call
8213      * strxfrm() again, but it is better to get it right the first time to
8214      * avoid wasted expensive string transformations.
8215      *
8216      * We use the string below to find how long the transformation of it is.
8217      * Almost all locales are supersets of ASCII, or at least the ASCII
8218      * letters.  We use all of them, half upper half lower, because if we used
8219      * fewer, we might hit just the ones that are outliers in a particular
8220      * locale.  Most of the strings being collated will contain a preponderance
8221      * of letters, and even if they are above-ASCII, they are likely to have
8222      * the same number of weight levels as the ASCII ones.  It turns out that
8223      * digits tend to have fewer levels, and some punctuation has more, but
8224      * those are relatively sparse in text, and khw believes this gives a
8225      * reasonable result, but it could be changed if experience so dictates. */
8226     const char longer[] = "ABCDEFGHIJKLMnopqrstuvwxyz";
8227     char * x_longer;        /* Transformed 'longer' */
8228     Size_t x_len_longer;    /* Length of 'x_longer' */
8229
8230     char * x_shorter;   /* We also transform a substring of 'longer' */
8231     Size_t x_len_shorter;
8232
8233     PL_in_utf8_COLLATE_locale = (PL_collation_standard)
8234                                 ? 0
8235                                 : is_locale_utf8(PL_collation_name);
8236     PL_strxfrm_NUL_replacement = '\0';
8237     PL_strxfrm_max_cp = 0;
8238
8239     /* mem_collxfrm_() is used get the transformation (though here we are
8240      * interested only in its length).  It is used because it has the
8241      * intelligence to handle all cases, but to work, it needs some values of
8242      * 'm' and 'b' to get it started.  For the purposes of this calculation we
8243      * use a very conservative estimate of 'm' and 'b'.  This assumes a weight
8244      * can be multiple bytes, enough to hold any UV on the platform, and there
8245      * are 5 levels, 4 weight bytes, and a trailing NUL.  */
8246     PL_collxfrm_base = 5;
8247     PL_collxfrm_mult = 5 * sizeof(UV);
8248
8249     /* Find out how long the transformation really is */
8250     x_longer = mem_collxfrm_(longer,
8251                              sizeof(longer) - 1,
8252                              &x_len_longer,
8253
8254                              /* We avoid converting to UTF-8 in the called
8255                               * function by telling it the string is in UTF-8
8256                               * if the locale is a UTF-8 one.  Since the string
8257                               * passed here is invariant under UTF-8, we can
8258                               * claim it's UTF-8 even if it isn't.  */
8259                               PL_in_utf8_COLLATE_locale);
8260     Safefree(x_longer);
8261
8262     /* Find out how long the transformation of a substring of 'longer' is.
8263      * Together the lengths of these transformations are sufficient to
8264      * calculate 'm' and 'b'.  The substring is all of 'longer' except the
8265      * first character.  This minimizes the chances of being swayed by outliers
8266      * */
8267     x_shorter = mem_collxfrm_(longer + 1,
8268                               sizeof(longer) - 2,
8269                               &x_len_shorter,
8270                               PL_in_utf8_COLLATE_locale);
8271     Safefree(x_shorter);
8272
8273     /* If the results are nonsensical for this simple test, the whole locale
8274      * definition is suspect.  Mark it so that locale collation is not active
8275      * at all for it.  XXX Should we warn? */
8276     if (   x_len_shorter == 0
8277         || x_len_longer == 0
8278         || x_len_shorter >= x_len_longer)
8279     {
8280         PL_collxfrm_mult = 0;
8281         PL_collxfrm_base = 1;
8282         DEBUG_L(PerlIO_printf(Perl_debug_log,
8283                 "Disabling locale collation for LC_COLLATE='%s';"
8284                 " length for shorter sample=%zu; longer=%zu\n",
8285                 PL_collation_name, x_len_shorter, x_len_longer));
8286     }
8287     else {
8288         SSize_t base;       /* Temporary */
8289
8290         /* We have both: m * strlen(longer)  + b = x_len_longer
8291          *               m * strlen(shorter) + b = x_len_shorter;
8292          * subtracting yields:
8293          *          m * (strlen(longer) - strlen(shorter))
8294          *                             = x_len_longer - x_len_shorter
8295          * But we have set things up so that 'shorter' is 1 byte smaller than
8296          * 'longer'.  Hence:
8297          *          m = x_len_longer - x_len_shorter
8298          *
8299          * But if something went wrong, make sure the multiplier is at least 1.
8300          */
8301         if (x_len_longer > x_len_shorter) {
8302             PL_collxfrm_mult = (STRLEN) x_len_longer - x_len_shorter;
8303         }
8304         else {
8305             PL_collxfrm_mult = 1;
8306         }
8307
8308         /*     mx + b = len
8309          * so:      b = len - mx
8310          * but in case something has gone wrong, make sure it is non-negative
8311          * */
8312         base = x_len_longer - PL_collxfrm_mult * (sizeof(longer) - 1);
8313         if (base < 0) {
8314             base = 0;
8315         }
8316
8317         /* Add 1 for the trailing NUL */
8318         PL_collxfrm_base = base + 1;
8319     }
8320
8321     DEBUG_L(PerlIO_printf(Perl_debug_log,
8322                           "?UTF-8 locale=%d; x_len_shorter=%zu, "
8323                           "x_len_longer=%zu,"
8324                           " collate multipler=%zu, collate base=%zu\n",
8325                           PL_in_utf8_COLLATE_locale,
8326                           x_len_shorter, x_len_longer,
8327                           PL_collxfrm_mult, PL_collxfrm_base));
8328 }
8329
8330 char *
8331 Perl_mem_collxfrm_(pTHX_ const char *input_string,
8332                          STRLEN len,    /* Length of 'input_string' */
8333                          STRLEN *xlen,  /* Set to length of returned string
8334                                            (not including the collation index
8335                                            prefix) */
8336                          bool utf8      /* Is the input in UTF-8? */
8337                    )
8338 {
8339     /* mem_collxfrm_() is like strxfrm() but with two important differences.
8340      * First, it handles embedded NULs. Second, it allocates a bit more memory
8341      * than needed for the transformed data itself.  The real transformed data
8342      * begins at offset COLLXFRM_HDR_LEN.  *xlen is set to the length of that,
8343      * and doesn't include the collation index size.
8344      *
8345      * It is the caller's responsibility to eventually free the memory returned
8346      * by this function.
8347      *
8348      * Please see sv_collxfrm() to see how this is used. */
8349
8350 #  define COLLXFRM_HDR_LEN    sizeof(PL_collation_ix)
8351
8352     char * s = (char *) input_string;
8353     STRLEN s_strlen = strlen(input_string);
8354     char *xbuf = NULL;
8355     STRLEN xAlloc;          /* xalloc is a reserved word in VC */
8356     STRLEN length_in_chars;
8357     bool first_time = TRUE; /* Cleared after first loop iteration */
8358
8359 #  ifdef USE_LOCALE_CTYPE
8360         const char * orig_CTYPE_locale = NULL;
8361 #  endif
8362
8363 #  if defined(USE_POSIX_2008_LOCALE) && defined HAS_STRXFRM_L
8364     locale_t constructed_locale = (locale_t) 0;
8365 #  endif
8366
8367     PERL_ARGS_ASSERT_MEM_COLLXFRM_;
8368
8369     /* Must be NUL-terminated */
8370     assert(*(input_string + len) == '\0');
8371
8372     if (PL_collxfrm_mult == 0) {     /* unknown or bad */
8373         if (PL_collxfrm_base != 0) { /* bad collation => skip */
8374             DEBUG_L(PerlIO_printf(Perl_debug_log,
8375                           "mem_collxfrm_: locale's collation is defective\n"));
8376             goto bad;
8377         }
8378
8379         /* (mult, base) == (0,0) means we need to calculate mult and base
8380          * before proceeding */
8381         S_compute_collxfrm_coefficients(aTHX);
8382     }
8383
8384     /* Replace any embedded NULs with the control that sorts before any others.
8385      * This will give as good as possible results on strings that don't
8386      * otherwise contain that character, but otherwise there may be
8387      * less-than-perfect results with that character and NUL.  This is
8388      * unavoidable unless we replace strxfrm with our own implementation. */
8389     if (UNLIKELY(s_strlen < len)) {   /* Only execute if there is an embedded
8390                                          NUL */
8391         char * e = s + len;
8392         char * sans_nuls;
8393         STRLEN sans_nuls_len;
8394         int try_non_controls;
8395         char this_replacement_char[] = "?\0";   /* Room for a two-byte string,
8396                                                    making sure 2nd byte is NUL.
8397                                                  */
8398         STRLEN this_replacement_len;
8399
8400         /* If we don't know what non-NUL control character sorts lowest for
8401          * this locale, find it */
8402         if (PL_strxfrm_NUL_replacement == '\0') {
8403             int j;
8404             char * cur_min_x = NULL;    /* The min_char's xfrm, (except it also
8405                                            includes the collation index
8406                                            prefixed. */
8407
8408             DEBUG_Lv(PerlIO_printf(Perl_debug_log, "Looking to replace NUL\n"));
8409
8410             /* Unlikely, but it may be that no control will work to replace
8411              * NUL, in which case we instead look for any character.  Controls
8412              * are preferred because collation order is, in general, context
8413              * sensitive, with adjoining characters affecting the order, and
8414              * controls are less likely to have such interactions, allowing the
8415              * NUL-replacement to stand on its own.  (Another way to look at it
8416              * is to imagine what would happen if the NUL were replaced by a
8417              * combining character; it wouldn't work out all that well.) */
8418             for (try_non_controls = 0;
8419                  try_non_controls < 2;
8420                  try_non_controls++)
8421             {
8422
8423 #  ifdef USE_LOCALE_CTYPE
8424
8425                 /* In this case we use isCNTRL_LC() below, which relies on
8426                  * LC_CTYPE, so that must be switched to correspond with the
8427                  * LC_COLLATE locale */
8428                 if (! try_non_controls && ! PL_in_utf8_COLLATE_locale) {
8429                     orig_CTYPE_locale = toggle_locale_c(LC_CTYPE,
8430                                                         PL_collation_name);
8431                 }
8432 #  endif
8433                 /* Look through all legal code points (NUL isn't) */
8434                 for (j = 1; j < 256; j++) {
8435                     char * x;       /* j's xfrm plus collation index */
8436                     STRLEN x_len;   /* length of 'x' */
8437                     STRLEN trial_len = 1;
8438                     char cur_source[] = { '\0', '\0' };
8439
8440                     /* Skip non-controls the first time through the loop.  The
8441                      * controls in a UTF-8 locale are the L1 ones */
8442                     if (! try_non_controls && (PL_in_utf8_COLLATE_locale)
8443                                                ? ! isCNTRL_L1(j)
8444                                                : ! isCNTRL_LC(j))
8445                     {
8446                         continue;
8447                     }
8448
8449                     /* Create a 1-char string of the current code point */
8450                     cur_source[0] = (char) j;
8451
8452                     /* Then transform it */
8453                     x = mem_collxfrm_(cur_source, trial_len, &x_len,
8454                                       0 /* The string is not in UTF-8 */);
8455
8456                     /* Ignore any character that didn't successfully transform.
8457                      * */
8458                     if (! x) {
8459                         continue;
8460                     }
8461
8462                     /* If this character's transformation is lower than
8463                      * the current lowest, this one becomes the lowest */
8464                     if (   cur_min_x == NULL
8465                         || strLT(x         + COLLXFRM_HDR_LEN,
8466                                  cur_min_x + COLLXFRM_HDR_LEN))
8467                     {
8468                         PL_strxfrm_NUL_replacement = j;
8469                         Safefree(cur_min_x);
8470                         cur_min_x = x;
8471                     }
8472                     else {
8473                         Safefree(x);
8474                     }
8475                 } /* end of loop through all 255 characters */
8476
8477 #  ifdef USE_LOCALE_CTYPE
8478                 restore_toggled_locale_c(LC_CTYPE, orig_CTYPE_locale);
8479 #  endif
8480
8481                 /* Stop looking if found */
8482                 if (cur_min_x) {
8483                     break;
8484                 }
8485
8486                 /* Unlikely, but possible, if there aren't any controls that
8487                  * work in the locale, repeat the loop, looking for any
8488                  * character that works */
8489                 DEBUG_L(PerlIO_printf(Perl_debug_log,
8490                 "mem_collxfrm_: No control worked.  Trying non-controls\n"));
8491             } /* End of loop to try first the controls, then any char */
8492
8493             if (! cur_min_x) {
8494                 DEBUG_L(PerlIO_printf(Perl_debug_log,
8495                     "mem_collxfrm_: Couldn't find any character to replace"
8496                     " embedded NULs in locale %s with", PL_collation_name));
8497                 goto bad;
8498             }
8499
8500             DEBUG_L(PerlIO_printf(Perl_debug_log,
8501                     "mem_collxfrm_: Replacing embedded NULs in locale %s with "
8502                     "0x%02X\n", PL_collation_name, PL_strxfrm_NUL_replacement));
8503
8504             Safefree(cur_min_x);
8505         } /* End of determining the character that is to replace NULs */
8506
8507         /* If the replacement is variant under UTF-8, it must match the
8508          * UTF8-ness of the original */
8509         if ( ! UVCHR_IS_INVARIANT(PL_strxfrm_NUL_replacement) && utf8) {
8510             this_replacement_char[0] =
8511                                 UTF8_EIGHT_BIT_HI(PL_strxfrm_NUL_replacement);
8512             this_replacement_char[1] =
8513                                 UTF8_EIGHT_BIT_LO(PL_strxfrm_NUL_replacement);
8514             this_replacement_len = 2;
8515         }
8516         else {
8517             this_replacement_char[0] = PL_strxfrm_NUL_replacement;
8518             /* this_replacement_char[1] = '\0' was done at initialization */
8519             this_replacement_len = 1;
8520         }
8521
8522         /* The worst case length for the replaced string would be if every
8523          * character in it is NUL.  Multiply that by the length of each
8524          * replacement, and allow for a trailing NUL */
8525         sans_nuls_len = (len * this_replacement_len) + 1;
8526         Newx(sans_nuls, sans_nuls_len, char);
8527         *sans_nuls = '\0';
8528
8529         /* Replace each NUL with the lowest collating control.  Loop until have
8530          * exhausted all the NULs */
8531         while (s + s_strlen < e) {
8532             my_strlcat(sans_nuls, s, sans_nuls_len);
8533
8534             /* Do the actual replacement */
8535             my_strlcat(sans_nuls, this_replacement_char, sans_nuls_len);
8536
8537             /* Move past the input NUL */
8538             s += s_strlen + 1;
8539             s_strlen = strlen(s);
8540         }
8541
8542         /* And add anything that trails the final NUL */
8543         my_strlcat(sans_nuls, s, sans_nuls_len);
8544
8545         /* Switch so below we transform this modified string */
8546         s = sans_nuls;
8547         len = strlen(s);
8548     } /* End of replacing NULs */
8549
8550     /* Make sure the UTF8ness of the string and locale match */
8551     if (utf8 != PL_in_utf8_COLLATE_locale) {
8552         /* XXX convert above Unicode to 10FFFF? */
8553         const char * const t = s;   /* Temporary so we can later find where the
8554                                        input was */
8555
8556         /* Here they don't match.  Change the string's to be what the locale is
8557          * expecting */
8558
8559         if (! utf8) { /* locale is UTF-8, but input isn't; upgrade the input */
8560             s = (char *) bytes_to_utf8((const U8 *) s, &len);
8561             utf8 = TRUE;
8562         }
8563         else {   /* locale is not UTF-8; but input is; downgrade the input */
8564
8565             s = (char *) bytes_from_utf8((const U8 *) s, &len, &utf8);
8566
8567             /* If the downgrade was successful we are done, but if the input
8568              * contains things that require UTF-8 to represent, have to do
8569              * damage control ... */
8570             if (UNLIKELY(utf8)) {
8571
8572                 /* What we do is construct a non-UTF-8 string with
8573                  *  1) the characters representable by a single byte converted
8574                  *     to be so (if necessary);
8575                  *  2) and the rest converted to collate the same as the
8576                  *     highest collating representable character.  That makes
8577                  *     them collate at the end.  This is similar to how we
8578                  *     handle embedded NULs, but we use the highest collating
8579                  *     code point instead of the smallest.  Like the NUL case,
8580                  *     this isn't perfect, but is the best we can reasonably
8581                  *     do.  Every above-255 code point will sort the same as
8582                  *     the highest-sorting 0-255 code point.  If that code
8583                  *     point can combine in a sequence with some other code
8584                  *     points for weight calculations, us changing something to
8585                  *     be it can adversely affect the results.  But in most
8586                  *     cases, it should work reasonably.  And note that this is
8587                  *     really an illegal situation: using code points above 255
8588                  *     on a locale where only 0-255 are valid.  If two strings
8589                  *     sort entirely equal, then the sort order for the
8590                  *     above-255 code points will be in code point order. */
8591
8592                 utf8 = FALSE;
8593
8594                 /* If we haven't calculated the code point with the maximum
8595                  * collating order for this locale, do so now */
8596                 if (! PL_strxfrm_max_cp) {
8597                     int j;
8598
8599                     /* The current transformed string that collates the
8600                      * highest (except it also includes the prefixed collation
8601                      * index. */
8602                     char * cur_max_x = NULL;
8603
8604                     /* Look through all legal code points (NUL isn't) */
8605                     for (j = 1; j < 256; j++) {
8606                         char * x;
8607                         STRLEN x_len;
8608                         char cur_source[] = { '\0', '\0' };
8609
8610                         /* Create a 1-char string of the current code point */
8611                         cur_source[0] = (char) j;
8612
8613                         /* Then transform it */
8614                         x = mem_collxfrm_(cur_source, 1, &x_len, FALSE);
8615
8616                         /* If something went wrong (which it shouldn't), just
8617                          * ignore this code point */
8618                         if (! x) {
8619                             continue;
8620                         }
8621
8622                         /* If this character's transformation is higher than
8623                          * the current highest, this one becomes the highest */
8624                         if (   cur_max_x == NULL
8625                             || strGT(x         + COLLXFRM_HDR_LEN,
8626                                      cur_max_x + COLLXFRM_HDR_LEN))
8627                         {
8628                             PL_strxfrm_max_cp = j;
8629                             Safefree(cur_max_x);
8630                             cur_max_x = x;
8631                         }
8632                         else {
8633                             Safefree(x);
8634                         }
8635                     }
8636
8637                     if (! cur_max_x) {
8638                         DEBUG_L(PerlIO_printf(Perl_debug_log,
8639                             "mem_collxfrm_: Couldn't find any character to"
8640                             " replace above-Latin1 chars in locale %s with",
8641                             PL_collation_name));
8642                         goto bad;
8643                     }
8644
8645                     DEBUG_L(PerlIO_printf(Perl_debug_log,
8646                             "mem_collxfrm_: highest 1-byte collating character"
8647                             " in locale %s is 0x%02X\n",
8648                             PL_collation_name,
8649                             PL_strxfrm_max_cp));
8650
8651                     Safefree(cur_max_x);
8652                 }
8653
8654                 /* Here we know which legal code point collates the highest.
8655                  * We are ready to construct the non-UTF-8 string.  The length
8656                  * will be at least 1 byte smaller than the input string
8657                  * (because we changed at least one 2-byte character into a
8658                  * single byte), but that is eaten up by the trailing NUL */
8659                 Newx(s, len, char);
8660
8661                 {
8662                     STRLEN i;
8663                     STRLEN d= 0;
8664                     char * e = (char *) t + len;
8665
8666                     for (i = 0; i < len; i+= UTF8SKIP(t + i)) {
8667                         U8 cur_char = t[i];
8668                         if (UTF8_IS_INVARIANT(cur_char)) {
8669                             s[d++] = cur_char;
8670                         }
8671                         else if (UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(t + i, e)) {
8672                             s[d++] = EIGHT_BIT_UTF8_TO_NATIVE(cur_char, t[i+1]);
8673                         }
8674                         else {  /* Replace illegal cp with highest collating
8675                                    one */
8676                             s[d++] = PL_strxfrm_max_cp;
8677                         }
8678                     }
8679                     s[d++] = '\0';
8680                     Renew(s, d, char);   /* Free up unused space */
8681                 }
8682             }
8683         }
8684
8685         /* Here, we have constructed a modified version of the input.  It could
8686          * be that we already had a modified copy before we did this version.
8687          * If so, that copy is no longer needed */
8688         if (t != input_string) {
8689             Safefree(t);
8690         }
8691     }
8692
8693     length_in_chars = (utf8)
8694                       ? utf8_length((U8 *) s, (U8 *) s + len)
8695                       : len;
8696
8697     /* The first element in the output is the collation id, used by
8698      * sv_collxfrm(); then comes the space for the transformed string.  The
8699      * equation should give us a good estimate as to how much is needed */
8700     xAlloc = COLLXFRM_HDR_LEN
8701            + PL_collxfrm_base
8702            + (PL_collxfrm_mult * length_in_chars);
8703     Newx(xbuf, xAlloc, char);
8704     if (UNLIKELY(! xbuf)) {
8705         DEBUG_L(PerlIO_printf(Perl_debug_log,
8706                       "mem_collxfrm_: Couldn't malloc %zu bytes\n", xAlloc));
8707         goto bad;
8708     }
8709
8710     /* Store the collation id */
8711     *(PERL_UINTMAX_T *)xbuf = PL_collation_ix;
8712
8713 #  if defined(USE_POSIX_2008_LOCALE) && defined HAS_STRXFRM_L
8714 #    ifdef USE_LOCALE_CTYPE
8715
8716     constructed_locale = newlocale(LC_CTYPE_MASK, PL_collation_name,
8717                                    duplocale(use_curlocale_scratch()));
8718 #    else
8719
8720     constructed_locale = duplocale(use_curlocale_scratch());
8721
8722 #    endif
8723 #    define my_strxfrm(dest, src, n)  strxfrm_l(dest, src, n,           \
8724                                                 constructed_locale)
8725 #    define CLEANUP_STRXFRM                                             \
8726         STMT_START {                                                    \
8727             if (constructed_locale != (locale_t) 0)                     \
8728                 freelocale(constructed_locale);                         \
8729         } STMT_END
8730 #  else
8731 #    define my_strxfrm(dest, src, n)  strxfrm(dest, src, n)
8732 #    ifdef USE_LOCALE_CTYPE
8733
8734     orig_CTYPE_locale = toggle_locale_c(LC_CTYPE, PL_collation_name);
8735
8736 #      define CLEANUP_STRXFRM                                           \
8737                 restore_toggled_locale_c(LC_CTYPE, orig_CTYPE_locale)
8738 #    else
8739 #      define CLEANUP_STRXFRM  NOOP
8740 #    endif
8741 #  endif
8742
8743     /* Then the transformation of the input.  We loop until successful, or we
8744      * give up */
8745     for (;;) {
8746
8747         errno = 0;
8748         *xlen = my_strxfrm(xbuf + COLLXFRM_HDR_LEN,
8749                            s,
8750                            xAlloc - COLLXFRM_HDR_LEN);
8751
8752
8753         /* If the transformed string occupies less space than we told strxfrm()
8754          * was available, it means it transformed the whole string. */
8755         if (*xlen < xAlloc - COLLXFRM_HDR_LEN) {
8756
8757             /* But there still could have been a problem */
8758             if (errno != 0) {
8759                 DEBUG_L(PerlIO_printf(Perl_debug_log,
8760                        "strxfrm failed for LC_COLLATE=%s; errno=%d, input=%s\n",
8761                        PL_collation_name, errno,
8762                        _byte_dump_string((U8 *) s, len, 0)));
8763                 goto bad;
8764             }
8765
8766             /* Here, the transformation was successful.  Some systems include a
8767              * trailing NUL in the returned length.  Ignore it, using a loop in
8768              * case multiple trailing NULs are returned. */
8769             while (   (*xlen) > 0
8770                    && *(xbuf + COLLXFRM_HDR_LEN + (*xlen) - 1) == '\0')
8771             {
8772                 (*xlen)--;
8773             }
8774
8775             /* If the first try didn't get it, it means our prediction was low.
8776              * Modify the coefficients so that we predict a larger value in any
8777              * future transformations */
8778             if (! first_time) {
8779                 STRLEN needed = *xlen + 1;   /* +1 For trailing NUL */
8780                 STRLEN computed_guess = PL_collxfrm_base
8781                                       + (PL_collxfrm_mult * length_in_chars);
8782
8783                 /* On zero-length input, just keep current slope instead of
8784                  * dividing by 0 */
8785                 const STRLEN new_m = (length_in_chars != 0)
8786                                      ? needed / length_in_chars
8787                                      : PL_collxfrm_mult;
8788
8789                 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
8790                     "initial size of %zu bytes for a length "
8791                     "%zu string was insufficient, %zu needed\n",
8792                     computed_guess, length_in_chars, needed));
8793
8794                 /* If slope increased, use it, but discard this result for
8795                  * length 1 strings, as we can't be sure that it's a real slope
8796                  * change */
8797                 if (length_in_chars > 1 && new_m  > PL_collxfrm_mult) {
8798
8799 #  ifdef DEBUGGING
8800
8801                     STRLEN old_m = PL_collxfrm_mult;
8802                     STRLEN old_b = PL_collxfrm_base;
8803
8804 #  endif
8805
8806                     PL_collxfrm_mult = new_m;
8807                     PL_collxfrm_base = 1;   /* +1 For trailing NUL */
8808                     computed_guess = PL_collxfrm_base
8809                                     + (PL_collxfrm_mult * length_in_chars);
8810                     if (computed_guess < needed) {
8811                         PL_collxfrm_base += needed - computed_guess;
8812                     }
8813
8814                     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
8815                                     "slope is now %zu; was %zu, base "
8816                         "is now %zu; was %zu\n",
8817                         PL_collxfrm_mult, old_m,
8818                         PL_collxfrm_base, old_b));
8819                 }
8820                 else {  /* Slope didn't change, but 'b' did */
8821                     const STRLEN new_b = needed
8822                                         - computed_guess
8823                                         + PL_collxfrm_base;
8824                     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
8825                         "base is now %zu; was %zu\n", new_b, PL_collxfrm_base));
8826                     PL_collxfrm_base = new_b;
8827                 }
8828             }
8829
8830             break;
8831         }
8832
8833         if (UNLIKELY(*xlen >= PERL_INT_MAX)) {
8834             DEBUG_L(PerlIO_printf(Perl_debug_log,
8835                   "mem_collxfrm_: Needed %zu bytes, max permissible is %u\n",
8836                   *xlen, PERL_INT_MAX));
8837             goto bad;
8838         }
8839
8840         /* A well-behaved strxfrm() returns exactly how much space it needs
8841          * (usually not including the trailing NUL) when it fails due to not
8842          * enough space being provided.  Assume that this is the case unless
8843          * it's been proven otherwise */
8844         if (LIKELY(PL_strxfrm_is_behaved) && first_time) {
8845             xAlloc = *xlen + COLLXFRM_HDR_LEN + 1;
8846         }
8847         else { /* Here, either:
8848                 *  1)  The strxfrm() has previously shown bad behavior; or
8849                 *  2)  It isn't the first time through the loop, which means
8850                 *      that the strxfrm() is now showing bad behavior, because
8851                 *      we gave it what it said was needed in the previous
8852                 *      iteration, and it came back saying it needed still more.
8853                 *      (Many versions of cygwin fit this.  When the buffer size
8854                 *      isn't sufficient, they return the input size instead of
8855                 *      how much is needed.)
8856                 * Increase the buffer size by a fixed percentage and try again.
8857                 * */
8858             xAlloc += (xAlloc / 4) + 1;
8859             PL_strxfrm_is_behaved = FALSE;
8860
8861             DEBUG_Lv(PerlIO_printf(Perl_debug_log,
8862                      "mem_collxfrm_ required more space than previously"
8863                      " calculated for locale %s, trying again with new"
8864                      " guess=%zu+%zu\n",
8865                 PL_collation_name,  COLLXFRM_HDR_LEN,
8866                      xAlloc - COLLXFRM_HDR_LEN));
8867         }
8868
8869         Renew(xbuf, xAlloc, char);
8870         if (UNLIKELY(! xbuf)) {
8871             DEBUG_L(PerlIO_printf(Perl_debug_log,
8872                       "mem_collxfrm_: Couldn't realloc %zu bytes\n", xAlloc));
8873             goto bad;
8874         }
8875
8876         first_time = FALSE;
8877     }
8878
8879     CLEANUP_STRXFRM;
8880
8881     DEBUG_L(print_collxfrm_input_and_return(s, s + len, xbuf, *xlen, utf8));
8882
8883     /* Free up unneeded space; retain enough for trailing NUL */
8884     Renew(xbuf, COLLXFRM_HDR_LEN + *xlen + 1, char);
8885
8886     if (s != input_string) {
8887         Safefree(s);
8888     }
8889
8890     return xbuf;
8891
8892   bad:
8893
8894     CLEANUP_STRXFRM;
8895     DEBUG_L(print_collxfrm_input_and_return(s, s + len, NULL, 0, utf8));
8896
8897     Safefree(xbuf);
8898     if (s != input_string) {
8899         Safefree(s);
8900     }
8901     *xlen = 0;
8902
8903     return NULL;
8904 }
8905
8906 #  ifdef DEBUGGING
8907
8908 STATIC void
8909 S_print_collxfrm_input_and_return(pTHX_
8910                                   const char * s,
8911                                   const char * e,
8912                                   const char * xbuf,
8913                                   const STRLEN xlen,
8914                                   const bool is_utf8)
8915 {
8916
8917     PERL_ARGS_ASSERT_PRINT_COLLXFRM_INPUT_AND_RETURN;
8918
8919     PerlIO_printf(Perl_debug_log,
8920                   "mem_collxfrm_[ix %" UVuf "] for locale '%s':\n"
8921                   "     input=%s\n    return=%s\n    return len=%zu\n",
8922                   (UV) PL_collation_ix, PL_collation_name,
8923                   get_displayable_string(s, e, is_utf8),
8924                   ((xbuf == NULL)
8925                    ? "(null)"
8926                    : ((xlen == 0)
8927                       ? "(empty)"
8928                       : _byte_dump_string((U8 *) xbuf + COLLXFRM_HDR_LEN,
8929                                           xlen, 0))),
8930                   xlen);
8931 }
8932
8933 #  endif    /* DEBUGGING */
8934
8935 SV *
8936 Perl_strxfrm(pTHX_ SV * src)
8937 {
8938     PERL_ARGS_ASSERT_STRXFRM;
8939
8940     /* For use by POSIX::strxfrm().  If they differ, toggle LC_CTYPE to
8941      * LC_COLLATE to avoid potential mojibake.
8942      *
8943      * If we can't calculate a collation, 'src' is instead returned, so that
8944      * future comparisons will be by code point order */
8945
8946 #  ifdef USE_LOCALE_CTYPE
8947
8948     const char * orig_ctype = toggle_locale_c(LC_CTYPE,
8949                                               querylocale_c(LC_COLLATE));
8950 #  endif
8951
8952     SV * dst = src;
8953     STRLEN dstlen;
8954     STRLEN srclen;
8955     const char *p = SvPV_const(src, srclen);
8956     const U32 utf8_flag = SvUTF8(src);
8957     char *d = mem_collxfrm_(p, srclen, &dstlen, cBOOL(utf8_flag));
8958
8959     assert(utf8_flag == 0 || utf8_flag == SVf_UTF8);
8960
8961     if (d != NULL) {
8962         assert(dstlen > 0);
8963         dst =newSVpvn_flags(d + COLLXFRM_HDR_LEN,
8964                             dstlen, SVs_TEMP|utf8_flag);
8965         Safefree(d);
8966     }
8967
8968 #  ifdef USE_LOCALE_CTYPE
8969
8970     restore_toggled_locale_c(LC_CTYPE, orig_ctype);
8971
8972 #  endif
8973
8974     return dst;
8975 }
8976
8977 #endif /* USE_LOCALE_COLLATE */
8978
8979 #ifdef USE_LOCALE
8980 #  ifdef USE_LOCALE_CTYPE
8981
8982 STATIC bool
8983 S_is_codeset_name_UTF8(const char * name)
8984 {
8985     /* Return a boolean as to if the passed-in name indicates it is a UTF-8
8986      * code set.  Several variants are possible */
8987     const Size_t len = strlen(name);
8988
8989     PERL_ARGS_ASSERT_IS_CODESET_NAME_UTF8;
8990
8991 #    ifdef WIN32
8992
8993     /* http://msdn.microsoft.com/en-us/library/windows/desktop/dd317756.aspx */
8994     if (memENDs(name, len, "65001")) {
8995         return TRUE;
8996     }
8997
8998 #    endif
8999                /* 'UTF8' or 'UTF-8' */
9000     return (    inRANGE(len, 4, 5)
9001             &&  name[len-1] == '8'
9002             && (   memBEGINs(name, len, "UTF")
9003                 || memBEGINs(name, len, "utf"))
9004             && (len == 4 || name[3] == '-'));
9005 }
9006
9007 #  endif
9008 #endif  /* USE_LOCALE */
9009
9010 bool
9011 Perl__is_in_locale_category(pTHX_ const bool compiling, const int category)
9012 {
9013     /* Internal function which returns if we are in the scope of a pragma that
9014      * enables the locale category 'category'.  'compiling' should indicate if
9015      * this is during the compilation phase (TRUE) or not (FALSE). */
9016
9017     const COP * const cop = (compiling) ? &PL_compiling : PL_curcop;
9018
9019     SV *these_categories = cop_hints_fetch_pvs(cop, "locale", 0);
9020     if (! these_categories || these_categories == &PL_sv_placeholder) {
9021         return FALSE;
9022     }
9023
9024     /* The pseudo-category 'not_characters' is -1, so just add 1 to each to get
9025      * a valid unsigned */
9026     assert(category >= -1);
9027     return cBOOL(SvUV(these_categories) & (1U << (category + 1)));
9028 }
9029
9030 /* my_strerror() returns a mortalized copy of the text of the error message
9031  * associated with 'errnum'.
9032  *
9033  * If not called from within the scope of 'use locale', it uses the text from
9034  * the C locale.  If Perl is compiled to not pay attention to LC_CTYPE nor
9035  * LC_MESSAGES, it uses whatever strerror() returns.  Otherwise the text is
9036  * derived from the locale, LC_MESSAGES if we have that; LC_CTYPE if not.
9037  *
9038  * It returns in *utf8ness the result's UTF-8ness
9039  *
9040  * The function just calls strerror(), but temporarily switches locales, if
9041  * needed.  Many platforms require LC_CTYPE and LC_MESSAGES to be in the same
9042  * CODESET in order for the return from strerror() to not contain '?' symbols,
9043  * or worse, mojibaked.  It's cheaper to just use the stricter criteria of
9044  * being in the same locale.  So the code below uses a common locale for both
9045  * categories.  Again, that is C if not within 'use locale' scope; or the
9046  * LC_MESSAGES locale if in scope and we have that category; and LC_CTYPE if we
9047  * don't have LC_MESSAGES; and whatever strerror returns if we don't have
9048  * either category.
9049  *
9050  * There are two sets of implementations.  The first below is if we have
9051  * strerror_l().  This is the simpler.  We just use the already-built C locale
9052  * object if not in locale scope, or build up a custom one otherwise.
9053  *
9054  * When strerror_l() is not available, we may have to swap locales temporarily
9055  * to bring the two categories into sync with each other, and possibly to the C
9056  * locale.
9057  *
9058  * Because the prepropessing directives to conditionally compile this function
9059  * would greatly obscure the logic of the various implementations, the whole
9060  * function is repeated for each configuration, with some common macros. */
9061
9062 /* Used to shorten the definitions of the following implementations of
9063  * my_strerror() */
9064 #define DEBUG_STRERROR_ENTER(errnum, in_locale)                             \
9065     DEBUG_Lv(PerlIO_printf(Perl_debug_log,                                  \
9066                            "my_strerror called with errnum %d;"             \
9067                            " Within locale scope=%d\n",                     \
9068                            errnum, in_locale))
9069
9070 #define DEBUG_STRERROR_RETURN(errstr, utf8ness)                             \
9071     DEBUG_Lv(PerlIO_printf(Perl_debug_log,                                  \
9072                            "Strerror returned; saving a copy: '%s';"        \
9073                            " utf8ness=%d\n",                                \
9074                            get_displayable_string(errstr,                   \
9075                                                   errstr + strlen(errstr),  \
9076                                                   *utf8ness),               \
9077                            (int) *utf8ness))
9078
9079 /* On platforms that have precisely one of these categories (Windows
9080  * qualifies), these yield the correct one */
9081 #if defined(USE_LOCALE_CTYPE)
9082 #  define WHICH_LC_INDEX LC_CTYPE_INDEX_
9083 #elif defined(USE_LOCALE_MESSAGES)
9084 #  define WHICH_LC_INDEX LC_MESSAGES_INDEX_
9085 #endif
9086
9087 /*===========================================================================*/
9088 /* First set of implementations, when have strerror_l() */
9089
9090 #if defined(USE_POSIX_2008_LOCALE) && defined(HAS_STRERROR_L)
9091
9092 #  if ! defined(USE_LOCALE_CTYPE) && ! defined(USE_LOCALE_MESSAGES)
9093
9094 /* Here, neither category is defined: use the C locale */
9095 const char *
9096 Perl_my_strerror(pTHX_ const int errnum, utf8ness_t * utf8ness)
9097 {
9098     PERL_ARGS_ASSERT_MY_STRERROR;
9099
9100     DEBUG_STRERROR_ENTER(errnum, 0);
9101
9102     const char *errstr = savepv(strerror_l(errnum, PL_C_locale_obj));
9103     *utf8ness = UTF8NESS_IMMATERIAL;
9104
9105     DEBUG_STRERROR_RETURN(errstr, utf8ness);
9106
9107     SAVEFREEPV(errstr);
9108     return errstr;
9109 }
9110
9111 #  elif ! defined(USE_LOCALE_CTYPE) || ! defined(USE_LOCALE_MESSAGES)
9112
9113 /*--------------------------------------------------------------------------*/
9114
9115 /* Here one or the other of CTYPE or MESSAGES is defined, but not both.  If we
9116  * are not within 'use locale' scope of the only one defined, we use the C
9117  * locale; otherwise use the current locale object */
9118
9119 const char *
9120 Perl_my_strerror(pTHX_ const int errnum, utf8ness_t * utf8ness)
9121 {
9122     PERL_ARGS_ASSERT_MY_STRERROR;
9123
9124     DEBUG_STRERROR_ENTER(errnum, IN_LC(categories[WHICH_LC_INDEX]));
9125
9126     /* Use C if not within locale scope;  Otherwise, use current locale */
9127     const locale_t which_obj = (IN_LC(categories[WHICH_LC_INDEX]))
9128                                ? PL_C_locale_obj
9129                                : use_curlocale_scratch();
9130
9131     const char *errstr = savepv(strerror_l(errnum, which_obj));
9132     *utf8ness = get_locale_string_utf8ness_i(errstr, LOCALE_UTF8NESS_UNKNOWN,
9133                                              NULL, WHICH_LC_INDEX);
9134     DEBUG_STRERROR_RETURN(errstr, utf8ness);
9135
9136     SAVEFREEPV(errstr);
9137     return errstr;
9138 }
9139
9140 /*--------------------------------------------------------------------------*/
9141 #  else     /* Are using both categories.  Place them in the same CODESET,
9142              * either C or the LC_MESSAGES locale */
9143
9144 const char *
9145 Perl_my_strerror(pTHX_ const int errnum, utf8ness_t * utf8ness)
9146 {
9147     PERL_ARGS_ASSERT_MY_STRERROR;
9148
9149     DEBUG_STRERROR_ENTER(errnum, IN_LC(LC_MESSAGES));
9150
9151     const char *errstr;
9152     if (! IN_LC(LC_MESSAGES)) {    /* Use C if not within locale scope */
9153         errstr = savepv(strerror_l(errnum, PL_C_locale_obj));
9154         *utf8ness = UTF8NESS_IMMATERIAL;
9155     }
9156     else {  /* Otherwise, use the LC_MESSAGES locale, making sure LC_CTYPE
9157                matches */
9158         locale_t cur = duplocale(use_curlocale_scratch());
9159
9160         cur = newlocale(LC_CTYPE_MASK, querylocale_c(LC_MESSAGES), cur);
9161         errstr = savepv(strerror_l(errnum, cur));
9162         *utf8ness = get_locale_string_utf8ness_i(errstr,
9163                                                  LOCALE_UTF8NESS_UNKNOWN,
9164                                                  NULL, LC_MESSAGES_INDEX_);
9165         freelocale(cur);
9166     }
9167
9168     DEBUG_STRERROR_RETURN(errstr, utf8ness);
9169
9170     SAVEFREEPV(errstr);
9171     return errstr;
9172 }
9173 #  endif    /* Above is using strerror_l */
9174 /*===========================================================================*/
9175 #else       /* Below is not using strerror_l */
9176 #  if ! defined(USE_LOCALE_CTYPE) && ! defined(USE_LOCALE_MESSAGES)
9177
9178 /* If not using using either of the categories, return plain, unadorned
9179  * strerror */
9180
9181 const char *
9182 Perl_my_strerror(pTHX_ const int errnum, utf8ness_t * utf8ness)
9183 {
9184     PERL_ARGS_ASSERT_MY_STRERROR;
9185
9186     DEBUG_STRERROR_ENTER(errnum, 0);
9187
9188     const char *errstr = savepv(Strerror(errnum));
9189     *utf8ness = UTF8NESS_IMMATERIAL;
9190
9191     DEBUG_STRERROR_RETURN(errstr, utf8ness);
9192
9193     SAVEFREEPV(errstr);
9194     return errstr;
9195 }
9196
9197 /*--------------------------------------------------------------------------*/
9198 #  elif ! defined(USE_LOCALE_CTYPE) || ! defined(USE_LOCALE_MESSAGES)
9199
9200 /* Here one or the other of CTYPE or MESSAGES is defined, but not both.  If we
9201  * are not within 'use locale' scope of the only one defined, we use the C
9202  * locale; otherwise use the current locale */
9203
9204 const char *
9205 Perl_my_strerror(pTHX_ const int errnum, utf8ness_t * utf8ness)
9206 {
9207     PERL_ARGS_ASSERT_MY_STRERROR;
9208
9209     DEBUG_STRERROR_ENTER(errnum, IN_LC(categories[WHICH_LC_INDEX]));
9210
9211     const char *errstr;
9212     if (IN_LC(categories[WHICH_LC_INDEX])) {
9213         errstr = savepv(Strerror(errnum));
9214         *utf8ness = get_locale_string_utf8ness_i(errstr,
9215                                                  LOCALE_UTF8NESS_UNKNOWN,
9216                                                  NULL, WHICH_LC_INDEX);
9217     }
9218     else {
9219
9220         LOCALE_LOCK;
9221
9222         const char * orig_locale = toggle_locale_i(WHICH_LC_INDEX, "C");
9223
9224         errstr = savepv(Strerror(errnum));
9225
9226         restore_toggled_locale_i(WHICH_LC_INDEX, orig_locale);
9227
9228         LOCALE_UNLOCK;
9229
9230         *utf8ness = UTF8NESS_IMMATERIAL;
9231     }
9232
9233     DEBUG_STRERROR_RETURN(errstr, utf8ness);
9234
9235     SAVEFREEPV(errstr);
9236     return errstr;
9237 }
9238
9239 /*--------------------------------------------------------------------------*/
9240 #  else
9241
9242 /* Below, have both LC_CTYPE and LC_MESSAGES.  Place them in the same CODESET,
9243  * either C or the LC_MESSAGES locale */
9244
9245 const char *
9246 Perl_my_strerror(pTHX_ const int errnum, utf8ness_t * utf8ness)
9247 {
9248     PERL_ARGS_ASSERT_MY_STRERROR;
9249
9250     DEBUG_STRERROR_ENTER(errnum, IN_LC(LC_MESSAGES));
9251
9252     const char * desired_locale = (IN_LC(LC_MESSAGES))
9253                                   ? querylocale_c(LC_MESSAGES)
9254                                   : "C";
9255     /* XXX Can fail on z/OS */
9256
9257     LOCALE_LOCK;
9258
9259     const char* orig_CTYPE_locale    = toggle_locale_c(LC_CTYPE,
9260                                                        desired_locale);
9261     const char* orig_MESSAGES_locale = toggle_locale_c(LC_MESSAGES,
9262                                                        desired_locale);
9263     const char *errstr = savepv(Strerror(errnum));
9264
9265     restore_toggled_locale_c(LC_MESSAGES, orig_MESSAGES_locale);
9266     restore_toggled_locale_c(LC_CTYPE, orig_CTYPE_locale);
9267
9268     LOCALE_UNLOCK;
9269
9270     *utf8ness = get_locale_string_utf8ness_i(errstr, LOCALE_UTF8NESS_UNKNOWN,
9271                                              NULL, LC_MESSAGES_INDEX_);
9272     DEBUG_STRERROR_RETURN(errstr, utf8ness);
9273
9274     SAVEFREEPV(errstr);
9275     return errstr;
9276 }
9277
9278 /*--------------------------------------------------------------------------*/
9279 #  endif /* end of not using strerror_l() */
9280 #endif   /* end of all the my_strerror() implementations */
9281
9282 /*
9283
9284 =for apidoc switch_to_global_locale
9285
9286 This function copies the locale state of the calling thread into the program's
9287 global locale, and converts the thread to use that global locale.
9288
9289 It is intended so that Perl can safely be used with C libraries that access the
9290 global locale and which can't be converted to not access it.  Effectively, this
9291 means libraries that call C<L<setlocale(3)>> on non-Windows systems.  (For
9292 portability, it is a good idea to use it on Windows as well.)
9293
9294 A downside of using it is that it disables the services that Perl provides to
9295 hide locale gotchas from your code.  The service you most likely will miss
9296 regards the radix character (decimal point) in floating point numbers.  Code
9297 executed after this function is called can no longer just assume that this
9298 character is correct for the current circumstances.
9299
9300 To return to Perl control, and restart the gotcha prevention services, call
9301 C<L</sync_locale>>.  Behavior is undefined for any pure Perl code that executes
9302 while the switch is in effect.
9303
9304 The global locale and the per-thread locales are independent.  As long as just
9305 one thread converts to the global locale, everything works smoothly.  But if
9306 more than one does, they can easily interfere with each other, and races are
9307 likely.  On Windows systems prior to Visual Studio 15 (at which point Microsoft
9308 fixed a bug), races can occur (even if only one thread has been converted to
9309 the global locale), but only if you use the following operations:
9310
9311 =over
9312
9313 =item L<POSIX::localeconv|POSIX/localeconv>
9314
9315 =item L<I18N::Langinfo>, items C<CRNCYSTR> and C<THOUSEP>
9316
9317 =item L<perlapi/Perl_langinfo>, items C<CRNCYSTR> and C<THOUSEP>
9318
9319 =back
9320
9321 The first item is not fixable (except by upgrading to a later Visual Studio
9322 release), but it would be possible to work around the latter two items by
9323 having Perl change its algorithm for calculating these to use Windows API
9324 functions (likely C<GetNumberFormat> and C<GetCurrencyFormat>); patches
9325 welcome.
9326
9327 XS code should never call plain C<setlocale>, but should instead be converted
9328 to either call L<C<Perl_setlocale>|perlapi/Perl_setlocale> (which is a drop-in
9329 for the system C<setlocale>) or use the methods given in L<perlcall> to call
9330 L<C<POSIX::setlocale>|POSIX/setlocale>.  Either one will transparently properly
9331 handle all cases of single- vs multi-thread, POSIX 2008-supported or not.
9332
9333 =cut
9334 */
9335
9336 #if defined(WIN32) && defined(USE_THREAD_SAFE_LOCALE)
9337 #  define CHANGE_SYSTEM_LOCALE_TO_GLOBAL                                \
9338     STMT_START {                                                        \
9339         if (_configthreadlocale(_DISABLE_PER_THREAD_LOCALE) == -1) {    \
9340             locale_panic_("_configthreadlocale returned an error");     \
9341         }                                                               \
9342     } STMT_END
9343 #elif defined(USE_POSIX_2008_LOCALE)
9344 #  define CHANGE_SYSTEM_LOCALE_TO_GLOBAL                                \
9345     STMT_START {                                                        \
9346         locale_t old_locale = uselocale(LC_GLOBAL_LOCALE);              \
9347         if (! old_locale) {                                             \
9348             locale_panic_("Could not change to global locale");         \
9349         }                                                               \
9350                                                                         \
9351         /* Free the per-thread memory */                                \
9352         if (   old_locale != LC_GLOBAL_LOCALE                           \
9353             && old_locale != PL_C_locale_obj)                           \
9354         {                                                               \
9355             freelocale(old_locale);                                     \
9356         }                                                               \
9357     } STMT_END
9358 #else
9359 #  define CHANGE_SYSTEM_LOCALE_TO_GLOBAL
9360 #endif
9361
9362 void
9363 Perl_switch_to_global_locale(pTHX)
9364 {
9365
9366 #ifdef USE_LOCALE
9367
9368     DEBUG_L(PerlIO_printf(Perl_debug_log, "Entering switch_to_global; %s\n",
9369                                           get_LC_ALL_display()));
9370
9371    /* In these cases, we use the system state to determine if we are in the
9372     * global locale or not. */
9373 #  ifdef USE_POSIX_2008_LOCALE
9374
9375     const bool perl_controls = (LC_GLOBAL_LOCALE != uselocale((locale_t) 0));
9376
9377 #  elif defined(USE_THREAD_SAFE_LOCALE) && defined(WIN32)
9378
9379     int config_return = _configthreadlocale(0);
9380     if (config_return == -1) {
9381         locale_panic_("_configthreadlocale returned an error");
9382     }
9383     const bool perl_controls = (config_return == _ENABLE_PER_THREAD_LOCALE);
9384
9385 #  else
9386
9387     const bool perl_controls = false;
9388
9389 #  endif
9390
9391     /* No-op if already in global */
9392     if (! perl_controls) {
9393         return;
9394     }
9395
9396 #  ifdef LC_ALL
9397
9398     const char * thread_locale = calculate_LC_ALL_string(NULL,
9399                                                          EXTERNAL_FORMAT_FOR_SET,
9400                                                          WANT_TEMP_PV,
9401                                                          __LINE__);
9402     CHANGE_SYSTEM_LOCALE_TO_GLOBAL;
9403     posix_setlocale(LC_ALL, thread_locale);
9404
9405 #  else   /* Must be USE_POSIX_2008_LOCALE) */
9406
9407     const char * cur_thread_locales[LC_ALL_INDEX_];
9408
9409     /* Save each category's current per-thread state */
9410     for_all_individual_category_indexes(i) {
9411         cur_thread_locales[i] = querylocale_i(i);
9412     }
9413
9414     CHANGE_SYSTEM_LOCALE_TO_GLOBAL;
9415
9416     /* Set the global to what was our per-thread state */
9417     POSIX_SETLOCALE_LOCK;
9418     for_all_individual_category_indexes(i) {
9419         posix_setlocale(categories[i], cur_thread_locales[i]);
9420     }
9421     POSIX_SETLOCALE_UNLOCK;
9422
9423 #  endif
9424 #  ifdef USE_LOCALE_NUMERIC
9425
9426     /* Switch to the underlying C numeric locale; the application is on its
9427      * own. */
9428     POSIX_SETLOCALE_LOCK;
9429     posix_setlocale(LC_NUMERIC, PL_numeric_name);
9430     POSIX_SETLOCALE_UNLOCK;
9431
9432 #  endif
9433 #endif
9434
9435 }
9436
9437 /*
9438
9439 =for apidoc sync_locale
9440
9441 This function copies the state of the program global locale into the calling
9442 thread, and converts that thread to using per-thread locales, if it wasn't
9443 already, and the platform supports them.  The LC_NUMERIC locale is toggled into
9444 the standard state (using the C locale's conventions), if not within the
9445 lexical scope of S<C<use locale>>.
9446
9447 Perl will now consider itself to have control of the locale.
9448
9449 Since unthreaded perls have only a global locale, this function is a no-op
9450 without threads.
9451
9452 This function is intended for use with C libraries that do locale manipulation.
9453 It allows Perl to accommodate the use of them.  Call this function before
9454 transferring back to Perl space so that it knows what state the C code has left
9455 things in.
9456
9457 XS code should not manipulate the locale on its own.  Instead,
9458 L<C<Perl_setlocale>|perlapi/Perl_setlocale> can be used at any time to query or
9459 change the locale (though changing the locale is antisocial and dangerous on
9460 multi-threaded systems that don't have multi-thread safe locale operations.
9461 (See L<perllocale/Multi-threaded operation>).
9462
9463 Using the libc L<C<setlocale(3)>> function should be avoided.  Nevertheless,
9464 certain non-Perl libraries called from XS, do call it, and their behavior may
9465 not be able to be changed.  This function, along with
9466 C<L</switch_to_global_locale>>, can be used to get seamless behavior in these
9467 circumstances, as long as only one thread is involved.
9468
9469 If the library has an option to turn off its locale manipulation, doing that is
9470 preferable to using this mechanism.  C<Gtk> is such a library.
9471
9472 The return value is a boolean: TRUE if the global locale at the time of call
9473 was in effect for the caller; and FALSE if a per-thread locale was in effect.
9474
9475 =cut
9476 */
9477
9478 bool
9479 Perl_sync_locale(pTHX)
9480 {
9481
9482 #ifndef USE_LOCALE
9483
9484     return TRUE;
9485
9486 #else
9487
9488     bool was_in_global = TRUE;
9489
9490 #  ifdef USE_THREAD_SAFE_LOCALE
9491 #    if defined(WIN32)
9492
9493     int config_return = _configthreadlocale(_DISABLE_PER_THREAD_LOCALE);
9494     if (config_return == -1) {
9495         locale_panic_("_configthreadlocale returned an error");
9496     }
9497     was_in_global = (config_return == _DISABLE_PER_THREAD_LOCALE);
9498
9499 #    elif defined(USE_POSIX_2008_LOCALE)
9500
9501     was_in_global = (LC_GLOBAL_LOCALE == uselocale(LC_GLOBAL_LOCALE));
9502
9503 #    else
9504 #      error Unexpected Configuration
9505 #    endif
9506 #  endif    /* USE_THREAD_SAFE_LOCALE */
9507
9508     /* Here, we are in the global locale.  Get and save the values for each
9509      * category, and convert the current thread to use them */
9510
9511 #  ifdef LC_ALL
9512
9513     STDIZED_SETLOCALE_LOCK;
9514     const char * lc_all_string = savepv(stdized_setlocale(LC_ALL, NULL));
9515     STDIZED_SETLOCALE_UNLOCK;
9516
9517     give_perl_locale_control(lc_all_string, __LINE__);
9518     Safefree(lc_all_string);
9519
9520 #  else
9521
9522     const char * current_globals[LC_ALL_INDEX_];
9523     for_all_individual_category_indexes(i) {
9524         STDIZED_SETLOCALE_LOCK;
9525         current_globals[i] = savepv(stdized_setlocale(categories[i], NULL));
9526         STDIZED_SETLOCALE_UNLOCK;
9527     }
9528
9529     give_perl_locale_control((const char **) &current_globals, __LINE__);
9530
9531     for_all_individual_category_indexes(i) {
9532         Safefree(current_globals[i]);
9533     }
9534
9535 #  endif
9536
9537     return was_in_global;
9538
9539 #endif
9540
9541 }
9542
9543 #if defined(DEBUGGING) && defined(USE_LOCALE)
9544
9545 STATIC char *
9546 S_my_setlocale_debug_string_i(pTHX_
9547                               const locale_category_index cat_index,
9548                               const char* locale, /* Optional locale name */
9549
9550                               /* return value from setlocale() when attempting
9551                                * to set 'category' to 'locale' */
9552                               const char* retval,
9553
9554                               const line_t line)
9555 {
9556     /* Returns a pointer to a NUL-terminated string in static storage with
9557      * added text about the info passed in.  This is not thread safe and will
9558      * be overwritten by the next call, so this should be used just to
9559      * formulate a string to immediately print or savepv() on. */
9560
9561     const char * locale_quote;
9562     const char * retval_quote;
9563
9564     assert(cat_index <= LC_ALL_INDEX_);
9565
9566     if (locale == NULL) {
9567         locale_quote = "";
9568         locale = "NULL";
9569     }
9570     else {
9571         locale_quote = "\"";
9572     }
9573
9574     if (retval == NULL) {
9575         retval_quote = "";
9576         retval = "NULL";
9577     }
9578     else {
9579         retval_quote = "\"";
9580     }
9581
9582 #  ifdef USE_LOCALE_THREADS
9583 #    define THREAD_FORMAT "%p:"
9584 #    define THREAD_ARGUMENT aTHX_
9585 #  else
9586 #    define THREAD_FORMAT
9587 #    define THREAD_ARGUMENT
9588 #  endif
9589
9590     return Perl_form(aTHX_
9591                      "%s:%" LINE_Tf ": " THREAD_FORMAT
9592                      " setlocale(%s[%d], %s%s%s) returned %s%s%s\n",
9593
9594                      __FILE__, line, THREAD_ARGUMENT
9595                      category_names[cat_index], categories[cat_index],
9596                      locale_quote, locale, locale_quote,
9597                      retval_quote, retval, retval_quote);
9598 }
9599
9600 #endif
9601 #ifdef USE_PERL_SWITCH_LOCALE_CONTEXT
9602
9603 void
9604 Perl_switch_locale_context(pTHX)
9605 {
9606     /* libc keeps per-thread locale status information in some configurations.
9607      * So, we can't just switch out aTHX to switch to a new thread.  libc has
9608      * to follow along.  This routine does that based on per-interpreter
9609      * variables we keep just for this purpose.
9610      *
9611      * There are two implementations where this is an issue.  For the other
9612      * implementations, it doesn't matter because libc is using global values
9613      * that all threads know about.
9614      *
9615      * The two implementations are where libc keeps thread-specific information
9616      * on its own.  These are
9617      *
9618      * POSIX 2008:  The current locale is kept by libc as an object.  We save
9619      *              a copy of that in the per-thread PL_cur_locale_obj, and so
9620      *              this routine uses that copy to tell the thread it should be
9621      *              operating with that object
9622      * Windows thread-safe locales:  A given thread in Windows can be being run
9623      *              with per-thread locales, or not.  When the thread context
9624      *              changes, libc doesn't automatically know if the thread is
9625      *              using per-thread locales, nor does it know what the new
9626      *              thread's locale is.  We keep that information in the
9627      *              per-thread variables:
9628      *                  PL_controls_locale  indicates if this thread is using
9629      *                                      per-thread locales or not
9630      *                  PL_cur_LC_ALL       indicates what the locale should be
9631      *                                      if it is a per-thread locale.
9632      */
9633
9634     if (UNLIKELY(   PL_veto_switch_non_tTHX_context
9635                  || PL_phase == PERL_PHASE_CONSTRUCT))
9636     {
9637         return;
9638     }
9639
9640 #  ifdef USE_POSIX_2008_LOCALE
9641
9642     if (! uselocale(PL_cur_locale_obj)) {
9643         locale_panic_(Perl_form(aTHX_
9644                                 "Can't uselocale(%p), LC_ALL supposed to"
9645                                 " be '%s'",
9646                                 PL_cur_locale_obj, get_LC_ALL_display()));
9647     }
9648
9649 #  elif defined(WIN32)
9650
9651     if (! bool_setlocale_c(LC_ALL, PL_cur_LC_ALL)) {
9652         locale_panic_(Perl_form(aTHX_ "Can't setlocale(%s)", PL_cur_LC_ALL));
9653     }
9654
9655 #  endif
9656
9657 }
9658
9659 #endif
9660
9661 void
9662 Perl_thread_locale_init(pTHX)
9663 {
9664
9665 #ifdef USE_THREAD_SAFE_LOCALE
9666 #  ifdef USE_POSIX_2008_LOCALE
9667
9668     /* Called from a thread on startup.
9669      *
9670      * The operations here have to be done from within the calling thread, as
9671      * they affect libc's knowledge of the thread; libc has no knowledge of
9672      * aTHX */
9673
9674      DEBUG_L(PerlIO_printf(Perl_debug_log,
9675                            "new thread, initial locale is %s;"
9676                            " calling setlocale(LC_ALL, \"C\")\n",
9677                            get_LC_ALL_display()));
9678
9679     if (! uselocale(PL_C_locale_obj)) {
9680
9681         /* Not being able to change to the C locale is severe; don't keep
9682          * going.  */
9683         locale_panic_(Perl_form(aTHX_
9684                                 "Can't uselocale(%p), 'C'", PL_C_locale_obj));
9685         NOT_REACHED; /* NOTREACHED */
9686     }
9687
9688 #    ifdef MULTIPLICITY
9689
9690     PL_cur_locale_obj = PL_C_locale_obj;
9691
9692 #    endif
9693 #  elif defined(WIN32)
9694
9695     /* On Windows, make sure new thread has per-thread locales enabled */
9696     if (_configthreadlocale(_ENABLE_PER_THREAD_LOCALE) == -1) {
9697         locale_panic_("_configthreadlocale returned an error");
9698     }
9699     void_setlocale_c(LC_ALL, "C");
9700
9701 #  endif
9702 #endif
9703
9704 }
9705
9706 void
9707 Perl_thread_locale_term(pTHX)
9708 {
9709     /* Called from a thread as it gets ready to terminate.
9710      *
9711      * The operations here have to be done from within the calling thread, as
9712      * they affect libc's knowledge of the thread; libc has no knowledge of
9713      * aTHX */
9714
9715 #if defined(USE_POSIX_2008_LOCALE) && defined(USE_THREADS)
9716
9717     /* Switch to the global locale, so can free up the per-thread object */
9718     locale_t actual_obj = uselocale(LC_GLOBAL_LOCALE);
9719     if (actual_obj != LC_GLOBAL_LOCALE && actual_obj != PL_C_locale_obj) {
9720         freelocale(actual_obj);
9721     }
9722
9723     /* Prevent leaks even if something has gone wrong */
9724     locale_t expected_obj = PL_cur_locale_obj;
9725     if (UNLIKELY(   expected_obj != actual_obj
9726                  && expected_obj != LC_GLOBAL_LOCALE
9727                  && expected_obj != PL_C_locale_obj))
9728     {
9729         freelocale(expected_obj);
9730     }
9731
9732     PL_cur_locale_obj = LC_GLOBAL_LOCALE;
9733
9734 #endif
9735 #ifdef WIN32_USE_FAKE_OLD_MINGW_LOCALES
9736
9737     /* When faking the mingw implementation, we coerce this function into doing
9738      * something completely different from its intent -- namely to free up our
9739      * static buffer to avoid a leak.  This function gets called for each
9740      * thread that is terminating, so will give us a chance to free the buffer
9741      * from the appropriate pool.  On unthreaded systems, it gets called by the
9742      * mutex termination code. */
9743
9744 #  ifdef MULTIPLICITY
9745
9746     if (aTHX != wsetlocale_buf_aTHX) {
9747         return;
9748     }
9749
9750 #  endif
9751
9752     if (wsetlocale_buf_size > 0) {
9753         Safefree(wsetlocale_buf);
9754         wsetlocale_buf_size = 0;
9755     }
9756
9757 #endif
9758
9759 }
9760
9761 /*
9762  * ex: set ts=8 sts=4 sw=4 et:
9763  */