locale.c

   1 /*    locale.c
   2  *
   3  *    Copyright (C) 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001,
   4  *    2002, 2003, 2005, 2006, 2007, 2008 by Larry Wall and others
   5  *
   6  *    You may distribute under the terms of either the GNU General Public
   7  *    License or the Artistic License, as specified in the README file.
   8  *
   9  */
  10
  11 /*
  12  *      A Elbereth Gilthoniel,
  13  *      silivren penna míriel
  14  *      o menel aglar elenath!
  15  *      Na-chaered palan-díriel
  16  *      o galadhremmin ennorath,
  17  *      Fanuilos, le linnathon
  18  *      nef aear, si nef aearon!
  19  *
  20  *     [p.238 of _The Lord of the Rings_, II/i: "Many Meetings"]
  21  */
  22
  23 /* utility functions for handling locale-specific stuff like what
  24  * character represents the decimal point.
  25  *
  26  * All C programs have an underlying locale.  Perl code generally doesn't pay
  27  * any attention to it except within the scope of a 'use locale'.  For most
  28  * categories, it accomplishes this by just using different operations if it is
  29  * in such scope than if not.  However, various libc functions called by Perl
  30  * are affected by the LC_NUMERIC category, so there are macros in perl.h that
  31  * are used to toggle between the current locale and the C locale depending on
  32  * the desired behavior of those functions at the moment.  And, LC_MESSAGES is
  33  * switched to the C locale for outputting the message unless within the scope
  34  * of 'use locale'.
  35  *
  36  * There is more than the typical amount of variation between platforms with
  37  * regard to locale handling.  At the end of these introductory comments, are
  38  * listed various relevent Configuration options, including some that can be
  39  * used to pretend to some extent that this is being developed on a different
  40  * platform than it actually is.  This allows you to make changes and catch
  41  * some errors without having access to those other platforms.
  42  *
  43  * This code now has multi-thread-safe locale handling on systems that support
  44  * that.  This is completely transparent to most XS code.  On earlier systems,
  45  * it would be possible to emulate thread-safe locales, but this likely would
  46  * involve a lot of locale switching, and would require XS code changes.
  47  * Macros could be written so that the code wouldn't have to know which type of
  48  * system is being used.
  49  *
  50  * Table-driven code is used for simplicity and clarity, as many operations
  51  * differ only in which category is being worked on.  However the system
  52  * categories need not be small contiguous integers, so do not lend themselves
  53  * to table lookup.  Instead we have created our own equivalent values which
  54  * are all small contiguous non-negative integers, and translation functions
  55  * between the two sets.  For category 'LC_foo', the name of our index is
  56  * LC_foo_INDEX_.  Various parallel tables, indexed by these, are used for the
  57  * translation.  The tables are generated at compile-time based on platform
  58  * characteristics and Configure options.  They hide from the code many of the
  59  * vagaries of the different locale implementations out there.
  60  *
  61  * On unthreaded perls, most operations expand out to just the basic
  62  * setlocale() calls.  That sort of is true on threaded perls on modern Windows
  63  * systems where the same API, after set up, is used for thread-safe locale
  64  * handling.  (But there are complications on Windows due to internal character
  65  * set issues.)  On other systems, there is a completely different API,
  66  * specified in POSIX 2008, to do thread-safe locales.  On these systems, our
  67  * bool_setlocale_2008_i() function is used to hide the different API from the
  68  * outside.  This makes it completely transparent to most XS code.
  69  *
  70  * A huge complicating factor is that the LC_NUMERIC category is normally held
  71  * in the C locale, except during those relatively rare times when it needs to
  72  * be in the underlying locale.  There is a bunch of code to accomplish this,
  73  * and to allow easy switches from one state to the other.
  74  *
  75  * In addition, the setlocale equivalents have versions for the return context,
  76  * 'void' and 'bool', besides the full return value.  This can present
  77  * opportunities for avoiding work.  We don't have to necessarily create a safe
  78  * copy to return if no return is desired.
  79  *
  80  * There are 3.5 major implementations here; which one chosen depends on what
  81  * the platform has available, and Configuration options.
  82  *
  83  * 1) Raw posix_setlocale().  This implementation is basically the libc
  84  *    setlocale(), with possibly minor tweaks.  This is used for startup, and
  85  *    always for unthreaded perls, and when the API for safe locale threading
  86  *    is identical to the unsafe API (Windows, currently).
  87  *
  88  *    This implementation is composed of two layers:
  89  *      a)  posix_setlocale() implements the libc setlocale().  In most cases,
  90  *          it is just an alias for the libc version.  But Windows doesn't
  91  *          fully conform to the POSIX standard, and this is a layer on top of
  92  *          libc to bring it more into conformance.  And in Configurations
  93  *          where perl is to ignore some locale categories that the libc
  94  *          setlocale() knows about, there is a layer to cope with that.
  95  *      b)  stdized_setlocale() is a layer above a) that fixes some vagaries in
  96  *          the return value of the libc setlocale().  On most platforms this
  97  *          layer is empty; it requires perl to be Configured with a parameter
  98  *          indicating the platform's defect, in order to be activated.  The
  99  *          current ones are listed at the definition of the macro.
 100  *
 101  * 2) An implementation that adds a minimal layer above implementation 1),
 102  *    making that implementation uninterruptible and returning a
 103  *    per-thread/per-category value.
 104  *
 105  * 3a and 3b) An implementation of POSIX 2008 thread-safe locale handling,
 106  *    hiding from the programmer the completely different API for this.
 107  *    This automatically makes almost all code thread-safe without need for
 108  *    changes.  This implementation is chosen on threaded perls when the
 109  *    platform properly supports the POSIX 2008 functions, and when there is no
 110  *    manual override to the contrary passed to Configure.
 111  *
 112  *    3a) is when the platform has a documented reliable querylocale() function
 113  *        or equivalent that is selected to be used.
 114  *    3b) is when we have to emulate that functionality.
 115  *
 116  *    Unfortunately, it seems that some platforms that claim to support these
 117  *    are buggy, in one way or another.  There are workarounds encoded here,
 118  *    where feasible, for platforms where the bugs are amenable to that
 119  *    (glibc, for example).  But other platforms instead don't use this
 120  *    implementation.
 121  *
 122  * z/OS (os390) is an outlier.  Locales really don't work under threads when
 123  * either the radix character isn't a dot, or attempts are made to change
 124  * locales after the first thread is created.  The reason is that IBM has made
 125  * it thread-safe by refusing to change locales (returning failure if
 126  * attempted) any time after an application has called pthread_create() to
 127  * create another thread.  The expectation is that an application will set up
 128  * its locale information before the first fork, and be stable thereafter.  But
 129  * perl toggles LC_NUMERIC if the locale's radix character isn't a dot, as do
 130  * the other toggles, which are less common.
 131  *
 132  * Associated with each implementation are three sets of macros that translate
 133  * a consistent API into what that implementation needs.  Each set consists of
 134  * three macros with the suffixes:
 135  *  _c  Means the argument is a locale category number known at compile time.
 136  *          An example would be LC_TIME.  This token is a compile-time constant
 137  *          and can be passed to a '_c' macro.
 138  *  _r  Means the argument is a locale category number whose value might not be
 139  *      known until runtime
 140  *  _i  Means the argument is our internal index of a locale category
 141  *
 142  * The three sets are:    ('_X'  means one of '_c', '_r', '_i')
 143  * 1) bool_setlocale_X()
 144  *      This calls the appropriate setlocale()-equivalent for the
 145  *      implementation, with the category and new locale.  The input locale is
 146  *      not necessarily valid, so the return is true or false depending on
 147  *      whether or not the setlocale() succeeded.  This is not used for
 148  *      querying the locale, so the input locale must not be NULL.
 149  *
 150  *      This macro is suitable for toggling the locale back and forth during an
 151  *      operation.  For example, the names of days and months under LC_TIME are
 152  *      strings that are also subject to LC_CTYPE.  If the locales of these two
 153  *      categories differ, mojibake can result on many platforms.  The code
 154  *      here will toggle LC_CTYPE into the locale of LC_TIME temporarily to
 155  *      avoid this.
 156  *
 157  *      Several categories require extra work when their locale is changed.
 158  *      LC_CTYPE, for example, requires the calculation of the table of which
 159  *      characters fold to which others under /i pattern matching or fc(), as
 160  *      folding is not a concept in POSIX.  This table isn't needed when the
 161  *      LC_CTYPE locale gets toggled during an operation, and will be toggled
 162  *      back before return to the caller.  To save work that would be
 163  *      discarded, the bool_setlocale_X() implementations don't do this extra
 164  *      work.  Instead, there is a separate function for just this purpose to
 165  *      be done before control is transferred back to the external caller.  All
 166  *      categories that have such requirements have such a function.  The
 167  *      update_functions[] array contains pointers to them (or NULL for
 168  *      categories which don't need a function).
 169  *
 170  *      Care must be taken to remember to call the separate function before
 171  *      returning to an external caller, and to not use things it updates
 172  *      before its call.  An alternative approach would be to have
 173  *      bool_setlocale_X() always call the update, which would return
 174  *      immediately if a flag wasn't set indicating it was time to actually
 175  *      perform it.
 176  *
 177  * 2) void_setlocale_X()
 178  *      This is like bool_setlocale_X(), but it is used only when it is
 179  *      expected that the call must succeed, or something is seriously wrong.
 180  *      A panic is issued if it fails.  The caller uses this form when it just
 181  *      wants to assume things worked.
 182  *
 183  * 3) querylocale_X()
 184  *      This returns a string that specifies the current locale for the given
 185  *      category given by the input argument.  The string is safe from other
 186  *      threads zapping it, and the caller need not worry about freeing it, but
 187  *      it may be mortalized, so must be copied if you need to preserve it
 188  *      across calls, or long term.  This returns the actual current locale,
 189  *      not the nominal.  These differ, for example, when LC_NUMERIC is
 190  *      supposed to be a locale whose decimal radix character is a comma.  As
 191  *      mentioned above, Perl actually keeps this category set to C in such
 192  *      circumstances so that XS code can just assume a dot radix character.
 193  *      querylocale_X() returns the locale that libc has stored at this moment,
 194  *      so most of the time will return a locale whose radix character is a
 195  *      dot.  The macro query_nominal_locale_i() can be used to get the nominal
 196  *      locale that an external caller would expect, for all categories except
 197  *      LC_ALL.  For that, you can use the function
 198  *      S_calculate_LC_ALL_string().  Or S_native_querylocale_i() will operate
 199  *      on any category.
 200  *
 201  * The underlying C API that this implements uses category numbers, hence the
 202  * code is structured to use '_r' at the API level to convert to indexes, which
 203  * are then used internally with the '_i' forms.
 204  *
 205  * The splitting apart into setting vs querying means that the return value of
 206  * the bool macros is not subject to potential clashes with other threads,
 207  * eliminating any need for the calling code to worry about that and get it
 208  * wrong.  Whereas, you do have to think about thread interactions when using a
 209  * query.
 210  *
 211  * Additionally, for the implementations where there aren't any complications,
 212  * a setlocale_i() is defined that is like plain setlocale(), returning the new
 213  * locale.  Thus it combines a bool_setlocale_X() with a querylocale_X().  It
 214  * is used only for performance on implementations that allow it, such as
 215  * non-threaded perls.
 216  *
 217  * There are also a few other macros herein that use this naming convention to
 218  * describe their category parameter.
 219  *
 220  * Relevant Configure options
 221  *
 222  *      -Accflags=-DNO_LOCALE
 223  *          This compiles perl to always use the C locale, ignoring any
 224  *          attempts to change it.  This could be useful on platforms with a
 225  *          crippled locale implementation.
 226  *
 227  *      -Accflags=-DNO_THREAD_SAFE_LOCALE
 228  *          Even if thread-safe operations are available on this platform and
 229  *          would otherwise be used (because this is a perl with multiplicity),
 230  *          perl is compiled to not use them.  This could be useful on
 231  *          platforms where the libc is buggy.
 232  *
 233  *      -Accflags=-DNO_POSIX_2008_LOCALE
 234  *          Even if the libc locale operations specified by the Posix 2008
 235  *          Standard are available on this platform and would otherwise be used
 236  *          (because this is a perl with multiplicity), perl is compiled to not
 237  *          use them.  This could be useful on platforms where the libc is
 238  *          buggy.  This is like NO_THREAD_SAFE_LOCALE, but has no effect on
 239  *          platforms that don't have these functions.
 240  *
 241  *      -Accflags=-DUSE_POSIX_2008_LOCALE
 242  *          Normally, setlocale() is used for locale operations on perls
 243  *          compiled without multiplicity.  This option causes the locale
 244  *          operations defined by the Posix 2008 Standard to always be used
 245  *          instead.  This could be useful on platforms where the libc
 246  *          setlocale() is buggy.
 247  *
 248  *      -Accflags=-DNO_THREAD_SAFE_QUERYLOCALE
 249  *          This applies only to platforms that have a querylocale() libc
 250  *          function.  perl assumes that that function is thread-safe, unless
 251  *          overridden by this, typically in a hints file.  When overridden,
 252  *          querylocale() is called only while the locale mutex is locked, and
 253  *          the result is copied to a per-thread place before unlocking.
 254  *
 255  *      -Accflags=-DUSE_NL_LOCALE_NAME
 256  *          glibc has an undocumented equivalent function to querylocale().  It
 257  *          currently isn't used by default because it is undocumented.  But
 258  *          testing hasn't found any problems with it.  Using this Configure
 259  *          option enables it on systems that have it (with no effect on
 260  *          systems lacking it).  Enabling this removes the need for perl
 261  *          to keep its own records, hence is more efficient and guaranteed to
 262  *          be accurate.
 263  *
 264  *      -Accflags=-DNO_LOCALE_CTYPE
 265  *      -Accflags=-DNO_LOCALE_NUMERIC
 266  *          etc.
 267  *
 268  *          If the named category(ies) does(do) not exist on this platform,
 269  *          these have no effect.  Otherwise they cause perl to be compiled to
 270  *          always keep the named category(ies) in the C locale.
 271  *
 272  *      -Accflags=-DHAS_BROKEN_SETLOCALE_QUERY_LC_ALL
 273  *          This would be set in a hints file to tell perl that doing a libc
 274  *              setlocale(LC_ALL, NULL)
 275  *          can give erroneous results, and perl will compensate to get the
 276  *          correct results.  This is known to be a problem in earlier AIX
 277  *          versions
 278  *
 279  *      -Accflags=-DHAS_LF_IN_SETLOCALE_RETURN
 280  *          This would be set in a hints file to tell perl that a libc
 281  *          setlocale() can return results containing \n characters that need
 282  *          to be stripped off.  khw believes there aren't any such platforms
 283  *          still in existence.
 284  *
 285  *      -Accflags=USE_FAKE_LC_ALL_POSITIONAL_NOTATION
 286  *          This is used when developing Perl on a platform that uses
 287  *          'name=value;' notation to represent LC_ALL when not all categories
 288  *          are the same.  When so compiled, much of the code gets compiled
 289  *          and exercised that applies to platforms that instead use positional
 290  *          notation.  This allows for finding many bugs in that portion of the
 291  *          implementation, without having to access such a platform.
 292  *
 293  *      -Accflags=-DWIN32_USE_FAKE_OLD_MINGW_LOCALES
 294  *          This is used when developing Perl on a non-Windows platform to
 295  *          compile and exercise much of the locale-related code that instead
 296  *          applies to MingW platforms that don't use the more modern UCRT
 297  *          library.  This allows for finding many bugs in that portion of the
 298  *          implementation, without having to access such a platform.
 299  */
 300
 301 /* If the environment says to, we can output debugging information during
 302  * initialization.  This is done before option parsing, and before any thread
 303  * creation, so can be a file-level static.  (Must come before #including
 304  * perl.h) */
 305 #include "config.h"
 306
 307 /* Returns the Unix errno portion; ignoring any others.  This is a macro here
 308  * instead of putting it into perl.h, because unclear to khw what should be
 309  * done generally. */
 310 #define GET_ERRNO   saved_errno
 311
 312 #ifdef DEBUGGING
 313 static int debug_initialization = 0;
 314 #  define DEBUG_INITIALIZATION_set(v) (debug_initialization = v)
 315 #  define DEBUG_LOCALE_INITIALIZATION_  debug_initialization
 316
 317 #  ifdef HAS_EXTENDED_OS_ERRNO
 318      /* Output the non-zero errno and/or the non-zero extended errno */
 319 #    define DEBUG_ERRNO                                                     \
 320         dSAVE_ERRNO; dTHX;                                                  \
 321         int extended = get_extended_os_errno();                             \
 322         const char * errno_string;                                          \
 323         if (GET_ERRNO == 0) { /* Skip output if both errno types are 0 */   \
 324             if (LIKELY(extended == 0)) errno_string = "";                   \
 325             else errno_string = Perl_form(aTHX_ "; $^E=%d", extended);      \
 326         }                                                                   \
 327         else if (LIKELY(extended == GET_ERRNO))                             \
 328             errno_string = Perl_form(aTHX_ "; $!=%d", GET_ERRNO);           \
 329         else errno_string = Perl_form(aTHX_ "; $!=%d, $^E=%d",              \
 330                                                     GET_ERRNO, extended);
 331 #  else
 332      /* Output the errno, if non-zero */
 333 #    define DEBUG_ERRNO                                                     \
 334         dSAVE_ERRNO;                                                        \
 335         const char * errno_string = "";                                     \
 336         if (GET_ERRNO != 0) {                                               \
 337             dTHX;                                                           \
 338             errno_string = Perl_form(aTHX_ "; $!=%d", GET_ERRNO);           \
 339         }
 340 #  endif
 341
 342 /* Automatically include the caller's file, and line number in debugging output;
 343  * and the errno (and/or extended errno) if non-zero.  On threaded perls add
 344  * the aTHX too. */
 345 #  if defined(USE_ITHREADS) && ! defined(NO_LOCALE_THREADS)
 346 #    define DEBUG_PRE_STMTS                                                 \
 347         DEBUG_ERRNO;                                                        \
 348         PerlIO_printf(Perl_debug_log, "\n%s: %" LINE_Tf ": 0x%p%s: ",       \
 349                                       __FILE__, (line_t)__LINE__, aTHX_     \
 350                                       errno_string);
 351 #  else
 352 #    define DEBUG_PRE_STMTS                                                 \
 353         DEBUG_ERRNO;                                                        \
 354         PerlIO_printf(Perl_debug_log, "\n%s: %" LINE_Tf "%s: ",             \
 355                                       __FILE__, (line_t)__LINE__,           \
 356                                       errno_string);
 357 #  endif
 358 #  define DEBUG_POST_STMTS  RESTORE_ERRNO;
 359 #else
 360 #  define debug_initialization 0
 361 #  define DEBUG_INITIALIZATION_set(v)
 362 #  define DEBUG_PRE_STMTS
 363 #  define DEBUG_POST_STMTS
 364 #endif
 365
 366 #include "EXTERN.h"
 367 #define PERL_IN_LOCALE_C
 368 #include "perl.h"
 369
 370 #if PERL_VERSION_GT(5,39,9)
 371 #  error Revert the commit that added this line
 372 #endif
 373
 374 #ifdef WIN32_USE_FAKE_OLD_MINGW_LOCALES
 375
 376    /* Use -Accflags=-DWIN32_USE_FAKE_OLD_MINGW_LOCALES on a POSIX or *nix box
 377     * to get a semblance of pretending the locale handling is that of a MingW
 378     * that doesn't use UCRT (hence 'OLD' in the name).  This exercizes code
 379     * paths that are not compiled on non-Windows boxes, and allows for ASAN
 380     * and PERL_MEMLOG.  This is thus a way to see if locale.c on Windows is
 381     * likely going to compile, without having to use a real Win32 box.  And
 382     * running the test suite will verify to a large extent our logic and memory
 383     * allocation handling for such boxes.  Of course the underlying calls are
 384     * to the POSIX libc, so any differences in implementation between those and
 385     * the Windows versions will not be caught by this. */
 386
 387 #  define WIN32
 388 #  undef P_CS_PRECEDES
 389 #  undef CURRENCY_SYMBOL
 390 #  define CP_UTF8 -1
 391 #  undef _configthreadlocale
 392 #  define _configthreadlocale(arg) NOOP
 393
 394 #  define MultiByteToWideChar(cp, flags, byte_string, m1, wstring, req_size) \
 395                     (mbsrtowcs(wstring, &(byte_string), req_size, NULL) + 1)
 396 #  define WideCharToMultiByte(cp, flags, wstring, m1, byte_string,          \
 397                               req_size, default_char, found_default_char)   \
 398                     (wcsrtombs(byte_string, &(wstring), req_size, NULL) + 1)
 399
 400 #  ifdef USE_LOCALE
 401
 402 static const wchar_t * wsetlocale_buf = NULL;
 403 static Size_t wsetlocale_buf_size = 0;
 404 static PerlInterpreter * wsetlocale_buf_aTHX = NULL;
 405
 406 STATIC
 407 const wchar_t *
 408 S_wsetlocale(const int category, const wchar_t * wlocale)
 409 {
 410     /* Windows uses a setlocale that takes a wchar_t* locale.  Other boxes
 411      * don't have this, so this Windows replacement converts the wchar_t input
 412      * to plain 'char*', calls plain setlocale(), and converts the result back
 413      * to 'wchar_t*' */
 414
 415     const char * byte_locale = NULL;
 416     if (wlocale) {
 417         byte_locale = Win_wstring_to_byte_string(CP_UTF8, wlocale);
 418     }
 419
 420     const char * byte_result = setlocale(category, byte_locale);
 421     Safefree(byte_locale);
 422     if (byte_result == NULL) {
 423         return NULL;
 424     }
 425
 426     const wchar_t * wresult = Win_byte_string_to_wstring(CP_UTF8, byte_result);
 427
 428     if (! wresult) {
 429         return NULL;
 430     }
 431
 432     /* Emulate a global static memory return from wsetlocale().  This currently
 433      * leaks at process end; would require changing LOCALE_TERM to fix that */
 434     Size_t string_size = wcslen(wresult) + 1;
 435
 436     if (wsetlocale_buf_size == 0) {
 437         Newx(wsetlocale_buf, string_size, wchar_t);
 438         wsetlocale_buf_size = string_size;
 439
 440 #  ifdef MULTIPLICITY
 441
 442         dTHX;
 443         wsetlocale_buf_aTHX = aTHX;
 444
 445 #  endif
 446
 447     }
 448     else if (string_size > wsetlocale_buf_size) {
 449         Renew(wsetlocale_buf, string_size, wchar_t);
 450         wsetlocale_buf_size = string_size;
 451     }
 452
 453     Copy(wresult, wsetlocale_buf, string_size, wchar_t);
 454     Safefree(wresult);
 455
 456     return wsetlocale_buf;
 457 }
 458
 459 #  define _wsetlocale(category, wlocale)  S_wsetlocale(category, wlocale)
 460 #  endif
 461 #endif  /* WIN32_USE_FAKE_OLD_MINGW_LOCALES */
 462
 463 /* 'for' loop headers to hide the necessary casts */
 464 #define for_all_individual_category_indexes(i)                              \
 465     for (locale_category_index i = (locale_category_index) 0;               \
 466          i < LC_ALL_INDEX_;                                                 \
 467          i = (locale_category_index) ((int) i + 1))
 468
 469 #define for_all_but_0th_individual_category_indexes(i)                      \
 470     for (locale_category_index i = (locale_category_index) 1;               \
 471          i < LC_ALL_INDEX_;                                                 \
 472          i = (locale_category_index) ((int) i + 1))
 473
 474 #define for_all_category_indexes(i)                                         \
 475     for (locale_category_index i = (locale_category_index) 0;               \
 476          i <= LC_ALL_INDEX_;                                                \
 477          i = (locale_category_index) ((int) i + 1))
 478
 479 #ifdef USE_LOCALE
 480 #  if defined(USE_FAKE_LC_ALL_POSITIONAL_NOTATION) && defined(LC_ALL)
 481
 482 /* This simulates an underlying positional notation for LC_ALL when compiled on
 483  * a system that uses name=value notation.  Use this to develop on Linux and
 484  * make a quick check that things have some chance of working on a positional
 485  * box.  Enable by adding to the Congfigure parameters:
 486  *      -Accflags=USE_FAKE_LC_ALL_POSITIONAL_NOTATION
 487  *
 488  * NOTE it redefines setlocale() and usequerylocale()
 489  * */
 490
 491 STATIC const char *
 492 S_positional_name_value_xlation(const char * locale, bool direction)
 493 {   /* direction == 1 is from name=value to positional
 494        direction == 0 is from positional to name=value */
 495     assert(locale);
 496
 497     dTHX;
 498     const char * individ_locales[LC_ALL_INDEX_] = { NULL };
 499
 500     /* This parses either notation */
 501     switch (parse_LC_ALL_string(locale,
 502                                 (const char **) &individ_locales,
 503                                 no_override,  /* Handled by other code */
 504                                 false,      /* Return only [0] if suffices */
 505                                 false,      /* Don't panic on error */
 506                                 __LINE__))
 507     {
 508       default:      /* Some compilers don't realize that below is the complete
 509                        list of the available enum values */
 510       case invalid:
 511         return NULL;
 512
 513       case no_array:
 514         return locale;
 515       case only_element_0:
 516         SAVEFREEPV(individ_locales[0]);
 517         return individ_locales[0];
 518       case full_array:
 519        {
 520         calc_LC_ALL_format  format = (direction)
 521                                      ? EXTERNAL_FORMAT_FOR_SET
 522                                      : INTERNAL_FORMAT;
 523         const char * retval = calculate_LC_ALL_string(individ_locales,
 524                                                       format,
 525                                                       WANT_TEMP_PV,
 526                                                       __LINE__);
 527
 528         for_all_individual_category_indexes(i) {
 529             Safefree(individ_locales[i]);
 530         }
 531
 532         return retval;
 533        }
 534     }
 535 }
 536
 537 STATIC const char *
 538 S_positional_setlocale(int cat, const char * locale)
 539 {
 540     if (cat != LC_ALL) return setlocale(cat, locale);
 541
 542     if (locale && strNE(locale, "")) {
 543         locale = S_positional_name_value_xlation(locale, 0);
 544         if (! locale) return NULL;
 545     }
 546
 547     locale = setlocale(cat, locale);
 548     if (locale == NULL) return NULL;
 549     return S_positional_name_value_xlation(locale, 1);
 550 }
 551
 552 #    undef setlocale
 553 #    define setlocale(a,b)  S_positional_setlocale(a,b)
 554 #    ifdef USE_POSIX_2008_LOCALE
 555
 556 STATIC locale_t
 557 S_positional_newlocale(int mask, const char * locale, locale_t base)
 558 {
 559     assert(locale);
 560
 561     if (mask != LC_ALL_MASK) return newlocale(mask, locale, base);
 562
 563     if (strNE(locale, "")) locale = S_positional_name_value_xlation(locale, 0);
 564     if (locale == NULL) return NULL;
 565     return newlocale(LC_ALL_MASK, locale, base);
 566 }
 567
 568 #    undef newlocale
 569 #    define newlocale(a,b,c)  S_positional_newlocale(a,b,c)
 570 #    endif
 571 #  endif
 572 #endif  /* End of fake positional notation */
 573
 574 #include "reentr.h"
 575
 576 #ifdef I_WCHAR
 577 #  include <wchar.h>
 578 #endif
 579 #ifdef I_WCTYPE
 580 #  include <wctype.h>
 581 #endif
 582
 583  /* The main errno that gets used is this one, on platforms that support it */
 584 #ifdef EINVAL
 585 #  define SET_EINVAL  SETERRNO(EINVAL, LIB_INVARG)
 586 #else
 587 #  define SET_EINVAL
 588 #endif
 589
 590 /* This is a starting guess as to when this is true.  It definititely isn't
 591  * true on *BSD where positional LC_ALL notation is used.  Likely this will end
 592  * up being defined in hints files. */
 593 #ifdef PERL_LC_ALL_USES_NAME_VALUE_PAIRS
 594 #  define NEWLOCALE_HANDLES_DISPARATE_LC_ALL
 595 #endif
 596
 597 /* But regardless, we have to look at individual categories if some are
 598  * ignored.  */
 599 #ifdef HAS_IGNORED_LOCALE_CATEGORIES_
 600 #  undef NEWLOCALE_HANDLES_DISPARATE_LC_ALL
 601 #endif
 602 #ifdef USE_LOCALE
 603
 604 /* Not all categories need be set to the same locale.  This macro determines if
 605  * 'name' which represents LC_ALL is uniform or disparate.  There are two
 606  * situations: 1) the platform uses unordered name=value pairs; 2) the platform
 607  * uses ordered positional values, with a separator string between them */
 608 #  ifdef PERL_LC_ALL_SEPARATOR   /* positional */
 609 #    define is_disparate_LC_ALL(name)  cBOOL(instr(name, PERL_LC_ALL_SEPARATOR))
 610 #  else     /* name=value */
 611
 612     /* In the, hopefully never occurring, event that the platform doesn't use
 613      * either mechanism for disparate LC_ALL's, assume the name=value pairs
 614      * form, rather than taking the extreme step of refusing to compile.  Many
 615      * programs won't have disparate locales, so will generally work */
 616 #    define PERL_LC_ALL_SEPARATOR  ";"
 617 #    define is_disparate_LC_ALL(name)  cBOOL(   strchr(name, ';')   \
 618                                              && strchr(name, '='))
 619 #  endif
 620
 621 /* It is possible to compile perl to always keep any individual category in the
 622  * C locale.  This would be done where the implementation on a platform is
 623  * flawed or incomplete.  At the time of this writing, for example, OpenBSD has
 624  * not implemented LC_COLLATE beyond the C locale.  The 'category_available[]'
 625  * table is a bool that says whether a category is changeable, or must be kept
 626  * in C.  This macro substitutes C for the locale appropriately, expanding to
 627  * nothing on the more typical case where all possible categories present on
 628  * the platform are handled. */
 629 #  ifdef HAS_IGNORED_LOCALE_CATEGORIES_
 630 #    define need_to_override_category(i)  (! category_available[i])
 631 #    define override_ignored_category(i, new_locale)                        \
 632                     ((need_to_override_category(i)) ? "C" : (new_locale))
 633 #  else
 634 #    define need_to_override_category(i)  0
 635 #    define override_ignored_category(i, new_locale)  (new_locale)
 636 #  endif
 637
 638 PERL_STATIC_INLINE const char *
 639 S_mortalized_pv_copy(pTHX_ const char * const pv)
 640 {
 641     PERL_ARGS_ASSERT_MORTALIZED_PV_COPY;
 642
 643     /* Copies the input pv, and arranges for it to be freed at an unspecified
 644      * later time. */
 645
 646     if (pv == NULL) {
 647         return NULL;
 648     }
 649
 650     const char * copy = savepv(pv);
 651     SAVEFREEPV(copy);
 652     return copy;
 653 }
 654
 655 #endif
 656
 657 /* Default values come from the C locale */
 658 #define C_codeset "ANSI_X3.4-1968" /* Only in some Configurations, and usually
 659                                       a single instance, so is a #define */
 660 static const char C_decimal_point[] = ".";
 661
 662 #if (defined(USE_LOCALE_NUMERIC) && ! defined(TS_W32_BROKEN_LOCALECONV))    \
 663  || ! (   defined(USE_LOCALE_NUMERIC)                                       \
 664        && (defined(HAS_SOME_LANGINFO) || defined(HAS_LOCALECONV)))
 665 static const char C_thousands_sep[] = "";
 666 #endif
 667
 668 /* Is the C string input 'name' "C" or "POSIX"?  If so, and 'name' is the
 669  * return of setlocale(), then this is extremely likely to be the C or POSIX
 670  * locale.  However, the output of setlocale() is documented to be opaque, but
 671  * the odds are extremely small that it would return these two strings for some
 672  * other locale.  Note that VMS includes many non-ASCII characters in these two
 673  * locales as controls and punctuation (below are hex bytes):
 674  *   cntrl:  84-97 9B-9F
 675  *   punct:  A1-A3 A5 A7-AB B0-B3 B5-B7 B9-BD BF-CF D1-DD DF-EF F1-FD
 676  * Oddly, none there are listed as alphas, though some represent alphabetics
 677  * http://www.nntp.perl.org/group/perl.perl5.porters/2013/02/msg198753.html */
 678 #define isNAME_C_OR_POSIX(name)                                              \
 679                              (   (name) != NULL                              \
 680                               && (( *(name) == 'C' && (*(name + 1)) == '\0') \
 681                                    || strEQ((name), "POSIX")))
 682
 683 #if defined(HAS_NL_LANGINFO_L) || defined(HAS_NL_LANGINFO)
 684 #  define HAS_SOME_LANGINFO
 685 #endif
 686
 687 #define my_langinfo_c(item, category, locale, retbufp, retbuf_sizep, utf8ness) \
 688             my_langinfo_i(item, category##_INDEX_, locale, retbufp,            \
 689                                                       retbuf_sizep,  utf8ness)
 690 #ifdef USE_LOCALE
 691 #  ifdef DEBUGGING
 692 #    define setlocale_debug_string_i(index, locale, result)                 \
 693             my_setlocale_debug_string_i(index, locale, result, __LINE__)
 694 #    define setlocale_debug_string_c(category, locale, result)              \
 695                 setlocale_debug_string_i(category##_INDEX_, locale, result)
 696 #    define setlocale_debug_string_r(category, locale, result)              \
 697              setlocale_debug_string_i(get_category_index(category),         \
 698                                       locale, result)
 699 #  endif
 700
 701 #  define toggle_locale_i(index, locale)                                    \
 702                  S_toggle_locale_i(aTHX_ index, locale, __LINE__)
 703 #  define toggle_locale_c(cat, locale)  toggle_locale_i(cat##_INDEX_, locale)
 704 #  define restore_toggled_locale_i(index, locale)                           \
 705                  S_restore_toggled_locale_i(aTHX_ index, locale, __LINE__)
 706 #  define restore_toggled_locale_c(cat, locale)                             \
 707                              restore_toggled_locale_i(cat##_INDEX_, locale)
 708
 709 /* On systems without LC_ALL, pretending it exists anyway simplifies things.
 710  * Choose a value for it that is very unlikely to clash with any actual
 711  * category */
 712 #  define FAKE_LC_ALL  PERL_INT_MIN
 713
 714 /* Below are parallel arrays for locale information indexed by our mapping of
 715  * category numbers into small non-negative indexes.  locale_table.h contains
 716  * an entry like this for each individual category used on this system:
 717  *      PERL_LOCALE_TABLE_ENTRY(CTYPE, S_new_ctype)
 718  *
 719  * Each array redefines PERL_LOCALE_TABLE_ENTRY to generate the information
 720  * needed for that array, and #includes locale_table.h to get the valid
 721  * categories.
 722  *
 723  * An entry for the conglomerate category LC_ALL is added here, immediately
 724  * following the individual categories.  (The treatment for it varies, so can't
 725  * be in locale_table.h.)
 726  *
 727  * Following this, each array ends with an entry for illegal categories.  All
 728  * category numbers unknown to perl get mapped to this entry.  This is likely
 729  * to be a parameter error from the calling program; but it could be that this
 730  * platform has a category we don't know about, in which case it needs to be
 731  * added, using the paradigm of one of the existing categories. */
 732
 733 /* The first array is the locale categories perl uses on this system, used to
 734  * map our index back to the system's category number. */
 735 STATIC const int categories[] = {
 736
 737 #  undef PERL_LOCALE_TABLE_ENTRY
 738 #  define PERL_LOCALE_TABLE_ENTRY(name, call_back)  LC_ ## name,
 739 #  include "locale_table.h"
 740
 741 #  ifdef LC_ALL
 742     LC_ALL,
 743 #  else
 744     FAKE_LC_ALL,
 745 #  endif
 746
 747    (FAKE_LC_ALL + 1)    /* Entry for unknown category; this number is unlikely
 748                            to clash with a real category */
 749 };
 750
 751 # define GET_NAME_AS_STRING(token)  # token
 752 # define GET_LC_NAME_AS_STRING(token) GET_NAME_AS_STRING(LC_ ## token)
 753
 754 /* The second array is the category names. */
 755 STATIC const char * const category_names[] = {
 756
 757 #  undef PERL_LOCALE_TABLE_ENTRY
 758 #  define PERL_LOCALE_TABLE_ENTRY(name, call_back)  GET_LC_NAME_AS_STRING(name),
 759 #  include "locale_table.h"
 760
 761 #  ifdef LC_ALL
 762 #    define LC_ALL_STRING  "LC_ALL"
 763 #  else
 764 #    define LC_ALL_STRING  "If you see this, it is a bug in perl;"      \
 765                            " please report it via perlbug"
 766 #  endif
 767
 768     LC_ALL_STRING,
 769
 770 #  define LC_UNKNOWN_STRING  "Locale category unknown to Perl; if you see"  \
 771                              " this, it is a bug in perl; please report it" \
 772                              " via perlbug"
 773     LC_UNKNOWN_STRING
 774 };
 775
 776 STATIC const Size_t category_name_lengths[] = {
 777
 778 #  undef PERL_LOCALE_TABLE_ENTRY
 779 #  define PERL_LOCALE_TABLE_ENTRY(name, call_back)                          \
 780                                     STRLENs(GET_LC_NAME_AS_STRING(name)),
 781 #  include "locale_table.h"
 782
 783     STRLENs(LC_ALL_STRING),
 784     STRLENs(LC_UNKNOWN_STRING)
 785 };
 786
 787 /* Each entry includes space for the '=' and ';' */
 788 #  undef PERL_LOCALE_TABLE_ENTRY
 789 #  define PERL_LOCALE_TABLE_ENTRY(name, call_back)                          \
 790                                 + STRLENs(GET_LC_NAME_AS_STRING(name)) + 2
 791
 792 STATIC const Size_t lc_all_boiler_plate_length = 1  /* space for trailing NUL */
 793 #  include "locale_table.h"
 794 ;
 795
 796 /* A few categories require additional setup when they are changed.  This table
 797  * points to the functions that do that setup */
 798 STATIC void (*update_functions[]) (pTHX_ const char *, bool force) = {
 799
 800 #  undef PERL_LOCALE_TABLE_ENTRY
 801 #  define PERL_LOCALE_TABLE_ENTRY(name, call_back)  call_back,
 802 #  include "locale_table.h"
 803
 804     S_new_LC_ALL,
 805     NULL,   /* No update for unknown category */
 806 };
 807
 808 #  if defined(HAS_IGNORED_LOCALE_CATEGORIES_)
 809
 810 /* Indicates if each category on this platform is available to use not in
 811  * the C locale */
 812 STATIC const bool category_available[] = {
 813
 814 #  undef PERL_LOCALE_TABLE_ENTRY
 815 #  define PERL_LOCALE_TABLE_ENTRY(name, call_back)  LC_ ## name ## _AVAIL_,
 816 #  include "locale_table.h"
 817
 818 #  ifdef LC_ALL
 819     true,
 820 #  else
 821     false,
 822 #  endif
 823
 824     false   /* LC_UNKNOWN_AVAIL_ */
 825 };
 826
 827 #  endif
 828 #  if defined(USE_POSIX_2008_LOCALE)
 829
 830 STATIC const int category_masks[] = {
 831
 832 #    undef PERL_LOCALE_TABLE_ENTRY
 833 #    define PERL_LOCALE_TABLE_ENTRY(name, call_back)  LC_ ## name ## _MASK,
 834 #    include "locale_table.h"
 835
 836     LC_ALL_MASK,    /* Will rightly refuse to compile unless this is defined */
 837     0               /* Empty mask for unknown category */
 838 };
 839
 840 #  endif
 841 #  if ! defined(PERL_LC_ALL_USES_NAME_VALUE_PAIRS)
 842
 843 /* On platforms that use positional notation for expressing LC_ALL, this maps
 844  * the position of each category to our corresponding internal index for it.
 845  * This is initialized at run time if needed.  LC_ALL_INDEX_ is not legal for
 846  * an individual locale, hence marks the elements here as not actually
 847  * initialized. */
 848 STATIC
 849 unsigned int
 850 map_LC_ALL_position_to_index[LC_ALL_INDEX_] = { LC_ALL_INDEX_ };
 851
 852 #  endif
 853 #endif
 854 #if defined(USE_LOCALE) || defined(DEBUGGING)
 855
 856 STATIC const char *
 857 S_get_displayable_string(pTHX_
 858                          const char * const s,
 859                          const char * const e,
 860                          const bool is_utf8)
 861 {
 862     PERL_ARGS_ASSERT_GET_DISPLAYABLE_STRING;
 863
 864     if (e <= s) {
 865         return "";
 866     }
 867
 868     const char * t = s;
 869     bool prev_was_printable = TRUE;
 870     bool first_time = TRUE;
 871     char * ret;
 872
 873     /* Worst case scenario: All are non-printable so have a blank between each.
 874      * If UTF-8, all are the largest possible code point; otherwise all are a
 875      * single byte.  '(2 + 1)'  is from each byte takes 2 characters to
 876      * display, and a blank (or NUL for the final one) after it */
 877     const Size_t size = (e - s) * (2 + 1) * ((is_utf8) ? UVSIZE : 1);
 878     Newxz(ret, size, char);
 879     SAVEFREEPV(ret);
 880
 881     while (t < e) {
 882         UV cp = (is_utf8)
 883                 ?  utf8_to_uvchr_buf((U8 *) t, e, NULL)
 884                 : * (U8 *) t;
 885         if (isPRINT(cp)) {
 886             if (! prev_was_printable) {
 887                 my_strlcat(ret, " ", size);
 888             }
 889
 890             /* Escape these to avoid any ambiguity */
 891             if (cp == ' ' || cp == '\\') {
 892                 my_strlcat(ret, "\\", size);
 893             }
 894             my_strlcat(ret, Perl_form(aTHX_ "%c", (U8) cp), size);
 895             prev_was_printable = TRUE;
 896         }
 897         else {
 898             if (! first_time) {
 899                 my_strlcat(ret, " ", size);
 900             }
 901             my_strlcat(ret, Perl_form(aTHX_ "%02" UVXf, cp), size);
 902             prev_was_printable = FALSE;
 903         }
 904         t += (is_utf8) ? UTF8SKIP(t) : 1;
 905         first_time = FALSE;
 906     }
 907
 908     return ret;
 909 }
 910
 911 #endif
 912 #ifdef USE_LOCALE
 913
 914 # define get_category_index(cat) get_category_index_helper(cat, NULL, __LINE__)
 915
 916 STATIC locale_category_index
 917 S_get_category_index_helper(pTHX_ const int category, bool * succeeded,
 918                                   const line_t caller_line)
 919 {
 920     PERL_ARGS_ASSERT_GET_CATEGORY_INDEX_HELPER;
 921
 922     /* Given a category, return the equivalent internal index we generally use
 923      * instead, warn or panic if not found. */
 924
 925     locale_category_index i;
 926
 927 #  undef PERL_LOCALE_TABLE_ENTRY
 928 #  define PERL_LOCALE_TABLE_ENTRY(name, call_back)                          \
 929                     case LC_ ## name: i =  LC_ ## name ## _INDEX_; break;
 930
 931     switch (category) {
 932
 933 #  include "locale_table.h"
 934 #  ifdef LC_ALL
 935       case LC_ALL: i =  LC_ALL_INDEX_; break;
 936 #  endif
 937
 938       default: goto unknown_locale;
 939     }
 940
 941     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
 942                            "index of category %d (%s) is %d;"
 943                            " called from %" LINE_Tf "\n",
 944                            category, category_names[i], i, caller_line));
 945
 946     if (succeeded) {
 947         *succeeded = true;
 948     }
 949
 950     return i;
 951
 952   unknown_locale:
 953
 954     if (succeeded) {
 955         *succeeded = false;
 956         return LC_ALL_INDEX_;   /* Arbitrary */
 957     }
 958
 959     locale_panic_via_(Perl_form(aTHX_ "Unknown locale category %d", category),
 960                       __FILE__, caller_line);
 961     NOT_REACHED; /* NOTREACHED */
 962 }
 963
 964 #endif /* ifdef USE_LOCALE */
 965
 966 void
 967 Perl_force_locale_unlock(pTHX)
 968 {
 969     /* Remove any locale mutex, in preperation for an inglorious termination,
 970      * typically a  panic */
 971
 972 #if defined(USE_LOCALE_THREADS)
 973
 974     /* If recursively locked, clear all at once */
 975     if (PL_locale_mutex_depth > 1) {
 976         PL_locale_mutex_depth = 1;
 977     }
 978
 979     if (PL_locale_mutex_depth > 0) {
 980         LOCALE_UNLOCK_;
 981     }
 982
 983 #endif
 984
 985 }
 986
 987 #ifdef USE_POSIX_2008_LOCALE
 988
 989 STATIC locale_t
 990 S_use_curlocale_scratch(pTHX)
 991 {
 992     /* This function is used to hide from the caller the case where the current
 993      * locale_t object in POSIX 2008 is the global one, which is illegal in
 994      * many of the P2008 API calls.  This checks for that and, if necessary
 995      * creates a proper P2008 object.  Any prior object is deleted, as is any
 996      * remaining object during global destruction. */
 997
 998     locale_t cur = uselocale((locale_t) 0);
 999
1000     if (cur != LC_GLOBAL_LOCALE) {
1001         return cur;
1002     }
1003
1004     if (PL_scratch_locale_obj) {
1005         freelocale(PL_scratch_locale_obj);
1006     }
1007
1008     PL_scratch_locale_obj = duplocale(LC_GLOBAL_LOCALE);
1009     return PL_scratch_locale_obj;
1010 }
1011
1012 #endif
1013
1014 void
1015 Perl_locale_panic(const char * msg,
1016                   const line_t immediate_caller_line,
1017                   const char * const higher_caller_file,
1018                   const line_t higher_caller_line)
1019 {
1020     PERL_ARGS_ASSERT_LOCALE_PANIC;
1021     dTHX;
1022     dSAVE_ERRNO;
1023
1024     force_locale_unlock();
1025
1026 #ifdef USE_C_BACKTRACE
1027     dump_c_backtrace(Perl_debug_log, 20, 1);
1028 #endif
1029
1030     const char * called_by = "";
1031     if (   strNE(__FILE__, higher_caller_file)
1032         || immediate_caller_line != higher_caller_line)
1033     {
1034         called_by = Perl_form(aTHX_ "\nCalled by %s: %" LINE_Tf "\n",
1035                                     higher_caller_file, higher_caller_line);
1036     }
1037
1038     RESTORE_ERRNO;
1039
1040     const char * errno_text;
1041
1042 #ifdef HAS_EXTENDED_OS_ERRNO
1043
1044     const int extended_errnum = get_extended_os_errno();
1045     if (errno != extended_errnum) {
1046         errno_text = Perl_form(aTHX_ "; errno=%d, $^E=%d",
1047                                      errno, extended_errnum);
1048     }
1049     else
1050
1051 #endif
1052
1053     {
1054         errno_text = Perl_form(aTHX_ "; errno=%d", errno);
1055     }
1056
1057     /* diag_listed_as: panic: %s */
1058     Perl_croak(aTHX_ "%s: %" LINE_Tf ": panic: %s%s%s\n",
1059                      __FILE__, immediate_caller_line,
1060                      msg, errno_text, called_by);
1061 }
1062
1063 /* Macros to report and croak on an unexpected failure to set the locale.  The
1064  * via version has more stack trace information */
1065 #define setlocale_failure_panic_i(i, cur, fail, line, higher_line)          \
1066     setlocale_failure_panic_via_i(i, cur, fail, __LINE__, line,             \
1067                                   __FILE__, higher_line)
1068
1069 #define setlocale_failure_panic_c(cat, cur, fail, line, higher_line)        \
1070    setlocale_failure_panic_i(cat##_INDEX_, cur, fail, line, higher_line)
1071
1072 #if defined(USE_LOCALE)
1073
1074 /* Expands to the code to
1075  *      result = savepvn(s, len)
1076  * if the category whose internal index is 'i' doesn't need to be kept in the C
1077  * locale on this system, or if 'action is 'no_override'.  Otherwise it expands
1078  * to
1079  *      result = savepv("C")
1080  * unless 'action' isn't 'check_that_overridden', in which case if the string
1081  * 's' isn't already "C" it panics */
1082 #  ifndef HAS_IGNORED_LOCALE_CATEGORIES_
1083 #    define OVERRIDE_AND_SAVEPV(s, len, result, i, action)                  \
1084                                                   result = savepvn(s, len)
1085 #  else
1086 #    define OVERRIDE_AND_SAVEPV(s, len, result, i, action)                  \
1087         STMT_START {                                                        \
1088             if (LIKELY(   ! need_to_override_category(i)                    \
1089                        || action == no_override)) {                         \
1090                 result = savepvn(s, len);                                   \
1091             }                                                               \
1092             else {                                                          \
1093                 const char * temp = savepvn(s, len);                        \
1094                 result = savepv(override_ignored_category(i, temp));        \
1095                 if (action == check_that_overridden && strNE(result, temp)) { \
1096                     locale_panic_(Perl_form(aTHX_                           \
1097                                 "%s expected to be '%s', instead is '%s'",  \
1098                                 category_names[i], result, temp));          \
1099                 }                                                           \
1100                 Safefree(temp);                                             \
1101             }                                                               \
1102         } STMT_END
1103 #  endif
1104
1105 STATIC parse_LC_ALL_string_return
1106 S_parse_LC_ALL_string(pTHX_ const char * string,
1107                             const char ** output,
1108                             const parse_LC_ALL_STRING_action  override,
1109                             bool always_use_full_array,
1110                             const bool panic_on_error,
1111                             const line_t caller_line)
1112 {
1113     /* This function parses the value of the input 'string' which is expected
1114      * to be the representation of an LC_ALL locale, and splits the result into
1115      * the values for the individual component categories, returning those in
1116      * the 'output' array.  Each array value will be a savepv() copy that is
1117      * the responsibility of the caller to make sure gets freed
1118      *
1119      * The locale for each category is independent of the other categories.
1120      * Often, they are all the same, but certainly not always.  Perl, in fact,
1121      * usually keeps LC_NUMERIC in the C locale, regardless of the underlying
1122      * locale.  LC_ALL has to be able to represent the case of when not all
1123      * categories have the same locale.  Platforms have differing ways of
1124      * representing this.  Internally, this file uses the 'name=value;'
1125      * representation found on some platforms, so this function always looks
1126      * for and parses that.  Other platforms use a positional notation.  On
1127      * those platforms, this function also parses that form.  It examines the
1128      * input to see which form is being parsed.
1129      *
1130      * Often, all categories will have the same locale.  This is special cased
1131      * if 'always_use_full_array' is false on input:
1132      *      1) If the input 'string' is a single value, this function doesn't
1133      *         store anything into 'output', and returns 'no_array'
1134      *      2) Some platforms will return multiple occurrences of the same
1135      *         value rather than coalescing them down to a single one.  HP-UX
1136      *         is such a one.  This function will do that collapsing for you,
1137      *         returning 'only_element_0' and saving the single value in
1138      *         output[0], which the caller will need to arrange to be freed.
1139      *         The rest of output[] is undefined, and does not need to be
1140      *         freed.
1141      *
1142      * Otherwise, the input 'string' may not be valid.  This function looks
1143      * mainly for syntactic errors, and if found, returns 'invalid'.  'output'
1144      * will not be filled in in that case, but the input state of it isn't
1145      * necessarily preserved.  Turning on -DL debugging will give details as to
1146      * the error.  If 'panic_on_error' is 'true', the function panics instead
1147      * of returning on error, with a message giving the details.
1148      *
1149      * Otherwise, output[] will be filled with the individual locale names for
1150      * all categories on the system, 'full_array' will be returned, and the
1151      * caller needs to arrange for each to be freed.  This means that either at
1152      * least one category differed from the others, or 'always_use_full_array' was
1153      * true on input.
1154      *
1155      * perl may be configured to ignore changes to a category's locale to
1156      * non-C.  The parameter 'override' tells this function what to do when
1157      * encountering such an illegal combination:
1158      *
1159      *      no_override             indicates to take no special action
1160      *      override_if_ignored,    indicates to return 'C' instead of what the
1161      *                              input string actually says.
1162      *      check_that_overridden   indicates to panic if the string says the
1163      *                              category is not 'C'.  This is used when
1164      *                              non-C is very unexpected behavior.
1165      * */
1166
1167     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
1168                            "Entering parse_LC_ALL_string; called from %"    \
1169                            LINE_Tf "\nnew='%s'\n", caller_line, string));
1170
1171 #  ifdef PERL_LC_ALL_USES_NAME_VALUE_PAIRS
1172
1173     const char separator[] = ";";
1174     const Size_t separator_len = 1;
1175     const bool single_component = (strchr(string, ';') == NULL);
1176
1177 #  else
1178
1179     /* It's possible (but quite unlikely) that the separator string is an '='
1180      * or a ';'.  Requiring both to be present for using the 'name=value;' form
1181      * properly handles those possibilities */
1182     const bool name_value = strchr(string, '=') && strchr(string, ';');
1183     const char * separator;
1184     Size_t separator_len;
1185     bool single_component;
1186     if (name_value) {
1187         separator = ";";
1188         separator_len = 1;
1189         single_component = false;   /* Since has both [;=], must be multi */
1190     }
1191     else {
1192         separator = PERL_LC_ALL_SEPARATOR;
1193         separator_len = STRLENs(PERL_LC_ALL_SEPARATOR);
1194         single_component = instr(string, separator) == NULL;
1195     }
1196
1197     Size_t component_number = 0;    /* Position in the parsing loop below */
1198
1199 #  endif
1200 #  ifndef HAS_IGNORED_LOCALE_CATEGORIES_
1201     PERL_UNUSED_ARG(override);
1202 #  else
1203
1204     /* Any ignored categories are to be set to "C", so if this single-component
1205      * LC_ALL isn't to C, it has both "C" and non-C, so isn't really a single
1206      * component.  All the non-ignored categories are set to the input
1207      * component, but the ignored ones are overridden to be C.
1208      *
1209      * This incidentally handles the case where the string is "".  The return
1210      * will be C for each ignored category and "" for the others.  Then the
1211      * caller can individually set each category, and get the right answer. */
1212     if (single_component && ! isNAME_C_OR_POSIX(string)) {
1213         for_all_individual_category_indexes(i) {
1214            OVERRIDE_AND_SAVEPV(string, strlen(string), output[i], i, override);
1215         }
1216
1217         return full_array;
1218     }
1219
1220 #  endif
1221
1222     if (single_component) {
1223         if (! always_use_full_array) {
1224             return no_array;
1225         }
1226
1227         for_all_individual_category_indexes(i) {
1228             output[i] = savepv(string);
1229         }
1230
1231         return full_array;
1232     }
1233
1234     /* Here the input is multiple components.  Parse through them.  (It is
1235      * possible that these components are all the same, so we check, and if so,
1236      * return just the 0th component (unless 'always_use_full_array' is true)
1237      *
1238      * This enum notes the possible errors findable in parsing */
1239     enum {
1240             incomplete,
1241             no_equals,
1242             unknown_category,
1243             contains_LC_ALL_element
1244     } error;
1245
1246     /* Keep track of the categories we have encountered so far */
1247     bool seen[LC_ALL_INDEX_] = { false };
1248
1249     Size_t index;           /* Our internal index for the current category */
1250     const char * s = string;
1251     const char * e = s + strlen(string);
1252     const char * category_end = NULL;
1253     const char * saved_first = NULL;
1254
1255     /* Parse the input locale string */
1256     while (s < e) {
1257
1258         /* 'separator' has been set up to delimit the components */
1259         const char * next_sep = instr(s, separator);
1260         if (! next_sep) {   /* At the end of the input */
1261             next_sep = e;
1262         }
1263
1264 #  ifndef PERL_LC_ALL_USES_NAME_VALUE_PAIRS
1265
1266         if (! name_value) {
1267             /* Get the index of the category in this position */
1268             index = map_LC_ALL_position_to_index[component_number++];
1269         }
1270         else
1271
1272 #  endif
1273
1274         {   /* Get the category part when each component is the
1275              * 'category=locale' form */
1276
1277             category_end = strchr(s, '=');
1278
1279             /* The '=' terminates the category name.  If no '=', is improper
1280              * form */
1281             if (! category_end) {
1282                 error = no_equals;
1283                 goto failure;
1284             }
1285
1286             /* Find our internal index of the category name; uses a linear
1287              * search.  (XXX This could be avoided by various means, but the
1288              * maximum likely search is 6 items, and khw doesn't think the
1289              * added complexity would save very much at all.) */
1290             const unsigned int name_len = (unsigned int) (category_end - s);
1291             for (index = 0; index < C_ARRAY_LENGTH(category_names); index++) {
1292                 if (   name_len == category_name_lengths[index]
1293                     && memEQ(s, category_names[index], name_len))
1294                 {
1295                     goto found_category;
1296                 }
1297             }
1298
1299             /* Here, the category is not in our list. */
1300             error = unknown_category;
1301             goto failure;
1302
1303           found_category:   /* The system knows about this category. */
1304
1305             if (index == LC_ALL_INDEX_) {
1306                 error = contains_LC_ALL_element;
1307                 goto failure;
1308             }
1309
1310             /* The locale name starts just beyond the '=' */
1311             s = category_end + 1;
1312
1313             /* Linux (and maybe others) doesn't treat a duplicate category in
1314              * the string as an error.  Instead it uses the final occurrence as
1315              * the intended value.  So if this is a duplicate, free the former
1316              * value before setting the new one */
1317             if (seen[index]) {
1318                 Safefree(output[index]);
1319             }
1320             else {
1321                 seen[index] = true;
1322             }
1323         }
1324
1325         /* Here, 'index' contains our internal index number for the current
1326          * category, and 's' points to the beginning of the locale name for
1327          * that category. */
1328         OVERRIDE_AND_SAVEPV(s, next_sep - s, output[index], index, override);
1329
1330         if (! always_use_full_array) {
1331             if (! saved_first) {
1332                 saved_first = output[index];
1333             }
1334             else {
1335                 if (strNE(saved_first, output[index])) {
1336                     always_use_full_array = true;
1337                 }
1338             }
1339         }
1340
1341         /* Next time start from the new position */
1342         s = next_sep + separator_len;
1343     }
1344
1345     /* Finished looping through all the categories
1346      *
1347      * Check if the input was incomplete. */
1348
1349 #  ifndef PERL_LC_ALL_USES_NAME_VALUE_PAIRS
1350
1351     if (! name_value) {     /* Positional notation */
1352         if (component_number != LC_ALL_INDEX_) {
1353             error = incomplete;
1354             goto failure;
1355         }
1356     }
1357     else
1358
1359 #  endif
1360
1361     {   /* Here is the name=value notation */
1362         for_all_individual_category_indexes(i) {
1363             if (! seen[i]) {
1364                 error = incomplete;
1365                 goto failure;
1366             }
1367         }
1368     }
1369
1370     /* In the loop above, we changed 'always_use_full_array' to true iff not all
1371      * categories have the same locale.  Hence, if it is still 'false', all of
1372      * them are the same. */
1373     if (always_use_full_array) {
1374         return full_array;
1375     }
1376
1377     /* Free the dangling ones */
1378     for_all_but_0th_individual_category_indexes(i) {
1379         Safefree(output[i]);
1380         output[i] = NULL;
1381     }
1382
1383     return only_element_0;
1384
1385   failure:
1386
1387     /* Don't leave memory dangling that we allocated before the failure */
1388     for_all_individual_category_indexes(i) {
1389         if (seen[i]) {
1390             Safefree(output[i]);
1391             output[i] = NULL;
1392         }
1393     }
1394
1395     const char * msg;
1396     const char * display_start = s;
1397     const char * display_end = e;
1398
1399     switch (error) {
1400         case incomplete:
1401             msg = "doesn't list every locale category";
1402             display_start = string;
1403             break;
1404         case no_equals:
1405             msg = "needs an '=' to split name=value";
1406             break;
1407         case unknown_category:
1408             msg = "is an unknown category";
1409             display_end = (category_end && category_end > display_start)
1410                           ? category_end
1411                           : e;
1412             break;
1413         case contains_LC_ALL_element:
1414             msg = "has LC_ALL, which is illegal here";
1415             break;
1416     }
1417
1418     msg = Perl_form(aTHX_ "'%.*s' %s\n",
1419                           (int) (display_end - display_start),
1420                           display_start, msg);
1421
1422     DEBUG_L(PerlIO_printf(Perl_debug_log, "%s", msg));
1423
1424     if (panic_on_error) {
1425         locale_panic_via_(msg, __FILE__, caller_line);
1426     }
1427
1428     return invalid;
1429 }
1430
1431 #  undef OVERRIDE_AND_SAVEPV
1432 #endif
1433
1434 /*==========================================================================
1435  * Here starts the code that gives a uniform interface to its callers, hiding
1436  * the differences between platforms.
1437  *
1438  * base_posix_setlocale_() presents a consistent POSIX-compliant interface to
1439  * setlocale().   Windows requres a customized base-level setlocale().  This
1440  * layer should only be used by the next level up: the plain posix_setlocale
1441  * layer.  Any necessary mutex locking needs to be done at a higher level.  The
1442  * return may be overwritten by the next call to this function */
1443 #ifdef WIN32
1444 #  define base_posix_setlocale_(cat, locale) win32_setlocale(cat, locale)
1445 #else
1446 #  define base_posix_setlocale_(cat, locale)                                \
1447                                     ((const char *) setlocale(cat, locale))
1448 #endif
1449
1450 /*==========================================================================
1451  * Here is the main posix layer.  It is the same as the base one unless the
1452  * system is lacking LC_ALL, or there are categories that we ignore, but that
1453  * the system libc knows about */
1454
1455 #if ! defined(USE_LOCALE)                                                   \
1456  ||  (defined(LC_ALL) && ! defined(HAS_IGNORED_LOCALE_CATEGORIES_))
1457 #  define posix_setlocale(cat, locale) base_posix_setlocale_(cat, locale)
1458 #else
1459 #  define posix_setlocale(cat, locale)                                      \
1460         S_posix_setlocale_with_complications(aTHX_ cat, locale, __LINE__)
1461
1462 STATIC const char *
1463 S_posix_setlocale_with_complications(pTHX_ const int cat,
1464                                            const char * new_locale,
1465                                            const line_t caller_line)
1466 {
1467     /* This implements the posix layer above the base posix layer.
1468      * It is needed to reconcile our internal records that reflect only a
1469      * proper subset of the categories known by the system. */
1470
1471     /* Querying the current locale returns the real value */
1472     if (new_locale == NULL) {
1473         new_locale = base_posix_setlocale_(cat, NULL);
1474         assert(new_locale);
1475         return new_locale;
1476     }
1477
1478     const char * locale_on_entry = NULL;
1479
1480     /* If setting from the environment, actually do the set to get the system's
1481      * idea of what that means; we may have to override later. */
1482     if (strEQ(new_locale, "")) {
1483         locale_on_entry = base_posix_setlocale_(cat, NULL);
1484         assert(locale_on_entry);
1485         new_locale = base_posix_setlocale_(cat, "");
1486         if (! new_locale) {
1487             SET_EINVAL;
1488             return NULL;
1489         }
1490     }
1491
1492 #  ifdef LC_ALL
1493
1494     const char * new_locales[LC_ALL_INDEX_] = { NULL };
1495
1496     if (cat == LC_ALL) {
1497         switch (parse_LC_ALL_string(new_locale,
1498                                     (const char **) &new_locales,
1499                                     override_if_ignored,   /* Override any
1500                                                               ignored
1501                                                               categories */
1502                                     false,    /* Return only [0] if suffices */
1503                                     false,    /* Don't panic on error */
1504                                     caller_line))
1505         {
1506           case invalid:
1507             SET_EINVAL;
1508             return NULL;
1509
1510           case no_array:
1511             break;
1512
1513           case only_element_0:
1514             new_locale = new_locales[0];
1515             SAVEFREEPV(new_locale);
1516             break;
1517
1518           case full_array:
1519
1520             /* Turn the array into a string that the libc setlocale() should
1521              * understand.   (Another option would be to loop, setting the
1522              * individual locales, and then return base(cat, NULL) */
1523             new_locale = calculate_LC_ALL_string(new_locales,
1524                                                  EXTERNAL_FORMAT_FOR_SET,
1525                                                  WANT_TEMP_PV,
1526                                                  caller_line);
1527
1528             for_all_individual_category_indexes(i) {
1529                 Safefree(new_locales[i]);
1530             }
1531
1532             /* And call the libc setlocale.  We could avoid this call if
1533              * locale_on_entry is set and eq the new_locale.  But that would be
1534              * only for the relatively rare case of the desired locale being
1535              * "", and the time spent in doing the string compare might be more
1536              * than that of just setting it unconditionally */
1537             new_locale = base_posix_setlocale_(cat, new_locale);
1538             if (! new_locale) {
1539                  goto failure;
1540             }
1541
1542             return new_locale;
1543         }
1544     }
1545
1546 #  endif
1547
1548     /* Here, 'new_locale' is a single value, not an aggregation.  Just set it.
1549      * */
1550     new_locale =
1551         base_posix_setlocale_(cat,
1552                               override_ignored_category(
1553                                           get_category_index(cat), new_locale));
1554     if (! new_locale) {
1555         goto failure;
1556     }
1557
1558     return new_locale;
1559
1560  failure:
1561
1562     /* 'locale_on_entry' being set indicates there has likely been a change in
1563      * locale which needs to be restored */
1564     if (locale_on_entry) {
1565         if (! base_posix_setlocale_(cat, locale_on_entry)) {
1566             setlocale_failure_panic_i(get_category_index(cat),
1567                                       NULL, locale_on_entry,
1568                                       __LINE__, caller_line);
1569         }
1570     }
1571
1572     SET_EINVAL;
1573     return NULL;
1574 }
1575
1576 #endif
1577
1578 /* End of posix layer
1579  *==========================================================================
1580  *
1581  * The next layer up is to catch vagaries and bugs in the libc setlocale return
1582  * value.  The return is not guaranteed to be stable.
1583  *
1584  * Any necessary mutex locking needs to be done at a higher level.
1585  *
1586  * On most platforms this layer is empty, expanding to just the layer
1587  * below.   To enable it, call Configure with either or both:
1588  * -Accflags=-DHAS_LF_IN_SETLOCALE_RETURN
1589  *                  to indicate that extraneous \n characters can be returned
1590  *                  by setlocale()
1591  * -Accflags=-DHAS_BROKEN_SETLOCALE_QUERY_LC_ALL
1592  *                  to indicate that setlocale(LC_ALL, NULL) cannot be relied
1593  *                  on
1594  */
1595
1596 #define STDIZED_SETLOCALE_LOCK    POSIX_SETLOCALE_LOCK
1597 #define STDIZED_SETLOCALE_UNLOCK  POSIX_SETLOCALE_UNLOCK
1598 #if ! defined(USE_LOCALE)                                                   \
1599  || ! (   defined(HAS_LF_IN_SETLOCALE_RETURN)                               \
1600        || defined(HAS_BROKEN_SETLOCALE_QUERY_LC_ALL))
1601 #  define stdized_setlocale(cat, locale)  posix_setlocale(cat, locale)
1602 #  define stdize_locale(cat, locale)  (locale)
1603 #else
1604 #  define stdized_setlocale(cat, locale)                                    \
1605         S_stdize_locale(aTHX_ cat, posix_setlocale(cat, locale), __LINE__)
1606
1607 STATIC const char *
1608 S_stdize_locale(pTHX_ const int category,
1609                       const char *input_locale,
1610                       const line_t caller_line)
1611 {
1612     /* The return value of setlocale() is opaque, but is required to be usable
1613      * as input to a future setlocale() to create the same state.
1614      * Unfortunately not all systems are compliant.  This function brings those
1615      * outliers into conformance.  It is based on what problems have arisen in
1616      * the field.
1617      *
1618      * This has similar constraints as the posix layer.  You need to lock
1619      * around it until its return is safely copied or no longer needed. (The
1620      * return may point to a global static buffer or may be mortalized.)
1621      *
1622      * The current things this corrects are:
1623      * 1) A new-line.  This function chops any \n characters
1624      * 2) A broken 'setlocale(LC_ALL, foo)'  This constructs a proper returned
1625      *                 string from the constituent categories
1626      *
1627      * If no changes were made, the input is returned as-is */
1628
1629     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
1630                           "Entering stdize_locale(%d, '%s');"
1631                           " called from %" LINE_Tf "\n",
1632                           category, input_locale, caller_line));
1633
1634     if (input_locale == NULL) {
1635         SET_EINVAL;
1636         return NULL;
1637     }
1638
1639     char * retval = (char *) input_locale;
1640
1641 #  if defined(LC_ALL) && defined(HAS_BROKEN_SETLOCALE_QUERY_LC_ALL)
1642
1643         /* If setlocale(LC_ALL, NULL) is broken, compute what the system
1644          * actually thinks it should be from its individual components */
1645     if (category == LC_ALL) {
1646         retval = (char *) calculate_LC_ALL_string(
1647                                           NULL,  /* query each individ locale */
1648                                           EXTERNAL_FORMAT_FOR_SET,
1649                                           WANT_TEMP_PV,
1650                                           caller_line);
1651     }
1652
1653 #  endif
1654 #  ifdef HAS_NL_IN_SETLOCALE_RETURN
1655
1656     char * first_bad = NULL;
1657
1658 #    ifndef LC_ALL
1659
1660     PERL_UNUSED_ARG(category);
1661     PERL_UNUSED_ARG(caller_line);
1662
1663 #      define INPUT_LOCALE  retval
1664 #      define MARK_CHANGED
1665 #    else
1666
1667     char * individ_locales[LC_ALL_INDEX_] = { NULL };
1668     bool made_changes = false;
1669     Size_t upper;
1670     if (category != LC_ALL) {
1671         individ_locales[0] = retval;
1672         upper = 0;
1673     }
1674     else {
1675
1676         /* And parse the locale string, splitting into its individual
1677          * components. */
1678         switch (parse_LC_ALL_string(retval,
1679                                     (const char **) &individ_locales,
1680                                     check_that_overridden, /* ignored
1681                                                               categories should
1682                                                               already have been
1683                                                               overridden */
1684                                     false,    /* Return only [0] if suffices */
1685                                     false,    /* Don't panic on error */
1686                                     caller_line))
1687         {
1688           case invalid:
1689             SET_EINVAL;
1690             return NULL;
1691
1692           case full_array: /* Loop below through all the component categories.
1693                             */
1694             upper = LC_ALL_INDEX_ - 1;
1695             break;
1696
1697           case no_array:
1698             /* All categories here are set to the same locale, and the parse
1699              * didn't fill in any of 'individ_locales'.  Set the 0th element to
1700              * that locale. */
1701             individ_locales[0] = retval;
1702             /* FALLTHROUGH */
1703
1704           case only_element_0: /* Element 0 is the only element we need to look
1705                                   at */
1706             upper = 0;
1707             break;
1708         }
1709     }
1710
1711     for (unsigned int i = 0; i <= upper; i++)
1712
1713 #      define INPUT_LOCALE  individ_locales[i]
1714 #      define MARK_CHANGED  made_changes = true;
1715 #    endif    /* Has LC_ALL */
1716
1717     {
1718         first_bad = (char *) strchr(INPUT_LOCALE, '\n');
1719
1720         /* Most likely, there isn't a problem with the input */
1721         if (UNLIKELY(first_bad)) {
1722
1723             /* This element will need to be adjusted.  Create a modifiable
1724              * copy. */
1725             MARK_CHANGED
1726             retval = savepv(INPUT_LOCALE);
1727             SAVEFREEPV(retval);
1728
1729             /* Translate the found position into terms of the copy */
1730             first_bad = retval + (first_bad - INPUT_LOCALE);
1731
1732             /* Get rid of the \n and what follows.  (Originally, only a
1733              * trailing \n was stripped.  Unsure what to do if not trailing) */
1734             *((char *) first_bad) = '\0';
1735         }   /* End of needs adjusting */
1736     }   /* End of looking for problems */
1737
1738 #    ifdef LC_ALL
1739
1740     /* If we had multiple elements, extra work is required */
1741     if (upper != 0) {
1742
1743         /* If no changes were made to the input, 'retval' already contains it
1744          * */
1745         if (made_changes) {
1746
1747             /* But if did make changes, need to calculate the new value */
1748             retval = (char *) calculate_LC_ALL_string(
1749                                             (const char **) &individ_locales,
1750                                             EXTERNAL_FORMAT_FOR_SET,
1751                                             WANT_TEMP_PV,
1752                                             caller_line);
1753         }
1754
1755         /* And free the no-longer needed memory */
1756         for (unsigned int i = 0; i <= upper; i++) {
1757             Safefree(individ_locales[i]);
1758         }
1759     }
1760
1761 #    endif
1762 #    undef INPUT_LOCALE
1763 #    undef MARK_CHANGED
1764 #  endif    /* HAS_NL_IN_SETLOCALE_RETURN */
1765
1766     return (const char *) retval;
1767 }
1768
1769 #endif  /* USE_LOCALE */
1770
1771 /* End of stdize_locale layer
1772  *
1773  * ==========================================================================
1774  *
1775  * The next many lines form several implementations of a layer above the
1776  * close-to-the-metal 'posix' and 'stdized' macros.  They are used to present a
1777  * uniform API to the rest of the code in this file in spite of the disparate
1778  * underlying implementations.  Which implementation gets compiled depends on
1779  * the platform capabilities (and some user choice) as determined by Configure.
1780  *
1781  * As more fully described in the introductory comments in this file, the
1782  * API of each implementation consists of three sets of macros.  Each set has
1783  * three variants with suffixes '_c', '_r', and '_i'.  In the list below '_X'
1784  * is to be replaced by any of these suffixes.
1785  *
1786  * 1) bool_setlocale_X  attempts to set the given category's locale to the
1787  *                      given value, returning if it worked or not.
1788  * 2) void_setlocale_X  is like the corresponding bool_setlocale, but used when
1789  *                      success is the only sane outcome, so failure causes it
1790  *                      to panic.
1791  * 3) querylocale_X     to see what the given category's locale is
1792  *
1793  * 4) setlocale_i()     is defined only in those implementations where the bool
1794  *                      and query forms are essentially the same, and can be
1795  *                      combined to save CPU time.
1796  *
1797  * Each implementation below is separated by ==== lines, and includes bool,
1798  * void, and query macros.  The query macros are first, followed by any
1799  * functions needed to implement them.  Then come the bool, again followed by
1800  * any implementing functions  Then are the void macros; next is setlocale_i if
1801  * present on this implementation.  Finally are any helper functions.  The sets
1802  * in each implementation are separated by ---- lines.
1803  *
1804  * The returned strings from all the querylocale...() forms in all
1805  * implementations are thread-safe, and the caller should not free them,
1806  * but each may be a mortalized copy.  If you need something stable across
1807  * calls, you need to savepv() the result yourself.
1808  *
1809  *===========================================================================*/
1810
1811 #if    (! defined(USE_LOCALE_THREADS) && ! defined(USE_POSIX_2008_LOCALE))    \
1812     || (  defined(WIN32) && defined(USE_THREAD_SAFE_LOCALE))
1813
1814 /* For non-threaded perls, the implementation just expands to the base-level
1815  * functions (except if we are Configured to nonetheless use the POSIX 2008
1816  * interface) This implementation is also used on threaded perls where
1817  * threading is invisible to us.  Currently this is only on later Windows
1818  * versions. */
1819
1820 #  define querylocale_r(cat)  mortalized_pv_copy(stdized_setlocale(cat, NULL))
1821 #  define querylocale_c(cat)  querylocale_r(cat)
1822 #  define querylocale_i(i)    querylocale_c(categories[i])
1823
1824 /*---------------------------------------------------------------------------*/
1825
1826 #  define bool_setlocale_r(cat, locale) cBOOL(posix_setlocale(cat, locale))
1827 #  define bool_setlocale_i(i, locale)                                       \
1828                                    bool_setlocale_c(categories[i], locale)
1829 #  define bool_setlocale_c(cat, locale)      bool_setlocale_r(cat, locale)
1830
1831 /*---------------------------------------------------------------------------*/
1832
1833 #  define void_setlocale_r_with_caller(cat, locale, file, line)             \
1834      STMT_START {                                                           \
1835         if (! bool_setlocale_r(cat, locale))                                \
1836             setlocale_failure_panic_via_i(get_category_index(cat),          \
1837                                           NULL, locale, __LINE__, 0,        \
1838                                           file, line);                      \
1839      } STMT_END
1840
1841 #  define void_setlocale_c_with_caller(cat, locale, file, line)             \
1842                     void_setlocale_r_with_caller(cat, locale, file, line)
1843
1844 #  define void_setlocale_i_with_caller(i, locale, file, line)               \
1845           void_setlocale_r_with_caller(categories[i], locale, file, line)
1846
1847 #  define void_setlocale_r(cat, locale)                                     \
1848             void_setlocale_r_with_caller(cat, locale, __FILE__, __LINE__)
1849 #  define void_setlocale_c(cat, locale) void_setlocale_r(cat, locale)
1850 #  define void_setlocale_i(i, locale)   void_setlocale_r(categories[i], locale)
1851
1852 /*---------------------------------------------------------------------------*/
1853
1854 /* setlocale_i is only defined for Configurations where the libc setlocale()
1855  * doesn't need any tweaking.  It allows for some shortcuts */
1856 #  ifndef USE_LOCALE_THREADS
1857 #    define setlocale_i(i, locale)   stdized_setlocale(categories[i], locale)
1858
1859 #  elif defined(WIN32) && defined(USE_THREAD_SAFE_LOCALE)
1860
1861 /* On Windows, we don't know at compile time if we are in thread-safe mode or
1862  * not.  If we are, we can just return the result of the layer below us.  If we
1863  * are in unsafe mode, we need to first copy that result to a safe place while
1864  * in a critical section */
1865
1866 #    define setlocale_i(i, locale)   S_setlocale_i(aTHX_ categories[i], locale)
1867
1868 STATIC const char *
1869 S_setlocale_i(pTHX_ const int category, const char * locale)
1870 {
1871     if (LIKELY(_configthreadlocale(0) == _ENABLE_PER_THREAD_LOCALE)) {
1872         return stdized_setlocale(category, locale);
1873     }
1874
1875     gwLOCALE_LOCK;
1876     const char * retval = save_to_buffer(stdized_setlocale(category, locale),
1877                                          &PL_setlocale_buf,
1878                                          &PL_setlocale_bufsize);
1879     gwLOCALE_UNLOCK;
1880
1881     return retval;
1882 }
1883
1884 #  endif
1885
1886 /*===========================================================================*/
1887 #elif   defined(USE_LOCALE_THREADS)                 \
1888    && ! defined(USE_THREAD_SAFE_LOCALE)
1889
1890    /* Here, there are threads, and there is no support for thread-safe
1891     * operation.  This is a dangerous situation, which perl is documented as
1892     * not supporting, but it arises in practice.  We can do a modicum of
1893     * automatic mitigation by making sure there is a per-thread return from
1894     * setlocale(), and that a mutex protects it from races */
1895
1896 #  define querylocale_r(cat)                                                \
1897                       mortalized_pv_copy(less_dicey_setlocale_r(cat, NULL))
1898 #  define querylocale_c(cat)  querylocale_r(cat)
1899 #  define querylocale_i(i)    querylocale_r(categories[i])
1900
1901 STATIC const char *
1902 S_less_dicey_setlocale_r(pTHX_ const int category, const char * locale)
1903 {
1904     const char * retval;
1905
1906     PERL_ARGS_ASSERT_LESS_DICEY_SETLOCALE_R;
1907
1908     STDIZED_SETLOCALE_LOCK;
1909
1910     retval = save_to_buffer(stdized_setlocale(category, locale),
1911                             &PL_less_dicey_locale_buf,
1912                             &PL_less_dicey_locale_bufsize);
1913
1914     STDIZED_SETLOCALE_UNLOCK;
1915
1916     return retval;
1917 }
1918
1919 /*---------------------------------------------------------------------------*/
1920
1921 #  define bool_setlocale_r(cat, locale)                                     \
1922                                less_dicey_bool_setlocale_r(cat, locale)
1923 #  define bool_setlocale_i(i, locale)                                       \
1924                                 bool_setlocale_r(categories[i], locale)
1925 #  define bool_setlocale_c(cat, locale) bool_setlocale_r(cat, locale)
1926
1927 STATIC bool
1928 S_less_dicey_bool_setlocale_r(pTHX_ const int cat, const char * locale)
1929 {
1930     bool retval;
1931
1932     PERL_ARGS_ASSERT_LESS_DICEY_BOOL_SETLOCALE_R;
1933
1934     /* Unlikely, but potentially possible that another thread could zap the
1935      * buffer from true to false or vice-versa, so need to lock here */
1936     POSIX_SETLOCALE_LOCK;
1937     retval = cBOOL(posix_setlocale(cat, locale));
1938     POSIX_SETLOCALE_UNLOCK;
1939
1940     return retval;
1941 }
1942
1943 /*---------------------------------------------------------------------------*/
1944
1945 #  define void_setlocale_r_with_caller(cat, locale, file, line)             \
1946      STMT_START {                                                           \
1947         if (! bool_setlocale_r(cat, locale))                                \
1948             setlocale_failure_panic_via_i(get_category_index(cat),          \
1949                                           NULL, locale, __LINE__, 0,        \
1950                                           file, line);                      \
1951      } STMT_END
1952
1953 #  define void_setlocale_c_with_caller(cat, locale, file, line)             \
1954                     void_setlocale_r_with_caller(cat, locale, file, line)
1955
1956 #  define void_setlocale_i_with_caller(i, locale, file, line)               \
1957           void_setlocale_r_with_caller(categories[i], locale, file, line)
1958
1959 #  define void_setlocale_r(cat, locale)                                     \
1960             void_setlocale_r_with_caller(cat, locale, __FILE__, __LINE__)
1961 #  define void_setlocale_c(cat, locale) void_setlocale_r(cat, locale)
1962 #  define void_setlocale_i(i, locale)   void_setlocale_r(categories[i], locale)
1963
1964 /*===========================================================================*/
1965
1966 #elif defined(USE_POSIX_2008_LOCALE)
1967 #  ifndef LC_ALL
1968 #    error This code assumes that LC_ALL is available on a system modern enough to have POSIX 2008
1969 #  endif
1970
1971 /* Here, there is a completely different API to get thread-safe locales.  We
1972  * emulate the setlocale() API with our own function(s).  setlocale categories,
1973  * like LC_NUMERIC, are not valid here for the POSIX 2008 API.  Instead, there
1974  * are equivalents, like LC_NUMERIC_MASK, which we use instead, which we find
1975  * by table lookup. */
1976
1977 #  if defined(__GLIBC__) && defined(USE_LOCALE_MESSAGES)
1978             /* https://sourceware.org/bugzilla/show_bug.cgi?id=24936 */
1979 #    define HAS_GLIBC_LC_MESSAGES_BUG
1980 #    include <libintl.h>
1981 #  endif
1982
1983 #  define querylocale_i(i)    querylocale_2008_i(i, __LINE__)
1984 #  define querylocale_c(cat)  querylocale_i(cat##_INDEX_)
1985 #  define querylocale_r(cat)  querylocale_i(get_category_index(cat))
1986
1987 STATIC const char *
1988 S_querylocale_2008_i(pTHX_ const locale_category_index index,
1989                            const line_t caller_line)
1990 {
1991     PERL_ARGS_ASSERT_QUERYLOCALE_2008_I;
1992     assert(index <= LC_ALL_INDEX_);
1993
1994     /* This function returns the name of the locale category given by the input
1995      * 'index' into our parallel tables of them.
1996      *
1997      * POSIX 2008, for some sick reason, chose not to provide a method to find
1998      * the category name of a locale, disregarding a basic linguistic tenet
1999      * that for any object, people will create a name for it.  (The next
2000      * version of the POSIX standard is proposed to fix this.)  Some vendors
2001      * have created a querylocale() function to do this in the meantime.  On
2002      * systems without querylocale(), we have to keep track of what the locale
2003      * has been set to, so that we can return its name so as to emulate
2004      * setlocale().  There are potential problems with this:
2005      *
2006      *  1)  We don't know what calling newlocale() with the locale argument ""
2007      *      actually does.  It gets its values from the program's environment.
2008      *      find_locale_from_environment() is used to work around this.  But it
2009      *      isn't fool-proof.  See the comments for that function for details.
2010      *  2)  It's possible for C code in some library to change the locale
2011      *      without us knowing it, and thus our records become wrong;
2012      *      querylocale() would catch this.  But as of September 2017, there
2013      *      are no occurrences in CPAN of uselocale().  Some libraries do use
2014      *      setlocale(), but that changes the global locale, and threads using
2015      *      per-thread locales will just ignore those changes.
2016      *  3)  Many systems have multiple names for the same locale.  Generally,
2017      *      there is an underlying base name, with aliases that evaluate to it.
2018      *      On some systems, if you set the locale to an alias, and then
2019      *      retrieve the name, you get the alias as expected; but on others you
2020      *      get the base name, not the alias you used.  And sometimes the
2021      *      charade is incomplete.  See
2022      *      https://bugs.freebsd.org/bugzilla/show_bug.cgi?id=269375.
2023      *
2024      *      The code is structured so that the returned locale name when the
2025      *      locale is changed is whatever the result of querylocale() on the
2026      *      new locale is.  This effectively gives the result the system
2027      *      expects.  Without querylocale, the name returned is always the
2028      *      input name.  Theoretically this could cause problems, but khw knows
2029      *      of none so far, but mentions it here in case you are trying to
2030      *      debug something.  (This could be worked around by messing with the
2031      *      global locale temporarily, using setlocale() to get the base name;
2032      *      but that could cause a race.  The comments for
2033      *      find_locale_from_environment() give details on the potential race.)
2034      */
2035
2036     const locale_t cur_obj = uselocale((locale_t) 0);
2037     const char * retval;
2038
2039     DEBUG_Lv(PerlIO_printf(Perl_debug_log, "querylocale_2008_i(%s) on %p;"
2040                                            " called from %" LINE_Tf "\n",
2041                                            category_names[index], cur_obj,
2042                                            caller_line));
2043
2044     if (UNLIKELY(cur_obj == LC_GLOBAL_LOCALE)) {
2045
2046         /* Even on platforms that have querylocale(), it is unclear if they
2047          * work in the global locale, and we have the means to get the correct
2048          * answer anyway.  khw is unsure this situation even comes up these
2049          * days, hence the branch prediction */
2050         POSIX_SETLOCALE_LOCK;
2051         retval = mortalized_pv_copy(posix_setlocale(categories[index], NULL));
2052         POSIX_SETLOCALE_UNLOCK;
2053     }
2054
2055     /* Here we have handled the case of the the current locale being the global
2056      * one.  Below is the 'else' case of that.  There are two different
2057      * implementations, depending on USE_PL_CURLOCALES */
2058
2059 #  ifdef USE_PL_CURLOCALES
2060
2061     else {
2062
2063         /* PL_curlocales[] is kept up-to-date for all categories except LC_ALL,
2064          * which may have been invalidated by setting it to NULL, and if so,
2065          * should now be calculated.  (The called function updates that
2066          * element.) */
2067         if (index == LC_ALL_INDEX_ && PL_curlocales[LC_ALL_INDEX_] == NULL) {
2068             calculate_LC_ALL_string((const char **) &PL_curlocales,
2069                                     INTERNAL_FORMAT,
2070                                     WANT_VOID,
2071                                     caller_line);
2072         }
2073
2074         if (cur_obj == PL_C_locale_obj) {
2075
2076             /* If the current locale object is the C object, then the answer is
2077              * "C" or POSIX, regardless of the category.  Handling this
2078              * reasonably likely case specially shortcuts extra effort, and
2079              * hides some bugs from us in OS's that alias other locales to C,
2080              * but do so incompletely.  If our records say it is POSIX, use
2081              * that; otherwise use C.  See
2082              * https://bugs.freebsd.org/bugzilla/show_bug.cgi?id=269375 */
2083             retval = mortalized_pv_copy((strEQ(PL_curlocales[index], "POSIX"))
2084                                         ? "POSIX"
2085                                         : "C");
2086         }
2087         else {
2088             retval = mortalized_pv_copy(PL_curlocales[index]);
2089         }
2090     }
2091
2092 #  else
2093
2094     /* Below is the implementation of the 'else' clause which handles the case
2095      * of the current locale not being the global one on platforms where
2096      * USE_PL_CURLOCALES is NOT in effect.  That means the system must have
2097      * some form of querylocale.  But these have varying characteristics, so
2098      * first create some #defines to make the actual 'else' clause uniform.
2099      *
2100      * First, glibc has a function that implements querylocale(), but is called
2101      * something else, and takes the category number; the others take the mask.
2102      * */
2103 #    if defined(USE_QUERYLOCALE) && (   defined(_NL_LOCALE_NAME)            \
2104                                      && defined(HAS_NL_LANGINFO_L))
2105 #      define my_querylocale(index, cur_obj)                                \
2106                 nl_langinfo_l(_NL_LOCALE_NAME(categories[index]), cur_obj)
2107
2108        /* Experience so far shows it is thread-safe, as well as glibc's
2109         * nl_langinfo_l(), so unless overridden, mark it so */
2110 #      ifdef NO_THREAD_SAFE_QUERYLOCALE
2111 #        undef HAS_THREAD_SAFE_QUERYLOCALE
2112 #      else
2113 #        define HAS_THREAD_SAFE_QUERYLOCALE
2114 #      endif
2115 #    else   /* below, ! glibc */
2116
2117        /* Otherwise, use the system's querylocale(). */
2118 #      define my_querylocale(index, cur_obj)                                \
2119                                querylocale(category_masks[index], cur_obj)
2120
2121        /* There is no standard for this function, and khw has never seen
2122         * anything beyond minimal vendor documentation, lacking important
2123         * details.  Experience has shown that some implementations have race
2124         * condiions, and their returns may not be thread safe.  It would be
2125         * unreliable to test for complete thread safety in Configure.  What we
2126         * do instead is to assume that it is thread-safe, unless overriden by,
2127         * say, a hints file specifying
2128         * -Accflags='-DNO_THREAD_SAFE_QUERYLOCALE */
2129 #      ifdef NO_THREAD_SAFE_QUERYLOCALE
2130 #        undef HAS_THREAD_SAFE_QUERYLOCALE
2131 #      else
2132 #        define HAS_THREAD_SAFE_QUERYLOCALE
2133 #      endif
2134 #    endif
2135
2136      /* Here, we have set up enough information to know if this querylocale()
2137       * is thread-safe, or needs to use a mutex */
2138 #    ifdef HAS_THREAD_SAFE_QUERYLOCALE
2139 #      define QUERYLOCALE_LOCK
2140 #      define QUERYLOCALE_UNLOCK
2141 #    else
2142 #      define QUERYLOCALE_LOCK    gwLOCALE_LOCK
2143 #      define QUERYLOCALE_UNLOCK  gwLOCALE_UNLOCK
2144 #    endif
2145
2146     /* Finally, everything is ready, so here is the 'else' clause to implement
2147      * the case of the current locale not being the global one on systems that
2148      * have some form of querylocale().  (POSIX will presumably eventually
2149      * publish their next version in their pipeline, which will define a
2150      * precisely specified querylocale equivalent, and there can be a new
2151      * #ifdef to use it without having to guess at its characteristics) */
2152
2153     else {
2154         /* We don't keep records when there is querylocale(), so as to avoid the
2155          * pitfalls mentioned at the beginning of this function.
2156          *
2157          * That means LC_ALL has to be calculated from all its constituent
2158          * categories each time, since the querylocale() forms on many (if not
2159          * all) platforms only work on individual categories */
2160         if (index == LC_ALL_INDEX_) {
2161             retval = calculate_LC_ALL_string(NULL, INTERNAL_FORMAT,
2162                                              WANT_TEMP_PV,
2163                                              caller_line);
2164         }
2165         else {
2166
2167             QUERYLOCALE_LOCK;
2168             retval = savepv(my_querylocale(index, cur_obj));
2169             QUERYLOCALE_UNLOCK;
2170
2171             /* querylocale() may conflate the C locale with something that
2172              * isn't exactly the same.  See for example
2173              * https://bugs.freebsd.org/bugzilla/show_bug.cgi?id=269375
2174              * We know that if the locale object is the C one, we
2175              * are in the C locale, which may go by the name POSIX, as both, by
2176              * definition, are equivalent.  But we consider any other name
2177              * spurious, so override with "C".  As in the PL_CURLOCALES case
2178              * above, this hides those glitches, for the most part, from the
2179              * rest of our code.  (The code is ordered this way so that if the
2180              * system distinugishes "C" from "POSIX", we do too.) */
2181             if (cur_obj == PL_C_locale_obj && ! isNAME_C_OR_POSIX(retval)) {
2182                 Safefree(retval);
2183                 retval = savepv("C");
2184             }
2185
2186             SAVEFREEPV(retval);
2187         }
2188     }
2189
2190 #    undef QUERYLOCALE_LOCK
2191 #    undef QUERYLOCALE_UNLOCK
2192 #  endif
2193
2194     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
2195                            "querylocale_2008_i(%s) returning '%s'\n",
2196                            category_names[index], retval));
2197     assert(strNE(retval, ""));
2198     return retval;
2199 }
2200
2201 /*---------------------------------------------------------------------------*/
2202
2203 #  define bool_setlocale_i(i, locale)                                       \
2204                               bool_setlocale_2008_i(i, locale, __LINE__)
2205 #  define bool_setlocale_c(cat, locale)                                     \
2206                                   bool_setlocale_i(cat##_INDEX_, locale)
2207 #  define bool_setlocale_r(cat, locale)                                     \
2208                        bool_setlocale_i(get_category_index(cat), locale)
2209
2210 /* If this doesn't exist on this platform, make it a no-op (to save #ifdefs) */
2211 #  ifndef update_PL_curlocales_i
2212 #    define update_PL_curlocales_i(index, new_locale, caller_line)
2213 #  endif
2214
2215 STATIC bool
2216 S_bool_setlocale_2008_i(pTHX_
2217
2218         /* Our internal index of the 'category' setlocale is called with */
2219         const locale_category_index  index,
2220         const char * new_locale,    /* The locale to set the category to */
2221         const line_t caller_line    /* Called from this line number */
2222        )
2223 {
2224     PERL_ARGS_ASSERT_BOOL_SETLOCALE_2008_I;
2225     assert(index <= LC_ALL_INDEX_);
2226
2227     /* This function effectively performs a setlocale() on just the current
2228      * thread; thus it is thread-safe.  It does this by using the POSIX 2008
2229      * locale functions to emulate the behavior of setlocale().  Similar to
2230      * regular setlocale(), the return from this function points to memory that
2231      * can be overwritten by other system calls, so needs to be copied
2232      * immediately if you need to retain it.  The difference here is that
2233      * system calls besides another setlocale() can overwrite it.
2234      *
2235      * By doing this, most locale-sensitive functions become thread-safe.  The
2236      * exceptions are mostly those that return a pointer to static memory.
2237      */
2238
2239     int mask = category_masks[index];
2240     const locale_t entry_obj = uselocale((locale_t) 0);
2241     const char * locale_on_entry = querylocale_i(index);
2242
2243     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
2244                            "bool_setlocale_2008_i: input=%d (%s), mask=0x%x,"
2245                            " new locale=\"%s\", current locale=\"%s\","
2246                            " index=%d, entry object=%p;"
2247                            " called from %" LINE_Tf "\n",
2248                            categories[index], category_names[index], mask,
2249                            ((new_locale == NULL) ? "(nil)" : new_locale),
2250                            locale_on_entry, index, entry_obj, caller_line));
2251
2252     /* Here, trying to change the locale, but it is a no-op if the new boss is
2253      * the same as the old boss.  Except this routine is called when converting
2254      * from the global locale, so in that case we will create a per-thread
2255      * locale below (with the current values).  It also seemed that newlocale()
2256      * could free up the basis locale memory if we called it with the new and
2257      * old being the same, but khw now thinks that this was due to some other
2258      * bug, since fixed, as there are other places where newlocale() gets
2259      * similarly called without problems. */
2260     if (   entry_obj != LC_GLOBAL_LOCALE
2261         && locale_on_entry
2262         && strEQ(new_locale, locale_on_entry))
2263     {
2264         DEBUG_Lv(PerlIO_printf(Perl_debug_log,
2265                                "bool_setlocale_2008_i: no-op to change to"
2266                                " what it already was\n"));
2267         return true;
2268     }
2269
2270 #  ifndef USE_QUERYLOCALE
2271
2272     /* Without a querylocale() mechanism, we have to figure out ourselves what
2273      * happens with setting a locale to "" */
2274
2275     if (strEQ(new_locale, "")) {
2276         new_locale = find_locale_from_environment(index);
2277         if (! new_locale) {
2278             SET_EINVAL;
2279             return false;
2280         }
2281     }
2282
2283 #  endif
2284 #  ifdef NEWLOCALE_HANDLES_DISPARATE_LC_ALL
2285
2286     const bool need_loop = false;
2287
2288 #  else
2289
2290     bool need_loop = false;
2291     const char * new_locales[LC_ALL_INDEX_] = { NULL };
2292
2293     /* If we're going to have to parse the LC_ALL string, might as well do it
2294      * now before we have made changes that we would have to back out of if the
2295      * parse fails */
2296     if (index == LC_ALL_INDEX_) {
2297         switch (parse_LC_ALL_string(new_locale,
2298                                     (const char **) &new_locales,
2299                                     override_if_ignored,
2300                                     false,    /* Return only [0] if suffices */
2301                                     false,    /* Don't panic on error */
2302                                     caller_line))
2303         {
2304           case invalid:
2305             SET_EINVAL;
2306             return false;
2307
2308           case no_array:
2309             need_loop = false;
2310             break;
2311
2312           case only_element_0:
2313             SAVEFREEPV(new_locales[0]);
2314             new_locale = new_locales[0];
2315             need_loop = false;
2316             break;
2317
2318           case full_array:
2319             need_loop = true;
2320             break;
2321         }
2322     }
2323
2324 #  endif
2325 #  ifdef HAS_GLIBC_LC_MESSAGES_BUG
2326
2327     /* For this bug, if the LC_MESSAGES locale changes, we have to do an
2328      * expensive workaround.  Save the current value so we can later determine
2329      * if it changed. */
2330     const char * old_messages_locale = NULL;
2331     if (   (index == LC_MESSAGES_INDEX_ || index == LC_ALL_INDEX_)
2332         &&  LIKELY(PL_phase != PERL_PHASE_CONSTRUCT))
2333     {
2334         old_messages_locale = querylocale_c(LC_MESSAGES);
2335     }
2336
2337 #  endif
2338
2339     assert(PL_C_locale_obj);
2340
2341     /* Now ready to switch to the input 'new_locale' */
2342
2343     /* Switching locales generally entails freeing the current one's space (at
2344      * the C library's discretion), hence we can't be using that locale at the
2345      * time of the switch (this wasn't obvious to khw from the man pages).  So
2346      * switch to a known locale object that we don't otherwise mess with. */
2347     if (! uselocale(PL_C_locale_obj)) {
2348
2349         /* Not being able to change to the C locale is severe; don't keep
2350          * going.  */
2351         setlocale_failure_panic_i(index, locale_on_entry, "C",
2352                                   __LINE__, caller_line);
2353         NOT_REACHED; /* NOTREACHED */
2354     }
2355
2356     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
2357                            "bool_setlocale_2008_i: now using C"
2358                            " object=%p\n", PL_C_locale_obj));
2359
2360     /* These two objects are special:
2361      *  LC_GLOBAL_LOCALE    because it is undefined behavior to call
2362      *                      newlocale() with it as a parameter.
2363      *  PL_C_locale_obj     because newlocale() generally destroys its locale
2364      *                      object parameter when it succeeds; and we don't
2365      *                      want that happening to this immutable object.
2366      * Copies will be made for them to use instead if we get so far as to call
2367      * newlocale(). */
2368     bool entry_obj_is_special = (   entry_obj == LC_GLOBAL_LOCALE
2369                                  || entry_obj == PL_C_locale_obj);
2370     locale_t new_obj;
2371
2372     /* PL_C_locale_obj is LC_ALL set to the C locale.  If this call is to
2373      * switch to LC_ALL => C, simply use that object.  But in fact, we already
2374      * have switched to it just above, in preparation for the general case.
2375      * Since we're already there, no need to do further switching. */
2376     if (mask == LC_ALL_MASK && isNAME_C_OR_POSIX(new_locale)) {
2377         DEBUG_Lv(PerlIO_printf(Perl_debug_log,
2378                                "bool_setlocale_2008_i: will stay in C"
2379                                " object\n"));
2380         new_obj = PL_C_locale_obj;
2381
2382         /* 'entry_obj' is now dangling, of no further use to anyone (unless it
2383          * is one of the special ones).  Free it to avoid a leak */
2384         if (! entry_obj_is_special) {
2385             freelocale(entry_obj);
2386         }
2387
2388         update_PL_curlocales_i(index, new_locale, caller_line);
2389     }
2390     else {  /* Here is the general case, not to LC_ALL => C */
2391
2392         /* The newlocale() call(s) below take a basis object to build upon to
2393          * create the changed locale, trashing it iff successful.
2394          *
2395          * For the objects that are not to be modified by this function, we
2396          * create a duplicate that gets trashed instead.
2397          *
2398          * Also if we will have to loop doing multiple newlocale()s, there is a
2399          * chance we will succeed for the first few, and then fail, having to
2400          * back out.  We need to duplicate 'entry_obj' in this case as well, so
2401          * it remains valid as something to back out to. */
2402         locale_t basis_obj = entry_obj;
2403
2404         if (entry_obj_is_special || need_loop) {
2405             basis_obj = duplocale(basis_obj);
2406             if (! basis_obj) {
2407                 locale_panic_via_("duplocale failed", __FILE__, caller_line);
2408                 NOT_REACHED; /* NOTREACHED */
2409             }
2410
2411             DEBUG_Lv(PerlIO_printf(Perl_debug_log,
2412                                    "bool_setlocale_2008_i created %p by"
2413                                    " duping the input\n", basis_obj));
2414         }
2415
2416 #  define DEBUG_NEW_OBJECT_CREATED(category, locale, new, old, caller_line) \
2417       DEBUG_Lv(PerlIO_printf(Perl_debug_log,                                \
2418                              "bool_setlocale_2008_i(%s, %s): created %p"    \
2419                              " while freeing %p; called from %" LINE_Tf     \
2420                              " via %" LINE_Tf "\n",                         \
2421                              category, locale, new, old,                    \
2422                              caller_line, __LINE__))
2423 #  define DEBUG_NEW_OBJECT_FAILED(category, locale, basis_obj)              \
2424       DEBUG_L(PerlIO_printf(Perl_debug_log,                                 \
2425                             "bool_setlocale_2008_i: creating new object"    \
2426                             " for (%s '%s') from %p failed; called from %"  \
2427                             LINE_Tf " via %" LINE_Tf "\n",                  \
2428                             category, locale, basis_obj,                    \
2429                             caller_line, __LINE__));
2430
2431         /* Ready to create a new locale by modification of the existing one.
2432          *
2433          * NOTE: This code may incorrectly show up as a leak under the address
2434          * sanitizer. We do not free this object under normal teardown, however
2435          * you can set PERL_DESTRUCT_LEVEL=2 to cause it to be freed.
2436          */
2437
2438 #  ifdef NEWLOCALE_HANDLES_DISPARATE_LC_ALL
2439
2440         /* Some platforms have a newlocale() that can handle disparate LC_ALL
2441          * input, so on these a single call to newlocale() always works */
2442 #  else
2443
2444         /* If a single call to newlocale() will do */
2445         if (! need_loop)
2446
2447 #  endif
2448
2449         {
2450             new_obj = newlocale(mask,
2451                                 override_ignored_category(index, new_locale),
2452                                 basis_obj);
2453             if (! new_obj) {
2454                 DEBUG_NEW_OBJECT_FAILED(category_names[index], new_locale,
2455                                         basis_obj);
2456
2457                 /* Since the call failed, it didn't trash 'basis_obj', which is
2458                  * a dup for these objects, and hence would leak if we don't
2459                  * free it.  XXX However, something is seriously wrong if we
2460                  * can't switch to C or the global locale, so maybe should
2461                  * panic instead */
2462                 if (entry_obj_is_special) {
2463                     freelocale(basis_obj);
2464                 }
2465
2466                 goto must_restore_state;
2467             }
2468
2469             DEBUG_NEW_OBJECT_CREATED(category_names[index], new_locale,
2470                                      new_obj, basis_obj, caller_line);
2471
2472             update_PL_curlocales_i(index, new_locale, caller_line);
2473         }
2474
2475 #  ifndef NEWLOCALE_HANDLES_DISPARATE_LC_ALL
2476
2477         else {  /* Need multiple newlocale() calls */
2478
2479             /* Loop through the individual categories, setting the locale of
2480              * each to the corresponding name previously populated into
2481              * newlocales[].  Each iteration builds on the previous one, adding
2482              * its category to what's already been calculated, and taking as a
2483              * basis for what's been calculated 'basis_obj', which is updated
2484              * each iteration to be the result of the previous one.  Upon
2485              * success, newlocale() trashes the 'basis_obj' parameter to it.
2486              * If any iteration fails, we immediately give up, restore the
2487              * locale to what it was at the time this function was called
2488              * (saved in 'entry_obj'), and return failure. */
2489
2490             /* Loop, using the previous iteration's result as the basis for the
2491              * next one.  (The first time we effectively use the locale in
2492              * force upon entry to this function.) */
2493             for_all_individual_category_indexes(i) {
2494                 new_obj = newlocale(category_masks[i],
2495                                     new_locales[i],
2496                                     basis_obj);
2497                 if (new_obj) {
2498                     DEBUG_NEW_OBJECT_CREATED(category_names[i],
2499                                              new_locales[i],
2500                                              new_obj, basis_obj,
2501                                              caller_line);
2502                     basis_obj = new_obj;
2503                     continue;
2504                 }
2505
2506                 /* Failed.  Likely this is because the proposed new locale
2507                  * isn't valid on this system. */
2508
2509                 DEBUG_NEW_OBJECT_FAILED(category_names[i],
2510                                         new_locales[i],
2511                                         basis_obj);
2512
2513                 /* newlocale() didn't trash this, since the function call
2514                  * failed */
2515                 freelocale(basis_obj);
2516
2517                 for_all_individual_category_indexes(j) {
2518                     Safefree(new_locales[j]);
2519                 }
2520
2521                 goto must_restore_state;
2522             }
2523
2524             /* Success for all categories. */
2525             for_all_individual_category_indexes(i) {
2526                 update_PL_curlocales_i(i, new_locales[i], caller_line);
2527                 Safefree(new_locales[i]);
2528             }
2529
2530             /* We dup'd entry_obj in case we had to fall back to it.  The
2531              * newlocale() above destroyed the dup when it first succeeded, but
2532              * entry_obj itself is left dangling, so free it */
2533             if (! entry_obj_is_special) {
2534                 freelocale(entry_obj);
2535             }
2536         }
2537
2538 #  endif    /* End of newlocale can't handle disparate LC_ALL input */
2539
2540     }
2541
2542 #  undef DEBUG_NEW_OBJECT_CREATED
2543 #  undef DEBUG_NEW_OBJECT_FAILED
2544
2545     /* Here, successfully created an object representing the desired locale;
2546      * now switch into it */
2547     if (! uselocale(new_obj)) {
2548         freelocale(new_obj);
2549         locale_panic_(Perl_form(aTHX_ "(called from %" LINE_Tf "):"
2550                                       " bool_setlocale_2008_i: switching"
2551                                       " into new locale failed",
2552                                       caller_line));
2553     }
2554
2555     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
2556                            "bool_setlocale_2008_i: now using %p\n", new_obj));
2557
2558 #  ifdef MULTIPLICITY   /* Unlikely, but POSIX 2008 functions could be
2559                            Configured to be used on unthreaded perls, in which
2560                            case this object doesn't exist */
2561
2562     if (DEBUG_Lv_TEST) {
2563         if (PL_cur_locale_obj != new_obj) {
2564             PerlIO_printf(Perl_debug_log,
2565                           "bool_setlocale_2008_i: PL_cur_locale_obj"
2566                           " was %p, now is %p\n",
2567                           PL_cur_locale_obj, new_obj);
2568         }
2569     }
2570
2571     /* Update the current object */
2572     PL_cur_locale_obj = new_obj;
2573
2574 #  endif
2575 #  ifdef HAS_GLIBC_LC_MESSAGES_BUG
2576
2577     /* Invalidate the glibc cache of loaded translations if the locale has
2578      * changed, see [perl #134264] and
2579      * https://sourceware.org/bugzilla/show_bug.cgi?id=24936 */
2580     if (old_messages_locale) {
2581         if (strNE(old_messages_locale, querylocale_c(LC_MESSAGES))) {
2582             textdomain(textdomain(NULL));
2583         }
2584     }
2585
2586 #  endif
2587
2588     return true;
2589
2590   must_restore_state:
2591
2592     /* We earlier switched to the LC_ALL => C locale in anticipation of it
2593      * succeeding,  Now have to switch back to the state upon entry.  */
2594     if (! uselocale(entry_obj)) {
2595         setlocale_failure_panic_i(index, "switching back to",
2596                                   locale_on_entry, __LINE__, caller_line);
2597     }
2598
2599     return false;
2600 }
2601
2602 /*---------------------------------------------------------------------------*/
2603
2604 #  define void_setlocale_i_with_caller(i, locale, file, line)               \
2605      STMT_START {                                                           \
2606         if (! bool_setlocale_i(i, locale))                                  \
2607             setlocale_failure_panic_via_i(i, NULL, locale, __LINE__, 0,     \
2608                                           file, line);                      \
2609      } STMT_END
2610
2611 #  define void_setlocale_r_with_caller(cat, locale, file, line)             \
2612         void_setlocale_i_with_caller(get_category_index(cat), locale,       \
2613                                      file, line)
2614
2615 #  define void_setlocale_c_with_caller(cat, locale, file, line)             \
2616             void_setlocale_i_with_caller(cat##_INDEX_, locale, file, line)
2617
2618 #  define void_setlocale_i(i, locale)                                       \
2619                 void_setlocale_i_with_caller(i, locale, __FILE__, __LINE__)
2620 #  define void_setlocale_c(cat, locale)                                     \
2621                                   void_setlocale_i(cat##_INDEX_, locale)
2622 #  define void_setlocale_r(cat, locale)                                     \
2623                   void_setlocale_i(get_category_index(cat), locale)
2624
2625 /*===========================================================================*/
2626
2627 #else
2628 #  error Unexpected Configuration
2629 #endif   /* End of the various implementations of the setlocale and
2630             querylocale macros used in the remainder of this program */
2631
2632 /* query_nominal_locale_i() is used when the caller needs the locale that an
2633  * external caller would be expecting, and not what we're secretly using
2634  * behind the scenes.  It deliberately doesn't handle LC_ALL; use
2635  * calculate_LC_ALL_string() for that. */
2636 #ifdef USE_LOCALE_NUMERIC
2637 #  define query_nominal_locale_i(i)                                         \
2638       (__ASSERT_(i != LC_ALL_INDEX_)                                        \
2639        ((i == LC_NUMERIC_INDEX_) ? PL_numeric_name : querylocale_i(i)))
2640 #else
2641 #  define query_nominal_locale_i(i)                                         \
2642       (__ASSERT_(i != LC_ALL_INDEX_) querylocale_i(i))
2643 #endif
2644
2645 #ifdef USE_PL_CURLOCALES
2646
2647 STATIC void
2648 S_update_PL_curlocales_i(pTHX_
2649                          const locale_category_index index,
2650                          const char * new_locale,
2651                          const line_t caller_line)
2652 {
2653     /* Update PL_curlocales[], which is parallel to the other ones indexed by
2654      * our mapping of libc category number to our internal equivalents. */
2655
2656     PERL_ARGS_ASSERT_UPDATE_PL_CURLOCALES_I;
2657     assert(index <= LC_ALL_INDEX_);
2658
2659     if (index == LC_ALL_INDEX_) {
2660
2661         /* For LC_ALL, we change all individual categories to correspond,
2662          * including the LC_ALL element */
2663         for (unsigned int i = 0; i <= LC_ALL_INDEX_; i++) {
2664             Safefree(PL_curlocales[i]);
2665             PL_curlocales[i] = NULL;
2666         }
2667
2668         switch (parse_LC_ALL_string(new_locale,
2669                                     (const char **) &PL_curlocales,
2670                                     check_that_overridden,  /* things should
2671                                                                have already
2672                                                                been overridden
2673                                                                */
2674                                     true,   /* Always fill array */
2675                                     true,   /* Panic if fails, as to get here
2676                                                it earlier had to have succeeded
2677                                                */
2678                                    caller_line))
2679         {
2680           case invalid:
2681           case no_array:
2682           case only_element_0:
2683             locale_panic_via_("Unexpected return from parse_LC_ALL_string",
2684                               __FILE__, caller_line);
2685
2686           case full_array:
2687             /* parse_LC_ALL_string() has already filled PL_curlocales properly,
2688              * except for the LC_ALL element, which should be set to
2689              * 'new_locale'. */
2690             PL_curlocales[LC_ALL_INDEX_] = savepv(new_locale);
2691         }
2692     }
2693     else {  /* Not LC_ALL */
2694
2695         /* Update the single category's record */
2696         Safefree(PL_curlocales[index]);
2697         PL_curlocales[index] = savepv(new_locale);
2698
2699         /* Invalidate LC_ALL */
2700         Safefree(PL_curlocales[LC_ALL_INDEX_]);
2701         PL_curlocales[LC_ALL_INDEX_] = NULL;
2702     }
2703 }
2704
2705 #  endif  /* Need PL_curlocales[] */
2706
2707 /*===========================================================================*/
2708
2709 #if defined(USE_LOCALE)
2710
2711 /* This paradigm is needed in several places in the function below.  We have to
2712  * substitute the nominal locale for LC_NUMERIC when returning a value for
2713  * external consumption */
2714 #  ifndef USE_LOCALE_NUMERIC
2715 #    define ENTRY(i, array, format)  array[i]
2716 #  else
2717 #    define ENTRY(i, array, format)                         \
2718        (UNLIKELY(   format == EXTERNAL_FORMAT_FOR_QUERY     \
2719                  && i == LC_NUMERIC_INDEX_)                 \
2720         ? PL_numeric_name                                   \
2721         : array[i])
2722 #  endif
2723
2724 STATIC
2725 const char *
2726 S_calculate_LC_ALL_string(pTHX_ const char ** category_locales_list,
2727                                 const calc_LC_ALL_format format,
2728                                 const calc_LC_ALL_return returning,
2729                                 const line_t caller_line)
2730 {
2731     PERL_ARGS_ASSERT_CALCULATE_LC_ALL_STRING;
2732
2733     /* NOTE: On Configurations that have PL_curlocales[], this function has the
2734      * side effect of updating the LC_ALL_INDEX_ element with its result.
2735      *
2736      * This function calculates a string that defines the locale(s) LC_ALL is
2737      * set to, in either:
2738      *  1)  Our internal format if 'format' is set to INTERNAL_FORMAT.
2739      *  2)  The external format returned by Perl_setlocale() if 'format' is set
2740      *      to EXTERNAL_FORMAT_FOR_QUERY or EXTERNAL_FORMAT_FOR_SET.
2741      *
2742      *      These two are distinguished by:
2743      *       a) EXTERNAL_FORMAT_FOR_SET returns the actual locale currently in
2744      *          effect.
2745      *       b) EXTERNAL_FORMAT_FOR_QUERY returns the nominal locale.
2746      *          Currently this can differ only from the actual locale in the
2747      *          LC_NUMERIC category when it is set to a locale whose radix is
2748      *          not a dot.  (The actual locale is kept as a dot to accommodate
2749      *          the large corpus of XS code that expects it to be that;
2750      *          switched to a non-dot temporarily during certain operations
2751      *          that require the actual radix.)
2752      *
2753      * In both 1) and 2), LC_ALL's values are passed to this function by
2754      * 'category_locales_list' which is either:
2755      *  1) a pointer to an array of strings with up-to-date values of all the
2756      *     individual categories; or
2757      *  2) NULL, to indicate to use querylocale_i() to get each individual
2758      *     value.
2759      *
2760      * The caller sets 'returning' to
2761      *      WANT_TEMP_PV        the function returns the calculated string
2762      *                              as a mortalized temporary, so the caller
2763      *                              doesn't have to worry about it being
2764      *                              per-thread, nor needs to arrange for its
2765      *                              clean-up.
2766      *      WANT_PL_setlocale_buf  the function stores the calculated string
2767      *                              into the per-thread buffer PL_setlocale_buf
2768      *                              and returns a pointer to that.  The buffer
2769      *                              is cleaned up automatically in process
2770      *                              destruction.  This return method avoids
2771      *                              extra copies in some circumstances.
2772      *      WANT_VOID           NULL is returned.  This is used when the
2773      *                              function is being called only for its side
2774      *                              effect of updating
2775      *                              PL_curlocales[LC_ALL_INDEX_]
2776      *
2777      * querylocale(), on systems that have it, doesn't tend to work for LC_ALL.
2778      * So we have to construct the answer ourselves based on the passed in
2779      * data.
2780      *
2781      * If all individual categories are the same locale, we can just set LC_ALL
2782      * to that locale.  But if not, we have to create an aggregation of all the
2783      * categories on the system.  Platforms differ as to the syntax they use
2784      * for these non-uniform locales for LC_ALL.  Some, like glibc and Windows,
2785      * use an unordered series of name=value pairs, like
2786      *      LC_NUMERIC=C;LC_TIME=en_US.UTF-8;...
2787      * to specify LC_ALL; others, like *BSD, use a positional notation with a
2788      * delimitter, typically a single '/' character:
2789      *      C/en_UK.UTF-8/...
2790      *
2791      * When the external format is desired, this function returns whatever the
2792      * system expects.  The internal format is always name=value pairs.
2793      *
2794      * For systems that have categories we don't know about, the algorithm
2795      * below won't know about those missing categories, leading to potential
2796      * bugs for code that looks at them.  If there is an environment variable
2797      * that sets that category, we won't know to look for it, and so our use of
2798      * LANG or "C" improperly overrides it.  On the other hand, if we don't do
2799      * what is done here, and there is no environment variable, the category's
2800      * locale should be set to LANG or "C".  So there is no good solution.  khw
2801      * thinks the best is to make sure we have a complete list of possible
2802      * categories, adding new ones as they show up on obscure platforms.
2803      */
2804
2805     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
2806                            "Entering calculate_LC_ALL_string(%s);"
2807                            " called from %" LINE_Tf "\n",
2808                            ((format == EXTERNAL_FORMAT_FOR_QUERY)
2809                             ? "EXTERNAL_FORMAT_FOR_QUERY"
2810                             : ((format == EXTERNAL_FORMAT_FOR_SET)
2811                                ? "EXTERNAL_FORMAT_FOR_SET"
2812                                : "INTERNAL_FORMAT")),
2813                            caller_line));
2814
2815     bool input_list_was_NULL = (category_locales_list == NULL);
2816
2817     /* If there was no input category list, construct a temporary one
2818      * ourselves. */
2819     const char * my_category_locales_list[LC_ALL_INDEX_];
2820     const char ** locales_list = category_locales_list;
2821     if (locales_list == NULL) {
2822         locales_list = my_category_locales_list;
2823
2824         if (format == EXTERNAL_FORMAT_FOR_QUERY) {
2825             for_all_individual_category_indexes(i) {
2826                 locales_list[i] = query_nominal_locale_i(i);
2827             }
2828         }
2829         else {
2830             for_all_individual_category_indexes(i) {
2831                 locales_list[i] = querylocale_i(i);
2832             }
2833         }
2834     }
2835
2836     /* While we are calculating LC_ALL, we see if every category's locale is
2837      * the same as every other's or not. */
2838 #  ifndef HAS_IGNORED_LOCALE_CATEGORIES_
2839
2840     /* When we pay attention to all categories, we assume they are all the same
2841      * until proven different */
2842     bool disparate = false;
2843
2844 #  else
2845
2846     /* But if there are ignored categories, those will be set to "C", so try an
2847      * arbitrary category, and if it isn't C, we know immediately that the
2848      * locales are disparate.  (The #if conditionals are to handle the case
2849      * where LC_NUMERIC_INDEX_ is 0.  We don't want to use LC_NUMERIC to
2850      * compare, as that may be different between external and internal forms.)
2851      * */
2852 #    if ! defined(USE_LOCALE_NUMERIC)
2853
2854     bool disparate = ! isNAME_C_OR_POSIX(locales_list[0]);
2855
2856 #    elif LC_NUMERIC_INDEX_ != 0
2857
2858     bool disparate = ! isNAME_C_OR_POSIX(locales_list[0]);
2859
2860 #    else
2861
2862     /* Would need revision to handle the very unlikely case where only a single
2863      * category, LC_NUMERIC, is defined */
2864     assert(LOCALE_CATEGORIES_COUNT_ > 0);
2865
2866     bool disparate = ! isNAME_C_OR_POSIX(locales_list[1]);
2867
2868 #    endif
2869 #  endif
2870
2871     /* Calculate the needed size for the string listing the individual locales.
2872      * Initialize with values known at compile time. */
2873     Size_t total_len;
2874     const char *separator;
2875
2876 #  ifdef PERL_LC_ALL_USES_NAME_VALUE_PAIRS  /* Positional formatted LC_ALL */
2877     PERL_UNUSED_ARG(format);
2878 #  else
2879
2880     if (format != INTERNAL_FORMAT) {
2881
2882         /* Here, we will be using positional notation.  it includes n-1
2883          * separators */
2884         total_len = (  LOCALE_CATEGORIES_COUNT_ - 1)
2885                      * STRLENs(PERL_LC_ALL_SEPARATOR)
2886                   + 1;   /* And a trailing NUL */
2887         separator = PERL_LC_ALL_SEPARATOR;
2888     }
2889     else
2890
2891 #  endif
2892
2893     {
2894         /* name=value output is always used in internal format, and when
2895          * positional isn't available on the platform. */
2896         total_len = lc_all_boiler_plate_length;
2897         separator = ";";
2898     }
2899
2900     /* The total length then is just the sum of the above boiler-plate plus the
2901      * total strlen()s of the locale name of each individual category. */
2902     for_all_individual_category_indexes(i) {
2903         const char * entry = ENTRY(i, locales_list, format);
2904
2905         total_len += strlen(entry);
2906         if (! disparate && strNE(entry, locales_list[0])) {
2907             disparate = true;
2908         }
2909     }
2910
2911     bool free_if_void_return = false;
2912     const char * retval;
2913
2914     /* If all categories have the same locale, we already know the answer */
2915     if (! disparate) {
2916         if (returning == WANT_PL_setlocale_buf) {
2917             save_to_buffer(locales_list[0],
2918                            &PL_setlocale_buf,
2919                            &PL_setlocale_bufsize);
2920             retval = PL_setlocale_buf;
2921         }
2922         else {
2923
2924             retval = locales_list[0];
2925
2926             /* If a temporary is wanted for the return, and we had to create
2927              * the input list ourselves, we created it into such a temporary,
2928              * so no further work is needed; but otherwise, make a mortal copy
2929              * of this passed-in list element */
2930             if (returning == WANT_TEMP_PV && ! input_list_was_NULL) {
2931                 retval = savepv(retval);
2932                 SAVEFREEPV(retval);
2933             }
2934
2935             /* In all cases here, there's nothing we create that needs to be
2936              * freed, so leave 'free_if_void_return' set to the default
2937              * 'false'. */
2938         }
2939     }
2940     else {  /* Here, not all categories have the same locale */
2941
2942         char * constructed;
2943
2944         /* If returning to PL_setlocale_buf, set up to write directly to it,
2945          * being sure it is resized to be large enough */
2946         if (returning == WANT_PL_setlocale_buf) {
2947             set_save_buffer_min_size(total_len,
2948                                      &PL_setlocale_buf,
2949                                      &PL_setlocale_bufsize);
2950             constructed = PL_setlocale_buf;
2951         }
2952         else {  /* Otherwise we need new memory to hold the calculated value. */
2953
2954             Newx(constructed, total_len, char);
2955
2956             /* If returning the new memory, it must be set up to be freed
2957              * later; otherwise at the end of this function */
2958             if (returning == WANT_TEMP_PV) {
2959                 SAVEFREEPV(constructed);
2960             }
2961             else {
2962                 free_if_void_return = true;
2963             }
2964         }
2965
2966         constructed[0] = '\0';
2967
2968         /* Loop through all the categories */
2969         for_all_individual_category_indexes(j) {
2970
2971             /* Add a separator, except before the first one */
2972             if (j != 0) {
2973                 my_strlcat(constructed, separator, total_len);
2974             }
2975
2976             const char * entry;
2977             Size_t needed_len;
2978             unsigned int i = j;
2979
2980 #  ifndef PERL_LC_ALL_USES_NAME_VALUE_PAIRS
2981
2982             if (UNLIKELY(format != INTERNAL_FORMAT)) {
2983
2984                 /* In positional notation 'j' means the position, and we have
2985                  * to convert to the index 'i' */
2986                 i = map_LC_ALL_position_to_index[j];
2987
2988                 entry = ENTRY(i, locales_list, format);
2989                 needed_len = my_strlcat(constructed, entry, total_len);
2990             }
2991             else
2992
2993 #  endif
2994             {
2995                 /* Below, we are to use name=value notation, either because
2996                  * that's what the platform uses, or because this is the
2997                  * internal format, which uses that notation regardless of the
2998                  * external form */
2999
3000                 entry = ENTRY(i, locales_list, format);
3001
3002                 /* "name=locale;" */
3003                 my_strlcat(constructed, category_names[i], total_len);
3004                 my_strlcat(constructed, "=", total_len);
3005                 needed_len = my_strlcat(constructed, entry, total_len);
3006             }
3007
3008             if (LIKELY(needed_len <= total_len)) {
3009                 continue;
3010             }
3011
3012             /* If would have overflowed, panic */
3013             locale_panic_via_(Perl_form(aTHX_
3014                                         "Internal length calculation wrong.\n"
3015                                         "\"%s\" was not entirely added to"
3016                                         " \"%.*s\"; needed=%zu, had=%zu",
3017                                         entry, (int) total_len,
3018                                         constructed,
3019                                         needed_len, total_len),
3020                                 __FILE__,
3021                                 caller_line);
3022         } /* End of loop through the categories */
3023
3024         retval = constructed;
3025     } /* End of the categories' locales are displarate */
3026
3027 #  if defined(USE_PL_CURLOCALES) && defined(LC_ALL)
3028
3029     if (format == INTERNAL_FORMAT) {
3030
3031         /* PL_curlocales[LC_ALL_INDEX_] is updated as a side-effect of this
3032          * function for internal format. */
3033         Safefree(PL_curlocales[LC_ALL_INDEX_]);
3034         PL_curlocales[LC_ALL_INDEX_] = savepv(retval);
3035     }
3036
3037 #  endif
3038
3039     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
3040                            "calculate_LC_ALL_string calculated '%s'\n",
3041                            retval));
3042
3043     if (returning == WANT_VOID) {
3044         if (free_if_void_return) {
3045             Safefree(retval);
3046         }
3047
3048         return NULL;
3049     }
3050
3051     return retval;
3052 }
3053
3054 #  if defined(WIN32) || (     defined(USE_POSIX_2008_LOCALE)        \
3055                          && ! defined(USE_QUERYLOCALE))
3056
3057 STATIC const char *
3058 S_find_locale_from_environment(pTHX_ const locale_category_index index)
3059 {
3060     /* NB: This function may actually change the locale on Windows.  It
3061      * currently is designed to be called only from setting the locale on
3062      * Windows, and POSIX 2008
3063      *
3064      * This function returns the locale specified by the program's environment
3065      * for the category specified by our internal index number 'index'.  It
3066      * therefore simulates:
3067      *      setlocale(cat, "")
3068      * but, except for some cases in Windows, doesn't actually change the
3069      * locale; merely returns it.
3070      *
3071      * The return need not be freed by the caller.  This
3072      * promise relies on PerlEnv_getenv() returning a mortalized copy to us.
3073      *
3074      * The simulation is needed only on certain platforms; otherwise, libc is
3075      * called with "" to get the actual value(s).  The simulation is needed
3076      * for:
3077      *
3078      *  1)  On Windows systems, the concept of the POSIX ordering of
3079      *      environment variables is missing.  To increase portability of
3080      *      programs across platforms, the POSIX ordering is emulated on
3081      *      Windows.
3082      *
3083      *  2)  On POSIX 2008 systems without querylocale(), it is problematic
3084      *      getting the results of the POSIX 2008 equivalent of
3085      *
3086      *          setlocale(category, "")
3087      *
3088      *      To ensure that we know exactly what those values are, we do the
3089      *      setting ourselves, using the documented algorithm specified by the
3090      *      POSIX standard (assuming the platform follows the Standard) rather
3091      *      than use "" as the locale.  This will lead to results that differ
3092      *      from native behavior if the native behavior differs from the
3093      *      Standard's documented value, but khw believes it is better to know
3094      *      what's going on, even if different from native, than to just guess.
3095      *
3096      *      glibc systems differ from this standard in having a LANGUAGE
3097      *      environment variable used for just LC_MESSAGES.  This function does
3098      *      NOT handle that.
3099      *
3100      *      Another option for the POSIX 2008 case would be, in a critical
3101      *      section, to save the global locale's current value, and do a
3102      *      straight setlocale(LC_ALL, "").  That would return our desired
3103      *      values, destroying the global locale's, which we would then
3104      *      restore.  But that could cause races with any other thread that is
3105      *      using the global locale and isn't using the mutex.  And, the only
3106      *      reason someone would have done that is because they are calling a
3107      *      library function, like in gtk, that calls setlocale(), and which
3108      *      can't be changed to use the mutex.  That wouldn't be a problem if
3109      *      this were to be done before any threads had switched, say during
3110      *      perl construction time.  But this code would still be needed for
3111      *      the general case.
3112      *
3113      * The Windows and POSIX 2008 differ in that the ultimate fallback is "C"
3114      * in POSIX, and is the system default locale in Windows.  To get that
3115      * system default value, we actually have to call setlocale() on Windows.
3116      */
3117
3118     const char * const lc_all = PerlEnv_getenv("LC_ALL");
3119     const char * locale_names[LC_ALL_INDEX_] = { NULL };
3120
3121     /* Use any "LC_ALL" environment variable, as it overrides everything else.
3122      * */
3123     if (lc_all && strNE(lc_all, "")) {
3124         return lc_all;
3125     }
3126
3127     /* Here, no usable LC_ALL environment variable.  We have to handle each
3128      * category separately.  If all categories are desired, we loop through
3129      * them all.  If only an individual category is desired, to avoid
3130      * duplicating logic, we use the same loop, but set up the limits so it is
3131      * only executed once, for that particular category. */
3132     locale_category_index lower, upper, offset;
3133     if (index == LC_ALL_INDEX_) {
3134         lower = (locale_category_index) 0;
3135         upper = (locale_category_index) ((int) LC_ALL_INDEX_ - 1);
3136         offset = (locale_category_index) 0;
3137     }
3138     else {
3139         lower = index;
3140         upper = index;
3141
3142         /* 'offset' is used so that the result of the single loop iteration is
3143          * stored into output[0] */
3144         offset = lower;
3145     }
3146
3147     /* When no LC_ALL environment variable, LANG is used as a default, but
3148      * overridden for individual categories that have corresponding environment
3149      * variables.  If no LANG exists, the default is "C" on POSIX 2008, or the
3150      * system default for the category on Windows. */
3151     const char * env_lang = NULL;
3152
3153     /* For each desired category, use any corresponding environment variable;
3154      * or the default if none such exists. */
3155     bool is_disparate = false;  /* Assume is uniform until proven otherwise */
3156     for (unsigned i = lower; i <= upper; i++) {
3157         const char * const env_override = PerlEnv_getenv(category_names[i]);
3158         unsigned int j = i - offset;
3159
3160         if (env_override && strNE(env_override, "")) {
3161             locale_names[j] = env_override;
3162         }
3163         else { /* Here, no corresponding environment variable, see if LANG
3164                   exists and is usable.  Done this way to avoid fetching LANG
3165                   unless it is actually needed */
3166             if (env_lang == NULL) {
3167                 env_lang = PerlEnv_getenv("LANG");
3168
3169                 /* If not usable, set it to a non-NULL illegal value so won't
3170                  * try to use it below */
3171                 if (env_lang == NULL || strEQ(env_lang, "")) {
3172                     env_lang = (const char *) 1;
3173                 }
3174             }
3175
3176             /* If a usable LANG exists, use it. */
3177             if (env_lang != NULL && env_lang != (const char *) 1) {
3178                 locale_names[j] = env_lang;
3179             }
3180             else {
3181
3182 #    ifdef WIN32
3183                 /* If no LANG, use the system default on Windows. */
3184                 locale_names[j] = wrap_wsetlocale(categories[i], ".ACP");
3185                 if (locale_names[j]) {
3186                     SAVEFREEPV(locale_names[j]);
3187                 }
3188                 else
3189 #    endif
3190                 {   /* If nothing was found or worked, use C */
3191                     locale_names[j] = "C";
3192                 }
3193             }
3194         }
3195
3196         if (j > 0 && ! is_disparate && strNE(locale_names[0], locale_names[j]))
3197         {
3198             is_disparate = true;
3199         }
3200
3201         DEBUG_Lv(PerlIO_printf(Perl_debug_log,
3202                  "find_locale_from_environment i=%u, j=%u, name=%s,"
3203                  " locale=%s, locale of 0th category=%s, disparate=%d\n",
3204                  i, j, category_names[i],
3205                  locale_names[j], locale_names[0], is_disparate));
3206     }
3207
3208     if (! is_disparate) {
3209         return locale_names[0];
3210     }
3211
3212     return calculate_LC_ALL_string(locale_names, INTERNAL_FORMAT,
3213                                    WANT_TEMP_PV,
3214                                    __LINE__);
3215 }
3216
3217 #  endif
3218 #  if defined(DEBUGGING) || defined(USE_PERL_SWITCH_LOCALE_CONTEXT)
3219
3220 STATIC const char *
3221 S_get_LC_ALL_display(pTHX)
3222 {
3223     return calculate_LC_ALL_string(NULL, INTERNAL_FORMAT,
3224                                    WANT_TEMP_PV,
3225                                    __LINE__);
3226 }
3227
3228 #  endif
3229
3230 STATIC void
3231 S_setlocale_failure_panic_via_i(pTHX_
3232                                 const locale_category_index cat_index,
3233                                 const char * current,
3234                                 const char * failed,
3235                                 const line_t proxy_caller_line,
3236                                 const line_t immediate_caller_line,
3237                                 const char * const higher_caller_file,
3238                                 const line_t higher_caller_line)
3239 {
3240     PERL_ARGS_ASSERT_SETLOCALE_FAILURE_PANIC_VIA_I;
3241
3242     /* Called to panic when a setlocale form unexpectedly failed for the
3243      * category determined by 'cat_index', and the locale that was in effect
3244      * (and likely still is) is 'current'.  'current' may be NULL, which causes
3245      * this function to query what it is.
3246      *
3247      * The extra caller information is used for when a function acts as a
3248      * stand-in for another function, which a typical reader would more likely
3249      * think would be the caller
3250      *
3251      * If a line number is 0, its stack (sort-of) frame is omitted; same if
3252      * it's the same line number as the next higher caller. */
3253
3254     const int cat = categories[cat_index];
3255     const char * name = category_names[cat_index];
3256
3257     dSAVE_ERRNO;
3258
3259     if (current == NULL) {
3260         current = querylocale_i(cat_index);
3261     }
3262
3263     const char * proxy_text = "";
3264     if (proxy_caller_line != 0 && proxy_caller_line != immediate_caller_line)
3265     {
3266         proxy_text = Perl_form(aTHX_ "\nCalled via %s: %" LINE_Tf,
3267                                       __FILE__, proxy_caller_line);
3268     }
3269     if (   strNE(__FILE__, higher_caller_file)
3270         || (   immediate_caller_line != 0
3271             && immediate_caller_line != higher_caller_line))
3272     {
3273         proxy_text = Perl_form(aTHX_ "%s\nCalled via %s: %" LINE_Tf,
3274                                       proxy_text, __FILE__,
3275                                       immediate_caller_line);
3276     }
3277
3278     /* 'false' in the get_displayable_string() calls makes it not think the
3279      * locale is UTF-8, so just dumps bytes.  Actually figuring it out can be
3280      * too complicated for a panic situation. */
3281     const char * msg = Perl_form(aTHX_
3282                             "Can't change locale for %s (%d) from '%s' to '%s'"
3283                             " %s",
3284                             name, cat,
3285                             get_displayable_string(current,
3286                                                    current + strlen(current),
3287                                                    false),
3288                             get_displayable_string(failed,
3289                                                    failed + strlen(failed),
3290                                                    false),
3291                             proxy_text);
3292     RESTORE_ERRNO;
3293
3294     Perl_locale_panic(msg, __LINE__, higher_caller_file, higher_caller_line);
3295     NOT_REACHED; /* NOTREACHED */
3296 }
3297
3298 /* Any of these will allow us to find the RADIX */
3299 #  if defined(USE_LOCALE_NUMERIC) && (   defined(HAS_SOME_LANGINFO)         \
3300                                       || defined(HAS_LOCALECONV)            \
3301                                       || defined(HAS_SNPRINTF))
3302 #    define CAN_CALCULATE_RADIX
3303 #  endif
3304 #  ifdef USE_LOCALE_NUMERIC
3305
3306 STATIC void
3307 S_new_numeric(pTHX_ const char *newnum, bool force)
3308 {
3309     PERL_ARGS_ASSERT_NEW_NUMERIC;
3310
3311     /* Called after each libc setlocale() or uselocale() call affecting
3312      * LC_NUMERIC, to tell core Perl this and that 'newnum' is the name of the
3313      * new locale, and we are switched into it.  It installs this locale as the
3314      * current underlying default, and then switches to the C locale, if
3315      * necessary, so that the code that has traditionally expected the radix
3316      * character to be a dot may continue to do so.
3317      *
3318      * The default locale and the C locale can be toggled between by use of the
3319      * set_numeric_underlying() and set_numeric_standard() functions, which
3320      * should probably not be called directly, but only via macros like
3321      * SET_NUMERIC_STANDARD() in perl.h.
3322      *
3323      * The toggling is necessary mainly so that a non-dot radix decimal point
3324      * character can be input and output, while allowing internal calculations
3325      * to use a dot.
3326      *
3327      * This sets several interpreter-level variables:
3328      * PL_numeric_name  The underlying locale's name: a copy of 'newnum'
3329      * PL_numeric_underlying   A boolean indicating if the toggled state is
3330      *                  such that the current locale is the program's
3331      *                  underlying locale
3332      * PL_numeric_standard   An int indicating if the toggled state is such
3333      *                  that the current locale is the C locale or
3334      *                  indistinguishable from the C locale.  If non-zero, it
3335      *                  is in C; if > 1, it means it may not be toggled away
3336      *                  from C.
3337      * PL_numeric_underlying_is_standard   A bool kept by this function
3338      *                  indicating that the underlying locale and the standard
3339      *                  C locale are indistinguishable for the purposes of
3340      *                  LC_NUMERIC.  This happens when both of the above two
3341      *                  variables are true at the same time.  (Toggling is a
3342      *                  no-op under these circumstances.)  This variable is
3343      *                  used to avoid having to recalculate.
3344      * PL_numeric_radix_sv   Contains the string that code should use for the
3345      *                  decimal point.  It is set to either a dot or the
3346      *                  program's underlying locale's radix character string,
3347      *                  depending on the situation.
3348      * PL_underlying_radix_sv   Contains the program's underlying locale's
3349      *                  radix character string.  This is copied into
3350      *                  PL_numeric_radix_sv when the situation warrants.  It
3351      *                  exists to avoid having to recalculate it when toggling.
3352      */
3353
3354     DEBUG_L( PerlIO_printf(Perl_debug_log,
3355                            "Called new_numeric with %s, PL_numeric_name=%s\n",
3356                            newnum, PL_numeric_name));
3357
3358     /* If not forcing this procedure, and there isn't actually a change from
3359      * our records, do nothing.  (Our records can be wrong when sync'ing to the
3360      * locale set up by an external library, hence the 'force' parameter) */
3361     if (! force && strEQ(PL_numeric_name, newnum)) {
3362         return;
3363     }
3364
3365     Safefree(PL_numeric_name);
3366     PL_numeric_name = savepv(newnum);
3367
3368     /* Handle the trivial case.  Since this is called at process
3369      * initialization, be aware that this bit can't rely on much being
3370      * available. */
3371     if (isNAME_C_OR_POSIX(PL_numeric_name)) {
3372         PL_numeric_standard = TRUE;
3373         PL_numeric_underlying_is_standard = TRUE;
3374         PL_numeric_underlying = TRUE;
3375         sv_setpv(PL_numeric_radix_sv, C_decimal_point);
3376         SvUTF8_off(PL_numeric_radix_sv);
3377         sv_setpv(PL_underlying_radix_sv, C_decimal_point);
3378         SvUTF8_off(PL_underlying_radix_sv);
3379         return;
3380     }
3381
3382     /* We are in the underlying locale until changed at the end of this
3383      * function */
3384     PL_numeric_underlying = TRUE;
3385
3386     char * radix = NULL;
3387     utf8ness_t utf8ness = UTF8NESS_IMMATERIAL;
3388
3389     /* Find and save this locale's radix character. */
3390     my_langinfo_c(RADIXCHAR, LC_NUMERIC, PL_numeric_name,
3391                   &radix, NULL, &utf8ness);
3392     sv_setpv(PL_underlying_radix_sv, radix);
3393
3394     if (utf8ness == UTF8NESS_YES) {
3395         SvUTF8_on(PL_underlying_radix_sv);
3396     }
3397     else {
3398         SvUTF8_off(PL_underlying_radix_sv);
3399     }
3400
3401     DEBUG_L(PerlIO_printf(Perl_debug_log,
3402                           "Locale radix is '%s', ?UTF-8=%d\n",
3403                           SvPVX(PL_underlying_radix_sv),
3404                           cBOOL(SvUTF8(PL_underlying_radix_sv))));
3405
3406     /* This locale is indistinguishable from C (for numeric purposes) if both
3407      * the radix character and the thousands separator are the same as C's.
3408      * Start with the radix. */
3409     PL_numeric_underlying_is_standard = strEQ(C_decimal_point, radix);
3410     Safefree(radix);
3411
3412 #    ifndef TS_W32_BROKEN_LOCALECONV
3413
3414     /* If the radix isn't the same as C's, we know it is distinguishable from
3415      * C; otherwise check the thousands separator too.  Only if both are the
3416      * same as C's is the locale indistinguishable from C.
3417      *
3418      * But on earlier Windows versions, there is a potential race.  This code
3419      * knows that localeconv() (elsewhere in this file) will be used to extract
3420      * the needed value, and localeconv() was buggy for quite a while, and that
3421      * code in this file hence uses a workaround.  And that workaround may have
3422      * an (unlikely) race.  Gathering the radix uses a different workaround on
3423      * Windows that doesn't involve a race.  It might be possible to do the
3424      * same for this (patches welcome).
3425      *
3426      * Until then khw doesn't think it's worth even the small risk of a race to
3427      * get this value, which doesn't appear to be used in any of the Microsoft
3428      * library routines anyway. */
3429
3430     char * scratch_buffer = NULL;
3431     if (PL_numeric_underlying_is_standard) {
3432         PL_numeric_underlying_is_standard = strEQ(C_thousands_sep,
3433                                              my_langinfo_c(THOUSEP, LC_NUMERIC,
3434                                                            PL_numeric_name,
3435                                                            &scratch_buffer,
3436                                                            NULL, NULL));
3437     }
3438     Safefree(scratch_buffer);
3439
3440 #    endif
3441
3442     PL_numeric_standard = PL_numeric_underlying_is_standard;
3443
3444     /* Keep LC_NUMERIC so that it has the C locale radix and thousands
3445      * separator.  This is for XS modules, so they don't have to worry about
3446      * the radix being a non-dot.  (Core operations that need the underlying
3447      * locale change to it temporarily). */
3448     if (! PL_numeric_standard) {
3449         set_numeric_standard(__FILE__, __LINE__);
3450     }
3451 }
3452
3453 #  endif
3454
3455 void
3456 Perl_set_numeric_standard(pTHX_ const char * const file, const line_t line)
3457 {
3458     PERL_ARGS_ASSERT_SET_NUMERIC_STANDARD;
3459     PERL_UNUSED_ARG(file);      /* Some Configurations ignore these */
3460     PERL_UNUSED_ARG(line);
3461
3462 #  ifdef USE_LOCALE_NUMERIC
3463
3464     /* Unconditionally toggle the LC_NUMERIC locale to the C locale
3465      *
3466      * Most code should use the macro SET_NUMERIC_STANDARD() in perl.h
3467      * instead of calling this directly.  The macro avoids calling this routine
3468      * if toggling isn't necessary according to our records (which could be
3469      * wrong if some XS code has changed the locale behind our back) */
3470
3471     DEBUG_L(PerlIO_printf(Perl_debug_log, "Setting LC_NUMERIC locale to"
3472                                           " standard C; called from %s: %"
3473                                           LINE_Tf "\n", file, line));
3474
3475     void_setlocale_c_with_caller(LC_NUMERIC, "C", file, line);
3476     PL_numeric_standard = TRUE;
3477     sv_setpv(PL_numeric_radix_sv, C_decimal_point);
3478     SvUTF8_off(PL_numeric_radix_sv);
3479
3480     PL_numeric_underlying = PL_numeric_underlying_is_standard;
3481
3482 #  endif /* USE_LOCALE_NUMERIC */
3483
3484 }
3485
3486 void
3487 Perl_set_numeric_underlying(pTHX_ const char * const file, const line_t line)
3488 {
3489     PERL_ARGS_ASSERT_SET_NUMERIC_UNDERLYING;
3490     PERL_UNUSED_ARG(file);      /* Some Configurations ignore these */
3491     PERL_UNUSED_ARG(line);
3492
3493 #  ifdef USE_LOCALE_NUMERIC
3494
3495     /* Unconditionally toggle the LC_NUMERIC locale to the current underlying
3496      * default.
3497      *
3498      * Most code should use the macro SET_NUMERIC_UNDERLYING() in perl.h
3499      * instead of calling this directly.  The macro avoids calling this routine
3500      * if toggling isn't necessary according to our records (which could be
3501      * wrong if some XS code has changed the locale behind our back) */
3502
3503     DEBUG_L(PerlIO_printf(Perl_debug_log, "Setting LC_NUMERIC locale to %s;"
3504                                           " called from %s: %" LINE_Tf "\n",
3505                                           PL_numeric_name, file, line));
3506     /* Maybe not in init? assert(PL_locale_mutex_depth > 0);*/
3507
3508     void_setlocale_c_with_caller(LC_NUMERIC, PL_numeric_name, file, line);
3509     PL_numeric_underlying = TRUE;
3510     sv_setsv_nomg(PL_numeric_radix_sv, PL_underlying_radix_sv);
3511
3512     PL_numeric_standard = PL_numeric_underlying_is_standard;
3513
3514 #  endif /* USE_LOCALE_NUMERIC */
3515
3516 }
3517
3518 #  ifdef USE_LOCALE_CTYPE
3519
3520 STATIC void
3521 S_new_ctype(pTHX_ const char *newctype, bool force)
3522 {
3523     PERL_ARGS_ASSERT_NEW_CTYPE;
3524     PERL_UNUSED_ARG(force);
3525
3526     /* Called after each libc setlocale() call affecting LC_CTYPE, to tell
3527      * core Perl this and that 'newctype' is the name of the new locale.
3528      *
3529      * This function sets up the folding arrays for all 256 bytes, assuming
3530      * that tofold() is tolc() since fold case is not a concept in POSIX,
3531      */
3532
3533     DEBUG_L(PerlIO_printf(Perl_debug_log, "Entering new_ctype(%s)\n",
3534                                           newctype));
3535
3536     /* No change means no-op */
3537     if (strEQ(PL_ctype_name, newctype)) {
3538         return;
3539     }
3540
3541     /* We will replace any bad locale warning with
3542      *  1)  nothing if the new one is ok; or
3543      *  2)  a new warning for the bad new locale */
3544     if (PL_warn_locale) {
3545         SvREFCNT_dec_NN(PL_warn_locale);
3546         PL_warn_locale = NULL;
3547     }
3548
3549     /* Clear cache */
3550     Safefree(PL_ctype_name);
3551     PL_ctype_name = "";
3552
3553     PL_in_utf8_turkic_locale = FALSE;
3554
3555     /* For the C locale, just use the standard folds, and we know there are no
3556      * glitches possible, so return early.  Since this is called at process
3557      * initialization, be aware that this bit can't rely on much being
3558      * available. */
3559     if (isNAME_C_OR_POSIX(newctype)) {
3560         Copy(PL_fold, PL_fold_locale, 256, U8);
3561         PL_ctype_name = savepv(newctype);
3562         PL_in_utf8_CTYPE_locale = FALSE;
3563         return;
3564     }
3565
3566     /* The cache being cleared signals the called function to compute a new
3567      * value */
3568     PL_in_utf8_CTYPE_locale = is_locale_utf8(newctype);
3569
3570     PL_ctype_name = savepv(newctype);
3571     bool maybe_utf8_turkic = FALSE;
3572
3573     /* Don't check for problems if we are suppressing the warnings */
3574     bool check_for_problems = ckWARN_d(WARN_LOCALE) || UNLIKELY(DEBUG_L_TEST);
3575
3576     if (PL_in_utf8_CTYPE_locale) {
3577
3578         /* A UTF-8 locale gets standard rules.  But note that code still has to
3579          * handle this specially because of the three problematic code points
3580          * */
3581         Copy(PL_fold_latin1, PL_fold_locale, 256, U8);
3582
3583         /* UTF-8 locales can have special handling for 'I' and 'i' if they are
3584          * Turkic.  Make sure these two are the only anomalies.  (We don't
3585          * require towupper and towlower because they aren't in C89.) */
3586
3587 #    if defined(HAS_TOWUPPER) && defined (HAS_TOWLOWER)
3588
3589         if (towupper('i') == 0x130 && towlower('I') == 0x131)
3590
3591 #    else
3592
3593         if (toU8_UPPER_LC('i') == 'i' && toU8_LOWER_LC('I') == 'I')
3594
3595 #    endif
3596
3597         {
3598             /* This is how we determine it really is Turkic */
3599             check_for_problems = TRUE;
3600             maybe_utf8_turkic = TRUE;
3601         }
3602     }
3603     else {  /* Not a canned locale we know the values for.  Compute them */
3604
3605 #    ifdef DEBUGGING
3606
3607         bool has_non_ascii_fold = FALSE;
3608         bool found_unexpected = FALSE;
3609
3610         /* Under -DLv, see if there are any folds outside the ASCII range.
3611          * This factoid is used below */
3612         if (DEBUG_Lv_TEST) {
3613             for (unsigned i = 128; i < 256; i++) {
3614                 int j = LATIN1_TO_NATIVE(i);
3615                 if (toU8_LOWER_LC(j) != j || toU8_UPPER_LC(j) != j) {
3616                     has_non_ascii_fold = TRUE;
3617                     break;
3618                 }
3619             }
3620         }
3621
3622 #    endif
3623
3624         for (unsigned i = 0; i < 256; i++) {
3625             if (isU8_UPPER_LC(i))
3626                 PL_fold_locale[i] = (U8) toU8_LOWER_LC(i);
3627             else if (isU8_LOWER_LC(i))
3628                 PL_fold_locale[i] = (U8) toU8_UPPER_LC(i);
3629             else
3630                 PL_fold_locale[i] = (U8) i;
3631
3632 #    ifdef DEBUGGING
3633
3634             /* Most locales these days are supersets of ASCII.  When debugging
3635              * with -DLv, it is helpful to know what the exceptions to that are
3636              * in this locale */
3637             if (DEBUG_Lv_TEST) {
3638                 bool unexpected = FALSE;
3639
3640                 if (isUPPER_L1(i)) {
3641                     if (isUPPER_A(i)) {
3642                         if (PL_fold_locale[i] != toLOWER_A(i)) {
3643                             unexpected = TRUE;
3644                         }
3645                     }
3646                     else if (has_non_ascii_fold) {
3647                         if (PL_fold_locale[i] != toLOWER_L1(i)) {
3648                             unexpected = TRUE;
3649                         }
3650                     }
3651                     else if (PL_fold_locale[i] != i) {
3652                         unexpected = TRUE;
3653                     }
3654                 }
3655                 else if (   isLOWER_L1(i)
3656                          && i != LATIN_SMALL_LETTER_SHARP_S
3657                          && i != MICRO_SIGN)
3658                 {
3659                     if (isLOWER_A(i)) {
3660                         if (PL_fold_locale[i] != toUPPER_A(i)) {
3661                             unexpected = TRUE;
3662                         }
3663                     }
3664                     else if (has_non_ascii_fold) {
3665                         if (PL_fold_locale[i] != toUPPER_LATIN1_MOD(i)) {
3666                             unexpected = TRUE;
3667                         }
3668                     }
3669                     else if (PL_fold_locale[i] != i) {
3670                         unexpected = TRUE;
3671                     }
3672                 }
3673                 else if (PL_fold_locale[i] != i) {
3674                     unexpected = TRUE;
3675                 }
3676
3677                 if (unexpected) {
3678                     found_unexpected = TRUE;
3679                     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
3680                                            "For %s, fold of %02x is %02x\n",
3681                                            newctype, i, PL_fold_locale[i]));
3682                 }
3683             }
3684         }
3685
3686         if (found_unexpected) {
3687             DEBUG_Lv(PerlIO_printf(Perl_debug_log,
3688                                "All bytes not mentioned above either fold to"
3689                                " themselves or are the expected ASCII or"
3690                                " Latin1 ones\n"));
3691         }
3692         else {
3693             DEBUG_Lv(PerlIO_printf(Perl_debug_log,
3694                                    "No nonstandard folds were found\n"));
3695 #    endif
3696
3697         }
3698     }
3699
3700 #    ifdef MB_CUR_MAX
3701
3702     /* We only handle single-byte locales (outside of UTF-8 ones); so if this
3703      * locale requires more than one byte, there are going to be BIG problems.
3704      * */
3705
3706     const int mb_cur_max = MB_CUR_MAX;
3707
3708     if (mb_cur_max > 1 && ! PL_in_utf8_CTYPE_locale
3709
3710             /* Some platforms return MB_CUR_MAX > 1 for even the "C" locale.
3711              * Just assume that the implementation for them (plus for POSIX) is
3712              * correct and the > 1 value is spurious.  (Since these are
3713              * specially handled to never be considered UTF-8 locales, as long
3714              * as this is the only problem, everything should work fine */
3715         && ! isNAME_C_OR_POSIX(newctype))
3716     {
3717         DEBUG_L(PerlIO_printf(Perl_debug_log,
3718                               "Unsupported, MB_CUR_MAX=%d\n", mb_cur_max));
3719
3720         Perl_ck_warner_d(aTHX_ packWARN(WARN_LOCALE),
3721                          "Locale '%s' is unsupported, and may crash the"
3722                          " interpreter",
3723                          newctype);
3724     }
3725
3726 #    endif
3727
3728     DEBUG_Lv(PerlIO_printf(Perl_debug_log, "check_for_problems=%d\n",
3729                                            check_for_problems));
3730
3731     /* We don't populate the other lists if a UTF-8 locale, but do check that
3732      * everything works as expected, unless checking turned off */
3733     if (check_for_problems) {
3734         /* Assume enough space for every character being bad.  4 spaces each
3735          * for the 94 printable characters that are output like "'x' "; and 5
3736          * spaces each for "'\\' ", "'\t' ", and "'\n' "; plus a terminating
3737          * NUL */
3738         char bad_chars_list[ (94 * 4) + (3 * 5) + 1 ] = { '\0' };
3739         unsigned int bad_count = 0;         /* Count of bad characters */
3740
3741         for (unsigned i = 0; i < 256; i++) {
3742
3743             /* If checking for locale problems, see if the native ASCII-range
3744              * printables plus \n and \t are in their expected categories in
3745              * the new locale.  If not, this could mean big trouble, upending
3746              * Perl's and most programs' assumptions, like having a
3747              * metacharacter with special meaning become a \w.  Fortunately,
3748              * it's very rare to find locales that aren't supersets of ASCII
3749              * nowadays.  It isn't a problem for most controls to be changed
3750              * into something else; we check only \n and \t, though perhaps \r
3751              * could be an issue as well. */
3752             if (isGRAPH_A(i) || isBLANK_A(i) || i == '\n') {
3753                 bool is_bad = FALSE;
3754                 char name[4] = { '\0' };
3755
3756                 /* Convert the name into a string */
3757                 if (isGRAPH_A(i)) {
3758                     name[0] = i;
3759                     name[1] = '\0';
3760                 }
3761                 else if (i == '\n') {
3762                     my_strlcpy(name, "\\n", sizeof(name));
3763                 }
3764                 else if (i == '\t') {
3765                     my_strlcpy(name, "\\t", sizeof(name));
3766                 }
3767                 else {
3768                     assert(i == ' ');
3769                     my_strlcpy(name, "' '", sizeof(name));
3770                 }
3771
3772                 /* Check each possibe class */
3773                 if (UNLIKELY(cBOOL(isU8_ALPHANUMERIC_LC(i)) !=
3774                                                     cBOOL(isALPHANUMERIC_A(i))))
3775                 {
3776                     is_bad = TRUE;
3777                     DEBUG_L(PerlIO_printf(Perl_debug_log,
3778                                         "isalnum('%s') unexpectedly is %x\n",
3779                                         name, cBOOL(isU8_ALPHANUMERIC_LC(i))));
3780                 }
3781                 if (UNLIKELY(cBOOL(isU8_ALPHA_LC(i)) != cBOOL(isALPHA_A(i))))  {
3782                     is_bad = TRUE;
3783                     DEBUG_L(PerlIO_printf(Perl_debug_log,
3784                                           "isalpha('%s') unexpectedly is %x\n",
3785                                           name, cBOOL(isU8_ALPHA_LC(i))));
3786                 }
3787                 if (UNLIKELY(cBOOL(isU8_DIGIT_LC(i)) != cBOOL(isDIGIT_A(i))))  {
3788                     is_bad = TRUE;
3789                     DEBUG_L(PerlIO_printf(Perl_debug_log,
3790                                           "isdigit('%s') unexpectedly is %x\n",
3791                                           name, cBOOL(isU8_DIGIT_LC(i))));
3792                 }
3793                 if (UNLIKELY(cBOOL(isU8_GRAPH_LC(i)) != cBOOL(isGRAPH_A(i))))  {
3794                     is_bad = TRUE;
3795                     DEBUG_L(PerlIO_printf(Perl_debug_log,
3796                                           "isgraph('%s') unexpectedly is %x\n",
3797                                           name, cBOOL(isU8_GRAPH_LC(i))));
3798                 }
3799                 if (UNLIKELY(cBOOL(isU8_LOWER_LC(i)) != cBOOL(isLOWER_A(i))))  {
3800                     is_bad = TRUE;
3801                     DEBUG_L(PerlIO_printf(Perl_debug_log,
3802                                           "islower('%s') unexpectedly is %x\n",
3803                                           name, cBOOL(isU8_LOWER_LC(i))));
3804                 }
3805                 if (UNLIKELY(cBOOL(isU8_PRINT_LC(i)) != cBOOL(isPRINT_A(i))))  {
3806                     is_bad = TRUE;
3807                     DEBUG_L(PerlIO_printf(Perl_debug_log,
3808                                           "isprint('%s') unexpectedly is %x\n",
3809                                           name, cBOOL(isU8_PRINT_LC(i))));
3810                 }
3811                 if (UNLIKELY(cBOOL(isU8_PUNCT_LC(i)) != cBOOL(isPUNCT_A(i))))  {
3812                     is_bad = TRUE;
3813                     DEBUG_L(PerlIO_printf(Perl_debug_log,
3814                                           "ispunct('%s') unexpectedly is %x\n",
3815                                           name, cBOOL(isU8_PUNCT_LC(i))));
3816                 }
3817                 if (UNLIKELY(cBOOL(isU8_SPACE_LC(i)) != cBOOL(isSPACE_A(i))))  {
3818                     is_bad = TRUE;
3819                     DEBUG_L(PerlIO_printf(Perl_debug_log,
3820                                           "isspace('%s') unexpectedly is %x\n",
3821                                           name, cBOOL(isU8_SPACE_LC(i))));
3822                 }
3823                 if (UNLIKELY(cBOOL(isU8_UPPER_LC(i)) != cBOOL(isUPPER_A(i))))  {
3824                     is_bad = TRUE;
3825                     DEBUG_L(PerlIO_printf(Perl_debug_log,
3826                                           "isupper('%s') unexpectedly is %x\n",
3827                                           name, cBOOL(isU8_UPPER_LC(i))));
3828                 }
3829                 if (UNLIKELY(cBOOL(isU8_XDIGIT_LC(i))!= cBOOL(isXDIGIT_A(i)))) {
3830                     is_bad = TRUE;
3831                     DEBUG_L(PerlIO_printf(Perl_debug_log,
3832                                           "isxdigit('%s') unexpectedly is %x\n",
3833                                           name, cBOOL(isU8_XDIGIT_LC(i))));
3834                 }
3835                 if (UNLIKELY(toU8_LOWER_LC(i) != (int) toLOWER_A(i))) {
3836                     is_bad = TRUE;
3837                     DEBUG_L(PerlIO_printf(Perl_debug_log,
3838                             "tolower('%s')=0x%x instead of the expected 0x%x\n",
3839                             name, toU8_LOWER_LC(i), (int) toLOWER_A(i)));
3840                 }
3841                 if (UNLIKELY(toU8_UPPER_LC(i) != (int) toUPPER_A(i))) {
3842                     is_bad = TRUE;
3843                     DEBUG_L(PerlIO_printf(Perl_debug_log,
3844                             "toupper('%s')=0x%x instead of the expected 0x%x\n",
3845                             name, toU8_UPPER_LC(i), (int) toUPPER_A(i)));
3846                 }
3847                 if (UNLIKELY((i == '\n' && ! isCNTRL_LC(i))))  {
3848                     is_bad = TRUE;
3849                     DEBUG_L(PerlIO_printf(Perl_debug_log,
3850                                 "'\\n' (=%02X) is not a control\n", (int) i));
3851                 }
3852
3853                 /* Add to the list;  Separate multiple entries with a blank */
3854                 if (is_bad) {
3855                     if (bad_count) {
3856                         my_strlcat(bad_chars_list, " ", sizeof(bad_chars_list));
3857                     }
3858                     my_strlcat(bad_chars_list, name, sizeof(bad_chars_list));
3859                     bad_count++;
3860                 }
3861             }
3862         }
3863
3864         if (bad_count == 2 && maybe_utf8_turkic) {
3865             bad_count = 0;
3866             *bad_chars_list = '\0';
3867
3868             /* The casts are because otherwise some compilers warn:
3869                 gcc.gnu.org/bugzilla/show_bug.cgi?id=99950
3870                 gcc.gnu.org/bugzilla/show_bug.cgi?id=94182
3871              */
3872             PL_fold_locale[ (U8) 'I' ] = 'I';
3873             PL_fold_locale[ (U8) 'i' ] = 'i';
3874             PL_in_utf8_turkic_locale = TRUE;
3875             DEBUG_L(PerlIO_printf(Perl_debug_log, "%s is turkic\n", newctype));
3876         }
3877
3878         /* If we found problems and we want them output, do so */
3879         if (   (UNLIKELY(bad_count))
3880             && (LIKELY(ckWARN_d(WARN_LOCALE)) || UNLIKELY(DEBUG_L_TEST)))
3881         {
3882             /* WARNING.  If you change the wording of these; be sure to update
3883              * t/loc_tools.pl correspondingly */
3884
3885             if (PL_in_utf8_CTYPE_locale) {
3886                 PL_warn_locale = Perl_newSVpvf(aTHX_
3887                      "Locale '%s' contains (at least) the following characters"
3888                      " which have\nunexpected meanings: %s\nThe Perl program"
3889                      " will use the expected meanings",
3890                       newctype, bad_chars_list);
3891             }
3892             else {
3893                 PL_warn_locale =
3894                     Perl_newSVpvf(aTHX_
3895                                   "\nThe following characters (and maybe"
3896                                   " others) may not have the same meaning as"
3897                                   " the Perl program expects: %s\n",
3898                                   bad_chars_list
3899                             );
3900             }
3901
3902 #    if defined(HAS_SOME_LANGINFO) || defined(WIN32)
3903
3904             char * scratch_buffer = NULL;
3905             Perl_sv_catpvf(aTHX_ PL_warn_locale, "; codeset=%s",
3906                                  my_langinfo_c(CODESET, LC_CTYPE,
3907                                                newctype,
3908                                                &scratch_buffer, NULL,
3909                                                NULL));
3910             Safefree(scratch_buffer);
3911
3912 #    endif
3913
3914             Perl_sv_catpvf(aTHX_ PL_warn_locale, "\n");
3915
3916             /* If we are actually in the scope of the locale or are debugging,
3917              * output the message now.  If not in that scope, we save the
3918              * message to be output at the first operation using this locale,
3919              * if that actually happens.  Most programs don't use locales, so
3920              * they are immune to bad ones.  */
3921             if (IN_LC(LC_CTYPE) || UNLIKELY(DEBUG_L_TEST)) {
3922
3923                 /* The '0' below suppresses a bogus gcc compiler warning */
3924                 Perl_warner(aTHX_ packWARN(WARN_LOCALE), SvPVX(PL_warn_locale),
3925                                                                             0);
3926                 if (IN_LC(LC_CTYPE)) {
3927                     SvREFCNT_dec_NN(PL_warn_locale);
3928                     PL_warn_locale = NULL;
3929                 }
3930             }
3931         }
3932     }
3933 }
3934
3935 void
3936 Perl_warn_problematic_locale()
3937 {
3938     dTHX;
3939
3940     /* Core-only function that outputs the message in PL_warn_locale,
3941      * and then NULLS it.  Should be called only through the macro
3942      * CHECK_AND_WARN_PROBLEMATIC_LOCALE_ */
3943
3944     if (PL_warn_locale) {
3945         Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
3946                              SvPVX(PL_warn_locale),
3947                              0 /* dummy to avoid compiler warning */ );
3948         SvREFCNT_dec_NN(PL_warn_locale);
3949         PL_warn_locale = NULL;
3950     }
3951 }
3952
3953 #  endif /* USE_LOCALE_CTYPE */
3954
3955 STATIC void
3956 S_new_LC_ALL(pTHX_ const char *lc_all, bool force)
3957 {
3958     PERL_ARGS_ASSERT_NEW_LC_ALL;
3959
3960     /* new_LC_ALL() updates all the things we care about.  Note that this is
3961      * called just after a change, so uses the actual underlying locale just
3962      * set, and not the nominal one (should they differ, as they may in
3963      * LC_NUMERIC). */
3964
3965     const char * individ_locales[LC_ALL_INDEX_] = { NULL };
3966
3967     switch (parse_LC_ALL_string(lc_all,
3968                                 individ_locales,
3969                                 override_if_ignored,   /* Override any ignored
3970                                                           categories */
3971                                 true,   /* Always fill array */
3972                                 true,   /* Panic if fails, as to get here it
3973                                            earlier had to have succeeded */
3974                                 __LINE__))
3975     {
3976       case invalid:
3977       case no_array:
3978       case only_element_0:
3979         locale_panic_("Unexpected return from parse_LC_ALL_string");
3980
3981       case full_array:
3982         break;
3983     }
3984
3985     for_all_individual_category_indexes(i) {
3986         if (update_functions[i]) {
3987             const char * this_locale = individ_locales[i];
3988             update_functions[i](aTHX_ this_locale, force);
3989         }
3990
3991         Safefree(individ_locales[i]);
3992     }
3993 }
3994
3995 #  ifdef USE_LOCALE_COLLATE
3996
3997 STATIC void
3998 S_new_collate(pTHX_ const char *newcoll, bool force)
3999 {
4000     PERL_ARGS_ASSERT_NEW_COLLATE;
4001     PERL_UNUSED_ARG(force);
4002
4003     /* Called after each libc setlocale() call affecting LC_COLLATE, to tell
4004      * core Perl this and that 'newcoll' is the name of the new locale.
4005      *
4006      * The design of locale collation is that every locale change is given an
4007      * index 'PL_collation_ix'.  The first time a string participates in an
4008      * operation that requires collation while locale collation is active, it
4009      * is given PERL_MAGIC_collxfrm magic (via sv_collxfrm_flags()).  That
4010      * magic includes the collation index, and the transformation of the string
4011      * by strxfrm(), q.v.  That transformation is used when doing comparisons,
4012      * instead of the string itself.  If a string changes, the magic is
4013      * cleared.  The next time the locale changes, the index is incremented,
4014      * and so we know during a comparison that the transformation is not
4015      * necessarily still valid, and so is recomputed.  Note that if the locale
4016      * changes enough times, the index could wrap, and it is possible that a
4017      * transformation would improperly be considered valid, leading to an
4018      * unlikely bug.  The value is declared to the widest possible type on this
4019      * platform. */
4020
4021     /* Return if the locale isn't changing */
4022     if (strEQ(PL_collation_name, newcoll)) {
4023         return;
4024     }
4025
4026     Safefree(PL_collation_name);
4027     PL_collation_name = savepv(newcoll);
4028     ++PL_collation_ix;
4029
4030     /* Set the new one up if trivial.  Since this is called at process
4031      * initialization, be aware that this bit can't rely on much being
4032      * available. */
4033     PL_collation_standard = isNAME_C_OR_POSIX(newcoll);
4034     if (PL_collation_standard) {
4035         DEBUG_Lv(PerlIO_printf(Perl_debug_log,
4036                                "Setting PL_collation name='%s'\n",
4037                                PL_collation_name));
4038         PL_collxfrm_base = 0;
4039         PL_collxfrm_mult = 2;
4040         PL_in_utf8_COLLATE_locale = FALSE;
4041         PL_strxfrm_NUL_replacement = '\0';
4042         PL_strxfrm_max_cp = 0;
4043         return;
4044     }
4045
4046     /* Flag that the remainder of the set up is being deferred until first
4047      * need. */
4048     PL_collxfrm_mult = 0;
4049     PL_collxfrm_base = 0;
4050
4051 }
4052
4053 #  endif /* USE_LOCALE_COLLATE */
4054
4055 #  ifdef WIN32
4056
4057 STATIC wchar_t *
4058 S_Win_byte_string_to_wstring(const UINT code_page, const char * byte_string)
4059 {
4060     /* Caller must arrange to free the returned string */
4061
4062     int req_size = MultiByteToWideChar(code_page, 0, byte_string, -1, NULL, 0);
4063     if (! req_size) {
4064         SET_EINVAL;
4065         return NULL;
4066     }
4067
4068     wchar_t *wstring;
4069     Newx(wstring, req_size, wchar_t);
4070
4071     if (! MultiByteToWideChar(code_page, 0, byte_string, -1, wstring, req_size))
4072     {
4073         Safefree(wstring);
4074         SET_EINVAL;
4075         return NULL;
4076     }
4077
4078     return wstring;
4079 }
4080
4081 #    define Win_utf8_string_to_wstring(s)                                   \
4082                                     Win_byte_string_to_wstring(CP_UTF8, (s))
4083
4084 STATIC char *
4085 S_Win_wstring_to_byte_string(const UINT code_page, const wchar_t * wstring)
4086 {
4087     /* Caller must arrange to free the returned string */
4088
4089     int req_size =
4090             WideCharToMultiByte(code_page, 0, wstring, -1, NULL, 0, NULL, NULL);
4091
4092     char *byte_string;
4093     Newx(byte_string, req_size, char);
4094
4095     if (! WideCharToMultiByte(code_page, 0, wstring, -1, byte_string,
4096                                                          req_size, NULL, NULL))
4097     {
4098         Safefree(byte_string);
4099         SET_EINVAL;
4100         return NULL;
4101     }
4102
4103     return byte_string;
4104 }
4105
4106 #    define Win_wstring_to_utf8_string(ws)                                  \
4107                                    Win_wstring_to_byte_string(CP_UTF8, (ws))
4108
4109 STATIC const char *
4110 S_wrap_wsetlocale(pTHX_ const int category, const char *locale)
4111 {
4112     PERL_ARGS_ASSERT_WRAP_WSETLOCALE;
4113
4114     /* Calls _wsetlocale(), converting the parameters/return to/from
4115      * Perl-expected forms as if plain setlocale() were being called instead.
4116      *
4117      * Caller must arrange for the returned PV to be freed.
4118      */
4119
4120     const wchar_t * wlocale = NULL;
4121
4122     if (locale) {
4123         wlocale = Win_utf8_string_to_wstring(locale);
4124         if (! wlocale) {
4125             return NULL;
4126         }
4127     }
4128
4129     WSETLOCALE_LOCK;
4130     const wchar_t * wresult = _wsetlocale(category, wlocale);
4131
4132     if (! wresult) {
4133         WSETLOCALE_UNLOCK;
4134         Safefree(wlocale);
4135         return NULL;
4136     }
4137
4138     const char * result = Win_wstring_to_utf8_string(wresult);
4139     WSETLOCALE_UNLOCK;
4140
4141     Safefree(wlocale);
4142     return result;
4143 }
4144
4145 STATIC const char *
4146 S_win32_setlocale(pTHX_ int category, const char* locale)
4147 {
4148     /* This, for Windows, emulates POSIX setlocale() behavior.  There is no
4149      * difference between the two unless the input locale is "", which normally
4150      * means on Windows to get the machine default, which is set via the
4151      * computer's "Regional and Language Options" (or its current equivalent).
4152      * In POSIX, it instead means to find the locale from the user's
4153      * environment.  This routine changes the Windows behavior to try the POSIX
4154      * behavior first.  Further details are in the called function
4155      * find_locale_from_environment().
4156      */
4157
4158     if (locale != NULL && strEQ(locale, "")) {
4159         /* Note this function may change the locale, but that's ok because we
4160          * are about to change it anyway */
4161         locale = find_locale_from_environment(get_category_index(category));
4162         if (locale == NULL) {
4163             SET_EINVAL;
4164             return NULL;
4165         }
4166     }
4167
4168     const char * result = wrap_wsetlocale(category, locale);
4169     DEBUG_L(PerlIO_printf(Perl_debug_log, "%s\n",
4170                           setlocale_debug_string_r(category, locale, result)));
4171
4172     if (! result) {
4173         SET_EINVAL;
4174         return NULL;
4175     }
4176
4177     save_to_buffer(result, &PL_setlocale_buf, &PL_setlocale_bufsize);
4178
4179 #    ifndef USE_PL_CUR_LC_ALL
4180
4181     Safefree(result);
4182
4183 #  else
4184
4185     /* Here, we need to keep track of LC_ALL, so store the new value.  but if
4186      * the input locale is NULL, we were just querying, so the original value
4187      * hasn't changed */
4188     if (locale == NULL) {
4189         Safefree(result);
4190     }
4191     else {
4192
4193         /* If we set LC_ALL directly above, we already know its new value; but
4194          * if we changed just an individual category, find the new LC_ALL */
4195         if (category != LC_ALL) {
4196             Safefree(result);
4197             result = wrap_wsetlocale(LC_ALL, NULL);
4198         }
4199
4200         Safefree(PL_cur_LC_ALL);
4201         PL_cur_LC_ALL = result;
4202     }
4203
4204     DEBUG_L(PerlIO_printf(Perl_debug_log, "new PL_cur_LC_ALL=%s\n",
4205                                           PL_cur_LC_ALL));
4206 #    endif
4207
4208     return PL_setlocale_buf;
4209 }
4210
4211 #  endif
4212
4213 STATIC const char *
4214 S_native_querylocale_i(pTHX_ const locale_category_index cat_index)
4215 {
4216     /* Determine the current locale and return it in the form the platform's
4217      * native locale handling understands.  This is different only from our
4218      * internal form for the LC_ALL category, as platforms differ in how they
4219      * represent that.
4220      *
4221      * This is only called from Perl_setlocale().  As such it returns in
4222      * PL_setlocale_buf */
4223
4224 #  ifdef USE_LOCALE_NUMERIC
4225
4226     /* We have the LC_NUMERIC name saved, because we are normally switched into
4227      * the C locale (or equivalent) for it. */
4228     if (cat_index == LC_NUMERIC_INDEX_) {
4229
4230         /* We don't have to copy this return value, as it is a per-thread
4231          * variable, and won't change until a future setlocale */
4232         return PL_numeric_name;
4233     }
4234
4235 #  endif
4236 #  ifdef LC_ALL
4237
4238     if (cat_index != LC_ALL_INDEX_)
4239
4240 #  endif
4241
4242     {
4243         /* Here, not LC_ALL, and not LC_NUMERIC: the actual and native values
4244          * match */
4245
4246 #  ifdef setlocale_i    /* Can shortcut if this is defined */
4247
4248         return setlocale_i(cat_index, NULL);
4249
4250 #  else
4251
4252         return save_to_buffer(querylocale_i(cat_index),
4253                               &PL_setlocale_buf, &PL_setlocale_bufsize);
4254 #  endif
4255
4256     }
4257
4258     /* Below, querying LC_ALL */
4259
4260 #  ifdef LC_ALL
4261 #    ifdef USE_PL_CURLOCALES
4262 #      define LC_ALL_ARG  PL_curlocales
4263 #    else
4264 #      define LC_ALL_ARG  NULL  /* Causes calculate_LC_ALL_string() to find the
4265                                    locale using a querylocale function */
4266 #    endif
4267
4268     return calculate_LC_ALL_string(LC_ALL_ARG, EXTERNAL_FORMAT_FOR_QUERY,
4269                                    WANT_PL_setlocale_buf,
4270                                    __LINE__);
4271 #    undef LC_ALL_ARG
4272 #  endif    /* has LC_ALL */
4273
4274 }
4275
4276 #endif      /* USE_LOCALE */
4277
4278 /*
4279 =for apidoc Perl_setlocale
4280
4281 This is an (almost) drop-in replacement for the system L<C<setlocale(3)>>,
4282 taking the same parameters, and returning the same information, except that it
4283 returns the correct underlying C<LC_NUMERIC> locale.  Regular C<setlocale> will
4284 instead return C<C> if the underlying locale has a non-dot decimal point
4285 character, or a non-empty thousands separator for displaying floating point
4286 numbers.  This is because perl keeps that locale category such that it has a
4287 dot and empty separator, changing the locale briefly during the operations
4288 where the underlying one is required. C<Perl_setlocale> knows about this, and
4289 compensates; regular C<setlocale> doesn't.
4290
4291 Another reason it isn't completely a drop-in replacement is that it is
4292 declared to return S<C<const char *>>, whereas the system setlocale omits the
4293 C<const> (presumably because its API was specified long ago, and can't be
4294 updated; it is illegal to change the information C<setlocale> returns; doing
4295 so leads to segfaults.)
4296
4297 Finally, C<Perl_setlocale> works under all circumstances, whereas plain
4298 C<setlocale> can be completely ineffective on some platforms under some
4299 configurations.
4300
4301 Changing the locale is not a good idea when more than one thread is running,
4302 except on systems where the predefined variable C<${^SAFE_LOCALES}> is
4303 non-zero.  This is because on such systems the locale is global to the whole
4304 process and not local to just the thread calling the function.  So changing it
4305 in one thread instantaneously changes it in all.  On some such systems, the
4306 system C<setlocale()> is ineffective, returning the wrong information, and
4307 failing to actually change the locale.  z/OS refuses to try to change the
4308 locale once a second thread is created.  C<Perl_setlocale>, should give you
4309 accurate results of what actually happened on these problematic platforms,
4310 returning NULL if the system forbade the locale change.
4311
4312 The return points to a per-thread static buffer, which is overwritten the next
4313 time C<Perl_setlocale> is called from the same thread.
4314
4315 =cut
4316
4317 */
4318
4319 const char *
4320 Perl_setlocale(const int category, const char * locale)
4321 {
4322     /* This wraps POSIX::setlocale() */
4323
4324 #ifndef USE_LOCALE
4325
4326     PERL_UNUSED_ARG(category);
4327     PERL_UNUSED_ARG(locale);
4328
4329     return "C";
4330
4331 #else
4332
4333     dTHX;
4334
4335     DEBUG_L(PerlIO_printf(Perl_debug_log,
4336                           "Entering Perl_setlocale(%d, \"%s\")\n",
4337                           category, locale));
4338
4339     bool valid_category;
4340     locale_category_index cat_index = get_category_index_helper(category,
4341                                                                 &valid_category,
4342                                                                 __LINE__);
4343     if (! valid_category) {
4344         if (ckWARN(WARN_LOCALE)) {
4345             const char * conditional_warn_text;
4346             if (locale == NULL) {
4347                 conditional_warn_text = "";
4348                 locale = "";
4349             }
4350             else {
4351                 conditional_warn_text = "; can't set it to ";
4352             }
4353
4354             /* diag_listed_as: Unknown locale category %d; can't set it to %s */
4355             Perl_warner(aTHX_
4356                            packWARN(WARN_LOCALE),
4357                            "Unknown locale category %d%s%s",
4358                            category, conditional_warn_text, locale);
4359         }
4360
4361         SET_EINVAL;
4362         return NULL;
4363     }
4364
4365 #  ifdef setlocale_i
4366
4367     /* setlocale_i() gets defined only on Configurations that use setlocale()
4368      * in a simple manner that adequately handles all cases.  If this category
4369      * doesn't have any perl complications, just do that. */
4370     if (! update_functions[cat_index]) {
4371         return setlocale_i(cat_index, locale);
4372     }
4373
4374 #  endif
4375
4376     /* Get current locale */
4377     const char * current_locale = native_querylocale_i(cat_index);
4378
4379     /* A NULL locale means only query what the current one is. */
4380     if (locale == NULL) {
4381         return current_locale;
4382     }
4383
4384     if (strEQ(current_locale, locale)) {
4385         DEBUG_L(PerlIO_printf(Perl_debug_log,
4386                              "Already in requested locale: no action taken\n"));
4387         return current_locale;
4388     }
4389
4390     /* Here, an actual change is being requested.  Do it */
4391     if (! bool_setlocale_i(cat_index, locale)) {
4392         DEBUG_L(PerlIO_printf(Perl_debug_log, "%s\n",
4393                           setlocale_debug_string_i(cat_index, locale, "NULL")));
4394         return NULL;
4395     }
4396
4397     /* At this point, the locale has been changed based on the requested value,
4398      * and the querylocale_i() will return the actual new value that the system
4399      * has for the category.  That may not be the same as the input, as libc
4400      * may have returned a synonymous locale name instead of the input one; or,
4401      * if there are locale categories that we are compiled to ignore, any
4402      * attempt to change them away from "C" is overruled */
4403     current_locale = querylocale_i(cat_index);
4404
4405     /* But certain categories need further work.  For example we may need to
4406      * calculate new folding or collation rules.  And for LC_NUMERIC, we have
4407      * to switch into a locale that has a dot radix. */
4408     if (update_functions[cat_index]) {
4409         update_functions[cat_index](aTHX_ current_locale,
4410                                           /* No need to force recalculation, as
4411                                            * aren't coming from a situation
4412                                            * where Perl hasn't been controlling
4413                                            * the locale, so has accurate
4414                                            * records. */
4415                                           false);
4416     }
4417
4418     /* Make sure the result is in a stable buffer for the caller's use, and is
4419      * in the expected format */
4420     current_locale = native_querylocale_i(cat_index);
4421
4422     DEBUG_L(PerlIO_printf(Perl_debug_log, "returning '%s'\n", current_locale));
4423
4424     return current_locale;
4425
4426 #endif
4427
4428 }
4429
4430 #if defined(USE_LOCALE) || defined(HAS_SOME_LANGINFO) || defined(HAS_LOCALECONV)
4431
4432 STATIC utf8ness_t
4433 S_get_locale_string_utf8ness_i(pTHX_ const char * string,
4434                                      const locale_utf8ness_t known_utf8,
4435                                      const char * locale,
4436                                      const locale_category_index cat_index)
4437 {
4438     PERL_ARGS_ASSERT_GET_LOCALE_STRING_UTF8NESS_I;
4439
4440 #  ifndef USE_LOCALE
4441
4442     return UTF8NESS_NO;
4443     PERL_UNUSED_ARG(string);
4444     PERL_UNUSED_ARG(known_utf8);
4445     PERL_UNUSED_ARG(locale);
4446     PERL_UNUSED_ARG(cat_index);
4447
4448 #  else
4449
4450     assert(cat_index <= LC_ALL_INDEX_);
4451
4452     /* Return to indicate if 'string' in the locale given by the input
4453      * arguments should be considered UTF-8 or not.
4454      *
4455      * If the input 'locale' is not NULL, use that for the locale; otherwise
4456      * use the current locale for the category specified by 'cat_index'.
4457      */
4458
4459     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
4460                            "Entering get_locale_string_utf8ness_i; locale=%s,"
4461                            " index=%u(%s), string=%s, known_utf8=%d\n",
4462                            locale, cat_index, category_names[cat_index],
4463                            ((string)
4464                             ?  _byte_dump_string((U8 *) string,
4465                                                  strlen(string),
4466                                                  0)
4467                             : "nil"),
4468                            known_utf8));
4469     if (string == NULL) {
4470         return UTF8NESS_IMMATERIAL;
4471     }
4472
4473     if (IN_BYTES) { /* respect 'use bytes' */
4474         return UTF8NESS_NO;
4475     }
4476
4477     Size_t len = strlen(string);
4478
4479     /* UTF8ness is immaterial if the representation doesn't vary */
4480     const U8 * first_variant = NULL;
4481     if (is_utf8_invariant_string_loc((U8 *) string, len, &first_variant)) {
4482         return UTF8NESS_IMMATERIAL;
4483     }
4484
4485     /* Can't be UTF-8 if invalid */
4486     if (! is_utf8_string((U8 *) first_variant,
4487                          len - ((char *) first_variant - string)))
4488     {
4489         return UTF8NESS_NO;
4490     }
4491
4492     /* Here and below, we know the string is legal UTF-8, containing at least
4493      * one character requiring a sequence of two or more bytes.  It is quite
4494      * likely to be UTF-8.  But it pays to be paranoid and do further checking.
4495      *
4496      * If we already know the UTF-8ness of the locale, then we immediately know
4497      * what the string is */
4498     if (UNLIKELY(known_utf8 != LOCALE_UTF8NESS_UNKNOWN)) {
4499         return (known_utf8 == LOCALE_IS_UTF8) ? UTF8NESS_YES : UTF8NESS_NO;
4500     }
4501
4502     if (locale == NULL) {
4503         locale = querylocale_i(cat_index);
4504     }
4505
4506     /* If the locale is UTF-8, the string is UTF-8;  otherwise it was
4507      * coincidental that the string is legal UTF-8
4508      *
4509      * However, if the perl is compiled to not pay attention to the category
4510      * being passed in, you might think that that locale is essentially always
4511      * the C locale, so it would make sense to say it isn't UTF-8.  But to get
4512      * here, the string has to contain characters unknown in the C locale.  And
4513      * in fact, Windows boxes are compiled without LC_MESSAGES, as their
4514      * message catalog isn't really a part of the locale system.  But those
4515      * messages really could be UTF-8, and given that the odds are rather small
4516      * of something not being UTF-8 but being syntactically valid UTF-8, khw
4517      * has decided to call such strings as UTF-8. */
4518     return (is_locale_utf8(locale)) ? UTF8NESS_YES : UTF8NESS_NO;
4519
4520 #  endif
4521
4522 }
4523
4524 STATIC bool
4525 S_is_locale_utf8(pTHX_ const char * locale)
4526 {
4527     /* Returns TRUE if the locale 'locale' is UTF-8; FALSE otherwise.  It uses
4528      * my_langinfo(), which employs various methods to get this information
4529      * if nl_langinfo() isn't available, using heuristics as a last resort, in
4530      * which case, the result will very likely be correct for locales for
4531      * languages that have commonly used non-ASCII characters, but for notably
4532      * English, it comes down to if the locale's name ends in something like
4533      * "UTF-8".  It errs on the side of not being a UTF-8 locale.
4534      *
4535      * Systems conforming to C99 should have the needed libc calls to give us a
4536      * completely reliable result. */
4537
4538 #  if ! defined(USE_LOCALE)                                                   \
4539    || ! defined(USE_LOCALE_CTYPE)                                             \
4540    ||   defined(EBCDIC) /* There aren't any real UTF-8 locales at this time */
4541
4542     PERL_UNUSED_ARG(locale);
4543
4544     return FALSE;
4545
4546 #  else
4547
4548     char * scratch_buffer = NULL;
4549     const char * codeset;
4550     bool retval;
4551
4552     PERL_ARGS_ASSERT_IS_LOCALE_UTF8;
4553
4554     if (strEQ(locale, PL_ctype_name)) {
4555         return PL_in_utf8_CTYPE_locale;
4556     }
4557
4558     codeset = my_langinfo_c(CODESET, LC_CTYPE, locale,
4559                             &scratch_buffer, NULL, NULL);
4560     retval = is_codeset_name_UTF8(codeset);
4561
4562     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
4563                            "found codeset=%s, is_utf8=%d\n", codeset, retval));
4564
4565     Safefree(scratch_buffer);
4566
4567     DEBUG_Lv(PerlIO_printf(Perl_debug_log, "is_locale_utf8(%s) returning %d\n",
4568                                                             locale, retval));
4569     return retval;
4570
4571 #  endif
4572
4573 }
4574
4575 #endif
4576 #ifdef USE_LOCALE
4577
4578 STATIC void
4579 S_set_save_buffer_min_size(pTHX_ Size_t min_len,
4580                                  char **buf,
4581                                  Size_t * buf_cursize)
4582 {
4583     /* Make sure the buffer pointed to by *buf is at least as large 'min_len';
4584      * *buf_cursize is the size of 'buf' upon entry; it will be updated to the
4585      * new size on exit.  'buf_cursize' being NULL is to be used when this is a
4586      * single use buffer, which will shortly be freed by the caller. */
4587
4588     if (buf_cursize == NULL) {
4589         Newx(*buf, min_len, char);
4590     }
4591     else if (*buf_cursize == 0) {
4592         Newx(*buf, min_len, char);
4593         *buf_cursize = min_len;
4594     }
4595     else if (min_len > *buf_cursize) {
4596         Renew(*buf, min_len, char);
4597         *buf_cursize = min_len;
4598     }
4599 }
4600
4601 STATIC const char *
4602 S_save_to_buffer(pTHX_ const char * string, char **buf, Size_t *buf_size)
4603 {
4604     PERL_ARGS_ASSERT_SAVE_TO_BUFFER;
4605
4606     /* Copy the NUL-terminated 'string' to a buffer whose address before this
4607      * call began at *buf, and whose available length before this call was
4608      * *buf_size.
4609      *
4610      * If the length of 'string' is greater than the space available, the
4611      * buffer is grown accordingly, which may mean that it gets relocated.
4612      * *buf and *buf_size will be updated to reflect this.
4613      *
4614      * Regardless, the function returns a pointer to where 'string' is now
4615      * stored.
4616      *
4617      * 'string' may be NULL, which means no action gets taken, and NULL is
4618      * returned.
4619      *
4620      * 'buf_size' being NULL is to be used when this is a single use buffer,
4621      * which will shortly be freed by the caller.
4622      *
4623      * If *buf or 'buf_size' are NULL or *buf_size is 0, the buffer is assumed
4624      * empty, and memory is malloc'd.
4625      */
4626
4627     if (! string) {
4628         return NULL;
4629     }
4630
4631     /* No-op to copy over oneself */
4632     if (string == *buf) {
4633         return string;
4634     }
4635
4636     Size_t string_size = strlen(string) + 1;
4637     set_save_buffer_min_size(string_size, buf, buf_size);
4638
4639 #    ifdef DEBUGGING
4640
4641     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
4642                          "Copying '%s' to %p\n",
4643                          ((is_utf8_string((U8 *) string, 0))
4644                           ? string
4645                           :_byte_dump_string((U8 *) string, strlen(string), 0)),
4646                           *buf));
4647
4648     /* Catch glitches.  Usually this is because LC_CTYPE needs to be the same
4649      * locale as whatever is being worked on */
4650     if (UNLIKELY(instr(string, REPLACEMENT_CHARACTER_UTF8))) {
4651         locale_panic_(Perl_form(aTHX_
4652                                 "Unexpected REPLACEMENT_CHARACTER in '%s'\n%s",
4653                                 string, get_LC_ALL_display()));
4654     }
4655
4656 #  endif
4657
4658     Copy(string, *buf, string_size, char);
4659     return *buf;
4660 }
4661
4662 #  ifdef WIN32
4663
4664 bool
4665 Perl_get_win32_message_utf8ness(pTHX_ const char * string)
4666 {
4667     /* This is because Windows doesn't have LC_MESSAGES. */
4668
4669 #    ifdef USE_LOCALE_CTYPE
4670
4671     return get_locale_string_utf8ness_i(string, LOCALE_IS_UTF8,
4672                                         NULL, LC_CTYPE_INDEX_);
4673 #    else
4674
4675     return false;
4676
4677 #    endif
4678
4679 }
4680
4681 #  endif
4682 #endif  /* USE_LOCALE */
4683
4684 int
4685 Perl_mbtowc_(pTHX_ const wchar_t * pwc, const char * s, const Size_t len)
4686 {
4687
4688 #if ! defined(HAS_MBRTOWC) && ! defined(HAS_MBTOWC)
4689
4690     PERL_UNUSED_ARG(pwc);
4691     PERL_UNUSED_ARG(s);
4692     PERL_UNUSED_ARG(len);
4693     return -1;
4694
4695 #else   /* Below we have some form of mbtowc() */
4696 #  if defined(HAS_MBRTOWC)                                      \
4697    && (defined(USE_LOCALE_THREADS) || ! defined(HAS_MBTOWC))
4698 #    define USE_MBRTOWC
4699 #  else
4700 #    undef USE_MBRTOWC
4701 #  endif
4702
4703     int retval = -1;
4704
4705     if (s == NULL) { /* Initialize the shift state to all zeros in
4706                         PL_mbrtowc_ps. */
4707
4708 #  if defined(USE_MBRTOWC)
4709
4710         memzero(&PL_mbrtowc_ps, sizeof(PL_mbrtowc_ps));
4711         return 0;
4712
4713 #  else
4714
4715         SETERRNO(0, 0);
4716         MBTOWC_LOCK_;
4717         retval = mbtowc(NULL, NULL, 0);
4718         MBTOWC_UNLOCK_;
4719         return retval;
4720
4721 #  endif
4722
4723     }
4724
4725 #  if defined(USE_MBRTOWC)
4726
4727     SETERRNO(0, 0);
4728     MBRTOWC_LOCK_;
4729     retval = (SSize_t) mbrtowc((wchar_t *) pwc, s, len, &PL_mbrtowc_ps);
4730     MBRTOWC_UNLOCK_;
4731
4732 #  else
4733
4734     /* Locking prevents races, but locales can be switched out without locking,
4735      * so this isn't a cure all */
4736     SETERRNO(0, 0);
4737     MBTOWC_LOCK_;
4738     retval = mbtowc((wchar_t *) pwc, s, len);
4739     MBTOWC_UNLOCK_;
4740
4741 #  endif
4742
4743     return retval;
4744
4745 #endif
4746
4747 }
4748
4749 /*
4750 =for apidoc Perl_localeconv
4751
4752 This is a thread-safe version of the libc L<localeconv(3)>.  It is the same as
4753 L<POSIX::localeconv|POSIX/localeconv> (returning a hash of the C<localeconv()>
4754 fields), but directly callable from XS code.
4755
4756 =cut
4757 */
4758
4759 HV *
4760 Perl_localeconv(pTHX)
4761 {
4762
4763 #if  ! defined(HAS_LOCALECONV)
4764
4765     return newHV();
4766
4767 #else
4768
4769     return my_localeconv(0);
4770
4771 #endif
4772
4773 }
4774
4775 #if  defined(HAS_LOCALECONV)
4776
4777 HV *
4778 S_my_localeconv(pTHX_ const int item)
4779 {
4780     PERL_ARGS_ASSERT_MY_LOCALECONV;
4781
4782     /* This returns a mortalized hash containing all or certain elements
4783      * returned by localeconv().  It is used by Perl_localeconv() and
4784      * POSIX::localeconv() and is thread-safe.
4785      *
4786      * There are two use cases:
4787      * 1) Called from POSIX::locale_conv().  This returns the lconv structure
4788      *    copied to a hash, based on the current underlying locales for
4789      *    LC_NUMERIC and LC_MONETARY. An input item==0 signifies this case, or
4790      *    on many platforms it is the only use case compiled.
4791      * 2) Certain items that nl_langinfo() provides are also derivable from
4792      *    the return of localeconv().  Windows notably doesn't have
4793      *    nl_langinfo(), so on that, and actually any platform lacking it,
4794      *    my_localeconv() is used also to emulate it for those particular
4795      *    items.  The code to do this is compiled only on such platforms.
4796      *    Rather than going to the expense of creating a full hash when only
4797      *    one item is needed, the returned hash has just the desired item in
4798      *    it.
4799      *
4800      * To access all the localeconv() struct lconv fields, there is a data
4801      * structure that contains every commonly documented field in it.  (Maybe
4802      * some minority platforms have extra fields.  Those could be added here
4803      * without harm; they would just be ignored on platforms lacking them.)
4804      *
4805      * Our structure is compiled to make looping through the fields easier by
4806      * pointing each name to its value's offset within lconv, e.g.,
4807         { "thousands_sep", STRUCT_OFFSET(struct lconv, thousands_sep) }
4808      */
4809 #  define LCONV_ENTRY(name)                                                 \
4810                        {STRINGIFY(name), STRUCT_OFFSET(struct lconv, name)}
4811
4812     /* These synonyms are just for clarity, and to make it easier in case
4813      * something needs to change in the future */
4814 #  define LCONV_NUMERIC_ENTRY(name)  LCONV_ENTRY(name)
4815 #  define LCONV_MONETARY_ENTRY(name) LCONV_ENTRY(name)
4816
4817     /* There are just a few fields for NUMERIC strings */
4818     const lconv_offset_t lconv_numeric_strings[] = {
4819 #  ifndef NO_LOCALECONV_GROUPING
4820         LCONV_NUMERIC_ENTRY(grouping),
4821 #   endif
4822         LCONV_NUMERIC_ENTRY(thousands_sep),
4823         LCONV_NUMERIC_ENTRY(decimal_point),
4824         {NULL, 0}
4825     };
4826
4827     /* When used to implement nl_langinfo(), we save time by only populating
4828      * the hash with the field(s) needed.  Thus we would need a data structure
4829      * of just:
4830      *  LCONV_NUMERIC_ENTRY(decimal_point),
4831      *  {NULL, 0}
4832      *
4833      * By placing the decimal_point field last in the full structure, we can
4834      * use just the tail for this bit of it, saving space.  This macro yields
4835      * the address of the sub structure. */
4836 #  define DECIMAL_POINT_ADDRESS                                             \
4837         &lconv_numeric_strings[(C_ARRAY_LENGTH(lconv_numeric_strings) - 2)]
4838
4839     /* And the MONETARY string fields */
4840     const lconv_offset_t lconv_monetary_strings[] = {
4841         LCONV_MONETARY_ENTRY(int_curr_symbol),
4842         LCONV_MONETARY_ENTRY(mon_decimal_point),
4843 #  ifndef NO_LOCALECONV_MON_THOUSANDS_SEP
4844         LCONV_MONETARY_ENTRY(mon_thousands_sep),
4845 #  endif
4846 #  ifndef NO_LOCALECONV_MON_GROUPING
4847         LCONV_MONETARY_ENTRY(mon_grouping),
4848 #  endif
4849         LCONV_MONETARY_ENTRY(positive_sign),
4850         LCONV_MONETARY_ENTRY(negative_sign),
4851         LCONV_MONETARY_ENTRY(currency_symbol),
4852         {NULL, 0}
4853     };
4854
4855     /* Like above, this field being last can be used as a sub structure */
4856 #  define CURRENCY_SYMBOL_ADDRESS                                            \
4857       &lconv_monetary_strings[(C_ARRAY_LENGTH(lconv_monetary_strings) - 2)]
4858
4859     /* Finally there are integer fields, all are for monetary purposes */
4860     const lconv_offset_t lconv_integers[] = {
4861         LCONV_ENTRY(int_frac_digits),
4862         LCONV_ENTRY(frac_digits),
4863         LCONV_ENTRY(p_sep_by_space),
4864         LCONV_ENTRY(n_cs_precedes),
4865         LCONV_ENTRY(n_sep_by_space),
4866         LCONV_ENTRY(p_sign_posn),
4867         LCONV_ENTRY(n_sign_posn),
4868 #  ifdef HAS_LC_MONETARY_2008
4869         LCONV_ENTRY(int_p_cs_precedes),
4870         LCONV_ENTRY(int_p_sep_by_space),
4871         LCONV_ENTRY(int_n_cs_precedes),
4872         LCONV_ENTRY(int_n_sep_by_space),
4873         LCONV_ENTRY(int_p_sign_posn),
4874         LCONV_ENTRY(int_n_sign_posn),
4875 #  endif
4876         LCONV_ENTRY(p_cs_precedes),
4877         {NULL, 0}
4878     };
4879
4880     /* Like above, this field being last can be used as a sub structure */
4881 #  define P_CS_PRECEDES_ADDRESS                                       \
4882       &lconv_integers[(C_ARRAY_LENGTH(lconv_integers) - 2)]
4883
4884     /* If we aren't paying attention to a given category, use LC_CTYPE instead;
4885      * If not paying attention to that either, the code below should end up not
4886      * using this.  Make sure that things blow up if that avoidance gets lost,
4887      * by setting the category to an out-of-bounds value */
4888     locale_category_index numeric_index;
4889     locale_category_index monetary_index;
4890
4891 #  ifdef USE_LOCALE_NUMERIC
4892     numeric_index = LC_NUMERIC_INDEX_;
4893 #  elif defined(USE_LOCALE_CTYPE)
4894     numeric_index = LC_CTYPE_INDEX_;
4895 #  else
4896     numeric_index = LC_ALL_INDEX_;      /* Out-of-bounds */
4897 #  endif
4898 #  ifdef USE_LOCALE_MONETARY
4899     monetary_index = LC_MONETARY_INDEX_;
4900 #  elif defined(USE_LOCALE_CTYPE)
4901     monetary_index = LC_CTYPE_INDEX_;
4902 #  else
4903     monetary_index = LC_ALL_INDEX_;     /* Out-of-bounds */
4904 #  endif
4905
4906     /* Some platforms, for correct non-mojibake results, require LC_CTYPE's
4907      * locale to match LC_NUMERIC's for the numeric fields, and LC_MONETARY's
4908      * for the monetary ones.  What happens if LC_NUMERIC and LC_MONETARY
4909      * aren't compatible?  Wrong results.  To avoid that, we call localeconv()
4910      * twice, once for each locale, setting LC_CTYPE to match the category.
4911      * But if the locales of both categories are the same, there is no need for
4912      * a second call.  Assume this is the case unless overridden below */
4913     bool requires_2nd_localeconv = false;
4914
4915     /* The actual hash populating is done by S_populate_hash_from_localeconv().
4916      * It gets passed an array of length two containing the data structure it
4917      * is supposed to use to get the key names to fill the hash with.  One
4918      * element is always for the NUMERIC strings (or NULL if none to use), and
4919      * the other element similarly for the MONETARY ones. */
4920 #  define NUMERIC_STRING_OFFSET   0
4921 #  define MONETARY_STRING_OFFSET  1
4922     const lconv_offset_t * strings[2] = { NULL, NULL };
4923
4924     /* This is a mask, with one bit to tell S_populate_hash_from_localeconv to
4925      * populate the NUMERIC items; another bit for the MONETARY ones.  This way
4926      * it can choose which (or both) to populate from */
4927     U32 index_bits = 0;
4928
4929     /* This converts from a locale index to its bit position in the above mask.
4930      * */
4931 #  define INDEX_TO_BIT(i)  (1 << (i))
4932
4933     /* The two categories can have disparate locales.  Initialize them to C and
4934      * override later whichever one(s) we pay attention to */
4935     const char * numeric_locale = "C";
4936     const char * monetary_locale = "C";
4937
4938     /* This will be either 'numeric_locale' or 'monetary_locale' depending on
4939      * what we are working on at the moment */
4940     const char * locale;
4941
4942     /* The LC_MONETARY category also has some integer-valued fields, whose
4943      * information is kept in a separate list */
4944     const lconv_offset_t * integers;
4945
4946 #  ifdef HAS_SOME_LANGINFO
4947
4948     /* If the only use-case for this is the full localeconv(), the 'item'
4949      * parameter is ignored. */
4950     PERL_UNUSED_ARG(item);
4951
4952 #  else
4953
4954     /* This only gets compiled for the use-case of using localeconv() to
4955      * emulate an nl_langinfo() missing from the platform. */
4956
4957 #    ifdef USE_LOCALE_NUMERIC
4958
4959     /* We need this substructure to only return this field for the THOUSEP
4960      * item.  The other items also need substructures, but they were handled
4961      * above by placing the substructure's item at the end of the full one, so
4962      * the data structure could do double duty.  However, both this and
4963      * RADIXCHAR would need to be in the final position of the same full
4964      * structure; an impossibility.  So make this into a separate structure */
4965     const lconv_offset_t  thousands_sep_string[] = {
4966         LCONV_NUMERIC_ENTRY(thousands_sep),
4967         {NULL, 0}
4968     };
4969
4970 #    endif
4971
4972     /* End of all the initialization of data structures.  Now for actual code.
4973      *
4974      * Without nl_langinfo(), the call to my_localeconv() could be for just one
4975      * of the following 3 items to emulate nl_langinfo().  This is compiled
4976      * only when using perl_langinfo.h, which we control, and it has been
4977      * constructed so that no item is numbered 0.
4978      *
4979      * For each, set up the appropriate parameters for the call below to
4980      * S_populate_hash_from_localeconv() */
4981     if (item != 0) switch (item) {
4982       default:
4983         locale_panic_(Perl_form(aTHX_
4984                     "Unexpected item passed to my_localeconv: %d", item));
4985         break;
4986
4987 #    ifdef USE_LOCALE_NUMERIC
4988
4989       case RADIXCHAR:
4990         locale = numeric_locale = PL_numeric_name;
4991         index_bits = INDEX_TO_BIT(LC_NUMERIC_INDEX_);
4992         strings[NUMERIC_STRING_OFFSET] = DECIMAL_POINT_ADDRESS;
4993         integers = NULL;
4994         break;
4995
4996       case THOUSEP:
4997         index_bits = INDEX_TO_BIT(LC_NUMERIC_INDEX_);
4998         locale = numeric_locale = PL_numeric_name;
4999         strings[NUMERIC_STRING_OFFSET] = thousands_sep_string;
5000         integers = NULL;
5001         break;
5002
5003 #    endif
5004 #    ifdef USE_LOCALE_MONETARY
5005
5006       case CRNCYSTR:
5007         index_bits = INDEX_TO_BIT(LC_MONETARY_INDEX_);
5008         locale = monetary_locale = querylocale_i(LC_MONETARY_INDEX_);
5009
5010         /* This item needs the values for both the currency symbol, and another
5011          * one used to construct the nl_langino()-compatible return */
5012         strings[MONETARY_STRING_OFFSET] = CURRENCY_SYMBOL_ADDRESS;
5013         integers = P_CS_PRECEDES_ADDRESS;
5014         break;
5015
5016 #    endif
5017
5018     } /* End of switch() */
5019
5020     else    /* End of for just one item to emulate nl_langinfo() */
5021
5022 #  endif
5023
5024     {   /* Here, the call is for all of localeconv().  It has a bunch of
5025          * items.  As in the individual item case, set up the parameters for
5026          * S_populate_hash_from_localeconv(); */
5027
5028 #  ifdef USE_LOCALE_NUMERIC
5029         numeric_locale = PL_numeric_name;
5030 #  elif defined(USE_LOCALE_CTYPE)
5031         numeric_locale = querylocale_i(numeric_index);
5032 #  endif
5033 #  if defined(USE_LOCALE_MONETARY) || defined(USE_LOCALE_CTYPE)
5034         monetary_locale = querylocale_i(monetary_index);
5035 #  endif
5036
5037         /* The first call to S_populate_hash_from_localeconv() will be for the
5038          * MONETARY values */
5039         index_bits = INDEX_TO_BIT(monetary_index);
5040         locale = monetary_locale;
5041
5042         /* And if the locales for the two categories are the same, we can also
5043          * do the NUMERIC values in the same call */
5044         if (strEQ(numeric_locale, monetary_locale)) {
5045             index_bits |= INDEX_TO_BIT(numeric_index);
5046         }
5047         else {
5048             requires_2nd_localeconv = true;
5049         }
5050
5051         /* We always pass both sets of strings. 'index_bits' tells
5052          * S_populate_hash_from_localeconv which to actually look at */
5053         strings[NUMERIC_STRING_OFFSET] = lconv_numeric_strings;
5054         strings[MONETARY_STRING_OFFSET] = lconv_monetary_strings;
5055
5056         /* And pass the integer values to populate; again 'index_bits' will
5057          * say to use them or not */
5058         integers = lconv_integers;
5059
5060     }   /* End of call is for localeconv() */
5061
5062     /* The code above has determined the parameters to
5063        S_populate_hash_from_localeconv() for both cases of an individual item
5064        and for the entire structure.  Below is code common to both */
5065
5066     HV * hv = newHV();      /* The returned hash, initially empty */
5067     sv_2mortal((SV*)hv);
5068
5069     /* Call localeconv() and copy its results into the hash.  All the
5070      * parameters have been initialized above */
5071     populate_hash_from_localeconv(hv,
5072                                   locale,
5073                                   index_bits,
5074                                   strings,
5075                                   integers
5076                                  );
5077
5078     /* The above call may have done all the hash fields, but not always, as
5079      * already explained.  If we need a second call it is always for the
5080      * NUMERIC fields */
5081     if (requires_2nd_localeconv) {
5082         populate_hash_from_localeconv(hv,
5083                                       numeric_locale,
5084                                       INDEX_TO_BIT(numeric_index),
5085                                       strings,
5086                                       NULL      /* There are no NUMERIC integer
5087                                                    fields */
5088                                      );
5089     }
5090
5091     /* Here, the hash has been completely populated.
5092      *
5093      * Now go through all the items and:
5094      *  a) For string items, see if they should be marked as UTF-8 or not.
5095      *     This would have been more convenient and faster to do while
5096      *     populating the hash in the first place, but that operation has to be
5097      *     done within a critical section, keeping other threads from
5098      *     executing, so only the minimal amount of work necessary is done at
5099      *     that time.
5100      *  b) For integer items, convert the C CHAR_MAX value into -1.  Again,
5101      *     this could have been done in the critical section, but was deferred
5102      *     to here to keep to the bare minimum amount the time spent owning the
5103      *     processor. CHAR_MAX is a C concept for an 8-bit character type.
5104      *     Perl has no such type; the closest fit is a -1.
5105      *
5106      * XXX On unthreaded perls, this code could be #ifdef'd out, and the
5107      * corrections determined at hash population time, at an extra maintenance
5108      * cost which khw doesn't think is worth it
5109      */
5110
5111 #  ifndef HAS_SOME_LANGINFO
5112
5113     /* We are done when called with an individual item.  There are no integer
5114      * items to adjust, and it's best for the caller to determine if this
5115      * string item is UTF-8 or not.  This is because the locale's UTF-8ness is
5116      * calculated below, and in some Configurations, that can lead to a
5117      * recursive call to here, which could recurse infinitely. */
5118
5119     if (item != 0) {
5120         return hv;
5121     }
5122
5123 #  endif
5124
5125     for (unsigned int i = 0; i < 2; i++) {  /* Try both types of strings */
5126         if (! strings[i]) {     /* Skip if no strings of this type */
5127             continue;
5128         }
5129
5130         locale = (i == NUMERIC_STRING_OFFSET)
5131                  ? numeric_locale
5132                  : monetary_locale;
5133
5134         if (! is_locale_utf8(locale)) {
5135             continue;   /* No string can be UTF-8 if the locale isn't */
5136         }
5137
5138         /* Examine each string */
5139         for (const lconv_offset_t *strp = strings[i]; strp->name; strp++) {
5140             const char * name = strp->name;
5141
5142             /* 'value' will contain the string that may need to be marked as
5143              * UTF-8 */
5144             SV ** value = hv_fetch(hv, name, strlen(name), true);
5145             if (! value || ! SvPOK(*value)) {
5146                 continue;
5147             }
5148
5149             /* Determine if the string should be marked as UTF-8. */
5150             if (UTF8NESS_YES == (get_locale_string_utf8ness_i(SvPVX(*value),
5151                                                   LOCALE_IS_UTF8,
5152                                                   NULL,
5153                                                   (locale_category_index) 0)))
5154             {
5155                 SvUTF8_on(*value);
5156             }
5157         }
5158     }   /* End of fixing up UTF8ness */
5159
5160
5161     /* Examine each integer */
5162     for (; integers; integers++) {
5163         const char * name = integers->name;
5164
5165         if (! name) {   /* Reached the end */
5166             break;
5167         }
5168
5169         SV ** value = hv_fetch(hv, name, strlen(name), true);
5170         if (! value) {
5171             continue;
5172         }
5173
5174         /* Change CHAR_MAX to -1 */
5175         if (SvIV(*value) == CHAR_MAX) {
5176             sv_setiv(*value, -1);
5177         }
5178     }
5179
5180     return hv;
5181 }
5182
5183 STATIC void
5184 S_populate_hash_from_localeconv(pTHX_ HV * hv,
5185
5186                                       /* Switch to this locale to run
5187                                        * localeconv() from */
5188                                       const char * locale,
5189
5190                                       /* bit mask of which categories to
5191                                        * populate */
5192                                       const U32 which_mask,
5193
5194                                       /* strings[0] points to the numeric
5195                                        * string fields; [1] to the monetary */
5196                                       const lconv_offset_t * strings[2],
5197
5198                                       /* And to the monetary integer fields */
5199                                       const lconv_offset_t * integers)
5200 {
5201     PERL_ARGS_ASSERT_POPULATE_HASH_FROM_LOCALECONV;
5202     PERL_UNUSED_ARG(which_mask);    /* Some configurations don't use this;
5203                                        complicated to figure out which */
5204 #  ifndef USE_LOCALE
5205     PERL_UNUSED_ARG(locale);
5206 #  endif
5207
5208     /* Run localeconv() and copy some or all of its results to the input 'hv'
5209      * hash.  Most localeconv() implementations return the values in a global
5210      * static buffer, so the operation must be performed in a critical section,
5211      * ending only after the copy is completed.  There are so many locks
5212      * because localeconv() deals with two categories, and returns in a single
5213      * global static buffer.  Some locks might be no-ops on this platform, but
5214      * not others.  We need to lock if any one isn't a no-op. */
5215
5216 #  ifdef USE_LOCALE_CTYPE
5217
5218     /* Some platforms require LC_CTYPE to be congruent with the category we are
5219      * looking for */
5220     const char * orig_CTYPE_locale = toggle_locale_c(LC_CTYPE, locale);
5221
5222 #  endif
5223 #  ifdef USE_LOCALE_NUMERIC
5224
5225     /* We need to toggle to the underlying NUMERIC locale if we are getting
5226      * NUMERIC strings */
5227     const char * orig_NUMERIC_locale = NULL;
5228     if (which_mask & INDEX_TO_BIT(LC_NUMERIC_INDEX_)) {
5229         LC_NUMERIC_LOCK(0);
5230
5231 #    if defined(WIN32)
5232
5233         /* There is a bug in Windows in which setting LC_CTYPE after the others
5234          * doesn't actually take effect for localeconv().  See commit
5235          * 418efacd1950763f74ed3cc22f8cf9206661b892 for details.  Thus we have
5236          * to make sure that the locale we want is set after LC_CTYPE.  We
5237          * unconditionally toggle away from and back to the current locale
5238          * prior to calling localeconv().
5239          *
5240          * This code will have no effect if we already are in C, but khw
5241          * hasn't seen any cases where this causes problems when we are in the
5242          * C locale. */
5243         orig_NUMERIC_locale = toggle_locale_i(LC_NUMERIC_INDEX_, "C");
5244         toggle_locale_i(LC_NUMERIC_INDEX_, locale);
5245
5246 #    else
5247
5248         /* No need for the extra toggle when not on Windows */
5249         orig_NUMERIC_locale = toggle_locale_i(LC_NUMERIC_INDEX_, locale);
5250
5251 #    endif
5252
5253     }
5254
5255 #  endif
5256 #  if defined(USE_LOCALE_MONETARY) && defined(WIN32)
5257
5258     /* Same Windows bug as described just above for NUMERIC.  Otherwise, no
5259      * need to toggle LC_MONETARY, as it is kept in the underlying locale */
5260     const char * orig_MONETARY_locale = NULL;
5261     if (which_mask & INDEX_TO_BIT(LC_MONETARY_INDEX_)) {
5262         orig_MONETARY_locale = toggle_locale_i(LC_MONETARY_INDEX_, "C");
5263         toggle_locale_i(LC_MONETARY_INDEX_, locale);
5264     }
5265
5266 #  endif
5267
5268     /* Finally ready to do the actual localeconv().  Lock to prevent other
5269      * accesses until we have made a copy of its returned static buffer */
5270     gwLOCALE_LOCK;
5271
5272 #  if defined(TS_W32_BROKEN_LOCALECONV) && defined(USE_THREAD_SAFE_LOCALE)
5273
5274     /* This is a workaround for another bug in Windows.  localeconv() was
5275      * broken with thread-safe locales prior to VS 15.  It looks at the global
5276      * locale instead of the thread one.  As a work-around, we toggle to the
5277      * global locale; populate the return; then toggle back.  We have to use
5278      * LC_ALL instead of the individual categories because of yet another bug
5279      * in Windows.  And this all has to be done in a critical section.
5280      *
5281      * This introduces a potential race with any other thread that has also
5282      * converted to use the global locale, and doesn't protect its locale calls
5283      * with mutexes.  khw can't think of any reason for a thread to do so on
5284      * Windows, as the locale API is the same regardless of thread-safety,
5285      * except if the code is ported from working on another platform where
5286      * there might be some reason to do this.  But this is typically due to
5287      * some alien-to-Perl library that thinks it owns locale setting.  Such a
5288      * library isn't likely to exist on Windows, so such an application is
5289      * unlikely to be run on Windows
5290      */
5291     bool restore_per_thread = FALSE;
5292
5293     /* Save the per-thread locale state */
5294     const char * save_thread = querylocale_c(LC_ALL);
5295
5296     /* Change to the global locale, and note if we already were there */
5297     int config_return = _configthreadlocale(_DISABLE_PER_THREAD_LOCALE);
5298     if (config_return != _DISABLE_PER_THREAD_LOCALE) {
5299         if (config_return == -1) {
5300             locale_panic_("_configthreadlocale returned an error");
5301         }
5302
5303         restore_per_thread = TRUE;
5304     }
5305
5306     /* Save the state of the global locale; then convert to our desired
5307      * state.  */
5308     const char * save_global = querylocale_c(LC_ALL);
5309     void_setlocale_c(LC_ALL, save_thread);
5310
5311 #  endif  /* TS_W32_BROKEN_LOCALECONV */
5312
5313     /* Finally, do the actual localeconv */
5314     const char *lcbuf_as_string = (const char *) localeconv();
5315
5316     /* Fill in the string fields of the HV* */
5317     for (unsigned int i = 0; i < 2; i++) {
5318
5319         /* One iteration is only for the numeric string fields.  Skip these
5320          * unless we are compiled to care about those fields and the input
5321          * parameters indicate we want their values */
5322         if (   i == NUMERIC_STRING_OFFSET
5323
5324 #  ifdef USE_LOCALE_NUMERIC
5325
5326             && (which_mask & INDEX_TO_BIT(LC_NUMERIC_INDEX_)) == 0
5327
5328 #  endif
5329
5330         ) {
5331             continue;
5332         }
5333
5334         /* The other iteration is only for the monetary string fields.  Again
5335          * skip it unless we want those values */
5336         if (   i == MONETARY_STRING_OFFSET
5337
5338 #  ifdef USE_LOCALE_MONETARY
5339
5340             && (which_mask & INDEX_TO_BIT(LC_MONETARY_INDEX_)) == 0
5341
5342 #  endif
5343         ) {
5344
5345             continue;
5346         }
5347
5348         /* For each field for the given category ... */
5349         const lconv_offset_t * category_strings = strings[i];
5350         while (1) {
5351             const char * name = category_strings->name;
5352             if (! name) {   /* Quit at the end */
5353                 break;
5354             }
5355
5356             /* we have set things up so that we know where in the returned
5357              * structure, when viewed as a string, the corresponding value is.
5358              * */
5359             const char *value = *((const char **)(  lcbuf_as_string
5360                                                   + category_strings->offset));
5361
5362             /* Set to get next string on next iteration */
5363             category_strings++;
5364
5365             /* Skip if this platform doesn't have this field. */
5366             if (! value) {
5367                 continue;
5368             }
5369
5370             /* Copy to the hash */
5371             (void) hv_store(hv,
5372                             name, strlen(name),
5373                             newSVpv(value, strlen(value)),
5374                             0);
5375         }
5376
5377         /* Add any int fields to the HV* */
5378         if (i == MONETARY_STRING_OFFSET && integers) {
5379             while (integers->name) {
5380                 const char value = *((const char *)(  lcbuf_as_string
5381                                                     + integers->offset));
5382                 (void) hv_store(hv, integers->name,
5383                                 strlen(integers->name), newSViv(value), 0);
5384                 integers++;
5385             }
5386         }
5387     }   /* End of loop through the fields */
5388
5389     /* Done with copying to the hash.  Can unwind the critical section locks */
5390
5391 #  if defined(TS_W32_BROKEN_LOCALECONV) && defined(USE_THREAD_SAFE_LOCALE)
5392
5393     /* Restore the global locale's prior state */
5394     void_setlocale_c(LC_ALL, save_global);
5395
5396     /* And back to per-thread locales */
5397     if (restore_per_thread) {
5398         if (_configthreadlocale(_ENABLE_PER_THREAD_LOCALE) == -1) {
5399             locale_panic_("_configthreadlocale returned an error");
5400         }
5401     }
5402
5403     /* Restore the per-thread locale state */
5404     void_setlocale_c(LC_ALL, save_thread);
5405
5406 #  endif  /* TS_W32_BROKEN_LOCALECONV */
5407
5408     gwLOCALE_UNLOCK;    /* Finished with the critical section of a
5409                            globally-accessible buffer */
5410
5411 #  if defined(USE_LOCALE_MONETARY) && defined(WIN32)
5412
5413     restore_toggled_locale_i(LC_MONETARY_INDEX_, orig_MONETARY_locale);
5414
5415 #  endif
5416 #  ifdef USE_LOCALE_NUMERIC
5417
5418     restore_toggled_locale_i(LC_NUMERIC_INDEX_, orig_NUMERIC_locale);
5419     if (which_mask & INDEX_TO_BIT(LC_NUMERIC_INDEX_)) {
5420         LC_NUMERIC_UNLOCK;
5421     }
5422
5423 #  endif
5424 #  ifdef USE_LOCALE_CTYPE
5425
5426     restore_toggled_locale_c(LC_CTYPE, orig_CTYPE_locale);
5427
5428 #  endif
5429
5430 }
5431
5432 #endif /* defined(HAS_LOCALECONV) */
5433 #ifndef HAS_SOME_LANGINFO
5434
5435 typedef int nl_item;    /* Substitute 'int' for emulated nl_langinfo() */
5436
5437 #endif
5438
5439 /*
5440
5441 =for apidoc      Perl_langinfo
5442 =for apidoc_item Perl_langinfo8
5443
5444 C<Perl_langinfo> is an (almost) drop-in replacement for the system
5445 C<L<nl_langinfo(3)>>, taking the same C<item> parameter values, and returning
5446 the same information.  But it is more thread-safe than regular
5447 C<nl_langinfo()>, and hides the quirks of Perl's locale handling from your
5448 code, and can be used on systems that lack a native C<nl_langinfo>.
5449
5450 However, you should instead use the improved version of this:
5451 L</Perl_langinfo8>, which behaves identically except for an additional
5452 parameter, a pointer to a variable declared as L</C<utf8ness_t>>, into which it
5453 returns to you how you should treat the returned string with regards to it
5454 being encoded in UTF-8 or not.
5455
5456 Concerning the differences between these and plain C<nl_langinfo()>:
5457
5458 =over
5459
5460 =item a.
5461
5462 C<Perl_langinfo8> has an extra parameter, described above.  Besides this, the
5463 other reason they aren't quite a drop-in replacement is actually an advantage.
5464 The C<const>ness of the return allows the compiler to catch attempts to write
5465 into the returned buffer, which is illegal and could cause run-time crashes.
5466
5467 =item b.
5468
5469 They deliver the correct results for the C<RADIXCHAR> and C<THOUSEP> items,
5470 without you having to write extra code.  The reason for the extra code would be
5471 because these are from the C<LC_NUMERIC> locale category, which is normally
5472 kept set by Perl so that the radix is a dot, and the separator is the empty
5473 string, no matter what the underlying locale is supposed to be, and so to get
5474 the expected results, you have to temporarily toggle into the underlying
5475 locale, and later toggle back.  (You could use plain C<nl_langinfo> and
5476 C<L</STORE_LC_NUMERIC_FORCE_TO_UNDERLYING>> for this but then you wouldn't get
5477 the other advantages of C<Perl_langinfo()>; not keeping C<LC_NUMERIC> in the C
5478 (or equivalent) locale would break a lot of CPAN, which is expecting the radix
5479 (decimal point) character to be a dot.)
5480
5481 =item c.
5482
5483 The system function they replace can have its static return buffer trashed,
5484 not only by a subsequent call to that function, but by a C<freelocale>,
5485 C<setlocale>, or other locale change.  The returned buffer of these functions
5486 is not changed until the next call to one or the other, so the buffer is never
5487 in a trashed state.
5488
5489 =item d.
5490
5491 The return buffer is per-thread, so it also is never overwritten by a call to
5492 these functions from another thread;  unlike the function it replaces.
5493
5494 =item e.
5495
5496 But most importantly, they work on systems that don't have C<nl_langinfo>, such
5497 as Windows, hence making your code more portable.  Of the fifty-some possible
5498 items specified by the POSIX 2008 standard,
5499 L<http://pubs.opengroup.org/onlinepubs/9699919799/basedefs/langinfo.h.html>,
5500 only one is completely unimplemented, though on non-Windows platforms, another
5501 significant one is not fully implemented).  They use various techniques to
5502 recover the other items, including calling C<L<localeconv(3)>>, and
5503 C<L<strftime(3)>>, both of which are specified in C89, so should be always be
5504 available.  Later C<strftime()> versions have additional capabilities.
5505 If an item is not available on your system, this returns either the value
5506 associated with the C locale, or simply C<"">, whichever is more appropriate.
5507
5508 It is important to note that, when called with an item that is recovered by
5509 using C<localeconv>, the buffer from any previous explicit call to
5510 C<L<localeconv(3)>> will be overwritten.  But you shouldn't be using
5511 C<localeconv> anyway because it is is very much not thread-safe, and suffers
5512 from the same problems outlined in item 'b.' above for the fields it returns
5513 that are controlled by the LC_NUMERIC locale category.  Instead, avoid all of
5514 those problems by calling L</Perl_localeconv>, which is thread-safe; or by
5515 using the methods given in L<perlcall>  to call
5516 L<C<POSIX::localeconv()>|POSIX/localeconv>, which is also thread-safe.
5517
5518 =back
5519
5520 The details for those items which may deviate from what this emulation returns
5521 and what a native C<nl_langinfo()> would return are specified in
5522 L<I18N::Langinfo>.
5523
5524 When using C<Perl_langinfo8> (or plain C<Perl_langinfo>) on systems that don't
5525 have a native C<nl_langinfo()>, you must
5526
5527  #include "perl_langinfo.h"
5528
5529 before the C<perl.h> C<#include>.  You can replace your F<langinfo.h>
5530 C<#include> with this one.  (Doing it this way keeps out the symbols that plain
5531 F<langinfo.h> would try to import into the namespace for code that doesn't need
5532 it.)
5533
5534 =cut
5535
5536 */
5537
5538 const char *
5539 Perl_langinfo(const nl_item item)
5540 {
5541     return Perl_langinfo8(item, NULL);
5542 }
5543
5544 const char *
5545 Perl_langinfo8(const nl_item item, utf8ness_t * utf8ness)
5546 {
5547     dTHX;
5548     locale_category_index cat_index;
5549
5550     PERL_ARGS_ASSERT_PERL_LANGINFO8;
5551
5552     if (utf8ness) {     /* Assume for now */
5553         *utf8ness = UTF8NESS_IMMATERIAL;
5554     }
5555
5556     /* Find the locale category that controls the input 'item'.  If we are not
5557      * paying attention to that category, instead return a default value.  Also
5558      * return the default value if there is no way for us to figure out the
5559      * correct value.  If we have some form of nl_langinfo(), we can always
5560      * figure it out, but lacking that, there may be alternative methods that
5561      * can be used to recover most of the possible items.  Some of those
5562      * methods need libc functions, which may or may not be available.  If
5563      * unavailable, we can't compute the correct value, so must here return the
5564      * default. */
5565     switch (item) {
5566
5567       case CODESET:
5568
5569 #ifdef USE_LOCALE_CTYPE
5570
5571         cat_index = LC_CTYPE_INDEX_;
5572         break;
5573
5574 #else
5575         return C_codeset;
5576 #endif
5577 #if defined(USE_LOCALE_MESSAGES) && defined(HAS_SOME_LANGINFO)
5578
5579       case YESEXPR: case YESSTR: case NOEXPR: case NOSTR:
5580         cat_index = LC_MESSAGES_INDEX_;
5581         break;
5582 #else
5583       case YESEXPR:   return "^[+1yY]";
5584       case YESSTR:    return "yes";
5585       case NOEXPR:    return "^[-0nN]";
5586       case NOSTR:     return "no";
5587 #endif
5588
5589       case CRNCYSTR:
5590
5591 #if  defined(USE_LOCALE_MONETARY)                                   \
5592  && (defined(HAS_SOME_LANGINFO) || defined(HAS_LOCALECONV))
5593
5594         cat_index = LC_MONETARY_INDEX_;
5595         break;
5596 #else
5597         return "-";
5598 #endif
5599
5600       case RADIXCHAR:
5601
5602 #ifdef CAN_CALCULATE_RADIX
5603
5604         cat_index = LC_NUMERIC_INDEX_;
5605         break;
5606 #else
5607         return C_decimal_point;
5608 #endif
5609
5610       case THOUSEP:
5611
5612 #if  defined(USE_LOCALE_NUMERIC)                                    \
5613  && (defined(HAS_SOME_LANGINFO) || defined(HAS_LOCALECONV))
5614
5615         cat_index = LC_NUMERIC_INDEX_;
5616         break;
5617 #else
5618         return C_thousands_sep;
5619 #endif
5620
5621 /* The other possible items are all in LC_TIME. */
5622 #ifdef USE_LOCALE_TIME
5623
5624       default:
5625         cat_index = LC_TIME_INDEX_;
5626         break;
5627
5628 #endif
5629 #if ! defined(USE_LOCALE_TIME) || ! defined(HAS_SOME_LANGINFO)
5630
5631     /* If not using LC_TIME, hard code the rest.  Or, if there is no
5632      * nl_langinfo(), we use strftime() as an alternative, and it is missing
5633      * functionality to get every single one, so hard-code those */
5634
5635       case ERA: return "";  /* Unimplemented; for use with strftime() %E
5636                                modifier */
5637
5638       /* These formats are defined by C89, so we assume that strftime supports
5639        * them, and so are returned unconditionally; they may not be what the
5640        * locale actually says, but should give good enough results for someone
5641        * using them as formats (as opposed to trying to parse them to figure
5642        * out what the locale says).  The other format items are actually tested
5643        * to verify they work on the platform */
5644       case D_FMT:         return "%x";
5645       case T_FMT:         return "%X";
5646       case D_T_FMT:       return "%c";
5647
5648 #  if defined(WIN32) || ! defined(USE_LOCALE_TIME)
5649
5650       /* strftime() on Windows doesn't have the POSIX (beyond C89) extensions
5651        * that would allow it to recover these */
5652       case ERA_D_FMT:     return "%x";
5653       case ERA_T_FMT:     return "%X";
5654       case ERA_D_T_FMT:   return "%c";
5655       case ALT_DIGITS:    return "0";
5656
5657 #  endif
5658 #  ifndef USE_LOCALE_TIME
5659
5660       case T_FMT_AMPM:    return "%r";
5661       case ABDAY_1:       return "Sun";
5662       case ABDAY_2:       return "Mon";
5663       case ABDAY_3:       return "Tue";
5664       case ABDAY_4:       return "Wed";
5665       case ABDAY_5:       return "Thu";
5666       case ABDAY_6:       return "Fri";
5667       case ABDAY_7:       return "Sat";
5668       case AM_STR:        return "AM";
5669       case PM_STR:        return "PM";
5670       case ABMON_1:       return "Jan";
5671       case ABMON_2:       return "Feb";
5672       case ABMON_3:       return "Mar";
5673       case ABMON_4:       return "Apr";
5674       case ABMON_5:       return "May";
5675       case ABMON_6:       return "Jun";
5676       case ABMON_7:       return "Jul";
5677       case ABMON_8:       return "Aug";
5678       case ABMON_9:       return "Sep";
5679       case ABMON_10:      return "Oct";
5680       case ABMON_11:      return "Nov";
5681       case ABMON_12:      return "Dec";
5682       case DAY_1:         return "Sunday";
5683       case DAY_2:         return "Monday";
5684       case DAY_3:         return "Tuesday";
5685       case DAY_4:         return "Wednesday";
5686       case DAY_5:         return "Thursday";
5687       case DAY_6:         return "Friday";
5688       case DAY_7:         return "Saturday";
5689       case MON_1:         return "January";
5690       case MON_2:         return "February";
5691       case MON_3:         return "March";
5692       case MON_4:         return "April";
5693       case MON_5:         return "May";
5694       case MON_6:         return "June";
5695       case MON_7:         return "July";
5696       case MON_8:         return "August";
5697       case MON_9:         return "September";
5698       case MON_10:        return "October";
5699       case MON_11:        return "November";
5700       case MON_12:        return "December";
5701
5702 #  endif
5703 #endif
5704
5705     } /* End of switch on item */
5706
5707 #ifndef USE_LOCALE
5708
5709     Perl_croak_nocontext("panic: Unexpected nl_langinfo() item %d", item);
5710     NOT_REACHED; /* NOTREACHED */
5711     PERL_UNUSED_VAR(cat_index);
5712
5713 #else
5714
5715     return my_langinfo_i(item, cat_index, query_nominal_locale_i(cat_index),
5716                          &PL_langinfo_buf, &PL_langinfo_bufsize, utf8ness);
5717 #endif
5718
5719 }
5720
5721 char *
5722 Perl_my_strftime(pTHX_ const char *fmt, int sec, int min, int hour,
5723                        int mday, int mon, int year, int wday, int yday,
5724                        int isdst)
5725 {   /* Documented above */
5726     PERL_ARGS_ASSERT_MY_STRFTIME;
5727
5728     struct tm  mytm;
5729     ints_to_tm(&mytm, sec, min, hour, mday, mon, year, wday, yday, isdst);
5730     char * ret = strftime_tm(fmt, &mytm);
5731     return ret;
5732 }
5733
5734 SV *
5735 Perl_sv_strftime_tm(pTHX_ SV * fmt, const struct tm * mytm)
5736 {   /* Documented above */
5737     PERL_ARGS_ASSERT_SV_STRFTIME_TM;
5738
5739     utf8ness_t fmt_utf8ness = (SvUTF8(fmt) && LIKELY(! IN_BYTES))
5740                               ? UTF8NESS_YES
5741                               : UTF8NESS_UNKNOWN;
5742
5743     utf8ness_t result_utf8ness;
5744     char * retval = strftime8(SvPV_nolen(fmt),
5745                               mytm,
5746                               fmt_utf8ness,
5747                               &result_utf8ness,
5748                               true  /* calling from sv_strftime */
5749                              );
5750     SV * sv = NULL;
5751     if (retval) {
5752         STRLEN len = strlen(retval);
5753         sv = newSV(len);
5754         sv_usepvn_flags(sv, retval, len, SV_HAS_TRAILING_NUL);
5755
5756         if (result_utf8ness == UTF8NESS_YES) {
5757             SvUTF8_on(sv);
5758         }
5759     }
5760
5761     return sv;
5762 }
5763
5764 SV *
5765 Perl_sv_strftime_ints(pTHX_ SV * fmt, int sec, int min, int hour,
5766                             int mday, int mon, int year, int wday,
5767                             int yday, int isdst)
5768 {   /* Documented above */
5769     PERL_ARGS_ASSERT_SV_STRFTIME_INTS;
5770
5771     struct tm  mytm;
5772     ints_to_tm(&mytm, sec, min, hour, mday, mon, year, wday, yday, isdst);
5773     SV * ret = sv_strftime_tm(fmt, &mytm);
5774     return ret;
5775 }
5776
5777 #ifdef USE_LOCALE
5778
5779 /* There are several implementations of my_langinfo, depending on the
5780  * Configuration.  They all share the same beginning of the function */
5781 STATIC const char *
5782 S_my_langinfo_i(pTHX_
5783                 const nl_item item,           /* The item to look up */
5784                 const locale_category_index cat_index, /* The locale category
5785                                                           that controls it */
5786                 /* The locale to look up 'item' in. */
5787                 const char * locale,
5788
5789                 /* Where to store the result, and where the size of that buffer
5790                  * is stored, updated on exit. retbuf_sizep may be NULL for an
5791                  * empty-on-entry, single use buffer whose size we don't need
5792                  * to keep track of */
5793                 char ** retbufp,
5794                 Size_t * retbuf_sizep,
5795
5796                 /* If not NULL, the location to store the UTF8-ness of 'item's
5797                  * value, as documented */
5798                 utf8ness_t * utf8ness)
5799 {
5800     const char * retval = NULL;
5801
5802     PERL_ARGS_ASSERT_MY_LANGINFO_I;
5803     assert(cat_index < LC_ALL_INDEX_);
5804
5805     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
5806                            "Entering my_langinfo item=%ld, using locale %s\n",
5807                            (long) item, locale));
5808 /*--------------------------------------------------------------------------*/
5809 /* Above is the common beginning to all the implementations of my_langinfo().
5810  * Below are the various completions.
5811  *
5812  * Some platforms don't deal well with non-ASCII strings in locale X when
5813  * LC_CTYPE is not in X.  (Actually it is probably when X is UTF-8 and LC_CTYPE
5814  * isn't, or vice versa).  There is explicit code to bring the categories into
5815  * sync.  This doesn't seem to be a problem with nl_langinfo(), so that
5816  * implementation doesn't currently worry about it.  But it is a problem on
5817  * Windows boxes, which don't have nl_langinfo(). */
5818
5819 /*--------------------------------------------------------------------------*/
5820 #  if defined(HAS_NL_LANGINFO) /* nl_langinfo() is available.  */
5821 #    ifdef USE_LOCALE_CTYPE
5822
5823     /* This function sorts out if things actually have to be switched or not,
5824      * for both save and restore. */
5825     const char * orig_CTYPE_locale = toggle_locale_c(LC_CTYPE, locale);
5826
5827 #    endif
5828
5829     const char * orig_switched_locale = toggle_locale_i(cat_index, locale);
5830
5831     gwLOCALE_LOCK;
5832     retval = save_to_buffer(nl_langinfo(item), retbufp, retbuf_sizep);
5833     gwLOCALE_UNLOCK;
5834
5835     if (utf8ness) {
5836         *utf8ness = get_locale_string_utf8ness_i(retval,
5837                                                  LOCALE_UTF8NESS_UNKNOWN,
5838                                                  locale, cat_index);
5839     }
5840
5841     restore_toggled_locale_i(cat_index, orig_switched_locale);
5842
5843 #    ifdef USE_LOCALE_CTYPE
5844
5845     restore_toggled_locale_c(LC_CTYPE, orig_CTYPE_locale);
5846
5847 #    endif
5848
5849     return retval;
5850 /*--------------------------------------------------------------------------*/
5851 #  else   /* Below, emulate nl_langinfo as best we can */
5852
5853     /* The other completion is where we have to emulate nl_langinfo().  There
5854      * are various possibilities depending on the Configuration */
5855
5856 #    ifdef USE_LOCALE_CTYPE
5857
5858     const char * orig_CTYPE_locale = toggle_locale_c(LC_CTYPE, locale);
5859
5860 #    endif
5861
5862     const char * orig_switched_locale = toggle_locale_i(cat_index, locale);
5863
5864     /* Here, we are in the locale we want information about */
5865
5866     /* Almost all the items will have ASCII return values.  Set that here, and
5867      * override if necessary */
5868     utf8ness_t is_utf8 = UTF8NESS_IMMATERIAL;
5869
5870     switch (item) {
5871       default:
5872         assert(item < 0);   /* Make sure using perl_langinfo.h */
5873         retval = "";
5874         break;
5875
5876       case RADIXCHAR:
5877
5878 #    if      defined(HAS_SNPRINTF)                                          \
5879        && (! defined(HAS_LOCALECONV) || defined(TS_W32_BROKEN_LOCALECONV))
5880
5881         {
5882             /* snprintf() can be used to find the radix character by outputting
5883              * a known simple floating point number to a buffer, and parsing
5884              * it, inferring the radix as the bytes separating the integer and
5885              * fractional parts.  But localeconv() is more direct, not
5886              * requiring inference, so use it instead of the code just below,
5887              * if (likely) it is available and works ok */
5888
5889             char * floatbuf = NULL;
5890             const Size_t initial_size = 10;
5891
5892             Newx(floatbuf, initial_size, char);
5893
5894             /* 1.5 is exactly representable on binary computers */
5895             Size_t needed_size = snprintf(floatbuf, initial_size, "%.1f", 1.5);
5896
5897             /* If our guess wasn't big enough, increase and try again, based on
5898              * the real number that snprintf() is supposed to return */
5899             if (UNLIKELY(needed_size >= initial_size)) {
5900                 needed_size++;  /* insurance */
5901                 Renew(floatbuf, needed_size, char);
5902                 Size_t new_needed = snprintf(floatbuf, needed_size, "%.1f", 1.5);
5903                 assert(new_needed <= needed_size);
5904                 needed_size = new_needed;
5905             }
5906
5907             char * s = floatbuf;
5908             char * e = floatbuf + needed_size;
5909
5910             /* Find the '1' */
5911             while (s < e && *s != '1') {
5912                 s++;
5913             }
5914
5915             if (LIKELY(s < e)) {
5916                 s++;
5917             }
5918
5919             /* Find the '5' */
5920             char * item_start = s;
5921             while (s < e && *s != '5') {
5922                 s++;
5923             }
5924
5925             /* Everything in between is the radix string */
5926             if (LIKELY(s < e)) {
5927                 *s = '\0';
5928                 retval = save_to_buffer(item_start, retbufp, retbuf_sizep);
5929                 Safefree(floatbuf);
5930
5931                 if (utf8ness) {
5932                     is_utf8 = get_locale_string_utf8ness_i(retval,
5933                                                         LOCALE_UTF8NESS_UNKNOWN,
5934                                                         locale, cat_index);
5935                 }
5936
5937                 break;
5938             }
5939
5940             Safefree(floatbuf);
5941         }
5942
5943 #      ifdef HAS_LOCALECONV /* snprintf() failed; drop down to use
5944                                localeconv() */
5945
5946         /* FALLTHROUGH */
5947
5948 #      else                      /* snprintf() failed and no localeconv() */
5949
5950         retval = C_decimal_point;
5951         break;
5952
5953 #      endif
5954 #    endif
5955 #    ifdef HAS_LOCALECONV
5956
5957     /* These items are available from localeconv().  (To avoid using
5958      * TS_W32_BROKEN_LOCALECONV, one could use GetNumberFormat and
5959      * GetCurrencyFormat; patches welcome) */
5960
5961 #      define P_CS_PRECEDES    "p_cs_precedes"
5962 #      define CURRENCY_SYMBOL  "currency_symbol"
5963
5964    /* case RADIXCHAR:   // May drop down to here in some configurations */
5965       case THOUSEP:
5966       case CRNCYSTR:
5967        {
5968
5969         /* The hash gets populated with just the field(s) related to 'item'. */
5970         HV * result_hv = my_localeconv(item);
5971
5972         SV* string;
5973         if (item != CRNCYSTR) {
5974
5975             /* These items have been populated with just one key => value */
5976             (void) hv_iterinit(result_hv);
5977             HE * entry = hv_iternext(result_hv);
5978             string = hv_iterval(result_hv, entry);
5979         }
5980         else {
5981
5982             /* But CRNCYSTR localeconv() returns a slightly different value
5983              * than the nl_langinfo() API calls for, so have to modify this one
5984              * to conform.  We need another value from localeconv() to know
5985              * what to change it to.  my_localeconv() has populated the hash
5986              * with exactly both fields.  Delete this one, leaving just the
5987              * CRNCYSTR one in the hash */
5988             SV* precedes = hv_delete(result_hv,
5989                                      P_CS_PRECEDES, STRLENs(P_CS_PRECEDES),
5990                                      0);
5991             if (! precedes) {
5992                 locale_panic_("my_localeconv() unexpectedly didn't return"
5993                               " a value for " P_CS_PRECEDES);
5994             }
5995
5996             /* The modification is to prefix the localeconv() return with a
5997              * single byte, calculated as follows: */
5998             char prefix = (LIKELY(SvIV(precedes) != -1))
5999                           ? ((precedes != 0) ?  '-' : '+')
6000
6001                             /* khw couldn't find any documentation that
6002                              * CHAR_MAX (which we modify to -1) is the signal,
6003                              * but cygwin uses it thusly, and it makes sense
6004                              * given that CHAR_MAX indicates the value isn't
6005                              * used, so it neither precedes nor succeeds */
6006                           : '.';
6007
6008             /* Now get CRNCYSTR */
6009             (void) hv_iterinit(result_hv);
6010             HE * entry = hv_iternext(result_hv);
6011             string = hv_iterval(result_hv, entry);
6012
6013             /* And perform the modification */
6014             Perl_sv_setpvf(aTHX_ string, "%c%s", prefix, SvPV_nolen(string));
6015         }
6016
6017         /* Here, 'string' contains the value we want to return */
6018         retval = save_to_buffer(SvPV_nolen(string), retbufp, retbuf_sizep);
6019
6020         if (utf8ness) {
6021             is_utf8 = get_locale_string_utf8ness_i(retval,
6022                                                    LOCALE_UTF8NESS_UNKNOWN,
6023                                                    locale,
6024                                                    cat_index);
6025         }
6026
6027         break;
6028
6029        }
6030
6031 #    endif  /* Some form of localeconv */
6032 #    ifdef HAS_STRFTIME
6033
6034       /* These formats are only available in later strftime's */
6035       case ERA_D_FMT: case ERA_T_FMT: case ERA_D_T_FMT: case T_FMT_AMPM:
6036
6037       /* The rest can be gotten from most versions of strftime(). */
6038       case ABDAY_1: case ABDAY_2: case ABDAY_3:
6039       case ABDAY_4: case ABDAY_5: case ABDAY_6: case ABDAY_7:
6040       case ALT_DIGITS:
6041       case AM_STR: case PM_STR:
6042       case ABMON_1: case ABMON_2: case ABMON_3: case ABMON_4:
6043       case ABMON_5: case ABMON_6: case ABMON_7: case ABMON_8:
6044       case ABMON_9: case ABMON_10: case ABMON_11: case ABMON_12:
6045       case DAY_1: case DAY_2: case DAY_3: case DAY_4:
6046       case DAY_5: case DAY_6: case DAY_7:
6047       case MON_1: case MON_2: case MON_3: case MON_4:
6048       case MON_5: case MON_6: case MON_7: case MON_8:
6049       case MON_9: case MON_10: case MON_11: case MON_12:
6050         {
6051             const char * format;
6052             bool return_format = FALSE;
6053             int mon = 0;
6054             int mday = 1;
6055             int hour = 6;
6056
6057             GCC_DIAG_IGNORE_STMT(-Wimplicit-fallthrough);
6058
6059             switch (item) {
6060               default:
6061                 locale_panic_(Perl_form(aTHX_ "switch case: %d problem", item));
6062                 NOT_REACHED; /* NOTREACHED */
6063
6064               case PM_STR: hour = 18;
6065               case AM_STR:
6066                 format = "%p";
6067                 break;
6068               case ABDAY_7: mday++;
6069               case ABDAY_6: mday++;
6070               case ABDAY_5: mday++;
6071               case ABDAY_4: mday++;
6072               case ABDAY_3: mday++;
6073               case ABDAY_2: mday++;
6074               case ABDAY_1:
6075                 format = "%a";
6076                 break;
6077               case DAY_7: mday++;
6078               case DAY_6: mday++;
6079               case DAY_5: mday++;
6080               case DAY_4: mday++;
6081               case DAY_3: mday++;
6082               case DAY_2: mday++;
6083               case DAY_1:
6084                 format = "%A";
6085                 break;
6086               case ABMON_12: mon++;
6087               case ABMON_11: mon++;
6088               case ABMON_10: mon++;
6089               case ABMON_9:  mon++;
6090               case ABMON_8:  mon++;
6091               case ABMON_7:  mon++;
6092               case ABMON_6:  mon++;
6093               case ABMON_5:  mon++;
6094               case ABMON_4:  mon++;
6095               case ABMON_3:  mon++;
6096               case ABMON_2:  mon++;
6097               case ABMON_1:
6098                 format = "%b";
6099                 break;
6100               case MON_12: mon++;
6101               case MON_11: mon++;
6102               case MON_10: mon++;
6103               case MON_9:  mon++;
6104               case MON_8:  mon++;
6105               case MON_7:  mon++;
6106               case MON_6:  mon++;
6107               case MON_5:  mon++;
6108               case MON_4:  mon++;
6109               case MON_3:  mon++;
6110               case MON_2:  mon++;
6111               case MON_1:
6112                 format = "%B";
6113                 break;
6114               case T_FMT_AMPM:
6115                 format = "%r";
6116                 return_format = TRUE;
6117                 break;
6118               case ERA_D_FMT:
6119                 format = "%Ex";
6120                 return_format = TRUE;
6121                 break;
6122               case ERA_T_FMT:
6123                 format = "%EX";
6124                 return_format = TRUE;
6125                 break;
6126               case ERA_D_T_FMT:
6127                 format = "%Ec";
6128                 return_format = TRUE;
6129                 break;
6130               case ALT_DIGITS:
6131                 format = "%Ow"; /* Find the alternate digit for 0 */
6132                 break;
6133             }
6134
6135             GCC_DIAG_RESTORE_STMT;
6136
6137             /* The year was deliberately chosen so that January 1 is on the
6138              * first day of the week.  Since we're only getting one thing at a
6139              * time, it all works */
6140             struct tm  mytm;
6141             ints_to_tm(&mytm, 30, 30, hour, mday, mon, 2011, 0, 0, 0);
6142             char * temp;
6143             if (utf8ness) {
6144                 temp = strftime8(format,
6145                                  &mytm,
6146                                  UTF8NESS_IMMATERIAL, /* All possible formats
6147                                                          specified above are
6148                                                          entirely ASCII */
6149                                  &is_utf8,
6150                                  false      /* not calling from sv_strftime */
6151                                 );
6152             }
6153             else {
6154                 temp = strftime_tm(format, &mytm);
6155             }
6156
6157             retval = save_to_buffer(temp, retbufp, retbuf_sizep);
6158             Safefree(temp);
6159
6160             /* If the item is 'ALT_DIGITS', '*retbuf' contains the alternate
6161              * format for wday 0.  If the value is the same as the normal 0,
6162              * there isn't an alternate, so clear the buffer.
6163              *
6164              * (wday was chosen because its range is all a single digit.
6165              * Things like tm_sec have two digits as the minimum: '00'.) */
6166             if (item == ALT_DIGITS && strEQ(*retbufp, "0")) {
6167                 retval = "";
6168                 break;
6169             }
6170
6171             /* ALT_DIGITS is problematic.  Experiments on it showed that
6172              * strftime() did not always work properly when going from alt-9 to
6173              * alt-10.  Only a few locales have this item defined, and in all
6174              * of them on Linux that khw was able to find, nl_langinfo() merely
6175              * returned the alt-0 character, possibly doubled.  Most Unicode
6176              * digits are in blocks of 10 consecutive code points, so that is
6177              * sufficient information for such scripts, as we can infer alt-1,
6178              * alt-2, ....  But for a Japanese locale, a CJK ideographic 0 is
6179              * returned, and the CJK digits are not in code point order, so you
6180              * can't really infer anything.  The localedef for this locale did
6181              * specify the succeeding digits, so that strftime() works properly
6182              * on them, without needing to infer anything.  But the
6183              * nl_langinfo() return did not give sufficient information for the
6184              * caller to understand what's going on.  So until there is
6185              * evidence that it should work differently, this returns the alt-0
6186              * string for ALT_DIGITS. */
6187
6188             if (return_format) {
6189
6190                 /* If to return the format, not the value, overwrite the buffer
6191                  * with it.  But some strftime()s will keep the original format
6192                  * if illegal, so change those to "" */
6193                 if (strEQ(*retbufp, format)) {
6194                     retval = "";
6195                 }
6196                 else {
6197                     retval = format;
6198                 }
6199
6200                 /* A format is always in ASCII */
6201                 is_utf8 = UTF8NESS_IMMATERIAL;
6202             }
6203
6204             break;
6205         }
6206
6207 #    endif
6208 #    ifdef USE_LOCALE_CTYPE
6209
6210       case CODESET:
6211
6212         /* The trivial case */
6213         if (isNAME_C_OR_POSIX(locale)) {
6214             retval = C_codeset;
6215             break;
6216         }
6217
6218 #      ifdef WIN32
6219
6220         /* This function retrieves the code page.  It is subject to change, but
6221          * is documented and has been stable for many releases */
6222         UINT ___lc_codepage_func(void);
6223
6224 #        ifndef WIN32_USE_FAKE_OLD_MINGW_LOCALES
6225
6226         retval = save_to_buffer(Perl_form(aTHX_ "%d", ___lc_codepage_func()),
6227                                 retbufp, retbuf_sizep);
6228 #        else
6229
6230         retval = save_to_buffer(nl_langinfo(CODESET),
6231                                 retbufp, retbuf_sizep);
6232 #        endif
6233
6234         DEBUG_Lv(PerlIO_printf(Perl_debug_log, "locale='%s' cp=%s\n",
6235                                                locale, retval));
6236         break;
6237
6238 #      else
6239
6240         /* The codeset is important, but khw did not figure out a way for it to
6241          * be retrieved on non-Windows boxes without nl_langinfo().  But even
6242          * if we can't get it directly, we can usually determine if it is a
6243          * UTF-8 locale or not.  If it is UTF-8, we (correctly) use that for
6244          * the code set. */
6245
6246 #        if defined(HAS_MBTOWC) || defined(HAS_MBRTOWC)
6247
6248         /* If libc mbtowc() evaluates the bytes that form the REPLACEMENT
6249          * CHARACTER as that Unicode code point, this has to be a UTF-8 locale.
6250          * */
6251         wchar_t wc = 0;
6252         (void) Perl_mbtowc_(aTHX_ NULL, NULL, 0);/* Reset shift state */
6253         int mbtowc_ret = Perl_mbtowc_(aTHX_ &wc,
6254                                       STR_WITH_LEN(REPLACEMENT_CHARACTER_UTF8));
6255         if (mbtowc_ret >= 0 && wc == UNICODE_REPLACEMENT) {
6256             DEBUG_Lv(PerlIO_printf(Perl_debug_log,
6257                                    "mbtowc returned REPLACEMENT\n"));
6258             retval = "UTF-8";
6259             break;
6260         }
6261
6262         /* Here, it isn't a UTF-8 locale.  After the #else clause is code to
6263          * find the codeset (if any) from the locale name */
6264
6265 #        else
6266
6267         /* Here, neither mbtowc() nor mbrtowc() is available.  The chances of
6268          * this are very small, as they are C99 required functions, and we are
6269          * now requiring C99; perhaps this is a defective implementation and
6270          * therefore Configure has been set to indicate neither exists.
6271          *
6272          * Just below we try to calculate the code set from the locale name.
6273          * In all cases but this one, it has already been determined that it is
6274          * not a UTF-8 locale.  But for this case, we defer that, calculate the
6275          * code set name, if any, and later use that result as a hint.  First
6276          * #define a symbol to later tell us that we need to handle this case.
6277          * */
6278 #          define NEED_FURTHER_UTF8NESS_CHECKING
6279 #        endif
6280
6281         /* Here, the code set has not been found.  The only other option khw
6282          * could think of is to see if the codeset is part of the locale name.
6283          * This is very less than ideal; often there is no code set in the
6284          * name; and at other times they even lie.
6285          *
6286          * But there is an XPG standard syntax, which many locales follow:
6287          *
6288          *    language[_territory[.codeset]][@modifier]
6289          *
6290          * So we take the part between the dot and any '@' */
6291         retval = strchr(locale, '.');
6292         if (! retval) {
6293             retval = "";  /* Alas, no dot */
6294         }
6295         else {
6296
6297             /* Don't include the dot */
6298             retval++;
6299
6300             /* And stop before any '@' */
6301             const char * modifier = strchr(retval, '@');
6302             if (modifier) {
6303                 char * code_set_name;
6304                 const Size_t name_len = modifier - retval;
6305                 Newx(code_set_name, name_len + 1, char);    /* +1 for NUL */
6306                 my_strlcpy(code_set_name, retval, name_len + 1);
6307                 SAVEFREEPV(code_set_name);
6308                 retval = code_set_name;
6309             }
6310
6311             /* The code set name is considered to be everything between the dot
6312              * and the '@' */
6313             retval = save_to_buffer(retval, retbufp, retbuf_sizep);
6314         }
6315
6316 #        ifndef NEED_FURTHER_UTF8NESS_CHECKING
6317
6318         break;  /* All done */
6319
6320 #        else
6321 #          define NAME_INDICATES_UTF8       0x1
6322 #          define MB_CUR_MAX_SUGGESTS_UTF8  0x2
6323
6324         /* Here, 'retval' contains whatever code set name is in the locale
6325          * name.  In this #else, it being a UTF-8 code set hasn't been
6326          * determined, because this platform is lacking the libc functions
6327          * which would readily return that information.  So, we try to infer
6328          * the UTF-8ness by other means, using the code set name just found as
6329          * a hint to help resolve ambiguities.  So if that name indicates it is
6330          * UTF-8, we expect it to be so */
6331         unsigned int lean_towards_being_utf8 = 0;
6332         if (is_codeset_name_UTF8(retval)) {
6333             lean_towards_being_utf8 |= NAME_INDICATES_UTF8;
6334         }
6335
6336         /* The code set is often UTF-8, even when the locale name doesn't so
6337          * indicate.  If we discover this is so, we will override whatever the
6338          * locale name said.  Conversely (but rarely), "UTF-8" in the locale
6339          * name might be wrong.  We return "" as the code set name if we find
6340          * that to be the case.
6341          *
6342          * For this portion of the file to compile, neither mbtowc() nor
6343          * mbrtowc() are available to us, even though they are required by C99.
6344          * So, something must be wrong with them.  The code here should be good
6345          * enough to work around this issue, but should the need arise, you
6346          * could look for other C99 functions that are implemented correctly to
6347          * use instead.
6348          *
6349          * But MB_CUR_MAX is a C99 construct that helps a lot, is simple for a
6350          * vendor to implement, and our experience with it is that it works
6351          * well on a variety of platforms.  We have found that it returns a
6352          * too-large number on some platforms for the C locale, but for no
6353          * others.  That locale was already ruled out above.  (If MB_CUR_MAX
6354          * returned too small a number, that would break a lot of things, and
6355          * likely would be quickly corrected by the vendor.)  khw has some
6356          * confidence that it doesn't return >1 when 1 is meant, as that would
6357          * trigger a Perl warning, and we've had no reports of invalid
6358          * occurrences of such. */
6359 #          ifdef MB_CUR_MAX
6360
6361         /* If there are fewer bytes available in this locale than are required
6362          * to represent the largest legal UTF-8 code point, this definitely
6363          * isn't a UTF-8 locale, even if the locale name says it is. */
6364         LC_CTYPE_LOCK;
6365         const int mb_cur_max = MB_CUR_MAX;
6366         LC_CTYPE_UNLOCK;
6367         if (mb_cur_max < (int) UNISKIP(PERL_UNICODE_MAX)) {
6368             if (lean_towards_being_utf8 & NAME_INDICATES_UTF8) {
6369                 retval = "";    /* The name is wrong; override */
6370             }
6371
6372             break;
6373         }
6374
6375         /* But if the locale could be UTF-8, and also the name corroborates
6376          * this, assume it is so */
6377         if (lean_towards_being_utf8 & NAME_INDICATES_UTF8) {
6378             break;
6379         }
6380
6381         /* Here, the name doesn't indicate UTF-8, but MB_CUR_MAX indicates it
6382          * could be.  khw knows of only two other locales in the world, EUC-TW
6383          * and GB 18030, that legitimately require this many bytes (4).  In
6384          * both, the single byte characters are the same as ASCII.  No
6385          * multi-byte character in EUC-TW is legal UTF-8 (since the first byte
6386          * of each is a continuation).  GB 18030 has no three byte sequences,
6387          * and none of the four byte ones is legal UTF-8 (as the second byte
6388          * for these is a non-continuation).  But every legal UTF-8 two byte
6389          * sequence is also legal in GB 18030, though none have the same
6390          * meaning, and no Han code point expressed in UTF-8 is two byte.  So
6391          * the further tests below which look for native expressions of
6392          * currency and time will not return two byte sequences, hence they
6393          * will reliably rule out this locale as being UTF-8.  So, if we get
6394          * this far, the result is almost certainly UTF-8.  But to be really
6395          * sure, also check that there is no illegal UTF-8. */
6396         lean_towards_being_utf8 |= MB_CUR_MAX_SUGGESTS_UTF8;
6397
6398 #          endif    /* has MB_CUR_MAX */
6399
6400         /* Here, MB_CUR_MAX is not available, or was inconclusive.  What we do
6401          * is to look at various strings associated with the locale:
6402          *  1)  If any are illegal UTF-8, the locale can't be UTF-8.
6403          *  2)  If all are legal UTF-8, and some non-ASCII characters are
6404          *      present, it is likely to be UTF-8, because of the strictness of
6405          *      UTF-8 syntax. So assume it is UTF-8
6406          *  3)  If all are ASCII and the locale name and/or MB_CUR_MAX indicate
6407          *      UTF-8, assume the locale is UTF-8.
6408          *  4)  Otherwise, assume the locale isn't UTF-8
6409          *
6410          * To save cycles, if the locale name indicates it is a UTF-8 locale,
6411          * we stop looking at the first instance with legal non-ASCII UTF-8.
6412          * It is very unlikely this combination is coincidental. */
6413
6414         utf8ness_t strings_utf8ness = UTF8NESS_UNKNOWN;
6415         char * scratch_buf = NULL;
6416         Size_t scratch_buf_size = 0;
6417
6418         /* List of strings to look at */
6419         const int trials[] = {
6420
6421 #          if defined(USE_LOCALE_MONETARY) && defined(HAS_LOCALECONV)
6422
6423             /* The first string tried is the locale currency name.  Often that
6424              * will be in the native script.
6425              *
6426              * But this is usable only if localeconv() is available, as that's
6427              * the way we find out the currency symbol. */
6428
6429             CRNCYSTR,
6430
6431 #          endif
6432 #          ifdef USE_LOCALE_TIME
6433
6434         /* We can also try various strings associated with LC_TIME, like the
6435          * names of months or days of the week */
6436
6437             DAY_1, DAY_2, DAY_3, DAY_4, DAY_5, DAY_6, DAY_7,
6438             MON_1, MON_2, MON_3, MON_4, MON_5, MON_6, MON_7, MON_8,
6439                                         MON_9, MON_10, MON_11, MON_12,
6440             ALT_DIGITS, AM_STR, PM_STR,
6441             ABDAY_1, ABDAY_2, ABDAY_3, ABDAY_4, ABDAY_5, ABDAY_6, ABDAY_7,
6442             ABMON_1, ABMON_2, ABMON_3, ABMON_4, ABMON_5, ABMON_6,
6443             ABMON_7, ABMON_8, ABMON_9, ABMON_10, ABMON_11, ABMON_12
6444
6445 #          endif
6446         };
6447
6448 #          ifdef USE_LOCALE_TIME
6449
6450         /* The code in the recursive call below can handle switching the
6451          * locales, but by doing it now here, that code will check and discover
6452          * that there is no need to switch then restore, avoiding those each
6453          * loop iteration */
6454         const char * orig_TIME_locale = toggle_locale_c(LC_TIME, locale);
6455
6456 #          endif
6457
6458         /* The trials array may consist of strings from two different locale
6459          * categories.  The call to my_langinfo_i() below needs to pass the
6460          * proper category for each string.  There is a max of 1 trial for
6461          * LC_MONETARY; the rest are LC_TIME.  So the array is arranged so the
6462          * LC_MONETARY item (if any) is first, and all subsequent iterations
6463          * will use LC_TIME.  These #ifdefs set up the values for all possible
6464          * combinations. */
6465 #          if defined(USE_LOCALE_MONETARY) && defined(HAS_LOCALECONV)
6466
6467         locale_category_index  cat_index = LC_MONETARY_INDEX_;
6468
6469 #            ifdef USE_LOCALE_TIME
6470
6471         const locale_category_index  follow_on_cat_index = LC_TIME_INDEX_;
6472         assert(trials[1] == DAY_1); /* Make sure only a single non-time entry */
6473
6474 #            else
6475
6476         /* Effectively out-of-bounds, as there is only the monetary entry */
6477         const locale_category_index  follow_on_cat_index = LC_ALL_INDEX_;
6478
6479 #            endif
6480 #          elif defined(USE_LOCALE_TIME)
6481
6482         locale_category_index  cat_index = LC_TIME_INDEX_;
6483         const locale_category_index  follow_on_cat_index = LC_TIME_INDEX_;
6484
6485 #          else
6486
6487         /* Effectively out-of-bounds, as here there are no trial entries at
6488          * all.  This allows this code to compile, but there are no strings to
6489          * test, and so the answer will always be non-UTF-8. */
6490         locale_category_index  cat_index = LC_ALL_INDEX_;
6491         const locale_category_index  follow_on_cat_index = LC_ALL_INDEX_;
6492 #          endif
6493
6494         /* Everything set up; look through all the strings */
6495         for (PERL_UINT_FAST8_T i = 0; i < C_ARRAY_LENGTH(trials); i++) {
6496             (void) my_langinfo_i(trials[i], cat_index, locale,
6497                                  &scratch_buf, &scratch_buf_size, NULL);
6498             cat_index = follow_on_cat_index;
6499
6500             /* To prevent infinite recursive calls, we don't ask for the
6501              * UTF-8ness of the string (in 'trials[i]') above.  Instead we
6502              * examine the returned string here */
6503             const Size_t len = strlen(scratch_buf);
6504             const U8 * first_variant;
6505
6506             /* If the string is identical whether or not it is encoded as
6507              * UTF-8, it isn't helpful in determining UTF8ness. */
6508             if (is_utf8_invariant_string_loc((U8 *) scratch_buf, len,
6509                                              &first_variant))
6510             {
6511                 continue;
6512             }
6513
6514             /* Here, has non-ASCII.  If not legal UTF-8, isn't a UTF-8
6515              * locale */
6516             if (! is_utf8_string(first_variant,
6517                                  len - (first_variant - (U8 *) scratch_buf)))
6518             {
6519                 strings_utf8ness = UTF8NESS_NO;
6520                 break;
6521             }
6522
6523             /* Here, is a legal non-ASCII UTF-8 string; tentatively set the
6524              * return to YES; possibly overridden by later iterations */
6525             strings_utf8ness = UTF8NESS_YES;
6526
6527             /* But if this corroborates our expectation, quit now */
6528             if (lean_towards_being_utf8 & NAME_INDICATES_UTF8) {
6529                 break;
6530             }
6531         }
6532
6533 #          ifdef USE_LOCALE_TIME
6534
6535         restore_toggled_locale_c(LC_TIME, orig_TIME_locale);
6536
6537 #          endif
6538
6539         Safefree(scratch_buf);
6540         scratch_buf = NULL;
6541
6542         if (strings_utf8ness == UTF8NESS_NO) {
6543             /* 'retval' is already loaded with whatever code set we found. */
6544             break;
6545         }
6546
6547         /* Here all tested strings are legal UTF-8.
6548          *
6549          * Above we set UTF8NESS_YES if any string wasn't ASCII.  But even if
6550          * they are all ascii, and the locale name indicates it is a UTF-8
6551          * locale, assume the locale is UTF-8. */
6552         if (lean_towards_being_utf8) {
6553             strings_utf8ness = UTF8NESS_YES;
6554         }
6555
6556         if (strings_utf8ness == UTF8NESS_YES) {
6557             retval = "UTF-8";
6558             break;
6559         }
6560
6561         /* Here, nothing examined indicates that the codeset is or isn't UTF-8.
6562          * But what is it?  The other locale categories are not likely to be of
6563          * further help:
6564          *
6565          * LC_NUMERIC   Only a few locales in the world have a non-ASCII radix
6566          *              or group separator.
6567          * LC_CTYPE     This code wouldn't be compiled if mbtowc() existed and
6568          *              was reliable.  This is unlikely in C99.  There are
6569          *              other functions that could be used instead, but are
6570          *              they going to exist, and be able to distinguish between
6571          *              UTF-8 and 8859-1?  Deal with this only if it becomes
6572          *              necessary.
6573          * LC_MESSAGES  The strings returned from strerror() would seem likely
6574          *              candidates, but experience has shown that many systems
6575          *              don't actually have translations installed for them.
6576          *              They are instead always in English, so everything in
6577          *              them is ASCII, which is of no help to us.  A Configure
6578          *              probe could possibly be written to see if this platform
6579          *              has non-ASCII error messages.  But again, wait until it
6580          *              turns out to be an actual problem.
6581          *
6582          *              Things like YESSTR, NOSTR, might not be in ASCII, but
6583          *              need nl_langinfo() to access, which we don't have.
6584          */
6585
6586         /* Otherwise, assume the locale isn't UTF-8.  This can be wrong if we
6587          * don't have MB_CUR_MAX, and the locale is English without UTF-8 in
6588          * its name, and with a dollar currency symbol. */
6589         break; /* 'retval' is already loaded with whatever code set we found. */
6590
6591 #        endif  /* NEED_FURTHER_UTF8NESS_CHECKING */
6592 #      endif    /* ! WIN32 */
6593 #    endif      /* USE_LOCALE_CTYPE */
6594
6595     } /* Giant switch() of nl_langinfo() items */
6596
6597     restore_toggled_locale_i(cat_index, orig_switched_locale);
6598
6599 #    ifdef USE_LOCALE_CTYPE
6600     restore_toggled_locale_c(LC_CTYPE, orig_CTYPE_locale);
6601 #    endif
6602
6603     if (utf8ness) {
6604         *utf8ness = is_utf8;
6605     }
6606
6607     return retval;
6608
6609 #  endif    /* All the implementations of my_langinfo() */
6610
6611 /*--------------------------------------------------------------------------*/
6612
6613 }   /* my_langinfo() */
6614
6615 #endif      /* USE_LOCALE */
6616
6617 /*
6618 =for apidoc_section $time
6619 =for apidoc      sv_strftime_tm
6620 =for apidoc_item sv_strftime_ints
6621 =for apidoc_item my_strftime
6622
6623 These implement the libc strftime(), but with a different API so that the return
6624 value is a pointer to the formatted result (which MUST be arranged to be FREED
6625 BY THE CALLER).  This allows these functions to increase the buffer size as
6626 needed, so that the caller doesn't have to worry about that.
6627
6628 On failure they return NULL, and set errno to C<EINVAL>.
6629
6630 C<sv_strftime_tm> and C<sv_strftime_ints> are preferred, as they transparently
6631 handle the UTF-8ness of the current locale, the input C<fmt>, and the returned
6632 result.  Only if the current C<LC_TIME> locale is a UTF-8 one (and S<C<use
6633 bytes>> is not in effect) will the result be marked as UTF-8.  These differ
6634 only in the form of their inputs.  C<sv_strftime_tm> takes a filled-in
6635 S<C<struct tm>> parameter.  C<sv_strftime_ints> takes a bunch of integer
6636 parameters that together completely define a given time.
6637
6638 C<my_strftime> is kept for backwards compatibility.  Knowing if the result
6639 should be considered UTF-8 or not requires significant extra logic.
6640
6641 Note that C<yday> and C<wday> effectively are ignored by C<sv_strftime_ints>
6642 and C<my_strftime>, as mini_mktime() overwrites them
6643
6644 Also note that all three functions are always executed in the underlying
6645 C<LC_TIME> locale of the program, giving results based on that locale.
6646
6647 =cut
6648  */
6649
6650 STATIC void
6651 S_ints_to_tm(pTHX_ struct tm * mytm,
6652                    int sec, int min, int hour, int mday, int mon, int year,
6653                    int wday, int yday, int isdst)
6654 {
6655     /* Create a struct tm structure from the input time-related integer
6656      * variables */
6657
6658     /* Override with the passed-in values */
6659     Zero(mytm, 1, struct tm);
6660     mytm->tm_sec = sec;
6661     mytm->tm_min = min;
6662     mytm->tm_hour = hour;
6663     mytm->tm_mday = mday;
6664     mytm->tm_mon = mon;
6665     mytm->tm_year = year;
6666     mytm->tm_wday = wday;
6667     mytm->tm_yday = yday;
6668     mytm->tm_isdst = isdst;
6669     mini_mktime(mytm);
6670
6671     /* use libc to get the values for tm_gmtoff and tm_zone on platforms that
6672      * have them [perl #18238] */
6673 #if  defined(HAS_MKTIME)                                      \
6674  && (defined(HAS_TM_TM_GMTOFF) || defined(HAS_TM_TM_ZONE))
6675     struct tm mytm2 = *mytm;
6676     MKTIME_LOCK;
6677     mktime(&mytm2);
6678     MKTIME_UNLOCK;
6679 #  ifdef HAS_TM_TM_GMTOFF
6680     mytm->tm_gmtoff = mytm2.tm_gmtoff;
6681 #  endif
6682 #  ifdef HAS_TM_TM_ZONE
6683     mytm->tm_zone = mytm2.tm_zone;
6684 #  endif
6685 #endif
6686
6687     return;
6688 }
6689
6690 STATIC char *
6691 S_strftime_tm(pTHX_ const char *fmt, const struct tm *mytm)
6692 {
6693     PERL_ARGS_ASSERT_STRFTIME_TM;
6694
6695     /* Execute strftime() based on the input struct tm */
6696
6697     /* An empty format yields an empty result */
6698     const int fmtlen = strlen(fmt);
6699     if (fmtlen == 0) {
6700         char *ret;
6701         Newxz (ret, 1, char);
6702         return ret;
6703     }
6704
6705 #ifndef HAS_STRFTIME
6706     Perl_croak(aTHX_ "panic: no strftime");
6707 #else
6708 #  if defined(USE_LOCALE_CTYPE) && defined(USE_LOCALE_TIME)
6709
6710     const char * orig_CTYPE_LOCALE = toggle_locale_c(LC_CTYPE,
6711                                                      querylocale_c(LC_TIME));
6712 #  endif
6713
6714     /* Guess an initial size for the returned string based on an expansion
6715      * factor of the input format, but with a minimum that should handle most
6716      * common cases.  If this guess is too small, we will try again with a
6717      * larger one */
6718     int bufsize = MAX(fmtlen * 2, 64);
6719
6720     char *buf = NULL;   /* Makes Renew() act as Newx() on the first iteration */
6721     do {
6722         Renew(buf, bufsize, char);
6723
6724         /* allowing user-supplied (rather than literal) formats is normally
6725          * frowned upon as a potential security risk; but this is part of the
6726          * API so we have to allow it (and the available formats have a much
6727          * lower chance of doing something bad than the ones for printf etc. */
6728         GCC_DIAG_IGNORE_STMT(-Wformat-nonliteral);
6729
6730         STRFTIME_LOCK;
6731         int len = strftime(buf, bufsize, fmt, mytm);
6732         STRFTIME_UNLOCK;
6733
6734         GCC_DIAG_RESTORE_STMT;
6735
6736         /* A non-zero return indicates success.  But to make sure we're not
6737          * dealing with some rogue strftime that returns how much space it
6738          * needs instead of 0 when there isn't enough, check that the return
6739          * indicates we have at least one byte of spare space (which will be
6740          * used for the terminating NUL). */
6741         if (inRANGE(len, 1, bufsize - 1)) {
6742             goto strftime_return;
6743         }
6744
6745         /* There are several possible reasons for a 0 return code for a
6746          * non-empty format, and they are not trivial to tease apart.  This
6747          * issue is a known bug in the strftime() API.  What we do to cope is
6748          * to assume that the reason is not enough space in the buffer, so
6749          * increase it and try again. */
6750         bufsize *= 2;
6751
6752         /* But don't just keep increasing the size indefinitely.  Stop when it
6753          * becomes obvious that the reason for failure is something besides not
6754          * enough space.  The most likely largest expanding format is %c.  On
6755          * khw's Linux box, the maximum result of this is 67 characters, in the
6756          * km_KH locale.  If a new script comes along that uses 4 UTF-8 bytes
6757          * per character, and with a similar expansion factor, that would be a
6758          * 268:2 byte ratio, or a bit more than 128:1 = 2**7:1.  Some strftime
6759          * implementations allow you to say %1000c to pad to 1000 bytes.  This
6760          * shows that it is impossible to implement this without a heuristic
6761          * (which can fail).  But it indicates we need to be generous in the
6762          * upper limit before failing.  The previous heuristic used was too
6763          * stingy.  Since the size doubles per iteration, it doesn't take many
6764          * to reach the limit */
6765     } while (bufsize < ((1 << 11) + 1) * fmtlen);
6766
6767     /* Here, strftime() returned 0, and it likely wasn't for lack of space.
6768      * There are two possible reasons:
6769      *
6770      * First is that the result is legitimately 0 length.  This can happen
6771      * when the format is precisely "%p".  That is the only documented format
6772      * that can have an empty result. */
6773     if (strEQ(fmt, "%p")) {
6774         Renew(buf, 1, char);
6775         *buf = '\0';
6776         goto strftime_return;
6777     }
6778
6779     /* The other reason is that the format string is malformed.  Probably it is
6780      * that the string is syntactically invalid for the locale.  On some
6781      * platforms an invalid conversion specifier '%?' (for all illegal '?') is
6782      * treated as a literal, but others may fail when '?' is illegal */
6783     Safefree(buf);
6784     SET_EINVAL;
6785     buf = NULL;
6786
6787   strftime_return:
6788
6789 #  if defined(USE_LOCALE_CTYPE) && defined(USE_LOCALE_TIME)
6790
6791     restore_toggled_locale_c(LC_CTYPE, orig_CTYPE_LOCALE);
6792
6793 #  endif
6794
6795     return buf;
6796
6797 #endif
6798
6799 }
6800
6801 STATIC char *
6802 S_strftime8(pTHX_ const char * fmt,
6803                   const struct tm * mytm,
6804                   const utf8ness_t fmt_utf8ness,
6805                   utf8ness_t * result_utf8ness,
6806                   const bool came_from_sv)
6807 {
6808     PERL_ARGS_ASSERT_STRFTIME8;
6809
6810     /* Wrap strftime_tm, taking into account the input and output UTF-8ness */
6811
6812 #ifdef USE_LOCALE_TIME
6813 #  define INDEX_TO_USE  LC_TIME_INDEX_
6814
6815     const char * locale = querylocale_c(LC_TIME);
6816     locale_utf8ness_t locale_utf8ness = LOCALE_UTF8NESS_UNKNOWN;
6817
6818 #else
6819 #  define INDEX_TO_USE  LC_ALL_INDEX_   /* Effectively out of bounds */
6820
6821     const char * locale = "C";
6822     locale_utf8ness_t locale_utf8ness = LOCALE_NOT_UTF8;
6823
6824 #endif
6825
6826     switch (fmt_utf8ness) {
6827       case UTF8NESS_IMMATERIAL:
6828         break;
6829
6830       case UTF8NESS_NO: /* Known not to be UTF-8; must not be UTF-8 locale */
6831         if (is_locale_utf8(locale)) {
6832             SET_EINVAL;
6833             return NULL;
6834         }
6835
6836         locale_utf8ness = LOCALE_NOT_UTF8;
6837         break;
6838
6839       case UTF8NESS_YES:    /* Known to be UTF-8; must be UTF-8 locale if can't
6840                                downgrade. */
6841         if (! is_locale_utf8(locale)) {
6842             locale_utf8ness = LOCALE_NOT_UTF8;
6843
6844             bool is_utf8 = true;
6845             Size_t fmt_len = strlen(fmt);
6846             fmt = (char *) bytes_from_utf8((U8 *) fmt, &fmt_len, &is_utf8);
6847             if (is_utf8) {
6848                 SET_EINVAL;
6849                 return NULL;
6850             }
6851
6852             SAVEFREEPV(fmt);
6853         }
6854         else {
6855             locale_utf8ness = LOCALE_IS_UTF8;
6856         }
6857
6858         break;
6859
6860       case UTF8NESS_UNKNOWN:
6861         if (! is_locale_utf8(locale)) {
6862             locale_utf8ness = LOCALE_NOT_UTF8;
6863         }
6864         else {
6865             locale_utf8ness = LOCALE_IS_UTF8;
6866             if (came_from_sv) {
6867
6868                 /* Upgrade 'fmt' to UTF-8 for a UTF-8 locale.  Otherwise the
6869                  * locale would find any UTF-8 variant characters to be
6870                  * malformed */
6871                 Size_t fmt_len = strlen(fmt);
6872                 fmt = (char *) bytes_to_utf8((U8 *) fmt, &fmt_len);
6873                 SAVEFREEPV(fmt);
6874             }
6875         }
6876
6877         break;
6878     }
6879
6880     char * retval = strftime_tm(fmt, mytm);
6881     *result_utf8ness = get_locale_string_utf8ness_i(retval,
6882                                                     locale_utf8ness,
6883                                                     locale,
6884                                                     INDEX_TO_USE);
6885     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
6886                           "fmt=%s, retval=%s; utf8ness=%d",
6887                           fmt,
6888                           ((is_utf8_string((U8 *) retval, 0))
6889                            ? retval
6890                            :_byte_dump_string((U8 *) retval, strlen(retval),0)),
6891                           *result_utf8ness));
6892     return retval;
6893
6894 #undef INDEX_TO_USE
6895
6896 }
6897
6898 #ifdef USE_LOCALE
6899
6900 STATIC void
6901 S_give_perl_locale_control(pTHX_
6902 #  ifdef LC_ALL
6903                            const char * lc_all_string,
6904 #  else
6905                            const char ** locales,
6906 #  endif
6907                            const line_t caller_line)
6908 {
6909     PERL_UNUSED_ARG(caller_line);
6910
6911     /* This is called when the program is in the global locale and are
6912      * switching to per-thread (if available).  And it is called at
6913      * initialization time to do the same.
6914      */
6915
6916 #  if defined(WIN32) && defined(USE_THREAD_SAFE_LOCALE)
6917
6918     /* On Windows, convert to per-thread behavior.  This isn't necessary in
6919      * POSIX 2008, as the conversion gets done automatically in the
6920      * void_setlocale_i() calls below. */
6921     if (_configthreadlocale(_ENABLE_PER_THREAD_LOCALE) == -1) {
6922         locale_panic_("_configthreadlocale returned an error");
6923     }
6924
6925 #  endif
6926 #  if ! defined(USE_THREAD_SAFE_LOCALE)                               \
6927    && ! defined(USE_POSIX_2008_LOCALE)
6928 #    if defined(LC_ALL)
6929     PERL_UNUSED_ARG(lc_all_string);
6930 #    else
6931     PERL_UNUSED_ARG(locales);
6932 #    endif
6933 #  else
6934
6935     /* This platform has per-thread locale handling.  Do the conversion. */
6936
6937 #    if defined(LC_ALL)
6938
6939     void_setlocale_c_with_caller(LC_ALL, lc_all_string, __FILE__, caller_line);
6940
6941 #    else
6942
6943     for_all_individual_category_indexes(i) {
6944         void_setlocale_i_with_caller(i, locales[i], __FILE__, caller_line);
6945     }
6946
6947 #    endif
6948 #  endif
6949
6950     /* Finally, update our remaining records.  'true' => force recalculation.
6951      * This is needed because we don't know what's happened while Perl hasn't
6952      * had control, so we need to figure out the current state */
6953
6954 #  if defined(LC_ALL)
6955
6956     new_LC_ALL(lc_all_string, true);
6957
6958 #    else
6959
6960     new_LC_ALL(calculate_LC_ALL_string(locales,
6961                                        INTERNAL_FORMAT,
6962                                        WANT_TEMP_PV,
6963                                        caller_line),
6964                true);
6965 #    endif
6966
6967 }
6968
6969 STATIC void
6970 S_output_check_environment_warning(pTHX_ const char * const language,
6971                                          const char * const lc_all,
6972                                          const char * const lang)
6973 {
6974     PerlIO_printf(Perl_error_log,
6975                   "perl: warning: Please check that your locale settings:\n");
6976
6977 #  ifdef __GLIBC__
6978
6979     PerlIO_printf(Perl_error_log, "\tLANGUAGE = %c%s%c,\n",
6980                                   language ? '"' : '(',
6981                                   language ? language : "unset",
6982                                   language ? '"' : ')');
6983 #  else
6984     PERL_UNUSED_ARG(language);
6985 #  endif
6986
6987     PerlIO_printf(Perl_error_log, "\tLC_ALL = %c%s%c,\n",
6988                                   lc_all ? '"' : '(',
6989                                   lc_all ? lc_all : "unset",
6990                                   lc_all ? '"' : ')');
6991
6992     for_all_individual_category_indexes(i) {
6993         const char * value = PerlEnv_getenv(category_names[i]);
6994         PerlIO_printf(Perl_error_log,
6995                       "\t%s = %c%s%c,\n",
6996                       category_names[i],
6997                       value ? '"' : '(',
6998                       value ? value : "unset",
6999                       value ? '"' : ')');
7000     }
7001
7002     PerlIO_printf(Perl_error_log, "\tLANG = %c%s%c\n",
7003                                   lang ? '"' : '(',
7004                                   lang ? lang : "unset",
7005                                   lang ? '"' : ')');
7006     PerlIO_printf(Perl_error_log,
7007                   "    are supported and installed on your system.\n");
7008 }
7009
7010 #endif
7011
7012 /* A helper macro for the next function.  Needed because would be called in two
7013  * places.  Knows about the internal workings of the function */
7014 #define GET_DESCRIPTION(trial, name)                                    \
7015     ((isNAME_C_OR_POSIX(name))                                          \
7016      ? "the standard locale"                                            \
7017      : ((trial == (system_default_trial)                                \
7018                   ? "the system default locale"                         \
7019                   : "a fallback locale")))
7020
7021 /*
7022  * Initialize locale awareness.
7023  */
7024 int
7025 Perl_init_i18nl10n(pTHX_ int printwarn)
7026 {
7027     /* printwarn is:
7028      *    0 if not to output warning when setup locale is bad
7029      *    1 if to output warning based on value of PERL_BADLANG
7030      *    >1 if to output regardless of PERL_BADLANG
7031      *
7032      * returns
7033      *    1 = set ok or not applicable,
7034      *    0 = fallback to a locale of lower priority
7035      *   -1 = fallback to all locales failed, not even to the C locale
7036      *
7037      * Under -DDEBUGGING, if the environment variable PERL_DEBUG_LOCALE_INIT is
7038      * set, debugging information is output.
7039      *
7040      * This routine effectively does the following in most cases:
7041      *
7042      *      basic initialization;
7043      *      asserts that the compiled tables are consistent;
7044      *      initialize data structures;
7045      *      make sure we are in the global locale;
7046      *      setlocale(LC_ALL, "");
7047      *      switch to per-thread locale if applicable;
7048      *
7049      * The "" causes the locale to be set to what the environment variables at
7050      * the time say it should be.
7051      *
7052      * To handle possible failures, the setlocale is expanded to be like:
7053      *
7054      *      trial_locale = pre-first-trial;
7055      *      while (has_another_trial()) {
7056      *          trial_locale = next_trial();
7057      *          if setlocale(LC_ALL, trial_locale) {
7058      *              ok = true;
7059      *              break;
7060      *          }
7061      *
7062      *          had_failure = true;
7063      *          warn();
7064      *      }
7065      *
7066      *      if (had_failure) {
7067      *          warn_even_more();
7068      *          if (! ok) warn_still_more();
7069      *      }
7070      *
7071      * The first trial is either:
7072      *      ""      to examine the environment variables for the locale
7073      *      NULL    to use the values already set for the locale by the program
7074      *              embedding this perl instantiation.
7075      *
7076      * Something is wrong if this trial fails, but there is a sequence of
7077      * fallbacks to try should that happen.  They are given in the enum below.
7078
7079      * If there is no LC_ALL defined on the system, the setlocale() above is
7080      * replaced by a loop setting each individual category separately.
7081      *
7082      * In a non-embeded environment, this code is executed exactly once.  It
7083      * sets up the global locale environment.  At the end, if some sort of
7084      * thread-safety is in effect, it will turn thread 0 into using that, with
7085      * the same locale as the global initially.  thread 0 can then change its
7086      * locale at will without affecting the global one.
7087      *
7088      * At destruction time, thread 0 will revert to the global locale as the
7089      * other threads die.
7090      *
7091      * Care must be taken in an embedded environment.  This code will be
7092      * executed for each instantiation.  Since it changes the global locale, it
7093      * could clash with another running instantiation that isn't using
7094      * per-thread locales.  perlembed suggests having the controlling program
7095      * set each instantiation's locale and set PERL_SKIP_LOCALE_INIT so this
7096      * code uses that without actually changing anything.  Then the onus is on
7097      * the controlling program to prevent any races.  The code below does
7098      * enough locking so as to prevent system calls from overwriting data
7099      * before it is safely copied here, but that isn't a general solution.
7100      */
7101
7102 #ifndef USE_LOCALE
7103
7104     PERL_UNUSED_ARG(printwarn);
7105     const int ok = 1;
7106
7107 #else  /* USE_LOCALE to near the end of the routine */
7108
7109     int ok = 0;
7110
7111 #  ifdef __GLIBC__
7112
7113     const char * const language = PerlEnv_getenv("LANGUAGE");
7114
7115 #  else
7116     const char * const language = NULL;     /* Unused placeholder */
7117 #  endif
7118
7119     /* A later getenv() could zap this, so only use here */
7120     const char * const bad_lang_use_once = PerlEnv_getenv("PERL_BADLANG");
7121
7122     const bool locwarn = (printwarn > 1
7123                           || (          printwarn
7124                               && (    ! bad_lang_use_once
7125                                   || (
7126                                          /* disallow with "" or "0" */
7127                                          *bad_lang_use_once
7128                                        && strNE("0", bad_lang_use_once)))));
7129
7130 #  ifndef DEBUGGING
7131 #    define DEBUG_LOCALE_INIT(a,b,c)
7132 #  else
7133
7134     DEBUG_INITIALIZATION_set(cBOOL(PerlEnv_getenv("PERL_DEBUG_LOCALE_INIT")));
7135
7136 #    define DEBUG_LOCALE_INIT(cat_index, locale, result)                    \
7137         DEBUG_L(PerlIO_printf(Perl_debug_log, "%s\n",                       \
7138                     setlocale_debug_string_i(cat_index, locale, result)));
7139
7140 #    ifdef LC_ALL
7141     assert(categories[LC_ALL_INDEX_] == LC_ALL);
7142     assert(strEQ(category_names[LC_ALL_INDEX_], "LC_ALL"));
7143 #      ifdef USE_POSIX_2008_LOCALE
7144     assert(category_masks[LC_ALL_INDEX_] == LC_ALL_MASK);
7145 #      endif
7146 #    endif
7147
7148     for_all_individual_category_indexes(i) {
7149         assert(category_name_lengths[i] == strlen(category_names[i]));
7150     }
7151
7152 #  endif    /* DEBUGGING */
7153
7154     /* Initialize the per-thread mbrFOO() state variables.  See POSIX.xs for
7155      * why these particular incantations are used. */
7156 #  ifdef HAS_MBRLEN
7157     memzero(&PL_mbrlen_ps, sizeof(PL_mbrlen_ps));
7158 #  endif
7159 #  ifdef HAS_MBRTOWC
7160     memzero(&PL_mbrtowc_ps, sizeof(PL_mbrtowc_ps));
7161 #  endif
7162 #  ifdef HAS_WCTOMBR
7163     wcrtomb(NULL, L'\0', &PL_wcrtomb_ps);
7164 #  endif
7165 #  ifdef USE_PL_CURLOCALES
7166
7167     for (unsigned int i = 0; i <= LC_ALL_INDEX_; i++) {
7168         PL_curlocales[i] = savepv("C");
7169     }
7170
7171 #  endif
7172 #  ifdef USE_PL_CUR_LC_ALL
7173
7174     PL_cur_LC_ALL = savepv("C");
7175
7176 #  endif
7177 #  if ! defined(PERL_LC_ALL_USES_NAME_VALUE_PAIRS) && defined(LC_ALL)
7178
7179     LOCALE_LOCK;
7180
7181     /* If we haven't done so already, translate the LC_ALL positions of
7182      * categories into our internal indices. */
7183     if (map_LC_ALL_position_to_index[0] == LC_ALL_INDEX_) {
7184
7185         /* Use this array, initialized by a config.h constant */
7186         int lc_all_category_positions[] = PERL_LC_ALL_CATEGORY_POSITIONS_INIT;
7187         STATIC_ASSERT_STMT(   C_ARRAY_LENGTH(lc_all_category_positions)
7188                            == LC_ALL_INDEX_);
7189
7190         for (unsigned int i = 0;
7191              i < C_ARRAY_LENGTH(lc_all_category_positions);
7192              i++)
7193         {
7194             map_LC_ALL_position_to_index[i] =
7195                               get_category_index(lc_all_category_positions[i]);
7196         }
7197     }
7198
7199     LOCALE_UNLOCK;
7200
7201 #  endif
7202 #  ifdef USE_POSIX_2008_LOCALE
7203
7204     /* This is a global, so be sure to keep another instance from zapping it */
7205     LOCALE_LOCK;
7206     if (PL_C_locale_obj) {
7207         LOCALE_UNLOCK;
7208     }
7209     else {
7210         PL_C_locale_obj = newlocale(LC_ALL_MASK, "C", (locale_t) 0);
7211         if (! PL_C_locale_obj) {
7212             LOCALE_UNLOCK;
7213             locale_panic_(Perl_form(aTHX_
7214                                 "Cannot create POSIX 2008 C locale object"));
7215         }
7216         LOCALE_UNLOCK;
7217
7218         DEBUG_Lv(PerlIO_printf(Perl_debug_log, "created C object %p\n",
7219                                                PL_C_locale_obj));
7220     }
7221
7222     /* Switch to using the POSIX 2008 interface now.  This would happen below
7223      * anyway, but deferring it can lead to leaks of memory that would also get
7224      * malloc'd in the interim.  We arbitrarily switch to the C locale,
7225      * overridden below  */
7226     if (! uselocale(PL_C_locale_obj)) {
7227         locale_panic_(Perl_form(aTHX_
7228                                 "Can't uselocale(%p), LC_ALL supposed to"
7229                                 " be 'C'",
7230                                 PL_C_locale_obj));
7231     }
7232
7233 #    ifdef MULTIPLICITY
7234
7235     PL_cur_locale_obj = PL_C_locale_obj;
7236
7237 #    endif
7238 #  endif
7239
7240     /* Now initialize some data structures.  This is entirely so that
7241      * later-executed code doesn't have to concern itself with things not being
7242      * initialized.  Arbitrarily use the C locale (which we know has to exist
7243      * on the system). */
7244
7245 #  ifdef USE_LOCALE_NUMERIC
7246
7247     PL_numeric_radix_sv    = newSV(1);
7248     PL_underlying_radix_sv = newSV(1);
7249     Newxz(PL_numeric_name, 1, char);    /* Single NUL character */
7250
7251 #  endif
7252 #  ifdef USE_LOCALE_COLLATE
7253
7254     Newxz(PL_collation_name, 1, char);
7255
7256 #  endif
7257 #  ifdef USE_LOCALE_CTYPE
7258
7259     Newxz(PL_ctype_name, 1, char);
7260
7261 #  endif
7262
7263     new_LC_ALL("C", true /* Don't shortcut */);
7264
7265 /*===========================================================================*/
7266
7267     /* Now ready to override the initialization with the values that the user
7268      * wants.  This is done in the global locale as explained in the
7269      * introductory comments to this function */
7270     switch_to_global_locale();
7271
7272     const char * const lc_all     = PerlEnv_getenv("LC_ALL");
7273     const char * const lang       = PerlEnv_getenv("LANG");
7274
7275     /* We try each locale in the enum, in order, until we get one that works,
7276      * or exhaust the list.  Normally the loop is executed just once.
7277      *
7278      * Each enum value is +1 from the previous */
7279     typedef enum {
7280             dummy_trial       = -1,
7281             environment_trial =  0,     /* "" or NULL; code below assumes value
7282                                            0 is the first real trial */
7283             LC_ALL_trial,               /* ENV{LC_ALL} */
7284             LANG_trial,                 /* ENV{LANG} */
7285             system_default_trial,       /* Windows .ACP */
7286             C_trial,                    /* C locale */
7287             beyond_final_trial,
7288     } trials;
7289
7290     trials trial;
7291     unsigned int already_checked = 0;
7292     const char * checked[C_trial];
7293
7294 #  ifdef LC_ALL
7295     const char * lc_all_string;
7296 #  else
7297     const char * curlocales[LC_ALL_INDEX_];
7298 #  endif
7299
7300     /* Loop through the initial setting and all the possible fallbacks,
7301      * breaking out of the loop on success */
7302     trial = dummy_trial;
7303     while (trial != beyond_final_trial) {
7304
7305         /* Each time through compute the next trial to use based on the one in
7306          * the previous iteration and switch to the new one.  This enforces the
7307          * order in which the fallbacks are applied */
7308       next_trial:
7309         trial = (trials) ((int) trial + 1);     /* Casts are needed for g++ */
7310
7311         const char * locale = NULL;
7312
7313         /* Set up the parameters for this trial */
7314         switch (trial) {
7315           case dummy_trial:
7316             locale_panic_("Unexpectedly got 'dummy_trial");
7317             break;
7318
7319           case environment_trial:
7320             /* This is either "" to get the values from the environment, or
7321              * NULL if the calling program has initialized the values already.
7322              * */
7323             locale = (PerlEnv_getenv("PERL_SKIP_LOCALE_INIT"))
7324                      ? NULL
7325                      : "";
7326             break;
7327
7328           case LC_ALL_trial:
7329             if (! lc_all || strEQ(lc_all, "")) {
7330                 continue;   /* No-op */
7331             }
7332
7333             locale = lc_all;
7334             break;
7335
7336           case LANG_trial:
7337             if (! lang || strEQ(lang, "")) {
7338                 continue;   /* No-op */
7339             }
7340
7341             locale = lang;
7342             break;
7343
7344           case system_default_trial:
7345
7346 #  if ! defined(WIN32) || ! defined(LC_ALL)
7347
7348             continue;   /* No-op */
7349
7350 #  else
7351             /* For Windows, we also try the system default locale before "C".
7352              * (If there exists a Windows without LC_ALL we skip this because
7353              * it gets too complicated.  For those, "C" is the next fallback
7354              * possibility). */
7355             locale = ".ACP";
7356 #  endif
7357             break;
7358
7359           case C_trial:
7360             locale = "C";
7361             break;
7362
7363           case beyond_final_trial:
7364             continue;     /* No-op, causes loop to exit */
7365         }
7366
7367         /* If the locale is a substantive name, don't try the same locale
7368          * twice. */
7369         if (locale && strNE(locale, "")) {
7370             for (unsigned int i = 0; i < already_checked; i++) {
7371                 if (strEQ(checked[i], locale)) {
7372                     goto next_trial;
7373                 }
7374             }
7375
7376             /* And, for future iterations, indicate we've tried this locale */
7377             assert(already_checked < C_ARRAY_LENGTH(checked));
7378             checked[already_checked] = savepv(locale);
7379             SAVEFREEPV(checked[already_checked]);
7380             already_checked++;
7381         }
7382
7383 #  ifdef LC_ALL
7384
7385         STDIZED_SETLOCALE_LOCK;
7386         lc_all_string = savepv(stdized_setlocale(LC_ALL, locale));
7387         STDIZED_SETLOCALE_UNLOCK;
7388
7389         DEBUG_LOCALE_INIT(LC_ALL_INDEX_, locale, lc_all_string);
7390
7391         if (LIKELY(lc_all_string)) {     /* Succeeded */
7392             ok = 1;
7393             break;
7394         }
7395
7396         if (trial == 0 && locwarn) {
7397             PerlIO_printf(Perl_error_log,
7398                                   "perl: warning: Setting locale failed.\n");
7399             output_check_environment_warning(language, lc_all, lang);
7400         }
7401
7402 #  else /* Below is ! LC_ALL */
7403
7404         bool setlocale_failure = FALSE;  /* This trial hasn't failed so far */
7405         bool dowarn = trial == 0 && locwarn;
7406
7407         for_all_individual_category_indexes(j) {
7408             STDIZED_SETLOCALE_LOCK;
7409             curlocales[j] = savepv(stdized_setlocale(categories[j], locale));
7410             STDIZED_SETLOCALE_UNLOCK;
7411
7412             DEBUG_LOCALE_INIT(j, locale, curlocales[j]);
7413
7414             if (UNLIKELY(! curlocales[j])) {
7415                 setlocale_failure = TRUE;
7416
7417                 /* If are going to warn below, continue to loop so all failures
7418                  * are included in the message */
7419                 if (! dowarn) {
7420                     break;
7421                 }
7422             }
7423         }
7424
7425         if (LIKELY(! setlocale_failure)) {  /* All succeeded */
7426             ok = 1;
7427             break;  /* Exit trial_locales loop */
7428         }
7429
7430         /* Here, this trial failed */
7431
7432         if (dowarn) {
7433             PerlIO_printf(Perl_error_log,
7434                 "perl: warning: Setting locale failed for the categories:\n");
7435
7436             for_all_individual_category_indexes(j) {
7437                 if (! curlocales[j]) {
7438                     PerlIO_printf(Perl_error_log, "\t%s\n", category_names[j]);
7439                 }
7440             }
7441
7442             output_check_environment_warning(language, lc_all, lang);
7443         }   /* end of warning on first failure */
7444
7445 #  endif /* LC_ALL */
7446
7447     }   /* end of looping through the trial locales */
7448
7449     /* If we had to do more than the first trial, it means that one failed, and
7450      * we may need to output a warning, and, if none worked, do more */
7451     if (UNLIKELY(trial != 0)) {
7452         if (locwarn) {
7453             const char * description = "a fallback locale";
7454             const char * name = NULL;;
7455
7456             /* If we didn't find a good fallback, list all we tried */
7457             if (! ok && already_checked > 0) {
7458                 PerlIO_printf(Perl_error_log, "perl: warning: Failed to fall"
7459                                               " back to ");
7460                 if (already_checked > 1) {  /* more than one was tried */
7461                     PerlIO_printf(Perl_error_log, "any of:\n");
7462                 }
7463
7464                 while (already_checked > 0) {
7465                     name = checked[--already_checked];
7466                     description = GET_DESCRIPTION(trial, name);
7467                     PerlIO_printf(Perl_error_log, "%s (\"%s\")\n",
7468                                                   description, name);
7469                 }
7470             }
7471
7472             if (ok) {
7473
7474                 /* Here, a fallback worked.  So we have saved its name, and the
7475                  * trial that succeeded is still valid */
7476 #  ifdef LC_ALL
7477                 const char * individ_locales[LC_ALL_INDEX_] = { NULL };
7478
7479                 /* Even though we know the valid string for LC_ALL that worked,
7480                  * translate it into our internal format, which is the
7481                  * name=value pairs notation.  This is easier for a human to
7482                  * decipher than the positional notation.  Some platforms
7483                  * can return "C C C C C C" for LC_ALL.  This code also
7484                  * standardizes that result into plain "C". */
7485                 switch (parse_LC_ALL_string(lc_all_string,
7486                                             (const char **) &individ_locales,
7487                                             no_override,
7488                                             false,   /* Return only [0] if
7489                                                         suffices */
7490                                             false,   /* Don't panic on error */
7491                                             __LINE__))
7492                 {
7493                   case invalid:
7494
7495                     /* Here, the parse failed, which shouldn't happen, but if
7496                      * it does, we have an easy fallback that allows us to keep
7497                      * going. */
7498                     name = lc_all_string;
7499                     break;
7500
7501                   case no_array:    /* The original is a single locale */
7502                     name = lc_all_string;
7503                     break;
7504
7505                   case only_element_0:  /* element[0] is a single locale valid
7506                                            for all categories */
7507                     SAVEFREEPV(individ_locales[0]);
7508                     name = individ_locales[0];
7509                     break;
7510
7511                   case full_array:
7512                     name = calculate_LC_ALL_string(individ_locales,
7513                                                    INTERNAL_FORMAT,
7514                                                    WANT_TEMP_PV,
7515                                                    __LINE__);
7516                     for_all_individual_category_indexes(j) {
7517                         Safefree(individ_locales[j]);
7518                     }
7519                 }
7520 #  else
7521                 name = calculate_LC_ALL_string(curlocales,
7522                                                INTERNAL_FORMAT,
7523                                                WANT_TEMP_PV,
7524                                                __LINE__);
7525 #  endif
7526                 description = GET_DESCRIPTION(trial, name);
7527             }
7528             else {
7529
7530                 /* Nothing seems to be working, yet we want to continue
7531                  * executing.  It may well be that locales are mostly
7532                  * irrelevant to this particular program, and there must be
7533                  * some locale underlying the program.  Figure it out as best
7534                  * we can, by querying the system's current locale */
7535
7536 #  ifdef LC_ALL
7537
7538                 STDIZED_SETLOCALE_LOCK;
7539                 name = stdized_setlocale(LC_ALL, NULL);
7540                 STDIZED_SETLOCALE_UNLOCK;
7541
7542                 if (UNLIKELY(! name)) {
7543                     name = "locale name not determinable";
7544                 }
7545
7546 #  else /* Below is ! LC_ALL */
7547
7548                 const char * system_locales[LC_ALL_INDEX_] = { NULL };
7549
7550                 for_all_individual_category_indexes(j) {
7551                     STDIZED_SETLOCALE_LOCK;
7552                     system_locales[j] = savepv(stdized_setlocale(categories[j],
7553                                                                  NULL));
7554                     STDIZED_SETLOCALE_UNLOCK;
7555
7556                     if (UNLIKELY(! system_locales[j])) {
7557                         system_locales[j] = "not determinable";
7558                     }
7559                 }
7560
7561                 /* We use the name=value form for the string, as that is more
7562                  * human readable than the positional notation */
7563                 name = calculate_LC_ALL_string(system_locales,
7564                                                INTERNAL_FORMAT,
7565                                                WANT_TEMP_PV,
7566                                                __LINE__);
7567                 description = "what the system says";
7568
7569                 for_all_individual_category_indexes(j) {
7570                     Safefree(system_locales[j]);
7571                 }
7572 #  endif
7573             }
7574
7575             PerlIO_printf(Perl_error_log,
7576                           "perl: warning: Falling back to %s (\"%s\").\n",
7577                           description, name);
7578
7579             /* Here, ok being true indicates that the first attempt failed, but
7580              * a fallback succeeded; false => nothing working.  Translate to
7581              * API return values. */
7582             ok = (ok) ? 0 : -1;
7583         }
7584     }
7585
7586 #  ifdef LC_ALL
7587
7588     give_perl_locale_control(lc_all_string, __LINE__);
7589     Safefree(lc_all_string);
7590
7591 #  else
7592
7593     give_perl_locale_control((const char **) &curlocales, __LINE__);
7594
7595     for_all_individual_category_indexes(j) {
7596         Safefree(curlocales[j]);
7597     }
7598
7599 #  endif
7600 #  if defined(USE_PERLIO) && defined(USE_LOCALE_CTYPE)
7601
7602     /* Set PL_utf8locale to TRUE if using PerlIO _and_ the current LC_CTYPE
7603      * locale is UTF-8.  give_perl_locale_control() just above has already
7604      * calculated the latter value and saved it in PL_in_utf8_CTYPE_locale. If
7605      * both PL_utf8locale and PL_unicode (set by -C or by $ENV{PERL_UNICODE})
7606      * are true, perl.c:S_parse_body() will turn on the PerlIO :utf8 layer on
7607      * STDIN, STDOUT, STDERR, _and_ the default open discipline.  */
7608     PL_utf8locale = PL_in_utf8_CTYPE_locale;
7609
7610     /* Set PL_unicode to $ENV{PERL_UNICODE} if using PerlIO.
7611        This is an alternative to using the -C command line switch
7612        (the -C if present will override this). */
7613     {
7614          const char *p = PerlEnv_getenv("PERL_UNICODE");
7615          PL_unicode = p ? parse_unicode_opts(&p) : 0;
7616          if (PL_unicode & PERL_UNICODE_UTF8CACHEASSERT_FLAG)
7617              PL_utf8cache = -1;
7618     }
7619
7620 #  endif
7621 #  if defined(USE_POSIX_2008_LOCALE) && defined(MULTIPLICITY)
7622     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
7623                            "finished Perl_init_i18nl10n; actual obj=%p,"
7624                            " expected obj=%p, initial=%s\n",
7625                            uselocale(0), PL_cur_locale_obj,
7626                            get_LC_ALL_display()));
7627 #  endif
7628
7629     /* So won't continue to output stuff */
7630     DEBUG_INITIALIZATION_set(FALSE);
7631
7632 #endif /* USE_LOCALE */
7633
7634     return ok;
7635 }
7636
7637 #undef GET_DESCRIPTION
7638 #ifdef USE_LOCALE_COLLATE
7639
7640 STATIC void
7641 S_compute_collxfrm_coefficients(pTHX)
7642 {
7643
7644     /* A locale collation definition includes primary, secondary, tertiary,
7645      * etc. weights for each character.  To sort, the primary weights are used,
7646      * and only if they compare equal, then the secondary weights are used, and
7647      * only if they compare equal, then the tertiary, etc.
7648      *
7649      * strxfrm() works by taking the input string, say ABC, and creating an
7650      * output transformed string consisting of first the primary weights,
7651      * A¹B¹C¹ followed by the secondary ones, A²B²C²; and then the tertiary,
7652      * etc, yielding A¹B¹C¹ A²B²C² A³B³C³ ....  Some characters may not have
7653      * weights at every level.  In our example, let's say B doesn't have a
7654      * tertiary weight, and A doesn't have a secondary weight.  The constructed
7655      * string is then going to be
7656      *  A¹B¹C¹ B²C² A³C³ ....
7657      * This has the desired effect that strcmp() will look at the secondary or
7658      * tertiary weights only if the strings compare equal at all higher
7659      * priority weights.  The spaces shown here, like in
7660      *  "A¹B¹C¹ A²B²C² "
7661      * are not just for readability.  In the general case, these must actually
7662      * be bytes, which we will call here 'separator weights'; and they must be
7663      * smaller than any other weight value, but since these are C strings, only
7664      * the terminating one can be a NUL (some implementations may include a
7665      * non-NUL separator weight just before the NUL).  Implementations tend to
7666      * reserve 01 for the separator weights.  They are needed so that a shorter
7667      * string's secondary weights won't be misconstrued as primary weights of a
7668      * longer string, etc.  By making them smaller than any other weight, the
7669      * shorter string will sort first.  (Actually, if all secondary weights are
7670      * smaller than all primary ones, there is no need for a separator weight
7671      * between those two levels, etc.)
7672      *
7673      * The length of the transformed string is roughly a linear function of the
7674      * input string.  It's not exactly linear because some characters don't
7675      * have weights at all levels.  When we call strxfrm() we have to allocate
7676      * some memory to hold the transformed string.  The calculations below try
7677      * to find coefficients 'm' and 'b' for this locale so that m*x + b equals
7678      * how much space we need, given the size of the input string in 'x'.  If
7679      * we calculate too small, we increase the size as needed, and call
7680      * strxfrm() again, but it is better to get it right the first time to
7681      * avoid wasted expensive string transformations.
7682      *
7683      * We use the string below to find how long the transformation of it is.
7684      * Almost all locales are supersets of ASCII, or at least the ASCII
7685      * letters.  We use all of them, half upper half lower, because if we used
7686      * fewer, we might hit just the ones that are outliers in a particular
7687      * locale.  Most of the strings being collated will contain a preponderance
7688      * of letters, and even if they are above-ASCII, they are likely to have
7689      * the same number of weight levels as the ASCII ones.  It turns out that
7690      * digits tend to have fewer levels, and some punctuation has more, but
7691      * those are relatively sparse in text, and khw believes this gives a
7692      * reasonable result, but it could be changed if experience so dictates. */
7693     const char longer[] = "ABCDEFGHIJKLMnopqrstuvwxyz";
7694     char * x_longer;        /* Transformed 'longer' */
7695     Size_t x_len_longer;    /* Length of 'x_longer' */
7696
7697     char * x_shorter;   /* We also transform a substring of 'longer' */
7698     Size_t x_len_shorter;
7699
7700     PL_in_utf8_COLLATE_locale = (PL_collation_standard)
7701                                 ? 0
7702                                 : is_locale_utf8(PL_collation_name);
7703     PL_strxfrm_NUL_replacement = '\0';
7704     PL_strxfrm_max_cp = 0;
7705
7706     /* mem_collxfrm_() is used get the transformation (though here we are
7707      * interested only in its length).  It is used because it has the
7708      * intelligence to handle all cases, but to work, it needs some values of
7709      * 'm' and 'b' to get it started.  For the purposes of this calculation we
7710      * use a very conservative estimate of 'm' and 'b'.  This assumes a weight
7711      * can be multiple bytes, enough to hold any UV on the platform, and there
7712      * are 5 levels, 4 weight bytes, and a trailing NUL.  */
7713     PL_collxfrm_base = 5;
7714     PL_collxfrm_mult = 5 * sizeof(UV);
7715
7716     /* Find out how long the transformation really is */
7717     x_longer = mem_collxfrm_(longer,
7718                              sizeof(longer) - 1,
7719                              &x_len_longer,
7720
7721                              /* We avoid converting to UTF-8 in the called
7722                               * function by telling it the string is in UTF-8
7723                               * if the locale is a UTF-8 one.  Since the string
7724                               * passed here is invariant under UTF-8, we can
7725                               * claim it's UTF-8 even if it isn't.  */
7726                               PL_in_utf8_COLLATE_locale);
7727     Safefree(x_longer);
7728
7729     /* Find out how long the transformation of a substring of 'longer' is.
7730      * Together the lengths of these transformations are sufficient to
7731      * calculate 'm' and 'b'.  The substring is all of 'longer' except the
7732      * first character.  This minimizes the chances of being swayed by outliers
7733      * */
7734     x_shorter = mem_collxfrm_(longer + 1,
7735                               sizeof(longer) - 2,
7736                               &x_len_shorter,
7737                               PL_in_utf8_COLLATE_locale);
7738     Safefree(x_shorter);
7739
7740     /* If the results are nonsensical for this simple test, the whole locale
7741      * definition is suspect.  Mark it so that locale collation is not active
7742      * at all for it.  XXX Should we warn? */
7743     if (   x_len_shorter == 0
7744         || x_len_longer == 0
7745         || x_len_shorter >= x_len_longer)
7746     {
7747         PL_collxfrm_mult = 0;
7748         PL_collxfrm_base = 1;
7749         DEBUG_L(PerlIO_printf(Perl_debug_log,
7750                 "Disabling locale collation for LC_COLLATE='%s';"
7751                 " length for shorter sample=%zu; longer=%zu\n",
7752                 PL_collation_name, x_len_shorter, x_len_longer));
7753     }
7754     else {
7755         SSize_t base;       /* Temporary */
7756
7757         /* We have both: m * strlen(longer)  + b = x_len_longer
7758          *               m * strlen(shorter) + b = x_len_shorter;
7759          * subtracting yields:
7760          *          m * (strlen(longer) - strlen(shorter))
7761          *                             = x_len_longer - x_len_shorter
7762          * But we have set things up so that 'shorter' is 1 byte smaller than
7763          * 'longer'.  Hence:
7764          *          m = x_len_longer - x_len_shorter
7765          *
7766          * But if something went wrong, make sure the multiplier is at least 1.
7767          */
7768         if (x_len_longer > x_len_shorter) {
7769             PL_collxfrm_mult = (STRLEN) x_len_longer - x_len_shorter;
7770         }
7771         else {
7772             PL_collxfrm_mult = 1;
7773         }
7774
7775         /*     mx + b = len
7776          * so:      b = len - mx
7777          * but in case something has gone wrong, make sure it is non-negative
7778          * */
7779         base = x_len_longer - PL_collxfrm_mult * (sizeof(longer) - 1);
7780         if (base < 0) {
7781             base = 0;
7782         }
7783
7784         /* Add 1 for the trailing NUL */
7785         PL_collxfrm_base = base + 1;
7786     }
7787
7788     DEBUG_L(PerlIO_printf(Perl_debug_log,
7789                           "?UTF-8 locale=%d; x_len_shorter=%zu, "
7790                           "x_len_longer=%zu,"
7791                           " collate multipler=%zu, collate base=%zu\n",
7792                           PL_in_utf8_COLLATE_locale,
7793                           x_len_shorter, x_len_longer,
7794                           PL_collxfrm_mult, PL_collxfrm_base));
7795 }
7796
7797 char *
7798 Perl_mem_collxfrm_(pTHX_ const char *input_string,
7799                          STRLEN len,    /* Length of 'input_string' */
7800                          STRLEN *xlen,  /* Set to length of returned string
7801                                            (not including the collation index
7802                                            prefix) */
7803                          bool utf8      /* Is the input in UTF-8? */
7804                    )
7805 {
7806     /* mem_collxfrm_() is like strxfrm() but with two important differences.
7807      * First, it handles embedded NULs. Second, it allocates a bit more memory
7808      * than needed for the transformed data itself.  The real transformed data
7809      * begins at offset COLLXFRM_HDR_LEN.  *xlen is set to the length of that,
7810      * and doesn't include the collation index size.
7811      *
7812      * It is the caller's responsibility to eventually free the memory returned
7813      * by this function.
7814      *
7815      * Please see sv_collxfrm() to see how this is used. */
7816
7817 #  define COLLXFRM_HDR_LEN    sizeof(PL_collation_ix)
7818
7819     char * s = (char *) input_string;
7820     STRLEN s_strlen = strlen(input_string);
7821     char *xbuf = NULL;
7822     STRLEN xAlloc;          /* xalloc is a reserved word in VC */
7823     STRLEN length_in_chars;
7824     bool first_time = TRUE; /* Cleared after first loop iteration */
7825
7826 #  ifdef USE_LOCALE_CTYPE
7827         const char * orig_CTYPE_locale = NULL;
7828 #  endif
7829
7830 #  if defined(USE_POSIX_2008_LOCALE) && defined HAS_STRXFRM_L
7831     locale_t constructed_locale = (locale_t) 0;
7832 #  endif
7833
7834     PERL_ARGS_ASSERT_MEM_COLLXFRM_;
7835
7836     /* Must be NUL-terminated */
7837     assert(*(input_string + len) == '\0');
7838
7839     if (PL_collxfrm_mult == 0) {     /* unknown or bad */
7840         if (PL_collxfrm_base != 0) { /* bad collation => skip */
7841             DEBUG_L(PerlIO_printf(Perl_debug_log,
7842                           "mem_collxfrm_: locale's collation is defective\n"));
7843             goto bad;
7844         }
7845
7846         /* (mult, base) == (0,0) means we need to calculate mult and base
7847          * before proceeding */
7848         S_compute_collxfrm_coefficients(aTHX);
7849     }
7850
7851     /* Replace any embedded NULs with the control that sorts before any others.
7852      * This will give as good as possible results on strings that don't
7853      * otherwise contain that character, but otherwise there may be
7854      * less-than-perfect results with that character and NUL.  This is
7855      * unavoidable unless we replace strxfrm with our own implementation. */
7856     if (UNLIKELY(s_strlen < len)) {   /* Only execute if there is an embedded
7857                                          NUL */
7858         char * e = s + len;
7859         char * sans_nuls;
7860         STRLEN sans_nuls_len;
7861         int try_non_controls;
7862         char this_replacement_char[] = "?\0";   /* Room for a two-byte string,
7863                                                    making sure 2nd byte is NUL.
7864                                                  */
7865         STRLEN this_replacement_len;
7866
7867         /* If we don't know what non-NUL control character sorts lowest for
7868          * this locale, find it */
7869         if (PL_strxfrm_NUL_replacement == '\0') {
7870             int j;
7871             char * cur_min_x = NULL;    /* The min_char's xfrm, (except it also
7872                                            includes the collation index
7873                                            prefixed. */
7874
7875             DEBUG_Lv(PerlIO_printf(Perl_debug_log, "Looking to replace NUL\n"));
7876
7877             /* Unlikely, but it may be that no control will work to replace
7878              * NUL, in which case we instead look for any character.  Controls
7879              * are preferred because collation order is, in general, context
7880              * sensitive, with adjoining characters affecting the order, and
7881              * controls are less likely to have such interactions, allowing the
7882              * NUL-replacement to stand on its own.  (Another way to look at it
7883              * is to imagine what would happen if the NUL were replaced by a
7884              * combining character; it wouldn't work out all that well.) */
7885             for (try_non_controls = 0;
7886                  try_non_controls < 2;
7887                  try_non_controls++)
7888             {
7889
7890 #  ifdef USE_LOCALE_CTYPE
7891
7892                 /* In this case we use isCNTRL_LC() below, which relies on
7893                  * LC_CTYPE, so that must be switched to correspond with the
7894                  * LC_COLLATE locale */
7895                 if (! try_non_controls && ! PL_in_utf8_COLLATE_locale) {
7896                     orig_CTYPE_locale = toggle_locale_c(LC_CTYPE,
7897                                                         PL_collation_name);
7898                 }
7899 #  endif
7900                 /* Look through all legal code points (NUL isn't) */
7901                 for (j = 1; j < 256; j++) {
7902                     char * x;       /* j's xfrm plus collation index */
7903                     STRLEN x_len;   /* length of 'x' */
7904                     STRLEN trial_len = 1;
7905                     char cur_source[] = { '\0', '\0' };
7906
7907                     /* Skip non-controls the first time through the loop.  The
7908                      * controls in a UTF-8 locale are the L1 ones */
7909                     if (! try_non_controls && (PL_in_utf8_COLLATE_locale)
7910                                                ? ! isCNTRL_L1(j)
7911                                                : ! isCNTRL_LC(j))
7912                     {
7913                         continue;
7914                     }
7915
7916                     /* Create a 1-char string of the current code point */
7917                     cur_source[0] = (char) j;
7918
7919                     /* Then transform it */
7920                     x = mem_collxfrm_(cur_source, trial_len, &x_len,
7921                                       0 /* The string is not in UTF-8 */);
7922
7923                     /* Ignore any character that didn't successfully transform.
7924                      * */
7925                     if (! x) {
7926                         continue;
7927                     }
7928
7929                     /* If this character's transformation is lower than
7930                      * the current lowest, this one becomes the lowest */
7931                     if (   cur_min_x == NULL
7932                         || strLT(x         + COLLXFRM_HDR_LEN,
7933                                  cur_min_x + COLLXFRM_HDR_LEN))
7934                     {
7935                         PL_strxfrm_NUL_replacement = j;
7936                         Safefree(cur_min_x);
7937                         cur_min_x = x;
7938                     }
7939                     else {
7940                         Safefree(x);
7941                     }
7942                 } /* end of loop through all 255 characters */
7943
7944 #  ifdef USE_LOCALE_CTYPE
7945                 restore_toggled_locale_c(LC_CTYPE, orig_CTYPE_locale);
7946 #  endif
7947
7948                 /* Stop looking if found */
7949                 if (cur_min_x) {
7950                     break;
7951                 }
7952
7953                 /* Unlikely, but possible, if there aren't any controls that
7954                  * work in the locale, repeat the loop, looking for any
7955                  * character that works */
7956                 DEBUG_L(PerlIO_printf(Perl_debug_log,
7957                 "mem_collxfrm_: No control worked.  Trying non-controls\n"));
7958             } /* End of loop to try first the controls, then any char */
7959
7960             if (! cur_min_x) {
7961                 DEBUG_L(PerlIO_printf(Perl_debug_log,
7962                     "mem_collxfrm_: Couldn't find any character to replace"
7963                     " embedded NULs in locale %s with", PL_collation_name));
7964                 goto bad;
7965             }
7966
7967             DEBUG_L(PerlIO_printf(Perl_debug_log,
7968                     "mem_collxfrm_: Replacing embedded NULs in locale %s with "
7969                     "0x%02X\n", PL_collation_name, PL_strxfrm_NUL_replacement));
7970
7971             Safefree(cur_min_x);
7972         } /* End of determining the character that is to replace NULs */
7973
7974         /* If the replacement is variant under UTF-8, it must match the
7975          * UTF8-ness of the original */
7976         if ( ! UVCHR_IS_INVARIANT(PL_strxfrm_NUL_replacement) && utf8) {
7977             this_replacement_char[0] =
7978                                 UTF8_EIGHT_BIT_HI(PL_strxfrm_NUL_replacement);
7979             this_replacement_char[1] =
7980                                 UTF8_EIGHT_BIT_LO(PL_strxfrm_NUL_replacement);
7981             this_replacement_len = 2;
7982         }
7983         else {
7984             this_replacement_char[0] = PL_strxfrm_NUL_replacement;
7985             /* this_replacement_char[1] = '\0' was done at initialization */
7986             this_replacement_len = 1;
7987         }
7988
7989         /* The worst case length for the replaced string would be if every
7990          * character in it is NUL.  Multiply that by the length of each
7991          * replacement, and allow for a trailing NUL */
7992         sans_nuls_len = (len * this_replacement_len) + 1;
7993         Newx(sans_nuls, sans_nuls_len, char);
7994         *sans_nuls = '\0';
7995
7996         /* Replace each NUL with the lowest collating control.  Loop until have
7997          * exhausted all the NULs */
7998         while (s + s_strlen < e) {
7999             my_strlcat(sans_nuls, s, sans_nuls_len);
8000
8001             /* Do the actual replacement */
8002             my_strlcat(sans_nuls, this_replacement_char, sans_nuls_len);
8003
8004             /* Move past the input NUL */
8005             s += s_strlen + 1;
8006             s_strlen = strlen(s);
8007         }
8008
8009         /* And add anything that trails the final NUL */
8010         my_strlcat(sans_nuls, s, sans_nuls_len);
8011
8012         /* Switch so below we transform this modified string */
8013         s = sans_nuls;
8014         len = strlen(s);
8015     } /* End of replacing NULs */
8016
8017     /* Make sure the UTF8ness of the string and locale match */
8018     if (utf8 != PL_in_utf8_COLLATE_locale) {
8019         /* XXX convert above Unicode to 10FFFF? */
8020         const char * const t = s;   /* Temporary so we can later find where the
8021                                        input was */
8022
8023         /* Here they don't match.  Change the string's to be what the locale is
8024          * expecting */
8025
8026         if (! utf8) { /* locale is UTF-8, but input isn't; upgrade the input */
8027             s = (char *) bytes_to_utf8((const U8 *) s, &len);
8028             utf8 = TRUE;
8029         }
8030         else {   /* locale is not UTF-8; but input is; downgrade the input */
8031
8032             s = (char *) bytes_from_utf8((const U8 *) s, &len, &utf8);
8033
8034             /* If the downgrade was successful we are done, but if the input
8035              * contains things that require UTF-8 to represent, have to do
8036              * damage control ... */
8037             if (UNLIKELY(utf8)) {
8038
8039                 /* What we do is construct a non-UTF-8 string with
8040                  *  1) the characters representable by a single byte converted
8041                  *     to be so (if necessary);
8042                  *  2) and the rest converted to collate the same as the
8043                  *     highest collating representable character.  That makes
8044                  *     them collate at the end.  This is similar to how we
8045                  *     handle embedded NULs, but we use the highest collating
8046                  *     code point instead of the smallest.  Like the NUL case,
8047                  *     this isn't perfect, but is the best we can reasonably
8048                  *     do.  Every above-255 code point will sort the same as
8049                  *     the highest-sorting 0-255 code point.  If that code
8050                  *     point can combine in a sequence with some other code
8051                  *     points for weight calculations, us changing something to
8052                  *     be it can adversely affect the results.  But in most
8053                  *     cases, it should work reasonably.  And note that this is
8054                  *     really an illegal situation: using code points above 255
8055                  *     on a locale where only 0-255 are valid.  If two strings
8056                  *     sort entirely equal, then the sort order for the
8057                  *     above-255 code points will be in code point order. */
8058
8059                 utf8 = FALSE;
8060
8061                 /* If we haven't calculated the code point with the maximum
8062                  * collating order for this locale, do so now */
8063                 if (! PL_strxfrm_max_cp) {
8064                     int j;
8065
8066                     /* The current transformed string that collates the
8067                      * highest (except it also includes the prefixed collation
8068                      * index. */
8069                     char * cur_max_x = NULL;
8070
8071                     /* Look through all legal code points (NUL isn't) */
8072                     for (j = 1; j < 256; j++) {
8073                         char * x;
8074                         STRLEN x_len;
8075                         char cur_source[] = { '\0', '\0' };
8076
8077                         /* Create a 1-char string of the current code point */
8078                         cur_source[0] = (char) j;
8079
8080                         /* Then transform it */
8081                         x = mem_collxfrm_(cur_source, 1, &x_len, FALSE);
8082
8083                         /* If something went wrong (which it shouldn't), just
8084                          * ignore this code point */
8085                         if (! x) {
8086                             continue;
8087                         }
8088
8089                         /* If this character's transformation is higher than
8090                          * the current highest, this one becomes the highest */
8091                         if (   cur_max_x == NULL
8092                             || strGT(x         + COLLXFRM_HDR_LEN,
8093                                      cur_max_x + COLLXFRM_HDR_LEN))
8094                         {
8095                             PL_strxfrm_max_cp = j;
8096                             Safefree(cur_max_x);
8097                             cur_max_x = x;
8098                         }
8099                         else {
8100                             Safefree(x);
8101                         }
8102                     }
8103
8104                     if (! cur_max_x) {
8105                         DEBUG_L(PerlIO_printf(Perl_debug_log,
8106                             "mem_collxfrm_: Couldn't find any character to"
8107                             " replace above-Latin1 chars in locale %s with",
8108                             PL_collation_name));
8109                         goto bad;
8110                     }
8111
8112                     DEBUG_L(PerlIO_printf(Perl_debug_log,
8113                             "mem_collxfrm_: highest 1-byte collating character"
8114                             " in locale %s is 0x%02X\n",
8115                             PL_collation_name,
8116                             PL_strxfrm_max_cp));
8117
8118                     Safefree(cur_max_x);
8119                 }
8120
8121                 /* Here we know which legal code point collates the highest.
8122                  * We are ready to construct the non-UTF-8 string.  The length
8123                  * will be at least 1 byte smaller than the input string
8124                  * (because we changed at least one 2-byte character into a
8125                  * single byte), but that is eaten up by the trailing NUL */
8126                 Newx(s, len, char);
8127
8128                 {
8129                     STRLEN i;
8130                     STRLEN d= 0;
8131                     char * e = (char *) t + len;
8132
8133                     for (i = 0; i < len; i+= UTF8SKIP(t + i)) {
8134                         U8 cur_char = t[i];
8135                         if (UTF8_IS_INVARIANT(cur_char)) {
8136                             s[d++] = cur_char;
8137                         }
8138                         else if (UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(t + i, e)) {
8139                             s[d++] = EIGHT_BIT_UTF8_TO_NATIVE(cur_char, t[i+1]);
8140                         }
8141                         else {  /* Replace illegal cp with highest collating
8142                                    one */
8143                             s[d++] = PL_strxfrm_max_cp;
8144                         }
8145                     }
8146                     s[d++] = '\0';
8147                     Renew(s, d, char);   /* Free up unused space */
8148                 }
8149             }
8150         }
8151
8152         /* Here, we have constructed a modified version of the input.  It could
8153          * be that we already had a modified copy before we did this version.
8154          * If so, that copy is no longer needed */
8155         if (t != input_string) {
8156             Safefree(t);
8157         }
8158     }
8159
8160     length_in_chars = (utf8)
8161                       ? utf8_length((U8 *) s, (U8 *) s + len)
8162                       : len;
8163
8164     /* The first element in the output is the collation id, used by
8165      * sv_collxfrm(); then comes the space for the transformed string.  The
8166      * equation should give us a good estimate as to how much is needed */
8167     xAlloc = COLLXFRM_HDR_LEN
8168            + PL_collxfrm_base
8169            + (PL_collxfrm_mult * length_in_chars);
8170     Newx(xbuf, xAlloc, char);
8171     if (UNLIKELY(! xbuf)) {
8172         DEBUG_L(PerlIO_printf(Perl_debug_log,
8173                       "mem_collxfrm_: Couldn't malloc %zu bytes\n", xAlloc));
8174         goto bad;
8175     }
8176
8177     /* Store the collation id */
8178     *(PERL_UINTMAX_T *)xbuf = PL_collation_ix;
8179
8180 #  if defined(USE_POSIX_2008_LOCALE) && defined HAS_STRXFRM_L
8181 #    ifdef USE_LOCALE_CTYPE
8182
8183     constructed_locale = newlocale(LC_CTYPE_MASK, PL_collation_name,
8184                                    duplocale(use_curlocale_scratch()));
8185 #    else
8186
8187     constructed_locale = duplocale(use_curlocale_scratch());
8188
8189 #    endif
8190 #    define my_strxfrm(dest, src, n)  strxfrm_l(dest, src, n,           \
8191                                                 constructed_locale)
8192 #    define CLEANUP_STRXFRM                                             \
8193         STMT_START {                                                    \
8194             if (constructed_locale != (locale_t) 0)                     \
8195                 freelocale(constructed_locale);                         \
8196         } STMT_END
8197 #  else
8198 #    define my_strxfrm(dest, src, n)  strxfrm(dest, src, n)
8199 #    ifdef USE_LOCALE_CTYPE
8200
8201     orig_CTYPE_locale = toggle_locale_c(LC_CTYPE, PL_collation_name);
8202
8203 #      define CLEANUP_STRXFRM                                           \
8204                 restore_toggled_locale_c(LC_CTYPE, orig_CTYPE_locale)
8205 #    else
8206 #      define CLEANUP_STRXFRM  NOOP
8207 #    endif
8208 #  endif
8209
8210     /* Then the transformation of the input.  We loop until successful, or we
8211      * give up */
8212     for (;;) {
8213
8214         errno = 0;
8215         *xlen = my_strxfrm(xbuf + COLLXFRM_HDR_LEN,
8216                            s,
8217                            xAlloc - COLLXFRM_HDR_LEN);
8218
8219
8220         /* If the transformed string occupies less space than we told strxfrm()
8221          * was available, it means it transformed the whole string. */
8222         if (*xlen < xAlloc - COLLXFRM_HDR_LEN) {
8223
8224             /* But there still could have been a problem */
8225             if (errno != 0) {
8226                 DEBUG_L(PerlIO_printf(Perl_debug_log,
8227                        "strxfrm failed for LC_COLLATE=%s; errno=%d, input=%s\n",
8228                        PL_collation_name, errno,
8229                        _byte_dump_string((U8 *) s, len, 0)));
8230                 goto bad;
8231             }
8232
8233             /* Here, the transformation was successful.  Some systems include a
8234              * trailing NUL in the returned length.  Ignore it, using a loop in
8235              * case multiple trailing NULs are returned. */
8236             while (   (*xlen) > 0
8237                    && *(xbuf + COLLXFRM_HDR_LEN + (*xlen) - 1) == '\0')
8238             {
8239                 (*xlen)--;
8240             }
8241
8242             /* If the first try didn't get it, it means our prediction was low.
8243              * Modify the coefficients so that we predict a larger value in any
8244              * future transformations */
8245             if (! first_time) {
8246                 STRLEN needed = *xlen + 1;   /* +1 For trailing NUL */
8247                 STRLEN computed_guess = PL_collxfrm_base
8248                                       + (PL_collxfrm_mult * length_in_chars);
8249
8250                 /* On zero-length input, just keep current slope instead of
8251                  * dividing by 0 */
8252                 const STRLEN new_m = (length_in_chars != 0)
8253                                      ? needed / length_in_chars
8254                                      : PL_collxfrm_mult;
8255
8256                 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
8257                     "initial size of %zu bytes for a length "
8258                     "%zu string was insufficient, %zu needed\n",
8259                     computed_guess, length_in_chars, needed));
8260
8261                 /* If slope increased, use it, but discard this result for
8262                  * length 1 strings, as we can't be sure that it's a real slope
8263                  * change */
8264                 if (length_in_chars > 1 && new_m  > PL_collxfrm_mult) {
8265
8266 #  ifdef DEBUGGING
8267
8268                     STRLEN old_m = PL_collxfrm_mult;
8269                     STRLEN old_b = PL_collxfrm_base;
8270
8271 #  endif
8272
8273                     PL_collxfrm_mult = new_m;
8274                     PL_collxfrm_base = 1;   /* +1 For trailing NUL */
8275                     computed_guess = PL_collxfrm_base
8276                                     + (PL_collxfrm_mult * length_in_chars);
8277                     if (computed_guess < needed) {
8278                         PL_collxfrm_base += needed - computed_guess;
8279                     }
8280
8281                     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
8282                                     "slope is now %zu; was %zu, base "
8283                         "is now %zu; was %zu\n",
8284                         PL_collxfrm_mult, old_m,
8285                         PL_collxfrm_base, old_b));
8286                 }
8287                 else {  /* Slope didn't change, but 'b' did */
8288                     const STRLEN new_b = needed
8289                                         - computed_guess
8290                                         + PL_collxfrm_base;
8291                     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
8292                         "base is now %zu; was %zu\n", new_b, PL_collxfrm_base));
8293                     PL_collxfrm_base = new_b;
8294                 }
8295             }
8296
8297             break;
8298         }
8299
8300         if (UNLIKELY(*xlen >= PERL_INT_MAX)) {
8301             DEBUG_L(PerlIO_printf(Perl_debug_log,
8302                   "mem_collxfrm_: Needed %zu bytes, max permissible is %u\n",
8303                   *xlen, PERL_INT_MAX));
8304             goto bad;
8305         }
8306
8307         /* A well-behaved strxfrm() returns exactly how much space it needs
8308          * (usually not including the trailing NUL) when it fails due to not
8309          * enough space being provided.  Assume that this is the case unless
8310          * it's been proven otherwise */
8311         if (LIKELY(PL_strxfrm_is_behaved) && first_time) {
8312             xAlloc = *xlen + COLLXFRM_HDR_LEN + 1;
8313         }
8314         else { /* Here, either:
8315                 *  1)  The strxfrm() has previously shown bad behavior; or
8316                 *  2)  It isn't the first time through the loop, which means
8317                 *      that the strxfrm() is now showing bad behavior, because
8318                 *      we gave it what it said was needed in the previous
8319                 *      iteration, and it came back saying it needed still more.
8320                 *      (Many versions of cygwin fit this.  When the buffer size
8321                 *      isn't sufficient, they return the input size instead of
8322                 *      how much is needed.)
8323                 * Increase the buffer size by a fixed percentage and try again.
8324                 * */
8325             xAlloc += (xAlloc / 4) + 1;
8326             PL_strxfrm_is_behaved = FALSE;
8327
8328             DEBUG_Lv(PerlIO_printf(Perl_debug_log,
8329                      "mem_collxfrm_ required more space than previously"
8330                      " calculated for locale %s, trying again with new"
8331                      " guess=%zu+%zu\n",
8332                 PL_collation_name,  COLLXFRM_HDR_LEN,
8333                      xAlloc - COLLXFRM_HDR_LEN));
8334         }
8335
8336         Renew(xbuf, xAlloc, char);
8337         if (UNLIKELY(! xbuf)) {
8338             DEBUG_L(PerlIO_printf(Perl_debug_log,
8339                       "mem_collxfrm_: Couldn't realloc %zu bytes\n", xAlloc));
8340             goto bad;
8341         }
8342
8343         first_time = FALSE;
8344     }
8345
8346     CLEANUP_STRXFRM;
8347
8348     DEBUG_L(print_collxfrm_input_and_return(s, s + len, xbuf, *xlen, utf8));
8349
8350     /* Free up unneeded space; retain enough for trailing NUL */
8351     Renew(xbuf, COLLXFRM_HDR_LEN + *xlen + 1, char);
8352
8353     if (s != input_string) {
8354         Safefree(s);
8355     }
8356
8357     return xbuf;
8358
8359   bad:
8360
8361     CLEANUP_STRXFRM;
8362     DEBUG_L(print_collxfrm_input_and_return(s, s + len, NULL, 0, utf8));
8363
8364     Safefree(xbuf);
8365     if (s != input_string) {
8366         Safefree(s);
8367     }
8368     *xlen = 0;
8369
8370     return NULL;
8371 }
8372
8373 #  ifdef DEBUGGING
8374
8375 STATIC void
8376 S_print_collxfrm_input_and_return(pTHX_
8377                                   const char * s,
8378                                   const char * e,
8379                                   const char * xbuf,
8380                                   const STRLEN xlen,
8381                                   const bool is_utf8)
8382 {
8383
8384     PERL_ARGS_ASSERT_PRINT_COLLXFRM_INPUT_AND_RETURN;
8385
8386     PerlIO_printf(Perl_debug_log,
8387                   "mem_collxfrm_[ix %" UVuf "] for locale '%s':\n"
8388                   "     input=%s\n    return=%s\n    return len=%zu\n",
8389                   (UV) PL_collation_ix, PL_collation_name,
8390                   get_displayable_string(s, e, is_utf8),
8391                   ((xbuf == NULL)
8392                    ? "(null)"
8393                    : ((xlen == 0)
8394                       ? "(empty)"
8395                       : _byte_dump_string((U8 *) xbuf + COLLXFRM_HDR_LEN,
8396                                           xlen, 0))),
8397                   xlen);
8398 }
8399
8400 #  endif    /* DEBUGGING */
8401
8402 SV *
8403 Perl_strxfrm(pTHX_ SV * src)
8404 {
8405     PERL_ARGS_ASSERT_STRXFRM;
8406
8407     /* For use by POSIX::strxfrm().  If they differ, toggle LC_CTYPE to
8408      * LC_COLLATE to avoid potential mojibake.
8409      *
8410      * If we can't calculate a collation, 'src' is instead returned, so that
8411      * future comparisons will be by code point order */
8412
8413 #  ifdef USE_LOCALE_CTYPE
8414
8415     const char * orig_ctype = toggle_locale_c(LC_CTYPE,
8416                                               querylocale_c(LC_COLLATE));
8417 #  endif
8418
8419     SV * dst = src;
8420     STRLEN dstlen;
8421     STRLEN srclen;
8422     const char *p = SvPV_const(src, srclen);
8423     const U32 utf8_flag = SvUTF8(src);
8424     char *d = mem_collxfrm_(p, srclen, &dstlen, cBOOL(utf8_flag));
8425
8426     assert(utf8_flag == 0 || utf8_flag == SVf_UTF8);
8427
8428     if (d != NULL) {
8429         assert(dstlen > 0);
8430         dst =newSVpvn_flags(d + COLLXFRM_HDR_LEN,
8431                             dstlen, SVs_TEMP|utf8_flag);
8432         Safefree(d);
8433     }
8434
8435 #  ifdef USE_LOCALE_CTYPE
8436
8437     restore_toggled_locale_c(LC_CTYPE, orig_ctype);
8438
8439 #  endif
8440
8441     return dst;
8442 }
8443
8444 #endif /* USE_LOCALE_COLLATE */
8445 #ifdef USE_LOCALE
8446
8447 STATIC const char *
8448 S_toggle_locale_i(pTHX_ const locale_category_index cat_index,
8449                         const char * new_locale,
8450                         const line_t caller_line)
8451 {
8452     PERL_ARGS_ASSERT_TOGGLE_LOCALE_I;
8453     assert(cat_index <= LC_ALL_INDEX_);
8454
8455     /* Changes the locale for the category specified by 'index' to 'new_locale,
8456      * if they aren't already the same.
8457      *
8458      * Returns a copy of the name of the original locale for 'cat_index'
8459      * so can be switched back to with the companion function
8460      * restore_toggled_locale_i(),  (NULL if no restoral is necessary.) */
8461
8462     /* Find the original locale of the category we may need to change, so that
8463      * it can be restored to later */
8464     const char * locale_to_restore_to = querylocale_i(cat_index);
8465
8466     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
8467              "(%" LINE_Tf "): toggle_locale_i: index=%d(%s), wanted=%s,"
8468              " actual=%s\n",
8469              caller_line, cat_index, category_names[cat_index],
8470              new_locale, locale_to_restore_to));
8471
8472     if (! locale_to_restore_to) {
8473         locale_panic_via_(Perl_form(aTHX_
8474                                     "Could not find current %s locale",
8475                                     category_names[cat_index]),
8476                          __FILE__, caller_line);
8477     }
8478
8479     /* If the locales are the same, there's nothing to do */
8480     if (strEQ(locale_to_restore_to, new_locale)) {
8481         DEBUG_Lv(PerlIO_printf(Perl_debug_log,
8482                                "(%" LINE_Tf "): %s locale unchanged as %s\n",
8483                                caller_line, category_names[cat_index],
8484                                new_locale));
8485
8486         return NULL;
8487     }
8488
8489     /* Finally, change the locale to the new one */
8490     void_setlocale_i_with_caller(cat_index, new_locale, __FILE__, caller_line);
8491
8492     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
8493                            "(%" LINE_Tf "): %s locale switched to %s\n",
8494                            caller_line, category_names[cat_index], new_locale));
8495
8496     return locale_to_restore_to;
8497
8498 #  ifndef DEBUGGING
8499     PERL_UNUSED_ARG(caller_line);
8500 #  endif
8501
8502 }
8503
8504 STATIC void
8505 S_restore_toggled_locale_i(pTHX_ const locale_category_index cat_index,
8506                                  const char * restore_locale,
8507                                  const line_t caller_line)
8508 {
8509     /* Restores the locale for LC_category corresponding to cat_index to
8510      * 'restore_locale' (which is a copy that will be freed by this function),
8511      * or do nothing if the latter parameter is NULL */
8512
8513     PERL_ARGS_ASSERT_RESTORE_TOGGLED_LOCALE_I;
8514     assert(cat_index <= LC_ALL_INDEX_);
8515
8516     if (restore_locale == NULL) {
8517         DEBUG_Lv(PerlIO_printf(Perl_debug_log,
8518                                "(%" LINE_Tf "): No need to restore %s\n",
8519                                caller_line, category_names[cat_index]));
8520         return;
8521     }
8522
8523     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
8524                            "(%" LINE_Tf "): %s restoring locale to %s\n",
8525                            caller_line, category_names[cat_index],
8526                            restore_locale));
8527
8528     void_setlocale_i_with_caller(cat_index, restore_locale,
8529                                   __FILE__, caller_line);
8530
8531 #  ifndef DEBUGGING
8532     PERL_UNUSED_ARG(caller_line);
8533 #  endif
8534
8535 }
8536
8537 #  ifdef USE_LOCALE_CTYPE
8538
8539 STATIC bool
8540 S_is_codeset_name_UTF8(const char * name)
8541 {
8542     /* Return a boolean as to if the passed-in name indicates it is a UTF-8
8543      * code set.  Several variants are possible */
8544     const Size_t len = strlen(name);
8545
8546     PERL_ARGS_ASSERT_IS_CODESET_NAME_UTF8;
8547
8548 #    ifdef WIN32
8549
8550     /* http://msdn.microsoft.com/en-us/library/windows/desktop/dd317756.aspx */
8551     if (memENDs(name, len, "65001")) {
8552         return TRUE;
8553     }
8554
8555 #    endif
8556                /* 'UTF8' or 'UTF-8' */
8557     return (    inRANGE(len, 4, 5)
8558             &&  name[len-1] == '8'
8559             && (   memBEGINs(name, len, "UTF")
8560                 || memBEGINs(name, len, "utf"))
8561             && (len == 4 || name[3] == '-'));
8562 }
8563
8564 #  endif
8565 #endif  /* USE_LOCALE */
8566
8567 bool
8568 Perl__is_in_locale_category(pTHX_ const bool compiling, const int category)
8569 {
8570     /* Internal function which returns if we are in the scope of a pragma that
8571      * enables the locale category 'category'.  'compiling' should indicate if
8572      * this is during the compilation phase (TRUE) or not (FALSE). */
8573
8574     const COP * const cop = (compiling) ? &PL_compiling : PL_curcop;
8575
8576     SV *these_categories = cop_hints_fetch_pvs(cop, "locale", 0);
8577     if (! these_categories || these_categories == &PL_sv_placeholder) {
8578         return FALSE;
8579     }
8580
8581     /* The pseudo-category 'not_characters' is -1, so just add 1 to each to get
8582      * a valid unsigned */
8583     assert(category >= -1);
8584     return cBOOL(SvUV(these_categories) & (1U << (category + 1)));
8585 }
8586
8587 /* my_strerror() returns a mortalized copy of the text of the error message
8588  * associated with 'errnum'.
8589  *
8590  * If not called from within the scope of 'use locale', it uses the text from
8591  * the C locale.  If Perl is compiled to not pay attention to LC_CTYPE nor
8592  * LC_MESSAGES, it uses whatever strerror() returns.  Otherwise the text is
8593  * derived from the locale, LC_MESSAGES if we have that; LC_CTYPE if not.
8594  *
8595  * It returns in *utf8ness the result's UTF-8ness
8596  *
8597  * The function just calls strerror(), but temporarily switches locales, if
8598  * needed.  Many platforms require LC_CTYPE and LC_MESSAGES to be in the same
8599  * CODESET in order for the return from strerror() to not contain '?' symbols,
8600  * or worse, mojibaked.  It's cheaper to just use the stricter criteria of
8601  * being in the same locale.  So the code below uses a common locale for both
8602  * categories.  Again, that is C if not within 'use locale' scope; or the
8603  * LC_MESSAGES locale if in scope and we have that category; and LC_CTYPE if we
8604  * don't have LC_MESSAGES; and whatever strerror returns if we don't have
8605  * either category.
8606  *
8607  * There are two sets of implementations.  The first below is if we have
8608  * strerror_l().  This is the simpler.  We just use the already-built C locale
8609  * object if not in locale scope, or build up a custom one otherwise.
8610  *
8611  * When strerror_l() is not available, we may have to swap locales temporarily
8612  * to bring the two categories into sync with each other, and possibly to the C
8613  * locale.
8614  *
8615  * Because the prepropessing directives to conditionally compile this function
8616  * would greatly obscure the logic of the various implementations, the whole
8617  * function is repeated for each configuration, with some common macros. */
8618
8619 /* Used to shorten the definitions of the following implementations of
8620  * my_strerror() */
8621 #define DEBUG_STRERROR_ENTER(errnum, in_locale)                             \
8622     DEBUG_Lv(PerlIO_printf(Perl_debug_log,                                  \
8623                            "my_strerror called with errnum %d;"             \
8624                            " Within locale scope=%d\n",                     \
8625                            errnum, in_locale))
8626
8627 #define DEBUG_STRERROR_RETURN(errstr, utf8ness)                             \
8628     DEBUG_Lv(PerlIO_printf(Perl_debug_log,                                  \
8629                            "Strerror returned; saving a copy: '%s';"        \
8630                            " utf8ness=%d\n",                                \
8631                            get_displayable_string(errstr,                   \
8632                                                   errstr + strlen(errstr),  \
8633                                                   *utf8ness),               \
8634                            (int) *utf8ness))
8635
8636 /* On platforms that have precisely one of these categories (Windows
8637  * qualifies), these yield the correct one */
8638 #if defined(USE_LOCALE_CTYPE)
8639 #  define WHICH_LC_INDEX LC_CTYPE_INDEX_
8640 #elif defined(USE_LOCALE_MESSAGES)
8641 #  define WHICH_LC_INDEX LC_MESSAGES_INDEX_
8642 #endif
8643
8644 /*===========================================================================*/
8645 /* First set of implementations, when have strerror_l() */
8646
8647 #if defined(USE_POSIX_2008_LOCALE) && defined(HAS_STRERROR_L)
8648
8649 #  if ! defined(USE_LOCALE_CTYPE) && ! defined(USE_LOCALE_MESSAGES)
8650
8651 /* Here, neither category is defined: use the C locale */
8652 const char *
8653 Perl_my_strerror(pTHX_ const int errnum, utf8ness_t * utf8ness)
8654 {
8655     PERL_ARGS_ASSERT_MY_STRERROR;
8656
8657     DEBUG_STRERROR_ENTER(errnum, 0);
8658
8659     const char *errstr = savepv(strerror_l(errnum, PL_C_locale_obj));
8660     *utf8ness = UTF8NESS_IMMATERIAL;
8661
8662     DEBUG_STRERROR_RETURN(errstr, utf8ness);
8663
8664     SAVEFREEPV(errstr);
8665     return errstr;
8666 }
8667
8668 #  elif ! defined(USE_LOCALE_CTYPE) || ! defined(USE_LOCALE_MESSAGES)
8669
8670 /*--------------------------------------------------------------------------*/
8671
8672 /* Here one or the other of CTYPE or MESSAGES is defined, but not both.  If we
8673  * are not within 'use locale' scope of the only one defined, we use the C
8674  * locale; otherwise use the current locale object */
8675
8676 const char *
8677 Perl_my_strerror(pTHX_ const int errnum, utf8ness_t * utf8ness)
8678 {
8679     PERL_ARGS_ASSERT_MY_STRERROR;
8680
8681     DEBUG_STRERROR_ENTER(errnum, IN_LC(categories[WHICH_LC_INDEX]));
8682
8683     /* Use C if not within locale scope;  Otherwise, use current locale */
8684     const locale_t which_obj = (IN_LC(categories[WHICH_LC_INDEX]))
8685                                ? PL_C_locale_obj
8686                                : use_curlocale_scratch();
8687
8688     const char *errstr = savepv(strerror_l(errnum, which_obj));
8689     *utf8ness = get_locale_string_utf8ness_i(errstr, LOCALE_UTF8NESS_UNKNOWN,
8690                                              NULL, WHICH_LC_INDEX);
8691     DEBUG_STRERROR_RETURN(errstr, utf8ness);
8692
8693     SAVEFREEPV(errstr);
8694     return errstr;
8695 }
8696
8697 /*--------------------------------------------------------------------------*/
8698 #  else     /* Are using both categories.  Place them in the same CODESET,
8699              * either C or the LC_MESSAGES locale */
8700
8701 const char *
8702 Perl_my_strerror(pTHX_ const int errnum, utf8ness_t * utf8ness)
8703 {
8704     PERL_ARGS_ASSERT_MY_STRERROR;
8705
8706     DEBUG_STRERROR_ENTER(errnum, IN_LC(LC_MESSAGES));
8707
8708     const char *errstr;
8709     if (! IN_LC(LC_MESSAGES)) {    /* Use C if not within locale scope */
8710         errstr = savepv(strerror_l(errnum, PL_C_locale_obj));
8711         *utf8ness = UTF8NESS_IMMATERIAL;
8712     }
8713     else {  /* Otherwise, use the LC_MESSAGES locale, making sure LC_CTYPE
8714                matches */
8715         locale_t cur = duplocale(use_curlocale_scratch());
8716
8717         cur = newlocale(LC_CTYPE_MASK, querylocale_c(LC_MESSAGES), cur);
8718         errstr = savepv(strerror_l(errnum, cur));
8719         *utf8ness = get_locale_string_utf8ness_i(errstr,
8720                                                  LOCALE_UTF8NESS_UNKNOWN,
8721                                                  NULL, LC_MESSAGES_INDEX_);
8722         freelocale(cur);
8723     }
8724
8725     DEBUG_STRERROR_RETURN(errstr, utf8ness);
8726
8727     SAVEFREEPV(errstr);
8728     return errstr;
8729 }
8730 #  endif    /* Above is using strerror_l */
8731 /*===========================================================================*/
8732 #else       /* Below is not using strerror_l */
8733 #  if ! defined(USE_LOCALE_CTYPE) && ! defined(USE_LOCALE_MESSAGES)
8734
8735 /* If not using using either of the categories, return plain, unadorned
8736  * strerror */
8737
8738 const char *
8739 Perl_my_strerror(pTHX_ const int errnum, utf8ness_t * utf8ness)
8740 {
8741     PERL_ARGS_ASSERT_MY_STRERROR;
8742
8743     DEBUG_STRERROR_ENTER(errnum, 0);
8744
8745     const char *errstr = savepv(Strerror(errnum));
8746     *utf8ness = UTF8NESS_IMMATERIAL;
8747
8748     DEBUG_STRERROR_RETURN(errstr, utf8ness);
8749
8750     SAVEFREEPV(errstr);
8751     return errstr;
8752 }
8753
8754 /*--------------------------------------------------------------------------*/
8755 #  elif ! defined(USE_LOCALE_CTYPE) || ! defined(USE_LOCALE_MESSAGES)
8756
8757 /* Here one or the other of CTYPE or MESSAGES is defined, but not both.  If we
8758  * are not within 'use locale' scope of the only one defined, we use the C
8759  * locale; otherwise use the current locale */
8760
8761 const char *
8762 Perl_my_strerror(pTHX_ const int errnum, utf8ness_t * utf8ness)
8763 {
8764     PERL_ARGS_ASSERT_MY_STRERROR;
8765
8766     DEBUG_STRERROR_ENTER(errnum, IN_LC(categories[WHICH_LC_INDEX]));
8767
8768     const char *errstr;
8769     if (IN_LC(categories[WHICH_LC_INDEX])) {
8770         errstr = savepv(Strerror(errnum));
8771         *utf8ness = get_locale_string_utf8ness_i(errstr,
8772                                                  LOCALE_UTF8NESS_UNKNOWN,
8773                                                  NULL, WHICH_LC_INDEX);
8774     }
8775     else {
8776
8777         LOCALE_LOCK;
8778
8779         const char * orig_locale = toggle_locale_i(WHICH_LC_INDEX, "C");
8780
8781         errstr = savepv(Strerror(errnum));
8782
8783         restore_toggled_locale_i(WHICH_LC_INDEX, orig_locale);
8784
8785         LOCALE_UNLOCK;
8786
8787         *utf8ness = UTF8NESS_IMMATERIAL;
8788     }
8789
8790     DEBUG_STRERROR_RETURN(errstr, utf8ness);
8791
8792     SAVEFREEPV(errstr);
8793     return errstr;
8794 }
8795
8796 /*--------------------------------------------------------------------------*/
8797 #  else
8798
8799 /* Below, have both LC_CTYPE and LC_MESSAGES.  Place them in the same CODESET,
8800  * either C or the LC_MESSAGES locale */
8801
8802 const char *
8803 Perl_my_strerror(pTHX_ const int errnum, utf8ness_t * utf8ness)
8804 {
8805     PERL_ARGS_ASSERT_MY_STRERROR;
8806
8807     DEBUG_STRERROR_ENTER(errnum, IN_LC(LC_MESSAGES));
8808
8809     const char * desired_locale = (IN_LC(LC_MESSAGES))
8810                                   ? querylocale_c(LC_MESSAGES)
8811                                   : "C";
8812     /* XXX Can fail on z/OS */
8813
8814     LOCALE_LOCK;
8815
8816     const char* orig_CTYPE_locale    = toggle_locale_c(LC_CTYPE,
8817                                                        desired_locale);
8818     const char* orig_MESSAGES_locale = toggle_locale_c(LC_MESSAGES,
8819                                                        desired_locale);
8820     const char *errstr = savepv(Strerror(errnum));
8821
8822     restore_toggled_locale_c(LC_MESSAGES, orig_MESSAGES_locale);
8823     restore_toggled_locale_c(LC_CTYPE, orig_CTYPE_locale);
8824
8825     LOCALE_UNLOCK;
8826
8827     *utf8ness = get_locale_string_utf8ness_i(errstr, LOCALE_UTF8NESS_UNKNOWN,
8828                                              NULL, LC_MESSAGES_INDEX_);
8829     DEBUG_STRERROR_RETURN(errstr, utf8ness);
8830
8831     SAVEFREEPV(errstr);
8832     return errstr;
8833 }
8834
8835 /*--------------------------------------------------------------------------*/
8836 #  endif /* end of not using strerror_l() */
8837 #endif   /* end of all the my_strerror() implementations */
8838
8839 /*
8840
8841 =for apidoc switch_to_global_locale
8842
8843 This function copies the locale state of the calling thread into the program's
8844 global locale, and converts the thread to use that global locale.
8845
8846 It is intended so that Perl can safely be used with C libraries that access the
8847 global locale and which can't be converted to not access it.  Effectively, this
8848 means libraries that call C<L<setlocale(3)>> on non-Windows systems.  (For
8849 portability, it is a good idea to use it on Windows as well.)
8850
8851 A downside of using it is that it disables the services that Perl provides to
8852 hide locale gotchas from your code.  The service you most likely will miss
8853 regards the radix character (decimal point) in floating point numbers.  Code
8854 executed after this function is called can no longer just assume that this
8855 character is correct for the current circumstances.
8856
8857 To return to Perl control, and restart the gotcha prevention services, call
8858 C<L</sync_locale>>.  Behavior is undefined for any pure Perl code that executes
8859 while the switch is in effect.
8860
8861 The global locale and the per-thread locales are independent.  As long as just
8862 one thread converts to the global locale, everything works smoothly.  But if
8863 more than one does, they can easily interfere with each other, and races are
8864 likely.  On Windows systems prior to Visual Studio 15 (at which point Microsoft
8865 fixed a bug), races can occur (even if only one thread has been converted to
8866 the global locale), but only if you use the following operations:
8867
8868 =over
8869
8870 =item L<POSIX::localeconv|POSIX/localeconv>
8871
8872 =item L<I18N::Langinfo>, items C<CRNCYSTR> and C<THOUSEP>
8873
8874 =item L<perlapi/Perl_langinfo>, items C<CRNCYSTR> and C<THOUSEP>
8875
8876 =back
8877
8878 The first item is not fixable (except by upgrading to a later Visual Studio
8879 release), but it would be possible to work around the latter two items by
8880 having Perl change its algorithm for calculating these to use Windows API
8881 functions (likely C<GetNumberFormat> and C<GetCurrencyFormat>); patches
8882 welcome.
8883
8884 XS code should never call plain C<setlocale>, but should instead be converted
8885 to either call L<C<Perl_setlocale>|perlapi/Perl_setlocale> (which is a drop-in
8886 for the system C<setlocale>) or use the methods given in L<perlcall> to call
8887 L<C<POSIX::setlocale>|POSIX/setlocale>.  Either one will transparently properly
8888 handle all cases of single- vs multi-thread, POSIX 2008-supported or not.
8889
8890 =cut
8891 */
8892
8893 #if defined(WIN32) && defined(USE_THREAD_SAFE_LOCALE)
8894 #  define CHANGE_SYSTEM_LOCALE_TO_GLOBAL                                \
8895     STMT_START {                                                        \
8896         if (_configthreadlocale(_DISABLE_PER_THREAD_LOCALE) == -1) {    \
8897             locale_panic_("_configthreadlocale returned an error");     \
8898         }                                                               \
8899     } STMT_END
8900 #elif defined(USE_POSIX_2008_LOCALE)
8901 #  define CHANGE_SYSTEM_LOCALE_TO_GLOBAL                                \
8902     STMT_START {                                                        \
8903         locale_t old_locale = uselocale(LC_GLOBAL_LOCALE);              \
8904         if (! old_locale) {                                             \
8905             locale_panic_("Could not change to global locale");         \
8906         }                                                               \
8907                                                                         \
8908         /* Free the per-thread memory */                                \
8909         if (   old_locale != LC_GLOBAL_LOCALE                           \
8910             && old_locale != PL_C_locale_obj)                           \
8911         {                                                               \
8912             freelocale(old_locale);                                     \
8913         }                                                               \
8914     } STMT_END
8915 #else
8916 #  define CHANGE_SYSTEM_LOCALE_TO_GLOBAL
8917 #endif
8918
8919 void
8920 Perl_switch_to_global_locale(pTHX)
8921 {
8922
8923 #ifdef USE_LOCALE
8924
8925     DEBUG_L(PerlIO_printf(Perl_debug_log, "Entering switch_to_global; %s\n",
8926                                           get_LC_ALL_display()));
8927
8928    /* In these cases, we use the system state to determine if we are in the
8929     * global locale or not. */
8930 #  ifdef USE_POSIX_2008_LOCALE
8931
8932     const bool perl_controls = (LC_GLOBAL_LOCALE != uselocale((locale_t) 0));
8933
8934 #  elif defined(USE_THREAD_SAFE_LOCALE) && defined(WIN32)
8935
8936     int config_return = _configthreadlocale(0);
8937     if (config_return == -1) {
8938         locale_panic_("_configthreadlocale returned an error");
8939     }
8940     const bool perl_controls = (config_return == _ENABLE_PER_THREAD_LOCALE);
8941
8942 #  else
8943
8944     const bool perl_controls = false;
8945
8946 #  endif
8947
8948     /* No-op if already in global */
8949     if (! perl_controls) {
8950         return;
8951     }
8952
8953 #  ifdef LC_ALL
8954
8955     const char * thread_locale = calculate_LC_ALL_string(NULL,
8956                                                          EXTERNAL_FORMAT_FOR_SET,
8957                                                          WANT_TEMP_PV,
8958                                                          __LINE__);
8959     CHANGE_SYSTEM_LOCALE_TO_GLOBAL;
8960     posix_setlocale(LC_ALL, thread_locale);
8961
8962 #  else   /* Must be USE_POSIX_2008_LOCALE) */
8963
8964     const char * cur_thread_locales[LC_ALL_INDEX_];
8965
8966     /* Save each category's current per-thread state */
8967     for_all_individual_category_indexes(i) {
8968         cur_thread_locales[i] = querylocale_i(i);
8969     }
8970
8971     CHANGE_SYSTEM_LOCALE_TO_GLOBAL;
8972
8973     /* Set the global to what was our per-thread state */
8974     POSIX_SETLOCALE_LOCK;
8975     for_all_individual_category_indexes(i) {
8976         posix_setlocale(categories[i], cur_thread_locales[i]);
8977     }
8978     POSIX_SETLOCALE_UNLOCK;
8979
8980 #  endif
8981 #  ifdef USE_LOCALE_NUMERIC
8982
8983     /* Switch to the underlying C numeric locale; the application is on its
8984      * own. */
8985     POSIX_SETLOCALE_LOCK;
8986     posix_setlocale(LC_NUMERIC, PL_numeric_name);
8987     POSIX_SETLOCALE_UNLOCK;
8988
8989 #  endif
8990 #endif
8991
8992 }
8993
8994 /*
8995
8996 =for apidoc sync_locale
8997
8998 This function copies the state of the program global locale into the calling
8999 thread, and converts that thread to using per-thread locales, if it wasn't
9000 already, and the platform supports them.  The LC_NUMERIC locale is toggled into
9001 the standard state (using the C locale's conventions), if not within the
9002 lexical scope of S<C<use locale>>.
9003
9004 Perl will now consider itself to have control of the locale.
9005
9006 Since unthreaded perls have only a global locale, this function is a no-op
9007 without threads.
9008
9009 This function is intended for use with C libraries that do locale manipulation.
9010 It allows Perl to accommodate the use of them.  Call this function before
9011 transferring back to Perl space so that it knows what state the C code has left
9012 things in.
9013
9014 XS code should not manipulate the locale on its own.  Instead,
9015 L<C<Perl_setlocale>|perlapi/Perl_setlocale> can be used at any time to query or
9016 change the locale (though changing the locale is antisocial and dangerous on
9017 multi-threaded systems that don't have multi-thread safe locale operations.
9018 (See L<perllocale/Multi-threaded operation>).
9019
9020 Using the libc L<C<setlocale(3)>> function should be avoided.  Nevertheless,
9021 certain non-Perl libraries called from XS, do call it, and their behavior may
9022 not be able to be changed.  This function, along with
9023 C<L</switch_to_global_locale>>, can be used to get seamless behavior in these
9024 circumstances, as long as only one thread is involved.
9025
9026 If the library has an option to turn off its locale manipulation, doing that is
9027 preferable to using this mechanism.  C<Gtk> is such a library.
9028
9029 The return value is a boolean: TRUE if the global locale at the time of call
9030 was in effect for the caller; and FALSE if a per-thread locale was in effect.
9031
9032 =cut
9033 */
9034
9035 bool
9036 Perl_sync_locale(pTHX)
9037 {
9038
9039 #ifndef USE_LOCALE
9040
9041     return TRUE;
9042
9043 #else
9044
9045     bool was_in_global = TRUE;
9046
9047 #  ifdef USE_THREAD_SAFE_LOCALE
9048 #    if defined(WIN32)
9049
9050     int config_return = _configthreadlocale(_DISABLE_PER_THREAD_LOCALE);
9051     if (config_return == -1) {
9052         locale_panic_("_configthreadlocale returned an error");
9053     }
9054     was_in_global = (config_return == _DISABLE_PER_THREAD_LOCALE);
9055
9056 #    elif defined(USE_POSIX_2008_LOCALE)
9057
9058     was_in_global = (LC_GLOBAL_LOCALE == uselocale(LC_GLOBAL_LOCALE));
9059
9060 #    else
9061 #      error Unexpected Configuration
9062 #    endif
9063 #  endif    /* USE_THREAD_SAFE_LOCALE */
9064
9065     /* Here, we are in the global locale.  Get and save the values for each
9066      * category, and convert the current thread to use them */
9067
9068 #  ifdef LC_ALL
9069
9070     STDIZED_SETLOCALE_LOCK;
9071     const char * lc_all_string = savepv(stdized_setlocale(LC_ALL, NULL));
9072     STDIZED_SETLOCALE_UNLOCK;
9073
9074     give_perl_locale_control(lc_all_string, __LINE__);
9075     Safefree(lc_all_string);
9076
9077 #  else
9078
9079     const char * current_globals[LC_ALL_INDEX_];
9080     for_all_individual_category_indexes(i) {
9081         STDIZED_SETLOCALE_LOCK;
9082         current_globals[i] = savepv(stdized_setlocale(categories[i], NULL));
9083         STDIZED_SETLOCALE_UNLOCK;
9084     }
9085
9086     give_perl_locale_control((const char **) &current_globals, __LINE__);
9087
9088     for_all_individual_category_indexes(i) {
9089         Safefree(current_globals[i]);
9090     }
9091
9092 #  endif
9093
9094     return was_in_global;
9095
9096 #endif
9097
9098 }
9099
9100 #if defined(DEBUGGING) && defined(USE_LOCALE)
9101
9102 STATIC char *
9103 S_my_setlocale_debug_string_i(pTHX_
9104                               const locale_category_index cat_index,
9105                               const char* locale, /* Optional locale name */
9106
9107                               /* return value from setlocale() when attempting
9108                                * to set 'category' to 'locale' */
9109                               const char* retval,
9110
9111                               const line_t line)
9112 {
9113     /* Returns a pointer to a NUL-terminated string in static storage with
9114      * added text about the info passed in.  This is not thread safe and will
9115      * be overwritten by the next call, so this should be used just to
9116      * formulate a string to immediately print or savepv() on. */
9117
9118     const char * locale_quote;
9119     const char * retval_quote;
9120
9121     assert(cat_index <= LC_ALL_INDEX_);
9122
9123     if (locale == NULL) {
9124         locale_quote = "";
9125         locale = "NULL";
9126     }
9127     else {
9128         locale_quote = "\"";
9129     }
9130
9131     if (retval == NULL) {
9132         retval_quote = "";
9133         retval = "NULL";
9134     }
9135     else {
9136         retval_quote = "\"";
9137     }
9138
9139 #  ifdef USE_LOCALE_THREADS
9140 #    define THREAD_FORMAT "%p:"
9141 #    define THREAD_ARGUMENT aTHX_
9142 #  else
9143 #    define THREAD_FORMAT
9144 #    define THREAD_ARGUMENT
9145 #  endif
9146
9147     return Perl_form(aTHX_
9148                      "%s:%" LINE_Tf ": " THREAD_FORMAT
9149                      " setlocale(%s[%d], %s%s%s) returned %s%s%s\n",
9150
9151                      __FILE__, line, THREAD_ARGUMENT
9152                      category_names[cat_index], categories[cat_index],
9153                      locale_quote, locale, locale_quote,
9154                      retval_quote, retval, retval_quote);
9155 }
9156
9157 #endif
9158 #ifdef USE_PERL_SWITCH_LOCALE_CONTEXT
9159
9160 void
9161 Perl_switch_locale_context(pTHX)
9162 {
9163     /* libc keeps per-thread locale status information in some configurations.
9164      * So, we can't just switch out aTHX to switch to a new thread.  libc has
9165      * to follow along.  This routine does that based on per-interpreter
9166      * variables we keep just for this purpose.
9167      *
9168      * There are two implementations where this is an issue.  For the other
9169      * implementations, it doesn't matter because libc is using global values
9170      * that all threads know about.
9171      *
9172      * The two implementations are where libc keeps thread-specific information
9173      * on its own.  These are
9174      *
9175      * POSIX 2008:  The current locale is kept by libc as an object.  We save
9176      *              a copy of that in the per-thread PL_cur_locale_obj, and so
9177      *              this routine uses that copy to tell the thread it should be
9178      *              operating with that object
9179      * Windows thread-safe locales:  A given thread in Windows can be being run
9180      *              with per-thread locales, or not.  When the thread context
9181      *              changes, libc doesn't automatically know if the thread is
9182      *              using per-thread locales, nor does it know what the new
9183      *              thread's locale is.  We keep that information in the
9184      *              per-thread variables:
9185      *                  PL_controls_locale  indicates if this thread is using
9186      *                                      per-thread locales or not
9187      *                  PL_cur_LC_ALL       indicates what the locale should be
9188      *                                      if it is a per-thread locale.
9189      */
9190
9191     if (UNLIKELY(   PL_veto_switch_non_tTHX_context
9192                  || PL_phase == PERL_PHASE_CONSTRUCT))
9193     {
9194         return;
9195     }
9196
9197 #  ifdef USE_POSIX_2008_LOCALE
9198
9199     if (! uselocale(PL_cur_locale_obj)) {
9200         locale_panic_(Perl_form(aTHX_
9201                                 "Can't uselocale(%p), LC_ALL supposed to"
9202                                 " be '%s'",
9203                                 PL_cur_locale_obj, get_LC_ALL_display()));
9204     }
9205
9206 #  elif defined(WIN32)
9207
9208     if (! bool_setlocale_c(LC_ALL, PL_cur_LC_ALL)) {
9209         locale_panic_(Perl_form(aTHX_ "Can't setlocale(%s)", PL_cur_LC_ALL));
9210     }
9211
9212 #  endif
9213
9214 }
9215
9216 #endif
9217
9218 void
9219 Perl_thread_locale_init(pTHX)
9220 {
9221
9222 #ifdef USE_THREAD_SAFE_LOCALE
9223 #  ifdef USE_POSIX_2008_LOCALE
9224
9225     /* Called from a thread on startup.
9226      *
9227      * The operations here have to be done from within the calling thread, as
9228      * they affect libc's knowledge of the thread; libc has no knowledge of
9229      * aTHX */
9230
9231      DEBUG_L(PerlIO_printf(Perl_debug_log,
9232                            "new thread, initial locale is %s;"
9233                            " calling setlocale(LC_ALL, \"C\")\n",
9234                            get_LC_ALL_display()));
9235
9236     if (! uselocale(PL_C_locale_obj)) {
9237
9238         /* Not being able to change to the C locale is severe; don't keep
9239          * going.  */
9240         locale_panic_(Perl_form(aTHX_
9241                                 "Can't uselocale(%p), 'C'", PL_C_locale_obj));
9242         NOT_REACHED; /* NOTREACHED */
9243     }
9244
9245 #    ifdef MULTIPLICITY
9246
9247     PL_cur_locale_obj = PL_C_locale_obj;
9248
9249 #    endif
9250 #  elif defined(WIN32)
9251
9252     /* On Windows, make sure new thread has per-thread locales enabled */
9253     if (_configthreadlocale(_ENABLE_PER_THREAD_LOCALE) == -1) {
9254         locale_panic_("_configthreadlocale returned an error");
9255     }
9256     void_setlocale_c(LC_ALL, "C");
9257
9258 #  endif
9259 #endif
9260
9261 }
9262
9263 void
9264 Perl_thread_locale_term(pTHX)
9265 {
9266     /* Called from a thread as it gets ready to terminate.
9267      *
9268      * The operations here have to be done from within the calling thread, as
9269      * they affect libc's knowledge of the thread; libc has no knowledge of
9270      * aTHX */
9271
9272 #if defined(USE_POSIX_2008_LOCALE) && defined(USE_THREADS)
9273
9274     /* Switch to the global locale, so can free up the per-thread object */
9275     locale_t actual_obj = uselocale(LC_GLOBAL_LOCALE);
9276     if (actual_obj != LC_GLOBAL_LOCALE && actual_obj != PL_C_locale_obj) {
9277         freelocale(actual_obj);
9278     }
9279
9280     /* Prevent leaks even if something has gone wrong */
9281     locale_t expected_obj = PL_cur_locale_obj;
9282     if (UNLIKELY(   expected_obj != actual_obj
9283                  && expected_obj != LC_GLOBAL_LOCALE
9284                  && expected_obj != PL_C_locale_obj))
9285     {
9286         freelocale(expected_obj);
9287     }
9288
9289     PL_cur_locale_obj = LC_GLOBAL_LOCALE;
9290
9291 #endif
9292 #ifdef WIN32_USE_FAKE_OLD_MINGW_LOCALES
9293
9294     /* When faking the mingw implementation, we coerce this function into doing
9295      * something completely different from its intent -- namely to free up our
9296      * static buffer to avoid a leak.  This function gets called for each
9297      * thread that is terminating, so will give us a chance to free the buffer
9298      * from the appropriate pool.  On unthreaded systems, it gets called by the
9299      * mutex termination code. */
9300
9301 #  ifdef MULTIPLICITY
9302
9303     if (aTHX != wsetlocale_buf_aTHX) {
9304         return;
9305     }
9306
9307 #  endif
9308
9309     if (wsetlocale_buf_size > 0) {
9310         Safefree(wsetlocale_buf);
9311         wsetlocale_buf_size = 0;
9312     }
9313
9314 #endif
9315
9316 }
9317
9318 /*
9319  * ex: set ts=8 sts=4 sw=4 et:
9320  */