locale.c

   1 /*    locale.c
   2  *
   3  *    Copyright (C) 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001,
   4  *    2002, 2003, 2005, 2006, 2007, 2008 by Larry Wall and others
   5  *
   6  *    You may distribute under the terms of either the GNU General Public
   7  *    License or the Artistic License, as specified in the README file.
   8  *
   9  */
  10
  11 /*
  12  *      A Elbereth Gilthoniel,
  13  *      silivren penna míriel
  14  *      o menel aglar elenath!
  15  *      Na-chaered palan-díriel
  16  *      o galadhremmin ennorath,
  17  *      Fanuilos, le linnathon
  18  *      nef aear, si nef aearon!
  19  *
  20  *     [p.238 of _The Lord of the Rings_, II/i: "Many Meetings"]
  21  */
  22
  23 /* utility functions for handling locale-specific stuff like what
  24  * character represents the decimal point.
  25  *
  26  * All C programs have an underlying locale.  Perl code generally doesn't pay
  27  * any attention to it except within the scope of a 'use locale'.  For most
  28  * categories, it accomplishes this by just using different operations if it is
  29  * in such scope than if not.  However, various libc functions called by Perl
  30  * are affected by the LC_NUMERIC category, so there are macros in perl.h that
  31  * are used to toggle between the current locale and the C locale depending on
  32  * the desired behavior of those functions at the moment.  And, LC_MESSAGES is
  33  * switched to the C locale for outputting the message unless within the scope
  34  * of 'use locale'.
  35  *
  36  * This code now has multi-thread-safe locale handling on systems that support
  37  * that.  This is completely transparent to most XS code.  On earlier systems,
  38  * it would be possible to emulate thread-safe locales, but this likely would
  39  * involve a lot of locale switching, and would require XS code changes.
  40  * Macros could be written so that the code wouldn't have to know which type of
  41  * system is being used.
  42  *
  43  * Table-driven code is used for simplicity and clarity, as many operations
  44  * differ only in which category is being worked on.  However the system
  45  * categories need not be small contiguous integers, so do not lend themselves
  46  * to table lookup.  Instead we have created our own equivalent values which
  47  * are all small contiguous non-negative integers, and translation functions
  48  * between the two sets.  For category 'LC_foo', the name of our index is
  49  * LC_foo_INDEX_.  Various parallel tables, indexed by these, are used.
  50  *
  51  * Many of the macros and functions in this file have one of the suffixes '_c',
  52  * '_r', or '_i'.  khw found these useful in remembering what type of locale
  53  * category to use as their parameter.  '_r' takes an int category number as
  54  * passed to setlocale(), like LC_ALL, LC_CTYPE, etc.  The 'r' indicates that
  55  * the value isn't known until runtime.  '_c' also indicates such a category
  56  * number, but its value is known at compile time.  These are both converted
  57  * into unsigned indexes into various tables of category information, where the
  58  * real work is generally done.  The tables are generated at compile-time based
  59  * on platform characteristics and Configure options.  They hide from the code
  60  * many of the vagaries of the different locale implementations out there.  You
  61  * may have already guessed that '_i' indicates the parameter is such an
  62  * unsigned index.  Converting from '_r' to '_i' requires run-time lookup.
  63  * '_c' is used to get cpp to do this at compile time.  To avoid the runtime
  64  * expense, the code is structured to use '_r' at the API level, and once
  65  * converted, everything possible is done using the table indexes.
  66  *
  67  * On unthreaded perls, most operations expand out to just the basic
  68  * setlocale() calls.  The same is true on threaded perls on modern Windows
  69  * systems where the same API, after set up, is used for thread-safe locale
  70  * handling.  On other systems, there is a completely different API, specified
  71  * in POSIX 2008, to do thread-safe locales.  On these systems, our
  72  * emulate_setlocale_i() function is used to hide the different API from the
  73  * outside.  This makes it completely transparent to most XS code.
  74  *
  75  * A huge complicating factor is that the LC_NUMERIC category is normally held
  76  * in the C locale, except during those relatively rare times when it needs to
  77  * be in the underlying locale.  There is a bunch of code to accomplish this,
  78  * and to allow easy switches from one state to the other.
  79  *
  80  * In addition, the setlocale equivalents have versions for the return context,
  81  * 'void' and 'bool', besides the full return value.  This can present
  82  * opportunities for avoiding work.  We don't have to necessarily create a safe
  83  * copy to return if no return is desired.
  84  *
  85  * There are 3.5 major implementations here; which one chosen depends on what
  86  * the platform has available, and Configuration options.
  87  *
  88  * 1) Raw my_setlocale().  Here the layer adds nothing.  This is used for
  89  *    unthreaded perls, and when the API for safe locale threading is identical
  90  *    to the unsafe API (Windows, currently).
  91  *
  92  * 2) A minimal layer that makes my_setlocale() uninterruptible and returns a
  93  *    per-thread/per-category value.
  94  *
  95  * 3a and 3b) A layer that implements POSIX 2008 thread-safe locale handling,
  96  *    mapping the setlocale() API to them.  This automatically makes almost all
  97  *    code thread-safe without need for changes.  This layer is chosen on
  98  *    threaded perls when the platform supports the POSIX 2008 functions, and
  99  *    when there is no manual override in Configure.
 100  *
 101  *    3a) is when the platform has a reliable querylocale() function or
 102  *        equivalent that is selected to be used.
 103  *    3b) is when we have to emulate that functionality.
 104  *
 105  * z/OS (os390) is an outlier.  Locales really don't work under threads when
 106  * either the radix character isn't a dot, or attempts are made to change
 107  * locales after the first thread is created.  The reason is that IBM has made
 108  * it thread-safe by refusing to change locales (returning failure if
 109  * attempted) any time after an application has called pthread_create() to
 110  * create another thread.  The expectation is that an application will set up
 111  * its locale information before the first fork, and be stable thereafter.  But
 112  * perl toggles LC_NUMERIC if the locale's radix character isn't a dot, as do
 113  * the other toggles, which are less common.
 114  */
 115
 116 /* If the environment says to, we can output debugging information during
 117  * initialization.  This is done before option parsing, and before any thread
 118  * creation, so can be a file-level static.  (Must come before #including
 119  * perl.h) */
 120 #ifdef DEBUGGING
 121 static int debug_initialization = 0;
 122 #  define DEBUG_INITIALIZATION_set(v) (debug_initialization = v)
 123 #  define DEBUG_LOCALE_INITIALIZATION_  debug_initialization
 124 #else
 125 #  define debug_initialization 0
 126 #  define DEBUG_INITIALIZATION_set(v)
 127 #endif
 128
 129 #define DEBUG_PRE_STMTS   dSAVE_ERRNO;                                        \
 130     PerlIO_printf(Perl_debug_log, "%s: %" LINE_Tf ": ", __FILE__, __LINE__);
 131 #define DEBUG_POST_STMTS  RESTORE_ERRNO;
 132
 133 #include "EXTERN.h"
 134 #define PERL_IN_LOCALE_C
 135 #include "perl.h"
 136
 137 #include "reentr.h"
 138
 139 #ifdef I_WCHAR
 140 #  include <wchar.h>
 141 #endif
 142 #ifdef I_WCTYPE
 143 #  include <wctype.h>
 144 #endif
 145
 146 #ifdef USE_LOCALE
 147
 148 PERL_STATIC_INLINE const char *
 149 S_mortalized_pv_copy(pTHX_ const char * const pv)
 150 {
 151     PERL_ARGS_ASSERT_MORTALIZED_PV_COPY;
 152
 153     /* Copies the input pv, and arranges for it to be freed at an unspecified
 154      * later time. */
 155
 156     if (pv == NULL) {
 157         return NULL;
 158     }
 159
 160     const char * copy = savepv(pv);
 161     SAVEFREEPV(copy);
 162     return copy;
 163 }
 164
 165 #endif
 166
 167 /* Returns the Unix errno portion; ignoring any others.  This is a macro here
 168  * instead of putting it into perl.h, because unclear to khw what should be
 169  * done generally. */
 170 #define GET_ERRNO   saved_errno
 171
 172 /* Default values come from the C locale */
 173 #define C_codeset "ANSI_X3.4-1968" /* Only in some Configurations, and usually
 174                                       a single instance, so is a #define */
 175 static const char C_decimal_point[] = ".";
 176 static const char C_thousands_sep[] = "";
 177
 178 /* Is the C string input 'name' "C" or "POSIX"?  If so, and 'name' is the
 179  * return of setlocale(), then this is extremely likely to be the C or POSIX
 180  * locale.  However, the output of setlocale() is documented to be opaque, but
 181  * the odds are extremely small that it would return these two strings for some
 182  * other locale.  Note that VMS in these two locales includes many non-ASCII
 183  * characters as controls and punctuation (below are hex bytes):
 184  *   cntrl:  84-97 9B-9F
 185  *   punct:  A1-A3 A5 A7-AB B0-B3 B5-B7 B9-BD BF-CF D1-DD DF-EF F1-FD
 186  * Oddly, none there are listed as alphas, though some represent alphabetics
 187  * http://www.nntp.perl.org/group/perl.perl5.porters/2013/02/msg198753.html */
 188 #define isNAME_C_OR_POSIX(name)                                              \
 189                              (   (name) != NULL                              \
 190                               && (( *(name) == 'C' && (*(name + 1)) == '\0') \
 191                                    || strEQ((name), "POSIX")))
 192
 193 #if defined(HAS_NL_LANGINFO_L) || defined(HAS_NL_LANGINFO)
 194 #  define HAS_SOME_LANGINFO
 195 #endif
 196 #if defined(HAS_LOCALECONV) || defined(HAS_LOCALECONV_L)
 197 #  define HAS_SOME_LOCALECONV
 198 #endif
 199
 200 #define my_langinfo_c(item, category, locale, retbufp, retbuf_sizep, utf8ness) \
 201             my_langinfo_i(item, category##_INDEX_, locale, retbufp,            \
 202                                                       retbuf_sizep,  utf8ness)
 203
 204 #ifdef USE_LOCALE
 205
 206 #  ifdef DEBUGGING
 207 #    define setlocale_debug_string_i(index, locale, result)                 \
 208             my_setlocale_debug_string_i(index, locale, result, __LINE__)
 209 #    define setlocale_debug_string_c(category, locale, result)              \
 210                 setlocale_debug_string_i(category##_INDEX_, locale, result)
 211 #    define setlocale_debug_string_r(category, locale, result)              \
 212              setlocale_debug_string_i(get_category_index(category, locale), \
 213                                       locale, result)
 214 #  endif
 215
 216 #  define toggle_locale_i(index, locale)                                    \
 217                  S_toggle_locale_i(aTHX_ index, locale, __LINE__)
 218 #  define toggle_locale_c(cat, locale)  toggle_locale_i(cat##_INDEX_, locale)
 219 #  define restore_toggled_locale_i(index, locale)                           \
 220                  S_restore_toggled_locale_i(aTHX_ index, locale, __LINE__)
 221 #  define restore_toggled_locale_c(cat, locale)                             \
 222                              restore_toggled_locale_i(cat##_INDEX_, locale)
 223
 224 /* Two parallel arrays indexed by our mapping of category numbers into small
 225  * non-negative indexes; first the locale categories Perl uses on this system,
 226  * used to do the inverse mapping.  The second array is their names.  These
 227  * arrays are in mostly arbitrary order. */
 228
 229 STATIC const int categories[] = {
 230
 231 #    ifdef USE_LOCALE_CTYPE
 232                              LC_CTYPE,
 233 #    endif
 234 #  ifdef USE_LOCALE_NUMERIC
 235                              LC_NUMERIC,
 236 #  endif
 237 #    ifdef USE_LOCALE_COLLATE
 238                              LC_COLLATE,
 239 #    endif
 240 #    ifdef USE_LOCALE_TIME
 241                              LC_TIME,
 242 #    endif
 243 #    ifdef USE_LOCALE_MESSAGES
 244                              LC_MESSAGES,
 245 #    endif
 246 #    ifdef USE_LOCALE_MONETARY
 247                              LC_MONETARY,
 248 #    endif
 249 #    ifdef USE_LOCALE_ADDRESS
 250                              LC_ADDRESS,
 251 #    endif
 252 #    ifdef USE_LOCALE_IDENTIFICATION
 253                              LC_IDENTIFICATION,
 254 #    endif
 255 #    ifdef USE_LOCALE_MEASUREMENT
 256                              LC_MEASUREMENT,
 257 #    endif
 258 #    ifdef USE_LOCALE_PAPER
 259                              LC_PAPER,
 260 #    endif
 261 #    ifdef USE_LOCALE_TELEPHONE
 262                              LC_TELEPHONE,
 263 #    endif
 264 #    ifdef USE_LOCALE_SYNTAX
 265                              LC_SYNTAX,
 266 #    endif
 267 #    ifdef USE_LOCALE_TOD
 268                              LC_TOD,
 269 #    endif
 270 #    ifdef LC_ALL
 271                              LC_ALL,
 272 #    endif
 273
 274    /* Placeholder as a precaution if code fails to check the return of
 275     * get_category_index(), which returns this element to indicate an error */
 276                             -1
 277 };
 278
 279 /* The top-most real element is LC_ALL */
 280
 281 STATIC const char * const category_names[] = {
 282
 283 #    ifdef USE_LOCALE_CTYPE
 284                                  "LC_CTYPE",
 285 #    endif
 286 #  ifdef USE_LOCALE_NUMERIC
 287                                  "LC_NUMERIC",
 288 #  endif
 289 #    ifdef USE_LOCALE_COLLATE
 290                                  "LC_COLLATE",
 291 #    endif
 292 #    ifdef USE_LOCALE_TIME
 293                                  "LC_TIME",
 294 #    endif
 295 #    ifdef USE_LOCALE_MESSAGES
 296                                  "LC_MESSAGES",
 297 #    endif
 298 #    ifdef USE_LOCALE_MONETARY
 299                                  "LC_MONETARY",
 300 #    endif
 301 #    ifdef USE_LOCALE_ADDRESS
 302                                  "LC_ADDRESS",
 303 #    endif
 304 #    ifdef USE_LOCALE_IDENTIFICATION
 305                                  "LC_IDENTIFICATION",
 306 #    endif
 307 #    ifdef USE_LOCALE_MEASUREMENT
 308                                  "LC_MEASUREMENT",
 309 #    endif
 310 #    ifdef USE_LOCALE_PAPER
 311                                  "LC_PAPER",
 312 #    endif
 313 #    ifdef USE_LOCALE_TELEPHONE
 314                                  "LC_TELEPHONE",
 315 #    endif
 316 #    ifdef USE_LOCALE_SYNTAX
 317                                  "LC_SYNTAX",
 318 #    endif
 319 #    ifdef USE_LOCALE_TOD
 320                                  "LC_TOD",
 321 #    endif
 322 #    ifdef LC_ALL
 323                                  "LC_ALL",
 324 #    endif
 325
 326    /* Placeholder as a precaution if code fails to check the return of
 327     * get_category_index(), which returns this element to indicate an error */
 328                                  NULL
 329 };
 330
 331 /* A few categories require additional setup when they are changed.  This table
 332  * points to the functions that do that setup */
 333 STATIC void (*update_functions[]) (pTHX_ const char *) = {
 334 #  ifdef USE_LOCALE_CTYPE
 335                                 S_new_ctype,
 336 #  endif
 337 #  ifdef USE_LOCALE_NUMERIC
 338                                 S_new_numeric,
 339 #  endif
 340 #  ifdef USE_LOCALE_COLLATE
 341                                 S_new_collate,
 342 #  endif
 343 #  ifdef USE_LOCALE_TIME
 344                                 NULL,
 345 #  endif
 346 #  ifdef USE_LOCALE_MESSAGES
 347                                 NULL,
 348 #  endif
 349 #  ifdef USE_LOCALE_MONETARY
 350                                 NULL,
 351 #  endif
 352 #  ifdef USE_LOCALE_ADDRESS
 353                                 NULL,
 354 #  endif
 355 #  ifdef USE_LOCALE_IDENTIFICATION
 356                                 NULL,
 357 #  endif
 358 #  ifdef USE_LOCALE_MEASUREMENT
 359                                 NULL,
 360 #  endif
 361 #  ifdef USE_LOCALE_PAPER
 362                                 NULL,
 363 #  endif
 364 #  ifdef USE_LOCALE_TELEPHONE
 365                                 NULL,
 366 #  endif
 367 #  ifdef USE_LOCALE_SYNTAX
 368                                 NULL,
 369 #  endif
 370 #  ifdef USE_LOCALE_TOD
 371                                 NULL,
 372 #  endif
 373     /* No harm done to have this even without an LC_ALL */
 374                                 S_new_LC_ALL,
 375
 376    /* Placeholder as a precaution if code fails to check the return of
 377     * get_category_index(), which returns this element to indicate an error */
 378                                 NULL
 379 };
 380
 381 #  ifdef LC_ALL
 382
 383     /* On systems with LC_ALL, it is kept in the highest index position.  (-2
 384      * to account for the final unused placeholder element.) */
 385 #    define NOMINAL_LC_ALL_INDEX (C_ARRAY_LENGTH(categories) - 2)
 386 #  else
 387
 388     /* On systems without LC_ALL, we pretend it is there, one beyond the real
 389      * top element, hence in the unused placeholder element. */
 390 #    define NOMINAL_LC_ALL_INDEX (C_ARRAY_LENGTH(categories) - 1)
 391 #  endif
 392
 393 /* Pretending there is an LC_ALL element just above allows us to avoid most
 394  * special cases.  Most loops through these arrays in the code below are
 395  * written like 'for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++)'.  They will work
 396  * on either type of system.  But the code must be written to not access the
 397  * element at 'LC_ALL_INDEX_' except on platforms that have it.  This can be
 398  * checked for at compile time by using the #define LC_ALL_INDEX_ which is only
 399  * defined if we do have LC_ALL. */
 400
 401 STATIC unsigned int
 402 S_get_category_index(const int category, const char * locale)
 403 {
 404     /* Given a category, return the equivalent internal index we generally use
 405      * instead.
 406      *
 407      * 'locale' is for use in any generated diagnostics, and may be NULL
 408      *
 409      * Some sort of hash could be used instead of this loop, but the number of
 410      * elements is so far at most 12 */
 411
 412     unsigned int i;
 413     const char * conditional_warn_text = "; can't set it to ";
 414
 415     PERL_ARGS_ASSERT_GET_CATEGORY_INDEX;
 416
 417 #  ifdef LC_ALL
 418     for (i = 0; i <=         LC_ALL_INDEX_; i++)
 419 #  else
 420     for (i = 0; i <  NOMINAL_LC_ALL_INDEX;  i++)
 421 #  endif
 422     {
 423         if (category == categories[i]) {
 424             dTHX_DEBUGGING;
 425             DEBUG_Lv(PerlIO_printf(Perl_debug_log,
 426                                    "index of category %d (%s) is %d\n",
 427                                    category, category_names[i], i));
 428             return i;
 429         }
 430     }
 431
 432     /* Here, we don't know about this category, so can't handle it. */
 433
 434     if (! locale) {
 435         locale = "";
 436         conditional_warn_text = "";
 437     }
 438
 439     /* diag_listed_as: Unknown locale category %d; can't set it to %s */
 440     Perl_warner_nocontext(packWARN(WARN_LOCALE),
 441                           "Unknown locale category %d%s%s",
 442                           category, conditional_warn_text, locale);
 443
 444 #  ifdef EINVAL
 445
 446     SETERRNO(EINVAL, LIB_INVARG);
 447
 448 #  endif
 449
 450     /* Return an out-of-bounds value */
 451     return NOMINAL_LC_ALL_INDEX + 1;
 452 }
 453
 454 #endif /* ifdef USE_LOCALE */
 455
 456 void
 457 Perl_force_locale_unlock()
 458 {
 459
 460 #if defined(USE_LOCALE_THREADS)
 461
 462     dTHX;
 463
 464     /* If recursively locked, clear all at once */
 465     if (PL_locale_mutex_depth > 1) {
 466         PL_locale_mutex_depth = 1;
 467     }
 468
 469     if (PL_locale_mutex_depth > 0) {
 470         LOCALE_UNLOCK_;
 471     }
 472
 473 #endif
 474
 475 }
 476
 477 #ifdef USE_POSIX_2008_LOCALE
 478
 479 STATIC locale_t
 480 S_use_curlocale_scratch(pTHX)
 481 {
 482     /* This function is used to hide from the caller the case where the current
 483      * locale_t object in POSIX 2008 is the global one, which is illegal in
 484      * many of the P2008 API calls.  This checks for that and, if necessary
 485      * creates a proper P2008 object.  Any prior object is deleted, as is any
 486      * remaining object during global destruction. */
 487
 488     locale_t cur = uselocale((locale_t) 0);
 489
 490     if (cur != LC_GLOBAL_LOCALE) {
 491         return cur;
 492     }
 493
 494     if (PL_scratch_locale_obj) {
 495         freelocale(PL_scratch_locale_obj);
 496     }
 497
 498     PL_scratch_locale_obj = duplocale(LC_GLOBAL_LOCALE);
 499     return PL_scratch_locale_obj;
 500 }
 501
 502 #endif
 503
 504 void
 505 Perl_locale_panic(const char * msg,
 506                   const char * file_name,
 507                   const line_t line,
 508                   const int errnum)
 509 {
 510     dTHX;
 511
 512     PERL_ARGS_ASSERT_LOCALE_PANIC;
 513
 514     force_locale_unlock();
 515
 516 #ifdef USE_C_BACKTRACE
 517     dump_c_backtrace(Perl_debug_log, 20, 1);
 518 #endif
 519
 520     /* diag_listed_as: panic: %s */
 521     Perl_croak(aTHX_ "%s: %d: panic: %s; errno=%d\n",
 522                      file_name, line, msg, errnum);
 523 }
 524
 525 #define setlocale_failure_panic_c(                                          \
 526                         cat, current, failed, caller_0_line, caller_1_line) \
 527         setlocale_failure_panic_i(cat##_INDEX_, current, failed,            \
 528                         caller_0_line, caller_1_line)
 529
 530 /* posix_setlocale() presents a consistent POSIX-compliant interface to
 531  * setlocale().   Windows requres a customized base-level setlocale() */
 532 #ifdef WIN32
 533 #  define posix_setlocale(cat, locale) win32_setlocale(cat, locale)
 534 #else
 535 #  define posix_setlocale(cat, locale) ((const char *) setlocale(cat, locale))
 536 #endif
 537
 538 /* The next layer up is to catch vagaries and bugs in the libc setlocale return
 539  * value */
 540 #ifdef stdize_locale
 541 #  define stdized_setlocale(cat, locale)                                       \
 542      stdize_locale(cat, posix_setlocale(cat, locale),                          \
 543                    &PL_stdize_locale_buf, &PL_stdize_locale_bufsize, __LINE__)
 544 #else
 545 #  define stdized_setlocale(cat, locale)  posix_setlocale(cat, locale)
 546 #endif
 547
 548 /* The next many lines form a layer above the close-to-the-metal 'posix'
 549  * and 'stdized' macros.  They are used to present a uniform API to the rest of
 550  * the code in this file in spite of the disparate underlying implementations.
 551  * */
 552
 553 #if    (! defined(USE_LOCALE_THREADS) && ! defined(USE_POSIX_2008_LOCALE))    \
 554     || (  defined(WIN32) && defined(USE_THREAD_SAFE_LOCALE))
 555
 556 /* For non-threaded perls (which we are not to use the POSIX 2008 API on), or a
 557  * thread-safe Windows one in which threading is invisible to us, the added
 558  * layer just expands to the base-level functions.  See the introductory
 559  * comments in this file for the meaning of the suffixes '_c', '_r', '_i'. */
 560
 561 #  define setlocale_r(cat, locale)        stdized_setlocale(cat, locale)
 562 #  define setlocale_i(i, locale)      setlocale_r(categories[i], locale)
 563 #  define setlocale_c(cat, locale)              setlocale_r(cat, locale)
 564
 565 #  define void_setlocale_i(i, locale)                                       \
 566     STMT_START {                                                            \
 567         if (! posix_setlocale(categories[i], locale)) {                     \
 568             setlocale_failure_panic_i(i, NULL, locale, __LINE__, 0);        \
 569             NOT_REACHED; /* NOTREACHED */                                   \
 570         }                                                                   \
 571     } STMT_END
 572 #  define void_setlocale_c(cat, locale)                                     \
 573                                   void_setlocale_i(cat##_INDEX_, locale)
 574 #  define void_setlocale_r(cat, locale)                                     \
 575                void_setlocale_i(get_category_index(cat, locale), locale)
 576
 577 #  define bool_setlocale_r(cat, locale) cBOOL(posix_setlocale(cat, locale))
 578 #  define bool_setlocale_i(i, locale)                                       \
 579                                  bool_setlocale_c(categories[i], locale)
 580 #  define bool_setlocale_c(cat, locale)    bool_setlocale_r(cat, locale)
 581
 582 /* All the querylocale...() forms return a mortalized copy.  If you need
 583  * something stable across calls, you need to savepv() the result yourself */
 584
 585 #  define querylocale_r(cat)        mortalized_pv_copy(setlocale_r(cat, NULL))
 586 #  define querylocale_c(cat)        querylocale_r(cat)
 587 #  define querylocale_i(i)          querylocale_c(categories[i])
 588
 589 #elif   defined(USE_LOCALE_THREADS)                 \
 590    && ! defined(USE_THREAD_SAFE_LOCALE)
 591
 592    /* Here, there are threads, and there is no support for thread-safe
 593     * operation.  This is a dangerous situation, which perl is documented as
 594     * not supporting, but it arises in practice.  We can do a modicum of
 595     * automatic mitigation by making sure there is a per-thread return from
 596     * setlocale(), and that a mutex protects it from races */
 597 STATIC const char *
 598 S_less_dicey_setlocale_r(pTHX_ const int category, const char * locale)
 599 {
 600     const char * retval;
 601
 602     PERL_ARGS_ASSERT_LESS_DICEY_SETLOCALE_R;
 603
 604     POSIX_SETLOCALE_LOCK;
 605
 606     retval = stdized_setlocale(category, locale);
 607
 608     /* We reuse PL_stdize_locale_buf as it doesn't conflict, but the call may
 609      * already have used it, in which case we don't have to do anything further
 610      * */
 611     if (retval != PL_stdize_locale_buf) {
 612         retval = save_to_buffer(retval,
 613                                 &PL_stdize_locale_buf, &PL_stdize_locale_bufsize);
 614     }
 615
 616     POSIX_SETLOCALE_UNLOCK;
 617
 618     return retval;
 619 }
 620
 621 #  define setlocale_r(cat, locale)  less_dicey_setlocale_r(cat, locale)
 622 #  define setlocale_c(cat, locale)             setlocale_r(cat, locale)
 623 #  define setlocale_i(i, locale)     setlocale_r(categories[i], locale)
 624
 625 #  define querylocale_r(cat)  mortalized_pv_copy(setlocale_r(cat, NULL))
 626 #  define querylocale_c(cat)                   querylocale_r(cat)
 627 #  define querylocale_i(i)                     querylocale_r(categories[i])
 628
 629 STATIC void
 630 S_less_dicey_void_setlocale_i(pTHX_ const unsigned cat_index,
 631                                     const char * locale,
 632                                     const line_t line)
 633 {
 634     PERL_ARGS_ASSERT_LESS_DICEY_VOID_SETLOCALE_I;
 635
 636     POSIX_SETLOCALE_LOCK;
 637     if (! posix_setlocale(categories[cat_index], locale)) {
 638         POSIX_SETLOCALE_UNLOCK;
 639         setlocale_failure_panic_i(cat_index, NULL, locale, __LINE__, line);
 640     }
 641     POSIX_SETLOCALE_UNLOCK;
 642 }
 643
 644 #  define void_setlocale_i(i, locale)                                       \
 645                           less_dicey_void_setlocale_i(i, locale, __LINE__)
 646 #  define void_setlocale_c(cat, locale)                                     \
 647                           void_setlocale_i(cat##_INDEX_, locale)
 648 #  define void_setlocale_r(cat, locale)                                     \
 649        void_setlocale_i(get_category_index(cat, locale), locale)
 650
 651 #  if 0     /* Not currently used */
 652
 653 STATIC bool
 654 S_less_dicey_bool_setlocale_r(pTHX_ const int cat, const char * locale)
 655 {
 656     bool retval;
 657
 658     PERL_ARGS_ASSERT_LESS_DICEY_BOOL_SETLOCALE_R;
 659
 660     POSIX_SETLOCALE_LOCK;
 661     retval = cBOOL(posix_setlocale(cat, locale));
 662     POSIX_SETLOCALE_UNLOCK;
 663
 664     return retval;
 665 }
 666
 667 #  endif
 668 #  define bool_setlocale_r(cat, locale)                                 \
 669                                less_dicey_bool_setlocale_r(cat, locale)
 670 #  define bool_setlocale_i(i, locale)                                   \
 671                                 bool_setlocale_r(categories[i], locale)
 672 #  define bool_setlocale_c(cat, locale) bool_setlocale_r(cat, locale)
 673 #else
 674
 675 /* Here, there is a completely different API to get thread-safe locales.  We
 676  * emulate the setlocale() API with our own function(s).  setlocale categories,
 677  * like LC_NUMERIC, are not valid here for the POSIX 2008 API.  Instead, there
 678  * are equivalents, like LC_NUMERIC_MASK, which we use instead, converting to
 679  * by using get_category_index() followed by table lookup. */
 680
 681 #  define emulate_setlocale_c(cat, locale, recalc_LC_ALL, line)             \
 682            emulate_setlocale_i(cat##_INDEX_, locale, recalc_LC_ALL, line)
 683
 684      /* A wrapper for the macros below. */
 685 #  define common_emulate_setlocale(i, locale)                               \
 686                  emulate_setlocale_i(i, locale, YES_RECALC_LC_ALL, __LINE__)
 687
 688 #  define setlocale_i(i, locale)     common_emulate_setlocale(i, locale)
 689 #  define setlocale_c(cat, locale)     setlocale_i(cat##_INDEX_, locale)
 690 #  define setlocale_r(cat, locale)                                          \
 691                     setlocale_i(get_category_index(cat, locale), locale)
 692
 693 #  define void_setlocale_i(i, locale)     ((void) setlocale_i(i, locale))
 694 #  define void_setlocale_c(cat, locale)                                     \
 695                                   void_setlocale_i(cat##_INDEX_, locale)
 696 #  define void_setlocale_r(cat, locale) ((void) setlocale_r(cat, locale))
 697
 698 #  define bool_setlocale_i(i, locale)       cBOOL(setlocale_i(i, locale))
 699 #  define bool_setlocale_c(cat, locale)                                     \
 700                                   bool_setlocale_i(cat##_INDEX_, locale)
 701 #  define bool_setlocale_r(cat, locale)   cBOOL(setlocale_r(cat, locale))
 702
 703 #  define querylocale_i(i)      mortalized_pv_copy(my_querylocale_i(i))
 704 #  define querylocale_c(cat)    querylocale_i(cat##_INDEX_)
 705 #  define querylocale_r(cat)    querylocale_i(get_category_index(cat,NULL))
 706
 707 #  ifdef USE_QUERYLOCALE
 708 #    define isSINGLE_BIT_SET(mask) isPOWER_OF_2(mask)
 709
 710      /* This code used to think querylocale() was valid on LC_ALL.  Make sure
 711       * all instances of that have been removed */
 712 #    define QUERYLOCALE_ASSERT(index)                                       \
 713                         __ASSERT_(isSINGLE_BIT_SET(category_masks[index]))
 714 #    if ! defined(HAS_QUERYLOCALE) && defined(_NL_LOCALE_NAME)
 715 #      define querylocale_l(index, locale_obj)                              \
 716             (QUERYLOCALE_ASSERT(index)                                      \
 717              mortalized_pv_copy(nl_langinfo_l(                              \
 718                          _NL_LOCALE_NAME(categories[index]), locale_obj)))
 719 #    else
 720 #      define querylocale_l(index, locale_obj)                              \
 721         (QUERYLOCALE_ASSERT(index)                                          \
 722          mortalized_pv_copy(querylocale(category_masks[index], locale_obj)))
 723 #    endif
 724 #  endif
 725 #  if defined(__GLIBC__) && defined(USE_LOCALE_MESSAGES)
 726 #    define HAS_GLIBC_LC_MESSAGES_BUG
 727 #    include <libintl.h>
 728 #  endif
 729
 730 /* A fourth array, parallel to the ones above to map from category to its
 731  * equivalent mask */
 732 STATIC const int category_masks[] = {
 733 #  ifdef USE_LOCALE_CTYPE
 734                                 LC_CTYPE_MASK,
 735 #  endif
 736 #  ifdef USE_LOCALE_NUMERIC
 737                                 LC_NUMERIC_MASK,
 738 #  endif
 739 #  ifdef USE_LOCALE_COLLATE
 740                                 LC_COLLATE_MASK,
 741 #  endif
 742 #  ifdef USE_LOCALE_TIME
 743                                 LC_TIME_MASK,
 744 #  endif
 745 #  ifdef USE_LOCALE_MESSAGES
 746                                 LC_MESSAGES_MASK,
 747 #  endif
 748 #  ifdef USE_LOCALE_MONETARY
 749                                 LC_MONETARY_MASK,
 750 #  endif
 751 #  ifdef USE_LOCALE_ADDRESS
 752                                 LC_ADDRESS_MASK,
 753 #  endif
 754 #  ifdef USE_LOCALE_IDENTIFICATION
 755                                 LC_IDENTIFICATION_MASK,
 756 #  endif
 757 #  ifdef USE_LOCALE_MEASUREMENT
 758                                 LC_MEASUREMENT_MASK,
 759 #  endif
 760 #  ifdef USE_LOCALE_PAPER
 761                                 LC_PAPER_MASK,
 762 #  endif
 763 #  ifdef USE_LOCALE_TELEPHONE
 764                                 LC_TELEPHONE_MASK,
 765 #  endif
 766 #  ifdef USE_LOCALE_SYNTAX
 767                                 LC_SYNTAX_MASK,
 768 #  endif
 769 #  ifdef USE_LOCALE_TOD
 770                                 LC_TOD_MASK,
 771 #  endif
 772                                 /* LC_ALL can't be turned off by a Configure
 773                                  * option, and in Posix 2008, should always be
 774                                  * here, so compile it in unconditionally.
 775                                  * This could catch some glitches at compile
 776                                  * time */
 777                                 LC_ALL_MASK,
 778
 779    /* Placeholder as a precaution if code fails to check the return of
 780     * get_category_index(), which returns this element to indicate an error */
 781                                 0
 782 };
 783
 784 #  define my_querylocale_c(cat) my_querylocale_i(cat##_INDEX_)
 785
 786 STATIC const char *
 787 S_my_querylocale_i(pTHX_ const unsigned int index)
 788 {
 789     /* This function returns the name of the locale category given by the input
 790      * index into our parallel tables of them.
 791      *
 792      * POSIX 2008, for some sick reason, chose not to provide a method to find
 793      * the category name of a locale, discarding a basic linguistic tenet that
 794      * for any object, people will create a name for it.  Some vendors have
 795      * created a querylocale() function to do just that.  This function is a
 796      * lot simpler to implement on systems that have this.  Otherwise, we have
 797      * to keep track of what the locale has been set to, so that we can return
 798      * its name so as to emulate setlocale().  It's also possible for C code in
 799      * some library to change the locale without us knowing it, though as of
 800      * September 2017, there are no occurrences in CPAN of uselocale().  Some
 801      * libraries do use setlocale(), but that changes the global locale, and
 802      * threads using per-thread locales will just ignore those changes. */
 803
 804     int category;
 805     const locale_t cur_obj = uselocale((locale_t) 0);
 806     const char * retval;
 807
 808     PERL_ARGS_ASSERT_MY_QUERYLOCALE_I;
 809     assert(index <= NOMINAL_LC_ALL_INDEX);
 810
 811     category = categories[index];
 812
 813     DEBUG_Lv(PerlIO_printf(Perl_debug_log, "my_querylocale_i(%s) on %p\n",
 814                                            category_names[index], cur_obj));
 815     if (cur_obj == LC_GLOBAL_LOCALE) {
 816         POSIX_SETLOCALE_LOCK;
 817         retval = posix_setlocale(category, NULL);
 818         POSIX_SETLOCALE_UNLOCK;
 819     }
 820     else {
 821
 822 #  ifdef USE_QUERYLOCALE
 823
 824         /* We don't currently keep records when there is querylocale(), so have
 825          * to get it anew each time */
 826         retval = (index == LC_ALL_INDEX_)
 827                  ? calculate_LC_ALL(cur_obj)
 828                  : querylocale_l(index, cur_obj);
 829
 830 #  else
 831
 832         /* But we do have up-to-date values when we keep our own records
 833          * (except some times in initialization, where we get the value from
 834          * the system. */
 835         if (PL_curlocales[index] == NULL) {
 836             retval = stdized_setlocale(category, NULL);
 837             PL_curlocales[index] = savepv(retval);
 838         }
 839         else {
 840             retval = PL_curlocales[index];
 841         }
 842
 843 #  endif
 844
 845             }
 846
 847                 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
 848                            "my_querylocale_i(%s) returning '%s'\n",
 849                            category_names[index], retval));
 850     assert(strNE(retval, ""));
 851     return retval;
 852 }
 853
 854 #  ifdef USE_PL_CURLOCALES
 855
 856 STATIC const char *
 857 S_update_PL_curlocales_i(pTHX_
 858                          const unsigned int index,
 859                          const char * new_locale,
 860                          recalc_lc_all_t recalc_LC_ALL)
 861 {
 862     /* This is a helper function for emulate_setlocale_i(), mostly used to
 863      * make that function easier to read. */
 864
 865     PERL_ARGS_ASSERT_UPDATE_PL_CURLOCALES_I;
 866     assert(index <= NOMINAL_LC_ALL_INDEX);
 867
 868     if (index == LC_ALL_INDEX_) {
 869         unsigned int i;
 870
 871         /* For LC_ALL, we change all individual categories to correspond */
 872                          /* PL_curlocales is a parallel array, so has same
 873                           * length as 'categories' */
 874         for (i = 0; i < LC_ALL_INDEX_; i++) {
 875             Safefree(PL_curlocales[i]);
 876             PL_curlocales[i] = savepv(new_locale);
 877         }
 878
 879         recalc_LC_ALL = YES_RECALC_LC_ALL;
 880     }
 881     else {
 882
 883         /* Update the single category's record */
 884         Safefree(PL_curlocales[index]);
 885         PL_curlocales[index] = savepv(new_locale);
 886
 887         if (recalc_LC_ALL == RECALCULATE_LC_ALL_ON_FINAL_INTERATION) {
 888             recalc_LC_ALL = (index == NOMINAL_LC_ALL_INDEX - 1)
 889                             ? YES_RECALC_LC_ALL
 890                             : DONT_RECALC_LC_ALL;
 891         }
 892     }
 893
 894     if (recalc_LC_ALL == YES_RECALC_LC_ALL) {
 895         Safefree(PL_curlocales[LC_ALL_INDEX_]);
 896         PL_curlocales[LC_ALL_INDEX_] =
 897                                     savepv(calculate_LC_ALL(PL_curlocales));
 898     }
 899
 900     return PL_curlocales[index];
 901 }
 902
 903 #  endif  /* Need PL_curlocales[] */
 904
 905 STATIC const char *
 906 S_setlocale_from_aggregate_LC_ALL(pTHX_ const char * locale, const line_t line)
 907 {
 908     /* This function parses the value of the LC_ALL locale, assuming glibc
 909      * syntax, and sets each individual category on the system to the proper
 910      * value.
 911      *
 912      * This is likely to only ever be called from one place, so exists to make
 913      * the calling function easier to read by moving this ancillary code out of
 914      * the main line.
 915      *
 916      * The locale for each category is independent of the other categories.
 917      * Often, they are all the same, but certainly not always.  Perl, in fact,
 918      * usually keeps LC_NUMERIC in the C locale, regardless of the underlying
 919      * locale.  LC_ALL has to be able to represent the case of when there are
 920      * varying locales.  Platforms have differing ways of representing this.
 921      * Because of this, the code in this file goes to lengths to avoid the
 922      * issue, generally looping over the component categories instead of
 923      * referring to them in the aggregate, wherever possible.  However, there
 924      * are cases where we have to parse our own constructed aggregates, which use
 925      * the glibc syntax. */
 926
 927     const char * locale_on_entry = querylocale_c(LC_ALL);
 928
 929     PERL_ARGS_ASSERT_SETLOCALE_FROM_AGGREGATE_LC_ALL;
 930
 931     /* If the string that gives what to set doesn't include all categories,
 932      * the omitted ones get set to "C".  To get this behavior, first set
 933      * all the individual categories to "C", and override the furnished
 934      * ones below.  FALSE => No need to recalculate LC_ALL, as this is a
 935      * temporary state */
 936     if (! emulate_setlocale_c(LC_ALL, "C", DONT_RECALC_LC_ALL, line)) {
 937         setlocale_failure_panic_c(LC_ALL, locale_on_entry,
 938                                   "C", __LINE__, line);
 939         NOT_REACHED; /* NOTREACHED */
 940     }
 941
 942     const char * s = locale;
 943     const char * e = locale + strlen(locale);
 944     while (s < e) {
 945         const char * p = s;
 946
 947         /* Parse through the category */
 948         while (isWORDCHAR(*p)) {
 949             p++;
 950         }
 951
 952         const char * category_end = p;
 953
 954         if (*p++ != '=') {
 955             locale_panic_(Perl_form(aTHX_
 956                   "Unexpected character in locale category name '%02X", *(p-1)));
 957         }
 958
 959         /* Parse through the locale name */
 960         const char * name_start = p;
 961         while (p < e && *p != ';') {
 962             if (! isGRAPH(*p)) {
 963                 locale_panic_(Perl_form(aTHX_
 964                               "Unexpected character in locale name '%02X", *p));
 965             }
 966             p++;
 967         }
 968
 969         const char * name_end = p;
 970
 971         /* Space past the semi-colon */
 972         if (p < e) {
 973             p++;
 974         }
 975
 976         /* Find the index of the category name in our lists */
 977         for (PERL_UINT_FAST8_T i = 0; i < LC_ALL_INDEX_; i++) {
 978
 979             /* Keep going if this index doesn't point to the category being
 980              * parsed.  The strnNE() avoids a Perl_form(), but would fail if
 981              * ever a category name could be a substring of another one, e.g.,
 982              * if there were a "LC_TIME_DATE" */
 983             if strnNE(s, category_names[i], category_end - s) {
 984                 continue;
 985             }
 986
 987             /* Here i points to the category being parsed.  Now isolate the
 988              * locale it is being changed to */
 989             const char * individ_locale = Perl_form(aTHX_ "%.*s",
 990                                 (int) (name_end - name_start), name_start);
 991
 992             /* And do the change.  FALSE => Don't recalculate LC_ALL; we'll do
 993              * it ourselves after the loop */
 994             if (! emulate_setlocale_i(i, individ_locale,
 995                                       DONT_RECALC_LC_ALL, line))
 996             {
 997
 998                 /* But if we have to back out, do fix up LC_ALL */
 999                 if (! emulate_setlocale_c(LC_ALL, locale_on_entry,
1000                                           YES_RECALC_LC_ALL, line))
1001                 {
1002                     setlocale_failure_panic_i(i, individ_locale,
1003                                               locale, __LINE__, line);
1004                     NOT_REACHED; /* NOTREACHED */
1005                 }
1006
1007                 /* Reverting to the entry value succeeded, but the operation
1008                  * failed to go to the requested locale. */
1009                 return NULL;
1010             }
1011
1012             /* Found and handled the desired category.  Quit the inner loop to
1013              * try the next category */
1014             break;
1015         }
1016
1017         /* Finished with this category; iterate to the next one in the input */
1018         s = p;
1019     }
1020
1021 #    ifdef USE_PL_CURLOCALES
1022
1023     /* Here we have set all the individual categories.  Update the LC_ALL entry
1024      * as well.  We can't just use the input 'locale' as the value may omit
1025      * categories whose locale is 'C'.  khw thinks it's better to store a
1026      * complete LC_ALL.  So calculate it. */
1027     const char * retval = savepv(calculate_LC_ALL(PL_curlocales));
1028     Safefree(PL_curlocales[LC_ALL_INDEX_]);
1029     PL_curlocales[LC_ALL_INDEX_] = retval;
1030
1031 #    else
1032
1033     const char * retval = querylocale_c(LC_ALL);
1034
1035 #    endif
1036
1037     return retval;
1038 }
1039
1040 #  ifndef USE_QUERYLOCALE
1041
1042 STATIC const char *
1043 S_find_locale_from_environment(pTHX_ const unsigned int index)
1044 {
1045     /* On systems without querylocale(), it is problematic getting the results
1046      * of the POSIX 2008 equivalent of setlocale(category, "") (which gets the
1047      * locale from the environment).
1048      *
1049      * To ensure that we know exactly what those values are, we do the setting
1050      * ourselves, using the documented algorithm (assuming the documentation is
1051      * correct) rather than use "" as the locale.  This will lead to results
1052      * that differ from native behavior if the native behavior differs from the
1053      * standard documented value, but khw believes it is better to know what's
1054      * going on, even if different from native, than to just guess.
1055      *
1056      * Another option would be, in a critical section, to save the global
1057      * locale's current value, and do a straight setlocale(LC_ALL, "").  That
1058      * would return our desired values, destroying the global locale's, which
1059      * we would then restore.  But that could cause races with any other thread
1060      * that is using the global locale and isn't using the mutex.  And, the
1061      * only reason someone would have done that is because they are calling a
1062      * library function, like in gtk, that calls setlocale(), and which can't
1063      * be changed to use the mutex.  That wouldn't be a problem if this were to
1064      * be done before any threads had switched, say during perl construction
1065      * time.  But this code would still be needed for the general case. */
1066
1067     const char * default_name;
1068     unsigned int i;
1069     const char * locale_names[LC_ALL_INDEX_];
1070
1071     /* We rely on PerlEnv_getenv() returning a mortalized copy */
1072     const char * const lc_all = PerlEnv_getenv("LC_ALL");
1073
1074     /* Use any "LC_ALL" environment variable, as it overrides everything
1075      * else. */
1076     if (lc_all && strNE(lc_all, "")) {
1077         return lc_all;
1078     }
1079
1080     /* Otherwise, we need to dig deeper.  Unless overridden, the default is
1081      * the LANG environment variable; "C" if it doesn't exist. */
1082     default_name = PerlEnv_getenv("LANG");
1083     if (! default_name || strEQ(default_name, "")) {
1084         default_name = "C";
1085     }
1086
1087     /* If setting an individual category, use its corresponding value found in
1088      * the environment, if any; otherwise use the default we already
1089      * calculated. */
1090     if (index != LC_ALL_INDEX_) {
1091         const char * const new_value = PerlEnv_getenv(category_names[index]);
1092
1093         return (new_value && strNE(new_value, ""))
1094                 ? new_value
1095                 : default_name;
1096     }
1097
1098     /* Here, we are getting LC_ALL.  Any categories that don't have a
1099      * corresponding environment variable set should be set to 'default_name'
1100      *
1101      * Simply find the values for all categories, and call the function to
1102      * compute LC_ALL. */
1103     for (i = 0; i < LC_ALL_INDEX_; i++) {
1104         const char * const env_override = PerlEnv_getenv(category_names[i]);
1105
1106         locale_names[i] = (env_override && strNE(env_override, ""))
1107                           ? env_override
1108                           : default_name;
1109
1110         DEBUG_Lv(PerlIO_printf(Perl_debug_log,
1111                  "find_locale_from_environment i=%d, name=%s, locale=%s\n",
1112                  i, category_names[i], locale_names[i]));
1113     }
1114
1115     return calculate_LC_ALL(locale_names);
1116 }
1117
1118 #  endif
1119
1120 STATIC const char *
1121 S_emulate_setlocale_i(pTHX_
1122
1123         /* Our internal index of the 'category' setlocale is
1124            called with */
1125         const unsigned int index,
1126
1127         const char * new_locale, /* The locale to set the category to */
1128         const recalc_lc_all_t recalc_LC_ALL,  /* Explained below */
1129         const line_t line     /* Called from this line number */
1130        )
1131 {
1132     PERL_ARGS_ASSERT_EMULATE_SETLOCALE_I;
1133     assert(index <= NOMINAL_LC_ALL_INDEX);
1134
1135     /* This function effectively performs a setlocale() on just the current
1136      * thread; thus it is thread-safe.  It does this by using the POSIX 2008
1137      * locale functions to emulate the behavior of setlocale().  Similar to
1138      * regular setlocale(), the return from this function points to memory that
1139      * can be overwritten by other system calls, so needs to be copied
1140      * immediately if you need to retain it.  The difference here is that
1141      * system calls besides another setlocale() can overwrite it.
1142      *
1143      * By doing this, most locale-sensitive functions become thread-safe.  The
1144      * exceptions are mostly those that return a pointer to static memory.
1145      *
1146      * This function may be called in a tight loop that iterates over all
1147      * categories.  Because LC_ALL is not a "real" category, but merely the sum
1148      * of all the other ones, such loops don't include LC_ALL.  On systems that
1149      * have querylocale() or similar, the current LC_ALL value is immediately
1150      * retrievable; on systems lacking that feature, we have to keep track of
1151      * LC_ALL ourselves.  We could do that on each iteration, only to throw it
1152      * away on the next, but the calculation is more than a trivial amount of
1153      * work.  Instead, the 'recalc_LC_ALL' parameter is set to
1154      * RECALCULATE_LC_ALL_ON_FINAL_INTERATION to only do the calculation once.
1155      * This function calls itself recursively in such a loop.
1156      *
1157      * When not in such a loop, the parameter is set to the other enum values
1158      * DONT_RECALC_LC_ALL or YES_RECALC_LC_ALL. */
1159
1160     int mask = category_masks[index];
1161     const locale_t entry_obj = uselocale((locale_t) 0);
1162     const char * locale_on_entry = querylocale_i(index);
1163
1164     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
1165              "emulate_setlocale_i input=%d (%s), mask=0x%x,"
1166              " new locale=\"%s\", current locale=\"%s\","
1167              "index=%d, object=%p\n",
1168              categories[index], category_names[index], mask,
1169              ((new_locale == NULL) ? "(nil)" : new_locale),
1170              locale_on_entry, index, entry_obj));
1171
1172     /* Return the already-calculated info if just querying what the existing
1173      * locale is */
1174     if (new_locale == NULL) {
1175         return locale_on_entry;
1176     }
1177
1178     /* Here, trying to change the locale, but it is a no-op if the new boss is
1179      * the same as the old boss.  Except this routine is called when converting
1180      * from the global locale, so in that case we will create a per-thread
1181      * locale below (with the current values).  Bitter experience also
1182      * indicates that newlocale() can free up the basis locale memory if we
1183      * call it with the new and old being the same. */
1184     if (   entry_obj != LC_GLOBAL_LOCALE
1185         && locale_on_entry
1186         && strEQ(new_locale, locale_on_entry))
1187     {
1188         DEBUG_Lv(PerlIO_printf(Perl_debug_log,
1189                  "(%" LINE_Tf "): emulate_setlocale_i"
1190                  " no-op to change to what it already was\n",
1191                  line));
1192
1193 #  ifdef USE_PL_CURLOCALES
1194
1195        /* On the final iteration of a loop that needs to recalculate LC_ALL, do
1196         * so.  If no iteration changed anything, LC_ALL also doesn't change,
1197         * but khw believes the complexity needed to keep track of that isn't
1198         * worth it. */
1199         if (UNLIKELY(   recalc_LC_ALL == RECALCULATE_LC_ALL_ON_FINAL_INTERATION
1200                      && index == NOMINAL_LC_ALL_INDEX - 1))
1201         {
1202             Safefree(PL_curlocales[LC_ALL_INDEX_]);
1203             PL_curlocales[LC_ALL_INDEX_] =
1204                                         savepv(calculate_LC_ALL(PL_curlocales));
1205         }
1206
1207 #  endif
1208
1209         return locale_on_entry;
1210     }
1211
1212 #  ifndef USE_QUERYLOCALE
1213
1214     /* Without a querylocale() mechanism, we have to figure out ourselves what
1215      * happens with setting a locale to "" */
1216     if (strEQ(new_locale, "")) {
1217         new_locale = find_locale_from_environment(index);
1218     }
1219
1220 #  endif
1221
1222     /* So far, it has worked that a semi-colon in the locale name means that
1223      * the category is LC_ALL and it subsumes categories which don't all have
1224      * the same locale.  This is the glibc syntax. */
1225     if (strchr(new_locale, ';')) {
1226         assert(index == LC_ALL_INDEX_);
1227         return setlocale_from_aggregate_LC_ALL(new_locale, line);
1228     }
1229
1230 #  ifdef HAS_GLIBC_LC_MESSAGES_BUG
1231
1232     /* For this bug, if the LC_MESSAGES locale changes, we have to do an
1233      * expensive workaround.  Save the current value so we can later determine
1234      * if it changed. */
1235     const char * old_messages_locale = NULL;
1236     if (   (index == LC_MESSAGES_INDEX_ || index == LC_ALL_INDEX_)
1237         &&  LIKELY(PL_phase != PERL_PHASE_CONSTRUCT))
1238     {
1239         old_messages_locale = querylocale_c(LC_MESSAGES);
1240     }
1241
1242 #  endif
1243
1244     assert(PL_C_locale_obj);
1245
1246     /* Now ready to switch to the input 'new_locale' */
1247
1248     /* Switching locales generally entails freeing the current one's space (at
1249      * the C library's discretion), hence we can't be using that locale at the
1250      * time of the switch (this wasn't obvious to khw from the man pages).  So
1251      * switch to a known locale object that we don't otherwise mess with. */
1252     if (! uselocale(PL_C_locale_obj)) {
1253
1254         /* Not being able to change to the C locale is severe; don't keep
1255          * going.  */
1256         setlocale_failure_panic_i(index, locale_on_entry, "C", __LINE__, line);
1257         NOT_REACHED; /* NOTREACHED */
1258     }
1259
1260     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
1261              "(%" LINE_Tf "): emulate_setlocale_i now using C"
1262              " object=%p\n", line, PL_C_locale_obj));
1263
1264     locale_t new_obj;
1265
1266     /* We created a (never changing) object at start-up for LC_ALL being in the
1267      * C locale.  If this call is to switch to LC_ALL=>C, simply use that
1268      * object.  But in fact, we already have switched to it just above, in
1269      * preparation for the general case.  Since we're already there, no need to
1270      * do further switching. */
1271     if (mask == LC_ALL_MASK && isNAME_C_OR_POSIX(new_locale)) {
1272         DEBUG_Lv(PerlIO_printf(Perl_debug_log, "(%" LINE_Tf "):"
1273                                                " emulate_setlocale_i will stay"
1274                                                " in C object\n", line));
1275         new_obj = PL_C_locale_obj;
1276
1277         /* And free the old object if it isn't a special one */
1278         if (entry_obj != LC_GLOBAL_LOCALE && entry_obj != PL_C_locale_obj) {
1279             freelocale(entry_obj);
1280         }
1281     }
1282     else {  /* Here is the general case, not to LC_ALL=>C */
1283         locale_t basis_obj = entry_obj;
1284
1285         /* Specially handle two objects */
1286         if (basis_obj == LC_GLOBAL_LOCALE || basis_obj == PL_C_locale_obj) {
1287
1288             /* For these two objects, we make duplicates to hand to newlocale()
1289              * below.  For LC_GLOBAL_LOCALE, this is because newlocale()
1290              * doesn't necessarily accept it as input (the results are
1291              * undefined).  For PL_C_locale_obj, it is so that it never gets
1292              * modified, as otherwise newlocale() is free to do so */
1293             basis_obj = duplocale(entry_obj);
1294             if (! basis_obj) {
1295                 locale_panic_(Perl_form(aTHX_ "(%" LINE_Tf "): duplocale failed",
1296                                               line));
1297                 NOT_REACHED; /* NOTREACHED */
1298             }
1299
1300             DEBUG_Lv(PerlIO_printf(Perl_debug_log,
1301                                    "(%" LINE_Tf "): emulate_setlocale_i"
1302                                    " created %p by duping the input\n",
1303                                    line, basis_obj));
1304         }
1305
1306         /* Ready to create a new locale by modification of the exising one */
1307         new_obj = newlocale(mask, new_locale, basis_obj);
1308
1309         if (! new_obj) {
1310             DEBUG_L(PerlIO_printf(Perl_debug_log,
1311                                   " (%" LINE_Tf "): emulate_setlocale_i"
1312                                   " creating new object from %p failed:"
1313                                   " errno=%d\n",
1314                                   line, basis_obj, GET_ERRNO));
1315
1316             /* Failed.  Likely this is because the proposed new locale isn't
1317              * valid on this system.  But we earlier switched to the LC_ALL=>C
1318              * locale in anticipation of it succeeding,  Now have to switch
1319              * back to the state upon entry */
1320             if (! uselocale(entry_obj)) {
1321                 setlocale_failure_panic_i(index, "switching back to",
1322                                           locale_on_entry, __LINE__, line);
1323                 NOT_REACHED; /* NOTREACHED */
1324             }
1325
1326 #    ifdef USE_PL_CURLOCALES
1327
1328             if (entry_obj == LC_GLOBAL_LOCALE) {
1329
1330                 /* Here, we are back in the global locale.  We may never have
1331                  * set PL_curlocales.  If the locale change had succeeded, the
1332                  * code would have then set them up, but since it didn't, do so
1333                  * here.  khw isn't sure if this prevents some issues or not,
1334                  * but tis is defensive coding.  The system setlocale() returns
1335                  * the desired information.  This will calculate LC_ALL's entry
1336                  * only on the final iteration */
1337                 POSIX_SETLOCALE_LOCK;
1338                 for (PERL_UINT_FAST8_T i = 0; i < NOMINAL_LC_ALL_INDEX; i++) {
1339                     update_PL_curlocales_i(i,
1340                                        posix_setlocale(categories[i], NULL),
1341                                        RECALCULATE_LC_ALL_ON_FINAL_INTERATION);
1342                 }
1343                 POSIX_SETLOCALE_UNLOCK;
1344             }
1345 #    endif
1346
1347             return NULL;
1348         }
1349
1350         DEBUG_Lv(PerlIO_printf(Perl_debug_log,
1351                                "(%" LINE_Tf "): emulate_setlocale_i created %p"
1352                                " while freeing %p\n", line, new_obj, basis_obj));
1353
1354         /* Here, successfully created an object representing the desired
1355          * locale; now switch into it */
1356         if (! uselocale(new_obj)) {
1357             freelocale(new_obj);
1358             locale_panic_(Perl_form(aTHX_ "(%" LINE_Tf "): emulate_setlocale_i"
1359                                           " switching into new locale failed",
1360                                           line));
1361         }
1362     }
1363
1364     /* Here, we are using 'new_obj' which matches the input 'new_locale'. */
1365     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
1366              "(%" LINE_Tf "): emulate_setlocale_i now using %p\n", line, new_obj));
1367
1368     /* We are done, except for updating our records (if the system doesn't keep
1369      * them) and in the case of locale "", we don't actually know what the
1370      * locale that got switched to is, as it came from the environment.  So
1371      * have to find it */
1372
1373 #  ifdef USE_QUERYLOCALE
1374
1375     if (strEQ(new_locale, "")) {
1376         new_locale = querylocale_i(index);
1377     }
1378
1379     PERL_UNUSED_ARG(recalc_LC_ALL);
1380
1381 #  else
1382
1383     new_locale = update_PL_curlocales_i(index, new_locale, recalc_LC_ALL);
1384
1385 #  endif
1386 #  ifdef HAS_GLIBC_LC_MESSAGES_BUG
1387
1388     /* Invalidate the glibc cache of loaded translations if the locale has changed,
1389      * see [perl #134264] */
1390     if (old_messages_locale) {
1391         if (strNE(old_messages_locale, my_querylocale_c(LC_MESSAGES))) {
1392             textdomain(textdomain(NULL));
1393         }
1394     }
1395
1396 #  endif
1397
1398     return new_locale;
1399 }
1400
1401 #endif   /* End of the various implementations of the setlocale and
1402             querylocale macros used in the remainder of this program */
1403
1404 #ifdef USE_LOCALE
1405
1406 /* So far, the locale strings returned by modern 2008-compliant systems have
1407  * been fine */
1408
1409 STATIC const char *
1410 S_stdize_locale(pTHX_ const int category,
1411                       const char *input_locale,
1412                       const char **buf,
1413                       Size_t *buf_size,
1414                       const line_t caller_line)
1415 {
1416     /* The return value of setlocale() is opaque, but is required to be usable
1417      * as input to a future setlocale() to create the same state.
1418      * Unfortunately not all systems are compliant.  But most often they are of
1419      * a very restricted set of forms that this file has been coded to expect.
1420      *
1421      * There are some outliers, though, that this function tries to tame:
1422      *
1423      * 1) A new-line.  This function chomps any \n characters
1424      * 2) foo=bar.     'bar' is what is generally meant, and the foo= part is
1425      *                 stripped.  This form is legal for LC_ALL.  When found in
1426      *                 that category group, the function calls itself
1427      *                 recursively on each possible component category to make
1428      *                 sure the individual categories are ok.
1429      *
1430      * If no changes to the input were made, it is returned; otherwise the
1431      * changed version is stored into memory at *buf, with *buf_size set to its
1432      * new value, and *buf is returned.
1433      */
1434
1435     const char * first_bad;
1436     const char * retval;
1437
1438     PERL_ARGS_ASSERT_STDIZE_LOCALE;
1439
1440     if (input_locale == NULL) {
1441         return NULL;
1442     }
1443
1444     first_bad = strpbrk(input_locale, "=\n");
1445
1446     /* Most likely, there isn't a problem with the input */
1447     if (LIKELY(! first_bad)) {
1448         return input_locale;
1449     }
1450
1451 #    ifdef LC_ALL
1452
1453     /* But if there is, and the category is LC_ALL, we have to look at each
1454      * component category */
1455     if (category == LC_ALL) {
1456         const char * individ_locales[LC_ALL_INDEX_];
1457         bool made_changes = FALSE;
1458         unsigned int i;
1459
1460         for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++) {
1461             Size_t this_size = 0;
1462             individ_locales[i] = stdize_locale(categories[i],
1463                                                posix_setlocale(categories[i],
1464                                                                NULL),
1465                                                &individ_locales[i],
1466                                                &this_size,
1467                                                caller_line);
1468
1469             /* If the size didn't change, it means this category did not have
1470              * to be adjusted, and individ_locales[i] points to the buffer
1471              * returned by posix_setlocale(); we have to copy that before
1472              * it's called again in the next iteration */
1473             if (this_size == 0) {
1474                 individ_locales[i] = savepv(individ_locales[i]);
1475             }
1476             else {
1477                 made_changes = TRUE;
1478             }
1479         }
1480
1481         /* If all the individual categories were ok as-is, this was a false
1482          * alarm.  We must have seen an '=' which was a legal occurrence in
1483          * this combination locale */
1484         if (! made_changes) {
1485             retval = input_locale;  /* The input can be returned unchanged */
1486         }
1487         else {
1488             retval = save_to_buffer(querylocale_c(LC_ALL), buf, buf_size);
1489         }
1490
1491         for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++) {
1492             Safefree(individ_locales[i]);
1493         }
1494
1495         return retval;
1496     }
1497
1498 #    else   /* else no LC_ALL */
1499
1500     PERL_UNUSED_ARG(category);
1501     PERL_UNUSED_ARG(caller_line);
1502
1503 #    endif
1504
1505     /* Here, there was a problem in an individual category.  This means that at
1506      * least one adjustment will be necessary.  Create a modifiable copy */
1507     retval = save_to_buffer(input_locale, buf, buf_size);
1508
1509     if (*first_bad != '=') {
1510
1511         /* Translate the found position into terms of the copy */
1512         first_bad = retval + (first_bad - input_locale);
1513     }
1514     else { /* An '=' */
1515
1516         /* It is unlikely that the return is so screwed-up that it contains
1517          * multiple equals signs, but handle that case by stripping all of
1518          * them.  */
1519         const char * final_equals = strrchr(retval, '=');
1520
1521         /* The length passed here causes the move to include the terminating
1522          * NUL */
1523         Move(final_equals + 1, retval, strlen(final_equals), char);
1524
1525         /* See if there are additional problems; if not, we're good to return.
1526          * */
1527         first_bad = strpbrk(retval, "\n");
1528
1529         if (! first_bad) {
1530             return retval;
1531         }
1532     }
1533
1534     /* Here, the problem must be a \n.  Get rid of it and what follows.
1535      * (Originally, only a trailing \n was stripped.  Unsure what to do if not
1536      * trailing) */
1537     *((char *) first_bad) = '\0';
1538     return retval;
1539 }
1540
1541 #if defined(USE_POSIX_2008_LOCALE) || ! defined(LC_ALL)
1542
1543 STATIC
1544 const char *
1545
1546 #  ifdef USE_QUERYLOCALE
1547 S_calculate_LC_ALL(pTHX_ const locale_t cur_obj)
1548 #  else
1549 S_calculate_LC_ALL(pTHX_ const char ** individ_locales)
1550 #  endif
1551
1552 {
1553     /* For POSIX 2008, we have to figure out LC_ALL ourselves when needed.
1554      * querylocale(), on systems that have it, doesn't tend to work for LC_ALL.
1555      * So we have to construct the answer ourselves based on the passed in
1556      * data, which is either a locale_t object, for systems with querylocale(),
1557      * or an array we keep updated to the proper values, otherwise.
1558      *
1559      * This returns a mortalized string containing the locale name(s) of
1560      * LC_ALL.
1561      *
1562      * If all individual categories are the same locale, we can just set LC_ALL
1563      * to that locale.  But if not, we have to create an aggregation of all the
1564      * categories on the system.  Platforms differ as to the syntax they use
1565      * for these non-uniform locales for LC_ALL.  Some use a '/' or other
1566      * delimiter of the locales with a predetermined order of categories; a
1567      * Configure probe would be needed to tell us how to decipher those.  glibc
1568      * uses a series of name=value pairs, like
1569      *      LC_NUMERIC=C;LC_TIME=en_US.UTF-8;...
1570      * The syntax we use for our aggregation doesn't much matter, as we take
1571      * care not to use the native setlocale() function on whatever style is
1572      * chosen.  But, it would be possible for someone to call Perl_setlocale()
1573      * using a native style we don't understand.  So far no one has complained.
1574      *
1575      * For systems that have categories we don't know about, the algorithm
1576      * below won't know about those missing categories, leading to potential
1577      * bugs for code that looks at them.  If there is an environment variable
1578      * that sets that category, we won't know to look for it, and so our use of
1579      * LANG or "C" improperly overrides it.  On the other hand, if we don't do
1580      * what is done here, and there is no environment variable, the category's
1581      * locale should be set to LANG or "C".  So there is no good solution.  khw
1582      * thinks the best is to make sure we have a complete list of possible
1583      * categories, adding new ones as they show up on obscure platforms.
1584      */
1585
1586     unsigned int i;
1587     Size_t names_len = 0;
1588     bool are_all_categories_the_same_locale = TRUE;
1589     char * aggregate_locale;
1590     char * previous_start = NULL;
1591     char * this_start = NULL;
1592     Size_t entry_len = 0;
1593
1594     PERL_ARGS_ASSERT_CALCULATE_LC_ALL;
1595
1596     /* First calculate the needed size for the string listing the categories
1597      * and their locales. */
1598     for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++) {
1599
1600 #  ifdef USE_QUERYLOCALE
1601         const char * entry = querylocale_l(i, cur_obj);
1602 #  else
1603         const char * entry = individ_locales[i];
1604 #  endif
1605
1606         names_len += strlen(category_names[i])
1607                   + 1                           /* '=' */
1608                   + strlen(entry)
1609                   + 1;                          /* ';' */
1610     }
1611
1612     names_len++;    /* Trailing '\0' */
1613
1614     /* Allocate enough space for the aggregated string */
1615     SAVEFREEPV(Newxz(aggregate_locale, names_len, char));
1616
1617     /* Then fill it in */
1618     for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++) {
1619         Size_t new_len;
1620
1621 #  ifdef USE_QUERYLOCALE
1622         const char * entry = querylocale_l(i, cur_obj);
1623 #  else
1624         const char * entry = individ_locales[i];
1625 #  endif
1626
1627         new_len = my_strlcat(aggregate_locale, category_names[i], names_len);
1628         assert(new_len <= names_len);
1629         new_len = my_strlcat(aggregate_locale, "=", names_len);
1630         assert(new_len <= names_len);
1631
1632         this_start = aggregate_locale + strlen(aggregate_locale);
1633         entry_len = strlen(entry);
1634
1635         new_len = my_strlcat(aggregate_locale, entry, names_len);
1636         assert(new_len <= names_len);
1637         new_len = my_strlcat(aggregate_locale, ";", names_len);
1638         assert(new_len <= names_len);
1639         PERL_UNUSED_VAR(new_len);   /* Only used in DEBUGGING */
1640
1641         if (   i > 0
1642             && are_all_categories_the_same_locale
1643             && memNE(previous_start, this_start, entry_len + 1))
1644         {
1645             are_all_categories_the_same_locale = FALSE;
1646         }
1647         else {
1648             previous_start = this_start;
1649         }
1650     }
1651
1652     /* If they are all the same, just return any one of them */
1653     if (are_all_categories_the_same_locale) {
1654         aggregate_locale = this_start;
1655         aggregate_locale[entry_len] = '\0';
1656     }
1657
1658     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
1659                            "calculate_LC_ALL returning '%s'\n",
1660                            aggregate_locale));
1661
1662     return aggregate_locale;
1663 }
1664
1665 #endif
1666
1667 #if defined(USE_LOCALE) && defined(DEBUGGING)
1668
1669 STATIC const char *
1670 S_get_LC_ALL_display(pTHX)
1671 {
1672
1673 #  ifdef LC_ALL
1674
1675     return querylocale_c(LC_ALL);
1676
1677 #  else
1678
1679     const char * curlocales[NOMINAL_LC_ALL_INDEX];
1680
1681     for (unsigned i = 0; i < NOMINAL_LC_ALL_INDEX; i++) {
1682         curlocales[i] = querylocale_i(i);
1683     }
1684
1685     return calculate_LC_ALL(curlocales);
1686
1687 #  endif
1688
1689 }
1690
1691 #endif
1692
1693 STATIC void
1694 S_setlocale_failure_panic_i(pTHX_
1695                             const unsigned int cat_index,
1696                             const char * current,
1697                             const char * failed,
1698                             const line_t caller_0_line,
1699                             const line_t caller_1_line)
1700 {
1701     dSAVE_ERRNO;
1702     const int cat = categories[cat_index];
1703     const char * name = category_names[cat_index];
1704
1705     PERL_ARGS_ASSERT_SETLOCALE_FAILURE_PANIC_I;
1706
1707     if (current == NULL) {
1708         current = querylocale_i(cat_index);
1709     }
1710
1711     Perl_locale_panic(Perl_form(aTHX_ "(%" LINE_Tf
1712                                       "): Can't change locale for %s(%d)"
1713                                       " from '%s' to '%s'",
1714                                       caller_1_line, name, cat,
1715                                       current, failed),
1716                       __FILE__, caller_0_line, GET_ERRNO);
1717     NOT_REACHED; /* NOTREACHED */
1718 }
1719
1720 /* Any of these will allow us to find the RADIX */
1721 #  if defined(USE_LOCALE_NUMERIC) && (   defined(HAS_SOME_LANGINFO)         \
1722                                       || defined(HAS_SOME_LOCALECONV)       \
1723                                       || defined(HAS_SNPRINTF))
1724 #    define CAN_CALCULATE_RADIX
1725 #  endif
1726 #  ifdef USE_LOCALE_NUMERIC
1727
1728 STATIC void
1729 S_new_numeric(pTHX_ const char *newnum)
1730 {
1731     PERL_ARGS_ASSERT_NEW_NUMERIC;
1732
1733     /* Called after each libc setlocale() call affecting LC_NUMERIC, to tell
1734      * core Perl this and that 'newnum' is the name of the new locale, and we
1735      * are switched into it.  It installs this locale as the current underlying
1736      * default, and then switches to the C locale, if necessary, so that the
1737      * code that has traditionally expected the radix character to be a dot may
1738      * continue to do so.
1739      *
1740      * The default locale and the C locale can be toggled between by use of the
1741      * set_numeric_underlying() and set_numeric_standard() functions, which
1742      * should probably not be called directly, but only via macros like
1743      * SET_NUMERIC_STANDARD() in perl.h.
1744      *
1745      * The toggling is necessary mainly so that a non-dot radix decimal point
1746      * character can be input and output, while allowing internal calculations
1747      * to use a dot.
1748      *
1749      * This sets several interpreter-level variables:
1750      * PL_numeric_name  The underlying locale's name: a copy of 'newnum'
1751      * PL_numeric_underlying  A boolean indicating if the toggled state is such
1752      *                  that the current locale is the program's underlying
1753      *                  locale
1754      * PL_numeric_standard An int indicating if the toggled state is such
1755      *                  that the current locale is the C locale or
1756      *                  indistinguishable from the C locale.  If non-zero, it
1757      *                  is in C; if > 1, it means it may not be toggled away
1758      *                  from C.
1759      * PL_numeric_underlying_is_standard   A bool kept by this function
1760      *                  indicating that the underlying locale and the standard
1761      *                  C locale are indistinguishable for the purposes of
1762      *                  LC_NUMERIC.  This happens when both of the above two
1763      *                  variables are true at the same time.  (Toggling is a
1764      *                  no-op under these circumstances.)  This variable is
1765      *                  used to avoid having to recalculate.
1766      * PL_numeric_radix_sv  Contains the string that code should use for the
1767      *                  decimal point.  It is set to either a dot or the
1768      *                  program's underlying locale's radix character string,
1769      *                  depending on the situation.
1770      * PL_underlying_radix_sv  Contains the program's underlying locale's radix
1771      *                  character string.  This is copied into
1772      *                  PL_numeric_radix_sv when the situation warrants.  It
1773      *                  exists to avoid having to recalculate it when toggling.
1774      * PL_underlying_numeric_obj = (only on POSIX 2008 platforms)  An object
1775      *                  with everything set up properly so as to avoid work on
1776      *                  such platforms.
1777      */
1778
1779     DEBUG_L( PerlIO_printf(Perl_debug_log,
1780                            "Called new_numeric with %s, PL_numeric_name=%s\n",
1781                            newnum, PL_numeric_name));
1782
1783     /* If this isn't actually a change, do nothing */
1784     if (strEQ(PL_numeric_name, newnum)) {
1785         return;
1786     }
1787
1788     Safefree(PL_numeric_name);
1789     PL_numeric_name = savepv(newnum);
1790
1791     /* Handle the trivial case.  Since this is called at process
1792      * initialization, be aware that this bit can't rely on much being
1793      * available. */
1794     if (isNAME_C_OR_POSIX(PL_numeric_name)) {
1795         PL_numeric_standard = TRUE;
1796         PL_numeric_underlying_is_standard = TRUE;
1797         PL_numeric_underlying = TRUE;
1798         sv_setpv(PL_numeric_radix_sv, C_decimal_point);
1799         sv_setpv(PL_underlying_radix_sv, C_decimal_point);
1800         return;
1801     }
1802
1803     /* We are in the underlying locale until changed at the end of this
1804      * function */
1805     PL_numeric_underlying = TRUE;
1806
1807 #  ifdef USE_POSIX_2008_LOCALE
1808
1809     /* We keep a special object for easy switching to */
1810     PL_underlying_numeric_obj = newlocale(LC_NUMERIC_MASK,
1811                                           PL_numeric_name,
1812                                           PL_underlying_numeric_obj);
1813
1814 #    endif
1815
1816     const char * radix = NULL;
1817     utf8ness_t utf8ness = UTF8NESS_IMMATERIAL;
1818
1819     /* Find and save this locale's radix character. */
1820     my_langinfo_c(RADIXCHAR, LC_NUMERIC, PL_numeric_name,
1821                   &radix, NULL, &utf8ness);
1822     sv_setpv(PL_underlying_radix_sv, radix);
1823
1824     if (utf8ness == UTF8NESS_YES) {
1825         SvUTF8_on(PL_underlying_radix_sv);
1826     }
1827
1828     DEBUG_L(PerlIO_printf(Perl_debug_log,
1829                           "Locale radix is '%s', ?UTF-8=%d\n",
1830                           SvPVX(PL_underlying_radix_sv),
1831                           cBOOL(SvUTF8(PL_underlying_radix_sv))));
1832
1833     /* This locale is indistinguishable from C (for numeric purposes) if both
1834      * the radix character and the thousands separator are the same as C's.
1835      * Start with the radix. */
1836     PL_numeric_underlying_is_standard = strEQ(C_decimal_point, radix);
1837     Safefree(radix);
1838
1839 #    ifndef TS_W32_BROKEN_LOCALECONV
1840
1841     /* If the radix isn't the same as C's, we know it is distinguishable from
1842      * C; otherwise check the thousands separator too.  Only if both are the
1843      * same as C's is the locale indistinguishable from C.
1844      *
1845      * But on earlier Windows versions, there is a potential race.  This code
1846      * knows that localeconv() (elsewhere in this file) will be used to extract
1847      * the needed value, and localeconv() was buggy for quite a while, and that
1848      * code in this file hence uses a workaround.  And that workaround may have
1849      * an (unlikely) race.  Gathering the radix uses a different workaround on
1850      * Windows that doesn't involve a race.  It might be possible to do the
1851      * same for this (patches welcome).
1852      *
1853      * Until then khw doesn't think it's worth even the small risk of a race to
1854      * get this value, which in almost all locales is empty, and doesn't appear
1855      * to be used in any of the Micrsoft library routines anyway. */
1856
1857     const char * scratch_buffer = NULL;
1858     PL_numeric_underlying_is_standard &= strEQ(C_thousands_sep,
1859                                                my_langinfo_c(THOUSEP, LC_NUMERIC,
1860                                                              PL_numeric_name,
1861                                                              &scratch_buffer,
1862                                                              NULL, NULL));
1863     Safefree(scratch_buffer);
1864
1865 #    endif
1866
1867     PL_numeric_standard = PL_numeric_underlying_is_standard;
1868
1869     /* Keep LC_NUMERIC so that it has the C locale radix and thousands
1870      * separator.  This is for XS modules, so they don't have to worry about
1871      * the radix being a non-dot.  (Core operations that need the underlying
1872      * locale change to it temporarily). */
1873     if (! PL_numeric_standard) {
1874         set_numeric_standard();
1875     }
1876
1877 }
1878
1879 #  endif
1880
1881 void
1882 Perl_set_numeric_standard(pTHX)
1883 {
1884
1885 #  ifdef USE_LOCALE_NUMERIC
1886
1887     /* Unconditionally toggle the LC_NUMERIC locale to the current underlying
1888      * default.
1889      *
1890      * Most code should use the macro SET_NUMERIC_STANDARD() in perl.h
1891      * instead of calling this directly.  The macro avoids calling this routine
1892      * if toggling isn't necessary according to our records (which could be
1893      * wrong if some XS code has changed the locale behind our back) */
1894
1895     DEBUG_L(PerlIO_printf(Perl_debug_log,
1896                                   "Setting LC_NUMERIC locale to standard C\n"));
1897
1898     void_setlocale_c(LC_NUMERIC, "C");
1899     PL_numeric_standard = TRUE;
1900     sv_setpv(PL_numeric_radix_sv, C_decimal_point);
1901
1902     PL_numeric_underlying = PL_numeric_underlying_is_standard;
1903
1904 #  endif /* USE_LOCALE_NUMERIC */
1905
1906 }
1907
1908 void
1909 Perl_set_numeric_underlying(pTHX)
1910 {
1911
1912 #  ifdef USE_LOCALE_NUMERIC
1913
1914     /* Unconditionally toggle the LC_NUMERIC locale to the current underlying
1915      * default.
1916      *
1917      * Most code should use the macro SET_NUMERIC_UNDERLYING() in perl.h
1918      * instead of calling this directly.  The macro avoids calling this routine
1919      * if toggling isn't necessary according to our records (which could be
1920      * wrong if some XS code has changed the locale behind our back) */
1921
1922     DEBUG_L(PerlIO_printf(Perl_debug_log, "Setting LC_NUMERIC locale to %s\n",
1923                                           PL_numeric_name));
1924
1925     void_setlocale_c(LC_NUMERIC, PL_numeric_name);
1926     PL_numeric_underlying = TRUE;
1927     sv_setsv_nomg(PL_numeric_radix_sv, PL_underlying_radix_sv);
1928
1929     PL_numeric_standard = PL_numeric_underlying_is_standard;
1930
1931 #  endif /* USE_LOCALE_NUMERIC */
1932
1933 }
1934
1935 #  ifdef USE_LOCALE_CTYPE
1936
1937 STATIC void
1938 S_new_ctype(pTHX_ const char *newctype)
1939 {
1940     PERL_ARGS_ASSERT_NEW_CTYPE;
1941
1942     /* Called after each libc setlocale() call affecting LC_CTYPE, to tell
1943      * core Perl this and that 'newctype' is the name of the new locale.
1944      *
1945      * This function sets up the folding arrays for all 256 bytes, assuming
1946      * that tofold() is tolc() since fold case is not a concept in POSIX,
1947      */
1948
1949     DEBUG_L(PerlIO_printf(Perl_debug_log, "Entering new_ctype(%s)\n", newctype));
1950
1951     /* No change means no-op */
1952     if (strEQ(PL_ctype_name, newctype)) {
1953         return;
1954     }
1955
1956     /* We will replace any bad locale warning with 1) nothing if the new one is
1957      * ok; or 2) a new warning for the bad new locale */
1958     if (PL_warn_locale) {
1959         SvREFCNT_dec_NN(PL_warn_locale);
1960         PL_warn_locale = NULL;
1961     }
1962
1963     /* Clear cache */
1964     Safefree(PL_ctype_name);
1965     PL_ctype_name = "";
1966
1967     PL_in_utf8_turkic_locale = FALSE;
1968
1969     /* For the C locale, just use the standard folds, and we know there are no
1970      * glitches possible, so return early.  Since this is called at process
1971      * initialization, be aware that this bit can't rely on much being
1972      * available. */
1973     if (isNAME_C_OR_POSIX(newctype)) {
1974         Copy(PL_fold, PL_fold_locale, 256, U8);
1975         PL_ctype_name = savepv(newctype);
1976         PL_in_utf8_CTYPE_locale = FALSE;
1977         return;
1978     }
1979
1980     /* The cache being cleared signals this to compute a new value */
1981     PL_in_utf8_CTYPE_locale = is_locale_utf8(newctype);
1982
1983     PL_ctype_name = savepv(newctype);
1984     bool maybe_utf8_turkic = FALSE;
1985
1986     /* Don't check for problems if we are suppressing the warnings */
1987     bool check_for_problems = ckWARN_d(WARN_LOCALE) || UNLIKELY(DEBUG_L_TEST);
1988
1989     if (PL_in_utf8_CTYPE_locale) {
1990
1991         /* A UTF-8 locale gets standard rules.  But note that code still has to
1992          * handle this specially because of the three problematic code points
1993          * */
1994         Copy(PL_fold_latin1, PL_fold_locale, 256, U8);
1995
1996         /* UTF-8 locales can have special handling for 'I' and 'i' if they are
1997          * Turkic.  Make sure these two are the only anomalies.  (We don't
1998          * require towupper and towlower because they aren't in C89.) */
1999
2000 #    if defined(HAS_TOWUPPER) && defined (HAS_TOWLOWER)
2001
2002         if (towupper('i') == 0x130 && towlower('I') == 0x131)
2003
2004 #    else
2005
2006         if (toU8_UPPER_LC('i') == 'i' && toU8_LOWER_LC('I') == 'I')
2007
2008 #    endif
2009
2010         {
2011             /* This is how we determine it really is Turkic */
2012             check_for_problems = TRUE;
2013             maybe_utf8_turkic = TRUE;
2014         }
2015     }
2016     else {  /* Not a canned locale we know the values for.  Compute them */
2017
2018 #    ifdef DEBUGGING
2019
2020         bool has_non_ascii_fold = FALSE;
2021         bool found_unexpected = FALSE;
2022
2023         if (DEBUG_Lv_TEST) {
2024             for (unsigned i = 128; i < 256; i++) {
2025                 int j = LATIN1_TO_NATIVE(i);
2026                 if (toU8_LOWER_LC(j) != j || toU8_UPPER_LC(j) != j) {
2027                     has_non_ascii_fold = TRUE;
2028                     break;
2029                 }
2030             }
2031         }
2032
2033 #    endif
2034
2035         for (unsigned i = 0; i < 256; i++) {
2036             if (isU8_UPPER_LC(i))
2037                 PL_fold_locale[i] = (U8) toU8_LOWER_LC(i);
2038             else if (isU8_LOWER_LC(i))
2039                 PL_fold_locale[i] = (U8) toU8_UPPER_LC(i);
2040             else
2041                 PL_fold_locale[i] = (U8) i;
2042
2043 #    ifdef DEBUGGING
2044
2045             if (DEBUG_Lv_TEST) {
2046                 bool unexpected = FALSE;
2047
2048                 if (isUPPER_L1(i)) {
2049                     if (isUPPER_A(i)) {
2050                         if (PL_fold_locale[i] != toLOWER_A(i)) {
2051                             unexpected = TRUE;
2052                         }
2053                     }
2054                     else if (has_non_ascii_fold) {
2055                         if (PL_fold_locale[i] != toLOWER_L1(i)) {
2056                             unexpected = TRUE;
2057                         }
2058                     }
2059                     else if (PL_fold_locale[i] != i) {
2060                         unexpected = TRUE;
2061                     }
2062                 }
2063                 else if (   isLOWER_L1(i)
2064                          && i != LATIN_SMALL_LETTER_SHARP_S
2065                          && i != MICRO_SIGN)
2066                 {
2067                     if (isLOWER_A(i)) {
2068                         if (PL_fold_locale[i] != toUPPER_A(i)) {
2069                             unexpected = TRUE;
2070                         }
2071                     }
2072                     else if (has_non_ascii_fold) {
2073                         if (PL_fold_locale[i] != toUPPER_LATIN1_MOD(i)) {
2074                             unexpected = TRUE;
2075                         }
2076                     }
2077                     else if (PL_fold_locale[i] != i) {
2078                         unexpected = TRUE;
2079                     }
2080                 }
2081                 else if (PL_fold_locale[i] != i) {
2082                     unexpected = TRUE;
2083                 }
2084
2085                 if (unexpected) {
2086                     found_unexpected = TRUE;
2087                     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
2088                                            "For %s, fold of %02x is %02x\n",
2089                                            newctype, i, PL_fold_locale[i]));
2090                 }
2091             }
2092         }
2093
2094         if (found_unexpected) {
2095             DEBUG_Lv(PerlIO_printf(Perl_debug_log,
2096                                "All bytes not mentioned above either fold to"
2097                                " themselves or are the expected ASCII or"
2098                                " Latin1 ones\n"));
2099         }
2100         else {
2101             DEBUG_Lv(PerlIO_printf(Perl_debug_log,
2102                                    "No nonstandard folds were found\n"));
2103 #    endif
2104
2105         }
2106     }
2107
2108 #    ifdef MB_CUR_MAX
2109
2110     /* We only handle single-byte locales (outside of UTF-8 ones; so if this
2111      * locale requires more than one byte, there are going to be BIG problems.
2112      * */
2113
2114     if (MB_CUR_MAX > 1 && ! PL_in_utf8_CTYPE_locale
2115
2116             /* Some platforms return MB_CUR_MAX > 1 for even the "C" locale.
2117              * Just assume that the implementation for them (plus for POSIX) is
2118              * correct and the > 1 value is spurious.  (Since these are
2119              * specially handled to never be considered UTF-8 locales, as long
2120              * as this is the only problem, everything should work fine */
2121         && ! isNAME_C_OR_POSIX(newctype))
2122     {
2123         DEBUG_L(PerlIO_printf(Perl_debug_log,
2124                               "Unsupported, MB_CUR_MAX=%d\n", (int) MB_CUR_MAX));
2125
2126         Perl_ck_warner_d(aTHX_ packWARN(WARN_LOCALE),
2127                          "Locale '%s' is unsupported, and may crash the"
2128                          " interpreter.\n",
2129                          newctype);
2130     }
2131
2132 #    endif
2133
2134     DEBUG_Lv(PerlIO_printf(Perl_debug_log, "check_for_problems=%d\n",
2135                                            check_for_problems));
2136
2137     /* We don't populate the other lists if a UTF-8 locale, but do check that
2138      * everything works as expected, unless checking turned off */
2139     if (check_for_problems) {
2140         /* Assume enough space for every character being bad.  4 spaces each
2141          * for the 94 printable characters that are output like "'x' "; and 5
2142          * spaces each for "'\\' ", "'\t' ", and "'\n' "; plus a terminating
2143          * NUL */
2144         char bad_chars_list[ (94 * 4) + (3 * 5) + 1 ] = { '\0' };
2145         unsigned int bad_count = 0;         /* Count of bad characters */
2146
2147         for (unsigned i = 0; i < 256; i++) {
2148
2149             /* If checking for locale problems, see if the native ASCII-range
2150              * printables plus \n and \t are in their expected categories in
2151              * the new locale.  If not, this could mean big trouble, upending
2152              * Perl's and most programs' assumptions, like having a
2153              * metacharacter with special meaning become a \w.  Fortunately,
2154              * it's very rare to find locales that aren't supersets of ASCII
2155              * nowadays.  It isn't a problem for most controls to be changed
2156              * into something else; we check only \n and \t, though perhaps \r
2157              * could be an issue as well. */
2158             if (isGRAPH_A(i) || isBLANK_A(i) || i == '\n') {
2159                 bool is_bad = FALSE;
2160                 char name[4] = { '\0' };
2161
2162                 /* Convert the name into a string */
2163                 if (isGRAPH_A(i)) {
2164                     name[0] = i;
2165                     name[1] = '\0';
2166                 }
2167                 else if (i == '\n') {
2168                     my_strlcpy(name, "\\n", sizeof(name));
2169                 }
2170                 else if (i == '\t') {
2171                     my_strlcpy(name, "\\t", sizeof(name));
2172                 }
2173                 else {
2174                     assert(i == ' ');
2175                     my_strlcpy(name, "' '", sizeof(name));
2176                 }
2177
2178                 /* Check each possibe class */
2179                 if (UNLIKELY(cBOOL(isU8_ALPHANUMERIC_LC(i)) != cBOOL(isALPHANUMERIC_A(i))))  {
2180                     is_bad = TRUE;
2181                     DEBUG_L(PerlIO_printf(Perl_debug_log,
2182                                           "isalnum('%s') unexpectedly is %x\n",
2183                                           name, cBOOL(isU8_ALPHANUMERIC_LC(i))));
2184                 }
2185                 if (UNLIKELY(cBOOL(isU8_ALPHA_LC(i)) != cBOOL(isALPHA_A(i))))  {
2186                     is_bad = TRUE;
2187                     DEBUG_L(PerlIO_printf(Perl_debug_log,
2188                                           "isalpha('%s') unexpectedly is %x\n",
2189                                           name, cBOOL(isU8_ALPHA_LC(i))));
2190                 }
2191                 if (UNLIKELY(cBOOL(isU8_DIGIT_LC(i)) != cBOOL(isDIGIT_A(i))))  {
2192                     is_bad = TRUE;
2193                     DEBUG_L(PerlIO_printf(Perl_debug_log,
2194                                           "isdigit('%s') unexpectedly is %x\n",
2195                                           name, cBOOL(isU8_DIGIT_LC(i))));
2196                 }
2197                 if (UNLIKELY(cBOOL(isU8_GRAPH_LC(i)) != cBOOL(isGRAPH_A(i))))  {
2198                     is_bad = TRUE;
2199                     DEBUG_L(PerlIO_printf(Perl_debug_log,
2200                                           "isgraph('%s') unexpectedly is %x\n",
2201                                           name, cBOOL(isU8_GRAPH_LC(i))));
2202                 }
2203                 if (UNLIKELY(cBOOL(isU8_LOWER_LC(i)) != cBOOL(isLOWER_A(i))))  {
2204                     is_bad = TRUE;
2205                     DEBUG_L(PerlIO_printf(Perl_debug_log,
2206                                           "islower('%s') unexpectedly is %x\n",
2207                                           name, cBOOL(isU8_LOWER_LC(i))));
2208                 }
2209                 if (UNLIKELY(cBOOL(isU8_PRINT_LC(i)) != cBOOL(isPRINT_A(i))))  {
2210                     is_bad = TRUE;
2211                     DEBUG_L(PerlIO_printf(Perl_debug_log,
2212                                           "isprint('%s') unexpectedly is %x\n",
2213                                           name, cBOOL(isU8_PRINT_LC(i))));
2214                 }
2215                 if (UNLIKELY(cBOOL(isU8_PUNCT_LC(i)) != cBOOL(isPUNCT_A(i))))  {
2216                     is_bad = TRUE;
2217                     DEBUG_L(PerlIO_printf(Perl_debug_log,
2218                                           "ispunct('%s') unexpectedly is %x\n",
2219                                           name, cBOOL(isU8_PUNCT_LC(i))));
2220                 }
2221                 if (UNLIKELY(cBOOL(isU8_SPACE_LC(i)) != cBOOL(isSPACE_A(i))))  {
2222                     is_bad = TRUE;
2223                     DEBUG_L(PerlIO_printf(Perl_debug_log,
2224                                           "isspace('%s') unexpectedly is %x\n",
2225                                           name, cBOOL(isU8_SPACE_LC(i))));
2226                 }
2227                 if (UNLIKELY(cBOOL(isU8_UPPER_LC(i)) != cBOOL(isUPPER_A(i))))  {
2228                     is_bad = TRUE;
2229                     DEBUG_L(PerlIO_printf(Perl_debug_log,
2230                                           "isupper('%s') unexpectedly is %x\n",
2231                                           name, cBOOL(isU8_UPPER_LC(i))));
2232                 }
2233                 if (UNLIKELY(cBOOL(isU8_XDIGIT_LC(i))!= cBOOL(isXDIGIT_A(i))))  {
2234                     is_bad = TRUE;
2235                     DEBUG_L(PerlIO_printf(Perl_debug_log,
2236                                           "isxdigit('%s') unexpectedly is %x\n",
2237                                           name, cBOOL(isU8_XDIGIT_LC(i))));
2238                 }
2239                 if (UNLIKELY(toU8_LOWER_LC(i) != (int) toLOWER_A(i))) {
2240                     is_bad = TRUE;
2241                     DEBUG_L(PerlIO_printf(Perl_debug_log,
2242                             "tolower('%s')=0x%x instead of the expected 0x%x\n",
2243                             name, toU8_LOWER_LC(i), (int) toLOWER_A(i)));
2244                 }
2245                 if (UNLIKELY(toU8_UPPER_LC(i) != (int) toUPPER_A(i))) {
2246                     is_bad = TRUE;
2247                     DEBUG_L(PerlIO_printf(Perl_debug_log,
2248                             "toupper('%s')=0x%x instead of the expected 0x%x\n",
2249                             name, toU8_UPPER_LC(i), (int) toUPPER_A(i)));
2250                 }
2251                 if (UNLIKELY((i == '\n' && ! isCNTRL_LC(i))))  {
2252                     is_bad = TRUE;
2253                     DEBUG_L(PerlIO_printf(Perl_debug_log,
2254                                 "'\\n' (=%02X) is not a control\n", (int) i));
2255                 }
2256
2257                 /* Add to the list;  Separate multiple entries with a blank */
2258                 if (is_bad) {
2259                     if (bad_count) {
2260                         my_strlcat(bad_chars_list, " ", sizeof(bad_chars_list));
2261                     }
2262                     my_strlcat(bad_chars_list, name, sizeof(bad_chars_list));
2263                     bad_count++;
2264                 }
2265             }
2266         }
2267
2268         if (bad_count == 2 && maybe_utf8_turkic) {
2269             bad_count = 0;
2270             *bad_chars_list = '\0';
2271             PL_fold_locale['I'] = 'I';
2272             PL_fold_locale['i'] = 'i';
2273             PL_in_utf8_turkic_locale = TRUE;
2274             DEBUG_L(PerlIO_printf(Perl_debug_log, "%s is turkic\n", newctype));
2275         }
2276
2277         /* If we found problems and we want them output, do so */
2278         if (   (UNLIKELY(bad_count))
2279             && (LIKELY(ckWARN_d(WARN_LOCALE)) || UNLIKELY(DEBUG_L_TEST)))
2280         {
2281             if (UNLIKELY(bad_count) && PL_in_utf8_CTYPE_locale) {
2282                 PL_warn_locale = Perl_newSVpvf(aTHX_
2283                      "Locale '%s' contains (at least) the following characters"
2284                      " which have\nunexpected meanings: %s\nThe Perl program"
2285                      " will use the expected meanings",
2286                       newctype, bad_chars_list);
2287             }
2288             else {
2289                 PL_warn_locale =
2290                     Perl_newSVpvf(aTHX_
2291                                   "\nThe following characters (and maybe"
2292                                   " others) may not have the same meaning as"
2293                                   " the Perl program expects: %s\n",
2294                                   bad_chars_list
2295                             );
2296             }
2297
2298 #    ifdef HAS_SOME_LANGINFO
2299
2300             const char * scratch_buffer = NULL;
2301             Perl_sv_catpvf(aTHX_ PL_warn_locale, "; codeset=%s",
2302                                  my_langinfo_c(CODESET, LC_CTYPE,
2303                                                newctype,
2304                                                &scratch_buffer, NULL,
2305                                                NULL));
2306             Safefree(scratch_buffer);
2307
2308 #  endif
2309
2310             Perl_sv_catpvf(aTHX_ PL_warn_locale, "\n");
2311
2312             /* If we are actually in the scope of the locale or are debugging,
2313              * output the message now.  If not in that scope, we save the
2314              * message to be output at the first operation using this locale,
2315              * if that actually happens.  Most programs don't use locales, so
2316              * they are immune to bad ones.  */
2317             if (IN_LC(LC_CTYPE) || UNLIKELY(DEBUG_L_TEST)) {
2318
2319                 /* The '0' below suppresses a bogus gcc compiler warning */
2320                 Perl_warner(aTHX_ packWARN(WARN_LOCALE), SvPVX(PL_warn_locale),
2321                                                                             0);
2322
2323                 if (IN_LC(LC_CTYPE)) {
2324                     SvREFCNT_dec_NN(PL_warn_locale);
2325                     PL_warn_locale = NULL;
2326                 }
2327             }
2328         }
2329     }
2330 }
2331
2332 #  endif /* USE_LOCALE_CTYPE */
2333
2334 void
2335 Perl__warn_problematic_locale()
2336 {
2337
2338 #  ifdef USE_LOCALE_CTYPE
2339
2340     dTHX;
2341
2342     /* Internal-to-core function that outputs the message in PL_warn_locale,
2343      * and then NULLS it.  Should be called only through the macro
2344      * CHECK_AND_WARN_PROBLEMATIC_LOCALE_ */
2345
2346     if (PL_warn_locale) {
2347         Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
2348                              SvPVX(PL_warn_locale),
2349                              0 /* dummy to avoid compiler warning */ );
2350         SvREFCNT_dec_NN(PL_warn_locale);
2351         PL_warn_locale = NULL;
2352     }
2353
2354 #  endif
2355
2356 }
2357
2358 STATIC void
2359 S_new_LC_ALL(pTHX_ const char *unused)
2360 {
2361     unsigned int i;
2362
2363     /* LC_ALL updates all the things we care about. */
2364
2365     PERL_UNUSED_ARG(unused);
2366
2367     for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++) {
2368         if (update_functions[i]) {
2369             const char * this_locale = querylocale_i(i);
2370             update_functions[i](aTHX_ this_locale);
2371         }
2372     }
2373 }
2374
2375 #  ifdef USE_LOCALE_COLLATE
2376
2377 STATIC void
2378 S_new_collate(pTHX_ const char *newcoll)
2379 {
2380     PERL_ARGS_ASSERT_NEW_COLLATE;
2381
2382
2383     /* Called after each libc setlocale() call affecting LC_COLLATE, to tell
2384      * core Perl this and that 'newcoll' is the name of the new locale.
2385      *
2386      * The design of locale collation is that every locale change is given an
2387      * index 'PL_collation_ix'.  The first time a string particpates in an
2388      * operation that requires collation while locale collation is active, it
2389      * is given PERL_MAGIC_collxfrm magic (via sv_collxfrm_flags()).  That
2390      * magic includes the collation index, and the transformation of the string
2391      * by strxfrm(), q.v.  That transformation is used when doing comparisons,
2392      * instead of the string itself.  If a string changes, the magic is
2393      * cleared.  The next time the locale changes, the index is incremented,
2394      * and so we know during a comparison that the transformation is not
2395      * necessarily still valid, and so is recomputed.  Note that if the locale
2396      * changes enough times, the index could wrap (a U32), and it is possible
2397      * that a transformation would improperly be considered valid, leading to
2398      * an unlikely bug */
2399
2400     /* Return if the locale isn't changing */
2401     if (strEQ(PL_collation_name, newcoll)) {
2402         return;
2403     }
2404
2405     Safefree(PL_collation_name);
2406     PL_collation_name = savepv(newcoll);
2407     ++PL_collation_ix;
2408
2409     /* Set the new one up if trivial.  Since this is called at process
2410      * initialization, be aware that this bit can't rely on much being
2411      * available. */
2412     PL_collation_standard = isNAME_C_OR_POSIX(newcoll);
2413     if (PL_collation_standard) {
2414         DEBUG_Lv(PerlIO_printf(Perl_debug_log, "Setting PL_collation name='%s'\n", PL_collation_name));
2415         PL_collxfrm_base = 0;
2416         PL_collxfrm_mult = 2;
2417         PL_in_utf8_COLLATE_locale = FALSE;
2418         PL_strxfrm_NUL_replacement = '\0';
2419         PL_strxfrm_max_cp = 0;
2420         return;
2421     }
2422
2423     /* Flag that the remainder of the set up is being deferred until first need */
2424     PL_collxfrm_mult = 0;
2425     PL_collxfrm_base = 0;
2426
2427 }
2428
2429 #  endif /* USE_LOCALE_COLLATE */
2430 #endif  /* USE_LOCALE */
2431
2432 #ifdef WIN32
2433
2434 wchar_t *
2435 Perl_Win_utf8_string_to_wstring(const char * utf8_string)
2436 {
2437     wchar_t *wstring;
2438
2439     int req_size = MultiByteToWideChar(CP_UTF8, 0, utf8_string, -1, NULL, 0);
2440     if (! req_size) {
2441         errno = EINVAL;
2442         return NULL;
2443     }
2444
2445     Newx(wstring, req_size, wchar_t);
2446
2447     if (! MultiByteToWideChar(CP_UTF8, 0, utf8_string, -1, wstring, req_size))
2448     {
2449         Safefree(wstring);
2450         errno = EINVAL;
2451         return NULL;
2452     }
2453
2454     return wstring;
2455 }
2456
2457 char *
2458 Perl_Win_wstring_to_utf8_string(const wchar_t * wstring)
2459 {
2460     char *utf8_string;
2461
2462     int req_size =
2463               WideCharToMultiByte(CP_UTF8, 0, wstring, -1, NULL, 0, NULL, NULL);
2464
2465     Newx(utf8_string, req_size, char);
2466
2467     if (! WideCharToMultiByte(CP_UTF8, 0, wstring, -1, utf8_string,
2468                                                          req_size, NULL, NULL))
2469     {
2470         Safefree(utf8_string);
2471         errno = EINVAL;
2472         return NULL;
2473     }
2474
2475     return utf8_string;
2476 }
2477
2478 #define USE_WSETLOCALE
2479
2480 #ifdef USE_WSETLOCALE
2481
2482 STATIC char *
2483 S_wrap_wsetlocale(pTHX_ int category, const char *locale) {
2484     wchar_t *wlocale = NULL;
2485     wchar_t *wresult;
2486     char *result;
2487
2488     if (locale) {
2489         wlocale = Win_utf8_string_to_wstring(locale);
2490         if (! wlocale) {
2491             return NULL;
2492         }
2493     }
2494     else {
2495         wlocale = NULL;
2496     }
2497
2498     wresult = _wsetlocale(category, wlocale);
2499     Safefree(wlocale);
2500
2501     if (! wresult) {
2502             return NULL;
2503         }
2504
2505     result = Win_wstring_to_utf8_string(wresult);
2506     SAVEFREEPV(result); /* is there something better we can do here? */
2507
2508     return result;
2509 }
2510
2511 #endif
2512
2513 STATIC char *
2514 S_win32_setlocale(pTHX_ int category, const char* locale)
2515 {
2516     /* This, for Windows, emulates POSIX setlocale() behavior.  There is no
2517      * difference between the two unless the input locale is "", which normally
2518      * means on Windows to get the machine default, which is set via the
2519      * computer's "Regional and Language Options" (or its current equivalent).
2520      * In POSIX, it instead means to find the locale from the user's
2521      * environment.  This routine changes the Windows behavior to first look in
2522      * the environment, and, if anything is found, use that instead of going to
2523      * the machine default.  If there is no environment override, the machine
2524      * default is used, by calling the real setlocale() with "".
2525      *
2526      * The POSIX behavior is to use the LC_ALL variable if set; otherwise to
2527      * use the particular category's variable if set; otherwise to use the LANG
2528      * variable. */
2529
2530     bool override_LC_ALL = FALSE;
2531     char * result;
2532     unsigned int i;
2533
2534     if (locale && strEQ(locale, "")) {
2535
2536 #  ifdef LC_ALL
2537
2538         locale = PerlEnv_getenv("LC_ALL");
2539         if (! locale) {
2540             if (category ==  LC_ALL) {
2541                 override_LC_ALL = TRUE;
2542             }
2543             else {
2544
2545 #  endif
2546
2547                 for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++) {
2548                     if (category == categories[i]) {
2549                         locale = PerlEnv_getenv(category_names[i]);
2550                         goto found_locale;
2551                     }
2552                 }
2553
2554                 locale = PerlEnv_getenv("LANG");
2555                 if (! locale) {
2556                     locale = "";
2557                 }
2558
2559               found_locale: ;
2560
2561 #  ifdef LC_ALL
2562
2563             }
2564         }
2565
2566 #  endif
2567
2568     }
2569
2570 #ifdef USE_WSETLOCALE
2571     result = S_wrap_wsetlocale(aTHX_ category, locale);
2572 #else
2573     result = setlocale(category, locale);
2574 #endif
2575     DEBUG_L(PerlIO_printf(Perl_debug_log, "%s\n",
2576                           setlocale_debug_string_r(category, locale, result)));
2577
2578     if (! override_LC_ALL)  {
2579         return result;
2580     }
2581
2582     /* Here the input category was LC_ALL, and we have set it to what is in the
2583      * LANG variable or the system default if there is no LANG.  But these have
2584      * lower priority than the other LC_foo variables, so override it for each
2585      * one that is set.  (If they are set to "", it means to use the same thing
2586      * we just set LC_ALL to, so can skip) */
2587
2588     for (i = 0; i < LC_ALL_INDEX_; i++) {
2589         result = PerlEnv_getenv(category_names[i]);
2590         if (result && strNE(result, "")) {
2591 #ifdef USE_WSETLOCALE
2592             S_wrap_wsetlocale(aTHX_ categories[i], result);
2593 #else
2594             setlocale(categories[i], result);
2595 #endif
2596             DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s\n",
2597                 setlocale_debug_string_i(i, result, "not captured")));
2598         }
2599     }
2600
2601     result = setlocale(LC_ALL, NULL);
2602     DEBUG_L(PerlIO_printf(Perl_debug_log, "%s\n",
2603                           setlocale_debug_string_c(LC_ALL, NULL, result)));
2604
2605     return result;
2606 }
2607
2608 #endif
2609
2610 /*
2611 =for apidoc Perl_setlocale
2612
2613 This is an (almost) drop-in replacement for the system L<C<setlocale(3)>>,
2614 taking the same parameters, and returning the same information, except that it
2615 returns the correct underlying C<LC_NUMERIC> locale.  Regular C<setlocale> will
2616 instead return C<C> if the underlying locale has a non-dot decimal point
2617 character, or a non-empty thousands separator for displaying floating point
2618 numbers.  This is because perl keeps that locale category such that it has a
2619 dot and empty separator, changing the locale briefly during the operations
2620 where the underlying one is required. C<Perl_setlocale> knows about this, and
2621 compensates; regular C<setlocale> doesn't.
2622
2623 Another reason it isn't completely a drop-in replacement is that it is
2624 declared to return S<C<const char *>>, whereas the system setlocale omits the
2625 C<const> (presumably because its API was specified long ago, and can't be
2626 updated; it is illegal to change the information C<setlocale> returns; doing
2627 so leads to segfaults.)
2628
2629 Finally, C<Perl_setlocale> works under all circumstances, whereas plain
2630 C<setlocale> can be completely ineffective on some platforms under some
2631 configurations.
2632
2633 C<Perl_setlocale> should not be used to change the locale except on systems
2634 where the predefined variable C<${^SAFE_LOCALES}> is 1.  On some such systems,
2635 the system C<setlocale()> is ineffective, returning the wrong information, and
2636 failing to actually change the locale.  C<Perl_setlocale>, however works
2637 properly in all circumstances.
2638
2639 The return points to a per-thread static buffer, which is overwritten the next
2640 time C<Perl_setlocale> is called from the same thread.
2641
2642 =cut
2643
2644 */
2645
2646 #ifndef USE_LOCALE_NUMERIC
2647 #  define affects_LC_NUMERIC(cat) 0
2648 #elif defined(LC_ALL)
2649 #  define affects_LC_NUMERIC(cat) (cat == LC_NUMERIC || cat == LC_ALL)
2650 #else
2651 #  define affects_LC_NUMERIC(cat) (cat == LC_NUMERIC)
2652 #endif
2653
2654 const char *
2655 Perl_setlocale(const int category, const char * locale)
2656 {
2657     /* This wraps POSIX::setlocale() */
2658
2659 #ifndef USE_LOCALE
2660
2661     PERL_UNUSED_ARG(category);
2662     PERL_UNUSED_ARG(locale);
2663
2664     return "C";
2665
2666 #else
2667
2668     const char * retval;
2669     dTHX;
2670
2671     DEBUG_L(PerlIO_printf(Perl_debug_log,
2672                           "Entering Perl_setlocale(%d, \"%s\")\n",
2673                           category, locale));
2674
2675     /* A NULL locale means only query what the current one is. */
2676     if (locale == NULL) {
2677
2678 #  ifndef USE_LOCALE_NUMERIC
2679
2680         /* Without LC_NUMERIC, it's trivial; we just return the value */
2681         return save_to_buffer(querylocale_r(category),
2682                               &PL_setlocale_buf, &PL_setlocale_bufsize);
2683 #  else
2684
2685         /* We have the LC_NUMERIC name saved, because we are normally switched
2686          * into the C locale (or equivalent) for it. */
2687         if (category == LC_NUMERIC) {
2688             DEBUG_L(PerlIO_printf(Perl_debug_log,
2689                     "Perl_setlocale(LC_NUMERIC, NULL) returning stashed '%s'\n",
2690                     PL_numeric_name));
2691
2692             /* We don't have to copy this return value, as it is a per-thread
2693              * variable, and won't change until a future setlocale */
2694             return PL_numeric_name;
2695         }
2696
2697 #    ifndef LC_ALL
2698
2699         /* Without LC_ALL, just return the value */
2700         return save_to_buffer(querylocale_r(category),
2701                               &PL_setlocale_buf, &PL_setlocale_bufsize);
2702
2703 #    else
2704
2705         /* Here, LC_ALL is available on this platform.  It's the one
2706          * complicating category (because it can contain a toggled LC_NUMERIC
2707          * value), for all the remaining ones (we took care of LC_NUMERIC
2708          * above), just return the value */
2709         if (category != LC_ALL) {
2710             return save_to_buffer(querylocale_r(category),
2711                                   &PL_setlocale_buf, &PL_setlocale_bufsize);
2712         }
2713
2714         bool toggled = FALSE;
2715
2716         /* For an LC_ALL query, switch back to the underlying numeric locale
2717          * (if we aren't there already) so as to get the correct results.  Our
2718          * records for all the other categories are valid without switching */
2719         if (! PL_numeric_underlying) {
2720             set_numeric_underlying();
2721             toggled = TRUE;
2722         }
2723
2724         retval = querylocale_c(LC_ALL);
2725
2726         if (toggled) {
2727             set_numeric_standard();
2728         }
2729
2730         DEBUG_L(PerlIO_printf(Perl_debug_log, "%s\n",
2731                             setlocale_debug_string_r(category, locale, retval)));
2732
2733         return save_to_buffer(retval, &PL_setlocale_buf, &PL_setlocale_bufsize);
2734
2735 #    endif      /* Has LC_ALL */
2736 #  endif        /* Has LC_NUMERIC */
2737
2738     } /* End of querying the current locale */
2739
2740
2741     /* Here, the input has a locale to change to.  First find the current
2742      * locale */
2743     unsigned int cat_index = get_category_index(category, NULL);
2744     retval = querylocale_i(cat_index);
2745
2746     /* If the new locale is the same as the current one, nothing is actually
2747      * being changed, so do nothing. */
2748     if (      strEQ(retval, locale)
2749         && (   ! affects_LC_NUMERIC(category)
2750
2751 #  ifdef USE_LOCALE_NUMERIC
2752
2753             || strEQ(locale, PL_numeric_name)
2754
2755 #  endif
2756
2757     )) {
2758         DEBUG_L(PerlIO_printf(Perl_debug_log,
2759                               "Already in requested locale: no action taken\n"));
2760         return save_to_buffer(setlocale_i(cat_index, locale),
2761                               &PL_setlocale_buf, &PL_setlocale_bufsize);
2762     }
2763
2764     /* Here, an actual change is being requested.  Do it */
2765     retval = setlocale_i(cat_index, locale);
2766
2767     if (! retval) {
2768         DEBUG_L(PerlIO_printf(Perl_debug_log, "%s\n",
2769                           setlocale_debug_string_i(cat_index, locale, "NULL")));
2770         return NULL;
2771     }
2772
2773     assert(strNE(retval, ""));
2774     retval = save_to_buffer(retval, &PL_setlocale_buf, &PL_setlocale_bufsize);
2775
2776     /* Now that have changed locales, we have to update our records to
2777      * correspond.  Only certain categories have extra work to update. */
2778     if (update_functions[cat_index]) {
2779         update_functions[cat_index](aTHX_ retval);
2780     }
2781
2782     DEBUG_L(PerlIO_printf(Perl_debug_log, "returning '%s'\n", retval));
2783
2784     return retval;
2785
2786 #endif
2787
2788 }
2789
2790 #ifdef USE_LOCALE
2791
2792 STATIC const char *
2793 S_save_to_buffer(const char * string, const char **buf, Size_t *buf_size)
2794 {
2795     /* Copy the NUL-terminated 'string' to a buffer whose address before this
2796      * call began at *buf, and whose available length before this call was
2797      * *buf_size.
2798      *
2799      * If the length of 'string' is greater than the space available, the
2800      * buffer is grown accordingly, which may mean that it gets relocated.
2801      * *buf and *buf_size will be updated to reflect this.
2802      *
2803      * Regardless, the function returns a pointer to where 'string' is now
2804      * stored.
2805      *
2806      * 'string' may be NULL, which means no action gets taken, and NULL is
2807      * returned.
2808      *
2809      * If *buf or 'buf_size' are NULL or *buf_size is 0, the buffer is assumed
2810      * empty, and memory is malloc'd.   'buf-size' being NULL is to be used
2811      * when this is a single use buffer, which will shortly be freed by the
2812      * caller.
2813      */
2814
2815     Size_t string_size;
2816
2817     PERL_ARGS_ASSERT_SAVE_TO_BUFFER;
2818
2819     if (! string) {
2820         return NULL;
2821     }
2822
2823     string_size = strlen(string) + 1;
2824
2825     if (buf_size == NULL) {
2826         Newx(*buf, string_size, char);
2827     }
2828     else if (*buf_size == 0) {
2829         Newx(*buf, string_size, char);
2830         *buf_size = string_size;
2831     }
2832     else if (string_size > *buf_size) {
2833         Renew(*buf, string_size, char);
2834         *buf_size = string_size;
2835     }
2836
2837     {
2838         dTHX_DEBUGGING;
2839         DEBUG_Lv(PerlIO_printf(Perl_debug_log,
2840                          "Copying '%s' to %p\n",
2841                          ((is_utf8_string((U8 *) string, 0))
2842                           ? string
2843                           :_byte_dump_string((U8 *) string, strlen(string), 0)),
2844                           *buf));
2845             }
2846
2847 #    ifdef DEBUGGING
2848
2849     /* Catch glitches.  Usually this is because LC_CTYPE needs to be the same
2850      * locale as whatever is being worked on */
2851     if (UNLIKELY(instr(string, REPLACEMENT_CHARACTER_UTF8))) {
2852         dTHX_DEBUGGING;
2853
2854         locale_panic_(Perl_form(aTHX_
2855                                 "Unexpected REPLACEMENT_CHARACTER in '%s'\n%s",
2856                                 string, get_LC_ALL_display()));
2857     }
2858
2859 #    endif
2860
2861     Copy(string, *buf, string_size, char);
2862     return *buf;
2863 }
2864
2865 STATIC utf8ness_t
2866 S_get_locale_string_utf8ness_i(pTHX_ const char * locale,
2867                                      const unsigned cat_index,
2868                                      const char * string,
2869                                      const locale_utf8ness_t known_utf8)
2870 {
2871     /* Return to indicate if 'string' in the locale given by the input
2872      * arguments should be considered UTF-8 or not.
2873      *
2874      * If the input 'locale' is not NULL, use that for the locale; otherwise
2875      * use the current locale for the category specified by 'cat_index'.
2876      */
2877
2878     Size_t len;
2879     const U8 * first_variant = NULL;
2880
2881     PERL_ARGS_ASSERT_GET_LOCALE_STRING_UTF8NESS_I;
2882     assert(cat_index <= NOMINAL_LC_ALL_INDEX);
2883
2884     if (string == NULL) {
2885         return UTF8NESS_NO;
2886     }
2887
2888     if (IN_BYTES) { /* respect 'use bytes' */
2889         return UTF8NESS_NO;
2890     }
2891
2892     len = strlen(string);
2893
2894     /* UTF8ness is immaterial if the representation doesn't vary */
2895     if (is_utf8_invariant_string_loc((U8 *) string, len, &first_variant)) {
2896         return UTF8NESS_IMMATERIAL;
2897     }
2898
2899     /* Can't be UTF-8 if invalid */
2900     if (! is_utf8_string((U8 *) first_variant,
2901                          len - ((char *) first_variant - string)))
2902     {
2903         return UTF8NESS_NO;
2904     }
2905
2906     /* Here and below, we know the string is legal UTF-8, containing at least
2907      * one character requiring a sequence of two or more bytes.  It is quite
2908      * likely to be UTF-8.  But it pays to be paranoid and do further checking.
2909      *
2910      * If we already know the UTF-8ness of the locale, then we immediately know
2911      * what the string is */
2912     if (UNLIKELY(known_utf8 != LOCALE_UTF8NESS_UNKNOWN)) {
2913         if (known_utf8 == LOCALE_IS_UTF8) {
2914             return UTF8NESS_YES;
2915         }
2916         else {
2917             return UTF8NESS_NO;
2918         }
2919     }
2920
2921 #  if defined(HAS_SOME_LANGINFO) || defined(HAS_MBTOWC) || defined(HAS_MBRTOWC)
2922
2923     /* Here, we have available the libc functions that can be used to
2924      * accurately determine the UTF8ness of the underlying locale.  If it is a
2925      * UTF-8 locale, the string is UTF-8;  otherwise it was coincidental that
2926      * the string is legal UTF-8
2927      *
2928      * However, if the perl is compiled to not pay attention to the category
2929      * being passed in, you might think that that locale is essentially always
2930      * the C locale, so it would make sense to say it isn't UTF-8.  But to get
2931      * here, the string has to contain characters unknown in the C locale.  And
2932      * in fact, Windows boxes are compiled without LC_MESSAGES, as their
2933      * message catalog isn't really a part of the locale system.  But those
2934      * messages really could be UTF-8, and given that the odds are rather small
2935      * of something not being UTF-8 but being syntactically valid UTF-8, khw
2936      * has decided to call such strings as UTF-8. */
2937
2938     if (locale == NULL) {
2939         locale = querylocale_i(cat_index);
2940     }
2941     if (is_locale_utf8(locale)) {
2942         return UTF8NESS_YES;
2943     }
2944
2945     return UTF8NESS_NO;
2946
2947 #  else
2948
2949     /* Here, we have a valid UTF-8 string containing non-ASCII characters, and
2950      * don't have access to functions to check if the locale is UTF-8 or not.
2951      * Assume that it is.  khw tried adding a check that the string is entirely
2952      * in a single Unicode script, but discovered the strftime() timezone is
2953      * user-settable through the environment, which may be in a different
2954      * script than the locale-expected value. */
2955     PERL_UNUSED_ARG(locale);
2956     PERL_UNUSED_ARG(cat_index);
2957
2958     return UTF8NESS_YES;
2959
2960 #  endif
2961
2962 }
2963
2964 #  ifdef WIN32
2965
2966 bool
2967 Perl_get_win32_message_utf8ness(pTHX_ const char * string)
2968 {
2969     /* NULL => locale irrelevant, 0 => category irrelevant
2970      * so returns based on the UTF-8 legality of the input string, ignoring the
2971      * locale and category completely.
2972      *
2973      * This is because Windows doesn't have LC_MESSAGES */
2974     return get_locale_string_utf8ness_i(NULL, 0, string, LOCALE_IS_UTF8);
2975 }
2976
2977 #  endif
2978 #endif  /* USE_LOCALE */
2979
2980
2981 int
2982 Perl_mbtowc_(pTHX_ const wchar_t * pwc, const char * s, const Size_t len)
2983 {
2984
2985 #if ! defined(HAS_MBRTOWC) && ! defined(HAS_MBTOWC)
2986
2987     PERL_UNUSED_ARG(pwc);
2988     PERL_UNUSED_ARG(s);
2989     PERL_UNUSED_ARG(len);
2990     return -1;
2991
2992 #else   /* Below we have some form of mbtowc() */
2993 #   if defined(HAS_MBRTOWC)                                     \
2994    && (defined(USE_LOCALE_THREADS) || ! defined(HAS_MBTOWC))
2995 #    define USE_MBRTOWC
2996 #  else
2997 #    undef USE_MBRTOWC
2998 #  endif
2999
3000     int retval = -1;
3001
3002     if (s == NULL) { /* Initialize the shift state to all zeros in
3003                         PL_mbrtowc_ps. */
3004
3005 #  if defined(USE_MBRTOWC)
3006
3007         memzero(&PL_mbrtowc_ps, sizeof(PL_mbrtowc_ps));
3008         return 0;
3009
3010 #  else
3011
3012         MBTOWC_LOCK_;
3013         SETERRNO(0, 0);
3014         retval = mbtowc(NULL, NULL, 0);
3015         MBTOWC_UNLOCK_;
3016         return retval;
3017
3018 #  endif
3019
3020     }
3021
3022 #  if defined(USE_MBRTOWC)
3023
3024     SETERRNO(0, 0);
3025     retval = (SSize_t) mbrtowc((wchar_t *) pwc, s, len, &PL_mbrtowc_ps);
3026
3027 #  else
3028
3029     /* Locking prevents races, but locales can be switched out without locking,
3030      * so this isn't a cure all */
3031     MBTOWC_LOCK_;
3032     SETERRNO(0, 0);
3033     retval = mbtowc((wchar_t *) pwc, s, len);
3034     MBTOWC_UNLOCK_;
3035
3036 #  endif
3037
3038     return retval;
3039
3040 #endif
3041
3042 }
3043
3044 /*
3045 =for apidoc Perl_localeconv
3046
3047 This is a thread-safe version of the libc L<localeconv(3)>.  It is the same as
3048 L<POSIX::localeconv|POSIX/localeconv> (returning a hash of the C<localeconv()>
3049 fields), but directly callable from XS code.
3050
3051 =cut
3052 */
3053
3054 HV *
3055 Perl_localeconv(pTHX)
3056 {
3057
3058 #if  ! defined(HAS_SOME_LOCALECONV)                                     \
3059  || (! defined(USE_LOCALE_MONETARY) && ! defined(USE_LOCALE_NUMERIC))
3060
3061     return newHV();
3062
3063 #else
3064
3065     return my_localeconv(0, LOCALE_UTF8NESS_UNKNOWN);
3066
3067 #endif
3068
3069 }
3070
3071 #if  defined(HAS_SOME_LOCALECONV)                                   \
3072  && (defined(USE_LOCALE_MONETARY) || defined(USE_LOCALE_NUMERIC))
3073
3074 HV *
3075 S_my_localeconv(pTHX_ const int item, const locale_utf8ness_t locale_is_utf8)
3076 {
3077     HV * retval;
3078     locale_utf8ness_t numeric_locale_is_utf8  = LOCALE_UTF8NESS_UNKNOWN;
3079     locale_utf8ness_t monetary_locale_is_utf8 = LOCALE_UTF8NESS_UNKNOWN;
3080     HV * (*copy_localeconv)(pTHX_ const struct lconv *,
3081                                   const int,
3082                                   const locale_utf8ness_t,
3083                                   const locale_utf8ness_t);
3084
3085     /* A thread-safe locale_conv().  The locking mechanisms vary greatly
3086      * depending on platform capabilities.  They all share this common set up
3087      * code for the function, and then conditional compilations choose one of
3088      * several terminations.
3089      *
3090      * There are two use cases:
3091      * 1) Called from POSIX::locale_conv().  This returns lconv() copied to
3092      *    a hash, based on the current underlying locale.
3093      * 2) Certain items that nl_langinfo() provides are also derivable from
3094      *    the return of localeconv().  Windows notably doesn't have
3095      *    nl_langinfo(), so on that, and actually any platform lacking it,
3096      *    my_localeconv() is used to emulate it for those particular items.
3097      *    The code to do this is compiled only on such platforms.  Rather than
3098      *    going to the expense of creating a full hash when only one item is
3099      *    needed, just the desired item is returned, in an SV cast to an HV.
3100      *
3101      * There is a helper function to accomplish each of the two tasks.  The
3102      * function pointer just below is set to the appropriate one, and is called
3103      * from each of the various implementations, in the middle of whatever
3104      * necessary locking/locale swapping have been done. */
3105
3106 #  ifdef HAS_SOME_LANGINFO
3107
3108     PERL_UNUSED_ARG(item);
3109     PERL_UNUSED_ARG(locale_is_utf8);
3110
3111 #    ifdef USE_LOCALE_NUMERIC
3112
3113     /* When there is a nl_langinfo, we will only be called for localeconv
3114      * numeric purposes. */
3115     const bool is_localeconv_call = true;
3116
3117 #    endif
3118
3119 #  else
3120
3121     /* Note we use this sentinel; this works because this only gets compiled
3122      * when our perl_langinfo.h is used, and that uses negative numbers for all
3123      * the items */
3124     const bool is_localeconv_call = (item == 0);
3125     if (is_localeconv_call)
3126
3127 #  endif
3128
3129     {
3130         copy_localeconv = S_populate_localeconv;
3131
3132 #    ifdef USE_LOCALE_NUMERIC
3133
3134         /* Get the UTF8ness of the locales now to avoid repeating this for each
3135          * string returned by localeconv() */
3136         numeric_locale_is_utf8 = (is_locale_utf8(PL_numeric_name))
3137                                   ? LOCALE_IS_UTF8
3138                                   : LOCALE_NOT_UTF8;
3139
3140 #    endif
3141 #    ifdef USE_LOCALE_MONETARY
3142
3143         monetary_locale_is_utf8 = (is_locale_utf8(querylocale_c(LC_MONETARY)))
3144                                   ? LOCALE_IS_UTF8
3145                                   : LOCALE_NOT_UTF8;
3146
3147 #  endif
3148
3149     }
3150
3151 #  ifndef HAS_SOME_LANGINFO
3152
3153     else {
3154         copy_localeconv = S_get_nl_item_from_localeconv;
3155         numeric_locale_is_utf8 = locale_is_utf8;
3156     }
3157
3158 #  endif
3159
3160     PERL_ARGS_ASSERT_MY_LOCALECONV;
3161 /*--------------------------------------------------------------------------*/
3162 /* Here, we are done with the common beginning of all the implementations of
3163  * my_localeconv().  Below are the various terminations of the function (except
3164  * the closing '}'.  They are separated out because the preprocessor directives
3165  * were making the simple logic hard to follow.  Each implementation ends with
3166  * the same few lines.  khw decided to keep those separate because he thought
3167  * it was clearer to the reader.
3168  *
3169  * The first distinct termination (of the above common code) are the
3170  * implementations when we have locale_conv_l() and can use it.  These are the
3171  * simplest cases, without any locking needed. */
3172 #  if defined(USE_POSIX_2008_LOCALE) && defined(HAS_LOCALECONV_L)
3173
3174      /* And there are two sub-cases: First (by far the most common) is where we
3175       * are compiled to pay attention to LC_NUMERIC */
3176 #    ifdef USE_LOCALE_NUMERIC
3177
3178     const locale_t cur = use_curlocale_scratch();
3179     locale_t with_numeric = duplocale(cur);
3180
3181     /* Just create a new locale object with what we've got, but using the
3182      * underlying LC_NUMERIC locale */
3183     with_numeric = newlocale(LC_NUMERIC_MASK, PL_numeric_name, with_numeric);
3184
3185     retval = copy_localeconv(aTHX_ localeconv_l(with_numeric),
3186                                    item,
3187                                    numeric_locale_is_utf8,
3188                                    monetary_locale_is_utf8);
3189     freelocale(with_numeric);
3190
3191     return retval;
3192
3193 /*--------------------------------------------------------------------------*/
3194 #    else   /* Below not paying attention to LC_NUMERIC */
3195
3196     const locale_t cur = use_curlocale_scratch();
3197
3198     retval = copy_localeconv(aTHX_ localeconv_l(cur),
3199                                    item,
3200                                    numeric_locale_is_utf8,
3201                                    monetary_locale_is_utf8);
3202     return retval;
3203
3204 #    endif  /* Above, using lconv_l(); below plain lconv() */
3205 /*--------------------------------------------------------------------------*/
3206 #  elif ! defined(TS_W32_BROKEN_LOCALECONV)  /* Next is regular lconv() */
3207
3208     /* There are so many locks because localeconv() deals with two
3209      * categories, and returns in a single global static buffer.  Some
3210      * locks might be no-ops on this platform, but not others.  We need to
3211      * lock if any one isn't a no-op. */
3212
3213 #    ifdef USE_LOCALE_NUMERIC
3214
3215     LC_NUMERIC_LOCK(0);
3216     const char * orig_switched_locale = NULL;
3217
3218     /* When called internally, are already switched into the proper numeric
3219      * locale; otherwise must toggle to it */
3220     if (is_localeconv_call) {
3221         orig_switched_locale = toggle_locale_c(LC_NUMERIC, PL_numeric_name);
3222     }
3223
3224 #    endif
3225
3226     gwLOCALE_LOCK;
3227     retval = copy_localeconv(aTHX_ localeconv(),
3228                                    item,
3229                                    numeric_locale_is_utf8,
3230                                    monetary_locale_is_utf8);
3231     gwLOCALE_UNLOCK;
3232
3233 #    ifdef USE_LOCALE_NUMERIC
3234
3235     if (orig_switched_locale) {
3236         restore_toggled_locale_c(LC_NUMERIC, orig_switched_locale);
3237     }
3238     LC_NUMERIC_UNLOCK;
3239
3240 #    endif
3241
3242     return retval;
3243
3244 /*--------------------------------------------------------------------------*/
3245 #  else /* defined(TS_W32_BROKEN_LOCALECONV) */
3246
3247     /* Last is a workaround for the broken localeconv() on Windows with
3248      * thread-safe locales prior to VS 15.  It looks at the global locale
3249      * instead of the thread one.  As a work-around, we toggle to the global
3250      * locale; populate the return; then toggle back.  We have to use LC_ALL
3251      * instead of the individual categories because of another bug in Windows.
3252      *
3253      * This introduces a potential race with any other thread that has also
3254      * converted to use the global locale, and doesn't protect its locale calls
3255      * with mutexes.  khw can't think of any reason for a thread to do so on
3256      * Windows, as the locale API is the same regardless of thread-safety, except
3257      * if the code is ported from working on another platform where there might
3258      * be some reason to do this.  But this is typically due to some
3259      * alien-to-Perl library that thinks it owns locale setting.  Such a
3260      * library usn't likely to exist on Windows, so such an application is
3261      * unlikely to be run on Windows
3262      */
3263     bool restore_per_thread = FALSE;
3264
3265 #    ifdef USE_LOCALE_NUMERIC
3266
3267     const char * orig_switched_locale = NULL;
3268
3269     LC_NUMERIC_LOCK(0);
3270
3271     /* When called internally, are already switched into the proper numeric
3272      * locale; otherwise must toggle to it */
3273     if (is_localeconv_call) {
3274         orig_switched_locale = toggle_locale_c(LC_NUMERIC, PL_numeric_name);
3275     }
3276
3277 #    endif
3278
3279     /* Save the per-thread locale state */
3280     const char * save_thread = querylocale_c(LC_ALL);
3281
3282     /* Change to the global locale, and note if we already were there */
3283     if (_configthreadlocale(_DISABLE_PER_THREAD_LOCALE)
3284                          != _DISABLE_PER_THREAD_LOCALE)
3285     {
3286         restore_per_thread = TRUE;
3287     }
3288
3289     /* Save the state of the global locale; then convert to our desired
3290      * state.  */
3291     const char * save_global = querylocale_c(LC_ALL);
3292     void_setlocale_c(LC_ALL, save_thread);
3293
3294     /* Safely stash the desired data */
3295     gwLOCALE_LOCK;
3296     retval = copy_localeconv(aTHX_ localeconv(),
3297                                    item,
3298                                    numeric_locale_is_utf8,
3299                                    monetary_locale_is_utf8);
3300     gwLOCALE_UNLOCK;
3301
3302     /* Restore the global locale's prior state */
3303     void_setlocale_c(LC_ALL, save_global);
3304
3305     /* And back to per-thread locales */
3306     if (restore_per_thread) {
3307         _configthreadlocale(_ENABLE_PER_THREAD_LOCALE);
3308     }
3309
3310     /* Restore the per-thread locale state */
3311     void_setlocale_c(LC_ALL, save_thread);
3312
3313 #    ifdef USE_LOCALE_NUMERIC
3314
3315     if (orig_switched_locale) {
3316         restore_toggled_locale_c(LC_NUMERIC, orig_switched_locale);
3317     }
3318     LC_NUMERIC_UNLOCK;
3319
3320 #    endif
3321
3322     return retval;
3323
3324 #  endif
3325 /*--------------------------------------------------------------------------*/
3326 }
3327
3328 STATIC HV *
3329 S_populate_localeconv(pTHX_ const struct lconv *lcbuf,
3330                             const int unused,
3331                             const locale_utf8ness_t numeric_locale_is_utf8,
3332                             const locale_utf8ness_t monetary_locale_is_utf8)
3333 {
3334     /* This returns a mortalized hash containing all the elements returned by
3335      * localeconv().  It is used by Perl_localeconv() and POSIX::localeconv()
3336      */
3337     PERL_UNUSED_ARG(unused);
3338
3339     struct lconv_offset {
3340         const char *name;
3341         size_t offset;
3342     };
3343
3344     /* Create e.g.,
3345         {"thousands_sep", STRUCT_OFFSET(struct lconv, thousands_sep)},
3346      */
3347 #  define LCONV_ENTRY(name)                                         \
3348             {STRINGIFY(name), STRUCT_OFFSET(struct lconv, name)}
3349
3350     /* Set up structures containing the documented fields.  One structure for
3351      * LC_NUMERIC-controlled strings; one for LC_MONETARY ones, and a final one
3352      * of just numerics. */
3353 #  ifdef USE_LOCALE_NUMERIC
3354
3355     static const struct lconv_offset lconv_numeric_strings[] = {
3356         LCONV_ENTRY(decimal_point),
3357         LCONV_ENTRY(thousands_sep),
3358 #    ifndef NO_LOCALECONV_GROUPING
3359         LCONV_ENTRY(grouping),
3360 #    endif
3361         {NULL, 0}
3362     };
3363
3364 #  endif
3365 #  ifdef USE_LOCALE_MONETARY
3366
3367     static const struct lconv_offset lconv_monetary_strings[] = {
3368         LCONV_ENTRY(int_curr_symbol),
3369         LCONV_ENTRY(currency_symbol),
3370         LCONV_ENTRY(mon_decimal_point),
3371 #    ifndef NO_LOCALECONV_MON_THOUSANDS_SEP
3372         LCONV_ENTRY(mon_thousands_sep),
3373 #    endif
3374 #    ifndef NO_LOCALECONV_MON_GROUPING
3375         LCONV_ENTRY(mon_grouping),
3376 #    endif
3377         LCONV_ENTRY(positive_sign),
3378         LCONV_ENTRY(negative_sign),
3379         {NULL, 0}
3380     };
3381
3382 #  endif
3383
3384     static const struct lconv_offset lconv_integers[] = {
3385 #  ifdef USE_LOCALE_MONETARY
3386         LCONV_ENTRY(int_frac_digits),
3387         LCONV_ENTRY(frac_digits),
3388         LCONV_ENTRY(p_cs_precedes),
3389         LCONV_ENTRY(p_sep_by_space),
3390         LCONV_ENTRY(n_cs_precedes),
3391         LCONV_ENTRY(n_sep_by_space),
3392         LCONV_ENTRY(p_sign_posn),
3393         LCONV_ENTRY(n_sign_posn),
3394 #    ifdef HAS_LC_MONETARY_2008
3395         LCONV_ENTRY(int_p_cs_precedes),
3396         LCONV_ENTRY(int_p_sep_by_space),
3397         LCONV_ENTRY(int_n_cs_precedes),
3398         LCONV_ENTRY(int_n_sep_by_space),
3399         LCONV_ENTRY(int_p_sign_posn),
3400         LCONV_ENTRY(int_n_sign_posn),
3401 #    endif
3402 #  endif
3403         {NULL, 0}
3404     };
3405
3406     static const unsigned category_indices[] = {
3407 #  ifdef USE_LOCALE_NUMERIC
3408         LC_NUMERIC_INDEX_,
3409 #  endif
3410 #  ifdef USE_LOCALE_MONETARY
3411         LC_MONETARY_INDEX_,
3412 #  endif
3413         (unsigned) -1   /* Just so the previous element can always end with a
3414                            comma => subtract 1 below for the max loop index */
3415     };
3416
3417     const char *ptr = (const char *) lcbuf;
3418     const struct lconv_offset *integers = lconv_integers;
3419
3420     HV * retval = newHV();
3421     sv_2mortal((SV*)retval);
3422
3423     PERL_ARGS_ASSERT_POPULATE_LOCALECONV;
3424
3425     /* For each enabled category ... */
3426     for (PERL_UINT_FAST8_T i = 0; i < C_ARRAY_LENGTH(category_indices) - 1; i++) {
3427         const unsigned cat_index = category_indices[i];
3428         locale_utf8ness_t locale_is_utf8 = LOCALE_UTF8NESS_UNKNOWN;
3429         const char *locale;
3430
3431         /* ( = NULL silences a compiler warning; would segfault if it could
3432          * actually happen.) */
3433         const struct lconv_offset *strings = NULL;
3434
3435 #  ifdef USE_LOCALE_NUMERIC
3436         if (cat_index == LC_NUMERIC_INDEX_) {
3437             locale_is_utf8 = numeric_locale_is_utf8;
3438             strings = lconv_numeric_strings;
3439         }
3440 #  else
3441         PERL_UNUSED_ARG(numeric_locale_is_utf8);
3442 #  endif
3443 #  ifdef USE_LOCALE_MONETARY
3444         if (cat_index == LC_MONETARY_INDEX_) {
3445             locale_is_utf8 = monetary_locale_is_utf8;
3446             strings = lconv_monetary_strings;
3447         }
3448 #  else
3449         PERL_UNUSED_ARG(monetary_locale_is_utf8);
3450 #  endif
3451
3452         assert(locale_is_utf8 != LOCALE_UTF8NESS_UNKNOWN);
3453
3454         /* Iterate over the strings structure for this category */
3455         locale = querylocale_i(cat_index);
3456
3457         while (strings->name) {
3458             const char *value = *((const char **)(ptr + strings->offset));
3459             if (value && *value) {
3460                 bool is_utf8 =  /* Only make UTF-8 if required to */
3461                     (UTF8NESS_YES == (get_locale_string_utf8ness_i(locale,
3462                                                               cat_index,
3463                                                               value,
3464                                                               locale_is_utf8)));
3465                 (void) hv_store(retval,
3466                                 strings->name,
3467                                 strlen(strings->name),
3468                                 newSVpvn_utf8(value, strlen(value), is_utf8),
3469                                 0);
3470             }
3471
3472             strings++;
3473         }
3474     }
3475
3476     while (integers->name) {
3477         const char value = *((const char *)(ptr + integers->offset));
3478
3479         if (value != CHAR_MAX)
3480             (void) hv_store(retval, integers->name,
3481                             strlen(integers->name), newSViv(value), 0);
3482         integers++;
3483     }
3484
3485     return retval;
3486 }
3487
3488 #  ifndef HAS_SOME_LANGINFO
3489
3490 STATIC HV *
3491 S_get_nl_item_from_localeconv(pTHX_ const struct lconv *lcbuf,
3492                                     const int item,
3493                                     const locale_utf8ness_t unused1,
3494                                     const locale_utf8ness_t unused2)
3495 {
3496     /* This is a helper function for my_localeconv(), which is called from
3497      * my_langinfo() to emulate the libc nl_langinfo() function on platforms
3498      * that don't have it available.
3499      *
3500      * This function acts as an extension to my_langinfo(), the intermediate
3501      * my_localeconv() call is to set up the locks and switch into the proper
3502      * locale.  That logic exists for other reasons, and by doing it this way,
3503      * it doesn't have to be duplicated.
3504      *
3505      * This function extracts the current value of 'item' in the current locale
3506      * using the localconv() result also passed in, via 'lcbuf'.  The other
3507      * parameter is unused, a placeholder so the signature of this function
3508      * matches another that does need it, and so the two functions can be
3509      * referred to by a single function pointer, to simplify the code below */
3510
3511     const char * prefix = "";
3512     const char * temp = NULL;
3513
3514     PERL_ARGS_ASSERT_GET_NL_ITEM_FROM_LOCALECONV;
3515     PERL_UNUSED_ARG(unused1);
3516     PERL_UNUSED_ARG(unused2);
3517
3518     switch (item) {
3519       case CRNCYSTR:
3520         temp = lcbuf->currency_symbol;
3521
3522         if (lcbuf->p_cs_precedes) {
3523
3524             /* khw couldn't find any documentation that CHAR_MAX is the signal,
3525              * but cygwin uses it thusly */
3526             if (lcbuf->p_cs_precedes == CHAR_MAX) {
3527                 prefix = ".";
3528             }
3529             else {
3530                 prefix = "-";
3531             }
3532         }
3533         else {
3534             prefix = "+";
3535         }
3536
3537         break;
3538
3539       case RADIXCHAR:
3540         temp = lcbuf->decimal_point;
3541         break;
3542
3543       case THOUSEP:
3544         temp = lcbuf->thousands_sep;
3545         break;
3546
3547       default:
3548         locale_panic_(Perl_form(aTHX_
3549                     "Unexpected item passed to populate_localeconv: %d", item));
3550     }
3551
3552     return (HV *) Perl_newSVpvf(aTHX_ "%s%s", prefix, temp);
3553 }
3554
3555 #  endif    /* ! Has some form of langinfo() */
3556 #endif      /*   Has some form of localeconv() and paying attn to a category it
3557                  traffics in */
3558
3559 #ifndef HAS_SOME_LANGINFO
3560
3561 typedef int nl_item;    /* Substitute 'int' for emulated nl_langinfo() */
3562
3563 #endif
3564
3565 /*
3566
3567 =for apidoc      Perl_langinfo
3568 =for apidoc_item Perl_langinfo8
3569
3570 C<Perl_langinfo> is an (almost) drop-in replacement for the system
3571 C<L<nl_langinfo(3)>>, taking the same C<item> parameter values, and returning
3572 the same information.  But it is more thread-safe than regular
3573 C<nl_langinfo()>, and hides the quirks of Perl's locale handling from your
3574 code, and can be used on systems that lack a native C<nl_langinfo>.
3575
3576 However, you should instead use the improved version of this:
3577 L</Perl_langinfo8>, which behaves identically except for an additional
3578 parameter, a pointer to a variable declared as L</C<utf8ness_t>>, into which it
3579 returns to you how you should treat the returned string with regards to it
3580 being encoded in UTF-8 or not.
3581
3582 Concerning the differences between these and plain C<nl_langinfo()>:
3583
3584 =over
3585
3586 =item a.
3587
3588 C<Perl_langinfo8> has an extra parameter, described above.  Besides this, the
3589 other reasons they aren't quite a drop-in replacement is actually an advantage.
3590 The C<const>ness of the return allows the compiler to catch attempts to write
3591 into the returned buffer, which is illegal and could cause run-time crashes.
3592
3593 =item b.
3594
3595 They deliver the correct results for the C<RADIXCHAR> and C<THOUSEP> items,
3596 without you having to write extra code.  The reason for the extra code would be
3597 because these are from the C<LC_NUMERIC> locale category, which is normally
3598 kept set by Perl so that the radix is a dot, and the separator is the empty
3599 string, no matter what the underlying locale is supposed to be, and so to get
3600 the expected results, you have to temporarily toggle into the underlying
3601 locale, and later toggle back.  (You could use plain C<nl_langinfo> and
3602 C<L</STORE_LC_NUMERIC_FORCE_TO_UNDERLYING>> for this but then you wouldn't get
3603 the other advantages of C<Perl_langinfo()>; not keeping C<LC_NUMERIC> in the C
3604 (or equivalent) locale would break a lot of CPAN, which is expecting the radix
3605 (decimal point) character to be a dot.)
3606
3607 =item c.
3608
3609 The system function they replace can have its static return buffer trashed,
3610 not only by a subsequent call to that function, but by a C<freelocale>,
3611 C<setlocale>, or other locale change.  The returned buffer of these functions
3612 is not changed until the next call to one or the other, so the buffer is never
3613 in a trashed state.
3614
3615 =item d.
3616
3617 The return buffer is per-thread, so it also is never overwritten by a call to
3618 these functions from another thread;  unlike the function it replaces.
3619
3620 =item e.
3621
3622 But most importantly, they work on systems that don't have C<nl_langinfo>, such
3623 as Windows, hence making your code more portable.  Of the fifty-some possible
3624 items specified by the POSIX 2008 standard,
3625 L<http://pubs.opengroup.org/onlinepubs/9699919799/basedefs/langinfo.h.html>,
3626 only one is completely unimplemented, though on non-Windows platforms, another
3627 significant one is not fully implemented).  They use various techniques to
3628 recover the other items, including calling C<L<localeconv(3)>>, and
3629 C<L<strftime(3)>>, both of which are specified in C89, so should be always be
3630 available.  Later C<strftime()> versions have additional capabilities; C<""> is
3631 returned for any item not available on your system.
3632
3633 It is important to note that, when called with an item that is recovered by
3634 using C<localeconv>, the buffer from any previous explicit call to
3635 C<L<localeconv(3)>> will be overwritten.  But you shouldn't be using
3636 C<localeconv> anyway because it is is very much not thread-safe, and suffers
3637 from the same problems outlined in item 'b.' above for the fields it returns that
3638 are controlled by the LC_NUMERIC locale category.  Instead, avoid all of those
3639 problems by calling L</Perl_localeconv>, which is thread-safe; or by using the
3640 methods given in L<perlcall>  to call
3641 L<C<POSIX::localeconv()>|POSIX/localeconv>, which is also thread-safe.
3642
3643 =back
3644
3645 The details for those items which may deviate from what this emulation returns
3646 and what a native C<nl_langinfo()> would return are specified in
3647 L<I18N::Langinfo>.
3648
3649 When using C<Perl_langinfo8> (or plain C<Perl_langinfo>) on systems that don't
3650 have a native C<nl_langinfo()>, you must
3651
3652  #include "perl_langinfo.h"
3653
3654 before the C<perl.h> C<#include>.  You can replace your C<langinfo.h>
3655 C<#include> with this one.  (Doing it this way keeps out the symbols that plain
3656 C<langinfo.h> would try to import into the namespace for code that doesn't need
3657 it.)
3658
3659 =cut
3660
3661 */
3662
3663 const char *
3664 Perl_langinfo(const nl_item item)
3665 {
3666     return Perl_langinfo8(item, NULL);
3667 }
3668
3669 const char *
3670 Perl_langinfo8(const nl_item item, utf8ness_t * utf8ness)
3671 {
3672     dTHX;
3673     unsigned cat_index;
3674
3675     PERL_ARGS_ASSERT_PERL_LANGINFO8;
3676
3677     if (utf8ness) {     /* Assume for now */
3678         *utf8ness = UTF8NESS_IMMATERIAL;
3679     }
3680
3681     /* Find the locale category that controls the input 'item'.  If we are not
3682      * paying attention to that category, instead return a default value.  Also
3683      * return the default value if there is no way for us to figure out the
3684      * correct value.  If we have some form of nl_langinfo(), we can always
3685      * figure it out, but lacking that, there may be alternative methods that
3686      * can be used to recover most of the possible items.  Some of those
3687      * methods need libc functions, which may or may not be available.  If
3688      * unavailable, we can't compute the correct value, so must here return the
3689      * default. */
3690     switch (item) {
3691
3692       case CODESET:
3693
3694 #ifdef USE_LOCALE_CTYPE
3695
3696         cat_index = LC_CTYPE_INDEX_;
3697         break;
3698
3699 #else
3700         return C_codeset;
3701 #endif
3702 #if defined(USE_LOCALE_MESSAGES) && defined(HAS_SOME_LANGINFO)
3703
3704       case YESEXPR: case YESSTR: case NOEXPR: case NOSTR:
3705         cat_index = LC_MESSAGES_INDEX_;
3706         break;
3707 #else
3708       case YESEXPR:   return "^[+1yY]";
3709       case YESSTR:    return "yes";
3710       case NOEXPR:    return "^[-0nN]";
3711       case NOSTR:     return "no";
3712 #endif
3713
3714       case CRNCYSTR:
3715
3716 #if  defined(USE_LOCALE_MONETARY)                                   \
3717  && (defined(HAS_SOME_LANGINFO) || defined(HAS_SOME_LOCALECONV))
3718
3719         cat_index = LC_MONETARY_INDEX_;
3720         break;
3721 #else
3722         return "-";
3723 #endif
3724
3725       case RADIXCHAR:
3726
3727 #ifdef CAN_CALCULATE_RADIX
3728
3729         cat_index = LC_NUMERIC_INDEX_;
3730         break;
3731 #else
3732         return C_decimal_point;
3733 #endif
3734
3735       case THOUSEP:
3736
3737 #if  defined(USE_LOCALE_NUMERIC)                                    \
3738  && (defined(HAS_SOME_LANGINFO) || defined(HAS_SOME_LOCALECONV))
3739
3740         cat_index = LC_NUMERIC_INDEX_;
3741         break;
3742 #else
3743         return C_thousands_sep;
3744 #endif
3745
3746 /* The other possible items are all in LC_TIME. */
3747 #ifdef USE_LOCALE_TIME
3748
3749       default:
3750         cat_index = LC_TIME_INDEX_;
3751         break;
3752
3753 #endif
3754 #if ! defined(USE_LOCALE_TIME) || ! defined(HAS_SOME_LANGINFO)
3755
3756     /* If not using LC_TIME, hard code the rest.  Or, if there is no
3757      * nl_langinfo(), we use strftime() as an alternative, and it is missing
3758      * functionality to get every single one, so hard-code those */
3759
3760       case ERA: return "";  /* Unimplemented; for use with strftime() %E
3761                                modifier */
3762
3763       /* These formats are defined by C89, so we assume that strftime supports
3764        * them, and so are returned unconditionally; they may not be what the
3765        * locale actually says, but should give good enough results for someone
3766        * using them as formats (as opposed to trying to parse them to figure
3767        * out what the locale says).  The other format items are actually tested
3768        * to verify they work on the platform */
3769       case D_FMT:         return "%x";
3770       case T_FMT:         return "%X";
3771       case D_T_FMT:       return "%c";
3772
3773 #  if defined(WIN32) || ! defined(USE_LOCALE_TIME)
3774
3775       /* strftime() on Windows doesn't have the POSIX (beyond C89) extensions
3776        * that would allow it to recover these */
3777       case ERA_D_FMT:     return "%x";
3778       case ERA_T_FMT:     return "%X";
3779       case ERA_D_T_FMT:   return "%c";
3780       case ALT_DIGITS:    return "0";
3781
3782 #  endif
3783 #  ifndef USE_LOCALE_TIME
3784
3785       case T_FMT_AMPM:    return "%r";
3786       case ABDAY_1:       return "Sun";
3787       case ABDAY_2:       return "Mon";
3788       case ABDAY_3:       return "Tue";
3789       case ABDAY_4:       return "Wed";
3790       case ABDAY_5:       return "Thu";
3791       case ABDAY_6:       return "Fri";
3792       case ABDAY_7:       return "Sat";
3793       case AM_STR:        return "AM";
3794       case PM_STR:        return "PM";
3795       case ABMON_1:       return "Jan";
3796       case ABMON_2:       return "Feb";
3797       case ABMON_3:       return "Mar";
3798       case ABMON_4:       return "Apr";
3799       case ABMON_5:       return "May";
3800       case ABMON_6:       return "Jun";
3801       case ABMON_7:       return "Jul";
3802       case ABMON_8:       return "Aug";
3803       case ABMON_9:       return "Sep";
3804       case ABMON_10:      return "Oct";
3805       case ABMON_11:      return "Nov";
3806       case ABMON_12:      return "Dec";
3807       case DAY_1:         return "Sunday";
3808       case DAY_2:         return "Monday";
3809       case DAY_3:         return "Tuesday";
3810       case DAY_4:         return "Wednesday";
3811       case DAY_5:         return "Thursday";
3812       case DAY_6:         return "Friday";
3813       case DAY_7:         return "Saturday";
3814       case MON_1:         return "January";
3815       case MON_2:         return "February";
3816       case MON_3:         return "March";
3817       case MON_4:         return "April";
3818       case MON_5:         return "May";
3819       case MON_6:         return "June";
3820       case MON_7:         return "July";
3821       case MON_8:         return "August";
3822       case MON_9:         return "September";
3823       case MON_10:        return "October";
3824       case MON_11:        return "November";
3825       case MON_12:        return "December";
3826
3827 #  endif
3828 #endif
3829
3830     } /* End of switch on item */
3831
3832 #ifndef USE_LOCALE
3833
3834     Perl_croak_nocontext("panic: Unexpected nl_langinfo() item %d", item);
3835     NOT_REACHED; /* NOTREACHED */
3836     PERL_UNUSED_VAR(cat_index);
3837
3838 #else
3839 #  ifdef USE_LOCALE_NUMERIC
3840
3841     /* Use either the underlying numeric, or the other underlying categories */
3842     if (cat_index == LC_NUMERIC_INDEX_) {
3843         return my_langinfo_c(item, LC_NUMERIC, PL_numeric_name,
3844                              &PL_langinfo_buf, &PL_langinfo_bufsize, utf8ness);
3845     }
3846     else
3847
3848 #  endif
3849
3850     {
3851         return my_langinfo_i(item, cat_index, querylocale_i(cat_index),
3852                              &PL_langinfo_buf, &PL_langinfo_bufsize, utf8ness);
3853     }
3854
3855 #endif
3856
3857 }
3858
3859 #ifdef USE_LOCALE
3860
3861 /* There are several implementations of my_langinfo, depending on the
3862  * Configuration.  They all share the same beginning of the function */
3863 STATIC const char *
3864 S_my_langinfo_i(pTHX_
3865                 const nl_item item,           /* The item to look up */
3866                 const unsigned int cat_index, /* The locale category that
3867                                                  controls it */
3868                 /* The locale to look up 'item' in. */
3869                 const char * locale,
3870
3871                 /* Where to store the result, and where the size of that buffer
3872                  * is stored, updated on exit. retbuf_sizep may be NULL for an
3873                  * empty-on-entry, single use buffer whose size we don't need
3874                  * to keep track of */
3875                 const char ** retbufp,
3876                 Size_t * retbuf_sizep,
3877
3878                 /* If not NULL, the location to store the UTF8-ness of 'item's
3879                  * value, as documented */
3880                 utf8ness_t * utf8ness)
3881 {
3882     const char * retval = NULL;
3883
3884     PERL_ARGS_ASSERT_MY_LANGINFO_I;
3885     assert(cat_index <= NOMINAL_LC_ALL_INDEX);
3886
3887     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
3888                            "Entering my_langinfo item=%d, using locale %s\n",
3889                            item, locale));
3890 /*--------------------------------------------------------------------------*/
3891 /* Above is the common beginning to all the implementations of my_langinfo().
3892  * Below are the various completions.
3893  *
3894  * Some platforms don't deal well with non-ASCII strings in locale X when
3895  * LC_CTYPE is not in X.  (Actually it is probably when X is UTF-8 and LC_CTYPE
3896  * isn't, or vice versa).  There is explicit code to bring the categories into
3897  * sync.  This doesn't seem to be a problem with nl_langinfo(), so that
3898  * implementation doesn't currently worry about it.  But it is a problem on
3899  * Windows boxes, which don't have nl_langinfo(). */
3900
3901 #  if defined(HAS_THREAD_SAFE_NL_LANGINFO_L) && defined(USE_POSIX_2008_LOCALE)
3902
3903     /* Simplest is if we can use nl_langinfo_l()
3904      *
3905      * With it, we can change LC_CTYPE in the same call as the other category */
3906 #    ifdef USE_LOCALE_CTYPE
3907 #      define CTYPE_SAFETY_MASK LC_CTYPE_MASK
3908 #    else
3909 #      define CTYPE_SAFETY_MASK 0
3910 #    endif
3911
3912     locale_t cur = newlocale((category_masks[cat_index] | CTYPE_SAFETY_MASK),
3913                              locale, (locale_t) 0);
3914
3915     retval = save_to_buffer(nl_langinfo_l(item, cur), retbufp, retbuf_sizep);
3916     if (utf8ness) {
3917         *utf8ness = get_locale_string_utf8ness_i(locale, cat_index, retval,
3918                                                  LOCALE_UTF8NESS_UNKNOWN);
3919     }
3920
3921     freelocale(cur);
3922
3923     return retval;
3924 /*--------------------------------------------------------------------------*/
3925 #  elif defined(HAS_NL_LANGINFO) /* nl_langinfo() is available.  */
3926
3927     /* The second version of my_langinfo() is if we have plain nl_langinfo() */
3928
3929 #    ifdef USE_LOCALE_CTYPE
3930
3931     /* Ths function sorts out if things actually have to be switched or not,
3932      * for both calls. */
3933     const char * orig_CTYPE_locale = toggle_locale_c(LC_CTYPE, locale);
3934
3935 #    endif
3936
3937     const char * orig_switched_locale = toggle_locale_i(cat_index, locale);
3938
3939     gwLOCALE_LOCK;
3940     retval = save_to_buffer(nl_langinfo(item), retbufp, retbuf_sizep);
3941     gwLOCALE_UNLOCK;
3942
3943     if (utf8ness) {
3944         *utf8ness = get_locale_string_utf8ness_i(locale, cat_index,
3945                                                retval, LOCALE_UTF8NESS_UNKNOWN);
3946     }
3947
3948     restore_toggled_locale_i(cat_index, orig_switched_locale);
3949
3950 #    ifdef USE_LOCALE_CTYPE
3951     restore_toggled_locale_c(LC_CTYPE, orig_CTYPE_locale);
3952 #    endif
3953
3954     return retval;
3955 /*--------------------------------------------------------------------------*/
3956 #  else   /* Below, emulate nl_langinfo as best we can */
3957
3958     /* And the third and final completion is where we have to emulate
3959      * nl_langinfo().  There are various possibilities depending on the
3960      * Configuration */
3961
3962 #    ifdef USE_LOCALE_CTYPE
3963
3964     const char * orig_CTYPE_locale =  toggle_locale_c(LC_CTYPE, locale);
3965
3966 #    endif
3967
3968     const char * orig_switched_locale = toggle_locale_i(cat_index, locale);
3969
3970     /* Here, we are in the locale we want information about */
3971
3972     /* Almost all the items will have ASCII return values.  Set that here, and
3973      * override if necessary */
3974     utf8ness_t is_utf8 = UTF8NESS_IMMATERIAL;
3975
3976     switch (item) {
3977       default:
3978         assert(item < 0);   /* Make sure using perl_langinfo.h */
3979         retval = "";
3980         break;
3981
3982       case RADIXCHAR:
3983
3984 #    if      defined(HAS_SNPRINTF)                                              \
3985        && (! defined(HAS_SOME_LOCALECONV) || defined(TS_W32_BROKEN_LOCALECONV))
3986
3987         {
3988             /* snprintf() can be used to find the radix character by outputting
3989              * a known simple floating point number to a buffer, and parsing
3990              * it, inferring the radix as the bytes separating the integer and
3991              * fractional parts.  But localeconv() is more direct, not
3992              * requiring inference, so use it instead of the code just below,
3993              * if (likely) it is available and works ok */
3994
3995             char * floatbuf = NULL;
3996             const Size_t initial_size = 10;
3997
3998             Newx(floatbuf, initial_size, char);
3999
4000             /* 1.5 is exactly representable on binary computers */
4001             Size_t needed_size = snprintf(floatbuf, initial_size, "%.1f", 1.5);
4002
4003             /* If our guess wasn't big enough, increase and try again, based on
4004              * the real number that strnprintf() is supposed to return */
4005             if (UNLIKELY(needed_size >= initial_size)) {
4006                 needed_size++;  /* insurance */
4007                 Renew(floatbuf, needed_size, char);
4008                 Size_t new_needed = snprintf(floatbuf, needed_size, "%.1f", 1.5);
4009                 assert(new_needed <= needed_size);
4010                 needed_size = new_needed;
4011             }
4012
4013             char * s = floatbuf;
4014             char * e = floatbuf + needed_size;
4015
4016             /* Find the '1' */
4017             while (s < e && *s != '1') {
4018                 s++;
4019             }
4020
4021             if (LIKELY(s < e)) {
4022                 s++;
4023             }
4024
4025             /* Find the '5' */
4026             char * item_start = s;
4027             while (s < e && *s != '5') {
4028                 s++;
4029             }
4030
4031             /* Everything in between is the radix string */
4032             if (LIKELY(s < e)) {
4033                 *s = '\0';
4034                 retval = save_to_buffer(item_start,
4035                                         (const char **) &PL_langinfo_buf,
4036                                         &PL_langinfo_bufsize);
4037                 Safefree(floatbuf);
4038
4039                 if (utf8ness) {
4040                     is_utf8 = get_locale_string_utf8ness_i(locale, cat_index,
4041                                                            retval,
4042                                                        LOCALE_UTF8NESS_UNKNOWN);
4043
4044                 }
4045
4046                 break;
4047             }
4048
4049             Safefree(floatbuf);
4050         }
4051
4052 #      ifdef HAS_SOME_LOCALECONV /* snprintf() failed; drop down to use
4053                                     localeconv() */
4054
4055         /* FALLTHROUGH */                                                           \
4056
4057 #      else                      /* snprintf() failed and no localeconv() */
4058
4059         retval = C_decimal_point;
4060         break;
4061
4062 #      endif
4063 #    endif
4064 #    ifdef HAS_SOME_LOCALECONV
4065
4066     /* These items are available from localeconv().  (To avoid using
4067      * TS_W32_BROKEN_LOCALECONV, one could use GetNumberFormat and
4068      * GetCurrencyFormat; patches welcome) */
4069
4070       case CRNCYSTR:
4071       case THOUSEP:
4072         {
4073             SV * string = (SV *) my_localeconv(item, LOCALE_UTF8NESS_UNKNOWN);
4074
4075             retval = save_to_buffer(SvPV_nolen(string), retbufp, retbuf_sizep);
4076
4077             if (utf8ness) {
4078                 is_utf8 = get_locale_string_utf8ness_i(locale, cat_index, retval,
4079                                                        LOCALE_UTF8NESS_UNKNOWN);
4080             }
4081
4082             SvREFCNT_dec_NN(string);
4083             break;
4084         }
4085
4086 #    endif  /* Some form of localeconv */
4087 #    ifdef HAS_STRFTIME
4088
4089       /* These formats are only available in later strfmtime's */
4090       case ERA_D_FMT: case ERA_T_FMT: case ERA_D_T_FMT: case T_FMT_AMPM:
4091
4092       /* The rest can be gotten from most versions of strftime(). */
4093       case ABDAY_1: case ABDAY_2: case ABDAY_3:
4094       case ABDAY_4: case ABDAY_5: case ABDAY_6: case ABDAY_7:
4095       case ALT_DIGITS:
4096       case AM_STR: case PM_STR:
4097       case ABMON_1: case ABMON_2: case ABMON_3: case ABMON_4:
4098       case ABMON_5: case ABMON_6: case ABMON_7: case ABMON_8:
4099       case ABMON_9: case ABMON_10: case ABMON_11: case ABMON_12:
4100       case DAY_1: case DAY_2: case DAY_3: case DAY_4:
4101       case DAY_5: case DAY_6: case DAY_7:
4102       case MON_1: case MON_2: case MON_3: case MON_4:
4103       case MON_5: case MON_6: case MON_7: case MON_8:
4104       case MON_9: case MON_10: case MON_11: case MON_12:
4105         {
4106             const char * format;
4107             bool return_format = FALSE;
4108             int mon = 0;
4109             int mday = 1;
4110             int hour = 6;
4111
4112             GCC_DIAG_IGNORE_STMT(-Wimplicit-fallthrough);
4113
4114             switch (item) {
4115               default:
4116                 locale_panic_(Perl_form(aTHX_ "switch case: %d problem", item));
4117                 NOT_REACHED; /* NOTREACHED */
4118
4119               case PM_STR: hour = 18;
4120               case AM_STR:
4121                 format = "%p";
4122                 break;
4123               case ABDAY_7: mday++;
4124               case ABDAY_6: mday++;
4125               case ABDAY_5: mday++;
4126               case ABDAY_4: mday++;
4127               case ABDAY_3: mday++;
4128               case ABDAY_2: mday++;
4129               case ABDAY_1:
4130                 format = "%a";
4131                 break;
4132               case DAY_7: mday++;
4133               case DAY_6: mday++;
4134               case DAY_5: mday++;
4135               case DAY_4: mday++;
4136               case DAY_3: mday++;
4137               case DAY_2: mday++;
4138               case DAY_1:
4139                 format = "%A";
4140                 break;
4141               case ABMON_12: mon++;
4142               case ABMON_11: mon++;
4143               case ABMON_10: mon++;
4144               case ABMON_9:  mon++;
4145               case ABMON_8:  mon++;
4146               case ABMON_7:  mon++;
4147               case ABMON_6:  mon++;
4148               case ABMON_5:  mon++;
4149               case ABMON_4:  mon++;
4150               case ABMON_3:  mon++;
4151               case ABMON_2:  mon++;
4152               case ABMON_1:
4153                 format = "%b";
4154                 break;
4155               case MON_12: mon++;
4156               case MON_11: mon++;
4157               case MON_10: mon++;
4158               case MON_9:  mon++;
4159               case MON_8:  mon++;
4160               case MON_7:  mon++;
4161               case MON_6:  mon++;
4162               case MON_5:  mon++;
4163               case MON_4:  mon++;
4164               case MON_3:  mon++;
4165               case MON_2:  mon++;
4166               case MON_1:
4167                 format = "%B";
4168                 break;
4169               case T_FMT_AMPM:
4170                 format = "%r";
4171                 return_format = TRUE;
4172                 break;
4173               case ERA_D_FMT:
4174                 format = "%Ex";
4175                 return_format = TRUE;
4176                 break;
4177               case ERA_T_FMT:
4178                 format = "%EX";
4179                 return_format = TRUE;
4180                 break;
4181               case ERA_D_T_FMT:
4182                 format = "%Ec";
4183                 return_format = TRUE;
4184                 break;
4185               case ALT_DIGITS:
4186                 format = "%Ow"; /* Find the alternate digit for 0 */
4187                 break;
4188             }
4189
4190             GCC_DIAG_RESTORE_STMT;
4191
4192             /* The year was deliberately chosen so that January 1 is on the
4193              * first day of the week.  Since we're only getting one thing at a
4194              * time, it all works */
4195             const char * temp = my_strftime8(format, 30, 30, hour, mday, mon,
4196                                              2011, 0, 0, 0, &is_utf8);
4197             retval = save_to_buffer(temp, retbufp, retbuf_sizep);
4198             Safefree(temp);
4199
4200             /* If the item is 'ALT_DIGITS', '*retbuf' contains the alternate
4201              * format for wday 0.  If the value is the same as the normal 0,
4202              * there isn't an alternate, so clear the buffer.
4203              *
4204              * (wday was chosen because its range is all a single digit.
4205              * Things like tm_sec have two digits as the minimum: '00'.) */
4206             if (item == ALT_DIGITS && strEQ(*retbufp, "0")) {
4207                 retval = "";
4208                 break;
4209             }
4210
4211             /* ALT_DIGITS is problematic.  Experiments on it showed that
4212              * strftime() did not always work properly when going from alt-9 to
4213              * alt-10.  Only a few locales have this item defined, and in all
4214              * of them on Linux that khw was able to find, nl_langinfo() merely
4215              * returned the alt-0 character, possibly doubled.  Most Unicode
4216              * digits are in blocks of 10 consecutive code points, so that is
4217              * sufficient information for such scripts, as we can infer alt-1,
4218              * alt-2, ....  But for a Japanese locale, a CJK ideographic 0 is
4219              * returned, and the CJK digits are not in code point order, so you
4220              * can't really infer anything.  The localedef for this locale did
4221              * specify the succeeding digits, so that strftime() works properly
4222              * on them, without needing to infer anything.  But the
4223              * nl_langinfo() return did not give sufficient information for the
4224              * caller to understand what's going on.  So until there is
4225              * evidence that it should work differently, this returns the alt-0
4226              * string for ALT_DIGITS. */
4227
4228             if (return_format) {
4229
4230                 /* If to return the format, not the value, overwrite the buffer
4231                  * with it.  But some strftime()s will keep the original format
4232                  * if illegal, so change those to "" */
4233                 if (strEQ(*retbufp, format)) {
4234                     retval = "";
4235                 }
4236                 else {
4237                     retval = format;
4238                 }
4239
4240                 /* A format is always in ASCII */
4241                 is_utf8 = UTF8NESS_IMMATERIAL;
4242             }
4243
4244             break;
4245         }
4246
4247 #    endif
4248
4249       case CODESET:
4250
4251         /* The trivial case */
4252         if (isNAME_C_OR_POSIX(locale)) {
4253             retval = C_codeset;
4254             break;
4255         }
4256
4257 #    ifdef WIN32
4258
4259         /* This function retrieves the code page.  It is subject to change, but
4260          * is documented and has been stable for many releases */
4261         UINT ___lc_codepage_func(void);
4262
4263         retval = save_to_buffer(Perl_form(aTHX_ "%d", ___lc_codepage_func()),
4264                                 retbufp, retbuf_sizep);
4265         DEBUG_Lv(PerlIO_printf(Perl_debug_log, "locale='%s' cp=%s\n",
4266                                                locale, retval));
4267         break;
4268
4269 #    else
4270
4271         /* The codeset is important, but khw did not figure out a way for it to
4272          * be retrieved on non-Windows boxes without nl_langinfo().  But even
4273          * if we can't get it directly, we can usually determine if it is a
4274          * UTF-8 locale or not.  If it is UTF-8, we (correctly) use that for
4275          * the code set. */
4276
4277 #      if defined(HAS_MBTOWC) || defined(HAS_MBRTOWC)
4278
4279         /* If libc mbtowc() evaluates the bytes that form the REPLACEMENT
4280          * CHARACTER as that Unicode code point, this has to be a UTF-8 locale.
4281          * */
4282
4283         wchar_t wc = 0;
4284         (void) Perl_mbtowc_(aTHX_ NULL, NULL, 0);/* Reset shift state */
4285         int mbtowc_ret = Perl_mbtowc_(aTHX_ &wc,
4286                               STR_WITH_LEN(REPLACEMENT_CHARACTER_UTF8));
4287         if (mbtowc_ret >= 0 && wc == UNICODE_REPLACEMENT) {
4288             DEBUG_Lv(PerlIO_printf(Perl_debug_log,
4289                                    "mbtowc returned REPLACEMENT\n"));
4290             retval = "UTF-8";
4291             break;
4292         }
4293
4294         /* Here, it isn't a UTF-8 locale. */
4295
4296 #    else   /* mbtowc() is not available. */
4297
4298         /* Sling together several possibilities, depending on platform
4299          * capabilities and what we found.
4300          *
4301          * For non-English locales or non-dollar currency locales, we likely
4302          * will find out whether a locale is UTF-8 or not */
4303
4304         utf8ness_t is_utf8 = UTF8NESS_UNKNOWN;
4305         const char * scratch_buf = NULL;
4306
4307 #      if defined(USE_LOCALE_MONETARY) && defined(HAS_SOME_LOCALECONV)
4308
4309         /* Can't use this method unless localeconv() is available, as that's
4310          * the way we find out the currency symbol. */
4311
4312         /* First try looking at the currency symbol (via a recursive call) to
4313          * see if it disambiguates things.  Often that will be in the native
4314          * script, and if the symbol isn't legal UTF-8, we know that the locale
4315          * isn't either. */
4316         (void) my_langinfo_c(CRNCYSTR, LC_MONETARY, locale, &scratch_buf, NULL,
4317                              &is_utf8);
4318         Safefree(scratch_buf);
4319
4320 #      endif
4321 #      ifdef USE_LOCALE_TIME
4322
4323         /* If we have ruled out being UTF-8, no point in checking further. */
4324         if (is_utf8 != UTF8NESS_NO) {
4325
4326             /* But otherwise do check more.  This is done even if the currency
4327              * symbol looks to be UTF-8, just in case that's a false positive.
4328              *
4329              * Look at the LC_TIME entries, like the names of the months or
4330              * weekdays.  We quit at the first one that is illegal UTF-8 */
4331
4332             utf8ness_t this_is_utf8 = UTF8NESS_UNKNOWN;
4333             const int times[] = {
4334                 DAY_1, DAY_2, DAY_3, DAY_4, DAY_5, DAY_6, DAY_7,
4335                 MON_1, MON_2, MON_3, MON_4, MON_5, MON_6, MON_7, MON_8,
4336                                             MON_9, MON_10, MON_11, MON_12,
4337                 ALT_DIGITS, AM_STR, PM_STR,
4338                 ABDAY_1, ABDAY_2, ABDAY_3, ABDAY_4, ABDAY_5, ABDAY_6,
4339                                                              ABDAY_7,
4340                 ABMON_1, ABMON_2, ABMON_3, ABMON_4, ABMON_5, ABMON_6,
4341                 ABMON_7, ABMON_8, ABMON_9, ABMON_10, ABMON_11, ABMON_12
4342             };
4343
4344             /* The code in the recursive call can handle switching the locales,
4345              * but by doing it here, we avoid switching each iteration of the
4346              * loop */
4347             const char * orig_TIME_locale = toggle_locale_c(LC_TIME, locale);
4348
4349             for (PERL_UINT_FAST8_T i = 0; i < C_ARRAY_LENGTH(times); i++) {
4350                 scratch_buf = NULL;
4351                 (void) my_langinfo_c(times[i], LC_TIME, locale, &scratch_buf,
4352                                      NULL, &this_is_utf8);
4353                 Safefree(scratch_buf);
4354                 if (this_is_utf8 == UTF8NESS_NO) {
4355                     is_utf8 = UTF8NESS_NO;
4356                     break;
4357                 }
4358
4359                 if (this_is_utf8 == UTF8NESS_YES) {
4360                     is_utf8 = UTF8NESS_YES;
4361                 }
4362             }
4363
4364             /* Here we have gone through all the LC_TIME elements.  is_utf8 has
4365              * been set as follows:
4366              *      UTF8NESS_NO           If any aren't legal UTF-8
4367              *      UTF8NESS_IMMMATERIAL  If all are ASCII
4368              *      UTF8NESS_YES          If all are legal UTF-8 (including
4369              *                            ASCIIi), and at least one isn't
4370              *                            ASCII. */
4371
4372             restore_toggled_locale_c(LC_TIME, orig_TIME_locale);
4373         }
4374
4375 #      endif    /* LC_TIME */
4376
4377         /* If nothing examined above rules out it being UTF-8, and at least one
4378          * thing fits as UTF-8 (and not plain ASCII), assume the codeset is
4379          * UTF-8. */
4380         if (is_utf8 == UTF8NESS_YES) {
4381             retval = "UTF-8";
4382             break;
4383         }
4384
4385         /* Here, nothing examined indicates that the codeset is UTF-8.  But
4386          * what is it?  The other locale categories are not likely to be of
4387          * further help:
4388          *
4389          * LC_NUMERIC   Only a few locales in the world have a non-ASCII radix
4390          *              or group separator.
4391          * LC_CTYPE     This code wouldn't be compiled if mbtowc() existed and
4392          *              was reliable.  This is unlikely in C99.  There are
4393          *              other functions that could be used instead, but are
4394          *              they going to exist, and be able to distinguish between
4395          *              UTF-8 and 8859-1?  Deal with this only if it becomes
4396          *              necessary.
4397          * LC_MESSAGES  The strings returned from strerror() would seem likely
4398          *              candidates, but experience has shown that many systems
4399          *              don't actually have translations installed for them.
4400          *              They are instead always in English, so everything in
4401          *              them is ASCII, which is of no help to us.  A Configure
4402          *              probe could possibly be written to see if this platform
4403          *              has non-ASCII error messages.  But again, wait until it
4404          *              turns out to be an actual problem. */
4405
4406 #    endif    /* ! mbtowc() */
4407
4408         /* Rejoin the mbtowc available/not-available cases.
4409          *
4410          * We got here only because we haven't been able to find the codeset.
4411          * The only other option khw could think of is to see if the codeset is
4412          * part of the locale name.  This is very less than ideal; often there
4413          * is no code set in the name; and at other times they even lie.
4414          *
4415          * Find any dot in the locale name */
4416         retval = (const char *) strchr(locale, '.');
4417         if (! retval) {
4418             retval = "";  /* Alas, no dot */
4419             break;
4420         }
4421
4422         /* Use everything past the dot */
4423         retval++;
4424
4425 #      if defined(HAS_MBTOWC) || defined(HAS_MBRTOWC)
4426
4427         /* When these functions, are available, they were tried earlier and
4428          * indicated that the locale did not act like a proper UTF-8 one.  So
4429          * if it claims to be UTF-8, it is a lie */
4430         if (is_codeset_name_UTF8(retval)) {
4431             retval = "";
4432             break;
4433         }
4434
4435 #      endif
4436
4437         /* Otherwise the code set name is considered to be everything past the
4438          * dot. */
4439         retval = save_to_buffer(retval, retbufp, retbuf_sizep);
4440
4441         break;
4442
4443 #    endif
4444
4445     } /* Giant switch() of nl_langinfo() items */
4446
4447     restore_toggled_locale_i(cat_index, orig_switched_locale);
4448
4449 #    ifdef USE_LOCALE_CTYPE
4450     restore_toggled_locale_c(LC_CTYPE, orig_CTYPE_locale);
4451 #    endif
4452
4453     if (utf8ness) {
4454         *utf8ness = is_utf8;
4455     }
4456
4457     return retval;
4458
4459 #  endif    /* All the implementations of my_langinfo() */
4460
4461 /*--------------------------------------------------------------------------*/
4462
4463 }   /* my_langinfo() */
4464
4465 #endif      /* USE_LOCALE */
4466
4467 char *
4468 Perl_my_strftime8(pTHX_ const char *fmt, int sec, int min, int hour, int mday,
4469                          int mon, int year, int wday, int yday, int isdst,
4470                          utf8ness_t * utf8ness)
4471 {   /* Documented in util.c */
4472     char * retval = my_strftime(fmt, sec, min, hour, mday, mon, year, wday,
4473                                 yday, isdst);
4474
4475     PERL_ARGS_ASSERT_MY_STRFTIME8;
4476
4477     if (utf8ness) {
4478
4479 #ifdef USE_LOCALE_TIME
4480         *utf8ness = get_locale_string_utf8ness_i(NULL, LC_TIME_INDEX_,
4481                                                retval, LOCALE_UTF8NESS_UNKNOWN);
4482 #else
4483         *utf8ness = UTF8NESS_IMMATERIAL;
4484 #endif
4485
4486     }
4487
4488     DEBUG_Lv(PerlIO_printf(Perl_debug_log, "fmt=%s, retval=%s", fmt,
4489                  ((is_utf8_string((U8 *) retval, 0))
4490                   ? retval
4491                   :_byte_dump_string((U8 *) retval, strlen(retval), 0)));
4492              if (utf8ness) PerlIO_printf(Perl_debug_log, "; utf8ness=%d",
4493                                                          (int) *utf8ness);
4494              PerlIO_printf(Perl_debug_log, "\n");
4495             );
4496
4497     return retval;
4498 }
4499
4500 /*
4501  * Initialize locale awareness.
4502  */
4503 int
4504 Perl_init_i18nl10n(pTHX_ int printwarn)
4505 {
4506     /* printwarn is
4507      *
4508      *    0 if not to output warning when setup locale is bad
4509      *    1 if to output warning based on value of PERL_BADLANG
4510      *    >1 if to output regardless of PERL_BADLANG
4511      *
4512      * returns
4513      *    1 = set ok or not applicable,
4514      *    0 = fallback to a locale of lower priority
4515      *   -1 = fallback to all locales failed, not even to the C locale
4516      *
4517      * Under -DDEBUGGING, if the environment variable PERL_DEBUG_LOCALE_INIT is
4518      * set, debugging information is output.
4519      *
4520      * This looks more complicated than it is, mainly due to the #ifdefs and
4521      * error handling.
4522      *
4523      * Besides some asserts, data structure initialization, and specific
4524      * platform complications, this routine is effectively represented by this
4525      * pseudo-code:
4526      *
4527      *      setlocale(LC_ALL, "");                                            x
4528      *      foreach (subcategory) {                                           x
4529      *          curlocales[f(subcategory)] = setlocale(subcategory, NULL);    x
4530      *      }                                                                 x
4531      *      if (platform_so_requires) {
4532      *          foreach (subcategory) {
4533      *            PL_curlocales[f(subcategory)] = curlocales[f(subcategory)]
4534      *          }
4535      *      }
4536      *      foreach (subcategory) {
4537      *          if (needs_special_handling[f(subcategory)] &this_subcat_handler
4538      *      }
4539      *
4540      * This sets all the categories to the values in the current environment,
4541      * saves them temporarily in curlocales[] until they can be handled and/or
4542      * on some platforms saved in a per-thread array PL_curlocales[].
4543      *
4544      * f(foo) is a mapping from the opaque system category numbers to small
4545      * non-negative integers used most everywhere in this file as indices into
4546      * arrays (such as curlocales[]) so the program doesn't have to otherwise
4547      * deal with the opaqueness.
4548      *
4549      * If the platform doesn't have LC_ALL, the lines marked 'x' above are
4550      * effectively replaced by:
4551      *      foreach (subcategory) {                                           y
4552      *          curlocales[f(subcategory)] = setlocale(subcategory, "");      y
4553      *      }                                                                 y
4554      *
4555      * The only differences being the lack of an LC_ALL call, and using ""
4556      * instead of NULL in the setlocale calls.
4557      *
4558      * But there are, of course, complications.
4559      *
4560      * it has to deal with if this is an embedded perl, whose locale doesn't
4561      * come from the environment, but has been set up by the caller.  This is
4562      * pretty simply handled: the "" in the setlocale calls is not a string
4563      * constant, but a variable which is set to NULL in the embedded case.
4564      *
4565      * But the major complication is handling failure and doing fallback.  All
4566      * the code marked 'x' or 'y' above is actually enclosed in an outer loop,
4567      * using the array trial_locales[].  On entry, trial_locales[] is
4568      * initialized to just one entry, containing the NULL or "" locale argument
4569      * shown above.  If, as is almost always the case, everything works, it
4570      * exits after just the one iteration, going on to the next step.
4571      *
4572      * But if there is a failure, the code tries its best to honor the
4573      * environment as much as possible.  It self-modifies trial_locales[] to
4574      * have more elements, one for each of the POSIX-specified settings from
4575      * the environment, such as LANG, ending in the ultimate fallback, the C
4576      * locale.  Thus if there is something bogus with a higher priority
4577      * environment variable, it will try with the next highest, until something
4578      * works.  If everything fails, it limps along with whatever state it got
4579      * to.
4580      *
4581      * A further complication is that Windows has an additional fallback, the
4582      * user-default ANSI code page obtained from the operating system.  This is
4583      * added as yet another loop iteration, just before the final "C"
4584      *
4585      * A slight complication is that in embedded Perls, the locale may already
4586      * be set-up, and we don't want to get it from the normal environment
4587      * variables.  This is handled by having a special environment variable
4588      * indicate we're in this situation.  We simply set setlocale's 2nd
4589      * parameter to be a NULL instead of "".  That indicates to setlocale that
4590      * it is not to change anything, but to return the current value,
4591      * effectively initializing perl's db to what the locale already is.
4592      *
4593      * We play the same trick with NULL if a LC_ALL succeeds.  We call
4594      * setlocale() on the individual categores with NULL to get their existing
4595      * values for our db, instead of trying to change them.
4596      * */
4597
4598     int ok = 1;
4599
4600 #ifndef USE_LOCALE
4601
4602     PERL_UNUSED_ARG(printwarn);
4603
4604 #else  /* USE_LOCALE */
4605 #  ifdef __GLIBC__
4606
4607     const char * const language = PerlEnv_getenv("LANGUAGE");
4608
4609 #  endif
4610
4611     /* NULL uses the existing already set up locale */
4612     const char * const setlocale_init = (PerlEnv_getenv("PERL_SKIP_LOCALE_INIT"))
4613                                         ? NULL
4614                                         : "";
4615     typedef struct trial_locales_struct_s {
4616         const char* trial_locale;
4617         const char* fallback_desc;
4618         const char* fallback_name;
4619     } trial_locales_struct;
4620     /* 5 = 1 each for "", LC_ALL, LANG, (Win32) system default locale, C */
4621     trial_locales_struct trial_locales[5];
4622     unsigned int trial_locales_count;
4623     const char * const lc_all     = PerlEnv_getenv("LC_ALL");
4624     const char * const lang       = PerlEnv_getenv("LANG");
4625     bool setlocale_failure = FALSE;
4626     unsigned int i;
4627
4628     /* A later getenv() could zap this, so only use here */
4629     const char * const bad_lang_use_once = PerlEnv_getenv("PERL_BADLANG");
4630
4631     const bool locwarn = (printwarn > 1
4632                           || (          printwarn
4633                               && (    ! bad_lang_use_once
4634                                   || (
4635                                          /* disallow with "" or "0" */
4636                                          *bad_lang_use_once
4637                                        && strNE("0", bad_lang_use_once)))));
4638
4639     /* current locale for given category; should have been copied so aren't
4640      * volatile */
4641     const char * curlocales[NOMINAL_LC_ALL_INDEX + 1];
4642
4643 #  ifndef DEBUGGING
4644 #    define DEBUG_LOCALE_INIT(a,b,c)
4645 #  else
4646
4647     DEBUG_INITIALIZATION_set(cBOOL(PerlEnv_getenv("PERL_DEBUG_LOCALE_INIT")));
4648
4649 #    define DEBUG_LOCALE_INIT(cat_index, locale, result)                    \
4650         DEBUG_L(PerlIO_printf(Perl_debug_log, "%s\n",                       \
4651                     setlocale_debug_string_i(cat_index, locale, result)));
4652
4653 /* Make sure the parallel arrays are properly set up */
4654 #    ifdef USE_LOCALE_NUMERIC
4655     assert(categories[LC_NUMERIC_INDEX_] == LC_NUMERIC);
4656     assert(strEQ(category_names[LC_NUMERIC_INDEX_], "LC_NUMERIC"));
4657 #      ifdef USE_POSIX_2008_LOCALE
4658     assert(category_masks[LC_NUMERIC_INDEX_] == LC_NUMERIC_MASK);
4659 #      endif
4660 #    endif
4661 #    ifdef USE_LOCALE_CTYPE
4662     assert(categories[LC_CTYPE_INDEX_] == LC_CTYPE);
4663     assert(strEQ(category_names[LC_CTYPE_INDEX_], "LC_CTYPE"));
4664 #      ifdef USE_POSIX_2008_LOCALE
4665     assert(category_masks[LC_CTYPE_INDEX_] == LC_CTYPE_MASK);
4666 #      endif
4667 #    endif
4668 #    ifdef USE_LOCALE_COLLATE
4669     assert(categories[LC_COLLATE_INDEX_] == LC_COLLATE);
4670     assert(strEQ(category_names[LC_COLLATE_INDEX_], "LC_COLLATE"));
4671 #      ifdef USE_POSIX_2008_LOCALE
4672     assert(category_masks[LC_COLLATE_INDEX_] == LC_COLLATE_MASK);
4673 #      endif
4674 #    endif
4675 #    ifdef USE_LOCALE_TIME
4676     assert(categories[LC_TIME_INDEX_] == LC_TIME);
4677     assert(strEQ(category_names[LC_TIME_INDEX_], "LC_TIME"));
4678 #      ifdef USE_POSIX_2008_LOCALE
4679     assert(category_masks[LC_TIME_INDEX_] == LC_TIME_MASK);
4680 #      endif
4681 #    endif
4682 #    ifdef USE_LOCALE_MESSAGES
4683     assert(categories[LC_MESSAGES_INDEX_] == LC_MESSAGES);
4684     assert(strEQ(category_names[LC_MESSAGES_INDEX_], "LC_MESSAGES"));
4685 #      ifdef USE_POSIX_2008_LOCALE
4686     assert(category_masks[LC_MESSAGES_INDEX_] == LC_MESSAGES_MASK);
4687 #      endif
4688 #    endif
4689 #    ifdef USE_LOCALE_MONETARY
4690     assert(categories[LC_MONETARY_INDEX_] == LC_MONETARY);
4691     assert(strEQ(category_names[LC_MONETARY_INDEX_], "LC_MONETARY"));
4692 #      ifdef USE_POSIX_2008_LOCALE
4693     assert(category_masks[LC_MONETARY_INDEX_] == LC_MONETARY_MASK);
4694 #      endif
4695 #    endif
4696 #    ifdef USE_LOCALE_ADDRESS
4697     assert(categories[LC_ADDRESS_INDEX_] == LC_ADDRESS);
4698     assert(strEQ(category_names[LC_ADDRESS_INDEX_], "LC_ADDRESS"));
4699 #      ifdef USE_POSIX_2008_LOCALE
4700     assert(category_masks[LC_ADDRESS_INDEX_] == LC_ADDRESS_MASK);
4701 #      endif
4702 #    endif
4703 #    ifdef USE_LOCALE_IDENTIFICATION
4704     assert(categories[LC_IDENTIFICATION_INDEX_] == LC_IDENTIFICATION);
4705     assert(strEQ(category_names[LC_IDENTIFICATION_INDEX_], "LC_IDENTIFICATION"));
4706 #      ifdef USE_POSIX_2008_LOCALE
4707     assert(category_masks[LC_IDENTIFICATION_INDEX_] == LC_IDENTIFICATION_MASK);
4708 #      endif
4709 #    endif
4710 #    ifdef USE_LOCALE_MEASUREMENT
4711     assert(categories[LC_MEASUREMENT_INDEX_] == LC_MEASUREMENT);
4712     assert(strEQ(category_names[LC_MEASUREMENT_INDEX_], "LC_MEASUREMENT"));
4713 #      ifdef USE_POSIX_2008_LOCALE
4714     assert(category_masks[LC_MEASUREMENT_INDEX_] == LC_MEASUREMENT_MASK);
4715 #      endif
4716 #    endif
4717 #    ifdef USE_LOCALE_PAPER
4718     assert(categories[LC_PAPER_INDEX_] == LC_PAPER);
4719     assert(strEQ(category_names[LC_PAPER_INDEX_], "LC_PAPER"));
4720 #      ifdef USE_POSIX_2008_LOCALE
4721     assert(category_masks[LC_PAPER_INDEX_] == LC_PAPER_MASK);
4722 #      endif
4723 #    endif
4724 #    ifdef USE_LOCALE_TELEPHONE
4725     assert(categories[LC_TELEPHONE_INDEX_] == LC_TELEPHONE);
4726     assert(strEQ(category_names[LC_TELEPHONE_INDEX_], "LC_TELEPHONE"));
4727 #      ifdef USE_POSIX_2008_LOCALE
4728     assert(category_masks[LC_TELEPHONE_INDEX_] == LC_TELEPHONE_MASK);
4729 #      endif
4730 #    endif
4731 #    ifdef USE_LOCALE_SYNTAX
4732     assert(categories[LC_SYNTAX_INDEX_] == LC_SYNTAX);
4733     assert(strEQ(category_names[LC_SYNTAX_INDEX_], "LC_SYNTAX"));
4734 #      ifdef USE_POSIX_2008_LOCALE
4735     assert(category_masks[LC_SYNTAX_INDEX_] == LC_SYNTAX_MASK);
4736 #      endif
4737 #    endif
4738 #    ifdef USE_LOCALE_TOD
4739     assert(categories[LC_TOD_INDEX_] == LC_TOD);
4740     assert(strEQ(category_names[LC_TOD_INDEX_], "LC_TOD"));
4741 #      ifdef USE_POSIX_2008_LOCALE
4742     assert(category_masks[LC_TOD_INDEX_] == LC_TOD_MASK);
4743 #      endif
4744 #    endif
4745 #    ifdef LC_ALL
4746     assert(categories[LC_ALL_INDEX_] == LC_ALL);
4747     assert(strEQ(category_names[LC_ALL_INDEX_], "LC_ALL"));
4748     STATIC_ASSERT_STMT(NOMINAL_LC_ALL_INDEX == LC_ALL_INDEX_);
4749 #      ifdef USE_POSIX_2008_LOCALE
4750     assert(category_masks[LC_ALL_INDEX_] == LC_ALL_MASK);
4751 #      endif
4752 #    endif
4753 #  endif    /* DEBUGGING */
4754
4755     /* Initialize the per-thread mbrFOO() state variables.  See POSIX.xs for
4756      * why these particular incantations are used. */
4757 #  ifdef HAS_MBRLEN
4758     memzero(&PL_mbrlen_ps, sizeof(PL_mbrlen_ps));
4759 #  endif
4760 #  ifdef HAS_MBRTOWC
4761     memzero(&PL_mbrtowc_ps, sizeof(PL_mbrtowc_ps));
4762 #  endif
4763 #  ifdef HAS_WCTOMBR
4764     wcrtomb(NULL, L'\0', &PL_wcrtomb_ps);
4765 #  endif
4766 #  ifdef USE_THREAD_SAFE_LOCALE
4767 #    ifdef WIN32
4768
4769     _configthreadlocale(_ENABLE_PER_THREAD_LOCALE);
4770
4771 #    endif
4772 #  endif
4773 #  ifdef USE_POSIX_2008_LOCALE
4774
4775     PL_C_locale_obj = newlocale(LC_ALL_MASK, "C", (locale_t) 0);
4776     if (! PL_C_locale_obj) {
4777         locale_panic_(Perl_form(aTHX_
4778                                 "Cannot create POSIX 2008 C locale object"));
4779     }
4780
4781     DEBUG_Lv(PerlIO_printf(Perl_debug_log, "created C object %p\n",
4782                            PL_C_locale_obj));
4783
4784 #    ifdef USE_LOCALE_NUMERIC
4785
4786     PL_underlying_numeric_obj = duplocale(PL_C_locale_obj);
4787
4788 #    endif
4789 #  endif
4790 #  ifdef USE_LOCALE_NUMERIC
4791
4792     PL_numeric_radix_sv    = newSV(1);
4793     PL_underlying_radix_sv = newSV(1);
4794     Newxz(PL_numeric_name, 1, char);    /* Single NUL character */
4795     new_numeric("C");
4796
4797 #  endif
4798 #  ifdef USE_LOCALE_COLLATE
4799
4800     Newxz(PL_collation_name, 1, char);
4801     new_collate("C");
4802
4803 #  endif
4804 #  ifdef USE_LOCALE_CTYPE
4805
4806     Newxz(PL_ctype_name, 1, char);
4807     new_ctype("C");
4808
4809 #  endif
4810 #  ifdef USE_PL_CURLOCALES
4811
4812     /* Initialize our records. */
4813     for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++) {
4814         (void) emulate_setlocale_i(i, posix_setlocale(categories[i], NULL),
4815                                    RECALCULATE_LC_ALL_ON_FINAL_INTERATION,
4816                                    __LINE__);
4817     }
4818
4819 #  endif
4820
4821     /* We try each locale in the list until we get one that works, or exhaust
4822      * the list.  Normally the loop is executed just once.  But if setting the
4823      * locale fails, inside the loop we add fallback trials to the array and so
4824      * will execute the loop multiple times */
4825     trial_locales[0] = (trial_locales_struct) {
4826         .trial_locale = setlocale_init,
4827         .fallback_desc = NULL,
4828         .fallback_name = NULL,
4829     };
4830     trial_locales_count = 1;
4831
4832     for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++) {
4833         curlocales[i] = NULL;
4834     }
4835
4836     for (i= 0; i < trial_locales_count; i++) {
4837         const char * trial_locale = trial_locales[i].trial_locale;
4838         setlocale_failure = FALSE;
4839
4840 #  ifdef LC_ALL
4841
4842         /* setlocale() return vals; not copied so must be looked at
4843          * immediately. */
4844         const char * sl_result[NOMINAL_LC_ALL_INDEX + 1];
4845         sl_result[LC_ALL_INDEX_] = stdized_setlocale(LC_ALL, trial_locale);
4846         DEBUG_LOCALE_INIT(LC_ALL_INDEX_, trial_locale, sl_result[LC_ALL_INDEX_]);
4847         if (! sl_result[LC_ALL_INDEX_]) {
4848             setlocale_failure = TRUE;
4849         }
4850         else {
4851             /* Since LC_ALL succeeded, it should have changed all the other
4852              * categories it can to its value; so we massage things so that the
4853              * setlocales below just return their category's current values.
4854              * This adequately handles the case in NetBSD where LC_COLLATE may
4855              * not be defined for a locale, and setting it individually will
4856              * fail, whereas setting LC_ALL succeeds, leaving LC_COLLATE set to
4857              * the POSIX locale. */
4858             trial_locale = NULL;
4859         }
4860
4861 #  endif /* LC_ALL */
4862
4863         if (! setlocale_failure) {
4864             unsigned int j;
4865             for (j = 0; j < NOMINAL_LC_ALL_INDEX; j++) {
4866                 curlocales[j] = stdized_setlocale(categories[j], trial_locale);
4867                 if (! curlocales[j]) {
4868                     setlocale_failure = TRUE;
4869                 }
4870                 curlocales[j] = savepv(curlocales[j]);
4871                 DEBUG_LOCALE_INIT(j, trial_locale, curlocales[j]);
4872             }
4873
4874             if (LIKELY(! setlocale_failure)) {  /* All succeeded */
4875                 break;  /* Exit trial_locales loop */
4876             }
4877         }
4878
4879         /* Here, something failed; will need to try a fallback. */
4880         ok = 0;
4881
4882         if (i == 0) {
4883             unsigned int j;
4884
4885             if (locwarn) { /* Output failure info only on the first one */
4886
4887 #  ifdef LC_ALL
4888
4889                 PerlIO_printf(Perl_error_log,
4890                 "perl: warning: Setting locale failed.\n");
4891
4892 #  else /* !LC_ALL */
4893
4894                 PerlIO_printf(Perl_error_log,
4895                 "perl: warning: Setting locale failed for the categories:\n");
4896
4897                 for (j = 0; j < NOMINAL_LC_ALL_INDEX; j++) {
4898                     if (! curlocales[j]) {
4899                         PerlIO_printf(Perl_error_log, "\t%s\n", category_names[j]);
4900                     }
4901                 }
4902
4903 #  endif /* LC_ALL */
4904
4905                 PerlIO_printf(Perl_error_log,
4906                     "perl: warning: Please check that your locale settings:\n");
4907
4908 #  ifdef __GLIBC__
4909
4910                 PerlIO_printf(Perl_error_log,
4911                             "\tLANGUAGE = %c%s%c,\n",
4912                             language ? '"' : '(',
4913                             language ? language : "unset",
4914                             language ? '"' : ')');
4915 #  endif
4916
4917                 PerlIO_printf(Perl_error_log,
4918                             "\tLC_ALL = %c%s%c,\n",
4919                             lc_all ? '"' : '(',
4920                             lc_all ? lc_all : "unset",
4921                             lc_all ? '"' : ')');
4922
4923 #  if defined(USE_ENVIRON_ARRAY)
4924
4925                 {
4926                     char **e;
4927
4928                     /* Look through the environment for any variables of the
4929                      * form qr/ ^ LC_ [A-Z]+ = /x, except LC_ALL which was
4930                      * already handled above.  These are assumed to be locale
4931                      * settings.  Output them and their values. */
4932                     for (e = environ; *e; e++) {
4933                         const STRLEN prefix_len = sizeof("LC_") - 1;
4934                         STRLEN uppers_len;
4935
4936                         if (     strBEGINs(*e, "LC_")
4937                             && ! strBEGINs(*e, "LC_ALL=")
4938                             && (uppers_len = strspn(*e + prefix_len,
4939                                              "ABCDEFGHIJKLMNOPQRSTUVWXYZ"))
4940                             && ((*e)[prefix_len + uppers_len] == '='))
4941                         {
4942                             PerlIO_printf(Perl_error_log, "\t%.*s = \"%s\",\n",
4943                                 (int) (prefix_len + uppers_len), *e,
4944                                 *e + prefix_len + uppers_len + 1);
4945                         }
4946                     }
4947                 }
4948
4949 #  else
4950
4951                 PerlIO_printf(Perl_error_log,
4952                             "\t(possibly more locale environment variables)\n");
4953
4954 #  endif
4955
4956                 PerlIO_printf(Perl_error_log,
4957                             "\tLANG = %c%s%c\n",
4958                             lang ? '"' : '(',
4959                             lang ? lang : "unset",
4960                             lang ? '"' : ')');
4961
4962                 PerlIO_printf(Perl_error_log,
4963                             "    are supported and installed on your system.\n");
4964             }
4965
4966             /* Calculate what fallback locales to try.  We have avoided this
4967              * until we have to, because failure is quite unlikely.  This will
4968              * usually change the upper bound of the loop we are in.
4969              *
4970              * Since the system's default way of setting the locale has not
4971              * found one that works, We use Perl's defined ordering: LC_ALL,
4972              * LANG, and the C locale.  We don't try the same locale twice, so
4973              * don't add to the list if already there.  (On POSIX systems, the
4974              * LC_ALL element will likely be a repeat of the 0th element "",
4975              * but there's no harm done by doing it explicitly.
4976              *
4977              * Note that this tries the LC_ALL environment variable even on
4978              * systems which have no LC_ALL locale setting.  This may or may
4979              * not have been originally intentional, but there's no real need
4980              * to change the behavior. */
4981             if (lc_all) {
4982                 for (j = 0; j < trial_locales_count; j++) {
4983                     if (strEQ(lc_all, trial_locales[j].trial_locale)) {
4984                         goto done_lc_all;
4985                     }
4986                 }
4987                 trial_locales[trial_locales_count++] = (trial_locales_struct) {
4988                     .trial_locale = lc_all,
4989                     .fallback_desc = (strEQ(lc_all, "C")
4990                                       ? "the standard locale"
4991                                       : "a fallback locale"),
4992                     .fallback_name = lc_all,
4993                 };
4994             }
4995           done_lc_all:
4996
4997             if (lang) {
4998                 for (j = 0; j < trial_locales_count; j++) {
4999                     if (strEQ(lang, trial_locales[j].trial_locale)) {
5000                         goto done_lang;
5001                     }
5002                 }
5003                 trial_locales[trial_locales_count++] = (trial_locales_struct) {
5004                     .trial_locale = lang,
5005                     .fallback_desc = (strEQ(lang, "C")
5006                                       ? "the standard locale"
5007                                       : "a fallback locale"),
5008                     .fallback_name = lang,
5009                 };
5010             }
5011           done_lang:
5012
5013 #  if defined(WIN32) && defined(LC_ALL)
5014
5015             /* For Windows, we also try the system default locale before "C".
5016              * (If there exists a Windows without LC_ALL we skip this because
5017              * it gets too complicated.  For those, the "C" is the next
5018              * fallback possibility). */
5019             {
5020                 /* Note that this may change the locale, but we are going to do
5021                  * that anyway.
5022                  *
5023                  * Our normal Windows setlocale() implementation ignores the
5024                  * system default locale to make things work like POSIX.  This
5025                  * is the only place where we want to consider it, so have to
5026                  * use wrap_wsetlocale(). */
5027                 const char *system_default_locale =
5028                                     stdize_locale(LC_ALL,
5029                                                   S_wrap_wsetlocale(aTHX_ LC_ALL, ""),
5030                                                   &PL_stdize_locale_buf,
5031                                                   &PL_stdize_locale_bufsize,
5032                                                   __LINE__);
5033                 DEBUG_LOCALE_INIT(LC_ALL_INDEX_, "", system_default_locale);
5034
5035                 /* Skip if invalid or if it's already on the list of locales to
5036                  * try */
5037                 if (! system_default_locale) {
5038                     goto done_system_default;
5039                 }
5040                 for (j = 0; j < trial_locales_count; j++) {
5041                     if (strEQ(system_default_locale, trial_locales[j].trial_locale)) {
5042                         goto done_system_default;
5043                     }
5044                 }
5045
5046                 trial_locales[trial_locales_count++] = (trial_locales_struct) {
5047                     .trial_locale = system_default_locale,
5048                     .fallback_desc = (strEQ(system_default_locale, "C")
5049                                       ? "the standard locale"
5050                                       : "the system default locale"),
5051                     .fallback_name = system_default_locale,
5052                 };
5053             }
5054           done_system_default:
5055
5056 #  endif
5057
5058             for (j = 0; j < trial_locales_count; j++) {
5059                 if (strEQ("C", trial_locales[j].trial_locale)) {
5060                     goto done_C;
5061                 }
5062             }
5063             trial_locales[trial_locales_count++] = (trial_locales_struct) {
5064                 .trial_locale = "C",
5065                 .fallback_desc = "the standard locale",
5066                 .fallback_name = "C",
5067             };
5068
5069           done_C: ;
5070         }   /* end of first time through the loop */
5071
5072 #  ifdef WIN32
5073
5074       next_iteration: ;
5075
5076 #  endif
5077
5078     }   /* end of looping through the trial locales */
5079
5080     if (ok < 1) {   /* If we tried to fallback */
5081         const char* msg;
5082         if (! setlocale_failure) {  /* fallback succeeded */
5083            msg = "Falling back to";
5084         }
5085         else {  /* fallback failed */
5086             unsigned int j;
5087
5088             /* We dropped off the end of the loop, so have to decrement i to
5089              * get back to the value the last time through */
5090             i--;
5091
5092             ok = -1;
5093             msg = "Failed to fall back to";
5094
5095             /* To continue, we should use whatever values we've got */
5096
5097             for (j = 0; j < NOMINAL_LC_ALL_INDEX; j++) {
5098                 Safefree(curlocales[j]);
5099                 curlocales[j] = savepv(stdized_setlocale(categories[j], NULL));
5100                 DEBUG_LOCALE_INIT(j, NULL, curlocales[j]);
5101             }
5102         }
5103
5104         if (locwarn) {
5105             const char * description = trial_locales[i].fallback_desc;
5106             const char * name = trial_locales[i].fallback_name;
5107
5108             if (name && strNE(name, "")) {
5109                 PerlIO_printf(Perl_error_log,
5110                     "perl: warning: %s %s (\"%s\").\n", msg, description, name);
5111             }
5112             else {
5113                 PerlIO_printf(Perl_error_log,
5114                                    "perl: warning: %s %s.\n", msg, description);
5115             }
5116         }
5117     } /* End of tried to fallback */
5118
5119 #  ifdef USE_POSIX_2008_LOCALE
5120
5121     /* The stdized setlocales haven't affected the P2008 locales.  Initialize
5122      * them now, calculating LC_ALL only on the final go round, when all have
5123      * been set. */
5124     for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++) {
5125         (void) emulate_setlocale_i(i, curlocales[i],
5126                                    RECALCULATE_LC_ALL_ON_FINAL_INTERATION,
5127                                    __LINE__);
5128     }
5129
5130 #  endif
5131
5132     /* Done with finding the locales; update the auxiliary records */
5133     new_LC_ALL(NULL);
5134
5135 #  if defined(USE_POSIX_2008_LOCALE) && defined(USE_LOCALE_NUMERIC)
5136
5137     /* This is a temporary workaround for #20155, to avoid issues where the
5138      * global locale wants a radix different from the per-thread one.  This
5139      * restores behavior for LC_NUMERIC to what it was before a7ff7ac. */
5140     posix_setlocale(LC_NUMERIC, "C");
5141
5142 #  endif
5143
5144     for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++) {
5145         Safefree(curlocales[i]);
5146     }
5147
5148 #  if defined(USE_PERLIO) && defined(USE_LOCALE_CTYPE)
5149
5150     /* Set PL_utf8locale to TRUE if using PerlIO _and_ the current LC_CTYPE
5151      * locale is UTF-8.  The call to new_ctype() just above has already
5152      * calculated the latter value and saved it in PL_in_utf8_CTYPE_locale. If
5153      * both PL_utf8locale and PL_unicode (set by -C or by $ENV{PERL_UNICODE})
5154      * are true, perl.c:S_parse_body() will turn on the PerlIO :utf8 layer on
5155      * STDIN, STDOUT, STDERR, _and_ the default open discipline.  */
5156     PL_utf8locale = PL_in_utf8_CTYPE_locale;
5157
5158     /* Set PL_unicode to $ENV{PERL_UNICODE} if using PerlIO.
5159        This is an alternative to using the -C command line switch
5160        (the -C if present will override this). */
5161     {
5162          const char *p = PerlEnv_getenv("PERL_UNICODE");
5163          PL_unicode = p ? parse_unicode_opts(&p) : 0;
5164          if (PL_unicode & PERL_UNICODE_UTF8CACHEASSERT_FLAG)
5165              PL_utf8cache = -1;
5166     }
5167
5168 #  endif
5169 #endif /* USE_LOCALE */
5170
5171     /* So won't continue to output stuff */
5172     DEBUG_INITIALIZATION_set(FALSE);
5173
5174     return ok;
5175 }
5176
5177 #ifdef USE_LOCALE_COLLATE
5178
5179 STATIC void
5180 S_compute_collxfrm_coefficients(pTHX)
5181 {
5182
5183         PL_in_utf8_COLLATE_locale = (PL_collation_standard)
5184                                     ? 0
5185                                     : is_locale_utf8(PL_collation_name);
5186         PL_strxfrm_NUL_replacement = '\0';
5187         PL_strxfrm_max_cp = 0;
5188
5189         /* A locale collation definition includes primary, secondary, tertiary,
5190          * etc. weights for each character.  To sort, the primary weights are
5191          * used, and only if they compare equal, then the secondary weights are
5192          * used, and only if they compare equal, then the tertiary, etc.
5193          *
5194          * strxfrm() works by taking the input string, say ABC, and creating an
5195          * output transformed string consisting of first the primary weights,
5196          * A¹B¹C¹ followed by the secondary ones, A²B²C²; and then the
5197          * tertiary, etc, yielding A¹B¹C¹ A²B²C² A³B³C³ ....  Some characters
5198          * may not have weights at every level.  In our example, let's say B
5199          * doesn't have a tertiary weight, and A doesn't have a secondary
5200          * weight.  The constructed string is then going to be
5201          *  A¹B¹C¹ B²C² A³C³ ....
5202          * This has the desired effect that strcmp() will look at the secondary
5203          * or tertiary weights only if the strings compare equal at all higher
5204          * priority weights.  The spaces shown here, like in
5205          *  "A¹B¹C¹ A²B²C² "
5206          * are not just for readability.  In the general case, these must
5207          * actually be bytes, which we will call here 'separator weights'; and
5208          * they must be smaller than any other weight value, but since these
5209          * are C strings, only the terminating one can be a NUL (some
5210          * implementations may include a non-NUL separator weight just before
5211          * the NUL).  Implementations tend to reserve 01 for the separator
5212          * weights.  They are needed so that a shorter string's secondary
5213          * weights won't be misconstrued as primary weights of a longer string,
5214          * etc.  By making them smaller than any other weight, the shorter
5215          * string will sort first.  (Actually, if all secondary weights are
5216          * smaller than all primary ones, there is no need for a separator
5217          * weight between those two levels, etc.)
5218          *
5219          * The length of the transformed string is roughly a linear function of
5220          * the input string.  It's not exactly linear because some characters
5221          * don't have weights at all levels.  When we call strxfrm() we have to
5222          * allocate some memory to hold the transformed string.  The
5223          * calculations below try to find coefficients 'm' and 'b' for this
5224          * locale so that m*x + b equals how much space we need, given the size
5225          * of the input string in 'x'.  If we calculate too small, we increase
5226          * the size as needed, and call strxfrm() again, but it is better to
5227          * get it right the first time to avoid wasted expensive string
5228          * transformations. */
5229
5230         {
5231             /* We use the string below to find how long the tranformation of it
5232              * is.  Almost all locales are supersets of ASCII, or at least the
5233              * ASCII letters.  We use all of them, half upper half lower,
5234              * because if we used fewer, we might hit just the ones that are
5235              * outliers in a particular locale.  Most of the strings being
5236              * collated will contain a preponderance of letters, and even if
5237              * they are above-ASCII, they are likely to have the same number of
5238              * weight levels as the ASCII ones.  It turns out that digits tend
5239              * to have fewer levels, and some punctuation has more, but those
5240              * are relatively sparse in text, and khw believes this gives a
5241              * reasonable result, but it could be changed if experience so
5242              * dictates. */
5243             const char longer[] = "ABCDEFGHIJKLMnopqrstuvwxyz";
5244             char * x_longer;        /* Transformed 'longer' */
5245             Size_t x_len_longer;    /* Length of 'x_longer' */
5246
5247             char * x_shorter;   /* We also transform a substring of 'longer' */
5248             Size_t x_len_shorter;
5249
5250             /* mem_collxfrm_() is used get the transformation (though here we
5251              * are interested only in its length).  It is used because it has
5252              * the intelligence to handle all cases, but to work, it needs some
5253              * values of 'm' and 'b' to get it started.  For the purposes of
5254              * this calculation we use a very conservative estimate of 'm' and
5255              * 'b'.  This assumes a weight can be multiple bytes, enough to
5256              * hold any UV on the platform, and there are 5 levels, 4 weight
5257              * bytes, and a trailing NUL.  */
5258             PL_collxfrm_base = 5;
5259             PL_collxfrm_mult = 5 * sizeof(UV);
5260
5261             /* Find out how long the transformation really is */
5262             x_longer = mem_collxfrm_(longer,
5263                                      sizeof(longer) - 1,
5264                                      &x_len_longer,
5265
5266                                      /* We avoid converting to UTF-8 in the
5267                                       * called function by telling it the
5268                                       * string is in UTF-8 if the locale is a
5269                                       * UTF-8 one.  Since the string passed
5270                                       * here is invariant under UTF-8, we can
5271                                       * claim it's UTF-8 even though it isn't.
5272                                       * */
5273                                      PL_in_utf8_COLLATE_locale);
5274             Safefree(x_longer);
5275
5276             /* Find out how long the transformation of a substring of 'longer'
5277              * is.  Together the lengths of these transformations are
5278              * sufficient to calculate 'm' and 'b'.  The substring is all of
5279              * 'longer' except the first character.  This minimizes the chances
5280              * of being swayed by outliers */
5281             x_shorter = mem_collxfrm_(longer + 1,
5282                                       sizeof(longer) - 2,
5283                                       &x_len_shorter,
5284                                       PL_in_utf8_COLLATE_locale);
5285             Safefree(x_shorter);
5286
5287             /* If the results are nonsensical for this simple test, the whole
5288              * locale definition is suspect.  Mark it so that locale collation
5289              * is not active at all for it.  XXX Should we warn? */
5290             if (   x_len_shorter == 0
5291                 || x_len_longer == 0
5292                 || x_len_shorter >= x_len_longer)
5293             {
5294                 PL_collxfrm_mult = 0;
5295                 PL_collxfrm_base = 1;
5296                 DEBUG_L(PerlIO_printf(Perl_debug_log,
5297                         "Disabling locale collation for LC_COLLATE='%s';"
5298                         " length for shorter sample=%zu; longer=%zu\n",
5299                         PL_collation_name, x_len_shorter, x_len_longer));
5300             }
5301             else {
5302                 SSize_t base;       /* Temporary */
5303
5304                 /* We have both:    m * strlen(longer)  + b = x_len_longer
5305                  *                  m * strlen(shorter) + b = x_len_shorter;
5306                  * subtracting yields:
5307                  *          m * (strlen(longer) - strlen(shorter))
5308                  *                             = x_len_longer - x_len_shorter
5309                  * But we have set things up so that 'shorter' is 1 byte smaller
5310                  * than 'longer'.  Hence:
5311                  *          m = x_len_longer - x_len_shorter
5312                  *
5313                  * But if something went wrong, make sure the multiplier is at
5314                  * least 1.
5315                  */
5316                 if (x_len_longer > x_len_shorter) {
5317                     PL_collxfrm_mult = (STRLEN) x_len_longer - x_len_shorter;
5318                 }
5319                 else {
5320                     PL_collxfrm_mult = 1;
5321                 }
5322
5323                 /*     mx + b = len
5324                  * so:      b = len - mx
5325                  * but in case something has gone wrong, make sure it is
5326                  * non-negative */
5327                 base = x_len_longer - PL_collxfrm_mult * (sizeof(longer) - 1);
5328                 if (base < 0) {
5329                     base = 0;
5330                 }
5331
5332                 /* Add 1 for the trailing NUL */
5333                 PL_collxfrm_base = base + 1;
5334             }
5335
5336             DEBUG_L(PerlIO_printf(Perl_debug_log,
5337                                   "?UTF-8 locale=%d; x_len_shorter=%zu, "
5338                     "x_len_longer=%zu,"
5339                     " collate multipler=%zu, collate base=%zu\n",
5340                     PL_in_utf8_COLLATE_locale,
5341                     x_len_shorter, x_len_longer,
5342                                   PL_collxfrm_mult, PL_collxfrm_base));
5343         }
5344 }
5345
5346 char *
5347 Perl_mem_collxfrm_(pTHX_ const char *input_string,
5348                          STRLEN len,    /* Length of 'input_string' */
5349                          STRLEN *xlen,  /* Set to length of returned string
5350                                            (not including the collation index
5351                                            prefix) */
5352                          bool utf8      /* Is the input in UTF-8? */
5353                    )
5354 {
5355     /* mem_collxfrm_() is like strxfrm() but with two important differences.
5356      * First, it handles embedded NULs. Second, it allocates a bit more memory
5357      * than needed for the transformed data itself.  The real transformed data
5358      * begins at offset COLLXFRM_HDR_LEN.  *xlen is set to the length of that,
5359      * and doesn't include the collation index size.
5360      *
5361      * It is the caller's responsibility to eventually free the memory returned
5362      * by this function.
5363      *
5364      * Please see sv_collxfrm() to see how this is used. */
5365
5366 #  define COLLXFRM_HDR_LEN    sizeof(PL_collation_ix)
5367
5368     char * s = (char *) input_string;
5369     STRLEN s_strlen = strlen(input_string);
5370     char *xbuf = NULL;
5371     STRLEN xAlloc;          /* xalloc is a reserved word in VC */
5372     STRLEN length_in_chars;
5373     bool first_time = TRUE; /* Cleared after first loop iteration */
5374
5375 #  ifdef USE_LOCALE_CTYPE
5376         const char * orig_CTYPE_locale = NULL;
5377 #  endif
5378
5379 #  if defined(USE_POSIX_2008_LOCALE) && defined HAS_STRXFRM_L
5380     locale_t constructed_locale = (locale_t) 0;
5381 #  endif
5382
5383     PERL_ARGS_ASSERT_MEM_COLLXFRM_;
5384
5385     /* Must be NUL-terminated */
5386     assert(*(input_string + len) == '\0');
5387
5388     if (PL_collxfrm_mult == 0) {     /* unknown or bad */
5389         if (PL_collxfrm_base != 0) { /* bad collation => skip */
5390             DEBUG_L(PerlIO_printf(Perl_debug_log,
5391                             "mem_collxfrm_: locale's collation is defective\n"));
5392             goto bad;
5393         }
5394
5395         S_compute_collxfrm_coefficients(aTHX);
5396     }
5397
5398     /* Replace any embedded NULs with the control that sorts before any others.
5399      * This will give as good as possible results on strings that don't
5400      * otherwise contain that character, but otherwise there may be
5401      * less-than-perfect results with that character and NUL.  This is
5402      * unavoidable unless we replace strxfrm with our own implementation. */
5403     if (UNLIKELY(s_strlen < len)) {   /* Only execute if there is an embedded
5404                                          NUL */
5405         char * e = s + len;
5406         char * sans_nuls;
5407         STRLEN sans_nuls_len;
5408         int try_non_controls;
5409         char this_replacement_char[] = "?\0";   /* Room for a two-byte string,
5410                                                    making sure 2nd byte is NUL.
5411                                                  */
5412         STRLEN this_replacement_len;
5413
5414         /* If we don't know what non-NUL control character sorts lowest for
5415          * this locale, find it */
5416         if (PL_strxfrm_NUL_replacement == '\0') {
5417             int j;
5418             char * cur_min_x = NULL;    /* The min_char's xfrm, (except it also
5419                                            includes the collation index
5420                                            prefixed. */
5421
5422             DEBUG_Lv(PerlIO_printf(Perl_debug_log, "Looking to replace NUL\n"));
5423
5424             /* Unlikely, but it may be that no control will work to replace
5425              * NUL, in which case we instead look for any character.  Controls
5426              * are preferred because collation order is, in general, context
5427              * sensitive, with adjoining characters affecting the order, and
5428              * controls are less likely to have such interactions, allowing the
5429              * NUL-replacement to stand on its own.  (Another way to look at it
5430              * is to imagine what would happen if the NUL were replaced by a
5431              * combining character; it wouldn't work out all that well.) */
5432             for (try_non_controls = 0;
5433                  try_non_controls < 2;
5434                  try_non_controls++)
5435             {
5436
5437 #  ifdef USE_LOCALE_CTYPE
5438
5439                 /* In this case we use isCNTRL_LC() below, which relies on
5440                  * LC_CTYPE, so that must be switched to correspond with the
5441                  * LC_COLLATE locale */
5442                 if (! try_non_controls && ! PL_in_utf8_COLLATE_locale) {
5443                     orig_CTYPE_locale = toggle_locale_c(LC_CTYPE, PL_collation_name);
5444                 }
5445 #  endif
5446                 /* Look through all legal code points (NUL isn't) */
5447                 for (j = 1; j < 256; j++) {
5448                     char * x;       /* j's xfrm plus collation index */
5449                     STRLEN x_len;   /* length of 'x' */
5450                     STRLEN trial_len = 1;
5451                     char cur_source[] = { '\0', '\0' };
5452
5453                     /* Skip non-controls the first time through the loop.  The
5454                      * controls in a UTF-8 locale are the L1 ones */
5455                     if (! try_non_controls && (PL_in_utf8_COLLATE_locale)
5456                                                ? ! isCNTRL_L1(j)
5457                                                : ! isCNTRL_LC(j))
5458                     {
5459                         continue;
5460                     }
5461
5462                     /* Create a 1-char string of the current code point */
5463                     cur_source[0] = (char) j;
5464
5465                     /* Then transform it */
5466                     x = mem_collxfrm_(cur_source, trial_len, &x_len,
5467                                       0 /* The string is not in UTF-8 */);
5468
5469                     /* Ignore any character that didn't successfully transform.
5470                      * */
5471                     if (! x) {
5472                         continue;
5473                     }
5474
5475                     /* If this character's transformation is lower than
5476                      * the current lowest, this one becomes the lowest */
5477                     if (   cur_min_x == NULL
5478                         || strLT(x         + COLLXFRM_HDR_LEN,
5479                                  cur_min_x + COLLXFRM_HDR_LEN))
5480                     {
5481                         PL_strxfrm_NUL_replacement = j;
5482                         Safefree(cur_min_x);
5483                         cur_min_x = x;
5484                     }
5485                     else {
5486                         Safefree(x);
5487                     }
5488                 } /* end of loop through all 255 characters */
5489
5490 #  ifdef USE_LOCALE_CTYPE
5491                 restore_toggled_locale_c(LC_CTYPE, orig_CTYPE_locale);
5492 #  endif
5493
5494                 /* Stop looking if found */
5495                 if (cur_min_x) {
5496                     break;
5497                 }
5498
5499                 /* Unlikely, but possible, if there aren't any controls that
5500                  * work in the locale, repeat the loop, looking for any
5501                  * character that works */
5502                 DEBUG_L(PerlIO_printf(Perl_debug_log,
5503                 "mem_collxfrm_: No control worked.  Trying non-controls\n"));
5504             } /* End of loop to try first the controls, then any char */
5505
5506             if (! cur_min_x) {
5507                 DEBUG_L(PerlIO_printf(Perl_debug_log,
5508                     "mem_collxfrm_: Couldn't find any character to replace"
5509                     " embedded NULs in locale %s with", PL_collation_name));
5510                 goto bad;
5511             }
5512
5513             DEBUG_L(PerlIO_printf(Perl_debug_log,
5514                     "mem_collxfrm_: Replacing embedded NULs in locale %s with "
5515                     "0x%02X\n", PL_collation_name, PL_strxfrm_NUL_replacement));
5516
5517             Safefree(cur_min_x);
5518         } /* End of determining the character that is to replace NULs */
5519
5520         /* If the replacement is variant under UTF-8, it must match the
5521          * UTF8-ness of the original */
5522         if ( ! UVCHR_IS_INVARIANT(PL_strxfrm_NUL_replacement) && utf8) {
5523             this_replacement_char[0] =
5524                                 UTF8_EIGHT_BIT_HI(PL_strxfrm_NUL_replacement);
5525             this_replacement_char[1] =
5526                                 UTF8_EIGHT_BIT_LO(PL_strxfrm_NUL_replacement);
5527             this_replacement_len = 2;
5528         }
5529         else {
5530             this_replacement_char[0] = PL_strxfrm_NUL_replacement;
5531             /* this_replacement_char[1] = '\0' was done at initialization */
5532             this_replacement_len = 1;
5533         }
5534
5535         /* The worst case length for the replaced string would be if every
5536          * character in it is NUL.  Multiply that by the length of each
5537          * replacement, and allow for a trailing NUL */
5538         sans_nuls_len = (len * this_replacement_len) + 1;
5539         Newx(sans_nuls, sans_nuls_len, char);
5540         *sans_nuls = '\0';
5541
5542         /* Replace each NUL with the lowest collating control.  Loop until have
5543          * exhausted all the NULs */
5544         while (s + s_strlen < e) {
5545             my_strlcat(sans_nuls, s, sans_nuls_len);
5546
5547             /* Do the actual replacement */
5548             my_strlcat(sans_nuls, this_replacement_char, sans_nuls_len);
5549
5550             /* Move past the input NUL */
5551             s += s_strlen + 1;
5552             s_strlen = strlen(s);
5553         }
5554
5555         /* And add anything that trails the final NUL */
5556         my_strlcat(sans_nuls, s, sans_nuls_len);
5557
5558         /* Switch so below we transform this modified string */
5559         s = sans_nuls;
5560         len = strlen(s);
5561     } /* End of replacing NULs */
5562
5563     /* Make sure the UTF8ness of the string and locale match */
5564     if (utf8 != PL_in_utf8_COLLATE_locale) {
5565         /* XXX convert above Unicode to 10FFFF? */
5566         const char * const t = s;   /* Temporary so we can later find where the
5567                                        input was */
5568
5569         /* Here they don't match.  Change the string's to be what the locale is
5570          * expecting */
5571
5572         if (! utf8) { /* locale is UTF-8, but input isn't; upgrade the input */
5573             s = (char *) bytes_to_utf8((const U8 *) s, &len);
5574             utf8 = TRUE;
5575         }
5576         else {   /* locale is not UTF-8; but input is; downgrade the input */
5577
5578             s = (char *) bytes_from_utf8((const U8 *) s, &len, &utf8);
5579
5580             /* If the downgrade was successful we are done, but if the input
5581              * contains things that require UTF-8 to represent, have to do
5582              * damage control ... */
5583             if (UNLIKELY(utf8)) {
5584
5585                 /* What we do is construct a non-UTF-8 string with
5586                  *  1) the characters representable by a single byte converted
5587                  *     to be so (if necessary);
5588                  *  2) and the rest converted to collate the same as the
5589                  *     highest collating representable character.  That makes
5590                  *     them collate at the end.  This is similar to how we
5591                  *     handle embedded NULs, but we use the highest collating
5592                  *     code point instead of the smallest.  Like the NUL case,
5593                  *     this isn't perfect, but is the best we can reasonably
5594                  *     do.  Every above-255 code point will sort the same as
5595                  *     the highest-sorting 0-255 code point.  If that code
5596                  *     point can combine in a sequence with some other code
5597                  *     points for weight calculations, us changing something to
5598                  *     be it can adversely affect the results.  But in most
5599                  *     cases, it should work reasonably.  And note that this is
5600                  *     really an illegal situation: using code points above 255
5601                  *     on a locale where only 0-255 are valid.  If two strings
5602                  *     sort entirely equal, then the sort order for the
5603                  *     above-255 code points will be in code point order. */
5604
5605                 utf8 = FALSE;
5606
5607                 /* If we haven't calculated the code point with the maximum
5608                  * collating order for this locale, do so now */
5609                 if (! PL_strxfrm_max_cp) {
5610                     int j;
5611
5612                     /* The current transformed string that collates the
5613                      * highest (except it also includes the prefixed collation
5614                      * index. */
5615                     char * cur_max_x = NULL;
5616
5617                     /* Look through all legal code points (NUL isn't) */
5618                     for (j = 1; j < 256; j++) {
5619                         char * x;
5620                         STRLEN x_len;
5621                         char cur_source[] = { '\0', '\0' };
5622
5623                         /* Create a 1-char string of the current code point */
5624                         cur_source[0] = (char) j;
5625
5626                         /* Then transform it */
5627                         x = mem_collxfrm_(cur_source, 1, &x_len, FALSE);
5628
5629                         /* If something went wrong (which it shouldn't), just
5630                          * ignore this code point */
5631                         if (! x) {
5632                             continue;
5633                         }
5634
5635                         /* If this character's transformation is higher than
5636                          * the current highest, this one becomes the highest */
5637                         if (   cur_max_x == NULL
5638                             || strGT(x         + COLLXFRM_HDR_LEN,
5639                                      cur_max_x + COLLXFRM_HDR_LEN))
5640                         {
5641                             PL_strxfrm_max_cp = j;
5642                             Safefree(cur_max_x);
5643                             cur_max_x = x;
5644                         }
5645                         else {
5646                             Safefree(x);
5647                         }
5648                     }
5649
5650                     if (! cur_max_x) {
5651                         DEBUG_L(PerlIO_printf(Perl_debug_log,
5652                             "mem_collxfrm_: Couldn't find any character to"
5653                             " replace above-Latin1 chars in locale %s with",
5654                             PL_collation_name));
5655                         goto bad;
5656                     }
5657
5658                     DEBUG_L(PerlIO_printf(Perl_debug_log,
5659                             "mem_collxfrm_: highest 1-byte collating character"
5660                             " in locale %s is 0x%02X\n",
5661                             PL_collation_name,
5662                             PL_strxfrm_max_cp));
5663
5664                     Safefree(cur_max_x);
5665                 }
5666
5667                 /* Here we know which legal code point collates the highest.
5668                  * We are ready to construct the non-UTF-8 string.  The length
5669                  * will be at least 1 byte smaller than the input string
5670                  * (because we changed at least one 2-byte character into a
5671                  * single byte), but that is eaten up by the trailing NUL */
5672                 Newx(s, len, char);
5673
5674                 {
5675                     STRLEN i;
5676                     STRLEN d= 0;
5677                     char * e = (char *) t + len;
5678
5679                     for (i = 0; i < len; i+= UTF8SKIP(t + i)) {
5680                         U8 cur_char = t[i];
5681                         if (UTF8_IS_INVARIANT(cur_char)) {
5682                             s[d++] = cur_char;
5683                         }
5684                         else if (UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(t + i, e)) {
5685                             s[d++] = EIGHT_BIT_UTF8_TO_NATIVE(cur_char, t[i+1]);
5686                         }
5687                         else {  /* Replace illegal cp with highest collating
5688                                    one */
5689                             s[d++] = PL_strxfrm_max_cp;
5690                         }
5691                     }
5692                     s[d++] = '\0';
5693                     Renew(s, d, char);   /* Free up unused space */
5694                 }
5695             }
5696         }
5697
5698         /* Here, we have constructed a modified version of the input.  It could
5699          * be that we already had a modified copy before we did this version.
5700          * If so, that copy is no longer needed */
5701         if (t != input_string) {
5702             Safefree(t);
5703         }
5704     }
5705
5706     length_in_chars = (utf8)
5707                       ? utf8_length((U8 *) s, (U8 *) s + len)
5708                       : len;
5709
5710     /* The first element in the output is the collation id, used by
5711      * sv_collxfrm(); then comes the space for the transformed string.  The
5712      * equation should give us a good estimate as to how much is needed */
5713     xAlloc = COLLXFRM_HDR_LEN
5714            + PL_collxfrm_base
5715            + (PL_collxfrm_mult * length_in_chars);
5716     Newx(xbuf, xAlloc, char);
5717     if (UNLIKELY(! xbuf)) {
5718         DEBUG_L(PerlIO_printf(Perl_debug_log,
5719                       "mem_collxfrm_: Couldn't malloc %zu bytes\n", xAlloc));
5720         goto bad;
5721     }
5722
5723     /* Store the collation id */
5724     *(U32*)xbuf = PL_collation_ix;
5725
5726 #  if defined(USE_POSIX_2008_LOCALE) && defined HAS_STRXFRM_L
5727 #    ifdef USE_LOCALE_CTYPE
5728
5729     constructed_locale = newlocale(LC_CTYPE_MASK, PL_collation_name,
5730                                    duplocale(use_curlocale_scratch()));
5731 #    else
5732
5733     constructed_locale = duplocale(use_curlocale_scratch());
5734
5735 #    endif
5736 #    define my_strxfrm(dest, src, n)  strxfrm_l(dest, src, n,           \
5737                                                 constructed_locale)
5738 #    define CLEANUP_STRXFRM                                             \
5739         STMT_START {                                                    \
5740             if (constructed_locale != (locale_t) 0)                     \
5741                 freelocale(constructed_locale);                         \
5742         } STMT_END
5743 #  else
5744 #    define my_strxfrm(dest, src, n)  strxfrm(dest, src, n)
5745 #    ifdef USE_LOCALE_CTYPE
5746
5747     orig_CTYPE_locale = toggle_locale_c(LC_CTYPE, PL_collation_name);
5748
5749 #      define CLEANUP_STRXFRM                                           \
5750                 restore_toggled_locale_c(LC_CTYPE, orig_CTYPE_locale)
5751 #    else
5752 #      define CLEANUP_STRXFRM  NOOP
5753 #    endif
5754 #  endif
5755
5756     /* Then the transformation of the input.  We loop until successful, or we
5757      * give up */
5758     for (;;) {
5759
5760         errno = 0;
5761         *xlen = my_strxfrm(xbuf + COLLXFRM_HDR_LEN, s, xAlloc - COLLXFRM_HDR_LEN);
5762
5763         /* If the transformed string occupies less space than we told strxfrm()
5764          * was available, it means it transformed the whole string. */
5765         if (*xlen < xAlloc - COLLXFRM_HDR_LEN) {
5766
5767             /* But there still could have been a problem */
5768             if (errno != 0) {
5769                 DEBUG_L(PerlIO_printf(Perl_debug_log,
5770                        "strxfrm failed for LC_COLLATE=%s; errno=%d, input=%s\n",
5771                        PL_collation_name, errno,
5772                        _byte_dump_string((U8 *) s, len, 0)));
5773                 goto bad;
5774             }
5775
5776             /* Here, the transformation was successful.  Some systems include a
5777              * trailing NUL in the returned length.  Ignore it, using a loop in
5778              * case multiple trailing NULs are returned. */
5779             while (   (*xlen) > 0
5780                    && *(xbuf + COLLXFRM_HDR_LEN + (*xlen) - 1) == '\0')
5781             {
5782                 (*xlen)--;
5783             }
5784
5785             /* If the first try didn't get it, it means our prediction was low.
5786              * Modify the coefficients so that we predict a larger value in any
5787              * future transformations */
5788             if (! first_time) {
5789                 STRLEN needed = *xlen + 1;   /* +1 For trailing NUL */
5790                 STRLEN computed_guess = PL_collxfrm_base
5791                                       + (PL_collxfrm_mult * length_in_chars);
5792
5793                 /* On zero-length input, just keep current slope instead of
5794                  * dividing by 0 */
5795                 const STRLEN new_m = (length_in_chars != 0)
5796                                      ? needed / length_in_chars
5797                                      : PL_collxfrm_mult;
5798
5799                 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
5800                     "initial size of %zu bytes for a length "
5801                     "%zu string was insufficient, %zu needed\n",
5802                     computed_guess, length_in_chars, needed));
5803
5804                 /* If slope increased, use it, but discard this result for
5805                  * length 1 strings, as we can't be sure that it's a real slope
5806                  * change */
5807                 if (length_in_chars > 1 && new_m  > PL_collxfrm_mult) {
5808
5809 #  ifdef DEBUGGING
5810
5811                     STRLEN old_m = PL_collxfrm_mult;
5812                     STRLEN old_b = PL_collxfrm_base;
5813
5814 #  endif
5815
5816                     PL_collxfrm_mult = new_m;
5817                     PL_collxfrm_base = 1;   /* +1 For trailing NUL */
5818                     computed_guess = PL_collxfrm_base
5819                                     + (PL_collxfrm_mult * length_in_chars);
5820                     if (computed_guess < needed) {
5821                         PL_collxfrm_base += needed - computed_guess;
5822                     }
5823
5824                     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
5825                                     "slope is now %zu; was %zu, base "
5826                         "is now %zu; was %zu\n",
5827                         PL_collxfrm_mult, old_m,
5828                         PL_collxfrm_base, old_b));
5829                 }
5830                 else {  /* Slope didn't change, but 'b' did */
5831                     const STRLEN new_b = needed
5832                                         - computed_guess
5833                                         + PL_collxfrm_base;
5834                     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
5835                         "base is now %zu; was %zu\n", new_b, PL_collxfrm_base));
5836                     PL_collxfrm_base = new_b;
5837                 }
5838             }
5839
5840             break;
5841         }
5842
5843         if (UNLIKELY(*xlen >= PERL_INT_MAX)) {
5844             DEBUG_L(PerlIO_printf(Perl_debug_log,
5845                   "mem_collxfrm_: Needed %zu bytes, max permissible is %u\n",
5846                   *xlen, PERL_INT_MAX));
5847             goto bad;
5848         }
5849
5850         /* A well-behaved strxfrm() returns exactly how much space it needs
5851          * (usually not including the trailing NUL) when it fails due to not
5852          * enough space being provided.  Assume that this is the case unless
5853          * it's been proven otherwise */
5854         if (LIKELY(PL_strxfrm_is_behaved) && first_time) {
5855             xAlloc = *xlen + COLLXFRM_HDR_LEN + 1;
5856         }
5857         else { /* Here, either:
5858                 *  1)  The strxfrm() has previously shown bad behavior; or
5859                 *  2)  It isn't the first time through the loop, which means
5860                 *      that the strxfrm() is now showing bad behavior, because
5861                 *      we gave it what it said was needed in the previous
5862                 *      iteration, and it came back saying it needed still more.
5863                 *      (Many versions of cygwin fit this.  When the buffer size
5864                 *      isn't sufficient, they return the input size instead of
5865                 *      how much is needed.)
5866                 * Increase the buffer size by a fixed percentage and try again.
5867                 * */
5868             xAlloc += (xAlloc / 4) + 1;
5869             PL_strxfrm_is_behaved = FALSE;
5870
5871             DEBUG_Lv(PerlIO_printf(Perl_debug_log,
5872                      "mem_collxfrm_ required more space than previously"
5873                      " calculated for locale %s, trying again with new"
5874                      " guess=%zu+%zu\n",
5875                 PL_collation_name,  COLLXFRM_HDR_LEN,
5876                      xAlloc - COLLXFRM_HDR_LEN));
5877         }
5878
5879         Renew(xbuf, xAlloc, char);
5880         if (UNLIKELY(! xbuf)) {
5881             DEBUG_L(PerlIO_printf(Perl_debug_log,
5882                       "mem_collxfrm_: Couldn't realloc %zu bytes\n", xAlloc));
5883             goto bad;
5884         }
5885
5886         first_time = FALSE;
5887     }
5888
5889     CLEANUP_STRXFRM;
5890
5891     DEBUG_L(print_collxfrm_input_and_return(s, s + len, xbuf, *xlen, utf8));
5892
5893     /* Free up unneeded space; retain enough for trailing NUL */
5894     Renew(xbuf, COLLXFRM_HDR_LEN + *xlen + 1, char);
5895
5896     if (s != input_string) {
5897         Safefree(s);
5898     }
5899
5900     return xbuf;
5901
5902   bad:
5903
5904     CLEANUP_STRXFRM;
5905     DEBUG_L(print_collxfrm_input_and_return(s, s + len, NULL, 0, utf8));
5906
5907     Safefree(xbuf);
5908     if (s != input_string) {
5909         Safefree(s);
5910     }
5911     *xlen = 0;
5912
5913     return NULL;
5914 }
5915
5916 #  ifdef DEBUGGING
5917
5918 STATIC void
5919 S_print_collxfrm_input_and_return(pTHX_
5920                                   const char * s,
5921                                   const char * e,
5922                                   const char * xbuf,
5923                                   const STRLEN xlen,
5924                                   const bool is_utf8)
5925 {
5926
5927     PERL_ARGS_ASSERT_PRINT_COLLXFRM_INPUT_AND_RETURN;
5928
5929     PerlIO_printf(Perl_debug_log,
5930                   "mem_collxfrm_[ix %" UVuf "] for locale '%s':\n",
5931                   (UV) PL_collation_ix, PL_collation_name);
5932     PerlIO_printf(Perl_debug_log,  "     input=");
5933     print_bytes_for_locale(s, e, is_utf8);
5934     PerlIO_printf(Perl_debug_log, "\n    return=%s\n    return len=%zu\n",
5935            ((xbuf == NULL)
5936              ? "(null)"
5937              : _byte_dump_string((U8 *) xbuf + COLLXFRM_HDR_LEN, xlen, 0)),
5938             xlen);
5939 }
5940
5941 #  endif    /* DEBUGGING */
5942 #endif /* USE_LOCALE_COLLATE */
5943
5944 #ifdef DEBUGGING
5945
5946 STATIC void
5947 S_print_bytes_for_locale(pTHX_
5948                     const char * const s,
5949                     const char * const e,
5950                     const bool is_utf8)
5951 {
5952     const char * t = s;
5953     bool prev_was_printable = TRUE;
5954     bool first_time = TRUE;
5955
5956     PERL_ARGS_ASSERT_PRINT_BYTES_FOR_LOCALE;
5957
5958     while (t < e) {
5959         UV cp = (is_utf8)
5960                 ?  utf8_to_uvchr_buf((U8 *) t, e, NULL)
5961                 : * (U8 *) t;
5962         if (isPRINT(cp)) {
5963             if (! prev_was_printable) {
5964                 PerlIO_printf(Perl_debug_log, " ");
5965             }
5966             PerlIO_printf(Perl_debug_log, "%c", (U8) cp);
5967             prev_was_printable = TRUE;
5968         }
5969         else {
5970             if (! first_time) {
5971                 PerlIO_printf(Perl_debug_log, " ");
5972             }
5973             PerlIO_printf(Perl_debug_log, "%02" UVXf, cp);
5974             prev_was_printable = FALSE;
5975         }
5976         t += (is_utf8) ? UTF8SKIP(t) : 1;
5977         first_time = FALSE;
5978     }
5979 }
5980
5981 #endif   /* #ifdef DEBUGGING */
5982 #ifdef USE_LOCALE
5983
5984 STATIC const char *
5985 S_toggle_locale_i(pTHX_ const unsigned cat_index,
5986                         const char * new_locale,
5987                         const line_t caller_line)
5988 {
5989     /* Changes the locale for the category specified by 'index' to 'new_locale,
5990      * if they aren't already the same.
5991      *
5992      * Returns a copy of the name of the original locale for 'cat_index'
5993      * so can be switched back to with the companion function
5994      * restore_toggled_locale_i(),  (NULL if no restoral is necessary.) */
5995
5996     const char * locale_to_restore_to = NULL;
5997
5998     PERL_ARGS_ASSERT_TOGGLE_LOCALE_I;
5999     assert(cat_index <= NOMINAL_LC_ALL_INDEX);
6000
6001     /* Find the original locale of the category we may need to change, so that
6002      * it can be restored to later */
6003
6004     locale_to_restore_to = querylocale_i(cat_index);
6005
6006     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
6007              "(%" LINE_Tf "): toggle_locale_i: index=%d(%s), wanted=%s,"
6008              " actual=%s\n",
6009              caller_line, cat_index, category_names[cat_index],
6010              new_locale, locale_to_restore_to));
6011
6012     if (! locale_to_restore_to) {
6013         locale_panic_(Perl_form(aTHX_ "Could not find current %s locale, errno=%d",
6014                                 category_names[cat_index], errno));
6015     }
6016
6017     /* If the locales are the same, there's nothing to do */
6018     if (strEQ(locale_to_restore_to, new_locale)) {
6019         DEBUG_Lv(PerlIO_printf(Perl_debug_log,
6020                                "(%d): %s locale unchanged as %s\n",
6021                                caller_line, category_names[cat_index],
6022                                new_locale));
6023
6024         return NULL;
6025     }
6026
6027     /* Finally, change the locale to the new one */
6028     void_setlocale_i(cat_index, new_locale);
6029
6030     DEBUG_Lv(PerlIO_printf(Perl_debug_log, "(%" LINE_Tf "): %s locale switched to %s\n",
6031                            caller_line, category_names[cat_index], new_locale));
6032
6033     return locale_to_restore_to;
6034
6035 #ifndef DEBUGGING
6036     PERL_UNUSED_ARG(caller_line);
6037 #endif
6038
6039 }
6040
6041 STATIC void
6042 S_restore_toggled_locale_i(pTHX_ const unsigned int cat_index,
6043                                  const char * restore_locale,
6044                                  const line_t caller_line)
6045 {
6046     /* Restores the locale for LC_category corresponding to cat_indes to
6047      * 'restore_locale' (which is a copy that will be freed by this function),
6048      * or do nothing if the latter parameter is NULL */
6049
6050     PERL_ARGS_ASSERT_RESTORE_TOGGLED_LOCALE_I;
6051     assert(cat_index <= NOMINAL_LC_ALL_INDEX);
6052
6053     if (restore_locale == NULL) {
6054         DEBUG_Lv(PerlIO_printf(Perl_debug_log,
6055                                "(%" LINE_Tf "): No need to restore %s\n",
6056                                caller_line, category_names[cat_index]));
6057         return;
6058     }
6059
6060     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
6061                            "(%" LINE_Tf "): %s restoring locale to %s\n",
6062                            caller_line, category_names[cat_index],
6063                            restore_locale));
6064
6065     void_setlocale_i(cat_index, restore_locale);
6066
6067 #ifndef DEBUGGING
6068     PERL_UNUSED_ARG(caller_line);
6069 #endif
6070
6071 }
6072
6073 #ifdef USE_LOCALE_CTYPE
6074
6075 STATIC bool
6076 S_is_codeset_name_UTF8(const char * name)
6077 {
6078     /* Return a boolean as to if the passed-in name indicates it is a UTF-8
6079      * code set.  Several variants are possible */
6080     const Size_t len = strlen(name);
6081
6082     PERL_ARGS_ASSERT_IS_CODESET_NAME_UTF8;
6083
6084 #  ifdef WIN32
6085
6086     /* http://msdn.microsoft.com/en-us/library/windows/desktop/dd317756.aspx */
6087     if (memENDs(name, len, "65001")) {
6088         return TRUE;
6089     }
6090
6091 #  endif
6092                /* 'UTF8' or 'UTF-8' */
6093     return (    inRANGE(len, 4, 5)
6094             &&  name[len-1] == '8'
6095             && (   memBEGINs(name, len, "UTF")
6096                 || memBEGINs(name, len, "utf"))
6097             && (len == 4 || name[3] == '-'));
6098 }
6099
6100 #endif
6101
6102 STATIC bool
6103 S_is_locale_utf8(pTHX_ const char * locale)
6104 {
6105     /* Returns TRUE if the locale 'locale' is UTF-8; FALSE otherwise.  It uses
6106      * my_langinfo(), which employs various methods to get this information
6107      * if nl_langinfo() isn't available, using heuristics as a last resort, in
6108      * which case, the result will very likely be correct for locales for
6109      * languages that have commonly used non-ASCII characters, but for notably
6110      * English, it comes down to if the locale's name ends in something like
6111      * "UTF-8".  It errs on the side of not being a UTF-8 locale. */
6112
6113 #  if ! defined(USE_LOCALE_CTYPE)                                             \
6114    ||   defined(EBCDIC) /* There aren't any real UTF-8 locales at this time */
6115
6116     PERL_UNUSED_ARG(locale);
6117
6118     return FALSE;
6119
6120 #  else
6121
6122     const char * scratch_buffer = NULL;
6123     const char * codeset;
6124     bool retval;
6125
6126     PERL_ARGS_ASSERT_IS_LOCALE_UTF8;
6127
6128     if (strEQ(locale, PL_ctype_name)) {
6129         return PL_in_utf8_CTYPE_locale;
6130     }
6131
6132     codeset = my_langinfo_c(CODESET, LC_CTYPE, locale,
6133                             &scratch_buffer, NULL, NULL);
6134     retval = is_codeset_name_UTF8(codeset);
6135
6136     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
6137                            "found codeset=%s, is_utf8=%d\n", codeset, retval));
6138
6139     Safefree(scratch_buffer);
6140     return retval;
6141
6142 #  endif
6143
6144 }
6145
6146 #endif  /* USE_LOCALE */
6147
6148 bool
6149 Perl__is_in_locale_category(pTHX_ const bool compiling, const int category)
6150 {
6151     /* Internal function which returns if we are in the scope of a pragma that
6152      * enables the locale category 'category'.  'compiling' should indicate if
6153      * this is during the compilation phase (TRUE) or not (FALSE). */
6154
6155     const COP * const cop = (compiling) ? &PL_compiling : PL_curcop;
6156
6157     SV *these_categories = cop_hints_fetch_pvs(cop, "locale", 0);
6158     if (! these_categories || these_categories == &PL_sv_placeholder) {
6159         return FALSE;
6160     }
6161
6162     /* The pseudo-category 'not_characters' is -1, so just add 1 to each to get
6163      * a valid unsigned */
6164     assert(category >= -1);
6165     return cBOOL(SvUV(these_categories) & (1U << (category + 1)));
6166 }
6167
6168 /* my_strerror() returns a mortalized copy of the text of the error message
6169  * associated with 'errnum'.
6170  *
6171  * If not called from within the scope of 'use locale', it uses the text from
6172  * the C locale.  If Perl is compiled to not pay attention to LC_CTYPE nor
6173  * LC_MESSAGES, it uses whatever strerror() returns.  Otherwise the text is
6174  * derived from the locale, LC_MESSAGES if we have that; LC_CTYPE if not.
6175  *
6176  * It returns in *utf8ness the result's UTF-8ness
6177  *
6178  * The function just calls strerror(), but temporarily switches locales, if
6179  * needed.  Many platforms require LC_CTYPE and LC_MESSAGES to be in the same
6180  * CODESET in order for the return from strerror() to not contain '?' symbols,
6181  * or worse, mojibaked.  It's cheaper to just use the stricter criteria of
6182  * being in the same locale.  So the code below uses a common locale for both
6183  * categories.  Again, that is C if not within 'use locale' scope; or the
6184  * LC_MESSAGES locale if in scope and we have that category; and LC_CTYPE if we
6185  * don't have LC_MESSAGES; and whatever strerror returns if we don't have
6186  * either category.
6187  *
6188  * There are two sets of implementations.  The first below is if we have
6189  * strerror_l().  This is the simpler.  We just use the already-built C locale
6190  * object if not in locale scope, or build up a custom one otherwise.
6191  *
6192  * When strerror_l() is not available, we may have to swap locales temporarily
6193  * to bring the two categories into sync with each other, and possibly to the C
6194  * locale.
6195  *
6196  * Because the prepropessing directives to conditionally compile this function
6197  * would greatly obscure the logic of the various implementations, the whole
6198  * function is repeated for each configuration, with some common macros. */
6199
6200 /* Used to shorten the definitions of the following implementations of
6201  * my_strerror() */
6202 #define DEBUG_STRERROR_ENTER(errnum, in_locale)                             \
6203     DEBUG_Lv(PerlIO_printf(Perl_debug_log,                                  \
6204                            "my_strerror called with errnum %d;"             \
6205                            " Within locale scope=%d\n",                     \
6206                            errnum, in_locale))
6207 #define DEBUG_STRERROR_RETURN(errstr, utf8ness)                             \
6208     DEBUG_Lv(PerlIO_printf(Perl_debug_log,                                  \
6209                            "Strerror returned; saving a copy: '");          \
6210        print_bytes_for_locale(errstr, errstr + strlen(errstr), 0);          \
6211        PerlIO_printf(Perl_debug_log, "'; utf8ness=%d\n", (int) *utf8ness);)
6212
6213 /* On platforms that have precisely one of these categories (Windows
6214  * qualifies), these yield the correct one */
6215 #if defined(USE_LOCALE_CTYPE)
6216 #  define WHICH_LC_INDEX LC_CTYPE_INDEX_
6217 #elif defined(USE_LOCALE_MESSAGES)
6218 #  define WHICH_LC_INDEX LC_MESSAGES_INDEX_
6219 #endif
6220
6221 /*==========================================================================*/
6222 /* First set of implementations, when have strerror_l() */
6223
6224 #if defined(USE_POSIX_2008_LOCALE) && defined(HAS_STRERROR_L)
6225
6226 #  if ! defined(USE_LOCALE_CTYPE) && ! defined(USE_LOCALE_MESSAGES)
6227
6228 /* Here, neither category is defined: use the C locale */
6229 const char *
6230 Perl_my_strerror(pTHX_ const int errnum, utf8ness_t * utf8ness)
6231 {
6232     PERL_ARGS_ASSERT_MY_STRERROR;
6233
6234     DEBUG_STRERROR_ENTER(errnum, 0);
6235
6236     const char *errstr = savepv(strerror_l(errnum, PL_C_locale_obj));
6237     *utf8ness = UTF8NESS_IMMATERIAL;
6238
6239     DEBUG_STRERROR_RETURN(errstr, utf8ness);
6240
6241     SAVEFREEPV(errstr);
6242     return errstr;
6243 }
6244
6245 #  elif ! defined(USE_LOCALE_CTYPE) || ! defined(USE_LOCALE_MESSAGES)
6246
6247 /*--------------------------------------------------------------------------*/
6248
6249 /* Here one or the other of CTYPE or MESSAGES is defined, but not both.  If we
6250  * are not within 'use locale' scope of the only one defined, we use the C
6251  * locale; otherwise use the current locale object */
6252
6253 const char *
6254 Perl_my_strerror(pTHX_ const int errnum, utf8ness_t * utf8ness)
6255 {
6256     PERL_ARGS_ASSERT_MY_STRERROR;
6257
6258     DEBUG_STRERROR_ENTER(errnum, IN_LC(categories[WHICH_LC_INDEX]));
6259
6260     /* Use C if not within locale scope;  Otherwise, use current locale */
6261     const locale_t which_obj = (IN_LC(categories[WHICH_LC_INDEX]))
6262                                ? PL_C_locale_obj
6263                                : use_curlocale_scratch();
6264
6265     const char *errstr = savepv(strerror_l(errnum, which_obj));
6266     *utf8ness = get_locale_string_utf8ness_i(NULL, WHICH_LC_INDEX, errstr,
6267                                              LOCALE_UTF8NESS_UNKNOWN);
6268     DEBUG_STRERROR_RETURN(errstr, utf8ness);
6269
6270     SAVEFREEPV(errstr);
6271     return errstr;
6272 }
6273
6274 /*--------------------------------------------------------------------------*/
6275 #  else     /* Are using both categories.  Place them in the same CODESET,
6276              * either C or the LC_MESSAGES locale */
6277
6278 const char *
6279 Perl_my_strerror(pTHX_ const int errnum, utf8ness_t * utf8ness)
6280 {
6281     PERL_ARGS_ASSERT_MY_STRERROR;
6282
6283     DEBUG_STRERROR_ENTER(errnum, IN_LC(LC_MESSAGES));
6284
6285     const char *errstr;
6286     if (! IN_LC(LC_MESSAGES)) {    /* Use C if not within locale scope */
6287         errstr = savepv(strerror_l(errnum, PL_C_locale_obj));
6288         *utf8ness = UTF8NESS_IMMATERIAL;
6289     }
6290     else {  /* Otherwise, use the LC_MESSAGES locale, making sure LC_CTYPE
6291                matches */
6292         locale_t cur = duplocale(use_curlocale_scratch());
6293
6294         cur = newlocale(LC_CTYPE_MASK, querylocale_c(LC_MESSAGES), cur);
6295         errstr = savepv(strerror_l(errnum, cur));
6296         *utf8ness = get_locale_string_utf8ness_i(NULL, LC_MESSAGES_INDEX_,
6297                                                errstr, LOCALE_UTF8NESS_UNKNOWN);
6298         freelocale(cur);
6299     }
6300
6301     DEBUG_STRERROR_RETURN(errstr, utf8ness);
6302
6303     SAVEFREEPV(errstr);
6304     return errstr;
6305 }
6306 #  endif    /* Above is using strerror_l */
6307 /*==========================================================================*/
6308 #else       /* Below is not using strerror_l */
6309 #  if ! defined(USE_LOCALE_CTYPE) && ! defined(USE_LOCALE_MESSAGES)
6310
6311 /* If not using using either of the categories, return plain, unadorned
6312  * strerror */
6313
6314 const char *
6315 Perl_my_strerror(pTHX_ const int errnum, utf8ness_t * utf8ness)
6316 {
6317     PERL_ARGS_ASSERT_MY_STRERROR;
6318
6319     DEBUG_STRERROR_ENTER(errnum, 0);
6320
6321     const char *errstr = savepv(Strerror(errnum));
6322     *utf8ness = UTF8NESS_IMMATERIAL;
6323
6324     DEBUG_STRERROR_RETURN(errstr, utf8ness);
6325
6326     SAVEFREEPV(errstr);
6327     return errstr;
6328 }
6329
6330 /*--------------------------------------------------------------------------*/
6331 #  elif ! defined(USE_LOCALE_CTYPE) || ! defined(USE_LOCALE_MESSAGES)
6332
6333 /* Here one or the other of CTYPE or MESSAGES is defined, but not both.  If we
6334  * are not within 'use locale' scope of the only one defined, we use the C
6335  * locale; otherwise use the current locale */
6336
6337 const char *
6338 Perl_my_strerror(pTHX_ const int errnum, utf8ness_t * utf8ness)
6339 {
6340     PERL_ARGS_ASSERT_MY_STRERROR;
6341
6342     DEBUG_STRERROR_ENTER(errnum, IN_LC(categories[WHICH_LC_INDEX]));
6343
6344     const char *errstr;
6345     if (IN_LC(categories[WHICH_LC_INDEX])) {
6346         errstr = savepv(Strerror(errnum));
6347         *utf8ness = get_locale_string_utf8ness_i(NULL, WHICH_LC_INDEX, errstr,
6348                                                  LOCALE_UTF8NESS_UNKNOWN);
6349     }
6350     else {
6351
6352         SETLOCALE_LOCK;
6353
6354         const char * orig_locale = toggle_locale_i(WHICH_LC_INDEX, "C");
6355
6356         errstr = savepv(Strerror(errnum));
6357
6358         restore_toggled_locale_i(WHICH_LC_INDEX, orig_locale);
6359
6360         SETLOCALE_UNLOCK;
6361
6362         *utf8ness = UTF8NESS_IMMATERIAL;
6363
6364     }
6365
6366     DEBUG_STRERROR_RETURN(errstr, utf8ness);
6367
6368     SAVEFREEPV(errstr);
6369     return errstr;
6370 }
6371
6372 /*--------------------------------------------------------------------------*/
6373 #  else
6374
6375 /* Below, have both LC_CTYPE and LC_MESSAGES.  Place them in the same CODESET,
6376  * either C or the LC_MESSAGES locale */
6377
6378 const char *
6379 Perl_my_strerror(pTHX_ const int errnum, utf8ness_t * utf8ness)
6380 {
6381     PERL_ARGS_ASSERT_MY_STRERROR;
6382
6383     DEBUG_STRERROR_ENTER(errnum, IN_LC(LC_MESSAGES));
6384
6385     const char * desired_locale = (IN_LC(LC_MESSAGES))
6386                                   ? querylocale_c(LC_MESSAGES)
6387                                   : "C";
6388     /* XXX Can fail on z/OS */
6389
6390     SETLOCALE_LOCK;
6391
6392     const char* orig_CTYPE_locale    = toggle_locale_c(LC_CTYPE, desired_locale);
6393     const char* orig_MESSAGES_locale = toggle_locale_c(LC_MESSAGES,
6394                                                        desired_locale);
6395     const char *errstr = savepv(Strerror(errnum));
6396
6397     restore_toggled_locale_c(LC_MESSAGES, orig_MESSAGES_locale);
6398     restore_toggled_locale_c(LC_CTYPE, orig_CTYPE_locale);
6399
6400     SETLOCALE_UNLOCK;
6401
6402     *utf8ness = get_locale_string_utf8ness_i(NULL, LC_MESSAGES_INDEX_, errstr,
6403                                              LOCALE_UTF8NESS_UNKNOWN);
6404     DEBUG_STRERROR_RETURN(errstr, utf8ness);
6405
6406     SAVEFREEPV(errstr);
6407     return errstr;
6408 }
6409
6410 /*--------------------------------------------------------------------------*/
6411 #  endif /* end of not using strerror_l() */
6412 #endif   /* end of all the my_strerror() implementations */
6413
6414 /*
6415
6416 =for apidoc switch_to_global_locale
6417
6418 On systems without locale support, or on typical single-threaded builds, or on
6419 platforms that do not support per-thread locale operations, this function does
6420 nothing.  On such systems that do have locale support, only a locale global to
6421 the whole program is available.
6422
6423 On multi-threaded builds on systems that do have per-thread locale operations,
6424 this function converts the thread it is running in to use the global locale.
6425 This is for code that has not yet or cannot be updated to handle multi-threaded
6426 locale operation.  As long as only a single thread is so-converted, everything
6427 works fine, as all the other threads continue to ignore the global one, so only
6428 this thread looks at it.
6429
6430 However, on Windows systems this isn't quite true prior to Visual Studio 15,
6431 at which point Microsoft fixed a bug.  A race can occur if you use the
6432 following operations on earlier Windows platforms:
6433
6434 =over
6435
6436 =item L<POSIX::localeconv|POSIX/localeconv>
6437
6438 =item L<I18N::Langinfo>, items C<CRNCYSTR> and C<THOUSEP>
6439
6440 =item L<perlapi/Perl_langinfo>, items C<CRNCYSTR> and C<THOUSEP>
6441
6442 =back
6443
6444 The first item is not fixable (except by upgrading to a later Visual Studio
6445 release), but it would be possible to work around the latter two items by using
6446 the Windows API functions C<GetNumberFormat> and C<GetCurrencyFormat>; patches
6447 welcome.
6448
6449 Without this function call, threads that use the L<C<setlocale(3)>> system
6450 function will not work properly, as all the locale-sensitive functions will
6451 look at the per-thread locale, and C<setlocale> will have no effect on this
6452 thread.
6453
6454 Perl code should convert to either call
6455 L<C<Perl_setlocale>|perlapi/Perl_setlocale> (which is a drop-in for the system
6456 C<setlocale>) or use the methods given in L<perlcall> to call
6457 L<C<POSIX::setlocale>|POSIX/setlocale>.  Either one will transparently properly
6458 handle all cases of single- vs multi-thread, POSIX 2008-supported or not.
6459
6460 Non-Perl libraries, such as C<gtk>, that call the system C<setlocale> can
6461 continue to work if this function is called before transferring control to the
6462 library.
6463
6464 Upon return from the code that needs to use the global locale,
6465 L<C<sync_locale()>|perlapi/sync_locale> should be called to restore the safe
6466 multi-thread operation.
6467
6468 =cut
6469 */
6470
6471 void
6472 Perl_switch_to_global_locale()
6473 {
6474     dTHX;
6475
6476 #ifdef USE_THREAD_SAFE_LOCALE
6477 #  ifdef WIN32
6478
6479     _configthreadlocale(_DISABLE_PER_THREAD_LOCALE);
6480
6481 #  else
6482
6483     {
6484         unsigned int i;
6485
6486         for (i = 0; i < LC_ALL_INDEX_; i++) {
6487             setlocale(categories[i], querylocale_i(i));
6488         }
6489     }
6490
6491     uselocale(LC_GLOBAL_LOCALE);
6492
6493 #  endif
6494 #endif
6495
6496 }
6497
6498 /*
6499
6500 =for apidoc sync_locale
6501
6502 L<C<Perl_setlocale>|perlapi/Perl_setlocale> can be used at any time to query or
6503 change the locale (though changing the locale is antisocial and dangerous on
6504 multi-threaded systems that don't have multi-thread safe locale operations.
6505 (See L<perllocale/Multi-threaded operation>).  Using the system
6506 L<C<setlocale(3)>> should be avoided.  Nevertheless, certain non-Perl libraries
6507 called from XS, such as C<Gtk> do so, and this can't be changed.  When the
6508 locale is changed by XS code that didn't use
6509 L<C<Perl_setlocale>|perlapi/Perl_setlocale>, Perl needs to be told that the
6510 locale has changed.  Use this function to do so, before returning to Perl.
6511
6512 The return value is a boolean: TRUE if the global locale at the time of call
6513 was in effect; and FALSE if a per-thread locale was in effect.  This can be
6514 used by the caller that needs to restore things as-they-were to decide whether
6515 or not to call
6516 L<C<Perl_switch_to_global_locale>|perlapi/switch_to_global_locale>.
6517
6518 =cut
6519 */
6520
6521 bool
6522 Perl_sync_locale()
6523 {
6524
6525 #ifndef USE_LOCALE
6526
6527     return TRUE;
6528
6529 #else
6530
6531     const char * newlocale;
6532     dTHX;
6533
6534 #  ifdef USE_POSIX_2008_LOCALE
6535
6536     bool was_in_global_locale = FALSE;
6537     locale_t cur_obj = uselocale((locale_t) 0);
6538
6539     /* On Windows, unless the foreign code has turned off the thread-safe
6540      * locale setting, any plain setlocale() will have affected what we see, so
6541      * no need to worry.  Otherwise, If the foreign code has done a plain
6542      * setlocale(), it will only affect the global locale on POSIX systems, but
6543      * will affect the */
6544     if (cur_obj == LC_GLOBAL_LOCALE) {
6545
6546 #    ifdef HAS_QUERY_LOCALE
6547
6548         void_setlocale_c(LC_ALL, querylocale_c(LC_ALL));
6549
6550 #    else
6551
6552         unsigned int i;
6553
6554         /* We can't trust that we can read the LC_ALL format on the
6555          * platform, so do them individually */
6556         for (i = 0; i < LC_ALL_INDEX_; i++) {
6557             void_setlocale_i(i, querylocale_i(i));
6558         }
6559
6560 #    endif
6561
6562         was_in_global_locale = TRUE;
6563     }
6564
6565 #  else
6566
6567     bool was_in_global_locale = TRUE;
6568
6569 #  endif
6570 #  ifdef USE_LOCALE_CTYPE
6571
6572     newlocale = querylocale_c(LC_CTYPE);
6573     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
6574                   "%s\n", setlocale_debug_string_c(LC_CTYPE, NULL, newlocale)));
6575     new_ctype(newlocale);
6576
6577 #  endif /* USE_LOCALE_CTYPE */
6578 #  ifdef USE_LOCALE_COLLATE
6579
6580     newlocale = querylocale_c(LC_COLLATE);
6581     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
6582                 "%s\n", setlocale_debug_string_c(LC_COLLATE, NULL, newlocale)));
6583     new_collate(newlocale);
6584
6585 #  endif
6586 #  ifdef USE_LOCALE_NUMERIC
6587
6588     newlocale = querylocale_c(LC_NUMERIC);
6589     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
6590                 "%s\n", setlocale_debug_string_c(LC_NUMERIC, NULL, newlocale)));
6591     new_numeric(newlocale);
6592
6593 #  endif /* USE_LOCALE_NUMERIC */
6594
6595     return was_in_global_locale;
6596
6597 #endif
6598
6599 }
6600
6601 #if defined(DEBUGGING) && defined(USE_LOCALE)
6602
6603 STATIC char *
6604 S_my_setlocale_debug_string_i(pTHX_
6605                               const unsigned cat_index,
6606                               const char* locale, /* Optional locale name */
6607
6608                               /* return value from setlocale() when attempting
6609                                * to set 'category' to 'locale' */
6610                               const char* retval,
6611
6612                               const line_t line)
6613 {
6614     /* Returns a pointer to a NUL-terminated string in static storage with
6615      * added text about the info passed in.  This is not thread safe and will
6616      * be overwritten by the next call, so this should be used just to
6617      * formulate a string to immediately print or savepv() on. */
6618
6619     const char * locale_quote;
6620     const char * retval_quote;
6621
6622     assert(cat_index <= NOMINAL_LC_ALL_INDEX);
6623
6624     if (locale == NULL) {
6625         locale_quote = "";
6626         locale = "NULL";
6627     }
6628     else {
6629         locale_quote = "\"";
6630     }
6631
6632     if (retval == NULL) {
6633         retval_quote = "";
6634         retval = "NULL";
6635     }
6636     else {
6637         retval_quote = "\"";
6638     }
6639
6640 #  ifdef USE_LOCALE_THREADS
6641 #    define THREAD_FORMAT "%p:"
6642 #    define THREAD_ARGUMENT aTHX_
6643 #  else
6644 #    define THREAD_FORMAT
6645 #    define THREAD_ARGUMENT
6646 #  endif
6647
6648     return Perl_form(aTHX_
6649                      "%s:%" LINE_Tf ":" THREAD_FORMAT
6650                      " setlocale(%s[%d], %s%s%s) returned %s%s%s\n",
6651
6652                      __FILE__, line, THREAD_ARGUMENT
6653                      category_names[cat_index], categories[cat_index],
6654                      locale_quote, locale, locale_quote,
6655                      retval_quote, retval, retval_quote);
6656 }
6657
6658 #endif
6659
6660 void
6661 Perl_thread_locale_init()
6662 {
6663     /* Called from a thread on startup*/
6664
6665 #ifdef USE_THREAD_SAFE_LOCALE
6666
6667     dTHX_DEBUGGING;
6668
6669      DEBUG_L(PerlIO_printf(Perl_debug_log,
6670                            "new thread, initial locale is %s;"
6671                            " calling setlocale(LC_ALL, \"C\")\n",
6672                            get_LC_ALL_display()));
6673 #  ifdef WIN32
6674
6675     /* On Windows, make sure new thread has per-thread locales enabled */
6676     _configthreadlocale(_ENABLE_PER_THREAD_LOCALE);
6677
6678 #  endif
6679 #  if defined(LC_ALL)
6680
6681     /* This thread starts off in the C locale.  Use the full Perl_setlocale()
6682      * to make sure no ill-advised shortcuts get taken on this new thread, */
6683     Perl_setlocale(LC_ALL, "C");
6684
6685 #  else
6686
6687     for (unsigned i = 0; i < NOMINAL_LC_ALL_INDEX; i++) {
6688         Perl_setlocale(categories[i], "C");
6689     }
6690
6691 #  endif
6692 #endif
6693
6694 }
6695
6696 void
6697 Perl_thread_locale_term()
6698 {
6699     /* Called from a thread as it gets ready to terminate */
6700
6701 #ifdef USE_POSIX_2008_LOCALE
6702
6703     /* C starts the new thread in the global C locale.  If we are thread-safe,
6704      * we want to not be in the global locale */
6705
6706     {   /* Free up */
6707         locale_t cur_obj = uselocale(LC_GLOBAL_LOCALE);
6708         if (cur_obj != LC_GLOBAL_LOCALE && cur_obj != PL_C_locale_obj) {
6709             freelocale(cur_obj);
6710         }
6711     }
6712
6713 #endif
6714
6715 }
6716
6717 /*
6718  * ex: set ts=8 sts=4 sw=4 et:
6719  */