locale.c

   1 /*    locale.c
   2  *
   3  *    Copyright (C) 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001,
   4  *    2002, 2003, 2005, 2006, 2007, 2008 by Larry Wall and others
   5  *
   6  *    You may distribute under the terms of either the GNU General Public
   7  *    License or the Artistic License, as specified in the README file.
   8  *
   9  */
  10
  11 /*
  12  *      A Elbereth Gilthoniel,
  13  *      silivren penna míriel
  14  *      o menel aglar elenath!
  15  *      Na-chaered palan-díriel
  16  *      o galadhremmin ennorath,
  17  *      Fanuilos, le linnathon
  18  *      nef aear, si nef aearon!
  19  *
  20  *     [p.238 of _The Lord of the Rings_, II/i: "Many Meetings"]
  21  */
  22
  23 /* utility functions for handling locale-specific stuff like what
  24  * character represents the decimal point.
  25  *
  26  * All C programs have an underlying locale.  Perl code generally doesn't pay
  27  * any attention to it except within the scope of a 'use locale'.  For most
  28  * categories, it accomplishes this by just using different operations if it is
  29  * in such scope than if not.  However, various libc functions called by Perl
  30  * are affected by the LC_NUMERIC category, so there are macros in perl.h that
  31  * are used to toggle between the current locale and the C locale depending on
  32  * the desired behavior of those functions at the moment.  And, LC_MESSAGES is
  33  * switched to the C locale for outputting the message unless within the scope
  34  * of 'use locale'.
  35  *
  36  * This code now has multi-thread-safe locale handling on systems that support
  37  * that.  This is completely transparent to most XS code.  On earlier systems,
  38  * it would be possible to emulate thread-safe locales, but this likely would
  39  * involve a lot of locale switching, and would require XS code changes.
  40  * Macros could be written so that the code wouldn't have to know which type of
  41  * system is being used.
  42  *
  43  * Table-driven code is used for simplicity and clarity, as many operations
  44  * differ only in which category is being worked on.  However the system
  45  * categories need not be small contiguous integers, so do not lend themselves
  46  * to table lookup.  Instead we have created our own equivalent values which
  47  * are all small contiguous non-negative integers, and translation functions
  48  * between the two sets.  For category 'LC_foo', the name of our index is
  49  * LC_foo_INDEX_.  Various parallel tables, indexed by these, are used.
  50  *
  51  * Many of the macros and functions in this file have one of the suffixes '_c',
  52  * '_r', or '_i'.  khw found these useful in remembering what type of locale
  53  * category to use as their parameter.  '_r' takes an int category number as
  54  * passed to setlocale(), like LC_ALL, LC_CTYPE, etc.  The 'r' indicates that
  55  * the value isn't known until runtime.  '_c' also indicates such a category
  56  * number, but its value is known at compile time.  These are both converted
  57  * into unsigned indexes into various tables of category information, where the
  58  * real work is generally done.  The tables are generated at compile-time based
  59  * on platform characteristics and Configure options.  They hide from the code
  60  * many of the vagaries of the different locale implementations out there.  You
  61  * may have already guessed that '_i' indicates the parameter is such an
  62  * unsigned index.  Converting from '_r' to '_i' requires run-time lookup.
  63  * '_c' is used to get cpp to do this at compile time.  To avoid the runtime
  64  * expense, the code is structured to use '_r' at the API level, and once
  65  * converted, everything possible is done using the table indexes.
  66  *
  67  * On unthreaded perls, most operations expand out to just the basic
  68  * setlocale() calls.  The same is true on threaded perls on modern Windows
  69  * systems where the same API, after set up, is used for thread-safe locale
  70  * handling.  On other systems, there is a completely different API, specified
  71  * in POSIX 2008, to do thread-safe locales.  On these systems, our
  72  * emulate_setlocale_i() function is used to hide the different API from the
  73  * outside.  This makes it completely transparent to most XS code.
  74  *
  75  * A huge complicating factor is that the LC_NUMERIC category is normally held
  76  * in the C locale, except during those relatively rare times when it needs to
  77  * be in the underlying locale.  There is a bunch of code to accomplish this,
  78  * and to allow easy switches from one state to the other.
  79  *
  80  * In addition, the setlocale equivalents have versions for the return context,
  81  * 'void' and 'bool', besides the full return value.  This can present
  82  * opportunities for avoiding work.  We don't have to necessarily create a safe
  83  * copy to return if no return is desired.
  84  *
  85  * There are 3.5 major implementations here; which one chosen depends on what
  86  * the platform has available, and Configuration options.
  87  *
  88  * 1) Raw my_setlocale().  Here the layer adds nothing.  This is used for
  89  *    unthreaded perls, and when the API for safe locale threading is identical
  90  *    to the unsafe API (Windows, currently).
  91  *
  92  * 2) A minimal layer that makes my_setlocale() uninterruptible and returns a
  93  *    per-thread/per-category value.
  94  *
  95  * 3a and 3b) A layer that implements POSIX 2008 thread-safe locale handling,
  96  *    mapping the setlocale() API to them.  This automatically makes almost all
  97  *    code thread-safe without need for changes.  This layer is chosen on
  98  *    threaded perls when the platform supports the POSIX 2008 functions, and
  99  *    when there is no manual override in Configure.
 100  *
 101  *    3a) is when the platform has a reliable querylocale() function or
 102  *        equivalent that is selected to be used.
 103  *    3b) is when we have to emulate that functionality.
 104  *
 105  * z/OS (os390) is an outlier.  Locales really don't work under threads when
 106  * either the radix character isn't a dot, or attempts are made to change
 107  * locales after the first thread is created.  The reason is that IBM has made
 108  * it thread-safe by refusing to change locales (returning failure if
 109  * attempted) any time after an application has called pthread_create() to
 110  * create another thread.  The expectation is that an application will set up
 111  * its locale information before the first fork, and be stable thereafter.  But
 112  * perl toggles LC_NUMERIC if the locale's radix character isn't a dot, as do
 113  * the other toggles, which are less common.
 114  */
 115
 116 /* If the environment says to, we can output debugging information during
 117  * initialization.  This is done before option parsing, and before any thread
 118  * creation, so can be a file-level static.  (Must come before #including
 119  * perl.h) */
 120 #ifdef DEBUGGING
 121 static int debug_initialization = 0;
 122 #  define DEBUG_INITIALIZATION_set(v) (debug_initialization = v)
 123 #  define DEBUG_LOCALE_INITIALIZATION_  debug_initialization
 124 #else
 125 #  define debug_initialization 0
 126 #  define DEBUG_INITIALIZATION_set(v)
 127 #endif
 128
 129 #define DEBUG_PRE_STMTS   dSAVE_ERRNO;                                        \
 130     PerlIO_printf(Perl_debug_log, "%s: %" LINE_Tf ": ", __FILE__, __LINE__);
 131 #define DEBUG_POST_STMTS  RESTORE_ERRNO;
 132
 133 #include "EXTERN.h"
 134 #define PERL_IN_LOCALE_C
 135 #include "perl.h"
 136
 137 #include "reentr.h"
 138
 139 #ifdef I_WCHAR
 140 #  include <wchar.h>
 141 #endif
 142 #ifdef I_WCTYPE
 143 #  include <wctype.h>
 144 #endif
 145
 146 #ifdef USE_LOCALE
 147
 148 PERL_STATIC_INLINE const char *
 149 S_mortalized_pv_copy(pTHX_ const char * const pv)
 150 {
 151     PERL_ARGS_ASSERT_MORTALIZED_PV_COPY;
 152
 153     /* Copies the input pv, and arranges for it to be freed at an unspecified
 154      * later time. */
 155
 156     if (pv == NULL) {
 157         return NULL;
 158     }
 159
 160     const char * copy = savepv(pv);
 161     SAVEFREEPV(copy);
 162     return copy;
 163 }
 164
 165 #endif
 166
 167 /* Returns the Unix errno portion; ignoring any others.  This is a macro here
 168  * instead of putting it into perl.h, because unclear to khw what should be
 169  * done generally. */
 170 #define GET_ERRNO   saved_errno
 171
 172 /* Default values come from the C locale */
 173 #define C_codeset "ANSI_X3.4-1968" /* Only in some Configurations, and usually
 174                                       a single instance, so is a #define */
 175 static const char C_decimal_point[] = ".";
 176 static const char C_thousands_sep[] = "";
 177
 178 /* Is the C string input 'name' "C" or "POSIX"?  If so, and 'name' is the
 179  * return of setlocale(), then this is extremely likely to be the C or POSIX
 180  * locale.  However, the output of setlocale() is documented to be opaque, but
 181  * the odds are extremely small that it would return these two strings for some
 182  * other locale.  Note that VMS in these two locales includes many non-ASCII
 183  * characters as controls and punctuation (below are hex bytes):
 184  *   cntrl:  84-97 9B-9F
 185  *   punct:  A1-A3 A5 A7-AB B0-B3 B5-B7 B9-BD BF-CF D1-DD DF-EF F1-FD
 186  * Oddly, none there are listed as alphas, though some represent alphabetics
 187  * http://www.nntp.perl.org/group/perl.perl5.porters/2013/02/msg198753.html */
 188 #define isNAME_C_OR_POSIX(name)                                              \
 189                              (   (name) != NULL                              \
 190                               && (( *(name) == 'C' && (*(name + 1)) == '\0') \
 191                                    || strEQ((name), "POSIX")))
 192
 193 #if defined(HAS_NL_LANGINFO_L) || defined(HAS_NL_LANGINFO)
 194 #  define HAS_SOME_LANGINFO
 195 #endif
 196 #if defined(HAS_LOCALECONV) || defined(HAS_LOCALECONV_L)
 197 #  define HAS_SOME_LOCALECONV
 198 #endif
 199
 200 #define my_langinfo_c(item, category, locale, retbufp, retbuf_sizep, utf8ness) \
 201             my_langinfo_i(item, category##_INDEX_, locale, retbufp,            \
 202                                                       retbuf_sizep,  utf8ness)
 203
 204 #ifdef USE_LOCALE
 205
 206 #  ifdef DEBUGGING
 207 #    define setlocale_debug_string_i(index, locale, result)                 \
 208             my_setlocale_debug_string_i(index, locale, result, __LINE__)
 209 #    define setlocale_debug_string_c(category, locale, result)              \
 210                 setlocale_debug_string_i(category##_INDEX_, locale, result)
 211 #    define setlocale_debug_string_r(category, locale, result)              \
 212              setlocale_debug_string_i(get_category_index(category, locale), \
 213                                       locale, result)
 214 #  endif
 215
 216 #  define toggle_locale_i(index, locale)                                    \
 217                  S_toggle_locale_i(aTHX_ index, locale, __LINE__)
 218 #  define toggle_locale_c(cat, locale)  toggle_locale_i(cat##_INDEX_, locale)
 219 #  define restore_toggled_locale_i(index, locale)                           \
 220                  S_restore_toggled_locale_i(aTHX_ index, locale, __LINE__)
 221 #  define restore_toggled_locale_c(cat, locale)                             \
 222                              restore_toggled_locale_i(cat##_INDEX_, locale)
 223
 224 /* Two parallel arrays indexed by our mapping of category numbers into small
 225  * non-negative indexes; first the locale categories Perl uses on this system,
 226  * used to do the inverse mapping.  The second array is their names.  These
 227  * arrays are in mostly arbitrary order. */
 228
 229 STATIC const int categories[] = {
 230
 231 #    ifdef USE_LOCALE_CTYPE
 232                              LC_CTYPE,
 233 #    endif
 234 #  ifdef USE_LOCALE_NUMERIC
 235                              LC_NUMERIC,
 236 #  endif
 237 #    ifdef USE_LOCALE_COLLATE
 238                              LC_COLLATE,
 239 #    endif
 240 #    ifdef USE_LOCALE_TIME
 241                              LC_TIME,
 242 #    endif
 243 #    ifdef USE_LOCALE_MESSAGES
 244                              LC_MESSAGES,
 245 #    endif
 246 #    ifdef USE_LOCALE_MONETARY
 247                              LC_MONETARY,
 248 #    endif
 249 #    ifdef USE_LOCALE_ADDRESS
 250                              LC_ADDRESS,
 251 #    endif
 252 #    ifdef USE_LOCALE_IDENTIFICATION
 253                              LC_IDENTIFICATION,
 254 #    endif
 255 #    ifdef USE_LOCALE_MEASUREMENT
 256                              LC_MEASUREMENT,
 257 #    endif
 258 #    ifdef USE_LOCALE_PAPER
 259                              LC_PAPER,
 260 #    endif
 261 #    ifdef USE_LOCALE_TELEPHONE
 262                              LC_TELEPHONE,
 263 #    endif
 264 #    ifdef USE_LOCALE_SYNTAX
 265                              LC_SYNTAX,
 266 #    endif
 267 #    ifdef USE_LOCALE_TOD
 268                              LC_TOD,
 269 #    endif
 270 #    ifdef LC_ALL
 271                              LC_ALL,
 272 #    endif
 273
 274    /* Placeholder as a precaution if code fails to check the return of
 275     * get_category_index(), which returns this element to indicate an error */
 276                             -1
 277 };
 278
 279 /* The top-most real element is LC_ALL */
 280
 281 STATIC const char * const category_names[] = {
 282
 283 #    ifdef USE_LOCALE_CTYPE
 284                                  "LC_CTYPE",
 285 #    endif
 286 #  ifdef USE_LOCALE_NUMERIC
 287                                  "LC_NUMERIC",
 288 #  endif
 289 #    ifdef USE_LOCALE_COLLATE
 290                                  "LC_COLLATE",
 291 #    endif
 292 #    ifdef USE_LOCALE_TIME
 293                                  "LC_TIME",
 294 #    endif
 295 #    ifdef USE_LOCALE_MESSAGES
 296                                  "LC_MESSAGES",
 297 #    endif
 298 #    ifdef USE_LOCALE_MONETARY
 299                                  "LC_MONETARY",
 300 #    endif
 301 #    ifdef USE_LOCALE_ADDRESS
 302                                  "LC_ADDRESS",
 303 #    endif
 304 #    ifdef USE_LOCALE_IDENTIFICATION
 305                                  "LC_IDENTIFICATION",
 306 #    endif
 307 #    ifdef USE_LOCALE_MEASUREMENT
 308                                  "LC_MEASUREMENT",
 309 #    endif
 310 #    ifdef USE_LOCALE_PAPER
 311                                  "LC_PAPER",
 312 #    endif
 313 #    ifdef USE_LOCALE_TELEPHONE
 314                                  "LC_TELEPHONE",
 315 #    endif
 316 #    ifdef USE_LOCALE_SYNTAX
 317                                  "LC_SYNTAX",
 318 #    endif
 319 #    ifdef USE_LOCALE_TOD
 320                                  "LC_TOD",
 321 #    endif
 322 #    ifdef LC_ALL
 323                                  "LC_ALL",
 324 #    endif
 325
 326    /* Placeholder as a precaution if code fails to check the return of
 327     * get_category_index(), which returns this element to indicate an error */
 328                                  NULL
 329 };
 330
 331 /* A few categories require additional setup when they are changed.  This table
 332  * points to the functions that do that setup */
 333 STATIC void (*update_functions[]) (pTHX_ const char *) = {
 334 #  ifdef USE_LOCALE_CTYPE
 335                                 S_new_ctype,
 336 #  endif
 337 #  ifdef USE_LOCALE_NUMERIC
 338                                 S_new_numeric,
 339 #  endif
 340 #  ifdef USE_LOCALE_COLLATE
 341                                 S_new_collate,
 342 #  endif
 343 #  ifdef USE_LOCALE_TIME
 344                                 NULL,
 345 #  endif
 346 #  ifdef USE_LOCALE_MESSAGES
 347                                 NULL,
 348 #  endif
 349 #  ifdef USE_LOCALE_MONETARY
 350                                 NULL,
 351 #  endif
 352 #  ifdef USE_LOCALE_ADDRESS
 353                                 NULL,
 354 #  endif
 355 #  ifdef USE_LOCALE_IDENTIFICATION
 356                                 NULL,
 357 #  endif
 358 #  ifdef USE_LOCALE_MEASUREMENT
 359                                 NULL,
 360 #  endif
 361 #  ifdef USE_LOCALE_PAPER
 362                                 NULL,
 363 #  endif
 364 #  ifdef USE_LOCALE_TELEPHONE
 365                                 NULL,
 366 #  endif
 367 #  ifdef USE_LOCALE_SYNTAX
 368                                 NULL,
 369 #  endif
 370 #  ifdef USE_LOCALE_TOD
 371                                 NULL,
 372 #  endif
 373     /* No harm done to have this even without an LC_ALL */
 374                                 S_new_LC_ALL,
 375
 376    /* Placeholder as a precaution if code fails to check the return of
 377     * get_category_index(), which returns this element to indicate an error */
 378                                 NULL
 379 };
 380
 381 #  ifdef LC_ALL
 382
 383     /* On systems with LC_ALL, it is kept in the highest index position.  (-2
 384      * to account for the final unused placeholder element.) */
 385 #    define NOMINAL_LC_ALL_INDEX (C_ARRAY_LENGTH(categories) - 2)
 386 #  else
 387
 388     /* On systems without LC_ALL, we pretend it is there, one beyond the real
 389      * top element, hence in the unused placeholder element. */
 390 #    define NOMINAL_LC_ALL_INDEX (C_ARRAY_LENGTH(categories) - 1)
 391 #  endif
 392
 393 /* Pretending there is an LC_ALL element just above allows us to avoid most
 394  * special cases.  Most loops through these arrays in the code below are
 395  * written like 'for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++)'.  They will work
 396  * on either type of system.  But the code must be written to not access the
 397  * element at 'LC_ALL_INDEX_' except on platforms that have it.  This can be
 398  * checked for at compile time by using the #define LC_ALL_INDEX_ which is only
 399  * defined if we do have LC_ALL. */
 400
 401 STATIC unsigned int
 402 S_get_category_index(const int category, const char * locale)
 403 {
 404     /* Given a category, return the equivalent internal index we generally use
 405      * instead.
 406      *
 407      * 'locale' is for use in any generated diagnostics, and may be NULL
 408      *
 409      * Some sort of hash could be used instead of this loop, but the number of
 410      * elements is so far at most 12 */
 411
 412     unsigned int i;
 413     const char * conditional_warn_text = "; can't set it to ";
 414
 415     PERL_ARGS_ASSERT_GET_CATEGORY_INDEX;
 416
 417 #  ifdef LC_ALL
 418     for (i = 0; i <=         LC_ALL_INDEX_; i++)
 419 #  else
 420     for (i = 0; i <  NOMINAL_LC_ALL_INDEX;  i++)
 421 #  endif
 422     {
 423         if (category == categories[i]) {
 424             dTHX_DEBUGGING;
 425             DEBUG_Lv(PerlIO_printf(Perl_debug_log,
 426                                    "index of category %d (%s) is %d\n",
 427                                    category, category_names[i], i));
 428             return i;
 429         }
 430     }
 431
 432     /* Here, we don't know about this category, so can't handle it. */
 433
 434     if (! locale) {
 435         locale = "";
 436         conditional_warn_text = "";
 437     }
 438
 439     /* diag_listed_as: Unknown locale category %d; can't set it to %s */
 440     Perl_warner_nocontext(packWARN(WARN_LOCALE),
 441                           "Unknown locale category %d%s%s",
 442                           category, conditional_warn_text, locale);
 443
 444 #  ifdef EINVAL
 445
 446     SETERRNO(EINVAL, LIB_INVARG);
 447
 448 #  endif
 449
 450     /* Return an out-of-bounds value */
 451     return NOMINAL_LC_ALL_INDEX + 1;
 452 }
 453
 454 #endif /* ifdef USE_LOCALE */
 455
 456 void
 457 Perl_force_locale_unlock()
 458 {
 459
 460 #if defined(USE_LOCALE_THREADS)
 461
 462     dTHX;
 463
 464     /* If recursively locked, clear all at once */
 465     if (PL_locale_mutex_depth > 1) {
 466         PL_locale_mutex_depth = 1;
 467     }
 468
 469     if (PL_locale_mutex_depth > 0) {
 470         LOCALE_UNLOCK_;
 471     }
 472
 473 #endif
 474
 475 }
 476
 477 #ifdef USE_POSIX_2008_LOCALE
 478
 479 STATIC locale_t
 480 S_use_curlocale_scratch(pTHX)
 481 {
 482     /* This function is used to hide from the caller the case where the current
 483      * locale_t object in POSIX 2008 is the global one, which is illegal in
 484      * many of the P2008 API calls.  This checks for that and, if necessary
 485      * creates a proper P2008 object.  Any prior object is deleted, as is any
 486      * remaining object during global destruction. */
 487
 488     locale_t cur = uselocale((locale_t) 0);
 489
 490     if (cur != LC_GLOBAL_LOCALE) {
 491         return cur;
 492     }
 493
 494     if (PL_scratch_locale_obj) {
 495         freelocale(PL_scratch_locale_obj);
 496     }
 497
 498     PL_scratch_locale_obj = duplocale(LC_GLOBAL_LOCALE);
 499     return PL_scratch_locale_obj;
 500 }
 501
 502 #endif
 503
 504 void
 505 Perl_locale_panic(const char * msg,
 506                   const char * file_name,
 507                   const line_t line,
 508                   const int errnum)
 509 {
 510     dTHX;
 511
 512     PERL_ARGS_ASSERT_LOCALE_PANIC;
 513
 514     force_locale_unlock();
 515
 516 #ifdef USE_C_BACKTRACE
 517     dump_c_backtrace(Perl_debug_log, 20, 1);
 518 #endif
 519
 520     /* diag_listed_as: panic: %s */
 521     Perl_croak(aTHX_ "%s: %d: panic: %s; errno=%d\n",
 522                      file_name, line, msg, errnum);
 523 }
 524
 525 #define setlocale_failure_panic_c(                                          \
 526                         cat, current, failed, caller_0_line, caller_1_line) \
 527         setlocale_failure_panic_i(cat##_INDEX_, current, failed,            \
 528                         caller_0_line, caller_1_line)
 529
 530 /* posix_setlocale() presents a consistent POSIX-compliant interface to
 531  * setlocale().   Windows requres a customized base-level setlocale() */
 532 #ifdef WIN32
 533 #  define posix_setlocale(cat, locale) win32_setlocale(cat, locale)
 534 #else
 535 #  define posix_setlocale(cat, locale) ((const char *) setlocale(cat, locale))
 536 #endif
 537
 538 /* The next layer up is to catch vagaries and bugs in the libc setlocale return
 539  * value */
 540 #ifdef stdize_locale
 541 #  define stdized_setlocale(cat, locale)                                       \
 542      stdize_locale(cat, posix_setlocale(cat, locale),                          \
 543                    &PL_stdize_locale_buf, &PL_stdize_locale_bufsize, __LINE__)
 544 #else
 545 #  define stdized_setlocale(cat, locale)  posix_setlocale(cat, locale)
 546 #endif
 547
 548 /* The next many lines form a layer above the close-to-the-metal 'posix'
 549  * and 'stdized' macros.  They are used to present a uniform API to the rest of
 550  * the code in this file in spite of the disparate underlying implementations.
 551  * */
 552
 553 #if    (! defined(USE_LOCALE_THREADS) && ! defined(USE_POSIX_2008_LOCALE))    \
 554     || (  defined(WIN32) && defined(USE_THREAD_SAFE_LOCALE))
 555
 556 /* For non-threaded perls (which we are not to use the POSIX 2008 API on), or a
 557  * thread-safe Windows one in which threading is invisible to us, the added
 558  * layer just expands to the base-level functions.  See the introductory
 559  * comments in this file for the meaning of the suffixes '_c', '_r', '_i'. */
 560
 561 #  define setlocale_r(cat, locale)        stdized_setlocale(cat, locale)
 562 #  define setlocale_i(i, locale)      setlocale_r(categories[i], locale)
 563 #  define setlocale_c(cat, locale)              setlocale_r(cat, locale)
 564
 565 #  define void_setlocale_i(i, locale)                                       \
 566     STMT_START {                                                            \
 567         if (! posix_setlocale(categories[i], locale)) {                     \
 568             setlocale_failure_panic_i(i, NULL, locale, __LINE__, 0);        \
 569             NOT_REACHED; /* NOTREACHED */                                   \
 570         }                                                                   \
 571     } STMT_END
 572 #  define void_setlocale_c(cat, locale)                                     \
 573                                   void_setlocale_i(cat##_INDEX_, locale)
 574 #  define void_setlocale_r(cat, locale)                                     \
 575                void_setlocale_i(get_category_index(cat, locale), locale)
 576
 577 #  define bool_setlocale_r(cat, locale) cBOOL(posix_setlocale(cat, locale))
 578 #  define bool_setlocale_i(i, locale)                                       \
 579                                  bool_setlocale_c(categories[i], locale)
 580 #  define bool_setlocale_c(cat, locale)    bool_setlocale_r(cat, locale)
 581
 582 /* All the querylocale...() forms return a mortalized copy.  If you need
 583  * something stable across calls, you need to savepv() the result yourself */
 584
 585 #  define querylocale_r(cat)        mortalized_pv_copy(setlocale_r(cat, NULL))
 586 #  define querylocale_c(cat)        querylocale_r(cat)
 587 #  define querylocale_i(i)          querylocale_c(categories[i])
 588
 589 #elif   defined(USE_LOCALE_THREADS)                 \
 590    && ! defined(USE_THREAD_SAFE_LOCALE)
 591
 592    /* Here, there are threads, and there is no support for thread-safe
 593     * operation.  This is a dangerous situation, which perl is documented as
 594     * not supporting, but it arises in practice.  We can do a modicum of
 595     * automatic mitigation by making sure there is a per-thread return from
 596     * setlocale(), and that a mutex protects it from races */
 597 STATIC const char *
 598 S_less_dicey_setlocale_r(pTHX_ const int category, const char * locale)
 599 {
 600     const char * retval;
 601
 602     PERL_ARGS_ASSERT_LESS_DICEY_SETLOCALE_R;
 603
 604     POSIX_SETLOCALE_LOCK;
 605
 606     retval = stdized_setlocale(category, locale);
 607
 608     /* We reuse PL_stdize_locale_buf as it doesn't conflict, but the call may
 609      * already have used it, in which case we don't have to do anything further
 610      * */
 611     retval = save_to_buffer(retval,
 612                             &PL_stdize_locale_buf, &PL_stdize_locale_bufsize);
 613
 614     POSIX_SETLOCALE_UNLOCK;
 615
 616     return retval;
 617 }
 618
 619 #  define setlocale_r(cat, locale)  less_dicey_setlocale_r(cat, locale)
 620 #  define setlocale_c(cat, locale)             setlocale_r(cat, locale)
 621 #  define setlocale_i(i, locale)     setlocale_r(categories[i], locale)
 622
 623 #  define querylocale_r(cat)  mortalized_pv_copy(setlocale_r(cat, NULL))
 624 #  define querylocale_c(cat)                   querylocale_r(cat)
 625 #  define querylocale_i(i)                     querylocale_r(categories[i])
 626
 627 STATIC void
 628 S_less_dicey_void_setlocale_i(pTHX_ const unsigned cat_index,
 629                                     const char * locale,
 630                                     const line_t line)
 631 {
 632     PERL_ARGS_ASSERT_LESS_DICEY_VOID_SETLOCALE_I;
 633
 634     POSIX_SETLOCALE_LOCK;
 635     if (! posix_setlocale(categories[cat_index], locale)) {
 636         POSIX_SETLOCALE_UNLOCK;
 637         setlocale_failure_panic_i(cat_index, NULL, locale, __LINE__, line);
 638     }
 639     POSIX_SETLOCALE_UNLOCK;
 640 }
 641
 642 #  define void_setlocale_i(i, locale)                                       \
 643                           less_dicey_void_setlocale_i(i, locale, __LINE__)
 644 #  define void_setlocale_c(cat, locale)                                     \
 645                           void_setlocale_i(cat##_INDEX_, locale)
 646 #  define void_setlocale_r(cat, locale)                                     \
 647        void_setlocale_i(get_category_index(cat, locale), locale)
 648
 649 #  if 0     /* Not currently used */
 650
 651 STATIC bool
 652 S_less_dicey_bool_setlocale_r(pTHX_ const int cat, const char * locale)
 653 {
 654     bool retval;
 655
 656     PERL_ARGS_ASSERT_LESS_DICEY_BOOL_SETLOCALE_R;
 657
 658     POSIX_SETLOCALE_LOCK;
 659     retval = cBOOL(posix_setlocale(cat, locale));
 660     POSIX_SETLOCALE_UNLOCK;
 661
 662     return retval;
 663 }
 664
 665 #  endif
 666 #  define bool_setlocale_r(cat, locale)                                 \
 667                                less_dicey_bool_setlocale_r(cat, locale)
 668 #  define bool_setlocale_i(i, locale)                                   \
 669                                 bool_setlocale_r(categories[i], locale)
 670 #  define bool_setlocale_c(cat, locale) bool_setlocale_r(cat, locale)
 671 #else
 672
 673 /* Here, there is a completely different API to get thread-safe locales.  We
 674  * emulate the setlocale() API with our own function(s).  setlocale categories,
 675  * like LC_NUMERIC, are not valid here for the POSIX 2008 API.  Instead, there
 676  * are equivalents, like LC_NUMERIC_MASK, which we use instead, converting to
 677  * by using get_category_index() followed by table lookup. */
 678
 679 #  define emulate_setlocale_c(cat, locale, recalc_LC_ALL, line)             \
 680            emulate_setlocale_i(cat##_INDEX_, locale, recalc_LC_ALL, line)
 681
 682      /* A wrapper for the macros below. */
 683 #  define common_emulate_setlocale(i, locale)                               \
 684                  emulate_setlocale_i(i, locale, YES_RECALC_LC_ALL, __LINE__)
 685
 686 #  define setlocale_i(i, locale)                                            \
 687      save_to_buffer(common_emulate_setlocale(i, locale),                    \
 688                                              &PL_stdize_locale_buf,         \
 689                                              &PL_stdize_locale_bufsize)
 690 #  define setlocale_c(cat, locale)     setlocale_i(cat##_INDEX_, locale)
 691 #  define setlocale_r(cat, locale)                                          \
 692                     setlocale_i(get_category_index(cat, locale), locale)
 693
 694 #  define void_setlocale_i(i, locale)                                       \
 695                              ((void) common_emulate_setlocale(i, locale))
 696 #  define void_setlocale_c(cat, locale)                                     \
 697                                   void_setlocale_i(cat##_INDEX_, locale)
 698 #  define void_setlocale_r(cat, locale) ((void) setlocale_r(cat, locale))
 699
 700 #  define bool_setlocale_i(i, locale)                                       \
 701                                cBOOL(common_emulate_setlocale(i, locale))
 702 #  define bool_setlocale_c(cat, locale)                                     \
 703                                   bool_setlocale_i(cat##_INDEX_, locale)
 704 #  define bool_setlocale_r(cat, locale)   cBOOL(setlocale_r(cat, locale))
 705
 706 #  define querylocale_i(i)      mortalized_pv_copy(my_querylocale_i(i))
 707 #  define querylocale_c(cat)    querylocale_i(cat##_INDEX_)
 708 #  define querylocale_r(cat)    querylocale_i(get_category_index(cat,NULL))
 709
 710 #  ifdef USE_QUERYLOCALE
 711 #    define isSINGLE_BIT_SET(mask) isPOWER_OF_2(mask)
 712
 713      /* This code used to think querylocale() was valid on LC_ALL.  Make sure
 714       * all instances of that have been removed */
 715 #    define QUERYLOCALE_ASSERT(index)                                       \
 716                         __ASSERT_(isSINGLE_BIT_SET(category_masks[index]))
 717 #    if ! defined(HAS_QUERYLOCALE) && defined(_NL_LOCALE_NAME)
 718 #      define querylocale_l(index, locale_obj)                              \
 719             (QUERYLOCALE_ASSERT(index)                                      \
 720              mortalized_pv_copy(nl_langinfo_l(                              \
 721                          _NL_LOCALE_NAME(categories[index]), locale_obj)))
 722 #    else
 723 #      define querylocale_l(index, locale_obj)                              \
 724         (QUERYLOCALE_ASSERT(index)                                          \
 725          mortalized_pv_copy(querylocale(category_masks[index], locale_obj)))
 726 #    endif
 727 #  endif
 728 #  if defined(__GLIBC__) && defined(USE_LOCALE_MESSAGES)
 729 #    define HAS_GLIBC_LC_MESSAGES_BUG
 730 #    include <libintl.h>
 731 #  endif
 732
 733 /* A fourth array, parallel to the ones above to map from category to its
 734  * equivalent mask */
 735 STATIC const int category_masks[] = {
 736 #  ifdef USE_LOCALE_CTYPE
 737                                 LC_CTYPE_MASK,
 738 #  endif
 739 #  ifdef USE_LOCALE_NUMERIC
 740                                 LC_NUMERIC_MASK,
 741 #  endif
 742 #  ifdef USE_LOCALE_COLLATE
 743                                 LC_COLLATE_MASK,
 744 #  endif
 745 #  ifdef USE_LOCALE_TIME
 746                                 LC_TIME_MASK,
 747 #  endif
 748 #  ifdef USE_LOCALE_MESSAGES
 749                                 LC_MESSAGES_MASK,
 750 #  endif
 751 #  ifdef USE_LOCALE_MONETARY
 752                                 LC_MONETARY_MASK,
 753 #  endif
 754 #  ifdef USE_LOCALE_ADDRESS
 755                                 LC_ADDRESS_MASK,
 756 #  endif
 757 #  ifdef USE_LOCALE_IDENTIFICATION
 758                                 LC_IDENTIFICATION_MASK,
 759 #  endif
 760 #  ifdef USE_LOCALE_MEASUREMENT
 761                                 LC_MEASUREMENT_MASK,
 762 #  endif
 763 #  ifdef USE_LOCALE_PAPER
 764                                 LC_PAPER_MASK,
 765 #  endif
 766 #  ifdef USE_LOCALE_TELEPHONE
 767                                 LC_TELEPHONE_MASK,
 768 #  endif
 769 #  ifdef USE_LOCALE_SYNTAX
 770                                 LC_SYNTAX_MASK,
 771 #  endif
 772 #  ifdef USE_LOCALE_TOD
 773                                 LC_TOD_MASK,
 774 #  endif
 775                                 /* LC_ALL can't be turned off by a Configure
 776                                  * option, and in Posix 2008, should always be
 777                                  * here, so compile it in unconditionally.
 778                                  * This could catch some glitches at compile
 779                                  * time */
 780                                 LC_ALL_MASK,
 781
 782    /* Placeholder as a precaution if code fails to check the return of
 783     * get_category_index(), which returns this element to indicate an error */
 784                                 0
 785 };
 786
 787 #  define my_querylocale_c(cat) my_querylocale_i(cat##_INDEX_)
 788
 789 STATIC const char *
 790 S_my_querylocale_i(pTHX_ const unsigned int index)
 791 {
 792     /* This function returns the name of the locale category given by the input
 793      * index into our parallel tables of them.
 794      *
 795      * POSIX 2008, for some sick reason, chose not to provide a method to find
 796      * the category name of a locale, discarding a basic linguistic tenet that
 797      * for any object, people will create a name for it.  Some vendors have
 798      * created a querylocale() function to do just that.  This function is a
 799      * lot simpler to implement on systems that have this.  Otherwise, we have
 800      * to keep track of what the locale has been set to, so that we can return
 801      * its name so as to emulate setlocale().  It's also possible for C code in
 802      * some library to change the locale without us knowing it, though as of
 803      * September 2017, there are no occurrences in CPAN of uselocale().  Some
 804      * libraries do use setlocale(), but that changes the global locale, and
 805      * threads using per-thread locales will just ignore those changes. */
 806
 807     int category;
 808     const locale_t cur_obj = uselocale((locale_t) 0);
 809     const char * retval;
 810
 811     PERL_ARGS_ASSERT_MY_QUERYLOCALE_I;
 812     assert(index <= NOMINAL_LC_ALL_INDEX);
 813
 814     category = categories[index];
 815
 816     DEBUG_Lv(PerlIO_printf(Perl_debug_log, "my_querylocale_i(%s) on %p\n",
 817                                            category_names[index], cur_obj));
 818     if (cur_obj == LC_GLOBAL_LOCALE) {
 819         POSIX_SETLOCALE_LOCK;
 820         retval = posix_setlocale(category, NULL);
 821         POSIX_SETLOCALE_UNLOCK;
 822     }
 823     else {
 824
 825 #  ifdef USE_QUERYLOCALE
 826
 827         /* We don't currently keep records when there is querylocale(), so have
 828          * to get it anew each time */
 829         retval = (index == LC_ALL_INDEX_)
 830                  ? calculate_LC_ALL(cur_obj)
 831                  : querylocale_l(index, cur_obj);
 832
 833 #  else
 834
 835         /* But we do have up-to-date values when we keep our own records
 836          * (except some times in initialization, where we get the value from
 837          * the system. */
 838         if (PL_curlocales[index] == NULL) {
 839             retval = stdized_setlocale(category, NULL);
 840             PL_curlocales[index] = savepv(retval);
 841         }
 842         else {
 843             retval = PL_curlocales[index];
 844         }
 845
 846 #  endif
 847
 848             }
 849
 850                 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
 851                            "my_querylocale_i(%s) returning '%s'\n",
 852                            category_names[index], retval));
 853     assert(strNE(retval, ""));
 854     return retval;
 855 }
 856
 857 #  ifdef USE_PL_CURLOCALES
 858
 859 STATIC const char *
 860 S_update_PL_curlocales_i(pTHX_
 861                          const unsigned int index,
 862                          const char * new_locale,
 863                          recalc_lc_all_t recalc_LC_ALL)
 864 {
 865     /* This is a helper function for emulate_setlocale_i(), mostly used to
 866      * make that function easier to read. */
 867
 868     PERL_ARGS_ASSERT_UPDATE_PL_CURLOCALES_I;
 869     assert(index <= NOMINAL_LC_ALL_INDEX);
 870
 871     if (index == LC_ALL_INDEX_) {
 872         unsigned int i;
 873
 874         /* For LC_ALL, we change all individual categories to correspond */
 875                          /* PL_curlocales is a parallel array, so has same
 876                           * length as 'categories' */
 877         for (i = 0; i < LC_ALL_INDEX_; i++) {
 878             Safefree(PL_curlocales[i]);
 879             PL_curlocales[i] = savepv(new_locale);
 880         }
 881
 882         recalc_LC_ALL = YES_RECALC_LC_ALL;
 883     }
 884     else {
 885
 886         /* Update the single category's record */
 887         Safefree(PL_curlocales[index]);
 888         PL_curlocales[index] = savepv(new_locale);
 889
 890         if (recalc_LC_ALL == RECALCULATE_LC_ALL_ON_FINAL_INTERATION) {
 891             recalc_LC_ALL = (index == NOMINAL_LC_ALL_INDEX - 1)
 892                             ? YES_RECALC_LC_ALL
 893                             : DONT_RECALC_LC_ALL;
 894         }
 895     }
 896
 897     if (recalc_LC_ALL == YES_RECALC_LC_ALL) {
 898         Safefree(PL_curlocales[LC_ALL_INDEX_]);
 899         PL_curlocales[LC_ALL_INDEX_] =
 900                                     savepv(calculate_LC_ALL(PL_curlocales));
 901     }
 902
 903     return PL_curlocales[index];
 904 }
 905
 906 #  endif  /* Need PL_curlocales[] */
 907
 908 STATIC const char *
 909 S_setlocale_from_aggregate_LC_ALL(pTHX_ const char * locale, const line_t line)
 910 {
 911     /* This function parses the value of the LC_ALL locale, assuming glibc
 912      * syntax, and sets each individual category on the system to the proper
 913      * value.
 914      *
 915      * This is likely to only ever be called from one place, so exists to make
 916      * the calling function easier to read by moving this ancillary code out of
 917      * the main line.
 918      *
 919      * The locale for each category is independent of the other categories.
 920      * Often, they are all the same, but certainly not always.  Perl, in fact,
 921      * usually keeps LC_NUMERIC in the C locale, regardless of the underlying
 922      * locale.  LC_ALL has to be able to represent the case of when there are
 923      * varying locales.  Platforms have differing ways of representing this.
 924      * Because of this, the code in this file goes to lengths to avoid the
 925      * issue, generally looping over the component categories instead of
 926      * referring to them in the aggregate, wherever possible.  However, there
 927      * are cases where we have to parse our own constructed aggregates, which use
 928      * the glibc syntax. */
 929
 930     const char * locale_on_entry = querylocale_c(LC_ALL);
 931
 932     PERL_ARGS_ASSERT_SETLOCALE_FROM_AGGREGATE_LC_ALL;
 933
 934     /* If the string that gives what to set doesn't include all categories,
 935      * the omitted ones get set to "C".  To get this behavior, first set
 936      * all the individual categories to "C", and override the furnished
 937      * ones below.  FALSE => No need to recalculate LC_ALL, as this is a
 938      * temporary state */
 939     if (! emulate_setlocale_c(LC_ALL, "C", DONT_RECALC_LC_ALL, line)) {
 940         setlocale_failure_panic_c(LC_ALL, locale_on_entry,
 941                                   "C", __LINE__, line);
 942         NOT_REACHED; /* NOTREACHED */
 943     }
 944
 945     const char * s = locale;
 946     const char * e = locale + strlen(locale);
 947     while (s < e) {
 948         const char * p = s;
 949
 950         /* Parse through the category */
 951         while (isWORDCHAR(*p)) {
 952             p++;
 953         }
 954
 955         const char * category_end = p;
 956
 957         if (*p++ != '=') {
 958             locale_panic_(Perl_form(aTHX_
 959                           "Unexpected character in locale category name '%s"
 960                           "<-- HERE",
 961                           get_displayable_string(s, p - 1, 0)));
 962         }
 963
 964         /* Parse through the locale name */
 965         const char * name_start = p;
 966         while (p < e && *p != ';') {
 967             if (! isGRAPH(*p)) {
 968                 locale_panic_(Perl_form(aTHX_
 969                               "Unexpected character in locale name '%02X", *p));
 970             }
 971             p++;
 972         }
 973
 974         const char * name_end = p;
 975
 976         /* Space past the semi-colon */
 977         if (p < e) {
 978             p++;
 979         }
 980
 981         /* Find the index of the category name in our lists */
 982         for (PERL_UINT_FAST8_T i = 0; i < LC_ALL_INDEX_; i++) {
 983
 984             /* Keep going if this index doesn't point to the category being
 985              * parsed.  The strnNE() avoids a Perl_form(), but would fail if
 986              * ever a category name could be a substring of another one, e.g.,
 987              * if there were a "LC_TIME_DATE" */
 988             if strnNE(s, category_names[i], category_end - s) {
 989                 continue;
 990             }
 991
 992             /* Here i points to the category being parsed.  Now isolate the
 993              * locale it is being changed to */
 994             const char * individ_locale = Perl_form(aTHX_ "%.*s",
 995                                 (int) (name_end - name_start), name_start);
 996
 997             /* And do the change.  FALSE => Don't recalculate LC_ALL; we'll do
 998              * it ourselves after the loop */
 999             if (! emulate_setlocale_i(i, individ_locale,
1000                                       DONT_RECALC_LC_ALL, line))
1001             {
1002
1003                 /* But if we have to back out, do fix up LC_ALL */
1004                 if (! emulate_setlocale_c(LC_ALL, locale_on_entry,
1005                                           YES_RECALC_LC_ALL, line))
1006                 {
1007                     setlocale_failure_panic_i(i, individ_locale,
1008                                               locale, __LINE__, line);
1009                     NOT_REACHED; /* NOTREACHED */
1010                 }
1011
1012                 /* Reverting to the entry value succeeded, but the operation
1013                  * failed to go to the requested locale. */
1014                 return NULL;
1015             }
1016
1017             /* Found and handled the desired category.  Quit the inner loop to
1018              * try the next category */
1019             break;
1020         }
1021
1022         /* Finished with this category; iterate to the next one in the input */
1023         s = p;
1024     }
1025
1026 #    ifdef USE_PL_CURLOCALES
1027
1028     /* Here we have set all the individual categories.  Update the LC_ALL entry
1029      * as well.  We can't just use the input 'locale' as the value may omit
1030      * categories whose locale is 'C'.  khw thinks it's better to store a
1031      * complete LC_ALL.  So calculate it. */
1032     const char * retval = savepv(calculate_LC_ALL(PL_curlocales));
1033     Safefree(PL_curlocales[LC_ALL_INDEX_]);
1034     PL_curlocales[LC_ALL_INDEX_] = retval;
1035
1036 #    else
1037
1038     const char * retval = querylocale_c(LC_ALL);
1039
1040 #    endif
1041
1042     return retval;
1043 }
1044
1045 #  ifndef USE_QUERYLOCALE
1046
1047 STATIC const char *
1048 S_find_locale_from_environment(pTHX_ const unsigned int index)
1049 {
1050     /* On systems without querylocale(), it is problematic getting the results
1051      * of the POSIX 2008 equivalent of setlocale(category, "") (which gets the
1052      * locale from the environment).
1053      *
1054      * To ensure that we know exactly what those values are, we do the setting
1055      * ourselves, using the documented algorithm (assuming the documentation is
1056      * correct) rather than use "" as the locale.  This will lead to results
1057      * that differ from native behavior if the native behavior differs from the
1058      * standard documented value, but khw believes it is better to know what's
1059      * going on, even if different from native, than to just guess.
1060      *
1061      * Another option would be, in a critical section, to save the global
1062      * locale's current value, and do a straight setlocale(LC_ALL, "").  That
1063      * would return our desired values, destroying the global locale's, which
1064      * we would then restore.  But that could cause races with any other thread
1065      * that is using the global locale and isn't using the mutex.  And, the
1066      * only reason someone would have done that is because they are calling a
1067      * library function, like in gtk, that calls setlocale(), and which can't
1068      * be changed to use the mutex.  That wouldn't be a problem if this were to
1069      * be done before any threads had switched, say during perl construction
1070      * time.  But this code would still be needed for the general case. */
1071
1072     const char * default_name;
1073     unsigned int i;
1074     const char * locale_names[LC_ALL_INDEX_];
1075
1076     /* We rely on PerlEnv_getenv() returning a mortalized copy */
1077     const char * const lc_all = PerlEnv_getenv("LC_ALL");
1078
1079     /* Use any "LC_ALL" environment variable, as it overrides everything
1080      * else. */
1081     if (lc_all && strNE(lc_all, "")) {
1082         return lc_all;
1083     }
1084
1085     /* Otherwise, we need to dig deeper.  Unless overridden, the default is
1086      * the LANG environment variable; "C" if it doesn't exist. */
1087     default_name = PerlEnv_getenv("LANG");
1088     if (! default_name || strEQ(default_name, "")) {
1089         default_name = "C";
1090     }
1091
1092     /* If setting an individual category, use its corresponding value found in
1093      * the environment, if any; otherwise use the default we already
1094      * calculated. */
1095     if (index != LC_ALL_INDEX_) {
1096         const char * const new_value = PerlEnv_getenv(category_names[index]);
1097
1098         return (new_value && strNE(new_value, ""))
1099                 ? new_value
1100                 : default_name;
1101     }
1102
1103     /* Here, we are getting LC_ALL.  Any categories that don't have a
1104      * corresponding environment variable set should be set to 'default_name'
1105      *
1106      * Simply find the values for all categories, and call the function to
1107      * compute LC_ALL. */
1108     for (i = 0; i < LC_ALL_INDEX_; i++) {
1109         const char * const env_override = PerlEnv_getenv(category_names[i]);
1110
1111         locale_names[i] = (env_override && strNE(env_override, ""))
1112                           ? env_override
1113                           : default_name;
1114
1115         DEBUG_Lv(PerlIO_printf(Perl_debug_log,
1116                  "find_locale_from_environment i=%d, name=%s, locale=%s\n",
1117                  i, category_names[i], locale_names[i]));
1118     }
1119
1120     return calculate_LC_ALL(locale_names);
1121 }
1122
1123 #  endif
1124
1125 STATIC const char *
1126 S_emulate_setlocale_i(pTHX_
1127
1128         /* Our internal index of the 'category' setlocale is
1129            called with */
1130         const unsigned int index,
1131
1132         const char * new_locale, /* The locale to set the category to */
1133         const recalc_lc_all_t recalc_LC_ALL,  /* Explained below */
1134         const line_t line     /* Called from this line number */
1135        )
1136 {
1137     PERL_ARGS_ASSERT_EMULATE_SETLOCALE_I;
1138     assert(index <= NOMINAL_LC_ALL_INDEX);
1139
1140     /* Otherwise could have undefined behavior, as the return of this function
1141      * may be copied to this buffer, which this function could change in the
1142      * middle of its work */
1143     assert(new_locale != PL_stdize_locale_buf);
1144
1145     /* This function effectively performs a setlocale() on just the current
1146      * thread; thus it is thread-safe.  It does this by using the POSIX 2008
1147      * locale functions to emulate the behavior of setlocale().  Similar to
1148      * regular setlocale(), the return from this function points to memory that
1149      * can be overwritten by other system calls, so needs to be copied
1150      * immediately if you need to retain it.  The difference here is that
1151      * system calls besides another setlocale() can overwrite it.
1152      *
1153      * By doing this, most locale-sensitive functions become thread-safe.  The
1154      * exceptions are mostly those that return a pointer to static memory.
1155      *
1156      * This function may be called in a tight loop that iterates over all
1157      * categories.  Because LC_ALL is not a "real" category, but merely the sum
1158      * of all the other ones, such loops don't include LC_ALL.  On systems that
1159      * have querylocale() or similar, the current LC_ALL value is immediately
1160      * retrievable; on systems lacking that feature, we have to keep track of
1161      * LC_ALL ourselves.  We could do that on each iteration, only to throw it
1162      * away on the next, but the calculation is more than a trivial amount of
1163      * work.  Instead, the 'recalc_LC_ALL' parameter is set to
1164      * RECALCULATE_LC_ALL_ON_FINAL_INTERATION to only do the calculation once.
1165      * This function calls itself recursively in such a loop.
1166      *
1167      * When not in such a loop, the parameter is set to the other enum values
1168      * DONT_RECALC_LC_ALL or YES_RECALC_LC_ALL. */
1169
1170     int mask = category_masks[index];
1171     const locale_t entry_obj = uselocale((locale_t) 0);
1172     const char * locale_on_entry = querylocale_i(index);
1173
1174     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
1175              "emulate_setlocale_i input=%d (%s), mask=0x%x,"
1176              " new locale=\"%s\", current locale=\"%s\","
1177              "index=%d, object=%p\n",
1178              categories[index], category_names[index], mask,
1179              ((new_locale == NULL) ? "(nil)" : new_locale),
1180              locale_on_entry, index, entry_obj));
1181
1182     /* Return the already-calculated info if just querying what the existing
1183      * locale is */
1184     if (new_locale == NULL) {
1185         return locale_on_entry;
1186     }
1187
1188     /* Here, trying to change the locale, but it is a no-op if the new boss is
1189      * the same as the old boss.  Except this routine is called when converting
1190      * from the global locale, so in that case we will create a per-thread
1191      * locale below (with the current values).  Bitter experience also
1192      * indicates that newlocale() can free up the basis locale memory if we
1193      * call it with the new and old being the same. */
1194     if (   entry_obj != LC_GLOBAL_LOCALE
1195         && locale_on_entry
1196         && strEQ(new_locale, locale_on_entry))
1197     {
1198         DEBUG_Lv(PerlIO_printf(Perl_debug_log,
1199                  "(%" LINE_Tf "): emulate_setlocale_i"
1200                  " no-op to change to what it already was\n",
1201                  line));
1202
1203 #  ifdef USE_PL_CURLOCALES
1204
1205        /* On the final iteration of a loop that needs to recalculate LC_ALL, do
1206         * so.  If no iteration changed anything, LC_ALL also doesn't change,
1207         * but khw believes the complexity needed to keep track of that isn't
1208         * worth it. */
1209         if (UNLIKELY(   recalc_LC_ALL == RECALCULATE_LC_ALL_ON_FINAL_INTERATION
1210                      && index == NOMINAL_LC_ALL_INDEX - 1))
1211         {
1212             Safefree(PL_curlocales[LC_ALL_INDEX_]);
1213             PL_curlocales[LC_ALL_INDEX_] =
1214                                         savepv(calculate_LC_ALL(PL_curlocales));
1215         }
1216
1217 #  endif
1218
1219         return locale_on_entry;
1220     }
1221
1222 #  ifndef USE_QUERYLOCALE
1223
1224     /* Without a querylocale() mechanism, we have to figure out ourselves what
1225      * happens with setting a locale to "" */
1226     if (strEQ(new_locale, "")) {
1227         new_locale = find_locale_from_environment(index);
1228     }
1229
1230 #  endif
1231
1232     /* So far, it has worked that a semi-colon in the locale name means that
1233      * the category is LC_ALL and it subsumes categories which don't all have
1234      * the same locale.  This is the glibc syntax. */
1235     if (strchr(new_locale, ';')) {
1236         assert(index == LC_ALL_INDEX_);
1237         return setlocale_from_aggregate_LC_ALL(new_locale, line);
1238     }
1239
1240 #  ifdef HAS_GLIBC_LC_MESSAGES_BUG
1241
1242     /* For this bug, if the LC_MESSAGES locale changes, we have to do an
1243      * expensive workaround.  Save the current value so we can later determine
1244      * if it changed. */
1245     const char * old_messages_locale = NULL;
1246     if (   (index == LC_MESSAGES_INDEX_ || index == LC_ALL_INDEX_)
1247         &&  LIKELY(PL_phase != PERL_PHASE_CONSTRUCT))
1248     {
1249         old_messages_locale = querylocale_c(LC_MESSAGES);
1250     }
1251
1252 #  endif
1253
1254     assert(PL_C_locale_obj);
1255
1256     /* Now ready to switch to the input 'new_locale' */
1257
1258     /* Switching locales generally entails freeing the current one's space (at
1259      * the C library's discretion), hence we can't be using that locale at the
1260      * time of the switch (this wasn't obvious to khw from the man pages).  So
1261      * switch to a known locale object that we don't otherwise mess with. */
1262     if (! uselocale(PL_C_locale_obj)) {
1263
1264         /* Not being able to change to the C locale is severe; don't keep
1265          * going.  */
1266         setlocale_failure_panic_i(index, locale_on_entry, "C", __LINE__, line);
1267         NOT_REACHED; /* NOTREACHED */
1268     }
1269
1270     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
1271              "(%" LINE_Tf "): emulate_setlocale_i now using C"
1272              " object=%p\n", line, PL_C_locale_obj));
1273
1274     locale_t new_obj;
1275
1276     /* We created a (never changing) object at start-up for LC_ALL being in the
1277      * C locale.  If this call is to switch to LC_ALL=>C, simply use that
1278      * object.  But in fact, we already have switched to it just above, in
1279      * preparation for the general case.  Since we're already there, no need to
1280      * do further switching. */
1281     if (mask == LC_ALL_MASK && isNAME_C_OR_POSIX(new_locale)) {
1282         DEBUG_Lv(PerlIO_printf(Perl_debug_log, "(%" LINE_Tf "):"
1283                                                " emulate_setlocale_i will stay"
1284                                                " in C object\n", line));
1285         new_obj = PL_C_locale_obj;
1286
1287         /* And free the old object if it isn't a special one */
1288         if (entry_obj != LC_GLOBAL_LOCALE && entry_obj != PL_C_locale_obj) {
1289             freelocale(entry_obj);
1290         }
1291     }
1292     else {  /* Here is the general case, not to LC_ALL=>C */
1293         locale_t basis_obj = entry_obj;
1294
1295         /* Specially handle two objects */
1296         if (basis_obj == LC_GLOBAL_LOCALE || basis_obj == PL_C_locale_obj) {
1297
1298             /* For these two objects, we make duplicates to hand to newlocale()
1299              * below.  For LC_GLOBAL_LOCALE, this is because newlocale()
1300              * doesn't necessarily accept it as input (the results are
1301              * undefined).  For PL_C_locale_obj, it is so that it never gets
1302              * modified, as otherwise newlocale() is free to do so */
1303             basis_obj = duplocale(basis_obj);
1304             if (! basis_obj) {
1305                 locale_panic_(Perl_form(aTHX_ "(%" LINE_Tf "): duplocale failed",
1306                                               line));
1307                 NOT_REACHED; /* NOTREACHED */
1308             }
1309
1310             DEBUG_Lv(PerlIO_printf(Perl_debug_log,
1311                                    "(%" LINE_Tf "): emulate_setlocale_i"
1312                                    " created %p by duping the input\n",
1313                                    line, basis_obj));
1314         }
1315
1316         /* Ready to create a new locale by modification of the exising one */
1317         new_obj = newlocale(mask, new_locale, basis_obj);
1318
1319         if (! new_obj) {
1320             DEBUG_L(PerlIO_printf(Perl_debug_log,
1321                                   " (%" LINE_Tf "): emulate_setlocale_i"
1322                                   " creating new object from %p failed:"
1323                                   " errno=%d\n",
1324                                   line, basis_obj, GET_ERRNO));
1325
1326             /* Failed.  Likely this is because the proposed new locale isn't
1327              * valid on this system.  But we earlier switched to the LC_ALL=>C
1328              * locale in anticipation of it succeeding,  Now have to switch
1329              * back to the state upon entry */
1330             if (! uselocale(entry_obj)) {
1331                 setlocale_failure_panic_i(index, "switching back to",
1332                                           locale_on_entry, __LINE__, line);
1333                 NOT_REACHED; /* NOTREACHED */
1334             }
1335
1336 #    ifdef USE_PL_CURLOCALES
1337
1338             if (entry_obj == LC_GLOBAL_LOCALE) {
1339
1340                 /* Here, we are back in the global locale.  We may never have
1341                  * set PL_curlocales.  If the locale change had succeeded, the
1342                  * code would have then set them up, but since it didn't, do so
1343                  * here.  khw isn't sure if this prevents some issues or not,
1344                  * but tis is defensive coding.  The system setlocale() returns
1345                  * the desired information.  This will calculate LC_ALL's entry
1346                  * only on the final iteration */
1347                 POSIX_SETLOCALE_LOCK;
1348                 for (PERL_UINT_FAST8_T i = 0; i < NOMINAL_LC_ALL_INDEX; i++) {
1349                     update_PL_curlocales_i(i,
1350                                        posix_setlocale(categories[i], NULL),
1351                                        RECALCULATE_LC_ALL_ON_FINAL_INTERATION);
1352                 }
1353                 POSIX_SETLOCALE_UNLOCK;
1354             }
1355 #    endif
1356
1357             return NULL;
1358         }
1359
1360         DEBUG_Lv(PerlIO_printf(Perl_debug_log,
1361                                "(%" LINE_Tf "): emulate_setlocale_i created %p"
1362                                " while freeing %p\n", line, new_obj, basis_obj));
1363
1364         /* Here, successfully created an object representing the desired
1365          * locale; now switch into it */
1366         if (! uselocale(new_obj)) {
1367             freelocale(new_obj);
1368             locale_panic_(Perl_form(aTHX_ "(%" LINE_Tf "): emulate_setlocale_i"
1369                                           " switching into new locale failed",
1370                                           line));
1371         }
1372     }
1373
1374     /* Here, we are using 'new_obj' which matches the input 'new_locale'. */
1375     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
1376              "(%" LINE_Tf "): emulate_setlocale_i now using %p\n", line, new_obj));
1377
1378     /* We are done, except for updating our records (if the system doesn't keep
1379      * them) and in the case of locale "", we don't actually know what the
1380      * locale that got switched to is, as it came from the environment.  So
1381      * have to find it */
1382
1383 #  ifdef USE_QUERYLOCALE
1384
1385     if (strEQ(new_locale, "")) {
1386         new_locale = querylocale_i(index);
1387     }
1388
1389     PERL_UNUSED_ARG(recalc_LC_ALL);
1390
1391 #  else
1392
1393     new_locale = update_PL_curlocales_i(index, new_locale, recalc_LC_ALL);
1394
1395 #  endif
1396 #  ifdef HAS_GLIBC_LC_MESSAGES_BUG
1397
1398     /* Invalidate the glibc cache of loaded translations if the locale has changed,
1399      * see [perl #134264] */
1400     if (old_messages_locale) {
1401         if (strNE(old_messages_locale, my_querylocale_c(LC_MESSAGES))) {
1402             textdomain(textdomain(NULL));
1403         }
1404     }
1405
1406 #  endif
1407
1408     return new_locale;
1409 }
1410
1411 #endif   /* End of the various implementations of the setlocale and
1412             querylocale macros used in the remainder of this program */
1413
1414 #ifdef USE_LOCALE
1415
1416 /* So far, the locale strings returned by modern 2008-compliant systems have
1417  * been fine */
1418
1419 STATIC const char *
1420 S_stdize_locale(pTHX_ const int category,
1421                       const char *input_locale,
1422                       const char **buf,
1423                       Size_t *buf_size,
1424                       const line_t caller_line)
1425 {
1426     /* The return value of setlocale() is opaque, but is required to be usable
1427      * as input to a future setlocale() to create the same state.
1428      * Unfortunately not all systems are compliant.  But most often they are of
1429      * a very restricted set of forms that this file has been coded to expect.
1430      *
1431      * There are some outliers, though, that this function tries to tame:
1432      *
1433      * 1) A new-line.  This function chomps any \n characters
1434      * 2) foo=bar.     'bar' is what is generally meant, and the foo= part is
1435      *                 stripped.  This form is legal for LC_ALL.  When found in
1436      *                 that category group, the function calls itself
1437      *                 recursively on each possible component category to make
1438      *                 sure the individual categories are ok.
1439      *
1440      * If no changes to the input were made, it is returned; otherwise the
1441      * changed version is stored into memory at *buf, with *buf_size set to its
1442      * new value, and *buf is returned.
1443      */
1444
1445     const char * first_bad;
1446     const char * retval;
1447
1448     PERL_ARGS_ASSERT_STDIZE_LOCALE;
1449
1450     if (input_locale == NULL) {
1451         return NULL;
1452     }
1453
1454     first_bad = strpbrk(input_locale, "=\n");
1455
1456     /* Most likely, there isn't a problem with the input */
1457     if (LIKELY(! first_bad)) {
1458         return input_locale;
1459     }
1460
1461 #    ifdef LC_ALL
1462
1463     /* But if there is, and the category is LC_ALL, we have to look at each
1464      * component category */
1465     if (category == LC_ALL) {
1466         const char * individ_locales[LC_ALL_INDEX_];
1467         bool made_changes = FALSE;
1468         unsigned int i;
1469
1470         for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++) {
1471             Size_t this_size = 0;
1472             individ_locales[i] = stdize_locale(categories[i],
1473                                                posix_setlocale(categories[i],
1474                                                                NULL),
1475                                                &individ_locales[i],
1476                                                &this_size,
1477                                                caller_line);
1478
1479             /* If the size didn't change, it means this category did not have
1480              * to be adjusted, and individ_locales[i] points to the buffer
1481              * returned by posix_setlocale(); we have to copy that before
1482              * it's called again in the next iteration */
1483             if (this_size == 0) {
1484                 individ_locales[i] = savepv(individ_locales[i]);
1485             }
1486             else {
1487                 made_changes = TRUE;
1488             }
1489         }
1490
1491         /* If all the individual categories were ok as-is, this was a false
1492          * alarm.  We must have seen an '=' which was a legal occurrence in
1493          * this combination locale */
1494         if (! made_changes) {
1495             retval = input_locale;  /* The input can be returned unchanged */
1496         }
1497         else {
1498             retval = save_to_buffer(querylocale_c(LC_ALL), buf, buf_size);
1499         }
1500
1501         for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++) {
1502             Safefree(individ_locales[i]);
1503         }
1504
1505         return retval;
1506     }
1507
1508 #    else   /* else no LC_ALL */
1509
1510     PERL_UNUSED_ARG(category);
1511     PERL_UNUSED_ARG(caller_line);
1512
1513 #    endif
1514
1515     /* Here, there was a problem in an individual category.  This means that at
1516      * least one adjustment will be necessary.  Create a modifiable copy */
1517     retval = save_to_buffer(input_locale, buf, buf_size);
1518
1519     if (*first_bad != '=') {
1520
1521         /* Translate the found position into terms of the copy */
1522         first_bad = retval + (first_bad - input_locale);
1523     }
1524     else { /* An '=' */
1525
1526         /* It is unlikely that the return is so screwed-up that it contains
1527          * multiple equals signs, but handle that case by stripping all of
1528          * them.  */
1529         const char * final_equals = strrchr(retval, '=');
1530
1531         /* The length passed here causes the move to include the terminating
1532          * NUL */
1533         Move(final_equals + 1, retval, strlen(final_equals), char);
1534
1535         /* See if there are additional problems; if not, we're good to return.
1536          * */
1537         first_bad = strpbrk(retval, "\n");
1538
1539         if (! first_bad) {
1540             return retval;
1541         }
1542     }
1543
1544     /* Here, the problem must be a \n.  Get rid of it and what follows.
1545      * (Originally, only a trailing \n was stripped.  Unsure what to do if not
1546      * trailing) */
1547     *((char *) first_bad) = '\0';
1548     return retval;
1549 }
1550
1551 #if defined(USE_POSIX_2008_LOCALE) || ! defined(LC_ALL)
1552
1553 STATIC
1554 const char *
1555
1556 #  ifdef USE_QUERYLOCALE
1557 S_calculate_LC_ALL(pTHX_ const locale_t cur_obj)
1558 #  else
1559 S_calculate_LC_ALL(pTHX_ const char ** individ_locales)
1560 #  endif
1561
1562 {
1563     /* For POSIX 2008, we have to figure out LC_ALL ourselves when needed.
1564      * querylocale(), on systems that have it, doesn't tend to work for LC_ALL.
1565      * So we have to construct the answer ourselves based on the passed in
1566      * data, which is either a locale_t object, for systems with querylocale(),
1567      * or an array we keep updated to the proper values, otherwise.
1568      *
1569      * This returns a mortalized string containing the locale name(s) of
1570      * LC_ALL.
1571      *
1572      * If all individual categories are the same locale, we can just set LC_ALL
1573      * to that locale.  But if not, we have to create an aggregation of all the
1574      * categories on the system.  Platforms differ as to the syntax they use
1575      * for these non-uniform locales for LC_ALL.  Some use a '/' or other
1576      * delimiter of the locales with a predetermined order of categories; a
1577      * Configure probe would be needed to tell us how to decipher those.  glibc
1578      * uses a series of name=value pairs, like
1579      *      LC_NUMERIC=C;LC_TIME=en_US.UTF-8;...
1580      * The syntax we use for our aggregation doesn't much matter, as we take
1581      * care not to use the native setlocale() function on whatever style is
1582      * chosen.  But, it would be possible for someone to call Perl_setlocale()
1583      * using a native style we don't understand.  So far no one has complained.
1584      *
1585      * For systems that have categories we don't know about, the algorithm
1586      * below won't know about those missing categories, leading to potential
1587      * bugs for code that looks at them.  If there is an environment variable
1588      * that sets that category, we won't know to look for it, and so our use of
1589      * LANG or "C" improperly overrides it.  On the other hand, if we don't do
1590      * what is done here, and there is no environment variable, the category's
1591      * locale should be set to LANG or "C".  So there is no good solution.  khw
1592      * thinks the best is to make sure we have a complete list of possible
1593      * categories, adding new ones as they show up on obscure platforms.
1594      */
1595
1596     unsigned int i;
1597     Size_t names_len = 0;
1598     bool are_all_categories_the_same_locale = TRUE;
1599     char * aggregate_locale;
1600     char * previous_start = NULL;
1601     char * this_start = NULL;
1602     Size_t entry_len = 0;
1603
1604     PERL_ARGS_ASSERT_CALCULATE_LC_ALL;
1605
1606     /* First calculate the needed size for the string listing the categories
1607      * and their locales. */
1608     for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++) {
1609
1610 #  ifdef USE_QUERYLOCALE
1611         const char * entry = querylocale_l(i, cur_obj);
1612 #  else
1613         const char * entry = individ_locales[i];
1614 #  endif
1615
1616         names_len += strlen(category_names[i])
1617                   + 1                           /* '=' */
1618                   + strlen(entry)
1619                   + 1;                          /* ';' */
1620     }
1621
1622     names_len++;    /* Trailing '\0' */
1623
1624     /* Allocate enough space for the aggregated string */
1625     SAVEFREEPV(Newxz(aggregate_locale, names_len, char));
1626
1627     /* Then fill it in */
1628     for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++) {
1629         Size_t new_len;
1630
1631 #  ifdef USE_QUERYLOCALE
1632         const char * entry = querylocale_l(i, cur_obj);
1633 #  else
1634         const char * entry = individ_locales[i];
1635 #  endif
1636
1637         new_len = my_strlcat(aggregate_locale, category_names[i], names_len);
1638         assert(new_len <= names_len);
1639         new_len = my_strlcat(aggregate_locale, "=", names_len);
1640         assert(new_len <= names_len);
1641
1642         this_start = aggregate_locale + strlen(aggregate_locale);
1643         entry_len = strlen(entry);
1644
1645         new_len = my_strlcat(aggregate_locale, entry, names_len);
1646         assert(new_len <= names_len);
1647         new_len = my_strlcat(aggregate_locale, ";", names_len);
1648         assert(new_len <= names_len);
1649         PERL_UNUSED_VAR(new_len);   /* Only used in DEBUGGING */
1650
1651         if (   i > 0
1652             && are_all_categories_the_same_locale
1653             && memNE(previous_start, this_start, entry_len + 1))
1654         {
1655             are_all_categories_the_same_locale = FALSE;
1656         }
1657         else {
1658             previous_start = this_start;
1659         }
1660     }
1661
1662     /* If they are all the same, just return any one of them */
1663     if (are_all_categories_the_same_locale) {
1664         aggregate_locale = this_start;
1665         aggregate_locale[entry_len] = '\0';
1666     }
1667
1668     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
1669                            "calculate_LC_ALL returning '%s'\n",
1670                            aggregate_locale));
1671
1672     return aggregate_locale;
1673 }
1674
1675 #endif
1676
1677 #if defined(USE_LOCALE) && defined(DEBUGGING)
1678
1679 STATIC const char *
1680 S_get_LC_ALL_display(pTHX)
1681 {
1682
1683 #  ifdef LC_ALL
1684
1685     return querylocale_c(LC_ALL);
1686
1687 #  else
1688
1689     const char * curlocales[NOMINAL_LC_ALL_INDEX];
1690
1691     for (unsigned i = 0; i < NOMINAL_LC_ALL_INDEX; i++) {
1692         curlocales[i] = querylocale_i(i);
1693     }
1694
1695     return calculate_LC_ALL(curlocales);
1696
1697 #  endif
1698
1699 }
1700
1701 #endif
1702
1703 STATIC void
1704 S_setlocale_failure_panic_i(pTHX_
1705                             const unsigned int cat_index,
1706                             const char * current,
1707                             const char * failed,
1708                             const line_t caller_0_line,
1709                             const line_t caller_1_line)
1710 {
1711     dSAVE_ERRNO;
1712     const int cat = categories[cat_index];
1713     const char * name = category_names[cat_index];
1714
1715     PERL_ARGS_ASSERT_SETLOCALE_FAILURE_PANIC_I;
1716
1717     if (current == NULL) {
1718         current = querylocale_i(cat_index);
1719     }
1720
1721     Perl_locale_panic(Perl_form(aTHX_ "(%" LINE_Tf
1722                                       "): Can't change locale for %s(%d)"
1723                                       " from '%s' to '%s'",
1724                                       caller_1_line, name, cat,
1725                                       current, failed),
1726                       __FILE__, caller_0_line, GET_ERRNO);
1727     NOT_REACHED; /* NOTREACHED */
1728 }
1729
1730 /* Any of these will allow us to find the RADIX */
1731 #  if defined(USE_LOCALE_NUMERIC) && (   defined(HAS_SOME_LANGINFO)         \
1732                                       || defined(HAS_SOME_LOCALECONV)       \
1733                                       || defined(HAS_SNPRINTF))
1734 #    define CAN_CALCULATE_RADIX
1735 #  endif
1736 #  ifdef USE_LOCALE_NUMERIC
1737
1738 STATIC void
1739 S_new_numeric(pTHX_ const char *newnum)
1740 {
1741     PERL_ARGS_ASSERT_NEW_NUMERIC;
1742
1743     /* Called after each libc setlocale() call affecting LC_NUMERIC, to tell
1744      * core Perl this and that 'newnum' is the name of the new locale, and we
1745      * are switched into it.  It installs this locale as the current underlying
1746      * default, and then switches to the C locale, if necessary, so that the
1747      * code that has traditionally expected the radix character to be a dot may
1748      * continue to do so.
1749      *
1750      * The default locale and the C locale can be toggled between by use of the
1751      * set_numeric_underlying() and set_numeric_standard() functions, which
1752      * should probably not be called directly, but only via macros like
1753      * SET_NUMERIC_STANDARD() in perl.h.
1754      *
1755      * The toggling is necessary mainly so that a non-dot radix decimal point
1756      * character can be input and output, while allowing internal calculations
1757      * to use a dot.
1758      *
1759      * This sets several interpreter-level variables:
1760      * PL_numeric_name  The underlying locale's name: a copy of 'newnum'
1761      * PL_numeric_underlying  A boolean indicating if the toggled state is such
1762      *                  that the current locale is the program's underlying
1763      *                  locale
1764      * PL_numeric_standard An int indicating if the toggled state is such
1765      *                  that the current locale is the C locale or
1766      *                  indistinguishable from the C locale.  If non-zero, it
1767      *                  is in C; if > 1, it means it may not be toggled away
1768      *                  from C.
1769      * PL_numeric_underlying_is_standard   A bool kept by this function
1770      *                  indicating that the underlying locale and the standard
1771      *                  C locale are indistinguishable for the purposes of
1772      *                  LC_NUMERIC.  This happens when both of the above two
1773      *                  variables are true at the same time.  (Toggling is a
1774      *                  no-op under these circumstances.)  This variable is
1775      *                  used to avoid having to recalculate.
1776      * PL_numeric_radix_sv  Contains the string that code should use for the
1777      *                  decimal point.  It is set to either a dot or the
1778      *                  program's underlying locale's radix character string,
1779      *                  depending on the situation.
1780      * PL_underlying_radix_sv  Contains the program's underlying locale's radix
1781      *                  character string.  This is copied into
1782      *                  PL_numeric_radix_sv when the situation warrants.  It
1783      *                  exists to avoid having to recalculate it when toggling.
1784      * PL_underlying_numeric_obj = (only on POSIX 2008 platforms)  An object
1785      *                  with everything set up properly so as to avoid work on
1786      *                  such platforms.
1787      */
1788
1789     DEBUG_L( PerlIO_printf(Perl_debug_log,
1790                            "Called new_numeric with %s, PL_numeric_name=%s\n",
1791                            newnum, PL_numeric_name));
1792
1793     /* If this isn't actually a change, do nothing */
1794     if (strEQ(PL_numeric_name, newnum)) {
1795         return;
1796     }
1797
1798     Safefree(PL_numeric_name);
1799     PL_numeric_name = savepv(newnum);
1800
1801     /* Handle the trivial case.  Since this is called at process
1802      * initialization, be aware that this bit can't rely on much being
1803      * available. */
1804     if (isNAME_C_OR_POSIX(PL_numeric_name)) {
1805         PL_numeric_standard = TRUE;
1806         PL_numeric_underlying_is_standard = TRUE;
1807         PL_numeric_underlying = TRUE;
1808         sv_setpv(PL_numeric_radix_sv, C_decimal_point);
1809         sv_setpv(PL_underlying_radix_sv, C_decimal_point);
1810         return;
1811     }
1812
1813     /* We are in the underlying locale until changed at the end of this
1814      * function */
1815     PL_numeric_underlying = TRUE;
1816
1817 #  ifdef USE_POSIX_2008_LOCALE
1818
1819     /* We keep a special object for easy switching to */
1820     PL_underlying_numeric_obj = newlocale(LC_NUMERIC_MASK,
1821                                           PL_numeric_name,
1822                                           PL_underlying_numeric_obj);
1823
1824 #    endif
1825
1826     const char * radix = NULL;
1827     utf8ness_t utf8ness = UTF8NESS_IMMATERIAL;
1828
1829     /* Find and save this locale's radix character. */
1830     my_langinfo_c(RADIXCHAR, LC_NUMERIC, PL_numeric_name,
1831                   &radix, NULL, &utf8ness);
1832     sv_setpv(PL_underlying_radix_sv, radix);
1833
1834     if (utf8ness == UTF8NESS_YES) {
1835         SvUTF8_on(PL_underlying_radix_sv);
1836     }
1837
1838     DEBUG_L(PerlIO_printf(Perl_debug_log,
1839                           "Locale radix is '%s', ?UTF-8=%d\n",
1840                           SvPVX(PL_underlying_radix_sv),
1841                           cBOOL(SvUTF8(PL_underlying_radix_sv))));
1842
1843     /* This locale is indistinguishable from C (for numeric purposes) if both
1844      * the radix character and the thousands separator are the same as C's.
1845      * Start with the radix. */
1846     PL_numeric_underlying_is_standard = strEQ(C_decimal_point, radix);
1847     Safefree(radix);
1848
1849 #    ifndef TS_W32_BROKEN_LOCALECONV
1850
1851     /* If the radix isn't the same as C's, we know it is distinguishable from
1852      * C; otherwise check the thousands separator too.  Only if both are the
1853      * same as C's is the locale indistinguishable from C.
1854      *
1855      * But on earlier Windows versions, there is a potential race.  This code
1856      * knows that localeconv() (elsewhere in this file) will be used to extract
1857      * the needed value, and localeconv() was buggy for quite a while, and that
1858      * code in this file hence uses a workaround.  And that workaround may have
1859      * an (unlikely) race.  Gathering the radix uses a different workaround on
1860      * Windows that doesn't involve a race.  It might be possible to do the
1861      * same for this (patches welcome).
1862      *
1863      * Until then khw doesn't think it's worth even the small risk of a race to
1864      * get this value, which in almost all locales is empty, and doesn't appear
1865      * to be used in any of the Micrsoft library routines anyway. */
1866
1867     const char * scratch_buffer = NULL;
1868     PL_numeric_underlying_is_standard &= strEQ(C_thousands_sep,
1869                                                my_langinfo_c(THOUSEP, LC_NUMERIC,
1870                                                              PL_numeric_name,
1871                                                              &scratch_buffer,
1872                                                              NULL, NULL));
1873     Safefree(scratch_buffer);
1874
1875 #    endif
1876
1877     PL_numeric_standard = PL_numeric_underlying_is_standard;
1878
1879     /* Keep LC_NUMERIC so that it has the C locale radix and thousands
1880      * separator.  This is for XS modules, so they don't have to worry about
1881      * the radix being a non-dot.  (Core operations that need the underlying
1882      * locale change to it temporarily). */
1883     if (! PL_numeric_standard) {
1884         set_numeric_standard();
1885     }
1886
1887 }
1888
1889 #  endif
1890
1891 void
1892 Perl_set_numeric_standard(pTHX)
1893 {
1894
1895 #  ifdef USE_LOCALE_NUMERIC
1896
1897     /* Unconditionally toggle the LC_NUMERIC locale to the current underlying
1898      * default.
1899      *
1900      * Most code should use the macro SET_NUMERIC_STANDARD() in perl.h
1901      * instead of calling this directly.  The macro avoids calling this routine
1902      * if toggling isn't necessary according to our records (which could be
1903      * wrong if some XS code has changed the locale behind our back) */
1904
1905     DEBUG_L(PerlIO_printf(Perl_debug_log,
1906                                   "Setting LC_NUMERIC locale to standard C\n"));
1907
1908     void_setlocale_c(LC_NUMERIC, "C");
1909     PL_numeric_standard = TRUE;
1910     sv_setpv(PL_numeric_radix_sv, C_decimal_point);
1911
1912     PL_numeric_underlying = PL_numeric_underlying_is_standard;
1913
1914 #  endif /* USE_LOCALE_NUMERIC */
1915
1916 }
1917
1918 void
1919 Perl_set_numeric_underlying(pTHX)
1920 {
1921
1922 #  ifdef USE_LOCALE_NUMERIC
1923
1924     /* Unconditionally toggle the LC_NUMERIC locale to the current underlying
1925      * default.
1926      *
1927      * Most code should use the macro SET_NUMERIC_UNDERLYING() in perl.h
1928      * instead of calling this directly.  The macro avoids calling this routine
1929      * if toggling isn't necessary according to our records (which could be
1930      * wrong if some XS code has changed the locale behind our back) */
1931
1932     DEBUG_L(PerlIO_printf(Perl_debug_log, "Setting LC_NUMERIC locale to %s\n",
1933                                           PL_numeric_name));
1934
1935     void_setlocale_c(LC_NUMERIC, PL_numeric_name);
1936     PL_numeric_underlying = TRUE;
1937     sv_setsv_nomg(PL_numeric_radix_sv, PL_underlying_radix_sv);
1938
1939     PL_numeric_standard = PL_numeric_underlying_is_standard;
1940
1941 #  endif /* USE_LOCALE_NUMERIC */
1942
1943 }
1944
1945 #  ifdef USE_LOCALE_CTYPE
1946
1947 STATIC void
1948 S_new_ctype(pTHX_ const char *newctype)
1949 {
1950     PERL_ARGS_ASSERT_NEW_CTYPE;
1951
1952     /* Called after each libc setlocale() call affecting LC_CTYPE, to tell
1953      * core Perl this and that 'newctype' is the name of the new locale.
1954      *
1955      * This function sets up the folding arrays for all 256 bytes, assuming
1956      * that tofold() is tolc() since fold case is not a concept in POSIX,
1957      */
1958
1959     DEBUG_L(PerlIO_printf(Perl_debug_log, "Entering new_ctype(%s)\n", newctype));
1960
1961     /* No change means no-op */
1962     if (strEQ(PL_ctype_name, newctype)) {
1963         return;
1964     }
1965
1966     /* We will replace any bad locale warning with 1) nothing if the new one is
1967      * ok; or 2) a new warning for the bad new locale */
1968     if (PL_warn_locale) {
1969         SvREFCNT_dec_NN(PL_warn_locale);
1970         PL_warn_locale = NULL;
1971     }
1972
1973     /* Clear cache */
1974     Safefree(PL_ctype_name);
1975     PL_ctype_name = "";
1976
1977     PL_in_utf8_turkic_locale = FALSE;
1978
1979     /* For the C locale, just use the standard folds, and we know there are no
1980      * glitches possible, so return early.  Since this is called at process
1981      * initialization, be aware that this bit can't rely on much being
1982      * available. */
1983     if (isNAME_C_OR_POSIX(newctype)) {
1984         Copy(PL_fold, PL_fold_locale, 256, U8);
1985         PL_ctype_name = savepv(newctype);
1986         PL_in_utf8_CTYPE_locale = FALSE;
1987         return;
1988     }
1989
1990     /* The cache being cleared signals this to compute a new value */
1991     PL_in_utf8_CTYPE_locale = is_locale_utf8(newctype);
1992
1993     PL_ctype_name = savepv(newctype);
1994     bool maybe_utf8_turkic = FALSE;
1995
1996     /* Don't check for problems if we are suppressing the warnings */
1997     bool check_for_problems = ckWARN_d(WARN_LOCALE) || UNLIKELY(DEBUG_L_TEST);
1998
1999     if (PL_in_utf8_CTYPE_locale) {
2000
2001         /* A UTF-8 locale gets standard rules.  But note that code still has to
2002          * handle this specially because of the three problematic code points
2003          * */
2004         Copy(PL_fold_latin1, PL_fold_locale, 256, U8);
2005
2006         /* UTF-8 locales can have special handling for 'I' and 'i' if they are
2007          * Turkic.  Make sure these two are the only anomalies.  (We don't
2008          * require towupper and towlower because they aren't in C89.) */
2009
2010 #    if defined(HAS_TOWUPPER) && defined (HAS_TOWLOWER)
2011
2012         if (towupper('i') == 0x130 && towlower('I') == 0x131)
2013
2014 #    else
2015
2016         if (toU8_UPPER_LC('i') == 'i' && toU8_LOWER_LC('I') == 'I')
2017
2018 #    endif
2019
2020         {
2021             /* This is how we determine it really is Turkic */
2022             check_for_problems = TRUE;
2023             maybe_utf8_turkic = TRUE;
2024         }
2025     }
2026     else {  /* Not a canned locale we know the values for.  Compute them */
2027
2028 #    ifdef DEBUGGING
2029
2030         bool has_non_ascii_fold = FALSE;
2031         bool found_unexpected = FALSE;
2032
2033         if (DEBUG_Lv_TEST) {
2034             for (unsigned i = 128; i < 256; i++) {
2035                 int j = LATIN1_TO_NATIVE(i);
2036                 if (toU8_LOWER_LC(j) != j || toU8_UPPER_LC(j) != j) {
2037                     has_non_ascii_fold = TRUE;
2038                     break;
2039                 }
2040             }
2041         }
2042
2043 #    endif
2044
2045         for (unsigned i = 0; i < 256; i++) {
2046             if (isU8_UPPER_LC(i))
2047                 PL_fold_locale[i] = (U8) toU8_LOWER_LC(i);
2048             else if (isU8_LOWER_LC(i))
2049                 PL_fold_locale[i] = (U8) toU8_UPPER_LC(i);
2050             else
2051                 PL_fold_locale[i] = (U8) i;
2052
2053 #    ifdef DEBUGGING
2054
2055             if (DEBUG_Lv_TEST) {
2056                 bool unexpected = FALSE;
2057
2058                 if (isUPPER_L1(i)) {
2059                     if (isUPPER_A(i)) {
2060                         if (PL_fold_locale[i] != toLOWER_A(i)) {
2061                             unexpected = TRUE;
2062                         }
2063                     }
2064                     else if (has_non_ascii_fold) {
2065                         if (PL_fold_locale[i] != toLOWER_L1(i)) {
2066                             unexpected = TRUE;
2067                         }
2068                     }
2069                     else if (PL_fold_locale[i] != i) {
2070                         unexpected = TRUE;
2071                     }
2072                 }
2073                 else if (   isLOWER_L1(i)
2074                          && i != LATIN_SMALL_LETTER_SHARP_S
2075                          && i != MICRO_SIGN)
2076                 {
2077                     if (isLOWER_A(i)) {
2078                         if (PL_fold_locale[i] != toUPPER_A(i)) {
2079                             unexpected = TRUE;
2080                         }
2081                     }
2082                     else if (has_non_ascii_fold) {
2083                         if (PL_fold_locale[i] != toUPPER_LATIN1_MOD(i)) {
2084                             unexpected = TRUE;
2085                         }
2086                     }
2087                     else if (PL_fold_locale[i] != i) {
2088                         unexpected = TRUE;
2089                     }
2090                 }
2091                 else if (PL_fold_locale[i] != i) {
2092                     unexpected = TRUE;
2093                 }
2094
2095                 if (unexpected) {
2096                     found_unexpected = TRUE;
2097                     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
2098                                            "For %s, fold of %02x is %02x\n",
2099                                            newctype, i, PL_fold_locale[i]));
2100                 }
2101             }
2102         }
2103
2104         if (found_unexpected) {
2105             DEBUG_Lv(PerlIO_printf(Perl_debug_log,
2106                                "All bytes not mentioned above either fold to"
2107                                " themselves or are the expected ASCII or"
2108                                " Latin1 ones\n"));
2109         }
2110         else {
2111             DEBUG_Lv(PerlIO_printf(Perl_debug_log,
2112                                    "No nonstandard folds were found\n"));
2113 #    endif
2114
2115         }
2116     }
2117
2118 #    ifdef MB_CUR_MAX
2119
2120     /* We only handle single-byte locales (outside of UTF-8 ones; so if this
2121      * locale requires more than one byte, there are going to be BIG problems.
2122      * */
2123
2124     if (MB_CUR_MAX > 1 && ! PL_in_utf8_CTYPE_locale
2125
2126             /* Some platforms return MB_CUR_MAX > 1 for even the "C" locale.
2127              * Just assume that the implementation for them (plus for POSIX) is
2128              * correct and the > 1 value is spurious.  (Since these are
2129              * specially handled to never be considered UTF-8 locales, as long
2130              * as this is the only problem, everything should work fine */
2131         && ! isNAME_C_OR_POSIX(newctype))
2132     {
2133         DEBUG_L(PerlIO_printf(Perl_debug_log,
2134                               "Unsupported, MB_CUR_MAX=%d\n", (int) MB_CUR_MAX));
2135
2136         Perl_ck_warner_d(aTHX_ packWARN(WARN_LOCALE),
2137                          "Locale '%s' is unsupported, and may crash the"
2138                          " interpreter.\n",
2139                          newctype);
2140     }
2141
2142 #    endif
2143
2144     DEBUG_Lv(PerlIO_printf(Perl_debug_log, "check_for_problems=%d\n",
2145                                            check_for_problems));
2146
2147     /* We don't populate the other lists if a UTF-8 locale, but do check that
2148      * everything works as expected, unless checking turned off */
2149     if (check_for_problems) {
2150         /* Assume enough space for every character being bad.  4 spaces each
2151          * for the 94 printable characters that are output like "'x' "; and 5
2152          * spaces each for "'\\' ", "'\t' ", and "'\n' "; plus a terminating
2153          * NUL */
2154         char bad_chars_list[ (94 * 4) + (3 * 5) + 1 ] = { '\0' };
2155         unsigned int bad_count = 0;         /* Count of bad characters */
2156
2157         for (unsigned i = 0; i < 256; i++) {
2158
2159             /* If checking for locale problems, see if the native ASCII-range
2160              * printables plus \n and \t are in their expected categories in
2161              * the new locale.  If not, this could mean big trouble, upending
2162              * Perl's and most programs' assumptions, like having a
2163              * metacharacter with special meaning become a \w.  Fortunately,
2164              * it's very rare to find locales that aren't supersets of ASCII
2165              * nowadays.  It isn't a problem for most controls to be changed
2166              * into something else; we check only \n and \t, though perhaps \r
2167              * could be an issue as well. */
2168             if (isGRAPH_A(i) || isBLANK_A(i) || i == '\n') {
2169                 bool is_bad = FALSE;
2170                 char name[4] = { '\0' };
2171
2172                 /* Convert the name into a string */
2173                 if (isGRAPH_A(i)) {
2174                     name[0] = i;
2175                     name[1] = '\0';
2176                 }
2177                 else if (i == '\n') {
2178                     my_strlcpy(name, "\\n", sizeof(name));
2179                 }
2180                 else if (i == '\t') {
2181                     my_strlcpy(name, "\\t", sizeof(name));
2182                 }
2183                 else {
2184                     assert(i == ' ');
2185                     my_strlcpy(name, "' '", sizeof(name));
2186                 }
2187
2188                 /* Check each possibe class */
2189                 if (UNLIKELY(cBOOL(isU8_ALPHANUMERIC_LC(i)) != cBOOL(isALPHANUMERIC_A(i))))  {
2190                     is_bad = TRUE;
2191                     DEBUG_L(PerlIO_printf(Perl_debug_log,
2192                                           "isalnum('%s') unexpectedly is %x\n",
2193                                           name, cBOOL(isU8_ALPHANUMERIC_LC(i))));
2194                 }
2195                 if (UNLIKELY(cBOOL(isU8_ALPHA_LC(i)) != cBOOL(isALPHA_A(i))))  {
2196                     is_bad = TRUE;
2197                     DEBUG_L(PerlIO_printf(Perl_debug_log,
2198                                           "isalpha('%s') unexpectedly is %x\n",
2199                                           name, cBOOL(isU8_ALPHA_LC(i))));
2200                 }
2201                 if (UNLIKELY(cBOOL(isU8_DIGIT_LC(i)) != cBOOL(isDIGIT_A(i))))  {
2202                     is_bad = TRUE;
2203                     DEBUG_L(PerlIO_printf(Perl_debug_log,
2204                                           "isdigit('%s') unexpectedly is %x\n",
2205                                           name, cBOOL(isU8_DIGIT_LC(i))));
2206                 }
2207                 if (UNLIKELY(cBOOL(isU8_GRAPH_LC(i)) != cBOOL(isGRAPH_A(i))))  {
2208                     is_bad = TRUE;
2209                     DEBUG_L(PerlIO_printf(Perl_debug_log,
2210                                           "isgraph('%s') unexpectedly is %x\n",
2211                                           name, cBOOL(isU8_GRAPH_LC(i))));
2212                 }
2213                 if (UNLIKELY(cBOOL(isU8_LOWER_LC(i)) != cBOOL(isLOWER_A(i))))  {
2214                     is_bad = TRUE;
2215                     DEBUG_L(PerlIO_printf(Perl_debug_log,
2216                                           "islower('%s') unexpectedly is %x\n",
2217                                           name, cBOOL(isU8_LOWER_LC(i))));
2218                 }
2219                 if (UNLIKELY(cBOOL(isU8_PRINT_LC(i)) != cBOOL(isPRINT_A(i))))  {
2220                     is_bad = TRUE;
2221                     DEBUG_L(PerlIO_printf(Perl_debug_log,
2222                                           "isprint('%s') unexpectedly is %x\n",
2223                                           name, cBOOL(isU8_PRINT_LC(i))));
2224                 }
2225                 if (UNLIKELY(cBOOL(isU8_PUNCT_LC(i)) != cBOOL(isPUNCT_A(i))))  {
2226                     is_bad = TRUE;
2227                     DEBUG_L(PerlIO_printf(Perl_debug_log,
2228                                           "ispunct('%s') unexpectedly is %x\n",
2229                                           name, cBOOL(isU8_PUNCT_LC(i))));
2230                 }
2231                 if (UNLIKELY(cBOOL(isU8_SPACE_LC(i)) != cBOOL(isSPACE_A(i))))  {
2232                     is_bad = TRUE;
2233                     DEBUG_L(PerlIO_printf(Perl_debug_log,
2234                                           "isspace('%s') unexpectedly is %x\n",
2235                                           name, cBOOL(isU8_SPACE_LC(i))));
2236                 }
2237                 if (UNLIKELY(cBOOL(isU8_UPPER_LC(i)) != cBOOL(isUPPER_A(i))))  {
2238                     is_bad = TRUE;
2239                     DEBUG_L(PerlIO_printf(Perl_debug_log,
2240                                           "isupper('%s') unexpectedly is %x\n",
2241                                           name, cBOOL(isU8_UPPER_LC(i))));
2242                 }
2243                 if (UNLIKELY(cBOOL(isU8_XDIGIT_LC(i))!= cBOOL(isXDIGIT_A(i))))  {
2244                     is_bad = TRUE;
2245                     DEBUG_L(PerlIO_printf(Perl_debug_log,
2246                                           "isxdigit('%s') unexpectedly is %x\n",
2247                                           name, cBOOL(isU8_XDIGIT_LC(i))));
2248                 }
2249                 if (UNLIKELY(toU8_LOWER_LC(i) != (int) toLOWER_A(i))) {
2250                     is_bad = TRUE;
2251                     DEBUG_L(PerlIO_printf(Perl_debug_log,
2252                             "tolower('%s')=0x%x instead of the expected 0x%x\n",
2253                             name, toU8_LOWER_LC(i), (int) toLOWER_A(i)));
2254                 }
2255                 if (UNLIKELY(toU8_UPPER_LC(i) != (int) toUPPER_A(i))) {
2256                     is_bad = TRUE;
2257                     DEBUG_L(PerlIO_printf(Perl_debug_log,
2258                             "toupper('%s')=0x%x instead of the expected 0x%x\n",
2259                             name, toU8_UPPER_LC(i), (int) toUPPER_A(i)));
2260                 }
2261                 if (UNLIKELY((i == '\n' && ! isCNTRL_LC(i))))  {
2262                     is_bad = TRUE;
2263                     DEBUG_L(PerlIO_printf(Perl_debug_log,
2264                                 "'\\n' (=%02X) is not a control\n", (int) i));
2265                 }
2266
2267                 /* Add to the list;  Separate multiple entries with a blank */
2268                 if (is_bad) {
2269                     if (bad_count) {
2270                         my_strlcat(bad_chars_list, " ", sizeof(bad_chars_list));
2271                     }
2272                     my_strlcat(bad_chars_list, name, sizeof(bad_chars_list));
2273                     bad_count++;
2274                 }
2275             }
2276         }
2277
2278         if (bad_count == 2 && maybe_utf8_turkic) {
2279             bad_count = 0;
2280             *bad_chars_list = '\0';
2281             PL_fold_locale['I'] = 'I';
2282             PL_fold_locale['i'] = 'i';
2283             PL_in_utf8_turkic_locale = TRUE;
2284             DEBUG_L(PerlIO_printf(Perl_debug_log, "%s is turkic\n", newctype));
2285         }
2286
2287         /* If we found problems and we want them output, do so */
2288         if (   (UNLIKELY(bad_count))
2289             && (LIKELY(ckWARN_d(WARN_LOCALE)) || UNLIKELY(DEBUG_L_TEST)))
2290         {
2291             if (UNLIKELY(bad_count) && PL_in_utf8_CTYPE_locale) {
2292                 PL_warn_locale = Perl_newSVpvf(aTHX_
2293                      "Locale '%s' contains (at least) the following characters"
2294                      " which have\nunexpected meanings: %s\nThe Perl program"
2295                      " will use the expected meanings",
2296                       newctype, bad_chars_list);
2297             }
2298             else {
2299                 PL_warn_locale =
2300                     Perl_newSVpvf(aTHX_
2301                                   "\nThe following characters (and maybe"
2302                                   " others) may not have the same meaning as"
2303                                   " the Perl program expects: %s\n",
2304                                   bad_chars_list
2305                             );
2306             }
2307
2308 #    ifdef HAS_SOME_LANGINFO
2309
2310             const char * scratch_buffer = NULL;
2311             Perl_sv_catpvf(aTHX_ PL_warn_locale, "; codeset=%s",
2312                                  my_langinfo_c(CODESET, LC_CTYPE,
2313                                                newctype,
2314                                                &scratch_buffer, NULL,
2315                                                NULL));
2316             Safefree(scratch_buffer);
2317
2318 #  endif
2319
2320             Perl_sv_catpvf(aTHX_ PL_warn_locale, "\n");
2321
2322             /* If we are actually in the scope of the locale or are debugging,
2323              * output the message now.  If not in that scope, we save the
2324              * message to be output at the first operation using this locale,
2325              * if that actually happens.  Most programs don't use locales, so
2326              * they are immune to bad ones.  */
2327             if (IN_LC(LC_CTYPE) || UNLIKELY(DEBUG_L_TEST)) {
2328
2329                 /* The '0' below suppresses a bogus gcc compiler warning */
2330                 Perl_warner(aTHX_ packWARN(WARN_LOCALE), SvPVX(PL_warn_locale),
2331                                                                             0);
2332
2333                 if (IN_LC(LC_CTYPE)) {
2334                     SvREFCNT_dec_NN(PL_warn_locale);
2335                     PL_warn_locale = NULL;
2336                 }
2337             }
2338         }
2339     }
2340 }
2341
2342 #  endif /* USE_LOCALE_CTYPE */
2343
2344 void
2345 Perl__warn_problematic_locale()
2346 {
2347
2348 #  ifdef USE_LOCALE_CTYPE
2349
2350     dTHX;
2351
2352     /* Internal-to-core function that outputs the message in PL_warn_locale,
2353      * and then NULLS it.  Should be called only through the macro
2354      * CHECK_AND_WARN_PROBLEMATIC_LOCALE_ */
2355
2356     if (PL_warn_locale) {
2357         Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
2358                              SvPVX(PL_warn_locale),
2359                              0 /* dummy to avoid compiler warning */ );
2360         SvREFCNT_dec_NN(PL_warn_locale);
2361         PL_warn_locale = NULL;
2362     }
2363
2364 #  endif
2365
2366 }
2367
2368 STATIC void
2369 S_new_LC_ALL(pTHX_ const char *unused)
2370 {
2371     unsigned int i;
2372
2373     /* LC_ALL updates all the things we care about. */
2374
2375     PERL_UNUSED_ARG(unused);
2376
2377     for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++) {
2378         if (update_functions[i]) {
2379             const char * this_locale = querylocale_i(i);
2380             update_functions[i](aTHX_ this_locale);
2381         }
2382     }
2383 }
2384
2385 #  ifdef USE_LOCALE_COLLATE
2386
2387 STATIC void
2388 S_new_collate(pTHX_ const char *newcoll)
2389 {
2390     PERL_ARGS_ASSERT_NEW_COLLATE;
2391
2392
2393     /* Called after each libc setlocale() call affecting LC_COLLATE, to tell
2394      * core Perl this and that 'newcoll' is the name of the new locale.
2395      *
2396      * The design of locale collation is that every locale change is given an
2397      * index 'PL_collation_ix'.  The first time a string particpates in an
2398      * operation that requires collation while locale collation is active, it
2399      * is given PERL_MAGIC_collxfrm magic (via sv_collxfrm_flags()).  That
2400      * magic includes the collation index, and the transformation of the string
2401      * by strxfrm(), q.v.  That transformation is used when doing comparisons,
2402      * instead of the string itself.  If a string changes, the magic is
2403      * cleared.  The next time the locale changes, the index is incremented,
2404      * and so we know during a comparison that the transformation is not
2405      * necessarily still valid, and so is recomputed.  Note that if the locale
2406      * changes enough times, the index could wrap (a U32), and it is possible
2407      * that a transformation would improperly be considered valid, leading to
2408      * an unlikely bug */
2409
2410     /* Return if the locale isn't changing */
2411     if (strEQ(PL_collation_name, newcoll)) {
2412         return;
2413     }
2414
2415     Safefree(PL_collation_name);
2416     PL_collation_name = savepv(newcoll);
2417     ++PL_collation_ix;
2418
2419     /* Set the new one up if trivial.  Since this is called at process
2420      * initialization, be aware that this bit can't rely on much being
2421      * available. */
2422     PL_collation_standard = isNAME_C_OR_POSIX(newcoll);
2423     if (PL_collation_standard) {
2424         DEBUG_Lv(PerlIO_printf(Perl_debug_log, "Setting PL_collation name='%s'\n", PL_collation_name));
2425         PL_collxfrm_base = 0;
2426         PL_collxfrm_mult = 2;
2427         PL_in_utf8_COLLATE_locale = FALSE;
2428         PL_strxfrm_NUL_replacement = '\0';
2429         PL_strxfrm_max_cp = 0;
2430         return;
2431     }
2432
2433     /* Flag that the remainder of the set up is being deferred until first need */
2434     PL_collxfrm_mult = 0;
2435     PL_collxfrm_base = 0;
2436
2437 }
2438
2439 #  endif /* USE_LOCALE_COLLATE */
2440 #endif  /* USE_LOCALE */
2441
2442 #ifdef WIN32
2443
2444 wchar_t *
2445 Perl_Win_utf8_string_to_wstring(const char * utf8_string)
2446 {
2447     wchar_t *wstring;
2448
2449     int req_size = MultiByteToWideChar(CP_UTF8, 0, utf8_string, -1, NULL, 0);
2450     if (! req_size) {
2451         errno = EINVAL;
2452         return NULL;
2453     }
2454
2455     Newx(wstring, req_size, wchar_t);
2456
2457     if (! MultiByteToWideChar(CP_UTF8, 0, utf8_string, -1, wstring, req_size))
2458     {
2459         Safefree(wstring);
2460         errno = EINVAL;
2461         return NULL;
2462     }
2463
2464     return wstring;
2465 }
2466
2467 char *
2468 Perl_Win_wstring_to_utf8_string(const wchar_t * wstring)
2469 {
2470     char *utf8_string;
2471
2472     int req_size =
2473               WideCharToMultiByte(CP_UTF8, 0, wstring, -1, NULL, 0, NULL, NULL);
2474
2475     Newx(utf8_string, req_size, char);
2476
2477     if (! WideCharToMultiByte(CP_UTF8, 0, wstring, -1, utf8_string,
2478                                                          req_size, NULL, NULL))
2479     {
2480         Safefree(utf8_string);
2481         errno = EINVAL;
2482         return NULL;
2483     }
2484
2485     return utf8_string;
2486 }
2487
2488 #define USE_WSETLOCALE
2489
2490 #ifdef USE_WSETLOCALE
2491
2492 STATIC char *
2493 S_wrap_wsetlocale(pTHX_ int category, const char *locale) {
2494     wchar_t *wlocale = NULL;
2495     wchar_t *wresult;
2496     char *result;
2497
2498     if (locale) {
2499         wlocale = Win_utf8_string_to_wstring(locale);
2500         if (! wlocale) {
2501             return NULL;
2502         }
2503     }
2504     else {
2505         wlocale = NULL;
2506     }
2507
2508     wresult = _wsetlocale(category, wlocale);
2509     Safefree(wlocale);
2510
2511     if (! wresult) {
2512             return NULL;
2513         }
2514
2515     result = Win_wstring_to_utf8_string(wresult);
2516     SAVEFREEPV(result); /* is there something better we can do here? */
2517
2518     return result;
2519 }
2520
2521 #endif
2522
2523 STATIC char *
2524 S_win32_setlocale(pTHX_ int category, const char* locale)
2525 {
2526     /* This, for Windows, emulates POSIX setlocale() behavior.  There is no
2527      * difference between the two unless the input locale is "", which normally
2528      * means on Windows to get the machine default, which is set via the
2529      * computer's "Regional and Language Options" (or its current equivalent).
2530      * In POSIX, it instead means to find the locale from the user's
2531      * environment.  This routine changes the Windows behavior to first look in
2532      * the environment, and, if anything is found, use that instead of going to
2533      * the machine default.  If there is no environment override, the machine
2534      * default is used, by calling the real setlocale() with "".
2535      *
2536      * The POSIX behavior is to use the LC_ALL variable if set; otherwise to
2537      * use the particular category's variable if set; otherwise to use the LANG
2538      * variable. */
2539
2540     bool override_LC_ALL = FALSE;
2541     char * result;
2542     unsigned int i;
2543
2544     if (locale && strEQ(locale, "")) {
2545
2546 #  ifdef LC_ALL
2547
2548         locale = PerlEnv_getenv("LC_ALL");
2549         if (! locale) {
2550             if (category ==  LC_ALL) {
2551                 override_LC_ALL = TRUE;
2552             }
2553             else {
2554
2555 #  endif
2556
2557                 for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++) {
2558                     if (category == categories[i]) {
2559                         locale = PerlEnv_getenv(category_names[i]);
2560                         goto found_locale;
2561                     }
2562                 }
2563
2564                 locale = PerlEnv_getenv("LANG");
2565                 if (! locale) {
2566                     locale = "";
2567                 }
2568
2569               found_locale: ;
2570
2571 #  ifdef LC_ALL
2572
2573             }
2574         }
2575
2576 #  endif
2577
2578     }
2579
2580 #ifdef USE_WSETLOCALE
2581     result = S_wrap_wsetlocale(aTHX_ category, locale);
2582 #else
2583     result = setlocale(category, locale);
2584 #endif
2585     DEBUG_L(PerlIO_printf(Perl_debug_log, "%s\n",
2586                           setlocale_debug_string_r(category, locale, result)));
2587
2588     if (! override_LC_ALL)  {
2589         return result;
2590     }
2591
2592     /* Here the input category was LC_ALL, and we have set it to what is in the
2593      * LANG variable or the system default if there is no LANG.  But these have
2594      * lower priority than the other LC_foo variables, so override it for each
2595      * one that is set.  (If they are set to "", it means to use the same thing
2596      * we just set LC_ALL to, so can skip) */
2597
2598     for (i = 0; i < LC_ALL_INDEX_; i++) {
2599         result = PerlEnv_getenv(category_names[i]);
2600         if (result && strNE(result, "")) {
2601 #ifdef USE_WSETLOCALE
2602             S_wrap_wsetlocale(aTHX_ categories[i], result);
2603 #else
2604             setlocale(categories[i], result);
2605 #endif
2606             DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s\n",
2607                 setlocale_debug_string_i(i, result, "not captured")));
2608         }
2609     }
2610
2611     result = setlocale(LC_ALL, NULL);
2612     DEBUG_L(PerlIO_printf(Perl_debug_log, "%s\n",
2613                           setlocale_debug_string_c(LC_ALL, NULL, result)));
2614
2615     return result;
2616 }
2617
2618 #endif
2619
2620 /*
2621 =for apidoc Perl_setlocale
2622
2623 This is an (almost) drop-in replacement for the system L<C<setlocale(3)>>,
2624 taking the same parameters, and returning the same information, except that it
2625 returns the correct underlying C<LC_NUMERIC> locale.  Regular C<setlocale> will
2626 instead return C<C> if the underlying locale has a non-dot decimal point
2627 character, or a non-empty thousands separator for displaying floating point
2628 numbers.  This is because perl keeps that locale category such that it has a
2629 dot and empty separator, changing the locale briefly during the operations
2630 where the underlying one is required. C<Perl_setlocale> knows about this, and
2631 compensates; regular C<setlocale> doesn't.
2632
2633 Another reason it isn't completely a drop-in replacement is that it is
2634 declared to return S<C<const char *>>, whereas the system setlocale omits the
2635 C<const> (presumably because its API was specified long ago, and can't be
2636 updated; it is illegal to change the information C<setlocale> returns; doing
2637 so leads to segfaults.)
2638
2639 Finally, C<Perl_setlocale> works under all circumstances, whereas plain
2640 C<setlocale> can be completely ineffective on some platforms under some
2641 configurations.
2642
2643 C<Perl_setlocale> should not be used to change the locale except on systems
2644 where the predefined variable C<${^SAFE_LOCALES}> is 1.  On some such systems,
2645 the system C<setlocale()> is ineffective, returning the wrong information, and
2646 failing to actually change the locale.  C<Perl_setlocale>, however works
2647 properly in all circumstances.
2648
2649 The return points to a per-thread static buffer, which is overwritten the next
2650 time C<Perl_setlocale> is called from the same thread.
2651
2652 =cut
2653
2654 */
2655
2656 #ifndef USE_LOCALE_NUMERIC
2657 #  define affects_LC_NUMERIC(cat) 0
2658 #elif defined(LC_ALL)
2659 #  define affects_LC_NUMERIC(cat) (cat == LC_NUMERIC || cat == LC_ALL)
2660 #else
2661 #  define affects_LC_NUMERIC(cat) (cat == LC_NUMERIC)
2662 #endif
2663
2664 const char *
2665 Perl_setlocale(const int category, const char * locale)
2666 {
2667     /* This wraps POSIX::setlocale() */
2668
2669 #ifndef USE_LOCALE
2670
2671     PERL_UNUSED_ARG(category);
2672     PERL_UNUSED_ARG(locale);
2673
2674     return "C";
2675
2676 #else
2677
2678     const char * retval;
2679     dTHX;
2680
2681     DEBUG_L(PerlIO_printf(Perl_debug_log,
2682                           "Entering Perl_setlocale(%d, \"%s\")\n",
2683                           category, locale));
2684
2685     /* A NULL locale means only query what the current one is. */
2686     if (locale == NULL) {
2687
2688 #  ifndef USE_LOCALE_NUMERIC
2689
2690         /* Without LC_NUMERIC, it's trivial; we just return the value */
2691         return save_to_buffer(querylocale_r(category),
2692                               &PL_setlocale_buf, &PL_setlocale_bufsize);
2693 #  else
2694
2695         /* We have the LC_NUMERIC name saved, because we are normally switched
2696          * into the C locale (or equivalent) for it. */
2697         if (category == LC_NUMERIC) {
2698             DEBUG_L(PerlIO_printf(Perl_debug_log,
2699                     "Perl_setlocale(LC_NUMERIC, NULL) returning stashed '%s'\n",
2700                     PL_numeric_name));
2701
2702             /* We don't have to copy this return value, as it is a per-thread
2703              * variable, and won't change until a future setlocale */
2704             return PL_numeric_name;
2705         }
2706
2707 #    ifndef LC_ALL
2708
2709         /* Without LC_ALL, just return the value */
2710         return save_to_buffer(querylocale_r(category),
2711                               &PL_setlocale_buf, &PL_setlocale_bufsize);
2712
2713 #    else
2714
2715         /* Here, LC_ALL is available on this platform.  It's the one
2716          * complicating category (because it can contain a toggled LC_NUMERIC
2717          * value), for all the remaining ones (we took care of LC_NUMERIC
2718          * above), just return the value */
2719         if (category != LC_ALL) {
2720             return save_to_buffer(querylocale_r(category),
2721                                   &PL_setlocale_buf, &PL_setlocale_bufsize);
2722         }
2723
2724         bool toggled = FALSE;
2725
2726         /* For an LC_ALL query, switch back to the underlying numeric locale
2727          * (if we aren't there already) so as to get the correct results.  Our
2728          * records for all the other categories are valid without switching */
2729         if (! PL_numeric_underlying) {
2730             set_numeric_underlying();
2731             toggled = TRUE;
2732         }
2733
2734         retval = querylocale_c(LC_ALL);
2735
2736         if (toggled) {
2737             set_numeric_standard();
2738         }
2739
2740         DEBUG_L(PerlIO_printf(Perl_debug_log, "%s\n",
2741                             setlocale_debug_string_r(category, locale, retval)));
2742
2743         return save_to_buffer(retval, &PL_setlocale_buf, &PL_setlocale_bufsize);
2744
2745 #    endif      /* Has LC_ALL */
2746 #  endif        /* Has LC_NUMERIC */
2747
2748     } /* End of querying the current locale */
2749
2750
2751     /* Here, the input has a locale to change to.  First find the current
2752      * locale */
2753     unsigned int cat_index = get_category_index(category, NULL);
2754     retval = querylocale_i(cat_index);
2755
2756     /* If the new locale is the same as the current one, nothing is actually
2757      * being changed, so do nothing. */
2758     if (      strEQ(retval, locale)
2759         && (   ! affects_LC_NUMERIC(category)
2760
2761 #  ifdef USE_LOCALE_NUMERIC
2762
2763             || strEQ(locale, PL_numeric_name)
2764
2765 #  endif
2766
2767     )) {
2768         DEBUG_L(PerlIO_printf(Perl_debug_log,
2769                               "Already in requested locale: no action taken\n"));
2770         return save_to_buffer(setlocale_i(cat_index, locale),
2771                               &PL_setlocale_buf, &PL_setlocale_bufsize);
2772     }
2773
2774     /* Here, an actual change is being requested.  Do it */
2775     retval = setlocale_i(cat_index, locale);
2776
2777     if (! retval) {
2778         DEBUG_L(PerlIO_printf(Perl_debug_log, "%s\n",
2779                           setlocale_debug_string_i(cat_index, locale, "NULL")));
2780         return NULL;
2781     }
2782
2783     assert(strNE(retval, ""));
2784     retval = save_to_buffer(retval, &PL_setlocale_buf, &PL_setlocale_bufsize);
2785
2786     /* Now that have changed locales, we have to update our records to
2787      * correspond.  Only certain categories have extra work to update. */
2788     if (update_functions[cat_index]) {
2789         update_functions[cat_index](aTHX_ retval);
2790     }
2791
2792     DEBUG_L(PerlIO_printf(Perl_debug_log, "returning '%s'\n", retval));
2793
2794     return retval;
2795
2796 #endif
2797
2798 }
2799
2800 #ifdef USE_LOCALE
2801
2802 STATIC const char *
2803 S_save_to_buffer(const char * string, const char **buf, Size_t *buf_size)
2804 {
2805     /* Copy the NUL-terminated 'string' to a buffer whose address before this
2806      * call began at *buf, and whose available length before this call was
2807      * *buf_size.
2808      *
2809      * If the length of 'string' is greater than the space available, the
2810      * buffer is grown accordingly, which may mean that it gets relocated.
2811      * *buf and *buf_size will be updated to reflect this.
2812      *
2813      * Regardless, the function returns a pointer to where 'string' is now
2814      * stored.
2815      *
2816      * 'string' may be NULL, which means no action gets taken, and NULL is
2817      * returned.
2818      *
2819      * If *buf or 'buf_size' are NULL or *buf_size is 0, the buffer is assumed
2820      * empty, and memory is malloc'd.   'buf-size' being NULL is to be used
2821      * when this is a single use buffer, which will shortly be freed by the
2822      * caller.
2823      */
2824
2825     Size_t string_size;
2826
2827     PERL_ARGS_ASSERT_SAVE_TO_BUFFER;
2828
2829     if (! string) {
2830         return NULL;
2831     }
2832
2833     /* No-op to copy over oneself */
2834     if (string == *buf) {
2835         return string;
2836     }
2837
2838     string_size = strlen(string) + 1;
2839
2840     if (buf_size == NULL) {
2841         Newx(*buf, string_size, char);
2842     }
2843     else if (*buf_size == 0) {
2844         Newx(*buf, string_size, char);
2845         *buf_size = string_size;
2846     }
2847     else if (string_size > *buf_size) {
2848         Renew(*buf, string_size, char);
2849         *buf_size = string_size;
2850     }
2851
2852     {
2853         dTHX_DEBUGGING;
2854         DEBUG_Lv(PerlIO_printf(Perl_debug_log,
2855                          "Copying '%s' to %p\n",
2856                          ((is_utf8_string((U8 *) string, 0))
2857                           ? string
2858                           :_byte_dump_string((U8 *) string, strlen(string), 0)),
2859                           *buf));
2860             }
2861
2862 #    ifdef DEBUGGING
2863
2864     /* Catch glitches.  Usually this is because LC_CTYPE needs to be the same
2865      * locale as whatever is being worked on */
2866     if (UNLIKELY(instr(string, REPLACEMENT_CHARACTER_UTF8))) {
2867         dTHX_DEBUGGING;
2868
2869         locale_panic_(Perl_form(aTHX_
2870                                 "Unexpected REPLACEMENT_CHARACTER in '%s'\n%s",
2871                                 string, get_LC_ALL_display()));
2872     }
2873
2874 #    endif
2875
2876     Copy(string, *buf, string_size, char);
2877     return *buf;
2878 }
2879
2880 STATIC utf8ness_t
2881 S_get_locale_string_utf8ness_i(pTHX_ const char * locale,
2882                                      const unsigned cat_index,
2883                                      const char * string,
2884                                      const locale_utf8ness_t known_utf8)
2885 {
2886     /* Return to indicate if 'string' in the locale given by the input
2887      * arguments should be considered UTF-8 or not.
2888      *
2889      * If the input 'locale' is not NULL, use that for the locale; otherwise
2890      * use the current locale for the category specified by 'cat_index'.
2891      */
2892
2893     Size_t len;
2894     const U8 * first_variant = NULL;
2895
2896     PERL_ARGS_ASSERT_GET_LOCALE_STRING_UTF8NESS_I;
2897     assert(cat_index <= NOMINAL_LC_ALL_INDEX);
2898
2899     if (string == NULL) {
2900         return UTF8NESS_NO;
2901     }
2902
2903     if (IN_BYTES) { /* respect 'use bytes' */
2904         return UTF8NESS_NO;
2905     }
2906
2907     len = strlen(string);
2908
2909     /* UTF8ness is immaterial if the representation doesn't vary */
2910     if (is_utf8_invariant_string_loc((U8 *) string, len, &first_variant)) {
2911         return UTF8NESS_IMMATERIAL;
2912     }
2913
2914     /* Can't be UTF-8 if invalid */
2915     if (! is_utf8_string((U8 *) first_variant,
2916                          len - ((char *) first_variant - string)))
2917     {
2918         return UTF8NESS_NO;
2919     }
2920
2921     /* Here and below, we know the string is legal UTF-8, containing at least
2922      * one character requiring a sequence of two or more bytes.  It is quite
2923      * likely to be UTF-8.  But it pays to be paranoid and do further checking.
2924      *
2925      * If we already know the UTF-8ness of the locale, then we immediately know
2926      * what the string is */
2927     if (UNLIKELY(known_utf8 != LOCALE_UTF8NESS_UNKNOWN)) {
2928         if (known_utf8 == LOCALE_IS_UTF8) {
2929             return UTF8NESS_YES;
2930         }
2931         else {
2932             return UTF8NESS_NO;
2933         }
2934     }
2935
2936 #  if defined(HAS_SOME_LANGINFO) || defined(HAS_MBTOWC) || defined(HAS_MBRTOWC)
2937
2938     /* Here, we have available the libc functions that can be used to
2939      * accurately determine the UTF8ness of the underlying locale.  If it is a
2940      * UTF-8 locale, the string is UTF-8;  otherwise it was coincidental that
2941      * the string is legal UTF-8
2942      *
2943      * However, if the perl is compiled to not pay attention to the category
2944      * being passed in, you might think that that locale is essentially always
2945      * the C locale, so it would make sense to say it isn't UTF-8.  But to get
2946      * here, the string has to contain characters unknown in the C locale.  And
2947      * in fact, Windows boxes are compiled without LC_MESSAGES, as their
2948      * message catalog isn't really a part of the locale system.  But those
2949      * messages really could be UTF-8, and given that the odds are rather small
2950      * of something not being UTF-8 but being syntactically valid UTF-8, khw
2951      * has decided to call such strings as UTF-8. */
2952
2953     if (locale == NULL) {
2954         locale = querylocale_i(cat_index);
2955     }
2956     if (is_locale_utf8(locale)) {
2957         return UTF8NESS_YES;
2958     }
2959
2960     return UTF8NESS_NO;
2961
2962 #  else
2963
2964     /* Here, we have a valid UTF-8 string containing non-ASCII characters, and
2965      * don't have access to functions to check if the locale is UTF-8 or not.
2966      * Assume that it is.  khw tried adding a check that the string is entirely
2967      * in a single Unicode script, but discovered the strftime() timezone is
2968      * user-settable through the environment, which may be in a different
2969      * script than the locale-expected value. */
2970     PERL_UNUSED_ARG(locale);
2971     PERL_UNUSED_ARG(cat_index);
2972
2973     return UTF8NESS_YES;
2974
2975 #  endif
2976
2977 }
2978
2979 #  ifdef WIN32
2980
2981 bool
2982 Perl_get_win32_message_utf8ness(pTHX_ const char * string)
2983 {
2984     /* NULL => locale irrelevant, 0 => category irrelevant
2985      * so returns based on the UTF-8 legality of the input string, ignoring the
2986      * locale and category completely.
2987      *
2988      * This is because Windows doesn't have LC_MESSAGES */
2989     return get_locale_string_utf8ness_i(NULL, 0, string, LOCALE_IS_UTF8);
2990 }
2991
2992 #  endif
2993 #endif  /* USE_LOCALE */
2994
2995
2996 int
2997 Perl_mbtowc_(pTHX_ const wchar_t * pwc, const char * s, const Size_t len)
2998 {
2999
3000 #if ! defined(HAS_MBRTOWC) && ! defined(HAS_MBTOWC)
3001
3002     PERL_UNUSED_ARG(pwc);
3003     PERL_UNUSED_ARG(s);
3004     PERL_UNUSED_ARG(len);
3005     return -1;
3006
3007 #else   /* Below we have some form of mbtowc() */
3008 #   if defined(HAS_MBRTOWC)                                     \
3009    && (defined(USE_LOCALE_THREADS) || ! defined(HAS_MBTOWC))
3010 #    define USE_MBRTOWC
3011 #  else
3012 #    undef USE_MBRTOWC
3013 #  endif
3014
3015     int retval = -1;
3016
3017     if (s == NULL) { /* Initialize the shift state to all zeros in
3018                         PL_mbrtowc_ps. */
3019
3020 #  if defined(USE_MBRTOWC)
3021
3022         memzero(&PL_mbrtowc_ps, sizeof(PL_mbrtowc_ps));
3023         return 0;
3024
3025 #  else
3026
3027         MBTOWC_LOCK_;
3028         SETERRNO(0, 0);
3029         retval = mbtowc(NULL, NULL, 0);
3030         MBTOWC_UNLOCK_;
3031         return retval;
3032
3033 #  endif
3034
3035     }
3036
3037 #  if defined(USE_MBRTOWC)
3038
3039     SETERRNO(0, 0);
3040     retval = (SSize_t) mbrtowc((wchar_t *) pwc, s, len, &PL_mbrtowc_ps);
3041
3042 #  else
3043
3044     /* Locking prevents races, but locales can be switched out without locking,
3045      * so this isn't a cure all */
3046     MBTOWC_LOCK_;
3047     SETERRNO(0, 0);
3048     retval = mbtowc((wchar_t *) pwc, s, len);
3049     MBTOWC_UNLOCK_;
3050
3051 #  endif
3052
3053     return retval;
3054
3055 #endif
3056
3057 }
3058
3059 /*
3060 =for apidoc Perl_localeconv
3061
3062 This is a thread-safe version of the libc L<localeconv(3)>.  It is the same as
3063 L<POSIX::localeconv|POSIX/localeconv> (returning a hash of the C<localeconv()>
3064 fields), but directly callable from XS code.
3065
3066 =cut
3067 */
3068
3069 HV *
3070 Perl_localeconv(pTHX)
3071 {
3072
3073 #if  ! defined(HAS_SOME_LOCALECONV)                                     \
3074  || (! defined(USE_LOCALE_MONETARY) && ! defined(USE_LOCALE_NUMERIC))
3075
3076     return newHV();
3077
3078 #else
3079
3080     return my_localeconv(0, LOCALE_UTF8NESS_UNKNOWN);
3081
3082 #endif
3083
3084 }
3085
3086 #if  defined(HAS_SOME_LOCALECONV)                                   \
3087  && (defined(USE_LOCALE_MONETARY) || defined(USE_LOCALE_NUMERIC))
3088
3089 HV *
3090 S_my_localeconv(pTHX_ const int item, const locale_utf8ness_t locale_is_utf8)
3091 {
3092     HV * retval;
3093     locale_utf8ness_t numeric_locale_is_utf8  = LOCALE_UTF8NESS_UNKNOWN;
3094     locale_utf8ness_t monetary_locale_is_utf8 = LOCALE_UTF8NESS_UNKNOWN;
3095     HV * (*copy_localeconv)(pTHX_ const struct lconv *,
3096                                   const int,
3097                                   const locale_utf8ness_t,
3098                                   const locale_utf8ness_t);
3099
3100     /* A thread-safe locale_conv().  The locking mechanisms vary greatly
3101      * depending on platform capabilities.  They all share this common set up
3102      * code for the function, and then conditional compilations choose one of
3103      * several terminations.
3104      *
3105      * There are two use cases:
3106      * 1) Called from POSIX::locale_conv().  This returns lconv() copied to
3107      *    a hash, based on the current underlying locale.
3108      * 2) Certain items that nl_langinfo() provides are also derivable from
3109      *    the return of localeconv().  Windows notably doesn't have
3110      *    nl_langinfo(), so on that, and actually any platform lacking it,
3111      *    my_localeconv() is used to emulate it for those particular items.
3112      *    The code to do this is compiled only on such platforms.  Rather than
3113      *    going to the expense of creating a full hash when only one item is
3114      *    needed, just the desired item is returned, in an SV cast to an HV.
3115      *
3116      * There is a helper function to accomplish each of the two tasks.  The
3117      * function pointer just below is set to the appropriate one, and is called
3118      * from each of the various implementations, in the middle of whatever
3119      * necessary locking/locale swapping have been done. */
3120
3121 #  ifdef HAS_SOME_LANGINFO
3122
3123     PERL_UNUSED_ARG(item);
3124     PERL_UNUSED_ARG(locale_is_utf8);
3125
3126 #    ifdef USE_LOCALE_NUMERIC
3127
3128     /* When there is a nl_langinfo, we will only be called for localeconv
3129      * numeric purposes. */
3130     const bool is_localeconv_call = true;
3131
3132 #    endif
3133
3134 #  else
3135
3136     /* Note we use this sentinel; this works because this only gets compiled
3137      * when our perl_langinfo.h is used, and that uses negative numbers for all
3138      * the items */
3139     const bool is_localeconv_call = (item == 0);
3140     if (is_localeconv_call)
3141
3142 #  endif
3143
3144     {
3145         copy_localeconv = S_populate_localeconv;
3146
3147 #    ifdef USE_LOCALE_NUMERIC
3148
3149         /* Get the UTF8ness of the locales now to avoid repeating this for each
3150          * string returned by localeconv() */
3151         numeric_locale_is_utf8 = (is_locale_utf8(PL_numeric_name))
3152                                   ? LOCALE_IS_UTF8
3153                                   : LOCALE_NOT_UTF8;
3154
3155 #    endif
3156 #    ifdef USE_LOCALE_MONETARY
3157
3158         monetary_locale_is_utf8 = (is_locale_utf8(querylocale_c(LC_MONETARY)))
3159                                   ? LOCALE_IS_UTF8
3160                                   : LOCALE_NOT_UTF8;
3161
3162 #  endif
3163
3164     }
3165
3166 #  ifndef HAS_SOME_LANGINFO
3167
3168     else {
3169         copy_localeconv = S_get_nl_item_from_localeconv;
3170         numeric_locale_is_utf8 = locale_is_utf8;
3171     }
3172
3173 #  endif
3174
3175     PERL_ARGS_ASSERT_MY_LOCALECONV;
3176 /*--------------------------------------------------------------------------*/
3177 /* Here, we are done with the common beginning of all the implementations of
3178  * my_localeconv().  Below are the various terminations of the function (except
3179  * the closing '}'.  They are separated out because the preprocessor directives
3180  * were making the simple logic hard to follow.  Each implementation ends with
3181  * the same few lines.  khw decided to keep those separate because he thought
3182  * it was clearer to the reader.
3183  *
3184  * The first distinct termination (of the above common code) are the
3185  * implementations when we have locale_conv_l() and can use it.  These are the
3186  * simplest cases, without any locking needed. */
3187 #  if defined(USE_POSIX_2008_LOCALE) && defined(HAS_LOCALECONV_L)
3188
3189      /* And there are two sub-cases: First (by far the most common) is where we
3190       * are compiled to pay attention to LC_NUMERIC */
3191 #    ifdef USE_LOCALE_NUMERIC
3192
3193     const locale_t cur = use_curlocale_scratch();
3194     locale_t with_numeric = duplocale(cur);
3195
3196     /* Just create a new locale object with what we've got, but using the
3197      * underlying LC_NUMERIC locale */
3198     with_numeric = newlocale(LC_NUMERIC_MASK, PL_numeric_name, with_numeric);
3199
3200     retval = copy_localeconv(aTHX_ localeconv_l(with_numeric),
3201                                    item,
3202                                    numeric_locale_is_utf8,
3203                                    monetary_locale_is_utf8);
3204     freelocale(with_numeric);
3205
3206     return retval;
3207
3208 /*--------------------------------------------------------------------------*/
3209 #    else   /* Below not paying attention to LC_NUMERIC */
3210
3211     const locale_t cur = use_curlocale_scratch();
3212
3213     retval = copy_localeconv(aTHX_ localeconv_l(cur),
3214                                    item,
3215                                    numeric_locale_is_utf8,
3216                                    monetary_locale_is_utf8);
3217     return retval;
3218
3219 #    endif  /* Above, using lconv_l(); below plain lconv() */
3220 /*--------------------------------------------------------------------------*/
3221 #  elif ! defined(TS_W32_BROKEN_LOCALECONV)  /* Next is regular lconv() */
3222
3223     /* There are so many locks because localeconv() deals with two
3224      * categories, and returns in a single global static buffer.  Some
3225      * locks might be no-ops on this platform, but not others.  We need to
3226      * lock if any one isn't a no-op. */
3227
3228 #    ifdef USE_LOCALE_NUMERIC
3229
3230     LC_NUMERIC_LOCK(0);
3231     const char * orig_switched_locale = NULL;
3232
3233     /* When called internally, are already switched into the proper numeric
3234      * locale; otherwise must toggle to it */
3235     if (is_localeconv_call) {
3236         orig_switched_locale = toggle_locale_c(LC_NUMERIC, PL_numeric_name);
3237     }
3238
3239 #    endif
3240
3241     gwLOCALE_LOCK;
3242     retval = copy_localeconv(aTHX_ localeconv(),
3243                                    item,
3244                                    numeric_locale_is_utf8,
3245                                    monetary_locale_is_utf8);
3246     gwLOCALE_UNLOCK;
3247
3248 #    ifdef USE_LOCALE_NUMERIC
3249
3250     if (orig_switched_locale) {
3251         restore_toggled_locale_c(LC_NUMERIC, orig_switched_locale);
3252     }
3253     LC_NUMERIC_UNLOCK;
3254
3255 #    endif
3256
3257     return retval;
3258
3259 /*--------------------------------------------------------------------------*/
3260 #  else /* defined(TS_W32_BROKEN_LOCALECONV) */
3261
3262     /* Last is a workaround for the broken localeconv() on Windows with
3263      * thread-safe locales prior to VS 15.  It looks at the global locale
3264      * instead of the thread one.  As a work-around, we toggle to the global
3265      * locale; populate the return; then toggle back.  We have to use LC_ALL
3266      * instead of the individual categories because of another bug in Windows.
3267      *
3268      * This introduces a potential race with any other thread that has also
3269      * converted to use the global locale, and doesn't protect its locale calls
3270      * with mutexes.  khw can't think of any reason for a thread to do so on
3271      * Windows, as the locale API is the same regardless of thread-safety, except
3272      * if the code is ported from working on another platform where there might
3273      * be some reason to do this.  But this is typically due to some
3274      * alien-to-Perl library that thinks it owns locale setting.  Such a
3275      * library usn't likely to exist on Windows, so such an application is
3276      * unlikely to be run on Windows
3277      */
3278     bool restore_per_thread = FALSE;
3279
3280 #    ifdef USE_LOCALE_NUMERIC
3281
3282     const char * orig_switched_locale = NULL;
3283
3284     LC_NUMERIC_LOCK(0);
3285
3286     /* When called internally, are already switched into the proper numeric
3287      * locale; otherwise must toggle to it */
3288     if (is_localeconv_call) {
3289         orig_switched_locale = toggle_locale_c(LC_NUMERIC, PL_numeric_name);
3290     }
3291
3292 #    endif
3293
3294     /* Save the per-thread locale state */
3295     const char * save_thread = querylocale_c(LC_ALL);
3296
3297     /* Change to the global locale, and note if we already were there */
3298     if (_configthreadlocale(_DISABLE_PER_THREAD_LOCALE)
3299                          != _DISABLE_PER_THREAD_LOCALE)
3300     {
3301         restore_per_thread = TRUE;
3302     }
3303
3304     /* Save the state of the global locale; then convert to our desired
3305      * state.  */
3306     const char * save_global = querylocale_c(LC_ALL);
3307     void_setlocale_c(LC_ALL, save_thread);
3308
3309     /* Safely stash the desired data */
3310     gwLOCALE_LOCK;
3311     retval = copy_localeconv(aTHX_ localeconv(),
3312                                    item,
3313                                    numeric_locale_is_utf8,
3314                                    monetary_locale_is_utf8);
3315     gwLOCALE_UNLOCK;
3316
3317     /* Restore the global locale's prior state */
3318     void_setlocale_c(LC_ALL, save_global);
3319
3320     /* And back to per-thread locales */
3321     if (restore_per_thread) {
3322         _configthreadlocale(_ENABLE_PER_THREAD_LOCALE);
3323     }
3324
3325     /* Restore the per-thread locale state */
3326     void_setlocale_c(LC_ALL, save_thread);
3327
3328 #    ifdef USE_LOCALE_NUMERIC
3329
3330     if (orig_switched_locale) {
3331         restore_toggled_locale_c(LC_NUMERIC, orig_switched_locale);
3332     }
3333     LC_NUMERIC_UNLOCK;
3334
3335 #    endif
3336
3337     return retval;
3338
3339 #  endif
3340 /*--------------------------------------------------------------------------*/
3341 }
3342
3343 STATIC HV *
3344 S_populate_localeconv(pTHX_ const struct lconv *lcbuf,
3345                             const int unused,
3346                             const locale_utf8ness_t numeric_locale_is_utf8,
3347                             const locale_utf8ness_t monetary_locale_is_utf8)
3348 {
3349     /* This returns a mortalized hash containing all the elements returned by
3350      * localeconv().  It is used by Perl_localeconv() and POSIX::localeconv()
3351      */
3352     PERL_UNUSED_ARG(unused);
3353
3354     struct lconv_offset {
3355         const char *name;
3356         size_t offset;
3357     };
3358
3359     /* Create e.g.,
3360         {"thousands_sep", STRUCT_OFFSET(struct lconv, thousands_sep)},
3361      */
3362 #  define LCONV_ENTRY(name)                                         \
3363             {STRINGIFY(name), STRUCT_OFFSET(struct lconv, name)}
3364
3365     /* Set up structures containing the documented fields.  One structure for
3366      * LC_NUMERIC-controlled strings; one for LC_MONETARY ones, and a final one
3367      * of just numerics. */
3368 #  ifdef USE_LOCALE_NUMERIC
3369
3370     static const struct lconv_offset lconv_numeric_strings[] = {
3371         LCONV_ENTRY(decimal_point),
3372         LCONV_ENTRY(thousands_sep),
3373 #    ifndef NO_LOCALECONV_GROUPING
3374         LCONV_ENTRY(grouping),
3375 #    endif
3376         {NULL, 0}
3377     };
3378
3379 #  endif
3380 #  ifdef USE_LOCALE_MONETARY
3381
3382     static const struct lconv_offset lconv_monetary_strings[] = {
3383         LCONV_ENTRY(int_curr_symbol),
3384         LCONV_ENTRY(currency_symbol),
3385         LCONV_ENTRY(mon_decimal_point),
3386 #    ifndef NO_LOCALECONV_MON_THOUSANDS_SEP
3387         LCONV_ENTRY(mon_thousands_sep),
3388 #    endif
3389 #    ifndef NO_LOCALECONV_MON_GROUPING
3390         LCONV_ENTRY(mon_grouping),
3391 #    endif
3392         LCONV_ENTRY(positive_sign),
3393         LCONV_ENTRY(negative_sign),
3394         {NULL, 0}
3395     };
3396
3397 #  endif
3398
3399     static const struct lconv_offset lconv_integers[] = {
3400 #  ifdef USE_LOCALE_MONETARY
3401         LCONV_ENTRY(int_frac_digits),
3402         LCONV_ENTRY(frac_digits),
3403         LCONV_ENTRY(p_cs_precedes),
3404         LCONV_ENTRY(p_sep_by_space),
3405         LCONV_ENTRY(n_cs_precedes),
3406         LCONV_ENTRY(n_sep_by_space),
3407         LCONV_ENTRY(p_sign_posn),
3408         LCONV_ENTRY(n_sign_posn),
3409 #    ifdef HAS_LC_MONETARY_2008
3410         LCONV_ENTRY(int_p_cs_precedes),
3411         LCONV_ENTRY(int_p_sep_by_space),
3412         LCONV_ENTRY(int_n_cs_precedes),
3413         LCONV_ENTRY(int_n_sep_by_space),
3414         LCONV_ENTRY(int_p_sign_posn),
3415         LCONV_ENTRY(int_n_sign_posn),
3416 #    endif
3417 #  endif
3418         {NULL, 0}
3419     };
3420
3421     static const unsigned category_indices[] = {
3422 #  ifdef USE_LOCALE_NUMERIC
3423         LC_NUMERIC_INDEX_,
3424 #  endif
3425 #  ifdef USE_LOCALE_MONETARY
3426         LC_MONETARY_INDEX_,
3427 #  endif
3428         (unsigned) -1   /* Just so the previous element can always end with a
3429                            comma => subtract 1 below for the max loop index */
3430     };
3431
3432     const char *ptr = (const char *) lcbuf;
3433     const struct lconv_offset *integers = lconv_integers;
3434
3435     HV * retval = newHV();
3436     sv_2mortal((SV*)retval);
3437
3438     PERL_ARGS_ASSERT_POPULATE_LOCALECONV;
3439
3440     /* For each enabled category ... */
3441     for (PERL_UINT_FAST8_T i = 0; i < C_ARRAY_LENGTH(category_indices) - 1; i++) {
3442         const unsigned cat_index = category_indices[i];
3443         locale_utf8ness_t locale_is_utf8 = LOCALE_UTF8NESS_UNKNOWN;
3444         const char *locale;
3445
3446         /* ( = NULL silences a compiler warning; would segfault if it could
3447          * actually happen.) */
3448         const struct lconv_offset *strings = NULL;
3449
3450 #  ifdef USE_LOCALE_NUMERIC
3451         if (cat_index == LC_NUMERIC_INDEX_) {
3452             locale_is_utf8 = numeric_locale_is_utf8;
3453             strings = lconv_numeric_strings;
3454         }
3455 #  else
3456         PERL_UNUSED_ARG(numeric_locale_is_utf8);
3457 #  endif
3458 #  ifdef USE_LOCALE_MONETARY
3459         if (cat_index == LC_MONETARY_INDEX_) {
3460             locale_is_utf8 = monetary_locale_is_utf8;
3461             strings = lconv_monetary_strings;
3462         }
3463 #  else
3464         PERL_UNUSED_ARG(monetary_locale_is_utf8);
3465 #  endif
3466
3467         assert(locale_is_utf8 != LOCALE_UTF8NESS_UNKNOWN);
3468
3469         /* Iterate over the strings structure for this category */
3470         locale = querylocale_i(cat_index);
3471
3472         while (strings->name) {
3473             const char *value = *((const char **)(ptr + strings->offset));
3474             if (value && *value) {
3475                 bool is_utf8 =  /* Only make UTF-8 if required to */
3476                     (UTF8NESS_YES == (get_locale_string_utf8ness_i(locale,
3477                                                               cat_index,
3478                                                               value,
3479                                                               locale_is_utf8)));
3480                 (void) hv_store(retval,
3481                                 strings->name,
3482                                 strlen(strings->name),
3483                                 newSVpvn_utf8(value, strlen(value), is_utf8),
3484                                 0);
3485             }
3486
3487             strings++;
3488         }
3489     }
3490
3491     while (integers->name) {
3492         const char value = *((const char *)(ptr + integers->offset));
3493
3494         if (value != CHAR_MAX)
3495             (void) hv_store(retval, integers->name,
3496                             strlen(integers->name), newSViv(value), 0);
3497         integers++;
3498     }
3499
3500     return retval;
3501 }
3502
3503 #  ifndef HAS_SOME_LANGINFO
3504
3505 STATIC HV *
3506 S_get_nl_item_from_localeconv(pTHX_ const struct lconv *lcbuf,
3507                                     const int item,
3508                                     const locale_utf8ness_t unused1,
3509                                     const locale_utf8ness_t unused2)
3510 {
3511     /* This is a helper function for my_localeconv(), which is called from
3512      * my_langinfo() to emulate the libc nl_langinfo() function on platforms
3513      * that don't have it available.
3514      *
3515      * This function acts as an extension to my_langinfo(), the intermediate
3516      * my_localeconv() call is to set up the locks and switch into the proper
3517      * locale.  That logic exists for other reasons, and by doing it this way,
3518      * it doesn't have to be duplicated.
3519      *
3520      * This function extracts the current value of 'item' in the current locale
3521      * using the localconv() result also passed in, via 'lcbuf'.  The other
3522      * parameter is unused, a placeholder so the signature of this function
3523      * matches another that does need it, and so the two functions can be
3524      * referred to by a single function pointer, to simplify the code below */
3525
3526     const char * prefix = "";
3527     const char * temp = NULL;
3528
3529     PERL_ARGS_ASSERT_GET_NL_ITEM_FROM_LOCALECONV;
3530     PERL_UNUSED_ARG(unused1);
3531     PERL_UNUSED_ARG(unused2);
3532
3533     switch (item) {
3534       case CRNCYSTR:
3535         temp = lcbuf->currency_symbol;
3536
3537         if (lcbuf->p_cs_precedes) {
3538
3539             /* khw couldn't find any documentation that CHAR_MAX is the signal,
3540              * but cygwin uses it thusly */
3541             if (lcbuf->p_cs_precedes == CHAR_MAX) {
3542                 prefix = ".";
3543             }
3544             else {
3545                 prefix = "-";
3546             }
3547         }
3548         else {
3549             prefix = "+";
3550         }
3551
3552         break;
3553
3554       case RADIXCHAR:
3555         temp = lcbuf->decimal_point;
3556         break;
3557
3558       case THOUSEP:
3559         temp = lcbuf->thousands_sep;
3560         break;
3561
3562       default:
3563         locale_panic_(Perl_form(aTHX_
3564                     "Unexpected item passed to populate_localeconv: %d", item));
3565     }
3566
3567     return (HV *) Perl_newSVpvf(aTHX_ "%s%s", prefix, temp);
3568 }
3569
3570 #  endif    /* ! Has some form of langinfo() */
3571 #endif      /*   Has some form of localeconv() and paying attn to a category it
3572                  traffics in */
3573
3574 #ifndef HAS_SOME_LANGINFO
3575
3576 typedef int nl_item;    /* Substitute 'int' for emulated nl_langinfo() */
3577
3578 #endif
3579
3580 /*
3581
3582 =for apidoc      Perl_langinfo
3583 =for apidoc_item Perl_langinfo8
3584
3585 C<Perl_langinfo> is an (almost) drop-in replacement for the system
3586 C<L<nl_langinfo(3)>>, taking the same C<item> parameter values, and returning
3587 the same information.  But it is more thread-safe than regular
3588 C<nl_langinfo()>, and hides the quirks of Perl's locale handling from your
3589 code, and can be used on systems that lack a native C<nl_langinfo>.
3590
3591 However, you should instead use the improved version of this:
3592 L</Perl_langinfo8>, which behaves identically except for an additional
3593 parameter, a pointer to a variable declared as L</C<utf8ness_t>>, into which it
3594 returns to you how you should treat the returned string with regards to it
3595 being encoded in UTF-8 or not.
3596
3597 Concerning the differences between these and plain C<nl_langinfo()>:
3598
3599 =over
3600
3601 =item a.
3602
3603 C<Perl_langinfo8> has an extra parameter, described above.  Besides this, the
3604 other reasons they aren't quite a drop-in replacement is actually an advantage.
3605 The C<const>ness of the return allows the compiler to catch attempts to write
3606 into the returned buffer, which is illegal and could cause run-time crashes.
3607
3608 =item b.
3609
3610 They deliver the correct results for the C<RADIXCHAR> and C<THOUSEP> items,
3611 without you having to write extra code.  The reason for the extra code would be
3612 because these are from the C<LC_NUMERIC> locale category, which is normally
3613 kept set by Perl so that the radix is a dot, and the separator is the empty
3614 string, no matter what the underlying locale is supposed to be, and so to get
3615 the expected results, you have to temporarily toggle into the underlying
3616 locale, and later toggle back.  (You could use plain C<nl_langinfo> and
3617 C<L</STORE_LC_NUMERIC_FORCE_TO_UNDERLYING>> for this but then you wouldn't get
3618 the other advantages of C<Perl_langinfo()>; not keeping C<LC_NUMERIC> in the C
3619 (or equivalent) locale would break a lot of CPAN, which is expecting the radix
3620 (decimal point) character to be a dot.)
3621
3622 =item c.
3623
3624 The system function they replace can have its static return buffer trashed,
3625 not only by a subsequent call to that function, but by a C<freelocale>,
3626 C<setlocale>, or other locale change.  The returned buffer of these functions
3627 is not changed until the next call to one or the other, so the buffer is never
3628 in a trashed state.
3629
3630 =item d.
3631
3632 The return buffer is per-thread, so it also is never overwritten by a call to
3633 these functions from another thread;  unlike the function it replaces.
3634
3635 =item e.
3636
3637 But most importantly, they work on systems that don't have C<nl_langinfo>, such
3638 as Windows, hence making your code more portable.  Of the fifty-some possible
3639 items specified by the POSIX 2008 standard,
3640 L<http://pubs.opengroup.org/onlinepubs/9699919799/basedefs/langinfo.h.html>,
3641 only one is completely unimplemented, though on non-Windows platforms, another
3642 significant one is not fully implemented).  They use various techniques to
3643 recover the other items, including calling C<L<localeconv(3)>>, and
3644 C<L<strftime(3)>>, both of which are specified in C89, so should be always be
3645 available.  Later C<strftime()> versions have additional capabilities; C<""> is
3646 returned for any item not available on your system.
3647
3648 It is important to note that, when called with an item that is recovered by
3649 using C<localeconv>, the buffer from any previous explicit call to
3650 C<L<localeconv(3)>> will be overwritten.  But you shouldn't be using
3651 C<localeconv> anyway because it is is very much not thread-safe, and suffers
3652 from the same problems outlined in item 'b.' above for the fields it returns that
3653 are controlled by the LC_NUMERIC locale category.  Instead, avoid all of those
3654 problems by calling L</Perl_localeconv>, which is thread-safe; or by using the
3655 methods given in L<perlcall>  to call
3656 L<C<POSIX::localeconv()>|POSIX/localeconv>, which is also thread-safe.
3657
3658 =back
3659
3660 The details for those items which may deviate from what this emulation returns
3661 and what a native C<nl_langinfo()> would return are specified in
3662 L<I18N::Langinfo>.
3663
3664 When using C<Perl_langinfo8> (or plain C<Perl_langinfo>) on systems that don't
3665 have a native C<nl_langinfo()>, you must
3666
3667  #include "perl_langinfo.h"
3668
3669 before the C<perl.h> C<#include>.  You can replace your C<langinfo.h>
3670 C<#include> with this one.  (Doing it this way keeps out the symbols that plain
3671 C<langinfo.h> would try to import into the namespace for code that doesn't need
3672 it.)
3673
3674 =cut
3675
3676 */
3677
3678 const char *
3679 Perl_langinfo(const nl_item item)
3680 {
3681     return Perl_langinfo8(item, NULL);
3682 }
3683
3684 const char *
3685 Perl_langinfo8(const nl_item item, utf8ness_t * utf8ness)
3686 {
3687     dTHX;
3688     unsigned cat_index;
3689
3690     PERL_ARGS_ASSERT_PERL_LANGINFO8;
3691
3692     if (utf8ness) {     /* Assume for now */
3693         *utf8ness = UTF8NESS_IMMATERIAL;
3694     }
3695
3696     /* Find the locale category that controls the input 'item'.  If we are not
3697      * paying attention to that category, instead return a default value.  Also
3698      * return the default value if there is no way for us to figure out the
3699      * correct value.  If we have some form of nl_langinfo(), we can always
3700      * figure it out, but lacking that, there may be alternative methods that
3701      * can be used to recover most of the possible items.  Some of those
3702      * methods need libc functions, which may or may not be available.  If
3703      * unavailable, we can't compute the correct value, so must here return the
3704      * default. */
3705     switch (item) {
3706
3707       case CODESET:
3708
3709 #ifdef USE_LOCALE_CTYPE
3710
3711         cat_index = LC_CTYPE_INDEX_;
3712         break;
3713
3714 #else
3715         return C_codeset;
3716 #endif
3717 #if defined(USE_LOCALE_MESSAGES) && defined(HAS_SOME_LANGINFO)
3718
3719       case YESEXPR: case YESSTR: case NOEXPR: case NOSTR:
3720         cat_index = LC_MESSAGES_INDEX_;
3721         break;
3722 #else
3723       case YESEXPR:   return "^[+1yY]";
3724       case YESSTR:    return "yes";
3725       case NOEXPR:    return "^[-0nN]";
3726       case NOSTR:     return "no";
3727 #endif
3728
3729       case CRNCYSTR:
3730
3731 #if  defined(USE_LOCALE_MONETARY)                                   \
3732  && (defined(HAS_SOME_LANGINFO) || defined(HAS_SOME_LOCALECONV))
3733
3734         cat_index = LC_MONETARY_INDEX_;
3735         break;
3736 #else
3737         return "-";
3738 #endif
3739
3740       case RADIXCHAR:
3741
3742 #ifdef CAN_CALCULATE_RADIX
3743
3744         cat_index = LC_NUMERIC_INDEX_;
3745         break;
3746 #else
3747         return C_decimal_point;
3748 #endif
3749
3750       case THOUSEP:
3751
3752 #if  defined(USE_LOCALE_NUMERIC)                                    \
3753  && (defined(HAS_SOME_LANGINFO) || defined(HAS_SOME_LOCALECONV))
3754
3755         cat_index = LC_NUMERIC_INDEX_;
3756         break;
3757 #else
3758         return C_thousands_sep;
3759 #endif
3760
3761 /* The other possible items are all in LC_TIME. */
3762 #ifdef USE_LOCALE_TIME
3763
3764       default:
3765         cat_index = LC_TIME_INDEX_;
3766         break;
3767
3768 #endif
3769 #if ! defined(USE_LOCALE_TIME) || ! defined(HAS_SOME_LANGINFO)
3770
3771     /* If not using LC_TIME, hard code the rest.  Or, if there is no
3772      * nl_langinfo(), we use strftime() as an alternative, and it is missing
3773      * functionality to get every single one, so hard-code those */
3774
3775       case ERA: return "";  /* Unimplemented; for use with strftime() %E
3776                                modifier */
3777
3778       /* These formats are defined by C89, so we assume that strftime supports
3779        * them, and so are returned unconditionally; they may not be what the
3780        * locale actually says, but should give good enough results for someone
3781        * using them as formats (as opposed to trying to parse them to figure
3782        * out what the locale says).  The other format items are actually tested
3783        * to verify they work on the platform */
3784       case D_FMT:         return "%x";
3785       case T_FMT:         return "%X";
3786       case D_T_FMT:       return "%c";
3787
3788 #  if defined(WIN32) || ! defined(USE_LOCALE_TIME)
3789
3790       /* strftime() on Windows doesn't have the POSIX (beyond C89) extensions
3791        * that would allow it to recover these */
3792       case ERA_D_FMT:     return "%x";
3793       case ERA_T_FMT:     return "%X";
3794       case ERA_D_T_FMT:   return "%c";
3795       case ALT_DIGITS:    return "0";
3796
3797 #  endif
3798 #  ifndef USE_LOCALE_TIME
3799
3800       case T_FMT_AMPM:    return "%r";
3801       case ABDAY_1:       return "Sun";
3802       case ABDAY_2:       return "Mon";
3803       case ABDAY_3:       return "Tue";
3804       case ABDAY_4:       return "Wed";
3805       case ABDAY_5:       return "Thu";
3806       case ABDAY_6:       return "Fri";
3807       case ABDAY_7:       return "Sat";
3808       case AM_STR:        return "AM";
3809       case PM_STR:        return "PM";
3810       case ABMON_1:       return "Jan";
3811       case ABMON_2:       return "Feb";
3812       case ABMON_3:       return "Mar";
3813       case ABMON_4:       return "Apr";
3814       case ABMON_5:       return "May";
3815       case ABMON_6:       return "Jun";
3816       case ABMON_7:       return "Jul";
3817       case ABMON_8:       return "Aug";
3818       case ABMON_9:       return "Sep";
3819       case ABMON_10:      return "Oct";
3820       case ABMON_11:      return "Nov";
3821       case ABMON_12:      return "Dec";
3822       case DAY_1:         return "Sunday";
3823       case DAY_2:         return "Monday";
3824       case DAY_3:         return "Tuesday";
3825       case DAY_4:         return "Wednesday";
3826       case DAY_5:         return "Thursday";
3827       case DAY_6:         return "Friday";
3828       case DAY_7:         return "Saturday";
3829       case MON_1:         return "January";
3830       case MON_2:         return "February";
3831       case MON_3:         return "March";
3832       case MON_4:         return "April";
3833       case MON_5:         return "May";
3834       case MON_6:         return "June";
3835       case MON_7:         return "July";
3836       case MON_8:         return "August";
3837       case MON_9:         return "September";
3838       case MON_10:        return "October";
3839       case MON_11:        return "November";
3840       case MON_12:        return "December";
3841
3842 #  endif
3843 #endif
3844
3845     } /* End of switch on item */
3846
3847 #ifndef USE_LOCALE
3848
3849     Perl_croak_nocontext("panic: Unexpected nl_langinfo() item %d", item);
3850     NOT_REACHED; /* NOTREACHED */
3851     PERL_UNUSED_VAR(cat_index);
3852
3853 #else
3854 #  ifdef USE_LOCALE_NUMERIC
3855
3856     /* Use either the underlying numeric, or the other underlying categories */
3857     if (cat_index == LC_NUMERIC_INDEX_) {
3858         return my_langinfo_c(item, LC_NUMERIC, PL_numeric_name,
3859                              &PL_langinfo_buf, &PL_langinfo_bufsize, utf8ness);
3860     }
3861     else
3862
3863 #  endif
3864
3865     {
3866         return my_langinfo_i(item, cat_index, querylocale_i(cat_index),
3867                              &PL_langinfo_buf, &PL_langinfo_bufsize, utf8ness);
3868     }
3869
3870 #endif
3871
3872 }
3873
3874 #ifdef USE_LOCALE
3875
3876 /* There are several implementations of my_langinfo, depending on the
3877  * Configuration.  They all share the same beginning of the function */
3878 STATIC const char *
3879 S_my_langinfo_i(pTHX_
3880                 const nl_item item,           /* The item to look up */
3881                 const unsigned int cat_index, /* The locale category that
3882                                                  controls it */
3883                 /* The locale to look up 'item' in. */
3884                 const char * locale,
3885
3886                 /* Where to store the result, and where the size of that buffer
3887                  * is stored, updated on exit. retbuf_sizep may be NULL for an
3888                  * empty-on-entry, single use buffer whose size we don't need
3889                  * to keep track of */
3890                 const char ** retbufp,
3891                 Size_t * retbuf_sizep,
3892
3893                 /* If not NULL, the location to store the UTF8-ness of 'item's
3894                  * value, as documented */
3895                 utf8ness_t * utf8ness)
3896 {
3897     const char * retval = NULL;
3898
3899     PERL_ARGS_ASSERT_MY_LANGINFO_I;
3900     assert(cat_index <= NOMINAL_LC_ALL_INDEX);
3901
3902     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
3903                            "Entering my_langinfo item=%d, using locale %s\n",
3904                            item, locale));
3905 /*--------------------------------------------------------------------------*/
3906 /* Above is the common beginning to all the implementations of my_langinfo().
3907  * Below are the various completions.
3908  *
3909  * Some platforms don't deal well with non-ASCII strings in locale X when
3910  * LC_CTYPE is not in X.  (Actually it is probably when X is UTF-8 and LC_CTYPE
3911  * isn't, or vice versa).  There is explicit code to bring the categories into
3912  * sync.  This doesn't seem to be a problem with nl_langinfo(), so that
3913  * implementation doesn't currently worry about it.  But it is a problem on
3914  * Windows boxes, which don't have nl_langinfo(). */
3915
3916 #  if defined(HAS_THREAD_SAFE_NL_LANGINFO_L) && defined(USE_POSIX_2008_LOCALE)
3917
3918     /* Simplest is if we can use nl_langinfo_l()
3919      *
3920      * With it, we can change LC_CTYPE in the same call as the other category */
3921 #    ifdef USE_LOCALE_CTYPE
3922 #      define CTYPE_SAFETY_MASK LC_CTYPE_MASK
3923 #    else
3924 #      define CTYPE_SAFETY_MASK 0
3925 #    endif
3926
3927     locale_t cur = newlocale((category_masks[cat_index] | CTYPE_SAFETY_MASK),
3928                              locale, (locale_t) 0);
3929
3930     retval = save_to_buffer(nl_langinfo_l(item, cur), retbufp, retbuf_sizep);
3931     if (utf8ness) {
3932         *utf8ness = get_locale_string_utf8ness_i(locale, cat_index, retval,
3933                                                  LOCALE_UTF8NESS_UNKNOWN);
3934     }
3935
3936     freelocale(cur);
3937
3938     return retval;
3939 /*--------------------------------------------------------------------------*/
3940 #  elif defined(HAS_NL_LANGINFO) /* nl_langinfo() is available.  */
3941
3942     /* The second version of my_langinfo() is if we have plain nl_langinfo() */
3943
3944 #    ifdef USE_LOCALE_CTYPE
3945
3946     /* Ths function sorts out if things actually have to be switched or not,
3947      * for both calls. */
3948     const char * orig_CTYPE_locale = toggle_locale_c(LC_CTYPE, locale);
3949
3950 #    endif
3951
3952     const char * orig_switched_locale = toggle_locale_i(cat_index, locale);
3953
3954     gwLOCALE_LOCK;
3955     retval = save_to_buffer(nl_langinfo(item), retbufp, retbuf_sizep);
3956     gwLOCALE_UNLOCK;
3957
3958     if (utf8ness) {
3959         *utf8ness = get_locale_string_utf8ness_i(locale, cat_index,
3960                                                retval, LOCALE_UTF8NESS_UNKNOWN);
3961     }
3962
3963     restore_toggled_locale_i(cat_index, orig_switched_locale);
3964
3965 #    ifdef USE_LOCALE_CTYPE
3966     restore_toggled_locale_c(LC_CTYPE, orig_CTYPE_locale);
3967 #    endif
3968
3969     return retval;
3970 /*--------------------------------------------------------------------------*/
3971 #  else   /* Below, emulate nl_langinfo as best we can */
3972
3973     /* And the third and final completion is where we have to emulate
3974      * nl_langinfo().  There are various possibilities depending on the
3975      * Configuration */
3976
3977 #    ifdef USE_LOCALE_CTYPE
3978
3979     const char * orig_CTYPE_locale =  toggle_locale_c(LC_CTYPE, locale);
3980
3981 #    endif
3982
3983     const char * orig_switched_locale = toggle_locale_i(cat_index, locale);
3984
3985     /* Here, we are in the locale we want information about */
3986
3987     /* Almost all the items will have ASCII return values.  Set that here, and
3988      * override if necessary */
3989     utf8ness_t is_utf8 = UTF8NESS_IMMATERIAL;
3990
3991     switch (item) {
3992       default:
3993         assert(item < 0);   /* Make sure using perl_langinfo.h */
3994         retval = "";
3995         break;
3996
3997       case RADIXCHAR:
3998
3999 #    if      defined(HAS_SNPRINTF)                                              \
4000        && (! defined(HAS_SOME_LOCALECONV) || defined(TS_W32_BROKEN_LOCALECONV))
4001
4002         {
4003             /* snprintf() can be used to find the radix character by outputting
4004              * a known simple floating point number to a buffer, and parsing
4005              * it, inferring the radix as the bytes separating the integer and
4006              * fractional parts.  But localeconv() is more direct, not
4007              * requiring inference, so use it instead of the code just below,
4008              * if (likely) it is available and works ok */
4009
4010             char * floatbuf = NULL;
4011             const Size_t initial_size = 10;
4012
4013             Newx(floatbuf, initial_size, char);
4014
4015             /* 1.5 is exactly representable on binary computers */
4016             Size_t needed_size = snprintf(floatbuf, initial_size, "%.1f", 1.5);
4017
4018             /* If our guess wasn't big enough, increase and try again, based on
4019              * the real number that strnprintf() is supposed to return */
4020             if (UNLIKELY(needed_size >= initial_size)) {
4021                 needed_size++;  /* insurance */
4022                 Renew(floatbuf, needed_size, char);
4023                 Size_t new_needed = snprintf(floatbuf, needed_size, "%.1f", 1.5);
4024                 assert(new_needed <= needed_size);
4025                 needed_size = new_needed;
4026             }
4027
4028             char * s = floatbuf;
4029             char * e = floatbuf + needed_size;
4030
4031             /* Find the '1' */
4032             while (s < e && *s != '1') {
4033                 s++;
4034             }
4035
4036             if (LIKELY(s < e)) {
4037                 s++;
4038             }
4039
4040             /* Find the '5' */
4041             char * item_start = s;
4042             while (s < e && *s != '5') {
4043                 s++;
4044             }
4045
4046             /* Everything in between is the radix string */
4047             if (LIKELY(s < e)) {
4048                 *s = '\0';
4049                 retval = save_to_buffer(item_start,
4050                                         (const char **) &PL_langinfo_buf,
4051                                         &PL_langinfo_bufsize);
4052                 Safefree(floatbuf);
4053
4054                 if (utf8ness) {
4055                     is_utf8 = get_locale_string_utf8ness_i(locale, cat_index,
4056                                                            retval,
4057                                                        LOCALE_UTF8NESS_UNKNOWN);
4058
4059                 }
4060
4061                 break;
4062             }
4063
4064             Safefree(floatbuf);
4065         }
4066
4067 #      ifdef HAS_SOME_LOCALECONV /* snprintf() failed; drop down to use
4068                                     localeconv() */
4069
4070         /* FALLTHROUGH */                                                           \
4071
4072 #      else                      /* snprintf() failed and no localeconv() */
4073
4074         retval = C_decimal_point;
4075         break;
4076
4077 #      endif
4078 #    endif
4079 #    ifdef HAS_SOME_LOCALECONV
4080
4081     /* These items are available from localeconv().  (To avoid using
4082      * TS_W32_BROKEN_LOCALECONV, one could use GetNumberFormat and
4083      * GetCurrencyFormat; patches welcome) */
4084
4085       case CRNCYSTR:
4086       case THOUSEP:
4087         {
4088             SV * string = (SV *) my_localeconv(item, LOCALE_UTF8NESS_UNKNOWN);
4089
4090             retval = save_to_buffer(SvPV_nolen(string), retbufp, retbuf_sizep);
4091
4092             if (utf8ness) {
4093                 is_utf8 = get_locale_string_utf8ness_i(locale, cat_index, retval,
4094                                                        LOCALE_UTF8NESS_UNKNOWN);
4095             }
4096
4097             SvREFCNT_dec_NN(string);
4098             break;
4099         }
4100
4101 #    endif  /* Some form of localeconv */
4102 #    ifdef HAS_STRFTIME
4103
4104       /* These formats are only available in later strfmtime's */
4105       case ERA_D_FMT: case ERA_T_FMT: case ERA_D_T_FMT: case T_FMT_AMPM:
4106
4107       /* The rest can be gotten from most versions of strftime(). */
4108       case ABDAY_1: case ABDAY_2: case ABDAY_3:
4109       case ABDAY_4: case ABDAY_5: case ABDAY_6: case ABDAY_7:
4110       case ALT_DIGITS:
4111       case AM_STR: case PM_STR:
4112       case ABMON_1: case ABMON_2: case ABMON_3: case ABMON_4:
4113       case ABMON_5: case ABMON_6: case ABMON_7: case ABMON_8:
4114       case ABMON_9: case ABMON_10: case ABMON_11: case ABMON_12:
4115       case DAY_1: case DAY_2: case DAY_3: case DAY_4:
4116       case DAY_5: case DAY_6: case DAY_7:
4117       case MON_1: case MON_2: case MON_3: case MON_4:
4118       case MON_5: case MON_6: case MON_7: case MON_8:
4119       case MON_9: case MON_10: case MON_11: case MON_12:
4120         {
4121             const char * format;
4122             bool return_format = FALSE;
4123             int mon = 0;
4124             int mday = 1;
4125             int hour = 6;
4126
4127             GCC_DIAG_IGNORE_STMT(-Wimplicit-fallthrough);
4128
4129             switch (item) {
4130               default:
4131                 locale_panic_(Perl_form(aTHX_ "switch case: %d problem", item));
4132                 NOT_REACHED; /* NOTREACHED */
4133
4134               case PM_STR: hour = 18;
4135               case AM_STR:
4136                 format = "%p";
4137                 break;
4138               case ABDAY_7: mday++;
4139               case ABDAY_6: mday++;
4140               case ABDAY_5: mday++;
4141               case ABDAY_4: mday++;
4142               case ABDAY_3: mday++;
4143               case ABDAY_2: mday++;
4144               case ABDAY_1:
4145                 format = "%a";
4146                 break;
4147               case DAY_7: mday++;
4148               case DAY_6: mday++;
4149               case DAY_5: mday++;
4150               case DAY_4: mday++;
4151               case DAY_3: mday++;
4152               case DAY_2: mday++;
4153               case DAY_1:
4154                 format = "%A";
4155                 break;
4156               case ABMON_12: mon++;
4157               case ABMON_11: mon++;
4158               case ABMON_10: mon++;
4159               case ABMON_9:  mon++;
4160               case ABMON_8:  mon++;
4161               case ABMON_7:  mon++;
4162               case ABMON_6:  mon++;
4163               case ABMON_5:  mon++;
4164               case ABMON_4:  mon++;
4165               case ABMON_3:  mon++;
4166               case ABMON_2:  mon++;
4167               case ABMON_1:
4168                 format = "%b";
4169                 break;
4170               case MON_12: mon++;
4171               case MON_11: mon++;
4172               case MON_10: mon++;
4173               case MON_9:  mon++;
4174               case MON_8:  mon++;
4175               case MON_7:  mon++;
4176               case MON_6:  mon++;
4177               case MON_5:  mon++;
4178               case MON_4:  mon++;
4179               case MON_3:  mon++;
4180               case MON_2:  mon++;
4181               case MON_1:
4182                 format = "%B";
4183                 break;
4184               case T_FMT_AMPM:
4185                 format = "%r";
4186                 return_format = TRUE;
4187                 break;
4188               case ERA_D_FMT:
4189                 format = "%Ex";
4190                 return_format = TRUE;
4191                 break;
4192               case ERA_T_FMT:
4193                 format = "%EX";
4194                 return_format = TRUE;
4195                 break;
4196               case ERA_D_T_FMT:
4197                 format = "%Ec";
4198                 return_format = TRUE;
4199                 break;
4200               case ALT_DIGITS:
4201                 format = "%Ow"; /* Find the alternate digit for 0 */
4202                 break;
4203             }
4204
4205             GCC_DIAG_RESTORE_STMT;
4206
4207             /* The year was deliberately chosen so that January 1 is on the
4208              * first day of the week.  Since we're only getting one thing at a
4209              * time, it all works */
4210             const char * temp = my_strftime8(format, 30, 30, hour, mday, mon,
4211                                              2011, 0, 0, 0, &is_utf8);
4212             retval = save_to_buffer(temp, retbufp, retbuf_sizep);
4213             Safefree(temp);
4214
4215             /* If the item is 'ALT_DIGITS', '*retbuf' contains the alternate
4216              * format for wday 0.  If the value is the same as the normal 0,
4217              * there isn't an alternate, so clear the buffer.
4218              *
4219              * (wday was chosen because its range is all a single digit.
4220              * Things like tm_sec have two digits as the minimum: '00'.) */
4221             if (item == ALT_DIGITS && strEQ(*retbufp, "0")) {
4222                 retval = "";
4223                 break;
4224             }
4225
4226             /* ALT_DIGITS is problematic.  Experiments on it showed that
4227              * strftime() did not always work properly when going from alt-9 to
4228              * alt-10.  Only a few locales have this item defined, and in all
4229              * of them on Linux that khw was able to find, nl_langinfo() merely
4230              * returned the alt-0 character, possibly doubled.  Most Unicode
4231              * digits are in blocks of 10 consecutive code points, so that is
4232              * sufficient information for such scripts, as we can infer alt-1,
4233              * alt-2, ....  But for a Japanese locale, a CJK ideographic 0 is
4234              * returned, and the CJK digits are not in code point order, so you
4235              * can't really infer anything.  The localedef for this locale did
4236              * specify the succeeding digits, so that strftime() works properly
4237              * on them, without needing to infer anything.  But the
4238              * nl_langinfo() return did not give sufficient information for the
4239              * caller to understand what's going on.  So until there is
4240              * evidence that it should work differently, this returns the alt-0
4241              * string for ALT_DIGITS. */
4242
4243             if (return_format) {
4244
4245                 /* If to return the format, not the value, overwrite the buffer
4246                  * with it.  But some strftime()s will keep the original format
4247                  * if illegal, so change those to "" */
4248                 if (strEQ(*retbufp, format)) {
4249                     retval = "";
4250                 }
4251                 else {
4252                     retval = format;
4253                 }
4254
4255                 /* A format is always in ASCII */
4256                 is_utf8 = UTF8NESS_IMMATERIAL;
4257             }
4258
4259             break;
4260         }
4261
4262 #    endif
4263
4264       case CODESET:
4265
4266         /* The trivial case */
4267         if (isNAME_C_OR_POSIX(locale)) {
4268             retval = C_codeset;
4269             break;
4270         }
4271
4272 #    ifdef WIN32
4273
4274         /* This function retrieves the code page.  It is subject to change, but
4275          * is documented and has been stable for many releases */
4276         UINT ___lc_codepage_func(void);
4277
4278         retval = save_to_buffer(Perl_form(aTHX_ "%d", ___lc_codepage_func()),
4279                                 retbufp, retbuf_sizep);
4280         DEBUG_Lv(PerlIO_printf(Perl_debug_log, "locale='%s' cp=%s\n",
4281                                                locale, retval));
4282         break;
4283
4284 #    else
4285
4286         /* The codeset is important, but khw did not figure out a way for it to
4287          * be retrieved on non-Windows boxes without nl_langinfo().  But even
4288          * if we can't get it directly, we can usually determine if it is a
4289          * UTF-8 locale or not.  If it is UTF-8, we (correctly) use that for
4290          * the code set. */
4291
4292 #      if defined(HAS_MBTOWC) || defined(HAS_MBRTOWC)
4293
4294         /* If libc mbtowc() evaluates the bytes that form the REPLACEMENT
4295          * CHARACTER as that Unicode code point, this has to be a UTF-8 locale.
4296          * */
4297
4298         wchar_t wc = 0;
4299         (void) Perl_mbtowc_(aTHX_ NULL, NULL, 0);/* Reset shift state */
4300         int mbtowc_ret = Perl_mbtowc_(aTHX_ &wc,
4301                               STR_WITH_LEN(REPLACEMENT_CHARACTER_UTF8));
4302         if (mbtowc_ret >= 0 && wc == UNICODE_REPLACEMENT) {
4303             DEBUG_Lv(PerlIO_printf(Perl_debug_log,
4304                                    "mbtowc returned REPLACEMENT\n"));
4305             retval = "UTF-8";
4306             break;
4307         }
4308
4309         /* Here, it isn't a UTF-8 locale. */
4310
4311 #    else   /* mbtowc() is not available. */
4312
4313         /* Sling together several possibilities, depending on platform
4314          * capabilities and what we found.
4315          *
4316          * For non-English locales or non-dollar currency locales, we likely
4317          * will find out whether a locale is UTF-8 or not */
4318
4319         utf8ness_t is_utf8 = UTF8NESS_UNKNOWN;
4320         const char * scratch_buf = NULL;
4321
4322 #      if defined(USE_LOCALE_MONETARY) && defined(HAS_SOME_LOCALECONV)
4323
4324         /* Can't use this method unless localeconv() is available, as that's
4325          * the way we find out the currency symbol. */
4326
4327         /* First try looking at the currency symbol (via a recursive call) to
4328          * see if it disambiguates things.  Often that will be in the native
4329          * script, and if the symbol isn't legal UTF-8, we know that the locale
4330          * isn't either. */
4331         (void) my_langinfo_c(CRNCYSTR, LC_MONETARY, locale, &scratch_buf, NULL,
4332                              &is_utf8);
4333         Safefree(scratch_buf);
4334
4335 #      endif
4336 #      ifdef USE_LOCALE_TIME
4337
4338         /* If we have ruled out being UTF-8, no point in checking further. */
4339         if (is_utf8 != UTF8NESS_NO) {
4340
4341             /* But otherwise do check more.  This is done even if the currency
4342              * symbol looks to be UTF-8, just in case that's a false positive.
4343              *
4344              * Look at the LC_TIME entries, like the names of the months or
4345              * weekdays.  We quit at the first one that is illegal UTF-8 */
4346
4347             utf8ness_t this_is_utf8 = UTF8NESS_UNKNOWN;
4348             const int times[] = {
4349                 DAY_1, DAY_2, DAY_3, DAY_4, DAY_5, DAY_6, DAY_7,
4350                 MON_1, MON_2, MON_3, MON_4, MON_5, MON_6, MON_7, MON_8,
4351                                             MON_9, MON_10, MON_11, MON_12,
4352                 ALT_DIGITS, AM_STR, PM_STR,
4353                 ABDAY_1, ABDAY_2, ABDAY_3, ABDAY_4, ABDAY_5, ABDAY_6,
4354                                                              ABDAY_7,
4355                 ABMON_1, ABMON_2, ABMON_3, ABMON_4, ABMON_5, ABMON_6,
4356                 ABMON_7, ABMON_8, ABMON_9, ABMON_10, ABMON_11, ABMON_12
4357             };
4358
4359             /* The code in the recursive call can handle switching the locales,
4360              * but by doing it here, we avoid switching each iteration of the
4361              * loop */
4362             const char * orig_TIME_locale = toggle_locale_c(LC_TIME, locale);
4363
4364             for (PERL_UINT_FAST8_T i = 0; i < C_ARRAY_LENGTH(times); i++) {
4365                 scratch_buf = NULL;
4366                 (void) my_langinfo_c(times[i], LC_TIME, locale, &scratch_buf,
4367                                      NULL, &this_is_utf8);
4368                 Safefree(scratch_buf);
4369                 if (this_is_utf8 == UTF8NESS_NO) {
4370                     is_utf8 = UTF8NESS_NO;
4371                     break;
4372                 }
4373
4374                 if (this_is_utf8 == UTF8NESS_YES) {
4375                     is_utf8 = UTF8NESS_YES;
4376                 }
4377             }
4378
4379             /* Here we have gone through all the LC_TIME elements.  is_utf8 has
4380              * been set as follows:
4381              *      UTF8NESS_NO           If any aren't legal UTF-8
4382              *      UTF8NESS_IMMMATERIAL  If all are ASCII
4383              *      UTF8NESS_YES          If all are legal UTF-8 (including
4384              *                            ASCIIi), and at least one isn't
4385              *                            ASCII. */
4386
4387             restore_toggled_locale_c(LC_TIME, orig_TIME_locale);
4388         }
4389
4390 #      endif    /* LC_TIME */
4391
4392         /* If nothing examined above rules out it being UTF-8, and at least one
4393          * thing fits as UTF-8 (and not plain ASCII), assume the codeset is
4394          * UTF-8. */
4395         if (is_utf8 == UTF8NESS_YES) {
4396             retval = "UTF-8";
4397             break;
4398         }
4399
4400         /* Here, nothing examined indicates that the codeset is UTF-8.  But
4401          * what is it?  The other locale categories are not likely to be of
4402          * further help:
4403          *
4404          * LC_NUMERIC   Only a few locales in the world have a non-ASCII radix
4405          *              or group separator.
4406          * LC_CTYPE     This code wouldn't be compiled if mbtowc() existed and
4407          *              was reliable.  This is unlikely in C99.  There are
4408          *              other functions that could be used instead, but are
4409          *              they going to exist, and be able to distinguish between
4410          *              UTF-8 and 8859-1?  Deal with this only if it becomes
4411          *              necessary.
4412          * LC_MESSAGES  The strings returned from strerror() would seem likely
4413          *              candidates, but experience has shown that many systems
4414          *              don't actually have translations installed for them.
4415          *              They are instead always in English, so everything in
4416          *              them is ASCII, which is of no help to us.  A Configure
4417          *              probe could possibly be written to see if this platform
4418          *              has non-ASCII error messages.  But again, wait until it
4419          *              turns out to be an actual problem. */
4420
4421 #    endif    /* ! mbtowc() */
4422
4423         /* Rejoin the mbtowc available/not-available cases.
4424          *
4425          * We got here only because we haven't been able to find the codeset.
4426          * The only other option khw could think of is to see if the codeset is
4427          * part of the locale name.  This is very less than ideal; often there
4428          * is no code set in the name; and at other times they even lie.
4429          *
4430          * Find any dot in the locale name */
4431         retval = (const char *) strchr(locale, '.');
4432         if (! retval) {
4433             retval = "";  /* Alas, no dot */
4434             break;
4435         }
4436
4437         /* Use everything past the dot */
4438         retval++;
4439
4440 #      if defined(HAS_MBTOWC) || defined(HAS_MBRTOWC)
4441
4442         /* When these functions, are available, they were tried earlier and
4443          * indicated that the locale did not act like a proper UTF-8 one.  So
4444          * if it claims to be UTF-8, it is a lie */
4445         if (is_codeset_name_UTF8(retval)) {
4446             retval = "";
4447             break;
4448         }
4449
4450 #      endif
4451
4452         /* Otherwise the code set name is considered to be everything past the
4453          * dot. */
4454         retval = save_to_buffer(retval, retbufp, retbuf_sizep);
4455
4456         break;
4457
4458 #    endif
4459
4460     } /* Giant switch() of nl_langinfo() items */
4461
4462     restore_toggled_locale_i(cat_index, orig_switched_locale);
4463
4464 #    ifdef USE_LOCALE_CTYPE
4465     restore_toggled_locale_c(LC_CTYPE, orig_CTYPE_locale);
4466 #    endif
4467
4468     if (utf8ness) {
4469         *utf8ness = is_utf8;
4470     }
4471
4472     return retval;
4473
4474 #  endif    /* All the implementations of my_langinfo() */
4475
4476 /*--------------------------------------------------------------------------*/
4477
4478 }   /* my_langinfo() */
4479
4480 #endif      /* USE_LOCALE */
4481
4482 char *
4483 Perl_my_strftime8(pTHX_ const char *fmt, int sec, int min, int hour, int mday,
4484                          int mon, int year, int wday, int yday, int isdst,
4485                          utf8ness_t * utf8ness)
4486 {   /* Documented in util.c */
4487     char * retval = my_strftime(fmt, sec, min, hour, mday, mon, year, wday,
4488                                 yday, isdst);
4489
4490     PERL_ARGS_ASSERT_MY_STRFTIME8;
4491
4492     if (utf8ness) {
4493
4494 #ifdef USE_LOCALE_TIME
4495         *utf8ness = get_locale_string_utf8ness_i(NULL, LC_TIME_INDEX_,
4496                                                retval, LOCALE_UTF8NESS_UNKNOWN);
4497 #else
4498         *utf8ness = UTF8NESS_IMMATERIAL;
4499 #endif
4500
4501     }
4502
4503     DEBUG_Lv(PerlIO_printf(Perl_debug_log, "fmt=%s, retval=%s", fmt,
4504                  ((is_utf8_string((U8 *) retval, 0))
4505                   ? retval
4506                   :_byte_dump_string((U8 *) retval, strlen(retval), 0)));
4507              if (utf8ness) PerlIO_printf(Perl_debug_log, "; utf8ness=%d",
4508                                                          (int) *utf8ness);
4509              PerlIO_printf(Perl_debug_log, "\n");
4510             );
4511
4512     return retval;
4513 }
4514
4515 /*
4516  * Initialize locale awareness.
4517  */
4518 int
4519 Perl_init_i18nl10n(pTHX_ int printwarn)
4520 {
4521     /* printwarn is
4522      *
4523      *    0 if not to output warning when setup locale is bad
4524      *    1 if to output warning based on value of PERL_BADLANG
4525      *    >1 if to output regardless of PERL_BADLANG
4526      *
4527      * returns
4528      *    1 = set ok or not applicable,
4529      *    0 = fallback to a locale of lower priority
4530      *   -1 = fallback to all locales failed, not even to the C locale
4531      *
4532      * Under -DDEBUGGING, if the environment variable PERL_DEBUG_LOCALE_INIT is
4533      * set, debugging information is output.
4534      *
4535      * This looks more complicated than it is, mainly due to the #ifdefs and
4536      * error handling.
4537      *
4538      * Besides some asserts, data structure initialization, and specific
4539      * platform complications, this routine is effectively represented by this
4540      * pseudo-code:
4541      *
4542      *      setlocale(LC_ALL, "");                                            x
4543      *      foreach (subcategory) {                                           x
4544      *          curlocales[f(subcategory)] = setlocale(subcategory, NULL);    x
4545      *      }                                                                 x
4546      *      if (platform_so_requires) {
4547      *          foreach (subcategory) {
4548      *            PL_curlocales[f(subcategory)] = curlocales[f(subcategory)]
4549      *          }
4550      *      }
4551      *      foreach (subcategory) {
4552      *          if (needs_special_handling[f(subcategory)] &this_subcat_handler
4553      *      }
4554      *
4555      * This sets all the categories to the values in the current environment,
4556      * saves them temporarily in curlocales[] until they can be handled and/or
4557      * on some platforms saved in a per-thread array PL_curlocales[].
4558      *
4559      * f(foo) is a mapping from the opaque system category numbers to small
4560      * non-negative integers used most everywhere in this file as indices into
4561      * arrays (such as curlocales[]) so the program doesn't have to otherwise
4562      * deal with the opaqueness.
4563      *
4564      * If the platform doesn't have LC_ALL, the lines marked 'x' above are
4565      * effectively replaced by:
4566      *      foreach (subcategory) {                                           y
4567      *          curlocales[f(subcategory)] = setlocale(subcategory, "");      y
4568      *      }                                                                 y
4569      *
4570      * The only differences being the lack of an LC_ALL call, and using ""
4571      * instead of NULL in the setlocale calls.
4572      *
4573      * But there are, of course, complications.
4574      *
4575      * it has to deal with if this is an embedded perl, whose locale doesn't
4576      * come from the environment, but has been set up by the caller.  This is
4577      * pretty simply handled: the "" in the setlocale calls is not a string
4578      * constant, but a variable which is set to NULL in the embedded case.
4579      *
4580      * But the major complication is handling failure and doing fallback.  All
4581      * the code marked 'x' or 'y' above is actually enclosed in an outer loop,
4582      * using the array trial_locales[].  On entry, trial_locales[] is
4583      * initialized to just one entry, containing the NULL or "" locale argument
4584      * shown above.  If, as is almost always the case, everything works, it
4585      * exits after just the one iteration, going on to the next step.
4586      *
4587      * But if there is a failure, the code tries its best to honor the
4588      * environment as much as possible.  It self-modifies trial_locales[] to
4589      * have more elements, one for each of the POSIX-specified settings from
4590      * the environment, such as LANG, ending in the ultimate fallback, the C
4591      * locale.  Thus if there is something bogus with a higher priority
4592      * environment variable, it will try with the next highest, until something
4593      * works.  If everything fails, it limps along with whatever state it got
4594      * to.
4595      *
4596      * A further complication is that Windows has an additional fallback, the
4597      * user-default ANSI code page obtained from the operating system.  This is
4598      * added as yet another loop iteration, just before the final "C"
4599      *
4600      * A slight complication is that in embedded Perls, the locale may already
4601      * be set-up, and we don't want to get it from the normal environment
4602      * variables.  This is handled by having a special environment variable
4603      * indicate we're in this situation.  We simply set setlocale's 2nd
4604      * parameter to be a NULL instead of "".  That indicates to setlocale that
4605      * it is not to change anything, but to return the current value,
4606      * effectively initializing perl's db to what the locale already is.
4607      *
4608      * We play the same trick with NULL if a LC_ALL succeeds.  We call
4609      * setlocale() on the individual categores with NULL to get their existing
4610      * values for our db, instead of trying to change them.
4611      * */
4612
4613     int ok = 1;
4614
4615 #ifndef USE_LOCALE
4616
4617     PERL_UNUSED_ARG(printwarn);
4618
4619 #else  /* USE_LOCALE */
4620 #  ifdef __GLIBC__
4621
4622     const char * const language = PerlEnv_getenv("LANGUAGE");
4623
4624 #  endif
4625
4626     /* NULL uses the existing already set up locale */
4627     const char * const setlocale_init = (PerlEnv_getenv("PERL_SKIP_LOCALE_INIT"))
4628                                         ? NULL
4629                                         : "";
4630     typedef struct trial_locales_struct_s {
4631         const char* trial_locale;
4632         const char* fallback_desc;
4633         const char* fallback_name;
4634     } trial_locales_struct;
4635     /* 5 = 1 each for "", LC_ALL, LANG, (Win32) system default locale, C */
4636     trial_locales_struct trial_locales[5];
4637     unsigned int trial_locales_count;
4638     const char * const lc_all     = PerlEnv_getenv("LC_ALL");
4639     const char * const lang       = PerlEnv_getenv("LANG");
4640     bool setlocale_failure = FALSE;
4641     unsigned int i;
4642
4643     /* A later getenv() could zap this, so only use here */
4644     const char * const bad_lang_use_once = PerlEnv_getenv("PERL_BADLANG");
4645
4646     const bool locwarn = (printwarn > 1
4647                           || (          printwarn
4648                               && (    ! bad_lang_use_once
4649                                   || (
4650                                          /* disallow with "" or "0" */
4651                                          *bad_lang_use_once
4652                                        && strNE("0", bad_lang_use_once)))));
4653
4654     /* current locale for given category; should have been copied so aren't
4655      * volatile */
4656     const char * curlocales[NOMINAL_LC_ALL_INDEX + 1];
4657
4658 #  ifndef DEBUGGING
4659 #    define DEBUG_LOCALE_INIT(a,b,c)
4660 #  else
4661
4662     DEBUG_INITIALIZATION_set(cBOOL(PerlEnv_getenv("PERL_DEBUG_LOCALE_INIT")));
4663
4664 #    define DEBUG_LOCALE_INIT(cat_index, locale, result)                    \
4665         DEBUG_L(PerlIO_printf(Perl_debug_log, "%s\n",                       \
4666                     setlocale_debug_string_i(cat_index, locale, result)));
4667
4668 /* Make sure the parallel arrays are properly set up */
4669 #    ifdef USE_LOCALE_NUMERIC
4670     assert(categories[LC_NUMERIC_INDEX_] == LC_NUMERIC);
4671     assert(strEQ(category_names[LC_NUMERIC_INDEX_], "LC_NUMERIC"));
4672 #      ifdef USE_POSIX_2008_LOCALE
4673     assert(category_masks[LC_NUMERIC_INDEX_] == LC_NUMERIC_MASK);
4674 #      endif
4675 #    endif
4676 #    ifdef USE_LOCALE_CTYPE
4677     assert(categories[LC_CTYPE_INDEX_] == LC_CTYPE);
4678     assert(strEQ(category_names[LC_CTYPE_INDEX_], "LC_CTYPE"));
4679 #      ifdef USE_POSIX_2008_LOCALE
4680     assert(category_masks[LC_CTYPE_INDEX_] == LC_CTYPE_MASK);
4681 #      endif
4682 #    endif
4683 #    ifdef USE_LOCALE_COLLATE
4684     assert(categories[LC_COLLATE_INDEX_] == LC_COLLATE);
4685     assert(strEQ(category_names[LC_COLLATE_INDEX_], "LC_COLLATE"));
4686 #      ifdef USE_POSIX_2008_LOCALE
4687     assert(category_masks[LC_COLLATE_INDEX_] == LC_COLLATE_MASK);
4688 #      endif
4689 #    endif
4690 #    ifdef USE_LOCALE_TIME
4691     assert(categories[LC_TIME_INDEX_] == LC_TIME);
4692     assert(strEQ(category_names[LC_TIME_INDEX_], "LC_TIME"));
4693 #      ifdef USE_POSIX_2008_LOCALE
4694     assert(category_masks[LC_TIME_INDEX_] == LC_TIME_MASK);
4695 #      endif
4696 #    endif
4697 #    ifdef USE_LOCALE_MESSAGES
4698     assert(categories[LC_MESSAGES_INDEX_] == LC_MESSAGES);
4699     assert(strEQ(category_names[LC_MESSAGES_INDEX_], "LC_MESSAGES"));
4700 #      ifdef USE_POSIX_2008_LOCALE
4701     assert(category_masks[LC_MESSAGES_INDEX_] == LC_MESSAGES_MASK);
4702 #      endif
4703 #    endif
4704 #    ifdef USE_LOCALE_MONETARY
4705     assert(categories[LC_MONETARY_INDEX_] == LC_MONETARY);
4706     assert(strEQ(category_names[LC_MONETARY_INDEX_], "LC_MONETARY"));
4707 #      ifdef USE_POSIX_2008_LOCALE
4708     assert(category_masks[LC_MONETARY_INDEX_] == LC_MONETARY_MASK);
4709 #      endif
4710 #    endif
4711 #    ifdef USE_LOCALE_ADDRESS
4712     assert(categories[LC_ADDRESS_INDEX_] == LC_ADDRESS);
4713     assert(strEQ(category_names[LC_ADDRESS_INDEX_], "LC_ADDRESS"));
4714 #      ifdef USE_POSIX_2008_LOCALE
4715     assert(category_masks[LC_ADDRESS_INDEX_] == LC_ADDRESS_MASK);
4716 #      endif
4717 #    endif
4718 #    ifdef USE_LOCALE_IDENTIFICATION
4719     assert(categories[LC_IDENTIFICATION_INDEX_] == LC_IDENTIFICATION);
4720     assert(strEQ(category_names[LC_IDENTIFICATION_INDEX_], "LC_IDENTIFICATION"));
4721 #      ifdef USE_POSIX_2008_LOCALE
4722     assert(category_masks[LC_IDENTIFICATION_INDEX_] == LC_IDENTIFICATION_MASK);
4723 #      endif
4724 #    endif
4725 #    ifdef USE_LOCALE_MEASUREMENT
4726     assert(categories[LC_MEASUREMENT_INDEX_] == LC_MEASUREMENT);
4727     assert(strEQ(category_names[LC_MEASUREMENT_INDEX_], "LC_MEASUREMENT"));
4728 #      ifdef USE_POSIX_2008_LOCALE
4729     assert(category_masks[LC_MEASUREMENT_INDEX_] == LC_MEASUREMENT_MASK);
4730 #      endif
4731 #    endif
4732 #    ifdef USE_LOCALE_PAPER
4733     assert(categories[LC_PAPER_INDEX_] == LC_PAPER);
4734     assert(strEQ(category_names[LC_PAPER_INDEX_], "LC_PAPER"));
4735 #      ifdef USE_POSIX_2008_LOCALE
4736     assert(category_masks[LC_PAPER_INDEX_] == LC_PAPER_MASK);
4737 #      endif
4738 #    endif
4739 #    ifdef USE_LOCALE_TELEPHONE
4740     assert(categories[LC_TELEPHONE_INDEX_] == LC_TELEPHONE);
4741     assert(strEQ(category_names[LC_TELEPHONE_INDEX_], "LC_TELEPHONE"));
4742 #      ifdef USE_POSIX_2008_LOCALE
4743     assert(category_masks[LC_TELEPHONE_INDEX_] == LC_TELEPHONE_MASK);
4744 #      endif
4745 #    endif
4746 #    ifdef USE_LOCALE_SYNTAX
4747     assert(categories[LC_SYNTAX_INDEX_] == LC_SYNTAX);
4748     assert(strEQ(category_names[LC_SYNTAX_INDEX_], "LC_SYNTAX"));
4749 #      ifdef USE_POSIX_2008_LOCALE
4750     assert(category_masks[LC_SYNTAX_INDEX_] == LC_SYNTAX_MASK);
4751 #      endif
4752 #    endif
4753 #    ifdef USE_LOCALE_TOD
4754     assert(categories[LC_TOD_INDEX_] == LC_TOD);
4755     assert(strEQ(category_names[LC_TOD_INDEX_], "LC_TOD"));
4756 #      ifdef USE_POSIX_2008_LOCALE
4757     assert(category_masks[LC_TOD_INDEX_] == LC_TOD_MASK);
4758 #      endif
4759 #    endif
4760 #    ifdef LC_ALL
4761     assert(categories[LC_ALL_INDEX_] == LC_ALL);
4762     assert(strEQ(category_names[LC_ALL_INDEX_], "LC_ALL"));
4763     STATIC_ASSERT_STMT(NOMINAL_LC_ALL_INDEX == LC_ALL_INDEX_);
4764 #      ifdef USE_POSIX_2008_LOCALE
4765     assert(category_masks[LC_ALL_INDEX_] == LC_ALL_MASK);
4766 #      endif
4767 #    endif
4768 #  endif    /* DEBUGGING */
4769
4770     /* Initialize the per-thread mbrFOO() state variables.  See POSIX.xs for
4771      * why these particular incantations are used. */
4772 #  ifdef HAS_MBRLEN
4773     memzero(&PL_mbrlen_ps, sizeof(PL_mbrlen_ps));
4774 #  endif
4775 #  ifdef HAS_MBRTOWC
4776     memzero(&PL_mbrtowc_ps, sizeof(PL_mbrtowc_ps));
4777 #  endif
4778 #  ifdef HAS_WCTOMBR
4779     wcrtomb(NULL, L'\0', &PL_wcrtomb_ps);
4780 #  endif
4781 #  ifdef USE_THREAD_SAFE_LOCALE
4782 #    ifdef WIN32
4783
4784     _configthreadlocale(_ENABLE_PER_THREAD_LOCALE);
4785
4786 #    endif
4787 #  endif
4788 #  ifdef USE_POSIX_2008_LOCALE
4789
4790     PL_C_locale_obj = newlocale(LC_ALL_MASK, "C", (locale_t) 0);
4791     if (! PL_C_locale_obj) {
4792         locale_panic_(Perl_form(aTHX_
4793                                 "Cannot create POSIX 2008 C locale object"));
4794     }
4795
4796     DEBUG_Lv(PerlIO_printf(Perl_debug_log, "created C object %p\n",
4797                            PL_C_locale_obj));
4798
4799 #    ifdef USE_LOCALE_NUMERIC
4800
4801     PL_underlying_numeric_obj = duplocale(PL_C_locale_obj);
4802
4803 #    endif
4804 #  endif
4805 #  ifdef USE_LOCALE_NUMERIC
4806
4807     PL_numeric_radix_sv    = newSV(1);
4808     PL_underlying_radix_sv = newSV(1);
4809     Newxz(PL_numeric_name, 1, char);    /* Single NUL character */
4810     new_numeric("C");
4811
4812 #  endif
4813 #  ifdef USE_LOCALE_COLLATE
4814
4815     Newxz(PL_collation_name, 1, char);
4816     new_collate("C");
4817
4818 #  endif
4819 #  ifdef USE_LOCALE_CTYPE
4820
4821     Newxz(PL_ctype_name, 1, char);
4822     new_ctype("C");
4823
4824 #  endif
4825 #  ifdef USE_PL_CURLOCALES
4826
4827     /* Initialize our records. */
4828     for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++) {
4829         (void) emulate_setlocale_i(i, posix_setlocale(categories[i], NULL),
4830                                    RECALCULATE_LC_ALL_ON_FINAL_INTERATION,
4831                                    __LINE__);
4832     }
4833
4834 #  endif
4835
4836     /* We try each locale in the list until we get one that works, or exhaust
4837      * the list.  Normally the loop is executed just once.  But if setting the
4838      * locale fails, inside the loop we add fallback trials to the array and so
4839      * will execute the loop multiple times */
4840     trial_locales[0] = (trial_locales_struct) {
4841         .trial_locale = setlocale_init,
4842         .fallback_desc = NULL,
4843         .fallback_name = NULL,
4844     };
4845     trial_locales_count = 1;
4846
4847     for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++) {
4848         curlocales[i] = NULL;
4849     }
4850
4851     for (i= 0; i < trial_locales_count; i++) {
4852         const char * trial_locale = trial_locales[i].trial_locale;
4853         setlocale_failure = FALSE;
4854
4855 #  ifdef LC_ALL
4856
4857         /* setlocale() return vals; not copied so must be looked at
4858          * immediately. */
4859         const char * sl_result[NOMINAL_LC_ALL_INDEX + 1];
4860         sl_result[LC_ALL_INDEX_] = stdized_setlocale(LC_ALL, trial_locale);
4861         DEBUG_LOCALE_INIT(LC_ALL_INDEX_, trial_locale, sl_result[LC_ALL_INDEX_]);
4862         if (! sl_result[LC_ALL_INDEX_]) {
4863             setlocale_failure = TRUE;
4864         }
4865         else {
4866             /* Since LC_ALL succeeded, it should have changed all the other
4867              * categories it can to its value; so we massage things so that the
4868              * setlocales below just return their category's current values.
4869              * This adequately handles the case in NetBSD where LC_COLLATE may
4870              * not be defined for a locale, and setting it individually will
4871              * fail, whereas setting LC_ALL succeeds, leaving LC_COLLATE set to
4872              * the POSIX locale. */
4873             trial_locale = NULL;
4874         }
4875
4876 #  endif /* LC_ALL */
4877
4878         if (! setlocale_failure) {
4879             unsigned int j;
4880             for (j = 0; j < NOMINAL_LC_ALL_INDEX; j++) {
4881                 curlocales[j] = stdized_setlocale(categories[j], trial_locale);
4882                 if (! curlocales[j]) {
4883                     setlocale_failure = TRUE;
4884                 }
4885                 curlocales[j] = savepv(curlocales[j]);
4886                 DEBUG_LOCALE_INIT(j, trial_locale, curlocales[j]);
4887             }
4888
4889             if (LIKELY(! setlocale_failure)) {  /* All succeeded */
4890                 break;  /* Exit trial_locales loop */
4891             }
4892         }
4893
4894         /* Here, something failed; will need to try a fallback. */
4895         ok = 0;
4896
4897         if (i == 0) {
4898             unsigned int j;
4899
4900             if (locwarn) { /* Output failure info only on the first one */
4901
4902 #  ifdef LC_ALL
4903
4904                 PerlIO_printf(Perl_error_log,
4905                 "perl: warning: Setting locale failed.\n");
4906
4907 #  else /* !LC_ALL */
4908
4909                 PerlIO_printf(Perl_error_log,
4910                 "perl: warning: Setting locale failed for the categories:\n");
4911
4912                 for (j = 0; j < NOMINAL_LC_ALL_INDEX; j++) {
4913                     if (! curlocales[j]) {
4914                         PerlIO_printf(Perl_error_log, "\t%s\n", category_names[j]);
4915                     }
4916                 }
4917
4918 #  endif /* LC_ALL */
4919
4920                 PerlIO_printf(Perl_error_log,
4921                     "perl: warning: Please check that your locale settings:\n");
4922
4923 #  ifdef __GLIBC__
4924
4925                 PerlIO_printf(Perl_error_log,
4926                             "\tLANGUAGE = %c%s%c,\n",
4927                             language ? '"' : '(',
4928                             language ? language : "unset",
4929                             language ? '"' : ')');
4930 #  endif
4931
4932                 PerlIO_printf(Perl_error_log,
4933                             "\tLC_ALL = %c%s%c,\n",
4934                             lc_all ? '"' : '(',
4935                             lc_all ? lc_all : "unset",
4936                             lc_all ? '"' : ')');
4937
4938 #  if defined(USE_ENVIRON_ARRAY)
4939
4940                 {
4941                     char **e;
4942
4943                     /* Look through the environment for any variables of the
4944                      * form qr/ ^ LC_ [A-Z]+ = /x, except LC_ALL which was
4945                      * already handled above.  These are assumed to be locale
4946                      * settings.  Output them and their values. */
4947                     for (e = environ; *e; e++) {
4948                         const STRLEN prefix_len = sizeof("LC_") - 1;
4949                         STRLEN uppers_len;
4950
4951                         if (     strBEGINs(*e, "LC_")
4952                             && ! strBEGINs(*e, "LC_ALL=")
4953                             && (uppers_len = strspn(*e + prefix_len,
4954                                              "ABCDEFGHIJKLMNOPQRSTUVWXYZ"))
4955                             && ((*e)[prefix_len + uppers_len] == '='))
4956                         {
4957                             PerlIO_printf(Perl_error_log, "\t%.*s = \"%s\",\n",
4958                                 (int) (prefix_len + uppers_len), *e,
4959                                 *e + prefix_len + uppers_len + 1);
4960                         }
4961                     }
4962                 }
4963
4964 #  else
4965
4966                 PerlIO_printf(Perl_error_log,
4967                             "\t(possibly more locale environment variables)\n");
4968
4969 #  endif
4970
4971                 PerlIO_printf(Perl_error_log,
4972                             "\tLANG = %c%s%c\n",
4973                             lang ? '"' : '(',
4974                             lang ? lang : "unset",
4975                             lang ? '"' : ')');
4976
4977                 PerlIO_printf(Perl_error_log,
4978                             "    are supported and installed on your system.\n");
4979             }
4980
4981             /* Calculate what fallback locales to try.  We have avoided this
4982              * until we have to, because failure is quite unlikely.  This will
4983              * usually change the upper bound of the loop we are in.
4984              *
4985              * Since the system's default way of setting the locale has not
4986              * found one that works, We use Perl's defined ordering: LC_ALL,
4987              * LANG, and the C locale.  We don't try the same locale twice, so
4988              * don't add to the list if already there.  (On POSIX systems, the
4989              * LC_ALL element will likely be a repeat of the 0th element "",
4990              * but there's no harm done by doing it explicitly.
4991              *
4992              * Note that this tries the LC_ALL environment variable even on
4993              * systems which have no LC_ALL locale setting.  This may or may
4994              * not have been originally intentional, but there's no real need
4995              * to change the behavior. */
4996             if (lc_all) {
4997                 for (j = 0; j < trial_locales_count; j++) {
4998                     if (strEQ(lc_all, trial_locales[j].trial_locale)) {
4999                         goto done_lc_all;
5000                     }
5001                 }
5002                 trial_locales[trial_locales_count++] = (trial_locales_struct) {
5003                     .trial_locale = lc_all,
5004                     .fallback_desc = (strEQ(lc_all, "C")
5005                                       ? "the standard locale"
5006                                       : "a fallback locale"),
5007                     .fallback_name = lc_all,
5008                 };
5009             }
5010           done_lc_all:
5011
5012             if (lang) {
5013                 for (j = 0; j < trial_locales_count; j++) {
5014                     if (strEQ(lang, trial_locales[j].trial_locale)) {
5015                         goto done_lang;
5016                     }
5017                 }
5018                 trial_locales[trial_locales_count++] = (trial_locales_struct) {
5019                     .trial_locale = lang,
5020                     .fallback_desc = (strEQ(lang, "C")
5021                                       ? "the standard locale"
5022                                       : "a fallback locale"),
5023                     .fallback_name = lang,
5024                 };
5025             }
5026           done_lang:
5027
5028 #  if defined(WIN32) && defined(LC_ALL)
5029
5030             /* For Windows, we also try the system default locale before "C".
5031              * (If there exists a Windows without LC_ALL we skip this because
5032              * it gets too complicated.  For those, the "C" is the next
5033              * fallback possibility). */
5034             {
5035                 /* Note that this may change the locale, but we are going to do
5036                  * that anyway.
5037                  *
5038                  * Our normal Windows setlocale() implementation ignores the
5039                  * system default locale to make things work like POSIX.  This
5040                  * is the only place where we want to consider it, so have to
5041                  * use wrap_wsetlocale(). */
5042                 const char *system_default_locale =
5043                                     stdize_locale(LC_ALL,
5044                                                   S_wrap_wsetlocale(aTHX_ LC_ALL, ""),
5045                                                   &PL_stdize_locale_buf,
5046                                                   &PL_stdize_locale_bufsize,
5047                                                   __LINE__);
5048                 DEBUG_LOCALE_INIT(LC_ALL_INDEX_, "", system_default_locale);
5049
5050                 /* Skip if invalid or if it's already on the list of locales to
5051                  * try */
5052                 if (! system_default_locale) {
5053                     goto done_system_default;
5054                 }
5055                 for (j = 0; j < trial_locales_count; j++) {
5056                     if (strEQ(system_default_locale, trial_locales[j].trial_locale)) {
5057                         goto done_system_default;
5058                     }
5059                 }
5060
5061                 trial_locales[trial_locales_count++] = (trial_locales_struct) {
5062                     .trial_locale = system_default_locale,
5063                     .fallback_desc = (strEQ(system_default_locale, "C")
5064                                       ? "the standard locale"
5065                                       : "the system default locale"),
5066                     .fallback_name = system_default_locale,
5067                 };
5068             }
5069           done_system_default:
5070
5071 #  endif
5072
5073             for (j = 0; j < trial_locales_count; j++) {
5074                 if (strEQ("C", trial_locales[j].trial_locale)) {
5075                     goto done_C;
5076                 }
5077             }
5078             trial_locales[trial_locales_count++] = (trial_locales_struct) {
5079                 .trial_locale = "C",
5080                 .fallback_desc = "the standard locale",
5081                 .fallback_name = "C",
5082             };
5083
5084           done_C: ;
5085         }   /* end of first time through the loop */
5086
5087 #  ifdef WIN32
5088
5089       next_iteration: ;
5090
5091 #  endif
5092
5093     }   /* end of looping through the trial locales */
5094
5095     if (ok < 1) {   /* If we tried to fallback */
5096         const char* msg;
5097         if (! setlocale_failure) {  /* fallback succeeded */
5098            msg = "Falling back to";
5099         }
5100         else {  /* fallback failed */
5101             unsigned int j;
5102
5103             /* We dropped off the end of the loop, so have to decrement i to
5104              * get back to the value the last time through */
5105             i--;
5106
5107             ok = -1;
5108             msg = "Failed to fall back to";
5109
5110             /* To continue, we should use whatever values we've got */
5111
5112             for (j = 0; j < NOMINAL_LC_ALL_INDEX; j++) {
5113                 Safefree(curlocales[j]);
5114                 curlocales[j] = savepv(stdized_setlocale(categories[j], NULL));
5115                 DEBUG_LOCALE_INIT(j, NULL, curlocales[j]);
5116             }
5117         }
5118
5119         if (locwarn) {
5120             const char * description = trial_locales[i].fallback_desc;
5121             const char * name = trial_locales[i].fallback_name;
5122
5123             if (name && strNE(name, "")) {
5124                 PerlIO_printf(Perl_error_log,
5125                     "perl: warning: %s %s (\"%s\").\n", msg, description, name);
5126             }
5127             else {
5128                 PerlIO_printf(Perl_error_log,
5129                                    "perl: warning: %s %s.\n", msg, description);
5130             }
5131         }
5132     } /* End of tried to fallback */
5133
5134 #  ifdef USE_POSIX_2008_LOCALE
5135
5136     /* The stdized setlocales haven't affected the P2008 locales.  Initialize
5137      * them now, calculating LC_ALL only on the final go round, when all have
5138      * been set. */
5139     for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++) {
5140         (void) emulate_setlocale_i(i, curlocales[i],
5141                                    RECALCULATE_LC_ALL_ON_FINAL_INTERATION,
5142                                    __LINE__);
5143     }
5144
5145 #  endif
5146
5147     /* Done with finding the locales; update the auxiliary records */
5148     new_LC_ALL(NULL);
5149
5150 #  if defined(USE_POSIX_2008_LOCALE) && defined(USE_LOCALE_NUMERIC)
5151
5152     /* This is a temporary workaround for #20155, to avoid issues where the
5153      * global locale wants a radix different from the per-thread one.  This
5154      * restores behavior for LC_NUMERIC to what it was before a7ff7ac. */
5155     posix_setlocale(LC_NUMERIC, "C");
5156
5157 #  endif
5158
5159     for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++) {
5160         Safefree(curlocales[i]);
5161     }
5162
5163 #  if defined(USE_PERLIO) && defined(USE_LOCALE_CTYPE)
5164
5165     /* Set PL_utf8locale to TRUE if using PerlIO _and_ the current LC_CTYPE
5166      * locale is UTF-8.  The call to new_ctype() just above has already
5167      * calculated the latter value and saved it in PL_in_utf8_CTYPE_locale. If
5168      * both PL_utf8locale and PL_unicode (set by -C or by $ENV{PERL_UNICODE})
5169      * are true, perl.c:S_parse_body() will turn on the PerlIO :utf8 layer on
5170      * STDIN, STDOUT, STDERR, _and_ the default open discipline.  */
5171     PL_utf8locale = PL_in_utf8_CTYPE_locale;
5172
5173     /* Set PL_unicode to $ENV{PERL_UNICODE} if using PerlIO.
5174        This is an alternative to using the -C command line switch
5175        (the -C if present will override this). */
5176     {
5177          const char *p = PerlEnv_getenv("PERL_UNICODE");
5178          PL_unicode = p ? parse_unicode_opts(&p) : 0;
5179          if (PL_unicode & PERL_UNICODE_UTF8CACHEASSERT_FLAG)
5180              PL_utf8cache = -1;
5181     }
5182
5183 #  endif
5184 #endif /* USE_LOCALE */
5185
5186     /* So won't continue to output stuff */
5187     DEBUG_INITIALIZATION_set(FALSE);
5188
5189     return ok;
5190 }
5191
5192 #ifdef USE_LOCALE_COLLATE
5193
5194 STATIC void
5195 S_compute_collxfrm_coefficients(pTHX)
5196 {
5197
5198         PL_in_utf8_COLLATE_locale = (PL_collation_standard)
5199                                     ? 0
5200                                     : is_locale_utf8(PL_collation_name);
5201         PL_strxfrm_NUL_replacement = '\0';
5202         PL_strxfrm_max_cp = 0;
5203
5204         /* A locale collation definition includes primary, secondary, tertiary,
5205          * etc. weights for each character.  To sort, the primary weights are
5206          * used, and only if they compare equal, then the secondary weights are
5207          * used, and only if they compare equal, then the tertiary, etc.
5208          *
5209          * strxfrm() works by taking the input string, say ABC, and creating an
5210          * output transformed string consisting of first the primary weights,
5211          * A¹B¹C¹ followed by the secondary ones, A²B²C²; and then the
5212          * tertiary, etc, yielding A¹B¹C¹ A²B²C² A³B³C³ ....  Some characters
5213          * may not have weights at every level.  In our example, let's say B
5214          * doesn't have a tertiary weight, and A doesn't have a secondary
5215          * weight.  The constructed string is then going to be
5216          *  A¹B¹C¹ B²C² A³C³ ....
5217          * This has the desired effect that strcmp() will look at the secondary
5218          * or tertiary weights only if the strings compare equal at all higher
5219          * priority weights.  The spaces shown here, like in
5220          *  "A¹B¹C¹ A²B²C² "
5221          * are not just for readability.  In the general case, these must
5222          * actually be bytes, which we will call here 'separator weights'; and
5223          * they must be smaller than any other weight value, but since these
5224          * are C strings, only the terminating one can be a NUL (some
5225          * implementations may include a non-NUL separator weight just before
5226          * the NUL).  Implementations tend to reserve 01 for the separator
5227          * weights.  They are needed so that a shorter string's secondary
5228          * weights won't be misconstrued as primary weights of a longer string,
5229          * etc.  By making them smaller than any other weight, the shorter
5230          * string will sort first.  (Actually, if all secondary weights are
5231          * smaller than all primary ones, there is no need for a separator
5232          * weight between those two levels, etc.)
5233          *
5234          * The length of the transformed string is roughly a linear function of
5235          * the input string.  It's not exactly linear because some characters
5236          * don't have weights at all levels.  When we call strxfrm() we have to
5237          * allocate some memory to hold the transformed string.  The
5238          * calculations below try to find coefficients 'm' and 'b' for this
5239          * locale so that m*x + b equals how much space we need, given the size
5240          * of the input string in 'x'.  If we calculate too small, we increase
5241          * the size as needed, and call strxfrm() again, but it is better to
5242          * get it right the first time to avoid wasted expensive string
5243          * transformations. */
5244
5245         {
5246             /* We use the string below to find how long the tranformation of it
5247              * is.  Almost all locales are supersets of ASCII, or at least the
5248              * ASCII letters.  We use all of them, half upper half lower,
5249              * because if we used fewer, we might hit just the ones that are
5250              * outliers in a particular locale.  Most of the strings being
5251              * collated will contain a preponderance of letters, and even if
5252              * they are above-ASCII, they are likely to have the same number of
5253              * weight levels as the ASCII ones.  It turns out that digits tend
5254              * to have fewer levels, and some punctuation has more, but those
5255              * are relatively sparse in text, and khw believes this gives a
5256              * reasonable result, but it could be changed if experience so
5257              * dictates. */
5258             const char longer[] = "ABCDEFGHIJKLMnopqrstuvwxyz";
5259             char * x_longer;        /* Transformed 'longer' */
5260             Size_t x_len_longer;    /* Length of 'x_longer' */
5261
5262             char * x_shorter;   /* We also transform a substring of 'longer' */
5263             Size_t x_len_shorter;
5264
5265             /* mem_collxfrm_() is used get the transformation (though here we
5266              * are interested only in its length).  It is used because it has
5267              * the intelligence to handle all cases, but to work, it needs some
5268              * values of 'm' and 'b' to get it started.  For the purposes of
5269              * this calculation we use a very conservative estimate of 'm' and
5270              * 'b'.  This assumes a weight can be multiple bytes, enough to
5271              * hold any UV on the platform, and there are 5 levels, 4 weight
5272              * bytes, and a trailing NUL.  */
5273             PL_collxfrm_base = 5;
5274             PL_collxfrm_mult = 5 * sizeof(UV);
5275
5276             /* Find out how long the transformation really is */
5277             x_longer = mem_collxfrm_(longer,
5278                                      sizeof(longer) - 1,
5279                                      &x_len_longer,
5280
5281                                      /* We avoid converting to UTF-8 in the
5282                                       * called function by telling it the
5283                                       * string is in UTF-8 if the locale is a
5284                                       * UTF-8 one.  Since the string passed
5285                                       * here is invariant under UTF-8, we can
5286                                       * claim it's UTF-8 even though it isn't.
5287                                       * */
5288                                      PL_in_utf8_COLLATE_locale);
5289             Safefree(x_longer);
5290
5291             /* Find out how long the transformation of a substring of 'longer'
5292              * is.  Together the lengths of these transformations are
5293              * sufficient to calculate 'm' and 'b'.  The substring is all of
5294              * 'longer' except the first character.  This minimizes the chances
5295              * of being swayed by outliers */
5296             x_shorter = mem_collxfrm_(longer + 1,
5297                                       sizeof(longer) - 2,
5298                                       &x_len_shorter,
5299                                       PL_in_utf8_COLLATE_locale);
5300             Safefree(x_shorter);
5301
5302             /* If the results are nonsensical for this simple test, the whole
5303              * locale definition is suspect.  Mark it so that locale collation
5304              * is not active at all for it.  XXX Should we warn? */
5305             if (   x_len_shorter == 0
5306                 || x_len_longer == 0
5307                 || x_len_shorter >= x_len_longer)
5308             {
5309                 PL_collxfrm_mult = 0;
5310                 PL_collxfrm_base = 1;
5311                 DEBUG_L(PerlIO_printf(Perl_debug_log,
5312                         "Disabling locale collation for LC_COLLATE='%s';"
5313                         " length for shorter sample=%zu; longer=%zu\n",
5314                         PL_collation_name, x_len_shorter, x_len_longer));
5315             }
5316             else {
5317                 SSize_t base;       /* Temporary */
5318
5319                 /* We have both:    m * strlen(longer)  + b = x_len_longer
5320                  *                  m * strlen(shorter) + b = x_len_shorter;
5321                  * subtracting yields:
5322                  *          m * (strlen(longer) - strlen(shorter))
5323                  *                             = x_len_longer - x_len_shorter
5324                  * But we have set things up so that 'shorter' is 1 byte smaller
5325                  * than 'longer'.  Hence:
5326                  *          m = x_len_longer - x_len_shorter
5327                  *
5328                  * But if something went wrong, make sure the multiplier is at
5329                  * least 1.
5330                  */
5331                 if (x_len_longer > x_len_shorter) {
5332                     PL_collxfrm_mult = (STRLEN) x_len_longer - x_len_shorter;
5333                 }
5334                 else {
5335                     PL_collxfrm_mult = 1;
5336                 }
5337
5338                 /*     mx + b = len
5339                  * so:      b = len - mx
5340                  * but in case something has gone wrong, make sure it is
5341                  * non-negative */
5342                 base = x_len_longer - PL_collxfrm_mult * (sizeof(longer) - 1);
5343                 if (base < 0) {
5344                     base = 0;
5345                 }
5346
5347                 /* Add 1 for the trailing NUL */
5348                 PL_collxfrm_base = base + 1;
5349             }
5350
5351             DEBUG_L(PerlIO_printf(Perl_debug_log,
5352                                   "?UTF-8 locale=%d; x_len_shorter=%zu, "
5353                     "x_len_longer=%zu,"
5354                     " collate multipler=%zu, collate base=%zu\n",
5355                     PL_in_utf8_COLLATE_locale,
5356                     x_len_shorter, x_len_longer,
5357                                   PL_collxfrm_mult, PL_collxfrm_base));
5358         }
5359 }
5360
5361 char *
5362 Perl_mem_collxfrm_(pTHX_ const char *input_string,
5363                          STRLEN len,    /* Length of 'input_string' */
5364                          STRLEN *xlen,  /* Set to length of returned string
5365                                            (not including the collation index
5366                                            prefix) */
5367                          bool utf8      /* Is the input in UTF-8? */
5368                    )
5369 {
5370     /* mem_collxfrm_() is like strxfrm() but with two important differences.
5371      * First, it handles embedded NULs. Second, it allocates a bit more memory
5372      * than needed for the transformed data itself.  The real transformed data
5373      * begins at offset COLLXFRM_HDR_LEN.  *xlen is set to the length of that,
5374      * and doesn't include the collation index size.
5375      *
5376      * It is the caller's responsibility to eventually free the memory returned
5377      * by this function.
5378      *
5379      * Please see sv_collxfrm() to see how this is used. */
5380
5381 #  define COLLXFRM_HDR_LEN    sizeof(PL_collation_ix)
5382
5383     char * s = (char *) input_string;
5384     STRLEN s_strlen = strlen(input_string);
5385     char *xbuf = NULL;
5386     STRLEN xAlloc;          /* xalloc is a reserved word in VC */
5387     STRLEN length_in_chars;
5388     bool first_time = TRUE; /* Cleared after first loop iteration */
5389
5390 #  ifdef USE_LOCALE_CTYPE
5391         const char * orig_CTYPE_locale = NULL;
5392 #  endif
5393
5394 #  if defined(USE_POSIX_2008_LOCALE) && defined HAS_STRXFRM_L
5395     locale_t constructed_locale = (locale_t) 0;
5396 #  endif
5397
5398     PERL_ARGS_ASSERT_MEM_COLLXFRM_;
5399
5400     /* Must be NUL-terminated */
5401     assert(*(input_string + len) == '\0');
5402
5403     if (PL_collxfrm_mult == 0) {     /* unknown or bad */
5404         if (PL_collxfrm_base != 0) { /* bad collation => skip */
5405             DEBUG_L(PerlIO_printf(Perl_debug_log,
5406                             "mem_collxfrm_: locale's collation is defective\n"));
5407             goto bad;
5408         }
5409
5410         S_compute_collxfrm_coefficients(aTHX);
5411     }
5412
5413     /* Replace any embedded NULs with the control that sorts before any others.
5414      * This will give as good as possible results on strings that don't
5415      * otherwise contain that character, but otherwise there may be
5416      * less-than-perfect results with that character and NUL.  This is
5417      * unavoidable unless we replace strxfrm with our own implementation. */
5418     if (UNLIKELY(s_strlen < len)) {   /* Only execute if there is an embedded
5419                                          NUL */
5420         char * e = s + len;
5421         char * sans_nuls;
5422         STRLEN sans_nuls_len;
5423         int try_non_controls;
5424         char this_replacement_char[] = "?\0";   /* Room for a two-byte string,
5425                                                    making sure 2nd byte is NUL.
5426                                                  */
5427         STRLEN this_replacement_len;
5428
5429         /* If we don't know what non-NUL control character sorts lowest for
5430          * this locale, find it */
5431         if (PL_strxfrm_NUL_replacement == '\0') {
5432             int j;
5433             char * cur_min_x = NULL;    /* The min_char's xfrm, (except it also
5434                                            includes the collation index
5435                                            prefixed. */
5436
5437             DEBUG_Lv(PerlIO_printf(Perl_debug_log, "Looking to replace NUL\n"));
5438
5439             /* Unlikely, but it may be that no control will work to replace
5440              * NUL, in which case we instead look for any character.  Controls
5441              * are preferred because collation order is, in general, context
5442              * sensitive, with adjoining characters affecting the order, and
5443              * controls are less likely to have such interactions, allowing the
5444              * NUL-replacement to stand on its own.  (Another way to look at it
5445              * is to imagine what would happen if the NUL were replaced by a
5446              * combining character; it wouldn't work out all that well.) */
5447             for (try_non_controls = 0;
5448                  try_non_controls < 2;
5449                  try_non_controls++)
5450             {
5451
5452 #  ifdef USE_LOCALE_CTYPE
5453
5454                 /* In this case we use isCNTRL_LC() below, which relies on
5455                  * LC_CTYPE, so that must be switched to correspond with the
5456                  * LC_COLLATE locale */
5457                 if (! try_non_controls && ! PL_in_utf8_COLLATE_locale) {
5458                     orig_CTYPE_locale = toggle_locale_c(LC_CTYPE, PL_collation_name);
5459                 }
5460 #  endif
5461                 /* Look through all legal code points (NUL isn't) */
5462                 for (j = 1; j < 256; j++) {
5463                     char * x;       /* j's xfrm plus collation index */
5464                     STRLEN x_len;   /* length of 'x' */
5465                     STRLEN trial_len = 1;
5466                     char cur_source[] = { '\0', '\0' };
5467
5468                     /* Skip non-controls the first time through the loop.  The
5469                      * controls in a UTF-8 locale are the L1 ones */
5470                     if (! try_non_controls && (PL_in_utf8_COLLATE_locale)
5471                                                ? ! isCNTRL_L1(j)
5472                                                : ! isCNTRL_LC(j))
5473                     {
5474                         continue;
5475                     }
5476
5477                     /* Create a 1-char string of the current code point */
5478                     cur_source[0] = (char) j;
5479
5480                     /* Then transform it */
5481                     x = mem_collxfrm_(cur_source, trial_len, &x_len,
5482                                       0 /* The string is not in UTF-8 */);
5483
5484                     /* Ignore any character that didn't successfully transform.
5485                      * */
5486                     if (! x) {
5487                         continue;
5488                     }
5489
5490                     /* If this character's transformation is lower than
5491                      * the current lowest, this one becomes the lowest */
5492                     if (   cur_min_x == NULL
5493                         || strLT(x         + COLLXFRM_HDR_LEN,
5494                                  cur_min_x + COLLXFRM_HDR_LEN))
5495                     {
5496                         PL_strxfrm_NUL_replacement = j;
5497                         Safefree(cur_min_x);
5498                         cur_min_x = x;
5499                     }
5500                     else {
5501                         Safefree(x);
5502                     }
5503                 } /* end of loop through all 255 characters */
5504
5505 #  ifdef USE_LOCALE_CTYPE
5506                 restore_toggled_locale_c(LC_CTYPE, orig_CTYPE_locale);
5507 #  endif
5508
5509                 /* Stop looking if found */
5510                 if (cur_min_x) {
5511                     break;
5512                 }
5513
5514                 /* Unlikely, but possible, if there aren't any controls that
5515                  * work in the locale, repeat the loop, looking for any
5516                  * character that works */
5517                 DEBUG_L(PerlIO_printf(Perl_debug_log,
5518                 "mem_collxfrm_: No control worked.  Trying non-controls\n"));
5519             } /* End of loop to try first the controls, then any char */
5520
5521             if (! cur_min_x) {
5522                 DEBUG_L(PerlIO_printf(Perl_debug_log,
5523                     "mem_collxfrm_: Couldn't find any character to replace"
5524                     " embedded NULs in locale %s with", PL_collation_name));
5525                 goto bad;
5526             }
5527
5528             DEBUG_L(PerlIO_printf(Perl_debug_log,
5529                     "mem_collxfrm_: Replacing embedded NULs in locale %s with "
5530                     "0x%02X\n", PL_collation_name, PL_strxfrm_NUL_replacement));
5531
5532             Safefree(cur_min_x);
5533         } /* End of determining the character that is to replace NULs */
5534
5535         /* If the replacement is variant under UTF-8, it must match the
5536          * UTF8-ness of the original */
5537         if ( ! UVCHR_IS_INVARIANT(PL_strxfrm_NUL_replacement) && utf8) {
5538             this_replacement_char[0] =
5539                                 UTF8_EIGHT_BIT_HI(PL_strxfrm_NUL_replacement);
5540             this_replacement_char[1] =
5541                                 UTF8_EIGHT_BIT_LO(PL_strxfrm_NUL_replacement);
5542             this_replacement_len = 2;
5543         }
5544         else {
5545             this_replacement_char[0] = PL_strxfrm_NUL_replacement;
5546             /* this_replacement_char[1] = '\0' was done at initialization */
5547             this_replacement_len = 1;
5548         }
5549
5550         /* The worst case length for the replaced string would be if every
5551          * character in it is NUL.  Multiply that by the length of each
5552          * replacement, and allow for a trailing NUL */
5553         sans_nuls_len = (len * this_replacement_len) + 1;
5554         Newx(sans_nuls, sans_nuls_len, char);
5555         *sans_nuls = '\0';
5556
5557         /* Replace each NUL with the lowest collating control.  Loop until have
5558          * exhausted all the NULs */
5559         while (s + s_strlen < e) {
5560             my_strlcat(sans_nuls, s, sans_nuls_len);
5561
5562             /* Do the actual replacement */
5563             my_strlcat(sans_nuls, this_replacement_char, sans_nuls_len);
5564
5565             /* Move past the input NUL */
5566             s += s_strlen + 1;
5567             s_strlen = strlen(s);
5568         }
5569
5570         /* And add anything that trails the final NUL */
5571         my_strlcat(sans_nuls, s, sans_nuls_len);
5572
5573         /* Switch so below we transform this modified string */
5574         s = sans_nuls;
5575         len = strlen(s);
5576     } /* End of replacing NULs */
5577
5578     /* Make sure the UTF8ness of the string and locale match */
5579     if (utf8 != PL_in_utf8_COLLATE_locale) {
5580         /* XXX convert above Unicode to 10FFFF? */
5581         const char * const t = s;   /* Temporary so we can later find where the
5582                                        input was */
5583
5584         /* Here they don't match.  Change the string's to be what the locale is
5585          * expecting */
5586
5587         if (! utf8) { /* locale is UTF-8, but input isn't; upgrade the input */
5588             s = (char *) bytes_to_utf8((const U8 *) s, &len);
5589             utf8 = TRUE;
5590         }
5591         else {   /* locale is not UTF-8; but input is; downgrade the input */
5592
5593             s = (char *) bytes_from_utf8((const U8 *) s, &len, &utf8);
5594
5595             /* If the downgrade was successful we are done, but if the input
5596              * contains things that require UTF-8 to represent, have to do
5597              * damage control ... */
5598             if (UNLIKELY(utf8)) {
5599
5600                 /* What we do is construct a non-UTF-8 string with
5601                  *  1) the characters representable by a single byte converted
5602                  *     to be so (if necessary);
5603                  *  2) and the rest converted to collate the same as the
5604                  *     highest collating representable character.  That makes
5605                  *     them collate at the end.  This is similar to how we
5606                  *     handle embedded NULs, but we use the highest collating
5607                  *     code point instead of the smallest.  Like the NUL case,
5608                  *     this isn't perfect, but is the best we can reasonably
5609                  *     do.  Every above-255 code point will sort the same as
5610                  *     the highest-sorting 0-255 code point.  If that code
5611                  *     point can combine in a sequence with some other code
5612                  *     points for weight calculations, us changing something to
5613                  *     be it can adversely affect the results.  But in most
5614                  *     cases, it should work reasonably.  And note that this is
5615                  *     really an illegal situation: using code points above 255
5616                  *     on a locale where only 0-255 are valid.  If two strings
5617                  *     sort entirely equal, then the sort order for the
5618                  *     above-255 code points will be in code point order. */
5619
5620                 utf8 = FALSE;
5621
5622                 /* If we haven't calculated the code point with the maximum
5623                  * collating order for this locale, do so now */
5624                 if (! PL_strxfrm_max_cp) {
5625                     int j;
5626
5627                     /* The current transformed string that collates the
5628                      * highest (except it also includes the prefixed collation
5629                      * index. */
5630                     char * cur_max_x = NULL;
5631
5632                     /* Look through all legal code points (NUL isn't) */
5633                     for (j = 1; j < 256; j++) {
5634                         char * x;
5635                         STRLEN x_len;
5636                         char cur_source[] = { '\0', '\0' };
5637
5638                         /* Create a 1-char string of the current code point */
5639                         cur_source[0] = (char) j;
5640
5641                         /* Then transform it */
5642                         x = mem_collxfrm_(cur_source, 1, &x_len, FALSE);
5643
5644                         /* If something went wrong (which it shouldn't), just
5645                          * ignore this code point */
5646                         if (! x) {
5647                             continue;
5648                         }
5649
5650                         /* If this character's transformation is higher than
5651                          * the current highest, this one becomes the highest */
5652                         if (   cur_max_x == NULL
5653                             || strGT(x         + COLLXFRM_HDR_LEN,
5654                                      cur_max_x + COLLXFRM_HDR_LEN))
5655                         {
5656                             PL_strxfrm_max_cp = j;
5657                             Safefree(cur_max_x);
5658                             cur_max_x = x;
5659                         }
5660                         else {
5661                             Safefree(x);
5662                         }
5663                     }
5664
5665                     if (! cur_max_x) {
5666                         DEBUG_L(PerlIO_printf(Perl_debug_log,
5667                             "mem_collxfrm_: Couldn't find any character to"
5668                             " replace above-Latin1 chars in locale %s with",
5669                             PL_collation_name));
5670                         goto bad;
5671                     }
5672
5673                     DEBUG_L(PerlIO_printf(Perl_debug_log,
5674                             "mem_collxfrm_: highest 1-byte collating character"
5675                             " in locale %s is 0x%02X\n",
5676                             PL_collation_name,
5677                             PL_strxfrm_max_cp));
5678
5679                     Safefree(cur_max_x);
5680                 }
5681
5682                 /* Here we know which legal code point collates the highest.
5683                  * We are ready to construct the non-UTF-8 string.  The length
5684                  * will be at least 1 byte smaller than the input string
5685                  * (because we changed at least one 2-byte character into a
5686                  * single byte), but that is eaten up by the trailing NUL */
5687                 Newx(s, len, char);
5688
5689                 {
5690                     STRLEN i;
5691                     STRLEN d= 0;
5692                     char * e = (char *) t + len;
5693
5694                     for (i = 0; i < len; i+= UTF8SKIP(t + i)) {
5695                         U8 cur_char = t[i];
5696                         if (UTF8_IS_INVARIANT(cur_char)) {
5697                             s[d++] = cur_char;
5698                         }
5699                         else if (UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(t + i, e)) {
5700                             s[d++] = EIGHT_BIT_UTF8_TO_NATIVE(cur_char, t[i+1]);
5701                         }
5702                         else {  /* Replace illegal cp with highest collating
5703                                    one */
5704                             s[d++] = PL_strxfrm_max_cp;
5705                         }
5706                     }
5707                     s[d++] = '\0';
5708                     Renew(s, d, char);   /* Free up unused space */
5709                 }
5710             }
5711         }
5712
5713         /* Here, we have constructed a modified version of the input.  It could
5714          * be that we already had a modified copy before we did this version.
5715          * If so, that copy is no longer needed */
5716         if (t != input_string) {
5717             Safefree(t);
5718         }
5719     }
5720
5721     length_in_chars = (utf8)
5722                       ? utf8_length((U8 *) s, (U8 *) s + len)
5723                       : len;
5724
5725     /* The first element in the output is the collation id, used by
5726      * sv_collxfrm(); then comes the space for the transformed string.  The
5727      * equation should give us a good estimate as to how much is needed */
5728     xAlloc = COLLXFRM_HDR_LEN
5729            + PL_collxfrm_base
5730            + (PL_collxfrm_mult * length_in_chars);
5731     Newx(xbuf, xAlloc, char);
5732     if (UNLIKELY(! xbuf)) {
5733         DEBUG_L(PerlIO_printf(Perl_debug_log,
5734                       "mem_collxfrm_: Couldn't malloc %zu bytes\n", xAlloc));
5735         goto bad;
5736     }
5737
5738     /* Store the collation id */
5739     *(U32*)xbuf = PL_collation_ix;
5740
5741 #  if defined(USE_POSIX_2008_LOCALE) && defined HAS_STRXFRM_L
5742 #    ifdef USE_LOCALE_CTYPE
5743
5744     constructed_locale = newlocale(LC_CTYPE_MASK, PL_collation_name,
5745                                    duplocale(use_curlocale_scratch()));
5746 #    else
5747
5748     constructed_locale = duplocale(use_curlocale_scratch());
5749
5750 #    endif
5751 #    define my_strxfrm(dest, src, n)  strxfrm_l(dest, src, n,           \
5752                                                 constructed_locale)
5753 #    define CLEANUP_STRXFRM                                             \
5754         STMT_START {                                                    \
5755             if (constructed_locale != (locale_t) 0)                     \
5756                 freelocale(constructed_locale);                         \
5757         } STMT_END
5758 #  else
5759 #    define my_strxfrm(dest, src, n)  strxfrm(dest, src, n)
5760 #    ifdef USE_LOCALE_CTYPE
5761
5762     orig_CTYPE_locale = toggle_locale_c(LC_CTYPE, PL_collation_name);
5763
5764 #      define CLEANUP_STRXFRM                                           \
5765                 restore_toggled_locale_c(LC_CTYPE, orig_CTYPE_locale)
5766 #    else
5767 #      define CLEANUP_STRXFRM  NOOP
5768 #    endif
5769 #  endif
5770
5771     /* Then the transformation of the input.  We loop until successful, or we
5772      * give up */
5773     for (;;) {
5774
5775         errno = 0;
5776         *xlen = my_strxfrm(xbuf + COLLXFRM_HDR_LEN, s, xAlloc - COLLXFRM_HDR_LEN);
5777
5778         /* If the transformed string occupies less space than we told strxfrm()
5779          * was available, it means it transformed the whole string. */
5780         if (*xlen < xAlloc - COLLXFRM_HDR_LEN) {
5781
5782             /* But there still could have been a problem */
5783             if (errno != 0) {
5784                 DEBUG_L(PerlIO_printf(Perl_debug_log,
5785                        "strxfrm failed for LC_COLLATE=%s; errno=%d, input=%s\n",
5786                        PL_collation_name, errno,
5787                        _byte_dump_string((U8 *) s, len, 0)));
5788                 goto bad;
5789             }
5790
5791             /* Here, the transformation was successful.  Some systems include a
5792              * trailing NUL in the returned length.  Ignore it, using a loop in
5793              * case multiple trailing NULs are returned. */
5794             while (   (*xlen) > 0
5795                    && *(xbuf + COLLXFRM_HDR_LEN + (*xlen) - 1) == '\0')
5796             {
5797                 (*xlen)--;
5798             }
5799
5800             /* If the first try didn't get it, it means our prediction was low.
5801              * Modify the coefficients so that we predict a larger value in any
5802              * future transformations */
5803             if (! first_time) {
5804                 STRLEN needed = *xlen + 1;   /* +1 For trailing NUL */
5805                 STRLEN computed_guess = PL_collxfrm_base
5806                                       + (PL_collxfrm_mult * length_in_chars);
5807
5808                 /* On zero-length input, just keep current slope instead of
5809                  * dividing by 0 */
5810                 const STRLEN new_m = (length_in_chars != 0)
5811                                      ? needed / length_in_chars
5812                                      : PL_collxfrm_mult;
5813
5814                 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
5815                     "initial size of %zu bytes for a length "
5816                     "%zu string was insufficient, %zu needed\n",
5817                     computed_guess, length_in_chars, needed));
5818
5819                 /* If slope increased, use it, but discard this result for
5820                  * length 1 strings, as we can't be sure that it's a real slope
5821                  * change */
5822                 if (length_in_chars > 1 && new_m  > PL_collxfrm_mult) {
5823
5824 #  ifdef DEBUGGING
5825
5826                     STRLEN old_m = PL_collxfrm_mult;
5827                     STRLEN old_b = PL_collxfrm_base;
5828
5829 #  endif
5830
5831                     PL_collxfrm_mult = new_m;
5832                     PL_collxfrm_base = 1;   /* +1 For trailing NUL */
5833                     computed_guess = PL_collxfrm_base
5834                                     + (PL_collxfrm_mult * length_in_chars);
5835                     if (computed_guess < needed) {
5836                         PL_collxfrm_base += needed - computed_guess;
5837                     }
5838
5839                     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
5840                                     "slope is now %zu; was %zu, base "
5841                         "is now %zu; was %zu\n",
5842                         PL_collxfrm_mult, old_m,
5843                         PL_collxfrm_base, old_b));
5844                 }
5845                 else {  /* Slope didn't change, but 'b' did */
5846                     const STRLEN new_b = needed
5847                                         - computed_guess
5848                                         + PL_collxfrm_base;
5849                     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
5850                         "base is now %zu; was %zu\n", new_b, PL_collxfrm_base));
5851                     PL_collxfrm_base = new_b;
5852                 }
5853             }
5854
5855             break;
5856         }
5857
5858         if (UNLIKELY(*xlen >= PERL_INT_MAX)) {
5859             DEBUG_L(PerlIO_printf(Perl_debug_log,
5860                   "mem_collxfrm_: Needed %zu bytes, max permissible is %u\n",
5861                   *xlen, PERL_INT_MAX));
5862             goto bad;
5863         }
5864
5865         /* A well-behaved strxfrm() returns exactly how much space it needs
5866          * (usually not including the trailing NUL) when it fails due to not
5867          * enough space being provided.  Assume that this is the case unless
5868          * it's been proven otherwise */
5869         if (LIKELY(PL_strxfrm_is_behaved) && first_time) {
5870             xAlloc = *xlen + COLLXFRM_HDR_LEN + 1;
5871         }
5872         else { /* Here, either:
5873                 *  1)  The strxfrm() has previously shown bad behavior; or
5874                 *  2)  It isn't the first time through the loop, which means
5875                 *      that the strxfrm() is now showing bad behavior, because
5876                 *      we gave it what it said was needed in the previous
5877                 *      iteration, and it came back saying it needed still more.
5878                 *      (Many versions of cygwin fit this.  When the buffer size
5879                 *      isn't sufficient, they return the input size instead of
5880                 *      how much is needed.)
5881                 * Increase the buffer size by a fixed percentage and try again.
5882                 * */
5883             xAlloc += (xAlloc / 4) + 1;
5884             PL_strxfrm_is_behaved = FALSE;
5885
5886             DEBUG_Lv(PerlIO_printf(Perl_debug_log,
5887                      "mem_collxfrm_ required more space than previously"
5888                      " calculated for locale %s, trying again with new"
5889                      " guess=%zu+%zu\n",
5890                 PL_collation_name,  COLLXFRM_HDR_LEN,
5891                      xAlloc - COLLXFRM_HDR_LEN));
5892         }
5893
5894         Renew(xbuf, xAlloc, char);
5895         if (UNLIKELY(! xbuf)) {
5896             DEBUG_L(PerlIO_printf(Perl_debug_log,
5897                       "mem_collxfrm_: Couldn't realloc %zu bytes\n", xAlloc));
5898             goto bad;
5899         }
5900
5901         first_time = FALSE;
5902     }
5903
5904     CLEANUP_STRXFRM;
5905
5906     DEBUG_L(print_collxfrm_input_and_return(s, s + len, xbuf, *xlen, utf8));
5907
5908     /* Free up unneeded space; retain enough for trailing NUL */
5909     Renew(xbuf, COLLXFRM_HDR_LEN + *xlen + 1, char);
5910
5911     if (s != input_string) {
5912         Safefree(s);
5913     }
5914
5915     return xbuf;
5916
5917   bad:
5918
5919     CLEANUP_STRXFRM;
5920     DEBUG_L(print_collxfrm_input_and_return(s, s + len, NULL, 0, utf8));
5921
5922     Safefree(xbuf);
5923     if (s != input_string) {
5924         Safefree(s);
5925     }
5926     *xlen = 0;
5927
5928     return NULL;
5929 }
5930
5931 #  ifdef DEBUGGING
5932
5933 STATIC void
5934 S_print_collxfrm_input_and_return(pTHX_
5935                                   const char * s,
5936                                   const char * e,
5937                                   const char * xbuf,
5938                                   const STRLEN xlen,
5939                                   const bool is_utf8)
5940 {
5941
5942     PERL_ARGS_ASSERT_PRINT_COLLXFRM_INPUT_AND_RETURN;
5943
5944     PerlIO_printf(Perl_debug_log,
5945                   "mem_collxfrm_[ix %" UVuf "] for locale '%s':\n"
5946                   "     input=%s\n    return=%s\n    return len=%zu\n",
5947                   (UV) PL_collation_ix, PL_collation_name,
5948                   get_displayable_string(s, e, is_utf8),
5949                   ((xbuf == NULL)
5950                    ? "(null)"
5951                    : _byte_dump_string((U8 *) xbuf + COLLXFRM_HDR_LEN, xlen, 0)),
5952                   xlen);
5953 }
5954
5955 #  endif    /* DEBUGGING */
5956 #endif /* USE_LOCALE_COLLATE */
5957
5958 STATIC const char *
5959 S_get_displayable_string(pTHX_
5960                          const char * const s,
5961                          const char * const e,
5962                          const bool is_utf8)
5963 {
5964     PERL_ARGS_ASSERT_GET_DISPLAYABLE_STRING;
5965
5966     const char * t = s;
5967     bool prev_was_printable = TRUE;
5968     bool first_time = TRUE;
5969     char * ret;
5970
5971     /* Worst case scenario: All are non-printable so have a blank between each.
5972      * If UTF-8, all are the largest possible code point; otherwise all are a
5973      * single byte.  '(2 + 1)'  is from each byte takes 2 characters to
5974      * display, and a blank (or NUL for the final one) after it */
5975     SAVEFREEPV(Newxz(ret,
5976                      (e - s) * (2 + 1)
5977                              * ((is_utf8) ? UVSIZE : 1),
5978                      char));
5979
5980     while (t < e) {
5981         UV cp = (is_utf8)
5982                 ?  utf8_to_uvchr_buf((U8 *) t, e, NULL)
5983                 : * (U8 *) t;
5984         if (isPRINT(cp)) {
5985             if (! prev_was_printable) {
5986                 my_strlcat(ret, " ", sizeof(ret));
5987             }
5988
5989             /* Escape these to avoid any ambiguity */
5990             if (cp == ' ' || cp == '\\') {
5991                 my_strlcat(ret, "\\", sizeof(ret));
5992             }
5993             my_strlcat(ret, Perl_form(aTHX_ "%c", (U8) cp), sizeof(ret));
5994             prev_was_printable = TRUE;
5995         }
5996         else {
5997             if (! first_time) {
5998                 my_strlcat(ret, " ", sizeof(ret));
5999             }
6000             my_strlcat(ret, Perl_form(aTHX_ "%02" UVXf, cp), sizeof(ret));
6001             prev_was_printable = FALSE;
6002         }
6003         t += (is_utf8) ? UTF8SKIP(t) : 1;
6004         first_time = FALSE;
6005     }
6006
6007     return ret;
6008 }
6009
6010 #ifdef USE_LOCALE
6011
6012 STATIC const char *
6013 S_toggle_locale_i(pTHX_ const unsigned cat_index,
6014                         const char * new_locale,
6015                         const line_t caller_line)
6016 {
6017     /* Changes the locale for the category specified by 'index' to 'new_locale,
6018      * if they aren't already the same.
6019      *
6020      * Returns a copy of the name of the original locale for 'cat_index'
6021      * so can be switched back to with the companion function
6022      * restore_toggled_locale_i(),  (NULL if no restoral is necessary.) */
6023
6024     const char * locale_to_restore_to = NULL;
6025
6026     PERL_ARGS_ASSERT_TOGGLE_LOCALE_I;
6027     assert(cat_index <= NOMINAL_LC_ALL_INDEX);
6028
6029     /* Find the original locale of the category we may need to change, so that
6030      * it can be restored to later */
6031
6032     locale_to_restore_to = querylocale_i(cat_index);
6033
6034     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
6035              "(%" LINE_Tf "): toggle_locale_i: index=%d(%s), wanted=%s,"
6036              " actual=%s\n",
6037              caller_line, cat_index, category_names[cat_index],
6038              new_locale, locale_to_restore_to));
6039
6040     if (! locale_to_restore_to) {
6041         locale_panic_(Perl_form(aTHX_ "Could not find current %s locale, errno=%d",
6042                                 category_names[cat_index], errno));
6043     }
6044
6045     /* If the locales are the same, there's nothing to do */
6046     if (strEQ(locale_to_restore_to, new_locale)) {
6047         DEBUG_Lv(PerlIO_printf(Perl_debug_log,
6048                                "(%d): %s locale unchanged as %s\n",
6049                                caller_line, category_names[cat_index],
6050                                new_locale));
6051
6052         return NULL;
6053     }
6054
6055     /* Finally, change the locale to the new one */
6056     void_setlocale_i(cat_index, new_locale);
6057
6058     DEBUG_Lv(PerlIO_printf(Perl_debug_log, "(%" LINE_Tf "): %s locale switched to %s\n",
6059                            caller_line, category_names[cat_index], new_locale));
6060
6061     return locale_to_restore_to;
6062
6063 #ifndef DEBUGGING
6064     PERL_UNUSED_ARG(caller_line);
6065 #endif
6066
6067 }
6068
6069 STATIC void
6070 S_restore_toggled_locale_i(pTHX_ const unsigned int cat_index,
6071                                  const char * restore_locale,
6072                                  const line_t caller_line)
6073 {
6074     /* Restores the locale for LC_category corresponding to cat_indes to
6075      * 'restore_locale' (which is a copy that will be freed by this function),
6076      * or do nothing if the latter parameter is NULL */
6077
6078     PERL_ARGS_ASSERT_RESTORE_TOGGLED_LOCALE_I;
6079     assert(cat_index <= NOMINAL_LC_ALL_INDEX);
6080
6081     if (restore_locale == NULL) {
6082         DEBUG_Lv(PerlIO_printf(Perl_debug_log,
6083                                "(%" LINE_Tf "): No need to restore %s\n",
6084                                caller_line, category_names[cat_index]));
6085         return;
6086     }
6087
6088     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
6089                            "(%" LINE_Tf "): %s restoring locale to %s\n",
6090                            caller_line, category_names[cat_index],
6091                            restore_locale));
6092
6093     void_setlocale_i(cat_index, restore_locale);
6094
6095 #ifndef DEBUGGING
6096     PERL_UNUSED_ARG(caller_line);
6097 #endif
6098
6099 }
6100
6101 #ifdef USE_LOCALE_CTYPE
6102
6103 STATIC bool
6104 S_is_codeset_name_UTF8(const char * name)
6105 {
6106     /* Return a boolean as to if the passed-in name indicates it is a UTF-8
6107      * code set.  Several variants are possible */
6108     const Size_t len = strlen(name);
6109
6110     PERL_ARGS_ASSERT_IS_CODESET_NAME_UTF8;
6111
6112 #  ifdef WIN32
6113
6114     /* http://msdn.microsoft.com/en-us/library/windows/desktop/dd317756.aspx */
6115     if (memENDs(name, len, "65001")) {
6116         return TRUE;
6117     }
6118
6119 #  endif
6120                /* 'UTF8' or 'UTF-8' */
6121     return (    inRANGE(len, 4, 5)
6122             &&  name[len-1] == '8'
6123             && (   memBEGINs(name, len, "UTF")
6124                 || memBEGINs(name, len, "utf"))
6125             && (len == 4 || name[3] == '-'));
6126 }
6127
6128 #endif
6129
6130 STATIC bool
6131 S_is_locale_utf8(pTHX_ const char * locale)
6132 {
6133     /* Returns TRUE if the locale 'locale' is UTF-8; FALSE otherwise.  It uses
6134      * my_langinfo(), which employs various methods to get this information
6135      * if nl_langinfo() isn't available, using heuristics as a last resort, in
6136      * which case, the result will very likely be correct for locales for
6137      * languages that have commonly used non-ASCII characters, but for notably
6138      * English, it comes down to if the locale's name ends in something like
6139      * "UTF-8".  It errs on the side of not being a UTF-8 locale. */
6140
6141 #  if ! defined(USE_LOCALE_CTYPE)                                             \
6142    ||   defined(EBCDIC) /* There aren't any real UTF-8 locales at this time */
6143
6144     PERL_UNUSED_ARG(locale);
6145
6146     return FALSE;
6147
6148 #  else
6149
6150     const char * scratch_buffer = NULL;
6151     const char * codeset;
6152     bool retval;
6153
6154     PERL_ARGS_ASSERT_IS_LOCALE_UTF8;
6155
6156     if (strEQ(locale, PL_ctype_name)) {
6157         return PL_in_utf8_CTYPE_locale;
6158     }
6159
6160     codeset = my_langinfo_c(CODESET, LC_CTYPE, locale,
6161                             &scratch_buffer, NULL, NULL);
6162     retval = is_codeset_name_UTF8(codeset);
6163
6164     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
6165                            "found codeset=%s, is_utf8=%d\n", codeset, retval));
6166
6167     Safefree(scratch_buffer);
6168     return retval;
6169
6170 #  endif
6171
6172 }
6173
6174 #endif  /* USE_LOCALE */
6175
6176 bool
6177 Perl__is_in_locale_category(pTHX_ const bool compiling, const int category)
6178 {
6179     /* Internal function which returns if we are in the scope of a pragma that
6180      * enables the locale category 'category'.  'compiling' should indicate if
6181      * this is during the compilation phase (TRUE) or not (FALSE). */
6182
6183     const COP * const cop = (compiling) ? &PL_compiling : PL_curcop;
6184
6185     SV *these_categories = cop_hints_fetch_pvs(cop, "locale", 0);
6186     if (! these_categories || these_categories == &PL_sv_placeholder) {
6187         return FALSE;
6188     }
6189
6190     /* The pseudo-category 'not_characters' is -1, so just add 1 to each to get
6191      * a valid unsigned */
6192     assert(category >= -1);
6193     return cBOOL(SvUV(these_categories) & (1U << (category + 1)));
6194 }
6195
6196 /* my_strerror() returns a mortalized copy of the text of the error message
6197  * associated with 'errnum'.
6198  *
6199  * If not called from within the scope of 'use locale', it uses the text from
6200  * the C locale.  If Perl is compiled to not pay attention to LC_CTYPE nor
6201  * LC_MESSAGES, it uses whatever strerror() returns.  Otherwise the text is
6202  * derived from the locale, LC_MESSAGES if we have that; LC_CTYPE if not.
6203  *
6204  * It returns in *utf8ness the result's UTF-8ness
6205  *
6206  * The function just calls strerror(), but temporarily switches locales, if
6207  * needed.  Many platforms require LC_CTYPE and LC_MESSAGES to be in the same
6208  * CODESET in order for the return from strerror() to not contain '?' symbols,
6209  * or worse, mojibaked.  It's cheaper to just use the stricter criteria of
6210  * being in the same locale.  So the code below uses a common locale for both
6211  * categories.  Again, that is C if not within 'use locale' scope; or the
6212  * LC_MESSAGES locale if in scope and we have that category; and LC_CTYPE if we
6213  * don't have LC_MESSAGES; and whatever strerror returns if we don't have
6214  * either category.
6215  *
6216  * There are two sets of implementations.  The first below is if we have
6217  * strerror_l().  This is the simpler.  We just use the already-built C locale
6218  * object if not in locale scope, or build up a custom one otherwise.
6219  *
6220  * When strerror_l() is not available, we may have to swap locales temporarily
6221  * to bring the two categories into sync with each other, and possibly to the C
6222  * locale.
6223  *
6224  * Because the prepropessing directives to conditionally compile this function
6225  * would greatly obscure the logic of the various implementations, the whole
6226  * function is repeated for each configuration, with some common macros. */
6227
6228 /* Used to shorten the definitions of the following implementations of
6229  * my_strerror() */
6230 #define DEBUG_STRERROR_ENTER(errnum, in_locale)                             \
6231     DEBUG_Lv(PerlIO_printf(Perl_debug_log,                                  \
6232                            "my_strerror called with errnum %d;"             \
6233                            " Within locale scope=%d\n",                     \
6234                            errnum, in_locale))
6235 #define DEBUG_STRERROR_RETURN(errstr, utf8ness)                             \
6236     DEBUG_Lv(PerlIO_printf(Perl_debug_log,                                  \
6237                            "Strerror returned; saving a copy: '%s';"        \
6238                            " utf8ness=%d\n",                                \
6239                            get_displayable_string(errstr,                   \
6240                                                   errstr + strlen(errstr),  \
6241                                                   *utf8ness),               \
6242                            (int) *utf8ness))
6243
6244 /* On platforms that have precisely one of these categories (Windows
6245  * qualifies), these yield the correct one */
6246 #if defined(USE_LOCALE_CTYPE)
6247 #  define WHICH_LC_INDEX LC_CTYPE_INDEX_
6248 #elif defined(USE_LOCALE_MESSAGES)
6249 #  define WHICH_LC_INDEX LC_MESSAGES_INDEX_
6250 #endif
6251
6252 /*==========================================================================*/
6253 /* First set of implementations, when have strerror_l() */
6254
6255 #if defined(USE_POSIX_2008_LOCALE) && defined(HAS_STRERROR_L)
6256
6257 #  if ! defined(USE_LOCALE_CTYPE) && ! defined(USE_LOCALE_MESSAGES)
6258
6259 /* Here, neither category is defined: use the C locale */
6260 const char *
6261 Perl_my_strerror(pTHX_ const int errnum, utf8ness_t * utf8ness)
6262 {
6263     PERL_ARGS_ASSERT_MY_STRERROR;
6264
6265     DEBUG_STRERROR_ENTER(errnum, 0);
6266
6267     const char *errstr = savepv(strerror_l(errnum, PL_C_locale_obj));
6268     *utf8ness = UTF8NESS_IMMATERIAL;
6269
6270     DEBUG_STRERROR_RETURN(errstr, utf8ness);
6271
6272     SAVEFREEPV(errstr);
6273     return errstr;
6274 }
6275
6276 #  elif ! defined(USE_LOCALE_CTYPE) || ! defined(USE_LOCALE_MESSAGES)
6277
6278 /*--------------------------------------------------------------------------*/
6279
6280 /* Here one or the other of CTYPE or MESSAGES is defined, but not both.  If we
6281  * are not within 'use locale' scope of the only one defined, we use the C
6282  * locale; otherwise use the current locale object */
6283
6284 const char *
6285 Perl_my_strerror(pTHX_ const int errnum, utf8ness_t * utf8ness)
6286 {
6287     PERL_ARGS_ASSERT_MY_STRERROR;
6288
6289     DEBUG_STRERROR_ENTER(errnum, IN_LC(categories[WHICH_LC_INDEX]));
6290
6291     /* Use C if not within locale scope;  Otherwise, use current locale */
6292     const locale_t which_obj = (IN_LC(categories[WHICH_LC_INDEX]))
6293                                ? PL_C_locale_obj
6294                                : use_curlocale_scratch();
6295
6296     const char *errstr = savepv(strerror_l(errnum, which_obj));
6297     *utf8ness = get_locale_string_utf8ness_i(NULL, WHICH_LC_INDEX, errstr,
6298                                              LOCALE_UTF8NESS_UNKNOWN);
6299     DEBUG_STRERROR_RETURN(errstr, utf8ness);
6300
6301     SAVEFREEPV(errstr);
6302     return errstr;
6303 }
6304
6305 /*--------------------------------------------------------------------------*/
6306 #  else     /* Are using both categories.  Place them in the same CODESET,
6307              * either C or the LC_MESSAGES locale */
6308
6309 const char *
6310 Perl_my_strerror(pTHX_ const int errnum, utf8ness_t * utf8ness)
6311 {
6312     PERL_ARGS_ASSERT_MY_STRERROR;
6313
6314     DEBUG_STRERROR_ENTER(errnum, IN_LC(LC_MESSAGES));
6315
6316     const char *errstr;
6317     if (! IN_LC(LC_MESSAGES)) {    /* Use C if not within locale scope */
6318         errstr = savepv(strerror_l(errnum, PL_C_locale_obj));
6319         *utf8ness = UTF8NESS_IMMATERIAL;
6320     }
6321     else {  /* Otherwise, use the LC_MESSAGES locale, making sure LC_CTYPE
6322                matches */
6323         locale_t cur = duplocale(use_curlocale_scratch());
6324
6325         cur = newlocale(LC_CTYPE_MASK, querylocale_c(LC_MESSAGES), cur);
6326         errstr = savepv(strerror_l(errnum, cur));
6327         *utf8ness = get_locale_string_utf8ness_i(NULL, LC_MESSAGES_INDEX_,
6328                                                errstr, LOCALE_UTF8NESS_UNKNOWN);
6329         freelocale(cur);
6330     }
6331
6332     DEBUG_STRERROR_RETURN(errstr, utf8ness);
6333
6334     SAVEFREEPV(errstr);
6335     return errstr;
6336 }
6337 #  endif    /* Above is using strerror_l */
6338 /*==========================================================================*/
6339 #else       /* Below is not using strerror_l */
6340 #  if ! defined(USE_LOCALE_CTYPE) && ! defined(USE_LOCALE_MESSAGES)
6341
6342 /* If not using using either of the categories, return plain, unadorned
6343  * strerror */
6344
6345 const char *
6346 Perl_my_strerror(pTHX_ const int errnum, utf8ness_t * utf8ness)
6347 {
6348     PERL_ARGS_ASSERT_MY_STRERROR;
6349
6350     DEBUG_STRERROR_ENTER(errnum, 0);
6351
6352     const char *errstr = savepv(Strerror(errnum));
6353     *utf8ness = UTF8NESS_IMMATERIAL;
6354
6355     DEBUG_STRERROR_RETURN(errstr, utf8ness);
6356
6357     SAVEFREEPV(errstr);
6358     return errstr;
6359 }
6360
6361 /*--------------------------------------------------------------------------*/
6362 #  elif ! defined(USE_LOCALE_CTYPE) || ! defined(USE_LOCALE_MESSAGES)
6363
6364 /* Here one or the other of CTYPE or MESSAGES is defined, but not both.  If we
6365  * are not within 'use locale' scope of the only one defined, we use the C
6366  * locale; otherwise use the current locale */
6367
6368 const char *
6369 Perl_my_strerror(pTHX_ const int errnum, utf8ness_t * utf8ness)
6370 {
6371     PERL_ARGS_ASSERT_MY_STRERROR;
6372
6373     DEBUG_STRERROR_ENTER(errnum, IN_LC(categories[WHICH_LC_INDEX]));
6374
6375     const char *errstr;
6376     if (IN_LC(categories[WHICH_LC_INDEX])) {
6377         errstr = savepv(Strerror(errnum));
6378         *utf8ness = get_locale_string_utf8ness_i(NULL, WHICH_LC_INDEX, errstr,
6379                                                  LOCALE_UTF8NESS_UNKNOWN);
6380     }
6381     else {
6382
6383         SETLOCALE_LOCK;
6384
6385         const char * orig_locale = toggle_locale_i(WHICH_LC_INDEX, "C");
6386
6387         errstr = savepv(Strerror(errnum));
6388
6389         restore_toggled_locale_i(WHICH_LC_INDEX, orig_locale);
6390
6391         SETLOCALE_UNLOCK;
6392
6393         *utf8ness = UTF8NESS_IMMATERIAL;
6394
6395     }
6396
6397     DEBUG_STRERROR_RETURN(errstr, utf8ness);
6398
6399     SAVEFREEPV(errstr);
6400     return errstr;
6401 }
6402
6403 /*--------------------------------------------------------------------------*/
6404 #  else
6405
6406 /* Below, have both LC_CTYPE and LC_MESSAGES.  Place them in the same CODESET,
6407  * either C or the LC_MESSAGES locale */
6408
6409 const char *
6410 Perl_my_strerror(pTHX_ const int errnum, utf8ness_t * utf8ness)
6411 {
6412     PERL_ARGS_ASSERT_MY_STRERROR;
6413
6414     DEBUG_STRERROR_ENTER(errnum, IN_LC(LC_MESSAGES));
6415
6416     const char * desired_locale = (IN_LC(LC_MESSAGES))
6417                                   ? querylocale_c(LC_MESSAGES)
6418                                   : "C";
6419     /* XXX Can fail on z/OS */
6420
6421     SETLOCALE_LOCK;
6422
6423     const char* orig_CTYPE_locale    = toggle_locale_c(LC_CTYPE, desired_locale);
6424     const char* orig_MESSAGES_locale = toggle_locale_c(LC_MESSAGES,
6425                                                        desired_locale);
6426     const char *errstr = savepv(Strerror(errnum));
6427
6428     restore_toggled_locale_c(LC_MESSAGES, orig_MESSAGES_locale);
6429     restore_toggled_locale_c(LC_CTYPE, orig_CTYPE_locale);
6430
6431     SETLOCALE_UNLOCK;
6432
6433     *utf8ness = get_locale_string_utf8ness_i(NULL, LC_MESSAGES_INDEX_, errstr,
6434                                              LOCALE_UTF8NESS_UNKNOWN);
6435     DEBUG_STRERROR_RETURN(errstr, utf8ness);
6436
6437     SAVEFREEPV(errstr);
6438     return errstr;
6439 }
6440
6441 /*--------------------------------------------------------------------------*/
6442 #  endif /* end of not using strerror_l() */
6443 #endif   /* end of all the my_strerror() implementations */
6444
6445 /*
6446
6447 =for apidoc switch_to_global_locale
6448
6449 This function copies the locale state of the calling thread into the program's
6450 global locale, and converts the thread to use that global locale.
6451
6452 It is intended so that Perl can safely be used with C libraries that access the
6453 global locale and which can't be converted to not access it.  Effectively, this
6454 means libraries that call C<L<setlocale(3)>> on non-Windows systems.  (For
6455 portability, it is a good idea to use it on Windows as well.)
6456
6457 A downside of using it is that it disables the services that Perl provides to
6458 hide locale gotchas from your code.  The service you most likely will miss
6459 regards the radix character (decimal point) in floating point numbers.  Code
6460 executed after this function is called can no longer just assume that this
6461 character is correct for the current circumstances.
6462
6463 To return to Perl control, and restart the gotcha prevention services, call
6464 C<L</sync_locale>>.  Behavior is undefined for any pure Perl code that executes
6465 while the switch is in effect.
6466
6467 The global locale and the per-thread locales are independent.  As long as just
6468 one thread converts to the global locale, everything works smoothly.  But if
6469 more than one does, they can easily interfere with each other, and races are
6470 likely.  On Windows systems prior to Visual Studio 15 (at which point Microsoft
6471 fixed a bug), races can occur (even if only one thread has been converted to
6472 the global locale), but only if you use the following operations:
6473
6474 =over
6475
6476 =item L<POSIX::localeconv|POSIX/localeconv>
6477
6478 =item L<I18N::Langinfo>, items C<CRNCYSTR> and C<THOUSEP>
6479
6480 =item L<perlapi/Perl_langinfo>, items C<CRNCYSTR> and C<THOUSEP>
6481
6482 =back
6483
6484 The first item is not fixable (except by upgrading to a later Visual Studio
6485 release), but it would be possible to work around the latter two items by
6486 having Perl change its algorithm for calculating these to use Windows API
6487 functions (likely C<GetNumberFormat> and C<GetCurrencyFormat>); patches
6488 welcome.
6489
6490 XS code should never call plain C<setlocale>, but should instead be converted
6491 to either call L<C<Perl_setlocale>|perlapi/Perl_setlocale> (which is a drop-in
6492 for the system C<setlocale>) or use the methods given in L<perlcall> to call
6493 L<C<POSIX::setlocale>|POSIX/setlocale>.  Either one will transparently properly
6494 handle all cases of single- vs multi-thread, POSIX 2008-supported or not.
6495
6496 =cut
6497 */
6498
6499 void
6500 Perl_switch_to_global_locale(pTHX)
6501 {
6502
6503 #ifdef USE_LOCALE
6504
6505     bool perl_controls = false;
6506
6507     DEBUG_L(PerlIO_printf(Perl_debug_log, "Entering switch_to_global; %s\n",
6508                                           get_LC_ALL_display()));
6509
6510 #  ifdef USE_THREAD_SAFE_LOCALE
6511
6512    /* In these cases, we use the system state to determine if we are in the
6513     * global locale or not. */
6514
6515 #    ifdef USE_POSIX_2008_LOCALE
6516
6517     perl_controls = LC_GLOBAL_LOCALE != uselocale((locale_t) 0);
6518
6519 #    elif defined(WIN32)
6520
6521     perl_controls = _configthreadlocale(0) == _ENABLE_PER_THREAD_LOCALE;
6522
6523 #    else
6524 #      error Unexpected Configuration
6525 #    endif
6526 #  endif
6527
6528     /* No-op if already in global */
6529     if (! perl_controls) {
6530         return;
6531     }
6532
6533 #  ifdef USE_THREAD_SAFE_LOCALE
6534 #    if defined(WIN32)
6535
6536     _configthreadlocale(_DISABLE_PER_THREAD_LOCALE);
6537
6538 #    elif defined(USE_POSIX_2008_LOCALE)
6539
6540     const char * curlocales[NOMINAL_LC_ALL_INDEX + 1];
6541
6542     /* Save each category's current state */
6543     for (unsigned i = 0; i < NOMINAL_LC_ALL_INDEX; i++) {
6544         curlocales[i] = querylocale_i(i);
6545     }
6546
6547     /* Switch to global */
6548     locale_t old_locale = uselocale(LC_GLOBAL_LOCALE);
6549     if (! old_locale) {
6550         locale_panic_(Perl_form(aTHX_ "Could not change to global locale"));
6551     }
6552
6553     if (old_locale != LC_GLOBAL_LOCALE && old_locale != PL_C_locale_obj) {
6554         freelocale(old_locale);
6555     }
6556
6557     /* Set the global to what was our per-thread state */
6558     POSIX_SETLOCALE_LOCK;
6559     for (unsigned int i = 0; i < NOMINAL_LC_ALL_INDEX; i++) {
6560         posix_setlocale(categories[i], curlocales[i]);
6561     }
6562     POSIX_SETLOCALE_UNLOCK;
6563
6564     for (unsigned int i = 0; i < NOMINAL_LC_ALL_INDEX; i++) {
6565         Safefree(curlocales[i]);
6566     }
6567
6568 #    else
6569 #      error Unexpected Configuration
6570 #    endif
6571 #  endif
6572 #  ifdef USE_LOCALE_NUMERIC
6573
6574     /* Switch to the underlying C numeric locale; the application is on its
6575      * own. */
6576     POSIX_SETLOCALE_LOCK;
6577     posix_setlocale(LC_NUMERIC, PL_numeric_name);
6578     POSIX_SETLOCALE_UNLOCK;
6579
6580 #  endif
6581 #endif
6582
6583 }
6584
6585 /*
6586
6587 =for apidoc sync_locale
6588
6589 This function copies the state of the program global locale into the calling
6590 thread, and converts that thread to using per-thread locales, if it wasn't
6591 already, and the platform supports them.  The LC_NUMERIC locale is toggled into
6592 the standard state (using the C locale's conventions), if not within the
6593 lexical scope of S<C<use locale>>.
6594
6595 Perl will now consider itself to have control of the locale.
6596
6597 Since unthreaded perls have only a global locale, this function is a no-op
6598 without threads.
6599
6600 This function is intended for use with C libraries that do locale manipulation.
6601 It allows Perl to accommodate the use of them.  Call this function before
6602 transferring back to Perl space so that it knows what state the C code has left
6603 things in.
6604
6605 XS code should not manipulate the locale on its own.  Instead,
6606 L<C<Perl_setlocale>|perlapi/Perl_setlocale> can be used at any time to query or
6607 change the locale (though changing the locale is antisocial and dangerous on
6608 multi-threaded systems that don't have multi-thread safe locale operations.
6609 (See L<perllocale/Multi-threaded operation>).
6610
6611 Using the libc L<C<setlocale(3)>> function should be avoided.  Nevertheless,
6612 certain non-Perl libraries called from XS, do call it, and their behavior may
6613 not be able to be changed.  This function, along with
6614 C<L</switch_to_global_locale>>, can be used to get seamless behavior in these
6615 circumstances, as long as only one thread is involved.
6616
6617 If the library has an option to turn off its locale manipulation, doing that is
6618 preferable to using this mechanism.  C<Gtk> is such a library.
6619
6620 The return value is a boolean: TRUE if the global locale at the time of call
6621 was in effect for the caller; and FALSE if a per-thread locale was in effect.
6622
6623 =cut
6624 */
6625
6626 bool
6627 Perl_sync_locale(pTHX)
6628 {
6629
6630 #ifndef USE_LOCALE
6631
6632     return TRUE;
6633
6634 #else
6635
6636     bool was_in_global = TRUE;
6637
6638 #  ifdef USE_THREAD_SAFE_LOCALE
6639 #    if defined(WIN32)
6640
6641     was_in_global = _configthreadlocale(_ENABLE_PER_THREAD_LOCALE)
6642                                     == _DISABLE_PER_THREAD_LOCALE;
6643
6644 #    elif defined(USE_POSIX_2008_LOCALE)
6645
6646     was_in_global = LC_GLOBAL_LOCALE == uselocale((locale_t) 0);
6647
6648 #    else
6649 #      error Unexpected Configuration
6650 #    endif
6651 #  endif    /* USE_THREAD_SAFE_LOCALE */
6652 #  ifdef LC_ALL
6653
6654     /* Use the external interface Perl_setlocale() to make sure all setup gets
6655      * done */
6656     Perl_setlocale(LC_ALL, stdized_setlocale(LC_ALL, NULL));
6657
6658 #  else
6659
6660     for (unsigned i = 0; i < NOMINAL_LC_ALL_INDEX; i++) {
6661         Perl_setlocale(categories[i], stdized_setlocale(categories[i], NULL);
6662     }
6663
6664 #  endif
6665
6666     return was_in_global;
6667
6668 #endif
6669
6670 }
6671
6672 #if defined(DEBUGGING) && defined(USE_LOCALE)
6673
6674 STATIC char *
6675 S_my_setlocale_debug_string_i(pTHX_
6676                               const unsigned cat_index,
6677                               const char* locale, /* Optional locale name */
6678
6679                               /* return value from setlocale() when attempting
6680                                * to set 'category' to 'locale' */
6681                               const char* retval,
6682
6683                               const line_t line)
6684 {
6685     /* Returns a pointer to a NUL-terminated string in static storage with
6686      * added text about the info passed in.  This is not thread safe and will
6687      * be overwritten by the next call, so this should be used just to
6688      * formulate a string to immediately print or savepv() on. */
6689
6690     const char * locale_quote;
6691     const char * retval_quote;
6692
6693     assert(cat_index <= NOMINAL_LC_ALL_INDEX);
6694
6695     if (locale == NULL) {
6696         locale_quote = "";
6697         locale = "NULL";
6698     }
6699     else {
6700         locale_quote = "\"";
6701     }
6702
6703     if (retval == NULL) {
6704         retval_quote = "";
6705         retval = "NULL";
6706     }
6707     else {
6708         retval_quote = "\"";
6709     }
6710
6711 #  ifdef USE_LOCALE_THREADS
6712 #    define THREAD_FORMAT "%p:"
6713 #    define THREAD_ARGUMENT aTHX_
6714 #  else
6715 #    define THREAD_FORMAT
6716 #    define THREAD_ARGUMENT
6717 #  endif
6718
6719     return Perl_form(aTHX_
6720                      "%s:%" LINE_Tf ":" THREAD_FORMAT
6721                      " setlocale(%s[%d], %s%s%s) returned %s%s%s\n",
6722
6723                      __FILE__, line, THREAD_ARGUMENT
6724                      category_names[cat_index], categories[cat_index],
6725                      locale_quote, locale, locale_quote,
6726                      retval_quote, retval, retval_quote);
6727 }
6728
6729 #endif
6730
6731 void
6732 Perl_thread_locale_init()
6733 {
6734     /* Called from a thread on startup*/
6735
6736 #ifdef USE_THREAD_SAFE_LOCALE
6737
6738     dTHX_DEBUGGING;
6739
6740      DEBUG_L(PerlIO_printf(Perl_debug_log,
6741                            "new thread, initial locale is %s;"
6742                            " calling setlocale(LC_ALL, \"C\")\n",
6743                            get_LC_ALL_display()));
6744 #  ifdef WIN32
6745
6746     /* On Windows, make sure new thread has per-thread locales enabled */
6747     _configthreadlocale(_ENABLE_PER_THREAD_LOCALE);
6748
6749 #  endif
6750 #  if defined(LC_ALL)
6751
6752     /* This thread starts off in the C locale.  Use the full Perl_setlocale()
6753      * to make sure no ill-advised shortcuts get taken on this new thread, */
6754     Perl_setlocale(LC_ALL, "C");
6755
6756 #  else
6757
6758     for (unsigned i = 0; i < NOMINAL_LC_ALL_INDEX; i++) {
6759         Perl_setlocale(categories[i], "C");
6760     }
6761
6762 #  endif
6763 #endif
6764
6765 }
6766
6767 void
6768 Perl_thread_locale_term()
6769 {
6770     /* Called from a thread as it gets ready to terminate */
6771
6772 #ifdef USE_POSIX_2008_LOCALE
6773
6774     /* C starts the new thread in the global C locale.  If we are thread-safe,
6775      * we want to not be in the global locale */
6776
6777     {   /* Free up */
6778         locale_t cur_obj = uselocale(LC_GLOBAL_LOCALE);
6779         if (cur_obj != LC_GLOBAL_LOCALE && cur_obj != PL_C_locale_obj) {
6780             freelocale(cur_obj);
6781         }
6782     }
6783
6784 #endif
6785
6786 }
6787
6788 /*
6789  * ex: set ts=8 sts=4 sw=4 et:
6790  */