locale.c

   1 /*    locale.c
   2  *
   3  *    Copyright (C) 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001,
   4  *    2002, 2003, 2005, 2006, 2007, 2008 by Larry Wall and others
   5  *
   6  *    You may distribute under the terms of either the GNU General Public
   7  *    License or the Artistic License, as specified in the README file.
   8  *
   9  */
  10
  11 /*
  12  *      A Elbereth Gilthoniel,
  13  *      silivren penna míriel
  14  *      o menel aglar elenath!
  15  *      Na-chaered palan-díriel
  16  *      o galadhremmin ennorath,
  17  *      Fanuilos, le linnathon
  18  *      nef aear, si nef aearon!
  19  *
  20  *     [p.238 of _The Lord of the Rings_, II/i: "Many Meetings"]
  21  */
  22
  23 /* utility functions for handling locale-specific stuff like what
  24  * character represents the decimal point.
  25  *
  26  * All C programs have an underlying locale.  Perl generally doesn't pay any
  27  * attention to it except within the scope of a 'use locale'.  For most
  28  * categories, it accomplishes this by just using different operations if it is
  29  * in such scope than if not.  However, various libc functions called by Perl
  30  * are affected by the LC_NUMERIC category, so there are macros in perl.h that
  31  * are used to toggle between the current locale and the C locale depending on
  32  * the desired behavior of those functions at the moment.  And, LC_MESSAGES is
  33  * switched to the C locale for outputting the message unless within the scope
  34  * of 'use locale'.
  35  */
  36
  37 #include "EXTERN.h"
  38 #define PERL_IN_LOCALE_C
  39 #include "perl_langinfo.h"
  40 #include "perl.h"
  41
  42 #include "reentr.h"
  43
  44 /* If the environment says to, we can output debugging information during
  45  * initialization.  This is done before option parsing, and before any thread
  46  * creation, so can be a file-level static */
  47 #ifdef DEBUGGING
  48 # ifdef PERL_GLOBAL_STRUCT
  49   /* no global syms allowed */
  50 #  define debug_initialization 0
  51 #  define DEBUG_INITIALIZATION_set(v)
  52 # else
  53 static bool debug_initialization = FALSE;
  54 #  define DEBUG_INITIALIZATION_set(v) (debug_initialization = v)
  55 # endif
  56 #endif
  57
  58 #ifdef USE_LOCALE
  59
  60 /*
  61  * Standardize the locale name from a string returned by 'setlocale', possibly
  62  * modifying that string.
  63  *
  64  * The typical return value of setlocale() is either
  65  * (1) "xx_YY" if the first argument of setlocale() is not LC_ALL
  66  * (2) "xa_YY xb_YY ..." if the first argument of setlocale() is LC_ALL
  67  *     (the space-separated values represent the various sublocales,
  68  *      in some unspecified order).  This is not handled by this function.
  69  *
  70  * In some platforms it has a form like "LC_SOMETHING=Lang_Country.866\n",
  71  * which is harmful for further use of the string in setlocale().  This
  72  * function removes the trailing new line and everything up through the '='
  73  *
  74  */
  75 STATIC char *
  76 S_stdize_locale(pTHX_ char *locs)
  77 {
  78     const char * const s = strchr(locs, '=');
  79     bool okay = TRUE;
  80
  81     PERL_ARGS_ASSERT_STDIZE_LOCALE;
  82
  83     if (s) {
  84         const char * const t = strchr(s, '.');
  85         okay = FALSE;
  86         if (t) {
  87             const char * const u = strchr(t, '\n');
  88             if (u && (u[1] == 0)) {
  89                 const STRLEN len = u - s;
  90                 Move(s + 1, locs, len, char);
  91                 locs[len] = 0;
  92                 okay = TRUE;
  93             }
  94         }
  95     }
  96
  97     if (!okay)
  98         Perl_croak(aTHX_ "Can't fix broken locale name \"%s\"", locs);
  99
 100     return locs;
 101 }
 102
 103 #endif
 104
 105 STATIC void
 106 S_set_numeric_radix(pTHX)
 107 {
 108 #ifdef USE_LOCALE_NUMERIC
 109 # ifdef HAS_LOCALECONV
 110     const struct lconv* const lc = localeconv();
 111
 112     if (lc && lc->decimal_point) {
 113         if (lc->decimal_point[0] == '.' && lc->decimal_point[1] == 0) {
 114             SvREFCNT_dec(PL_numeric_radix_sv);
 115             PL_numeric_radix_sv = NULL;
 116         }
 117         else {
 118             if (PL_numeric_radix_sv)
 119                 sv_setpv(PL_numeric_radix_sv, lc->decimal_point);
 120             else
 121                 PL_numeric_radix_sv = newSVpv(lc->decimal_point, 0);
 122             if (! is_utf8_invariant_string((U8 *) lc->decimal_point, 0)
 123                 && is_utf8_string((U8 *) lc->decimal_point, 0)
 124                 && _is_cur_LC_category_utf8(LC_NUMERIC))
 125             {
 126                 SvUTF8_on(PL_numeric_radix_sv);
 127             }
 128         }
 129     }
 130     else
 131         PL_numeric_radix_sv = NULL;
 132
 133 #ifdef DEBUGGING
 134     if (DEBUG_L_TEST || debug_initialization) {
 135         PerlIO_printf(Perl_debug_log, "Locale radix is '%s', ?UTF-8=%d\n",
 136                                           (PL_numeric_radix_sv)
 137                                            ? SvPVX(PL_numeric_radix_sv)
 138                                            : "NULL",
 139                                           (PL_numeric_radix_sv)
 140                                            ? cBOOL(SvUTF8(PL_numeric_radix_sv))
 141                                            : 0);
 142     }
 143 #endif
 144
 145 # endif /* HAS_LOCALECONV */
 146 #endif /* USE_LOCALE_NUMERIC */
 147 }
 148
 149 /* Is the C string input 'name' "C" or "POSIX"?  If so, and 'name' is the
 150  * return of setlocale(), then this is extremely likely to be the C or POSIX
 151  * locale.  However, the output of setlocale() is documented to be opaque, but
 152  * the odds are extremely small that it would return these two strings for some
 153  * other locale.  Note that VMS in these two locales includes many non-ASCII
 154  * characters as controls and punctuation (below are hex bytes):
 155  *   cntrl:  00-1F 7F 84-97 9B-9F
 156  *   punct:  21-2F 3A-40 5B-60 7B-7E A1-A3 A5 A7-AB B0-B3 B5-B7 B9-BD BF-CF D1-DD DF-EF F1-FD
 157  * Oddly, none there are listed as alphas, though some represent alphabetics
 158  * http://www.nntp.perl.org/group/perl.perl5.porters/2013/02/msg198753.html */
 159 #define isNAME_C_OR_POSIX(name) ((name) != NULL                                 \
 160                                   && ((*(name) == 'C' && (*(name + 1)) == '\0') \
 161                                        || strEQ((name), "POSIX")))
 162
 163 void
 164 Perl_new_numeric(pTHX_ const char *newnum)
 165 {
 166 #ifdef USE_LOCALE_NUMERIC
 167
 168     /* Called after all libc setlocale() calls affecting LC_NUMERIC, to tell
 169      * core Perl this and that 'newnum' is the name of the new locale.
 170      * It installs this locale as the current underlying default.
 171      *
 172      * The default locale and the C locale can be toggled between by use of the
 173      * set_numeric_local() and set_numeric_standard() functions, which should
 174      * probably not be called directly, but only via macros like
 175      * SET_NUMERIC_STANDARD() in perl.h.
 176      *
 177      * The toggling is necessary mainly so that a non-dot radix decimal point
 178      * character can be output, while allowing internal calculations to use a
 179      * dot.
 180      *
 181      * This sets several interpreter-level variables:
 182      * PL_numeric_name  The underlying locale's name: a copy of 'newnum'
 183      * PL_numeric_local A boolean indicating if the toggled state is such
 184      *                  that the current locale is the program's underlying
 185      *                  locale
 186      * PL_numeric_standard An int indicating if the toggled state is such
 187      *                  that the current locale is the C locale.  If non-zero,
 188      *                  it is in C; if > 1, it means it may not be toggled away
 189      *                  from C.
 190      * Note that both of the last two variables can be true at the same time,
 191      * if the underlying locale is C.  (Toggling is a no-op under these
 192      * circumstances.)
 193      *
 194      * Any code changing the locale (outside this file) should use
 195      * POSIX::setlocale, which calls this function.  Therefore this function
 196      * should be called directly only from this file and from
 197      * POSIX::setlocale() */
 198
 199     char *save_newnum;
 200
 201     if (! newnum) {
 202         Safefree(PL_numeric_name);
 203         PL_numeric_name = NULL;
 204         PL_numeric_standard = TRUE;
 205         PL_numeric_local = TRUE;
 206         return;
 207     }
 208
 209     save_newnum = stdize_locale(savepv(newnum));
 210
 211     PL_numeric_standard = isNAME_C_OR_POSIX(save_newnum);
 212     PL_numeric_local = TRUE;
 213
 214     if (! PL_numeric_name || strNE(PL_numeric_name, save_newnum)) {
 215         Safefree(PL_numeric_name);
 216         PL_numeric_name = save_newnum;
 217     }
 218     else {
 219         Safefree(save_newnum);
 220     }
 221
 222     /* Keep LC_NUMERIC in the C locale.  This is for XS modules, so they don't
 223      * have to worry about the radix being a non-dot.  (Core operations that
 224      * need the underlying locale change to it temporarily). */
 225     set_numeric_standard();
 226
 227     set_numeric_radix();
 228
 229 #else
 230     PERL_UNUSED_ARG(newnum);
 231 #endif /* USE_LOCALE_NUMERIC */
 232 }
 233
 234 void
 235 Perl_set_numeric_standard(pTHX)
 236 {
 237 #ifdef USE_LOCALE_NUMERIC
 238     /* Toggle the LC_NUMERIC locale to C.  Most code should use the macros like
 239      * SET_NUMERIC_STANDARD() in perl.h instead of calling this directly.  The
 240      * macro avoids calling this routine if toggling isn't necessary according
 241      * to our records (which could be wrong if some XS code has changed the
 242      * locale behind our back) */
 243
 244     setlocale(LC_NUMERIC, "C");
 245     PL_numeric_standard = TRUE;
 246     PL_numeric_local = isNAME_C_OR_POSIX(PL_numeric_name);
 247     set_numeric_radix();
 248 #ifdef DEBUGGING
 249     if (DEBUG_L_TEST || debug_initialization) {
 250         PerlIO_printf(Perl_debug_log,
 251                           "Underlying LC_NUMERIC locale now is C\n");
 252     }
 253 #endif
 254
 255 #endif /* USE_LOCALE_NUMERIC */
 256 }
 257
 258 void
 259 Perl_set_numeric_local(pTHX)
 260 {
 261 #ifdef USE_LOCALE_NUMERIC
 262     /* Toggle the LC_NUMERIC locale to the current underlying default.  Most
 263      * code should use the macros like SET_NUMERIC_LOCAL() in perl.h instead of
 264      * calling this directly.  The macro avoids calling this routine if
 265      * toggling isn't necessary according to our records (which could be wrong
 266      * if some XS code has changed the locale behind our back) */
 267
 268     setlocale(LC_NUMERIC, PL_numeric_name);
 269     PL_numeric_standard = isNAME_C_OR_POSIX(PL_numeric_name);
 270     PL_numeric_local = TRUE;
 271     set_numeric_radix();
 272 #ifdef DEBUGGING
 273     if (DEBUG_L_TEST || debug_initialization) {
 274         PerlIO_printf(Perl_debug_log,
 275                           "Underlying LC_NUMERIC locale now is %s\n",
 276                           PL_numeric_name);
 277     }
 278 #endif
 279
 280 #endif /* USE_LOCALE_NUMERIC */
 281 }
 282
 283 /*
 284  * Set up for a new ctype locale.
 285  */
 286 STATIC void
 287 S_new_ctype(pTHX_ const char *newctype)
 288 {
 289 #ifdef USE_LOCALE_CTYPE
 290
 291     /* Called after all libc setlocale() calls affecting LC_CTYPE, to tell
 292      * core Perl this and that 'newctype' is the name of the new locale.
 293      *
 294      * This function sets up the folding arrays for all 256 bytes, assuming
 295      * that tofold() is tolc() since fold case is not a concept in POSIX,
 296      *
 297      * Any code changing the locale (outside this file) should use
 298      * POSIX::setlocale, which calls this function.  Therefore this function
 299      * should be called directly only from this file and from
 300      * POSIX::setlocale() */
 301
 302     dVAR;
 303     UV i;
 304
 305     PERL_ARGS_ASSERT_NEW_CTYPE;
 306
 307     /* We will replace any bad locale warning with 1) nothing if the new one is
 308      * ok; or 2) a new warning for the bad new locale */
 309     if (PL_warn_locale) {
 310         SvREFCNT_dec_NN(PL_warn_locale);
 311         PL_warn_locale = NULL;
 312     }
 313
 314     PL_in_utf8_CTYPE_locale = _is_cur_LC_category_utf8(LC_CTYPE);
 315
 316     /* A UTF-8 locale gets standard rules.  But note that code still has to
 317      * handle this specially because of the three problematic code points */
 318     if (PL_in_utf8_CTYPE_locale) {
 319         Copy(PL_fold_latin1, PL_fold_locale, 256, U8);
 320     }
 321     else {
 322         /* Assume enough space for every character being bad.  4 spaces each
 323          * for the 94 printable characters that are output like "'x' "; and 5
 324          * spaces each for "'\\' ", "'\t' ", and "'\n' "; plus a terminating
 325          * NUL */
 326         char bad_chars_list[ (94 * 4) + (3 * 5) + 1 ];
 327
 328         /* Don't check for problems if we are suppressing the warnings */
 329         bool check_for_problems = ckWARN_d(WARN_LOCALE)
 330                                || UNLIKELY(DEBUG_L_TEST);
 331         bool multi_byte_locale = FALSE;     /* Assume is a single-byte locale
 332                                                to start */
 333         unsigned int bad_count = 0;         /* Count of bad characters */
 334
 335         for (i = 0; i < 256; i++) {
 336             if (isUPPER_LC((U8) i))
 337                 PL_fold_locale[i] = (U8) toLOWER_LC((U8) i);
 338             else if (isLOWER_LC((U8) i))
 339                 PL_fold_locale[i] = (U8) toUPPER_LC((U8) i);
 340             else
 341                 PL_fold_locale[i] = (U8) i;
 342
 343             /* If checking for locale problems, see if the native ASCII-range
 344              * printables plus \n and \t are in their expected categories in
 345              * the new locale.  If not, this could mean big trouble, upending
 346              * Perl's and most programs' assumptions, like having a
 347              * metacharacter with special meaning become a \w.  Fortunately,
 348              * it's very rare to find locales that aren't supersets of ASCII
 349              * nowadays.  It isn't a problem for most controls to be changed
 350              * into something else; we check only \n and \t, though perhaps \r
 351              * could be an issue as well. */
 352             if (check_for_problems
 353                 && (isGRAPH_A(i) || isBLANK_A(i) || i == '\n'))
 354             {
 355                 if ((isALPHANUMERIC_A(i) && ! isALPHANUMERIC_LC(i))
 356                      || (isPUNCT_A(i) && ! isPUNCT_LC(i))
 357                      || (isBLANK_A(i) && ! isBLANK_LC(i))
 358                      || (i == '\n' && ! isCNTRL_LC(i)))
 359                 {
 360                     if (bad_count) {    /* Separate multiple entries with a
 361                                            blank */
 362                         bad_chars_list[bad_count++] = ' ';
 363                     }
 364                     bad_chars_list[bad_count++] = '\'';
 365                     if (isPRINT_A(i)) {
 366                         bad_chars_list[bad_count++] = (char) i;
 367                     }
 368                     else {
 369                         bad_chars_list[bad_count++] = '\\';
 370                         if (i == '\n') {
 371                             bad_chars_list[bad_count++] = 'n';
 372                         }
 373                         else {
 374                             assert(i == '\t');
 375                             bad_chars_list[bad_count++] = 't';
 376                         }
 377                     }
 378                     bad_chars_list[bad_count++] = '\'';
 379                     bad_chars_list[bad_count] = '\0';
 380                 }
 381             }
 382         }
 383
 384 #ifdef MB_CUR_MAX
 385         /* We only handle single-byte locales (outside of UTF-8 ones; so if
 386          * this locale requires more than one byte, there are going to be
 387          * problems. */
 388         DEBUG_Lv(PerlIO_printf(Perl_debug_log,
 389                  "%s:%d: check_for_problems=%d, MB_CUR_MAX=%d\n",
 390                  __FILE__, __LINE__, check_for_problems, (int) MB_CUR_MAX));
 391
 392         if (check_for_problems && MB_CUR_MAX > 1
 393
 394                /* Some platforms return MB_CUR_MAX > 1 for even the "C"
 395                 * locale.  Just assume that the implementation for them (plus
 396                 * for POSIX) is correct and the > 1 value is spurious.  (Since
 397                 * these are specially handled to never be considered UTF-8
 398                 * locales, as long as this is the only problem, everything
 399                 * should work fine */
 400             && strNE(newctype, "C") && strNE(newctype, "POSIX"))
 401         {
 402             multi_byte_locale = TRUE;
 403         }
 404 #endif
 405
 406         if (bad_count || multi_byte_locale) {
 407             PL_warn_locale = Perl_newSVpvf(aTHX_
 408                              "Locale '%s' may not work well.%s%s%s\n",
 409                              newctype,
 410                              (multi_byte_locale)
 411                               ? "  Some characters in it are not recognized by"
 412                                 " Perl."
 413                               : "",
 414                              (bad_count)
 415                               ? "\nThe following characters (and maybe others)"
 416                                 " may not have the same meaning as the Perl"
 417                                 " program expects:\n"
 418                               : "",
 419                              (bad_count)
 420                               ? bad_chars_list
 421                               : ""
 422                             );
 423             /* If we are actually in the scope of the locale or are debugging,
 424              * output the message now.  If not in that scope, we save the
 425              * message to be output at the first operation using this locale,
 426              * if that actually happens.  Most programs don't use locales, so
 427              * they are immune to bad ones.  */
 428             if (IN_LC(LC_CTYPE) || UNLIKELY(DEBUG_L_TEST)) {
 429
 430                 /* We have to save 'newctype' because the setlocale() just
 431                  * below may destroy it.  The next setlocale() further down
 432                  * should restore it properly so that the intermediate change
 433                  * here is transparent to this function's caller */
 434                 const char * const badlocale = savepv(newctype);
 435
 436                 setlocale(LC_CTYPE, "C");
 437
 438                 /* The '0' below suppresses a bogus gcc compiler warning */
 439                 Perl_warner(aTHX_ packWARN(WARN_LOCALE), SvPVX(PL_warn_locale), 0);
 440
 441                 setlocale(LC_CTYPE, badlocale);
 442                 Safefree(badlocale);
 443
 444                 if (IN_LC(LC_CTYPE)) {
 445                     SvREFCNT_dec_NN(PL_warn_locale);
 446                     PL_warn_locale = NULL;
 447                 }
 448             }
 449         }
 450     }
 451
 452 #endif /* USE_LOCALE_CTYPE */
 453     PERL_ARGS_ASSERT_NEW_CTYPE;
 454     PERL_UNUSED_ARG(newctype);
 455     PERL_UNUSED_CONTEXT;
 456 }
 457
 458 void
 459 Perl__warn_problematic_locale()
 460 {
 461
 462 #ifdef USE_LOCALE_CTYPE
 463
 464     dTHX;
 465
 466     /* Internal-to-core function that outputs the message in PL_warn_locale,
 467      * and then NULLS it.  Should be called only through the macro
 468      * _CHECK_AND_WARN_PROBLEMATIC_LOCALE */
 469
 470     if (PL_warn_locale) {
 471         /*GCC_DIAG_IGNORE(-Wformat-security);   Didn't work */
 472         Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
 473                              SvPVX(PL_warn_locale),
 474                              0 /* dummy to avoid compiler warning */ );
 475         /* GCC_DIAG_RESTORE; */
 476         SvREFCNT_dec_NN(PL_warn_locale);
 477         PL_warn_locale = NULL;
 478     }
 479
 480 #endif
 481
 482 }
 483
 484 STATIC void
 485 S_new_collate(pTHX_ const char *newcoll)
 486 {
 487 #ifdef USE_LOCALE_COLLATE
 488
 489     /* Called after all libc setlocale() calls affecting LC_COLLATE, to tell
 490      * core Perl this and that 'newcoll' is the name of the new locale.
 491      *
 492      * Any code changing the locale (outside this file) should use
 493      * POSIX::setlocale, which calls this function.  Therefore this function
 494      * should be called directly only from this file and from
 495      * POSIX::setlocale().
 496      *
 497      * The design of locale collation is that every locale change is given an
 498      * index 'PL_collation_ix'.  The first time a string particpates in an
 499      * operation that requires collation while locale collation is active, it
 500      * is given PERL_MAGIC_collxfrm magic (via sv_collxfrm_flags()).  That
 501      * magic includes the collation index, and the transformation of the string
 502      * by strxfrm(), q.v.  That transformation is used when doing comparisons,
 503      * instead of the string itself.  If a string changes, the magic is
 504      * cleared.  The next time the locale changes, the index is incremented,
 505      * and so we know during a comparison that the transformation is not
 506      * necessarily still valid, and so is recomputed.  Note that if the locale
 507      * changes enough times, the index could wrap (a U32), and it is possible
 508      * that a transformation would improperly be considered valid, leading to
 509      * an unlikely bug */
 510
 511     if (! newcoll) {
 512         if (PL_collation_name) {
 513             ++PL_collation_ix;
 514             Safefree(PL_collation_name);
 515             PL_collation_name = NULL;
 516         }
 517         PL_collation_standard = TRUE;
 518       is_standard_collation:
 519         PL_collxfrm_base = 0;
 520         PL_collxfrm_mult = 2;
 521         PL_in_utf8_COLLATE_locale = FALSE;
 522         PL_strxfrm_NUL_replacement = '\0';
 523         PL_strxfrm_max_cp = 0;
 524         return;
 525     }
 526
 527     /* If this is not the same locale as currently, set the new one up */
 528     if (! PL_collation_name || strNE(PL_collation_name, newcoll)) {
 529         ++PL_collation_ix;
 530         Safefree(PL_collation_name);
 531         PL_collation_name = stdize_locale(savepv(newcoll));
 532         PL_collation_standard = isNAME_C_OR_POSIX(newcoll);
 533         if (PL_collation_standard) {
 534             goto is_standard_collation;
 535         }
 536
 537         PL_in_utf8_COLLATE_locale = _is_cur_LC_category_utf8(LC_COLLATE);
 538         PL_strxfrm_NUL_replacement = '\0';
 539         PL_strxfrm_max_cp = 0;
 540
 541         /* A locale collation definition includes primary, secondary, tertiary,
 542          * etc. weights for each character.  To sort, the primary weights are
 543          * used, and only if they compare equal, then the secondary weights are
 544          * used, and only if they compare equal, then the tertiary, etc.
 545          *
 546          * strxfrm() works by taking the input string, say ABC, and creating an
 547          * output transformed string consisting of first the primary weights,
 548          * A¹B¹C¹ followed by the secondary ones, A²B²C²; and then the
 549          * tertiary, etc, yielding A¹B¹C¹ A²B²C² A³B³C³ ....  Some characters
 550          * may not have weights at every level.  In our example, let's say B
 551          * doesn't have a tertiary weight, and A doesn't have a secondary
 552          * weight.  The constructed string is then going to be
 553          *  A¹B¹C¹ B²C² A³C³ ....
 554          * This has the desired effect that strcmp() will look at the secondary
 555          * or tertiary weights only if the strings compare equal at all higher
 556          * priority weights.  The spaces shown here, like in
 557          *  "A¹B¹C¹ A²B²C² "
 558          * are not just for readability.  In the general case, these must
 559          * actually be bytes, which we will call here 'separator weights'; and
 560          * they must be smaller than any other weight value, but since these
 561          * are C strings, only the terminating one can be a NUL (some
 562          * implementations may include a non-NUL separator weight just before
 563          * the NUL).  Implementations tend to reserve 01 for the separator
 564          * weights.  They are needed so that a shorter string's secondary
 565          * weights won't be misconstrued as primary weights of a longer string,
 566          * etc.  By making them smaller than any other weight, the shorter
 567          * string will sort first.  (Actually, if all secondary weights are
 568          * smaller than all primary ones, there is no need for a separator
 569          * weight between those two levels, etc.)
 570          *
 571          * The length of the transformed string is roughly a linear function of
 572          * the input string.  It's not exactly linear because some characters
 573          * don't have weights at all levels.  When we call strxfrm() we have to
 574          * allocate some memory to hold the transformed string.  The
 575          * calculations below try to find coefficients 'm' and 'b' for this
 576          * locale so that m*x + b equals how much space we need, given the size
 577          * of the input string in 'x'.  If we calculate too small, we increase
 578          * the size as needed, and call strxfrm() again, but it is better to
 579          * get it right the first time to avoid wasted expensive string
 580          * transformations. */
 581
 582         {
 583             /* We use the string below to find how long the tranformation of it
 584              * is.  Almost all locales are supersets of ASCII, or at least the
 585              * ASCII letters.  We use all of them, half upper half lower,
 586              * because if we used fewer, we might hit just the ones that are
 587              * outliers in a particular locale.  Most of the strings being
 588              * collated will contain a preponderance of letters, and even if
 589              * they are above-ASCII, they are likely to have the same number of
 590              * weight levels as the ASCII ones.  It turns out that digits tend
 591              * to have fewer levels, and some punctuation has more, but those
 592              * are relatively sparse in text, and khw believes this gives a
 593              * reasonable result, but it could be changed if experience so
 594              * dictates. */
 595             const char longer[] = "ABCDEFGHIJKLMnopqrstuvwxyz";
 596             char * x_longer;        /* Transformed 'longer' */
 597             Size_t x_len_longer;    /* Length of 'x_longer' */
 598
 599             char * x_shorter;   /* We also transform a substring of 'longer' */
 600             Size_t x_len_shorter;
 601
 602             /* _mem_collxfrm() is used get the transformation (though here we
 603              * are interested only in its length).  It is used because it has
 604              * the intelligence to handle all cases, but to work, it needs some
 605              * values of 'm' and 'b' to get it started.  For the purposes of
 606              * this calculation we use a very conservative estimate of 'm' and
 607              * 'b'.  This assumes a weight can be multiple bytes, enough to
 608              * hold any UV on the platform, and there are 5 levels, 4 weight
 609              * bytes, and a trailing NUL.  */
 610             PL_collxfrm_base = 5;
 611             PL_collxfrm_mult = 5 * sizeof(UV);
 612
 613             /* Find out how long the transformation really is */
 614             x_longer = _mem_collxfrm(longer,
 615                                      sizeof(longer) - 1,
 616                                      &x_len_longer,
 617
 618                                      /* We avoid converting to UTF-8 in the
 619                                       * called function by telling it the
 620                                       * string is in UTF-8 if the locale is a
 621                                       * UTF-8 one.  Since the string passed
 622                                       * here is invariant under UTF-8, we can
 623                                       * claim it's UTF-8 even though it isn't.
 624                                       * */
 625                                      PL_in_utf8_COLLATE_locale);
 626             Safefree(x_longer);
 627
 628             /* Find out how long the transformation of a substring of 'longer'
 629              * is.  Together the lengths of these transformations are
 630              * sufficient to calculate 'm' and 'b'.  The substring is all of
 631              * 'longer' except the first character.  This minimizes the chances
 632              * of being swayed by outliers */
 633             x_shorter = _mem_collxfrm(longer + 1,
 634                                       sizeof(longer) - 2,
 635                                       &x_len_shorter,
 636                                       PL_in_utf8_COLLATE_locale);
 637             Safefree(x_shorter);
 638
 639             /* If the results are nonsensical for this simple test, the whole
 640              * locale definition is suspect.  Mark it so that locale collation
 641              * is not active at all for it.  XXX Should we warn? */
 642             if (   x_len_shorter == 0
 643                 || x_len_longer == 0
 644                 || x_len_shorter >= x_len_longer)
 645             {
 646                 PL_collxfrm_mult = 0;
 647                 PL_collxfrm_base = 0;
 648             }
 649             else {
 650                 SSize_t base;       /* Temporary */
 651
 652                 /* We have both:    m * strlen(longer)  + b = x_len_longer
 653                  *                  m * strlen(shorter) + b = x_len_shorter;
 654                  * subtracting yields:
 655                  *          m * (strlen(longer) - strlen(shorter))
 656                  *                             = x_len_longer - x_len_shorter
 657                  * But we have set things up so that 'shorter' is 1 byte smaller
 658                  * than 'longer'.  Hence:
 659                  *          m = x_len_longer - x_len_shorter
 660                  *
 661                  * But if something went wrong, make sure the multiplier is at
 662                  * least 1.
 663                  */
 664                 if (x_len_longer > x_len_shorter) {
 665                     PL_collxfrm_mult = (STRLEN) x_len_longer - x_len_shorter;
 666                 }
 667                 else {
 668                     PL_collxfrm_mult = 1;
 669                 }
 670
 671                 /*     mx + b = len
 672                  * so:      b = len - mx
 673                  * but in case something has gone wrong, make sure it is
 674                  * non-negative */
 675                 base = x_len_longer - PL_collxfrm_mult * (sizeof(longer) - 1);
 676                 if (base < 0) {
 677                     base = 0;
 678                 }
 679
 680                 /* Add 1 for the trailing NUL */
 681                 PL_collxfrm_base = base + 1;
 682             }
 683
 684 #ifdef DEBUGGING
 685             if (DEBUG_L_TEST || debug_initialization) {
 686                 PerlIO_printf(Perl_debug_log,
 687                     "%s:%d: ?UTF-8 locale=%d; x_len_shorter=%zu, "
 688                     "x_len_longer=%zu,"
 689                     " collate multipler=%zu, collate base=%zu\n",
 690                     __FILE__, __LINE__,
 691                     PL_in_utf8_COLLATE_locale,
 692                     x_len_shorter, x_len_longer,
 693                     PL_collxfrm_mult, PL_collxfrm_base);
 694             }
 695 #endif
 696         }
 697     }
 698
 699 #else
 700     PERL_UNUSED_ARG(newcoll);
 701 #endif /* USE_LOCALE_COLLATE */
 702 }
 703
 704 #ifndef WIN32 /* No wrapper except on Windows */
 705
 706 #define my_setlocale(a,b) setlocale(a,b)
 707
 708 #else   /* WIN32 */
 709
 710 STATIC char *
 711 S_my_setlocale(pTHX_ int category, const char* locale)
 712 {
 713     /* This, for Windows, emulates POSIX setlocale() behavior.  There is no
 714      * difference unless the input locale is "", which means on Windows to get
 715      * the machine default, which is set via the computer's "Regional and
 716      * Language Options" (or its current equivalent).  In POSIX, it instead
 717      * means to find the locale from the user's environment.  This routine
 718      * looks in the environment, and, if anything is found, uses that instead
 719      * of going to the machine default.  If there is no environment override,
 720      * the machine default is used, as normal, by calling the real setlocale()
 721      * with "".  The POSIX behavior is to use the LC_ALL variable if set;
 722      * otherwise to use the particular category's variable if set; otherwise to
 723      * use the LANG variable. */
 724
 725     bool override_LC_ALL = FALSE;
 726     char * result;
 727
 728     if (locale && strEQ(locale, "")) {
 729 #   ifdef LC_ALL
 730         locale = PerlEnv_getenv("LC_ALL");
 731         if (! locale) {
 732 #endif
 733             switch (category) {
 734 #   ifdef LC_ALL
 735                 case LC_ALL:
 736                     override_LC_ALL = TRUE;
 737                     break;  /* We already know its variable isn't set */
 738 #   endif
 739 #   ifdef USE_LOCALE_TIME
 740                 case LC_TIME:
 741                     locale = PerlEnv_getenv("LC_TIME");
 742                     break;
 743 #   endif
 744 #   ifdef USE_LOCALE_CTYPE
 745                 case LC_CTYPE:
 746                     locale = PerlEnv_getenv("LC_CTYPE");
 747                     break;
 748 #   endif
 749 #   ifdef USE_LOCALE_COLLATE
 750                 case LC_COLLATE:
 751                     locale = PerlEnv_getenv("LC_COLLATE");
 752                     break;
 753 #   endif
 754 #   ifdef USE_LOCALE_MONETARY
 755                 case LC_MONETARY:
 756                     locale = PerlEnv_getenv("LC_MONETARY");
 757                     break;
 758 #   endif
 759 #   ifdef USE_LOCALE_NUMERIC
 760                 case LC_NUMERIC:
 761                     locale = PerlEnv_getenv("LC_NUMERIC");
 762                     break;
 763 #   endif
 764 #   ifdef USE_LOCALE_MESSAGES
 765                 case LC_MESSAGES:
 766                     locale = PerlEnv_getenv("LC_MESSAGES");
 767                     break;
 768 #   endif
 769                 default:
 770                     /* This is a category, like PAPER_SIZE that we don't
 771                      * know about; and so can't provide a wrapper. */
 772                     break;
 773             }
 774             if (! locale) {
 775                 locale = PerlEnv_getenv("LANG");
 776                 if (! locale) {
 777                     locale = "";
 778                 }
 779             }
 780 #   ifdef LC_ALL
 781         }
 782 #   endif
 783     }
 784
 785     result = setlocale(category, locale);
 786     DEBUG_L(PerlIO_printf(Perl_debug_log, "%s:%d: %s\n", __FILE__, __LINE__,
 787                             setlocale_debug_string(category, locale, result)));
 788
 789     if (! override_LC_ALL)  {
 790         return result;
 791     }
 792
 793     /* Here the input category was LC_ALL, and we have set it to what is in the
 794      * LANG variable or the system default if there is no LANG.  But these have
 795      * lower priority than the other LC_foo variables, so override it for each
 796      * one that is set.  (If they are set to "", it means to use the same thing
 797      * we just set LC_ALL to, so can skip) */
 798 #   ifdef USE_LOCALE_TIME
 799     result = PerlEnv_getenv("LC_TIME");
 800     if (result && strNE(result, "")) {
 801         setlocale(LC_TIME, result);
 802         DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s:%d: %s\n",
 803                     __FILE__, __LINE__,
 804                     setlocale_debug_string(LC_TIME, result, "not captured")));
 805     }
 806 #   endif
 807 #   ifdef USE_LOCALE_CTYPE
 808     result = PerlEnv_getenv("LC_CTYPE");
 809     if (result && strNE(result, "")) {
 810         setlocale(LC_CTYPE, result);
 811         DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s:%d: %s\n",
 812                     __FILE__, __LINE__,
 813                     setlocale_debug_string(LC_CTYPE, result, "not captured")));
 814     }
 815 #   endif
 816 #   ifdef USE_LOCALE_COLLATE
 817     result = PerlEnv_getenv("LC_COLLATE");
 818     if (result && strNE(result, "")) {
 819         setlocale(LC_COLLATE, result);
 820         DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s:%d: %s\n",
 821                   __FILE__, __LINE__,
 822                   setlocale_debug_string(LC_COLLATE, result, "not captured")));
 823     }
 824 #   endif
 825 #   ifdef USE_LOCALE_MONETARY
 826     result = PerlEnv_getenv("LC_MONETARY");
 827     if (result && strNE(result, "")) {
 828         setlocale(LC_MONETARY, result);
 829         DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s:%d: %s\n",
 830                  __FILE__, __LINE__,
 831                  setlocale_debug_string(LC_MONETARY, result, "not captured")));
 832     }
 833 #   endif
 834 #   ifdef USE_LOCALE_NUMERIC
 835     result = PerlEnv_getenv("LC_NUMERIC");
 836     if (result && strNE(result, "")) {
 837         setlocale(LC_NUMERIC, result);
 838         DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s:%d: %s\n",
 839                  __FILE__, __LINE__,
 840                  setlocale_debug_string(LC_NUMERIC, result, "not captured")));
 841     }
 842 #   endif
 843 #   ifdef USE_LOCALE_MESSAGES
 844     result = PerlEnv_getenv("LC_MESSAGES");
 845     if (result && strNE(result, "")) {
 846         setlocale(LC_MESSAGES, result);
 847         DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s:%d: %s\n",
 848                  __FILE__, __LINE__,
 849                  setlocale_debug_string(LC_MESSAGES, result, "not captured")));
 850     }
 851 #   endif
 852
 853     result = setlocale(LC_ALL, NULL);
 854     DEBUG_L(PerlIO_printf(Perl_debug_log, "%s:%d: %s\n",
 855                                __FILE__, __LINE__,
 856                                setlocale_debug_string(LC_ALL, NULL, result)));
 857
 858     return result;
 859 }
 860
 861 #endif
 862
 863 char *
 864 Perl_setlocale(int category, const char * locale)
 865 {
 866     /* This wraps POSIX::setlocale() */
 867
 868     char * retval;
 869     dTHX;
 870
 871
 872 #ifdef USE_LOCALE_NUMERIC
 873
 874     /* A NULL locale means only query what the current one is.  We
 875      * have the LC_NUMERIC name saved, because we are normally switched
 876      * into the C locale for it.  Switch back so an LC_ALL query will yield
 877      * the correct results; all other categories don't require special
 878      * handling */
 879     if (locale == NULL) {
 880         if (category == LC_NUMERIC) {
 881             return savepv(PL_numeric_name);
 882         }
 883
 884 #   ifdef LC_ALL
 885
 886         else if (category == LC_ALL) {
 887             SET_NUMERIC_UNDERLYING();
 888         }
 889
 890 #   endif
 891
 892     }
 893
 894 #endif
 895
 896     retval = my_setlocale(category, locale);
 897
 898     DEBUG_L(PerlIO_printf(Perl_debug_log,
 899         "%s:%d: %s\n", __FILE__, __LINE__,
 900             setlocale_debug_string(category, locale, retval)));
 901     if (! retval) {
 902         /* Should never happen that a query would return an error, but be
 903          * sure and reset to C locale */
 904         if (locale == 0) {
 905             SET_NUMERIC_STANDARD();
 906         }
 907         return NULL;
 908     }
 909
 910     /* Save retval since subsequent setlocale() calls may overwrite it. */
 911     retval = savepv(retval);
 912
 913     /* If locale == NULL, we are just querying the state, but may have switched
 914      * to NUMERIC_UNDERLYING.  Switch back before returning. */
 915     if (locale == NULL) {
 916         SET_NUMERIC_STANDARD();
 917         return retval;
 918     }
 919     else {  /* Now that have switched locales, we have to update our records to
 920                correspond */
 921
 922 #ifdef USE_LOCALE_CTYPE
 923
 924         if (   category == LC_CTYPE
 925
 926 #    ifdef LC_ALL
 927
 928             || category == LC_ALL
 929
 930 #    endif
 931
 932             )
 933         {
 934             char *newctype;
 935
 936 #    ifdef LC_ALL
 937
 938             if (category == LC_ALL) {
 939                 newctype = setlocale(LC_CTYPE, NULL);
 940                 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
 941                     "%s:%d: %s\n", __FILE__, __LINE__,
 942                     setlocale_debug_string(LC_CTYPE, NULL, newctype)));
 943             }
 944             else
 945
 946 #    endif
 947
 948                 newctype = retval;
 949             new_ctype(newctype);
 950         }
 951
 952 #endif /* USE_LOCALE_CTYPE */
 953
 954 #ifdef USE_LOCALE_COLLATE
 955
 956         if (   category == LC_COLLATE
 957
 958 #    ifdef LC_ALL
 959
 960             || category == LC_ALL
 961
 962 #    endif
 963
 964             )
 965         {
 966             char *newcoll;
 967
 968 #    ifdef LC_ALL
 969
 970             if (category == LC_ALL) {
 971                 newcoll = setlocale(LC_COLLATE, NULL);
 972                 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
 973                     "%s:%d: %s\n", __FILE__, __LINE__,
 974                     setlocale_debug_string(LC_COLLATE, NULL, newcoll)));
 975             }
 976             else
 977
 978 #    endif
 979
 980                 newcoll = retval;
 981             new_collate(newcoll);
 982         }
 983
 984 #endif /* USE_LOCALE_COLLATE */
 985
 986 #ifdef USE_LOCALE_NUMERIC
 987
 988         if (   category == LC_NUMERIC
 989
 990 #    ifdef LC_ALL
 991
 992             || category == LC_ALL
 993
 994 #    endif
 995
 996             )
 997         {
 998             char *newnum;
 999
1000 #    ifdef LC_ALL
1001
1002             if (category == LC_ALL) {
1003                 newnum = setlocale(LC_NUMERIC, NULL);
1004                 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
1005                     "%s:%d: %s\n", __FILE__, __LINE__,
1006                     setlocale_debug_string(LC_NUMERIC, NULL, newnum)));
1007             }
1008             else
1009
1010 #    endif
1011
1012                 newnum = retval;
1013             new_numeric(newnum);
1014         }
1015
1016 #endif /* USE_LOCALE_NUMERIC */
1017
1018     }
1019
1020     return retval;
1021
1022
1023 }
1024
1025 PERL_STATIC_INLINE const char *
1026 S_save_to_buffer(const char * string, char **buf, Size_t *buf_size, const Size_t offset)
1027 {
1028     /* Copy the NUL-terminated 'string' to 'buf' + 'offset'.  'buf' has size 'buf_size',
1029      * growing it if necessary */
1030
1031     const Size_t string_size = strlen(string) + offset + 1;
1032
1033     PERL_ARGS_ASSERT_SAVE_TO_BUFFER;
1034
1035     if (*buf_size == 0) {
1036         Newx(*buf, string_size, char);
1037         *buf_size = string_size;
1038     }
1039     else if (string_size > *buf_size) {
1040         Renew(*buf, string_size, char);
1041         *buf_size = string_size;
1042     }
1043
1044     Copy(string, *buf + offset, string_size - offset, char);
1045     return *buf;
1046 }
1047
1048 /*
1049
1050 =head1 Locale-related functions and macros
1051
1052 =for apidoc Perl_langinfo
1053
1054 This is an (almostª) drop-in replacement for the system C<L<nl_langinfo(3)>>,
1055 taking the same C<item> parameter values, and returning the same information.
1056 But it is more thread-safe than regular C<nl_langinfo()>, and hides the quirks
1057 of Perl's locale handling from your code, and can be used on systems that lack
1058 a native C<nl_langinfo>.
1059
1060 Expanding on these:
1061
1062 =over
1063
1064 =item *
1065
1066 It delivers the correct results for the C<RADIXCHAR> and C<THOUSESEP> items,
1067 without you having to write extra code.  The reason for the extra code would be
1068 because these are from the C<LC_NUMERIC> locale category, which is normally
1069 kept set to the C locale by Perl, no matter what the underlying locale is
1070 supposed to be, and so to get the expected results, you have to temporarily
1071 toggle into the underlying locale, and later toggle back.  (You could use
1072 plain C<nl_langinfo> and C<L</STORE_LC_NUMERIC_FORCE_TO_UNDERLYING>> for this
1073 but then you wouldn't get the other advantages of C<Perl_langinfo()>; not
1074 keeping C<LC_NUMERIC> in the C locale would break a lot of CPAN, which is
1075 expecting the radix (decimal point) character to be a dot.)
1076
1077 =item *
1078
1079 Depending on C<item>, it works on systems that don't have C<nl_langinfo>, hence
1080 makes your code more portable.  Of the fifty-some possible items specified by
1081 the POSIX 2008 standard,
1082 L<http://pubs.opengroup.org/onlinepubs/9699919799/basedefs/langinfo.h.html>,
1083 only two are completely unimplemented.  It uses various techniques to recover
1084 the other items, including calling C<L<localeconv(3)>>, and C<L<strftime(3)>>,
1085 both of which are specified in C89, so should be always be available.  Later
1086 C<strftime()> versions have additional capabilities; C<""> is returned for
1087 those not available on your system.
1088
1089 The details for those items which may differ from what this emulation returns
1090 and what a native C<nl_langinfo()> would return are:
1091
1092 =over
1093
1094 =item C<CODESET>
1095
1096 =item C<ERA>
1097
1098 Unimplemented, so returns C<"">.
1099
1100 =item C<YESEXPR>
1101
1102 =item C<NOEXPR>
1103
1104 Only the values for English are returned.  Earlier POSIX standards also
1105 specified C<YESSTR> and C<NOSTR>, but these have been removed from POSIX 2008,
1106 and aren't supported by C<Perl_langinfo>.
1107
1108 =item C<D_FMT>
1109
1110 Always evaluates to C<%x>, the locale's appropriate date representation.
1111
1112 =item C<T_FMT>
1113
1114 Always evaluates to C<%X>, the locale's appropriate time representation.
1115
1116 =item C<D_T_FMT>
1117
1118 Always evaluates to C<%c>, the locale's appropriate date and time
1119 representation.
1120
1121 =item C<CRNCYSTR>
1122
1123 The return may be incorrect for those rare locales where the currency symbol
1124 replaces the radix character.
1125 Send email to L<mailto:perlbug@perl.org> if you have examples of it needing
1126 to work differently.
1127
1128 =item C<ALT_DIGITS>
1129
1130 Currently this gives the same results as Linux does.
1131 Send email to L<mailto:perlbug@perl.org> if you have examples of it needing
1132 to work differently.
1133
1134 =item C<ERA_D_FMT>
1135
1136 =item C<ERA_T_FMT>
1137
1138 =item C<ERA_D_T_FMT>
1139
1140 =item C<T_FMT_AMPM>
1141
1142 These are derived by using C<strftime()>, and not all versions of that function
1143 know about them.  C<""> is returned for these on such systems.
1144
1145 =back
1146
1147 When using C<Perl_langinfo> on systems that don't have a native
1148 C<nl_langinfo()>, you must
1149
1150  #include "perl_langinfo.h"
1151
1152 before the C<perl.h> C<#include>.  You can replace your C<langinfo.h>
1153 C<#include> with this one.  (Doing it this way keeps out the symbols that plain
1154 C<langinfo.h> imports into the namespace for code that doesn't need it.)
1155
1156 You also should not use the bare C<langinfo.h> item names, but should preface
1157 them with C<PERL_>, so use C<PERL_RADIXCHAR> instead of plain C<RADIXCHAR>.
1158 The C<PERL_I<foo>> versions will also work for this function on systems that do
1159 have a native C<nl_langinfo>.
1160
1161 =item *
1162
1163 It is thread-friendly, returning its result in a buffer that won't be
1164 overwritten by another thread, so you don't have to code for that possibility.
1165 The buffer can be overwritten by the next call to C<nl_langinfo> or
1166 C<Perl_langinfo> in the same thread.
1167
1168 =item *
1169
1170 ªIt returns S<C<const char *>>, whereas plain C<nl_langinfo()> returns S<C<char
1171 *>>, but you are (only by documentation) forbidden to write into the buffer.
1172 By declaring this C<const>, the compiler enforces this restriction.  The extra
1173 C<const> is why this isn't an unequivocal drop-in replacement for
1174 C<nl_langinfo>.
1175
1176 =back
1177
1178 The original impetus for C<Perl_langinfo()> was so that code that needs to
1179 find out the current currency symbol, floating point radix character, or digit
1180 grouping separator can use, on all systems, the simpler and more
1181 thread-friendly C<nl_langinfo> API instead of C<L<localeconv(3)>> which is a
1182 pain to make thread-friendly.  For other fields returned by C<localeconv>, it
1183 is better to use the methods given in L<perlcall> to call
1184 L<C<POSIX::localeconv()>|POSIX/localeconv>, which is thread-friendly.
1185
1186 =cut
1187
1188 */
1189
1190 const char *
1191 #ifdef HAS_NL_LANGINFO
1192 Perl_langinfo(const nl_item item)
1193 #else
1194 Perl_langinfo(const int item)
1195 #endif
1196 {
1197     bool toggle = TRUE;
1198
1199 #if defined(HAS_NL_LANGINFO)
1200 #  if ! defined(USE_ITHREADS)
1201
1202     /* Single-thread, and nl_langinfo() is available.  Call it, switching to
1203      * underlying LC_NUMERIC for those items dependent on it */
1204
1205     const char * retval;
1206
1207     if (toggle) {
1208         if (item == PERL_RADIXCHAR || item == PERL_THOUSEP) {
1209             setlocale(LC_NUMERIC, PL_numeric_name);
1210         }
1211         else {
1212             toggle = FALSE;
1213         }
1214     }
1215
1216     retval = nl_langinfo(item);
1217
1218     if (toggle) {
1219         setlocale(LC_NUMERIC, "C");
1220     }
1221
1222     return retval;
1223
1224
1225 #  else
1226
1227     /* Multi-threaded, with native nl_langinfo().  Use it, copying result to
1228      * per-thread buffer, and toggling LC_NUMERIC if necessary, all within a
1229      * crtical section */
1230
1231     dTHX;
1232
1233     LOCALE_LOCK;
1234
1235     if (toggle) {
1236         if (item == PERL_RADIXCHAR || item == PERL_THOUSEP) {
1237             setlocale(LC_NUMERIC, PL_numeric_name);
1238         }
1239         else {
1240             toggle = FALSE;
1241         }
1242     }
1243
1244     save_to_buffer(nl_langinfo(item), &PL_langinfo_buf, &PL_langinfo_bufsize, 0);
1245
1246     if (toggle) {
1247         setlocale(LC_NUMERIC, "C");
1248     }
1249
1250     LOCALE_UNLOCK;
1251
1252     return PL_langinfo_buf;
1253
1254 #  endif
1255 #else   /* Below, emulate nl_langinfo as best we can */
1256
1257     dTHX;
1258
1259 #  ifdef HAS_LOCALECONV
1260
1261     const struct lconv* lc;
1262
1263 #  endif
1264 #  ifdef HAS_STRFTIME
1265
1266     struct tm tm;
1267     bool return_format = FALSE; /* Return the %format, not the value */
1268     const char * format;
1269
1270 #  endif
1271
1272     /* We copy the results to a per-thread buffer, even if not multi-threaded.
1273      * This is in part to simplify this code, and partly because we need a
1274      * buffer anyway for strftime(), and partly because a call of localeconv()
1275      * could otherwise wipe out the buffer, and the programmer would not be
1276      * expecting this, as this is a nl_langinfo() substitute after all, so s/he
1277      * might be thinking their localeconv() is safe until another localeconv()
1278      * call. */
1279
1280     switch (item) {
1281         Size_t len;
1282         const char * retval;
1283
1284         /* These 2 are unimplemented */
1285         case PERL_CODESET:
1286         case PERL_ERA:          /* For use with strftime() %E modifier */
1287
1288         default:
1289             return "";
1290
1291         /* We use only an English set, since we don't know any more */
1292         case PERL_YESEXPR:   return "^[+1yY]";
1293         case PERL_NOEXPR:    return "^[-0nN]";
1294
1295 #  ifdef HAS_LOCALECONV
1296
1297         case PERL_CRNCYSTR:
1298
1299             LOCALE_LOCK;
1300
1301             lc = localeconv();
1302             if (! lc || ! lc->currency_symbol || strEQ("", lc->currency_symbol))
1303             {
1304                 LOCALE_UNLOCK;
1305                 return "";
1306             }
1307
1308             /* Leave the first spot empty to be filled in below */
1309             save_to_buffer(lc->currency_symbol, &PL_langinfo_buf,
1310                            &PL_langinfo_bufsize, 1);
1311             if (lc->mon_decimal_point && strEQ(lc->mon_decimal_point, ""))
1312             { /*  khw couldn't figure out how the localedef specifications
1313                   would show that the $ should replace the radix; this is
1314                   just a guess as to how it might work.*/
1315                 *PL_langinfo_buf = '.';
1316             }
1317             else if (lc->p_cs_precedes) {
1318                 *PL_langinfo_buf = '-';
1319             }
1320             else {
1321                 *PL_langinfo_buf = '+';
1322             }
1323
1324             LOCALE_UNLOCK;
1325             break;
1326
1327         case PERL_RADIXCHAR:
1328         case PERL_THOUSEP:
1329
1330             LOCALE_LOCK;
1331
1332             if (toggle) {
1333                 setlocale(LC_NUMERIC, PL_numeric_name);
1334             }
1335
1336             lc = localeconv();
1337             if (! lc) {
1338                 retval = "";
1339             }
1340             else switch (item) {
1341                 case PERL_RADIXCHAR:
1342                     if (! lc->decimal_point) {
1343                         retval = "";
1344                     }
1345                     else {
1346                         retval = lc->decimal_point;
1347                     }
1348                     break;
1349
1350                 case PERL_THOUSEP:
1351                     if (! lc->thousands_sep || strEQ("", lc->thousands_sep)) {
1352                         retval = "";
1353                     }
1354                     else {
1355                         retval = lc->thousands_sep;
1356                     }
1357                     break;
1358
1359                 default:
1360                     LOCALE_UNLOCK;
1361                     Perl_croak(aTHX_ "panic: %s: %d: switch case: %d problem",
1362                                             __FILE__, __LINE__, item);
1363             }
1364
1365             save_to_buffer(retval, &PL_langinfo_buf, &PL_langinfo_bufsize, 0);
1366
1367             if (toggle) {
1368                 setlocale(LC_NUMERIC, "C");
1369             }
1370
1371             LOCALE_UNLOCK;
1372
1373             break;
1374
1375 #  endif
1376 #  ifdef HAS_STRFTIME
1377
1378         /* These are defined by C89, so we assume that strftime supports them,
1379          * and so are returned unconditionally; they may not be what the locale
1380          * actually says, but should give good enough results for someone using
1381          * them as formats (as opposed to trying to parse them to figure out
1382          * what the locale says).  The other format ones are actually tested to
1383          * verify they work on the platform */
1384         case PERL_D_FMT:         return "%x";
1385         case PERL_T_FMT:         return "%X";
1386         case PERL_D_T_FMT:       return "%c";
1387
1388         /* These formats are only available in later strfmtime's */
1389         case PERL_ERA_D_FMT: case PERL_ERA_T_FMT: case PERL_ERA_D_T_FMT:
1390         case PERL_T_FMT_AMPM:
1391
1392         /* The rest can be gotten from most versions of strftime(). */
1393         case PERL_ABDAY_1: case PERL_ABDAY_2: case PERL_ABDAY_3:
1394         case PERL_ABDAY_4: case PERL_ABDAY_5: case PERL_ABDAY_6:
1395         case PERL_ABDAY_7:
1396         case PERL_ALT_DIGITS:
1397         case PERL_AM_STR: case PERL_PM_STR:
1398         case PERL_ABMON_1: case PERL_ABMON_2: case PERL_ABMON_3:
1399         case PERL_ABMON_4: case PERL_ABMON_5: case PERL_ABMON_6:
1400         case PERL_ABMON_7: case PERL_ABMON_8: case PERL_ABMON_9:
1401         case PERL_ABMON_10: case PERL_ABMON_11: case PERL_ABMON_12:
1402         case PERL_DAY_1: case PERL_DAY_2: case PERL_DAY_3: case PERL_DAY_4:
1403         case PERL_DAY_5: case PERL_DAY_6: case PERL_DAY_7:
1404         case PERL_MON_1: case PERL_MON_2: case PERL_MON_3: case PERL_MON_4:
1405         case PERL_MON_5: case PERL_MON_6: case PERL_MON_7: case PERL_MON_8:
1406         case PERL_MON_9: case PERL_MON_10: case PERL_MON_11: case PERL_MON_12:
1407
1408             LOCALE_LOCK;
1409
1410             init_tm(&tm);   /* Precaution against core dumps */
1411             tm.tm_sec = 30;
1412             tm.tm_min = 30;
1413             tm.tm_hour = 6;
1414             tm.tm_year = 2017 - 1900;
1415             tm.tm_wday = 0;
1416             tm.tm_mon = 0;
1417             switch (item) {
1418                 default:
1419                     LOCALE_UNLOCK;
1420                     Perl_croak(aTHX_ "panic: %s: %d: switch case: %d problem",
1421                                              __FILE__, __LINE__, item);
1422                     NOT_REACHED; /* NOTREACHED */
1423
1424                 case PERL_PM_STR: tm.tm_hour = 18;
1425                 case PERL_AM_STR:
1426                     format = "%p";
1427                     break;
1428
1429                 case PERL_ABDAY_7: tm.tm_wday++;
1430                 case PERL_ABDAY_6: tm.tm_wday++;
1431                 case PERL_ABDAY_5: tm.tm_wday++;
1432                 case PERL_ABDAY_4: tm.tm_wday++;
1433                 case PERL_ABDAY_3: tm.tm_wday++;
1434                 case PERL_ABDAY_2: tm.tm_wday++;
1435                 case PERL_ABDAY_1:
1436                     format = "%a";
1437                     break;
1438
1439                 case PERL_DAY_7: tm.tm_wday++;
1440                 case PERL_DAY_6: tm.tm_wday++;
1441                 case PERL_DAY_5: tm.tm_wday++;
1442                 case PERL_DAY_4: tm.tm_wday++;
1443                 case PERL_DAY_3: tm.tm_wday++;
1444                 case PERL_DAY_2: tm.tm_wday++;
1445                 case PERL_DAY_1:
1446                     format = "%A";
1447                     break;
1448
1449                 case PERL_ABMON_12: tm.tm_mon++;
1450                 case PERL_ABMON_11: tm.tm_mon++;
1451                 case PERL_ABMON_10: tm.tm_mon++;
1452                 case PERL_ABMON_9: tm.tm_mon++;
1453                 case PERL_ABMON_8: tm.tm_mon++;
1454                 case PERL_ABMON_7: tm.tm_mon++;
1455                 case PERL_ABMON_6: tm.tm_mon++;
1456                 case PERL_ABMON_5: tm.tm_mon++;
1457                 case PERL_ABMON_4: tm.tm_mon++;
1458                 case PERL_ABMON_3: tm.tm_mon++;
1459                 case PERL_ABMON_2: tm.tm_mon++;
1460                 case PERL_ABMON_1:
1461                     format = "%b";
1462                     break;
1463
1464                 case PERL_MON_12: tm.tm_mon++;
1465                 case PERL_MON_11: tm.tm_mon++;
1466                 case PERL_MON_10: tm.tm_mon++;
1467                 case PERL_MON_9: tm.tm_mon++;
1468                 case PERL_MON_8: tm.tm_mon++;
1469                 case PERL_MON_7: tm.tm_mon++;
1470                 case PERL_MON_6: tm.tm_mon++;
1471                 case PERL_MON_5: tm.tm_mon++;
1472                 case PERL_MON_4: tm.tm_mon++;
1473                 case PERL_MON_3: tm.tm_mon++;
1474                 case PERL_MON_2: tm.tm_mon++;
1475                 case PERL_MON_1:
1476                     format = "%B";
1477                     break;
1478
1479                 case PERL_T_FMT_AMPM:
1480                     format = "%r";
1481                     return_format = TRUE;
1482                     break;
1483
1484                 case PERL_ERA_D_FMT:
1485                     format = "%Ex";
1486                     return_format = TRUE;
1487                     break;
1488
1489                 case PERL_ERA_T_FMT:
1490                     format = "%EX";
1491                     return_format = TRUE;
1492                     break;
1493
1494                 case PERL_ERA_D_T_FMT:
1495                     format = "%Ec";
1496                     return_format = TRUE;
1497                     break;
1498
1499                 case PERL_ALT_DIGITS:
1500                     tm.tm_wday = 0;
1501                     format = "%Ow";     /* Find the alternate digit for 0 */
1502                     break;
1503             }
1504
1505             /* We can't use my_strftime() because it doesn't look at tm_wday  */
1506             while (0 == strftime(PL_langinfo_buf, PL_langinfo_bufsize,
1507                                  format, &tm))
1508             {
1509                 /* A zero return means one of:
1510                  *  a)  there wasn't enough space in PL_langinfo_buf
1511                  *  b)  the format, like a plain %p, returns empty
1512                  *  c)  it was an illegal format, though some implementations of
1513                  *      strftime will just return the illegal format as a plain
1514                  *      character sequence.
1515                  *
1516                  *  To quickly test for case 'b)', try again but precede the
1517                  *  format with a plain character.  If that result is still
1518                  *  empty, the problem is either 'a)' or 'c)' */
1519
1520                 Size_t format_size = strlen(format) + 1;
1521                 Size_t mod_size = format_size + 1;
1522                 char * mod_format;
1523                 char * temp_result;
1524
1525                 Newx(mod_format, mod_size, char);
1526                 Newx(temp_result, PL_langinfo_bufsize, char);
1527                 *mod_format = '\a';
1528                 my_strlcpy(mod_format + 1, format, mod_size);
1529                 len = strftime(temp_result,
1530                                PL_langinfo_bufsize,
1531                                mod_format, &tm);
1532                 Safefree(mod_format);
1533                 Safefree(temp_result);
1534
1535                 /* If 'len' is non-zero, it means that we had a case like %p
1536                  * which means the current locale doesn't use a.m. or p.m., and
1537                  * that is valid */
1538                 if (len == 0) {
1539
1540                     /* Here, still didn't work.  If we get well beyond a
1541                      * reasonable size, bail out to prevent an infinite loop. */
1542
1543                     if (PL_langinfo_bufsize > 100 * format_size) {
1544                         *PL_langinfo_buf = '\0';
1545                     }
1546                     else { /* Double the buffer size to retry;  Add 1 in case
1547                               original was 0, so we aren't stuck at 0. */
1548                         PL_langinfo_bufsize *= 2;
1549                         PL_langinfo_bufsize++;
1550                         Renew(PL_langinfo_buf, PL_langinfo_bufsize, char);
1551                         continue;
1552                     }
1553                 }
1554
1555                 break;
1556             }
1557
1558             /* Here, we got a result.
1559              *
1560              * If the item is 'ALT_DIGITS', PL_langinfo_buf contains the
1561              * alternate format for wday 0.  If the value is the same as the
1562              * normal 0, there isn't an alternate, so clear the buffer. */
1563             if (   item == PERL_ALT_DIGITS
1564                 && strEQ(PL_langinfo_buf, "0"))
1565             {
1566                 *PL_langinfo_buf = '\0';
1567             }
1568
1569             /* ALT_DIGITS is problematic.  Experiments on it showed that
1570              * strftime() did not always work properly when going from alt-9 to
1571              * alt-10.  Only a few locales have this item defined, and in all
1572              * of them on Linux that khw was able to find, nl_langinfo() merely
1573              * returned the alt-0 character, possibly doubled.  Most Unicode
1574              * digits are in blocks of 10 consecutive code points, so that is
1575              * sufficient information for those scripts, as we can infer alt-1,
1576              * alt-2, ....  But for a Japanese locale, a CJK ideographic 0 is
1577              * returned, and the CJK digits are not in code point order, so you
1578              * can't really infer anything.  The localedef for this locale did
1579              * specify the succeeding digits, so that strftime() works properly
1580              * on them, without needing to infer anything.  But the
1581              * nl_langinfo() return did not give sufficient information for the
1582              * caller to understand what's going on.  So until there is
1583              * evidence that it should work differently, this returns the alt-0
1584              * string for ALT_DIGITS.
1585              *
1586              * wday was chosen because its range is all a single digit.  Things
1587              * like tm_sec have two digits as the minimum: '00' */
1588
1589             LOCALE_UNLOCK;
1590
1591             /* If to return the format, not the value, overwrite the buffer
1592              * with it.  But some strftime()s will keep the original format if
1593              * illegal, so change those to "" */
1594             if (return_format) {
1595                 if (strEQ(PL_langinfo_buf, format)) {
1596                     *PL_langinfo_buf = '\0';
1597                 }
1598                 else {
1599                     save_to_buffer(format, &PL_langinfo_buf,
1600                                     &PL_langinfo_bufsize, 0);
1601                 }
1602             }
1603
1604             break;
1605
1606 #  endif
1607
1608     }
1609
1610     return PL_langinfo_buf;
1611
1612 #endif
1613
1614 }
1615
1616 /*
1617  * Initialize locale awareness.
1618  */
1619 int
1620 Perl_init_i18nl10n(pTHX_ int printwarn)
1621 {
1622     /* printwarn is
1623      *
1624      *    0 if not to output warning when setup locale is bad
1625      *    1 if to output warning based on value of PERL_BADLANG
1626      *    >1 if to output regardless of PERL_BADLANG
1627      *
1628      * returns
1629      *    1 = set ok or not applicable,
1630      *    0 = fallback to a locale of lower priority
1631      *   -1 = fallback to all locales failed, not even to the C locale
1632      *
1633      * Under -DDEBUGGING, if the environment variable PERL_DEBUG_LOCALE_INIT is
1634      * set, debugging information is output.
1635      *
1636      * This looks more complicated than it is, mainly due to the #ifdefs.
1637      *
1638      * We try to set LC_ALL to the value determined by the environment.  If
1639      * there is no LC_ALL on this platform, we try the individual categories we
1640      * know about.  If this works, we are done.
1641      *
1642      * But if it doesn't work, we have to do something else.  We search the
1643      * environment variables ourselves instead of relying on the system to do
1644      * it.  We look at, in order, LC_ALL, LANG, a system default locale (if we
1645      * think there is one), and the ultimate fallback "C".  This is all done in
1646      * the same loop as above to avoid duplicating code, but it makes things
1647      * more complex.  After the original failure, we add the fallback
1648      * possibilities to the list of locales to try, and iterate the loop
1649      * through them all until one succeeds.
1650      *
1651      * On Ultrix, the locale MUST come from the environment, so there is
1652      * preliminary code to set it.  I (khw) am not sure that it is necessary,
1653      * and that this couldn't be folded into the loop, but barring any real
1654      * platforms to test on, it's staying as-is
1655      *
1656      * A slight complication is that in embedded Perls, the locale may already
1657      * be set-up, and we don't want to get it from the normal environment
1658      * variables.  This is handled by having a special environment variable
1659      * indicate we're in this situation.  We simply set setlocale's 2nd
1660      * parameter to be a NULL instead of "".  That indicates to setlocale that
1661      * it is not to change anything, but to return the current value,
1662      * effectively initializing perl's db to what the locale already is.
1663      *
1664      * We play the same trick with NULL if a LC_ALL succeeds.  We call
1665      * setlocale() on the individual categores with NULL to get their existing
1666      * values for our db, instead of trying to change them.
1667      * */
1668
1669     int ok = 1;
1670
1671 #if defined(USE_LOCALE)
1672 #ifdef USE_LOCALE_CTYPE
1673     char *curctype   = NULL;
1674 #endif /* USE_LOCALE_CTYPE */
1675 #ifdef USE_LOCALE_COLLATE
1676     char *curcoll    = NULL;
1677 #endif /* USE_LOCALE_COLLATE */
1678 #ifdef USE_LOCALE_NUMERIC
1679     char *curnum     = NULL;
1680 #endif /* USE_LOCALE_NUMERIC */
1681 #ifdef __GLIBC__
1682     const char * const language   = savepv(PerlEnv_getenv("LANGUAGE"));
1683 #endif
1684
1685     /* NULL uses the existing already set up locale */
1686     const char * const setlocale_init = (PerlEnv_getenv("PERL_SKIP_LOCALE_INIT"))
1687                                         ? NULL
1688                                         : "";
1689     const char* trial_locales[5];   /* 5 = 1 each for "", LC_ALL, LANG, "", C */
1690     unsigned int trial_locales_count;
1691     const char * const lc_all     = savepv(PerlEnv_getenv("LC_ALL"));
1692     const char * const lang       = savepv(PerlEnv_getenv("LANG"));
1693     bool setlocale_failure = FALSE;
1694     unsigned int i;
1695     char *p;
1696
1697     /* A later getenv() could zap this, so only use here */
1698     const char * const bad_lang_use_once = PerlEnv_getenv("PERL_BADLANG");
1699
1700     const bool locwarn = (printwarn > 1
1701                           || (printwarn
1702                               && (! bad_lang_use_once
1703                                   || (
1704                                     /* disallow with "" or "0" */
1705                                     *bad_lang_use_once
1706                                     && strNE("0", bad_lang_use_once)))));
1707     bool done = FALSE;
1708     char * sl_result;   /* return from setlocale() */
1709     char * locale_param;
1710 #ifdef WIN32
1711     /* In some systems you can find out the system default locale
1712      * and use that as the fallback locale. */
1713 #   define SYSTEM_DEFAULT_LOCALE
1714 #endif
1715 #ifdef SYSTEM_DEFAULT_LOCALE
1716     const char *system_default_locale = NULL;
1717 #endif
1718
1719 #ifdef DEBUGGING
1720     DEBUG_INITIALIZATION_set(cBOOL(PerlEnv_getenv("PERL_DEBUG_LOCALE_INIT")));
1721 #   define DEBUG_LOCALE_INIT(category, locale, result)                      \
1722         STMT_START {                                                        \
1723                 if (debug_initialization) {                                 \
1724                     PerlIO_printf(Perl_debug_log,                           \
1725                                   "%s:%d: %s\n",                            \
1726                                   __FILE__, __LINE__,                       \
1727                                   setlocale_debug_string(category,          \
1728                                                           locale,           \
1729                                                           result));         \
1730                 }                                                           \
1731         } STMT_END
1732 #else
1733 #   define DEBUG_LOCALE_INIT(a,b,c)
1734 #endif
1735
1736 #ifndef LOCALE_ENVIRON_REQUIRED
1737     PERL_UNUSED_VAR(done);
1738     PERL_UNUSED_VAR(locale_param);
1739 #else
1740
1741     /*
1742      * Ultrix setlocale(..., "") fails if there are no environment
1743      * variables from which to get a locale name.
1744      */
1745
1746 #   ifdef LC_ALL
1747     if (lang) {
1748         sl_result = my_setlocale(LC_ALL, setlocale_init);
1749         DEBUG_LOCALE_INIT(LC_ALL, setlocale_init, sl_result);
1750         if (sl_result)
1751             done = TRUE;
1752         else
1753             setlocale_failure = TRUE;
1754     }
1755     if (! setlocale_failure) {
1756 #       ifdef USE_LOCALE_CTYPE
1757         locale_param = (! done && (lang || PerlEnv_getenv("LC_CTYPE")))
1758                        ? setlocale_init
1759                        : NULL;
1760         curctype = my_setlocale(LC_CTYPE, locale_param);
1761         DEBUG_LOCALE_INIT(LC_CTYPE, locale_param, sl_result);
1762         if (! curctype)
1763             setlocale_failure = TRUE;
1764         else
1765             curctype = savepv(curctype);
1766 #       endif /* USE_LOCALE_CTYPE */
1767 #       ifdef USE_LOCALE_COLLATE
1768         locale_param = (! done && (lang || PerlEnv_getenv("LC_COLLATE")))
1769                        ? setlocale_init
1770                        : NULL;
1771         curcoll = my_setlocale(LC_COLLATE, locale_param);
1772         DEBUG_LOCALE_INIT(LC_COLLATE, locale_param, sl_result);
1773         if (! curcoll)
1774             setlocale_failure = TRUE;
1775         else
1776             curcoll = savepv(curcoll);
1777 #       endif /* USE_LOCALE_COLLATE */
1778 #       ifdef USE_LOCALE_NUMERIC
1779         locale_param = (! done && (lang || PerlEnv_getenv("LC_NUMERIC")))
1780                        ? setlocale_init
1781                        : NULL;
1782         curnum = my_setlocale(LC_NUMERIC, locale_param);
1783         DEBUG_LOCALE_INIT(LC_NUMERIC, locale_param, sl_result);
1784         if (! curnum)
1785             setlocale_failure = TRUE;
1786         else
1787             curnum = savepv(curnum);
1788 #       endif /* USE_LOCALE_NUMERIC */
1789 #       ifdef USE_LOCALE_MESSAGES
1790         locale_param = (! done && (lang || PerlEnv_getenv("LC_MESSAGES")))
1791                        ? setlocale_init
1792                        : NULL;
1793         sl_result = my_setlocale(LC_MESSAGES, locale_param);
1794         DEBUG_LOCALE_INIT(LC_MESSAGES, locale_param, sl_result);
1795         if (! sl_result) {
1796             setlocale_failure = TRUE;
1797         }
1798 #       endif /* USE_LOCALE_MESSAGES */
1799 #       ifdef USE_LOCALE_MONETARY
1800         locale_param = (! done && (lang || PerlEnv_getenv("LC_MONETARY")))
1801                        ? setlocale_init
1802                        : NULL;
1803         sl_result = my_setlocale(LC_MONETARY, locale_param);
1804         DEBUG_LOCALE_INIT(LC_MONETARY, locale_param, sl_result);
1805         if (! sl_result) {
1806             setlocale_failure = TRUE;
1807         }
1808 #       endif /* USE_LOCALE_MONETARY */
1809     }
1810
1811 #   endif /* LC_ALL */
1812
1813 #endif /* !LOCALE_ENVIRON_REQUIRED */
1814
1815     /* We try each locale in the list until we get one that works, or exhaust
1816      * the list.  Normally the loop is executed just once.  But if setting the
1817      * locale fails, inside the loop we add fallback trials to the array and so
1818      * will execute the loop multiple times */
1819     trial_locales[0] = setlocale_init;
1820     trial_locales_count = 1;
1821     for (i= 0; i < trial_locales_count; i++) {
1822         const char * trial_locale = trial_locales[i];
1823
1824         if (i > 0) {
1825
1826             /* XXX This is to preserve old behavior for LOCALE_ENVIRON_REQUIRED
1827              * when i==0, but I (khw) don't think that behavior makes much
1828              * sense */
1829             setlocale_failure = FALSE;
1830
1831 #ifdef SYSTEM_DEFAULT_LOCALE
1832 #  ifdef WIN32
1833             /* On Windows machines, an entry of "" after the 0th means to use
1834              * the system default locale, which we now proceed to get. */
1835             if (strEQ(trial_locale, "")) {
1836                 unsigned int j;
1837
1838                 /* Note that this may change the locale, but we are going to do
1839                  * that anyway just below */
1840                 system_default_locale = setlocale(LC_ALL, "");
1841                 DEBUG_LOCALE_INIT(LC_ALL, "", system_default_locale);
1842
1843                 /* Skip if invalid or it's already on the list of locales to
1844                  * try */
1845                 if (! system_default_locale) {
1846                     goto next_iteration;
1847                 }
1848                 for (j = 0; j < trial_locales_count; j++) {
1849                     if (strEQ(system_default_locale, trial_locales[j])) {
1850                         goto next_iteration;
1851                     }
1852                 }
1853
1854                 trial_locale = system_default_locale;
1855             }
1856 #  endif /* WIN32 */
1857 #endif /* SYSTEM_DEFAULT_LOCALE */
1858         }
1859
1860 #ifdef LC_ALL
1861         sl_result = my_setlocale(LC_ALL, trial_locale);
1862         DEBUG_LOCALE_INIT(LC_ALL, trial_locale, sl_result);
1863         if (! sl_result) {
1864             setlocale_failure = TRUE;
1865         }
1866         else {
1867             /* Since LC_ALL succeeded, it should have changed all the other
1868              * categories it can to its value; so we massage things so that the
1869              * setlocales below just return their category's current values.
1870              * This adequately handles the case in NetBSD where LC_COLLATE may
1871              * not be defined for a locale, and setting it individually will
1872              * fail, whereas setting LC_ALL suceeds, leaving LC_COLLATE set to
1873              * the POSIX locale. */
1874             trial_locale = NULL;
1875         }
1876 #endif /* LC_ALL */
1877
1878         if (!setlocale_failure) {
1879 #ifdef USE_LOCALE_CTYPE
1880             Safefree(curctype);
1881             curctype = my_setlocale(LC_CTYPE, trial_locale);
1882             DEBUG_LOCALE_INIT(LC_CTYPE, trial_locale, curctype);
1883             if (! curctype)
1884                 setlocale_failure = TRUE;
1885             else
1886                 curctype = savepv(curctype);
1887 #endif /* USE_LOCALE_CTYPE */
1888 #ifdef USE_LOCALE_COLLATE
1889             Safefree(curcoll);
1890             curcoll = my_setlocale(LC_COLLATE, trial_locale);
1891             DEBUG_LOCALE_INIT(LC_COLLATE, trial_locale, curcoll);
1892             if (! curcoll)
1893                 setlocale_failure = TRUE;
1894             else
1895                 curcoll = savepv(curcoll);
1896 #endif /* USE_LOCALE_COLLATE */
1897 #ifdef USE_LOCALE_NUMERIC
1898             Safefree(curnum);
1899             curnum = my_setlocale(LC_NUMERIC, trial_locale);
1900             DEBUG_LOCALE_INIT(LC_NUMERIC, trial_locale, curnum);
1901             if (! curnum)
1902                 setlocale_failure = TRUE;
1903             else
1904                 curnum = savepv(curnum);
1905 #endif /* USE_LOCALE_NUMERIC */
1906 #ifdef USE_LOCALE_MESSAGES
1907             sl_result = my_setlocale(LC_MESSAGES, trial_locale);
1908             DEBUG_LOCALE_INIT(LC_MESSAGES, trial_locale, sl_result);
1909             if (! (sl_result))
1910                 setlocale_failure = TRUE;
1911 #endif /* USE_LOCALE_MESSAGES */
1912 #ifdef USE_LOCALE_MONETARY
1913             sl_result = my_setlocale(LC_MONETARY, trial_locale);
1914             DEBUG_LOCALE_INIT(LC_MONETARY, trial_locale, sl_result);
1915             if (! (sl_result))
1916                 setlocale_failure = TRUE;
1917 #endif /* USE_LOCALE_MONETARY */
1918
1919             if (! setlocale_failure) {  /* Success */
1920                 break;
1921             }
1922         }
1923
1924         /* Here, something failed; will need to try a fallback. */
1925         ok = 0;
1926
1927         if (i == 0) {
1928             unsigned int j;
1929
1930             if (locwarn) { /* Output failure info only on the first one */
1931 #ifdef LC_ALL
1932
1933                 PerlIO_printf(Perl_error_log,
1934                 "perl: warning: Setting locale failed.\n");
1935
1936 #else /* !LC_ALL */
1937
1938                 PerlIO_printf(Perl_error_log,
1939                 "perl: warning: Setting locale failed for the categories:\n\t");
1940 #  ifdef USE_LOCALE_CTYPE
1941                 if (! curctype)
1942                     PerlIO_printf(Perl_error_log, "LC_CTYPE ");
1943 #  endif /* USE_LOCALE_CTYPE */
1944 #  ifdef USE_LOCALE_COLLATE
1945                 if (! curcoll)
1946                     PerlIO_printf(Perl_error_log, "LC_COLLATE ");
1947 #  endif /* USE_LOCALE_COLLATE */
1948 #  ifdef USE_LOCALE_NUMERIC
1949                 if (! curnum)
1950                     PerlIO_printf(Perl_error_log, "LC_NUMERIC ");
1951 #  endif /* USE_LOCALE_NUMERIC */
1952                 PerlIO_printf(Perl_error_log, "and possibly others\n");
1953
1954 #endif /* LC_ALL */
1955
1956                 PerlIO_printf(Perl_error_log,
1957                     "perl: warning: Please check that your locale settings:\n");
1958
1959 #ifdef __GLIBC__
1960                 PerlIO_printf(Perl_error_log,
1961                             "\tLANGUAGE = %c%s%c,\n",
1962                             language ? '"' : '(',
1963                             language ? language : "unset",
1964                             language ? '"' : ')');
1965 #endif
1966
1967                 PerlIO_printf(Perl_error_log,
1968                             "\tLC_ALL = %c%s%c,\n",
1969                             lc_all ? '"' : '(',
1970                             lc_all ? lc_all : "unset",
1971                             lc_all ? '"' : ')');
1972
1973 #if defined(USE_ENVIRON_ARRAY)
1974                 {
1975                 char **e;
1976                 for (e = environ; *e; e++) {
1977                     if (strEQs(*e, "LC_")
1978                             && strNEs(*e, "LC_ALL=")
1979                             && (p = strchr(*e, '=')))
1980                         PerlIO_printf(Perl_error_log, "\t%.*s = \"%s\",\n",
1981                                         (int)(p - *e), *e, p + 1);
1982                 }
1983                 }
1984 #else
1985                 PerlIO_printf(Perl_error_log,
1986                             "\t(possibly more locale environment variables)\n");
1987 #endif
1988
1989                 PerlIO_printf(Perl_error_log,
1990                             "\tLANG = %c%s%c\n",
1991                             lang ? '"' : '(',
1992                             lang ? lang : "unset",
1993                             lang ? '"' : ')');
1994
1995                 PerlIO_printf(Perl_error_log,
1996                             "    are supported and installed on your system.\n");
1997             }
1998
1999             /* Calculate what fallback locales to try.  We have avoided this
2000              * until we have to, because failure is quite unlikely.  This will
2001              * usually change the upper bound of the loop we are in.
2002              *
2003              * Since the system's default way of setting the locale has not
2004              * found one that works, We use Perl's defined ordering: LC_ALL,
2005              * LANG, and the C locale.  We don't try the same locale twice, so
2006              * don't add to the list if already there.  (On POSIX systems, the
2007              * LC_ALL element will likely be a repeat of the 0th element "",
2008              * but there's no harm done by doing it explicitly.
2009              *
2010              * Note that this tries the LC_ALL environment variable even on
2011              * systems which have no LC_ALL locale setting.  This may or may
2012              * not have been originally intentional, but there's no real need
2013              * to change the behavior. */
2014             if (lc_all) {
2015                 for (j = 0; j < trial_locales_count; j++) {
2016                     if (strEQ(lc_all, trial_locales[j])) {
2017                         goto done_lc_all;
2018                     }
2019                 }
2020                 trial_locales[trial_locales_count++] = lc_all;
2021             }
2022           done_lc_all:
2023
2024             if (lang) {
2025                 for (j = 0; j < trial_locales_count; j++) {
2026                     if (strEQ(lang, trial_locales[j])) {
2027                         goto done_lang;
2028                     }
2029                 }
2030                 trial_locales[trial_locales_count++] = lang;
2031             }
2032           done_lang:
2033
2034 #if defined(WIN32) && defined(LC_ALL)
2035             /* For Windows, we also try the system default locale before "C".
2036              * (If there exists a Windows without LC_ALL we skip this because
2037              * it gets too complicated.  For those, the "C" is the next
2038              * fallback possibility).  The "" is the same as the 0th element of
2039              * the array, but the code at the loop above knows to treat it
2040              * differently when not the 0th */
2041             trial_locales[trial_locales_count++] = "";
2042 #endif
2043
2044             for (j = 0; j < trial_locales_count; j++) {
2045                 if (strEQ("C", trial_locales[j])) {
2046                     goto done_C;
2047                 }
2048             }
2049             trial_locales[trial_locales_count++] = "C";
2050
2051           done_C: ;
2052         }   /* end of first time through the loop */
2053
2054 #ifdef WIN32
2055       next_iteration: ;
2056 #endif
2057
2058     }   /* end of looping through the trial locales */
2059
2060     if (ok < 1) {   /* If we tried to fallback */
2061         const char* msg;
2062         if (! setlocale_failure) {  /* fallback succeeded */
2063            msg = "Falling back to";
2064         }
2065         else {  /* fallback failed */
2066
2067             /* We dropped off the end of the loop, so have to decrement i to
2068              * get back to the value the last time through */
2069             i--;
2070
2071             ok = -1;
2072             msg = "Failed to fall back to";
2073
2074             /* To continue, we should use whatever values we've got */
2075 #ifdef USE_LOCALE_CTYPE
2076             Safefree(curctype);
2077             curctype = savepv(setlocale(LC_CTYPE, NULL));
2078             DEBUG_LOCALE_INIT(LC_CTYPE, NULL, curctype);
2079 #endif /* USE_LOCALE_CTYPE */
2080 #ifdef USE_LOCALE_COLLATE
2081             Safefree(curcoll);
2082             curcoll = savepv(setlocale(LC_COLLATE, NULL));
2083             DEBUG_LOCALE_INIT(LC_COLLATE, NULL, curcoll);
2084 #endif /* USE_LOCALE_COLLATE */
2085 #ifdef USE_LOCALE_NUMERIC
2086             Safefree(curnum);
2087             curnum = savepv(setlocale(LC_NUMERIC, NULL));
2088             DEBUG_LOCALE_INIT(LC_NUMERIC, NULL, curnum);
2089 #endif /* USE_LOCALE_NUMERIC */
2090         }
2091
2092         if (locwarn) {
2093             const char * description;
2094             const char * name = "";
2095             if (strEQ(trial_locales[i], "C")) {
2096                 description = "the standard locale";
2097                 name = "C";
2098             }
2099 #ifdef SYSTEM_DEFAULT_LOCALE
2100             else if (strEQ(trial_locales[i], "")) {
2101                 description = "the system default locale";
2102                 if (system_default_locale) {
2103                     name = system_default_locale;
2104                 }
2105             }
2106 #endif /* SYSTEM_DEFAULT_LOCALE */
2107             else {
2108                 description = "a fallback locale";
2109                 name = trial_locales[i];
2110             }
2111             if (name && strNE(name, "")) {
2112                 PerlIO_printf(Perl_error_log,
2113                     "perl: warning: %s %s (\"%s\").\n", msg, description, name);
2114             }
2115             else {
2116                 PerlIO_printf(Perl_error_log,
2117                                    "perl: warning: %s %s.\n", msg, description);
2118             }
2119         }
2120     } /* End of tried to fallback */
2121
2122 #ifdef USE_LOCALE_CTYPE
2123     new_ctype(curctype);
2124 #endif /* USE_LOCALE_CTYPE */
2125
2126 #ifdef USE_LOCALE_COLLATE
2127     new_collate(curcoll);
2128 #endif /* USE_LOCALE_COLLATE */
2129
2130 #ifdef USE_LOCALE_NUMERIC
2131     new_numeric(curnum);
2132 #endif /* USE_LOCALE_NUMERIC */
2133
2134 #if defined(USE_PERLIO) && defined(USE_LOCALE_CTYPE)
2135     /* Set PL_utf8locale to TRUE if using PerlIO _and_ the current LC_CTYPE
2136      * locale is UTF-8.  If PL_utf8locale and PL_unicode (set by -C or by
2137      * $ENV{PERL_UNICODE}) are true, perl.c:S_parse_body() will turn on the
2138      * PerlIO :utf8 layer on STDIN, STDOUT, STDERR, _and_ the default open
2139      * discipline.  */
2140     PL_utf8locale = _is_cur_LC_category_utf8(LC_CTYPE);
2141
2142     /* Set PL_unicode to $ENV{PERL_UNICODE} if using PerlIO.
2143        This is an alternative to using the -C command line switch
2144        (the -C if present will override this). */
2145     {
2146          const char *p = PerlEnv_getenv("PERL_UNICODE");
2147          PL_unicode = p ? parse_unicode_opts(&p) : 0;
2148          if (PL_unicode & PERL_UNICODE_UTF8CACHEASSERT_FLAG)
2149              PL_utf8cache = -1;
2150     }
2151 #endif
2152
2153 #ifdef USE_LOCALE_CTYPE
2154     Safefree(curctype);
2155 #endif /* USE_LOCALE_CTYPE */
2156 #ifdef USE_LOCALE_COLLATE
2157     Safefree(curcoll);
2158 #endif /* USE_LOCALE_COLLATE */
2159 #ifdef USE_LOCALE_NUMERIC
2160     Safefree(curnum);
2161 #endif /* USE_LOCALE_NUMERIC */
2162
2163 #ifdef __GLIBC__
2164     Safefree(language);
2165 #endif
2166
2167     Safefree(lc_all);
2168     Safefree(lang);
2169
2170 #else  /* !USE_LOCALE */
2171     PERL_UNUSED_ARG(printwarn);
2172 #endif /* USE_LOCALE */
2173
2174 #ifdef DEBUGGING
2175     /* So won't continue to output stuff */
2176     DEBUG_INITIALIZATION_set(FALSE);
2177 #endif
2178
2179     return ok;
2180 }
2181
2182 #ifdef USE_LOCALE_COLLATE
2183
2184 char *
2185 Perl__mem_collxfrm(pTHX_ const char *input_string,
2186                          STRLEN len,    /* Length of 'input_string' */
2187                          STRLEN *xlen,  /* Set to length of returned string
2188                                            (not including the collation index
2189                                            prefix) */
2190                          bool utf8      /* Is the input in UTF-8? */
2191                    )
2192 {
2193
2194     /* _mem_collxfrm() is a bit like strxfrm() but with two important
2195      * differences. First, it handles embedded NULs. Second, it allocates a bit
2196      * more memory than needed for the transformed data itself.  The real
2197      * transformed data begins at offset COLLXFRM_HDR_LEN.  *xlen is set to
2198      * the length of that, and doesn't include the collation index size.
2199      * Please see sv_collxfrm() to see how this is used. */
2200
2201 #define COLLXFRM_HDR_LEN    sizeof(PL_collation_ix)
2202
2203     char * s = (char *) input_string;
2204     STRLEN s_strlen = strlen(input_string);
2205     char *xbuf = NULL;
2206     STRLEN xAlloc;          /* xalloc is a reserved word in VC */
2207     STRLEN length_in_chars;
2208     bool first_time = TRUE; /* Cleared after first loop iteration */
2209
2210     PERL_ARGS_ASSERT__MEM_COLLXFRM;
2211
2212     /* Must be NUL-terminated */
2213     assert(*(input_string + len) == '\0');
2214
2215     /* If this locale has defective collation, skip */
2216     if (PL_collxfrm_base == 0 && PL_collxfrm_mult == 0) {
2217         DEBUG_L(PerlIO_printf(Perl_debug_log,
2218                       "_mem_collxfrm: locale's collation is defective\n"));
2219         goto bad;
2220     }
2221
2222     /* Replace any embedded NULs with the control that sorts before any others.
2223      * This will give as good as possible results on strings that don't
2224      * otherwise contain that character, but otherwise there may be
2225      * less-than-perfect results with that character and NUL.  This is
2226      * unavoidable unless we replace strxfrm with our own implementation. */
2227     if (UNLIKELY(s_strlen < len)) {   /* Only execute if there is an embedded
2228                                          NUL */
2229         char * e = s + len;
2230         char * sans_nuls;
2231         STRLEN sans_nuls_len;
2232         int try_non_controls;
2233         char this_replacement_char[] = "?\0";   /* Room for a two-byte string,
2234                                                    making sure 2nd byte is NUL.
2235                                                  */
2236         STRLEN this_replacement_len;
2237
2238         /* If we don't know what non-NUL control character sorts lowest for
2239          * this locale, find it */
2240         if (PL_strxfrm_NUL_replacement == '\0') {
2241             int j;
2242             char * cur_min_x = NULL;    /* The min_char's xfrm, (except it also
2243                                            includes the collation index
2244                                            prefixed. */
2245
2246             DEBUG_Lv(PerlIO_printf(Perl_debug_log, "Looking to replace NUL\n"));
2247
2248             /* Unlikely, but it may be that no control will work to replace
2249              * NUL, in which case we instead look for any character.  Controls
2250              * are preferred because collation order is, in general, context
2251              * sensitive, with adjoining characters affecting the order, and
2252              * controls are less likely to have such interactions, allowing the
2253              * NUL-replacement to stand on its own.  (Another way to look at it
2254              * is to imagine what would happen if the NUL were replaced by a
2255              * combining character; it wouldn't work out all that well.) */
2256             for (try_non_controls = 0;
2257                  try_non_controls < 2;
2258                  try_non_controls++)
2259             {
2260                 /* Look through all legal code points (NUL isn't) */
2261                 for (j = 1; j < 256; j++) {
2262                     char * x;       /* j's xfrm plus collation index */
2263                     STRLEN x_len;   /* length of 'x' */
2264                     STRLEN trial_len = 1;
2265                     char cur_source[] = { '\0', '\0' };
2266
2267                     /* Skip non-controls the first time through the loop.  The
2268                      * controls in a UTF-8 locale are the L1 ones */
2269                     if (! try_non_controls && (PL_in_utf8_COLLATE_locale)
2270                                                ? ! isCNTRL_L1(j)
2271                                                : ! isCNTRL_LC(j))
2272                     {
2273                         continue;
2274                     }
2275
2276                     /* Create a 1-char string of the current code point */
2277                     cur_source[0] = (char) j;
2278
2279                     /* Then transform it */
2280                     x = _mem_collxfrm(cur_source, trial_len, &x_len,
2281                                       0 /* The string is not in UTF-8 */);
2282
2283                     /* Ignore any character that didn't successfully transform.
2284                      * */
2285                     if (! x) {
2286                         continue;
2287                     }
2288
2289                     /* If this character's transformation is lower than
2290                      * the current lowest, this one becomes the lowest */
2291                     if (   cur_min_x == NULL
2292                         || strLT(x         + COLLXFRM_HDR_LEN,
2293                                  cur_min_x + COLLXFRM_HDR_LEN))
2294                     {
2295                         PL_strxfrm_NUL_replacement = j;
2296                         cur_min_x = x;
2297                     }
2298                     else {
2299                         Safefree(x);
2300                     }
2301                 } /* end of loop through all 255 characters */
2302
2303                 /* Stop looking if found */
2304                 if (cur_min_x) {
2305                     break;
2306                 }
2307
2308                 /* Unlikely, but possible, if there aren't any controls that
2309                  * work in the locale, repeat the loop, looking for any
2310                  * character that works */
2311                 DEBUG_L(PerlIO_printf(Perl_debug_log,
2312                 "_mem_collxfrm: No control worked.  Trying non-controls\n"));
2313             } /* End of loop to try first the controls, then any char */
2314
2315             if (! cur_min_x) {
2316                 DEBUG_L(PerlIO_printf(Perl_debug_log,
2317                     "_mem_collxfrm: Couldn't find any character to replace"
2318                     " embedded NULs in locale %s with", PL_collation_name));
2319                 goto bad;
2320             }
2321
2322             DEBUG_L(PerlIO_printf(Perl_debug_log,
2323                     "_mem_collxfrm: Replacing embedded NULs in locale %s with "
2324                     "0x%02X\n", PL_collation_name, PL_strxfrm_NUL_replacement));
2325
2326             Safefree(cur_min_x);
2327         } /* End of determining the character that is to replace NULs */
2328
2329         /* If the replacement is variant under UTF-8, it must match the
2330          * UTF8-ness as the original */
2331         if ( ! UVCHR_IS_INVARIANT(PL_strxfrm_NUL_replacement) && utf8) {
2332             this_replacement_char[0] =
2333                                 UTF8_EIGHT_BIT_HI(PL_strxfrm_NUL_replacement);
2334             this_replacement_char[1] =
2335                                 UTF8_EIGHT_BIT_LO(PL_strxfrm_NUL_replacement);
2336             this_replacement_len = 2;
2337         }
2338         else {
2339             this_replacement_char[0] = PL_strxfrm_NUL_replacement;
2340             /* this_replacement_char[1] = '\0' was done at initialization */
2341             this_replacement_len = 1;
2342         }
2343
2344         /* The worst case length for the replaced string would be if every
2345          * character in it is NUL.  Multiply that by the length of each
2346          * replacement, and allow for a trailing NUL */
2347         sans_nuls_len = (len * this_replacement_len) + 1;
2348         Newx(sans_nuls, sans_nuls_len, char);
2349         *sans_nuls = '\0';
2350
2351         /* Replace each NUL with the lowest collating control.  Loop until have
2352          * exhausted all the NULs */
2353         while (s + s_strlen < e) {
2354             my_strlcat(sans_nuls, s, sans_nuls_len);
2355
2356             /* Do the actual replacement */
2357             my_strlcat(sans_nuls, this_replacement_char, sans_nuls_len);
2358
2359             /* Move past the input NUL */
2360             s += s_strlen + 1;
2361             s_strlen = strlen(s);
2362         }
2363
2364         /* And add anything that trails the final NUL */
2365         my_strlcat(sans_nuls, s, sans_nuls_len);
2366
2367         /* Switch so below we transform this modified string */
2368         s = sans_nuls;
2369         len = strlen(s);
2370     } /* End of replacing NULs */
2371
2372     /* Make sure the UTF8ness of the string and locale match */
2373     if (utf8 != PL_in_utf8_COLLATE_locale) {
2374         const char * const t = s;   /* Temporary so we can later find where the
2375                                        input was */
2376
2377         /* Here they don't match.  Change the string's to be what the locale is
2378          * expecting */
2379
2380         if (! utf8) { /* locale is UTF-8, but input isn't; upgrade the input */
2381             s = (char *) bytes_to_utf8((const U8 *) s, &len);
2382             utf8 = TRUE;
2383         }
2384         else {   /* locale is not UTF-8; but input is; downgrade the input */
2385
2386             s = (char *) bytes_from_utf8((const U8 *) s, &len, &utf8);
2387
2388             /* If the downgrade was successful we are done, but if the input
2389              * contains things that require UTF-8 to represent, have to do
2390              * damage control ... */
2391             if (UNLIKELY(utf8)) {
2392
2393                 /* What we do is construct a non-UTF-8 string with
2394                  *  1) the characters representable by a single byte converted
2395                  *     to be so (if necessary);
2396                  *  2) and the rest converted to collate the same as the
2397                  *     highest collating representable character.  That makes
2398                  *     them collate at the end.  This is similar to how we
2399                  *     handle embedded NULs, but we use the highest collating
2400                  *     code point instead of the smallest.  Like the NUL case,
2401                  *     this isn't perfect, but is the best we can reasonably
2402                  *     do.  Every above-255 code point will sort the same as
2403                  *     the highest-sorting 0-255 code point.  If that code
2404                  *     point can combine in a sequence with some other code
2405                  *     points for weight calculations, us changing something to
2406                  *     be it can adversely affect the results.  But in most
2407                  *     cases, it should work reasonably.  And note that this is
2408                  *     really an illegal situation: using code points above 255
2409                  *     on a locale where only 0-255 are valid.  If two strings
2410                  *     sort entirely equal, then the sort order for the
2411                  *     above-255 code points will be in code point order. */
2412
2413                 utf8 = FALSE;
2414
2415                 /* If we haven't calculated the code point with the maximum
2416                  * collating order for this locale, do so now */
2417                 if (! PL_strxfrm_max_cp) {
2418                     int j;
2419
2420                     /* The current transformed string that collates the
2421                      * highest (except it also includes the prefixed collation
2422                      * index. */
2423                     char * cur_max_x = NULL;
2424
2425                     /* Look through all legal code points (NUL isn't) */
2426                     for (j = 1; j < 256; j++) {
2427                         char * x;
2428                         STRLEN x_len;
2429                         char cur_source[] = { '\0', '\0' };
2430
2431                         /* Create a 1-char string of the current code point */
2432                         cur_source[0] = (char) j;
2433
2434                         /* Then transform it */
2435                         x = _mem_collxfrm(cur_source, 1, &x_len, FALSE);
2436
2437                         /* If something went wrong (which it shouldn't), just
2438                          * ignore this code point */
2439                         if (! x) {
2440                             continue;
2441                         }
2442
2443                         /* If this character's transformation is higher than
2444                          * the current highest, this one becomes the highest */
2445                         if (   cur_max_x == NULL
2446                             || strGT(x         + COLLXFRM_HDR_LEN,
2447                                      cur_max_x + COLLXFRM_HDR_LEN))
2448                         {
2449                             PL_strxfrm_max_cp = j;
2450                             cur_max_x = x;
2451                         }
2452                         else {
2453                             Safefree(x);
2454                         }
2455                     }
2456
2457                     if (! cur_max_x) {
2458                         DEBUG_L(PerlIO_printf(Perl_debug_log,
2459                             "_mem_collxfrm: Couldn't find any character to"
2460                             " replace above-Latin1 chars in locale %s with",
2461                             PL_collation_name));
2462                         goto bad;
2463                     }
2464
2465                     DEBUG_L(PerlIO_printf(Perl_debug_log,
2466                             "_mem_collxfrm: highest 1-byte collating character"
2467                             " in locale %s is 0x%02X\n",
2468                             PL_collation_name,
2469                             PL_strxfrm_max_cp));
2470
2471                     Safefree(cur_max_x);
2472                 }
2473
2474                 /* Here we know which legal code point collates the highest.
2475                  * We are ready to construct the non-UTF-8 string.  The length
2476                  * will be at least 1 byte smaller than the input string
2477                  * (because we changed at least one 2-byte character into a
2478                  * single byte), but that is eaten up by the trailing NUL */
2479                 Newx(s, len, char);
2480
2481                 {
2482                     STRLEN i;
2483                     STRLEN d= 0;
2484                     char * e = (char *) t + len;
2485
2486                     for (i = 0; i < len; i+= UTF8SKIP(t + i)) {
2487                         U8 cur_char = t[i];
2488                         if (UTF8_IS_INVARIANT(cur_char)) {
2489                             s[d++] = cur_char;
2490                         }
2491                         else if (UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(t + i, e)) {
2492                             s[d++] = EIGHT_BIT_UTF8_TO_NATIVE(cur_char, t[i+1]);
2493                         }
2494                         else {  /* Replace illegal cp with highest collating
2495                                    one */
2496                             s[d++] = PL_strxfrm_max_cp;
2497                         }
2498                     }
2499                     s[d++] = '\0';
2500                     Renew(s, d, char);   /* Free up unused space */
2501                 }
2502             }
2503         }
2504
2505         /* Here, we have constructed a modified version of the input.  It could
2506          * be that we already had a modified copy before we did this version.
2507          * If so, that copy is no longer needed */
2508         if (t != input_string) {
2509             Safefree(t);
2510         }
2511     }
2512
2513     length_in_chars = (utf8)
2514                       ? utf8_length((U8 *) s, (U8 *) s + len)
2515                       : len;
2516
2517     /* The first element in the output is the collation id, used by
2518      * sv_collxfrm(); then comes the space for the transformed string.  The
2519      * equation should give us a good estimate as to how much is needed */
2520     xAlloc = COLLXFRM_HDR_LEN
2521            + PL_collxfrm_base
2522            + (PL_collxfrm_mult * length_in_chars);
2523     Newx(xbuf, xAlloc, char);
2524     if (UNLIKELY(! xbuf)) {
2525         DEBUG_L(PerlIO_printf(Perl_debug_log,
2526                       "_mem_collxfrm: Couldn't malloc %zu bytes\n", xAlloc));
2527         goto bad;
2528     }
2529
2530     /* Store the collation id */
2531     *(U32*)xbuf = PL_collation_ix;
2532
2533     /* Then the transformation of the input.  We loop until successful, or we
2534      * give up */
2535     for (;;) {
2536
2537         *xlen = strxfrm(xbuf + COLLXFRM_HDR_LEN, s, xAlloc - COLLXFRM_HDR_LEN);
2538
2539         /* If the transformed string occupies less space than we told strxfrm()
2540          * was available, it means it successfully transformed the whole
2541          * string. */
2542         if (*xlen < xAlloc - COLLXFRM_HDR_LEN) {
2543
2544             /* Some systems include a trailing NUL in the returned length.
2545              * Ignore it, using a loop in case multiple trailing NULs are
2546              * returned. */
2547             while (   (*xlen) > 0
2548                    && *(xbuf + COLLXFRM_HDR_LEN + (*xlen) - 1) == '\0')
2549             {
2550                 (*xlen)--;
2551             }
2552
2553             /* If the first try didn't get it, it means our prediction was low.
2554              * Modify the coefficients so that we predict a larger value in any
2555              * future transformations */
2556             if (! first_time) {
2557                 STRLEN needed = *xlen + 1;   /* +1 For trailing NUL */
2558                 STRLEN computed_guess = PL_collxfrm_base
2559                                       + (PL_collxfrm_mult * length_in_chars);
2560
2561                 /* On zero-length input, just keep current slope instead of
2562                  * dividing by 0 */
2563                 const STRLEN new_m = (length_in_chars != 0)
2564                                      ? needed / length_in_chars
2565                                      : PL_collxfrm_mult;
2566
2567                 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
2568                     "%s: %d: initial size of %zu bytes for a length "
2569                     "%zu string was insufficient, %zu needed\n",
2570                     __FILE__, __LINE__,
2571                     computed_guess, length_in_chars, needed));
2572
2573                 /* If slope increased, use it, but discard this result for
2574                  * length 1 strings, as we can't be sure that it's a real slope
2575                  * change */
2576                 if (length_in_chars > 1 && new_m  > PL_collxfrm_mult) {
2577 #ifdef DEBUGGING
2578                     STRLEN old_m = PL_collxfrm_mult;
2579                     STRLEN old_b = PL_collxfrm_base;
2580 #endif
2581                     PL_collxfrm_mult = new_m;
2582                     PL_collxfrm_base = 1;   /* +1 For trailing NUL */
2583                     computed_guess = PL_collxfrm_base
2584                                     + (PL_collxfrm_mult * length_in_chars);
2585                     if (computed_guess < needed) {
2586                         PL_collxfrm_base += needed - computed_guess;
2587                     }
2588
2589                     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
2590                         "%s: %d: slope is now %zu; was %zu, base "
2591                         "is now %zu; was %zu\n",
2592                         __FILE__, __LINE__,
2593                         PL_collxfrm_mult, old_m,
2594                         PL_collxfrm_base, old_b));
2595                 }
2596                 else {  /* Slope didn't change, but 'b' did */
2597                     const STRLEN new_b = needed
2598                                         - computed_guess
2599                                         + PL_collxfrm_base;
2600                     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
2601                         "%s: %d: base is now %zu; was %zu\n",
2602                         __FILE__, __LINE__,
2603                         new_b, PL_collxfrm_base));
2604                     PL_collxfrm_base = new_b;
2605                 }
2606             }
2607
2608             break;
2609         }
2610
2611         if (UNLIKELY(*xlen >= PERL_INT_MAX)) {
2612             DEBUG_L(PerlIO_printf(Perl_debug_log,
2613                   "_mem_collxfrm: Needed %zu bytes, max permissible is %u\n",
2614                   *xlen, PERL_INT_MAX));
2615             goto bad;
2616         }
2617
2618         /* A well-behaved strxfrm() returns exactly how much space it needs
2619          * (usually not including the trailing NUL) when it fails due to not
2620          * enough space being provided.  Assume that this is the case unless
2621          * it's been proven otherwise */
2622         if (LIKELY(PL_strxfrm_is_behaved) && first_time) {
2623             xAlloc = *xlen + COLLXFRM_HDR_LEN + 1;
2624         }
2625         else { /* Here, either:
2626                 *  1)  The strxfrm() has previously shown bad behavior; or
2627                 *  2)  It isn't the first time through the loop, which means
2628                 *      that the strxfrm() is now showing bad behavior, because
2629                 *      we gave it what it said was needed in the previous
2630                 *      iteration, and it came back saying it needed still more.
2631                 *      (Many versions of cygwin fit this.  When the buffer size
2632                 *      isn't sufficient, they return the input size instead of
2633                 *      how much is needed.)
2634                 * Increase the buffer size by a fixed percentage and try again.
2635                 * */
2636             xAlloc += (xAlloc / 4) + 1;
2637             PL_strxfrm_is_behaved = FALSE;
2638
2639 #ifdef DEBUGGING
2640             if (DEBUG_Lv_TEST || debug_initialization) {
2641                 PerlIO_printf(Perl_debug_log,
2642                 "_mem_collxfrm required more space than previously calculated"
2643                 " for locale %s, trying again with new guess=%d+%zu\n",
2644                 PL_collation_name, (int) COLLXFRM_HDR_LEN,
2645                 xAlloc - COLLXFRM_HDR_LEN);
2646             }
2647 #endif
2648         }
2649
2650         Renew(xbuf, xAlloc, char);
2651         if (UNLIKELY(! xbuf)) {
2652             DEBUG_L(PerlIO_printf(Perl_debug_log,
2653                       "_mem_collxfrm: Couldn't realloc %zu bytes\n", xAlloc));
2654             goto bad;
2655         }
2656
2657         first_time = FALSE;
2658     }
2659
2660
2661 #ifdef DEBUGGING
2662     if (DEBUG_Lv_TEST || debug_initialization) {
2663
2664         print_collxfrm_input_and_return(s, s + len, xlen, utf8);
2665         PerlIO_printf(Perl_debug_log, "Its xfrm is:");
2666         PerlIO_printf(Perl_debug_log, "%s\n",
2667                       _byte_dump_string((U8 *) xbuf + COLLXFRM_HDR_LEN,
2668                        *xlen, 1));
2669     }
2670 #endif
2671
2672     /* Free up unneeded space; retain ehough for trailing NUL */
2673     Renew(xbuf, COLLXFRM_HDR_LEN + *xlen + 1, char);
2674
2675     if (s != input_string) {
2676         Safefree(s);
2677     }
2678
2679     return xbuf;
2680
2681   bad:
2682     Safefree(xbuf);
2683     if (s != input_string) {
2684         Safefree(s);
2685     }
2686     *xlen = 0;
2687 #ifdef DEBUGGING
2688     if (DEBUG_Lv_TEST || debug_initialization) {
2689         print_collxfrm_input_and_return(s, s + len, NULL, utf8);
2690     }
2691 #endif
2692     return NULL;
2693 }
2694
2695 #ifdef DEBUGGING
2696
2697 STATIC void
2698 S_print_collxfrm_input_and_return(pTHX_
2699                                   const char * const s,
2700                                   const char * const e,
2701                                   const STRLEN * const xlen,
2702                                   const bool is_utf8)
2703 {
2704
2705     PERL_ARGS_ASSERT_PRINT_COLLXFRM_INPUT_AND_RETURN;
2706
2707     PerlIO_printf(Perl_debug_log, "_mem_collxfrm[%" UVuf "]: returning ",
2708                                                         (UV)PL_collation_ix);
2709     if (xlen) {
2710         PerlIO_printf(Perl_debug_log, "%zu", *xlen);
2711     }
2712     else {
2713         PerlIO_printf(Perl_debug_log, "NULL");
2714     }
2715     PerlIO_printf(Perl_debug_log, " for locale '%s', string='",
2716                                                             PL_collation_name);
2717     print_bytes_for_locale(s, e, is_utf8);
2718
2719     PerlIO_printf(Perl_debug_log, "'\n");
2720 }
2721
2722 STATIC void
2723 S_print_bytes_for_locale(pTHX_
2724                     const char * const s,
2725                     const char * const e,
2726                     const bool is_utf8)
2727 {
2728     const char * t = s;
2729     bool prev_was_printable = TRUE;
2730     bool first_time = TRUE;
2731
2732     PERL_ARGS_ASSERT_PRINT_BYTES_FOR_LOCALE;
2733
2734     while (t < e) {
2735         UV cp = (is_utf8)
2736                 ?  utf8_to_uvchr_buf((U8 *) t, e, NULL)
2737                 : * (U8 *) t;
2738         if (isPRINT(cp)) {
2739             if (! prev_was_printable) {
2740                 PerlIO_printf(Perl_debug_log, " ");
2741             }
2742             PerlIO_printf(Perl_debug_log, "%c", (U8) cp);
2743             prev_was_printable = TRUE;
2744         }
2745         else {
2746             if (! first_time) {
2747                 PerlIO_printf(Perl_debug_log, " ");
2748             }
2749             PerlIO_printf(Perl_debug_log, "%02" UVXf, cp);
2750             prev_was_printable = FALSE;
2751         }
2752         t += (is_utf8) ? UTF8SKIP(t) : 1;
2753         first_time = FALSE;
2754     }
2755 }
2756
2757 #endif   /* #ifdef DEBUGGING */
2758
2759 #endif /* USE_LOCALE_COLLATE */
2760
2761 #ifdef USE_LOCALE
2762
2763 bool
2764 Perl__is_cur_LC_category_utf8(pTHX_ int category)
2765 {
2766     /* Returns TRUE if the current locale for 'category' is UTF-8; FALSE
2767      * otherwise. 'category' may not be LC_ALL.  If the platform doesn't have
2768      * nl_langinfo(), nor MB_CUR_MAX, this employs a heuristic, which hence
2769      * could give the wrong result.  The result will very likely be correct for
2770      * languages that have commonly used non-ASCII characters, but for notably
2771      * English, it comes down to if the locale's name ends in something like
2772      * "UTF-8".  It errs on the side of not being a UTF-8 locale. */
2773
2774     char *save_input_locale = NULL;
2775     STRLEN final_pos;
2776
2777 #ifdef LC_ALL
2778     assert(category != LC_ALL);
2779 #endif
2780
2781     /* First dispose of the trivial cases */
2782     save_input_locale = setlocale(category, NULL);
2783     if (! save_input_locale) {
2784         DEBUG_L(PerlIO_printf(Perl_debug_log,
2785                               "Could not find current locale for category %d\n",
2786                               category));
2787         return FALSE;   /* XXX maybe should croak */
2788     }
2789     save_input_locale = stdize_locale(savepv(save_input_locale));
2790     if (isNAME_C_OR_POSIX(save_input_locale)) {
2791         DEBUG_L(PerlIO_printf(Perl_debug_log,
2792                               "Current locale for category %d is %s\n",
2793                               category, save_input_locale));
2794         Safefree(save_input_locale);
2795         return FALSE;
2796     }
2797
2798 #if defined(USE_LOCALE_CTYPE)    \
2799     && (defined(MB_CUR_MAX) || (defined(HAS_NL_LANGINFO) && defined(CODESET)))
2800
2801     { /* Next try nl_langinfo or MB_CUR_MAX if available */
2802
2803         char *save_ctype_locale = NULL;
2804         bool is_utf8;
2805
2806         if (category != LC_CTYPE) { /* These work only on LC_CTYPE */
2807
2808             /* Get the current LC_CTYPE locale */
2809             save_ctype_locale = setlocale(LC_CTYPE, NULL);
2810             if (! save_ctype_locale) {
2811                 DEBUG_L(PerlIO_printf(Perl_debug_log,
2812                                "Could not find current locale for LC_CTYPE\n"));
2813                 goto cant_use_nllanginfo;
2814             }
2815             save_ctype_locale = stdize_locale(savepv(save_ctype_locale));
2816
2817             /* If LC_CTYPE and the desired category use the same locale, this
2818              * means that finding the value for LC_CTYPE is the same as finding
2819              * the value for the desired category.  Otherwise, switch LC_CTYPE
2820              * to the desired category's locale */
2821             if (strEQ(save_ctype_locale, save_input_locale)) {
2822                 Safefree(save_ctype_locale);
2823                 save_ctype_locale = NULL;
2824             }
2825             else if (! setlocale(LC_CTYPE, save_input_locale)) {
2826                 DEBUG_L(PerlIO_printf(Perl_debug_log,
2827                                     "Could not change LC_CTYPE locale to %s\n",
2828                                     save_input_locale));
2829                 Safefree(save_ctype_locale);
2830                 goto cant_use_nllanginfo;
2831             }
2832         }
2833
2834         DEBUG_L(PerlIO_printf(Perl_debug_log, "Current LC_CTYPE locale=%s\n",
2835                                               save_input_locale));
2836
2837         /* Here the current LC_CTYPE is set to the locale of the category whose
2838          * information is desired.  This means that nl_langinfo() and MB_CUR_MAX
2839          * should give the correct results */
2840
2841 #   if defined(HAS_NL_LANGINFO) && defined(CODESET)
2842         {
2843             char *codeset = nl_langinfo(CODESET);
2844             if (codeset && strNE(codeset, "")) {
2845                 codeset = savepv(codeset);
2846
2847                 /* If we switched LC_CTYPE, switch back */
2848                 if (save_ctype_locale) {
2849                     setlocale(LC_CTYPE, save_ctype_locale);
2850                     Safefree(save_ctype_locale);
2851                 }
2852
2853                 is_utf8 = foldEQ(codeset, STR_WITH_LEN("UTF-8"))
2854                         || foldEQ(codeset, STR_WITH_LEN("UTF8"));
2855
2856                 DEBUG_L(PerlIO_printf(Perl_debug_log,
2857                        "\tnllanginfo returned CODESET '%s'; ?UTF8 locale=%d\n",
2858                                                      codeset,         is_utf8));
2859                 Safefree(codeset);
2860                 Safefree(save_input_locale);
2861                 return is_utf8;
2862             }
2863         }
2864
2865 #   endif
2866 #   ifdef MB_CUR_MAX
2867
2868         /* Here, either we don't have nl_langinfo, or it didn't return a
2869          * codeset.  Try MB_CUR_MAX */
2870
2871         /* Standard UTF-8 needs at least 4 bytes to represent the maximum
2872          * Unicode code point.  Since UTF-8 is the only non-single byte
2873          * encoding we handle, we just say any such encoding is UTF-8, and if
2874          * turns out to be wrong, other things will fail */
2875         is_utf8 = MB_CUR_MAX >= 4;
2876
2877         DEBUG_L(PerlIO_printf(Perl_debug_log,
2878                               "\tMB_CUR_MAX=%d; ?UTF8 locale=%d\n",
2879                                    (int) MB_CUR_MAX,      is_utf8));
2880
2881         Safefree(save_input_locale);
2882
2883 #       ifdef HAS_MBTOWC
2884
2885         /* ... But, most system that have MB_CUR_MAX will also have mbtowc(),
2886          * since they are both in the C99 standard.  We can feed a known byte
2887          * string to the latter function, and check that it gives the expected
2888          * result */
2889         if (is_utf8) {
2890             wchar_t wc;
2891             PERL_UNUSED_RESULT(mbtowc(&wc, NULL, 0));/* Reset any shift state */
2892             errno = 0;
2893             if ((size_t)mbtowc(&wc, HYPHEN_UTF8, strlen(HYPHEN_UTF8))
2894                                                         != strlen(HYPHEN_UTF8)
2895                 || wc != (wchar_t) 0x2010)
2896             {
2897                 is_utf8 = FALSE;
2898                 DEBUG_L(PerlIO_printf(Perl_debug_log, "\thyphen=U+%x\n", (unsigned int)wc));
2899                 DEBUG_L(PerlIO_printf(Perl_debug_log,
2900                         "\treturn from mbtowc=%d; errno=%d; ?UTF8 locale=0\n",
2901                         mbtowc(&wc, HYPHEN_UTF8, strlen(HYPHEN_UTF8)), errno));
2902             }
2903         }
2904 #       endif
2905
2906         /* If we switched LC_CTYPE, switch back */
2907         if (save_ctype_locale) {
2908             setlocale(LC_CTYPE, save_ctype_locale);
2909             Safefree(save_ctype_locale);
2910         }
2911
2912         return is_utf8;
2913 #   endif
2914     }
2915
2916   cant_use_nllanginfo:
2917
2918 #else   /* nl_langinfo should work if available, so don't bother compiling this
2919            fallback code.  The final fallback of looking at the name is
2920            compiled, and will be executed if nl_langinfo fails */
2921
2922     /* nl_langinfo not available or failed somehow.  Next try looking at the
2923      * currency symbol to see if it disambiguates things.  Often that will be
2924      * in the native script, and if the symbol isn't in UTF-8, we know that the
2925      * locale isn't.  If it is non-ASCII UTF-8, we infer that the locale is
2926      * too, as the odds of a non-UTF8 string being valid UTF-8 are quite small
2927      * */
2928
2929 #ifdef HAS_LOCALECONV
2930 #   ifdef USE_LOCALE_MONETARY
2931     {
2932         char *save_monetary_locale = NULL;
2933         bool only_ascii = FALSE;
2934         bool is_utf8 = FALSE;
2935         struct lconv* lc;
2936
2937         /* Like above for LC_CTYPE, we first set LC_MONETARY to the locale of
2938          * the desired category, if it isn't that locale already */
2939
2940         if (category != LC_MONETARY) {
2941
2942             save_monetary_locale = setlocale(LC_MONETARY, NULL);
2943             if (! save_monetary_locale) {
2944                 DEBUG_L(PerlIO_printf(Perl_debug_log,
2945                             "Could not find current locale for LC_MONETARY\n"));
2946                 goto cant_use_monetary;
2947             }
2948             save_monetary_locale = stdize_locale(savepv(save_monetary_locale));
2949
2950             if (strEQ(save_monetary_locale, save_input_locale)) {
2951                 Safefree(save_monetary_locale);
2952                 save_monetary_locale = NULL;
2953             }
2954             else if (! setlocale(LC_MONETARY, save_input_locale)) {
2955                 DEBUG_L(PerlIO_printf(Perl_debug_log,
2956                             "Could not change LC_MONETARY locale to %s\n",
2957                                                         save_input_locale));
2958                 Safefree(save_monetary_locale);
2959                 goto cant_use_monetary;
2960             }
2961         }
2962
2963         /* Here the current LC_MONETARY is set to the locale of the category
2964          * whose information is desired. */
2965
2966         lc = localeconv();
2967         if (! lc
2968             || ! lc->currency_symbol
2969             || is_utf8_invariant_string((U8 *) lc->currency_symbol, 0))
2970         {
2971             DEBUG_L(PerlIO_printf(Perl_debug_log, "Couldn't get currency symbol for %s, or contains only ASCII; can't use for determining if UTF-8 locale\n", save_input_locale));
2972             only_ascii = TRUE;
2973         }
2974         else {
2975             is_utf8 = is_utf8_string((U8 *) lc->currency_symbol, 0);
2976         }
2977
2978         /* If we changed it, restore LC_MONETARY to its original locale */
2979         if (save_monetary_locale) {
2980             setlocale(LC_MONETARY, save_monetary_locale);
2981             Safefree(save_monetary_locale);
2982         }
2983
2984         if (! only_ascii) {
2985
2986             /* It isn't a UTF-8 locale if the symbol is not legal UTF-8;
2987              * otherwise assume the locale is UTF-8 if and only if the symbol
2988              * is non-ascii UTF-8. */
2989             DEBUG_L(PerlIO_printf(Perl_debug_log, "\t?Currency symbol for %s is UTF-8=%d\n",
2990                                     save_input_locale, is_utf8));
2991             Safefree(save_input_locale);
2992             return is_utf8;
2993         }
2994     }
2995   cant_use_monetary:
2996
2997 #   endif /* USE_LOCALE_MONETARY */
2998 #endif /* HAS_LOCALECONV */
2999
3000 #if defined(HAS_STRFTIME) && defined(USE_LOCALE_TIME)
3001
3002 /* Still haven't found a non-ASCII string to disambiguate UTF-8 or not.  Try
3003  * the names of the months and weekdays, timezone, and am/pm indicator */
3004     {
3005         char *save_time_locale = NULL;
3006         int hour = 10;
3007         bool is_dst = FALSE;
3008         int dom = 1;
3009         int month = 0;
3010         int i;
3011         char * formatted_time;
3012
3013
3014         /* Like above for LC_MONETARY, we set LC_TIME to the locale of the
3015          * desired category, if it isn't that locale already */
3016
3017         if (category != LC_TIME) {
3018
3019             save_time_locale = setlocale(LC_TIME, NULL);
3020             if (! save_time_locale) {
3021                 DEBUG_L(PerlIO_printf(Perl_debug_log,
3022                             "Could not find current locale for LC_TIME\n"));
3023                 goto cant_use_time;
3024             }
3025             save_time_locale = stdize_locale(savepv(save_time_locale));
3026
3027             if (strEQ(save_time_locale, save_input_locale)) {
3028                 Safefree(save_time_locale);
3029                 save_time_locale = NULL;
3030             }
3031             else if (! setlocale(LC_TIME, save_input_locale)) {
3032                 DEBUG_L(PerlIO_printf(Perl_debug_log,
3033                             "Could not change LC_TIME locale to %s\n",
3034                                                         save_input_locale));
3035                 Safefree(save_time_locale);
3036                 goto cant_use_time;
3037             }
3038         }
3039
3040         /* Here the current LC_TIME is set to the locale of the category
3041          * whose information is desired.  Look at all the days of the week and
3042          * month names, and the timezone and am/pm indicator for UTF-8 variant
3043          * characters.  The first such a one found will tell us if the locale
3044          * is UTF-8 or not */
3045
3046         for (i = 0; i < 7 + 12; i++) {  /* 7 days; 12 months */
3047             formatted_time = my_strftime("%A %B %Z %p",
3048                             0, 0, hour, dom, month, 2012 - 1900, 0, 0, is_dst);
3049             if ( ! formatted_time
3050                 || is_utf8_invariant_string((U8 *) formatted_time, 0))
3051             {
3052
3053                 /* Here, we didn't find a non-ASCII.  Try the next time through
3054                  * with the complemented dst and am/pm, and try with the next
3055                  * weekday.  After we have gotten all weekdays, try the next
3056                  * month */
3057                 is_dst = ! is_dst;
3058                 hour = (hour + 12) % 24;
3059                 dom++;
3060                 if (i > 6) {
3061                     month++;
3062                 }
3063                 continue;
3064             }
3065
3066             /* Here, we have a non-ASCII.  Return TRUE is it is valid UTF8;
3067              * false otherwise.  But first, restore LC_TIME to its original
3068              * locale if we changed it */
3069             if (save_time_locale) {
3070                 setlocale(LC_TIME, save_time_locale);
3071                 Safefree(save_time_locale);
3072             }
3073
3074             DEBUG_L(PerlIO_printf(Perl_debug_log, "\t?time-related strings for %s are UTF-8=%d\n",
3075                                 save_input_locale,
3076                                 is_utf8_string((U8 *) formatted_time, 0)));
3077             Safefree(save_input_locale);
3078             return is_utf8_string((U8 *) formatted_time, 0);
3079         }
3080
3081         /* Falling off the end of the loop indicates all the names were just
3082          * ASCII.  Go on to the next test.  If we changed it, restore LC_TIME
3083          * to its original locale */
3084         if (save_time_locale) {
3085             setlocale(LC_TIME, save_time_locale);
3086             Safefree(save_time_locale);
3087         }
3088         DEBUG_L(PerlIO_printf(Perl_debug_log, "All time-related words for %s contain only ASCII; can't use for determining if UTF-8 locale\n", save_input_locale));
3089     }
3090   cant_use_time:
3091
3092 #endif
3093
3094 #if 0 && defined(USE_LOCALE_MESSAGES) && defined(HAS_SYS_ERRLIST)
3095
3096 /* This code is ifdefd out because it was found to not be necessary in testing
3097  * on our dromedary test machine, which has over 700 locales.  There, this
3098  * added no value to looking at the currency symbol and the time strings.  I
3099  * left it in so as to avoid rewriting it if real-world experience indicates
3100  * that dromedary is an outlier.  Essentially, instead of returning abpve if we
3101  * haven't found illegal utf8, we continue on and examine all the strerror()
3102  * messages on the platform for utf8ness.  If all are ASCII, we still don't
3103  * know the answer; but otherwise we have a pretty good indication of the
3104  * utf8ness.  The reason this doesn't help much is that the messages may not
3105  * have been translated into the locale.  The currency symbol and time strings
3106  * are much more likely to have been translated.  */
3107     {
3108         int e;
3109         bool is_utf8 = FALSE;
3110         bool non_ascii = FALSE;
3111         char *save_messages_locale = NULL;
3112         const char * errmsg = NULL;
3113
3114         /* Like above, we set LC_MESSAGES to the locale of the desired
3115          * category, if it isn't that locale already */
3116
3117         if (category != LC_MESSAGES) {
3118
3119             save_messages_locale = setlocale(LC_MESSAGES, NULL);
3120             if (! save_messages_locale) {
3121                 DEBUG_L(PerlIO_printf(Perl_debug_log,
3122                             "Could not find current locale for LC_MESSAGES\n"));
3123                 goto cant_use_messages;
3124             }
3125             save_messages_locale = stdize_locale(savepv(save_messages_locale));
3126
3127             if (strEQ(save_messages_locale, save_input_locale)) {
3128                 Safefree(save_messages_locale);
3129                 save_messages_locale = NULL;
3130             }
3131             else if (! setlocale(LC_MESSAGES, save_input_locale)) {
3132                 DEBUG_L(PerlIO_printf(Perl_debug_log,
3133                             "Could not change LC_MESSAGES locale to %s\n",
3134                                                         save_input_locale));
3135                 Safefree(save_messages_locale);
3136                 goto cant_use_messages;
3137             }
3138         }
3139
3140         /* Here the current LC_MESSAGES is set to the locale of the category
3141          * whose information is desired.  Look through all the messages.  We
3142          * can't use Strerror() here because it may expand to code that
3143          * segfaults in miniperl */
3144
3145         for (e = 0; e <= sys_nerr; e++) {
3146             errno = 0;
3147             errmsg = sys_errlist[e];
3148             if (errno || !errmsg) {
3149                 break;
3150             }
3151             errmsg = savepv(errmsg);
3152             if (! is_utf8_invariant_string((U8 *) errmsg, 0)) {
3153                 non_ascii = TRUE;
3154                 is_utf8 = is_utf8_string((U8 *) errmsg, 0);
3155                 break;
3156             }
3157         }
3158         Safefree(errmsg);
3159
3160         /* And, if we changed it, restore LC_MESSAGES to its original locale */
3161         if (save_messages_locale) {
3162             setlocale(LC_MESSAGES, save_messages_locale);
3163             Safefree(save_messages_locale);
3164         }
3165
3166         if (non_ascii) {
3167
3168             /* Any non-UTF-8 message means not a UTF-8 locale; if all are valid,
3169              * any non-ascii means it is one; otherwise we assume it isn't */
3170             DEBUG_L(PerlIO_printf(Perl_debug_log, "\t?error messages for %s are UTF-8=%d\n",
3171                                 save_input_locale,
3172                                 is_utf8));
3173             Safefree(save_input_locale);
3174             return is_utf8;
3175         }
3176
3177         DEBUG_L(PerlIO_printf(Perl_debug_log, "All error messages for %s contain only ASCII; can't use for determining if UTF-8 locale\n", save_input_locale));
3178     }
3179   cant_use_messages:
3180
3181 #endif
3182
3183 #endif /* the code that is compiled when no nl_langinfo */
3184
3185 #ifndef EBCDIC  /* On os390, even if the name ends with "UTF-8', it isn't a
3186                    UTF-8 locale */
3187     /* As a last resort, look at the locale name to see if it matches
3188      * qr/UTF -?  * 8 /ix, or some other common locale names.  This "name", the
3189      * return of setlocale(), is actually defined to be opaque, so we can't
3190      * really rely on the absence of various substrings in the name to indicate
3191      * its UTF-8ness, but if it has UTF8 in the name, it is extremely likely to
3192      * be a UTF-8 locale.  Similarly for the other common names */
3193
3194     final_pos = strlen(save_input_locale) - 1;
3195     if (final_pos >= 3) {
3196         char *name = save_input_locale;
3197
3198         /* Find next 'U' or 'u' and look from there */
3199         while ((name += strcspn(name, "Uu") + 1)
3200                                             <= save_input_locale + final_pos - 2)
3201         {
3202             if (!isALPHA_FOLD_NE(*name, 't')
3203                 || isALPHA_FOLD_NE(*(name + 1), 'f'))
3204             {
3205                 continue;
3206             }
3207             name += 2;
3208             if (*(name) == '-') {
3209                 if ((name > save_input_locale + final_pos - 1)) {
3210                     break;
3211                 }
3212                 name++;
3213             }
3214             if (*(name) == '8') {
3215                 DEBUG_L(PerlIO_printf(Perl_debug_log,
3216                                       "Locale %s ends with UTF-8 in name\n",
3217                                       save_input_locale));
3218                 Safefree(save_input_locale);
3219                 return TRUE;
3220             }
3221         }
3222         DEBUG_L(PerlIO_printf(Perl_debug_log,
3223                               "Locale %s doesn't end with UTF-8 in name\n",
3224                                 save_input_locale));
3225     }
3226 #endif
3227
3228 #ifdef WIN32
3229     /* http://msdn.microsoft.com/en-us/library/windows/desktop/dd317756.aspx */
3230     if (final_pos >= 4
3231         && *(save_input_locale + final_pos - 0) == '1'
3232         && *(save_input_locale + final_pos - 1) == '0'
3233         && *(save_input_locale + final_pos - 2) == '0'
3234         && *(save_input_locale + final_pos - 3) == '5'
3235         && *(save_input_locale + final_pos - 4) == '6')
3236     {
3237         DEBUG_L(PerlIO_printf(Perl_debug_log,
3238                         "Locale %s ends with 10056 in name, is UTF-8 locale\n",
3239                         save_input_locale));
3240         Safefree(save_input_locale);
3241         return TRUE;
3242     }
3243 #endif
3244
3245     /* Other common encodings are the ISO 8859 series, which aren't UTF-8.  But
3246      * since we are about to return FALSE anyway, there is no point in doing
3247      * this extra work */
3248 #if 0
3249     if (instr(save_input_locale, "8859")) {
3250         DEBUG_L(PerlIO_printf(Perl_debug_log,
3251                              "Locale %s has 8859 in name, not UTF-8 locale\n",
3252                              save_input_locale));
3253         Safefree(save_input_locale);
3254         return FALSE;
3255     }
3256 #endif
3257
3258     DEBUG_L(PerlIO_printf(Perl_debug_log,
3259                           "Assuming locale %s is not a UTF-8 locale\n",
3260                                     save_input_locale));
3261     Safefree(save_input_locale);
3262     return FALSE;
3263 }
3264
3265 #endif
3266
3267
3268 bool
3269 Perl__is_in_locale_category(pTHX_ const bool compiling, const int category)
3270 {
3271     dVAR;
3272     /* Internal function which returns if we are in the scope of a pragma that
3273      * enables the locale category 'category'.  'compiling' should indicate if
3274      * this is during the compilation phase (TRUE) or not (FALSE). */
3275
3276     const COP * const cop = (compiling) ? &PL_compiling : PL_curcop;
3277
3278     SV *categories = cop_hints_fetch_pvs(cop, "locale", 0);
3279     if (! categories || categories == &PL_sv_placeholder) {
3280         return FALSE;
3281     }
3282
3283     /* The pseudo-category 'not_characters' is -1, so just add 1 to each to get
3284      * a valid unsigned */
3285     assert(category >= -1);
3286     return cBOOL(SvUV(categories) & (1U << (category + 1)));
3287 }
3288
3289 char *
3290 Perl_my_strerror(pTHX_ const int errnum)
3291 {
3292     /* Returns a mortalized copy of the text of the error message associated
3293      * with 'errnum'.  It uses the current locale's text unless the platform
3294      * doesn't have the LC_MESSAGES category or we are not being called from
3295      * within the scope of 'use locale'.  In the former case, it uses whatever
3296      * strerror returns; in the latter case it uses the text from the C locale.
3297      *
3298      * The function just calls strerror(), but temporarily switches, if needed,
3299      * to the C locale */
3300
3301     char *errstr;
3302     dVAR;
3303
3304 #ifndef USE_LOCALE_MESSAGES
3305
3306     /* If platform doesn't have messages category, we don't do any switching to
3307      * the C locale; we just use whatever strerror() returns */
3308
3309     errstr = savepv(Strerror(errnum));
3310
3311 #else   /* Has locale messages */
3312
3313     const bool within_locale_scope = IN_LC(LC_MESSAGES);
3314
3315 #  if defined(HAS_POSIX_2008_LOCALE) && defined(HAS_STRERROR_L)
3316
3317     /* This function is trivial if we have strerror_l() */
3318
3319     if (within_locale_scope) {
3320         errstr = strerror(errnum);
3321     }
3322     else {
3323         errstr = strerror_l(errnum, PL_C_locale_obj);
3324     }
3325
3326     errstr = savepv(errstr);
3327
3328 #  else /* Doesn't have strerror_l(). */
3329
3330 #    ifdef USE_POSIX_2008_LOCALE
3331
3332     locale_t save_locale = NULL;
3333
3334 #    else
3335
3336     char * save_locale = NULL;
3337     bool locale_is_C = FALSE;
3338
3339     /* We have a critical section to prevent another thread from changing the
3340      * locale out from under us (or zapping the buffer returned from
3341      * setlocale() ) */
3342     LOCALE_LOCK;
3343
3344 #    endif
3345
3346     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
3347                             "my_strerror called with errnum %d\n", errnum));
3348     if (! within_locale_scope) {
3349         errno = 0;
3350
3351 #  ifdef USE_POSIX_2008_LOCALE /* Use the thread-safe locale functions */
3352
3353         DEBUG_Lv(PerlIO_printf(Perl_debug_log,
3354                                     "Not within locale scope, about to call"
3355                                     " uselocale(0x%p)\n", PL_C_locale_obj));
3356         save_locale = uselocale(PL_C_locale_obj);
3357         if (! save_locale) {
3358             DEBUG_L(PerlIO_printf(Perl_debug_log,
3359                                     "uselocale failed, errno=%d\n", errno));
3360         }
3361         else {
3362             DEBUG_Lv(PerlIO_printf(Perl_debug_log,
3363                                     "uselocale returned 0x%p\n", save_locale));
3364         }
3365
3366 #    else    /* Not thread-safe build */
3367
3368         save_locale = setlocale(LC_MESSAGES, NULL);
3369         if (! save_locale) {
3370             DEBUG_L(PerlIO_printf(Perl_debug_log,
3371                                   "setlocale failed, errno=%d\n", errno));
3372         }
3373         else {
3374             locale_is_C = isNAME_C_OR_POSIX(save_locale);
3375
3376             /* Switch to the C locale if not already in it */
3377             if (! locale_is_C) {
3378
3379                 /* The setlocale() just below likely will zap 'save_locale', so
3380                  * create a copy.  */
3381                 save_locale = savepv(save_locale);
3382                 setlocale(LC_MESSAGES, "C");
3383             }
3384         }
3385
3386 #    endif
3387
3388     }   /* end of ! within_locale_scope */
3389     else {
3390         DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s: %d: WITHIN locale scope\n",
3391                                                __FILE__, __LINE__));
3392     }
3393
3394     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
3395              "Any locale change has been done; about to call Strerror\n"));
3396     errstr = savepv(Strerror(errnum));
3397
3398     if (! within_locale_scope) {
3399         errno = 0;
3400
3401 #  ifdef USE_POSIX_2008_LOCALE
3402
3403         DEBUG_Lv(PerlIO_printf(Perl_debug_log,
3404                     "%s: %d: not within locale scope, restoring the locale\n",
3405                     __FILE__, __LINE__));
3406         if (save_locale && ! uselocale(save_locale)) {
3407             DEBUG_L(PerlIO_printf(Perl_debug_log,
3408                           "uselocale restore failed, errno=%d\n", errno));
3409         }
3410     }
3411
3412 #    else
3413
3414         if (save_locale && ! locale_is_C) {
3415             if (! setlocale(LC_MESSAGES, save_locale)) {
3416                 DEBUG_L(PerlIO_printf(Perl_debug_log,
3417                       "setlocale restore failed, errno=%d\n", errno));
3418             }
3419             Safefree(save_locale);
3420         }
3421     }
3422
3423     LOCALE_UNLOCK;
3424
3425 #    endif
3426 #  endif /* End of doesn't have strerror_l */
3427 #endif   /* End of does have locale messages */
3428
3429 #ifdef DEBUGGING
3430
3431     if (DEBUG_Lv_TEST) {
3432         PerlIO_printf(Perl_debug_log, "Strerror returned; saving a copy: '");
3433         print_bytes_for_locale(errstr, errstr + strlen(errstr), 0);
3434         PerlIO_printf(Perl_debug_log, "'\n");
3435     }
3436
3437 #endif
3438
3439     SAVEFREEPV(errstr);
3440     return errstr;
3441 }
3442
3443 /*
3444
3445 =for apidoc sync_locale
3446
3447 Changing the program's locale should be avoided by XS code.  Nevertheless,
3448 certain non-Perl libraries called from XS, such as C<Gtk> do so.  When this
3449 happens, Perl needs to be told that the locale has changed.  Use this function
3450 to do so, before returning to Perl.
3451
3452 =cut
3453 */
3454
3455 void
3456 Perl_sync_locale(pTHX)
3457 {
3458
3459 #ifdef USE_LOCALE_CTYPE
3460     new_ctype(setlocale(LC_CTYPE, NULL));
3461 #endif /* USE_LOCALE_CTYPE */
3462
3463 #ifdef USE_LOCALE_COLLATE
3464     new_collate(setlocale(LC_COLLATE, NULL));
3465 #endif
3466
3467 #ifdef USE_LOCALE_NUMERIC
3468     set_numeric_local();    /* Switch from "C" to underlying LC_NUMERIC */
3469     new_numeric(setlocale(LC_NUMERIC, NULL));
3470 #endif /* USE_LOCALE_NUMERIC */
3471
3472 }
3473
3474 #if defined(DEBUGGING) && defined(USE_LOCALE)
3475
3476 STATIC char *
3477 S_setlocale_debug_string(const int category,        /* category number,
3478                                                            like LC_ALL */
3479                             const char* const locale,   /* locale name */
3480
3481                             /* return value from setlocale() when attempting to
3482                              * set 'category' to 'locale' */
3483                             const char* const retval)
3484 {
3485     /* Returns a pointer to a NUL-terminated string in static storage with
3486      * added text about the info passed in.  This is not thread safe and will
3487      * be overwritten by the next call, so this should be used just to
3488      * formulate a string to immediately print or savepv() on. */
3489
3490     /* initialise to a non-null value to keep it out of BSS and so keep
3491      * -DPERL_GLOBAL_STRUCT_PRIVATE happy */
3492     static char ret[128] = "If you can read this, thank your buggy C"
3493                            " library strlcpy(), and change your hints file"
3494                            " to undef it";
3495     my_strlcpy(ret, "setlocale(", sizeof(ret));
3496
3497     switch (category) {
3498         default:
3499             my_snprintf(ret, sizeof(ret), "%s? %d", ret, category);
3500             break;
3501 #   ifdef LC_ALL
3502         case LC_ALL:
3503             my_strlcat(ret, "LC_ALL", sizeof(ret));
3504             break;
3505 #   endif
3506 #   ifdef LC_CTYPE
3507         case LC_CTYPE:
3508             my_strlcat(ret, "LC_CTYPE", sizeof(ret));
3509             break;
3510 #   endif
3511 #   ifdef LC_NUMERIC
3512         case LC_NUMERIC:
3513             my_strlcat(ret, "LC_NUMERIC", sizeof(ret));
3514             break;
3515 #   endif
3516 #   ifdef LC_COLLATE
3517         case LC_COLLATE:
3518             my_strlcat(ret, "LC_COLLATE", sizeof(ret));
3519             break;
3520 #   endif
3521 #   ifdef LC_TIME
3522         case LC_TIME:
3523             my_strlcat(ret, "LC_TIME", sizeof(ret));
3524             break;
3525 #   endif
3526 #   ifdef LC_MONETARY
3527         case LC_MONETARY:
3528             my_strlcat(ret, "LC_MONETARY", sizeof(ret));
3529             break;
3530 #   endif
3531 #   ifdef LC_MESSAGES
3532         case LC_MESSAGES:
3533             my_strlcat(ret, "LC_MESSAGES", sizeof(ret));
3534             break;
3535 #   endif
3536     }
3537
3538     my_strlcat(ret, ", ", sizeof(ret));
3539
3540     if (locale) {
3541         my_strlcat(ret, "\"", sizeof(ret));
3542         my_strlcat(ret, locale, sizeof(ret));
3543         my_strlcat(ret, "\"", sizeof(ret));
3544     }
3545     else {
3546         my_strlcat(ret, "NULL", sizeof(ret));
3547     }
3548
3549     my_strlcat(ret, ") returned ", sizeof(ret));
3550
3551     if (retval) {
3552         my_strlcat(ret, "\"", sizeof(ret));
3553         my_strlcat(ret, retval, sizeof(ret));
3554         my_strlcat(ret, "\"", sizeof(ret));
3555     }
3556     else {
3557         my_strlcat(ret, "NULL", sizeof(ret));
3558     }
3559
3560     assert(strlen(ret) < sizeof(ret));
3561
3562     return ret;
3563 }
3564
3565 #endif
3566
3567
3568 /*
3569  * ex: set ts=8 sts=4 sw=4 et:
3570  */