locale.c

   1 /*    locale.c
   2  *
   3  *    Copyright (C) 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001,
   4  *    2002, 2003, 2005, 2006, 2007, 2008 by Larry Wall and others
   5  *
   6  *    You may distribute under the terms of either the GNU General Public
   7  *    License or the Artistic License, as specified in the README file.
   8  *
   9  */
  10
  11 /*
  12  *      A Elbereth Gilthoniel,
  13  *      silivren penna míriel
  14  *      o menel aglar elenath!
  15  *      Na-chaered palan-díriel
  16  *      o galadhremmin ennorath,
  17  *      Fanuilos, le linnathon
  18  *      nef aear, si nef aearon!
  19  *
  20  *     [p.238 of _The Lord of the Rings_, II/i: "Many Meetings"]
  21  */
  22
  23 /* utility functions for handling locale-specific stuff like what
  24  * character represents the decimal point.
  25  *
  26  * All C programs have an underlying locale.  Perl code generally doesn't pay
  27  * any attention to it except within the scope of a 'use locale'.  For most
  28  * categories, it accomplishes this by just using different operations if it is
  29  * in such scope than if not.  However, various libc functions called by Perl
  30  * are affected by the LC_NUMERIC category, so there are macros in perl.h that
  31  * are used to toggle between the current locale and the C locale depending on
  32  * the desired behavior of those functions at the moment.  And, LC_MESSAGES is
  33  * switched to the C locale for outputting the message unless within the scope
  34  * of 'use locale'.
  35  */
  36
  37 #include "EXTERN.h"
  38 #define PERL_IN_LOCALE_C
  39 #include "perl_langinfo.h"
  40 #include "perl.h"
  41
  42 #include "reentr.h"
  43
  44 /* If the environment says to, we can output debugging information during
  45  * initialization.  This is done before option parsing, and before any thread
  46  * creation, so can be a file-level static */
  47 #ifdef DEBUGGING
  48 #  ifdef PERL_GLOBAL_STRUCT
  49   /* no global syms allowed */
  50 #    define debug_initialization 0
  51 #    define DEBUG_INITIALIZATION_set(v)
  52 #  else
  53 static bool debug_initialization = FALSE;
  54 #    define DEBUG_INITIALIZATION_set(v) (debug_initialization = v)
  55 #  endif
  56 #endif
  57
  58 /* strlen() of a literal string constant.  XXX We might want this more general,
  59  * but using it in just this file for now */
  60 #define STRLENs(s)  (sizeof("" s "") - 1)
  61
  62 /* Is the C string input 'name' "C" or "POSIX"?  If so, and 'name' is the
  63  * return of setlocale(), then this is extremely likely to be the C or POSIX
  64  * locale.  However, the output of setlocale() is documented to be opaque, but
  65  * the odds are extremely small that it would return these two strings for some
  66  * other locale.  Note that VMS in these two locales includes many non-ASCII
  67  * characters as controls and punctuation (below are hex bytes):
  68  *   cntrl:  84-97 9B-9F
  69  *   punct:  A1-A3 A5 A7-AB B0-B3 B5-B7 B9-BD BF-CF D1-DD DF-EF F1-FD
  70  * Oddly, none there are listed as alphas, though some represent alphabetics
  71  * http://www.nntp.perl.org/group/perl.perl5.porters/2013/02/msg198753.html */
  72 #define isNAME_C_OR_POSIX(name)                                              \
  73                              (   (name) != NULL                              \
  74                               && (( *(name) == 'C' && (*(name + 1)) == '\0') \
  75                                    || strEQ((name), "POSIX")))
  76
  77 #ifdef USE_LOCALE
  78
  79 /*
  80  * Standardize the locale name from a string returned by 'setlocale', possibly
  81  * modifying that string.
  82  *
  83  * The typical return value of setlocale() is either
  84  * (1) "xx_YY" if the first argument of setlocale() is not LC_ALL
  85  * (2) "xa_YY xb_YY ..." if the first argument of setlocale() is LC_ALL
  86  *     (the space-separated values represent the various sublocales,
  87  *      in some unspecified order).  This is not handled by this function.
  88  *
  89  * In some platforms it has a form like "LC_SOMETHING=Lang_Country.866\n",
  90  * which is harmful for further use of the string in setlocale().  This
  91  * function removes the trailing new line and everything up through the '='
  92  *
  93  */
  94 STATIC char *
  95 S_stdize_locale(pTHX_ char *locs)
  96 {
  97     const char * const s = strchr(locs, '=');
  98     bool okay = TRUE;
  99
 100     PERL_ARGS_ASSERT_STDIZE_LOCALE;
 101
 102     if (s) {
 103         const char * const t = strchr(s, '.');
 104         okay = FALSE;
 105         if (t) {
 106             const char * const u = strchr(t, '\n');
 107             if (u && (u[1] == 0)) {
 108                 const STRLEN len = u - s;
 109                 Move(s + 1, locs, len, char);
 110                 locs[len] = 0;
 111                 okay = TRUE;
 112             }
 113         }
 114     }
 115
 116     if (!okay)
 117         Perl_croak(aTHX_ "Can't fix broken locale name \"%s\"", locs);
 118
 119     return locs;
 120 }
 121
 122 /* Two parallel arrays; first the locale categories Perl uses on this system;
 123  * the second array is their names.  These arrays are in mostly arbitrary
 124  * order. */
 125
 126 const int categories[] = {
 127
 128 #    ifdef USE_LOCALE_NUMERIC
 129                              LC_NUMERIC,
 130 #    endif
 131 #    ifdef USE_LOCALE_CTYPE
 132                              LC_CTYPE,
 133 #    endif
 134 #    ifdef USE_LOCALE_COLLATE
 135                              LC_COLLATE,
 136 #    endif
 137 #    ifdef USE_LOCALE_TIME
 138                              LC_TIME,
 139 #    endif
 140 #    ifdef USE_LOCALE_MESSAGES
 141                              LC_MESSAGES,
 142 #    endif
 143 #    ifdef USE_LOCALE_MONETARY
 144                              LC_MONETARY,
 145 #    endif
 146 #    ifdef LC_ALL
 147                              LC_ALL,
 148 #    endif
 149                             -1  /* Placeholder because C doesn't allow a
 150                                    trailing comma, and it would get complicated
 151                                    with all the #ifdef's */
 152 };
 153
 154 /* The top-most real element is LC_ALL */
 155
 156 const char * category_names[] = {
 157
 158 #    ifdef USE_LOCALE_NUMERIC
 159                                  "LC_NUMERIC",
 160 #    endif
 161 #    ifdef USE_LOCALE_CTYPE
 162                                  "LC_CTYPE",
 163 #    endif
 164 #    ifdef USE_LOCALE_COLLATE
 165                                  "LC_COLLATE",
 166 #    endif
 167 #    ifdef USE_LOCALE_TIME
 168                                  "LC_TIME",
 169 #    endif
 170 #    ifdef USE_LOCALE_MESSAGES
 171                                  "LC_MESSAGES",
 172 #    endif
 173 #    ifdef USE_LOCALE_MONETARY
 174                                  "LC_MONETARY",
 175 #    endif
 176 #    ifdef LC_ALL
 177                                  "LC_ALL",
 178 #    endif
 179                                  NULL  /* Placeholder */
 180                             };
 181
 182 #  ifdef LC_ALL
 183
 184     /* On systems with LC_ALL, it is kept in the highest index position.  (-2
 185      * to account for the final unused placeholder element.) */
 186 #    define NOMINAL_LC_ALL_INDEX (C_ARRAY_LENGTH(categories) - 2)
 187
 188 #  else
 189
 190     /* On systems without LC_ALL, we pretend it is there, one beyond the real
 191      * top element, hence in the unused placeholder element. */
 192 #    define NOMINAL_LC_ALL_INDEX (C_ARRAY_LENGTH(categories) - 1)
 193
 194 #  endif
 195
 196 /* Pretending there is an LC_ALL element just above allows us to avoid most
 197  * special cases.  Most loops through these arrays in the code below are
 198  * written like 'for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++)'.  They will work
 199  * on either type of system.  But the code must be written to not access the
 200  * element at 'LC_ALL_INDEX' except on platforms that have it.  This can be
 201  * checked for at compile time by using the #define LC_ALL_INDEX which is only
 202  * defined if we do have LC_ALL. */
 203
 204 /* Now create LC_foo_INDEX #defines for just those categories on this system */
 205 #  ifdef USE_LOCALE_NUMERIC
 206 #    define LC_NUMERIC_INDEX            0
 207 #    define _DUMMY_NUMERIC              LC_NUMERIC_INDEX
 208 #  else
 209 #    define _DUMMY_NUMERIC              -1
 210 #  endif
 211 #  ifdef USE_LOCALE_CTYPE
 212 #    define LC_CTYPE_INDEX              _DUMMY_NUMERIC + 1
 213 #    define _DUMMY_CTYPE                LC_CTYPE_INDEX
 214 #  else
 215 #    define _DUMMY_CTYPE                _DUMMY_NUMERIC
 216 #  endif
 217 #  ifdef USE_LOCALE_COLLATE
 218 #    define LC_COLLATE_INDEX            _DUMMY_CTYPE + 1
 219 #    define _DUMMY_COLLATE              LC_COLLATE_INDEX
 220 #  else
 221 #    define _DUMMY_COLLATE              _DUMMY_COLLATE
 222 #  endif
 223 #  ifdef USE_LOCALE_TIME
 224 #    define LC_TIME_INDEX               _DUMMY_COLLATE + 1
 225 #    define _DUMMY_TIME                 LC_TIME_INDEX
 226 #  else
 227 #    define _DUMMY_TIME                 _DUMMY_COLLATE
 228 #  endif
 229 #  ifdef USE_LOCALE_MESSAGES
 230 #    define LC_MESSAGES_INDEX           _DUMMY_TIME + 1
 231 #    define _DUMMY_MESSAGES             LC_MESSAGES_INDEX
 232 #  else
 233 #    define _DUMMY_MESSAGES             _DUMMY_TIME
 234 #  endif
 235 #  ifdef USE_LOCALE_MONETARY
 236 #    define LC_MONETARY_INDEX           _DUMMY_MESSAGES + 1
 237 #    define _DUMMY_MONETARY             LC_MONETARY_INDEX
 238 #  else
 239 #    define _DUMMY_MONETARY             _DUMMY_MESSAGES
 240 #  endif
 241 #  ifdef LC_ALL
 242 #    define LC_ALL_INDEX                _DUMMY_MONETARY + 1
 243 #  endif
 244 #endif /* ifdef USE_LOCALE */
 245
 246 /* Windows requres a customized base-level setlocale() */
 247 #  ifdef WIN32
 248 #    define my_setlocale(cat, locale) win32_setlocale(cat, locale)
 249 #  else
 250 #    define my_setlocale(cat, locale) setlocale(cat, locale)
 251 #  endif
 252
 253 /* Just placeholders for now.  "_c" is intended to be called when the category
 254  * is a constant known at compile time; "_r", not known until run time  */
 255 #  define do_setlocale_c(category, locale) my_setlocale(category, locale)
 256 #  define do_setlocale_r(category, locale) my_setlocale(category, locale)
 257
 258 STATIC void
 259 S_set_numeric_radix(pTHX_ const bool use_locale)
 260 {
 261     /* If 'use_locale' is FALSE, set to use a dot for the radix character.  If
 262      * TRUE, use the radix character derived from the current locale */
 263
 264 #if defined(USE_LOCALE_NUMERIC) && (   defined(HAS_LOCALECONV)              \
 265                                     || defined(HAS_NL_LANGINFO))
 266
 267     /* We only set up the radix SV if we are to use a locale radix ... */
 268     if (use_locale) {
 269         const char * radix = my_nl_langinfo(PERL_RADIXCHAR, FALSE);
 270                                           /* FALSE => already in dest locale */
 271
 272         /* ... and the character being used isn't a dot */
 273         if (strNE(radix, ".")) {
 274             if (PL_numeric_radix_sv) {
 275                 sv_setpv(PL_numeric_radix_sv, radix);
 276             }
 277             else {
 278                 PL_numeric_radix_sv = newSVpv(radix, 0);
 279             }
 280
 281             if ( !  is_utf8_invariant_string(
 282                      (U8 *) SvPVX(PL_numeric_radix_sv), SvCUR(PL_numeric_radix_sv))
 283                 &&  is_utf8_string(
 284                      (U8 *) SvPVX(PL_numeric_radix_sv), SvCUR(PL_numeric_radix_sv))
 285                 && _is_cur_LC_category_utf8(LC_NUMERIC))
 286             {
 287                 SvUTF8_on(PL_numeric_radix_sv);
 288             }
 289             goto done;
 290         }
 291     }
 292
 293     SvREFCNT_dec(PL_numeric_radix_sv);
 294     PL_numeric_radix_sv = NULL;
 295
 296   done: ;
 297
 298 #  ifdef DEBUGGING
 299
 300     if (DEBUG_L_TEST || debug_initialization) {
 301         PerlIO_printf(Perl_debug_log, "Locale radix is '%s', ?UTF-8=%d\n",
 302                                           (PL_numeric_radix_sv)
 303                                            ? SvPVX(PL_numeric_radix_sv)
 304                                            : "NULL",
 305                                           (PL_numeric_radix_sv)
 306                                            ? cBOOL(SvUTF8(PL_numeric_radix_sv))
 307                                            : 0);
 308     }
 309
 310 #  endif
 311 #endif /* USE_LOCALE_NUMERIC and can find the radix char */
 312
 313 }
 314
 315
 316 void
 317 Perl_new_numeric(pTHX_ const char *newnum)
 318 {
 319
 320 #ifndef USE_LOCALE_NUMERIC
 321
 322     PERL_UNUSED_ARG(newnum);
 323
 324 #else
 325
 326     /* Called after all libc setlocale() calls affecting LC_NUMERIC, to tell
 327      * core Perl this and that 'newnum' is the name of the new locale.
 328      * It installs this locale as the current underlying default.
 329      *
 330      * The default locale and the C locale can be toggled between by use of the
 331      * set_numeric_underlying() and set_numeric_standard() functions, which
 332      * should probably not be called directly, but only via macros like
 333      * SET_NUMERIC_STANDARD() in perl.h.
 334      *
 335      * The toggling is necessary mainly so that a non-dot radix decimal point
 336      * character can be output, while allowing internal calculations to use a
 337      * dot.
 338      *
 339      * This sets several interpreter-level variables:
 340      * PL_numeric_name  The underlying locale's name: a copy of 'newnum'
 341      * PL_numeric_underlying  A boolean indicating if the toggled state is such
 342      *                  that the current locale is the program's underlying
 343      *                  locale
 344      * PL_numeric_standard An int indicating if the toggled state is such
 345      *                  that the current locale is the C locale.  If non-zero,
 346      *                  it is in C; if > 1, it means it may not be toggled away
 347      *                  from C.
 348      * Note that both of the last two variables can be true at the same time,
 349      * if the underlying locale is C.  (Toggling is a no-op under these
 350      * circumstances.)
 351      *
 352      * Any code changing the locale (outside this file) should use
 353      * POSIX::setlocale, which calls this function.  Therefore this function
 354      * should be called directly only from this file and from
 355      * POSIX::setlocale() */
 356
 357     char *save_newnum;
 358
 359     if (! newnum) {
 360         Safefree(PL_numeric_name);
 361         PL_numeric_name = NULL;
 362         PL_numeric_standard = TRUE;
 363         PL_numeric_underlying = TRUE;
 364         return;
 365     }
 366
 367     save_newnum = stdize_locale(savepv(newnum));
 368
 369     PL_numeric_standard = isNAME_C_OR_POSIX(save_newnum);
 370     PL_numeric_underlying = TRUE;
 371
 372     if (! PL_numeric_name || strNE(PL_numeric_name, save_newnum)) {
 373         Safefree(PL_numeric_name);
 374         PL_numeric_name = save_newnum;
 375     }
 376     else {
 377         Safefree(save_newnum);
 378     }
 379
 380     /* Keep LC_NUMERIC in the C locale.  This is for XS modules, so they don't
 381      * have to worry about the radix being a non-dot.  (Core operations that
 382      * need the underlying locale change to it temporarily). */
 383     set_numeric_standard();
 384
 385 #endif /* USE_LOCALE_NUMERIC */
 386
 387 }
 388
 389 void
 390 Perl_set_numeric_standard(pTHX)
 391 {
 392
 393 #ifdef USE_LOCALE_NUMERIC
 394
 395     /* Toggle the LC_NUMERIC locale to C.  Most code should use the macros like
 396      * SET_NUMERIC_STANDARD() in perl.h instead of calling this directly.  The
 397      * macro avoids calling this routine if toggling isn't necessary according
 398      * to our records (which could be wrong if some XS code has changed the
 399      * locale behind our back) */
 400
 401     do_setlocale_c(LC_NUMERIC, "C");
 402     PL_numeric_standard = TRUE;
 403     PL_numeric_underlying = isNAME_C_OR_POSIX(PL_numeric_name);
 404     set_numeric_radix(0);
 405
 406 #  ifdef DEBUGGING
 407
 408     if (DEBUG_L_TEST || debug_initialization) {
 409         PerlIO_printf(Perl_debug_log,
 410                           "LC_NUMERIC locale now is standard C\n");
 411     }
 412
 413 #  endif
 414 #endif /* USE_LOCALE_NUMERIC */
 415
 416 }
 417
 418 void
 419 Perl_set_numeric_underlying(pTHX)
 420 {
 421
 422 #ifdef USE_LOCALE_NUMERIC
 423
 424     /* Toggle the LC_NUMERIC locale to the current underlying default.  Most
 425      * code should use the macros like SET_NUMERIC_UNDERLYING() in perl.h
 426      * instead of calling this directly.  The macro avoids calling this routine
 427      * if toggling isn't necessary according to our records (which could be
 428      * wrong if some XS code has changed the locale behind our back) */
 429
 430     do_setlocale_c(LC_NUMERIC, PL_numeric_name);
 431     PL_numeric_standard = isNAME_C_OR_POSIX(PL_numeric_name);
 432     PL_numeric_underlying = TRUE;
 433     set_numeric_radix(1);
 434
 435 #  ifdef DEBUGGING
 436
 437     if (DEBUG_L_TEST || debug_initialization) {
 438         PerlIO_printf(Perl_debug_log,
 439                           "LC_NUMERIC locale now is %s\n",
 440                           PL_numeric_name);
 441     }
 442
 443 #  endif
 444 #endif /* USE_LOCALE_NUMERIC */
 445
 446 }
 447
 448 /*
 449  * Set up for a new ctype locale.
 450  */
 451 STATIC void
 452 S_new_ctype(pTHX_ const char *newctype)
 453 {
 454
 455 #ifndef USE_LOCALE_CTYPE
 456
 457     PERL_ARGS_ASSERT_NEW_CTYPE;
 458     PERL_UNUSED_ARG(newctype);
 459     PERL_UNUSED_CONTEXT;
 460
 461 #else
 462
 463     /* Called after all libc setlocale() calls affecting LC_CTYPE, to tell
 464      * core Perl this and that 'newctype' is the name of the new locale.
 465      *
 466      * This function sets up the folding arrays for all 256 bytes, assuming
 467      * that tofold() is tolc() since fold case is not a concept in POSIX,
 468      *
 469      * Any code changing the locale (outside this file) should use
 470      * POSIX::setlocale, which calls this function.  Therefore this function
 471      * should be called directly only from this file and from
 472      * POSIX::setlocale() */
 473
 474     dVAR;
 475     UV i;
 476
 477     PERL_ARGS_ASSERT_NEW_CTYPE;
 478
 479     /* We will replace any bad locale warning with 1) nothing if the new one is
 480      * ok; or 2) a new warning for the bad new locale */
 481     if (PL_warn_locale) {
 482         SvREFCNT_dec_NN(PL_warn_locale);
 483         PL_warn_locale = NULL;
 484     }
 485
 486     PL_in_utf8_CTYPE_locale = _is_cur_LC_category_utf8(LC_CTYPE);
 487
 488     /* A UTF-8 locale gets standard rules.  But note that code still has to
 489      * handle this specially because of the three problematic code points */
 490     if (PL_in_utf8_CTYPE_locale) {
 491         Copy(PL_fold_latin1, PL_fold_locale, 256, U8);
 492     }
 493     else {
 494         /* Assume enough space for every character being bad.  4 spaces each
 495          * for the 94 printable characters that are output like "'x' "; and 5
 496          * spaces each for "'\\' ", "'\t' ", and "'\n' "; plus a terminating
 497          * NUL */
 498         char bad_chars_list[ (94 * 4) + (3 * 5) + 1 ];
 499
 500         /* Don't check for problems if we are suppressing the warnings */
 501         bool check_for_problems = ckWARN_d(WARN_LOCALE)
 502                                || UNLIKELY(DEBUG_L_TEST);
 503         bool multi_byte_locale = FALSE;     /* Assume is a single-byte locale
 504                                                to start */
 505         unsigned int bad_count = 0;         /* Count of bad characters */
 506
 507         for (i = 0; i < 256; i++) {
 508             if (isUPPER_LC((U8) i))
 509                 PL_fold_locale[i] = (U8) toLOWER_LC((U8) i);
 510             else if (isLOWER_LC((U8) i))
 511                 PL_fold_locale[i] = (U8) toUPPER_LC((U8) i);
 512             else
 513                 PL_fold_locale[i] = (U8) i;
 514
 515             /* If checking for locale problems, see if the native ASCII-range
 516              * printables plus \n and \t are in their expected categories in
 517              * the new locale.  If not, this could mean big trouble, upending
 518              * Perl's and most programs' assumptions, like having a
 519              * metacharacter with special meaning become a \w.  Fortunately,
 520              * it's very rare to find locales that aren't supersets of ASCII
 521              * nowadays.  It isn't a problem for most controls to be changed
 522              * into something else; we check only \n and \t, though perhaps \r
 523              * could be an issue as well. */
 524             if (    check_for_problems
 525                 && (isGRAPH_A(i) || isBLANK_A(i) || i == '\n'))
 526             {
 527                 if ((    isALPHANUMERIC_A(i) && ! isALPHANUMERIC_LC(i))
 528                      || (isPUNCT_A(i) && ! isPUNCT_LC(i))
 529                      || (isBLANK_A(i) && ! isBLANK_LC(i))
 530                      || (i == '\n' && ! isCNTRL_LC(i)))
 531                 {
 532                     if (bad_count) {    /* Separate multiple entries with a
 533                                            blank */
 534                         bad_chars_list[bad_count++] = ' ';
 535                     }
 536                     bad_chars_list[bad_count++] = '\'';
 537                     if (isPRINT_A(i)) {
 538                         bad_chars_list[bad_count++] = (char) i;
 539                     }
 540                     else {
 541                         bad_chars_list[bad_count++] = '\\';
 542                         if (i == '\n') {
 543                             bad_chars_list[bad_count++] = 'n';
 544                         }
 545                         else {
 546                             assert(i == '\t');
 547                             bad_chars_list[bad_count++] = 't';
 548                         }
 549                     }
 550                     bad_chars_list[bad_count++] = '\'';
 551                     bad_chars_list[bad_count] = '\0';
 552                 }
 553             }
 554         }
 555
 556 #  ifdef MB_CUR_MAX
 557
 558         /* We only handle single-byte locales (outside of UTF-8 ones; so if
 559          * this locale requires more than one byte, there are going to be
 560          * problems. */
 561         DEBUG_Lv(PerlIO_printf(Perl_debug_log,
 562                  "%s:%d: check_for_problems=%d, MB_CUR_MAX=%d\n",
 563                  __FILE__, __LINE__, check_for_problems, (int) MB_CUR_MAX));
 564
 565         if (check_for_problems && MB_CUR_MAX > 1
 566
 567                /* Some platforms return MB_CUR_MAX > 1 for even the "C"
 568                 * locale.  Just assume that the implementation for them (plus
 569                 * for POSIX) is correct and the > 1 value is spurious.  (Since
 570                 * these are specially handled to never be considered UTF-8
 571                 * locales, as long as this is the only problem, everything
 572                 * should work fine */
 573             && strNE(newctype, "C") && strNE(newctype, "POSIX"))
 574         {
 575             multi_byte_locale = TRUE;
 576         }
 577
 578 #  endif
 579
 580         if (bad_count || multi_byte_locale) {
 581             PL_warn_locale = Perl_newSVpvf(aTHX_
 582                              "Locale '%s' may not work well.%s%s%s\n",
 583                              newctype,
 584                              (multi_byte_locale)
 585                               ? "  Some characters in it are not recognized by"
 586                                 " Perl."
 587                               : "",
 588                              (bad_count)
 589                               ? "\nThe following characters (and maybe others)"
 590                                 " may not have the same meaning as the Perl"
 591                                 " program expects:\n"
 592                               : "",
 593                              (bad_count)
 594                               ? bad_chars_list
 595                               : ""
 596                             );
 597             /* If we are actually in the scope of the locale or are debugging,
 598              * output the message now.  If not in that scope, we save the
 599              * message to be output at the first operation using this locale,
 600              * if that actually happens.  Most programs don't use locales, so
 601              * they are immune to bad ones.  */
 602             if (IN_LC(LC_CTYPE) || UNLIKELY(DEBUG_L_TEST)) {
 603
 604                 /* We have to save 'newctype' because the setlocale() just
 605                  * below may destroy it.  The next setlocale() further down
 606                  * should restore it properly so that the intermediate change
 607                  * here is transparent to this function's caller */
 608                 const char * const badlocale = savepv(newctype);
 609
 610                 do_setlocale_c(LC_CTYPE, "C");
 611
 612                 /* The '0' below suppresses a bogus gcc compiler warning */
 613                 Perl_warner(aTHX_ packWARN(WARN_LOCALE), SvPVX(PL_warn_locale), 0);
 614
 615                 do_setlocale_c(LC_CTYPE, badlocale);
 616                 Safefree(badlocale);
 617
 618                 if (IN_LC(LC_CTYPE)) {
 619                     SvREFCNT_dec_NN(PL_warn_locale);
 620                     PL_warn_locale = NULL;
 621                 }
 622             }
 623         }
 624     }
 625
 626 #endif /* USE_LOCALE_CTYPE */
 627
 628 }
 629
 630 void
 631 Perl__warn_problematic_locale()
 632 {
 633
 634 #ifdef USE_LOCALE_CTYPE
 635
 636     dTHX;
 637
 638     /* Internal-to-core function that outputs the message in PL_warn_locale,
 639      * and then NULLS it.  Should be called only through the macro
 640      * _CHECK_AND_WARN_PROBLEMATIC_LOCALE */
 641
 642     if (PL_warn_locale) {
 643         Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
 644                              SvPVX(PL_warn_locale),
 645                              0 /* dummy to avoid compiler warning */ );
 646         SvREFCNT_dec_NN(PL_warn_locale);
 647         PL_warn_locale = NULL;
 648     }
 649
 650 #endif
 651
 652 }
 653
 654 STATIC void
 655 S_new_collate(pTHX_ const char *newcoll)
 656 {
 657
 658 #ifndef USE_LOCALE_COLLATE
 659
 660     PERL_UNUSED_ARG(newcoll);
 661     PERL_UNUSED_CONTEXT;
 662
 663 #else
 664
 665     /* Called after all libc setlocale() calls affecting LC_COLLATE, to tell
 666      * core Perl this and that 'newcoll' is the name of the new locale.
 667      *
 668      * The design of locale collation is that every locale change is given an
 669      * index 'PL_collation_ix'.  The first time a string particpates in an
 670      * operation that requires collation while locale collation is active, it
 671      * is given PERL_MAGIC_collxfrm magic (via sv_collxfrm_flags()).  That
 672      * magic includes the collation index, and the transformation of the string
 673      * by strxfrm(), q.v.  That transformation is used when doing comparisons,
 674      * instead of the string itself.  If a string changes, the magic is
 675      * cleared.  The next time the locale changes, the index is incremented,
 676      * and so we know during a comparison that the transformation is not
 677      * necessarily still valid, and so is recomputed.  Note that if the locale
 678      * changes enough times, the index could wrap (a U32), and it is possible
 679      * that a transformation would improperly be considered valid, leading to
 680      * an unlikely bug */
 681
 682     if (! newcoll) {
 683         if (PL_collation_name) {
 684             ++PL_collation_ix;
 685             Safefree(PL_collation_name);
 686             PL_collation_name = NULL;
 687         }
 688         PL_collation_standard = TRUE;
 689       is_standard_collation:
 690         PL_collxfrm_base = 0;
 691         PL_collxfrm_mult = 2;
 692         PL_in_utf8_COLLATE_locale = FALSE;
 693         PL_strxfrm_NUL_replacement = '\0';
 694         PL_strxfrm_max_cp = 0;
 695         return;
 696     }
 697
 698     /* If this is not the same locale as currently, set the new one up */
 699     if (! PL_collation_name || strNE(PL_collation_name, newcoll)) {
 700         ++PL_collation_ix;
 701         Safefree(PL_collation_name);
 702         PL_collation_name = stdize_locale(savepv(newcoll));
 703         PL_collation_standard = isNAME_C_OR_POSIX(newcoll);
 704         if (PL_collation_standard) {
 705             goto is_standard_collation;
 706         }
 707
 708         PL_in_utf8_COLLATE_locale = _is_cur_LC_category_utf8(LC_COLLATE);
 709         PL_strxfrm_NUL_replacement = '\0';
 710         PL_strxfrm_max_cp = 0;
 711
 712         /* A locale collation definition includes primary, secondary, tertiary,
 713          * etc. weights for each character.  To sort, the primary weights are
 714          * used, and only if they compare equal, then the secondary weights are
 715          * used, and only if they compare equal, then the tertiary, etc.
 716          *
 717          * strxfrm() works by taking the input string, say ABC, and creating an
 718          * output transformed string consisting of first the primary weights,
 719          * A¹B¹C¹ followed by the secondary ones, A²B²C²; and then the
 720          * tertiary, etc, yielding A¹B¹C¹ A²B²C² A³B³C³ ....  Some characters
 721          * may not have weights at every level.  In our example, let's say B
 722          * doesn't have a tertiary weight, and A doesn't have a secondary
 723          * weight.  The constructed string is then going to be
 724          *  A¹B¹C¹ B²C² A³C³ ....
 725          * This has the desired effect that strcmp() will look at the secondary
 726          * or tertiary weights only if the strings compare equal at all higher
 727          * priority weights.  The spaces shown here, like in
 728          *  "A¹B¹C¹ A²B²C² "
 729          * are not just for readability.  In the general case, these must
 730          * actually be bytes, which we will call here 'separator weights'; and
 731          * they must be smaller than any other weight value, but since these
 732          * are C strings, only the terminating one can be a NUL (some
 733          * implementations may include a non-NUL separator weight just before
 734          * the NUL).  Implementations tend to reserve 01 for the separator
 735          * weights.  They are needed so that a shorter string's secondary
 736          * weights won't be misconstrued as primary weights of a longer string,
 737          * etc.  By making them smaller than any other weight, the shorter
 738          * string will sort first.  (Actually, if all secondary weights are
 739          * smaller than all primary ones, there is no need for a separator
 740          * weight between those two levels, etc.)
 741          *
 742          * The length of the transformed string is roughly a linear function of
 743          * the input string.  It's not exactly linear because some characters
 744          * don't have weights at all levels.  When we call strxfrm() we have to
 745          * allocate some memory to hold the transformed string.  The
 746          * calculations below try to find coefficients 'm' and 'b' for this
 747          * locale so that m*x + b equals how much space we need, given the size
 748          * of the input string in 'x'.  If we calculate too small, we increase
 749          * the size as needed, and call strxfrm() again, but it is better to
 750          * get it right the first time to avoid wasted expensive string
 751          * transformations. */
 752
 753         {
 754             /* We use the string below to find how long the tranformation of it
 755              * is.  Almost all locales are supersets of ASCII, or at least the
 756              * ASCII letters.  We use all of them, half upper half lower,
 757              * because if we used fewer, we might hit just the ones that are
 758              * outliers in a particular locale.  Most of the strings being
 759              * collated will contain a preponderance of letters, and even if
 760              * they are above-ASCII, they are likely to have the same number of
 761              * weight levels as the ASCII ones.  It turns out that digits tend
 762              * to have fewer levels, and some punctuation has more, but those
 763              * are relatively sparse in text, and khw believes this gives a
 764              * reasonable result, but it could be changed if experience so
 765              * dictates. */
 766             const char longer[] = "ABCDEFGHIJKLMnopqrstuvwxyz";
 767             char * x_longer;        /* Transformed 'longer' */
 768             Size_t x_len_longer;    /* Length of 'x_longer' */
 769
 770             char * x_shorter;   /* We also transform a substring of 'longer' */
 771             Size_t x_len_shorter;
 772
 773             /* _mem_collxfrm() is used get the transformation (though here we
 774              * are interested only in its length).  It is used because it has
 775              * the intelligence to handle all cases, but to work, it needs some
 776              * values of 'm' and 'b' to get it started.  For the purposes of
 777              * this calculation we use a very conservative estimate of 'm' and
 778              * 'b'.  This assumes a weight can be multiple bytes, enough to
 779              * hold any UV on the platform, and there are 5 levels, 4 weight
 780              * bytes, and a trailing NUL.  */
 781             PL_collxfrm_base = 5;
 782             PL_collxfrm_mult = 5 * sizeof(UV);
 783
 784             /* Find out how long the transformation really is */
 785             x_longer = _mem_collxfrm(longer,
 786                                      sizeof(longer) - 1,
 787                                      &x_len_longer,
 788
 789                                      /* We avoid converting to UTF-8 in the
 790                                       * called function by telling it the
 791                                       * string is in UTF-8 if the locale is a
 792                                       * UTF-8 one.  Since the string passed
 793                                       * here is invariant under UTF-8, we can
 794                                       * claim it's UTF-8 even though it isn't.
 795                                       * */
 796                                      PL_in_utf8_COLLATE_locale);
 797             Safefree(x_longer);
 798
 799             /* Find out how long the transformation of a substring of 'longer'
 800              * is.  Together the lengths of these transformations are
 801              * sufficient to calculate 'm' and 'b'.  The substring is all of
 802              * 'longer' except the first character.  This minimizes the chances
 803              * of being swayed by outliers */
 804             x_shorter = _mem_collxfrm(longer + 1,
 805                                       sizeof(longer) - 2,
 806                                       &x_len_shorter,
 807                                       PL_in_utf8_COLLATE_locale);
 808             Safefree(x_shorter);
 809
 810             /* If the results are nonsensical for this simple test, the whole
 811              * locale definition is suspect.  Mark it so that locale collation
 812              * is not active at all for it.  XXX Should we warn? */
 813             if (   x_len_shorter == 0
 814                 || x_len_longer == 0
 815                 || x_len_shorter >= x_len_longer)
 816             {
 817                 PL_collxfrm_mult = 0;
 818                 PL_collxfrm_base = 0;
 819             }
 820             else {
 821                 SSize_t base;       /* Temporary */
 822
 823                 /* We have both:    m * strlen(longer)  + b = x_len_longer
 824                  *                  m * strlen(shorter) + b = x_len_shorter;
 825                  * subtracting yields:
 826                  *          m * (strlen(longer) - strlen(shorter))
 827                  *                             = x_len_longer - x_len_shorter
 828                  * But we have set things up so that 'shorter' is 1 byte smaller
 829                  * than 'longer'.  Hence:
 830                  *          m = x_len_longer - x_len_shorter
 831                  *
 832                  * But if something went wrong, make sure the multiplier is at
 833                  * least 1.
 834                  */
 835                 if (x_len_longer > x_len_shorter) {
 836                     PL_collxfrm_mult = (STRLEN) x_len_longer - x_len_shorter;
 837                 }
 838                 else {
 839                     PL_collxfrm_mult = 1;
 840                 }
 841
 842                 /*     mx + b = len
 843                  * so:      b = len - mx
 844                  * but in case something has gone wrong, make sure it is
 845                  * non-negative */
 846                 base = x_len_longer - PL_collxfrm_mult * (sizeof(longer) - 1);
 847                 if (base < 0) {
 848                     base = 0;
 849                 }
 850
 851                 /* Add 1 for the trailing NUL */
 852                 PL_collxfrm_base = base + 1;
 853             }
 854
 855 #  ifdef DEBUGGING
 856
 857             if (DEBUG_L_TEST || debug_initialization) {
 858                 PerlIO_printf(Perl_debug_log,
 859                     "%s:%d: ?UTF-8 locale=%d; x_len_shorter=%zu, "
 860                     "x_len_longer=%zu,"
 861                     " collate multipler=%zu, collate base=%zu\n",
 862                     __FILE__, __LINE__,
 863                     PL_in_utf8_COLLATE_locale,
 864                     x_len_shorter, x_len_longer,
 865                     PL_collxfrm_mult, PL_collxfrm_base);
 866             }
 867 #  endif
 868
 869         }
 870     }
 871
 872 #endif /* USE_LOCALE_COLLATE */
 873
 874 }
 875
 876 #ifdef WIN32
 877
 878 STATIC char *
 879 S_win32_setlocale(pTHX_ int category, const char* locale)
 880 {
 881     /* This, for Windows, emulates POSIX setlocale() behavior.  There is no
 882      * difference between the two unless the input locale is "", which normally
 883      * means on Windows to get the machine default, which is set via the
 884      * computer's "Regional and Language Options" (or its current equivalent).
 885      * In POSIX, it instead means to find the locale from the user's
 886      * environment.  This routine changes the Windows behavior to first look in
 887      * the environment, and, if anything is found, use that instead of going to
 888      * the machine default.  If there is no environment override, the machine
 889      * default is used, by calling the real setlocale() with "".
 890      *
 891      * The POSIX behavior is to use the LC_ALL variable if set; otherwise to
 892      * use the particular category's variable if set; otherwise to use the LANG
 893      * variable. */
 894
 895     bool override_LC_ALL = FALSE;
 896     char * result;
 897     unsigned int i;
 898
 899     if (locale && strEQ(locale, "")) {
 900
 901 #  ifdef LC_ALL
 902
 903         locale = PerlEnv_getenv("LC_ALL");
 904         if (! locale) {
 905             if (category ==  LC_ALL) {
 906                 override_LC_ALL = TRUE;
 907             }
 908             else {
 909
 910 #  endif
 911
 912                 for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++) {
 913                     if (category == categories[i]) {
 914                         locale = PerlEnv_getenv(category_names[i]);
 915                         goto found_locale;
 916                     }
 917                 }
 918
 919                 locale = PerlEnv_getenv("LANG");
 920                 if (! locale) {
 921                     locale = "";
 922                 }
 923
 924               found_locale: ;
 925
 926 #  ifdef LC_ALL
 927
 928             }
 929         }
 930
 931 #  endif
 932
 933     }
 934
 935     result = setlocale(category, locale);
 936     DEBUG_L(PerlIO_printf(Perl_debug_log, "%s:%d: %s\n", __FILE__, __LINE__,
 937                             setlocale_debug_string(category, locale, result)));
 938
 939     if (! override_LC_ALL)  {
 940         return result;
 941     }
 942
 943     /* Here the input category was LC_ALL, and we have set it to what is in the
 944      * LANG variable or the system default if there is no LANG.  But these have
 945      * lower priority than the other LC_foo variables, so override it for each
 946      * one that is set.  (If they are set to "", it means to use the same thing
 947      * we just set LC_ALL to, so can skip) */
 948
 949     for (i = 0; i < LC_ALL_INDEX; i++) {
 950         result = PerlEnv_getenv(category_names[i]);
 951         if (result && strNE(result, "")) {
 952             setlocale(categories[i], result);
 953             DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s:%d: %s\n",
 954                 __FILE__, __LINE__,
 955                 setlocale_debug_string(categories[i], result, "not captured")));
 956         }
 957     }
 958
 959     result = setlocale(LC_ALL, NULL);
 960     DEBUG_L(PerlIO_printf(Perl_debug_log, "%s:%d: %s\n",
 961                                __FILE__, __LINE__,
 962                                setlocale_debug_string(LC_ALL, NULL, result)));
 963
 964     return result;
 965 }
 966
 967 #endif
 968
 969 char *
 970 Perl_setlocale(int category, const char * locale)
 971 {
 972     /* This wraps POSIX::setlocale() */
 973
 974     char * retval;
 975     char * newlocale;
 976     dTHX;
 977
 978 #ifdef USE_LOCALE_NUMERIC
 979
 980     /* A NULL locale means only query what the current one is.  We
 981      * have the LC_NUMERIC name saved, because we are normally switched
 982      * into the C locale for it.  Switch back so an LC_ALL query will yield
 983      * the correct results; all other categories don't require special
 984      * handling */
 985     if (locale == NULL) {
 986         if (category == LC_NUMERIC) {
 987             return savepv(PL_numeric_name);
 988         }
 989
 990 #  ifdef LC_ALL
 991
 992         else if (category == LC_ALL) {
 993             SET_NUMERIC_UNDERLYING();
 994         }
 995
 996 #  endif
 997
 998     }
 999
1000 #endif
1001
1002     /* Save retval since subsequent setlocale() calls may overwrite it. */
1003     retval = savepv(do_setlocale_r(category, locale));
1004
1005     DEBUG_L(PerlIO_printf(Perl_debug_log,
1006         "%s:%d: %s\n", __FILE__, __LINE__,
1007             setlocale_debug_string(category, locale, retval)));
1008     if (! retval) {
1009         /* Should never happen that a query would return an error, but be
1010          * sure and reset to C locale */
1011         if (locale == 0) {
1012             SET_NUMERIC_STANDARD();
1013         }
1014
1015         return NULL;
1016     }
1017
1018     /* If locale == NULL, we are just querying the state, but may have switched
1019      * to NUMERIC_UNDERLYING.  Switch back before returning. */
1020     if (locale == NULL) {
1021         SET_NUMERIC_STANDARD();
1022         return retval;
1023     }
1024
1025     /* Now that have switched locales, we have to update our records to
1026      * correspond. */
1027
1028     switch (category) {
1029
1030 #ifdef USE_LOCALE_CTYPE
1031
1032         case LC_CTYPE:
1033             new_ctype(retval);
1034             break;
1035
1036 #endif
1037 #ifdef USE_LOCALE_COLLATE
1038
1039         case LC_COLLATE:
1040             new_collate(retval);
1041             break;
1042
1043 #endif
1044 #ifdef USE_LOCALE_NUMERIC
1045
1046         case LC_NUMERIC:
1047             new_numeric(retval);
1048             break;
1049
1050 #endif
1051 #ifdef LC_ALL
1052
1053         case LC_ALL:
1054
1055             /* LC_ALL updates all the things we care about.  The values may not
1056              * be the same as 'retval', as the locale "" may have set things
1057              * individually */
1058
1059 #  ifdef USE_LOCALE_CTYPE
1060
1061             newlocale = do_setlocale_c(LC_CTYPE, NULL);
1062             new_ctype(newlocale);
1063
1064 #  endif /* USE_LOCALE_CTYPE */
1065 #  ifdef USE_LOCALE_COLLATE
1066
1067             newlocale = do_setlocale_c(LC_COLLATE, NULL);
1068             new_collate(newlocale);
1069
1070 #  endif
1071 #  ifdef USE_LOCALE_NUMERIC
1072
1073             newlocale = do_setlocale_c(LC_NUMERIC, NULL);
1074             new_numeric(newlocale);
1075
1076 #  endif /* USE_LOCALE_NUMERIC */
1077 #endif /* LC_ALL */
1078
1079         default:
1080             break;
1081     }
1082
1083     return retval;
1084
1085
1086 }
1087
1088 PERL_STATIC_INLINE const char *
1089 S_save_to_buffer(const char * string, char **buf, Size_t *buf_size, const Size_t offset)
1090 {
1091     /* Copy the NUL-terminated 'string' to 'buf' + 'offset'.  'buf' has size 'buf_size',
1092      * growing it if necessary */
1093
1094     const Size_t string_size = strlen(string) + offset + 1;
1095
1096     PERL_ARGS_ASSERT_SAVE_TO_BUFFER;
1097
1098     if (*buf_size == 0) {
1099         Newx(*buf, string_size, char);
1100         *buf_size = string_size;
1101     }
1102     else if (string_size > *buf_size) {
1103         Renew(*buf, string_size, char);
1104         *buf_size = string_size;
1105     }
1106
1107     Copy(string, *buf + offset, string_size - offset, char);
1108     return *buf;
1109 }
1110
1111 /*
1112
1113 =head1 Locale-related functions and macros
1114
1115 =for apidoc Perl_langinfo
1116
1117 This is an (almost ª) drop-in replacement for the system C<L<nl_langinfo(3)>>,
1118 taking the same C<item> parameter values, and returning the same information.
1119 But it is more thread-safe than regular C<nl_langinfo()>, and hides the quirks
1120 of Perl's locale handling from your code, and can be used on systems that lack
1121 a native C<nl_langinfo>.
1122
1123 Expanding on these:
1124
1125 =over
1126
1127 =item *
1128
1129 It delivers the correct results for the C<RADIXCHAR> and C<THOUSESEP> items,
1130 without you having to write extra code.  The reason for the extra code would be
1131 because these are from the C<LC_NUMERIC> locale category, which is normally
1132 kept set to the C locale by Perl, no matter what the underlying locale is
1133 supposed to be, and so to get the expected results, you have to temporarily
1134 toggle into the underlying locale, and later toggle back.  (You could use
1135 plain C<nl_langinfo> and C<L</STORE_LC_NUMERIC_FORCE_TO_UNDERLYING>> for this
1136 but then you wouldn't get the other advantages of C<Perl_langinfo()>; not
1137 keeping C<LC_NUMERIC> in the C locale would break a lot of CPAN, which is
1138 expecting the radix (decimal point) character to be a dot.)
1139
1140 =item *
1141
1142 Depending on C<item>, it works on systems that don't have C<nl_langinfo>, hence
1143 makes your code more portable.  Of the fifty-some possible items specified by
1144 the POSIX 2008 standard,
1145 L<http://pubs.opengroup.org/onlinepubs/9699919799/basedefs/langinfo.h.html>,
1146 only two are completely unimplemented.  It uses various techniques to recover
1147 the other items, including calling C<L<localeconv(3)>>, and C<L<strftime(3)>>,
1148 both of which are specified in C89, so should be always be available.  Later
1149 C<strftime()> versions have additional capabilities; C<""> is returned for
1150 those not available on your system.
1151
1152 The details for those items which may differ from what this emulation returns
1153 and what a native C<nl_langinfo()> would return are:
1154
1155 =over
1156
1157 =item C<CODESET>
1158
1159 =item C<ERA>
1160
1161 Unimplemented, so returns C<"">.
1162
1163 =item C<YESEXPR>
1164
1165 =item C<NOEXPR>
1166
1167 Only the values for English are returned.  Earlier POSIX standards also
1168 specified C<YESSTR> and C<NOSTR>, but these have been removed from POSIX 2008,
1169 and aren't supported by C<Perl_langinfo>.
1170
1171 =item C<D_FMT>
1172
1173 Always evaluates to C<%x>, the locale's appropriate date representation.
1174
1175 =item C<T_FMT>
1176
1177 Always evaluates to C<%X>, the locale's appropriate time representation.
1178
1179 =item C<D_T_FMT>
1180
1181 Always evaluates to C<%c>, the locale's appropriate date and time
1182 representation.
1183
1184 =item C<CRNCYSTR>
1185
1186 The return may be incorrect for those rare locales where the currency symbol
1187 replaces the radix character.
1188 Send email to L<mailto:perlbug@perl.org> if you have examples of it needing
1189 to work differently.
1190
1191 =item C<ALT_DIGITS>
1192
1193 Currently this gives the same results as Linux does.
1194 Send email to L<mailto:perlbug@perl.org> if you have examples of it needing
1195 to work differently.
1196
1197 =item C<ERA_D_FMT>
1198
1199 =item C<ERA_T_FMT>
1200
1201 =item C<ERA_D_T_FMT>
1202
1203 =item C<T_FMT_AMPM>
1204
1205 These are derived by using C<strftime()>, and not all versions of that function
1206 know about them.  C<""> is returned for these on such systems.
1207
1208 =back
1209
1210 When using C<Perl_langinfo> on systems that don't have a native
1211 C<nl_langinfo()>, you must
1212
1213  #include "perl_langinfo.h"
1214
1215 before the C<perl.h> C<#include>.  You can replace your C<langinfo.h>
1216 C<#include> with this one.  (Doing it this way keeps out the symbols that plain
1217 C<langinfo.h> imports into the namespace for code that doesn't need it.)
1218
1219 You also should not use the bare C<langinfo.h> item names, but should preface
1220 them with C<PERL_>, so use C<PERL_RADIXCHAR> instead of plain C<RADIXCHAR>.
1221 The C<PERL_I<foo>> versions will also work for this function on systems that do
1222 have a native C<nl_langinfo>.
1223
1224 =item *
1225
1226 It is thread-friendly, returning its result in a buffer that won't be
1227 overwritten by another thread, so you don't have to code for that possibility.
1228 The buffer can be overwritten by the next call to C<nl_langinfo> or
1229 C<Perl_langinfo> in the same thread.
1230
1231 =item *
1232
1233 ª It returns S<C<const char *>>, whereas plain C<nl_langinfo()> returns S<C<char
1234 *>>, but you are (only by documentation) forbidden to write into the buffer.
1235 By declaring this C<const>, the compiler enforces this restriction.  The extra
1236 C<const> is why this isn't an unequivocal drop-in replacement for
1237 C<nl_langinfo>.
1238
1239 =back
1240
1241 The original impetus for C<Perl_langinfo()> was so that code that needs to
1242 find out the current currency symbol, floating point radix character, or digit
1243 grouping separator can use, on all systems, the simpler and more
1244 thread-friendly C<nl_langinfo> API instead of C<L<localeconv(3)>> which is a
1245 pain to make thread-friendly.  For other fields returned by C<localeconv>, it
1246 is better to use the methods given in L<perlcall> to call
1247 L<C<POSIX::localeconv()>|POSIX/localeconv>, which is thread-friendly.
1248
1249 =cut
1250
1251 */
1252
1253 const char *
1254 #ifdef HAS_NL_LANGINFO
1255 Perl_langinfo(const nl_item item)
1256 #else
1257 Perl_langinfo(const int item)
1258 #endif
1259 {
1260     return my_nl_langinfo(item, TRUE);
1261 }
1262
1263 const char *
1264 #ifdef HAS_NL_LANGINFO
1265 S_my_nl_langinfo(const nl_item item, bool toggle)
1266 #else
1267 S_my_nl_langinfo(const int item, bool toggle)
1268 #endif
1269 {
1270     dTHX;
1271
1272 #if defined(HAS_NL_LANGINFO) /* nl_langinfo() is available.  */
1273 #if   ! defined(HAS_POSIX_2008_LOCALE)
1274
1275     /* Here, use plain nl_langinfo(), switching to the underlying LC_NUMERIC
1276      * for those items dependent on it.  This must be copied to a buffer before
1277      * switching back, as some systems destroy the buffer when setlocale() is
1278      * called */
1279
1280     LOCALE_LOCK;
1281
1282     if (toggle) {
1283         if (item == PERL_RADIXCHAR || item == PERL_THOUSEP) {
1284             do_setlocale_c(LC_NUMERIC, PL_numeric_name);
1285         }
1286         else {
1287             toggle = FALSE;
1288         }
1289     }
1290
1291     save_to_buffer(nl_langinfo(item), &PL_langinfo_buf, &PL_langinfo_bufsize, 0);
1292
1293     if (toggle) {
1294         do_setlocale_c(LC_NUMERIC, "C");
1295     }
1296
1297     LOCALE_UNLOCK;
1298
1299     return PL_langinfo_buf;
1300
1301 #  else /* Use nl_langinfo_l(), avoiding both a mutex and changing the locale */
1302
1303     bool do_free = FALSE;
1304     locale_t cur = uselocale((locale_t) 0);
1305
1306     if (cur == LC_GLOBAL_LOCALE) {
1307         cur = duplocale(LC_GLOBAL_LOCALE);
1308         do_free = TRUE;
1309     }
1310
1311     if (   toggle
1312         && (item == PERL_RADIXCHAR || item == PERL_THOUSEP))
1313     {
1314         cur = newlocale(LC_NUMERIC_MASK, PL_numeric_name, cur);
1315         do_free = TRUE;
1316     }
1317
1318     save_to_buffer(nl_langinfo_l(item, cur),
1319                    &PL_langinfo_buf, &PL_langinfo_bufsize, 0);
1320     if (do_free) {
1321         freelocale(cur);
1322     }
1323
1324     return PL_langinfo_buf;
1325
1326 #    endif
1327 #else   /* Below, emulate nl_langinfo as best we can */
1328 #  ifdef HAS_LOCALECONV
1329
1330     const struct lconv* lc;
1331
1332 #  endif
1333 #  ifdef HAS_STRFTIME
1334
1335     struct tm tm;
1336     bool return_format = FALSE; /* Return the %format, not the value */
1337     const char * format;
1338
1339 #  endif
1340
1341     /* We copy the results to a per-thread buffer, even if not multi-threaded.
1342      * This is in part to simplify this code, and partly because we need a
1343      * buffer anyway for strftime(), and partly because a call of localeconv()
1344      * could otherwise wipe out the buffer, and the programmer would not be
1345      * expecting this, as this is a nl_langinfo() substitute after all, so s/he
1346      * might be thinking their localeconv() is safe until another localeconv()
1347      * call. */
1348
1349     switch (item) {
1350         Size_t len;
1351         const char * retval;
1352
1353         /* These 2 are unimplemented */
1354         case PERL_CODESET:
1355         case PERL_ERA:          /* For use with strftime() %E modifier */
1356
1357         default:
1358             return "";
1359
1360         /* We use only an English set, since we don't know any more */
1361         case PERL_YESEXPR:   return "^[+1yY]";
1362         case PERL_NOEXPR:    return "^[-0nN]";
1363
1364 #  ifdef HAS_LOCALECONV
1365
1366         case PERL_CRNCYSTR:
1367
1368             LOCALE_LOCK;
1369
1370             lc = localeconv();
1371             if (! lc || ! lc->currency_symbol || strEQ("", lc->currency_symbol))
1372             {
1373                 LOCALE_UNLOCK;
1374                 return "";
1375             }
1376
1377             /* Leave the first spot empty to be filled in below */
1378             save_to_buffer(lc->currency_symbol, &PL_langinfo_buf,
1379                            &PL_langinfo_bufsize, 1);
1380             if (lc->mon_decimal_point && strEQ(lc->mon_decimal_point, ""))
1381             { /*  khw couldn't figure out how the localedef specifications
1382                   would show that the $ should replace the radix; this is
1383                   just a guess as to how it might work.*/
1384                 *PL_langinfo_buf = '.';
1385             }
1386             else if (lc->p_cs_precedes) {
1387                 *PL_langinfo_buf = '-';
1388             }
1389             else {
1390                 *PL_langinfo_buf = '+';
1391             }
1392
1393             LOCALE_UNLOCK;
1394             break;
1395
1396         case PERL_RADIXCHAR:
1397         case PERL_THOUSEP:
1398
1399             LOCALE_LOCK;
1400
1401             if (toggle) {
1402                 do_setlocale_c(LC_NUMERIC, PL_numeric_name);
1403             }
1404
1405             lc = localeconv();
1406             if (! lc) {
1407                 retval = "";
1408             }
1409             else {
1410                 retval = (item == PERL_RADIXCHAR)
1411                          ? lc->decimal_point
1412                          : lc->thousands_sep;
1413                 if (! retval) {
1414                     retval = "";
1415                 }
1416             }
1417
1418             save_to_buffer(retval, &PL_langinfo_buf, &PL_langinfo_bufsize, 0);
1419
1420             if (toggle) {
1421                 do_setlocale_c(LC_NUMERIC, "C");
1422             }
1423
1424             LOCALE_UNLOCK;
1425
1426             break;
1427
1428 #  endif
1429 #  ifdef HAS_STRFTIME
1430
1431         /* These are defined by C89, so we assume that strftime supports them,
1432          * and so are returned unconditionally; they may not be what the locale
1433          * actually says, but should give good enough results for someone using
1434          * them as formats (as opposed to trying to parse them to figure out
1435          * what the locale says).  The other format items are actually tested to
1436          * verify they work on the platform */
1437         case PERL_D_FMT:         return "%x";
1438         case PERL_T_FMT:         return "%X";
1439         case PERL_D_T_FMT:       return "%c";
1440
1441         /* These formats are only available in later strfmtime's */
1442         case PERL_ERA_D_FMT: case PERL_ERA_T_FMT: case PERL_ERA_D_T_FMT:
1443         case PERL_T_FMT_AMPM:
1444
1445         /* The rest can be gotten from most versions of strftime(). */
1446         case PERL_ABDAY_1: case PERL_ABDAY_2: case PERL_ABDAY_3:
1447         case PERL_ABDAY_4: case PERL_ABDAY_5: case PERL_ABDAY_6:
1448         case PERL_ABDAY_7:
1449         case PERL_ALT_DIGITS:
1450         case PERL_AM_STR: case PERL_PM_STR:
1451         case PERL_ABMON_1: case PERL_ABMON_2: case PERL_ABMON_3:
1452         case PERL_ABMON_4: case PERL_ABMON_5: case PERL_ABMON_6:
1453         case PERL_ABMON_7: case PERL_ABMON_8: case PERL_ABMON_9:
1454         case PERL_ABMON_10: case PERL_ABMON_11: case PERL_ABMON_12:
1455         case PERL_DAY_1: case PERL_DAY_2: case PERL_DAY_3: case PERL_DAY_4:
1456         case PERL_DAY_5: case PERL_DAY_6: case PERL_DAY_7:
1457         case PERL_MON_1: case PERL_MON_2: case PERL_MON_3: case PERL_MON_4:
1458         case PERL_MON_5: case PERL_MON_6: case PERL_MON_7: case PERL_MON_8:
1459         case PERL_MON_9: case PERL_MON_10: case PERL_MON_11: case PERL_MON_12:
1460
1461             LOCALE_LOCK;
1462
1463             init_tm(&tm);   /* Precaution against core dumps */
1464             tm.tm_sec = 30;
1465             tm.tm_min = 30;
1466             tm.tm_hour = 6;
1467             tm.tm_year = 2017 - 1900;
1468             tm.tm_wday = 0;
1469             tm.tm_mon = 0;
1470             switch (item) {
1471                 default:
1472                     LOCALE_UNLOCK;
1473                     Perl_croak(aTHX_ "panic: %s: %d: switch case: %d problem",
1474                                              __FILE__, __LINE__, item);
1475                     NOT_REACHED; /* NOTREACHED */
1476
1477                 case PERL_PM_STR: tm.tm_hour = 18;
1478                 case PERL_AM_STR:
1479                     format = "%p";
1480                     break;
1481
1482                 case PERL_ABDAY_7: tm.tm_wday++;
1483                 case PERL_ABDAY_6: tm.tm_wday++;
1484                 case PERL_ABDAY_5: tm.tm_wday++;
1485                 case PERL_ABDAY_4: tm.tm_wday++;
1486                 case PERL_ABDAY_3: tm.tm_wday++;
1487                 case PERL_ABDAY_2: tm.tm_wday++;
1488                 case PERL_ABDAY_1:
1489                     format = "%a";
1490                     break;
1491
1492                 case PERL_DAY_7: tm.tm_wday++;
1493                 case PERL_DAY_6: tm.tm_wday++;
1494                 case PERL_DAY_5: tm.tm_wday++;
1495                 case PERL_DAY_4: tm.tm_wday++;
1496                 case PERL_DAY_3: tm.tm_wday++;
1497                 case PERL_DAY_2: tm.tm_wday++;
1498                 case PERL_DAY_1:
1499                     format = "%A";
1500                     break;
1501
1502                 case PERL_ABMON_12: tm.tm_mon++;
1503                 case PERL_ABMON_11: tm.tm_mon++;
1504                 case PERL_ABMON_10: tm.tm_mon++;
1505                 case PERL_ABMON_9: tm.tm_mon++;
1506                 case PERL_ABMON_8: tm.tm_mon++;
1507                 case PERL_ABMON_7: tm.tm_mon++;
1508                 case PERL_ABMON_6: tm.tm_mon++;
1509                 case PERL_ABMON_5: tm.tm_mon++;
1510                 case PERL_ABMON_4: tm.tm_mon++;
1511                 case PERL_ABMON_3: tm.tm_mon++;
1512                 case PERL_ABMON_2: tm.tm_mon++;
1513                 case PERL_ABMON_1:
1514                     format = "%b";
1515                     break;
1516
1517                 case PERL_MON_12: tm.tm_mon++;
1518                 case PERL_MON_11: tm.tm_mon++;
1519                 case PERL_MON_10: tm.tm_mon++;
1520                 case PERL_MON_9: tm.tm_mon++;
1521                 case PERL_MON_8: tm.tm_mon++;
1522                 case PERL_MON_7: tm.tm_mon++;
1523                 case PERL_MON_6: tm.tm_mon++;
1524                 case PERL_MON_5: tm.tm_mon++;
1525                 case PERL_MON_4: tm.tm_mon++;
1526                 case PERL_MON_3: tm.tm_mon++;
1527                 case PERL_MON_2: tm.tm_mon++;
1528                 case PERL_MON_1:
1529                     format = "%B";
1530                     break;
1531
1532                 case PERL_T_FMT_AMPM:
1533                     format = "%r";
1534                     return_format = TRUE;
1535                     break;
1536
1537                 case PERL_ERA_D_FMT:
1538                     format = "%Ex";
1539                     return_format = TRUE;
1540                     break;
1541
1542                 case PERL_ERA_T_FMT:
1543                     format = "%EX";
1544                     return_format = TRUE;
1545                     break;
1546
1547                 case PERL_ERA_D_T_FMT:
1548                     format = "%Ec";
1549                     return_format = TRUE;
1550                     break;
1551
1552                 case PERL_ALT_DIGITS:
1553                     tm.tm_wday = 0;
1554                     format = "%Ow";     /* Find the alternate digit for 0 */
1555                     break;
1556             }
1557
1558             /* We can't use my_strftime() because it doesn't look at tm_wday  */
1559             while (0 == strftime(PL_langinfo_buf, PL_langinfo_bufsize,
1560                                  format, &tm))
1561             {
1562                 /* A zero return means one of:
1563                  *  a)  there wasn't enough space in PL_langinfo_buf
1564                  *  b)  the format, like a plain %p, returns empty
1565                  *  c)  it was an illegal format, though some implementations of
1566                  *      strftime will just return the illegal format as a plain
1567                  *      character sequence.
1568                  *
1569                  *  To quickly test for case 'b)', try again but precede the
1570                  *  format with a plain character.  If that result is still
1571                  *  empty, the problem is either 'a)' or 'c)' */
1572
1573                 Size_t format_size = strlen(format) + 1;
1574                 Size_t mod_size = format_size + 1;
1575                 char * mod_format;
1576                 char * temp_result;
1577
1578                 Newx(mod_format, mod_size, char);
1579                 Newx(temp_result, PL_langinfo_bufsize, char);
1580                 *mod_format = '\a';
1581                 my_strlcpy(mod_format + 1, format, mod_size);
1582                 len = strftime(temp_result,
1583                                PL_langinfo_bufsize,
1584                                mod_format, &tm);
1585                 Safefree(mod_format);
1586                 Safefree(temp_result);
1587
1588                 /* If 'len' is non-zero, it means that we had a case like %p
1589                  * which means the current locale doesn't use a.m. or p.m., and
1590                  * that is valid */
1591                 if (len == 0) {
1592
1593                     /* Here, still didn't work.  If we get well beyond a
1594                      * reasonable size, bail out to prevent an infinite loop. */
1595
1596                     if (PL_langinfo_bufsize > 100 * format_size) {
1597                         *PL_langinfo_buf = '\0';
1598                     }
1599                     else { /* Double the buffer size to retry;  Add 1 in case
1600                               original was 0, so we aren't stuck at 0. */
1601                         PL_langinfo_bufsize *= 2;
1602                         PL_langinfo_bufsize++;
1603                         Renew(PL_langinfo_buf, PL_langinfo_bufsize, char);
1604                         continue;
1605                     }
1606                 }
1607
1608                 break;
1609             }
1610
1611             /* Here, we got a result.
1612              *
1613              * If the item is 'ALT_DIGITS', PL_langinfo_buf contains the
1614              * alternate format for wday 0.  If the value is the same as the
1615              * normal 0, there isn't an alternate, so clear the buffer. */
1616             if (   item == PERL_ALT_DIGITS
1617                 && strEQ(PL_langinfo_buf, "0"))
1618             {
1619                 *PL_langinfo_buf = '\0';
1620             }
1621
1622             /* ALT_DIGITS is problematic.  Experiments on it showed that
1623              * strftime() did not always work properly when going from alt-9 to
1624              * alt-10.  Only a few locales have this item defined, and in all
1625              * of them on Linux that khw was able to find, nl_langinfo() merely
1626              * returned the alt-0 character, possibly doubled.  Most Unicode
1627              * digits are in blocks of 10 consecutive code points, so that is
1628              * sufficient information for those scripts, as we can infer alt-1,
1629              * alt-2, ....  But for a Japanese locale, a CJK ideographic 0 is
1630              * returned, and the CJK digits are not in code point order, so you
1631              * can't really infer anything.  The localedef for this locale did
1632              * specify the succeeding digits, so that strftime() works properly
1633              * on them, without needing to infer anything.  But the
1634              * nl_langinfo() return did not give sufficient information for the
1635              * caller to understand what's going on.  So until there is
1636              * evidence that it should work differently, this returns the alt-0
1637              * string for ALT_DIGITS.
1638              *
1639              * wday was chosen because its range is all a single digit.  Things
1640              * like tm_sec have two digits as the minimum: '00' */
1641
1642             LOCALE_UNLOCK;
1643
1644             /* If to return the format, not the value, overwrite the buffer
1645              * with it.  But some strftime()s will keep the original format if
1646              * illegal, so change those to "" */
1647             if (return_format) {
1648                 if (strEQ(PL_langinfo_buf, format)) {
1649                     *PL_langinfo_buf = '\0';
1650                 }
1651                 else {
1652                     save_to_buffer(format, &PL_langinfo_buf,
1653                                     &PL_langinfo_bufsize, 0);
1654                 }
1655             }
1656
1657             break;
1658
1659 #  endif
1660
1661     }
1662
1663     return PL_langinfo_buf;
1664
1665 #endif
1666
1667 }
1668
1669 /*
1670  * Initialize locale awareness.
1671  */
1672 int
1673 Perl_init_i18nl10n(pTHX_ int printwarn)
1674 {
1675     /* printwarn is
1676      *
1677      *    0 if not to output warning when setup locale is bad
1678      *    1 if to output warning based on value of PERL_BADLANG
1679      *    >1 if to output regardless of PERL_BADLANG
1680      *
1681      * returns
1682      *    1 = set ok or not applicable,
1683      *    0 = fallback to a locale of lower priority
1684      *   -1 = fallback to all locales failed, not even to the C locale
1685      *
1686      * Under -DDEBUGGING, if the environment variable PERL_DEBUG_LOCALE_INIT is
1687      * set, debugging information is output.
1688      *
1689      * This looks more complicated than it is, mainly due to the #ifdefs.
1690      *
1691      * We try to set LC_ALL to the value determined by the environment.  If
1692      * there is no LC_ALL on this platform, we try the individual categories we
1693      * know about.  If this works, we are done.
1694      *
1695      * But if it doesn't work, we have to do something else.  We search the
1696      * environment variables ourselves instead of relying on the system to do
1697      * it.  We look at, in order, LC_ALL, LANG, a system default locale (if we
1698      * think there is one), and the ultimate fallback "C".  This is all done in
1699      * the same loop as above to avoid duplicating code, but it makes things
1700      * more complex.  The 'trial_locales' array is initialized with just one
1701      * element; it causes the behavior described in the paragraph above this to
1702      * happen.  If that fails, we add elements to 'trial_locales', and do extra
1703      * loop iterations to cause the behavior described in this paragraph.
1704      *
1705      * On Ultrix, the locale MUST come from the environment, so there is
1706      * preliminary code to set it.  I (khw) am not sure that it is necessary,
1707      * and that this couldn't be folded into the loop, but barring any real
1708      * platforms to test on, it's staying as-is
1709      *
1710      * A slight complication is that in embedded Perls, the locale may already
1711      * be set-up, and we don't want to get it from the normal environment
1712      * variables.  This is handled by having a special environment variable
1713      * indicate we're in this situation.  We simply set setlocale's 2nd
1714      * parameter to be a NULL instead of "".  That indicates to setlocale that
1715      * it is not to change anything, but to return the current value,
1716      * effectively initializing perl's db to what the locale already is.
1717      *
1718      * We play the same trick with NULL if a LC_ALL succeeds.  We call
1719      * setlocale() on the individual categores with NULL to get their existing
1720      * values for our db, instead of trying to change them.
1721      * */
1722
1723     int ok = 1;
1724
1725 #ifndef USE_LOCALE
1726
1727     PERL_UNUSED_ARG(printwarn);
1728
1729 #else  /* USE_LOCALE */
1730 #  ifdef __GLIBC__
1731
1732     const char * const language   = savepv(PerlEnv_getenv("LANGUAGE"));
1733
1734 #  endif
1735
1736     /* NULL uses the existing already set up locale */
1737     const char * const setlocale_init = (PerlEnv_getenv("PERL_SKIP_LOCALE_INIT"))
1738                                         ? NULL
1739                                         : "";
1740     const char* trial_locales[5];   /* 5 = 1 each for "", LC_ALL, LANG, "", C */
1741     unsigned int trial_locales_count;
1742     const char * const lc_all     = savepv(PerlEnv_getenv("LC_ALL"));
1743     const char * const lang       = savepv(PerlEnv_getenv("LANG"));
1744     bool setlocale_failure = FALSE;
1745     unsigned int i;
1746
1747     /* A later getenv() could zap this, so only use here */
1748     const char * const bad_lang_use_once = PerlEnv_getenv("PERL_BADLANG");
1749
1750     const bool locwarn = (printwarn > 1
1751                           || (          printwarn
1752                               && (    ! bad_lang_use_once
1753                                   || (
1754                                          /* disallow with "" or "0" */
1755                                          *bad_lang_use_once
1756                                        && strNE("0", bad_lang_use_once)))));
1757     bool done = FALSE;
1758     char * sl_result[NOMINAL_LC_ALL_INDEX + 1];   /* setlocale() return vals;
1759                                                      not copied so must be
1760                                                      looked at immediately */
1761     char * curlocales[NOMINAL_LC_ALL_INDEX + 1];  /* current locale for given
1762                                                      category; should have been
1763                                                      copied so aren't volatile
1764                                                    */
1765     char * locale_param;
1766
1767 #  ifdef WIN32
1768
1769     /* In some systems you can find out the system default locale
1770      * and use that as the fallback locale. */
1771 #    define SYSTEM_DEFAULT_LOCALE
1772 #  endif
1773 #  ifdef SYSTEM_DEFAULT_LOCALE
1774
1775     const char *system_default_locale = NULL;
1776
1777 #  endif
1778
1779 #  ifndef DEBUGGING
1780 #    define DEBUG_LOCALE_INIT(a,b,c)
1781 #  else
1782
1783     DEBUG_INITIALIZATION_set(cBOOL(PerlEnv_getenv("PERL_DEBUG_LOCALE_INIT")));
1784
1785 #    define DEBUG_LOCALE_INIT(category, locale, result)                     \
1786         STMT_START {                                                        \
1787                 if (debug_initialization) {                                 \
1788                     PerlIO_printf(Perl_debug_log,                           \
1789                                   "%s:%d: %s\n",                            \
1790                                   __FILE__, __LINE__,                       \
1791                                   setlocale_debug_string(category,          \
1792                                                           locale,           \
1793                                                           result));         \
1794                 }                                                           \
1795         } STMT_END
1796
1797 /* Make sure the parallel arrays are properly set up */
1798 #    ifdef USE_LOCALE_NUMERIC
1799     assert(categories[LC_NUMERIC_INDEX] == LC_NUMERIC);
1800     assert(strEQ(category_names[LC_NUMERIC_INDEX], "LC_NUMERIC"));
1801 #    endif
1802 #    ifdef USE_LOCALE_CTYPE
1803     assert(categories[LC_CTYPE_INDEX] == LC_CTYPE);
1804     assert(strEQ(category_names[LC_CTYPE_INDEX], "LC_CTYPE"));
1805 #    endif
1806 #    ifdef USE_LOCALE_COLLATE
1807     assert(categories[LC_COLLATE_INDEX] == LC_COLLATE);
1808     assert(strEQ(category_names[LC_COLLATE_INDEX], "LC_COLLATE"));
1809 #    endif
1810 #    ifdef USE_LOCALE_TIME
1811     assert(categories[LC_TIME_INDEX] == LC_TIME);
1812     assert(strEQ(category_names[LC_TIME_INDEX], "LC_TIME"));
1813 #    endif
1814 #    ifdef USE_LOCALE_MESSAGES
1815     assert(categories[LC_MESSAGES_INDEX] == LC_MESSAGES);
1816     assert(strEQ(category_names[LC_MESSAGES_INDEX], "LC_MESSAGES"));
1817 #    endif
1818 #    ifdef USE_LOCALE_MONETARY
1819     assert(categories[LC_MONETARY_INDEX] == LC_MONETARY);
1820     assert(strEQ(category_names[LC_MONETARY_INDEX], "LC_MONETARY"));
1821 #    endif
1822 #    ifdef LC_ALL
1823     assert(categories[LC_ALL_INDEX] == LC_ALL);
1824     assert(strEQ(category_names[LC_ALL_INDEX], "LC_ALL"));
1825     assert(NOMINAL_LC_ALL_INDEX == LC_ALL_INDEX);
1826 #    endif
1827 #  endif    /* DEBUGGING */
1828 #  ifndef LOCALE_ENVIRON_REQUIRED
1829
1830     PERL_UNUSED_VAR(done);
1831     PERL_UNUSED_VAR(locale_param);
1832
1833 #  else
1834
1835     /*
1836      * Ultrix setlocale(..., "") fails if there are no environment
1837      * variables from which to get a locale name.
1838      */
1839
1840 #    ifdef LC_ALL
1841
1842     if (lang) {
1843         sl_result[LC_ALL_INDEX] = do_setlocale_c(LC_ALL, setlocale_init);
1844         DEBUG_LOCALE_INIT(LC_ALL, setlocale_init, sl_result[LC_ALL_INDEX]);
1845         if (sl_result[LC_ALL_INDEX])
1846             done = TRUE;
1847         else
1848             setlocale_failure = TRUE;
1849     }
1850     if (! setlocale_failure) {
1851         for (i = 0; i < LC_ALL_INDEX; i++) {
1852             locale_param = (! done && (lang || PerlEnv_getenv(category_names[i])))
1853                            ? setlocale_init
1854                            : NULL;
1855             sl_result[i] = do_setlocale_r(categories[i], locale_param);
1856             if (! sl_result[i]) {
1857                 setlocale_failure = TRUE;
1858             }
1859             DEBUG_LOCALE_INIT(categories[i], locale_param, sl_result[i]);
1860         }
1861     }
1862
1863 #    endif /* LC_ALL */
1864 #  endif /* LOCALE_ENVIRON_REQUIRED */
1865
1866     /* We try each locale in the list until we get one that works, or exhaust
1867      * the list.  Normally the loop is executed just once.  But if setting the
1868      * locale fails, inside the loop we add fallback trials to the array and so
1869      * will execute the loop multiple times */
1870     trial_locales[0] = setlocale_init;
1871     trial_locales_count = 1;
1872
1873     for (i= 0; i < trial_locales_count; i++) {
1874         const char * trial_locale = trial_locales[i];
1875
1876         if (i > 0) {
1877
1878             /* XXX This is to preserve old behavior for LOCALE_ENVIRON_REQUIRED
1879              * when i==0, but I (khw) don't think that behavior makes much
1880              * sense */
1881             setlocale_failure = FALSE;
1882
1883 #  ifdef SYSTEM_DEFAULT_LOCALE
1884 #    ifdef WIN32
1885
1886             /* On Windows machines, an entry of "" after the 0th means to use
1887              * the system default locale, which we now proceed to get. */
1888             if (strEQ(trial_locale, "")) {
1889                 unsigned int j;
1890
1891                 /* Note that this may change the locale, but we are going to do
1892                  * that anyway just below */
1893                 system_default_locale = do_setlocale_c(LC_ALL, "");
1894                 DEBUG_LOCALE_INIT(LC_ALL, "", system_default_locale);
1895
1896                 /* Skip if invalid or if it's already on the list of locales to
1897                  * try */
1898                 if (! system_default_locale) {
1899                     goto next_iteration;
1900                 }
1901                 for (j = 0; j < trial_locales_count; j++) {
1902                     if (strEQ(system_default_locale, trial_locales[j])) {
1903                         goto next_iteration;
1904                     }
1905                 }
1906
1907                 trial_locale = system_default_locale;
1908             }
1909 #    endif /* WIN32 */
1910 #  endif /* SYSTEM_DEFAULT_LOCALE */
1911         }
1912
1913 #  ifdef LC_ALL
1914
1915         sl_result[LC_ALL_INDEX] = do_setlocale_c(LC_ALL, trial_locale);
1916         DEBUG_LOCALE_INIT(LC_ALL, trial_locale, sl_result[LC_ALL_INDEX]);
1917         if (! sl_result[LC_ALL_INDEX]) {
1918             setlocale_failure = TRUE;
1919         }
1920         else {
1921             /* Since LC_ALL succeeded, it should have changed all the other
1922              * categories it can to its value; so we massage things so that the
1923              * setlocales below just return their category's current values.
1924              * This adequately handles the case in NetBSD where LC_COLLATE may
1925              * not be defined for a locale, and setting it individually will
1926              * fail, whereas setting LC_ALL succeeds, leaving LC_COLLATE set to
1927              * the POSIX locale. */
1928             trial_locale = NULL;
1929         }
1930
1931 #  endif /* LC_ALL */
1932
1933         if (! setlocale_failure) {
1934             unsigned int j;
1935             for (j = 0; j < NOMINAL_LC_ALL_INDEX; j++) {
1936                 curlocales[j]
1937                         = savepv(do_setlocale_r(categories[j], trial_locale));
1938                 if (! curlocales[j]) {
1939                     setlocale_failure = TRUE;
1940                 }
1941                 DEBUG_LOCALE_INIT(categories[j], trial_locale, curlocales[j]);
1942             }
1943
1944             if (! setlocale_failure) {  /* All succeeded */
1945                 break;  /* Exit trial_locales loop */
1946             }
1947         }
1948
1949         /* Here, something failed; will need to try a fallback. */
1950         ok = 0;
1951
1952         if (i == 0) {
1953             unsigned int j;
1954
1955             if (locwarn) { /* Output failure info only on the first one */
1956
1957 #  ifdef LC_ALL
1958
1959                 PerlIO_printf(Perl_error_log,
1960                 "perl: warning: Setting locale failed.\n");
1961
1962 #  else /* !LC_ALL */
1963
1964                 PerlIO_printf(Perl_error_log,
1965                 "perl: warning: Setting locale failed for the categories:\n\t");
1966
1967                 for (j = 0; j < NOMINAL_LC_ALL_INDEX; j++) {
1968                     if (! curlocales[j]) {
1969                         PerlIO_printf(Perl_error_log, category_names[j]);
1970                     }
1971                     else {
1972                         Safefree(curlocales[j]);
1973                     }
1974                 }
1975
1976                 PerlIO_printf(Perl_error_log, "and possibly others\n");
1977
1978 #  endif /* LC_ALL */
1979
1980                 PerlIO_printf(Perl_error_log,
1981                     "perl: warning: Please check that your locale settings:\n");
1982
1983 #  ifdef __GLIBC__
1984
1985                 PerlIO_printf(Perl_error_log,
1986                             "\tLANGUAGE = %c%s%c,\n",
1987                             language ? '"' : '(',
1988                             language ? language : "unset",
1989                             language ? '"' : ')');
1990 #  endif
1991
1992                 PerlIO_printf(Perl_error_log,
1993                             "\tLC_ALL = %c%s%c,\n",
1994                             lc_all ? '"' : '(',
1995                             lc_all ? lc_all : "unset",
1996                             lc_all ? '"' : ')');
1997
1998 #  if defined(USE_ENVIRON_ARRAY)
1999
2000                 {
2001                     char **e;
2002
2003                     /* Look through the environment for any variables of the
2004                      * form qr/ ^ LC_ [A-Z]+ = /x, except LC_ALL which was
2005                      * already handled above.  These are assumed to be locale
2006                      * settings.  Output them and their values. */
2007                     for (e = environ; *e; e++) {
2008                         const STRLEN prefix_len = sizeof("LC_") - 1;
2009                         STRLEN uppers_len;
2010
2011                         if (     strBEGINs(*e, "LC_")
2012                             && ! strBEGINs(*e, "LC_ALL=")
2013                             && (uppers_len = strspn(*e + prefix_len,
2014                                              "ABCDEFGHIJKLMNOPQRSTUVWXYZ"))
2015                             && ((*e)[prefix_len + uppers_len] == '='))
2016                         {
2017                             PerlIO_printf(Perl_error_log, "\t%.*s = \"%s\",\n",
2018                                 (int) (prefix_len + uppers_len), *e,
2019                                 *e + prefix_len + uppers_len + 1);
2020                         }
2021                     }
2022                 }
2023
2024 #  else
2025
2026                 PerlIO_printf(Perl_error_log,
2027                             "\t(possibly more locale environment variables)\n");
2028
2029 #  endif
2030
2031                 PerlIO_printf(Perl_error_log,
2032                             "\tLANG = %c%s%c\n",
2033                             lang ? '"' : '(',
2034                             lang ? lang : "unset",
2035                             lang ? '"' : ')');
2036
2037                 PerlIO_printf(Perl_error_log,
2038                             "    are supported and installed on your system.\n");
2039             }
2040
2041             /* Calculate what fallback locales to try.  We have avoided this
2042              * until we have to, because failure is quite unlikely.  This will
2043              * usually change the upper bound of the loop we are in.
2044              *
2045              * Since the system's default way of setting the locale has not
2046              * found one that works, We use Perl's defined ordering: LC_ALL,
2047              * LANG, and the C locale.  We don't try the same locale twice, so
2048              * don't add to the list if already there.  (On POSIX systems, the
2049              * LC_ALL element will likely be a repeat of the 0th element "",
2050              * but there's no harm done by doing it explicitly.
2051              *
2052              * Note that this tries the LC_ALL environment variable even on
2053              * systems which have no LC_ALL locale setting.  This may or may
2054              * not have been originally intentional, but there's no real need
2055              * to change the behavior. */
2056             if (lc_all) {
2057                 for (j = 0; j < trial_locales_count; j++) {
2058                     if (strEQ(lc_all, trial_locales[j])) {
2059                         goto done_lc_all;
2060                     }
2061                 }
2062                 trial_locales[trial_locales_count++] = lc_all;
2063             }
2064           done_lc_all:
2065
2066             if (lang) {
2067                 for (j = 0; j < trial_locales_count; j++) {
2068                     if (strEQ(lang, trial_locales[j])) {
2069                         goto done_lang;
2070                     }
2071                 }
2072                 trial_locales[trial_locales_count++] = lang;
2073             }
2074           done_lang:
2075
2076 #  if defined(WIN32) && defined(LC_ALL)
2077
2078             /* For Windows, we also try the system default locale before "C".
2079              * (If there exists a Windows without LC_ALL we skip this because
2080              * it gets too complicated.  For those, the "C" is the next
2081              * fallback possibility).  The "" is the same as the 0th element of
2082              * the array, but the code at the loop above knows to treat it
2083              * differently when not the 0th */
2084             trial_locales[trial_locales_count++] = "";
2085
2086 #  endif
2087
2088             for (j = 0; j < trial_locales_count; j++) {
2089                 if (strEQ("C", trial_locales[j])) {
2090                     goto done_C;
2091                 }
2092             }
2093             trial_locales[trial_locales_count++] = "C";
2094
2095           done_C: ;
2096         }   /* end of first time through the loop */
2097
2098 #  ifdef WIN32
2099
2100       next_iteration: ;
2101
2102 #  endif
2103
2104     }   /* end of looping through the trial locales */
2105
2106     if (ok < 1) {   /* If we tried to fallback */
2107         const char* msg;
2108         if (! setlocale_failure) {  /* fallback succeeded */
2109            msg = "Falling back to";
2110         }
2111         else {  /* fallback failed */
2112             unsigned int j;
2113
2114             /* We dropped off the end of the loop, so have to decrement i to
2115              * get back to the value the last time through */
2116             i--;
2117
2118             ok = -1;
2119             msg = "Failed to fall back to";
2120
2121             /* To continue, we should use whatever values we've got */
2122
2123             for (j = 0; j < NOMINAL_LC_ALL_INDEX; j++) {
2124                 Safefree(curlocales[j]);
2125                 curlocales[j] = savepv(do_setlocale_r(categories[j], NULL));
2126                 DEBUG_LOCALE_INIT(categories[j], NULL, curlocales[j]);
2127             }
2128         }
2129
2130         if (locwarn) {
2131             const char * description;
2132             const char * name = "";
2133             if (strEQ(trial_locales[i], "C")) {
2134                 description = "the standard locale";
2135                 name = "C";
2136             }
2137
2138 #  ifdef SYSTEM_DEFAULT_LOCALE
2139
2140             else if (strEQ(trial_locales[i], "")) {
2141                 description = "the system default locale";
2142                 if (system_default_locale) {
2143                     name = system_default_locale;
2144                 }
2145             }
2146
2147 #  endif /* SYSTEM_DEFAULT_LOCALE */
2148
2149             else {
2150                 description = "a fallback locale";
2151                 name = trial_locales[i];
2152             }
2153             if (name && strNE(name, "")) {
2154                 PerlIO_printf(Perl_error_log,
2155                     "perl: warning: %s %s (\"%s\").\n", msg, description, name);
2156             }
2157             else {
2158                 PerlIO_printf(Perl_error_log,
2159                                    "perl: warning: %s %s.\n", msg, description);
2160             }
2161         }
2162     } /* End of tried to fallback */
2163
2164     /* Done with finding the locales; update our records */
2165
2166 #  ifdef USE_LOCALE_CTYPE
2167
2168     new_ctype(curlocales[LC_CTYPE_INDEX]);
2169
2170 #  endif
2171 #  ifdef USE_LOCALE_COLLATE
2172
2173     new_collate(curlocales[LC_COLLATE_INDEX]);
2174
2175 #  endif
2176 #  ifdef USE_LOCALE_NUMERIC
2177
2178     new_numeric(curlocales[LC_NUMERIC_INDEX]);
2179
2180 #  endif
2181
2182
2183     for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++) {
2184         Safefree(curlocales[i]);
2185     }
2186
2187 #  if defined(USE_PERLIO) && defined(USE_LOCALE_CTYPE)
2188
2189     /* Set PL_utf8locale to TRUE if using PerlIO _and_ the current LC_CTYPE
2190      * locale is UTF-8.  If PL_utf8locale and PL_unicode (set by -C or by
2191      * $ENV{PERL_UNICODE}) are true, perl.c:S_parse_body() will turn on the
2192      * PerlIO :utf8 layer on STDIN, STDOUT, STDERR, _and_ the default open
2193      * discipline.  */
2194     PL_utf8locale = _is_cur_LC_category_utf8(LC_CTYPE);
2195
2196     /* Set PL_unicode to $ENV{PERL_UNICODE} if using PerlIO.
2197        This is an alternative to using the -C command line switch
2198        (the -C if present will override this). */
2199     {
2200          const char *p = PerlEnv_getenv("PERL_UNICODE");
2201          PL_unicode = p ? parse_unicode_opts(&p) : 0;
2202          if (PL_unicode & PERL_UNICODE_UTF8CACHEASSERT_FLAG)
2203              PL_utf8cache = -1;
2204     }
2205
2206 #  endif
2207 #  ifdef __GLIBC__
2208
2209     Safefree(language);
2210
2211 #  endif
2212
2213     Safefree(lc_all);
2214     Safefree(lang);
2215
2216 #endif /* USE_LOCALE */
2217 #ifdef DEBUGGING
2218
2219     /* So won't continue to output stuff */
2220     DEBUG_INITIALIZATION_set(FALSE);
2221
2222 #endif
2223
2224     return ok;
2225 }
2226
2227 #ifdef USE_LOCALE_COLLATE
2228
2229 char *
2230 Perl__mem_collxfrm(pTHX_ const char *input_string,
2231                          STRLEN len,    /* Length of 'input_string' */
2232                          STRLEN *xlen,  /* Set to length of returned string
2233                                            (not including the collation index
2234                                            prefix) */
2235                          bool utf8      /* Is the input in UTF-8? */
2236                    )
2237 {
2238
2239     /* _mem_collxfrm() is a bit like strxfrm() but with two important
2240      * differences. First, it handles embedded NULs. Second, it allocates a bit
2241      * more memory than needed for the transformed data itself.  The real
2242      * transformed data begins at offset COLLXFRM_HDR_LEN.  *xlen is set to
2243      * the length of that, and doesn't include the collation index size.
2244      * Please see sv_collxfrm() to see how this is used. */
2245
2246 #define COLLXFRM_HDR_LEN    sizeof(PL_collation_ix)
2247
2248     char * s = (char *) input_string;
2249     STRLEN s_strlen = strlen(input_string);
2250     char *xbuf = NULL;
2251     STRLEN xAlloc;          /* xalloc is a reserved word in VC */
2252     STRLEN length_in_chars;
2253     bool first_time = TRUE; /* Cleared after first loop iteration */
2254
2255     PERL_ARGS_ASSERT__MEM_COLLXFRM;
2256
2257     /* Must be NUL-terminated */
2258     assert(*(input_string + len) == '\0');
2259
2260     /* If this locale has defective collation, skip */
2261     if (PL_collxfrm_base == 0 && PL_collxfrm_mult == 0) {
2262         DEBUG_L(PerlIO_printf(Perl_debug_log,
2263                       "_mem_collxfrm: locale's collation is defective\n"));
2264         goto bad;
2265     }
2266
2267     /* Replace any embedded NULs with the control that sorts before any others.
2268      * This will give as good as possible results on strings that don't
2269      * otherwise contain that character, but otherwise there may be
2270      * less-than-perfect results with that character and NUL.  This is
2271      * unavoidable unless we replace strxfrm with our own implementation. */
2272     if (UNLIKELY(s_strlen < len)) {   /* Only execute if there is an embedded
2273                                          NUL */
2274         char * e = s + len;
2275         char * sans_nuls;
2276         STRLEN sans_nuls_len;
2277         int try_non_controls;
2278         char this_replacement_char[] = "?\0";   /* Room for a two-byte string,
2279                                                    making sure 2nd byte is NUL.
2280                                                  */
2281         STRLEN this_replacement_len;
2282
2283         /* If we don't know what non-NUL control character sorts lowest for
2284          * this locale, find it */
2285         if (PL_strxfrm_NUL_replacement == '\0') {
2286             int j;
2287             char * cur_min_x = NULL;    /* The min_char's xfrm, (except it also
2288                                            includes the collation index
2289                                            prefixed. */
2290
2291             DEBUG_Lv(PerlIO_printf(Perl_debug_log, "Looking to replace NUL\n"));
2292
2293             /* Unlikely, but it may be that no control will work to replace
2294              * NUL, in which case we instead look for any character.  Controls
2295              * are preferred because collation order is, in general, context
2296              * sensitive, with adjoining characters affecting the order, and
2297              * controls are less likely to have such interactions, allowing the
2298              * NUL-replacement to stand on its own.  (Another way to look at it
2299              * is to imagine what would happen if the NUL were replaced by a
2300              * combining character; it wouldn't work out all that well.) */
2301             for (try_non_controls = 0;
2302                  try_non_controls < 2;
2303                  try_non_controls++)
2304             {
2305                 /* Look through all legal code points (NUL isn't) */
2306                 for (j = 1; j < 256; j++) {
2307                     char * x;       /* j's xfrm plus collation index */
2308                     STRLEN x_len;   /* length of 'x' */
2309                     STRLEN trial_len = 1;
2310                     char cur_source[] = { '\0', '\0' };
2311
2312                     /* Skip non-controls the first time through the loop.  The
2313                      * controls in a UTF-8 locale are the L1 ones */
2314                     if (! try_non_controls && (PL_in_utf8_COLLATE_locale)
2315                                                ? ! isCNTRL_L1(j)
2316                                                : ! isCNTRL_LC(j))
2317                     {
2318                         continue;
2319                     }
2320
2321                     /* Create a 1-char string of the current code point */
2322                     cur_source[0] = (char) j;
2323
2324                     /* Then transform it */
2325                     x = _mem_collxfrm(cur_source, trial_len, &x_len,
2326                                       0 /* The string is not in UTF-8 */);
2327
2328                     /* Ignore any character that didn't successfully transform.
2329                      * */
2330                     if (! x) {
2331                         continue;
2332                     }
2333
2334                     /* If this character's transformation is lower than
2335                      * the current lowest, this one becomes the lowest */
2336                     if (   cur_min_x == NULL
2337                         || strLT(x         + COLLXFRM_HDR_LEN,
2338                                  cur_min_x + COLLXFRM_HDR_LEN))
2339                     {
2340                         PL_strxfrm_NUL_replacement = j;
2341                         cur_min_x = x;
2342                     }
2343                     else {
2344                         Safefree(x);
2345                     }
2346                 } /* end of loop through all 255 characters */
2347
2348                 /* Stop looking if found */
2349                 if (cur_min_x) {
2350                     break;
2351                 }
2352
2353                 /* Unlikely, but possible, if there aren't any controls that
2354                  * work in the locale, repeat the loop, looking for any
2355                  * character that works */
2356                 DEBUG_L(PerlIO_printf(Perl_debug_log,
2357                 "_mem_collxfrm: No control worked.  Trying non-controls\n"));
2358             } /* End of loop to try first the controls, then any char */
2359
2360             if (! cur_min_x) {
2361                 DEBUG_L(PerlIO_printf(Perl_debug_log,
2362                     "_mem_collxfrm: Couldn't find any character to replace"
2363                     " embedded NULs in locale %s with", PL_collation_name));
2364                 goto bad;
2365             }
2366
2367             DEBUG_L(PerlIO_printf(Perl_debug_log,
2368                     "_mem_collxfrm: Replacing embedded NULs in locale %s with "
2369                     "0x%02X\n", PL_collation_name, PL_strxfrm_NUL_replacement));
2370
2371             Safefree(cur_min_x);
2372         } /* End of determining the character that is to replace NULs */
2373
2374         /* If the replacement is variant under UTF-8, it must match the
2375          * UTF8-ness as the original */
2376         if ( ! UVCHR_IS_INVARIANT(PL_strxfrm_NUL_replacement) && utf8) {
2377             this_replacement_char[0] =
2378                                 UTF8_EIGHT_BIT_HI(PL_strxfrm_NUL_replacement);
2379             this_replacement_char[1] =
2380                                 UTF8_EIGHT_BIT_LO(PL_strxfrm_NUL_replacement);
2381             this_replacement_len = 2;
2382         }
2383         else {
2384             this_replacement_char[0] = PL_strxfrm_NUL_replacement;
2385             /* this_replacement_char[1] = '\0' was done at initialization */
2386             this_replacement_len = 1;
2387         }
2388
2389         /* The worst case length for the replaced string would be if every
2390          * character in it is NUL.  Multiply that by the length of each
2391          * replacement, and allow for a trailing NUL */
2392         sans_nuls_len = (len * this_replacement_len) + 1;
2393         Newx(sans_nuls, sans_nuls_len, char);
2394         *sans_nuls = '\0';
2395
2396         /* Replace each NUL with the lowest collating control.  Loop until have
2397          * exhausted all the NULs */
2398         while (s + s_strlen < e) {
2399             my_strlcat(sans_nuls, s, sans_nuls_len);
2400
2401             /* Do the actual replacement */
2402             my_strlcat(sans_nuls, this_replacement_char, sans_nuls_len);
2403
2404             /* Move past the input NUL */
2405             s += s_strlen + 1;
2406             s_strlen = strlen(s);
2407         }
2408
2409         /* And add anything that trails the final NUL */
2410         my_strlcat(sans_nuls, s, sans_nuls_len);
2411
2412         /* Switch so below we transform this modified string */
2413         s = sans_nuls;
2414         len = strlen(s);
2415     } /* End of replacing NULs */
2416
2417     /* Make sure the UTF8ness of the string and locale match */
2418     if (utf8 != PL_in_utf8_COLLATE_locale) {
2419         const char * const t = s;   /* Temporary so we can later find where the
2420                                        input was */
2421
2422         /* Here they don't match.  Change the string's to be what the locale is
2423          * expecting */
2424
2425         if (! utf8) { /* locale is UTF-8, but input isn't; upgrade the input */
2426             s = (char *) bytes_to_utf8((const U8 *) s, &len);
2427             utf8 = TRUE;
2428         }
2429         else {   /* locale is not UTF-8; but input is; downgrade the input */
2430
2431             s = (char *) bytes_from_utf8((const U8 *) s, &len, &utf8);
2432
2433             /* If the downgrade was successful we are done, but if the input
2434              * contains things that require UTF-8 to represent, have to do
2435              * damage control ... */
2436             if (UNLIKELY(utf8)) {
2437
2438                 /* What we do is construct a non-UTF-8 string with
2439                  *  1) the characters representable by a single byte converted
2440                  *     to be so (if necessary);
2441                  *  2) and the rest converted to collate the same as the
2442                  *     highest collating representable character.  That makes
2443                  *     them collate at the end.  This is similar to how we
2444                  *     handle embedded NULs, but we use the highest collating
2445                  *     code point instead of the smallest.  Like the NUL case,
2446                  *     this isn't perfect, but is the best we can reasonably
2447                  *     do.  Every above-255 code point will sort the same as
2448                  *     the highest-sorting 0-255 code point.  If that code
2449                  *     point can combine in a sequence with some other code
2450                  *     points for weight calculations, us changing something to
2451                  *     be it can adversely affect the results.  But in most
2452                  *     cases, it should work reasonably.  And note that this is
2453                  *     really an illegal situation: using code points above 255
2454                  *     on a locale where only 0-255 are valid.  If two strings
2455                  *     sort entirely equal, then the sort order for the
2456                  *     above-255 code points will be in code point order. */
2457
2458                 utf8 = FALSE;
2459
2460                 /* If we haven't calculated the code point with the maximum
2461                  * collating order for this locale, do so now */
2462                 if (! PL_strxfrm_max_cp) {
2463                     int j;
2464
2465                     /* The current transformed string that collates the
2466                      * highest (except it also includes the prefixed collation
2467                      * index. */
2468                     char * cur_max_x = NULL;
2469
2470                     /* Look through all legal code points (NUL isn't) */
2471                     for (j = 1; j < 256; j++) {
2472                         char * x;
2473                         STRLEN x_len;
2474                         char cur_source[] = { '\0', '\0' };
2475
2476                         /* Create a 1-char string of the current code point */
2477                         cur_source[0] = (char) j;
2478
2479                         /* Then transform it */
2480                         x = _mem_collxfrm(cur_source, 1, &x_len, FALSE);
2481
2482                         /* If something went wrong (which it shouldn't), just
2483                          * ignore this code point */
2484                         if (! x) {
2485                             continue;
2486                         }
2487
2488                         /* If this character's transformation is higher than
2489                          * the current highest, this one becomes the highest */
2490                         if (   cur_max_x == NULL
2491                             || strGT(x         + COLLXFRM_HDR_LEN,
2492                                      cur_max_x + COLLXFRM_HDR_LEN))
2493                         {
2494                             PL_strxfrm_max_cp = j;
2495                             cur_max_x = x;
2496                         }
2497                         else {
2498                             Safefree(x);
2499                         }
2500                     }
2501
2502                     if (! cur_max_x) {
2503                         DEBUG_L(PerlIO_printf(Perl_debug_log,
2504                             "_mem_collxfrm: Couldn't find any character to"
2505                             " replace above-Latin1 chars in locale %s with",
2506                             PL_collation_name));
2507                         goto bad;
2508                     }
2509
2510                     DEBUG_L(PerlIO_printf(Perl_debug_log,
2511                             "_mem_collxfrm: highest 1-byte collating character"
2512                             " in locale %s is 0x%02X\n",
2513                             PL_collation_name,
2514                             PL_strxfrm_max_cp));
2515
2516                     Safefree(cur_max_x);
2517                 }
2518
2519                 /* Here we know which legal code point collates the highest.
2520                  * We are ready to construct the non-UTF-8 string.  The length
2521                  * will be at least 1 byte smaller than the input string
2522                  * (because we changed at least one 2-byte character into a
2523                  * single byte), but that is eaten up by the trailing NUL */
2524                 Newx(s, len, char);
2525
2526                 {
2527                     STRLEN i;
2528                     STRLEN d= 0;
2529                     char * e = (char *) t + len;
2530
2531                     for (i = 0; i < len; i+= UTF8SKIP(t + i)) {
2532                         U8 cur_char = t[i];
2533                         if (UTF8_IS_INVARIANT(cur_char)) {
2534                             s[d++] = cur_char;
2535                         }
2536                         else if (UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(t + i, e)) {
2537                             s[d++] = EIGHT_BIT_UTF8_TO_NATIVE(cur_char, t[i+1]);
2538                         }
2539                         else {  /* Replace illegal cp with highest collating
2540                                    one */
2541                             s[d++] = PL_strxfrm_max_cp;
2542                         }
2543                     }
2544                     s[d++] = '\0';
2545                     Renew(s, d, char);   /* Free up unused space */
2546                 }
2547             }
2548         }
2549
2550         /* Here, we have constructed a modified version of the input.  It could
2551          * be that we already had a modified copy before we did this version.
2552          * If so, that copy is no longer needed */
2553         if (t != input_string) {
2554             Safefree(t);
2555         }
2556     }
2557
2558     length_in_chars = (utf8)
2559                       ? utf8_length((U8 *) s, (U8 *) s + len)
2560                       : len;
2561
2562     /* The first element in the output is the collation id, used by
2563      * sv_collxfrm(); then comes the space for the transformed string.  The
2564      * equation should give us a good estimate as to how much is needed */
2565     xAlloc = COLLXFRM_HDR_LEN
2566            + PL_collxfrm_base
2567            + (PL_collxfrm_mult * length_in_chars);
2568     Newx(xbuf, xAlloc, char);
2569     if (UNLIKELY(! xbuf)) {
2570         DEBUG_L(PerlIO_printf(Perl_debug_log,
2571                       "_mem_collxfrm: Couldn't malloc %zu bytes\n", xAlloc));
2572         goto bad;
2573     }
2574
2575     /* Store the collation id */
2576     *(U32*)xbuf = PL_collation_ix;
2577
2578     /* Then the transformation of the input.  We loop until successful, or we
2579      * give up */
2580     for (;;) {
2581
2582         *xlen = strxfrm(xbuf + COLLXFRM_HDR_LEN, s, xAlloc - COLLXFRM_HDR_LEN);
2583
2584         /* If the transformed string occupies less space than we told strxfrm()
2585          * was available, it means it successfully transformed the whole
2586          * string. */
2587         if (*xlen < xAlloc - COLLXFRM_HDR_LEN) {
2588
2589             /* Some systems include a trailing NUL in the returned length.
2590              * Ignore it, using a loop in case multiple trailing NULs are
2591              * returned. */
2592             while (   (*xlen) > 0
2593                    && *(xbuf + COLLXFRM_HDR_LEN + (*xlen) - 1) == '\0')
2594             {
2595                 (*xlen)--;
2596             }
2597
2598             /* If the first try didn't get it, it means our prediction was low.
2599              * Modify the coefficients so that we predict a larger value in any
2600              * future transformations */
2601             if (! first_time) {
2602                 STRLEN needed = *xlen + 1;   /* +1 For trailing NUL */
2603                 STRLEN computed_guess = PL_collxfrm_base
2604                                       + (PL_collxfrm_mult * length_in_chars);
2605
2606                 /* On zero-length input, just keep current slope instead of
2607                  * dividing by 0 */
2608                 const STRLEN new_m = (length_in_chars != 0)
2609                                      ? needed / length_in_chars
2610                                      : PL_collxfrm_mult;
2611
2612                 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
2613                     "%s: %d: initial size of %zu bytes for a length "
2614                     "%zu string was insufficient, %zu needed\n",
2615                     __FILE__, __LINE__,
2616                     computed_guess, length_in_chars, needed));
2617
2618                 /* If slope increased, use it, but discard this result for
2619                  * length 1 strings, as we can't be sure that it's a real slope
2620                  * change */
2621                 if (length_in_chars > 1 && new_m  > PL_collxfrm_mult) {
2622
2623 #  ifdef DEBUGGING
2624
2625                     STRLEN old_m = PL_collxfrm_mult;
2626                     STRLEN old_b = PL_collxfrm_base;
2627
2628 #  endif
2629
2630                     PL_collxfrm_mult = new_m;
2631                     PL_collxfrm_base = 1;   /* +1 For trailing NUL */
2632                     computed_guess = PL_collxfrm_base
2633                                     + (PL_collxfrm_mult * length_in_chars);
2634                     if (computed_guess < needed) {
2635                         PL_collxfrm_base += needed - computed_guess;
2636                     }
2637
2638                     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
2639                         "%s: %d: slope is now %zu; was %zu, base "
2640                         "is now %zu; was %zu\n",
2641                         __FILE__, __LINE__,
2642                         PL_collxfrm_mult, old_m,
2643                         PL_collxfrm_base, old_b));
2644                 }
2645                 else {  /* Slope didn't change, but 'b' did */
2646                     const STRLEN new_b = needed
2647                                         - computed_guess
2648                                         + PL_collxfrm_base;
2649                     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
2650                         "%s: %d: base is now %zu; was %zu\n",
2651                         __FILE__, __LINE__,
2652                         new_b, PL_collxfrm_base));
2653                     PL_collxfrm_base = new_b;
2654                 }
2655             }
2656
2657             break;
2658         }
2659
2660         if (UNLIKELY(*xlen >= PERL_INT_MAX)) {
2661             DEBUG_L(PerlIO_printf(Perl_debug_log,
2662                   "_mem_collxfrm: Needed %zu bytes, max permissible is %u\n",
2663                   *xlen, PERL_INT_MAX));
2664             goto bad;
2665         }
2666
2667         /* A well-behaved strxfrm() returns exactly how much space it needs
2668          * (usually not including the trailing NUL) when it fails due to not
2669          * enough space being provided.  Assume that this is the case unless
2670          * it's been proven otherwise */
2671         if (LIKELY(PL_strxfrm_is_behaved) && first_time) {
2672             xAlloc = *xlen + COLLXFRM_HDR_LEN + 1;
2673         }
2674         else { /* Here, either:
2675                 *  1)  The strxfrm() has previously shown bad behavior; or
2676                 *  2)  It isn't the first time through the loop, which means
2677                 *      that the strxfrm() is now showing bad behavior, because
2678                 *      we gave it what it said was needed in the previous
2679                 *      iteration, and it came back saying it needed still more.
2680                 *      (Many versions of cygwin fit this.  When the buffer size
2681                 *      isn't sufficient, they return the input size instead of
2682                 *      how much is needed.)
2683                 * Increase the buffer size by a fixed percentage and try again.
2684                 * */
2685             xAlloc += (xAlloc / 4) + 1;
2686             PL_strxfrm_is_behaved = FALSE;
2687
2688 #  ifdef DEBUGGING
2689
2690             if (DEBUG_Lv_TEST || debug_initialization) {
2691                 PerlIO_printf(Perl_debug_log,
2692                 "_mem_collxfrm required more space than previously calculated"
2693                 " for locale %s, trying again with new guess=%d+%zu\n",
2694                 PL_collation_name, (int) COLLXFRM_HDR_LEN,
2695                 xAlloc - COLLXFRM_HDR_LEN);
2696             }
2697
2698 #  endif
2699
2700         }
2701
2702         Renew(xbuf, xAlloc, char);
2703         if (UNLIKELY(! xbuf)) {
2704             DEBUG_L(PerlIO_printf(Perl_debug_log,
2705                       "_mem_collxfrm: Couldn't realloc %zu bytes\n", xAlloc));
2706             goto bad;
2707         }
2708
2709         first_time = FALSE;
2710     }
2711
2712
2713 #  ifdef DEBUGGING
2714
2715     if (DEBUG_Lv_TEST || debug_initialization) {
2716
2717         print_collxfrm_input_and_return(s, s + len, xlen, utf8);
2718         PerlIO_printf(Perl_debug_log, "Its xfrm is:");
2719         PerlIO_printf(Perl_debug_log, "%s\n",
2720                       _byte_dump_string((U8 *) xbuf + COLLXFRM_HDR_LEN,
2721                        *xlen, 1));
2722     }
2723
2724 #  endif
2725
2726     /* Free up unneeded space; retain ehough for trailing NUL */
2727     Renew(xbuf, COLLXFRM_HDR_LEN + *xlen + 1, char);
2728
2729     if (s != input_string) {
2730         Safefree(s);
2731     }
2732
2733     return xbuf;
2734
2735   bad:
2736     Safefree(xbuf);
2737     if (s != input_string) {
2738         Safefree(s);
2739     }
2740     *xlen = 0;
2741
2742 #  ifdef DEBUGGING
2743
2744     if (DEBUG_Lv_TEST || debug_initialization) {
2745         print_collxfrm_input_and_return(s, s + len, NULL, utf8);
2746     }
2747
2748 #  endif
2749
2750     return NULL;
2751 }
2752
2753 #  ifdef DEBUGGING
2754
2755 STATIC void
2756 S_print_collxfrm_input_and_return(pTHX_
2757                                   const char * const s,
2758                                   const char * const e,
2759                                   const STRLEN * const xlen,
2760                                   const bool is_utf8)
2761 {
2762
2763     PERL_ARGS_ASSERT_PRINT_COLLXFRM_INPUT_AND_RETURN;
2764
2765     PerlIO_printf(Perl_debug_log, "_mem_collxfrm[%" UVuf "]: returning ",
2766                                                         (UV)PL_collation_ix);
2767     if (xlen) {
2768         PerlIO_printf(Perl_debug_log, "%zu", *xlen);
2769     }
2770     else {
2771         PerlIO_printf(Perl_debug_log, "NULL");
2772     }
2773     PerlIO_printf(Perl_debug_log, " for locale '%s', string='",
2774                                                             PL_collation_name);
2775     print_bytes_for_locale(s, e, is_utf8);
2776
2777     PerlIO_printf(Perl_debug_log, "'\n");
2778 }
2779
2780 STATIC void
2781 S_print_bytes_for_locale(pTHX_
2782                     const char * const s,
2783                     const char * const e,
2784                     const bool is_utf8)
2785 {
2786     const char * t = s;
2787     bool prev_was_printable = TRUE;
2788     bool first_time = TRUE;
2789
2790     PERL_ARGS_ASSERT_PRINT_BYTES_FOR_LOCALE;
2791
2792     while (t < e) {
2793         UV cp = (is_utf8)
2794                 ?  utf8_to_uvchr_buf((U8 *) t, e, NULL)
2795                 : * (U8 *) t;
2796         if (isPRINT(cp)) {
2797             if (! prev_was_printable) {
2798                 PerlIO_printf(Perl_debug_log, " ");
2799             }
2800             PerlIO_printf(Perl_debug_log, "%c", (U8) cp);
2801             prev_was_printable = TRUE;
2802         }
2803         else {
2804             if (! first_time) {
2805                 PerlIO_printf(Perl_debug_log, " ");
2806             }
2807             PerlIO_printf(Perl_debug_log, "%02" UVXf, cp);
2808             prev_was_printable = FALSE;
2809         }
2810         t += (is_utf8) ? UTF8SKIP(t) : 1;
2811         first_time = FALSE;
2812     }
2813 }
2814
2815 #  endif   /* #ifdef DEBUGGING */
2816 #endif /* USE_LOCALE_COLLATE */
2817
2818 #ifdef USE_LOCALE
2819
2820 bool
2821 Perl__is_cur_LC_category_utf8(pTHX_ int category)
2822 {
2823     /* Returns TRUE if the current locale for 'category' is UTF-8; FALSE
2824      * otherwise. 'category' may not be LC_ALL.  If the platform doesn't have
2825      * nl_langinfo(), nor MB_CUR_MAX, this employs a heuristic, which hence
2826      * could give the wrong result.  The result will very likely be correct for
2827      * languages that have commonly used non-ASCII characters, but for notably
2828      * English, it comes down to if the locale's name ends in something like
2829      * "UTF-8".  It errs on the side of not being a UTF-8 locale. */
2830
2831     char *save_input_locale = NULL;
2832     STRLEN final_pos;
2833
2834 #  ifdef LC_ALL
2835
2836     assert(category != LC_ALL);
2837
2838 #  endif
2839
2840     /* First dispose of the trivial cases */
2841     save_input_locale = do_setlocale_r(category, NULL);
2842     if (! save_input_locale) {
2843         DEBUG_L(PerlIO_printf(Perl_debug_log,
2844                               "Could not find current locale for category %d\n",
2845                               category));
2846         return FALSE;   /* XXX maybe should croak */
2847     }
2848     save_input_locale = stdize_locale(savepv(save_input_locale));
2849     if (isNAME_C_OR_POSIX(save_input_locale)) {
2850         DEBUG_L(PerlIO_printf(Perl_debug_log,
2851                               "Current locale for category %d is %s\n",
2852                               category, save_input_locale));
2853         Safefree(save_input_locale);
2854         return FALSE;
2855     }
2856
2857 #  if defined(USE_LOCALE_CTYPE)    \
2858     && (defined(MB_CUR_MAX) || (defined(HAS_NL_LANGINFO) && defined(CODESET)))
2859
2860     { /* Next try nl_langinfo or MB_CUR_MAX if available */
2861
2862         char *save_ctype_locale = NULL;
2863         bool is_utf8;
2864
2865         if (category != LC_CTYPE) { /* These work only on LC_CTYPE */
2866
2867             /* Get the current LC_CTYPE locale */
2868             save_ctype_locale = do_setlocale_c(LC_CTYPE, NULL);
2869             if (! save_ctype_locale) {
2870                 DEBUG_L(PerlIO_printf(Perl_debug_log,
2871                                "Could not find current locale for LC_CTYPE\n"));
2872                 goto cant_use_nllanginfo;
2873             }
2874             save_ctype_locale = stdize_locale(savepv(save_ctype_locale));
2875
2876             /* If LC_CTYPE and the desired category use the same locale, this
2877              * means that finding the value for LC_CTYPE is the same as finding
2878              * the value for the desired category.  Otherwise, switch LC_CTYPE
2879              * to the desired category's locale */
2880             if (strEQ(save_ctype_locale, save_input_locale)) {
2881                 Safefree(save_ctype_locale);
2882                 save_ctype_locale = NULL;
2883             }
2884             else if (! do_setlocale_c(LC_CTYPE, save_input_locale)) {
2885                 DEBUG_L(PerlIO_printf(Perl_debug_log,
2886                                     "Could not change LC_CTYPE locale to %s\n",
2887                                     save_input_locale));
2888                 Safefree(save_ctype_locale);
2889                 goto cant_use_nllanginfo;
2890             }
2891         }
2892
2893         DEBUG_L(PerlIO_printf(Perl_debug_log, "Current LC_CTYPE locale=%s\n",
2894                                               save_input_locale));
2895
2896         /* Here the current LC_CTYPE is set to the locale of the category whose
2897          * information is desired.  This means that nl_langinfo() and MB_CUR_MAX
2898          * should give the correct results */
2899
2900 #    if defined(HAS_NL_LANGINFO) && defined(CODESET)
2901      /* The task is easiest if has this POSIX 2001 function */
2902
2903         {
2904             const char *codeset = my_nl_langinfo(PERL_CODESET, FALSE);
2905                                           /* FALSE => already in dest locale */
2906
2907             DEBUG_L(PerlIO_printf(Perl_debug_log,
2908                             "\tnllanginfo returned CODESET '%s'\n", codeset));
2909
2910             if (codeset && strNE(codeset, "")) {
2911                 /* If we switched LC_CTYPE, switch back */
2912                 if (save_ctype_locale) {
2913                     do_setlocale_c(LC_CTYPE, save_ctype_locale);
2914                     Safefree(save_ctype_locale);
2915                 }
2916
2917                 is_utf8 = (   (   strlen(codeset) == STRLENs("UTF-8")
2918                                && foldEQ(codeset, STR_WITH_LEN("UTF-8")))
2919                            || (   strlen(codeset) == STRLENs("UTF8")
2920                                && foldEQ(codeset, STR_WITH_LEN("UTF8"))));
2921
2922                 DEBUG_L(PerlIO_printf(Perl_debug_log,
2923                        "\tnllanginfo returned CODESET '%s'; ?UTF8 locale=%d\n",
2924                                                      codeset,         is_utf8));
2925                 Safefree(save_input_locale);
2926                 return is_utf8;
2927             }
2928         }
2929
2930 #    endif
2931 #    ifdef MB_CUR_MAX
2932
2933         /* Here, either we don't have nl_langinfo, or it didn't return a
2934          * codeset.  Try MB_CUR_MAX */
2935
2936         /* Standard UTF-8 needs at least 4 bytes to represent the maximum
2937          * Unicode code point.  Since UTF-8 is the only non-single byte
2938          * encoding we handle, we just say any such encoding is UTF-8, and if
2939          * turns out to be wrong, other things will fail */
2940         is_utf8 = (unsigned) MB_CUR_MAX >= STRLENs(MAX_UNICODE_UTF8);
2941
2942         DEBUG_L(PerlIO_printf(Perl_debug_log,
2943                               "\tMB_CUR_MAX=%d; ?UTF8 locale=%d\n",
2944                                    (int) MB_CUR_MAX,      is_utf8));
2945
2946         Safefree(save_input_locale);
2947
2948 #      ifdef HAS_MBTOWC
2949
2950         /* ... But, most system that have MB_CUR_MAX will also have mbtowc(),
2951          * since they are both in the C99 standard.  We can feed a known byte
2952          * string to the latter function, and check that it gives the expected
2953          * result */
2954         if (is_utf8) {
2955             wchar_t wc;
2956             int len;
2957
2958             PERL_UNUSED_RESULT(mbtowc(&wc, NULL, 0));/* Reset any shift state */
2959             errno = 0;
2960             len = mbtowc(&wc, STR_WITH_LEN(REPLACEMENT_CHARACTER_UTF8));
2961
2962
2963             if (   len != STRLENs(REPLACEMENT_CHARACTER_UTF8)
2964                 || wc != (wchar_t) UNICODE_REPLACEMENT)
2965             {
2966                 is_utf8 = FALSE;
2967                 DEBUG_L(PerlIO_printf(Perl_debug_log, "\replacement=U+%x\n",
2968                                                             (unsigned int)wc));
2969                 DEBUG_L(PerlIO_printf(Perl_debug_log,
2970                         "\treturn from mbtowc=%d; errno=%d; ?UTF8 locale=0\n",
2971                                                len,      errno));
2972             }
2973         }
2974
2975 #      endif
2976
2977         /* If we switched LC_CTYPE, switch back */
2978         if (save_ctype_locale) {
2979             do_setlocale_c(LC_CTYPE, save_ctype_locale);
2980             Safefree(save_ctype_locale);
2981         }
2982
2983         return is_utf8;
2984
2985 #    endif
2986
2987     }
2988
2989   cant_use_nllanginfo:
2990
2991 #  else   /* nl_langinfo should work if available, so don't bother compiling this
2992            fallback code.  The final fallback of looking at the name is
2993            compiled, and will be executed if nl_langinfo fails */
2994
2995     /* nl_langinfo not available or failed somehow.  Next try looking at the
2996      * currency symbol to see if it disambiguates things.  Often that will be
2997      * in the native script, and if the symbol isn't in UTF-8, we know that the
2998      * locale isn't.  If it is non-ASCII UTF-8, we infer that the locale is
2999      * too, as the odds of a non-UTF8 string being valid UTF-8 are quite small
3000      * */
3001
3002 #    ifdef HAS_LOCALECONV
3003 #      ifdef USE_LOCALE_MONETARY
3004
3005     {
3006         char *save_monetary_locale = NULL;
3007         bool only_ascii = FALSE;
3008         bool is_utf8 = FALSE;
3009         struct lconv* lc;
3010
3011         /* Like above for LC_CTYPE, we first set LC_MONETARY to the locale of
3012          * the desired category, if it isn't that locale already */
3013
3014         if (category != LC_MONETARY) {
3015
3016             save_monetary_locale = do_setlocale_c(LC_MONETARY, NULL);
3017             if (! save_monetary_locale) {
3018                 DEBUG_L(PerlIO_printf(Perl_debug_log,
3019                             "Could not find current locale for LC_MONETARY\n"));
3020                 goto cant_use_monetary;
3021             }
3022             save_monetary_locale = stdize_locale(savepv(save_monetary_locale));
3023
3024             if (strEQ(save_monetary_locale, save_input_locale)) {
3025                 Safefree(save_monetary_locale);
3026                 save_monetary_locale = NULL;
3027             }
3028             else if (! do_setlocale_c(LC_MONETARY, save_input_locale)) {
3029                 DEBUG_L(PerlIO_printf(Perl_debug_log,
3030                             "Could not change LC_MONETARY locale to %s\n",
3031                                                         save_input_locale));
3032                 Safefree(save_monetary_locale);
3033                 goto cant_use_monetary;
3034             }
3035         }
3036
3037         /* Here the current LC_MONETARY is set to the locale of the category
3038          * whose information is desired. */
3039
3040         lc = localeconv();
3041         if (! lc
3042             || ! lc->currency_symbol
3043             || is_utf8_invariant_string((U8 *) lc->currency_symbol, 0))
3044         {
3045             DEBUG_L(PerlIO_printf(Perl_debug_log, "Couldn't get currency symbol for %s, or contains only ASCII; can't use for determining if UTF-8 locale\n", save_input_locale));
3046             only_ascii = TRUE;
3047         }
3048         else {
3049             is_utf8 = is_utf8_string((U8 *) lc->currency_symbol, 0);
3050         }
3051
3052         /* If we changed it, restore LC_MONETARY to its original locale */
3053         if (save_monetary_locale) {
3054             do_setlocale_c(LC_MONETARY, save_monetary_locale);
3055             Safefree(save_monetary_locale);
3056         }
3057
3058         if (! only_ascii) {
3059
3060             /* It isn't a UTF-8 locale if the symbol is not legal UTF-8;
3061              * otherwise assume the locale is UTF-8 if and only if the symbol
3062              * is non-ascii UTF-8. */
3063             DEBUG_L(PerlIO_printf(Perl_debug_log, "\t?Currency symbol for %s is UTF-8=%d\n",
3064                                     save_input_locale, is_utf8));
3065             Safefree(save_input_locale);
3066             return is_utf8;
3067         }
3068     }
3069   cant_use_monetary:
3070
3071 #      endif /* USE_LOCALE_MONETARY */
3072 #    endif /* HAS_LOCALECONV */
3073
3074 #    if defined(HAS_STRFTIME) && defined(USE_LOCALE_TIME)
3075
3076 /* Still haven't found a non-ASCII string to disambiguate UTF-8 or not.  Try
3077  * the names of the months and weekdays, timezone, and am/pm indicator */
3078     {
3079         char *save_time_locale = NULL;
3080         int hour = 10;
3081         bool is_dst = FALSE;
3082         int dom = 1;
3083         int month = 0;
3084         int i;
3085         char * formatted_time;
3086
3087
3088         /* Like above for LC_MONETARY, we set LC_TIME to the locale of the
3089          * desired category, if it isn't that locale already */
3090
3091         if (category != LC_TIME) {
3092
3093             save_time_locale = do_setlocale_c(LC_TIME, NULL);
3094             if (! save_time_locale) {
3095                 DEBUG_L(PerlIO_printf(Perl_debug_log,
3096                             "Could not find current locale for LC_TIME\n"));
3097                 goto cant_use_time;
3098             }
3099             save_time_locale = stdize_locale(savepv(save_time_locale));
3100
3101             if (strEQ(save_time_locale, save_input_locale)) {
3102                 Safefree(save_time_locale);
3103                 save_time_locale = NULL;
3104             }
3105             else if (! do_setlocale_c(LC_TIME, save_input_locale)) {
3106                 DEBUG_L(PerlIO_printf(Perl_debug_log,
3107                             "Could not change LC_TIME locale to %s\n",
3108                                                         save_input_locale));
3109                 Safefree(save_time_locale);
3110                 goto cant_use_time;
3111             }
3112         }
3113
3114         /* Here the current LC_TIME is set to the locale of the category
3115          * whose information is desired.  Look at all the days of the week and
3116          * month names, and the timezone and am/pm indicator for UTF-8 variant
3117          * characters.  The first such a one found will tell us if the locale
3118          * is UTF-8 or not */
3119
3120         for (i = 0; i < 7 + 12; i++) {  /* 7 days; 12 months */
3121             formatted_time = my_strftime("%A %B %Z %p",
3122                             0, 0, hour, dom, month, 2012 - 1900, 0, 0, is_dst);
3123             if ( ! formatted_time
3124                 || is_utf8_invariant_string((U8 *) formatted_time, 0))
3125             {
3126
3127                 /* Here, we didn't find a non-ASCII.  Try the next time through
3128                  * with the complemented dst and am/pm, and try with the next
3129                  * weekday.  After we have gotten all weekdays, try the next
3130                  * month */
3131                 is_dst = ! is_dst;
3132                 hour = (hour + 12) % 24;
3133                 dom++;
3134                 if (i > 6) {
3135                     month++;
3136                 }
3137                 continue;
3138             }
3139
3140             /* Here, we have a non-ASCII.  Return TRUE is it is valid UTF8;
3141              * false otherwise.  But first, restore LC_TIME to its original
3142              * locale if we changed it */
3143             if (save_time_locale) {
3144                 do_setlocale_c(LC_TIME, save_time_locale);
3145                 Safefree(save_time_locale);
3146             }
3147
3148             DEBUG_L(PerlIO_printf(Perl_debug_log, "\t?time-related strings for %s are UTF-8=%d\n",
3149                                 save_input_locale,
3150                                 is_utf8_string((U8 *) formatted_time, 0)));
3151             Safefree(save_input_locale);
3152             return is_utf8_string((U8 *) formatted_time, 0);
3153         }
3154
3155         /* Falling off the end of the loop indicates all the names were just
3156          * ASCII.  Go on to the next test.  If we changed it, restore LC_TIME
3157          * to its original locale */
3158         if (save_time_locale) {
3159             do_setlocale_c(LC_TIME, save_time_locale);
3160             Safefree(save_time_locale);
3161         }
3162         DEBUG_L(PerlIO_printf(Perl_debug_log, "All time-related words for %s contain only ASCII; can't use for determining if UTF-8 locale\n", save_input_locale));
3163     }
3164   cant_use_time:
3165
3166 #    endif
3167
3168 #    if 0 && defined(USE_LOCALE_MESSAGES) && defined(HAS_SYS_ERRLIST)
3169
3170 /* This code is ifdefd out because it was found to not be necessary in testing
3171  * on our dromedary test machine, which has over 700 locales.  There, this
3172  * added no value to looking at the currency symbol and the time strings.  I
3173  * left it in so as to avoid rewriting it if real-world experience indicates
3174  * that dromedary is an outlier.  Essentially, instead of returning abpve if we
3175  * haven't found illegal utf8, we continue on and examine all the strerror()
3176  * messages on the platform for utf8ness.  If all are ASCII, we still don't
3177  * know the answer; but otherwise we have a pretty good indication of the
3178  * utf8ness.  The reason this doesn't help much is that the messages may not
3179  * have been translated into the locale.  The currency symbol and time strings
3180  * are much more likely to have been translated.  */
3181     {
3182         int e;
3183         bool is_utf8 = FALSE;
3184         bool non_ascii = FALSE;
3185         char *save_messages_locale = NULL;
3186         const char * errmsg = NULL;
3187
3188         /* Like above, we set LC_MESSAGES to the locale of the desired
3189          * category, if it isn't that locale already */
3190
3191         if (category != LC_MESSAGES) {
3192
3193             save_messages_locale = do_setlocale_c(LC_MESSAGES, NULL);
3194             if (! save_messages_locale) {
3195                 DEBUG_L(PerlIO_printf(Perl_debug_log,
3196                             "Could not find current locale for LC_MESSAGES\n"));
3197                 goto cant_use_messages;
3198             }
3199             save_messages_locale = stdize_locale(savepv(save_messages_locale));
3200
3201             if (strEQ(save_messages_locale, save_input_locale)) {
3202                 Safefree(save_messages_locale);
3203                 save_messages_locale = NULL;
3204             }
3205             else if (! do_setlocale_c(LC_MESSAGES, save_input_locale)) {
3206                 DEBUG_L(PerlIO_printf(Perl_debug_log,
3207                             "Could not change LC_MESSAGES locale to %s\n",
3208                                                         save_input_locale));
3209                 Safefree(save_messages_locale);
3210                 goto cant_use_messages;
3211             }
3212         }
3213
3214         /* Here the current LC_MESSAGES is set to the locale of the category
3215          * whose information is desired.  Look through all the messages.  We
3216          * can't use Strerror() here because it may expand to code that
3217          * segfaults in miniperl */
3218
3219         for (e = 0; e <= sys_nerr; e++) {
3220             errno = 0;
3221             errmsg = sys_errlist[e];
3222             if (errno || !errmsg) {
3223                 break;
3224             }
3225             errmsg = savepv(errmsg);
3226             if (! is_utf8_invariant_string((U8 *) errmsg, 0)) {
3227                 non_ascii = TRUE;
3228                 is_utf8 = is_utf8_string((U8 *) errmsg, 0);
3229                 break;
3230             }
3231         }
3232         Safefree(errmsg);
3233
3234         /* And, if we changed it, restore LC_MESSAGES to its original locale */
3235         if (save_messages_locale) {
3236             do_setlocale_c(LC_MESSAGES, save_messages_locale);
3237             Safefree(save_messages_locale);
3238         }
3239
3240         if (non_ascii) {
3241
3242             /* Any non-UTF-8 message means not a UTF-8 locale; if all are valid,
3243              * any non-ascii means it is one; otherwise we assume it isn't */
3244             DEBUG_L(PerlIO_printf(Perl_debug_log, "\t?error messages for %s are UTF-8=%d\n",
3245                                 save_input_locale,
3246                                 is_utf8));
3247             Safefree(save_input_locale);
3248             return is_utf8;
3249         }
3250
3251         DEBUG_L(PerlIO_printf(Perl_debug_log, "All error messages for %s contain only ASCII; can't use for determining if UTF-8 locale\n", save_input_locale));
3252     }
3253   cant_use_messages:
3254
3255 #    endif
3256 #  endif /* the code that is compiled when no nl_langinfo */
3257
3258 #  ifndef EBCDIC  /* On os390, even if the name ends with "UTF-8', it isn't a
3259                    UTF-8 locale */
3260
3261     /* As a last resort, look at the locale name to see if it matches
3262      * qr/UTF -?  * 8 /ix, or some other common locale names.  This "name", the
3263      * return of setlocale(), is actually defined to be opaque, so we can't
3264      * really rely on the absence of various substrings in the name to indicate
3265      * its UTF-8ness, but if it has UTF8 in the name, it is extremely likely to
3266      * be a UTF-8 locale.  Similarly for the other common names */
3267
3268     final_pos = strlen(save_input_locale) - 1;
3269     if (final_pos >= 3) {
3270         char *name = save_input_locale;
3271
3272         /* Find next 'U' or 'u' and look from there */
3273         while ((name += strcspn(name, "Uu") + 1)
3274                                             <= save_input_locale + final_pos - 2)
3275         {
3276             if (   isALPHA_FOLD_NE(*name, 't')
3277                 || isALPHA_FOLD_NE(*(name + 1), 'f'))
3278             {
3279                 continue;
3280             }
3281             name += 2;
3282             if (*(name) == '-') {
3283                 if ((name > save_input_locale + final_pos - 1)) {
3284                     break;
3285                 }
3286                 name++;
3287             }
3288             if (*(name) == '8') {
3289                 DEBUG_L(PerlIO_printf(Perl_debug_log,
3290                                       "Locale %s ends with UTF-8 in name\n",
3291                                       save_input_locale));
3292                 Safefree(save_input_locale);
3293                 return TRUE;
3294             }
3295         }
3296         DEBUG_L(PerlIO_printf(Perl_debug_log,
3297                               "Locale %s doesn't end with UTF-8 in name\n",
3298                                 save_input_locale));
3299     }
3300
3301 #  endif
3302 #  ifdef WIN32
3303
3304     /* http://msdn.microsoft.com/en-us/library/windows/desktop/dd317756.aspx */
3305     if (memENDs(save_input_locale, final_pos, "65001")) {
3306         DEBUG_L(PerlIO_printf(Perl_debug_log,
3307                         "Locale %s ends with 65001 in name, is UTF-8 locale\n",
3308                         save_input_locale));
3309         Safefree(save_input_locale);
3310         return TRUE;
3311     }
3312
3313 #  endif
3314
3315     /* Other common encodings are the ISO 8859 series, which aren't UTF-8.  But
3316      * since we are about to return FALSE anyway, there is no point in doing
3317      * this extra work */
3318
3319 #  if 0
3320     if (instr(save_input_locale, "8859")) {
3321         DEBUG_L(PerlIO_printf(Perl_debug_log,
3322                              "Locale %s has 8859 in name, not UTF-8 locale\n",
3323                              save_input_locale));
3324         Safefree(save_input_locale);
3325         return FALSE;
3326     }
3327 #  endif
3328
3329     DEBUG_L(PerlIO_printf(Perl_debug_log,
3330                           "Assuming locale %s is not a UTF-8 locale\n",
3331                                     save_input_locale));
3332     Safefree(save_input_locale);
3333     return FALSE;
3334 }
3335
3336 #endif
3337
3338
3339 bool
3340 Perl__is_in_locale_category(pTHX_ const bool compiling, const int category)
3341 {
3342     dVAR;
3343     /* Internal function which returns if we are in the scope of a pragma that
3344      * enables the locale category 'category'.  'compiling' should indicate if
3345      * this is during the compilation phase (TRUE) or not (FALSE). */
3346
3347     const COP * const cop = (compiling) ? &PL_compiling : PL_curcop;
3348
3349     SV *categories = cop_hints_fetch_pvs(cop, "locale", 0);
3350     if (! categories || categories == &PL_sv_placeholder) {
3351         return FALSE;
3352     }
3353
3354     /* The pseudo-category 'not_characters' is -1, so just add 1 to each to get
3355      * a valid unsigned */
3356     assert(category >= -1);
3357     return cBOOL(SvUV(categories) & (1U << (category + 1)));
3358 }
3359
3360 char *
3361 Perl_my_strerror(pTHX_ const int errnum)
3362 {
3363     /* Returns a mortalized copy of the text of the error message associated
3364      * with 'errnum'.  It uses the current locale's text unless the platform
3365      * doesn't have the LC_MESSAGES category or we are not being called from
3366      * within the scope of 'use locale'.  In the former case, it uses whatever
3367      * strerror returns; in the latter case it uses the text from the C locale.
3368      *
3369      * The function just calls strerror(), but temporarily switches, if needed,
3370      * to the C locale */
3371
3372     char *errstr;
3373     dVAR;
3374
3375 #ifndef USE_LOCALE_MESSAGES
3376
3377     /* If platform doesn't have messages category, we don't do any switching to
3378      * the C locale; we just use whatever strerror() returns */
3379
3380     errstr = savepv(Strerror(errnum));
3381
3382 #else   /* Has locale messages */
3383
3384     const bool within_locale_scope = IN_LC(LC_MESSAGES);
3385
3386 #  if defined(HAS_POSIX_2008_LOCALE) && defined(HAS_STRERROR_L)
3387
3388     /* This function is trivial if we don't have to worry about thread safety
3389      * and have strerror_l(), as it handles the switch of locales so we don't
3390      * have to deal with that.  We don't have to worry about thread safety if
3391      * this is an unthreaded build, or if strerror_r() is also available.  Both
3392      * it and strerror_l() are thread-safe.  Plain strerror() isn't thread
3393      * safe.  But on threaded builds when strerror_r() is available, the
3394      * apparent call to strerror() below is actually a macro that
3395      * behind-the-scenes calls strerror_r().
3396      */
3397
3398 #    if ! defined(USE_ITHREADS) || defined(HAS_STRERROR_R)
3399
3400     if (within_locale_scope) {
3401         errstr = savepv(strerror(errnum));
3402     }
3403     else {
3404         errstr = savepv(strerror_l(errnum, PL_C_locale_obj));
3405     }
3406
3407 #    else
3408
3409     /* Here we have strerror_l(), but not strerror_r() and we are on a
3410      * threaded-build.  We use strerror_l() for everything, constructing a
3411      * locale to pass to it if necessary */
3412
3413     bool do_free = FALSE;
3414     locale_t locale_to_use;
3415
3416     if (within_locale_scope) {
3417         locale_to_use = uselocale((locale_t) 0);
3418         if (locale_to_use == LC_GLOBAL_LOCALE) {
3419             locale_to_use = duplocale(LC_GLOBAL_LOCALE);
3420             do_free = TRUE;
3421         }
3422     }
3423     else {  /* Use C locale if not within 'use locale' scope */
3424         locale_to_use = PL_C_locale_obj;
3425     }
3426
3427     errstr = savepv(strerror_l(errnum, locale_to_use));
3428
3429     if (do_free) {
3430         freelocale(locale_to_use);
3431     }
3432
3433 #    endif
3434 #  else /* Doesn't have strerror_l() */
3435
3436 #    ifdef USE_POSIX_2008_LOCALE
3437
3438     locale_t save_locale = NULL;
3439
3440 #    else
3441
3442     char * save_locale = NULL;
3443     bool locale_is_C = FALSE;
3444
3445     /* We have a critical section to prevent another thread from changing the
3446      * locale out from under us (or zapping the buffer returned from
3447      * setlocale() ) */
3448     LOCALE_LOCK;
3449
3450 #    endif
3451
3452     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
3453                             "my_strerror called with errnum %d\n", errnum));
3454     if (! within_locale_scope) {
3455         errno = 0;
3456
3457 #  ifdef USE_POSIX_2008_LOCALE /* Use the thread-safe locale functions */
3458
3459         DEBUG_Lv(PerlIO_printf(Perl_debug_log,
3460                                     "Not within locale scope, about to call"
3461                                     " uselocale(0x%p)\n", PL_C_locale_obj));
3462         save_locale = uselocale(PL_C_locale_obj);
3463         if (! save_locale) {
3464             DEBUG_L(PerlIO_printf(Perl_debug_log,
3465                                     "uselocale failed, errno=%d\n", errno));
3466         }
3467         else {
3468             DEBUG_Lv(PerlIO_printf(Perl_debug_log,
3469                                     "uselocale returned 0x%p\n", save_locale));
3470         }
3471
3472 #    else    /* Not thread-safe build */
3473
3474         save_locale = do_setlocale_c(LC_MESSAGES, NULL);
3475         if (! save_locale) {
3476             DEBUG_L(PerlIO_printf(Perl_debug_log,
3477                                   "setlocale failed, errno=%d\n", errno));
3478         }
3479         else {
3480             locale_is_C = isNAME_C_OR_POSIX(save_locale);
3481
3482             /* Switch to the C locale if not already in it */
3483             if (! locale_is_C) {
3484
3485                 /* The setlocale() just below likely will zap 'save_locale', so
3486                  * create a copy.  */
3487                 save_locale = savepv(save_locale);
3488                 do_setlocale_c(LC_MESSAGES, "C");
3489             }
3490         }
3491
3492 #    endif
3493
3494     }   /* end of ! within_locale_scope */
3495     else {
3496         DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s: %d: WITHIN locale scope\n",
3497                                                __FILE__, __LINE__));
3498     }
3499
3500     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
3501              "Any locale change has been done; about to call Strerror\n"));
3502     errstr = savepv(Strerror(errnum));
3503
3504     if (! within_locale_scope) {
3505         errno = 0;
3506
3507 #  ifdef USE_POSIX_2008_LOCALE
3508
3509         DEBUG_Lv(PerlIO_printf(Perl_debug_log,
3510                     "%s: %d: not within locale scope, restoring the locale\n",
3511                     __FILE__, __LINE__));
3512         if (save_locale && ! uselocale(save_locale)) {
3513             DEBUG_L(PerlIO_printf(Perl_debug_log,
3514                           "uselocale restore failed, errno=%d\n", errno));
3515         }
3516     }
3517
3518 #    else
3519
3520         if (save_locale && ! locale_is_C) {
3521             if (! do_setlocale_c(LC_MESSAGES, save_locale)) {
3522                 DEBUG_L(PerlIO_printf(Perl_debug_log,
3523                       "setlocale restore failed, errno=%d\n", errno));
3524             }
3525             Safefree(save_locale);
3526         }
3527     }
3528
3529     LOCALE_UNLOCK;
3530
3531 #    endif
3532 #  endif /* End of doesn't have strerror_l */
3533 #endif   /* End of does have locale messages */
3534
3535 #ifdef DEBUGGING
3536
3537     if (DEBUG_Lv_TEST) {
3538         PerlIO_printf(Perl_debug_log, "Strerror returned; saving a copy: '");
3539         print_bytes_for_locale(errstr, errstr + strlen(errstr), 0);
3540         PerlIO_printf(Perl_debug_log, "'\n");
3541     }
3542
3543 #endif
3544
3545     SAVEFREEPV(errstr);
3546     return errstr;
3547 }
3548
3549 /*
3550
3551 =for apidoc sync_locale
3552
3553 Changing the program's locale should be avoided by XS code.  Nevertheless,
3554 certain non-Perl libraries called from XS, such as C<Gtk> do so.  When this
3555 happens, Perl needs to be told that the locale has changed.  Use this function
3556 to do so, before returning to Perl.
3557
3558 =cut
3559 */
3560
3561 void
3562 Perl_sync_locale(pTHX)
3563 {
3564     char * newlocale;
3565
3566 #ifdef USE_LOCALE_CTYPE
3567
3568     newlocale = do_setlocale_c(LC_CTYPE, NULL);
3569     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
3570         "%s:%d: %s\n", __FILE__, __LINE__,
3571         setlocale_debug_string(LC_CTYPE, NULL, newlocale)));
3572     new_ctype(newlocale);
3573
3574 #endif /* USE_LOCALE_CTYPE */
3575 #ifdef USE_LOCALE_COLLATE
3576
3577     newlocale = do_setlocale_c(LC_COLLATE, NULL);
3578     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
3579         "%s:%d: %s\n", __FILE__, __LINE__,
3580         setlocale_debug_string(LC_COLLATE, NULL, newlocale)));
3581     new_collate(newlocale);
3582
3583 #endif
3584 #ifdef USE_LOCALE_NUMERIC
3585
3586     newlocale = do_setlocale_c(LC_NUMERIC, NULL);
3587     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
3588         "%s:%d: %s\n", __FILE__, __LINE__,
3589         setlocale_debug_string(LC_NUMERIC, NULL, newlocale)));
3590     new_numeric(newlocale);
3591
3592 #endif /* USE_LOCALE_NUMERIC */
3593
3594 }
3595
3596 #if defined(DEBUGGING) && defined(USE_LOCALE)
3597
3598 STATIC char *
3599 S_setlocale_debug_string(const int category,        /* category number,
3600                                                            like LC_ALL */
3601                             const char* const locale,   /* locale name */
3602
3603                             /* return value from setlocale() when attempting to
3604                              * set 'category' to 'locale' */
3605                             const char* const retval)
3606 {
3607     /* Returns a pointer to a NUL-terminated string in static storage with
3608      * added text about the info passed in.  This is not thread safe and will
3609      * be overwritten by the next call, so this should be used just to
3610      * formulate a string to immediately print or savepv() on. */
3611
3612     /* initialise to a non-null value to keep it out of BSS and so keep
3613      * -DPERL_GLOBAL_STRUCT_PRIVATE happy */
3614     static char ret[128] = "If you can read this, thank your buggy C"
3615                            " library strlcpy(), and change your hints file"
3616                            " to undef it";
3617     unsigned int i;
3618
3619 #  ifdef LC_ALL
3620
3621     const unsigned int highest_index = LC_ALL_INDEX;
3622
3623 #  else
3624
3625     const unsigned int highest_index = NOMINAL_LC_ALL_INDEX - 1;
3626
3627 #endif
3628
3629
3630     my_strlcpy(ret, "setlocale(", sizeof(ret));
3631
3632     /* Look for category in our list, and if found, add its name */
3633     for (i = 0; i <= highest_index; i++) {
3634         if (category == categories[i]) {
3635             my_strlcat(ret, category_names[i], sizeof(ret));
3636             goto found_category;
3637         }
3638     }
3639
3640     /* Unknown category to us */
3641     my_snprintf(ret, sizeof(ret), "%s? %d", ret, category);
3642
3643   found_category:
3644
3645     my_strlcat(ret, ", ", sizeof(ret));
3646
3647     if (locale) {
3648         my_strlcat(ret, "\"", sizeof(ret));
3649         my_strlcat(ret, locale, sizeof(ret));
3650         my_strlcat(ret, "\"", sizeof(ret));
3651     }
3652     else {
3653         my_strlcat(ret, "NULL", sizeof(ret));
3654     }
3655
3656     my_strlcat(ret, ") returned ", sizeof(ret));
3657
3658     if (retval) {
3659         my_strlcat(ret, "\"", sizeof(ret));
3660         my_strlcat(ret, retval, sizeof(ret));
3661         my_strlcat(ret, "\"", sizeof(ret));
3662     }
3663     else {
3664         my_strlcat(ret, "NULL", sizeof(ret));
3665     }
3666
3667     assert(strlen(ret) < sizeof(ret));
3668
3669     return ret;
3670 }
3671
3672 #endif
3673
3674
3675 /*
3676  * ex: set ts=8 sts=4 sw=4 et:
3677  */