perl5.git.perl.org Git - perl5.git/blame_incremental

... / ...

Commit	Line	Data
	1	/* locale.c
	2	*
	3	* Copyright (C) 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001,
	4	* 2002, 2003, 2005, 2006, 2007, 2008 by Larry Wall and others
	5	*
	6	* You may distribute under the terms of either the GNU General Public
	7	* License or the Artistic License, as specified in the README file.
	8	*
	9	*/
	10
	11	/*
	12	* A Elbereth Gilthoniel,
	13	* silivren penna míriel
	14	* o menel aglar elenath!
	15	* Na-chaered palan-díriel
	16	* o galadhremmin ennorath,
	17	* Fanuilos, le linnathon
	18	* nef aear, si nef aearon!
	19	*
	20	* [p.238 of _The Lord of the Rings_, II/i: "Many Meetings"]
	21	*/
	22
	23	/* utility functions for handling locale-specific stuff like what
	24	* character represents the decimal point.
	25	*
	26	* All C programs have an underlying locale. Perl code generally doesn't pay
	27	* any attention to it except within the scope of a 'use locale'. For most
	28	* categories, it accomplishes this by just using different operations if it is
	29	* in such scope than if not. However, various libc functions called by Perl
	30	* are affected by the LC_NUMERIC category, so there are macros in perl.h that
	31	* are used to toggle between the current locale and the C locale depending on
	32	* the desired behavior of those functions at the moment. And, LC_MESSAGES is
	33	* switched to the C locale for outputting the message unless within the scope
	34	* of 'use locale'.
	35	*/
	36
	37	#include "EXTERN.h"
	38	#define PERL_IN_LOCALE_C
	39	#include "perl_langinfo.h"
	40	#include "perl.h"
	41
	42	#include "reentr.h"
	43
	44	/* If the environment says to, we can output debugging information during
	45	* initialization. This is done before option parsing, and before any thread
	46	* creation, so can be a file-level static */
	47	#if ! defined(DEBUGGING) \|\| defined(PERL_GLOBAL_STRUCT)
	48	# define debug_initialization 0
	49	# define DEBUG_INITIALIZATION_set(v)
	50	#else
	51	static bool debug_initialization = FALSE;
	52	# define DEBUG_INITIALIZATION_set(v) (debug_initialization = v)
	53	#endif
	54
	55	/* strlen() of a literal string constant. We might want this more general,
	56	* but using it in just this file for now. A problem with more generality is
	57	* the compiler warnings about comparing unlike signs */
	58	#define STRLENs(s) (sizeof("" s "") - 1)
	59
	60	/* Is the C string input 'name' "C" or "POSIX"? If so, and 'name' is the
	61	* return of setlocale(), then this is extremely likely to be the C or POSIX
	62	* locale. However, the output of setlocale() is documented to be opaque, but
	63	* the odds are extremely small that it would return these two strings for some
	64	* other locale. Note that VMS in these two locales includes many non-ASCII
	65	* characters as controls and punctuation (below are hex bytes):
	66	* cntrl: 84-97 9B-9F
	67	* punct: A1-A3 A5 A7-AB B0-B3 B5-B7 B9-BD BF-CF D1-DD DF-EF F1-FD
	68	* Oddly, none there are listed as alphas, though some represent alphabetics
	69	* http://www.nntp.perl.org/group/perl.perl5.porters/2013/02/msg198753.html */
	70	#define isNAME_C_OR_POSIX(name) \
	71	( (name) != NULL \
	72	&& (( (name) == 'C' && ((name + 1)) == '\0') \
	73	\|\| strEQ((name), "POSIX")))
	74
	75	#ifdef USE_LOCALE
	76
	77	/*
	78	* Standardize the locale name from a string returned by 'setlocale', possibly
	79	* modifying that string.
	80	*
	81	* The typical return value of setlocale() is either
	82	* (1) "xx_YY" if the first argument of setlocale() is not LC_ALL
	83	* (2) "xa_YY xb_YY ..." if the first argument of setlocale() is LC_ALL
	84	* (the space-separated values represent the various sublocales,
	85	* in some unspecified order). This is not handled by this function.
	86	*
	87	* In some platforms it has a form like "LC_SOMETHING=Lang_Country.866\n",
	88	* which is harmful for further use of the string in setlocale(). This
	89	* function removes the trailing new line and everything up through the '='
	90	*
	91	*/
	92	STATIC char *
	93	S_stdize_locale(pTHX_ char *locs)
	94	{
	95	const char * const s = strchr(locs, '=');
	96	bool okay = TRUE;
	97
	98	PERL_ARGS_ASSERT_STDIZE_LOCALE;
	99
	100	if (s) {
	101	const char * const t = strchr(s, '.');
	102	okay = FALSE;
	103	if (t) {
	104	const char * const u = strchr(t, '\n');
	105	if (u && (u[1] == 0)) {
	106	const STRLEN len = u - s;
	107	Move(s + 1, locs, len, char);
	108	locs[len] = 0;
	109	okay = TRUE;
	110	}
	111	}
	112	}
	113
	114	if (!okay)
	115	Perl_croak(aTHX_ "Can't fix broken locale name \"%s\"", locs);
	116
	117	return locs;
	118	}
	119
	120	/* Two parallel arrays; first the locale categories Perl uses on this system;
	121	* the second array is their names. These arrays are in mostly arbitrary
	122	* order. */
	123
	124	const int categories[] = {
	125
	126	# ifdef USE_LOCALE_NUMERIC
	127	LC_NUMERIC,
	128	# endif
	129	# ifdef USE_LOCALE_CTYPE
	130	LC_CTYPE,
	131	# endif
	132	# ifdef USE_LOCALE_COLLATE
	133	LC_COLLATE,
	134	# endif
	135	# ifdef USE_LOCALE_TIME
	136	LC_TIME,
	137	# endif
	138	# ifdef USE_LOCALE_MESSAGES
	139	LC_MESSAGES,
	140	# endif
	141	# ifdef USE_LOCALE_MONETARY
	142	LC_MONETARY,
	143	# endif
	144	# ifdef USE_LOCALE_ADDRESS
	145	LC_ADDRESS,
	146	# endif
	147	# ifdef USE_LOCALE_IDENTIFICATION
	148	LC_IDENTIFICATION,
	149	# endif
	150	# ifdef USE_LOCALE_MEASUREMENT
	151	LC_MEASUREMENT,
	152	# endif
	153	# ifdef USE_LOCALE_PAPER
	154	LC_PAPER,
	155	# endif
	156	# ifdef USE_LOCALE_TELEPHONE
	157	LC_TELEPHONE,
	158	# endif
	159	# ifdef LC_ALL
	160	LC_ALL,
	161	# endif
	162	-1 /* Placeholder because C doesn't allow a
	163	trailing comma, and it would get complicated
	164	with all the #ifdef's */
	165	};
	166
	167	/* The top-most real element is LC_ALL */
	168
	169	const char * category_names[] = {
	170
	171	# ifdef USE_LOCALE_NUMERIC
	172	"LC_NUMERIC",
	173	# endif
	174	# ifdef USE_LOCALE_CTYPE
	175	"LC_CTYPE",
	176	# endif
	177	# ifdef USE_LOCALE_COLLATE
	178	"LC_COLLATE",
	179	# endif
	180	# ifdef USE_LOCALE_TIME
	181	"LC_TIME",
	182	# endif
	183	# ifdef USE_LOCALE_MESSAGES
	184	"LC_MESSAGES",
	185	# endif
	186	# ifdef USE_LOCALE_MONETARY
	187	"LC_MONETARY",
	188	# endif
	189	# ifdef USE_LOCALE_ADDRESS
	190	"LC_ADDRESS",
	191	# endif
	192	# ifdef USE_LOCALE_IDENTIFICATION
	193	"LC_IDENTIFICATION",
	194	# endif
	195	# ifdef USE_LOCALE_MEASUREMENT
	196	"LC_MEASUREMENT",
	197	# endif
	198	# ifdef USE_LOCALE_PAPER
	199	"LC_PAPER",
	200	# endif
	201	# ifdef USE_LOCALE_TELEPHONE
	202	"LC_TELEPHONE",
	203	# endif
	204	# ifdef LC_ALL
	205	"LC_ALL",
	206	# endif
	207	NULL /* Placeholder */
	208	};
	209
	210	# ifdef LC_ALL
	211
	212	/* On systems with LC_ALL, it is kept in the highest index position. (-2
	213	* to account for the final unused placeholder element.) */
	214	# define NOMINAL_LC_ALL_INDEX (C_ARRAY_LENGTH(categories) - 2)
	215
	216	# else
	217
	218	/* On systems without LC_ALL, we pretend it is there, one beyond the real
	219	* top element, hence in the unused placeholder element. */
	220	# define NOMINAL_LC_ALL_INDEX (C_ARRAY_LENGTH(categories) - 1)
	221
	222	# endif
	223
	224	/* Pretending there is an LC_ALL element just above allows us to avoid most
	225	* special cases. Most loops through these arrays in the code below are
	226	* written like 'for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++)'. They will work
	227	* on either type of system. But the code must be written to not access the
	228	* element at 'LC_ALL_INDEX' except on platforms that have it. This can be
	229	* checked for at compile time by using the #define LC_ALL_INDEX which is only
	230	* defined if we do have LC_ALL. */
	231
	232	STATIC const char *
	233	S_category_name(const int category)
	234	{
	235	unsigned int i;
	236
	237	#ifdef LC_ALL
	238
	239	if (category == LC_ALL) {
	240	return "LC_ALL";
	241	}
	242
	243	#endif
	244
	245	for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++) {
	246	if (category == categories[i]) {
	247	return category_names[i];
	248	}
	249	}
	250
	251	{
	252	const char suffix[] = " (unknown)";
	253	int temp = category;
	254	Size_t length = sizeof(suffix) + 1;
	255	char * unknown;
	256	dTHX;
	257
	258	if (temp < 0) {
	259	length++;
	260	temp = - temp;
	261	}
	262
	263	/* Calculate the number of digits */
	264	while (temp >= 10) {
	265	temp /= 10;
	266	length++;
	267	}
	268
	269	Newx(unknown, length, char);
	270	my_snprintf(unknown, length, "%d%s", category, suffix);
	271	SAVEFREEPV(unknown);
	272	return unknown;
	273	}
	274	}
	275
	276	/* Now create LC_foo_INDEX #defines for just those categories on this system */
	277	# ifdef USE_LOCALE_NUMERIC
	278	# define LC_NUMERIC_INDEX 0
	279	# define _DUMMY_NUMERIC LC_NUMERIC_INDEX
	280	# else
	281	# define _DUMMY_NUMERIC -1
	282	# endif
	283	# ifdef USE_LOCALE_CTYPE
	284	# define LC_CTYPE_INDEX _DUMMY_NUMERIC + 1
	285	# define _DUMMY_CTYPE LC_CTYPE_INDEX
	286	# else
	287	# define _DUMMY_CTYPE _DUMMY_NUMERIC
	288	# endif
	289	# ifdef USE_LOCALE_COLLATE
	290	# define LC_COLLATE_INDEX _DUMMY_CTYPE + 1
	291	# define _DUMMY_COLLATE LC_COLLATE_INDEX
	292	# else
	293	# define _DUMMY_COLLATE _DUMMY_COLLATE
	294	# endif
	295	# ifdef USE_LOCALE_TIME
	296	# define LC_TIME_INDEX _DUMMY_COLLATE + 1
	297	# define _DUMMY_TIME LC_TIME_INDEX
	298	# else
	299	# define _DUMMY_TIME _DUMMY_COLLATE
	300	# endif
	301	# ifdef USE_LOCALE_MESSAGES
	302	# define LC_MESSAGES_INDEX _DUMMY_TIME + 1
	303	# define _DUMMY_MESSAGES LC_MESSAGES_INDEX
	304	# else
	305	# define _DUMMY_MESSAGES _DUMMY_TIME
	306	# endif
	307	# ifdef USE_LOCALE_MONETARY
	308	# define LC_MONETARY_INDEX _DUMMY_MESSAGES + 1
	309	# define _DUMMY_MONETARY LC_MONETARY_INDEX
	310	# else
	311	# define _DUMMY_MONETARY _DUMMY_MESSAGES
	312	# endif
	313	# ifdef USE_LOCALE_ADDRESS
	314	# define LC_ADDRESS_INDEX _DUMMY_MONETARY + 1
	315	# define _DUMMY_ADDRESS LC_ADDRESS_INDEX
	316	# else
	317	# define _DUMMY_ADDRESS _DUMMY_MONETARY
	318	# endif
	319	# ifdef USE_LOCALE_IDENTIFICATION
	320	# define LC_IDENTIFICATION_INDEX _DUMMY_ADDRESS + 1
	321	# define _DUMMY_IDENTIFICATION LC_IDENTIFICATION_INDEX
	322	# else
	323	# define _DUMMY_IDENTIFICATION _DUMMY_ADDRESS
	324	# endif
	325	# ifdef USE_LOCALE_MEASUREMENT
	326	# define LC_MEASUREMENT_INDEX _DUMMY_IDENTIFICATION + 1
	327	# define _DUMMY_MEASUREMENT LC_MEASUREMENT_INDEX
	328	# else
	329	# define _DUMMY_MEASUREMENT _DUMMY_IDENTIFICATION
	330	# endif
	331	# ifdef USE_LOCALE_PAPER
	332	# define LC_PAPER_INDEX _DUMMY_MEASUREMENT + 1
	333	# define _DUMMY_PAPER LC_PAPER_INDEX
	334	# else
	335	# define _DUMMY_PAPER _DUMMY_MEASUREMENT
	336	# endif
	337	# ifdef USE_LOCALE_TELEPHONE
	338	# define LC_TELEPHONE_INDEX _DUMMY_PAPER + 1
	339	# define _DUMMY_TELEPHONE LC_TELEPHONE_INDEX
	340	# else
	341	# define _DUMMY_TELEPHONE _DUMMY_PAPER
	342	# endif
	343	# ifdef LC_ALL
	344	# define LC_ALL_INDEX _DUMMY_TELEPHONE + 1
	345	# endif
	346	#endif /* ifdef USE_LOCALE */
	347
	348	/* Windows requres a customized base-level setlocale() */
	349	# ifdef WIN32
	350	# define my_setlocale(cat, locale) win32_setlocale(cat, locale)
	351	# else
	352	# define my_setlocale(cat, locale) setlocale(cat, locale)
	353	# endif
	354
	355	/* Just placeholders for now. "_c" is intended to be called when the category
	356	* is a constant known at compile time; "_r", not known until run time */
	357	# define do_setlocale_c(category, locale) my_setlocale(category, locale)
	358	# define do_setlocale_r(category, locale) my_setlocale(category, locale)
	359
	360	STATIC void
	361	S_set_numeric_radix(pTHX_ const bool use_locale)
	362	{
	363	/* If 'use_locale' is FALSE, set to use a dot for the radix character. If
	364	* TRUE, use the radix character derived from the current locale */
	365
	366	#if defined(USE_LOCALE_NUMERIC) && ( defined(HAS_LOCALECONV) \
	367	\|\| defined(HAS_NL_LANGINFO))
	368
	369	/* We only set up the radix SV if we are to use a locale radix ... */
	370	if (use_locale) {
	371	const char * radix = my_nl_langinfo(PERL_RADIXCHAR, FALSE);
	372	/* FALSE => already in dest locale */
	373	/* ... and the character being used isn't a dot */
	374	if (strNE(radix, ".")) {
	375	const U8 * first_variant;
	376
	377	if (PL_numeric_radix_sv) {
	378	sv_setpv(PL_numeric_radix_sv, radix);
	379	}
	380	else {
	381	PL_numeric_radix_sv = newSVpv(radix, 0);
	382	}
	383
	384	/* If there is a byte variant under UTF-8, and if the remainder of
	385	* the string starting there is valid UTF-8, and we are in a UTF-8
	386	* locale, then mark the radix as being in UTF-8 */
	387	if ( ! is_utf8_invariant_string_loc(
	388	(U8 *) SvPVX(PL_numeric_radix_sv),
	389	SvCUR(PL_numeric_radix_sv),
	390	&first_variant)
	391	&& is_utf8_string(first_variant,
	392	SvCUR(PL_numeric_radix_sv)
	393	- ((char *) first_variant
	394	- SvPVX(PL_numeric_radix_sv)))
	395	&& _is_cur_LC_category_utf8(LC_NUMERIC))
	396	{
	397	SvUTF8_on(PL_numeric_radix_sv);
	398	}
	399	goto done;
	400	}
	401	}
	402
	403	SvREFCNT_dec(PL_numeric_radix_sv);
	404	PL_numeric_radix_sv = NULL;
	405
	406	done: ;
	407
	408	# ifdef DEBUGGING
	409
	410	if (DEBUG_L_TEST \|\| debug_initialization) {
	411	PerlIO_printf(Perl_debug_log, "Locale radix is '%s', ?UTF-8=%d\n",
	412	(PL_numeric_radix_sv)
	413	? SvPVX(PL_numeric_radix_sv)
	414	: "NULL",
	415	(PL_numeric_radix_sv)
	416	? cBOOL(SvUTF8(PL_numeric_radix_sv))
	417	: 0);
	418	}
	419
	420	# endif
	421	#endif /* USE_LOCALE_NUMERIC and can find the radix char */
	422
	423	}
	424
	425
	426	void
	427	Perl_new_numeric(pTHX_ const char *newnum)
	428	{
	429
	430	#ifndef USE_LOCALE_NUMERIC
	431
	432	PERL_UNUSED_ARG(newnum);
	433
	434	#else
	435
	436	/* Called after each libc setlocale() call affecting LC_NUMERIC, to tell
	437	* core Perl this and that 'newnum' is the name of the new locale.
	438	* It installs this locale as the current underlying default.
	439	*
	440	* The default locale and the C locale can be toggled between by use of the
	441	* set_numeric_underlying() and set_numeric_standard() functions, which
	442	* should probably not be called directly, but only via macros like
	443	* SET_NUMERIC_STANDARD() in perl.h.
	444	*
	445	* The toggling is necessary mainly so that a non-dot radix decimal point
	446	* character can be output, while allowing internal calculations to use a
	447	* dot.
	448	*
	449	* This sets several interpreter-level variables:
	450	* PL_numeric_name The underlying locale's name: a copy of 'newnum'
	451	* PL_numeric_underlying A boolean indicating if the toggled state is such
	452	* that the current locale is the program's underlying
	453	* locale
	454	* PL_numeric_standard An int indicating if the toggled state is such
	455	* that the current locale is the C locale or
	456	* indistinguishable from the C locale. If non-zero, it
	457	* is in C; if > 1, it means it may not be toggled away
	458	* from C.
	459	* PL_numeric_underlying_is_standard A bool kept by this function
	460	* indicating that the underlying locale and the standard
	461	* C locale are indistinguishable for the purposes of
	462	* LC_NUMERIC. This happens when both of the above two
	463	* variables are true at the same time. (Toggling is a
	464	* no-op under these circumstances.) This variable is
	465	* used to avoid having to recalculate.
	466	* Any code changing the locale (outside this file) should use
	467	* POSIX::setlocale, which calls this function. Therefore this function
	468	* should be called directly only from this file and from
	469	* POSIX::setlocale() */
	470
	471	char *save_newnum;
	472
	473	if (! newnum) {
	474	Safefree(PL_numeric_name);
	475	PL_numeric_name = NULL;
	476	PL_numeric_standard = TRUE;
	477	PL_numeric_underlying = TRUE;
	478	PL_numeric_underlying_is_standard = TRUE;
	479	return;
	480	}
	481
	482	save_newnum = stdize_locale(savepv(newnum));
	483	PL_numeric_underlying = TRUE;
	484	PL_numeric_standard = isNAME_C_OR_POSIX(save_newnum);
	485
	486	/* If its name isn't C nor POSIX, it could still be indistinguishable from
	487	* them */
	488	if (! PL_numeric_standard) {
	489	PL_numeric_standard = cBOOL(strEQ(".", my_nl_langinfo(PERL_RADIXCHAR,
	490	FALSE /* Don't toggle locale */ ))
	491	&& strEQ("", my_nl_langinfo(PERL_THOUSEP,
	492	FALSE)));
	493	}
	494
	495	/* Save the new name if it isn't the same as the previous one, if any */
	496	if (! PL_numeric_name \|\| strNE(PL_numeric_name, save_newnum)) {
	497	Safefree(PL_numeric_name);
	498	PL_numeric_name = save_newnum;
	499	}
	500	else {
	501	Safefree(save_newnum);
	502	}
	503
	504	PL_numeric_underlying_is_standard = PL_numeric_standard;
	505
	506	if (DEBUG_L_TEST \|\| debug_initialization) {
	507	PerlIO_printf(Perl_debug_log, "Called new_numeric with %s, PL_numeric_name=%s\n", newnum, PL_numeric_name);
	508	}
	509
	510	/* Keep LC_NUMERIC in the C locale. This is for XS modules, so they don't
	511	* have to worry about the radix being a non-dot. (Core operations that
	512	* need the underlying locale change to it temporarily). */
	513	set_numeric_standard();
	514
	515	#endif /* USE_LOCALE_NUMERIC */
	516
	517	}
	518
	519	void
	520	Perl_set_numeric_standard(pTHX)
	521	{
	522
	523	#ifdef USE_LOCALE_NUMERIC
	524
	525	/* Toggle the LC_NUMERIC locale to C. Most code should use the macros like
	526	* SET_NUMERIC_STANDARD() in perl.h instead of calling this directly. The
	527	* macro avoids calling this routine if toggling isn't necessary according
	528	* to our records (which could be wrong if some XS code has changed the
	529	* locale behind our back) */
	530
	531	do_setlocale_c(LC_NUMERIC, "C");
	532	PL_numeric_standard = TRUE;
	533	PL_numeric_underlying = PL_numeric_underlying_is_standard;
	534	set_numeric_radix(0);
	535
	536	# ifdef DEBUGGING
	537
	538	if (DEBUG_L_TEST \|\| debug_initialization) {
	539	PerlIO_printf(Perl_debug_log,
	540	"LC_NUMERIC locale now is standard C\n");
	541	}
	542
	543	# endif
	544	#endif /* USE_LOCALE_NUMERIC */
	545
	546	}
	547
	548	void
	549	Perl_set_numeric_underlying(pTHX)
	550	{
	551
	552	#ifdef USE_LOCALE_NUMERIC
	553
	554	/* Toggle the LC_NUMERIC locale to the current underlying default. Most
	555	* code should use the macros like SET_NUMERIC_UNDERLYING() in perl.h
	556	* instead of calling this directly. The macro avoids calling this routine
	557	* if toggling isn't necessary according to our records (which could be
	558	* wrong if some XS code has changed the locale behind our back) */
	559
	560	do_setlocale_c(LC_NUMERIC, PL_numeric_name);
	561	PL_numeric_standard = PL_numeric_underlying_is_standard;
	562	PL_numeric_underlying = TRUE;
	563	set_numeric_radix(1);
	564
	565	# ifdef DEBUGGING
	566
	567	if (DEBUG_L_TEST \|\| debug_initialization) {
	568	PerlIO_printf(Perl_debug_log,
	569	"LC_NUMERIC locale now is %s\n",
	570	PL_numeric_name);
	571	}
	572
	573	# endif
	574	#endif /* USE_LOCALE_NUMERIC */
	575
	576	}
	577
	578	/*
	579	* Set up for a new ctype locale.
	580	*/
	581	STATIC void
	582	S_new_ctype(pTHX_ const char *newctype)
	583	{
	584
	585	#ifndef USE_LOCALE_CTYPE
	586
	587	PERL_ARGS_ASSERT_NEW_CTYPE;
	588	PERL_UNUSED_ARG(newctype);
	589	PERL_UNUSED_CONTEXT;
	590
	591	#else
	592
	593	/* Called after each libc setlocale() call affecting LC_CTYPE, to tell
	594	* core Perl this and that 'newctype' is the name of the new locale.
	595	*
	596	* This function sets up the folding arrays for all 256 bytes, assuming
	597	* that tofold() is tolc() since fold case is not a concept in POSIX,
	598	*
	599	* Any code changing the locale (outside this file) should use
	600	* POSIX::setlocale, which calls this function. Therefore this function
	601	* should be called directly only from this file and from
	602	* POSIX::setlocale() */
	603
	604	dVAR;
	605	UV i;
	606
	607	PERL_ARGS_ASSERT_NEW_CTYPE;
	608
	609	/* We will replace any bad locale warning with 1) nothing if the new one is
	610	* ok; or 2) a new warning for the bad new locale */
	611	if (PL_warn_locale) {
	612	SvREFCNT_dec_NN(PL_warn_locale);
	613	PL_warn_locale = NULL;
	614	}
	615
	616	PL_in_utf8_CTYPE_locale = _is_cur_LC_category_utf8(LC_CTYPE);
	617
	618	/* A UTF-8 locale gets standard rules. But note that code still has to
	619	* handle this specially because of the three problematic code points */
	620	if (PL_in_utf8_CTYPE_locale) {
	621	Copy(PL_fold_latin1, PL_fold_locale, 256, U8);
	622	}
	623	else {
	624	/* Assume enough space for every character being bad. 4 spaces each
	625	* for the 94 printable characters that are output like "'x' "; and 5
	626	* spaces each for "'\\' ", "'\t' ", and "'\n' "; plus a terminating
	627	* NUL */
	628	char bad_chars_list[ (94 * 4) + (3 * 5) + 1 ];
	629
	630	/* Don't check for problems if we are suppressing the warnings */
	631	bool check_for_problems = ckWARN_d(WARN_LOCALE)
	632	\|\| UNLIKELY(DEBUG_L_TEST);
	633	bool multi_byte_locale = FALSE; /* Assume is a single-byte locale
	634	to start */
	635	unsigned int bad_count = 0; /* Count of bad characters */
	636
	637	for (i = 0; i < 256; i++) {
	638	if (isupper(i))
	639	PL_fold_locale[i] = (U8) tolower(i);
	640	else if (islower(i))
	641	PL_fold_locale[i] = (U8) toupper(i);
	642	else
	643	PL_fold_locale[i] = (U8) i;
	644
	645	/* If checking for locale problems, see if the native ASCII-range
	646	* printables plus \n and \t are in their expected categories in
	647	* the new locale. If not, this could mean big trouble, upending
	648	* Perl's and most programs' assumptions, like having a
	649	* metacharacter with special meaning become a \w. Fortunately,
	650	* it's very rare to find locales that aren't supersets of ASCII
	651	* nowadays. It isn't a problem for most controls to be changed
	652	* into something else; we check only \n and \t, though perhaps \r
	653	* could be an issue as well. */
	654	if ( check_for_problems
	655	&& (isGRAPH_A(i) \|\| isBLANK_A(i) \|\| i == '\n'))
	656	{
	657	if ( cBOOL(isalnum(i)) != cBOOL(isALPHANUMERIC(i))
	658	\|\| cBOOL(isalpha(i)) != cBOOL(isALPHA_A(i))
	659	\|\| cBOOL(isdigit(i)) != cBOOL(isDIGIT_A(i))
	660	\|\| cBOOL(isgraph(i)) != cBOOL(isGRAPH_A(i))
	661	\|\| cBOOL(islower(i)) != cBOOL(isLOWER_A(i))
	662	\|\| cBOOL(isprint(i)) != cBOOL(isPRINT_A(i))
	663	\|\| cBOOL(ispunct(i)) != cBOOL(isPUNCT_A(i))
	664	\|\| cBOOL(isspace(i)) != cBOOL(isSPACE_A(i))
	665	\|\| cBOOL(isupper(i)) != cBOOL(isUPPER_A(i))
	666	\|\| cBOOL(isxdigit(i))!= cBOOL(isXDIGIT_A(i))
	667	\|\| tolower(i) != (int) toLOWER_A(i)
	668	\|\| toupper(i) != (int) toUPPER_A(i)
	669	\|\| (i == '\n' && ! isCNTRL_LC(i)))
	670	{
	671	if (bad_count) { /* Separate multiple entries with a
	672	blank */
	673	bad_chars_list[bad_count++] = ' ';
	674	}
	675	bad_chars_list[bad_count++] = '\'';
	676	if (isPRINT_A(i)) {
	677	bad_chars_list[bad_count++] = (char) i;
	678	}
	679	else {
	680	bad_chars_list[bad_count++] = '\\';
	681	if (i == '\n') {
	682	bad_chars_list[bad_count++] = 'n';
	683	}
	684	else {
	685	assert(i == '\t');
	686	bad_chars_list[bad_count++] = 't';
	687	}
	688	}
	689	bad_chars_list[bad_count++] = '\'';
	690	bad_chars_list[bad_count] = '\0';
	691	}
	692	}
	693	}
	694
	695	# ifdef MB_CUR_MAX
	696
	697	/* We only handle single-byte locales (outside of UTF-8 ones; so if
	698	* this locale requires more than one byte, there are going to be
	699	* problems. */
	700	DEBUG_Lv(PerlIO_printf(Perl_debug_log,
	701	"%s:%d: check_for_problems=%d, MB_CUR_MAX=%d\n",
	702	__FILE__, __LINE__, check_for_problems, (int) MB_CUR_MAX));
	703
	704	if (check_for_problems && MB_CUR_MAX > 1
	705
	706	/* Some platforms return MB_CUR_MAX > 1 for even the "C"
	707	* locale. Just assume that the implementation for them (plus
	708	* for POSIX) is correct and the > 1 value is spurious. (Since
	709	* these are specially handled to never be considered UTF-8
	710	* locales, as long as this is the only problem, everything
	711	* should work fine */
	712	&& strNE(newctype, "C") && strNE(newctype, "POSIX"))
	713	{
	714	multi_byte_locale = TRUE;
	715	}
	716
	717	# endif
	718
	719	if (bad_count \|\| multi_byte_locale) {
	720	PL_warn_locale = Perl_newSVpvf(aTHX_
	721	"Locale '%s' may not work well.%s%s%s\n",
	722	newctype,
	723	(multi_byte_locale)
	724	? " Some characters in it are not recognized by"
	725	" Perl."
	726	: "",
	727	(bad_count)
	728	? "\nThe following characters (and maybe others)"
	729	" may not have the same meaning as the Perl"
	730	" program expects:\n"
	731	: "",
	732	(bad_count)
	733	? bad_chars_list
	734	: ""
	735	);
	736	/* If we are actually in the scope of the locale or are debugging,
	737	* output the message now. If not in that scope, we save the
	738	* message to be output at the first operation using this locale,
	739	* if that actually happens. Most programs don't use locales, so
	740	* they are immune to bad ones. */
	741	if (IN_LC(LC_CTYPE) \|\| UNLIKELY(DEBUG_L_TEST)) {
	742
	743	/* We have to save 'newctype' because the setlocale() just
	744	* below may destroy it. The next setlocale() further down
	745	* should restore it properly so that the intermediate change
	746	* here is transparent to this function's caller */
	747	const char * const badlocale = savepv(newctype);
	748
	749	do_setlocale_c(LC_CTYPE, "C");
	750
	751	/* The '0' below suppresses a bogus gcc compiler warning */
	752	Perl_warner(aTHX_ packWARN(WARN_LOCALE), SvPVX(PL_warn_locale), 0);
	753
	754	do_setlocale_c(LC_CTYPE, badlocale);
	755	Safefree(badlocale);
	756
	757	if (IN_LC(LC_CTYPE)) {
	758	SvREFCNT_dec_NN(PL_warn_locale);
	759	PL_warn_locale = NULL;
	760	}
	761	}
	762	}
	763	}
	764
	765	#endif /* USE_LOCALE_CTYPE */
	766
	767	}
	768
	769	void
	770	Perl__warn_problematic_locale()
	771	{
	772
	773	#ifdef USE_LOCALE_CTYPE
	774
	775	dTHX;
	776
	777	/* Internal-to-core function that outputs the message in PL_warn_locale,
	778	* and then NULLS it. Should be called only through the macro
	779	* _CHECK_AND_WARN_PROBLEMATIC_LOCALE */
	780
	781	if (PL_warn_locale) {
	782	Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
	783	SvPVX(PL_warn_locale),
	784	0 /* dummy to avoid compiler warning */ );
	785	SvREFCNT_dec_NN(PL_warn_locale);
	786	PL_warn_locale = NULL;
	787	}
	788
	789	#endif
	790
	791	}
	792
	793	STATIC void
	794	S_new_collate(pTHX_ const char *newcoll)
	795	{
	796
	797	#ifndef USE_LOCALE_COLLATE
	798
	799	PERL_UNUSED_ARG(newcoll);
	800	PERL_UNUSED_CONTEXT;
	801
	802	#else
	803
	804	/* Called after each libc setlocale() call affecting LC_COLLATE, to tell
	805	* core Perl this and that 'newcoll' is the name of the new locale.
	806	*
	807	* The design of locale collation is that every locale change is given an
	808	* index 'PL_collation_ix'. The first time a string particpates in an
	809	* operation that requires collation while locale collation is active, it
	810	* is given PERL_MAGIC_collxfrm magic (via sv_collxfrm_flags()). That
	811	* magic includes the collation index, and the transformation of the string
	812	* by strxfrm(), q.v. That transformation is used when doing comparisons,
	813	* instead of the string itself. If a string changes, the magic is
	814	* cleared. The next time the locale changes, the index is incremented,
	815	* and so we know during a comparison that the transformation is not
	816	* necessarily still valid, and so is recomputed. Note that if the locale
	817	* changes enough times, the index could wrap (a U32), and it is possible
	818	* that a transformation would improperly be considered valid, leading to
	819	* an unlikely bug */
	820
	821	if (! newcoll) {
	822	if (PL_collation_name) {
	823	++PL_collation_ix;
	824	Safefree(PL_collation_name);
	825	PL_collation_name = NULL;
	826	}
	827	PL_collation_standard = TRUE;
	828	is_standard_collation:
	829	PL_collxfrm_base = 0;
	830	PL_collxfrm_mult = 2;
	831	PL_in_utf8_COLLATE_locale = FALSE;
	832	PL_strxfrm_NUL_replacement = '\0';
	833	PL_strxfrm_max_cp = 0;
	834	return;
	835	}
	836
	837	/* If this is not the same locale as currently, set the new one up */
	838	if (! PL_collation_name \|\| strNE(PL_collation_name, newcoll)) {
	839	++PL_collation_ix;
	840	Safefree(PL_collation_name);
	841	PL_collation_name = stdize_locale(savepv(newcoll));
	842	PL_collation_standard = isNAME_C_OR_POSIX(newcoll);
	843	if (PL_collation_standard) {
	844	goto is_standard_collation;
	845	}
	846
	847	PL_in_utf8_COLLATE_locale = _is_cur_LC_category_utf8(LC_COLLATE);
	848	PL_strxfrm_NUL_replacement = '\0';
	849	PL_strxfrm_max_cp = 0;
	850
	851	/* A locale collation definition includes primary, secondary, tertiary,
	852	* etc. weights for each character. To sort, the primary weights are
	853	* used, and only if they compare equal, then the secondary weights are
	854	* used, and only if they compare equal, then the tertiary, etc.
	855	*
	856	* strxfrm() works by taking the input string, say ABC, and creating an
	857	* output transformed string consisting of first the primary weights,
	858	* A¹B¹C¹ followed by the secondary ones, A²B²C²; and then the
	859	* tertiary, etc, yielding A¹B¹C¹ A²B²C² A³B³C³ .... Some characters
	860	* may not have weights at every level. In our example, let's say B
	861	* doesn't have a tertiary weight, and A doesn't have a secondary
	862	* weight. The constructed string is then going to be
	863	* A¹B¹C¹ B²C² A³C³ ....
	864	* This has the desired effect that strcmp() will look at the secondary
	865	* or tertiary weights only if the strings compare equal at all higher
	866	* priority weights. The spaces shown here, like in
	867	* "A¹B¹C¹ A²B²C² "
	868	* are not just for readability. In the general case, these must
	869	* actually be bytes, which we will call here 'separator weights'; and
	870	* they must be smaller than any other weight value, but since these
	871	* are C strings, only the terminating one can be a NUL (some
	872	* implementations may include a non-NUL separator weight just before
	873	* the NUL). Implementations tend to reserve 01 for the separator
	874	* weights. They are needed so that a shorter string's secondary
	875	* weights won't be misconstrued as primary weights of a longer string,
	876	* etc. By making them smaller than any other weight, the shorter
	877	* string will sort first. (Actually, if all secondary weights are
	878	* smaller than all primary ones, there is no need for a separator
	879	* weight between those two levels, etc.)
	880	*
	881	* The length of the transformed string is roughly a linear function of
	882	* the input string. It's not exactly linear because some characters
	883	* don't have weights at all levels. When we call strxfrm() we have to
	884	* allocate some memory to hold the transformed string. The
	885	* calculations below try to find coefficients 'm' and 'b' for this
	886	* locale so that m*x + b equals how much space we need, given the size
	887	* of the input string in 'x'. If we calculate too small, we increase
	888	* the size as needed, and call strxfrm() again, but it is better to
	889	* get it right the first time to avoid wasted expensive string
	890	* transformations. */
	891
	892	{
	893	/* We use the string below to find how long the tranformation of it
	894	* is. Almost all locales are supersets of ASCII, or at least the
	895	* ASCII letters. We use all of them, half upper half lower,
	896	* because if we used fewer, we might hit just the ones that are
	897	* outliers in a particular locale. Most of the strings being
	898	* collated will contain a preponderance of letters, and even if
	899	* they are above-ASCII, they are likely to have the same number of
	900	* weight levels as the ASCII ones. It turns out that digits tend
	901	* to have fewer levels, and some punctuation has more, but those
	902	* are relatively sparse in text, and khw believes this gives a
	903	* reasonable result, but it could be changed if experience so
	904	* dictates. */
	905	const char longer[] = "ABCDEFGHIJKLMnopqrstuvwxyz";
	906	char * x_longer; /* Transformed 'longer' */
	907	Size_t x_len_longer; /* Length of 'x_longer' */
	908
	909	char * x_shorter; /* We also transform a substring of 'longer' */
	910	Size_t x_len_shorter;
	911
	912	/* _mem_collxfrm() is used get the transformation (though here we
	913	* are interested only in its length). It is used because it has
	914	* the intelligence to handle all cases, but to work, it needs some
	915	* values of 'm' and 'b' to get it started. For the purposes of
	916	* this calculation we use a very conservative estimate of 'm' and
	917	* 'b'. This assumes a weight can be multiple bytes, enough to
	918	* hold any UV on the platform, and there are 5 levels, 4 weight
	919	* bytes, and a trailing NUL. */
	920	PL_collxfrm_base = 5;
	921	PL_collxfrm_mult = 5 * sizeof(UV);
	922
	923	/* Find out how long the transformation really is */
	924	x_longer = _mem_collxfrm(longer,
	925	sizeof(longer) - 1,
	926	&x_len_longer,
	927
	928	/* We avoid converting to UTF-8 in the
	929	* called function by telling it the
	930	* string is in UTF-8 if the locale is a
	931	* UTF-8 one. Since the string passed
	932	* here is invariant under UTF-8, we can
	933	* claim it's UTF-8 even though it isn't.
	934	* */
	935	PL_in_utf8_COLLATE_locale);
	936	Safefree(x_longer);
	937
	938	/* Find out how long the transformation of a substring of 'longer'
	939	* is. Together the lengths of these transformations are
	940	* sufficient to calculate 'm' and 'b'. The substring is all of
	941	* 'longer' except the first character. This minimizes the chances
	942	* of being swayed by outliers */
	943	x_shorter = _mem_collxfrm(longer + 1,
	944	sizeof(longer) - 2,
	945	&x_len_shorter,
	946	PL_in_utf8_COLLATE_locale);
	947	Safefree(x_shorter);
	948
	949	/* If the results are nonsensical for this simple test, the whole
	950	* locale definition is suspect. Mark it so that locale collation
	951	* is not active at all for it. XXX Should we warn? */
	952	if ( x_len_shorter == 0
	953	\|\| x_len_longer == 0
	954	\|\| x_len_shorter >= x_len_longer)
	955	{
	956	PL_collxfrm_mult = 0;
	957	PL_collxfrm_base = 0;
	958	}
	959	else {
	960	SSize_t base; /* Temporary */
	961
	962	/* We have both: m * strlen(longer) + b = x_len_longer
	963	* m * strlen(shorter) + b = x_len_shorter;
	964	* subtracting yields:
	965	* m * (strlen(longer) - strlen(shorter))
	966	* = x_len_longer - x_len_shorter
	967	* But we have set things up so that 'shorter' is 1 byte smaller
	968	* than 'longer'. Hence:
	969	* m = x_len_longer - x_len_shorter
	970	*
	971	* But if something went wrong, make sure the multiplier is at
	972	* least 1.
	973	*/
	974	if (x_len_longer > x_len_shorter) {
	975	PL_collxfrm_mult = (STRLEN) x_len_longer - x_len_shorter;
	976	}
	977	else {
	978	PL_collxfrm_mult = 1;
	979	}
	980
	981	/* mx + b = len
	982	* so: b = len - mx
	983	* but in case something has gone wrong, make sure it is
	984	* non-negative */
	985	base = x_len_longer - PL_collxfrm_mult * (sizeof(longer) - 1);
	986	if (base < 0) {
	987	base = 0;
	988	}
	989
	990	/* Add 1 for the trailing NUL */
	991	PL_collxfrm_base = base + 1;
	992	}
	993
	994	# ifdef DEBUGGING
	995
	996	if (DEBUG_L_TEST \|\| debug_initialization) {
	997	PerlIO_printf(Perl_debug_log,
	998	"%s:%d: ?UTF-8 locale=%d; x_len_shorter=%zu, "
	999	"x_len_longer=%zu,"
	1000	" collate multipler=%zu, collate base=%zu\n",
	1001	__FILE__, __LINE__,
	1002	PL_in_utf8_COLLATE_locale,
	1003	x_len_shorter, x_len_longer,
	1004	PL_collxfrm_mult, PL_collxfrm_base);
	1005	}
	1006	# endif
	1007
	1008	}
	1009	}
	1010
	1011	#endif /* USE_LOCALE_COLLATE */
	1012
	1013	}
	1014
	1015	#ifdef WIN32
	1016
	1017	STATIC char *
	1018	S_win32_setlocale(pTHX_ int category, const char* locale)
	1019	{
	1020	/* This, for Windows, emulates POSIX setlocale() behavior. There is no
	1021	* difference between the two unless the input locale is "", which normally
	1022	* means on Windows to get the machine default, which is set via the
	1023	* computer's "Regional and Language Options" (or its current equivalent).
	1024	* In POSIX, it instead means to find the locale from the user's
	1025	* environment. This routine changes the Windows behavior to first look in
	1026	* the environment, and, if anything is found, use that instead of going to
	1027	* the machine default. If there is no environment override, the machine
	1028	* default is used, by calling the real setlocale() with "".
	1029	*
	1030	* The POSIX behavior is to use the LC_ALL variable if set; otherwise to
	1031	* use the particular category's variable if set; otherwise to use the LANG
	1032	* variable. */
	1033
	1034	bool override_LC_ALL = FALSE;
	1035	char * result;
	1036	unsigned int i;
	1037
	1038	if (locale && strEQ(locale, "")) {
	1039
	1040	# ifdef LC_ALL
	1041
	1042	locale = PerlEnv_getenv("LC_ALL");
	1043	if (! locale) {
	1044	if (category == LC_ALL) {
	1045	override_LC_ALL = TRUE;
	1046	}
	1047	else {
	1048
	1049	# endif
	1050
	1051	for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++) {
	1052	if (category == categories[i]) {
	1053	locale = PerlEnv_getenv(category_names[i]);
	1054	goto found_locale;
	1055	}
	1056	}
	1057
	1058	locale = PerlEnv_getenv("LANG");
	1059	if (! locale) {
	1060	locale = "";
	1061	}
	1062
	1063	found_locale: ;
	1064
	1065	# ifdef LC_ALL
	1066
	1067	}
	1068	}
	1069
	1070	# endif
	1071
	1072	}
	1073
	1074	result = setlocale(category, locale);
	1075	DEBUG_L(PerlIO_printf(Perl_debug_log, "%s:%d: %s\n", __FILE__, __LINE__,
	1076	setlocale_debug_string(category, locale, result)));
	1077
	1078	if (! override_LC_ALL) {
	1079	return result;
	1080	}
	1081
	1082	/* Here the input category was LC_ALL, and we have set it to what is in the
	1083	* LANG variable or the system default if there is no LANG. But these have
	1084	* lower priority than the other LC_foo variables, so override it for each
	1085	* one that is set. (If they are set to "", it means to use the same thing
	1086	* we just set LC_ALL to, so can skip) */
	1087
	1088	for (i = 0; i < LC_ALL_INDEX; i++) {
	1089	result = PerlEnv_getenv(category_names[i]);
	1090	if (result && strNE(result, "")) {
	1091	setlocale(categories[i], result);
	1092	DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s:%d: %s\n",
	1093	__FILE__, __LINE__,
	1094	setlocale_debug_string(categories[i], result, "not captured")));
	1095	}
	1096	}
	1097
	1098	result = setlocale(LC_ALL, NULL);
	1099	DEBUG_L(PerlIO_printf(Perl_debug_log, "%s:%d: %s\n",
	1100	__FILE__, __LINE__,
	1101	setlocale_debug_string(LC_ALL, NULL, result)));
	1102
	1103	return result;
	1104	}
	1105
	1106	#endif
	1107
	1108	char *
	1109	Perl_setlocale(int category, const char * locale)
	1110	{
	1111	/* This wraps POSIX::setlocale() */
	1112
	1113	char * retval;
	1114	char * newlocale;
	1115	dTHX;
	1116
	1117	#ifdef USE_LOCALE_NUMERIC
	1118
	1119	/* A NULL locale means only query what the current one is. We have the
	1120	* LC_NUMERIC name saved, because we are normally switched into the C
	1121	* locale for it. For an LC_ALL query, switch back to get the correct
	1122	* results. All other categories don't require special handling */
	1123	if (locale == NULL) {
	1124	if (category == LC_NUMERIC) {
	1125	return savepv(PL_numeric_name);
	1126	}
	1127
	1128	# ifdef LC_ALL
	1129
	1130	else if (category == LC_ALL && ! PL_numeric_underlying) {
	1131
	1132	SET_NUMERIC_UNDERLYING();
	1133	}
	1134
	1135	# endif
	1136
	1137	}
	1138
	1139	#endif
	1140
	1141	/* Save retval since subsequent setlocale() calls may overwrite it. */
	1142	retval = savepv(do_setlocale_r(category, locale));
	1143
	1144	DEBUG_L(PerlIO_printf(Perl_debug_log,
	1145	"%s:%d: %s\n", __FILE__, __LINE__,
	1146	setlocale_debug_string(category, locale, retval)));
	1147	if (! retval) {
	1148	/* Should never happen that a query would return an error, but be
	1149	* sure and reset to C locale */
	1150	if (locale == 0) {
	1151	SET_NUMERIC_STANDARD();
	1152	}
	1153
	1154	return NULL;
	1155	}
	1156
	1157	/* If locale == NULL, we are just querying the state, but may have switched
	1158	* to NUMERIC_UNDERLYING. Switch back before returning. */
	1159	if (locale == NULL) {
	1160	SET_NUMERIC_STANDARD();
	1161	return retval;
	1162	}
	1163
	1164	/* Now that have switched locales, we have to update our records to
	1165	* correspond. */
	1166
	1167	switch (category) {
	1168
	1169	#ifdef USE_LOCALE_CTYPE
	1170
	1171	case LC_CTYPE:
	1172	new_ctype(retval);
	1173	break;
	1174
	1175	#endif
	1176	#ifdef USE_LOCALE_COLLATE
	1177
	1178	case LC_COLLATE:
	1179	new_collate(retval);
	1180	break;
	1181
	1182	#endif
	1183	#ifdef USE_LOCALE_NUMERIC
	1184
	1185	case LC_NUMERIC:
	1186	new_numeric(retval);
	1187	break;
	1188
	1189	#endif
	1190	#ifdef LC_ALL
	1191
	1192	case LC_ALL:
	1193
	1194	/* LC_ALL updates all the things we care about. The values may not
	1195	* be the same as 'retval', as the locale "" may have set things
	1196	* individually */
	1197
	1198	# ifdef USE_LOCALE_CTYPE
	1199
	1200	newlocale = do_setlocale_c(LC_CTYPE, NULL);
	1201	new_ctype(newlocale);
	1202
	1203	# endif /* USE_LOCALE_CTYPE */
	1204	# ifdef USE_LOCALE_COLLATE
	1205
	1206	newlocale = do_setlocale_c(LC_COLLATE, NULL);
	1207	new_collate(newlocale);
	1208
	1209	# endif
	1210	# ifdef USE_LOCALE_NUMERIC
	1211
	1212	newlocale = do_setlocale_c(LC_NUMERIC, NULL);
	1213	new_numeric(newlocale);
	1214
	1215	# endif /* USE_LOCALE_NUMERIC */
	1216	#endif /* LC_ALL */
	1217
	1218	default:
	1219	break;
	1220	}
	1221
	1222	return retval;
	1223
	1224
	1225	}
	1226
	1227	PERL_STATIC_INLINE const char *
	1228	S_save_to_buffer(const char * string, char *buf, Size_t buf_size, const Size_t offset)
	1229	{
	1230	/* Copy the NUL-terminated 'string' to 'buf' + 'offset'. 'buf' has size 'buf_size',
	1231	* growing it if necessary */
	1232
	1233	const Size_t string_size = strlen(string) + offset + 1;
	1234
	1235	PERL_ARGS_ASSERT_SAVE_TO_BUFFER;
	1236
	1237	if (*buf_size == 0) {
	1238	Newx(*buf, string_size, char);
	1239	*buf_size = string_size;
	1240	}
	1241	else if (string_size > *buf_size) {
	1242	Renew(*buf, string_size, char);
	1243	*buf_size = string_size;
	1244	}
	1245
	1246	Copy(string, *buf + offset, string_size - offset, char);
	1247	return *buf;
	1248	}
	1249
	1250	/*
	1251
	1252	=head1 Locale-related functions and macros
	1253
	1254	=for apidoc Perl_langinfo
	1255
	1256	This is an (almost ª) drop-in replacement for the system C<L<nl_langinfo(3)>>,
	1257	taking the same C<item> parameter values, and returning the same information.
	1258	But it is more thread-safe than regular C<nl_langinfo()>, and hides the quirks
	1259	of Perl's locale handling from your code, and can be used on systems that lack
	1260	a native C<nl_langinfo>.
	1261
	1262	Expanding on these:
	1263
	1264	=over
	1265
	1266	=item *
	1267
	1268	It delivers the correct results for the C<RADIXCHAR> and C<THOUSESEP> items,
	1269	without you having to write extra code. The reason for the extra code would be
	1270	because these are from the C<LC_NUMERIC> locale category, which is normally
	1271	kept set to the C locale by Perl, no matter what the underlying locale is
	1272	supposed to be, and so to get the expected results, you have to temporarily
	1273	toggle into the underlying locale, and later toggle back. (You could use
	1274	plain C<nl_langinfo> and C<L</STORE_LC_NUMERIC_FORCE_TO_UNDERLYING>> for this
	1275	but then you wouldn't get the other advantages of C<Perl_langinfo()>; not
	1276	keeping C<LC_NUMERIC> in the C locale would break a lot of CPAN, which is
	1277	expecting the radix (decimal point) character to be a dot.)
	1278
	1279	=item *
	1280
	1281	Depending on C<item>, it works on systems that don't have C<nl_langinfo>, hence
	1282	makes your code more portable. Of the fifty-some possible items specified by
	1283	the POSIX 2008 standard,
	1284	L<http://pubs.opengroup.org/onlinepubs/9699919799/basedefs/langinfo.h.html>,
	1285	only two are completely unimplemented. It uses various techniques to recover
	1286	the other items, including calling C<L<localeconv(3)>>, and C<L<strftime(3)>>,
	1287	both of which are specified in C89, so should be always be available. Later
	1288	C<strftime()> versions have additional capabilities; C<""> is returned for
	1289	those not available on your system.
	1290
	1291	The details for those items which may differ from what this emulation returns
	1292	and what a native C<nl_langinfo()> would return are:
	1293
	1294	=over
	1295
	1296	=item C<CODESET>
	1297
	1298	=item C<ERA>
	1299
	1300	Unimplemented, so returns C<"">.
	1301
	1302	=item C<YESEXPR>
	1303
	1304	=item C<YESSTR>
	1305
	1306	=item C<NOEXPR>
	1307
	1308	=item C<NOSTR>
	1309
	1310	Only the values for English are returned. C<YESSTR> and C<NOSTR> have been
	1311	removed from POSIX 2008, and are retained for backwards compatibility. Your
	1312	platform's C<nl_langinfo> may not support them.
	1313
	1314	=item C<D_FMT>
	1315
	1316	Always evaluates to C<%x>, the locale's appropriate date representation.
	1317
	1318	=item C<T_FMT>
	1319
	1320	Always evaluates to C<%X>, the locale's appropriate time representation.
	1321
	1322	=item C<D_T_FMT>
	1323
	1324	Always evaluates to C<%c>, the locale's appropriate date and time
	1325	representation.
	1326
	1327	=item C<CRNCYSTR>
	1328
	1329	The return may be incorrect for those rare locales where the currency symbol
	1330	replaces the radix character.
	1331	Send email to L<mailto:perlbug@perl.org> if you have examples of it needing
	1332	to work differently.
	1333
	1334	=item C<ALT_DIGITS>
	1335
	1336	Currently this gives the same results as Linux does.
	1337	Send email to L<mailto:perlbug@perl.org> if you have examples of it needing
	1338	to work differently.
	1339
	1340	=item C<ERA_D_FMT>
	1341
	1342	=item C<ERA_T_FMT>
	1343
	1344	=item C<ERA_D_T_FMT>
	1345
	1346	=item C<T_FMT_AMPM>
	1347
	1348	These are derived by using C<strftime()>, and not all versions of that function
	1349	know about them. C<""> is returned for these on such systems.
	1350
	1351	=back
	1352
	1353	When using C<Perl_langinfo> on systems that don't have a native
	1354	C<nl_langinfo()>, you must
	1355
	1356	#include "perl_langinfo.h"
	1357
	1358	before the C<perl.h> C<#include>. You can replace your C<langinfo.h>
	1359	C<#include> with this one. (Doing it this way keeps out the symbols that plain
	1360	C<langinfo.h> imports into the namespace for code that doesn't need it.)
	1361
	1362	You also should not use the bare C<langinfo.h> item names, but should preface
	1363	them with C<PERL_>, so use C<PERL_RADIXCHAR> instead of plain C<RADIXCHAR>.
	1364	The C<PERL_I<foo>> versions will also work for this function on systems that do
	1365	have a native C<nl_langinfo>.
	1366
	1367	=item *
	1368
	1369	It is thread-friendly, returning its result in a buffer that won't be
	1370	overwritten by another thread, so you don't have to code for that possibility.
	1371	The buffer can be overwritten by the next call to C<nl_langinfo> or
	1372	C<Perl_langinfo> in the same thread.
	1373
	1374	=item *
	1375
	1376	ª It returns S<C<const char *>>, whereas plain C<nl_langinfo()> returns S<C<char
	1377	*>>, but you are (only by documentation) forbidden to write into the buffer.
	1378	By declaring this C<const>, the compiler enforces this restriction. The extra
	1379	C<const> is why this isn't an unequivocal drop-in replacement for
	1380	C<nl_langinfo>.
	1381
	1382	=back
	1383
	1384	The original impetus for C<Perl_langinfo()> was so that code that needs to
	1385	find out the current currency symbol, floating point radix character, or digit
	1386	grouping separator can use, on all systems, the simpler and more
	1387	thread-friendly C<nl_langinfo> API instead of C<L<localeconv(3)>> which is a
	1388	pain to make thread-friendly. For other fields returned by C<localeconv>, it
	1389	is better to use the methods given in L<perlcall> to call
	1390	L<C<POSIX::localeconv()>\|POSIX/localeconv>, which is thread-friendly.
	1391
	1392	=cut
	1393
	1394	*/
	1395
	1396	const char *
	1397	#ifdef HAS_NL_LANGINFO
	1398	Perl_langinfo(const nl_item item)
	1399	#else
	1400	Perl_langinfo(const int item)
	1401	#endif
	1402	{
	1403	return my_nl_langinfo(item, TRUE);
	1404	}
	1405
	1406	const char *
	1407	#ifdef HAS_NL_LANGINFO
	1408	S_my_nl_langinfo(const nl_item item, bool toggle)
	1409	#else
	1410	S_my_nl_langinfo(const int item, bool toggle)
	1411	#endif
	1412	{
	1413	dTHX;
	1414
	1415	#if defined(HAS_NL_LANGINFO) /* nl_langinfo() is available. */
	1416	#if ! defined(HAS_POSIX_2008_LOCALE)
	1417
	1418	/* Here, use plain nl_langinfo(), switching to the underlying LC_NUMERIC
	1419	* for those items dependent on it. This must be copied to a buffer before
	1420	* switching back, as some systems destroy the buffer when setlocale() is
	1421	* called */
	1422
	1423	LOCALE_LOCK;
	1424
	1425	if (toggle) {
	1426	if ( ! PL_numeric_underlying
	1427	&& (item == PERL_RADIXCHAR \|\| item == PERL_THOUSEP))
	1428	{
	1429	do_setlocale_c(LC_NUMERIC, PL_numeric_name);
	1430	}
	1431	else {
	1432	toggle = FALSE;
	1433	}
	1434	}
	1435
	1436	save_to_buffer(nl_langinfo(item), &PL_langinfo_buf, &PL_langinfo_bufsize, 0);
	1437
	1438	if (toggle) {
	1439	do_setlocale_c(LC_NUMERIC, "C");
	1440	}
	1441
	1442	LOCALE_UNLOCK;
	1443
	1444	# else /* Use nl_langinfo_l(), avoiding both a mutex and changing the locale */
	1445
	1446	bool do_free = FALSE;
	1447	locale_t cur = uselocale((locale_t) 0);
	1448
	1449	if (cur == LC_GLOBAL_LOCALE) {
	1450	cur = duplocale(LC_GLOBAL_LOCALE);
	1451	do_free = TRUE;
	1452	}
	1453
	1454	if (toggle) {
	1455	cur = newlocale(LC_NUMERIC_MASK, PL_numeric_name, cur);
	1456	do_free = TRUE;
	1457	}
	1458
	1459	save_to_buffer(nl_langinfo_l(item, cur),
	1460	&PL_langinfo_buf, &PL_langinfo_bufsize, 0);
	1461	if (do_free) {
	1462	freelocale(cur);
	1463	}
	1464
	1465	# endif
	1466
	1467	if (strEQ(PL_langinfo_buf, "")) {
	1468	if (item == PERL_YESSTR) {
	1469	return "yes";
	1470	}
	1471	if (item == PERL_NOSTR) {
	1472	return "no";
	1473	}
	1474	}
	1475
	1476	return PL_langinfo_buf;
	1477
	1478	#else /* Below, emulate nl_langinfo as best we can */
	1479
	1480	{
	1481
	1482	# ifdef HAS_LOCALECONV
	1483
	1484	const struct lconv* lc;
	1485
	1486	# endif
	1487	# ifdef HAS_STRFTIME
	1488
	1489	struct tm tm;
	1490	bool return_format = FALSE; /* Return the %format, not the value */
	1491	const char * format;
	1492
	1493	# endif
	1494
	1495	/* We copy the results to a per-thread buffer, even if not
	1496	* multi-threaded. This is in part to simplify this code, and partly
	1497	* because we need a buffer anyway for strftime(), and partly because a
	1498	* call of localeconv() could otherwise wipe out the buffer, and the
	1499	* programmer would not be expecting this, as this is a nl_langinfo()
	1500	* substitute after all, so s/he might be thinking their localeconv()
	1501	* is safe until another localeconv() call. */
	1502
	1503	switch (item) {
	1504	Size_t len;
	1505	const char * retval;
	1506
	1507	/* These 2 are unimplemented */
	1508	case PERL_CODESET:
	1509	case PERL_ERA: /* For use with strftime() %E modifier */
	1510
	1511	default:
	1512	return "";
	1513
	1514	/* We use only an English set, since we don't know any more */
	1515	case PERL_YESEXPR: return "^[+1yY]";
	1516	case PERL_YESSTR: return "yes";
	1517	case PERL_NOEXPR: return "^[-0nN]";
	1518	case PERL_NOSTR: return "no";
	1519
	1520	# ifdef HAS_LOCALECONV
	1521
	1522	case PERL_CRNCYSTR:
	1523
	1524	LOCALE_LOCK;
	1525
	1526	/* We don't bother with localeconv_l() because any system that
	1527	* has it is likely to also have nl_langinfo() */
	1528
	1529	lc = localeconv();
	1530	if ( ! lc
	1531	\|\| ! lc->currency_symbol
	1532	\|\| strEQ("", lc->currency_symbol))
	1533	{
	1534	LOCALE_UNLOCK;
	1535	return "";
	1536	}
	1537
	1538	/* Leave the first spot empty to be filled in below */
	1539	save_to_buffer(lc->currency_symbol, &PL_langinfo_buf,
	1540	&PL_langinfo_bufsize, 1);
	1541	if (lc->mon_decimal_point && strEQ(lc->mon_decimal_point, ""))
	1542	{ /* khw couldn't figure out how the localedef specifications
	1543	would show that the $ should replace the radix; this is
	1544	just a guess as to how it might work.*/
	1545	*PL_langinfo_buf = '.';
	1546	}
	1547	else if (lc->p_cs_precedes) {
	1548	*PL_langinfo_buf = '-';
	1549	}
	1550	else {
	1551	*PL_langinfo_buf = '+';
	1552	}
	1553
	1554	LOCALE_UNLOCK;
	1555	break;
	1556
	1557	case PERL_RADIXCHAR:
	1558	case PERL_THOUSEP:
	1559
	1560	LOCALE_LOCK;
	1561
	1562	if (toggle) {
	1563	if (! PL_numeric_underlying) {
	1564	do_setlocale_c(LC_NUMERIC, PL_numeric_name);
	1565	}
	1566	else {
	1567	toggle = FALSE;
	1568	}
	1569	}
	1570
	1571	lc = localeconv();
	1572	if (! lc) {
	1573	retval = "";
	1574	}
	1575	else {
	1576	retval = (item == PERL_RADIXCHAR)
	1577	? lc->decimal_point
	1578	: lc->thousands_sep;
	1579	if (! retval) {
	1580	retval = "";
	1581	}
	1582	}
	1583
	1584	save_to_buffer(retval, &PL_langinfo_buf,
	1585	&PL_langinfo_bufsize, 0);
	1586
	1587	if (toggle) {
	1588	do_setlocale_c(LC_NUMERIC, "C");
	1589	}
	1590
	1591	LOCALE_UNLOCK;
	1592
	1593	break;
	1594
	1595	# endif
	1596	# ifdef HAS_STRFTIME
	1597
	1598	/* These are defined by C89, so we assume that strftime supports
	1599	* them, and so are returned unconditionally; they may not be what
	1600	* the locale actually says, but should give good enough results
	1601	* for someone using them as formats (as opposed to trying to parse
	1602	* them to figure out what the locale says). The other format
	1603	* items are actually tested to verify they work on the platform */
	1604	case PERL_D_FMT: return "%x";
	1605	case PERL_T_FMT: return "%X";
	1606	case PERL_D_T_FMT: return "%c";
	1607
	1608	/* These formats are only available in later strfmtime's */
	1609	case PERL_ERA_D_FMT: case PERL_ERA_T_FMT: case PERL_ERA_D_T_FMT:
	1610	case PERL_T_FMT_AMPM:
	1611
	1612	/* The rest can be gotten from most versions of strftime(). */
	1613	case PERL_ABDAY_1: case PERL_ABDAY_2: case PERL_ABDAY_3:
	1614	case PERL_ABDAY_4: case PERL_ABDAY_5: case PERL_ABDAY_6:
	1615	case PERL_ABDAY_7:
	1616	case PERL_ALT_DIGITS:
	1617	case PERL_AM_STR: case PERL_PM_STR:
	1618	case PERL_ABMON_1: case PERL_ABMON_2: case PERL_ABMON_3:
	1619	case PERL_ABMON_4: case PERL_ABMON_5: case PERL_ABMON_6:
	1620	case PERL_ABMON_7: case PERL_ABMON_8: case PERL_ABMON_9:
	1621	case PERL_ABMON_10: case PERL_ABMON_11: case PERL_ABMON_12:
	1622	case PERL_DAY_1: case PERL_DAY_2: case PERL_DAY_3: case PERL_DAY_4:
	1623	case PERL_DAY_5: case PERL_DAY_6: case PERL_DAY_7:
	1624	case PERL_MON_1: case PERL_MON_2: case PERL_MON_3: case PERL_MON_4:
	1625	case PERL_MON_5: case PERL_MON_6: case PERL_MON_7: case PERL_MON_8:
	1626	case PERL_MON_9: case PERL_MON_10: case PERL_MON_11:
	1627	case PERL_MON_12:
	1628
	1629	LOCALE_LOCK;
	1630
	1631	init_tm(&tm); /* Precaution against core dumps */
	1632	tm.tm_sec = 30;
	1633	tm.tm_min = 30;
	1634	tm.tm_hour = 6;
	1635	tm.tm_year = 2017 - 1900;
	1636	tm.tm_wday = 0;
	1637	tm.tm_mon = 0;
	1638	switch (item) {
	1639	default:
	1640	LOCALE_UNLOCK;
	1641	Perl_croak(aTHX_
	1642	"panic: %s: %d: switch case: %d problem",
	1643	__FILE__, __LINE__, item);
	1644	NOT_REACHED; /* NOTREACHED */
	1645
	1646	case PERL_PM_STR: tm.tm_hour = 18;
	1647	case PERL_AM_STR:
	1648	format = "%p";
	1649	break;
	1650
	1651	case PERL_ABDAY_7: tm.tm_wday++;
	1652	case PERL_ABDAY_6: tm.tm_wday++;
	1653	case PERL_ABDAY_5: tm.tm_wday++;
	1654	case PERL_ABDAY_4: tm.tm_wday++;
	1655	case PERL_ABDAY_3: tm.tm_wday++;
	1656	case PERL_ABDAY_2: tm.tm_wday++;
	1657	case PERL_ABDAY_1:
	1658	format = "%a";
	1659	break;
	1660
	1661	case PERL_DAY_7: tm.tm_wday++;
	1662	case PERL_DAY_6: tm.tm_wday++;
	1663	case PERL_DAY_5: tm.tm_wday++;
	1664	case PERL_DAY_4: tm.tm_wday++;
	1665	case PERL_DAY_3: tm.tm_wday++;
	1666	case PERL_DAY_2: tm.tm_wday++;
	1667	case PERL_DAY_1:
	1668	format = "%A";
	1669	break;
	1670
	1671	case PERL_ABMON_12: tm.tm_mon++;
	1672	case PERL_ABMON_11: tm.tm_mon++;
	1673	case PERL_ABMON_10: tm.tm_mon++;
	1674	case PERL_ABMON_9: tm.tm_mon++;
	1675	case PERL_ABMON_8: tm.tm_mon++;
	1676	case PERL_ABMON_7: tm.tm_mon++;
	1677	case PERL_ABMON_6: tm.tm_mon++;
	1678	case PERL_ABMON_5: tm.tm_mon++;
	1679	case PERL_ABMON_4: tm.tm_mon++;
	1680	case PERL_ABMON_3: tm.tm_mon++;
	1681	case PERL_ABMON_2: tm.tm_mon++;
	1682	case PERL_ABMON_1:
	1683	format = "%b";
	1684	break;
	1685
	1686	case PERL_MON_12: tm.tm_mon++;
	1687	case PERL_MON_11: tm.tm_mon++;
	1688	case PERL_MON_10: tm.tm_mon++;
	1689	case PERL_MON_9: tm.tm_mon++;
	1690	case PERL_MON_8: tm.tm_mon++;
	1691	case PERL_MON_7: tm.tm_mon++;
	1692	case PERL_MON_6: tm.tm_mon++;
	1693	case PERL_MON_5: tm.tm_mon++;
	1694	case PERL_MON_4: tm.tm_mon++;
	1695	case PERL_MON_3: tm.tm_mon++;
	1696	case PERL_MON_2: tm.tm_mon++;
	1697	case PERL_MON_1:
	1698	format = "%B";
	1699	break;
	1700
	1701	case PERL_T_FMT_AMPM:
	1702	format = "%r";
	1703	return_format = TRUE;
	1704	break;
	1705
	1706	case PERL_ERA_D_FMT:
	1707	format = "%Ex";
	1708	return_format = TRUE;
	1709	break;
	1710
	1711	case PERL_ERA_T_FMT:
	1712	format = "%EX";
	1713	return_format = TRUE;
	1714	break;
	1715
	1716	case PERL_ERA_D_T_FMT:
	1717	format = "%Ec";
	1718	return_format = TRUE;
	1719	break;
	1720
	1721	case PERL_ALT_DIGITS:
	1722	tm.tm_wday = 0;
	1723	format = "%Ow"; /* Find the alternate digit for 0 */
	1724	break;
	1725	}
	1726
	1727	/* We can't use my_strftime() because it doesn't look at
	1728	* tm_wday */
	1729	while (0 == strftime(PL_langinfo_buf, PL_langinfo_bufsize,
	1730	format, &tm))
	1731	{
	1732	/* A zero return means one of:
	1733	* a) there wasn't enough space in PL_langinfo_buf
	1734	* b) the format, like a plain %p, returns empty
	1735	* c) it was an illegal format, though some
	1736	* implementations of strftime will just return the
	1737	* illegal format as a plain character sequence.
	1738	*
	1739	* To quickly test for case 'b)', try again but precede
	1740	* the format with a plain character. If that result is
	1741	* still empty, the problem is either 'a)' or 'c)' */
	1742
	1743	Size_t format_size = strlen(format) + 1;
	1744	Size_t mod_size = format_size + 1;
	1745	char * mod_format;
	1746	char * temp_result;
	1747
	1748	Newx(mod_format, mod_size, char);
	1749	Newx(temp_result, PL_langinfo_bufsize, char);
	1750	*mod_format = '\a';
	1751	my_strlcpy(mod_format + 1, format, mod_size);
	1752	len = strftime(temp_result,
	1753	PL_langinfo_bufsize,
	1754	mod_format, &tm);
	1755	Safefree(mod_format);
	1756	Safefree(temp_result);
	1757
	1758	/* If 'len' is non-zero, it means that we had a case like
	1759	* %p which means the current locale doesn't use a.m. or
	1760	* p.m., and that is valid */
	1761	if (len == 0) {
	1762
	1763	/* Here, still didn't work. If we get well beyond a
	1764	* reasonable size, bail out to prevent an infinite
	1765	* loop. */
	1766
	1767	if (PL_langinfo_bufsize > 100 * format_size) {
	1768	*PL_langinfo_buf = '\0';
	1769	}
	1770	else {
	1771	/* Double the buffer size to retry; Add 1 in case
	1772	* original was 0, so we aren't stuck at 0. */
	1773	PL_langinfo_bufsize *= 2;
	1774	PL_langinfo_bufsize++;
	1775	Renew(PL_langinfo_buf, PL_langinfo_bufsize, char);
	1776	continue;
	1777	}
	1778	}
	1779
	1780	break;
	1781	}
	1782
	1783	/* Here, we got a result.
	1784	*
	1785	* If the item is 'ALT_DIGITS', PL_langinfo_buf contains the
	1786	* alternate format for wday 0. If the value is the same as
	1787	* the normal 0, there isn't an alternate, so clear the buffer.
	1788	* */
	1789	if ( item == PERL_ALT_DIGITS
	1790	&& strEQ(PL_langinfo_buf, "0"))
	1791	{
	1792	*PL_langinfo_buf = '\0';
	1793	}
	1794
	1795	/* ALT_DIGITS is problematic. Experiments on it showed that
	1796	* strftime() did not always work properly when going from
	1797	* alt-9 to alt-10. Only a few locales have this item defined,
	1798	* and in all of them on Linux that khw was able to find,
	1799	* nl_langinfo() merely returned the alt-0 character, possibly
	1800	* doubled. Most Unicode digits are in blocks of 10
	1801	* consecutive code points, so that is sufficient information
	1802	* for those scripts, as we can infer alt-1, alt-2, .... But
	1803	* for a Japanese locale, a CJK ideographic 0 is returned, and
	1804	* the CJK digits are not in code point order, so you can't
	1805	* really infer anything. The localedef for this locale did
	1806	* specify the succeeding digits, so that strftime() works
	1807	* properly on them, without needing to infer anything. But
	1808	* the nl_langinfo() return did not give sufficient information
	1809	* for the caller to understand what's going on. So until
	1810	* there is evidence that it should work differently, this
	1811	* returns the alt-0 string for ALT_DIGITS.
	1812	*
	1813	* wday was chosen because its range is all a single digit.
	1814	* Things like tm_sec have two digits as the minimum: '00' */
	1815
	1816	LOCALE_UNLOCK;
	1817
	1818	/* If to return the format, not the value, overwrite the buffer
	1819	* with it. But some strftime()s will keep the original format
	1820	* if illegal, so change those to "" */
	1821	if (return_format) {
	1822	if (strEQ(PL_langinfo_buf, format)) {
	1823	*PL_langinfo_buf = '\0';
	1824	}
	1825	else {
	1826	save_to_buffer(format, &PL_langinfo_buf,
	1827	&PL_langinfo_bufsize, 0);
	1828	}
	1829	}
	1830
	1831	break;
	1832
	1833	# endif
	1834
	1835	}
	1836	}
	1837
	1838	return PL_langinfo_buf;
	1839
	1840	#endif
	1841
	1842	}
	1843
	1844	/*
	1845	* Initialize locale awareness.
	1846	*/
	1847	int
	1848	Perl_init_i18nl10n(pTHX_ int printwarn)
	1849	{
	1850	/* printwarn is
	1851	*
	1852	* 0 if not to output warning when setup locale is bad
	1853	* 1 if to output warning based on value of PERL_BADLANG
	1854	* >1 if to output regardless of PERL_BADLANG
	1855	*
	1856	* returns
	1857	* 1 = set ok or not applicable,
	1858	* 0 = fallback to a locale of lower priority
	1859	* -1 = fallback to all locales failed, not even to the C locale
	1860	*
	1861	* Under -DDEBUGGING, if the environment variable PERL_DEBUG_LOCALE_INIT is
	1862	* set, debugging information is output.
	1863	*
	1864	* This looks more complicated than it is, mainly due to the #ifdefs.
	1865	*
	1866	* We try to set LC_ALL to the value determined by the environment. If
	1867	* there is no LC_ALL on this platform, we try the individual categories we
	1868	* know about. If this works, we are done.
	1869	*
	1870	* But if it doesn't work, we have to do something else. We search the
	1871	* environment variables ourselves instead of relying on the system to do
	1872	* it. We look at, in order, LC_ALL, LANG, a system default locale (if we
	1873	* think there is one), and the ultimate fallback "C". This is all done in
	1874	* the same loop as above to avoid duplicating code, but it makes things
	1875	* more complex. The 'trial_locales' array is initialized with just one
	1876	* element; it causes the behavior described in the paragraph above this to
	1877	* happen. If that fails, we add elements to 'trial_locales', and do extra
	1878	* loop iterations to cause the behavior described in this paragraph.
	1879	*
	1880	* On Ultrix, the locale MUST come from the environment, so there is
	1881	* preliminary code to set it. I (khw) am not sure that it is necessary,
	1882	* and that this couldn't be folded into the loop, but barring any real
	1883	* platforms to test on, it's staying as-is
	1884	*
	1885	* A slight complication is that in embedded Perls, the locale may already
	1886	* be set-up, and we don't want to get it from the normal environment
	1887	* variables. This is handled by having a special environment variable
	1888	* indicate we're in this situation. We simply set setlocale's 2nd
	1889	* parameter to be a NULL instead of "". That indicates to setlocale that
	1890	* it is not to change anything, but to return the current value,
	1891	* effectively initializing perl's db to what the locale already is.
	1892	*
	1893	* We play the same trick with NULL if a LC_ALL succeeds. We call
	1894	* setlocale() on the individual categores with NULL to get their existing
	1895	* values for our db, instead of trying to change them.
	1896	* */
	1897
	1898	int ok = 1;
	1899
	1900	#ifndef USE_LOCALE
	1901
	1902	PERL_UNUSED_ARG(printwarn);
	1903
	1904	#else /* USE_LOCALE */
	1905	# ifdef __GLIBC__
	1906
	1907	const char * const language = savepv(PerlEnv_getenv("LANGUAGE"));
	1908
	1909	# endif
	1910
	1911	/* NULL uses the existing already set up locale */
	1912	const char * const setlocale_init = (PerlEnv_getenv("PERL_SKIP_LOCALE_INIT"))
	1913	? NULL
	1914	: "";
	1915	const char* trial_locales[5]; /* 5 = 1 each for "", LC_ALL, LANG, "", C */
	1916	unsigned int trial_locales_count;
	1917	const char * const lc_all = savepv(PerlEnv_getenv("LC_ALL"));
	1918	const char * const lang = savepv(PerlEnv_getenv("LANG"));
	1919	bool setlocale_failure = FALSE;
	1920	unsigned int i;
	1921
	1922	/* A later getenv() could zap this, so only use here */
	1923	const char * const bad_lang_use_once = PerlEnv_getenv("PERL_BADLANG");
	1924
	1925	const bool locwarn = (printwarn > 1
	1926	\|\| ( printwarn
	1927	&& ( ! bad_lang_use_once
	1928	\|\| (
	1929	/* disallow with "" or "0" */
	1930	*bad_lang_use_once
	1931	&& strNE("0", bad_lang_use_once)))));
	1932
	1933	/* setlocale() return vals; not copied so must be looked at immediately */
	1934	const char * sl_result[NOMINAL_LC_ALL_INDEX + 1];
	1935
	1936	/* current locale for given category; should have been copied so aren't
	1937	* volatile */
	1938	const char * curlocales[NOMINAL_LC_ALL_INDEX + 1];
	1939
	1940	# ifdef WIN32
	1941
	1942	/* In some systems you can find out the system default locale
	1943	* and use that as the fallback locale. */
	1944	# define SYSTEM_DEFAULT_LOCALE
	1945	# endif
	1946	# ifdef SYSTEM_DEFAULT_LOCALE
	1947
	1948	const char *system_default_locale = NULL;
	1949
	1950	# endif
	1951
	1952	# ifndef DEBUGGING
	1953	# define DEBUG_LOCALE_INIT(a,b,c)
	1954	# else
	1955
	1956	DEBUG_INITIALIZATION_set(cBOOL(PerlEnv_getenv("PERL_DEBUG_LOCALE_INIT")));
	1957
	1958	# define DEBUG_LOCALE_INIT(category, locale, result) \
	1959	STMT_START { \
	1960	if (debug_initialization) { \
	1961	PerlIO_printf(Perl_debug_log, \
	1962	"%s:%d: %s\n", \
	1963	__FILE__, __LINE__, \
	1964	setlocale_debug_string(category, \
	1965	locale, \
	1966	result)); \
	1967	} \
	1968	} STMT_END
	1969
	1970	/* Make sure the parallel arrays are properly set up */
	1971	# ifdef USE_LOCALE_NUMERIC
	1972	assert(categories[LC_NUMERIC_INDEX] == LC_NUMERIC);
	1973	assert(strEQ(category_names[LC_NUMERIC_INDEX], "LC_NUMERIC"));
	1974	# endif
	1975	# ifdef USE_LOCALE_CTYPE
	1976	assert(categories[LC_CTYPE_INDEX] == LC_CTYPE);
	1977	assert(strEQ(category_names[LC_CTYPE_INDEX], "LC_CTYPE"));
	1978	# endif
	1979	# ifdef USE_LOCALE_COLLATE
	1980	assert(categories[LC_COLLATE_INDEX] == LC_COLLATE);
	1981	assert(strEQ(category_names[LC_COLLATE_INDEX], "LC_COLLATE"));
	1982	# endif
	1983	# ifdef USE_LOCALE_TIME
	1984	assert(categories[LC_TIME_INDEX] == LC_TIME);
	1985	assert(strEQ(category_names[LC_TIME_INDEX], "LC_TIME"));
	1986	# endif
	1987	# ifdef USE_LOCALE_MESSAGES
	1988	assert(categories[LC_MESSAGES_INDEX] == LC_MESSAGES);
	1989	assert(strEQ(category_names[LC_MESSAGES_INDEX], "LC_MESSAGES"));
	1990	# endif
	1991	# ifdef USE_LOCALE_MONETARY
	1992	assert(categories[LC_MONETARY_INDEX] == LC_MONETARY);
	1993	assert(strEQ(category_names[LC_MONETARY_INDEX], "LC_MONETARY"));
	1994	# endif
	1995	# ifdef USE_LOCALE_ADDRESS
	1996	assert(categories[LC_ADDRESS_INDEX] == LC_ADDRESS);
	1997	assert(strEQ(category_names[LC_ADDRESS_INDEX], "LC_ADDRESS"));
	1998	# endif
	1999	# ifdef USE_LOCALE_IDENTIFICATION
	2000	assert(categories[LC_IDENTIFICATION_INDEX] == LC_IDENTIFICATION);
	2001	assert(strEQ(category_names[LC_IDENTIFICATION_INDEX], "LC_IDENTIFICATION"));
	2002	# endif
	2003	# ifdef USE_LOCALE_MEASUREMENT
	2004	assert(categories[LC_MEASUREMENT_INDEX] == LC_MEASUREMENT);
	2005	assert(strEQ(category_names[LC_MEASUREMENT_INDEX], "LC_MEASUREMENT"));
	2006	# endif
	2007	# ifdef USE_LOCALE_PAPER
	2008	assert(categories[LC_PAPER_INDEX] == LC_PAPER);
	2009	assert(strEQ(category_names[LC_PAPER_INDEX], "LC_PAPER"));
	2010	# endif
	2011	# ifdef USE_LOCALE_TELEPHONE
	2012	assert(categories[LC_TELEPHONE_INDEX] == LC_TELEPHONE);
	2013	assert(strEQ(category_names[LC_TELEPHONE_INDEX], "LC_TELEPHONE"));
	2014	# endif
	2015	# ifdef LC_ALL
	2016	assert(categories[LC_ALL_INDEX] == LC_ALL);
	2017	assert(strEQ(category_names[LC_ALL_INDEX], "LC_ALL"));
	2018	assert(NOMINAL_LC_ALL_INDEX == LC_ALL_INDEX);
	2019	# endif
	2020	# endif /* DEBUGGING */
	2021	# ifdef LOCALE_ENVIRON_REQUIRED
	2022
	2023	/*
	2024	* Ultrix setlocale(..., "") fails if there are no environment
	2025	* variables from which to get a locale name.
	2026	*/
	2027
	2028	# ifndef LC_ALL
	2029	# error Ultrix without LC_ALL not implemented
	2030	# else
	2031
	2032	{
	2033	bool done = FALSE;
	2034	if (lang) {
	2035	sl_result[LC_ALL_INDEX] = do_setlocale_c(LC_ALL, setlocale_init);
	2036	DEBUG_LOCALE_INIT(LC_ALL, setlocale_init, sl_result[LC_ALL_INDEX]);
	2037	if (sl_result[LC_ALL_INDEX])
	2038	done = TRUE;
	2039	else
	2040	setlocale_failure = TRUE;
	2041	}
	2042	if (! setlocale_failure) {
	2043	const char * locale_param;
	2044	for (i = 0; i < LC_ALL_INDEX; i++) {
	2045	locale_param = (! done && (lang \|\| PerlEnv_getenv(category_names[i])))
	2046	? setlocale_init
	2047	: NULL;
	2048	sl_result[i] = do_setlocale_r(categories[i], locale_param);
	2049	if (! sl_result[i]) {
	2050	setlocale_failure = TRUE;
	2051	}
	2052	DEBUG_LOCALE_INIT(categories[i], locale_param, sl_result[i]);
	2053	}
	2054	}
	2055	}
	2056
	2057	# endif /* LC_ALL */
	2058	# endif /* LOCALE_ENVIRON_REQUIRED */
	2059
	2060	/* We try each locale in the list until we get one that works, or exhaust
	2061	* the list. Normally the loop is executed just once. But if setting the
	2062	* locale fails, inside the loop we add fallback trials to the array and so
	2063	* will execute the loop multiple times */
	2064	trial_locales[0] = setlocale_init;
	2065	trial_locales_count = 1;
	2066
	2067	for (i= 0; i < trial_locales_count; i++) {
	2068	const char * trial_locale = trial_locales[i];
	2069
	2070	if (i > 0) {
	2071
	2072	/* XXX This is to preserve old behavior for LOCALE_ENVIRON_REQUIRED
	2073	* when i==0, but I (khw) don't think that behavior makes much
	2074	* sense */
	2075	setlocale_failure = FALSE;
	2076
	2077	# ifdef SYSTEM_DEFAULT_LOCALE
	2078	# ifdef WIN32 /* Note that assumes Win32 has LC_ALL */
	2079
	2080	/* On Windows machines, an entry of "" after the 0th means to use
	2081	* the system default locale, which we now proceed to get. */
	2082	if (strEQ(trial_locale, "")) {
	2083	unsigned int j;
	2084
	2085	/* Note that this may change the locale, but we are going to do
	2086	* that anyway just below */
	2087	system_default_locale = do_setlocale_c(LC_ALL, "");
	2088	DEBUG_LOCALE_INIT(LC_ALL, "", system_default_locale);
	2089
	2090	/* Skip if invalid or if it's already on the list of locales to
	2091	* try */
	2092	if (! system_default_locale) {
	2093	goto next_iteration;
	2094	}
	2095	for (j = 0; j < trial_locales_count; j++) {
	2096	if (strEQ(system_default_locale, trial_locales[j])) {
	2097	goto next_iteration;
	2098	}
	2099	}
	2100
	2101	trial_locale = system_default_locale;
	2102	}
	2103	# else
	2104	# error SYSTEM_DEFAULT_LOCALE only implemented for Win32
	2105	# endif
	2106	# endif /* SYSTEM_DEFAULT_LOCALE */
	2107
	2108	} /* For i > 0 */
	2109
	2110	# ifdef LC_ALL
	2111
	2112	sl_result[LC_ALL_INDEX] = do_setlocale_c(LC_ALL, trial_locale);
	2113	DEBUG_LOCALE_INIT(LC_ALL, trial_locale, sl_result[LC_ALL_INDEX]);
	2114	if (! sl_result[LC_ALL_INDEX]) {
	2115	setlocale_failure = TRUE;
	2116	}
	2117	else {
	2118	/* Since LC_ALL succeeded, it should have changed all the other
	2119	* categories it can to its value; so we massage things so that the
	2120	* setlocales below just return their category's current values.
	2121	* This adequately handles the case in NetBSD where LC_COLLATE may
	2122	* not be defined for a locale, and setting it individually will
	2123	* fail, whereas setting LC_ALL succeeds, leaving LC_COLLATE set to
	2124	* the POSIX locale. */
	2125	trial_locale = NULL;
	2126	}
	2127
	2128	# endif /* LC_ALL */
	2129
	2130	if (! setlocale_failure) {
	2131	unsigned int j;
	2132	for (j = 0; j < NOMINAL_LC_ALL_INDEX; j++) {
	2133	curlocales[j]
	2134	= savepv(do_setlocale_r(categories[j], trial_locale));
	2135	if (! curlocales[j]) {
	2136	setlocale_failure = TRUE;
	2137	}
	2138	DEBUG_LOCALE_INIT(categories[j], trial_locale, curlocales[j]);
	2139	}
	2140
	2141	if (! setlocale_failure) { /* All succeeded */
	2142	break; /* Exit trial_locales loop */
	2143	}
	2144	}
	2145
	2146	/* Here, something failed; will need to try a fallback. */
	2147	ok = 0;
	2148
	2149	if (i == 0) {
	2150	unsigned int j;
	2151
	2152	if (locwarn) { /* Output failure info only on the first one */
	2153
	2154	# ifdef LC_ALL
	2155
	2156	PerlIO_printf(Perl_error_log,
	2157	"perl: warning: Setting locale failed.\n");
	2158
	2159	# else /* !LC_ALL */
	2160
	2161	PerlIO_printf(Perl_error_log,
	2162	"perl: warning: Setting locale failed for the categories:\n\t");
	2163
	2164	for (j = 0; j < NOMINAL_LC_ALL_INDEX; j++) {
	2165	if (! curlocales[j]) {
	2166	PerlIO_printf(Perl_error_log, category_names[j]);
	2167	}
	2168	else {
	2169	Safefree(curlocales[j]);
	2170	}
	2171	}
	2172
	2173	# endif /* LC_ALL */
	2174
	2175	PerlIO_printf(Perl_error_log,
	2176	"perl: warning: Please check that your locale settings:\n");
	2177
	2178	# ifdef __GLIBC__
	2179
	2180	PerlIO_printf(Perl_error_log,
	2181	"\tLANGUAGE = %c%s%c,\n",
	2182	language ? '"' : '(',
	2183	language ? language : "unset",
	2184	language ? '"' : ')');
	2185	# endif
	2186
	2187	PerlIO_printf(Perl_error_log,
	2188	"\tLC_ALL = %c%s%c,\n",
	2189	lc_all ? '"' : '(',
	2190	lc_all ? lc_all : "unset",
	2191	lc_all ? '"' : ')');
	2192
	2193	# if defined(USE_ENVIRON_ARRAY)
	2194
	2195	{
	2196	char **e;
	2197
	2198	/* Look through the environment for any variables of the
	2199	* form qr/ ^ LC_ [A-Z]+ = /x, except LC_ALL which was
	2200	* already handled above. These are assumed to be locale
	2201	* settings. Output them and their values. */
	2202	for (e = environ; *e; e++) {
	2203	const STRLEN prefix_len = sizeof("LC_") - 1;
	2204	STRLEN uppers_len;
	2205
	2206	if ( strBEGINs(*e, "LC_")
	2207	&& ! strBEGINs(*e, "LC_ALL=")
	2208	&& (uppers_len = strspn(*e + prefix_len,
	2209	"ABCDEFGHIJKLMNOPQRSTUVWXYZ"))
	2210	&& ((*e)[prefix_len + uppers_len] == '='))
	2211	{
	2212	PerlIO_printf(Perl_error_log, "\t%.*s = \"%s\",\n",
	2213	(int) (prefix_len + uppers_len), *e,
	2214	*e + prefix_len + uppers_len + 1);
	2215	}
	2216	}
	2217	}
	2218
	2219	# else
	2220
	2221	PerlIO_printf(Perl_error_log,
	2222	"\t(possibly more locale environment variables)\n");
	2223
	2224	# endif
	2225
	2226	PerlIO_printf(Perl_error_log,
	2227	"\tLANG = %c%s%c\n",
	2228	lang ? '"' : '(',
	2229	lang ? lang : "unset",
	2230	lang ? '"' : ')');
	2231
	2232	PerlIO_printf(Perl_error_log,
	2233	" are supported and installed on your system.\n");
	2234	}
	2235
	2236	/* Calculate what fallback locales to try. We have avoided this
	2237	* until we have to, because failure is quite unlikely. This will
	2238	* usually change the upper bound of the loop we are in.
	2239	*
	2240	* Since the system's default way of setting the locale has not
	2241	* found one that works, We use Perl's defined ordering: LC_ALL,
	2242	* LANG, and the C locale. We don't try the same locale twice, so
	2243	* don't add to the list if already there. (On POSIX systems, the
	2244	* LC_ALL element will likely be a repeat of the 0th element "",
	2245	* but there's no harm done by doing it explicitly.
	2246	*
	2247	* Note that this tries the LC_ALL environment variable even on
	2248	* systems which have no LC_ALL locale setting. This may or may
	2249	* not have been originally intentional, but there's no real need
	2250	* to change the behavior. */
	2251	if (lc_all) {
	2252	for (j = 0; j < trial_locales_count; j++) {
	2253	if (strEQ(lc_all, trial_locales[j])) {
	2254	goto done_lc_all;
	2255	}
	2256	}
	2257	trial_locales[trial_locales_count++] = lc_all;
	2258	}
	2259	done_lc_all:
	2260
	2261	if (lang) {
	2262	for (j = 0; j < trial_locales_count; j++) {
	2263	if (strEQ(lang, trial_locales[j])) {
	2264	goto done_lang;
	2265	}
	2266	}
	2267	trial_locales[trial_locales_count++] = lang;
	2268	}
	2269	done_lang:
	2270
	2271	# if defined(WIN32) && defined(LC_ALL)
	2272
	2273	/* For Windows, we also try the system default locale before "C".
	2274	* (If there exists a Windows without LC_ALL we skip this because
	2275	* it gets too complicated. For those, the "C" is the next
	2276	* fallback possibility). The "" is the same as the 0th element of
	2277	* the array, but the code at the loop above knows to treat it
	2278	* differently when not the 0th */
	2279	trial_locales[trial_locales_count++] = "";
	2280
	2281	# endif
	2282
	2283	for (j = 0; j < trial_locales_count; j++) {
	2284	if (strEQ("C", trial_locales[j])) {
	2285	goto done_C;
	2286	}
	2287	}
	2288	trial_locales[trial_locales_count++] = "C";
	2289
	2290	done_C: ;
	2291	} /* end of first time through the loop */
	2292
	2293	# ifdef WIN32
	2294
	2295	next_iteration: ;
	2296
	2297	# endif
	2298
	2299	} /* end of looping through the trial locales */
	2300
	2301	if (ok < 1) { /* If we tried to fallback */
	2302	const char* msg;
	2303	if (! setlocale_failure) { /* fallback succeeded */
	2304	msg = "Falling back to";
	2305	}
	2306	else { /* fallback failed */
	2307	unsigned int j;
	2308
	2309	/* We dropped off the end of the loop, so have to decrement i to
	2310	* get back to the value the last time through */
	2311	i--;
	2312
	2313	ok = -1;
	2314	msg = "Failed to fall back to";
	2315
	2316	/* To continue, we should use whatever values we've got */
	2317
	2318	for (j = 0; j < NOMINAL_LC_ALL_INDEX; j++) {
	2319	Safefree(curlocales[j]);
	2320	curlocales[j] = savepv(do_setlocale_r(categories[j], NULL));
	2321	DEBUG_LOCALE_INIT(categories[j], NULL, curlocales[j]);
	2322	}
	2323	}
	2324
	2325	if (locwarn) {
	2326	const char * description;
	2327	const char * name = "";
	2328	if (strEQ(trial_locales[i], "C")) {
	2329	description = "the standard locale";
	2330	name = "C";
	2331	}
	2332
	2333	# ifdef SYSTEM_DEFAULT_LOCALE
	2334
	2335	else if (strEQ(trial_locales[i], "")) {
	2336	description = "the system default locale";
	2337	if (system_default_locale) {
	2338	name = system_default_locale;
	2339	}
	2340	}
	2341
	2342	# endif /* SYSTEM_DEFAULT_LOCALE */
	2343
	2344	else {
	2345	description = "a fallback locale";
	2346	name = trial_locales[i];
	2347	}
	2348	if (name && strNE(name, "")) {
	2349	PerlIO_printf(Perl_error_log,
	2350	"perl: warning: %s %s (\"%s\").\n", msg, description, name);
	2351	}
	2352	else {
	2353	PerlIO_printf(Perl_error_log,
	2354	"perl: warning: %s %s.\n", msg, description);
	2355	}
	2356	}
	2357	} /* End of tried to fallback */
	2358
	2359	/* Done with finding the locales; update our records */
	2360
	2361	# ifdef USE_LOCALE_CTYPE
	2362
	2363	new_ctype(curlocales[LC_CTYPE_INDEX]);
	2364
	2365	# endif
	2366	# ifdef USE_LOCALE_COLLATE
	2367
	2368	new_collate(curlocales[LC_COLLATE_INDEX]);
	2369
	2370	# endif
	2371	# ifdef USE_LOCALE_NUMERIC
	2372
	2373	new_numeric(curlocales[LC_NUMERIC_INDEX]);
	2374
	2375	# endif
	2376
	2377
	2378	for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++) {
	2379	Safefree(curlocales[i]);
	2380	}
	2381
	2382	# if defined(USE_PERLIO) && defined(USE_LOCALE_CTYPE)
	2383
	2384	/* Set PL_utf8locale to TRUE if using PerlIO _and_ the current LC_CTYPE
	2385	* locale is UTF-8. If PL_utf8locale and PL_unicode (set by -C or by
	2386	* $ENV{PERL_UNICODE}) are true, perl.c:S_parse_body() will turn on the
	2387	* PerlIO :utf8 layer on STDIN, STDOUT, STDERR, _and_ the default open
	2388	* discipline. */
	2389	PL_utf8locale = _is_cur_LC_category_utf8(LC_CTYPE);
	2390
	2391	/* Set PL_unicode to $ENV{PERL_UNICODE} if using PerlIO.
	2392	This is an alternative to using the -C command line switch
	2393	(the -C if present will override this). */
	2394	{
	2395	const char *p = PerlEnv_getenv("PERL_UNICODE");
	2396	PL_unicode = p ? parse_unicode_opts(&p) : 0;
	2397	if (PL_unicode & PERL_UNICODE_UTF8CACHEASSERT_FLAG)
	2398	PL_utf8cache = -1;
	2399	}
	2400
	2401	# endif
	2402	# ifdef __GLIBC__
	2403
	2404	Safefree(language);
	2405
	2406	# endif
	2407
	2408	Safefree(lc_all);
	2409	Safefree(lang);
	2410
	2411	#endif /* USE_LOCALE */
	2412	#ifdef DEBUGGING
	2413
	2414	/* So won't continue to output stuff */
	2415	DEBUG_INITIALIZATION_set(FALSE);
	2416
	2417	#endif
	2418
	2419	return ok;
	2420	}
	2421
	2422	#ifdef USE_LOCALE_COLLATE
	2423
	2424	char *
	2425	Perl__mem_collxfrm(pTHX_ const char *input_string,
	2426	STRLEN len, /* Length of 'input_string' */
	2427	STRLEN xlen, / Set to length of returned string
	2428	(not including the collation index
	2429	prefix) */
	2430	bool utf8 /* Is the input in UTF-8? */
	2431	)
	2432	{
	2433
	2434	/* _mem_collxfrm() is a bit like strxfrm() but with two important
	2435	* differences. First, it handles embedded NULs. Second, it allocates a bit
	2436	* more memory than needed for the transformed data itself. The real
	2437	* transformed data begins at offset COLLXFRM_HDR_LEN. *xlen is set to
	2438	* the length of that, and doesn't include the collation index size.
	2439	* Please see sv_collxfrm() to see how this is used. */
	2440
	2441	#define COLLXFRM_HDR_LEN sizeof(PL_collation_ix)
	2442
	2443	char * s = (char *) input_string;
	2444	STRLEN s_strlen = strlen(input_string);
	2445	char *xbuf = NULL;
	2446	STRLEN xAlloc; /* xalloc is a reserved word in VC */
	2447	STRLEN length_in_chars;
	2448	bool first_time = TRUE; /* Cleared after first loop iteration */
	2449
	2450	PERL_ARGS_ASSERT__MEM_COLLXFRM;
	2451
	2452	/* Must be NUL-terminated */
	2453	assert(*(input_string + len) == '\0');
	2454
	2455	/* If this locale has defective collation, skip */
	2456	if (PL_collxfrm_base == 0 && PL_collxfrm_mult == 0) {
	2457	DEBUG_L(PerlIO_printf(Perl_debug_log,
	2458	"_mem_collxfrm: locale's collation is defective\n"));
	2459	goto bad;
	2460	}
	2461
	2462	/* Replace any embedded NULs with the control that sorts before any others.
	2463	* This will give as good as possible results on strings that don't
	2464	* otherwise contain that character, but otherwise there may be
	2465	* less-than-perfect results with that character and NUL. This is
	2466	* unavoidable unless we replace strxfrm with our own implementation. */
	2467	if (UNLIKELY(s_strlen < len)) { /* Only execute if there is an embedded
	2468	NUL */
	2469	char * e = s + len;
	2470	char * sans_nuls;
	2471	STRLEN sans_nuls_len;
	2472	int try_non_controls;
	2473	char this_replacement_char[] = "?\0"; /* Room for a two-byte string,
	2474	making sure 2nd byte is NUL.
	2475	*/
	2476	STRLEN this_replacement_len;
	2477
	2478	/* If we don't know what non-NUL control character sorts lowest for
	2479	* this locale, find it */
	2480	if (PL_strxfrm_NUL_replacement == '\0') {
	2481	int j;
	2482	char * cur_min_x = NULL; /* The min_char's xfrm, (except it also
	2483	includes the collation index
	2484	prefixed. */
	2485
	2486	DEBUG_Lv(PerlIO_printf(Perl_debug_log, "Looking to replace NUL\n"));
	2487
	2488	/* Unlikely, but it may be that no control will work to replace
	2489	* NUL, in which case we instead look for any character. Controls
	2490	* are preferred because collation order is, in general, context
	2491	* sensitive, with adjoining characters affecting the order, and
	2492	* controls are less likely to have such interactions, allowing the
	2493	* NUL-replacement to stand on its own. (Another way to look at it
	2494	* is to imagine what would happen if the NUL were replaced by a
	2495	* combining character; it wouldn't work out all that well.) */
	2496	for (try_non_controls = 0;
	2497	try_non_controls < 2;
	2498	try_non_controls++)
	2499	{
	2500	/* Look through all legal code points (NUL isn't) */
	2501	for (j = 1; j < 256; j++) {
	2502	char * x; /* j's xfrm plus collation index */
	2503	STRLEN x_len; /* length of 'x' */
	2504	STRLEN trial_len = 1;
	2505	char cur_source[] = { '\0', '\0' };
	2506
	2507	/* Skip non-controls the first time through the loop. The
	2508	* controls in a UTF-8 locale are the L1 ones */
	2509	if (! try_non_controls && (PL_in_utf8_COLLATE_locale)
	2510	? ! isCNTRL_L1(j)
	2511	: ! isCNTRL_LC(j))
	2512	{
	2513	continue;
	2514	}
	2515
	2516	/* Create a 1-char string of the current code point */
	2517	cur_source[0] = (char) j;
	2518
	2519	/* Then transform it */
	2520	x = _mem_collxfrm(cur_source, trial_len, &x_len,
	2521	0 /* The string is not in UTF-8 */);
	2522
	2523	/* Ignore any character that didn't successfully transform.
	2524	* */
	2525	if (! x) {
	2526	continue;
	2527	}
	2528
	2529	/* If this character's transformation is lower than
	2530	* the current lowest, this one becomes the lowest */
	2531	if ( cur_min_x == NULL
	2532	\|\| strLT(x + COLLXFRM_HDR_LEN,
	2533	cur_min_x + COLLXFRM_HDR_LEN))
	2534	{
	2535	PL_strxfrm_NUL_replacement = j;
	2536	cur_min_x = x;
	2537	}
	2538	else {
	2539	Safefree(x);
	2540	}
	2541	} /* end of loop through all 255 characters */
	2542
	2543	/* Stop looking if found */
	2544	if (cur_min_x) {
	2545	break;
	2546	}
	2547
	2548	/* Unlikely, but possible, if there aren't any controls that
	2549	* work in the locale, repeat the loop, looking for any
	2550	* character that works */
	2551	DEBUG_L(PerlIO_printf(Perl_debug_log,
	2552	"_mem_collxfrm: No control worked. Trying non-controls\n"));
	2553	} /* End of loop to try first the controls, then any char */
	2554
	2555	if (! cur_min_x) {
	2556	DEBUG_L(PerlIO_printf(Perl_debug_log,
	2557	"_mem_collxfrm: Couldn't find any character to replace"
	2558	" embedded NULs in locale %s with", PL_collation_name));
	2559	goto bad;
	2560	}
	2561
	2562	DEBUG_L(PerlIO_printf(Perl_debug_log,
	2563	"_mem_collxfrm: Replacing embedded NULs in locale %s with "
	2564	"0x%02X\n", PL_collation_name, PL_strxfrm_NUL_replacement));
	2565
	2566	Safefree(cur_min_x);
	2567	} /* End of determining the character that is to replace NULs */
	2568
	2569	/* If the replacement is variant under UTF-8, it must match the
	2570	* UTF8-ness of the original */
	2571	if ( ! UVCHR_IS_INVARIANT(PL_strxfrm_NUL_replacement) && utf8) {
	2572	this_replacement_char[0] =
	2573	UTF8_EIGHT_BIT_HI(PL_strxfrm_NUL_replacement);
	2574	this_replacement_char[1] =
	2575	UTF8_EIGHT_BIT_LO(PL_strxfrm_NUL_replacement);
	2576	this_replacement_len = 2;
	2577	}
	2578	else {
	2579	this_replacement_char[0] = PL_strxfrm_NUL_replacement;
	2580	/* this_replacement_char[1] = '\0' was done at initialization */
	2581	this_replacement_len = 1;
	2582	}
	2583
	2584	/* The worst case length for the replaced string would be if every
	2585	* character in it is NUL. Multiply that by the length of each
	2586	* replacement, and allow for a trailing NUL */
	2587	sans_nuls_len = (len * this_replacement_len) + 1;
	2588	Newx(sans_nuls, sans_nuls_len, char);
	2589	*sans_nuls = '\0';
	2590
	2591	/* Replace each NUL with the lowest collating control. Loop until have
	2592	* exhausted all the NULs */
	2593	while (s + s_strlen < e) {
	2594	my_strlcat(sans_nuls, s, sans_nuls_len);
	2595
	2596	/* Do the actual replacement */
	2597	my_strlcat(sans_nuls, this_replacement_char, sans_nuls_len);
	2598
	2599	/* Move past the input NUL */
	2600	s += s_strlen + 1;
	2601	s_strlen = strlen(s);
	2602	}
	2603
	2604	/* And add anything that trails the final NUL */
	2605	my_strlcat(sans_nuls, s, sans_nuls_len);
	2606
	2607	/* Switch so below we transform this modified string */
	2608	s = sans_nuls;
	2609	len = strlen(s);
	2610	} /* End of replacing NULs */
	2611
	2612	/* Make sure the UTF8ness of the string and locale match */
	2613	if (utf8 != PL_in_utf8_COLLATE_locale) {
	2614	const char * const t = s; /* Temporary so we can later find where the
	2615	input was */
	2616
	2617	/* Here they don't match. Change the string's to be what the locale is
	2618	* expecting */
	2619
	2620	if (! utf8) { /* locale is UTF-8, but input isn't; upgrade the input */
	2621	s = (char ) bytes_to_utf8((const U8 ) s, &len);
	2622	utf8 = TRUE;
	2623	}
	2624	else { /* locale is not UTF-8; but input is; downgrade the input */
	2625
	2626	s = (char ) bytes_from_utf8((const U8 ) s, &len, &utf8);
	2627
	2628	/* If the downgrade was successful we are done, but if the input
	2629	* contains things that require UTF-8 to represent, have to do
	2630	* damage control ... */
	2631	if (UNLIKELY(utf8)) {
	2632
	2633	/* What we do is construct a non-UTF-8 string with
	2634	* 1) the characters representable by a single byte converted
	2635	* to be so (if necessary);
	2636	* 2) and the rest converted to collate the same as the
	2637	* highest collating representable character. That makes
	2638	* them collate at the end. This is similar to how we
	2639	* handle embedded NULs, but we use the highest collating
	2640	* code point instead of the smallest. Like the NUL case,
	2641	* this isn't perfect, but is the best we can reasonably
	2642	* do. Every above-255 code point will sort the same as
	2643	* the highest-sorting 0-255 code point. If that code
	2644	* point can combine in a sequence with some other code
	2645	* points for weight calculations, us changing something to
	2646	* be it can adversely affect the results. But in most
	2647	* cases, it should work reasonably. And note that this is
	2648	* really an illegal situation: using code points above 255
	2649	* on a locale where only 0-255 are valid. If two strings
	2650	* sort entirely equal, then the sort order for the
	2651	* above-255 code points will be in code point order. */
	2652
	2653	utf8 = FALSE;
	2654
	2655	/* If we haven't calculated the code point with the maximum
	2656	* collating order for this locale, do so now */
	2657	if (! PL_strxfrm_max_cp) {
	2658	int j;
	2659
	2660	/* The current transformed string that collates the
	2661	* highest (except it also includes the prefixed collation
	2662	* index. */
	2663	char * cur_max_x = NULL;
	2664
	2665	/* Look through all legal code points (NUL isn't) */
	2666	for (j = 1; j < 256; j++) {
	2667	char * x;
	2668	STRLEN x_len;
	2669	char cur_source[] = { '\0', '\0' };
	2670
	2671	/* Create a 1-char string of the current code point */
	2672	cur_source[0] = (char) j;
	2673
	2674	/* Then transform it */
	2675	x = _mem_collxfrm(cur_source, 1, &x_len, FALSE);
	2676
	2677	/* If something went wrong (which it shouldn't), just
	2678	* ignore this code point */
	2679	if (! x) {
	2680	continue;
	2681	}
	2682
	2683	/* If this character's transformation is higher than
	2684	* the current highest, this one becomes the highest */
	2685	if ( cur_max_x == NULL
	2686	\|\| strGT(x + COLLXFRM_HDR_LEN,
	2687	cur_max_x + COLLXFRM_HDR_LEN))
	2688	{
	2689	PL_strxfrm_max_cp = j;
	2690	cur_max_x = x;
	2691	}
	2692	else {
	2693	Safefree(x);
	2694	}
	2695	}
	2696
	2697	if (! cur_max_x) {
	2698	DEBUG_L(PerlIO_printf(Perl_debug_log,
	2699	"_mem_collxfrm: Couldn't find any character to"
	2700	" replace above-Latin1 chars in locale %s with",
	2701	PL_collation_name));
	2702	goto bad;
	2703	}
	2704
	2705	DEBUG_L(PerlIO_printf(Perl_debug_log,
	2706	"_mem_collxfrm: highest 1-byte collating character"
	2707	" in locale %s is 0x%02X\n",
	2708	PL_collation_name,
	2709	PL_strxfrm_max_cp));
	2710
	2711	Safefree(cur_max_x);
	2712	}
	2713
	2714	/* Here we know which legal code point collates the highest.
	2715	* We are ready to construct the non-UTF-8 string. The length
	2716	* will be at least 1 byte smaller than the input string
	2717	* (because we changed at least one 2-byte character into a
	2718	* single byte), but that is eaten up by the trailing NUL */
	2719	Newx(s, len, char);
	2720
	2721	{
	2722	STRLEN i;
	2723	STRLEN d= 0;
	2724	char * e = (char *) t + len;
	2725
	2726	for (i = 0; i < len; i+= UTF8SKIP(t + i)) {
	2727	U8 cur_char = t[i];
	2728	if (UTF8_IS_INVARIANT(cur_char)) {
	2729	s[d++] = cur_char;
	2730	}
	2731	else if (UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(t + i, e)) {
	2732	s[d++] = EIGHT_BIT_UTF8_TO_NATIVE(cur_char, t[i+1]);
	2733	}
	2734	else { /* Replace illegal cp with highest collating
	2735	one */
	2736	s[d++] = PL_strxfrm_max_cp;
	2737	}
	2738	}
	2739	s[d++] = '\0';
	2740	Renew(s, d, char); /* Free up unused space */
	2741	}
	2742	}
	2743	}
	2744
	2745	/* Here, we have constructed a modified version of the input. It could
	2746	* be that we already had a modified copy before we did this version.
	2747	* If so, that copy is no longer needed */
	2748	if (t != input_string) {
	2749	Safefree(t);
	2750	}
	2751	}
	2752
	2753	length_in_chars = (utf8)
	2754	? utf8_length((U8 ) s, (U8 ) s + len)
	2755	: len;
	2756
	2757	/* The first element in the output is the collation id, used by
	2758	* sv_collxfrm(); then comes the space for the transformed string. The
	2759	* equation should give us a good estimate as to how much is needed */
	2760	xAlloc = COLLXFRM_HDR_LEN
	2761	+ PL_collxfrm_base
	2762	+ (PL_collxfrm_mult * length_in_chars);
	2763	Newx(xbuf, xAlloc, char);
	2764	if (UNLIKELY(! xbuf)) {
	2765	DEBUG_L(PerlIO_printf(Perl_debug_log,
	2766	"_mem_collxfrm: Couldn't malloc %zu bytes\n", xAlloc));
	2767	goto bad;
	2768	}
	2769
	2770	/* Store the collation id */
	2771	(U32)xbuf = PL_collation_ix;
	2772
	2773	/* Then the transformation of the input. We loop until successful, or we
	2774	* give up */
	2775	for (;;) {
	2776
	2777	*xlen = strxfrm(xbuf + COLLXFRM_HDR_LEN, s, xAlloc - COLLXFRM_HDR_LEN);
	2778
	2779	/* If the transformed string occupies less space than we told strxfrm()
	2780	* was available, it means it successfully transformed the whole
	2781	* string. */
	2782	if (*xlen < xAlloc - COLLXFRM_HDR_LEN) {
	2783
	2784	/* Some systems include a trailing NUL in the returned length.
	2785	* Ignore it, using a loop in case multiple trailing NULs are
	2786	* returned. */
	2787	while ( (*xlen) > 0
	2788	&& (xbuf + COLLXFRM_HDR_LEN + (xlen) - 1) == '\0')
	2789	{
	2790	(*xlen)--;
	2791	}
	2792
	2793	/* If the first try didn't get it, it means our prediction was low.
	2794	* Modify the coefficients so that we predict a larger value in any
	2795	* future transformations */
	2796	if (! first_time) {
	2797	STRLEN needed = xlen + 1; / +1 For trailing NUL */
	2798	STRLEN computed_guess = PL_collxfrm_base
	2799	+ (PL_collxfrm_mult * length_in_chars);
	2800
	2801	/* On zero-length input, just keep current slope instead of
	2802	* dividing by 0 */
	2803	const STRLEN new_m = (length_in_chars != 0)
	2804	? needed / length_in_chars
	2805	: PL_collxfrm_mult;
	2806
	2807	DEBUG_Lv(PerlIO_printf(Perl_debug_log,
	2808	"%s: %d: initial size of %zu bytes for a length "
	2809	"%zu string was insufficient, %zu needed\n",
	2810	__FILE__, __LINE__,
	2811	computed_guess, length_in_chars, needed));
	2812
	2813	/* If slope increased, use it, but discard this result for
	2814	* length 1 strings, as we can't be sure that it's a real slope
	2815	* change */
	2816	if (length_in_chars > 1 && new_m > PL_collxfrm_mult) {
	2817
	2818	# ifdef DEBUGGING
	2819
	2820	STRLEN old_m = PL_collxfrm_mult;
	2821	STRLEN old_b = PL_collxfrm_base;
	2822
	2823	# endif
	2824
	2825	PL_collxfrm_mult = new_m;
	2826	PL_collxfrm_base = 1; /* +1 For trailing NUL */
	2827	computed_guess = PL_collxfrm_base
	2828	+ (PL_collxfrm_mult * length_in_chars);
	2829	if (computed_guess < needed) {
	2830	PL_collxfrm_base += needed - computed_guess;
	2831	}
	2832
	2833	DEBUG_Lv(PerlIO_printf(Perl_debug_log,
	2834	"%s: %d: slope is now %zu; was %zu, base "
	2835	"is now %zu; was %zu\n",
	2836	__FILE__, __LINE__,
	2837	PL_collxfrm_mult, old_m,
	2838	PL_collxfrm_base, old_b));
	2839	}
	2840	else { /* Slope didn't change, but 'b' did */
	2841	const STRLEN new_b = needed
	2842	- computed_guess
	2843	+ PL_collxfrm_base;
	2844	DEBUG_Lv(PerlIO_printf(Perl_debug_log,
	2845	"%s: %d: base is now %zu; was %zu\n",
	2846	__FILE__, __LINE__,
	2847	new_b, PL_collxfrm_base));
	2848	PL_collxfrm_base = new_b;
	2849	}
	2850	}
	2851
	2852	break;
	2853	}
	2854
	2855	if (UNLIKELY(*xlen >= PERL_INT_MAX)) {
	2856	DEBUG_L(PerlIO_printf(Perl_debug_log,
	2857	"_mem_collxfrm: Needed %zu bytes, max permissible is %u\n",
	2858	*xlen, PERL_INT_MAX));
	2859	goto bad;
	2860	}
	2861
	2862	/* A well-behaved strxfrm() returns exactly how much space it needs
	2863	* (usually not including the trailing NUL) when it fails due to not
	2864	* enough space being provided. Assume that this is the case unless
	2865	* it's been proven otherwise */
	2866	if (LIKELY(PL_strxfrm_is_behaved) && first_time) {
	2867	xAlloc = *xlen + COLLXFRM_HDR_LEN + 1;
	2868	}
	2869	else { /* Here, either:
	2870	* 1) The strxfrm() has previously shown bad behavior; or
	2871	* 2) It isn't the first time through the loop, which means
	2872	* that the strxfrm() is now showing bad behavior, because
	2873	* we gave it what it said was needed in the previous
	2874	* iteration, and it came back saying it needed still more.
	2875	* (Many versions of cygwin fit this. When the buffer size
	2876	* isn't sufficient, they return the input size instead of
	2877	* how much is needed.)
	2878	* Increase the buffer size by a fixed percentage and try again.
	2879	* */
	2880	xAlloc += (xAlloc / 4) + 1;
	2881	PL_strxfrm_is_behaved = FALSE;
	2882
	2883	# ifdef DEBUGGING
	2884
	2885	if (DEBUG_Lv_TEST \|\| debug_initialization) {
	2886	PerlIO_printf(Perl_debug_log,
	2887	"_mem_collxfrm required more space than previously calculated"
	2888	" for locale %s, trying again with new guess=%d+%zu\n",
	2889	PL_collation_name, (int) COLLXFRM_HDR_LEN,
	2890	xAlloc - COLLXFRM_HDR_LEN);
	2891	}
	2892
	2893	# endif
	2894
	2895	}
	2896
	2897	Renew(xbuf, xAlloc, char);
	2898	if (UNLIKELY(! xbuf)) {
	2899	DEBUG_L(PerlIO_printf(Perl_debug_log,
	2900	"_mem_collxfrm: Couldn't realloc %zu bytes\n", xAlloc));
	2901	goto bad;
	2902	}
	2903
	2904	first_time = FALSE;
	2905	}
	2906
	2907
	2908	# ifdef DEBUGGING
	2909
	2910	if (DEBUG_Lv_TEST \|\| debug_initialization) {
	2911
	2912	print_collxfrm_input_and_return(s, s + len, xlen, utf8);
	2913	PerlIO_printf(Perl_debug_log, "Its xfrm is:");
	2914	PerlIO_printf(Perl_debug_log, "%s\n",
	2915	_byte_dump_string((U8 *) xbuf + COLLXFRM_HDR_LEN,
	2916	*xlen, 1));
	2917	}
	2918
	2919	# endif
	2920
	2921	/* Free up unneeded space; retain ehough for trailing NUL */
	2922	Renew(xbuf, COLLXFRM_HDR_LEN + *xlen + 1, char);
	2923
	2924	if (s != input_string) {
	2925	Safefree(s);
	2926	}
	2927
	2928	return xbuf;
	2929
	2930	bad:
	2931	Safefree(xbuf);
	2932	if (s != input_string) {
	2933	Safefree(s);
	2934	}
	2935	*xlen = 0;
	2936
	2937	# ifdef DEBUGGING
	2938
	2939	if (DEBUG_Lv_TEST \|\| debug_initialization) {
	2940	print_collxfrm_input_and_return(s, s + len, NULL, utf8);
	2941	}
	2942
	2943	# endif
	2944
	2945	return NULL;
	2946	}
	2947
	2948	# ifdef DEBUGGING
	2949
	2950	STATIC void
	2951	S_print_collxfrm_input_and_return(pTHX_
	2952	const char * const s,
	2953	const char * const e,
	2954	const STRLEN * const xlen,
	2955	const bool is_utf8)
	2956	{
	2957
	2958	PERL_ARGS_ASSERT_PRINT_COLLXFRM_INPUT_AND_RETURN;
	2959
	2960	PerlIO_printf(Perl_debug_log, "_mem_collxfrm[%" UVuf "]: returning ",
	2961	(UV)PL_collation_ix);
	2962	if (xlen) {
	2963	PerlIO_printf(Perl_debug_log, "%zu", *xlen);
	2964	}
	2965	else {
	2966	PerlIO_printf(Perl_debug_log, "NULL");
	2967	}
	2968	PerlIO_printf(Perl_debug_log, " for locale '%s', string='",
	2969	PL_collation_name);
	2970	print_bytes_for_locale(s, e, is_utf8);
	2971
	2972	PerlIO_printf(Perl_debug_log, "'\n");
	2973	}
	2974
	2975	STATIC void
	2976	S_print_bytes_for_locale(pTHX_
	2977	const char * const s,
	2978	const char * const e,
	2979	const bool is_utf8)
	2980	{
	2981	const char * t = s;
	2982	bool prev_was_printable = TRUE;
	2983	bool first_time = TRUE;
	2984
	2985	PERL_ARGS_ASSERT_PRINT_BYTES_FOR_LOCALE;
	2986
	2987	while (t < e) {
	2988	UV cp = (is_utf8)
	2989	? utf8_to_uvchr_buf((U8 *) t, e, NULL)
	2990	: * (U8 *) t;
	2991	if (isPRINT(cp)) {
	2992	if (! prev_was_printable) {
	2993	PerlIO_printf(Perl_debug_log, " ");
	2994	}
	2995	PerlIO_printf(Perl_debug_log, "%c", (U8) cp);
	2996	prev_was_printable = TRUE;
	2997	}
	2998	else {
	2999	if (! first_time) {
	3000	PerlIO_printf(Perl_debug_log, " ");
	3001	}
	3002	PerlIO_printf(Perl_debug_log, "%02" UVXf, cp);
	3003	prev_was_printable = FALSE;
	3004	}
	3005	t += (is_utf8) ? UTF8SKIP(t) : 1;
	3006	first_time = FALSE;
	3007	}
	3008	}
	3009
	3010	# endif /* #ifdef DEBUGGING */
	3011	#endif /* USE_LOCALE_COLLATE */
	3012
	3013	#ifdef USE_LOCALE
	3014
	3015	bool
	3016	Perl__is_cur_LC_category_utf8(pTHX_ int category)
	3017	{
	3018	/* Returns TRUE if the current locale for 'category' is UTF-8; FALSE
	3019	* otherwise. 'category' may not be LC_ALL. If the platform doesn't have
	3020	* nl_langinfo(), nor MB_CUR_MAX, this employs a heuristic, which hence
	3021	* could give the wrong result. The result will very likely be correct for
	3022	* languages that have commonly used non-ASCII characters, but for notably
	3023	* English, it comes down to if the locale's name ends in something like
	3024	* "UTF-8". It errs on the side of not being a UTF-8 locale. */
	3025
	3026	const char *save_input_locale = NULL;
	3027	STRLEN final_pos;
	3028
	3029	# ifdef LC_ALL
	3030
	3031	assert(category != LC_ALL);
	3032
	3033	# endif
	3034
	3035	/* First dispose of the trivial cases */
	3036	save_input_locale = do_setlocale_r(category, NULL);
	3037	if (! save_input_locale) {
	3038	DEBUG_L(PerlIO_printf(Perl_debug_log,
	3039	"Could not find current locale for category %d\n",
	3040	category));
	3041	return FALSE; /* XXX maybe should croak */
	3042	}
	3043	save_input_locale = stdize_locale(savepv(save_input_locale));
	3044	if (isNAME_C_OR_POSIX(save_input_locale)) {
	3045	DEBUG_L(PerlIO_printf(Perl_debug_log,
	3046	"Current locale for category %d is %s\n",
	3047	category, save_input_locale));
	3048	Safefree(save_input_locale);
	3049	return FALSE;
	3050	}
	3051
	3052	# if defined(USE_LOCALE_CTYPE) \
	3053	&& (defined(MB_CUR_MAX) \|\| (defined(HAS_NL_LANGINFO) && defined(CODESET)))
	3054
	3055	{ /* Next try nl_langinfo or MB_CUR_MAX if available */
	3056
	3057	char *save_ctype_locale = NULL;
	3058	bool is_utf8;
	3059
	3060	if (category != LC_CTYPE) { /* These work only on LC_CTYPE */
	3061
	3062	/* Get the current LC_CTYPE locale */
	3063	save_ctype_locale = do_setlocale_c(LC_CTYPE, NULL);
	3064	if (! save_ctype_locale) {
	3065	DEBUG_L(PerlIO_printf(Perl_debug_log,
	3066	"Could not find current locale for LC_CTYPE\n"));
	3067	goto cant_use_nllanginfo;
	3068	}
	3069	save_ctype_locale = stdize_locale(savepv(save_ctype_locale));
	3070
	3071	/* If LC_CTYPE and the desired category use the same locale, this
	3072	* means that finding the value for LC_CTYPE is the same as finding
	3073	* the value for the desired category. Otherwise, switch LC_CTYPE
	3074	* to the desired category's locale */
	3075	if (strEQ(save_ctype_locale, save_input_locale)) {
	3076	Safefree(save_ctype_locale);
	3077	save_ctype_locale = NULL;
	3078	}
	3079	else if (! do_setlocale_c(LC_CTYPE, save_input_locale)) {
	3080	DEBUG_L(PerlIO_printf(Perl_debug_log,
	3081	"Could not change LC_CTYPE locale to %s\n",
	3082	save_input_locale));
	3083	Safefree(save_ctype_locale);
	3084	goto cant_use_nllanginfo;
	3085	}
	3086	}
	3087
	3088	DEBUG_L(PerlIO_printf(Perl_debug_log, "Current LC_CTYPE locale=%s\n",
	3089	save_input_locale));
	3090
	3091	/* Here the current LC_CTYPE is set to the locale of the category whose
	3092	* information is desired. This means that nl_langinfo() and MB_CUR_MAX
	3093	* should give the correct results */
	3094
	3095	# if defined(HAS_NL_LANGINFO) && defined(CODESET)
	3096
	3097	{ /* The task is easiest if the platform has this POSIX 2001 function */
	3098	const char *codeset = my_nl_langinfo(PERL_CODESET, FALSE);
	3099	/* FALSE => already in dest locale */
	3100
	3101	DEBUG_L(PerlIO_printf(Perl_debug_log,
	3102	"\tnllanginfo returned CODESET '%s'\n", codeset));
	3103
	3104	if (codeset && strNE(codeset, "")) {
	3105	/* If we switched LC_CTYPE, switch back */
	3106	if (save_ctype_locale) {
	3107	do_setlocale_c(LC_CTYPE, save_ctype_locale);
	3108	Safefree(save_ctype_locale);
	3109	}
	3110
	3111	is_utf8 = ( ( strlen(codeset) == STRLENs("UTF-8")
	3112	&& foldEQ(codeset, STR_WITH_LEN("UTF-8")))
	3113	\|\| ( strlen(codeset) == STRLENs("UTF8")
	3114	&& foldEQ(codeset, STR_WITH_LEN("UTF8"))));
	3115
	3116	DEBUG_L(PerlIO_printf(Perl_debug_log,
	3117	"\tnllanginfo returned CODESET '%s'; ?UTF8 locale=%d\n",
	3118	codeset, is_utf8));
	3119	Safefree(save_input_locale);
	3120	return is_utf8;
	3121	}
	3122	}
	3123
	3124	# endif
	3125	# ifdef MB_CUR_MAX
	3126
	3127	/* Here, either we don't have nl_langinfo, or it didn't return a
	3128	* codeset. Try MB_CUR_MAX */
	3129
	3130	/* Standard UTF-8 needs at least 4 bytes to represent the maximum
	3131	* Unicode code point. Since UTF-8 is the only non-single byte
	3132	* encoding we handle, we just say any such encoding is UTF-8, and if
	3133	* turns out to be wrong, other things will fail */
	3134	is_utf8 = (unsigned) MB_CUR_MAX >= STRLENs(MAX_UNICODE_UTF8);
	3135
	3136	DEBUG_L(PerlIO_printf(Perl_debug_log,
	3137	"\tMB_CUR_MAX=%d; ?UTF8 locale=%d\n",
	3138	(int) MB_CUR_MAX, is_utf8));
	3139
	3140	Safefree(save_input_locale);
	3141
	3142	# ifdef HAS_MBTOWC
	3143
	3144	/* ... But, most system that have MB_CUR_MAX will also have mbtowc(),
	3145	* since they are both in the C99 standard. We can feed a known byte
	3146	* string to the latter function, and check that it gives the expected
	3147	* result */
	3148	if (is_utf8) {
	3149	wchar_t wc;
	3150	int len;
	3151
	3152	PERL_UNUSED_RESULT(mbtowc(&wc, NULL, 0));/* Reset any shift state */
	3153	errno = 0;
	3154	len = mbtowc(&wc, STR_WITH_LEN(REPLACEMENT_CHARACTER_UTF8));
	3155
	3156
	3157	if ( len != STRLENs(REPLACEMENT_CHARACTER_UTF8)
	3158	\|\| wc != (wchar_t) UNICODE_REPLACEMENT)
	3159	{
	3160	is_utf8 = FALSE;
	3161	DEBUG_L(PerlIO_printf(Perl_debug_log, "\replacement=U+%x\n",
	3162	(unsigned int)wc));
	3163	DEBUG_L(PerlIO_printf(Perl_debug_log,
	3164	"\treturn from mbtowc=%d; errno=%d; ?UTF8 locale=0\n",
	3165	len, errno));
	3166	}
	3167	}
	3168
	3169	# endif
	3170
	3171	/* If we switched LC_CTYPE, switch back */
	3172	if (save_ctype_locale) {
	3173	do_setlocale_c(LC_CTYPE, save_ctype_locale);
	3174	Safefree(save_ctype_locale);
	3175	}
	3176
	3177	return is_utf8;
	3178
	3179	# endif
	3180
	3181	}
	3182
	3183	cant_use_nllanginfo:
	3184
	3185	# else /* nl_langinfo should work if available, so don't bother compiling this
	3186	fallback code. The final fallback of looking at the name is
	3187	compiled, and will be executed if nl_langinfo fails */
	3188
	3189	/* nl_langinfo not available or failed somehow. Next try looking at the
	3190	* currency symbol to see if it disambiguates things. Often that will be
	3191	* in the native script, and if the symbol isn't in UTF-8, we know that the
	3192	* locale isn't. If it is non-ASCII UTF-8, we infer that the locale is
	3193	* too, as the odds of a non-UTF8 string being valid UTF-8 are quite small
	3194	* */
	3195
	3196	# ifdef HAS_LOCALECONV
	3197	# ifdef USE_LOCALE_MONETARY
	3198
	3199	{
	3200	char *save_monetary_locale = NULL;
	3201	bool only_ascii = FALSE;
	3202	bool is_utf8 = FALSE;
	3203	struct lconv* lc;
	3204
	3205	/* Like above for LC_CTYPE, we first set LC_MONETARY to the locale of
	3206	* the desired category, if it isn't that locale already */
	3207
	3208	if (category != LC_MONETARY) {
	3209
	3210	save_monetary_locale = do_setlocale_c(LC_MONETARY, NULL);
	3211	if (! save_monetary_locale) {
	3212	DEBUG_L(PerlIO_printf(Perl_debug_log,
	3213	"Could not find current locale for LC_MONETARY\n"));
	3214	goto cant_use_monetary;
	3215	}
	3216	save_monetary_locale = stdize_locale(savepv(save_monetary_locale));
	3217
	3218	if (strEQ(save_monetary_locale, save_input_locale)) {
	3219	Safefree(save_monetary_locale);
	3220	save_monetary_locale = NULL;
	3221	}
	3222	else if (! do_setlocale_c(LC_MONETARY, save_input_locale)) {
	3223	DEBUG_L(PerlIO_printf(Perl_debug_log,
	3224	"Could not change LC_MONETARY locale to %s\n",
	3225	save_input_locale));
	3226	Safefree(save_monetary_locale);
	3227	goto cant_use_monetary;
	3228	}
	3229	}
	3230
	3231	/* Here the current LC_MONETARY is set to the locale of the category
	3232	* whose information is desired. */
	3233
	3234	lc = localeconv();
	3235	if (! lc
	3236	\|\| ! lc->currency_symbol
	3237	\|\| is_utf8_invariant_string((U8 *) lc->currency_symbol, 0))
	3238	{
	3239	DEBUG_L(PerlIO_printf(Perl_debug_log, "Couldn't get currency symbol for %s, or contains only ASCII; can't use for determining if UTF-8 locale\n", save_input_locale));
	3240	only_ascii = TRUE;
	3241	}
	3242	else {
	3243	is_utf8 = is_utf8_string((U8 *) lc->currency_symbol, 0);
	3244	}
	3245
	3246	/* If we changed it, restore LC_MONETARY to its original locale */
	3247	if (save_monetary_locale) {
	3248	do_setlocale_c(LC_MONETARY, save_monetary_locale);
	3249	Safefree(save_monetary_locale);
	3250	}
	3251
	3252	if (! only_ascii) {
	3253
	3254	/* It isn't a UTF-8 locale if the symbol is not legal UTF-8;
	3255	* otherwise assume the locale is UTF-8 if and only if the symbol
	3256	* is non-ascii UTF-8. */
	3257	DEBUG_L(PerlIO_printf(Perl_debug_log, "\t?Currency symbol for %s is UTF-8=%d\n",
	3258	save_input_locale, is_utf8));
	3259	Safefree(save_input_locale);
	3260	return is_utf8;
	3261	}
	3262	}
	3263	cant_use_monetary:
	3264
	3265	# endif /* USE_LOCALE_MONETARY */
	3266	# endif /* HAS_LOCALECONV */
	3267
	3268	# if defined(HAS_STRFTIME) && defined(USE_LOCALE_TIME)
	3269
	3270	/* Still haven't found a non-ASCII string to disambiguate UTF-8 or not. Try
	3271	* the names of the months and weekdays, timezone, and am/pm indicator */
	3272	{
	3273	char *save_time_locale = NULL;
	3274	int hour = 10;
	3275	bool is_dst = FALSE;
	3276	int dom = 1;
	3277	int month = 0;
	3278	int i;
	3279	char * formatted_time;
	3280
	3281
	3282	/* Like above for LC_MONETARY, we set LC_TIME to the locale of the
	3283	* desired category, if it isn't that locale already */
	3284
	3285	if (category != LC_TIME) {
	3286
	3287	save_time_locale = do_setlocale_c(LC_TIME, NULL);
	3288	if (! save_time_locale) {
	3289	DEBUG_L(PerlIO_printf(Perl_debug_log,
	3290	"Could not find current locale for LC_TIME\n"));
	3291	goto cant_use_time;
	3292	}
	3293	save_time_locale = stdize_locale(savepv(save_time_locale));
	3294
	3295	if (strEQ(save_time_locale, save_input_locale)) {
	3296	Safefree(save_time_locale);
	3297	save_time_locale = NULL;
	3298	}
	3299	else if (! do_setlocale_c(LC_TIME, save_input_locale)) {
	3300	DEBUG_L(PerlIO_printf(Perl_debug_log,
	3301	"Could not change LC_TIME locale to %s\n",
	3302	save_input_locale));
	3303	Safefree(save_time_locale);
	3304	goto cant_use_time;
	3305	}
	3306	}
	3307
	3308	/* Here the current LC_TIME is set to the locale of the category
	3309	* whose information is desired. Look at all the days of the week and
	3310	* month names, and the timezone and am/pm indicator for UTF-8 variant
	3311	* characters. The first such a one found will tell us if the locale
	3312	* is UTF-8 or not */
	3313
	3314	for (i = 0; i < 7 + 12; i++) { /* 7 days; 12 months */
	3315	formatted_time = my_strftime("%A %B %Z %p",
	3316	0, 0, hour, dom, month, 2012 - 1900, 0, 0, is_dst);
	3317	if ( ! formatted_time
	3318	\|\| is_utf8_invariant_string((U8 *) formatted_time, 0))
	3319	{
	3320
	3321	/* Here, we didn't find a non-ASCII. Try the next time through
	3322	* with the complemented dst and am/pm, and try with the next
	3323	* weekday. After we have gotten all weekdays, try the next
	3324	* month */
	3325	is_dst = ! is_dst;
	3326	hour = (hour + 12) % 24;
	3327	dom++;
	3328	if (i > 6) {
	3329	month++;
	3330	}
	3331	continue;
	3332	}
	3333
	3334	/* Here, we have a non-ASCII. Return TRUE is it is valid UTF8;
	3335	* false otherwise. But first, restore LC_TIME to its original
	3336	* locale if we changed it */
	3337	if (save_time_locale) {
	3338	do_setlocale_c(LC_TIME, save_time_locale);
	3339	Safefree(save_time_locale);
	3340	}
	3341
	3342	DEBUG_L(PerlIO_printf(Perl_debug_log, "\t?time-related strings for %s are UTF-8=%d\n",
	3343	save_input_locale,
	3344	is_utf8_string((U8 *) formatted_time, 0)));
	3345	Safefree(save_input_locale);
	3346	return is_utf8_string((U8 *) formatted_time, 0);
	3347	}
	3348
	3349	/* Falling off the end of the loop indicates all the names were just
	3350	* ASCII. Go on to the next test. If we changed it, restore LC_TIME
	3351	* to its original locale */
	3352	if (save_time_locale) {
	3353	do_setlocale_c(LC_TIME, save_time_locale);
	3354	Safefree(save_time_locale);
	3355	}
	3356	DEBUG_L(PerlIO_printf(Perl_debug_log, "All time-related words for %s contain only ASCII; can't use for determining if UTF-8 locale\n", save_input_locale));
	3357	}
	3358	cant_use_time:
	3359
	3360	# endif
	3361
	3362	# if 0 && defined(USE_LOCALE_MESSAGES) && defined(HAS_SYS_ERRLIST)
	3363
	3364	/* This code is ifdefd out because it was found to not be necessary in testing
	3365	* on our dromedary test machine, which has over 700 locales. There, this
	3366	* added no value to looking at the currency symbol and the time strings. I
	3367	* left it in so as to avoid rewriting it if real-world experience indicates
	3368	* that dromedary is an outlier. Essentially, instead of returning abpve if we
	3369	* haven't found illegal utf8, we continue on and examine all the strerror()
	3370	* messages on the platform for utf8ness. If all are ASCII, we still don't
	3371	* know the answer; but otherwise we have a pretty good indication of the
	3372	* utf8ness. The reason this doesn't help much is that the messages may not
	3373	* have been translated into the locale. The currency symbol and time strings
	3374	* are much more likely to have been translated. */
	3375	{
	3376	int e;
	3377	bool is_utf8 = FALSE;
	3378	bool non_ascii = FALSE;
	3379	char *save_messages_locale = NULL;
	3380	const char * errmsg = NULL;
	3381
	3382	/* Like above, we set LC_MESSAGES to the locale of the desired
	3383	* category, if it isn't that locale already */
	3384
	3385	if (category != LC_MESSAGES) {
	3386
	3387	save_messages_locale = do_setlocale_c(LC_MESSAGES, NULL);
	3388	if (! save_messages_locale) {
	3389	DEBUG_L(PerlIO_printf(Perl_debug_log,
	3390	"Could not find current locale for LC_MESSAGES\n"));
	3391	goto cant_use_messages;
	3392	}
	3393	save_messages_locale = stdize_locale(savepv(save_messages_locale));
	3394
	3395	if (strEQ(save_messages_locale, save_input_locale)) {
	3396	Safefree(save_messages_locale);
	3397	save_messages_locale = NULL;
	3398	}
	3399	else if (! do_setlocale_c(LC_MESSAGES, save_input_locale)) {
	3400	DEBUG_L(PerlIO_printf(Perl_debug_log,
	3401	"Could not change LC_MESSAGES locale to %s\n",
	3402	save_input_locale));
	3403	Safefree(save_messages_locale);
	3404	goto cant_use_messages;
	3405	}
	3406	}
	3407
	3408	/* Here the current LC_MESSAGES is set to the locale of the category
	3409	* whose information is desired. Look through all the messages. We
	3410	* can't use Strerror() here because it may expand to code that
	3411	* segfaults in miniperl */
	3412
	3413	for (e = 0; e <= sys_nerr; e++) {
	3414	errno = 0;
	3415	errmsg = sys_errlist[e];
	3416	if (errno \|\| !errmsg) {
	3417	break;
	3418	}
	3419	errmsg = savepv(errmsg);
	3420	if (! is_utf8_invariant_string((U8 *) errmsg, 0)) {
	3421	non_ascii = TRUE;
	3422	is_utf8 = is_utf8_string((U8 *) errmsg, 0);
	3423	break;
	3424	}
	3425	}
	3426	Safefree(errmsg);
	3427
	3428	/* And, if we changed it, restore LC_MESSAGES to its original locale */
	3429	if (save_messages_locale) {
	3430	do_setlocale_c(LC_MESSAGES, save_messages_locale);
	3431	Safefree(save_messages_locale);
	3432	}
	3433
	3434	if (non_ascii) {
	3435
	3436	/* Any non-UTF-8 message means not a UTF-8 locale; if all are valid,
	3437	* any non-ascii means it is one; otherwise we assume it isn't */
	3438	DEBUG_L(PerlIO_printf(Perl_debug_log, "\t?error messages for %s are UTF-8=%d\n",
	3439	save_input_locale,
	3440	is_utf8));
	3441	Safefree(save_input_locale);
	3442	return is_utf8;
	3443	}
	3444
	3445	DEBUG_L(PerlIO_printf(Perl_debug_log, "All error messages for %s contain only ASCII; can't use for determining if UTF-8 locale\n", save_input_locale));
	3446	}
	3447	cant_use_messages:
	3448
	3449	# endif
	3450	# endif /* the code that is compiled when no nl_langinfo */
	3451
	3452	# ifndef EBCDIC /* On os390, even if the name ends with "UTF-8', it isn't a
	3453	UTF-8 locale */
	3454
	3455	/* As a last resort, look at the locale name to see if it matches
	3456	* qr/UTF -? * 8 /ix, or some other common locale names. This "name", the
	3457	* return of setlocale(), is actually defined to be opaque, so we can't
	3458	* really rely on the absence of various substrings in the name to indicate
	3459	* its UTF-8ness, but if it has UTF8 in the name, it is extremely likely to
	3460	* be a UTF-8 locale. Similarly for the other common names */
	3461
	3462	final_pos = strlen(save_input_locale) - 1;
	3463	if (final_pos >= 3) {
	3464	const char *name = save_input_locale;
	3465
	3466	/* Find next 'U' or 'u' and look from there */
	3467	while ((name += strcspn(name, "Uu") + 1)
	3468	<= save_input_locale + final_pos - 2)
	3469	{
	3470	if ( isALPHA_FOLD_NE(*name, 't')
	3471	\|\| isALPHA_FOLD_NE(*(name + 1), 'f'))
	3472	{
	3473	continue;
	3474	}
	3475	name += 2;
	3476	if (*(name) == '-') {
	3477	if ((name > save_input_locale + final_pos - 1)) {
	3478	break;
	3479	}
	3480	name++;
	3481	}
	3482	if (*(name) == '8') {
	3483	DEBUG_L(PerlIO_printf(Perl_debug_log,
	3484	"Locale %s ends with UTF-8 in name\n",
	3485	save_input_locale));
	3486	Safefree(save_input_locale);
	3487	return TRUE;
	3488	}
	3489	}
	3490	DEBUG_L(PerlIO_printf(Perl_debug_log,
	3491	"Locale %s doesn't end with UTF-8 in name\n",
	3492	save_input_locale));
	3493	}
	3494
	3495	# endif
	3496	# ifdef WIN32
	3497
	3498	/* http://msdn.microsoft.com/en-us/library/windows/desktop/dd317756.aspx */
	3499	if (memENDs(save_input_locale, final_pos, "65001")) {
	3500	DEBUG_L(PerlIO_printf(Perl_debug_log,
	3501	"Locale %s ends with 65001 in name, is UTF-8 locale\n",
	3502	save_input_locale));
	3503	Safefree(save_input_locale);
	3504	return TRUE;
	3505	}
	3506
	3507	# endif
	3508
	3509	/* Other common encodings are the ISO 8859 series, which aren't UTF-8. But
	3510	* since we are about to return FALSE anyway, there is no point in doing
	3511	* this extra work */
	3512
	3513	# if 0
	3514	if (instr(save_input_locale, "8859")) {
	3515	DEBUG_L(PerlIO_printf(Perl_debug_log,
	3516	"Locale %s has 8859 in name, not UTF-8 locale\n",
	3517	save_input_locale));
	3518	Safefree(save_input_locale);
	3519	return FALSE;
	3520	}
	3521	# endif
	3522
	3523	DEBUG_L(PerlIO_printf(Perl_debug_log,
	3524	"Assuming locale %s is not a UTF-8 locale\n",
	3525	save_input_locale));
	3526	Safefree(save_input_locale);
	3527	return FALSE;
	3528	}
	3529
	3530	#endif
	3531
	3532
	3533	bool
	3534	Perl__is_in_locale_category(pTHX_ const bool compiling, const int category)
	3535	{
	3536	dVAR;
	3537	/* Internal function which returns if we are in the scope of a pragma that
	3538	* enables the locale category 'category'. 'compiling' should indicate if
	3539	* this is during the compilation phase (TRUE) or not (FALSE). */
	3540
	3541	const COP * const cop = (compiling) ? &PL_compiling : PL_curcop;
	3542
	3543	SV *categories = cop_hints_fetch_pvs(cop, "locale", 0);
	3544	if (! categories \|\| categories == &PL_sv_placeholder) {
	3545	return FALSE;
	3546	}
	3547
	3548	/* The pseudo-category 'not_characters' is -1, so just add 1 to each to get
	3549	* a valid unsigned */
	3550	assert(category >= -1);
	3551	return cBOOL(SvUV(categories) & (1U << (category + 1)));
	3552	}
	3553
	3554	char *
	3555	Perl_my_strerror(pTHX_ const int errnum)
	3556	{
	3557	/* Returns a mortalized copy of the text of the error message associated
	3558	* with 'errnum'. It uses the current locale's text unless the platform
	3559	* doesn't have the LC_MESSAGES category or we are not being called from
	3560	* within the scope of 'use locale'. In the former case, it uses whatever
	3561	* strerror returns; in the latter case it uses the text from the C locale.
	3562	*
	3563	* The function just calls strerror(), but temporarily switches, if needed,
	3564	* to the C locale */
	3565
	3566	char *errstr;
	3567	dVAR;
	3568
	3569	#ifndef USE_LOCALE_MESSAGES
	3570
	3571	/* If platform doesn't have messages category, we don't do any switching to
	3572	* the C locale; we just use whatever strerror() returns */
	3573
	3574	errstr = savepv(Strerror(errnum));
	3575
	3576	#else /* Has locale messages */
	3577
	3578	const bool within_locale_scope = IN_LC(LC_MESSAGES);
	3579
	3580	# if defined(HAS_POSIX_2008_LOCALE) && defined(HAS_STRERROR_L)
	3581
	3582	/* This function is trivial if we don't have to worry about thread safety
	3583	* and have strerror_l(), as it handles the switch of locales so we don't
	3584	* have to deal with that. We don't have to worry about thread safety if
	3585	* this is an unthreaded build, or if strerror_r() is also available. Both
	3586	* it and strerror_l() are thread-safe. Plain strerror() isn't thread
	3587	* safe. But on threaded builds when strerror_r() is available, the
	3588	* apparent call to strerror() below is actually a macro that
	3589	* behind-the-scenes calls strerror_r().
	3590	*/
	3591
	3592	# if ! defined(USE_ITHREADS) \|\| defined(HAS_STRERROR_R)
	3593
	3594	if (within_locale_scope) {
	3595	errstr = savepv(strerror(errnum));
	3596	}
	3597	else {
	3598	errstr = savepv(strerror_l(errnum, PL_C_locale_obj));
	3599	}
	3600
	3601	# else
	3602
	3603	/* Here we have strerror_l(), but not strerror_r() and we are on a
	3604	* threaded-build. We use strerror_l() for everything, constructing a
	3605	* locale to pass to it if necessary */
	3606
	3607	bool do_free = FALSE;
	3608	locale_t locale_to_use;
	3609
	3610	if (within_locale_scope) {
	3611	locale_to_use = uselocale((locale_t) 0);
	3612	if (locale_to_use == LC_GLOBAL_LOCALE) {
	3613	locale_to_use = duplocale(LC_GLOBAL_LOCALE);
	3614	do_free = TRUE;
	3615	}
	3616	}
	3617	else { /* Use C locale if not within 'use locale' scope */
	3618	locale_to_use = PL_C_locale_obj;
	3619	}
	3620
	3621	errstr = savepv(strerror_l(errnum, locale_to_use));
	3622
	3623	if (do_free) {
	3624	freelocale(locale_to_use);
	3625	}
	3626
	3627	# endif
	3628	# else /* Doesn't have strerror_l() */
	3629
	3630	# ifdef USE_POSIX_2008_LOCALE
	3631
	3632	locale_t save_locale = NULL;
	3633
	3634	# else
	3635
	3636	const char * save_locale = NULL;
	3637	bool locale_is_C = FALSE;
	3638
	3639	/* We have a critical section to prevent another thread from changing the
	3640	* locale out from under us (or zapping the buffer returned from
	3641	* setlocale() ) */
	3642	LOCALE_LOCK;
	3643
	3644	# endif
	3645
	3646	DEBUG_Lv(PerlIO_printf(Perl_debug_log,
	3647	"my_strerror called with errnum %d\n", errnum));
	3648	if (! within_locale_scope) {
	3649	errno = 0;
	3650
	3651	# ifdef USE_POSIX_2008_LOCALE /* Use the thread-safe locale functions */
	3652
	3653	DEBUG_Lv(PerlIO_printf(Perl_debug_log,
	3654	"Not within locale scope, about to call"
	3655	" uselocale(0x%p)\n", PL_C_locale_obj));
	3656	save_locale = uselocale(PL_C_locale_obj);
	3657	if (! save_locale) {
	3658	DEBUG_L(PerlIO_printf(Perl_debug_log,
	3659	"uselocale failed, errno=%d\n", errno));
	3660	}
	3661	else {
	3662	DEBUG_Lv(PerlIO_printf(Perl_debug_log,
	3663	"uselocale returned 0x%p\n", save_locale));
	3664	}
	3665
	3666	# else /* Not thread-safe build */
	3667
	3668	save_locale = do_setlocale_c(LC_MESSAGES, NULL);
	3669	if (! save_locale) {
	3670	DEBUG_L(PerlIO_printf(Perl_debug_log,
	3671	"setlocale failed, errno=%d\n", errno));
	3672	}
	3673	else {
	3674	locale_is_C = isNAME_C_OR_POSIX(save_locale);
	3675
	3676	/* Switch to the C locale if not already in it */
	3677	if (! locale_is_C) {
	3678
	3679	/* The setlocale() just below likely will zap 'save_locale', so
	3680	* create a copy. */
	3681	save_locale = savepv(save_locale);
	3682	do_setlocale_c(LC_MESSAGES, "C");
	3683	}
	3684	}
	3685
	3686	# endif
	3687
	3688	} /* end of ! within_locale_scope */
	3689	else {
	3690	DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s: %d: WITHIN locale scope\n",
	3691	__FILE__, __LINE__));
	3692	}
	3693
	3694	DEBUG_Lv(PerlIO_printf(Perl_debug_log,
	3695	"Any locale change has been done; about to call Strerror\n"));
	3696	errstr = savepv(Strerror(errnum));
	3697
	3698	if (! within_locale_scope) {
	3699	errno = 0;
	3700
	3701	# ifdef USE_POSIX_2008_LOCALE
	3702
	3703	DEBUG_Lv(PerlIO_printf(Perl_debug_log,
	3704	"%s: %d: not within locale scope, restoring the locale\n",
	3705	__FILE__, __LINE__));
	3706	if (save_locale && ! uselocale(save_locale)) {
	3707	DEBUG_L(PerlIO_printf(Perl_debug_log,
	3708	"uselocale restore failed, errno=%d\n", errno));
	3709	}
	3710	}
	3711
	3712	# else
	3713
	3714	if (save_locale && ! locale_is_C) {
	3715	if (! do_setlocale_c(LC_MESSAGES, save_locale)) {
	3716	DEBUG_L(PerlIO_printf(Perl_debug_log,
	3717	"setlocale restore failed, errno=%d\n", errno));
	3718	}
	3719	Safefree(save_locale);
	3720	}
	3721	}
	3722
	3723	LOCALE_UNLOCK;
	3724
	3725	# endif
	3726	# endif /* End of doesn't have strerror_l */
	3727	#endif /* End of does have locale messages */
	3728
	3729	#ifdef DEBUGGING
	3730
	3731	if (DEBUG_Lv_TEST) {
	3732	PerlIO_printf(Perl_debug_log, "Strerror returned; saving a copy: '");
	3733	print_bytes_for_locale(errstr, errstr + strlen(errstr), 0);
	3734	PerlIO_printf(Perl_debug_log, "'\n");
	3735	}
	3736
	3737	#endif
	3738
	3739	SAVEFREEPV(errstr);
	3740	return errstr;
	3741	}
	3742
	3743	/*
	3744
	3745	=for apidoc sync_locale
	3746
	3747	Changing the program's locale should be avoided by XS code. Nevertheless,
	3748	certain non-Perl libraries called from XS, such as C<Gtk> do so. When this
	3749	happens, Perl needs to be told that the locale has changed. Use this function
	3750	to do so, before returning to Perl.
	3751
	3752	=cut
	3753	*/
	3754
	3755	void
	3756	Perl_sync_locale(pTHX)
	3757	{
	3758	char * newlocale;
	3759
	3760	#ifdef USE_LOCALE_CTYPE
	3761
	3762	newlocale = do_setlocale_c(LC_CTYPE, NULL);
	3763	DEBUG_Lv(PerlIO_printf(Perl_debug_log,
	3764	"%s:%d: %s\n", __FILE__, __LINE__,
	3765	setlocale_debug_string(LC_CTYPE, NULL, newlocale)));
	3766	new_ctype(newlocale);
	3767
	3768	#endif /* USE_LOCALE_CTYPE */
	3769	#ifdef USE_LOCALE_COLLATE
	3770
	3771	newlocale = do_setlocale_c(LC_COLLATE, NULL);
	3772	DEBUG_Lv(PerlIO_printf(Perl_debug_log,
	3773	"%s:%d: %s\n", __FILE__, __LINE__,
	3774	setlocale_debug_string(LC_COLLATE, NULL, newlocale)));
	3775	new_collate(newlocale);
	3776
	3777	#endif
	3778	#ifdef USE_LOCALE_NUMERIC
	3779
	3780	newlocale = do_setlocale_c(LC_NUMERIC, NULL);
	3781	DEBUG_Lv(PerlIO_printf(Perl_debug_log,
	3782	"%s:%d: %s\n", __FILE__, __LINE__,
	3783	setlocale_debug_string(LC_NUMERIC, NULL, newlocale)));
	3784	new_numeric(newlocale);
	3785
	3786	#endif /* USE_LOCALE_NUMERIC */
	3787
	3788	}
	3789
	3790	#if defined(DEBUGGING) && defined(USE_LOCALE)
	3791
	3792	STATIC char *
	3793	S_setlocale_debug_string(const int category, /* category number,
	3794	like LC_ALL */
	3795	const char* const locale, /* locale name */
	3796
	3797	/* return value from setlocale() when attempting to
	3798	* set 'category' to 'locale' */
	3799	const char* const retval)
	3800	{
	3801	/* Returns a pointer to a NUL-terminated string in static storage with
	3802	* added text about the info passed in. This is not thread safe and will
	3803	* be overwritten by the next call, so this should be used just to
	3804	* formulate a string to immediately print or savepv() on. */
	3805
	3806	/* initialise to a non-null value to keep it out of BSS and so keep
	3807	* -DPERL_GLOBAL_STRUCT_PRIVATE happy */
	3808	static char ret[128] = "If you can read this, thank your buggy C"
	3809	" library strlcpy(), and change your hints file"
	3810	" to undef it";
	3811
	3812	my_strlcpy(ret, "setlocale(", sizeof(ret));
	3813	my_strlcat(ret, category_name(category), sizeof(ret));
	3814	my_strlcat(ret, ", ", sizeof(ret));
	3815
	3816	if (locale) {
	3817	my_strlcat(ret, "\"", sizeof(ret));
	3818	my_strlcat(ret, locale, sizeof(ret));
	3819	my_strlcat(ret, "\"", sizeof(ret));
	3820	}
	3821	else {
	3822	my_strlcat(ret, "NULL", sizeof(ret));
	3823	}
	3824
	3825	my_strlcat(ret, ") returned ", sizeof(ret));
	3826
	3827	if (retval) {
	3828	my_strlcat(ret, "\"", sizeof(ret));
	3829	my_strlcat(ret, retval, sizeof(ret));
	3830	my_strlcat(ret, "\"", sizeof(ret));
	3831	}
	3832	else {
	3833	my_strlcat(ret, "NULL", sizeof(ret));
	3834	}
	3835
	3836	assert(strlen(ret) < sizeof(ret));
	3837
	3838	return ret;
	3839	}
	3840
	3841	#endif
	3842
	3843
	3844	/*
	3845	* ex: set ts=8 sts=4 sw=4 et:
	3846	*/