perl5.git.perl.org Git - perl5.git/blame_incremental

... / ...

Commit	Line	Data
	1	/* locale.c
	2	*
	3	* Copyright (C) 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001,
	4	* 2002, 2003, 2005, 2006, 2007, 2008 by Larry Wall and others
	5	*
	6	* You may distribute under the terms of either the GNU General Public
	7	* License or the Artistic License, as specified in the README file.
	8	*
	9	*/
	10
	11	/*
	12	* A Elbereth Gilthoniel,
	13	* silivren penna míriel
	14	* o menel aglar elenath!
	15	* Na-chaered palan-díriel
	16	* o galadhremmin ennorath,
	17	* Fanuilos, le linnathon
	18	* nef aear, si nef aearon!
	19	*
	20	* [p.238 of _The Lord of the Rings_, II/i: "Many Meetings"]
	21	*/
	22
	23	/* utility functions for handling locale-specific stuff like what
	24	* character represents the decimal point.
	25	*
	26	* All C programs have an underlying locale. Perl generally doesn't pay any
	27	* attention to it except within the scope of a 'use locale'. For most
	28	* categories, it accomplishes this by just using different operations if it is
	29	* in such scope than if not. However, various libc functions called by Perl
	30	* are affected by the LC_NUMERIC category, so there are macros in perl.h that
	31	* are used to toggle between the current locale and the C locale depending on
	32	* the desired behavior of those functions at the moment.
	33	*/
	34
	35	#include "EXTERN.h"
	36	#define PERL_IN_LOCALE_C
	37	#include "perl.h"
	38
	39	#ifdef I_LANGINFO
	40	# include <langinfo.h>
	41	#endif
	42
	43	#include "reentr.h"
	44
	45	#ifdef USE_LOCALE
	46
	47	/*
	48	* Standardize the locale name from a string returned by 'setlocale', possibly
	49	* modifying that string.
	50	*
	51	* The typical return value of setlocale() is either
	52	* (1) "xx_YY" if the first argument of setlocale() is not LC_ALL
	53	* (2) "xa_YY xb_YY ..." if the first argument of setlocale() is LC_ALL
	54	* (the space-separated values represent the various sublocales,
	55	* in some unspecified order). This is not handled by this function.
	56	*
	57	* In some platforms it has a form like "LC_SOMETHING=Lang_Country.866\n",
	58	* which is harmful for further use of the string in setlocale(). This
	59	* function removes the trailing new line and everything up through the '='
	60	*
	61	*/
	62	STATIC char *
	63	S_stdize_locale(pTHX_ char *locs)
	64	{
	65	const char * const s = strchr(locs, '=');
	66	bool okay = TRUE;
	67
	68	PERL_ARGS_ASSERT_STDIZE_LOCALE;
	69
	70	if (s) {
	71	const char * const t = strchr(s, '.');
	72	okay = FALSE;
	73	if (t) {
	74	const char * const u = strchr(t, '\n');
	75	if (u && (u[1] == 0)) {
	76	const STRLEN len = u - s;
	77	Move(s + 1, locs, len, char);
	78	locs[len] = 0;
	79	okay = TRUE;
	80	}
	81	}
	82	}
	83
	84	if (!okay)
	85	Perl_croak(aTHX_ "Can't fix broken locale name \"%s\"", locs);
	86
	87	return locs;
	88	}
	89
	90	#endif
	91
	92	void
	93	Perl_set_numeric_radix(pTHX)
	94	{
	95	#ifdef USE_LOCALE_NUMERIC
	96	# ifdef HAS_LOCALECONV
	97	const struct lconv* const lc = localeconv();
	98
	99	if (lc && lc->decimal_point) {
	100	if (lc->decimal_point[0] == '.' && lc->decimal_point[1] == 0) {
	101	SvREFCNT_dec(PL_numeric_radix_sv);
	102	PL_numeric_radix_sv = NULL;
	103	}
	104	else {
	105	if (PL_numeric_radix_sv)
	106	sv_setpv(PL_numeric_radix_sv, lc->decimal_point);
	107	else
	108	PL_numeric_radix_sv = newSVpv(lc->decimal_point, 0);
	109	if (! is_invariant_string((U8 *) lc->decimal_point, 0)
	110	&& is_utf8_string((U8 *) lc->decimal_point, 0)
	111	&& _is_cur_LC_category_utf8(LC_NUMERIC))
	112	{
	113	SvUTF8_on(PL_numeric_radix_sv);
	114	}
	115	}
	116	}
	117	else
	118	PL_numeric_radix_sv = NULL;
	119
	120	DEBUG_L(PerlIO_printf(Perl_debug_log, "Locale radix is %s\n",
	121	(PL_numeric_radix_sv)
	122	? lc->decimal_point
	123	: "NULL"));
	124
	125	# endif /* HAS_LOCALECONV */
	126	#endif /* USE_LOCALE_NUMERIC */
	127	}
	128
	129	/* Is the C string input 'name' "C" or "POSIX"? If so, and 'name' is the
	130	* return of setlocale(), then this is extremely likely to be the C or POSIX
	131	* locale. However, the output of setlocale() is documented to be opaque, but
	132	* the odds are extremely small that it would return these two strings for some
	133	* other locale. Note that VMS in these two locales includes many non-ASCII
	134	* characters as controls and punctuation (below are hex bytes):
	135	* cntrl: 00-1F 7F 84-97 9B-9F
	136	* punct: 21-2F 3A-40 5B-60 7B-7E A1-A3 A5 A7-AB B0-B3 B5-B7 B9-BD BF-CF D1-DD DF-EF F1-FD
	137	* Oddly, none there are listed as alphas, though some represent alphabetics
	138	* http://www.nntp.perl.org/group/perl.perl5.porters/2013/02/msg198753.html */
	139	#define isNAME_C_OR_POSIX(name) ((name) != NULL \
	140	&& (((name) == 'C' && ((name + 1)) == '\0') \
	141	\|\| strEQ((name), "POSIX")))
	142
	143	void
	144	Perl_new_numeric(pTHX_ const char *newnum)
	145	{
	146	#ifdef USE_LOCALE_NUMERIC
	147
	148	/* Called after all libc setlocale() calls affecting LC_NUMERIC, to tell
	149	* core Perl this and that 'newnum' is the name of the new locale.
	150	* It installs this locale as the current underlying default.
	151	*
	152	* The default locale and the C locale can be toggled between by use of the
	153	* set_numeric_local() and set_numeric_standard() functions, which should
	154	* probably not be called directly, but only via macros like
	155	* SET_NUMERIC_STANDARD() in perl.h.
	156	*
	157	* The toggling is necessary mainly so that a non-dot radix decimal point
	158	* character can be output, while allowing internal calculations to use a
	159	* dot.
	160	*
	161	* This sets several interpreter-level variables:
	162	* PL_numeric_name The underlying locale's name: a copy of 'newnum'
	163	* PL_numeric_local A boolean indicating if the toggled state is such
	164	* that the current locale is the program's underlying
	165	* locale
	166	* PL_numeric_standard An int indicating if the toggled state is such
	167	* that the current locale is the C locale. If non-zero,
	168	* it is in C; if > 1, it means it may not be toggled away
	169	* from C.
	170	* Note that both of the last two variables can be true at the same time,
	171	* if the underlying locale is C. (Toggling is a no-op under these
	172	* circumstances.)
	173	*
	174	* Any code changing the locale (outside this file) should use
	175	* POSIX::setlocale, which calls this function. Therefore this function
	176	* should be called directly only from this file and from
	177	* POSIX::setlocale() */
	178
	179	char *save_newnum;
	180
	181	if (! newnum) {
	182	Safefree(PL_numeric_name);
	183	PL_numeric_name = NULL;
	184	PL_numeric_standard = TRUE;
	185	PL_numeric_local = TRUE;
	186	return;
	187	}
	188
	189	save_newnum = stdize_locale(savepv(newnum));
	190	if (! PL_numeric_name \|\| strNE(PL_numeric_name, save_newnum)) {
	191	Safefree(PL_numeric_name);
	192	PL_numeric_name = save_newnum;
	193	}
	194
	195	PL_numeric_standard = isNAME_C_OR_POSIX(save_newnum);
	196	PL_numeric_local = TRUE;
	197
	198	/* Keep LC_NUMERIC in the C locale. This is for XS modules, so they don't
	199	* have to worry about the radix being a non-dot. (Core operations that
	200	* need the underlying locale change to it temporarily). */
	201	set_numeric_standard();
	202
	203	set_numeric_radix();
	204
	205	#else
	206	PERL_UNUSED_ARG(newnum);
	207	#endif /* USE_LOCALE_NUMERIC */
	208	}
	209
	210	void
	211	Perl_set_numeric_standard(pTHX)
	212	{
	213	#ifdef USE_LOCALE_NUMERIC
	214	/* Toggle the LC_NUMERIC locale to C. Most code should use the macros like
	215	* SET_NUMERIC_STANDARD() in perl.h instead of calling this directly. The
	216	* macro avoids calling this routine if toggling isn't necessary according
	217	* to our records (which could be wrong if some XS code has changed the
	218	* locale behind our back) */
	219
	220	setlocale(LC_NUMERIC, "C");
	221	PL_numeric_standard = TRUE;
	222	PL_numeric_local = isNAME_C_OR_POSIX(PL_numeric_name);
	223	set_numeric_radix();
	224	DEBUG_L(PerlIO_printf(Perl_debug_log,
	225	"Underlying LC_NUMERIC locale now is C\n"));
	226
	227	#endif /* USE_LOCALE_NUMERIC */
	228	}
	229
	230	void
	231	Perl_set_numeric_local(pTHX)
	232	{
	233	#ifdef USE_LOCALE_NUMERIC
	234	/* Toggle the LC_NUMERIC locale to the current underlying default. Most
	235	* code should use the macros like SET_NUMERIC_LOCAL() in perl.h instead of
	236	* calling this directly. The macro avoids calling this routine if
	237	* toggling isn't necessary according to our records (which could be wrong
	238	* if some XS code has changed the locale behind our back) */
	239
	240	setlocale(LC_NUMERIC, PL_numeric_name);
	241	PL_numeric_standard = isNAME_C_OR_POSIX(PL_numeric_name);
	242	PL_numeric_local = TRUE;
	243	set_numeric_radix();
	244	DEBUG_L(PerlIO_printf(Perl_debug_log,
	245	"Underlying LC_NUMERIC locale now is %s\n",
	246	PL_numeric_name));
	247
	248	#endif /* USE_LOCALE_NUMERIC */
	249	}
	250
	251	/*
	252	* Set up for a new ctype locale.
	253	*/
	254	void
	255	Perl_new_ctype(pTHX_ const char *newctype)
	256	{
	257	#ifdef USE_LOCALE_CTYPE
	258
	259	/* Called after all libc setlocale() calls affecting LC_CTYPE, to tell
	260	* core Perl this and that 'newctype' is the name of the new locale.
	261	*
	262	* This function sets up the folding arrays for all 256 bytes, assuming
	263	* that tofold() is tolc() since fold case is not a concept in POSIX,
	264	*
	265	* Any code changing the locale (outside this file) should use
	266	* POSIX::setlocale, which calls this function. Therefore this function
	267	* should be called directly only from this file and from
	268	* POSIX::setlocale() */
	269
	270	dVAR;
	271	UV i;
	272
	273	PERL_ARGS_ASSERT_NEW_CTYPE;
	274
	275	/* We will replace any bad locale warning with 1) nothing if the new one is
	276	* ok; or 2) a new warning for the bad new locale */
	277	if (PL_warn_locale) {
	278	SvREFCNT_dec_NN(PL_warn_locale);
	279	PL_warn_locale = NULL;
	280	}
	281
	282	PL_in_utf8_CTYPE_locale = _is_cur_LC_category_utf8(LC_CTYPE);
	283
	284	/* A UTF-8 locale gets standard rules. But note that code still has to
	285	* handle this specially because of the three problematic code points */
	286	if (PL_in_utf8_CTYPE_locale) {
	287	Copy(PL_fold_latin1, PL_fold_locale, 256, U8);
	288	}
	289	else {
	290	/* Assume enough space for every character being bad. 4 spaces each
	291	* for the 94 printable characters that are output like "'x' "; and 5
	292	* spaces each for "'\\' ", "'\t' ", and "'\n' "; plus a terminating
	293	* NUL */
	294	char bad_chars_list[ (94 * 4) + (3 * 5) + 1 ];
	295
	296	bool check_for_problems = ckWARN_d(WARN_LOCALE); /* No warnings means
	297	no check */
	298	bool multi_byte_locale = FALSE; /* Assume is a single-byte locale
	299	to start */
	300	unsigned int bad_count = 0; /* Count of bad characters */
	301
	302	for (i = 0; i < 256; i++) {
	303	if (isUPPER_LC((U8) i))
	304	PL_fold_locale[i] = (U8) toLOWER_LC((U8) i);
	305	else if (isLOWER_LC((U8) i))
	306	PL_fold_locale[i] = (U8) toUPPER_LC((U8) i);
	307	else
	308	PL_fold_locale[i] = (U8) i;
	309
	310	/* If checking for locale problems, see if the native ASCII-range
	311	* printables plus \n and \t are in their expected categories in
	312	* the new locale. If not, this could mean big trouble, upending
	313	* Perl's and most programs' assumptions, like having a
	314	* metacharacter with special meaning become a \w. Fortunately,
	315	* it's very rare to find locales that aren't supersets of ASCII
	316	* nowadays. It isn't a problem for most controls to be changed
	317	* into something else; we check only \n and \t, though perhaps \r
	318	* could be an issue as well. */
	319	if (check_for_problems
	320	&& (isGRAPH_A(i) \|\| isBLANK_A(i) \|\| i == '\n'))
	321	{
	322	if ((isALPHANUMERIC_A(i) && ! isALPHANUMERIC_LC(i))
	323	\|\| (isPUNCT_A(i) && ! isPUNCT_LC(i))
	324	\|\| (isBLANK_A(i) && ! isBLANK_LC(i))
	325	\|\| (i == '\n' && ! isCNTRL_LC(i)))
	326	{
	327	if (bad_count) { /* Separate multiple entries with a
	328	blank */
	329	bad_chars_list[bad_count++] = ' ';
	330	}
	331	bad_chars_list[bad_count++] = '\'';
	332	if (isPRINT_A(i)) {
	333	bad_chars_list[bad_count++] = (char) i;
	334	}
	335	else {
	336	bad_chars_list[bad_count++] = '\\';
	337	if (i == '\n') {
	338	bad_chars_list[bad_count++] = 'n';
	339	}
	340	else {
	341	assert(i == '\t');
	342	bad_chars_list[bad_count++] = 't';
	343	}
	344	}
	345	bad_chars_list[bad_count++] = '\'';
	346	bad_chars_list[bad_count] = '\0';
	347	}
	348	}
	349	}
	350
	351	#ifdef MB_CUR_MAX
	352	/* We only handle single-byte locales (outside of UTF-8 ones; so if
	353	* this locale requires than one byte, there are going to be
	354	* problems. */
	355	if (check_for_problems && MB_CUR_MAX > 1
	356
	357	/* Some platforms return MB_CUR_MAX > 1 for even the "C"
	358	* locale. Just assume that the implementation for them (plus
	359	* for POSIX) is correct and the > 1 value is spurious. (Since
	360	* these are specially handled to never be considered UTF-8
	361	* locales, as long as this is the only problem, everything
	362	* should work fine */
	363	&& strNE(newctype, "C") && strNE(newctype, "POSIX"))
	364	{
	365	multi_byte_locale = TRUE;
	366	}
	367	#endif
	368
	369	if (bad_count \|\| multi_byte_locale) {
	370	PL_warn_locale = Perl_newSVpvf(aTHX_
	371	"Locale '%s' may not work well.%s%s%s\n",
	372	newctype,
	373	(multi_byte_locale)
	374	? " Some characters in it are not recognized by"
	375	" Perl."
	376	: "",
	377	(bad_count)
	378	? "\nThe following characters (and maybe others)"
	379	" may not have the same meaning as the Perl"
	380	" program expects:\n"
	381	: "",
	382	(bad_count)
	383	? bad_chars_list
	384	: ""
	385	);
	386	/* If we are actually in the scope of the locale, output the
	387	* message now. Otherwise we save it to be output at the first
	388	* operation using this locale, if that actually happens. Most
	389	* programs don't use locales, so they are immune to bad ones */
	390	if (IN_LC(LC_CTYPE)) {
	391
	392	/* We have to save 'newctype' because the setlocale() just
	393	* below may destroy it. The next setlocale() further down
	394	* should restore it properly so that the intermediate change
	395	* here is transparent to this function's caller */
	396	const char * const badlocale = savepv(newctype);
	397
	398	setlocale(LC_CTYPE, "C");
	399
	400	/* The '0' below suppresses a bogus gcc compiler warning */
	401	Perl_warner(aTHX_ packWARN(WARN_LOCALE), SvPVX(PL_warn_locale), 0);
	402	setlocale(LC_CTYPE, badlocale);
	403	Safefree(badlocale);
	404	SvREFCNT_dec_NN(PL_warn_locale);
	405	PL_warn_locale = NULL;
	406	}
	407	}
	408	}
	409
	410	#endif /* USE_LOCALE_CTYPE */
	411	PERL_ARGS_ASSERT_NEW_CTYPE;
	412	PERL_UNUSED_ARG(newctype);
	413	PERL_UNUSED_CONTEXT;
	414	}
	415
	416	void
	417	Perl_new_collate(pTHX_ const char *newcoll)
	418	{
	419	#ifdef USE_LOCALE_COLLATE
	420
	421	/* Called after all libc setlocale() calls affecting LC_COLLATE, to tell
	422	* core Perl this and that 'newcoll' is the name of the new locale.
	423	*
	424	* Any code changing the locale (outside this file) should use
	425	* POSIX::setlocale, which calls this function. Therefore this function
	426	* should be called directly only from this file and from
	427	* POSIX::setlocale() */
	428
	429	if (! newcoll) {
	430	if (PL_collation_name) {
	431	++PL_collation_ix;
	432	Safefree(PL_collation_name);
	433	PL_collation_name = NULL;
	434	}
	435	PL_collation_standard = TRUE;
	436	PL_collxfrm_base = 0;
	437	PL_collxfrm_mult = 2;
	438	return;
	439	}
	440
	441	if (! PL_collation_name \|\| strNE(PL_collation_name, newcoll)) {
	442	++PL_collation_ix;
	443	Safefree(PL_collation_name);
	444	PL_collation_name = stdize_locale(savepv(newcoll));
	445	PL_collation_standard = isNAME_C_OR_POSIX(newcoll);
	446
	447	{
	448	/* 2: at most so many chars ('a', 'b'). */
	449	/* 50: surely no system expands a char more. */
	450	#define XFRMBUFSIZE (2 * 50)
	451	char xbuf[XFRMBUFSIZE];
	452	const Size_t fa = strxfrm(xbuf, "a", XFRMBUFSIZE);
	453	const Size_t fb = strxfrm(xbuf, "ab", XFRMBUFSIZE);
	454	const SSize_t mult = fb - fa;
	455	if (mult < 1 && !(fa == 0 && fb == 0))
	456	Perl_croak(aTHX_ "panic: strxfrm() gets absurd - a => %"UVuf", ab => %"UVuf,
	457	(UV) fa, (UV) fb);
	458	PL_collxfrm_base = (fa > (Size_t)mult) ? (fa - mult) : 0;
	459	PL_collxfrm_mult = mult;
	460	}
	461	}
	462
	463	#else
	464	PERL_UNUSED_ARG(newcoll);
	465	#endif /* USE_LOCALE_COLLATE */
	466	}
	467
	468	#ifdef WIN32
	469
	470	char *
	471	Perl_my_setlocale(pTHX_ int category, const char* locale)
	472	{
	473	/* This, for Windows, emulates POSIX setlocale() behavior. There is no
	474	* difference unless the input locale is "", which means on Windows to get
	475	* the machine default, which is set via the computer's "Regional and
	476	* Language Options" (or its current equivalent). In POSIX, it instead
	477	* means to find the locale from the user's environment. This routine
	478	* looks in the environment, and, if anything is found, uses that instead
	479	* of going to the machine default. If there is no environment override,
	480	* the machine default is used, as normal, by calling the real setlocale()
	481	* with "". The POSIX behavior is to use the LC_ALL variable if set;
	482	* otherwise to use the particular category's variable if set; otherwise to
	483	* use the LANG variable. */
	484
	485	bool override_LC_ALL = 0;
	486	char * result;
	487
	488	if (locale && strEQ(locale, "")) {
	489	# ifdef LC_ALL
	490	locale = PerlEnv_getenv("LC_ALL");
	491	if (! locale) {
	492	#endif
	493	switch (category) {
	494	# ifdef LC_ALL
	495	case LC_ALL:
	496	override_LC_ALL = TRUE;
	497	break; /* We already know its variable isn't set */
	498	# endif
	499	# ifdef USE_LOCALE_TIME
	500	case LC_TIME:
	501	locale = PerlEnv_getenv("LC_TIME");
	502	break;
	503	# endif
	504	# ifdef USE_LOCALE_CTYPE
	505	case LC_CTYPE:
	506	locale = PerlEnv_getenv("LC_CTYPE");
	507	break;
	508	# endif
	509	# ifdef USE_LOCALE_COLLATE
	510	case LC_COLLATE:
	511	locale = PerlEnv_getenv("LC_COLLATE");
	512	break;
	513	# endif
	514	# ifdef USE_LOCALE_MONETARY
	515	case LC_MONETARY:
	516	locale = PerlEnv_getenv("LC_MONETARY");
	517	break;
	518	# endif
	519	# ifdef USE_LOCALE_NUMERIC
	520	case LC_NUMERIC:
	521	locale = PerlEnv_getenv("LC_NUMERIC");
	522	break;
	523	# endif
	524	# ifdef USE_LOCALE_MESSAGES
	525	case LC_MESSAGES:
	526	locale = PerlEnv_getenv("LC_MESSAGES");
	527	break;
	528	# endif
	529	default:
	530	/* This is a category, like PAPER_SIZE that we don't
	531	* know about; and so can't provide a wrapper. */
	532	break;
	533	}
	534	if (! locale) {
	535	locale = PerlEnv_getenv("LANG");
	536	if (! locale) {
	537	locale = "";
	538	}
	539	}
	540	# ifdef LC_ALL
	541	}
	542	# endif
	543	}
	544
	545	result = setlocale(category, locale);
	546
	547	if (! override_LC_ALL) {
	548	return result;
	549	}
	550
	551	/* Here the input locale was LC_ALL, and we have set it to what is in the
	552	* LANG variable or the system default if there is no LANG. But these have
	553	* lower priority than the other LC_foo variables, so override it for each
	554	* one that is set. (If they are set to "", it means to use the same thing
	555	* we just set LC_ALL to, so can skip) */
	556	# ifdef USE_LOCALE_TIME
	557	result = PerlEnv_getenv("LC_TIME");
	558	if (result && strNE(result, "")) {
	559	setlocale(LC_TIME, result);
	560	}
	561	# endif
	562	# ifdef USE_LOCALE_CTYPE
	563	result = PerlEnv_getenv("LC_CTYPE");
	564	if (result && strNE(result, "")) {
	565	setlocale(LC_CTYPE, result);
	566	}
	567	# endif
	568	# ifdef USE_LOCALE_COLLATE
	569	result = PerlEnv_getenv("LC_COLLATE");
	570	if (result && strNE(result, "")) {
	571	setlocale(LC_COLLATE, result);
	572	}
	573	# endif
	574	# ifdef USE_LOCALE_MONETARY
	575	result = PerlEnv_getenv("LC_MONETARY");
	576	if (result && strNE(result, "")) {
	577	setlocale(LC_MONETARY, result);
	578	}
	579	# endif
	580	# ifdef USE_LOCALE_NUMERIC
	581	result = PerlEnv_getenv("LC_NUMERIC");
	582	if (result && strNE(result, "")) {
	583	setlocale(LC_NUMERIC, result);
	584	}
	585	# endif
	586	# ifdef USE_LOCALE_MESSAGES
	587	result = PerlEnv_getenv("LC_MESSAGES");
	588	if (result && strNE(result, "")) {
	589	setlocale(LC_MESSAGES, result);
	590	}
	591	# endif
	592
	593	return setlocale(LC_ALL, NULL);
	594
	595	}
	596
	597	#endif
	598
	599
	600	/*
	601	* Initialize locale awareness.
	602	*/
	603	int
	604	Perl_init_i18nl10n(pTHX_ int printwarn)
	605	{
	606	/* printwarn is
	607	*
	608	* 0 if not to output warning when setup locale is bad
	609	* 1 if to output warning based on value of PERL_BADLANG
	610	* >1 if to output regardless of PERL_BADLANG
	611	*
	612	* returns
	613	* 1 = set ok or not applicable,
	614	* 0 = fallback to a locale of lower priority
	615	* -1 = fallback to all locales failed, not even to the C locale
	616	*/
	617
	618	int ok = 1;
	619
	620	#if defined(USE_LOCALE)
	621	#ifdef USE_LOCALE_CTYPE
	622	char *curctype = NULL;
	623	#endif /* USE_LOCALE_CTYPE */
	624	#ifdef USE_LOCALE_COLLATE
	625	char *curcoll = NULL;
	626	#endif /* USE_LOCALE_COLLATE */
	627	#ifdef USE_LOCALE_NUMERIC
	628	char *curnum = NULL;
	629	#endif /* USE_LOCALE_NUMERIC */
	630	#ifdef __GLIBC__
	631	char * const language = PerlEnv_getenv("LANGUAGE");
	632	#endif
	633
	634	/* NULL uses the existing already set up locale */
	635	const char * const setlocale_init = (PerlEnv_getenv("PERL_SKIP_LOCALE_INIT"))
	636	? NULL
	637	: "";
	638	const char* trial_locales[5]; /* 5 = 1 each for "", LC_ALL, LANG, "", C */
	639	unsigned int trial_locales_count;
	640	char * const lc_all = PerlEnv_getenv("LC_ALL");
	641	char * const lang = PerlEnv_getenv("LANG");
	642	bool setlocale_failure = FALSE;
	643	unsigned int i;
	644	char *p;
	645	const bool locwarn = (printwarn > 1 \|\|
	646	(printwarn &&
	647	(!(p = PerlEnv_getenv("PERL_BADLANG")) \|\|
	648	grok_atou(p, NULL))));
	649	bool done = FALSE;
	650	#ifdef WIN32
	651	/* In some systems you can find out the system default locale
	652	* and use that as the fallback locale. */
	653	# define SYSTEM_DEFAULT_LOCALE
	654	#endif
	655	#ifdef SYSTEM_DEFAULT_LOCALE
	656	const char *system_default_locale = NULL;
	657	#endif
	658
	659	#ifndef LOCALE_ENVIRON_REQUIRED
	660	PERL_UNUSED_VAR(done);
	661	#else
	662
	663	/*
	664	* Ultrix setlocale(..., "") fails if there are no environment
	665	* variables from which to get a locale name.
	666	*/
	667
	668	# ifdef LC_ALL
	669	if (lang) {
	670	if (my_setlocale(LC_ALL, setlocale_init))
	671	done = TRUE;
	672	else
	673	setlocale_failure = TRUE;
	674	}
	675	if (!setlocale_failure) {
	676	# ifdef USE_LOCALE_CTYPE
	677	Safefree(curctype);
	678	if (! (curctype =
	679	my_setlocale(LC_CTYPE,
	680	(!done && (lang \|\| PerlEnv_getenv("LC_CTYPE")))
	681	? setlocale_init : NULL)))
	682	setlocale_failure = TRUE;
	683	else
	684	curctype = savepv(curctype);
	685	# endif /* USE_LOCALE_CTYPE */
	686	# ifdef USE_LOCALE_COLLATE
	687	Safefree(curcoll);
	688	if (! (curcoll =
	689	my_setlocale(LC_COLLATE,
	690	(!done && (lang \|\| PerlEnv_getenv("LC_COLLATE")))
	691	? setlocale_init : NULL)))
	692	setlocale_failure = TRUE;
	693	else
	694	curcoll = savepv(curcoll);
	695	# endif /* USE_LOCALE_COLLATE */
	696	# ifdef USE_LOCALE_NUMERIC
	697	Safefree(curnum);
	698	if (! (curnum =
	699	my_setlocale(LC_NUMERIC,
	700	(!done && (lang \|\| PerlEnv_getenv("LC_NUMERIC")))
	701	? setlocale_init : NULL)))
	702	setlocale_failure = TRUE;
	703	else
	704	curnum = savepv(curnum);
	705	# endif /* USE_LOCALE_NUMERIC */
	706	# ifdef USE_LOCALE_MESSAGES
	707	if (! my_setlocale(LC_MESSAGES,
	708	(!done && (lang \|\| PerlEnv_getenv("LC_MESSAGES")))
	709	? setlocale_init : NULL))
	710	{
	711	setlocale_failure = TRUE;
	712	}
	713	# endif /* USE_LOCALE_MESSAGES */
	714	# ifdef USE_LOCALE_MONETARY
	715	if (! my_setlocale(LC_MONETARY,
	716	(!done && (lang \|\| PerlEnv_getenv("LC_MONETARY")))
	717	? setlocale_init : NULL))
	718	{
	719	setlocale_failure = TRUE;
	720	}
	721	# endif /* USE_LOCALE_MONETARY */
	722	}
	723
	724	# endif /* LC_ALL */
	725
	726	#endif /* !LOCALE_ENVIRON_REQUIRED */
	727
	728	/* We try each locale in the list until we get one that works, or exhaust
	729	* the list */
	730	trial_locales[0] = setlocale_init;
	731	trial_locales_count = 1;
	732	for (i= 0; i < trial_locales_count; i++) {
	733	const char * trial_locale = trial_locales[i];
	734
	735	if (i > 0) {
	736
	737	/* XXX This is to preserve old behavior for LOCALE_ENVIRON_REQUIRED
	738	* when i==0, but I (khw) don't think that behavior makes much
	739	* sense */
	740	setlocale_failure = FALSE;
	741
	742	#ifdef SYSTEM_DEFAULT_LOCALE
	743	# ifdef WIN32
	744	/* On Windows machines, an entry of "" after the 0th means to use
	745	* the system default locale, which we now proceed to get. */
	746	if (strEQ(trial_locale, "")) {
	747	unsigned int j;
	748
	749	/* Note that this may change the locale, but we are going to do
	750	* that anyway just below */
	751	system_default_locale = setlocale(LC_ALL, "");
	752
	753	/* Skip if invalid or it's already on the list of locales to
	754	* try */
	755	if (! system_default_locale) {
	756	goto next_iteration;
	757	}
	758	for (j = 0; j < trial_locales_count; j++) {
	759	if (strEQ(system_default_locale, trial_locales[j])) {
	760	goto next_iteration;
	761	}
	762	}
	763
	764	trial_locale = system_default_locale;
	765	}
	766	# endif /* WIN32 */
	767	#endif /* SYSTEM_DEFAULT_LOCALE */
	768	}
	769
	770	#ifdef LC_ALL
	771	if (! my_setlocale(LC_ALL, trial_locale)) {
	772	setlocale_failure = TRUE;
	773	}
	774	else {
	775	/* Since LC_ALL succeeded, it should have changed all the other
	776	* categories it can to its value; so we massage things so that the
	777	* setlocales below just return their category's current values.
	778	* This adequately handles the case in NetBSD where LC_COLLATE may
	779	* not be defined for a locale, and setting it individually will
	780	* fail, whereas setting LC_ALL suceeds, leaving LC_COLLATE set to
	781	* the POSIX locale. */
	782	trial_locale = NULL;
	783	}
	784	#endif /* LC_ALL */
	785
	786	if (!setlocale_failure) {
	787	#ifdef USE_LOCALE_CTYPE
	788	Safefree(curctype);
	789	if (! (curctype = my_setlocale(LC_CTYPE, trial_locale)))
	790	setlocale_failure = TRUE;
	791	else
	792	curctype = savepv(curctype);
	793	#endif /* USE_LOCALE_CTYPE */
	794	#ifdef USE_LOCALE_COLLATE
	795	Safefree(curcoll);
	796	if (! (curcoll = my_setlocale(LC_COLLATE, trial_locale)))
	797	setlocale_failure = TRUE;
	798	else
	799	curcoll = savepv(curcoll);
	800	#endif /* USE_LOCALE_COLLATE */
	801	#ifdef USE_LOCALE_NUMERIC
	802	Safefree(curnum);
	803	if (! (curnum = my_setlocale(LC_NUMERIC, trial_locale)))
	804	setlocale_failure = TRUE;
	805	else
	806	curnum = savepv(curnum);
	807	#endif /* USE_LOCALE_NUMERIC */
	808	#ifdef USE_LOCALE_MESSAGES
	809	if (! (my_setlocale(LC_MESSAGES, trial_locale)))
	810	setlocale_failure = TRUE;
	811	#endif /* USE_LOCALE_MESSAGES */
	812	#ifdef USE_LOCALE_MONETARY
	813	if (! (my_setlocale(LC_MONETARY, trial_locale)))
	814	setlocale_failure = TRUE;
	815	#endif /* USE_LOCALE_MONETARY */
	816
	817	if (! setlocale_failure) { /* Success */
	818	break;
	819	}
	820	}
	821
	822	/* Here, something failed; will need to try a fallback. */
	823	ok = 0;
	824
	825	if (i == 0) {
	826	unsigned int j;
	827
	828	if (locwarn) { /* Output failure info only on the first one */
	829	#ifdef LC_ALL
	830
	831	PerlIO_printf(Perl_error_log,
	832	"perl: warning: Setting locale failed.\n");
	833
	834	#else /* !LC_ALL */
	835
	836	PerlIO_printf(Perl_error_log,
	837	"perl: warning: Setting locale failed for the categories:\n\t");
	838	#ifdef USE_LOCALE_CTYPE
	839	if (! curctype)
	840	PerlIO_printf(Perl_error_log, "LC_CTYPE ");
	841	#endif /* USE_LOCALE_CTYPE */
	842	#ifdef USE_LOCALE_COLLATE
	843	if (! curcoll)
	844	PerlIO_printf(Perl_error_log, "LC_COLLATE ");
	845	#endif /* USE_LOCALE_COLLATE */
	846	#ifdef USE_LOCALE_NUMERIC
	847	if (! curnum)
	848	PerlIO_printf(Perl_error_log, "LC_NUMERIC ");
	849	#endif /* USE_LOCALE_NUMERIC */
	850	PerlIO_printf(Perl_error_log, "and possibly others\n");
	851
	852	#endif /* LC_ALL */
	853
	854	PerlIO_printf(Perl_error_log,
	855	"perl: warning: Please check that your locale settings:\n");
	856
	857	#ifdef __GLIBC__
	858	PerlIO_printf(Perl_error_log,
	859	"\tLANGUAGE = %c%s%c,\n",
	860	language ? '"' : '(',
	861	language ? language : "unset",
	862	language ? '"' : ')');
	863	#endif
	864
	865	PerlIO_printf(Perl_error_log,
	866	"\tLC_ALL = %c%s%c,\n",
	867	lc_all ? '"' : '(',
	868	lc_all ? lc_all : "unset",
	869	lc_all ? '"' : ')');
	870
	871	#if defined(USE_ENVIRON_ARRAY)
	872	{
	873	char **e;
	874	for (e = environ; *e; e++) {
	875	if (strnEQ(*e, "LC_", 3)
	876	&& strnNE(*e, "LC_ALL=", 7)
	877	&& (p = strchr(*e, '=')))
	878	PerlIO_printf(Perl_error_log, "\t%.*s = \"%s\",\n",
	879	(int)(p - e), e, p + 1);
	880	}
	881	}
	882	#else
	883	PerlIO_printf(Perl_error_log,
	884	"\t(possibly more locale environment variables)\n");
	885	#endif
	886
	887	PerlIO_printf(Perl_error_log,
	888	"\tLANG = %c%s%c\n",
	889	lang ? '"' : '(',
	890	lang ? lang : "unset",
	891	lang ? '"' : ')');
	892
	893	PerlIO_printf(Perl_error_log,
	894	" are supported and installed on your system.\n");
	895	}
	896
	897	/* Calculate what fallback locales to try. We have avoided this
	898	* until we have to, becuase failure is quite unlikely. This will
	899	* usually change the upper bound of the loop we are in.
	900	*
	901	* Since the system's default way of setting the locale has not
	902	* found one that works, We use Perl's defined ordering: LC_ALL,
	903	* LANG, and the C locale. We don't try the same locale twice, so
	904	* don't add to the list if already there. (On POSIX systems, the
	905	* LC_ALL element will likely be a repeat of the 0th element "",
	906	* but there's no harm done by doing it explicitly */
	907	if (lc_all) {
	908	for (j = 0; j < trial_locales_count; j++) {
	909	if (strEQ(lc_all, trial_locales[j])) {
	910	goto done_lc_all;
	911	}
	912	}
	913	trial_locales[trial_locales_count++] = lc_all;
	914	}
	915	done_lc_all:
	916
	917	if (lang) {
	918	for (j = 0; j < trial_locales_count; j++) {
	919	if (strEQ(lang, trial_locales[j])) {
	920	goto done_lang;
	921	}
	922	}
	923	trial_locales[trial_locales_count++] = lang;
	924	}
	925	done_lang:
	926
	927	#if defined(WIN32) && defined(LC_ALL)
	928	/* For Windows, we also try the system default locale before "C".
	929	* (If there exists a Windows without LC_ALL we skip this because
	930	* it gets too complicated. For those, the "C" is the next
	931	* fallback possibility). The "" is the same as the 0th element of
	932	* the array, but the code at the loop above knows to treat it
	933	* differently when not the 0th */
	934	trial_locales[trial_locales_count++] = "";
	935	#endif
	936
	937	for (j = 0; j < trial_locales_count; j++) {
	938	if (strEQ("C", trial_locales[j])) {
	939	goto done_C;
	940	}
	941	}
	942	trial_locales[trial_locales_count++] = "C";
	943
	944	done_C: ;
	945	} /* end of first time through the loop */
	946
	947	#ifdef WIN32
	948	next_iteration: ;
	949	#endif
	950
	951	} /* end of looping through the trial locales */
	952
	953	if (ok < 1) { /* If we tried to fallback */
	954	const char* msg;
	955	if (! setlocale_failure) { /* fallback succeeded */
	956	msg = "Falling back to";
	957	}
	958	else { /* fallback failed */
	959
	960	/* We dropped off the end of the loop, so have to decrement i to
	961	* get back to the value the last time through */
	962	i--;
	963
	964	ok = -1;
	965	msg = "Failed to fall back to";
	966
	967	/* To continue, we should use whatever values we've got */
	968	#ifdef USE_LOCALE_CTYPE
	969	Safefree(curctype);
	970	curctype = savepv(setlocale(LC_CTYPE, NULL));
	971	#endif /* USE_LOCALE_CTYPE */
	972	#ifdef USE_LOCALE_COLLATE
	973	Safefree(curcoll);
	974	curcoll = savepv(setlocale(LC_COLLATE, NULL));
	975	#endif /* USE_LOCALE_COLLATE */
	976	#ifdef USE_LOCALE_NUMERIC
	977	Safefree(curnum);
	978	curnum = savepv(setlocale(LC_NUMERIC, NULL));
	979	#endif /* USE_LOCALE_NUMERIC */
	980	}
	981
	982	if (locwarn) {
	983	const char * description;
	984	const char * name = "";
	985	if (strEQ(trial_locales[i], "C")) {
	986	description = "the standard locale";
	987	name = "C";
	988	}
	989	#ifdef SYSTEM_DEFAULT_LOCALE
	990	else if (strEQ(trial_locales[i], "")) {
	991	description = "the system default locale";
	992	if (system_default_locale) {
	993	name = system_default_locale;
	994	}
	995	}
	996	#endif /* SYSTEM_DEFAULT_LOCALE */
	997	else {
	998	description = "a fallback locale";
	999	name = trial_locales[i];
	1000	}
	1001	if (name && strNE(name, "")) {
	1002	PerlIO_printf(Perl_error_log,
	1003	"perl: warning: %s %s (\"%s\").\n", msg, description, name);
	1004	}
	1005	else {
	1006	PerlIO_printf(Perl_error_log,
	1007	"perl: warning: %s %s.\n", msg, description);
	1008	}
	1009	}
	1010	} /* End of tried to fallback */
	1011
	1012	#ifdef USE_LOCALE_CTYPE
	1013	new_ctype(curctype);
	1014	#endif /* USE_LOCALE_CTYPE */
	1015
	1016	#ifdef USE_LOCALE_COLLATE
	1017	new_collate(curcoll);
	1018	#endif /* USE_LOCALE_COLLATE */
	1019
	1020	#ifdef USE_LOCALE_NUMERIC
	1021	new_numeric(curnum);
	1022	#endif /* USE_LOCALE_NUMERIC */
	1023
	1024	#if defined(USE_PERLIO) && defined(USE_LOCALE_CTYPE)
	1025	/* Set PL_utf8locale to TRUE if using PerlIO _and_ the current LC_CTYPE
	1026	* locale is UTF-8. If PL_utf8locale and PL_unicode (set by -C or by
	1027	* $ENV{PERL_UNICODE}) are true, perl.c:S_parse_body() will turn on the
	1028	* PerlIO :utf8 layer on STDIN, STDOUT, STDERR, _and_ the default open
	1029	* discipline. */
	1030	PL_utf8locale = _is_cur_LC_category_utf8(LC_CTYPE);
	1031
	1032	/* Set PL_unicode to $ENV{PERL_UNICODE} if using PerlIO.
	1033	This is an alternative to using the -C command line switch
	1034	(the -C if present will override this). */
	1035	{
	1036	const char *p = PerlEnv_getenv("PERL_UNICODE");
	1037	PL_unicode = p ? parse_unicode_opts(&p) : 0;
	1038	if (PL_unicode & PERL_UNICODE_UTF8CACHEASSERT_FLAG)
	1039	PL_utf8cache = -1;
	1040	}
	1041	#endif
	1042
	1043	#ifdef USE_LOCALE_CTYPE
	1044	Safefree(curctype);
	1045	#endif /* USE_LOCALE_CTYPE */
	1046	#ifdef USE_LOCALE_COLLATE
	1047	Safefree(curcoll);
	1048	#endif /* USE_LOCALE_COLLATE */
	1049	#ifdef USE_LOCALE_NUMERIC
	1050	Safefree(curnum);
	1051	#endif /* USE_LOCALE_NUMERIC */
	1052
	1053	#else /* !USE_LOCALE */
	1054	PERL_UNUSED_ARG(printwarn);
	1055	#endif /* USE_LOCALE */
	1056
	1057	return ok;
	1058	}
	1059
	1060
	1061	#ifdef USE_LOCALE_COLLATE
	1062
	1063	/*
	1064	* mem_collxfrm() is a bit like strxfrm() but with two important
	1065	* differences. First, it handles embedded NULs. Second, it allocates
	1066	* a bit more memory than needed for the transformed data itself.
	1067	* The real transformed data begins at offset sizeof(collationix).
	1068	* Please see sv_collxfrm() to see how this is used.
	1069	*/
	1070
	1071	char *
	1072	Perl_mem_collxfrm(pTHX_ const char s, STRLEN len, STRLEN xlen)
	1073	{
	1074	char *xbuf;
	1075	STRLEN xAlloc, xin, xout; /* xalloc is a reserved word in VC */
	1076
	1077	PERL_ARGS_ASSERT_MEM_COLLXFRM;
	1078
	1079	/* the first sizeof(collationix) bytes are used by sv_collxfrm(). */
	1080	/* the +1 is for the terminating NUL. */
	1081
	1082	xAlloc = sizeof(PL_collation_ix) + PL_collxfrm_base + (PL_collxfrm_mult * len) + 1;
	1083	Newx(xbuf, xAlloc, char);
	1084	if (! xbuf)
	1085	goto bad;
	1086
	1087	(U32)xbuf = PL_collation_ix;
	1088	xout = sizeof(PL_collation_ix);
	1089	for (xin = 0; xin < len; ) {
	1090	Size_t xused;
	1091
	1092	for (;;) {
	1093	xused = strxfrm(xbuf + xout, s + xin, xAlloc - xout);
	1094	if (xused >= PERL_INT_MAX)
	1095	goto bad;
	1096	if ((STRLEN)xused < xAlloc - xout)
	1097	break;
	1098	xAlloc = (2 * xAlloc) + 1;
	1099	Renew(xbuf, xAlloc, char);
	1100	if (! xbuf)
	1101	goto bad;
	1102	}
	1103
	1104	xin += strlen(s + xin) + 1;
	1105	xout += xused;
	1106
	1107	/* Embedded NULs are understood but silently skipped
	1108	* because they make no sense in locale collation. */
	1109	}
	1110
	1111	xbuf[xout] = '\0';
	1112	*xlen = xout - sizeof(PL_collation_ix);
	1113	return xbuf;
	1114
	1115	bad:
	1116	Safefree(xbuf);
	1117	*xlen = 0;
	1118	return NULL;
	1119	}
	1120
	1121	#endif /* USE_LOCALE_COLLATE */
	1122
	1123	#ifdef USE_LOCALE
	1124
	1125	bool
	1126	Perl__is_cur_LC_category_utf8(pTHX_ int category)
	1127	{
	1128	/* Returns TRUE if the current locale for 'category' is UTF-8; FALSE
	1129	* otherwise. 'category' may not be LC_ALL. If the platform doesn't have
	1130	* nl_langinfo(), nor MB_CUR_MAX, this employs a heuristic, which hence
	1131	* could give the wrong result. The result will very likely be correct for
	1132	* languages that have commonly used non-ASCII characters, but for notably
	1133	* English, it comes down to if the locale's name ends in something like
	1134	* "UTF-8". It errs on the side of not being a UTF-8 locale. */
	1135
	1136	char *save_input_locale = NULL;
	1137	STRLEN final_pos;
	1138
	1139	#ifdef LC_ALL
	1140	assert(category != LC_ALL);
	1141	#endif
	1142
	1143	/* First dispose of the trivial cases */
	1144	save_input_locale = setlocale(category, NULL);
	1145	if (! save_input_locale) {
	1146	DEBUG_L(PerlIO_printf(Perl_debug_log,
	1147	"Could not find current locale for category %d\n",
	1148	category));
	1149	return FALSE; /* XXX maybe should croak */
	1150	}
	1151	save_input_locale = stdize_locale(savepv(save_input_locale));
	1152	if (isNAME_C_OR_POSIX(save_input_locale)) {
	1153	DEBUG_L(PerlIO_printf(Perl_debug_log,
	1154	"Current locale for category %d is %s\n",
	1155	category, save_input_locale));
	1156	Safefree(save_input_locale);
	1157	return FALSE;
	1158	}
	1159
	1160	#if defined(USE_LOCALE_CTYPE) \
	1161	&& (defined(MB_CUR_MAX) \|\| (defined(HAS_NL_LANGINFO) && defined(CODESET)))
	1162
	1163	{ /* Next try nl_langinfo or MB_CUR_MAX if available */
	1164
	1165	char *save_ctype_locale = NULL;
	1166	bool is_utf8;
	1167
	1168	if (category != LC_CTYPE) { /* These work only on LC_CTYPE */
	1169
	1170	/* Get the current LC_CTYPE locale */
	1171	save_ctype_locale = setlocale(LC_CTYPE, NULL);
	1172	if (! save_ctype_locale) {
	1173	DEBUG_L(PerlIO_printf(Perl_debug_log,
	1174	"Could not find current locale for LC_CTYPE\n"));
	1175	goto cant_use_nllanginfo;
	1176	}
	1177	save_ctype_locale = stdize_locale(savepv(save_ctype_locale));
	1178
	1179	/* If LC_CTYPE and the desired category use the same locale, this
	1180	* means that finding the value for LC_CTYPE is the same as finding
	1181	* the value for the desired category. Otherwise, switch LC_CTYPE
	1182	* to the desired category's locale */
	1183	if (strEQ(save_ctype_locale, save_input_locale)) {
	1184	Safefree(save_ctype_locale);
	1185	save_ctype_locale = NULL;
	1186	}
	1187	else if (! setlocale(LC_CTYPE, save_input_locale)) {
	1188	DEBUG_L(PerlIO_printf(Perl_debug_log,
	1189	"Could not change LC_CTYPE locale to %s\n",
	1190	save_input_locale));
	1191	Safefree(save_ctype_locale);
	1192	goto cant_use_nllanginfo;
	1193	}
	1194	}
	1195
	1196	DEBUG_L(PerlIO_printf(Perl_debug_log, "Current LC_CTYPE locale=%s\n",
	1197	save_input_locale));
	1198
	1199	/* Here the current LC_CTYPE is set to the locale of the category whose
	1200	* information is desired. This means that nl_langinfo() and MB_CUR_MAX
	1201	* should give the correct results */
	1202
	1203	# if defined(HAS_NL_LANGINFO) && defined(CODESET)
	1204	{
	1205	char *codeset = nl_langinfo(CODESET);
	1206	if (codeset && strNE(codeset, "")) {
	1207	codeset = savepv(codeset);
	1208
	1209	/* If we switched LC_CTYPE, switch back */
	1210	if (save_ctype_locale) {
	1211	setlocale(LC_CTYPE, save_ctype_locale);
	1212	Safefree(save_ctype_locale);
	1213	}
	1214
	1215	is_utf8 = foldEQ(codeset, STR_WITH_LEN("UTF-8"))
	1216	\|\| foldEQ(codeset, STR_WITH_LEN("UTF8"));
	1217
	1218	DEBUG_L(PerlIO_printf(Perl_debug_log,
	1219	"\tnllanginfo returned CODESET '%s'; ?UTF8 locale=%d\n",
	1220	codeset, is_utf8));
	1221	Safefree(codeset);
	1222	Safefree(save_input_locale);
	1223	return is_utf8;
	1224	}
	1225	}
	1226
	1227	# endif
	1228	# ifdef MB_CUR_MAX
	1229
	1230	/* Here, either we don't have nl_langinfo, or it didn't return a
	1231	* codeset. Try MB_CUR_MAX */
	1232
	1233	/* Standard UTF-8 needs at least 4 bytes to represent the maximum
	1234	* Unicode code point. Since UTF-8 is the only non-single byte
	1235	* encoding we handle, we just say any such encoding is UTF-8, and if
	1236	* turns out to be wrong, other things will fail */
	1237	is_utf8 = MB_CUR_MAX >= 4;
	1238
	1239	DEBUG_L(PerlIO_printf(Perl_debug_log,
	1240	"\tMB_CUR_MAX=%d; ?UTF8 locale=%d\n",
	1241	(int) MB_CUR_MAX, is_utf8));
	1242
	1243	Safefree(save_input_locale);
	1244
	1245	# ifdef HAS_MBTOWC
	1246
	1247	/* ... But, most system that have MB_CUR_MAX will also have mbtowc(),
	1248	* since they are both in the C99 standard. We can feed a known byte
	1249	* string to the latter function, and check that it gives the expected
	1250	* result */
	1251	if (is_utf8) {
	1252	wchar_t wc;
	1253	PERL_UNUSED_RESULT(mbtowc(&wc, NULL, 0));/* Reset any shift state */
	1254	errno = 0;
	1255	if ((size_t)mbtowc(&wc, HYPHEN_UTF8, strlen(HYPHEN_UTF8))
	1256	!= strlen(HYPHEN_UTF8)
	1257	\|\| wc != (wchar_t) 0x2010)
	1258	{
	1259	is_utf8 = FALSE;
	1260	DEBUG_L(PerlIO_printf(Perl_debug_log, "\thyphen=U+%x\n", (unsigned int)wc));
	1261	DEBUG_L(PerlIO_printf(Perl_debug_log,
	1262	"\treturn from mbtowc=%d; errno=%d; ?UTF8 locale=0\n",
	1263	mbtowc(&wc, HYPHEN_UTF8, strlen(HYPHEN_UTF8)), errno));
	1264	}
	1265	}
	1266	# endif
	1267
	1268	/* If we switched LC_CTYPE, switch back */
	1269	if (save_ctype_locale) {
	1270	setlocale(LC_CTYPE, save_ctype_locale);
	1271	Safefree(save_ctype_locale);
	1272	}
	1273
	1274	return is_utf8;
	1275	# endif
	1276	}
	1277
	1278	cant_use_nllanginfo:
	1279
	1280	#else /* nl_langinfo should work if available, so don't bother compiling this
	1281	fallback code. The final fallback of looking at the name is
	1282	compiled, and will be executed if nl_langinfo fails */
	1283
	1284	/* nl_langinfo not available or failed somehow. Next try looking at the
	1285	* currency symbol to see if it disambiguates things. Often that will be
	1286	* in the native script, and if the symbol isn't in UTF-8, we know that the
	1287	* locale isn't. If it is non-ASCII UTF-8, we infer that the locale is
	1288	* too, as the odds of a non-UTF8 string being valid UTF-8 are quite small
	1289	* */
	1290
	1291	#ifdef HAS_LOCALECONV
	1292	# ifdef USE_LOCALE_MONETARY
	1293	{
	1294	char *save_monetary_locale = NULL;
	1295	bool only_ascii = FALSE;
	1296	bool is_utf8 = FALSE;
	1297	struct lconv* lc;
	1298
	1299	/* Like above for LC_CTYPE, we first set LC_MONETARY to the locale of
	1300	* the desired category, if it isn't that locale already */
	1301
	1302	if (category != LC_MONETARY) {
	1303
	1304	save_monetary_locale = setlocale(LC_MONETARY, NULL);
	1305	if (! save_monetary_locale) {
	1306	DEBUG_L(PerlIO_printf(Perl_debug_log,
	1307	"Could not find current locale for LC_MONETARY\n"));
	1308	goto cant_use_monetary;
	1309	}
	1310	save_monetary_locale = stdize_locale(savepv(save_monetary_locale));
	1311
	1312	if (strEQ(save_monetary_locale, save_input_locale)) {
	1313	Safefree(save_monetary_locale);
	1314	save_monetary_locale = NULL;
	1315	}
	1316	else if (! setlocale(LC_MONETARY, save_input_locale)) {
	1317	DEBUG_L(PerlIO_printf(Perl_debug_log,
	1318	"Could not change LC_MONETARY locale to %s\n",
	1319	save_input_locale));
	1320	Safefree(save_monetary_locale);
	1321	goto cant_use_monetary;
	1322	}
	1323	}
	1324
	1325	/* Here the current LC_MONETARY is set to the locale of the category
	1326	* whose information is desired. */
	1327
	1328	lc = localeconv();
	1329	if (! lc
	1330	\|\| ! lc->currency_symbol
	1331	\|\| is_invariant_string((U8 *) lc->currency_symbol, 0))
	1332	{
	1333	DEBUG_L(PerlIO_printf(Perl_debug_log, "Couldn't get currency symbol for %s, or contains only ASCII; can't use for determining if UTF-8 locale\n", save_input_locale));
	1334	only_ascii = TRUE;
	1335	}
	1336	else {
	1337	is_utf8 = is_utf8_string((U8 *) lc->currency_symbol, 0);
	1338	}
	1339
	1340	/* If we changed it, restore LC_MONETARY to its original locale */
	1341	if (save_monetary_locale) {
	1342	setlocale(LC_MONETARY, save_monetary_locale);
	1343	Safefree(save_monetary_locale);
	1344	}
	1345
	1346	if (! only_ascii) {
	1347
	1348	/* It isn't a UTF-8 locale if the symbol is not legal UTF-8;
	1349	* otherwise assume the locale is UTF-8 if and only if the symbol
	1350	* is non-ascii UTF-8. */
	1351	DEBUG_L(PerlIO_printf(Perl_debug_log, "\t?Currency symbol for %s is UTF-8=%d\n",
	1352	save_input_locale, is_utf8));
	1353	Safefree(save_input_locale);
	1354	return is_utf8;
	1355	}
	1356	}
	1357	cant_use_monetary:
	1358
	1359	# endif /* USE_LOCALE_MONETARY */
	1360	#endif /* HAS_LOCALECONV */
	1361
	1362	#if defined(HAS_STRFTIME) && defined(USE_LOCALE_TIME)
	1363
	1364	/* Still haven't found a non-ASCII string to disambiguate UTF-8 or not. Try
	1365	* the names of the months and weekdays, timezone, and am/pm indicator */
	1366	{
	1367	char *save_time_locale = NULL;
	1368	int hour = 10;
	1369	bool is_dst = FALSE;
	1370	int dom = 1;
	1371	int month = 0;
	1372	int i;
	1373	char * formatted_time;
	1374
	1375
	1376	/* Like above for LC_MONETARY, we set LC_TIME to the locale of the
	1377	* desired category, if it isn't that locale already */
	1378
	1379	if (category != LC_TIME) {
	1380
	1381	save_time_locale = setlocale(LC_TIME, NULL);
	1382	if (! save_time_locale) {
	1383	DEBUG_L(PerlIO_printf(Perl_debug_log,
	1384	"Could not find current locale for LC_TIME\n"));
	1385	goto cant_use_time;
	1386	}
	1387	save_time_locale = stdize_locale(savepv(save_time_locale));
	1388
	1389	if (strEQ(save_time_locale, save_input_locale)) {
	1390	Safefree(save_time_locale);
	1391	save_time_locale = NULL;
	1392	}
	1393	else if (! setlocale(LC_TIME, save_input_locale)) {
	1394	DEBUG_L(PerlIO_printf(Perl_debug_log,
	1395	"Could not change LC_TIME locale to %s\n",
	1396	save_input_locale));
	1397	Safefree(save_time_locale);
	1398	goto cant_use_time;
	1399	}
	1400	}
	1401
	1402	/* Here the current LC_TIME is set to the locale of the category
	1403	* whose information is desired. Look at all the days of the week and
	1404	* month names, and the timezone and am/pm indicator for UTF-8 variant
	1405	* characters. The first such a one found will tell us if the locale
	1406	* is UTF-8 or not */
	1407
	1408	for (i = 0; i < 7 + 12; i++) { /* 7 days; 12 months */
	1409	formatted_time = my_strftime("%A %B %Z %p",
	1410	0, 0, hour, dom, month, 112, 0, 0, is_dst);
	1411	if (! formatted_time \|\| is_invariant_string((U8 *) formatted_time, 0)) {
	1412
	1413	/* Here, we didn't find a non-ASCII. Try the next time through
	1414	* with the complemented dst and am/pm, and try with the next
	1415	* weekday. After we have gotten all weekdays, try the next
	1416	* month */
	1417	is_dst = ! is_dst;
	1418	hour = (hour + 12) % 24;
	1419	dom++;
	1420	if (i > 6) {
	1421	month++;
	1422	}
	1423	continue;
	1424	}
	1425
	1426	/* Here, we have a non-ASCII. Return TRUE is it is valid UTF8;
	1427	* false otherwise. But first, restore LC_TIME to its original
	1428	* locale if we changed it */
	1429	if (save_time_locale) {
	1430	setlocale(LC_TIME, save_time_locale);
	1431	Safefree(save_time_locale);
	1432	}
	1433
	1434	DEBUG_L(PerlIO_printf(Perl_debug_log, "\t?time-related strings for %s are UTF-8=%d\n",
	1435	save_input_locale,
	1436	is_utf8_string((U8 *) formatted_time, 0)));
	1437	Safefree(save_input_locale);
	1438	return is_utf8_string((U8 *) formatted_time, 0);
	1439	}
	1440
	1441	/* Falling off the end of the loop indicates all the names were just
	1442	* ASCII. Go on to the next test. If we changed it, restore LC_TIME
	1443	* to its original locale */
	1444	if (save_time_locale) {
	1445	setlocale(LC_TIME, save_time_locale);
	1446	Safefree(save_time_locale);
	1447	}
	1448	DEBUG_L(PerlIO_printf(Perl_debug_log, "All time-related words for %s contain only ASCII; can't use for determining if UTF-8 locale\n", save_input_locale));
	1449	}
	1450	cant_use_time:
	1451
	1452	#endif
	1453
	1454	#if 0 && defined(USE_LOCALE_MESSAGES) && defined(HAS_SYS_ERRLIST)
	1455
	1456	/* This code is ifdefd out because it was found to not be necessary in testing
	1457	* on our dromedary test machine, which has over 700 locales. There, this
	1458	* added no value to looking at the currency symbol and the time strings. I
	1459	* left it in so as to avoid rewriting it if real-world experience indicates
	1460	* that dromedary is an outlier. Essentially, instead of returning abpve if we
	1461	* haven't found illegal utf8, we continue on and examine all the strerror()
	1462	* messages on the platform for utf8ness. If all are ASCII, we still don't
	1463	* know the answer; but otherwise we have a pretty good indication of the
	1464	* utf8ness. The reason this doesn't help much is that the messages may not
	1465	* have been translated into the locale. The currency symbol and time strings
	1466	* are much more likely to have been translated. */
	1467	{
	1468	int e;
	1469	bool is_utf8 = FALSE;
	1470	bool non_ascii = FALSE;
	1471	char *save_messages_locale = NULL;
	1472	const char * errmsg = NULL;
	1473
	1474	/* Like above, we set LC_MESSAGES to the locale of the desired
	1475	* category, if it isn't that locale already */
	1476
	1477	if (category != LC_MESSAGES) {
	1478
	1479	save_messages_locale = setlocale(LC_MESSAGES, NULL);
	1480	if (! save_messages_locale) {
	1481	DEBUG_L(PerlIO_printf(Perl_debug_log,
	1482	"Could not find current locale for LC_MESSAGES\n"));
	1483	goto cant_use_messages;
	1484	}
	1485	save_messages_locale = stdize_locale(savepv(save_messages_locale));
	1486
	1487	if (strEQ(save_messages_locale, save_input_locale)) {
	1488	Safefree(save_messages_locale);
	1489	save_messages_locale = NULL;
	1490	}
	1491	else if (! setlocale(LC_MESSAGES, save_input_locale)) {
	1492	DEBUG_L(PerlIO_printf(Perl_debug_log,
	1493	"Could not change LC_MESSAGES locale to %s\n",
	1494	save_input_locale));
	1495	Safefree(save_messages_locale);
	1496	goto cant_use_messages;
	1497	}
	1498	}
	1499
	1500	/* Here the current LC_MESSAGES is set to the locale of the category
	1501	* whose information is desired. Look through all the messages. We
	1502	* can't use Strerror() here because it may expand to code that
	1503	* segfaults in miniperl */
	1504
	1505	for (e = 0; e <= sys_nerr; e++) {
	1506	errno = 0;
	1507	errmsg = sys_errlist[e];
	1508	if (errno \|\| !errmsg) {
	1509	break;
	1510	}
	1511	errmsg = savepv(errmsg);
	1512	if (! is_invariant_string((U8 *) errmsg, 0)) {
	1513	non_ascii = TRUE;
	1514	is_utf8 = is_utf8_string((U8 *) errmsg, 0);
	1515	break;
	1516	}
	1517	}
	1518	Safefree(errmsg);
	1519
	1520	/* And, if we changed it, restore LC_MESSAGES to its original locale */
	1521	if (save_messages_locale) {
	1522	setlocale(LC_MESSAGES, save_messages_locale);
	1523	Safefree(save_messages_locale);
	1524	}
	1525
	1526	if (non_ascii) {
	1527
	1528	/* Any non-UTF-8 message means not a UTF-8 locale; if all are valid,
	1529	* any non-ascii means it is one; otherwise we assume it isn't */
	1530	DEBUG_L(PerlIO_printf(Perl_debug_log, "\t?error messages for %s are UTF-8=%d\n",
	1531	save_input_locale,
	1532	is_utf8));
	1533	Safefree(save_input_locale);
	1534	return is_utf8;
	1535	}
	1536
	1537	DEBUG_L(PerlIO_printf(Perl_debug_log, "All error messages for %s contain only ASCII; can't use for determining if UTF-8 locale\n", save_input_locale));
	1538	}
	1539	cant_use_messages:
	1540
	1541	#endif
	1542
	1543	#endif /* the code that is compiled when no nl_langinfo */
	1544
	1545	#ifndef EBCDIC /* On os390, even if the name ends with "UTF-8', it isn't a
	1546	UTF-8 locale */
	1547	/* As a last resort, look at the locale name to see if it matches
	1548	* qr/UTF -? * 8 /ix, or some other common locale names. This "name", the
	1549	* return of setlocale(), is actually defined to be opaque, so we can't
	1550	* really rely on the absence of various substrings in the name to indicate
	1551	* its UTF-8ness, but if it has UTF8 in the name, it is extremely likely to
	1552	* be a UTF-8 locale. Similarly for the other common names */
	1553
	1554	final_pos = strlen(save_input_locale) - 1;
	1555	if (final_pos >= 3) {
	1556	char *name = save_input_locale;
	1557
	1558	/* Find next 'U' or 'u' and look from there */
	1559	while ((name += strcspn(name, "Uu") + 1)
	1560	<= save_input_locale + final_pos - 2)
	1561	{
	1562	if (!isALPHA_FOLD_NE(*name, 't')
	1563	\|\| isALPHA_FOLD_NE(*(name + 1), 'f'))
	1564	{
	1565	continue;
	1566	}
	1567	name += 2;
	1568	if (*(name) == '-') {
	1569	if ((name > save_input_locale + final_pos - 1)) {
	1570	break;
	1571	}
	1572	name++;
	1573	}
	1574	if (*(name) == '8') {
	1575	DEBUG_L(PerlIO_printf(Perl_debug_log,
	1576	"Locale %s ends with UTF-8 in name\n",
	1577	save_input_locale));
	1578	Safefree(save_input_locale);
	1579	return TRUE;
	1580	}
	1581	}
	1582	DEBUG_L(PerlIO_printf(Perl_debug_log,
	1583	"Locale %s doesn't end with UTF-8 in name\n",
	1584	save_input_locale));
	1585	}
	1586	#endif
	1587
	1588	#ifdef WIN32
	1589	/* http://msdn.microsoft.com/en-us/library/windows/desktop/dd317756.aspx */
	1590	if (final_pos >= 4
	1591	&& *(save_input_locale + final_pos - 0) == '1'
	1592	&& *(save_input_locale + final_pos - 1) == '0'
	1593	&& *(save_input_locale + final_pos - 2) == '0'
	1594	&& *(save_input_locale + final_pos - 3) == '5'
	1595	&& *(save_input_locale + final_pos - 4) == '6')
	1596	{
	1597	DEBUG_L(PerlIO_printf(Perl_debug_log,
	1598	"Locale %s ends with 10056 in name, is UTF-8 locale\n",
	1599	save_input_locale));
	1600	Safefree(save_input_locale);
	1601	return TRUE;
	1602	}
	1603	#endif
	1604
	1605	/* Other common encodings are the ISO 8859 series, which aren't UTF-8. But
	1606	* since we are about to return FALSE anyway, there is no point in doing
	1607	* this extra work */
	1608	#if 0
	1609	if (instr(save_input_locale, "8859")) {
	1610	DEBUG_L(PerlIO_printf(Perl_debug_log,
	1611	"Locale %s has 8859 in name, not UTF-8 locale\n",
	1612	save_input_locale));
	1613	Safefree(save_input_locale);
	1614	return FALSE;
	1615	}
	1616	#endif
	1617
	1618	DEBUG_L(PerlIO_printf(Perl_debug_log,
	1619	"Assuming locale %s is not a UTF-8 locale\n",
	1620	save_input_locale));
	1621	Safefree(save_input_locale);
	1622	return FALSE;
	1623	}
	1624
	1625	#endif
	1626
	1627
	1628	bool
	1629	Perl__is_in_locale_category(pTHX_ const bool compiling, const int category)
	1630	{
	1631	dVAR;
	1632	/* Internal function which returns if we are in the scope of a pragma that
	1633	* enables the locale category 'category'. 'compiling' should indicate if
	1634	* this is during the compilation phase (TRUE) or not (FALSE). */
	1635
	1636	const COP * const cop = (compiling) ? &PL_compiling : PL_curcop;
	1637
	1638	SV *categories = cop_hints_fetch_pvs(cop, "locale", 0);
	1639	if (! categories \|\| categories == &PL_sv_placeholder) {
	1640	return FALSE;
	1641	}
	1642
	1643	/* The pseudo-category 'not_characters' is -1, so just add 1 to each to get
	1644	* a valid unsigned */
	1645	assert(category >= -1);
	1646	return cBOOL(SvUV(categories) & (1U << (category + 1)));
	1647	}
	1648
	1649	char *
	1650	Perl_my_strerror(pTHX_ const int errnum) {
	1651
	1652	/* Uses C locale for the error text unless within scope of 'use locale' for
	1653	* LC_MESSAGES */
	1654
	1655	#ifdef USE_LOCALE_MESSAGES
	1656	if (! IN_LC(LC_MESSAGES)) {
	1657	char * save_locale = setlocale(LC_MESSAGES, NULL);
	1658	if (! isNAME_C_OR_POSIX(save_locale)) {
	1659	char *errstr;
	1660
	1661	/* The next setlocale likely will zap this, so create a copy */
	1662	save_locale = savepv(save_locale);
	1663
	1664	setlocale(LC_MESSAGES, "C");
	1665
	1666	/* This points to the static space in Strerror, with all its
	1667	* limitations */
	1668	errstr = Strerror(errnum);
	1669
	1670	setlocale(LC_MESSAGES, save_locale);
	1671	Safefree(save_locale);
	1672	return errstr;
	1673	}
	1674	}
	1675	#endif
	1676
	1677	return Strerror(errnum);
	1678	}
	1679
	1680	/*
	1681
	1682	=head1 Locale-related functions and macros
	1683
	1684	=for apidoc sync_locale
	1685
	1686	Changing the program's locale should be avoided by XS code. Nevertheless,
	1687	certain non-Perl libraries called from XS, such as C<Gtk> do so. When this
	1688	happens, Perl needs to be told that the locale has changed. Use this function
	1689	to do so, before returning to Perl.
	1690
	1691	=cut
	1692	*/
	1693
	1694	void
	1695	Perl_sync_locale(pTHX)
	1696	{
	1697
	1698	#ifdef USE_LOCALE_CTYPE
	1699	new_ctype(setlocale(LC_CTYPE, NULL));
	1700	#endif /* USE_LOCALE_CTYPE */
	1701
	1702	#ifdef USE_LOCALE_COLLATE
	1703	new_collate(setlocale(LC_COLLATE, NULL));
	1704	#endif
	1705
	1706	#ifdef USE_LOCALE_NUMERIC
	1707	set_numeric_local(); /* Switch from "C" to underlying LC_NUMERIC */
	1708	new_numeric(setlocale(LC_NUMERIC, NULL));
	1709	#endif /* USE_LOCALE_NUMERIC */
	1710
	1711	}
	1712
	1713
	1714
	1715	/*
	1716	* Local variables:
	1717	* c-indentation-style: bsd
	1718	* c-basic-offset: 4
	1719	* indent-tabs-mode: nil
	1720	* End:
	1721	*
	1722	* ex: set ts=8 sts=4 sw=4 et:
	1723	*/