perl5.git.perl.org Git - perl5.git/blame_incremental

... / ...

Commit	Line	Data
	1	/* locale.c
	2	*
	3	* Copyright (C) 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001,
	4	* 2002, 2003, 2005, 2006, 2007, 2008 by Larry Wall and others
	5	*
	6	* You may distribute under the terms of either the GNU General Public
	7	* License or the Artistic License, as specified in the README file.
	8	*
	9	*/
	10
	11	/*
	12	* A Elbereth Gilthoniel,
	13	* silivren penna míriel
	14	* o menel aglar elenath!
	15	* Na-chaered palan-díriel
	16	* o galadhremmin ennorath,
	17	* Fanuilos, le linnathon
	18	* nef aear, si nef aearon!
	19	*
	20	* [p.238 of _The Lord of the Rings_, II/i: "Many Meetings"]
	21	*/
	22
	23	/* utility functions for handling locale-specific stuff like what
	24	* character represents the decimal point.
	25	*
	26	* All C programs have an underlying locale. Perl code generally doesn't pay
	27	* any attention to it except within the scope of a 'use locale'. For most
	28	* categories, it accomplishes this by just using different operations if it is
	29	* in such scope than if not. However, various libc functions called by Perl
	30	* are affected by the LC_NUMERIC category, so there are macros in perl.h that
	31	* are used to toggle between the current locale and the C locale depending on
	32	* the desired behavior of those functions at the moment. And, LC_MESSAGES is
	33	* switched to the C locale for outputting the message unless within the scope
	34	* of 'use locale'.
	35	*/
	36
	37	#include "EXTERN.h"
	38	#define PERL_IN_LOCALE_C
	39	#include "perl_langinfo.h"
	40	#include "perl.h"
	41
	42	#include "reentr.h"
	43
	44	#ifdef I_WCHAR
	45	# include <wchar.h>
	46	#endif
	47
	48	/* If the environment says to, we can output debugging information during
	49	* initialization. This is done before option parsing, and before any thread
	50	* creation, so can be a file-level static */
	51	#if ! defined(DEBUGGING) \|\| defined(PERL_GLOBAL_STRUCT)
	52	# define debug_initialization 0
	53	# define DEBUG_INITIALIZATION_set(v)
	54	#else
	55	static bool debug_initialization = FALSE;
	56	# define DEBUG_INITIALIZATION_set(v) (debug_initialization = v)
	57	#endif
	58
	59	/* strlen() of a literal string constant. We might want this more general,
	60	* but using it in just this file for now. A problem with more generality is
	61	* the compiler warnings about comparing unlike signs */
	62	#define STRLENs(s) (sizeof("" s "") - 1)
	63
	64	/* Is the C string input 'name' "C" or "POSIX"? If so, and 'name' is the
	65	* return of setlocale(), then this is extremely likely to be the C or POSIX
	66	* locale. However, the output of setlocale() is documented to be opaque, but
	67	* the odds are extremely small that it would return these two strings for some
	68	* other locale. Note that VMS in these two locales includes many non-ASCII
	69	* characters as controls and punctuation (below are hex bytes):
	70	* cntrl: 84-97 9B-9F
	71	* punct: A1-A3 A5 A7-AB B0-B3 B5-B7 B9-BD BF-CF D1-DD DF-EF F1-FD
	72	* Oddly, none there are listed as alphas, though some represent alphabetics
	73	* http://www.nntp.perl.org/group/perl.perl5.porters/2013/02/msg198753.html */
	74	#define isNAME_C_OR_POSIX(name) \
	75	( (name) != NULL \
	76	&& (( (name) == 'C' && ((name + 1)) == '\0') \
	77	\|\| strEQ((name), "POSIX")))
	78
	79	#ifdef USE_LOCALE
	80
	81	/* This code keeps a LRU cache of the UTF-8ness of the locales it has so-far
	82	* looked up. This is in the form of a C string: */
	83
	84	#define UTF8NESS_SEP "\v"
	85	#define UTF8NESS_PREFIX "\f"
	86
	87	/* So, the string looks like:
	88	*
	89	* \vC\a0\vPOSIX\a0\vam_ET\a0\vaf_ZA.utf8\a1\ven_US.UTF-8\a1\0
	90	*
	91	* where the digit 0 after the \a indicates that the locale starting just
	92	* after the preceding \v is not UTF-8, and the digit 1 mean it is. */
	93
	94	STATIC_ASSERT_DECL(STRLENs(UTF8NESS_SEP) == 1);
	95	STATIC_ASSERT_DECL(STRLENs(UTF8NESS_PREFIX) == 1);
	96
	97	#define C_and_POSIX_utf8ness UTF8NESS_SEP "C" UTF8NESS_PREFIX "0" \
	98	UTF8NESS_SEP "POSIX" UTF8NESS_PREFIX "0"
	99
	100	/* The cache is initialized to C_and_POSIX_utf8ness at start up. These are
	101	* kept there always. The remining portion of the cache is LRU, with the
	102	* oldest looked-up locale at the tail end */
	103
	104	STATIC char *
	105	S_stdize_locale(pTHX_ char *locs)
	106	{
	107	/* Standardize the locale name from a string returned by 'setlocale',
	108	* possibly modifying that string.
	109	*
	110	* The typical return value of setlocale() is either
	111	* (1) "xx_YY" if the first argument of setlocale() is not LC_ALL
	112	* (2) "xa_YY xb_YY ..." if the first argument of setlocale() is LC_ALL
	113	* (the space-separated values represent the various sublocales,
	114	* in some unspecified order). This is not handled by this function.
	115	*
	116	* In some platforms it has a form like "LC_SOMETHING=Lang_Country.866\n",
	117	* which is harmful for further use of the string in setlocale(). This
	118	* function removes the trailing new line and everything up through the '='
	119	* */
	120
	121	const char * const s = strchr(locs, '=');
	122	bool okay = TRUE;
	123
	124	PERL_ARGS_ASSERT_STDIZE_LOCALE;
	125
	126	if (s) {
	127	const char * const t = strchr(s, '.');
	128	okay = FALSE;
	129	if (t) {
	130	const char * const u = strchr(t, '\n');
	131	if (u && (u[1] == 0)) {
	132	const STRLEN len = u - s;
	133	Move(s + 1, locs, len, char);
	134	locs[len] = 0;
	135	okay = TRUE;
	136	}
	137	}
	138	}
	139
	140	if (!okay)
	141	Perl_croak(aTHX_ "Can't fix broken locale name \"%s\"", locs);
	142
	143	return locs;
	144	}
	145
	146	/* Two parallel arrays; first the locale categories Perl uses on this system;
	147	* the second array is their names. These arrays are in mostly arbitrary
	148	* order. */
	149
	150	const int categories[] = {
	151
	152	# ifdef USE_LOCALE_NUMERIC
	153	LC_NUMERIC,
	154	# endif
	155	# ifdef USE_LOCALE_CTYPE
	156	LC_CTYPE,
	157	# endif
	158	# ifdef USE_LOCALE_COLLATE
	159	LC_COLLATE,
	160	# endif
	161	# ifdef USE_LOCALE_TIME
	162	LC_TIME,
	163	# endif
	164	# ifdef USE_LOCALE_MESSAGES
	165	LC_MESSAGES,
	166	# endif
	167	# ifdef USE_LOCALE_MONETARY
	168	LC_MONETARY,
	169	# endif
	170	# ifdef USE_LOCALE_ADDRESS
	171	LC_ADDRESS,
	172	# endif
	173	# ifdef USE_LOCALE_IDENTIFICATION
	174	LC_IDENTIFICATION,
	175	# endif
	176	# ifdef USE_LOCALE_MEASUREMENT
	177	LC_MEASUREMENT,
	178	# endif
	179	# ifdef USE_LOCALE_PAPER
	180	LC_PAPER,
	181	# endif
	182	# ifdef USE_LOCALE_TELEPHONE
	183	LC_TELEPHONE,
	184	# endif
	185	# ifdef LC_ALL
	186	LC_ALL,
	187	# endif
	188	-1 /* Placeholder because C doesn't allow a
	189	trailing comma, and it would get complicated
	190	with all the #ifdef's */
	191	};
	192
	193	/* The top-most real element is LC_ALL */
	194
	195	const char * category_names[] = {
	196
	197	# ifdef USE_LOCALE_NUMERIC
	198	"LC_NUMERIC",
	199	# endif
	200	# ifdef USE_LOCALE_CTYPE
	201	"LC_CTYPE",
	202	# endif
	203	# ifdef USE_LOCALE_COLLATE
	204	"LC_COLLATE",
	205	# endif
	206	# ifdef USE_LOCALE_TIME
	207	"LC_TIME",
	208	# endif
	209	# ifdef USE_LOCALE_MESSAGES
	210	"LC_MESSAGES",
	211	# endif
	212	# ifdef USE_LOCALE_MONETARY
	213	"LC_MONETARY",
	214	# endif
	215	# ifdef USE_LOCALE_ADDRESS
	216	"LC_ADDRESS",
	217	# endif
	218	# ifdef USE_LOCALE_IDENTIFICATION
	219	"LC_IDENTIFICATION",
	220	# endif
	221	# ifdef USE_LOCALE_MEASUREMENT
	222	"LC_MEASUREMENT",
	223	# endif
	224	# ifdef USE_LOCALE_PAPER
	225	"LC_PAPER",
	226	# endif
	227	# ifdef USE_LOCALE_TELEPHONE
	228	"LC_TELEPHONE",
	229	# endif
	230	# ifdef LC_ALL
	231	"LC_ALL",
	232	# endif
	233	NULL /* Placeholder */
	234	};
	235
	236	# ifdef LC_ALL
	237
	238	/* On systems with LC_ALL, it is kept in the highest index position. (-2
	239	* to account for the final unused placeholder element.) */
	240	# define NOMINAL_LC_ALL_INDEX (C_ARRAY_LENGTH(categories) - 2)
	241
	242	# else
	243
	244	/* On systems without LC_ALL, we pretend it is there, one beyond the real
	245	* top element, hence in the unused placeholder element. */
	246	# define NOMINAL_LC_ALL_INDEX (C_ARRAY_LENGTH(categories) - 1)
	247
	248	# endif
	249
	250	/* Pretending there is an LC_ALL element just above allows us to avoid most
	251	* special cases. Most loops through these arrays in the code below are
	252	* written like 'for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++)'. They will work
	253	* on either type of system. But the code must be written to not access the
	254	* element at 'LC_ALL_INDEX' except on platforms that have it. This can be
	255	* checked for at compile time by using the #define LC_ALL_INDEX which is only
	256	* defined if we do have LC_ALL. */
	257
	258	STATIC const char *
	259	S_category_name(const int category)
	260	{
	261	unsigned int i;
	262
	263	#ifdef LC_ALL
	264
	265	if (category == LC_ALL) {
	266	return "LC_ALL";
	267	}
	268
	269	#endif
	270
	271	for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++) {
	272	if (category == categories[i]) {
	273	return category_names[i];
	274	}
	275	}
	276
	277	{
	278	const char suffix[] = " (unknown)";
	279	int temp = category;
	280	Size_t length = sizeof(suffix) + 1;
	281	char * unknown;
	282	dTHX;
	283
	284	if (temp < 0) {
	285	length++;
	286	temp = - temp;
	287	}
	288
	289	/* Calculate the number of digits */
	290	while (temp >= 10) {
	291	temp /= 10;
	292	length++;
	293	}
	294
	295	Newx(unknown, length, char);
	296	my_snprintf(unknown, length, "%d%s", category, suffix);
	297	SAVEFREEPV(unknown);
	298	return unknown;
	299	}
	300	}
	301
	302	/* Now create LC_foo_INDEX #defines for just those categories on this system */
	303	# ifdef USE_LOCALE_NUMERIC
	304	# define LC_NUMERIC_INDEX 0
	305	# define _DUMMY_NUMERIC LC_NUMERIC_INDEX
	306	# else
	307	# define _DUMMY_NUMERIC -1
	308	# endif
	309	# ifdef USE_LOCALE_CTYPE
	310	# define LC_CTYPE_INDEX _DUMMY_NUMERIC + 1
	311	# define _DUMMY_CTYPE LC_CTYPE_INDEX
	312	# else
	313	# define _DUMMY_CTYPE _DUMMY_NUMERIC
	314	# endif
	315	# ifdef USE_LOCALE_COLLATE
	316	# define LC_COLLATE_INDEX _DUMMY_CTYPE + 1
	317	# define _DUMMY_COLLATE LC_COLLATE_INDEX
	318	# else
	319	# define _DUMMY_COLLATE _DUMMY_COLLATE
	320	# endif
	321	# ifdef USE_LOCALE_TIME
	322	# define LC_TIME_INDEX _DUMMY_COLLATE + 1
	323	# define _DUMMY_TIME LC_TIME_INDEX
	324	# else
	325	# define _DUMMY_TIME _DUMMY_COLLATE
	326	# endif
	327	# ifdef USE_LOCALE_MESSAGES
	328	# define LC_MESSAGES_INDEX _DUMMY_TIME + 1
	329	# define _DUMMY_MESSAGES LC_MESSAGES_INDEX
	330	# else
	331	# define _DUMMY_MESSAGES _DUMMY_TIME
	332	# endif
	333	# ifdef USE_LOCALE_MONETARY
	334	# define LC_MONETARY_INDEX _DUMMY_MESSAGES + 1
	335	# define _DUMMY_MONETARY LC_MONETARY_INDEX
	336	# else
	337	# define _DUMMY_MONETARY _DUMMY_MESSAGES
	338	# endif
	339	# ifdef USE_LOCALE_ADDRESS
	340	# define LC_ADDRESS_INDEX _DUMMY_MONETARY + 1
	341	# define _DUMMY_ADDRESS LC_ADDRESS_INDEX
	342	# else
	343	# define _DUMMY_ADDRESS _DUMMY_MONETARY
	344	# endif
	345	# ifdef USE_LOCALE_IDENTIFICATION
	346	# define LC_IDENTIFICATION_INDEX _DUMMY_ADDRESS + 1
	347	# define _DUMMY_IDENTIFICATION LC_IDENTIFICATION_INDEX
	348	# else
	349	# define _DUMMY_IDENTIFICATION _DUMMY_ADDRESS
	350	# endif
	351	# ifdef USE_LOCALE_MEASUREMENT
	352	# define LC_MEASUREMENT_INDEX _DUMMY_IDENTIFICATION + 1
	353	# define _DUMMY_MEASUREMENT LC_MEASUREMENT_INDEX
	354	# else
	355	# define _DUMMY_MEASUREMENT _DUMMY_IDENTIFICATION
	356	# endif
	357	# ifdef USE_LOCALE_PAPER
	358	# define LC_PAPER_INDEX _DUMMY_MEASUREMENT + 1
	359	# define _DUMMY_PAPER LC_PAPER_INDEX
	360	# else
	361	# define _DUMMY_PAPER _DUMMY_MEASUREMENT
	362	# endif
	363	# ifdef USE_LOCALE_TELEPHONE
	364	# define LC_TELEPHONE_INDEX _DUMMY_PAPER + 1
	365	# define _DUMMY_TELEPHONE LC_TELEPHONE_INDEX
	366	# else
	367	# define _DUMMY_TELEPHONE _DUMMY_PAPER
	368	# endif
	369	# ifdef LC_ALL
	370	# define LC_ALL_INDEX _DUMMY_TELEPHONE + 1
	371	# endif
	372	#endif /* ifdef USE_LOCALE */
	373
	374	/* Windows requres a customized base-level setlocale() */
	375	# ifdef WIN32
	376	# define my_setlocale(cat, locale) win32_setlocale(cat, locale)
	377	# else
	378	# define my_setlocale(cat, locale) setlocale(cat, locale)
	379	# endif
	380
	381	/* Just placeholders for now. "_c" is intended to be called when the category
	382	* is a constant known at compile time; "_r", not known until run time */
	383	# define do_setlocale_c(category, locale) my_setlocale(category, locale)
	384	# define do_setlocale_r(category, locale) my_setlocale(category, locale)
	385
	386	STATIC void
	387	S_set_numeric_radix(pTHX_ const bool use_locale)
	388	{
	389	/* If 'use_locale' is FALSE, set to use a dot for the radix character. If
	390	* TRUE, use the radix character derived from the current locale */
	391
	392	#if defined(USE_LOCALE_NUMERIC) && ( defined(HAS_LOCALECONV) \
	393	\|\| defined(HAS_NL_LANGINFO))
	394
	395	const char * radix = (use_locale)
	396	? my_nl_langinfo(PERL_RADIXCHAR, FALSE)
	397	/* FALSE => already in dest locale */
	398	: ".";
	399
	400	sv_setpv(PL_numeric_radix_sv, radix);
	401
	402	/* If this is valid UTF-8 that isn't totally ASCII, and we are in
	403	* a UTF-8 locale, then mark the radix as being in UTF-8 */
	404	if (is_utf8_non_invariant_string((U8 *) SvPVX(PL_numeric_radix_sv),
	405	SvCUR(PL_numeric_radix_sv))
	406	&& _is_cur_LC_category_utf8(LC_NUMERIC))
	407	{
	408	SvUTF8_on(PL_numeric_radix_sv);
	409	}
	410
	411	# ifdef DEBUGGING
	412
	413	if (DEBUG_L_TEST \|\| debug_initialization) {
	414	PerlIO_printf(Perl_debug_log, "Locale radix is '%s', ?UTF-8=%d\n",
	415	SvPVX(PL_numeric_radix_sv),
	416	cBOOL(SvUTF8(PL_numeric_radix_sv)));
	417	}
	418
	419	# endif
	420	#endif /* USE_LOCALE_NUMERIC and can find the radix char */
	421
	422	}
	423
	424
	425	void
	426	Perl_new_numeric(pTHX_ const char *newnum)
	427	{
	428
	429	#ifndef USE_LOCALE_NUMERIC
	430
	431	PERL_UNUSED_ARG(newnum);
	432
	433	#else
	434
	435	/* Called after each libc setlocale() call affecting LC_NUMERIC, to tell
	436	* core Perl this and that 'newnum' is the name of the new locale.
	437	* It installs this locale as the current underlying default.
	438	*
	439	* The default locale and the C locale can be toggled between by use of the
	440	* set_numeric_underlying() and set_numeric_standard() functions, which
	441	* should probably not be called directly, but only via macros like
	442	* SET_NUMERIC_STANDARD() in perl.h.
	443	*
	444	* The toggling is necessary mainly so that a non-dot radix decimal point
	445	* character can be output, while allowing internal calculations to use a
	446	* dot.
	447	*
	448	* This sets several interpreter-level variables:
	449	* PL_numeric_name The underlying locale's name: a copy of 'newnum'
	450	* PL_numeric_underlying A boolean indicating if the toggled state is such
	451	* that the current locale is the program's underlying
	452	* locale
	453	* PL_numeric_standard An int indicating if the toggled state is such
	454	* that the current locale is the C locale or
	455	* indistinguishable from the C locale. If non-zero, it
	456	* is in C; if > 1, it means it may not be toggled away
	457	* from C.
	458	* PL_numeric_underlying_is_standard A bool kept by this function
	459	* indicating that the underlying locale and the standard
	460	* C locale are indistinguishable for the purposes of
	461	* LC_NUMERIC. This happens when both of the above two
	462	* variables are true at the same time. (Toggling is a
	463	* no-op under these circumstances.) This variable is
	464	* used to avoid having to recalculate.
	465	* Any code changing the locale (outside this file) should use
	466	* POSIX::setlocale, which calls this function. Therefore this function
	467	* should be called directly only from this file and from
	468	* POSIX::setlocale() */
	469
	470	char *save_newnum;
	471
	472	if (! newnum) {
	473	Safefree(PL_numeric_name);
	474	PL_numeric_name = NULL;
	475	PL_numeric_standard = TRUE;
	476	PL_numeric_underlying = TRUE;
	477	PL_numeric_underlying_is_standard = TRUE;
	478	return;
	479	}
	480
	481	save_newnum = stdize_locale(savepv(newnum));
	482	PL_numeric_underlying = TRUE;
	483	PL_numeric_standard = isNAME_C_OR_POSIX(save_newnum);
	484
	485	/* If its name isn't C nor POSIX, it could still be indistinguishable from
	486	* them */
	487	if (! PL_numeric_standard) {
	488	PL_numeric_standard = cBOOL(strEQ(".", my_nl_langinfo(PERL_RADIXCHAR,
	489	FALSE /* Don't toggle locale */ ))
	490	&& strEQ("", my_nl_langinfo(PERL_THOUSEP,
	491	FALSE)));
	492	}
	493
	494	/* Save the new name if it isn't the same as the previous one, if any */
	495	if (! PL_numeric_name \|\| strNE(PL_numeric_name, save_newnum)) {
	496	Safefree(PL_numeric_name);
	497	PL_numeric_name = save_newnum;
	498	}
	499	else {
	500	Safefree(save_newnum);
	501	}
	502
	503	PL_numeric_underlying_is_standard = PL_numeric_standard;
	504
	505	# ifdef HAS_POSIX_2008_LOCALE
	506
	507	PL_underlying_numeric_obj = newlocale(LC_NUMERIC_MASK,
	508	PL_numeric_name,
	509	PL_underlying_numeric_obj);
	510
	511	#endif
	512
	513	if (DEBUG_L_TEST \|\| debug_initialization) {
	514	PerlIO_printf(Perl_debug_log, "Called new_numeric with %s, PL_numeric_name=%s\n", newnum, PL_numeric_name);
	515	}
	516
	517	/* Keep LC_NUMERIC in the C locale. This is for XS modules, so they don't
	518	* have to worry about the radix being a non-dot. (Core operations that
	519	* need the underlying locale change to it temporarily). */
	520	set_numeric_standard();
	521
	522	#endif /* USE_LOCALE_NUMERIC */
	523
	524	}
	525
	526	void
	527	Perl_set_numeric_standard(pTHX)
	528	{
	529
	530	#ifdef USE_LOCALE_NUMERIC
	531
	532	/* Toggle the LC_NUMERIC locale to C. Most code should use the macros like
	533	* SET_NUMERIC_STANDARD() in perl.h instead of calling this directly. The
	534	* macro avoids calling this routine if toggling isn't necessary according
	535	* to our records (which could be wrong if some XS code has changed the
	536	* locale behind our back) */
	537
	538	do_setlocale_c(LC_NUMERIC, "C");
	539	PL_numeric_standard = TRUE;
	540	PL_numeric_underlying = PL_numeric_underlying_is_standard;
	541	set_numeric_radix(0);
	542
	543	# ifdef DEBUGGING
	544
	545	if (DEBUG_L_TEST \|\| debug_initialization) {
	546	PerlIO_printf(Perl_debug_log,
	547	"LC_NUMERIC locale now is standard C\n");
	548	}
	549
	550	# endif
	551	#endif /* USE_LOCALE_NUMERIC */
	552
	553	}
	554
	555	void
	556	Perl_set_numeric_underlying(pTHX)
	557	{
	558
	559	#ifdef USE_LOCALE_NUMERIC
	560
	561	/* Toggle the LC_NUMERIC locale to the current underlying default. Most
	562	* code should use the macros like SET_NUMERIC_UNDERLYING() in perl.h
	563	* instead of calling this directly. The macro avoids calling this routine
	564	* if toggling isn't necessary according to our records (which could be
	565	* wrong if some XS code has changed the locale behind our back) */
	566
	567	do_setlocale_c(LC_NUMERIC, PL_numeric_name);
	568	PL_numeric_standard = PL_numeric_underlying_is_standard;
	569	PL_numeric_underlying = TRUE;
	570	set_numeric_radix(! PL_numeric_standard);
	571
	572	# ifdef DEBUGGING
	573
	574	if (DEBUG_L_TEST \|\| debug_initialization) {
	575	PerlIO_printf(Perl_debug_log,
	576	"LC_NUMERIC locale now is %s\n",
	577	PL_numeric_name);
	578	}
	579
	580	# endif
	581	#endif /* USE_LOCALE_NUMERIC */
	582
	583	}
	584
	585	/*
	586	* Set up for a new ctype locale.
	587	*/
	588	STATIC void
	589	S_new_ctype(pTHX_ const char *newctype)
	590	{
	591
	592	#ifndef USE_LOCALE_CTYPE
	593
	594	PERL_ARGS_ASSERT_NEW_CTYPE;
	595	PERL_UNUSED_ARG(newctype);
	596	PERL_UNUSED_CONTEXT;
	597
	598	#else
	599
	600	/* Called after each libc setlocale() call affecting LC_CTYPE, to tell
	601	* core Perl this and that 'newctype' is the name of the new locale.
	602	*
	603	* This function sets up the folding arrays for all 256 bytes, assuming
	604	* that tofold() is tolc() since fold case is not a concept in POSIX,
	605	*
	606	* Any code changing the locale (outside this file) should use
	607	* POSIX::setlocale, which calls this function. Therefore this function
	608	* should be called directly only from this file and from
	609	* POSIX::setlocale() */
	610
	611	dVAR;
	612	UV i;
	613
	614	PERL_ARGS_ASSERT_NEW_CTYPE;
	615
	616	/* We will replace any bad locale warning with 1) nothing if the new one is
	617	* ok; or 2) a new warning for the bad new locale */
	618	if (PL_warn_locale) {
	619	SvREFCNT_dec_NN(PL_warn_locale);
	620	PL_warn_locale = NULL;
	621	}
	622
	623	PL_in_utf8_CTYPE_locale = _is_cur_LC_category_utf8(LC_CTYPE);
	624
	625	/* A UTF-8 locale gets standard rules. But note that code still has to
	626	* handle this specially because of the three problematic code points */
	627	if (PL_in_utf8_CTYPE_locale) {
	628	Copy(PL_fold_latin1, PL_fold_locale, 256, U8);
	629	}
	630	else {
	631	/* Assume enough space for every character being bad. 4 spaces each
	632	* for the 94 printable characters that are output like "'x' "; and 5
	633	* spaces each for "'\\' ", "'\t' ", and "'\n' "; plus a terminating
	634	* NUL */
	635	char bad_chars_list[ (94 * 4) + (3 * 5) + 1 ];
	636
	637	/* Don't check for problems if we are suppressing the warnings */
	638	bool check_for_problems = ckWARN_d(WARN_LOCALE)
	639	\|\| UNLIKELY(DEBUG_L_TEST);
	640	bool multi_byte_locale = FALSE; /* Assume is a single-byte locale
	641	to start */
	642	unsigned int bad_count = 0; /* Count of bad characters */
	643
	644	for (i = 0; i < 256; i++) {
	645	if (isupper(i))
	646	PL_fold_locale[i] = (U8) tolower(i);
	647	else if (islower(i))
	648	PL_fold_locale[i] = (U8) toupper(i);
	649	else
	650	PL_fold_locale[i] = (U8) i;
	651
	652	/* If checking for locale problems, see if the native ASCII-range
	653	* printables plus \n and \t are in their expected categories in
	654	* the new locale. If not, this could mean big trouble, upending
	655	* Perl's and most programs' assumptions, like having a
	656	* metacharacter with special meaning become a \w. Fortunately,
	657	* it's very rare to find locales that aren't supersets of ASCII
	658	* nowadays. It isn't a problem for most controls to be changed
	659	* into something else; we check only \n and \t, though perhaps \r
	660	* could be an issue as well. */
	661	if ( check_for_problems
	662	&& (isGRAPH_A(i) \|\| isBLANK_A(i) \|\| i == '\n'))
	663	{
	664	if ( cBOOL(isalnum(i)) != cBOOL(isALPHANUMERIC_A(i))
	665	\|\| cBOOL(isalpha(i)) != cBOOL(isALPHA_A(i))
	666	\|\| cBOOL(isdigit(i)) != cBOOL(isDIGIT_A(i))
	667	\|\| cBOOL(isgraph(i)) != cBOOL(isGRAPH_A(i))
	668	\|\| cBOOL(islower(i)) != cBOOL(isLOWER_A(i))
	669	\|\| cBOOL(isprint(i)) != cBOOL(isPRINT_A(i))
	670	\|\| cBOOL(ispunct(i)) != cBOOL(isPUNCT_A(i))
	671	\|\| cBOOL(isspace(i)) != cBOOL(isSPACE_A(i))
	672	\|\| cBOOL(isupper(i)) != cBOOL(isUPPER_A(i))
	673	\|\| cBOOL(isxdigit(i))!= cBOOL(isXDIGIT_A(i))
	674	\|\| tolower(i) != (int) toLOWER_A(i)
	675	\|\| toupper(i) != (int) toUPPER_A(i)
	676	\|\| (i == '\n' && ! isCNTRL_LC(i)))
	677	{
	678	if (bad_count) { /* Separate multiple entries with a
	679	blank */
	680	bad_chars_list[bad_count++] = ' ';
	681	}
	682	bad_chars_list[bad_count++] = '\'';
	683	if (isPRINT_A(i)) {
	684	bad_chars_list[bad_count++] = (char) i;
	685	}
	686	else {
	687	bad_chars_list[bad_count++] = '\\';
	688	if (i == '\n') {
	689	bad_chars_list[bad_count++] = 'n';
	690	}
	691	else {
	692	assert(i == '\t');
	693	bad_chars_list[bad_count++] = 't';
	694	}
	695	}
	696	bad_chars_list[bad_count++] = '\'';
	697	bad_chars_list[bad_count] = '\0';
	698	}
	699	}
	700	}
	701
	702	# ifdef MB_CUR_MAX
	703
	704	/* We only handle single-byte locales (outside of UTF-8 ones; so if
	705	* this locale requires more than one byte, there are going to be
	706	* problems. */
	707	DEBUG_Lv(PerlIO_printf(Perl_debug_log,
	708	"%s:%d: check_for_problems=%d, MB_CUR_MAX=%d\n",
	709	__FILE__, __LINE__, check_for_problems, (int) MB_CUR_MAX));
	710
	711	if (check_for_problems && MB_CUR_MAX > 1
	712
	713	/* Some platforms return MB_CUR_MAX > 1 for even the "C"
	714	* locale. Just assume that the implementation for them (plus
	715	* for POSIX) is correct and the > 1 value is spurious. (Since
	716	* these are specially handled to never be considered UTF-8
	717	* locales, as long as this is the only problem, everything
	718	* should work fine */
	719	&& strNE(newctype, "C") && strNE(newctype, "POSIX"))
	720	{
	721	multi_byte_locale = TRUE;
	722	}
	723
	724	# endif
	725
	726	if (bad_count \|\| multi_byte_locale) {
	727	PL_warn_locale = Perl_newSVpvf(aTHX_
	728	"Locale '%s' may not work well.%s%s%s\n",
	729	newctype,
	730	(multi_byte_locale)
	731	? " Some characters in it are not recognized by"
	732	" Perl."
	733	: "",
	734	(bad_count)
	735	? "\nThe following characters (and maybe others)"
	736	" may not have the same meaning as the Perl"
	737	" program expects:\n"
	738	: "",
	739	(bad_count)
	740	? bad_chars_list
	741	: ""
	742	);
	743	/* If we are actually in the scope of the locale or are debugging,
	744	* output the message now. If not in that scope, we save the
	745	* message to be output at the first operation using this locale,
	746	* if that actually happens. Most programs don't use locales, so
	747	* they are immune to bad ones. */
	748	if (IN_LC(LC_CTYPE) \|\| UNLIKELY(DEBUG_L_TEST)) {
	749
	750	/* The '0' below suppresses a bogus gcc compiler warning */
	751	Perl_warner(aTHX_ packWARN(WARN_LOCALE), SvPVX(PL_warn_locale), 0);
	752
	753	if (IN_LC(LC_CTYPE)) {
	754	SvREFCNT_dec_NN(PL_warn_locale);
	755	PL_warn_locale = NULL;
	756	}
	757	}
	758	}
	759	}
	760
	761	#endif /* USE_LOCALE_CTYPE */
	762
	763	}
	764
	765	void
	766	Perl__warn_problematic_locale()
	767	{
	768
	769	#ifdef USE_LOCALE_CTYPE
	770
	771	dTHX;
	772
	773	/* Internal-to-core function that outputs the message in PL_warn_locale,
	774	* and then NULLS it. Should be called only through the macro
	775	* _CHECK_AND_WARN_PROBLEMATIC_LOCALE */
	776
	777	if (PL_warn_locale) {
	778	Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
	779	SvPVX(PL_warn_locale),
	780	0 /* dummy to avoid compiler warning */ );
	781	SvREFCNT_dec_NN(PL_warn_locale);
	782	PL_warn_locale = NULL;
	783	}
	784
	785	#endif
	786
	787	}
	788
	789	STATIC void
	790	S_new_collate(pTHX_ const char *newcoll)
	791	{
	792
	793	#ifndef USE_LOCALE_COLLATE
	794
	795	PERL_UNUSED_ARG(newcoll);
	796	PERL_UNUSED_CONTEXT;
	797
	798	#else
	799
	800	/* Called after each libc setlocale() call affecting LC_COLLATE, to tell
	801	* core Perl this and that 'newcoll' is the name of the new locale.
	802	*
	803	* The design of locale collation is that every locale change is given an
	804	* index 'PL_collation_ix'. The first time a string particpates in an
	805	* operation that requires collation while locale collation is active, it
	806	* is given PERL_MAGIC_collxfrm magic (via sv_collxfrm_flags()). That
	807	* magic includes the collation index, and the transformation of the string
	808	* by strxfrm(), q.v. That transformation is used when doing comparisons,
	809	* instead of the string itself. If a string changes, the magic is
	810	* cleared. The next time the locale changes, the index is incremented,
	811	* and so we know during a comparison that the transformation is not
	812	* necessarily still valid, and so is recomputed. Note that if the locale
	813	* changes enough times, the index could wrap (a U32), and it is possible
	814	* that a transformation would improperly be considered valid, leading to
	815	* an unlikely bug */
	816
	817	if (! newcoll) {
	818	if (PL_collation_name) {
	819	++PL_collation_ix;
	820	Safefree(PL_collation_name);
	821	PL_collation_name = NULL;
	822	}
	823	PL_collation_standard = TRUE;
	824	is_standard_collation:
	825	PL_collxfrm_base = 0;
	826	PL_collxfrm_mult = 2;
	827	PL_in_utf8_COLLATE_locale = FALSE;
	828	PL_strxfrm_NUL_replacement = '\0';
	829	PL_strxfrm_max_cp = 0;
	830	return;
	831	}
	832
	833	/* If this is not the same locale as currently, set the new one up */
	834	if (! PL_collation_name \|\| strNE(PL_collation_name, newcoll)) {
	835	++PL_collation_ix;
	836	Safefree(PL_collation_name);
	837	PL_collation_name = stdize_locale(savepv(newcoll));
	838	PL_collation_standard = isNAME_C_OR_POSIX(newcoll);
	839	if (PL_collation_standard) {
	840	goto is_standard_collation;
	841	}
	842
	843	PL_in_utf8_COLLATE_locale = _is_cur_LC_category_utf8(LC_COLLATE);
	844	PL_strxfrm_NUL_replacement = '\0';
	845	PL_strxfrm_max_cp = 0;
	846
	847	/* A locale collation definition includes primary, secondary, tertiary,
	848	* etc. weights for each character. To sort, the primary weights are
	849	* used, and only if they compare equal, then the secondary weights are
	850	* used, and only if they compare equal, then the tertiary, etc.
	851	*
	852	* strxfrm() works by taking the input string, say ABC, and creating an
	853	* output transformed string consisting of first the primary weights,
	854	* A¹B¹C¹ followed by the secondary ones, A²B²C²; and then the
	855	* tertiary, etc, yielding A¹B¹C¹ A²B²C² A³B³C³ .... Some characters
	856	* may not have weights at every level. In our example, let's say B
	857	* doesn't have a tertiary weight, and A doesn't have a secondary
	858	* weight. The constructed string is then going to be
	859	* A¹B¹C¹ B²C² A³C³ ....
	860	* This has the desired effect that strcmp() will look at the secondary
	861	* or tertiary weights only if the strings compare equal at all higher
	862	* priority weights. The spaces shown here, like in
	863	* "A¹B¹C¹ A²B²C² "
	864	* are not just for readability. In the general case, these must
	865	* actually be bytes, which we will call here 'separator weights'; and
	866	* they must be smaller than any other weight value, but since these
	867	* are C strings, only the terminating one can be a NUL (some
	868	* implementations may include a non-NUL separator weight just before
	869	* the NUL). Implementations tend to reserve 01 for the separator
	870	* weights. They are needed so that a shorter string's secondary
	871	* weights won't be misconstrued as primary weights of a longer string,
	872	* etc. By making them smaller than any other weight, the shorter
	873	* string will sort first. (Actually, if all secondary weights are
	874	* smaller than all primary ones, there is no need for a separator
	875	* weight between those two levels, etc.)
	876	*
	877	* The length of the transformed string is roughly a linear function of
	878	* the input string. It's not exactly linear because some characters
	879	* don't have weights at all levels. When we call strxfrm() we have to
	880	* allocate some memory to hold the transformed string. The
	881	* calculations below try to find coefficients 'm' and 'b' for this
	882	* locale so that m*x + b equals how much space we need, given the size
	883	* of the input string in 'x'. If we calculate too small, we increase
	884	* the size as needed, and call strxfrm() again, but it is better to
	885	* get it right the first time to avoid wasted expensive string
	886	* transformations. */
	887
	888	{
	889	/* We use the string below to find how long the tranformation of it
	890	* is. Almost all locales are supersets of ASCII, or at least the
	891	* ASCII letters. We use all of them, half upper half lower,
	892	* because if we used fewer, we might hit just the ones that are
	893	* outliers in a particular locale. Most of the strings being
	894	* collated will contain a preponderance of letters, and even if
	895	* they are above-ASCII, they are likely to have the same number of
	896	* weight levels as the ASCII ones. It turns out that digits tend
	897	* to have fewer levels, and some punctuation has more, but those
	898	* are relatively sparse in text, and khw believes this gives a
	899	* reasonable result, but it could be changed if experience so
	900	* dictates. */
	901	const char longer[] = "ABCDEFGHIJKLMnopqrstuvwxyz";
	902	char * x_longer; /* Transformed 'longer' */
	903	Size_t x_len_longer; /* Length of 'x_longer' */
	904
	905	char * x_shorter; /* We also transform a substring of 'longer' */
	906	Size_t x_len_shorter;
	907
	908	/* _mem_collxfrm() is used get the transformation (though here we
	909	* are interested only in its length). It is used because it has
	910	* the intelligence to handle all cases, but to work, it needs some
	911	* values of 'm' and 'b' to get it started. For the purposes of
	912	* this calculation we use a very conservative estimate of 'm' and
	913	* 'b'. This assumes a weight can be multiple bytes, enough to
	914	* hold any UV on the platform, and there are 5 levels, 4 weight
	915	* bytes, and a trailing NUL. */
	916	PL_collxfrm_base = 5;
	917	PL_collxfrm_mult = 5 * sizeof(UV);
	918
	919	/* Find out how long the transformation really is */
	920	x_longer = _mem_collxfrm(longer,
	921	sizeof(longer) - 1,
	922	&x_len_longer,
	923
	924	/* We avoid converting to UTF-8 in the
	925	* called function by telling it the
	926	* string is in UTF-8 if the locale is a
	927	* UTF-8 one. Since the string passed
	928	* here is invariant under UTF-8, we can
	929	* claim it's UTF-8 even though it isn't.
	930	* */
	931	PL_in_utf8_COLLATE_locale);
	932	Safefree(x_longer);
	933
	934	/* Find out how long the transformation of a substring of 'longer'
	935	* is. Together the lengths of these transformations are
	936	* sufficient to calculate 'm' and 'b'. The substring is all of
	937	* 'longer' except the first character. This minimizes the chances
	938	* of being swayed by outliers */
	939	x_shorter = _mem_collxfrm(longer + 1,
	940	sizeof(longer) - 2,
	941	&x_len_shorter,
	942	PL_in_utf8_COLLATE_locale);
	943	Safefree(x_shorter);
	944
	945	/* If the results are nonsensical for this simple test, the whole
	946	* locale definition is suspect. Mark it so that locale collation
	947	* is not active at all for it. XXX Should we warn? */
	948	if ( x_len_shorter == 0
	949	\|\| x_len_longer == 0
	950	\|\| x_len_shorter >= x_len_longer)
	951	{
	952	PL_collxfrm_mult = 0;
	953	PL_collxfrm_base = 0;
	954	}
	955	else {
	956	SSize_t base; /* Temporary */
	957
	958	/* We have both: m * strlen(longer) + b = x_len_longer
	959	* m * strlen(shorter) + b = x_len_shorter;
	960	* subtracting yields:
	961	* m * (strlen(longer) - strlen(shorter))
	962	* = x_len_longer - x_len_shorter
	963	* But we have set things up so that 'shorter' is 1 byte smaller
	964	* than 'longer'. Hence:
	965	* m = x_len_longer - x_len_shorter
	966	*
	967	* But if something went wrong, make sure the multiplier is at
	968	* least 1.
	969	*/
	970	if (x_len_longer > x_len_shorter) {
	971	PL_collxfrm_mult = (STRLEN) x_len_longer - x_len_shorter;
	972	}
	973	else {
	974	PL_collxfrm_mult = 1;
	975	}
	976
	977	/* mx + b = len
	978	* so: b = len - mx
	979	* but in case something has gone wrong, make sure it is
	980	* non-negative */
	981	base = x_len_longer - PL_collxfrm_mult * (sizeof(longer) - 1);
	982	if (base < 0) {
	983	base = 0;
	984	}
	985
	986	/* Add 1 for the trailing NUL */
	987	PL_collxfrm_base = base + 1;
	988	}
	989
	990	# ifdef DEBUGGING
	991
	992	if (DEBUG_L_TEST \|\| debug_initialization) {
	993	PerlIO_printf(Perl_debug_log,
	994	"%s:%d: ?UTF-8 locale=%d; x_len_shorter=%zu, "
	995	"x_len_longer=%zu,"
	996	" collate multipler=%zu, collate base=%zu\n",
	997	__FILE__, __LINE__,
	998	PL_in_utf8_COLLATE_locale,
	999	x_len_shorter, x_len_longer,
	1000	PL_collxfrm_mult, PL_collxfrm_base);
	1001	}
	1002	# endif
	1003
	1004	}
	1005	}
	1006
	1007	#endif /* USE_LOCALE_COLLATE */
	1008
	1009	}
	1010
	1011	#ifdef WIN32
	1012
	1013	STATIC char *
	1014	S_win32_setlocale(pTHX_ int category, const char* locale)
	1015	{
	1016	/* This, for Windows, emulates POSIX setlocale() behavior. There is no
	1017	* difference between the two unless the input locale is "", which normally
	1018	* means on Windows to get the machine default, which is set via the
	1019	* computer's "Regional and Language Options" (or its current equivalent).
	1020	* In POSIX, it instead means to find the locale from the user's
	1021	* environment. This routine changes the Windows behavior to first look in
	1022	* the environment, and, if anything is found, use that instead of going to
	1023	* the machine default. If there is no environment override, the machine
	1024	* default is used, by calling the real setlocale() with "".
	1025	*
	1026	* The POSIX behavior is to use the LC_ALL variable if set; otherwise to
	1027	* use the particular category's variable if set; otherwise to use the LANG
	1028	* variable. */
	1029
	1030	bool override_LC_ALL = FALSE;
	1031	char * result;
	1032	unsigned int i;
	1033
	1034	if (locale && strEQ(locale, "")) {
	1035
	1036	# ifdef LC_ALL
	1037
	1038	locale = PerlEnv_getenv("LC_ALL");
	1039	if (! locale) {
	1040	if (category == LC_ALL) {
	1041	override_LC_ALL = TRUE;
	1042	}
	1043	else {
	1044
	1045	# endif
	1046
	1047	for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++) {
	1048	if (category == categories[i]) {
	1049	locale = PerlEnv_getenv(category_names[i]);
	1050	goto found_locale;
	1051	}
	1052	}
	1053
	1054	locale = PerlEnv_getenv("LANG");
	1055	if (! locale) {
	1056	locale = "";
	1057	}
	1058
	1059	found_locale: ;
	1060
	1061	# ifdef LC_ALL
	1062
	1063	}
	1064	}
	1065
	1066	# endif
	1067
	1068	}
	1069
	1070	result = setlocale(category, locale);
	1071	DEBUG_L(PerlIO_printf(Perl_debug_log, "%s:%d: %s\n", __FILE__, __LINE__,
	1072	setlocale_debug_string(category, locale, result)));
	1073
	1074	if (! override_LC_ALL) {
	1075	return result;
	1076	}
	1077
	1078	/* Here the input category was LC_ALL, and we have set it to what is in the
	1079	* LANG variable or the system default if there is no LANG. But these have
	1080	* lower priority than the other LC_foo variables, so override it for each
	1081	* one that is set. (If they are set to "", it means to use the same thing
	1082	* we just set LC_ALL to, so can skip) */
	1083
	1084	for (i = 0; i < LC_ALL_INDEX; i++) {
	1085	result = PerlEnv_getenv(category_names[i]);
	1086	if (result && strNE(result, "")) {
	1087	setlocale(categories[i], result);
	1088	DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s:%d: %s\n",
	1089	__FILE__, __LINE__,
	1090	setlocale_debug_string(categories[i], result, "not captured")));
	1091	}
	1092	}
	1093
	1094	result = setlocale(LC_ALL, NULL);
	1095	DEBUG_L(PerlIO_printf(Perl_debug_log, "%s:%d: %s\n",
	1096	__FILE__, __LINE__,
	1097	setlocale_debug_string(LC_ALL, NULL, result)));
	1098
	1099	return result;
	1100	}
	1101
	1102	#endif
	1103
	1104	char *
	1105	Perl_setlocale(int category, const char * locale)
	1106	{
	1107	/* This wraps POSIX::setlocale() */
	1108
	1109	char * retval;
	1110	char * newlocale;
	1111	dTHX;
	1112
	1113	#ifdef USE_LOCALE_NUMERIC
	1114
	1115	/* A NULL locale means only query what the current one is. We have the
	1116	* LC_NUMERIC name saved, because we are normally switched into the C
	1117	* locale for it. For an LC_ALL query, switch back to get the correct
	1118	* results. All other categories don't require special handling */
	1119	if (locale == NULL) {
	1120	if (category == LC_NUMERIC) {
	1121	return savepv(PL_numeric_name);
	1122	}
	1123
	1124	# ifdef LC_ALL
	1125
	1126	else if (category == LC_ALL && ! PL_numeric_underlying) {
	1127
	1128	SET_NUMERIC_UNDERLYING();
	1129	}
	1130
	1131	# endif
	1132
	1133	}
	1134
	1135	#endif
	1136
	1137	/* Save retval since subsequent setlocale() calls may overwrite it. */
	1138	retval = savepv(do_setlocale_r(category, locale));
	1139
	1140	DEBUG_L(PerlIO_printf(Perl_debug_log,
	1141	"%s:%d: %s\n", __FILE__, __LINE__,
	1142	setlocale_debug_string(category, locale, retval)));
	1143	if (! retval) {
	1144	/* Should never happen that a query would return an error, but be
	1145	* sure and reset to C locale */
	1146	if (locale == 0) {
	1147	SET_NUMERIC_STANDARD();
	1148	}
	1149
	1150	return NULL;
	1151	}
	1152
	1153	/* If locale == NULL, we are just querying the state, but may have switched
	1154	* to NUMERIC_UNDERLYING. Switch back before returning. */
	1155	if (locale == NULL) {
	1156	SET_NUMERIC_STANDARD();
	1157	return retval;
	1158	}
	1159
	1160	/* Now that have switched locales, we have to update our records to
	1161	* correspond. */
	1162
	1163	switch (category) {
	1164
	1165	#ifdef USE_LOCALE_CTYPE
	1166
	1167	case LC_CTYPE:
	1168	new_ctype(retval);
	1169	break;
	1170
	1171	#endif
	1172	#ifdef USE_LOCALE_COLLATE
	1173
	1174	case LC_COLLATE:
	1175	new_collate(retval);
	1176	break;
	1177
	1178	#endif
	1179	#ifdef USE_LOCALE_NUMERIC
	1180
	1181	case LC_NUMERIC:
	1182	new_numeric(retval);
	1183	break;
	1184
	1185	#endif
	1186	#ifdef LC_ALL
	1187
	1188	case LC_ALL:
	1189
	1190	/* LC_ALL updates all the things we care about. The values may not
	1191	* be the same as 'retval', as the locale "" may have set things
	1192	* individually */
	1193
	1194	# ifdef USE_LOCALE_CTYPE
	1195
	1196	newlocale = do_setlocale_c(LC_CTYPE, NULL);
	1197	new_ctype(newlocale);
	1198
	1199	# endif /* USE_LOCALE_CTYPE */
	1200	# ifdef USE_LOCALE_COLLATE
	1201
	1202	newlocale = do_setlocale_c(LC_COLLATE, NULL);
	1203	new_collate(newlocale);
	1204
	1205	# endif
	1206	# ifdef USE_LOCALE_NUMERIC
	1207
	1208	newlocale = do_setlocale_c(LC_NUMERIC, NULL);
	1209	new_numeric(newlocale);
	1210
	1211	# endif /* USE_LOCALE_NUMERIC */
	1212	#endif /* LC_ALL */
	1213
	1214	default:
	1215	break;
	1216	}
	1217
	1218	return retval;
	1219
	1220
	1221	}
	1222
	1223	PERL_STATIC_INLINE const char *
	1224	S_save_to_buffer(const char * string, char *buf, Size_t buf_size, const Size_t offset)
	1225	{
	1226	/* Copy the NUL-terminated 'string' to 'buf' + 'offset'. 'buf' has size 'buf_size',
	1227	* growing it if necessary */
	1228
	1229	const Size_t string_size = strlen(string) + offset + 1;
	1230
	1231	PERL_ARGS_ASSERT_SAVE_TO_BUFFER;
	1232
	1233	if (*buf_size == 0) {
	1234	Newx(*buf, string_size, char);
	1235	*buf_size = string_size;
	1236	}
	1237	else if (string_size > *buf_size) {
	1238	Renew(*buf, string_size, char);
	1239	*buf_size = string_size;
	1240	}
	1241
	1242	Copy(string, *buf + offset, string_size - offset, char);
	1243	return *buf;
	1244	}
	1245
	1246	/*
	1247
	1248	=head1 Locale-related functions and macros
	1249
	1250	=for apidoc Perl_langinfo
	1251
	1252	This is an (almost ª) drop-in replacement for the system C<L<nl_langinfo(3)>>,
	1253	taking the same C<item> parameter values, and returning the same information.
	1254	But it is more thread-safe than regular C<nl_langinfo()>, and hides the quirks
	1255	of Perl's locale handling from your code, and can be used on systems that lack
	1256	a native C<nl_langinfo>.
	1257
	1258	Expanding on these:
	1259
	1260	=over
	1261
	1262	=item *
	1263
	1264	It delivers the correct results for the C<RADIXCHAR> and C<THOUSESEP> items,
	1265	without you having to write extra code. The reason for the extra code would be
	1266	because these are from the C<LC_NUMERIC> locale category, which is normally
	1267	kept set to the C locale by Perl, no matter what the underlying locale is
	1268	supposed to be, and so to get the expected results, you have to temporarily
	1269	toggle into the underlying locale, and later toggle back. (You could use
	1270	plain C<nl_langinfo> and C<L</STORE_LC_NUMERIC_FORCE_TO_UNDERLYING>> for this
	1271	but then you wouldn't get the other advantages of C<Perl_langinfo()>; not
	1272	keeping C<LC_NUMERIC> in the C locale would break a lot of CPAN, which is
	1273	expecting the radix (decimal point) character to be a dot.)
	1274
	1275	=item *
	1276
	1277	Depending on C<item>, it works on systems that don't have C<nl_langinfo>, hence
	1278	makes your code more portable. Of the fifty-some possible items specified by
	1279	the POSIX 2008 standard,
	1280	L<http://pubs.opengroup.org/onlinepubs/9699919799/basedefs/langinfo.h.html>,
	1281	only two are completely unimplemented. It uses various techniques to recover
	1282	the other items, including calling C<L<localeconv(3)>>, and C<L<strftime(3)>>,
	1283	both of which are specified in C89, so should be always be available. Later
	1284	C<strftime()> versions have additional capabilities; C<""> is returned for
	1285	those not available on your system.
	1286
	1287	The details for those items which may differ from what this emulation returns
	1288	and what a native C<nl_langinfo()> would return are:
	1289
	1290	=over
	1291
	1292	=item C<CODESET>
	1293
	1294	=item C<ERA>
	1295
	1296	Unimplemented, so returns C<"">.
	1297
	1298	=item C<YESEXPR>
	1299
	1300	=item C<YESSTR>
	1301
	1302	=item C<NOEXPR>
	1303
	1304	=item C<NOSTR>
	1305
	1306	Only the values for English are returned. C<YESSTR> and C<NOSTR> have been
	1307	removed from POSIX 2008, and are retained for backwards compatibility. Your
	1308	platform's C<nl_langinfo> may not support them.
	1309
	1310	=item C<D_FMT>
	1311
	1312	Always evaluates to C<%x>, the locale's appropriate date representation.
	1313
	1314	=item C<T_FMT>
	1315
	1316	Always evaluates to C<%X>, the locale's appropriate time representation.
	1317
	1318	=item C<D_T_FMT>
	1319
	1320	Always evaluates to C<%c>, the locale's appropriate date and time
	1321	representation.
	1322
	1323	=item C<CRNCYSTR>
	1324
	1325	The return may be incorrect for those rare locales where the currency symbol
	1326	replaces the radix character.
	1327	Send email to L<mailto:perlbug@perl.org> if you have examples of it needing
	1328	to work differently.
	1329
	1330	=item C<ALT_DIGITS>
	1331
	1332	Currently this gives the same results as Linux does.
	1333	Send email to L<mailto:perlbug@perl.org> if you have examples of it needing
	1334	to work differently.
	1335
	1336	=item C<ERA_D_FMT>
	1337
	1338	=item C<ERA_T_FMT>
	1339
	1340	=item C<ERA_D_T_FMT>
	1341
	1342	=item C<T_FMT_AMPM>
	1343
	1344	These are derived by using C<strftime()>, and not all versions of that function
	1345	know about them. C<""> is returned for these on such systems.
	1346
	1347	=back
	1348
	1349	When using C<Perl_langinfo> on systems that don't have a native
	1350	C<nl_langinfo()>, you must
	1351
	1352	#include "perl_langinfo.h"
	1353
	1354	before the C<perl.h> C<#include>. You can replace your C<langinfo.h>
	1355	C<#include> with this one. (Doing it this way keeps out the symbols that plain
	1356	C<langinfo.h> imports into the namespace for code that doesn't need it.)
	1357
	1358	You also should not use the bare C<langinfo.h> item names, but should preface
	1359	them with C<PERL_>, so use C<PERL_RADIXCHAR> instead of plain C<RADIXCHAR>.
	1360	The C<PERL_I<foo>> versions will also work for this function on systems that do
	1361	have a native C<nl_langinfo>.
	1362
	1363	=item *
	1364
	1365	It is thread-friendly, returning its result in a buffer that won't be
	1366	overwritten by another thread, so you don't have to code for that possibility.
	1367	The buffer can be overwritten by the next call to C<nl_langinfo> or
	1368	C<Perl_langinfo> in the same thread.
	1369
	1370	=item *
	1371
	1372	ª It returns S<C<const char *>>, whereas plain C<nl_langinfo()> returns S<C<char
	1373	*>>, but you are (only by documentation) forbidden to write into the buffer.
	1374	By declaring this C<const>, the compiler enforces this restriction. The extra
	1375	C<const> is why this isn't an unequivocal drop-in replacement for
	1376	C<nl_langinfo>.
	1377
	1378	=back
	1379
	1380	The original impetus for C<Perl_langinfo()> was so that code that needs to
	1381	find out the current currency symbol, floating point radix character, or digit
	1382	grouping separator can use, on all systems, the simpler and more
	1383	thread-friendly C<nl_langinfo> API instead of C<L<localeconv(3)>> which is a
	1384	pain to make thread-friendly. For other fields returned by C<localeconv>, it
	1385	is better to use the methods given in L<perlcall> to call
	1386	L<C<POSIX::localeconv()>\|POSIX/localeconv>, which is thread-friendly.
	1387
	1388	=cut
	1389
	1390	*/
	1391
	1392	const char *
	1393	#ifdef HAS_NL_LANGINFO
	1394	Perl_langinfo(const nl_item item)
	1395	#else
	1396	Perl_langinfo(const int item)
	1397	#endif
	1398	{
	1399	return my_nl_langinfo(item, TRUE);
	1400	}
	1401
	1402	const char *
	1403	#ifdef HAS_NL_LANGINFO
	1404	S_my_nl_langinfo(const nl_item item, bool toggle)
	1405	#else
	1406	S_my_nl_langinfo(const int item, bool toggle)
	1407	#endif
	1408	{
	1409	dTHX;
	1410
	1411	/* We only need to toggle into the underlying LC_NUMERIC locale for these
	1412	* two items, and only if not already there */
	1413	if (toggle && (( item != PERL_RADIXCHAR && item != PERL_THOUSEP)
	1414	\|\| PL_numeric_underlying))
	1415	{
	1416	toggle = FALSE;
	1417	}
	1418
	1419	#if defined(HAS_NL_LANGINFO) /* nl_langinfo() is available. */
	1420	#if ! defined(HAS_POSIX_2008_LOCALE)
	1421
	1422	/* Here, use plain nl_langinfo(), switching to the underlying LC_NUMERIC
	1423	* for those items dependent on it. This must be copied to a buffer before
	1424	* switching back, as some systems destroy the buffer when setlocale() is
	1425	* called */
	1426
	1427	{
	1428	DECLARATION_FOR_LC_NUMERIC_MANIPULATION;
	1429
	1430	if (toggle) {
	1431	STORE_LC_NUMERIC_FORCE_TO_UNDERLYING();
	1432	}
	1433
	1434	LOCALE_LOCK; /* Prevent interference from another thread executing
	1435	this code section (the only call to nl_langinfo in
	1436	the core) */
	1437
	1438	save_to_buffer(nl_langinfo(item), &PL_langinfo_buf,
	1439	&PL_langinfo_bufsize, 0);
	1440
	1441	LOCALE_UNLOCK;
	1442
	1443	if (toggle) {
	1444	RESTORE_LC_NUMERIC();
	1445	}
	1446	}
	1447
	1448	# else /* Use nl_langinfo_l(), avoiding both a mutex and changing the locale */
	1449
	1450	{
	1451	bool do_free = FALSE;
	1452	locale_t cur = uselocale((locale_t) 0);
	1453
	1454	if (cur == LC_GLOBAL_LOCALE) {
	1455	cur = duplocale(LC_GLOBAL_LOCALE);
	1456	do_free = TRUE;
	1457	}
	1458
	1459	if (toggle) {
	1460	if (PL_underlying_numeric_obj) {
	1461	cur = PL_underlying_numeric_obj;
	1462	}
	1463	else {
	1464	cur = newlocale(LC_NUMERIC_MASK, PL_numeric_name, cur);
	1465	do_free = TRUE;
	1466	}
	1467	}
	1468
	1469	save_to_buffer(nl_langinfo_l(item, cur),
	1470	&PL_langinfo_buf, &PL_langinfo_bufsize, 0);
	1471	if (do_free) {
	1472	freelocale(cur);
	1473	}
	1474	}
	1475
	1476	# endif
	1477
	1478	if (strEQ(PL_langinfo_buf, "")) {
	1479	if (item == PERL_YESSTR) {
	1480	return "yes";
	1481	}
	1482	if (item == PERL_NOSTR) {
	1483	return "no";
	1484	}
	1485	}
	1486
	1487	return PL_langinfo_buf;
	1488
	1489	#else /* Below, emulate nl_langinfo as best we can */
	1490
	1491	{
	1492
	1493	# ifdef HAS_LOCALECONV
	1494
	1495	const struct lconv* lc;
	1496	DECLARATION_FOR_LC_NUMERIC_MANIPULATION;
	1497
	1498	# endif
	1499	# ifdef HAS_STRFTIME
	1500
	1501	struct tm tm;
	1502	bool return_format = FALSE; /* Return the %format, not the value */
	1503	const char * format;
	1504
	1505	# endif
	1506
	1507	/* We copy the results to a per-thread buffer, even if not
	1508	* multi-threaded. This is in part to simplify this code, and partly
	1509	* because we need a buffer anyway for strftime(), and partly because a
	1510	* call of localeconv() could otherwise wipe out the buffer, and the
	1511	* programmer would not be expecting this, as this is a nl_langinfo()
	1512	* substitute after all, so s/he might be thinking their localeconv()
	1513	* is safe until another localeconv() call. */
	1514
	1515	switch (item) {
	1516	Size_t len;
	1517	const char * retval;
	1518
	1519	/* These 2 are unimplemented */
	1520	case PERL_CODESET:
	1521	case PERL_ERA: /* For use with strftime() %E modifier */
	1522
	1523	default:
	1524	return "";
	1525
	1526	/* We use only an English set, since we don't know any more */
	1527	case PERL_YESEXPR: return "^[+1yY]";
	1528	case PERL_YESSTR: return "yes";
	1529	case PERL_NOEXPR: return "^[-0nN]";
	1530	case PERL_NOSTR: return "no";
	1531
	1532	# ifdef HAS_LOCALECONV
	1533
	1534	case PERL_CRNCYSTR:
	1535
	1536	/* We don't bother with localeconv_l() because any system that
	1537	* has it is likely to also have nl_langinfo() */
	1538
	1539	LOCALE_LOCK; /* Prevent interference with other threads
	1540	using localeconv() */
	1541
	1542	lc = localeconv();
	1543	if ( ! lc
	1544	\|\| ! lc->currency_symbol
	1545	\|\| strEQ("", lc->currency_symbol))
	1546	{
	1547	LOCALE_UNLOCK;
	1548	return "";
	1549	}
	1550
	1551	/* Leave the first spot empty to be filled in below */
	1552	save_to_buffer(lc->currency_symbol, &PL_langinfo_buf,
	1553	&PL_langinfo_bufsize, 1);
	1554	if (lc->mon_decimal_point && strEQ(lc->mon_decimal_point, ""))
	1555	{ /* khw couldn't figure out how the localedef specifications
	1556	would show that the $ should replace the radix; this is
	1557	just a guess as to how it might work.*/
	1558	*PL_langinfo_buf = '.';
	1559	}
	1560	else if (lc->p_cs_precedes) {
	1561	*PL_langinfo_buf = '-';
	1562	}
	1563	else {
	1564	*PL_langinfo_buf = '+';
	1565	}
	1566
	1567	LOCALE_UNLOCK;
	1568	break;
	1569
	1570	case PERL_RADIXCHAR:
	1571	case PERL_THOUSEP:
	1572
	1573	if (toggle) {
	1574	STORE_LC_NUMERIC_FORCE_TO_UNDERLYING();
	1575	}
	1576
	1577	LOCALE_LOCK; /* Prevent interference with other threads
	1578	using localeconv() */
	1579
	1580	lc = localeconv();
	1581	if (! lc) {
	1582	retval = "";
	1583	}
	1584	else {
	1585	retval = (item == PERL_RADIXCHAR)
	1586	? lc->decimal_point
	1587	: lc->thousands_sep;
	1588	if (! retval) {
	1589	retval = "";
	1590	}
	1591	}
	1592
	1593	save_to_buffer(retval, &PL_langinfo_buf,
	1594	&PL_langinfo_bufsize, 0);
	1595
	1596	LOCALE_UNLOCK;
	1597
	1598	if (toggle) {
	1599	RESTORE_LC_NUMERIC();
	1600	}
	1601
	1602	break;
	1603
	1604	# endif
	1605	# ifdef HAS_STRFTIME
	1606
	1607	/* These are defined by C89, so we assume that strftime supports
	1608	* them, and so are returned unconditionally; they may not be what
	1609	* the locale actually says, but should give good enough results
	1610	* for someone using them as formats (as opposed to trying to parse
	1611	* them to figure out what the locale says). The other format
	1612	* items are actually tested to verify they work on the platform */
	1613	case PERL_D_FMT: return "%x";
	1614	case PERL_T_FMT: return "%X";
	1615	case PERL_D_T_FMT: return "%c";
	1616
	1617	/* These formats are only available in later strfmtime's */
	1618	case PERL_ERA_D_FMT: case PERL_ERA_T_FMT: case PERL_ERA_D_T_FMT:
	1619	case PERL_T_FMT_AMPM:
	1620
	1621	/* The rest can be gotten from most versions of strftime(). */
	1622	case PERL_ABDAY_1: case PERL_ABDAY_2: case PERL_ABDAY_3:
	1623	case PERL_ABDAY_4: case PERL_ABDAY_5: case PERL_ABDAY_6:
	1624	case PERL_ABDAY_7:
	1625	case PERL_ALT_DIGITS:
	1626	case PERL_AM_STR: case PERL_PM_STR:
	1627	case PERL_ABMON_1: case PERL_ABMON_2: case PERL_ABMON_3:
	1628	case PERL_ABMON_4: case PERL_ABMON_5: case PERL_ABMON_6:
	1629	case PERL_ABMON_7: case PERL_ABMON_8: case PERL_ABMON_9:
	1630	case PERL_ABMON_10: case PERL_ABMON_11: case PERL_ABMON_12:
	1631	case PERL_DAY_1: case PERL_DAY_2: case PERL_DAY_3: case PERL_DAY_4:
	1632	case PERL_DAY_5: case PERL_DAY_6: case PERL_DAY_7:
	1633	case PERL_MON_1: case PERL_MON_2: case PERL_MON_3: case PERL_MON_4:
	1634	case PERL_MON_5: case PERL_MON_6: case PERL_MON_7: case PERL_MON_8:
	1635	case PERL_MON_9: case PERL_MON_10: case PERL_MON_11:
	1636	case PERL_MON_12:
	1637
	1638	LOCALE_LOCK;
	1639
	1640	init_tm(&tm); /* Precaution against core dumps */
	1641	tm.tm_sec = 30;
	1642	tm.tm_min = 30;
	1643	tm.tm_hour = 6;
	1644	tm.tm_year = 2017 - 1900;
	1645	tm.tm_wday = 0;
	1646	tm.tm_mon = 0;
	1647	switch (item) {
	1648	default:
	1649	LOCALE_UNLOCK;
	1650	Perl_croak(aTHX_
	1651	"panic: %s: %d: switch case: %d problem",
	1652	__FILE__, __LINE__, item);
	1653	NOT_REACHED; /* NOTREACHED */
	1654
	1655	case PERL_PM_STR: tm.tm_hour = 18;
	1656	case PERL_AM_STR:
	1657	format = "%p";
	1658	break;
	1659
	1660	case PERL_ABDAY_7: tm.tm_wday++;
	1661	case PERL_ABDAY_6: tm.tm_wday++;
	1662	case PERL_ABDAY_5: tm.tm_wday++;
	1663	case PERL_ABDAY_4: tm.tm_wday++;
	1664	case PERL_ABDAY_3: tm.tm_wday++;
	1665	case PERL_ABDAY_2: tm.tm_wday++;
	1666	case PERL_ABDAY_1:
	1667	format = "%a";
	1668	break;
	1669
	1670	case PERL_DAY_7: tm.tm_wday++;
	1671	case PERL_DAY_6: tm.tm_wday++;
	1672	case PERL_DAY_5: tm.tm_wday++;
	1673	case PERL_DAY_4: tm.tm_wday++;
	1674	case PERL_DAY_3: tm.tm_wday++;
	1675	case PERL_DAY_2: tm.tm_wday++;
	1676	case PERL_DAY_1:
	1677	format = "%A";
	1678	break;
	1679
	1680	case PERL_ABMON_12: tm.tm_mon++;
	1681	case PERL_ABMON_11: tm.tm_mon++;
	1682	case PERL_ABMON_10: tm.tm_mon++;
	1683	case PERL_ABMON_9: tm.tm_mon++;
	1684	case PERL_ABMON_8: tm.tm_mon++;
	1685	case PERL_ABMON_7: tm.tm_mon++;
	1686	case PERL_ABMON_6: tm.tm_mon++;
	1687	case PERL_ABMON_5: tm.tm_mon++;
	1688	case PERL_ABMON_4: tm.tm_mon++;
	1689	case PERL_ABMON_3: tm.tm_mon++;
	1690	case PERL_ABMON_2: tm.tm_mon++;
	1691	case PERL_ABMON_1:
	1692	format = "%b";
	1693	break;
	1694
	1695	case PERL_MON_12: tm.tm_mon++;
	1696	case PERL_MON_11: tm.tm_mon++;
	1697	case PERL_MON_10: tm.tm_mon++;
	1698	case PERL_MON_9: tm.tm_mon++;
	1699	case PERL_MON_8: tm.tm_mon++;
	1700	case PERL_MON_7: tm.tm_mon++;
	1701	case PERL_MON_6: tm.tm_mon++;
	1702	case PERL_MON_5: tm.tm_mon++;
	1703	case PERL_MON_4: tm.tm_mon++;
	1704	case PERL_MON_3: tm.tm_mon++;
	1705	case PERL_MON_2: tm.tm_mon++;
	1706	case PERL_MON_1:
	1707	format = "%B";
	1708	break;
	1709
	1710	case PERL_T_FMT_AMPM:
	1711	format = "%r";
	1712	return_format = TRUE;
	1713	break;
	1714
	1715	case PERL_ERA_D_FMT:
	1716	format = "%Ex";
	1717	return_format = TRUE;
	1718	break;
	1719
	1720	case PERL_ERA_T_FMT:
	1721	format = "%EX";
	1722	return_format = TRUE;
	1723	break;
	1724
	1725	case PERL_ERA_D_T_FMT:
	1726	format = "%Ec";
	1727	return_format = TRUE;
	1728	break;
	1729
	1730	case PERL_ALT_DIGITS:
	1731	tm.tm_wday = 0;
	1732	format = "%Ow"; /* Find the alternate digit for 0 */
	1733	break;
	1734	}
	1735
	1736	/* We can't use my_strftime() because it doesn't look at
	1737	* tm_wday */
	1738	while (0 == strftime(PL_langinfo_buf, PL_langinfo_bufsize,
	1739	format, &tm))
	1740	{
	1741	/* A zero return means one of:
	1742	* a) there wasn't enough space in PL_langinfo_buf
	1743	* b) the format, like a plain %p, returns empty
	1744	* c) it was an illegal format, though some
	1745	* implementations of strftime will just return the
	1746	* illegal format as a plain character sequence.
	1747	*
	1748	* To quickly test for case 'b)', try again but precede
	1749	* the format with a plain character. If that result is
	1750	* still empty, the problem is either 'a)' or 'c)' */
	1751
	1752	Size_t format_size = strlen(format) + 1;
	1753	Size_t mod_size = format_size + 1;
	1754	char * mod_format;
	1755	char * temp_result;
	1756
	1757	Newx(mod_format, mod_size, char);
	1758	Newx(temp_result, PL_langinfo_bufsize, char);
	1759	*mod_format = ' ';
	1760	my_strlcpy(mod_format + 1, format, mod_size);
	1761	len = strftime(temp_result,
	1762	PL_langinfo_bufsize,
	1763	mod_format, &tm);
	1764	Safefree(mod_format);
	1765	Safefree(temp_result);
	1766
	1767	/* If 'len' is non-zero, it means that we had a case like
	1768	* %p which means the current locale doesn't use a.m. or
	1769	* p.m., and that is valid */
	1770	if (len == 0) {
	1771
	1772	/* Here, still didn't work. If we get well beyond a
	1773	* reasonable size, bail out to prevent an infinite
	1774	* loop. */
	1775
	1776	if (PL_langinfo_bufsize > 100 * format_size) {
	1777	*PL_langinfo_buf = '\0';
	1778	}
	1779	else {
	1780	/* Double the buffer size to retry; Add 1 in case
	1781	* original was 0, so we aren't stuck at 0. */
	1782	PL_langinfo_bufsize *= 2;
	1783	PL_langinfo_bufsize++;
	1784	Renew(PL_langinfo_buf, PL_langinfo_bufsize, char);
	1785	continue;
	1786	}
	1787	}
	1788
	1789	break;
	1790	}
	1791
	1792	/* Here, we got a result.
	1793	*
	1794	* If the item is 'ALT_DIGITS', PL_langinfo_buf contains the
	1795	* alternate format for wday 0. If the value is the same as
	1796	* the normal 0, there isn't an alternate, so clear the buffer.
	1797	* */
	1798	if ( item == PERL_ALT_DIGITS
	1799	&& strEQ(PL_langinfo_buf, "0"))
	1800	{
	1801	*PL_langinfo_buf = '\0';
	1802	}
	1803
	1804	/* ALT_DIGITS is problematic. Experiments on it showed that
	1805	* strftime() did not always work properly when going from
	1806	* alt-9 to alt-10. Only a few locales have this item defined,
	1807	* and in all of them on Linux that khw was able to find,
	1808	* nl_langinfo() merely returned the alt-0 character, possibly
	1809	* doubled. Most Unicode digits are in blocks of 10
	1810	* consecutive code points, so that is sufficient information
	1811	* for those scripts, as we can infer alt-1, alt-2, .... But
	1812	* for a Japanese locale, a CJK ideographic 0 is returned, and
	1813	* the CJK digits are not in code point order, so you can't
	1814	* really infer anything. The localedef for this locale did
	1815	* specify the succeeding digits, so that strftime() works
	1816	* properly on them, without needing to infer anything. But
	1817	* the nl_langinfo() return did not give sufficient information
	1818	* for the caller to understand what's going on. So until
	1819	* there is evidence that it should work differently, this
	1820	* returns the alt-0 string for ALT_DIGITS.
	1821	*
	1822	* wday was chosen because its range is all a single digit.
	1823	* Things like tm_sec have two digits as the minimum: '00' */
	1824
	1825	LOCALE_UNLOCK;
	1826
	1827	/* If to return the format, not the value, overwrite the buffer
	1828	* with it. But some strftime()s will keep the original format
	1829	* if illegal, so change those to "" */
	1830	if (return_format) {
	1831	if (strEQ(PL_langinfo_buf, format)) {
	1832	*PL_langinfo_buf = '\0';
	1833	}
	1834	else {
	1835	save_to_buffer(format, &PL_langinfo_buf,
	1836	&PL_langinfo_bufsize, 0);
	1837	}
	1838	}
	1839
	1840	break;
	1841
	1842	# endif
	1843
	1844	}
	1845	}
	1846
	1847	return PL_langinfo_buf;
	1848
	1849	#endif
	1850
	1851	}
	1852
	1853	/*
	1854	* Initialize locale awareness.
	1855	*/
	1856	int
	1857	Perl_init_i18nl10n(pTHX_ int printwarn)
	1858	{
	1859	/* printwarn is
	1860	*
	1861	* 0 if not to output warning when setup locale is bad
	1862	* 1 if to output warning based on value of PERL_BADLANG
	1863	* >1 if to output regardless of PERL_BADLANG
	1864	*
	1865	* returns
	1866	* 1 = set ok or not applicable,
	1867	* 0 = fallback to a locale of lower priority
	1868	* -1 = fallback to all locales failed, not even to the C locale
	1869	*
	1870	* Under -DDEBUGGING, if the environment variable PERL_DEBUG_LOCALE_INIT is
	1871	* set, debugging information is output.
	1872	*
	1873	* This looks more complicated than it is, mainly due to the #ifdefs.
	1874	*
	1875	* We try to set LC_ALL to the value determined by the environment. If
	1876	* there is no LC_ALL on this platform, we try the individual categories we
	1877	* know about. If this works, we are done.
	1878	*
	1879	* But if it doesn't work, we have to do something else. We search the
	1880	* environment variables ourselves instead of relying on the system to do
	1881	* it. We look at, in order, LC_ALL, LANG, a system default locale (if we
	1882	* think there is one), and the ultimate fallback "C". This is all done in
	1883	* the same loop as above to avoid duplicating code, but it makes things
	1884	* more complex. The 'trial_locales' array is initialized with just one
	1885	* element; it causes the behavior described in the paragraph above this to
	1886	* happen. If that fails, we add elements to 'trial_locales', and do extra
	1887	* loop iterations to cause the behavior described in this paragraph.
	1888	*
	1889	* On Ultrix, the locale MUST come from the environment, so there is
	1890	* preliminary code to set it. I (khw) am not sure that it is necessary,
	1891	* and that this couldn't be folded into the loop, but barring any real
	1892	* platforms to test on, it's staying as-is
	1893	*
	1894	* A slight complication is that in embedded Perls, the locale may already
	1895	* be set-up, and we don't want to get it from the normal environment
	1896	* variables. This is handled by having a special environment variable
	1897	* indicate we're in this situation. We simply set setlocale's 2nd
	1898	* parameter to be a NULL instead of "". That indicates to setlocale that
	1899	* it is not to change anything, but to return the current value,
	1900	* effectively initializing perl's db to what the locale already is.
	1901	*
	1902	* We play the same trick with NULL if a LC_ALL succeeds. We call
	1903	* setlocale() on the individual categores with NULL to get their existing
	1904	* values for our db, instead of trying to change them.
	1905	* */
	1906
	1907	int ok = 1;
	1908
	1909	#ifndef USE_LOCALE
	1910
	1911	PERL_UNUSED_ARG(printwarn);
	1912
	1913	#else /* USE_LOCALE */
	1914	# ifdef __GLIBC__
	1915
	1916	const char * const language = savepv(PerlEnv_getenv("LANGUAGE"));
	1917
	1918	# endif
	1919
	1920	/* NULL uses the existing already set up locale */
	1921	const char * const setlocale_init = (PerlEnv_getenv("PERL_SKIP_LOCALE_INIT"))
	1922	? NULL
	1923	: "";
	1924	const char* trial_locales[5]; /* 5 = 1 each for "", LC_ALL, LANG, "", C */
	1925	unsigned int trial_locales_count;
	1926	const char * const lc_all = savepv(PerlEnv_getenv("LC_ALL"));
	1927	const char * const lang = savepv(PerlEnv_getenv("LANG"));
	1928	bool setlocale_failure = FALSE;
	1929	unsigned int i;
	1930
	1931	/* A later getenv() could zap this, so only use here */
	1932	const char * const bad_lang_use_once = PerlEnv_getenv("PERL_BADLANG");
	1933
	1934	const bool locwarn = (printwarn > 1
	1935	\|\| ( printwarn
	1936	&& ( ! bad_lang_use_once
	1937	\|\| (
	1938	/* disallow with "" or "0" */
	1939	*bad_lang_use_once
	1940	&& strNE("0", bad_lang_use_once)))));
	1941
	1942	/* setlocale() return vals; not copied so must be looked at immediately */
	1943	const char * sl_result[NOMINAL_LC_ALL_INDEX + 1];
	1944
	1945	/* current locale for given category; should have been copied so aren't
	1946	* volatile */
	1947	const char * curlocales[NOMINAL_LC_ALL_INDEX + 1];
	1948
	1949	# ifdef WIN32
	1950
	1951	/* In some systems you can find out the system default locale
	1952	* and use that as the fallback locale. */
	1953	# define SYSTEM_DEFAULT_LOCALE
	1954	# endif
	1955	# ifdef SYSTEM_DEFAULT_LOCALE
	1956
	1957	const char *system_default_locale = NULL;
	1958
	1959	# endif
	1960
	1961	# ifndef DEBUGGING
	1962	# define DEBUG_LOCALE_INIT(a,b,c)
	1963	# else
	1964
	1965	DEBUG_INITIALIZATION_set(cBOOL(PerlEnv_getenv("PERL_DEBUG_LOCALE_INIT")));
	1966
	1967	# define DEBUG_LOCALE_INIT(category, locale, result) \
	1968	STMT_START { \
	1969	if (debug_initialization) { \
	1970	PerlIO_printf(Perl_debug_log, \
	1971	"%s:%d: %s\n", \
	1972	__FILE__, __LINE__, \
	1973	setlocale_debug_string(category, \
	1974	locale, \
	1975	result)); \
	1976	} \
	1977	} STMT_END
	1978
	1979	/* Make sure the parallel arrays are properly set up */
	1980	# ifdef USE_LOCALE_NUMERIC
	1981	assert(categories[LC_NUMERIC_INDEX] == LC_NUMERIC);
	1982	assert(strEQ(category_names[LC_NUMERIC_INDEX], "LC_NUMERIC"));
	1983	# endif
	1984	# ifdef USE_LOCALE_CTYPE
	1985	assert(categories[LC_CTYPE_INDEX] == LC_CTYPE);
	1986	assert(strEQ(category_names[LC_CTYPE_INDEX], "LC_CTYPE"));
	1987	# endif
	1988	# ifdef USE_LOCALE_COLLATE
	1989	assert(categories[LC_COLLATE_INDEX] == LC_COLLATE);
	1990	assert(strEQ(category_names[LC_COLLATE_INDEX], "LC_COLLATE"));
	1991	# endif
	1992	# ifdef USE_LOCALE_TIME
	1993	assert(categories[LC_TIME_INDEX] == LC_TIME);
	1994	assert(strEQ(category_names[LC_TIME_INDEX], "LC_TIME"));
	1995	# endif
	1996	# ifdef USE_LOCALE_MESSAGES
	1997	assert(categories[LC_MESSAGES_INDEX] == LC_MESSAGES);
	1998	assert(strEQ(category_names[LC_MESSAGES_INDEX], "LC_MESSAGES"));
	1999	# endif
	2000	# ifdef USE_LOCALE_MONETARY
	2001	assert(categories[LC_MONETARY_INDEX] == LC_MONETARY);
	2002	assert(strEQ(category_names[LC_MONETARY_INDEX], "LC_MONETARY"));
	2003	# endif
	2004	# ifdef USE_LOCALE_ADDRESS
	2005	assert(categories[LC_ADDRESS_INDEX] == LC_ADDRESS);
	2006	assert(strEQ(category_names[LC_ADDRESS_INDEX], "LC_ADDRESS"));
	2007	# endif
	2008	# ifdef USE_LOCALE_IDENTIFICATION
	2009	assert(categories[LC_IDENTIFICATION_INDEX] == LC_IDENTIFICATION);
	2010	assert(strEQ(category_names[LC_IDENTIFICATION_INDEX], "LC_IDENTIFICATION"));
	2011	# endif
	2012	# ifdef USE_LOCALE_MEASUREMENT
	2013	assert(categories[LC_MEASUREMENT_INDEX] == LC_MEASUREMENT);
	2014	assert(strEQ(category_names[LC_MEASUREMENT_INDEX], "LC_MEASUREMENT"));
	2015	# endif
	2016	# ifdef USE_LOCALE_PAPER
	2017	assert(categories[LC_PAPER_INDEX] == LC_PAPER);
	2018	assert(strEQ(category_names[LC_PAPER_INDEX], "LC_PAPER"));
	2019	# endif
	2020	# ifdef USE_LOCALE_TELEPHONE
	2021	assert(categories[LC_TELEPHONE_INDEX] == LC_TELEPHONE);
	2022	assert(strEQ(category_names[LC_TELEPHONE_INDEX], "LC_TELEPHONE"));
	2023	# endif
	2024	# ifdef LC_ALL
	2025	assert(categories[LC_ALL_INDEX] == LC_ALL);
	2026	assert(strEQ(category_names[LC_ALL_INDEX], "LC_ALL"));
	2027	assert(NOMINAL_LC_ALL_INDEX == LC_ALL_INDEX);
	2028	# endif
	2029	# endif /* DEBUGGING */
	2030
	2031	/* Initialize the cache of the program's UTF-8ness for the always known
	2032	* locales C and POSIX */
	2033	my_strlcpy(PL_locale_utf8ness, C_and_POSIX_utf8ness,
	2034	sizeof(PL_locale_utf8ness));
	2035
	2036	PL_numeric_radix_sv = newSVpvs(".");
	2037
	2038	# ifdef LOCALE_ENVIRON_REQUIRED
	2039
	2040	/*
	2041	* Ultrix setlocale(..., "") fails if there are no environment
	2042	* variables from which to get a locale name.
	2043	*/
	2044
	2045	# ifndef LC_ALL
	2046	# error Ultrix without LC_ALL not implemented
	2047	# else
	2048
	2049	{
	2050	bool done = FALSE;
	2051	if (lang) {
	2052	sl_result[LC_ALL_INDEX] = do_setlocale_c(LC_ALL, setlocale_init);
	2053	DEBUG_LOCALE_INIT(LC_ALL, setlocale_init, sl_result[LC_ALL_INDEX]);
	2054	if (sl_result[LC_ALL_INDEX])
	2055	done = TRUE;
	2056	else
	2057	setlocale_failure = TRUE;
	2058	}
	2059	if (! setlocale_failure) {
	2060	const char * locale_param;
	2061	for (i = 0; i < LC_ALL_INDEX; i++) {
	2062	locale_param = (! done && (lang \|\| PerlEnv_getenv(category_names[i])))
	2063	? setlocale_init
	2064	: NULL;
	2065	sl_result[i] = do_setlocale_r(categories[i], locale_param);
	2066	if (! sl_result[i]) {
	2067	setlocale_failure = TRUE;
	2068	}
	2069	DEBUG_LOCALE_INIT(categories[i], locale_param, sl_result[i]);
	2070	}
	2071	}
	2072	}
	2073
	2074	# endif /* LC_ALL */
	2075	# endif /* LOCALE_ENVIRON_REQUIRED */
	2076
	2077	/* We try each locale in the list until we get one that works, or exhaust
	2078	* the list. Normally the loop is executed just once. But if setting the
	2079	* locale fails, inside the loop we add fallback trials to the array and so
	2080	* will execute the loop multiple times */
	2081	trial_locales[0] = setlocale_init;
	2082	trial_locales_count = 1;
	2083
	2084	for (i= 0; i < trial_locales_count; i++) {
	2085	const char * trial_locale = trial_locales[i];
	2086
	2087	if (i > 0) {
	2088
	2089	/* XXX This is to preserve old behavior for LOCALE_ENVIRON_REQUIRED
	2090	* when i==0, but I (khw) don't think that behavior makes much
	2091	* sense */
	2092	setlocale_failure = FALSE;
	2093
	2094	# ifdef SYSTEM_DEFAULT_LOCALE
	2095	# ifdef WIN32 /* Note that assumes Win32 has LC_ALL */
	2096
	2097	/* On Windows machines, an entry of "" after the 0th means to use
	2098	* the system default locale, which we now proceed to get. */
	2099	if (strEQ(trial_locale, "")) {
	2100	unsigned int j;
	2101
	2102	/* Note that this may change the locale, but we are going to do
	2103	* that anyway just below */
	2104	system_default_locale = do_setlocale_c(LC_ALL, "");
	2105	DEBUG_LOCALE_INIT(LC_ALL, "", system_default_locale);
	2106
	2107	/* Skip if invalid or if it's already on the list of locales to
	2108	* try */
	2109	if (! system_default_locale) {
	2110	goto next_iteration;
	2111	}
	2112	for (j = 0; j < trial_locales_count; j++) {
	2113	if (strEQ(system_default_locale, trial_locales[j])) {
	2114	goto next_iteration;
	2115	}
	2116	}
	2117
	2118	trial_locale = system_default_locale;
	2119	}
	2120	# else
	2121	# error SYSTEM_DEFAULT_LOCALE only implemented for Win32
	2122	# endif
	2123	# endif /* SYSTEM_DEFAULT_LOCALE */
	2124
	2125	} /* For i > 0 */
	2126
	2127	# ifdef LC_ALL
	2128
	2129	sl_result[LC_ALL_INDEX] = do_setlocale_c(LC_ALL, trial_locale);
	2130	DEBUG_LOCALE_INIT(LC_ALL, trial_locale, sl_result[LC_ALL_INDEX]);
	2131	if (! sl_result[LC_ALL_INDEX]) {
	2132	setlocale_failure = TRUE;
	2133	}
	2134	else {
	2135	/* Since LC_ALL succeeded, it should have changed all the other
	2136	* categories it can to its value; so we massage things so that the
	2137	* setlocales below just return their category's current values.
	2138	* This adequately handles the case in NetBSD where LC_COLLATE may
	2139	* not be defined for a locale, and setting it individually will
	2140	* fail, whereas setting LC_ALL succeeds, leaving LC_COLLATE set to
	2141	* the POSIX locale. */
	2142	trial_locale = NULL;
	2143	}
	2144
	2145	# endif /* LC_ALL */
	2146
	2147	if (! setlocale_failure) {
	2148	unsigned int j;
	2149	for (j = 0; j < NOMINAL_LC_ALL_INDEX; j++) {
	2150	curlocales[j]
	2151	= savepv(do_setlocale_r(categories[j], trial_locale));
	2152	if (! curlocales[j]) {
	2153	setlocale_failure = TRUE;
	2154	}
	2155	DEBUG_LOCALE_INIT(categories[j], trial_locale, curlocales[j]);
	2156	}
	2157
	2158	if (! setlocale_failure) { /* All succeeded */
	2159	break; /* Exit trial_locales loop */
	2160	}
	2161	}
	2162
	2163	/* Here, something failed; will need to try a fallback. */
	2164	ok = 0;
	2165
	2166	if (i == 0) {
	2167	unsigned int j;
	2168
	2169	if (locwarn) { /* Output failure info only on the first one */
	2170
	2171	# ifdef LC_ALL
	2172
	2173	PerlIO_printf(Perl_error_log,
	2174	"perl: warning: Setting locale failed.\n");
	2175
	2176	# else /* !LC_ALL */
	2177
	2178	PerlIO_printf(Perl_error_log,
	2179	"perl: warning: Setting locale failed for the categories:\n\t");
	2180
	2181	for (j = 0; j < NOMINAL_LC_ALL_INDEX; j++) {
	2182	if (! curlocales[j]) {
	2183	PerlIO_printf(Perl_error_log, category_names[j]);
	2184	}
	2185	else {
	2186	Safefree(curlocales[j]);
	2187	}
	2188	}
	2189
	2190	# endif /* LC_ALL */
	2191
	2192	PerlIO_printf(Perl_error_log,
	2193	"perl: warning: Please check that your locale settings:\n");
	2194
	2195	# ifdef __GLIBC__
	2196
	2197	PerlIO_printf(Perl_error_log,
	2198	"\tLANGUAGE = %c%s%c,\n",
	2199	language ? '"' : '(',
	2200	language ? language : "unset",
	2201	language ? '"' : ')');
	2202	# endif
	2203
	2204	PerlIO_printf(Perl_error_log,
	2205	"\tLC_ALL = %c%s%c,\n",
	2206	lc_all ? '"' : '(',
	2207	lc_all ? lc_all : "unset",
	2208	lc_all ? '"' : ')');
	2209
	2210	# if defined(USE_ENVIRON_ARRAY)
	2211
	2212	{
	2213	char **e;
	2214
	2215	/* Look through the environment for any variables of the
	2216	* form qr/ ^ LC_ [A-Z]+ = /x, except LC_ALL which was
	2217	* already handled above. These are assumed to be locale
	2218	* settings. Output them and their values. */
	2219	for (e = environ; *e; e++) {
	2220	const STRLEN prefix_len = sizeof("LC_") - 1;
	2221	STRLEN uppers_len;
	2222
	2223	if ( strBEGINs(*e, "LC_")
	2224	&& ! strBEGINs(*e, "LC_ALL=")
	2225	&& (uppers_len = strspn(*e + prefix_len,
	2226	"ABCDEFGHIJKLMNOPQRSTUVWXYZ"))
	2227	&& ((*e)[prefix_len + uppers_len] == '='))
	2228	{
	2229	PerlIO_printf(Perl_error_log, "\t%.*s = \"%s\",\n",
	2230	(int) (prefix_len + uppers_len), *e,
	2231	*e + prefix_len + uppers_len + 1);
	2232	}
	2233	}
	2234	}
	2235
	2236	# else
	2237
	2238	PerlIO_printf(Perl_error_log,
	2239	"\t(possibly more locale environment variables)\n");
	2240
	2241	# endif
	2242
	2243	PerlIO_printf(Perl_error_log,
	2244	"\tLANG = %c%s%c\n",
	2245	lang ? '"' : '(',
	2246	lang ? lang : "unset",
	2247	lang ? '"' : ')');
	2248
	2249	PerlIO_printf(Perl_error_log,
	2250	" are supported and installed on your system.\n");
	2251	}
	2252
	2253	/* Calculate what fallback locales to try. We have avoided this
	2254	* until we have to, because failure is quite unlikely. This will
	2255	* usually change the upper bound of the loop we are in.
	2256	*
	2257	* Since the system's default way of setting the locale has not
	2258	* found one that works, We use Perl's defined ordering: LC_ALL,
	2259	* LANG, and the C locale. We don't try the same locale twice, so
	2260	* don't add to the list if already there. (On POSIX systems, the
	2261	* LC_ALL element will likely be a repeat of the 0th element "",
	2262	* but there's no harm done by doing it explicitly.
	2263	*
	2264	* Note that this tries the LC_ALL environment variable even on
	2265	* systems which have no LC_ALL locale setting. This may or may
	2266	* not have been originally intentional, but there's no real need
	2267	* to change the behavior. */
	2268	if (lc_all) {
	2269	for (j = 0; j < trial_locales_count; j++) {
	2270	if (strEQ(lc_all, trial_locales[j])) {
	2271	goto done_lc_all;
	2272	}
	2273	}
	2274	trial_locales[trial_locales_count++] = lc_all;
	2275	}
	2276	done_lc_all:
	2277
	2278	if (lang) {
	2279	for (j = 0; j < trial_locales_count; j++) {
	2280	if (strEQ(lang, trial_locales[j])) {
	2281	goto done_lang;
	2282	}
	2283	}
	2284	trial_locales[trial_locales_count++] = lang;
	2285	}
	2286	done_lang:
	2287
	2288	# if defined(WIN32) && defined(LC_ALL)
	2289
	2290	/* For Windows, we also try the system default locale before "C".
	2291	* (If there exists a Windows without LC_ALL we skip this because
	2292	* it gets too complicated. For those, the "C" is the next
	2293	* fallback possibility). The "" is the same as the 0th element of
	2294	* the array, but the code at the loop above knows to treat it
	2295	* differently when not the 0th */
	2296	trial_locales[trial_locales_count++] = "";
	2297
	2298	# endif
	2299
	2300	for (j = 0; j < trial_locales_count; j++) {
	2301	if (strEQ("C", trial_locales[j])) {
	2302	goto done_C;
	2303	}
	2304	}
	2305	trial_locales[trial_locales_count++] = "C";
	2306
	2307	done_C: ;
	2308	} /* end of first time through the loop */
	2309
	2310	# ifdef WIN32
	2311
	2312	next_iteration: ;
	2313
	2314	# endif
	2315
	2316	} /* end of looping through the trial locales */
	2317
	2318	if (ok < 1) { /* If we tried to fallback */
	2319	const char* msg;
	2320	if (! setlocale_failure) { /* fallback succeeded */
	2321	msg = "Falling back to";
	2322	}
	2323	else { /* fallback failed */
	2324	unsigned int j;
	2325
	2326	/* We dropped off the end of the loop, so have to decrement i to
	2327	* get back to the value the last time through */
	2328	i--;
	2329
	2330	ok = -1;
	2331	msg = "Failed to fall back to";
	2332
	2333	/* To continue, we should use whatever values we've got */
	2334
	2335	for (j = 0; j < NOMINAL_LC_ALL_INDEX; j++) {
	2336	Safefree(curlocales[j]);
	2337	curlocales[j] = savepv(do_setlocale_r(categories[j], NULL));
	2338	DEBUG_LOCALE_INIT(categories[j], NULL, curlocales[j]);
	2339	}
	2340	}
	2341
	2342	if (locwarn) {
	2343	const char * description;
	2344	const char * name = "";
	2345	if (strEQ(trial_locales[i], "C")) {
	2346	description = "the standard locale";
	2347	name = "C";
	2348	}
	2349
	2350	# ifdef SYSTEM_DEFAULT_LOCALE
	2351
	2352	else if (strEQ(trial_locales[i], "")) {
	2353	description = "the system default locale";
	2354	if (system_default_locale) {
	2355	name = system_default_locale;
	2356	}
	2357	}
	2358
	2359	# endif /* SYSTEM_DEFAULT_LOCALE */
	2360
	2361	else {
	2362	description = "a fallback locale";
	2363	name = trial_locales[i];
	2364	}
	2365	if (name && strNE(name, "")) {
	2366	PerlIO_printf(Perl_error_log,
	2367	"perl: warning: %s %s (\"%s\").\n", msg, description, name);
	2368	}
	2369	else {
	2370	PerlIO_printf(Perl_error_log,
	2371	"perl: warning: %s %s.\n", msg, description);
	2372	}
	2373	}
	2374	} /* End of tried to fallback */
	2375
	2376	/* Done with finding the locales; update our records */
	2377
	2378	# ifdef USE_LOCALE_CTYPE
	2379
	2380	new_ctype(curlocales[LC_CTYPE_INDEX]);
	2381
	2382	# endif
	2383	# ifdef USE_LOCALE_COLLATE
	2384
	2385	new_collate(curlocales[LC_COLLATE_INDEX]);
	2386
	2387	# endif
	2388	# ifdef USE_LOCALE_NUMERIC
	2389
	2390	new_numeric(curlocales[LC_NUMERIC_INDEX]);
	2391
	2392	# endif
	2393
	2394	for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++) {
	2395
	2396	# if defined(USE_ITHREADS)
	2397
	2398	/* This caches whether each category's locale is UTF-8 or not. This
	2399	* may involve changing the locale. It is ok to do this at
	2400	* initialization time before any threads have started, but not later.
	2401	* Caching means that if the program heeds our dictate not to change
	2402	* locales in threaded applications, this data will remain valid, and
	2403	* it may get queried without changing locales. If the environment is
	2404	* such that all categories have the same locale, this isn't needed, as
	2405	* the code will not change the locale; but this handles the uncommon
	2406	* case where the environment has disparate locales for the categories
	2407	* */
	2408	(void) _is_cur_LC_category_utf8(categories[i]);
	2409
	2410	# endif
	2411
	2412	Safefree(curlocales[i]);
	2413	}
	2414
	2415	# if defined(USE_PERLIO) && defined(USE_LOCALE_CTYPE)
	2416
	2417	/* Set PL_utf8locale to TRUE if using PerlIO _and_ the current LC_CTYPE
	2418	* locale is UTF-8. The call to new_ctype() just above has already
	2419	* calculated the latter value and saved it in PL_in_utf8_CTYPE_locale. If
	2420	* both PL_utf8locale and PL_unicode (set by -C or by $ENV{PERL_UNICODE})
	2421	* are true, perl.c:S_parse_body() will turn on the PerlIO :utf8 layer on
	2422	* STDIN, STDOUT, STDERR, _and_ the default open discipline. */
	2423	PL_utf8locale = PL_in_utf8_CTYPE_locale;
	2424
	2425	/* Set PL_unicode to $ENV{PERL_UNICODE} if using PerlIO.
	2426	This is an alternative to using the -C command line switch
	2427	(the -C if present will override this). */
	2428	{
	2429	const char *p = PerlEnv_getenv("PERL_UNICODE");
	2430	PL_unicode = p ? parse_unicode_opts(&p) : 0;
	2431	if (PL_unicode & PERL_UNICODE_UTF8CACHEASSERT_FLAG)
	2432	PL_utf8cache = -1;
	2433	}
	2434
	2435	# endif
	2436	# ifdef __GLIBC__
	2437
	2438	Safefree(language);
	2439
	2440	# endif
	2441
	2442	Safefree(lc_all);
	2443	Safefree(lang);
	2444
	2445	#endif /* USE_LOCALE */
	2446	#ifdef DEBUGGING
	2447
	2448	/* So won't continue to output stuff */
	2449	DEBUG_INITIALIZATION_set(FALSE);
	2450
	2451	#endif
	2452
	2453	return ok;
	2454	}
	2455
	2456	#ifdef USE_LOCALE_COLLATE
	2457
	2458	char *
	2459	Perl__mem_collxfrm(pTHX_ const char *input_string,
	2460	STRLEN len, /* Length of 'input_string' */
	2461	STRLEN xlen, / Set to length of returned string
	2462	(not including the collation index
	2463	prefix) */
	2464	bool utf8 /* Is the input in UTF-8? */
	2465	)
	2466	{
	2467
	2468	/* _mem_collxfrm() is a bit like strxfrm() but with two important
	2469	* differences. First, it handles embedded NULs. Second, it allocates a bit
	2470	* more memory than needed for the transformed data itself. The real
	2471	* transformed data begins at offset COLLXFRM_HDR_LEN. *xlen is set to
	2472	* the length of that, and doesn't include the collation index size.
	2473	* Please see sv_collxfrm() to see how this is used. */
	2474
	2475	#define COLLXFRM_HDR_LEN sizeof(PL_collation_ix)
	2476
	2477	char * s = (char *) input_string;
	2478	STRLEN s_strlen = strlen(input_string);
	2479	char *xbuf = NULL;
	2480	STRLEN xAlloc; /* xalloc is a reserved word in VC */
	2481	STRLEN length_in_chars;
	2482	bool first_time = TRUE; /* Cleared after first loop iteration */
	2483
	2484	PERL_ARGS_ASSERT__MEM_COLLXFRM;
	2485
	2486	/* Must be NUL-terminated */
	2487	assert(*(input_string + len) == '\0');
	2488
	2489	/* If this locale has defective collation, skip */
	2490	if (PL_collxfrm_base == 0 && PL_collxfrm_mult == 0) {
	2491	DEBUG_L(PerlIO_printf(Perl_debug_log,
	2492	"_mem_collxfrm: locale's collation is defective\n"));
	2493	goto bad;
	2494	}
	2495
	2496	/* Replace any embedded NULs with the control that sorts before any others.
	2497	* This will give as good as possible results on strings that don't
	2498	* otherwise contain that character, but otherwise there may be
	2499	* less-than-perfect results with that character and NUL. This is
	2500	* unavoidable unless we replace strxfrm with our own implementation. */
	2501	if (UNLIKELY(s_strlen < len)) { /* Only execute if there is an embedded
	2502	NUL */
	2503	char * e = s + len;
	2504	char * sans_nuls;
	2505	STRLEN sans_nuls_len;
	2506	int try_non_controls;
	2507	char this_replacement_char[] = "?\0"; /* Room for a two-byte string,
	2508	making sure 2nd byte is NUL.
	2509	*/
	2510	STRLEN this_replacement_len;
	2511
	2512	/* If we don't know what non-NUL control character sorts lowest for
	2513	* this locale, find it */
	2514	if (PL_strxfrm_NUL_replacement == '\0') {
	2515	int j;
	2516	char * cur_min_x = NULL; /* The min_char's xfrm, (except it also
	2517	includes the collation index
	2518	prefixed. */
	2519
	2520	DEBUG_Lv(PerlIO_printf(Perl_debug_log, "Looking to replace NUL\n"));
	2521
	2522	/* Unlikely, but it may be that no control will work to replace
	2523	* NUL, in which case we instead look for any character. Controls
	2524	* are preferred because collation order is, in general, context
	2525	* sensitive, with adjoining characters affecting the order, and
	2526	* controls are less likely to have such interactions, allowing the
	2527	* NUL-replacement to stand on its own. (Another way to look at it
	2528	* is to imagine what would happen if the NUL were replaced by a
	2529	* combining character; it wouldn't work out all that well.) */
	2530	for (try_non_controls = 0;
	2531	try_non_controls < 2;
	2532	try_non_controls++)
	2533	{
	2534	/* Look through all legal code points (NUL isn't) */
	2535	for (j = 1; j < 256; j++) {
	2536	char * x; /* j's xfrm plus collation index */
	2537	STRLEN x_len; /* length of 'x' */
	2538	STRLEN trial_len = 1;
	2539	char cur_source[] = { '\0', '\0' };
	2540
	2541	/* Skip non-controls the first time through the loop. The
	2542	* controls in a UTF-8 locale are the L1 ones */
	2543	if (! try_non_controls && (PL_in_utf8_COLLATE_locale)
	2544	? ! isCNTRL_L1(j)
	2545	: ! isCNTRL_LC(j))
	2546	{
	2547	continue;
	2548	}
	2549
	2550	/* Create a 1-char string of the current code point */
	2551	cur_source[0] = (char) j;
	2552
	2553	/* Then transform it */
	2554	x = _mem_collxfrm(cur_source, trial_len, &x_len,
	2555	0 /* The string is not in UTF-8 */);
	2556
	2557	/* Ignore any character that didn't successfully transform.
	2558	* */
	2559	if (! x) {
	2560	continue;
	2561	}
	2562
	2563	/* If this character's transformation is lower than
	2564	* the current lowest, this one becomes the lowest */
	2565	if ( cur_min_x == NULL
	2566	\|\| strLT(x + COLLXFRM_HDR_LEN,
	2567	cur_min_x + COLLXFRM_HDR_LEN))
	2568	{
	2569	PL_strxfrm_NUL_replacement = j;
	2570	cur_min_x = x;
	2571	}
	2572	else {
	2573	Safefree(x);
	2574	}
	2575	} /* end of loop through all 255 characters */
	2576
	2577	/* Stop looking if found */
	2578	if (cur_min_x) {
	2579	break;
	2580	}
	2581
	2582	/* Unlikely, but possible, if there aren't any controls that
	2583	* work in the locale, repeat the loop, looking for any
	2584	* character that works */
	2585	DEBUG_L(PerlIO_printf(Perl_debug_log,
	2586	"_mem_collxfrm: No control worked. Trying non-controls\n"));
	2587	} /* End of loop to try first the controls, then any char */
	2588
	2589	if (! cur_min_x) {
	2590	DEBUG_L(PerlIO_printf(Perl_debug_log,
	2591	"_mem_collxfrm: Couldn't find any character to replace"
	2592	" embedded NULs in locale %s with", PL_collation_name));
	2593	goto bad;
	2594	}
	2595
	2596	DEBUG_L(PerlIO_printf(Perl_debug_log,
	2597	"_mem_collxfrm: Replacing embedded NULs in locale %s with "
	2598	"0x%02X\n", PL_collation_name, PL_strxfrm_NUL_replacement));
	2599
	2600	Safefree(cur_min_x);
	2601	} /* End of determining the character that is to replace NULs */
	2602
	2603	/* If the replacement is variant under UTF-8, it must match the
	2604	* UTF8-ness of the original */
	2605	if ( ! UVCHR_IS_INVARIANT(PL_strxfrm_NUL_replacement) && utf8) {
	2606	this_replacement_char[0] =
	2607	UTF8_EIGHT_BIT_HI(PL_strxfrm_NUL_replacement);
	2608	this_replacement_char[1] =
	2609	UTF8_EIGHT_BIT_LO(PL_strxfrm_NUL_replacement);
	2610	this_replacement_len = 2;
	2611	}
	2612	else {
	2613	this_replacement_char[0] = PL_strxfrm_NUL_replacement;
	2614	/* this_replacement_char[1] = '\0' was done at initialization */
	2615	this_replacement_len = 1;
	2616	}
	2617
	2618	/* The worst case length for the replaced string would be if every
	2619	* character in it is NUL. Multiply that by the length of each
	2620	* replacement, and allow for a trailing NUL */
	2621	sans_nuls_len = (len * this_replacement_len) + 1;
	2622	Newx(sans_nuls, sans_nuls_len, char);
	2623	*sans_nuls = '\0';
	2624
	2625	/* Replace each NUL with the lowest collating control. Loop until have
	2626	* exhausted all the NULs */
	2627	while (s + s_strlen < e) {
	2628	my_strlcat(sans_nuls, s, sans_nuls_len);
	2629
	2630	/* Do the actual replacement */
	2631	my_strlcat(sans_nuls, this_replacement_char, sans_nuls_len);
	2632
	2633	/* Move past the input NUL */
	2634	s += s_strlen + 1;
	2635	s_strlen = strlen(s);
	2636	}
	2637
	2638	/* And add anything that trails the final NUL */
	2639	my_strlcat(sans_nuls, s, sans_nuls_len);
	2640
	2641	/* Switch so below we transform this modified string */
	2642	s = sans_nuls;
	2643	len = strlen(s);
	2644	} /* End of replacing NULs */
	2645
	2646	/* Make sure the UTF8ness of the string and locale match */
	2647	if (utf8 != PL_in_utf8_COLLATE_locale) {
	2648	const char * const t = s; /* Temporary so we can later find where the
	2649	input was */
	2650
	2651	/* Here they don't match. Change the string's to be what the locale is
	2652	* expecting */
	2653
	2654	if (! utf8) { /* locale is UTF-8, but input isn't; upgrade the input */
	2655	s = (char ) bytes_to_utf8((const U8 ) s, &len);
	2656	utf8 = TRUE;
	2657	}
	2658	else { /* locale is not UTF-8; but input is; downgrade the input */
	2659
	2660	s = (char ) bytes_from_utf8((const U8 ) s, &len, &utf8);
	2661
	2662	/* If the downgrade was successful we are done, but if the input
	2663	* contains things that require UTF-8 to represent, have to do
	2664	* damage control ... */
	2665	if (UNLIKELY(utf8)) {
	2666
	2667	/* What we do is construct a non-UTF-8 string with
	2668	* 1) the characters representable by a single byte converted
	2669	* to be so (if necessary);
	2670	* 2) and the rest converted to collate the same as the
	2671	* highest collating representable character. That makes
	2672	* them collate at the end. This is similar to how we
	2673	* handle embedded NULs, but we use the highest collating
	2674	* code point instead of the smallest. Like the NUL case,
	2675	* this isn't perfect, but is the best we can reasonably
	2676	* do. Every above-255 code point will sort the same as
	2677	* the highest-sorting 0-255 code point. If that code
	2678	* point can combine in a sequence with some other code
	2679	* points for weight calculations, us changing something to
	2680	* be it can adversely affect the results. But in most
	2681	* cases, it should work reasonably. And note that this is
	2682	* really an illegal situation: using code points above 255
	2683	* on a locale where only 0-255 are valid. If two strings
	2684	* sort entirely equal, then the sort order for the
	2685	* above-255 code points will be in code point order. */
	2686
	2687	utf8 = FALSE;
	2688
	2689	/* If we haven't calculated the code point with the maximum
	2690	* collating order for this locale, do so now */
	2691	if (! PL_strxfrm_max_cp) {
	2692	int j;
	2693
	2694	/* The current transformed string that collates the
	2695	* highest (except it also includes the prefixed collation
	2696	* index. */
	2697	char * cur_max_x = NULL;
	2698
	2699	/* Look through all legal code points (NUL isn't) */
	2700	for (j = 1; j < 256; j++) {
	2701	char * x;
	2702	STRLEN x_len;
	2703	char cur_source[] = { '\0', '\0' };
	2704
	2705	/* Create a 1-char string of the current code point */
	2706	cur_source[0] = (char) j;
	2707
	2708	/* Then transform it */
	2709	x = _mem_collxfrm(cur_source, 1, &x_len, FALSE);
	2710
	2711	/* If something went wrong (which it shouldn't), just
	2712	* ignore this code point */
	2713	if (! x) {
	2714	continue;
	2715	}
	2716
	2717	/* If this character's transformation is higher than
	2718	* the current highest, this one becomes the highest */
	2719	if ( cur_max_x == NULL
	2720	\|\| strGT(x + COLLXFRM_HDR_LEN,
	2721	cur_max_x + COLLXFRM_HDR_LEN))
	2722	{
	2723	PL_strxfrm_max_cp = j;
	2724	cur_max_x = x;
	2725	}
	2726	else {
	2727	Safefree(x);
	2728	}
	2729	}
	2730
	2731	if (! cur_max_x) {
	2732	DEBUG_L(PerlIO_printf(Perl_debug_log,
	2733	"_mem_collxfrm: Couldn't find any character to"
	2734	" replace above-Latin1 chars in locale %s with",
	2735	PL_collation_name));
	2736	goto bad;
	2737	}
	2738
	2739	DEBUG_L(PerlIO_printf(Perl_debug_log,
	2740	"_mem_collxfrm: highest 1-byte collating character"
	2741	" in locale %s is 0x%02X\n",
	2742	PL_collation_name,
	2743	PL_strxfrm_max_cp));
	2744
	2745	Safefree(cur_max_x);
	2746	}
	2747
	2748	/* Here we know which legal code point collates the highest.
	2749	* We are ready to construct the non-UTF-8 string. The length
	2750	* will be at least 1 byte smaller than the input string
	2751	* (because we changed at least one 2-byte character into a
	2752	* single byte), but that is eaten up by the trailing NUL */
	2753	Newx(s, len, char);
	2754
	2755	{
	2756	STRLEN i;
	2757	STRLEN d= 0;
	2758	char * e = (char *) t + len;
	2759
	2760	for (i = 0; i < len; i+= UTF8SKIP(t + i)) {
	2761	U8 cur_char = t[i];
	2762	if (UTF8_IS_INVARIANT(cur_char)) {
	2763	s[d++] = cur_char;
	2764	}
	2765	else if (UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(t + i, e)) {
	2766	s[d++] = EIGHT_BIT_UTF8_TO_NATIVE(cur_char, t[i+1]);
	2767	}
	2768	else { /* Replace illegal cp with highest collating
	2769	one */
	2770	s[d++] = PL_strxfrm_max_cp;
	2771	}
	2772	}
	2773	s[d++] = '\0';
	2774	Renew(s, d, char); /* Free up unused space */
	2775	}
	2776	}
	2777	}
	2778
	2779	/* Here, we have constructed a modified version of the input. It could
	2780	* be that we already had a modified copy before we did this version.
	2781	* If so, that copy is no longer needed */
	2782	if (t != input_string) {
	2783	Safefree(t);
	2784	}
	2785	}
	2786
	2787	length_in_chars = (utf8)
	2788	? utf8_length((U8 ) s, (U8 ) s + len)
	2789	: len;
	2790
	2791	/* The first element in the output is the collation id, used by
	2792	* sv_collxfrm(); then comes the space for the transformed string. The
	2793	* equation should give us a good estimate as to how much is needed */
	2794	xAlloc = COLLXFRM_HDR_LEN
	2795	+ PL_collxfrm_base
	2796	+ (PL_collxfrm_mult * length_in_chars);
	2797	Newx(xbuf, xAlloc, char);
	2798	if (UNLIKELY(! xbuf)) {
	2799	DEBUG_L(PerlIO_printf(Perl_debug_log,
	2800	"_mem_collxfrm: Couldn't malloc %zu bytes\n", xAlloc));
	2801	goto bad;
	2802	}
	2803
	2804	/* Store the collation id */
	2805	(U32)xbuf = PL_collation_ix;
	2806
	2807	/* Then the transformation of the input. We loop until successful, or we
	2808	* give up */
	2809	for (;;) {
	2810
	2811	*xlen = strxfrm(xbuf + COLLXFRM_HDR_LEN, s, xAlloc - COLLXFRM_HDR_LEN);
	2812
	2813	/* If the transformed string occupies less space than we told strxfrm()
	2814	* was available, it means it successfully transformed the whole
	2815	* string. */
	2816	if (*xlen < xAlloc - COLLXFRM_HDR_LEN) {
	2817
	2818	/* Some systems include a trailing NUL in the returned length.
	2819	* Ignore it, using a loop in case multiple trailing NULs are
	2820	* returned. */
	2821	while ( (*xlen) > 0
	2822	&& (xbuf + COLLXFRM_HDR_LEN + (xlen) - 1) == '\0')
	2823	{
	2824	(*xlen)--;
	2825	}
	2826
	2827	/* If the first try didn't get it, it means our prediction was low.
	2828	* Modify the coefficients so that we predict a larger value in any
	2829	* future transformations */
	2830	if (! first_time) {
	2831	STRLEN needed = xlen + 1; / +1 For trailing NUL */
	2832	STRLEN computed_guess = PL_collxfrm_base
	2833	+ (PL_collxfrm_mult * length_in_chars);
	2834
	2835	/* On zero-length input, just keep current slope instead of
	2836	* dividing by 0 */
	2837	const STRLEN new_m = (length_in_chars != 0)
	2838	? needed / length_in_chars
	2839	: PL_collxfrm_mult;
	2840
	2841	DEBUG_Lv(PerlIO_printf(Perl_debug_log,
	2842	"%s: %d: initial size of %zu bytes for a length "
	2843	"%zu string was insufficient, %zu needed\n",
	2844	__FILE__, __LINE__,
	2845	computed_guess, length_in_chars, needed));
	2846
	2847	/* If slope increased, use it, but discard this result for
	2848	* length 1 strings, as we can't be sure that it's a real slope
	2849	* change */
	2850	if (length_in_chars > 1 && new_m > PL_collxfrm_mult) {
	2851
	2852	# ifdef DEBUGGING
	2853
	2854	STRLEN old_m = PL_collxfrm_mult;
	2855	STRLEN old_b = PL_collxfrm_base;
	2856
	2857	# endif
	2858
	2859	PL_collxfrm_mult = new_m;
	2860	PL_collxfrm_base = 1; /* +1 For trailing NUL */
	2861	computed_guess = PL_collxfrm_base
	2862	+ (PL_collxfrm_mult * length_in_chars);
	2863	if (computed_guess < needed) {
	2864	PL_collxfrm_base += needed - computed_guess;
	2865	}
	2866
	2867	DEBUG_Lv(PerlIO_printf(Perl_debug_log,
	2868	"%s: %d: slope is now %zu; was %zu, base "
	2869	"is now %zu; was %zu\n",
	2870	__FILE__, __LINE__,
	2871	PL_collxfrm_mult, old_m,
	2872	PL_collxfrm_base, old_b));
	2873	}
	2874	else { /* Slope didn't change, but 'b' did */
	2875	const STRLEN new_b = needed
	2876	- computed_guess
	2877	+ PL_collxfrm_base;
	2878	DEBUG_Lv(PerlIO_printf(Perl_debug_log,
	2879	"%s: %d: base is now %zu; was %zu\n",
	2880	__FILE__, __LINE__,
	2881	new_b, PL_collxfrm_base));
	2882	PL_collxfrm_base = new_b;
	2883	}
	2884	}
	2885
	2886	break;
	2887	}
	2888
	2889	if (UNLIKELY(*xlen >= PERL_INT_MAX)) {
	2890	DEBUG_L(PerlIO_printf(Perl_debug_log,
	2891	"_mem_collxfrm: Needed %zu bytes, max permissible is %u\n",
	2892	*xlen, PERL_INT_MAX));
	2893	goto bad;
	2894	}
	2895
	2896	/* A well-behaved strxfrm() returns exactly how much space it needs
	2897	* (usually not including the trailing NUL) when it fails due to not
	2898	* enough space being provided. Assume that this is the case unless
	2899	* it's been proven otherwise */
	2900	if (LIKELY(PL_strxfrm_is_behaved) && first_time) {
	2901	xAlloc = *xlen + COLLXFRM_HDR_LEN + 1;
	2902	}
	2903	else { /* Here, either:
	2904	* 1) The strxfrm() has previously shown bad behavior; or
	2905	* 2) It isn't the first time through the loop, which means
	2906	* that the strxfrm() is now showing bad behavior, because
	2907	* we gave it what it said was needed in the previous
	2908	* iteration, and it came back saying it needed still more.
	2909	* (Many versions of cygwin fit this. When the buffer size
	2910	* isn't sufficient, they return the input size instead of
	2911	* how much is needed.)
	2912	* Increase the buffer size by a fixed percentage and try again.
	2913	* */
	2914	xAlloc += (xAlloc / 4) + 1;
	2915	PL_strxfrm_is_behaved = FALSE;
	2916
	2917	# ifdef DEBUGGING
	2918
	2919	if (DEBUG_Lv_TEST \|\| debug_initialization) {
	2920	PerlIO_printf(Perl_debug_log,
	2921	"_mem_collxfrm required more space than previously calculated"
	2922	" for locale %s, trying again with new guess=%d+%zu\n",
	2923	PL_collation_name, (int) COLLXFRM_HDR_LEN,
	2924	xAlloc - COLLXFRM_HDR_LEN);
	2925	}
	2926
	2927	# endif
	2928
	2929	}
	2930
	2931	Renew(xbuf, xAlloc, char);
	2932	if (UNLIKELY(! xbuf)) {
	2933	DEBUG_L(PerlIO_printf(Perl_debug_log,
	2934	"_mem_collxfrm: Couldn't realloc %zu bytes\n", xAlloc));
	2935	goto bad;
	2936	}
	2937
	2938	first_time = FALSE;
	2939	}
	2940
	2941
	2942	# ifdef DEBUGGING
	2943
	2944	if (DEBUG_Lv_TEST \|\| debug_initialization) {
	2945
	2946	print_collxfrm_input_and_return(s, s + len, xlen, utf8);
	2947	PerlIO_printf(Perl_debug_log, "Its xfrm is:");
	2948	PerlIO_printf(Perl_debug_log, "%s\n",
	2949	_byte_dump_string((U8 *) xbuf + COLLXFRM_HDR_LEN,
	2950	*xlen, 1));
	2951	}
	2952
	2953	# endif
	2954
	2955	/* Free up unneeded space; retain ehough for trailing NUL */
	2956	Renew(xbuf, COLLXFRM_HDR_LEN + *xlen + 1, char);
	2957
	2958	if (s != input_string) {
	2959	Safefree(s);
	2960	}
	2961
	2962	return xbuf;
	2963
	2964	bad:
	2965	Safefree(xbuf);
	2966	if (s != input_string) {
	2967	Safefree(s);
	2968	}
	2969	*xlen = 0;
	2970
	2971	# ifdef DEBUGGING
	2972
	2973	if (DEBUG_Lv_TEST \|\| debug_initialization) {
	2974	print_collxfrm_input_and_return(s, s + len, NULL, utf8);
	2975	}
	2976
	2977	# endif
	2978
	2979	return NULL;
	2980	}
	2981
	2982	# ifdef DEBUGGING
	2983
	2984	STATIC void
	2985	S_print_collxfrm_input_and_return(pTHX_
	2986	const char * const s,
	2987	const char * const e,
	2988	const STRLEN * const xlen,
	2989	const bool is_utf8)
	2990	{
	2991
	2992	PERL_ARGS_ASSERT_PRINT_COLLXFRM_INPUT_AND_RETURN;
	2993
	2994	PerlIO_printf(Perl_debug_log, "_mem_collxfrm[%" UVuf "]: returning ",
	2995	(UV)PL_collation_ix);
	2996	if (xlen) {
	2997	PerlIO_printf(Perl_debug_log, "%zu", *xlen);
	2998	}
	2999	else {
	3000	PerlIO_printf(Perl_debug_log, "NULL");
	3001	}
	3002	PerlIO_printf(Perl_debug_log, " for locale '%s', string='",
	3003	PL_collation_name);
	3004	print_bytes_for_locale(s, e, is_utf8);
	3005
	3006	PerlIO_printf(Perl_debug_log, "'\n");
	3007	}
	3008
	3009	STATIC void
	3010	S_print_bytes_for_locale(pTHX_
	3011	const char * const s,
	3012	const char * const e,
	3013	const bool is_utf8)
	3014	{
	3015	const char * t = s;
	3016	bool prev_was_printable = TRUE;
	3017	bool first_time = TRUE;
	3018
	3019	PERL_ARGS_ASSERT_PRINT_BYTES_FOR_LOCALE;
	3020
	3021	while (t < e) {
	3022	UV cp = (is_utf8)
	3023	? utf8_to_uvchr_buf((U8 *) t, e, NULL)
	3024	: * (U8 *) t;
	3025	if (isPRINT(cp)) {
	3026	if (! prev_was_printable) {
	3027	PerlIO_printf(Perl_debug_log, " ");
	3028	}
	3029	PerlIO_printf(Perl_debug_log, "%c", (U8) cp);
	3030	prev_was_printable = TRUE;
	3031	}
	3032	else {
	3033	if (! first_time) {
	3034	PerlIO_printf(Perl_debug_log, " ");
	3035	}
	3036	PerlIO_printf(Perl_debug_log, "%02" UVXf, cp);
	3037	prev_was_printable = FALSE;
	3038	}
	3039	t += (is_utf8) ? UTF8SKIP(t) : 1;
	3040	first_time = FALSE;
	3041	}
	3042	}
	3043
	3044	# endif /* #ifdef DEBUGGING */
	3045	#endif /* USE_LOCALE_COLLATE */
	3046
	3047	#ifdef USE_LOCALE
	3048
	3049	STATIC const char *
	3050	S_switch_category_locale_to_template(pTHX_ const int switch_category, const int template_category, const char * template_locale)
	3051	{
	3052	/* Changes the locale for LC_'switch_category" to that of
	3053	* LC_'template_category', if they aren't already the same. If not NULL,
	3054	* 'template_locale' is the locale that 'template_category' is in.
	3055	*
	3056	* Returns the original locale for 'switch_category' so can be switched
	3057	* back to with the companion function restore_switched_locale(), (NULL if
	3058	* no restoral is necessary.) */
	3059
	3060	char * restore_to_locale = NULL;
	3061
	3062	if (switch_category == template_category) { /* No changes needed */
	3063	return NULL;
	3064	}
	3065
	3066	/* Find the original locale of the category we may need to change, so that
	3067	* it can be restored to later */
	3068	restore_to_locale = stdize_locale(savepv(do_setlocale_r(switch_category,
	3069	NULL)));
	3070	if (! restore_to_locale) {
	3071	Perl_croak(aTHX_
	3072	"panic: %s: %d: Could not find current %s locale, errno=%d\n",
	3073	__FILE__, __LINE__, category_name(switch_category), errno);
	3074	}
	3075
	3076	/* If the locale of the template category wasn't passed in, find it now */
	3077	if (template_locale == NULL) {
	3078	template_locale = do_setlocale_r(template_category, NULL);
	3079	if (! template_locale) {
	3080	Perl_croak(aTHX_
	3081	"panic: %s: %d: Could not find current %s locale, errno=%d\n",
	3082	__FILE__, __LINE__, category_name(template_category), errno);
	3083	}
	3084	}
	3085
	3086	/* It the locales are the same, there's nothing to do */
	3087	if (strEQ(restore_to_locale, template_locale)) {
	3088	Safefree(restore_to_locale);
	3089
	3090	DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s locale unchanged as %s\n",
	3091	category_name(switch_category), restore_to_locale));
	3092
	3093	return NULL;
	3094	}
	3095
	3096	/* Finally, change the locale to the template one */
	3097	if (! do_setlocale_r(switch_category, template_locale)) {
	3098	Perl_croak(aTHX_
	3099	"panic: %s: %d: Could not change %s locale to %s, errno=%d\n",
	3100	__FILE__, __LINE__, category_name(switch_category),
	3101	template_locale, errno);
	3102	}
	3103
	3104	DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s locale switched to %s\n",
	3105	category_name(switch_category), template_locale));
	3106
	3107	return restore_to_locale;
	3108	}
	3109
	3110	STATIC void
	3111	S_restore_switched_locale(pTHX_ const int category, const char * const original_locale)
	3112	{
	3113	/* Restores the locale for LC_'category' to 'original_locale', or do
	3114	* nothing if the latter parameter is NULL */
	3115
	3116	if (original_locale == NULL) {
	3117	return;
	3118	}
	3119
	3120	if (! do_setlocale_r(category, original_locale)) {
	3121	Perl_croak(aTHX_
	3122	"panic: %s: %d: setlocale %s restore to %s failed, errno=%d\n",
	3123	__FILE__, __LINE__,
	3124	category_name(category), original_locale, errno);
	3125	}
	3126
	3127	Safefree(original_locale);
	3128	}
	3129
	3130	bool
	3131	Perl__is_cur_LC_category_utf8(pTHX_ int category)
	3132	{
	3133	/* Returns TRUE if the current locale for 'category' is UTF-8; FALSE
	3134	* otherwise. 'category' may not be LC_ALL. If the platform doesn't have
	3135	* nl_langinfo(), nor MB_CUR_MAX, this employs a heuristic, which hence
	3136	* could give the wrong result. The result will very likely be correct for
	3137	* languages that have commonly used non-ASCII characters, but for notably
	3138	* English, it comes down to if the locale's name ends in something like
	3139	* "UTF-8". It errs on the side of not being a UTF-8 locale. */
	3140
	3141	/* Name of current locale corresponding to the input category */
	3142	const char *save_input_locale = NULL;
	3143
	3144	bool is_utf8 = FALSE; /* The return value */
	3145
	3146	/* The variables below are for the cache of previous lookups using this
	3147	* function. The cache is a C string, described at the definition for
	3148	* 'C_and_POSIX_utf8ness'.
	3149	*
	3150	* The first part of the cache is fixed, for the C and POSIX locales. The
	3151	* varying part starts just after them. */
	3152	char * utf8ness_cache = PL_locale_utf8ness + STRLENs(C_and_POSIX_utf8ness);
	3153
	3154	Size_t utf8ness_cache_size; /* Size of the varying portion */
	3155	Size_t input_name_len; /* Length in bytes of save_input_locale */
	3156	Size_t input_name_len_with_overhead; /* plus extra chars used to store
	3157	the name in the cache */
	3158	char * delimited; /* The name plus the delimiters used to store
	3159	it in the cache */
	3160	char * name_pos; /* position of 'delimited' in the cache, or 0
	3161	if not there */
	3162
	3163
	3164	# ifdef LC_ALL
	3165
	3166	assert(category != LC_ALL);
	3167
	3168	# endif
	3169
	3170	/* Get the desired category's locale */
	3171	save_input_locale = stdize_locale(savepv(do_setlocale_r(category, NULL)));
	3172	if (! save_input_locale) {
	3173	Perl_croak(aTHX_
	3174	"panic: %s: %d: Could not find current %s locale, errno=%d\n",
	3175	__FILE__, __LINE__, category_name(category), errno);
	3176	}
	3177
	3178	DEBUG_L(PerlIO_printf(Perl_debug_log,
	3179	"Current locale for %s is %s\n",
	3180	category_name(category), save_input_locale));
	3181
	3182	input_name_len = strlen(save_input_locale);
	3183
	3184	/* In our cache, each name is accompanied by two delimiters and a single
	3185	* utf8ness digit */
	3186	input_name_len_with_overhead = input_name_len + 3;
	3187
	3188	/* Allocate and populate space for a copy of the name surrounded by the
	3189	* delimiters */
	3190	Newx(delimited, input_name_len_with_overhead, char);
	3191	delimited[0] = UTF8NESS_SEP[0];
	3192	Copy(save_input_locale, delimited + 1, input_name_len, char);
	3193	delimited[input_name_len+1] = UTF8NESS_PREFIX[0];
	3194	delimited[input_name_len+2] = '\0';
	3195
	3196	/* And see if that is in the cache */
	3197	name_pos = instr(PL_locale_utf8ness, delimited);
	3198	if (name_pos) {
	3199	is_utf8 = *(name_pos + input_name_len_with_overhead - 1) - '0';
	3200
	3201	# ifdef DEBUGGING
	3202
	3203	if (DEBUG_Lv_TEST \|\| debug_initialization) {
	3204	PerlIO_printf(Perl_debug_log, "UTF8ness for locale %s=%d, \n",
	3205	save_input_locale, is_utf8);
	3206	}
	3207
	3208	# endif
	3209
	3210	/* And, if not already in that position, move it to the beginning of
	3211	* the non-constant portion of the list, since it is the most recently
	3212	* used. (We don't have to worry about overflow, since just moving
	3213	* existing names around) */
	3214	if (name_pos > utf8ness_cache) {
	3215	Move(utf8ness_cache,
	3216	utf8ness_cache + input_name_len_with_overhead,
	3217	name_pos - utf8ness_cache, char);
	3218	Copy(delimited,
	3219	utf8ness_cache,
	3220	input_name_len_with_overhead - 1, char);
	3221	utf8ness_cache[input_name_len_with_overhead - 1] = is_utf8 + '0';
	3222	}
	3223
	3224	Safefree(delimited);
	3225	Safefree(save_input_locale);
	3226	return is_utf8;
	3227	}
	3228
	3229	/* Here we don't have stored the utf8ness for the input locale. We have to
	3230	* calculate it */
	3231
	3232	# if defined(USE_LOCALE_CTYPE) \
	3233	&& ( (defined(HAS_NL_LANGINFO) && defined(CODESET)) \
	3234	\|\| (defined(HAS_MBTOWC) \|\| defined(HAS_MBRTOWC)))
	3235
	3236	{
	3237	const char *original_ctype_locale
	3238	= switch_category_locale_to_template(LC_CTYPE,
	3239	category,
	3240	save_input_locale);
	3241
	3242	/* Here the current LC_CTYPE is set to the locale of the category whose
	3243	* information is desired. This means that nl_langinfo() and mbtowc()
	3244	* should give the correct results */
	3245
	3246	# ifdef MB_CUR_MAX /* But we can potentially rule out UTF-8ness, avoiding
	3247	calling the functions if we have this */
	3248
	3249	/* Standard UTF-8 needs at least 4 bytes to represent the maximum
	3250	* Unicode code point. */
	3251
	3252	DEBUG_L(PerlIO_printf(Perl_debug_log, "\tMB_CUR_MAX=%d\n",
	3253	(int) MB_CUR_MAX));
	3254	if ((unsigned) MB_CUR_MAX < STRLENs(MAX_UNICODE_UTF8)) {
	3255	is_utf8 = FALSE;
	3256	restore_switched_locale(LC_CTYPE, original_ctype_locale);
	3257	goto finish_and_return;
	3258	}
	3259
	3260	# endif
	3261	# if defined(HAS_NL_LANGINFO) && defined(CODESET)
	3262
	3263	{ /* The task is easiest if the platform has this POSIX 2001 function.
	3264	Except on some platforms it can wrongly return "", so have to have
	3265	a fallback. And it can return that it's UTF-8, even if there are
	3266	variances from that. For example, Turkish locales may use the
	3267	alternate dotted I rules, and sometimes it appears to be a
	3268	defective locale definition. XXX We should probably check for
	3269	these in the Latin1 range and warn */
	3270	const char *codeset = my_nl_langinfo(PERL_CODESET, FALSE);
	3271	/* FALSE => already in dest locale */
	3272
	3273	DEBUG_L(PerlIO_printf(Perl_debug_log,
	3274	"\tnllanginfo returned CODESET '%s'\n", codeset));
	3275
	3276	if (codeset && strNE(codeset, "")) {
	3277
	3278	/* If the implementation of foldEQ() somehow were
	3279	* to change to not go byte-by-byte, this could
	3280	* read past end of string, as only one length is
	3281	* checked. But currently, a premature NUL will
	3282	* compare false, and it will stop there */
	3283	is_utf8 = cBOOL( foldEQ(codeset, STR_WITH_LEN("UTF-8"))
	3284	\|\| foldEQ(codeset, STR_WITH_LEN("UTF8")));
	3285
	3286	DEBUG_L(PerlIO_printf(Perl_debug_log,
	3287	"\tnllanginfo returned CODESET '%s'; ?UTF8 locale=%d\n",
	3288	codeset, is_utf8));
	3289	restore_switched_locale(LC_CTYPE, original_ctype_locale);
	3290	goto finish_and_return;
	3291	}
	3292	}
	3293
	3294	# endif
	3295	# if defined(HAS_MBTOWC) \|\| defined(HAS_MBRTOWC)
	3296	/* We can see if this is a UTF-8-like locale if have mbtowc(). It was a
	3297	* late adder to C89, so very likely to have it. However, testing has
	3298	* shown that, like nl_langinfo() above, there are locales that are not
	3299	* strictly UTF-8 that this will return that they are */
	3300
	3301	{
	3302	wchar_t wc;
	3303	int len;
	3304
	3305	# if defined(HAS_MBRTOWC) && defined(USE_ITHREADS)
	3306
	3307	mbstate_t ps;
	3308
	3309	# endif
	3310
	3311	/* mbrtowc() and mbtowc() convert a byte string to a wide
	3312	* character. Feed a byte string to one of them and check that the
	3313	* result is the expected Unicode code point */
	3314
	3315	# if defined(HAS_MBRTOWC) && defined(USE_ITHREADS)
	3316	/* Prefer this function if available, as it's reentrant */
	3317
	3318	memset(&ps, 0, sizeof(ps));;
	3319	PERL_UNUSED_RESULT(mbrtowc(&wc, NULL, 0, &ps)); /* Reset any shift
	3320	state */
	3321	errno = 0;
	3322	len = mbrtowc(&wc, STR_WITH_LEN(REPLACEMENT_CHARACTER_UTF8), &ps);
	3323
	3324	# else
	3325
	3326	PERL_UNUSED_RESULT(mbtowc(&wc, NULL, 0));/* Reset any shift state */
	3327	errno = 0;
	3328	len = mbtowc(&wc, STR_WITH_LEN(REPLACEMENT_CHARACTER_UTF8));
	3329
	3330	# endif
	3331
	3332	DEBUG_L(PerlIO_printf(Perl_debug_log,
	3333	"\treturn from mbtowc; len=%d; code_point=%x; errno=%d\n",
	3334	len, (unsigned int) wc, errno));
	3335
	3336	is_utf8 = cBOOL( len == STRLENs(REPLACEMENT_CHARACTER_UTF8)
	3337	&& wc == (wchar_t) UNICODE_REPLACEMENT);
	3338	}
	3339
	3340	restore_switched_locale(LC_CTYPE, original_ctype_locale);
	3341	goto finish_and_return;
	3342	}
	3343
	3344	# endif
	3345	# else
	3346
	3347	/* Here, we must have a C89 compiler that doesn't have mbtowc(). Next
	3348	* try looking at the currency symbol to see if it disambiguates
	3349	* things. Often that will be in the native script, and if the symbol
	3350	* isn't in UTF-8, we know that the locale isn't. If it is non-ASCII
	3351	* UTF-8, we infer that the locale is too, as the odds of a non-UTF8
	3352	* string being valid UTF-8 are quite small */
	3353
	3354	# ifdef USE_LOCALE_MONETARY
	3355
	3356	/* If have LC_MONETARY, we can look at the currency symbol. Often that
	3357	* will be in the native script. We do this one first because there is
	3358	* just one string to examine, so potentially avoids work */
	3359
	3360	{
	3361	const char *original_monetary_locale
	3362	= switch_category_locale_to_template(LC_MONETARY,
	3363	category,
	3364	save_input_locale);
	3365	bool only_ascii = FALSE;
	3366	const U8 * currency_string
	3367	= (const U8 *) my_nl_langinfo(PERL_CRNCYSTR, FALSE);
	3368	/* 2nd param not relevant for this item */
	3369	const U8 * first_variant;
	3370
	3371	assert( *currency_string == '-'
	3372	\|\| *currency_string == '+'
	3373	\|\| *currency_string == '.');
	3374
	3375	currency_string++;
	3376
	3377	if (is_utf8_invariant_string_loc(currency_string, 0, &first_variant))
	3378	{
	3379	DEBUG_L(PerlIO_printf(Perl_debug_log, "Couldn't get currency symbol for %s, or contains only ASCII; can't use for determining if UTF-8 locale\n", save_input_locale));
	3380	only_ascii = TRUE;
	3381	}
	3382	else {
	3383	is_utf8 = is_strict_utf8_string(first_variant, 0);
	3384	}
	3385
	3386	restore_switched_locale(LC_MONETARY, original_monetary_locale);
	3387
	3388	if (! only_ascii) {
	3389
	3390	/* It isn't a UTF-8 locale if the symbol is not legal UTF-8;
	3391	* otherwise assume the locale is UTF-8 if and only if the symbol
	3392	* is non-ascii UTF-8. */
	3393	DEBUG_L(PerlIO_printf(Perl_debug_log, "\t?Currency symbol for %s is UTF-8=%d\n",
	3394	save_input_locale, is_utf8));
	3395	goto finish_and_return;
	3396	}
	3397	}
	3398
	3399	# endif /* USE_LOCALE_MONETARY */
	3400	# if defined(HAS_STRFTIME) && defined(USE_LOCALE_TIME)
	3401
	3402	/* Still haven't found a non-ASCII string to disambiguate UTF-8 or not. Try
	3403	* the names of the months and weekdays, timezone, and am/pm indicator */
	3404	{
	3405	const char *original_time_locale
	3406	= switch_category_locale_to_template(LC_TIME,
	3407	category,
	3408	save_input_locale);
	3409	int hour = 10;
	3410	bool is_dst = FALSE;
	3411	int dom = 1;
	3412	int month = 0;
	3413	int i;
	3414	char * formatted_time;
	3415
	3416	/* Here the current LC_TIME is set to the locale of the category
	3417	* whose information is desired. Look at all the days of the week and
	3418	* month names, and the timezone and am/pm indicator for UTF-8 variant
	3419	* characters. The first such a one found will tell us if the locale
	3420	* is UTF-8 or not */
	3421
	3422	for (i = 0; i < 7 + 12; i++) { /* 7 days; 12 months */
	3423	formatted_time = my_strftime("%A %B %Z %p",
	3424	0, 0, hour, dom, month, 2012 - 1900, 0, 0, is_dst);
	3425	if ( ! formatted_time
	3426	\|\| is_utf8_invariant_string((U8 *) formatted_time, 0))
	3427	{
	3428
	3429	/* Here, we didn't find a non-ASCII. Try the next time through
	3430	* with the complemented dst and am/pm, and try with the next
	3431	* weekday. After we have gotten all weekdays, try the next
	3432	* month */
	3433	is_dst = ! is_dst;
	3434	hour = (hour + 12) % 24;
	3435	dom++;
	3436	if (i > 6) {
	3437	month++;
	3438	}
	3439	continue;
	3440	}
	3441
	3442	/* Here, we have a non-ASCII. Return TRUE is it is valid UTF8;
	3443	* false otherwise. But first, restore LC_TIME to its original
	3444	* locale if we changed it */
	3445	restore_switched_locale(LC_TIME, original_time_locale);
	3446
	3447	DEBUG_L(PerlIO_printf(Perl_debug_log, "\t?time-related strings for %s are UTF-8=%d\n",
	3448	save_input_locale,
	3449	is_utf8_string((U8 *) formatted_time, 0)));
	3450	is_utf8 = is_utf8_string((U8 *) formatted_time, 0);
	3451	goto finish_and_return;
	3452	}
	3453
	3454	/* Falling off the end of the loop indicates all the names were just
	3455	* ASCII. Go on to the next test. If we changed it, restore LC_TIME
	3456	* to its original locale */
	3457	restore_switched_locale(LC_TIME, original_time_locale);
	3458	DEBUG_L(PerlIO_printf(Perl_debug_log, "All time-related words for %s contain only ASCII; can't use for determining if UTF-8 locale\n", save_input_locale));
	3459	}
	3460
	3461	# endif
	3462
	3463	# if 0 && defined(USE_LOCALE_MESSAGES) && defined(HAS_SYS_ERRLIST)
	3464
	3465	/* This code is ifdefd out because it was found to not be necessary in testing
	3466	* on our dromedary test machine, which has over 700 locales. There, this
	3467	* added no value to looking at the currency symbol and the time strings. I
	3468	* left it in so as to avoid rewriting it if real-world experience indicates
	3469	* that dromedary is an outlier. Essentially, instead of returning abpve if we
	3470	* haven't found illegal utf8, we continue on and examine all the strerror()
	3471	* messages on the platform for utf8ness. If all are ASCII, we still don't
	3472	* know the answer; but otherwise we have a pretty good indication of the
	3473	* utf8ness. The reason this doesn't help much is that the messages may not
	3474	* have been translated into the locale. The currency symbol and time strings
	3475	* are much more likely to have been translated. */
	3476	{
	3477	int e;
	3478	bool non_ascii = FALSE;
	3479	const char *original_messages_locale
	3480	= switch_category_locale_to_template(LC_MESSAGES,
	3481	category,
	3482	save_input_locale);
	3483	const char * errmsg = NULL;
	3484
	3485	/* Here the current LC_MESSAGES is set to the locale of the category
	3486	* whose information is desired. Look through all the messages. We
	3487	* can't use Strerror() here because it may expand to code that
	3488	* segfaults in miniperl */
	3489
	3490	for (e = 0; e <= sys_nerr; e++) {
	3491	errno = 0;
	3492	errmsg = sys_errlist[e];
	3493	if (errno \|\| !errmsg) {
	3494	break;
	3495	}
	3496	errmsg = savepv(errmsg);
	3497	if (! is_utf8_invariant_string((U8 *) errmsg, 0)) {
	3498	non_ascii = TRUE;
	3499	is_utf8 = is_utf8_string((U8 *) errmsg, 0);
	3500	break;
	3501	}
	3502	}
	3503	Safefree(errmsg);
	3504
	3505	restore_switched_locale(LC_MESSAGES, original_messages_locale);
	3506
	3507	if (non_ascii) {
	3508
	3509	/* Any non-UTF-8 message means not a UTF-8 locale; if all are valid,
	3510	* any non-ascii means it is one; otherwise we assume it isn't */
	3511	DEBUG_L(PerlIO_printf(Perl_debug_log, "\t?error messages for %s are UTF-8=%d\n",
	3512	save_input_locale,
	3513	is_utf8));
	3514	goto finish_and_return;
	3515	}
	3516
	3517	DEBUG_L(PerlIO_printf(Perl_debug_log, "All error messages for %s contain only ASCII; can't use for determining if UTF-8 locale\n", save_input_locale));
	3518	}
	3519
	3520	# endif
	3521	# ifndef EBCDIC /* On os390, even if the name ends with "UTF-8', it isn't a
	3522	UTF-8 locale */
	3523
	3524	/* As a last resort, look at the locale name to see if it matches
	3525	* qr/UTF -? * 8 /ix, or some other common locale names. This "name", the
	3526	* return of setlocale(), is actually defined to be opaque, so we can't
	3527	* really rely on the absence of various substrings in the name to indicate
	3528	* its UTF-8ness, but if it has UTF8 in the name, it is extremely likely to
	3529	* be a UTF-8 locale. Similarly for the other common names */
	3530
	3531	{
	3532	const Size_t final_pos = strlen(save_input_locale) - 1;
	3533
	3534	if (final_pos >= 3) {
	3535	const char *name = save_input_locale;
	3536
	3537	/* Find next 'U' or 'u' and look from there */
	3538	while ((name += strcspn(name, "Uu") + 1)
	3539	<= save_input_locale + final_pos - 2)
	3540	{
	3541	if ( isALPHA_FOLD_NE(*name, 't')
	3542	\|\| isALPHA_FOLD_NE(*(name + 1), 'f'))
	3543	{
	3544	continue;
	3545	}
	3546	name += 2;
	3547	if (*(name) == '-') {
	3548	if ((name > save_input_locale + final_pos - 1)) {
	3549	break;
	3550	}
	3551	name++;
	3552	}
	3553	if (*(name) == '8') {
	3554	DEBUG_L(PerlIO_printf(Perl_debug_log,
	3555	"Locale %s ends with UTF-8 in name\n",
	3556	save_input_locale));
	3557	is_utf8 = TRUE;
	3558	goto finish_and_return;
	3559	}
	3560	}
	3561	DEBUG_L(PerlIO_printf(Perl_debug_log,
	3562	"Locale %s doesn't end with UTF-8 in name\n",
	3563	save_input_locale));
	3564	}
	3565
	3566	# ifdef WIN32
	3567
	3568	/* http://msdn.microsoft.com/en-us/library/windows/desktop/dd317756.aspx */
	3569	if (memENDs(save_input_locale, final_pos, "65001")) {
	3570	DEBUG_L(PerlIO_printf(Perl_debug_log,
	3571	"Locale %s ends with 65001 in name, is UTF-8 locale\n",
	3572	save_input_locale));
	3573	is_utf8 = TRUE;
	3574	goto finish_and_return;
	3575	}
	3576	}
	3577
	3578	# endif
	3579	# endif
	3580
	3581	/* Other common encodings are the ISO 8859 series, which aren't UTF-8. But
	3582	* since we are about to return FALSE anyway, there is no point in doing
	3583	* this extra work */
	3584
	3585	# if 0
	3586	if (instr(save_input_locale, "8859")) {
	3587	DEBUG_L(PerlIO_printf(Perl_debug_log,
	3588	"Locale %s has 8859 in name, not UTF-8 locale\n",
	3589	save_input_locale));
	3590	is_utf8 = FALSE;
	3591	goto finish_and_return;
	3592	}
	3593	# endif
	3594
	3595	DEBUG_L(PerlIO_printf(Perl_debug_log,
	3596	"Assuming locale %s is not a UTF-8 locale\n",
	3597	save_input_locale));
	3598	is_utf8 = FALSE;
	3599
	3600	# endif /* the code that is compiled when no modern LC_CTYPE */
	3601
	3602	finish_and_return:
	3603
	3604	/* Cache this result so we don't have to go through all this next time. */
	3605	utf8ness_cache_size = sizeof(PL_locale_utf8ness)
	3606	- (utf8ness_cache - PL_locale_utf8ness);
	3607
	3608	/* But we can't save it if it is too large for the total space available */
	3609	if (LIKELY(input_name_len_with_overhead < utf8ness_cache_size)) {
	3610	Size_t utf8ness_cache_len = strlen(utf8ness_cache);
	3611
	3612	/* Here it can fit, but we may need to clear out the oldest cached
	3613	* result(s) to do so. Check */
	3614	if (utf8ness_cache_len + input_name_len_with_overhead
	3615	>= utf8ness_cache_size)
	3616	{
	3617	/* Here we have to clear something out to make room for this.
	3618	* Start looking at the rightmost place where it could fit and find
	3619	* the beginning of the entry that extends past that. */
	3620	char * cutoff = (char *) my_memrchr(utf8ness_cache,
	3621	UTF8NESS_SEP[0],
	3622	utf8ness_cache_size
	3623	- input_name_len_with_overhead);
	3624
	3625	assert(cutoff);
	3626	assert(cutoff >= utf8ness_cache);
	3627
	3628	/* This and all subsequent entries must be removed */
	3629	*cutoff = '\0';
	3630	utf8ness_cache_len = strlen(utf8ness_cache);
	3631	}
	3632
	3633	/* Make space for the new entry */
	3634	Move(utf8ness_cache,
	3635	utf8ness_cache + input_name_len_with_overhead,
	3636	utf8ness_cache_len + 1 /* Incl. trailing NUL */, char);
	3637
	3638	/* And insert it */
	3639	Copy(delimited, utf8ness_cache, input_name_len_with_overhead - 1, char);
	3640	utf8ness_cache[input_name_len_with_overhead - 1] = is_utf8 + '0';
	3641
	3642	if ((PL_locale_utf8ness[strlen(PL_locale_utf8ness)-1]
	3643	& (PERL_UINTMAX_T) ~1) != '0')
	3644	{
	3645	Perl_croak(aTHX_
	3646	"panic: %s: %d: Corrupt utf8ness_cache=%s\nlen=%zu,"
	3647	" inserted_name=%s, its_len=%zu\n",
	3648	__FILE__, __LINE__,
	3649	PL_locale_utf8ness, strlen(PL_locale_utf8ness),
	3650	delimited, input_name_len_with_overhead);
	3651	}
	3652	}
	3653
	3654	# ifdef DEBUGGING
	3655
	3656	if (DEBUG_Lv_TEST \|\| debug_initialization) {
	3657	PerlIO_printf(Perl_debug_log,
	3658	"PL_locale_utf8ness is now %s; returning %d\n",
	3659	PL_locale_utf8ness, is_utf8);
	3660	}
	3661
	3662	# endif
	3663
	3664	Safefree(delimited);
	3665	Safefree(save_input_locale);
	3666	return is_utf8;
	3667	}
	3668
	3669	#endif
	3670
	3671	bool
	3672	Perl__is_in_locale_category(pTHX_ const bool compiling, const int category)
	3673	{
	3674	dVAR;
	3675	/* Internal function which returns if we are in the scope of a pragma that
	3676	* enables the locale category 'category'. 'compiling' should indicate if
	3677	* this is during the compilation phase (TRUE) or not (FALSE). */
	3678
	3679	const COP * const cop = (compiling) ? &PL_compiling : PL_curcop;
	3680
	3681	SV *categories = cop_hints_fetch_pvs(cop, "locale", 0);
	3682	if (! categories \|\| categories == &PL_sv_placeholder) {
	3683	return FALSE;
	3684	}
	3685
	3686	/* The pseudo-category 'not_characters' is -1, so just add 1 to each to get
	3687	* a valid unsigned */
	3688	assert(category >= -1);
	3689	return cBOOL(SvUV(categories) & (1U << (category + 1)));
	3690	}
	3691
	3692	char *
	3693	Perl_my_strerror(pTHX_ const int errnum)
	3694	{
	3695	/* Returns a mortalized copy of the text of the error message associated
	3696	* with 'errnum'. It uses the current locale's text unless the platform
	3697	* doesn't have the LC_MESSAGES category or we are not being called from
	3698	* within the scope of 'use locale'. In the former case, it uses whatever
	3699	* strerror returns; in the latter case it uses the text from the C locale.
	3700	*
	3701	* The function just calls strerror(), but temporarily switches, if needed,
	3702	* to the C locale */
	3703
	3704	char *errstr;
	3705	dVAR;
	3706
	3707	#ifndef USE_LOCALE_MESSAGES
	3708
	3709	/* If platform doesn't have messages category, we don't do any switching to
	3710	* the C locale; we just use whatever strerror() returns */
	3711
	3712	errstr = savepv(Strerror(errnum));
	3713
	3714	#else /* Has locale messages */
	3715
	3716	const bool within_locale_scope = IN_LC(LC_MESSAGES);
	3717
	3718	# if defined(HAS_POSIX_2008_LOCALE) && defined(HAS_STRERROR_L)
	3719
	3720	/* This function is trivial if we don't have to worry about thread safety
	3721	* and have strerror_l(), as it handles the switch of locales so we don't
	3722	* have to deal with that. We don't have to worry about thread safety if
	3723	* this is an unthreaded build, or if strerror_r() is also available. Both
	3724	* it and strerror_l() are thread-safe. Plain strerror() isn't thread
	3725	* safe. But on threaded builds when strerror_r() is available, the
	3726	* apparent call to strerror() below is actually a macro that
	3727	* behind-the-scenes calls strerror_r().
	3728	*/
	3729
	3730	# if ! defined(USE_ITHREADS) \|\| defined(HAS_STRERROR_R)
	3731
	3732	if (within_locale_scope) {
	3733	errstr = savepv(strerror(errnum));
	3734	}
	3735	else {
	3736	errstr = savepv(strerror_l(errnum, PL_C_locale_obj));
	3737	}
	3738
	3739	# else
	3740
	3741	/* Here we have strerror_l(), but not strerror_r() and we are on a
	3742	* threaded-build. We use strerror_l() for everything, constructing a
	3743	* locale to pass to it if necessary */
	3744
	3745	bool do_free = FALSE;
	3746	locale_t locale_to_use;
	3747
	3748	if (within_locale_scope) {
	3749	locale_to_use = uselocale((locale_t) 0);
	3750	if (locale_to_use == LC_GLOBAL_LOCALE) {
	3751	locale_to_use = duplocale(LC_GLOBAL_LOCALE);
	3752	do_free = TRUE;
	3753	}
	3754	}
	3755	else { /* Use C locale if not within 'use locale' scope */
	3756	locale_to_use = PL_C_locale_obj;
	3757	}
	3758
	3759	errstr = savepv(strerror_l(errnum, locale_to_use));
	3760
	3761	if (do_free) {
	3762	freelocale(locale_to_use);
	3763	}
	3764
	3765	# endif
	3766	# else /* Doesn't have strerror_l() */
	3767
	3768	# ifdef USE_POSIX_2008_LOCALE
	3769
	3770	locale_t save_locale = NULL;
	3771
	3772	# else
	3773
	3774	const char * save_locale = NULL;
	3775	bool locale_is_C = FALSE;
	3776
	3777	/* We have a critical section to prevent another thread from changing the
	3778	* locale out from under us (or zapping the buffer returned from
	3779	* setlocale() ) */
	3780	LOCALE_LOCK;
	3781
	3782	# endif
	3783
	3784	DEBUG_Lv(PerlIO_printf(Perl_debug_log,
	3785	"my_strerror called with errnum %d\n", errnum));
	3786	if (! within_locale_scope) {
	3787	errno = 0;
	3788
	3789	# ifdef USE_POSIX_2008_LOCALE /* Use the thread-safe locale functions */
	3790
	3791	DEBUG_Lv(PerlIO_printf(Perl_debug_log,
	3792	"Not within locale scope, about to call"
	3793	" uselocale(0x%p)\n", PL_C_locale_obj));
	3794	save_locale = uselocale(PL_C_locale_obj);
	3795	if (! save_locale) {
	3796	DEBUG_L(PerlIO_printf(Perl_debug_log,
	3797	"uselocale failed, errno=%d\n", errno));
	3798	}
	3799	else {
	3800	DEBUG_Lv(PerlIO_printf(Perl_debug_log,
	3801	"uselocale returned 0x%p\n", save_locale));
	3802	}
	3803
	3804	# else /* Not thread-safe build */
	3805
	3806	save_locale = do_setlocale_c(LC_MESSAGES, NULL);
	3807	if (! save_locale) {
	3808	Perl_croak(aTHX_
	3809	"panic: %s: %d: Could not find current LC_MESSAGES locale,"
	3810	" errno=%d\n", __FILE__, __LINE__, errno);
	3811	}
	3812	else {
	3813	locale_is_C = isNAME_C_OR_POSIX(save_locale);
	3814
	3815	/* Switch to the C locale if not already in it */
	3816	if (! locale_is_C) {
	3817
	3818	/* The setlocale() just below likely will zap 'save_locale', so
	3819	* create a copy. */
	3820	save_locale = savepv(save_locale);
	3821	do_setlocale_c(LC_MESSAGES, "C");
	3822	}
	3823	}
	3824
	3825	# endif
	3826
	3827	} /* end of ! within_locale_scope */
	3828	else {
	3829	DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s: %d: WITHIN locale scope\n",
	3830	__FILE__, __LINE__));
	3831	}
	3832
	3833	DEBUG_Lv(PerlIO_printf(Perl_debug_log,
	3834	"Any locale change has been done; about to call Strerror\n"));
	3835	errstr = savepv(Strerror(errnum));
	3836
	3837	if (! within_locale_scope) {
	3838	errno = 0;
	3839
	3840	# ifdef USE_POSIX_2008_LOCALE
	3841
	3842	DEBUG_Lv(PerlIO_printf(Perl_debug_log,
	3843	"%s: %d: not within locale scope, restoring the locale\n",
	3844	__FILE__, __LINE__));
	3845	if (save_locale && ! uselocale(save_locale)) {
	3846	DEBUG_L(PerlIO_printf(Perl_debug_log,
	3847	"uselocale restore failed, errno=%d\n", errno));
	3848	}
	3849	}
	3850
	3851	# else
	3852
	3853	if (save_locale && ! locale_is_C) {
	3854	if (! do_setlocale_c(LC_MESSAGES, save_locale)) {
	3855	Perl_croak(aTHX_
	3856	"panic: %s: %d: setlocale restore failed, errno=%d\n",
	3857	__FILE__, __LINE__, errno);
	3858	}
	3859	Safefree(save_locale);
	3860	}
	3861	}
	3862
	3863	LOCALE_UNLOCK;
	3864
	3865	# endif
	3866	# endif /* End of doesn't have strerror_l */
	3867	#endif /* End of does have locale messages */
	3868
	3869	#ifdef DEBUGGING
	3870
	3871	if (DEBUG_Lv_TEST) {
	3872	PerlIO_printf(Perl_debug_log, "Strerror returned; saving a copy: '");
	3873	print_bytes_for_locale(errstr, errstr + strlen(errstr), 0);
	3874	PerlIO_printf(Perl_debug_log, "'\n");
	3875	}
	3876
	3877	#endif
	3878
	3879	SAVEFREEPV(errstr);
	3880	return errstr;
	3881	}
	3882
	3883	/*
	3884
	3885	=for apidoc sync_locale
	3886
	3887	Changing the program's locale should be avoided by XS code. Nevertheless,
	3888	certain non-Perl libraries called from XS, such as C<Gtk> do so. When this
	3889	happens, Perl needs to be told that the locale has changed. Use this function
	3890	to do so, before returning to Perl.
	3891
	3892	=cut
	3893	*/
	3894
	3895	void
	3896	Perl_sync_locale(pTHX)
	3897	{
	3898	char * newlocale;
	3899
	3900	#ifdef USE_LOCALE_CTYPE
	3901
	3902	newlocale = do_setlocale_c(LC_CTYPE, NULL);
	3903	DEBUG_Lv(PerlIO_printf(Perl_debug_log,
	3904	"%s:%d: %s\n", __FILE__, __LINE__,
	3905	setlocale_debug_string(LC_CTYPE, NULL, newlocale)));
	3906	new_ctype(newlocale);
	3907
	3908	#endif /* USE_LOCALE_CTYPE */
	3909	#ifdef USE_LOCALE_COLLATE
	3910
	3911	newlocale = do_setlocale_c(LC_COLLATE, NULL);
	3912	DEBUG_Lv(PerlIO_printf(Perl_debug_log,
	3913	"%s:%d: %s\n", __FILE__, __LINE__,
	3914	setlocale_debug_string(LC_COLLATE, NULL, newlocale)));
	3915	new_collate(newlocale);
	3916
	3917	#endif
	3918	#ifdef USE_LOCALE_NUMERIC
	3919
	3920	newlocale = do_setlocale_c(LC_NUMERIC, NULL);
	3921	DEBUG_Lv(PerlIO_printf(Perl_debug_log,
	3922	"%s:%d: %s\n", __FILE__, __LINE__,
	3923	setlocale_debug_string(LC_NUMERIC, NULL, newlocale)));
	3924	new_numeric(newlocale);
	3925
	3926	#endif /* USE_LOCALE_NUMERIC */
	3927
	3928	}
	3929
	3930	#if defined(DEBUGGING) && defined(USE_LOCALE)
	3931
	3932	STATIC char *
	3933	S_setlocale_debug_string(const int category, /* category number,
	3934	like LC_ALL */
	3935	const char* const locale, /* locale name */
	3936
	3937	/* return value from setlocale() when attempting to
	3938	* set 'category' to 'locale' */
	3939	const char* const retval)
	3940	{
	3941	/* Returns a pointer to a NUL-terminated string in static storage with
	3942	* added text about the info passed in. This is not thread safe and will
	3943	* be overwritten by the next call, so this should be used just to
	3944	* formulate a string to immediately print or savepv() on. */
	3945
	3946	/* initialise to a non-null value to keep it out of BSS and so keep
	3947	* -DPERL_GLOBAL_STRUCT_PRIVATE happy */
	3948	static char ret[128] = "If you can read this, thank your buggy C"
	3949	" library strlcpy(), and change your hints file"
	3950	" to undef it";
	3951
	3952	my_strlcpy(ret, "setlocale(", sizeof(ret));
	3953	my_strlcat(ret, category_name(category), sizeof(ret));
	3954	my_strlcat(ret, ", ", sizeof(ret));
	3955
	3956	if (locale) {
	3957	my_strlcat(ret, "\"", sizeof(ret));
	3958	my_strlcat(ret, locale, sizeof(ret));
	3959	my_strlcat(ret, "\"", sizeof(ret));
	3960	}
	3961	else {
	3962	my_strlcat(ret, "NULL", sizeof(ret));
	3963	}
	3964
	3965	my_strlcat(ret, ") returned ", sizeof(ret));
	3966
	3967	if (retval) {
	3968	my_strlcat(ret, "\"", sizeof(ret));
	3969	my_strlcat(ret, retval, sizeof(ret));
	3970	my_strlcat(ret, "\"", sizeof(ret));
	3971	}
	3972	else {
	3973	my_strlcat(ret, "NULL", sizeof(ret));
	3974	}
	3975
	3976	assert(strlen(ret) < sizeof(ret));
	3977
	3978	return ret;
	3979	}
	3980
	3981	#endif
	3982
	3983
	3984	/*
	3985	* ex: set ts=8 sts=4 sw=4 et:
	3986	*/