Commit
8c6180a91de91a1194f427fc639694f43a903a78 added a warning message
for when Perl determines that the program's underlying locale just
switched into is poorly supported. At the time it was thought that this
would be an extremely rare occurrence. However, a bug in HP-UX -
B.11.00/64 causes this message to be raised for the "C" locale. A
workaround was done that silenced those. However, before it got fixed,
this message would occur gobs of times executing the test suite. It was
raised even if the script is not locale-aware, so that the underlying
locale was completely irrelevant. There is a good prospect that someone
using an older Asian locale as their default would get this message
inappropriately, even if they don't use locales, or switch to a
supported one before using them.
This commit causes the message to be raised only if it actually is
relevant. When not in the scope of 'use locale', the message is stored,
not raised. Upon the first locale-dependent operation within a bad
locale, the saved message is raised, and the storage cleared. I was
able to do this without adding extra branching to the main-line
non-locale execution code. This was done by adding regnodes which get
jumped to by switch statements, and refactoring some existing C tests so
they exclude non-locale right off the bat.
These changes would have been necessary for another locale warning that
I previously agreed to implement, and which is coming a few commits from
now.
I do not know of any way to add tests in the test suite for this. It is
in fact rare for modern locales to have these issues. The way I tested
this was to temporarily change the C code so that all locales are viewed
as defective, and manually note that the warnings came out where
expected, and only where expected.
I chose not to try to output this warning on any POSIX functions called.
I believe that all that are affected are deprecated or scheduled to be
deprecated anyway. And POSIX is closer to the hardware of the machine.
For convenience, I also don't output the message for some zero-length
pattern matches. If something is going to be matched, the message will
likely very soon be raised anyway.
#define PL_utf8_xidstart (vTHX->Iutf8_xidstart)
#define PL_utf8cache (vTHX->Iutf8cache)
#define PL_utf8locale (vTHX->Iutf8locale)
+#define PL_warn_locale (vTHX->Iwarn_locale)
#define PL_warnhook (vTHX->Iwarnhook)
#define PL_watchaddr (vTHX->Iwatchaddr)
#define PL_watchok (vTHX->Iwatchok)
PERLVAR(I, utf8locale, bool) /* utf8 locale detected */
PERLVAR(I, in_utf8_CTYPE_locale, bool)
+PERLVAR(I, warn_locale, SV *)
PERLVARA(I, colors,6, char *) /* values from PERL_RE_COLORS env var */
to start */
unsigned int bad_count = 0; /* Count of bad characters */
+ SvREFCNT_dec(PL_warn_locale); /* We are about to overwrite this */
+
for (i = 0; i < 256; i++) {
if (isUPPER_LC((U8) i))
PL_fold_locale[i] = (U8) toLOWER_LC((U8) i);
#endif
if (bad_count || multi_byte_locale) {
-
- /* We have to save 'newctype' because the setlocale() just below
- * may destroy it. The next setlocale() further down should
- * restore it properly so that the intermediate change here is
- * transparent to this function's caller */
- const char * const badlocale = savepv(newctype);
-
- setlocale(LC_CTYPE, "C");
- Perl_warner(aTHX_ packWARN(WARN_LOCALE),
+ PL_warn_locale = Perl_newSVpvf(aTHX_
"Locale '%s' may not work well.%s%s%s\n",
- badlocale,
+ newctype,
(multi_byte_locale)
? " Some characters in it are not recognized by"
" Perl."
? bad_chars_list
: ""
);
- setlocale(LC_CTYPE, badlocale);
+ /* If we are actually in the scope of the locale, output the
+ * message now. Otherwise we save it to be output at the first
+ * operation using this locale, if that actually happens. Most
+ * programs don't use locales, so they are immune to bad ones */
+ if (IN_LC(LC_CTYPE)) {
+
+ /* We have to save 'newctype' because the setlocale() just
+ * below may destroy it. The next setlocale() further down
+ * should restore it properly so that the intermediate change
+ * here is transparent to this function's caller */
+ const char * const badlocale = savepv(newctype);
+
+ setlocale(LC_CTYPE, "C");
+
+ /* The '0' below suppresses a bogus gcc compiler warning */
+ Perl_warner(aTHX_ packWARN(WARN_LOCALE), SvPVX(PL_warn_locale), 0);
+ setlocale(LC_CTYPE, badlocale);
+ SvREFCNT_dec_NN(PL_warn_locale);
+ PL_warn_locale = NULL;
+ }
}
}
SvREFCNT_dec(PL_Latin1);
SvREFCNT_dec(PL_NonL1NonFinalFold);
SvREFCNT_dec(PL_HasMultiCharFold);
+ SvREFCNT_dec(PL_warn_locale);
PL_utf8_mark = NULL;
PL_utf8_toupper = NULL;
PL_utf8_totitle = NULL;
PL_AboveLatin1 = NULL;
PL_InBitmap = NULL;
PL_HasMultiCharFold = NULL;
+ PL_warn_locale = NULL;
PL_Latin1 = NULL;
PL_NonL1NonFinalFold = NULL;
PL_UpperLatin1 = NULL;
# define IN_LC(category) \
(IN_LC_COMPILETIME(category) || IN_LC_RUNTIME(category))
+# if defined (PERL_CORE) || defined (PERL_IN_XSUB_RE)
+
+ /* This internal macro should be called from places that operate under
+ * locale rules. It there is a problem with the current locale that
+ * hasn't been raised yet, it will output a warning this time */
+# define _CHECK_AND_WARN_PROBLEMATIC_LOCALE \
+ STMT_START { \
+ if (PL_warn_locale) { \
+ /*GCC_DIAG_IGNORE(-Wformat-security); Didn't work */ \
+ Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), \
+ SvPVX(PL_warn_locale), \
+ 0 /* dummy to avoid comp warning */ ); \
+ /* GCC_DIAG_RESTORE; */ \
+ SvREFCNT_dec_NN(PL_warn_locale); \
+ PL_warn_locale = NULL; \
+ } \
+ } STMT_END
+
+
+# endif /* PERL_CORE or PERL_IN_XSUB_RE */
+
#else /* No locale usage */
# define IN_LOCALE_RUNTIME 0
# define IN_SOME_LOCALE_FORM_RUNTIME 0
# define IN_LC_COMPILETIME(category) 0
# define IN_LC_RUNTIME(category) 0
# define IN_LC(category) 0
+
+# define _CHECK_AND_WARN_PROBLEMATIC_LOCALE
#endif
#ifdef USE_LOCALE_NUMERIC
XXX Describe change here
+The message
+L<Locale '%s' may not work well.%s|perldiag/"Locale '%s' may not work well.%s">
+is no longer raised unless the problemtatic locale is actually used in
+the Perl program. Previously it was raised if it merely was the
+underlying locale. All Perl programs have an underlying locale at all
+times, but something like a C<S<use locale>> is needed for that locale
+to actually have some effect. This message will not be raised when
+the underlying locale is hidden.
+
=back
=head1 Utility Changes
=item Locale '%s' may not work well.%s
-(W locale) The named locale that Perl is now trying to use is not fully
-compatible with Perl. The second C<%s> gives a reason.
+(W locale) You are using the named locale, which is a non-UTF-8 one, and
+which Perl has determined is not fully compatible with Perl. The second
+C<%s> gives a reason.
By far the most common reason is that the locale has characters in it
that are represented by more than one byte. The only such locales that
Perl can handle are the UTF-8 locales. Most likely the specified locale
is a non-UTF-8 one for an East Asian language such as Chinese or
Japanese. If the locale is a superset of ASCII, the ASCII portion of it
-may work in Perl. Read on for problems when it isn't a superset of
-ASCII.
+may work in Perl.
Some essentially obsolete locales that aren't supersets of ASCII, mainly
those in ISO 646 or other 7-bit locales, such as ASMO 449, can also have
changed by the locale and are also used by the program.
The warning message lists the determinable conflicting characters.
+Note that not all incompatibilities are found.
+
+If this happens to you, there's not much you can do except switch to use a
+different locale or use L<Encode> to translate from the locale into
+UTF-8; if that's impracticable, you have been warned that some things
+may break.
+
+This message is output once each time a bad locale is switched into
+within the scope of C<S<use locale>>, or on the first possibly-affected
+operation if the C<S<use locale>> inherits a bad one. It is not raised
+for any operations from the L<POSIX> module.
+
=item localtime(%f) failed
(W overflow) You called C<localtime> with a number that it could not handle:
if (op_type == OP_LCFIRST) {
/* lower case the first letter: no trickiness for any character */
- *tmpbuf =
#ifdef USE_LOCALE_CTYPE
- (IN_LC_RUNTIME(LC_CTYPE))
- ? toLOWER_LC(*s)
- :
+ if (IN_LC_RUNTIME(LC_CTYPE)) {
+ _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
+ *tmpbuf = toLOWER_LC(*s);
+ }
+ else
#endif
- (IN_UNI_8_BIT)
- ? toLOWER_LATIN1(*s)
- : toLOWER(*s);
+ {
+ *tmpbuf = (IN_UNI_8_BIT)
+ ? toLOWER_LATIN1(*s)
+ : toLOWER(*s);
+ }
}
- /* is ucfirst() */
#ifdef USE_LOCALE_CTYPE
+ /* is ucfirst() */
else if (IN_LC_RUNTIME(LC_CTYPE)) {
if (IN_UTF8_CTYPE_LOCALE) {
goto do_uni_rules;
}
+ _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
*tmpbuf = (U8) toUPPER_LC(*s); /* This would be a bug if any
locales have upper and title case
different */
if (IN_UTF8_CTYPE_LOCALE) {
goto do_uni_rules;
}
+ _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
for (; s < send; d++, s++)
*d = (U8) toUPPER_LC(*s);
}
* whole thing in a tight loop, for speed, */
#ifdef USE_LOCALE_CTYPE
if (IN_LC_RUNTIME(LC_CTYPE)) {
+ _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
for (; s < send; d++, s++)
*d = toLOWER_LC(*s);
}
if (IN_UTF8_CTYPE_LOCALE) {
goto do_uni_folding;
}
+ _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
for (; s < send; d++, s++)
*d = (U8) toFOLD_LC(*s);
}
U8 flags = FOLD_FLAGS_FULL; \
switch (trie_type) { \
case trie_flu8: \
+ _CHECK_AND_WARN_PROBLEMATIC_LOCALE; \
goto do_trie_utf8_fold; \
case trie_utf8_exactfa_fold: \
flags |= FOLD_FLAGS_NOMIX_ASCII; \
} \
break; \
case trie_utf8l: \
+ _CHECK_AND_WARN_PROBLEMATIC_LOCALE; \
+ /* FALLTHROUGH */ \
case trie_utf8: \
uvc = utf8n_to_uvchr( (const U8*) uc, UTF8_MAXLEN, &len, uniflags ); \
break; \
/* We know what class it must start with. */
switch (OP(c)) {
case ANYOFL:
+ _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
+ /* FALLTHROUGH */
case ANYOF:
if (utf8_target) {
REXEC_FBC_UTF8_CLASS_SCAN(
goto do_exactf_non_utf8;
case EXACTFL:
+ _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
if (is_utf8_pat || utf8_target || IN_UTF8_CTYPE_LOCALE) {
utf8_fold_flags = FOLDEQ_LOCALE;
goto do_exactf_utf8;
}
case BOUNDL:
+ _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
FBC_BOUND(isWORDCHAR_LC, isWORDCHAR_LC_uvchr, isWORDCHAR_LC_utf8);
break;
case NBOUNDL:
+ _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
FBC_NBOUND(isWORDCHAR_LC, isWORDCHAR_LC_uvchr, isWORDCHAR_LC_utf8);
break;
case BOUND:
/* FALLTHROUGH */
case POSIXL:
+ _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
REXEC_FBC_CSCAN(to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(c), (U8 *) s)),
to_complement ^ cBOOL(isFOO_lc(FLAGS(c), *s)));
break;
HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
U32 state = trie->startstate;
+ if (scan->flags == EXACTL || scan->flags == EXACTFLU8) {
+ _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
+ }
if ( trie->bitmap
&& (NEXTCHR_IS_EOS || !TRIE_BITMAP_TEST(trie, nextchr)))
{
#undef ST
case EXACTL: /* /abc/l */
+ _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
+ /* FALLTHROUGH */
case EXACT: { /* /abc/ */
char *s = STRING(scan);
ln = STR_LEN(scan);
const char * s;
U32 fold_utf8_flags;
+ _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
folder = foldEQ_locale;
fold_array = PL_fold_locale;
fold_utf8_flags = FOLDEQ_LOCALE;
* have to set the FLAGS fields of these */
case BOUNDL: /* /\b/l */
case NBOUNDL: /* /\B/l */
+ _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
+ /* FALLTHROUGH */
case BOUND: /* /\b/ */
case BOUNDU: /* /\b/u */
case BOUNDA: /* /\b/a */
break;
case ANYOFL: /* /[abc]/l */
+ _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
+ /* FALLTHROUGH */
case ANYOF: /* /[abc]/ */
if (NEXTCHR_IS_EOS)
sayNO;
/* FALLTHROUGH */
case POSIXL: /* \w or [:punct:] etc. under /l */
+ _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
if (NEXTCHR_IS_EOS)
sayNO;
const U8 *fold_array;
UV utf8_fold_flags;
+ _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
folder = foldEQ_locale;
fold_array = PL_fold_locale;
type = REFFL;
goto do_nref_ref_common;
case REFFL: /* /\1/il */
+ _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
folder = foldEQ_locale;
fold_array = PL_fold_locale;
utf8_fold_flags = FOLDEQ_LOCALE;
}
break;
case EXACTL:
+ _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
+ /* FALLTHROUGH */
case EXACT:
assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1);
goto do_exactf;
case EXACTFL:
+ _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
utf8_flags = FOLDEQ_LOCALE;
goto do_exactf;
break;
}
case ANYOFL:
+ _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
+ /* FALLTHROUGH */
case ANYOF:
if (utf8_target) {
while (hardcount < max
/* FALLTHROUGH */
case POSIXL:
+ _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
if (! utf8_target) {
while (scan < loceol && to_complement ^ cBOOL(isFOO_lc(FLAGS(p),
*scan)))
}
break;
+ case BOUNDL:
+ case NBOUNDL:
+ _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
+ /* FALLTHROUGH */
case BOUND:
case BOUNDA:
- case BOUNDL:
case BOUNDU:
case EOS:
case GPOS:
case KEEPS:
case NBOUND:
case NBOUNDA:
- case NBOUNDL:
case NBOUNDU:
case OPFAIL:
case SBOL:
/* Unicode features (see perlrun/-C) */
PL_unicode = proto_perl->Iunicode;
+ /* Should we warn if uses locale? */
+ PL_warn_locale = proto_perl->Iwarn_locale;
+
/* Pre-5.8 signals control */
PL_signals = proto_perl->Isignals;
PERL_ARGS_ASSERT__TO_UNI_FOLD_FLAGS;
- /* Treat a UTF-8 locale as not being in locale at all */
- if (IN_UTF8_CTYPE_LOCALE) {
- flags &= ~FOLD_FLAGS_LOCALE;
+ if (flags & FOLD_FLAGS_LOCALE) {
+ /* Treat a UTF-8 locale as not being in locale at all */
+ if (IN_UTF8_CTYPE_LOCALE) {
+ flags &= ~FOLD_FLAGS_LOCALE;
+ }
+ else {
+ _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
+ }
}
if (c < 256) {
PERL_ARGS_ASSERT__TO_UTF8_UPPER_FLAGS;
- if (flags && IN_UTF8_CTYPE_LOCALE) {
- flags = FALSE;
+ if (flags) {
+ /* Treat a UTF-8 locale as not being in locale at all */
+ if (IN_UTF8_CTYPE_LOCALE) {
+ flags = FALSE;
+ }
+ else {
+ _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
+ }
}
if (UTF8_IS_INVARIANT(*p)) {
PERL_ARGS_ASSERT__TO_UTF8_TITLE_FLAGS;
- if (flags && IN_UTF8_CTYPE_LOCALE) {
- flags = FALSE;
+ if (flags) {
+ /* Treat a UTF-8 locale as not being in locale at all */
+ if (IN_UTF8_CTYPE_LOCALE) {
+ flags = FALSE;
+ }
+ else {
+ _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
+ }
}
if (UTF8_IS_INVARIANT(*p)) {
PERL_ARGS_ASSERT__TO_UTF8_LOWER_FLAGS;
- if (flags && IN_UTF8_CTYPE_LOCALE) {
- flags = FALSE;
+ if (flags) {
+ /* Treat a UTF-8 locale as not being in locale at all */
+ if (IN_UTF8_CTYPE_LOCALE) {
+ flags = FALSE;
+ }
+ else {
+ _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
+ }
}
if (UTF8_IS_INVARIANT(*p)) {
assert(p != ustrp); /* Otherwise overwrites */
- if (flags & FOLD_FLAGS_LOCALE && IN_UTF8_CTYPE_LOCALE) {
- flags &= ~FOLD_FLAGS_LOCALE;
+ if (flags & FOLD_FLAGS_LOCALE) {
+ /* Treat a UTF-8 locale as not being in locale at all */
+ if (IN_UTF8_CTYPE_LOCALE) {
+ flags &= ~FOLD_FLAGS_LOCALE;
+ }
+ else {
+ _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
+ }
}
if (UTF8_IS_INVARIANT(*p)) {