Copy(PL_fold_latin1, PL_fold_locale, 256, U8);
}
else {
+ /* Assume enough space for every character being bad. 4 spaces each
+ * for the 94 printable characters that are output like "'x' "; and 5
+ * spaces each for "'\\' ", "'\t' ", and "'\n' "; plus a terminating
+ * NUL */
+ char bad_chars_list[ (94 * 4) + (3 * 5) + 1 ];
+
+ bool check_for_problems = ckWARN_d(WARN_LOCALE); /* No warnings means
+ no check */
+ bool multi_byte_locale = FALSE; /* Assume is a single-byte locale
+ to start */
+ unsigned int bad_count = 0; /* Count of bad characters */
+
for (i = 0; i < 256; i++) {
if (isUPPER_LC((U8) i))
PL_fold_locale[i] = (U8) toLOWER_LC((U8) i);
PL_fold_locale[i] = (U8) toUPPER_LC((U8) i);
else
PL_fold_locale[i] = (U8) i;
+
+ /* If checking for locale problems, see if the native ASCII-range
+ * printables plus \n and \t are in their expected categories in
+ * the new locale. If not, this could mean big trouble, upending
+ * Perl's and most programs' assumptions, like having a
+ * metacharacter with special meaning become a \w. Fortunately,
+ * it's very rare to find locales that aren't supersets of ASCII
+ * nowadays. It isn't a problem for most controls to be changed
+ * into something else; we check only \n and \t, though perhaps \r
+ * could be an issue as well. */
+ if (check_for_problems
+ && (isGRAPH_A(i) || isBLANK_A(i) || i == '\n'))
+ {
+ if ((isALPHANUMERIC_A(i) && ! isALPHANUMERIC_LC(i))
+ || (isPUNCT_A(i) && ! isPUNCT_LC(i))
+ || (isBLANK_A(i) && ! isBLANK_LC(i))
+ || (i == '\n' && ! isCNTRL_LC(i)))
+ {
+ if (bad_count) { /* Separate multiple entries with a
+ blank */
+ bad_chars_list[bad_count++] = ' ';
+ }
+ bad_chars_list[bad_count++] = '\'';
+ if (isPRINT_A(i)) {
+ bad_chars_list[bad_count++] = (char) i;
+ }
+ else {
+ bad_chars_list[bad_count++] = '\\';
+ if (i == '\n') {
+ bad_chars_list[bad_count++] = 'n';
+ }
+ else {
+ assert(i == '\t');
+ bad_chars_list[bad_count++] = 't';
+ }
+ }
+ bad_chars_list[bad_count++] = '\'';
+ bad_chars_list[bad_count] = '\0';
+ }
+ }
+ }
+
+#ifdef MB_CUR_MAX
+ /* We only handle single-byte locales (outside of UTF-8 ones; so if
+ * this locale has requires than one byte, there are going to be
+ * problems */
+ if (check_for_problems && MB_CUR_MAX > 1) {
+ multi_byte_locale = TRUE;
+ }
+#endif
+
+ if (bad_count || multi_byte_locale) {
+ setlocale(LC_CTYPE, "C");
+ Perl_warner(aTHX_ packWARN(WARN_LOCALE),
+ "Locale '%s' may not work well.%s%s%s\n",
+ newctype,
+ (multi_byte_locale)
+ ? " Some characters in it are not recognized by"
+ " Perl."
+ : "",
+ (bad_count)
+ ? "\nThe following characters (and maybe others)"
+ " may not have the same meaning as the Perl"
+ " program expects:\n"
+ : "",
+ (bad_count)
+ ? bad_chars_list
+ : ""
+ );
+ setlocale(LC_CTYPE, newctype);
}
}
L<Use of literal non-graphic characters in variable names is deprecated|perldiag/"Use of literal non-graphic characters in variable names is deprecated">
+=item *
+
+A new C<locale> warning category has been created, with only one warning
+message currently in it:
+L<Locale '%s' may not work well.%s|perldiag/Locale '%s' may not work well.%s>
+
=back
=head2 Changes to Existing Diagnostics
than it can reliably handle and C<localtime> probably returned the
wrong date.
+=item Locale '%s' may not work well.%s
+
+(W locale) The named locale that Perl is now trying to use is not fully
+compatible with Perl. The second C<%s> gives a reason.
+
+By far the most common reason is that the locale has characters in it
+that are represented by more than one byte. The only such locales that
+Perl can handle are the UTF-8 locales. Most likely the specified locale
+is a non-UTF-8 one for an East Asian language such as Chinese or
+Japanese. If the locale is a superset of ASCII, the ASCII portion of it
+may work in Perl. Read on for problems when it isn't a superset of
+ASCII.
+
+Some essentially obsolete locales that aren't supersets of ASCII, mainly
+those in ISO 646 or other 7-bit locales, such as ASMO 449, can also have
+problems, depending on what portions of the ASCII character set get
+changed by the locale and are also used by the program.
+The warning message lists the determinable conflicting characters.
+
=item Lookbehind longer than %d not implemented in regex m/%s/
(F) There is currently a limit on the length of string which lookbehind can
you may find--possibly to your surprise--that "|" moves from the
C<POSIX::ispunct()> class to C<POSIX::isalpha()>.
Unfortunately, this creates big problems for regular expressions. "|" still
-means alternation even though it matches C<\w>.
+means alternation even though it matches C<\w>. Starting in v5.22, a
+warning will be raised when such a locale is switched into. More
+details are given several paragraphs further down.
Starting in v5.20, Perl supports UTF-8 locales for C<LC_CTYPE>, but
otherwise Perl only supports single-byte locales, such as the ISO 8859
series. This means that wide character locales, for example for Asian
-languages, are not supported. The UTF-8 locale support is actually a
+languages, are not supported. (If the platform has the capability
+for Perl to detect such a locale, starting in Perl v5.22,
+L<Perl will warn, default enabled|warnings/Category Hierarchy>,
+using the C<locale> warning category, whenever such a locale is switched
+into.) The UTF-8 locale support is actually a
superset of POSIX locales, because it is really full Unicode behavior
as if no locale were in effect at all (except for tainting; see
L</SECURITY>). POSIX locales, even UTF-8 ones,
for example, that C<\N> in regular expressions (every character
but new-line) works on the platform character set.
+Starting in v5.22, Perl will by default warn when switching into a
+locale that redefines any ASCII printable character (plus C<\t> and
+C<\n>) into a different class than expected. This is unlikely to
+happen on modern locales, but can happen with the ISO 646 and other
+7-bit locales that are essentially obsolete. Things may still work,
+depending on what features of Perl are used by the program. For
+example, in the example from above where C<"|"> becomes a C<\w>, and
+there are no regular expressions where this matters, the program may
+still work properly. The warning lists all the characters that
+it can determine could be adversely affected.
+
B<Note:> A broken or malicious C<LC_CTYPE> locale definition may result
in clearly ineligible characters being considered to be alphanumeric by
your application. For strict matching of (mundane) ASCII letters and