} STMT_END
+ /* These two internal macros are called when a warning should be raised,
+ * and will do so if enabled. The first takes a single code point
+ * argument; the 2nd, is a pointer to the first byte of the UTF-8 encoded
+ * string, and an end position which it won't try to read past */
+# define _CHECK_AND_OUTPUT_WIDE_LOCALE_CP_MSG(cp) \
+ Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), \
+ "Wide character (U+%"UVXf") in %s", (UV) cp, OP_DESC(PL_op));
+
+# define _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(s, send) \
+ STMT_START { /* Check if to warn before doing the conversion work */\
+ if (ckWARN(WARN_LOCALE)) { \
+ UV cp = utf8_to_uvchr_buf((U8 *) s, (U8 *) send, NULL); \
+ Perl_warner(aTHX_ packWARN(WARN_LOCALE), \
+ "Wide character (U+%"UVXf") in %s", \
+ (cp == 0) \
+ ? UNICODE_REPLACEMENT \
+ : (UV) cp, \
+ OP_DESC(PL_op)); \
+ } \
+ } STMT_END
+
# endif /* PERL_CORE or PERL_IN_XSUB_RE */
#else /* No locale usage */
# define IN_LC(category) 0
# define _CHECK_AND_WARN_PROBLEMATIC_LOCALE
+# define _CHECK_AND_OUTPUT_WIDE_LOCALE_CP_MSG(a)
+# define _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(a,b)
#endif
#ifdef USE_LOCALE_NUMERIC
XXX L<message|perldiag/"message">
+=item *
+
+L<Wide character (U+%X) in %s|perldiag/"Wide character (U+%X) in %s">
+
=back
=head2 Changes to Existing Diagnostics
cheating. In general, you are supposed to explicitly mark the
filehandle with an encoding, see L<open> and L<perlfunc/binmode>.
+=item Wide character (U+%X) in %s
+
+(W locale) While in a single-byte locale (I<i.e.>, a non-UTF-8
+one), a multi-byte character was encountered. Perl considers this
+character to be the specified Unicode code point. Combining non-UTF8
+locales and Unicode is dangerous. Almost certainly some characters
+will have two different representations. For example, in the ISO 8859-7
+(Greek) locale, the code point 0xC3 represents a Capital Gamma. But so
+also does 0x393. This will make string comparisons unreliable.
+
+You likely need to figure out how this multi-byte character got mixed up
+with your single-byte locale (or perhaps you thought you had a UTF-8
+locale, but Perl disagrees).
+
=item Within []-length '%c' not allowed
(F) The count in the (un)pack template may be replaced by C<[TEMPLATE]>
points meaning the same character. Thus in a Greek locale, both U+03A7
and U+00D7 are GREEK CAPITAL LETTER CHI.
+Because of all these problems, starting in v5.22, Perl will raise a
+warning if a multi-byte (hence Unicode) code point is used when a
+single-byte locale is in effect. (Although it doesn't check for this if
+doing so would unreasonably slow execution down.)
+
Vendor locales are notoriously buggy, and it is difficult for Perl to test
its locale-handling code because this interacts with code that Perl has no
control over; therefore the locale-handling code in Perl may be buggy as
TWO_BYTE_UTF8_TO_NATIVE(*character, *(character + 1)));
}
+ _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(character, character + UTF8SKIP(character));
+
if (classnum < _FIRST_NON_SWASH_CC) {
/* Initialize the swash unless done already */
switch (trie_type) { \
case trie_flu8: \
_CHECK_AND_WARN_PROBLEMATIC_LOCALE; \
+ if (utf8_target && UTF8_IS_ABOVE_LATIN1(*uc)) { \
+ _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(uc, uc + UTF8SKIP(uc)); \
+ } \
goto do_trie_utf8_fold; \
case trie_utf8_exactfa_fold: \
flags |= FOLD_FLAGS_NOMIX_ASCII; \
break; \
case trie_utf8l: \
_CHECK_AND_WARN_PROBLEMATIC_LOCALE; \
+ if (utf8_target && UTF8_IS_ABOVE_LATIN1(*uc)) { \
+ _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(uc, uc + UTF8SKIP(uc)); \
+ } \
/* FALLTHROUGH */ \
case trie_utf8: \
uvc = utf8n_to_uvchr( (const U8*) uc, UTF8_MAXLEN, &len, uniflags ); \
UTF-8 to express. */
break;
}
- utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
+ utf8_fold_flags = FOLDEQ_LOCALE | FOLDEQ_S2_ALREADY_FOLDED
+ | FOLDEQ_S2_FOLDS_SANE;
goto do_exactf_utf8;
case EXACTFU:
if (scan->flags == EXACTL || scan->flags == EXACTFLU8) {
_CHECK_AND_WARN_PROBLEMATIC_LOCALE;
+ if (utf8_target
+ && UTF8_IS_ABOVE_LATIN1(nextchr)
+ && scan->flags == EXACTL)
+ {
+ /* We only output for EXACTL, as we let the folder
+ * output this message for EXACTFLU8 to avoid
+ * duplication */
+ _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput,
+ reginfo->strend);
+ }
}
if ( trie->bitmap
&& (NEXTCHR_IS_EOS || !TRIE_BITMAP_TEST(trie, nextchr)))
case EXACTL: /* /abc/l */
_CHECK_AND_WARN_PROBLEMATIC_LOCALE;
+
+ /* Complete checking would involve going through every character
+ * matched by the string to see if any is above latin1. But the
+ * comparision otherwise might very well be a fast assembly
+ * language routine, and I (khw) don't think slowing things down
+ * just to check for this warning is worth it. So this just checks
+ * the first character */
+ if (utf8_target && UTF8_IS_ABOVE_LATIN1(*locinput)) {
+ _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput, reginfo->strend);
+ }
/* FALLTHROUGH */
case EXACT: { /* /abc/ */
char *s = STRING(scan);
if (! utf8_target) {
sayNO;
}
- fold_utf8_flags = FOLDEQ_S1_ALREADY_FOLDED;
+ fold_utf8_flags = FOLDEQ_LOCALE | FOLDEQ_S1_ALREADY_FOLDED
+ | FOLDEQ_S1_FOLDS_SANE;
goto do_exactf;
case EXACTFU_SS: /* /\x{df}/iu */
}
}
else { /* Here, must be an above Latin-1 code point */
+ _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput, reginfo->strend);
goto utf8_posix_above_latin1;
}
break;
case EXACTL:
_CHECK_AND_WARN_PROBLEMATIC_LOCALE;
+ if (utf8_target && UTF8_IS_ABOVE_LATIN1(*scan)) {
+ _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(scan, loceol);
+ }
/* FALLTHROUGH */
case EXACT:
assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1);
if (! utf8_target) {
break;
}
- utf8_flags = FOLDEQ_S2_ALREADY_FOLDED;
+ utf8_flags = FOLDEQ_LOCALE | FOLDEQ_S2_ALREADY_FOLDED
+ | FOLDEQ_S2_FOLDS_SANE;
goto do_exactf;
case EXACTFU_SS:
* UTF8_ALLOW_FFFF */
if (c_len == (STRLEN)-1)
Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
+ if (c > 255 && OP(n) == ANYOFL && ! is_ANYOF_SYNTHETIC(n)) {
+ _CHECK_AND_OUTPUT_WIDE_LOCALE_CP_MSG(c);
+ }
}
/* If this character is potentially in the bitmap, check it */
#
EXPECT
+########
+# NAME Wide character in non-UTF-8 locale
+eval { require POSIX; POSIX->import("locale_h") };
+if ($@) {
+ print("SKIPPED\n# no POSIX\n"),exit;
+}
+use warnings 'locale';
+use locale;
+setlocale(&POSIX::LC_CTYPE, "C");
+"\x{100}" =~ /\x{100}|\x{101}/il;
+"\x{100}" =~ /\x{100}|\x{101}/l;
+"\x{100}" =~ /\w/l;
+"\x{100}" =~ /\x{100}+/l;
+"\x{100}" =~ /[\x{100}\x{102}]/l;
+no warnings 'locale';
+EXPECT
+Wide character (U+100) in pattern match (m//) at - line 8.
+Wide character (U+100) in pattern match (m//) at - line 8.
+Wide character (U+100) in pattern match (m//) at - line 9.
+Wide character (U+100) in pattern match (m//) at - line 9.
+Wide character (U+100) in pattern match (m//) at - line 9.
+Wide character (U+100) in pattern match (m//) at - line 10.
+Wide character (U+100) in pattern match (m//) at - line 10.
+Wide character (U+100) in pattern match (m//) at - line 11.
+Wide character (U+100) in pattern match (m//) at - line 12.
+Wide character (U+100) in pattern match (m//) at - line 12.
Can't do uc("\x{FB00}") on non-UTF-8 locale; resolved to "\x{FB00}". at - line 13.
Can't do ucfirst("\x{149}") on non-UTF-8 locale; resolved to "\x{149}". at - line 14.
Can't do lcfirst("\x{178}") on non-UTF-8 locale; resolved to "\x{178}". at - line 15.
+########
+# NAME Wide character in non-UTF-8 locale
+eval { require POSIX; POSIX->import("locale_h") };
+if ($@) {
+ print("SKIPPED\n# no POSIX\n"),exit;
+}
+use warnings 'locale';
+use feature 'fc';
+use locale;
+setlocale(&POSIX::LC_CTYPE, "C");
+my $a;
+$a = lc("\x{100}");
+$a = lcfirst("\x{101}");
+$a = fc("\x{102}");
+$a = uc("\x{103}");
+$a = ucfirst("\x{104}");
+no warnings 'locale';
+$a = lc("\x{100}");
+$a = lcfirst("\x{101}");
+$a = fc("\x{102}");
+$a = uc("\x{103}");
+$a = ucfirst("\x{104}");
+EXPECT
+Wide character (U+100) in lc at - line 10.
+Wide character (U+101) in lcfirst at - line 11.
+Wide character (U+102) in fc at - line 12.
+Wide character (U+103) in uc at - line 13.
+Wide character (U+104) in ucfirst at - line 14.
use strict;
use warnings;
+no warnings 'locale'; # Some /l tests use above-latin1 chars to make sure
+ # they work, even though they warn.
use Config;
plan('no_plan');
ok "\x{100}X" =~ /$re/, "S_cl_and ANYOF_UNICODE & ANYOF_INVERTED";
my $loc_re = qq /(?l:^([^X]*)X)/;
utf8::upgrade ($loc_re);
+ no warnings 'locale';
ok "\x{100}X" =~ /$loc_re/, "locale, S_cl_and ANYOF_UNICODE & ANYOF_INVERTED";
}
s += UTF8SKIP(s);
}
- /* Here, no characters crossed, result is ok as-is */
+ /* Here, no characters crossed, result is ok as-is, but we warn. */
+ _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(p, p + UTF8SKIP(p));
return result;
}