Raise warning on multi-byte char in single-byte locale

author Karl Williamson <khw@cpan.org>

Mon, 29 Dec 2014 20:15:57 +0000 (13:15 -0700)

committer Karl Williamson <khw@cpan.org>

Mon, 29 Dec 2014 20:52:57 +0000 (13:52 -0700)
author Karl Williamson <khw@cpan.org>
Mon, 29 Dec 2014 20:15:57 +0000 (13:15 -0700)
committer Karl Williamson <khw@cpan.org>
Mon, 29 Dec 2014 20:52:57 +0000 (13:52 -0700)
diff --git a/perl.h b/perl.h

index 35624b5..89a7d43 100644 (file)
--- a/perl.h
+++ b/perl.h
@@ -5798,6 +5798,27 @@ typedef struct am_table_short AMTS;
          }  STMT_END
  
  
+    /* These two internal macros are called when a warning should be raised,
+     * and will do so if enabled.  The first takes a single code point
+     * argument; the 2nd, is a pointer to the first byte of the UTF-8 encoded
+     * string, and an end position which it won't try to read past */
+#   define _CHECK_AND_OUTPUT_WIDE_LOCALE_CP_MSG(cp)                         \
+      Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),                           \
+             "Wide character (U+%"UVXf") in %s", (UV) cp, OP_DESC(PL_op));
+
+#  define _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(s, send)                   \
+       STMT_START { /* Check if to warn before doing the conversion work */\
+            if (ckWARN(WARN_LOCALE)) {                                      \
+                UV cp = utf8_to_uvchr_buf((U8 *) s, (U8 *) send, NULL);     \
+                Perl_warner(aTHX_ packWARN(WARN_LOCALE),                    \
+                    "Wide character (U+%"UVXf") in %s",                     \
+                    (cp == 0)                                               \
+                     ? UNICODE_REPLACEMENT                                  \
+                     : (UV) cp,                                             \
+                    OP_DESC(PL_op));                                        \
+            }                                                               \
+        }  STMT_END
+
  #   endif   /* PERL_CORE or PERL_IN_XSUB_RE */
  
  #else   /* No locale usage */
@@ -5816,6 +5837,8 @@ typedef struct am_table_short AMTS;
  #   define IN_LC(category)                  0
  
  #   define _CHECK_AND_WARN_PROBLEMATIC_LOCALE
+#   define _CHECK_AND_OUTPUT_WIDE_LOCALE_CP_MSG(a)
+#   define _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(a,b)
  #endif
  
  #ifdef USE_LOCALE_NUMERIC
diff --git a/pod/perldelta.pod b/pod/perldelta.pod

index a494565..6a830b9 100644 (file)
--- a/pod/perldelta.pod
+++ b/pod/perldelta.pod
@@ -209,6 +209,10 @@ XXX L<message|perldiag/"message">
  
  XXX L<message|perldiag/"message">
  
+=item *
+
+L<Wide character (U+%X) in %s|perldiag/"Wide character (U+%X) in %s">
+
  =back
  
  =head2 Changes to Existing Diagnostics
diff --git a/pod/perldiag.pod b/pod/perldiag.pod

index 63df68d..4979da2 100644 (file)
--- a/pod/perldiag.pod
+++ b/pod/perldiag.pod
@@ -6964,6 +6964,20 @@ warning is to add C<no warnings 'utf8';> but that is often closer to
  cheating.  In general, you are supposed to explicitly mark the
  filehandle with an encoding, see L<open> and L<perlfunc/binmode>.
  
+=item Wide character (U+%X) in %s
+
+(W locale) While in a single-byte locale (I<i.e.>, a non-UTF-8
+one), a multi-byte character was encountered.   Perl considers this
+character to be the specified Unicode code point.  Combining non-UTF8
+locales and Unicode is dangerous.  Almost certainly some characters
+will have two different representations.  For example, in the ISO 8859-7
+(Greek) locale, the code point 0xC3 represents a Capital Gamma.  But so
+also does 0x393.  This will make string comparisons unreliable.
+
+You likely need to figure out how this multi-byte character got mixed up
+with your single-byte locale (or perhaps you thought you had a UTF-8
+locale, but Perl disagrees).
+
  =item Within []-length '%c' not allowed
  
  (F) The count in the (un)pack template may be replaced by C<[TEMPLATE]>
diff --git a/pod/perllocale.pod b/pod/perllocale.pod

index 17fddcb..3b2d79d 100644 (file)
--- a/pod/perllocale.pod
+++ b/pod/perllocale.pod
@@ -1519,6 +1519,11 @@ Still another problem is that this approach can lead to two code
  points meaning the same character.  Thus in a Greek locale, both U+03A7
  and U+00D7 are GREEK CAPITAL LETTER CHI.
  
+Because of all these problems, starting in v5.22, Perl will raise a
+warning if a multi-byte (hence Unicode) code point is used when a
+single-byte locale is in effect.  (Although it doesn't check for this if
+doing so would unreasonably slow execution down.)
+
  Vendor locales are notoriously buggy, and it is difficult for Perl to test
  its locale-handling code because this interacts with code that Perl has no
  control over; therefore the locale-handling code in Perl may be buggy as
diff --git a/regexec.c b/regexec.c

index 776cfd5..e659f4b 100644 (file)
--- a/regexec.c
+++ b/regexec.c
@@ -512,6 +512,8 @@ S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character)
                          TWO_BYTE_UTF8_TO_NATIVE(*character, *(character + 1)));
      }
  
+    _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(character, character + UTF8SKIP(character));
+
      if (classnum < _FIRST_NON_SWASH_CC) {
  
          /* Initialize the swash unless done already */
@@ -1457,6 +1459,9 @@ STMT_START {
      switch (trie_type) {                                                            \
      case trie_flu8:                                                                 \
          _CHECK_AND_WARN_PROBLEMATIC_LOCALE;                                         \
+        if (utf8_target && UTF8_IS_ABOVE_LATIN1(*uc)) {                             \
+            _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(uc, uc + UTF8SKIP(uc));          \
+        }                                                                           \
          goto do_trie_utf8_fold;                                                     \
      case trie_utf8_exactfa_fold:                                                    \
          flags |= FOLD_FLAGS_NOMIX_ASCII;                                            \
@@ -1495,6 +1500,9 @@ STMT_START {
          break;                                                                      \
      case trie_utf8l:                                                                \
          _CHECK_AND_WARN_PROBLEMATIC_LOCALE;                                         \
+        if (utf8_target && UTF8_IS_ABOVE_LATIN1(*uc)) {                             \
+            _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(uc, uc + UTF8SKIP(uc));          \
+        }                                                                           \
          /* FALLTHROUGH */                                                           \
      case trie_utf8:                                                                 \
          uvc = utf8n_to_uvchr( (const U8*) uc, UTF8_MAXLEN, &len, uniflags );        \
@@ -1819,7 +1827,8 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
                                         UTF-8 to express.  */
                  break;
              }
-            utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
+            utf8_fold_flags =  FOLDEQ_LOCALE | FOLDEQ_S2_ALREADY_FOLDED
+                                             | FOLDEQ_S2_FOLDS_SANE;
              goto do_exactf_utf8;
  
      case EXACTFU:
@@ -4185,6 +4194,16 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
  
                  if (scan->flags == EXACTL || scan->flags == EXACTFLU8) {
                      _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
+                    if (utf8_target
+                        && UTF8_IS_ABOVE_LATIN1(nextchr)
+                        && scan->flags == EXACTL)
+                    {
+                        /* We only output for EXACTL, as we let the folder
+                         * output this message for EXACTFLU8 to avoid
+                         * duplication */
+                        _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput,
+                                                               reginfo->strend);
+                    }
                  }
                  if (   trie->bitmap
                      && (NEXTCHR_IS_EOS || !TRIE_BITMAP_TEST(trie, nextchr)))
@@ -4461,6 +4480,16 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
  
         case EXACTL:             /*  /abc/l       */
              _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
+
+            /* Complete checking would involve going through every character
+             * matched by the string to see if any is above latin1.  But the
+             * comparision otherwise might very well be a fast assembly
+             * language routine, and I (khw) don't think slowing things down
+             * just to check for this warning is worth it.  So this just checks
+             * the first character */
+            if (utf8_target && UTF8_IS_ABOVE_LATIN1(*locinput)) {
+                _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput, reginfo->strend);
+            }
              /* FALLTHROUGH */
         case EXACT: {            /*  /abc/        */
             char *s = STRING(scan);
@@ -4560,7 +4589,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
              if (! utf8_target) {
                  sayNO;
              }
-           fold_utf8_flags = FOLDEQ_S1_ALREADY_FOLDED;
+            fold_utf8_flags =  FOLDEQ_LOCALE | FOLDEQ_S1_ALREADY_FOLDED
+                                             | FOLDEQ_S1_FOLDS_SANE;
             goto do_exactf;
  
         case EXACTFU_SS:         /*  /\x{df}/iu   */
@@ -4758,6 +4788,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
                  }
              }
              else { /* Here, must be an above Latin-1 code point */
+                _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput, reginfo->strend);
                  goto utf8_posix_above_latin1;
              }
  
@@ -7231,6 +7262,9 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p,
         break;
      case EXACTL:
          _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
+        if (utf8_target && UTF8_IS_ABOVE_LATIN1(*scan)) {
+            _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(scan, loceol);
+        }
          /* FALLTHROUGH */
      case EXACT:
          assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1);
@@ -7318,7 +7352,8 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p,
          if (! utf8_target) {
              break;
          }
-       utf8_flags = FOLDEQ_S2_ALREADY_FOLDED;
+        utf8_flags =  FOLDEQ_LOCALE | FOLDEQ_S2_ALREADY_FOLDED
+                                    | FOLDEQ_S2_FOLDS_SANE;
          goto do_exactf;
  
      case EXACTFU_SS:
@@ -7733,6 +7768,9 @@ S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const
                  * UTF8_ALLOW_FFFF */
         if (c_len == (STRLEN)-1)
             Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
+        if (c > 255 && OP(n) == ANYOFL && ! is_ANYOF_SYNTHETIC(n)) {
+            _CHECK_AND_OUTPUT_WIDE_LOCALE_CP_MSG(c);
+        }
      }
  
      /* If this character is potentially in the bitmap, check it */
diff --git a/t/lib/warnings/regexec b/t/lib/warnings/regexec

index 73696df..0c6a16a 100644 (file)
--- a/t/lib/warnings/regexec
+++ b/t/lib/warnings/regexec
@@ -117,3 +117,29 @@ $_ = 'a' x (2**15+1);
  #
  EXPECT
  
+########
+# NAME Wide character in non-UTF-8 locale
+eval { require POSIX; POSIX->import("locale_h") };
+if ($@) {
+    print("SKIPPED\n# no POSIX\n"),exit;
+}
+use warnings 'locale';
+use locale;
+setlocale(&POSIX::LC_CTYPE, "C");
+"\x{100}" =~ /\x{100}|\x{101}/il;
+"\x{100}" =~ /\x{100}|\x{101}/l;
+"\x{100}" =~ /\w/l;
+"\x{100}" =~ /\x{100}+/l;
+"\x{100}" =~ /[\x{100}\x{102}]/l;
+no warnings 'locale';
+EXPECT
+Wide character (U+100) in pattern match (m//) at - line 8.
+Wide character (U+100) in pattern match (m//) at - line 8.
+Wide character (U+100) in pattern match (m//) at - line 9.
+Wide character (U+100) in pattern match (m//) at - line 9.
+Wide character (U+100) in pattern match (m//) at - line 9.
+Wide character (U+100) in pattern match (m//) at - line 10.
+Wide character (U+100) in pattern match (m//) at - line 10.
+Wide character (U+100) in pattern match (m//) at - line 11.
+Wide character (U+100) in pattern match (m//) at - line 12.
+Wide character (U+100) in pattern match (m//) at - line 12.
diff --git a/t/lib/warnings/utf8 b/t/lib/warnings/utf8

index abce3d1..75f3f25 100644 (file)
--- a/t/lib/warnings/utf8
+++ b/t/lib/warnings/utf8
@@ -612,3 +612,31 @@ Can't do fc("\x{FB05}") on non-UTF-8 locale; resolved to "\x{FB06}". at - line 1
  Can't do uc("\x{FB00}") on non-UTF-8 locale; resolved to "\x{FB00}". at - line 13.
  Can't do ucfirst("\x{149}") on non-UTF-8 locale; resolved to "\x{149}". at - line 14.
  Can't do lcfirst("\x{178}") on non-UTF-8 locale; resolved to "\x{178}". at - line 15.
+########
+# NAME Wide character in non-UTF-8 locale
+eval { require POSIX; POSIX->import("locale_h") };
+if ($@) {
+    print("SKIPPED\n# no POSIX\n"),exit;
+}
+use warnings 'locale';
+use feature 'fc';
+use locale;
+setlocale(&POSIX::LC_CTYPE, "C");
+my $a;
+$a = lc("\x{100}");
+$a = lcfirst("\x{101}");
+$a = fc("\x{102}");
+$a = uc("\x{103}");
+$a = ucfirst("\x{104}");
+no warnings 'locale';
+$a = lc("\x{100}");
+$a = lcfirst("\x{101}");
+$a = fc("\x{102}");
+$a = uc("\x{103}");
+$a = ucfirst("\x{104}");
+EXPECT
+Wide character (U+100) in lc at - line 10.
+Wide character (U+101) in lcfirst at - line 11.
+Wide character (U+102) in fc at - line 12.
+Wide character (U+103) in uc at - line 13.
+Wide character (U+104) in ucfirst at - line 14.
diff --git a/t/re/charset.t b/t/re/charset.t

index 4d0d99c..e061916 100644 (file)
--- a/t/re/charset.t
+++ b/t/re/charset.t
@@ -9,6 +9,8 @@ BEGIN {
  
  use strict;
  use warnings;
+no warnings 'locale';   # Some /l tests use above-latin1 chars to make sure
+                        # they work, even though they warn.
  use Config;
  
  plan('no_plan');
diff --git a/t/re/pat_advanced.t b/t/re/pat_advanced.t

index c210e2e..19d6fbc 100644 (file)
--- a/t/re/pat_advanced.t
+++ b/t/re/pat_advanced.t
@@ -866,6 +866,7 @@ sub run_tests {
          ok "\x{100}X" =~ /$re/, "S_cl_and ANYOF_UNICODE & ANYOF_INVERTED";
          my $loc_re = qq /(?l:^([^X]*)X)/;
          utf8::upgrade ($loc_re);
+        no warnings 'locale';
          ok "\x{100}X" =~ /$loc_re/, "locale, S_cl_and ANYOF_UNICODE & ANYOF_INVERTED";
      }
  
diff --git a/utf8.c b/utf8.c

index b5470a8..8551e11 100644 (file)
--- a/utf8.c
+++ b/utf8.c
@@ -1914,7 +1914,8 @@ S_check_locale_boundary_crossing(pTHX_ const U8* const p, const UV result, U8* c
             s += UTF8SKIP(s);
         }
  
-       /* Here, no characters crossed, result is ok as-is */
+        /* Here, no characters crossed, result is ok as-is, but we warn. */
+        _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(p, p + UTF8SKIP(p));
         return result;
      }
author	Karl Williamson <khw@cpan.org>
	Mon, 29 Dec 2014 20:15:57 +0000 (13:15 -0700)
committer	Karl Williamson <khw@cpan.org>
	Mon, 29 Dec 2014 20:52:57 +0000 (13:52 -0700)
perl.h		patch \| blob \| blame \| history
pod/perldelta.pod		patch \| blob \| blame \| history
pod/perldiag.pod		patch \| blob \| blame \| history
pod/perllocale.pod		patch \| blob \| blame \| history
regexec.c		patch \| blob \| blame \| history
t/lib/warnings/regexec		patch \| blob \| blame \| history
t/lib/warnings/utf8		patch \| blob \| blame \| history
t/re/charset.t		patch \| blob \| blame \| history
t/re/pat_advanced.t		patch \| blob \| blame \| history
utf8.c		patch \| blob \| blame \| history