From 0dec74cdf5288e3984f799818ed991a90bc9675b Mon Sep 17 00:00:00 2001
From: Karl Williamson <khw@cpan.org>
Date: Fri, 5 Jan 2018 14:09:40 -0700
Subject: [PATCH 1/1] locale.c: Revamp finding if locale is UTF-8

This changes how this functionality works for the LC_CTYPE locale.  On
systems that have nl_langinfo() one can get a "definitive" answer from
just that.  Otherwise (or if that doesn't return properly) one can use
mbtowc() to check if the UTF-8 byte sequence for the Unicode REPLACEMENT
CHARACTER actually is considered to be that code point.  This is also
"definitive".  If the maximum byte string length for a character is too
short to handle all Unicode UTF-8, we know without further checking that
this isn't a UTF-8 locale, so can avoid the mbtowc check.

It turns out, from testing, that some locales are labelled UTF-8 by
nl_langinfo even though they depart from that at times.  Similarly for
mbtowc().  Perl assumes that a locale doesn't depart from this, and uses
its internal rules that it knows are UTF-8.  A future commit will warn
when this happens.
---
 locale.c | 134 +++++++++++++++++++++++++++++++--------------------------------
 1 file changed, 66 insertions(+), 68 deletions(-)

diff --git a/locale.c b/locale.c
index 583bf12..3391925 100644
--- a/locale.c
+++ b/locale.c
@@ -41,6 +41,10 @@
 
 #include "reentr.h"
 
+#ifdef I_WCHAR
+#  include <wchar.h>
+#endif
+
 /* If the environment says to, we can output debugging information during
  * initialization.  This is done before option parsing, and before any thread
  * creation, so can be a file-level static */
@@ -3068,7 +3072,6 @@ Perl__is_cur_LC_category_utf8(pTHX_ int category)
     const char *save_input_locale = NULL;
 
     bool is_utf8 = FALSE;                /* The return value */
-    STRLEN final_pos;
 
     /* The variables below are for the cache of previous lookups using this
      * function.  The cache is a C string, described at the definition for
@@ -3157,14 +3160,14 @@ Perl__is_cur_LC_category_utf8(pTHX_ int category)
     /* Here we don't have stored the utf8ness for the input locale.  We have to
      * calculate it */
 
-#  if defined(USE_LOCALE_CTYPE)    \
-    && (defined(MB_CUR_MAX) || (defined(HAS_NL_LANGINFO) && defined(CODESET)))
-
-    { /* Next try nl_langinfo or MB_CUR_MAX if available */
+#  if         defined(USE_LOCALE_CTYPE)                                  \
+      && (   (defined(HAS_NL_LANGINFO) && defined(CODESET))              \
+          ||  defined(HAS_MBTOWC))
 
-        char *save_ctype_locale = NULL;
+    {
+        const char *save_ctype_locale = NULL;
 
-        if (category != LC_CTYPE) { /* These work only on LC_CTYPE */
+        if (category != LC_CTYPE) {
 
             /* Get the current LC_CTYPE locale */
             save_ctype_locale = do_setlocale_c(LC_CTYPE, NULL);
@@ -3195,12 +3198,32 @@ Perl__is_cur_LC_category_utf8(pTHX_ int category)
                                               save_input_locale));
 
         /* Here the current LC_CTYPE is set to the locale of the category whose
-         * information is desired.  This means that nl_langinfo() and MB_CUR_MAX
+         * information is desired.  This means that nl_langinfo() and mbtowc()
          * should give the correct results */
 
+#    ifdef MB_CUR_MAX  /* But we can potentially rule out UTF-8ness, avoiding
+                          calling the functions if we have this */
+
+            /* Standard UTF-8 needs at least 4 bytes to represent the maximum
+             * Unicode code point. */
+
+            DEBUG_L(PerlIO_printf(Perl_debug_log, "\tMB_CUR_MAX=%d\n",
+                                       (int) MB_CUR_MAX));
+            if ((unsigned) MB_CUR_MAX < STRLENs(MAX_UNICODE_UTF8)) {
+                is_utf8 = FALSE;
+                goto finish_ctype;
+            }
+
+#    endif
 #    if defined(HAS_NL_LANGINFO) && defined(CODESET)
 
-        { /* The task is easiest if the platform has this POSIX 2001 function */
+        { /* The task is easiest if the platform has this POSIX 2001 function.
+             Except on some platforms it can wrongly return "", so have to have
+             a fallback.  And it can return that it's UTF-8, even if there are
+             variances from that.  For example, Turkish locales may use the
+             alternate dotted I rules, and sometimes it appears to be a
+             defective locale definition.  XXX We should probably check for
+             these in the Latin1 range and warn */
             const char *codeset = my_nl_langinfo(PERL_CODESET, FALSE);
                                           /* FALSE => already in dest locale */
 
@@ -3208,11 +3231,6 @@ Perl__is_cur_LC_category_utf8(pTHX_ int category)
                             "\tnllanginfo returned CODESET '%s'\n", codeset));
 
             if (codeset && strNE(codeset, "")) {
-                /* If we switched LC_CTYPE, switch back */
-                if (save_ctype_locale) {
-                    do_setlocale_c(LC_CTYPE, save_ctype_locale);
-                    Safefree(save_ctype_locale);
-                }
 
                               /* If the implementation of foldEQ() somehow were
                                * to change to not go byte-by-byte, this could
@@ -3225,56 +3243,38 @@ Perl__is_cur_LC_category_utf8(pTHX_ int category)
                 DEBUG_L(PerlIO_printf(Perl_debug_log,
                        "\tnllanginfo returned CODESET '%s'; ?UTF8 locale=%d\n",
                                                      codeset,         is_utf8));
-                goto finish_and_return;
+                goto finish_ctype;
             }
         }
 
 #    endif
-#    ifdef MB_CUR_MAX
-
-        /* Here, either we don't have nl_langinfo, or it didn't return a
-         * codeset.  Try MB_CUR_MAX */
-
-        /* Standard UTF-8 needs at least 4 bytes to represent the maximum
-         * Unicode code point.  Since UTF-8 is the only non-single byte
-         * encoding we handle, we just say any such encoding is UTF-8, and if
-         * turns out to be wrong, other things will fail */
-        is_utf8 = (unsigned) MB_CUR_MAX >= STRLENs(MAX_UNICODE_UTF8);
-
-        DEBUG_L(PerlIO_printf(Perl_debug_log,
-                              "\tMB_CUR_MAX=%d; ?UTF8 locale=%d\n",
-                                   (int) MB_CUR_MAX,      is_utf8));
+#    if defined(HAS_MBTOWC)
+     /* We can see if this is a UTF-8-like locale if have mbtowc().  It was a
+      * late adder to C89, so very likely to have it.  However, testing has
+      * shown that, like nl_langinfo() above, there are locales that are not
+      * strictly UTF-8 that this will return that they are */
 
-        Safefree(save_input_locale);
-
-#      ifdef HAS_MBTOWC
-
-        /* ... But, most system that have MB_CUR_MAX will also have mbtowc(),
-         * since they are both in the C99 standard.  We can feed a known byte
-         * string to the latter function, and check that it gives the expected
-         * result */
-        if (is_utf8) {
+        {
             wchar_t wc;
             int len;
 
+           /* mbtowc() converts a byte string to a wide character.  Feed a byte
+            * string to it and check that the result is the expected Unicode
+            * code point */
+
             PERL_UNUSED_RESULT(mbtowc(&wc, NULL, 0));/* Reset any shift state */
             errno = 0;
             len = mbtowc(&wc, STR_WITH_LEN(REPLACEMENT_CHARACTER_UTF8));
 
+            DEBUG_L(PerlIO_printf(Perl_debug_log,
+                    "\treturn from mbtowc; len=%d; code_point=%x; errno=%d\n",
+                                   len,      (unsigned int) wc, errno));
 
-            if (   len != STRLENs(REPLACEMENT_CHARACTER_UTF8)
-                || wc != (wchar_t) UNICODE_REPLACEMENT)
-            {
-                is_utf8 = FALSE;
-                DEBUG_L(PerlIO_printf(Perl_debug_log, "\replacement=U+%x\n",
-                                                            (unsigned int)wc));
-                DEBUG_L(PerlIO_printf(Perl_debug_log,
-                        "\treturn from mbtowc=%d; errno=%d; ?UTF8 locale=0\n",
-                                               len,      errno));
-            }
+            is_utf8 = cBOOL(   len == STRLENs(REPLACEMENT_CHARACTER_UTF8)
+                            && wc == (wchar_t) UNICODE_REPLACEMENT);
         }
 
-#      endif
+      finish_ctype:
 
         /* If we switched LC_CTYPE, switch back */
         if (save_ctype_locale) {
@@ -3283,21 +3283,17 @@ Perl__is_cur_LC_category_utf8(pTHX_ int category)
         }
 
         goto finish_and_return;
-
-#    endif
-
     }
 
-#  else   /* nl_langinfo should work if available, so don't bother compiling this
-           fallback code.  The final fallback of looking at the name is
-           compiled, and will be executed if nl_langinfo fails */
+#    endif
+#  else
 
-    /* nl_langinfo not available or failed somehow.  Next try looking at the
-     * currency symbol to see if it disambiguates things.  Often that will be
-     * in the native script, and if the symbol isn't in UTF-8, we know that the
-     * locale isn't.  If it is non-ASCII UTF-8, we infer that the locale is
-     * too, as the odds of a non-UTF8 string being valid UTF-8 are quite small
-     * */
+        /* Here, we must have a C89 compiler that doesn't have mbtowc().  Next
+         * try looking at the currency symbol to see if it disambiguates
+         * things.  Often that will be in the native script, and if the symbol
+         * isn't in UTF-8, we know that the locale isn't.  If it is non-ASCII
+         * UTF-8, we infer that the locale is too, as the odds of a non-UTF8
+         * string being valid UTF-8 are quite small */
 
 #    ifdef HAS_LOCALECONV
 #      ifdef USE_LOCALE_MONETARY
@@ -3543,9 +3539,7 @@ Perl__is_cur_LC_category_utf8(pTHX_ int category)
     }
 
 #    endif
-#  endif /* the code that is compiled when no nl_langinfo */
-
-#  ifndef EBCDIC  /* On os390, even if the name ends with "UTF-8', it isn't a
+#    ifndef EBCDIC  /* On os390, even if the name ends with "UTF-8', it isn't a
                    UTF-8 locale */
 
     /* As a last resort, look at the locale name to see if it matches
@@ -3555,7 +3549,9 @@ Perl__is_cur_LC_category_utf8(pTHX_ int category)
      * its UTF-8ness, but if it has UTF8 in the name, it is extremely likely to
      * be a UTF-8 locale.  Similarly for the other common names */
 
-    final_pos = strlen(save_input_locale) - 1;
+    {
+    const Size_t final_pos = strlen(save_input_locale) - 1;
+
     if (final_pos >= 3) {
         const char *name = save_input_locale;
 
@@ -3598,6 +3594,7 @@ Perl__is_cur_LC_category_utf8(pTHX_ int category)
         is_utf8 = TRUE;
         goto finish_and_return;
     }
+    }
 
 #      endif
 #    endif
@@ -3606,7 +3603,7 @@ Perl__is_cur_LC_category_utf8(pTHX_ int category)
      * since we are about to return FALSE anyway, there is no point in doing
      * this extra work */
 
-#  if 0
+#    if 0
     if (instr(save_input_locale, "8859")) {
         DEBUG_L(PerlIO_printf(Perl_debug_log,
                              "Locale %s has 8859 in name, not UTF-8 locale\n",
@@ -3614,13 +3611,15 @@ Perl__is_cur_LC_category_utf8(pTHX_ int category)
         is_utf8 = FALSE;
         goto finish_and_return;
     }
-#  endif
+#    endif
 
     DEBUG_L(PerlIO_printf(Perl_debug_log,
                           "Assuming locale %s is not a UTF-8 locale\n",
                                     save_input_locale));
     is_utf8 = FALSE;
 
+#  endif /* the code that is compiled when no modern LC_CTYPE */
+
   finish_and_return:
 
     /* Cache this result so we don't have to go through all this next time. */
@@ -3690,7 +3689,6 @@ Perl__is_cur_LC_category_utf8(pTHX_ int category)
 
 #endif
 
-
 bool
 Perl__is_in_locale_category(pTHX_ const bool compiling, const int category)
 {
-- 
1.8.3.1