locale.c: Use safer code practice

[perl5.git] / locale.c
diff --git a/locale.c b/locale.c

index 2b66c8c..1e67e57 100644 (file)
--- a/locale.c
+++ b/locale.c
@@ -93,7 +93,6 @@ void
  Perl_set_numeric_radix(pTHX)
  {
  #ifdef USE_LOCALE_NUMERIC
-    dVAR;
  # ifdef HAS_LOCALECONV
      const struct lconv* const lc = localeconv();
  
@@ -127,6 +126,20 @@ Perl_set_numeric_radix(pTHX)
  #endif /* USE_LOCALE_NUMERIC */
  }
  
+/* Is the C string input 'name' "C" or "POSIX"?  If so, and 'name' is the
+ * return of setlocale(), then this is extremely likely to be the C or POSIX
+ * locale.  However, the output of setlocale() is documented to be opaque, but
+ * the odds are extremely small that it would return these two strings for some
+ * other locale.  Note that VMS in these two locales includes many non-ASCII
+ * characters as controls and punctuation (below are hex bytes):
+ *   cntrl:  00-1F 7F 84-97 9B-9F
+ *   punct:  21-2F 3A-40 5B-60 7B-7E A1-A3 A5 A7-AB B0-B3 B5-B7 B9-BD BF-CF D1-DD DF-EF F1-FD
+ * Oddly, none there are listed as alphas, though some represent alphabetics
+ * http://www.nntp.perl.org/group/perl.perl5.porters/2013/02/msg198753.html */
+#define isNAME_C_OR_POSIX(name) ((name) != NULL                                 \
+                                  && ((*(name) == 'C' && (*(name + 1)) == '\0') \
+                                       || strEQ((name), "POSIX")))
+
  void
  Perl_new_numeric(pTHX_ const char *newnum)
  {
@@ -164,7 +177,6 @@ Perl_new_numeric(pTHX_ const char *newnum)
       * POSIX::setlocale() */
  
      char *save_newnum;
-    dVAR;
  
      if (! newnum) {
         Safefree(PL_numeric_name);
@@ -180,8 +192,7 @@ Perl_new_numeric(pTHX_ const char *newnum)
         PL_numeric_name = save_newnum;
      }
  
-    PL_numeric_standard = ((*save_newnum == 'C' && save_newnum[1] == '\0')
-                            || strEQ(save_newnum, "POSIX"));
+    PL_numeric_standard = isNAME_C_OR_POSIX(save_newnum);
      PL_numeric_local = TRUE;
  
      /* Keep LC_NUMERIC in the C locale.  This is for XS modules, so they don't
@@ -191,6 +202,8 @@ Perl_new_numeric(pTHX_ const char *newnum)
  
      set_numeric_radix();
  
+#else
+    PERL_UNUSED_ARG(newnum);
  #endif /* USE_LOCALE_NUMERIC */
  }
  
@@ -198,18 +211,16 @@ void
  Perl_set_numeric_standard(pTHX)
  {
  #ifdef USE_LOCALE_NUMERIC
-    dVAR;
-
-    /* Toggle the LC_NUMERIC locale to C, if not already there.  Probably
-     * should use the macros like SET_NUMERIC_STANDARD() in perl.h instead of
-     * calling this directly. */
-
-    if (_NOT_IN_NUMERIC_STANDARD) {
-       setlocale(LC_NUMERIC, "C");
-       PL_numeric_standard = TRUE;
-       PL_numeric_local = FALSE;
-       set_numeric_radix();
-    }
+    /* Toggle the LC_NUMERIC locale to C.  Most code should use the macros like
+     * SET_NUMERIC_STANDARD() in perl.h instead of calling this directly.  The
+     * macro avoids calling this routine if toggling isn't necessary according
+     * to our records (which could be wrong if some XS code has changed the
+     * locale behind our back) */
+
+    setlocale(LC_NUMERIC, "C");
+    PL_numeric_standard = TRUE;
+    PL_numeric_local = isNAME_C_OR_POSIX(PL_numeric_name);
+    set_numeric_radix();
      DEBUG_L(PerlIO_printf(Perl_debug_log,
                            "Underlying LC_NUMERIC locale now is C\n"));
  
@@ -220,18 +231,16 @@ void
  Perl_set_numeric_local(pTHX)
  {
  #ifdef USE_LOCALE_NUMERIC
-    dVAR;
-
-    /* Toggle the LC_NUMERIC locale to the current underlying default, if not
-     * already there.  Probably should use the macros like SET_NUMERIC_LOCAL()
-     * in perl.h instead of calling this directly. */
-
-    if (_NOT_IN_NUMERIC_LOCAL) {
-       setlocale(LC_NUMERIC, PL_numeric_name);
-       PL_numeric_standard = FALSE;
-       PL_numeric_local = TRUE;
-       set_numeric_radix();
-    }
+    /* Toggle the LC_NUMERIC locale to the current underlying default.  Most
+     * code should use the macros like SET_NUMERIC_LOCAL() in perl.h instead of
+     * calling this directly.  The macro avoids calling this routine if
+     * toggling isn't necessary according to our records (which could be wrong
+     * if some XS code has changed the locale behind our back) */
+
+    setlocale(LC_NUMERIC, PL_numeric_name);
+    PL_numeric_standard = isNAME_C_OR_POSIX(PL_numeric_name);
+    PL_numeric_local = TRUE;
+    set_numeric_radix();
      DEBUG_L(PerlIO_printf(Perl_debug_log,
                            "Underlying LC_NUMERIC locale now is %s\n",
                            PL_numeric_name));
@@ -300,8 +309,6 @@ Perl_new_collate(pTHX_ const char *newcoll)
       * should be called directly only from this file and from
       * POSIX::setlocale() */
  
-    dVAR;
-
      if (! newcoll) {
         if (PL_collation_name) {
             ++PL_collation_ix;
@@ -318,8 +325,7 @@ Perl_new_collate(pTHX_ const char *newcoll)
         ++PL_collation_ix;
         Safefree(PL_collation_name);
         PL_collation_name = stdize_locale(savepv(newcoll));
-       PL_collation_standard = ((*newcoll == 'C' && newcoll[1] == '\0')
-                                || strEQ(newcoll, "POSIX"));
+       PL_collation_standard = isNAME_C_OR_POSIX(newcoll);
  
         {
           /*  2: at most so many chars ('a', 'b'). */
@@ -337,6 +343,8 @@ Perl_new_collate(pTHX_ const char *newcoll)
         }
      }
  
+#else
+    PERL_UNUSED_ARG(newcoll);
  #endif /* USE_LOCALE_COLLATE */
  }
  
@@ -493,8 +501,6 @@ Perl_init_i18nl10n(pTHX_ int printwarn)
      int ok = 1;
  
  #if defined(USE_LOCALE)
-    dVAR;
-
  #ifdef USE_LOCALE_CTYPE
      char *curctype   = NULL;
  #endif /* USE_LOCALE_CTYPE */
@@ -926,6 +932,8 @@ Perl_init_i18nl10n(pTHX_ int printwarn)
      Safefree(curnum);
  #endif /* USE_LOCALE_NUMERIC */
  
+#else  /* !USE_LOCALE */
+    PERL_UNUSED_ARG(printwarn);
  #endif /* USE_LOCALE */
  
      return ok;
@@ -945,7 +953,6 @@ Perl_init_i18nl10n(pTHX_ int printwarn)
  char *
  Perl_mem_collxfrm(pTHX_ const char *s, STRLEN len, STRLEN *xlen)
  {
-    dVAR;
      char *xbuf;
      STRLEN xAlloc, xin, xout; /* xalloc is a reserved word in VC */
  
@@ -1022,9 +1029,7 @@ Perl__is_cur_LC_category_utf8(pTHX_ int category)
          return FALSE;   /* XXX maybe should croak */
      }
      save_input_locale = stdize_locale(savepv(save_input_locale));
-    if ((*save_input_locale == 'C' && save_input_locale[1] == '\0')
-        || strEQ(save_input_locale, "POSIX"))
-    {
+    if (isNAME_C_OR_POSIX(save_input_locale)) {
          DEBUG_L(PerlIO_printf(Perl_debug_log,
                                "Current locale for category %d is %s\n",
                                category, save_input_locale));
@@ -1043,12 +1048,13 @@ Perl__is_cur_LC_category_utf8(pTHX_ int category)
          if (category != LC_CTYPE) { /* These work only on LC_CTYPE */
  
              /* Get the current LC_CTYPE locale */
-            save_ctype_locale = stdize_locale(savepv(setlocale(LC_CTYPE, NULL)));
+            save_ctype_locale = setlocale(LC_CTYPE, NULL);
              if (! save_ctype_locale) {
                  DEBUG_L(PerlIO_printf(Perl_debug_log,
                                 "Could not find current locale for LC_CTYPE\n"));
                  goto cant_use_nllanginfo;
              }
+            save_ctype_locale = stdize_locale(savepv(save_ctype_locale));
  
              /* If LC_CTYPE and the desired category use the same locale, this
               * means that finding the value for LC_CTYPE is the same as finding
@@ -1076,8 +1082,9 @@ Perl__is_cur_LC_category_utf8(pTHX_ int category)
  
  #   if defined(HAS_NL_LANGINFO) && defined(CODESET)
          {
-            char *codeset = savepv(nl_langinfo(CODESET));
+            char *codeset = nl_langinfo(CODESET);
              if (codeset && strNE(codeset, "")) {
+                codeset = savepv(codeset);
  
                  /* If we switched LC_CTYPE, switch back */
                  if (save_ctype_locale) {
@@ -1095,7 +1102,6 @@ Perl__is_cur_LC_category_utf8(pTHX_ int category)
                  Safefree(save_input_locale);
                  return is_utf8;
              }
-            Safefree(codeset);
          }
  
  #   endif
@@ -1153,99 +1159,36 @@ Perl__is_cur_LC_category_utf8(pTHX_ int category)
  
    cant_use_nllanginfo:
  
-#endif /* HAS_NL_LANGINFO etc */
-
-    /* nl_langinfo not available or failed somehow.  Look at the locale name to
-     * see if it matches qr/UTF -? 8 /ix  */
-
-    final_pos = strlen(save_input_locale) - 1;
-    if (final_pos >= 3) {
-        char *name = save_input_locale;
-
-        /* Find next 'U' or 'u' and look from there */
-        while ((name += strcspn(name, "Uu") + 1)
-                                            <= save_input_locale + final_pos - 2)
-        {
-            if (toFOLD(*(name)) != 't'
-                || toFOLD(*(name + 1)) != 'f')
-            {
-                continue;
-            }
-            name += 2;
-            if (*(name) == '-') {
-                if ((name > save_input_locale + final_pos - 1)) {
-                    break;
-                }
-                name++;
-            }
-            if (*(name) == '8') {
-                Safefree(save_input_locale);
-                DEBUG_L(PerlIO_printf(Perl_debug_log,
-                                      "Locale %s ends with UTF-8 in name\n",
-                                      save_input_locale));
-                return TRUE;
-            }
-        }
-        DEBUG_L(PerlIO_printf(Perl_debug_log,
-                              "Locale %s doesn't end with UTF-8 in name\n",
-                                save_input_locale));
-    }
-
-#ifdef WIN32
-    /* http://msdn.microsoft.com/en-us/library/windows/desktop/dd317756.aspx */
-    if (final_pos >= 4
-        && *(save_input_locale + final_pos - 0) == '1'
-        && *(save_input_locale + final_pos - 1) == '0'
-        && *(save_input_locale + final_pos - 2) == '0'
-        && *(save_input_locale + final_pos - 3) == '5'
-        && *(save_input_locale + final_pos - 4) == '6')
-    {
-        DEBUG_L(PerlIO_printf(Perl_debug_log,
-                        "Locale %s ends with 10056 in name, is UTF-8 locale\n",
-                        save_input_locale));
-        Safefree(save_input_locale);
-        return TRUE;
-    }
-#endif
+#else   /* nl_langinfo should work if available, so don't bother compiling this
+           fallback code.  The final fallback of looking at the name is
+           compiled, and will be executed if nl_langinfo fails */
  
-    /* Other common encodings are the ISO 8859 series, which aren't UTF-8 */
-    if (instr(save_input_locale, "8859")) {
-        DEBUG_L(PerlIO_printf(Perl_debug_log,
-                             "Locale %s has 8859 in name, not UTF-8 locale\n",
-                             save_input_locale));
-        Safefree(save_input_locale);
-        return FALSE;
-    }
+    /* nl_langinfo not available or failed somehow.  Next try looking at the
+     * currency symbol to see if it disambiguates things.  Often that will be
+     * in the native script, and if the symbol isn't in UTF-8, we know that the
+     * locale isn't.  If it is non-ASCII UTF-8, we infer that the locale is
+     * too. */
  
  #ifdef HAS_LOCALECONV
-
  #   ifdef USE_LOCALE_MONETARY
-
-    /* Here, there is nothing in the locale name to indicate whether the locale
-     * is UTF-8 or not.  This "name", the return of setlocale(), is actually
-     * defined to be opaque, so we can't really rely on the absence of various
-     * substrings in the name to indicate its UTF-8ness.  Look at the locale's
-     * currency symbol.  Often that will be in the native script, and if the
-     * symbol isn't in UTF-8, we know that the locale isn't.  If it is
-     * non-ASCII UTF-8, we infer that the locale is too.
-     * To do this, like above for LC_CTYPE, we first set LC_MONETARY to the
-     * locale of the desired category, if it isn't that locale already */
-
      {
          char *save_monetary_locale = NULL;
          bool illegal_utf8 = FALSE;
          bool only_ascii = FALSE;
          const struct lconv* const lc = localeconv();
  
+        /* Like above for LC_CTYPE, we first set LC_MONETARY to the locale of
+         * the desired category, if it isn't that locale already */
+
          if (category != LC_MONETARY) {
  
-            save_monetary_locale = stdize_locale(savepv(setlocale(LC_MONETARY,
-                                                                  NULL)));
+            save_monetary_locale = setlocale(LC_MONETARY, NULL);
              if (! save_monetary_locale) {
                  DEBUG_L(PerlIO_printf(Perl_debug_log,
                              "Could not find current locale for LC_MONETARY\n"));
                  goto cant_use_monetary;
              }
+            save_monetary_locale = stdize_locale(savepv(save_monetary_locale));
  
              if (strNE(save_monetary_locale, save_input_locale)) {
                  if (! setlocale(LC_MONETARY, save_input_locale)) {
@@ -1280,6 +1223,10 @@ Perl__is_cur_LC_category_utf8(pTHX_ int category)
              Safefree(save_monetary_locale);
          }
  
+        if (only_ascii) {
+            goto cant_use_monetary;
+        }
+
          Safefree(save_input_locale);
  
          /* It isn't a UTF-8 locale if the symbol is not legal UTF-8; otherwise
@@ -1287,19 +1234,161 @@ Perl__is_cur_LC_category_utf8(pTHX_ int category)
           * UTF-8.  (We can't really tell if the locale is UTF-8 or not if the
           * symbol is just a '$', so we err on the side of it not being UTF-8)
           * */
-        DEBUG_L(PerlIO_printf(Perl_debug_log, "\tis_utf8=%d\n", (illegal_utf8)
-                                                               ? FALSE
-                                                               : ! only_ascii));
-        return (illegal_utf8)
-                ? FALSE
-                : ! only_ascii;
-
+        DEBUG_L(PerlIO_printf(Perl_debug_log, "\tis_utf8=%d\n", ! illegal_utf8));
+        return ! illegal_utf8;
      }
    cant_use_monetary:
  
  #   endif /* USE_LOCALE_MONETARY */
  #endif /* HAS_LOCALECONV */
  
+#if 0 && defined(HAS_STRERROR) && defined(USE_LOCALE_MESSAGES)
+
+/* This code is ifdefd out because it was found to not be necessary in testing
+ * on our dromedary test machine, which has over 700 locales.  There, looking
+ * at just the currency symbol gave essentially the same results as doing this
+ * extra work.  Executing this also caused segfaults in miniperl.  I left it in
+ * so as to avoid rewriting it if real-world experience indicates that
+ * dromedary is an outlier.  Essentially, instead of returning abpve if we
+ * haven't found illegal utf8, we continue on and examine all the strerror()
+ * messages on the platform for utf8ness.  If all are ASCII, we still don't
+ * know the answer; but otherwise we have a pretty good indication of the
+ * utf8ness.  The reason this doesn't necessarily help much is that the
+ * messages may not have been translated into the locale.  The currency symbol
+ * is much more likely to have been translated.  The code below would need to
+ * be altered somewhat to just be a continuation of testing the currency
+ * symbol. */
+        int e;
+        unsigned int failures = 0, non_ascii = 0;
+        char *save_messages_locale = NULL;
+
+        /* Like above for LC_CTYPE, we set LC_MESSAGES to the locale of the
+         * desired category, if it isn't that locale already */
+
+        if (category != LC_MESSAGES) {
+
+            save_messages_locale = stdize_locale(savepv(setlocale(LC_MESSAGES,
+                                                                  NULL)));
+            if (! save_messages_locale) {
+                goto cant_use_messages;
+            }
+
+            if (strEQ(save_messages_locale, save_input_locale)) {
+                Safefree(save_input_locale);
+            }
+            else if (! setlocale(LC_MESSAGES, save_input_locale)) {
+                Safefree(save_messages_locale);
+                goto cant_use_messages;
+            }
+        }
+
+        /* Here the current LC_MESSAGES is set to the locale of the category
+         * whose information is desired.  Look through all the messages */
+
+        for (e = 0;
+#ifdef HAS_SYS_ERRLIST
+             e <= sys_nerr
+#endif
+             ; e++)
+        {
+            const U8* const errmsg = (U8 *) Strerror(e) ;
+            if (!errmsg)
+                break;
+            if (! is_utf8_string(errmsg, 0)) {
+                failures++;
+                break;
+            }
+            else if (! is_ascii_string(errmsg, 0)) {
+                non_ascii++;
+            }
+        }
+
+        /* And, if we changed it, restore LC_MESSAGES to its original locale */
+        if (save_messages_locale) {
+            setlocale(LC_MESSAGES, save_messages_locale);
+            Safefree(save_messages_locale);
+        }
+
+        /* Any non-UTF-8 message means not a UTF-8 locale; if all are valid,
+         * any non-ascii means it is one; otherwise we assume it isn't */
+        return (failures) ? FALSE : non_ascii;
+
+    }
+  cant_use_messages:
+
+#endif
+
+#endif /* the code that is compiled when no nl_langinfo */
+
+    /* As a last resort, look at the locale name to see if it matches
+     * qr/UTF -?  * 8 /ix, or some other common locale names.  This "name", the
+     * return of setlocale(), is actually defined to be opaque, so we can't
+     * really rely on the absence of various substrings in the name to indicate
+     * its UTF-8ness, but if it has UTF8 in the name, it is extremely likely to
+     * be a UTF-8 locale.  Similarly for the other common names */
+
+    final_pos = strlen(save_input_locale) - 1;
+    if (final_pos >= 3) {
+        char *name = save_input_locale;
+
+        /* Find next 'U' or 'u' and look from there */
+        while ((name += strcspn(name, "Uu") + 1)
+                                            <= save_input_locale + final_pos - 2)
+        {
+            if (toFOLD(*(name)) != 't'
+                || toFOLD(*(name + 1)) != 'f')
+            {
+                continue;
+            }
+            name += 2;
+            if (*(name) == '-') {
+                if ((name > save_input_locale + final_pos - 1)) {
+                    break;
+                }
+                name++;
+            }
+            if (*(name) == '8') {
+                Safefree(save_input_locale);
+                DEBUG_L(PerlIO_printf(Perl_debug_log,
+                                      "Locale %s ends with UTF-8 in name\n",
+                                      save_input_locale));
+                return TRUE;
+            }
+        }
+        DEBUG_L(PerlIO_printf(Perl_debug_log,
+                              "Locale %s doesn't end with UTF-8 in name\n",
+                                save_input_locale));
+    }
+
+#ifdef WIN32
+    /* http://msdn.microsoft.com/en-us/library/windows/desktop/dd317756.aspx */
+    if (final_pos >= 4
+        && *(save_input_locale + final_pos - 0) == '1'
+        && *(save_input_locale + final_pos - 1) == '0'
+        && *(save_input_locale + final_pos - 2) == '0'
+        && *(save_input_locale + final_pos - 3) == '5'
+        && *(save_input_locale + final_pos - 4) == '6')
+    {
+        DEBUG_L(PerlIO_printf(Perl_debug_log,
+                        "Locale %s ends with 10056 in name, is UTF-8 locale\n",
+                        save_input_locale));
+        Safefree(save_input_locale);
+        return TRUE;
+    }
+#endif
+
+    /* Other common encodings are the ISO 8859 series, which aren't UTF-8.  But
+     * since we are about to return FALSE anyway, there is no point in doing
+     * this extra work */
+#if 0
+    if (instr(save_input_locale, "8859")) {
+        DEBUG_L(PerlIO_printf(Perl_debug_log,
+                             "Locale %s has 8859 in name, not UTF-8 locale\n",
+                             save_input_locale));
+        Safefree(save_input_locale);
+        return FALSE;
+    }
+#endif
  
      DEBUG_L(PerlIO_printf(Perl_debug_log,
                            "Assuming locale %s is not a UTF-8 locale\n",
@@ -1341,9 +1430,7 @@ Perl_my_strerror(pTHX_ const int errnum) {
  #ifdef USE_LOCALE_MESSAGES
      if (! IN_LC(LC_MESSAGES)) {
          char * save_locale = setlocale(LC_MESSAGES, NULL);
-        if (! ((*save_locale == 'C' && save_locale[1] == '\0')
-                || strEQ(save_locale, "POSIX")))
-        {
+        if (! isNAME_C_OR_POSIX(save_locale)) {
              char *errstr;
  
              /* The next setlocale likely will zap this, so create a copy */