This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
locale.c: Not so aggressive collation memory use guess
[perl5.git] / locale.c
index 667a1bc..48b9184 100644 (file)
--- a/locale.c
+++ b/locale.c
 
 #include "reentr.h"
 
+/* If the environment says to, we can output debugging information during
+ * initialization.  This is done before option parsing, and before any thread
+ * creation, so can be a file-level static */
+#ifdef DEBUGGING
+static bool debug_initialization = FALSE;
+#endif
+
 #ifdef USE_LOCALE
 
 /*
@@ -119,13 +126,17 @@ Perl_set_numeric_radix(pTHX)
     else
        PL_numeric_radix_sv = NULL;
 
-    DEBUG_L(PerlIO_printf(Perl_debug_log, "Locale radix is '%s', ?UTF-8=%d\n",
+#ifdef DEBUGGING
+    if (DEBUG_L_TEST || debug_initialization) {
+        PerlIO_printf(Perl_debug_log, "Locale radix is '%s', ?UTF-8=%d\n",
                                           (PL_numeric_radix_sv)
                                            ? SvPVX(PL_numeric_radix_sv)
                                            : "NULL",
                                           (PL_numeric_radix_sv)
                                            ? cBOOL(SvUTF8(PL_numeric_radix_sv))
-                                           : 0));
+                                           : 0);
+    }
+#endif
 
 # endif /* HAS_LOCALECONV */
 #endif /* USE_LOCALE_NUMERIC */
@@ -230,8 +241,12 @@ Perl_set_numeric_standard(pTHX)
     PL_numeric_standard = TRUE;
     PL_numeric_local = isNAME_C_OR_POSIX(PL_numeric_name);
     set_numeric_radix();
-    DEBUG_L(PerlIO_printf(Perl_debug_log,
-                          "Underlying LC_NUMERIC locale now is C\n"));
+#ifdef DEBUGGING
+    if (DEBUG_L_TEST || debug_initialization) {
+        PerlIO_printf(Perl_debug_log,
+                          "Underlying LC_NUMERIC locale now is C\n");
+    }
+#endif
 
 #endif /* USE_LOCALE_NUMERIC */
 }
@@ -250,9 +265,13 @@ Perl_set_numeric_local(pTHX)
     PL_numeric_standard = isNAME_C_OR_POSIX(PL_numeric_name);
     PL_numeric_local = TRUE;
     set_numeric_radix();
-    DEBUG_L(PerlIO_printf(Perl_debug_log,
+#ifdef DEBUGGING
+    if (DEBUG_L_TEST || debug_initialization) {
+        PerlIO_printf(Perl_debug_log,
                           "Underlying LC_NUMERIC locale now is %s\n",
-                          PL_numeric_name));
+                          PL_numeric_name);
+    }
+#endif
 
 #endif /* USE_LOCALE_NUMERIC */
 }
@@ -647,6 +666,19 @@ Perl_new_collate(pTHX_ const char *newcoll)
                 /* Add 1 for the trailing NUL */
                 PL_collxfrm_base = base + 1;
             }
+
+#ifdef DEBUGGING
+            if (DEBUG_L_TEST || debug_initialization) {
+                PerlIO_printf(Perl_debug_log,
+                    "%s:%d: ?UTF-8 locale=%d; x_len_shorter=%"UVuf", "
+                    "x_len_longer=%"UVuf","
+                    " collate multipler=%"UVuf", collate base=%"UVuf"\n",
+                    __FILE__, __LINE__,
+                    PL_in_utf8_COLLATE_locale,
+                    x_len_shorter, x_len_longer,
+                    PL_collxfrm_mult, PL_collxfrm_base);
+            }
+#endif
        }
     }
 
@@ -884,24 +916,6 @@ Perl_init_i18nl10n(pTHX_ int printwarn)
     const char * const setlocale_init = (PerlEnv_getenv("PERL_SKIP_LOCALE_INIT"))
                                         ? NULL
                                         : "";
-#ifdef DEBUGGING
-    const bool debug = (PerlEnv_getenv("PERL_DEBUG_LOCALE_INIT"))
-                       ? TRUE
-                       : FALSE;
-#   define DEBUG_LOCALE_INIT(category, locale, result)                      \
-       STMT_START {                                                        \
-               if (debug) {                                                \
-                    PerlIO_printf(Perl_debug_log,                           \
-                                  "%s:%d: %s\n",                            \
-                                  __FILE__, __LINE__,                       \
-                                  _setlocale_debug_string(category,         \
-                                                          locale,           \
-                                                          result));         \
-                }                                                           \
-       } STMT_END
-#else
-#   define DEBUG_LOCALE_INIT(a,b,c)
-#endif
     const char* trial_locales[5];   /* 5 = 1 each for "", LC_ALL, LANG, "", C */
     unsigned int trial_locales_count;
     const char * const lc_all     = savepv(PerlEnv_getenv("LC_ALL"));
@@ -932,6 +946,25 @@ Perl_init_i18nl10n(pTHX_ int printwarn)
     const char *system_default_locale = NULL;
 #endif
 
+#ifdef DEBUGGING
+    debug_initialization = (PerlEnv_getenv("PERL_DEBUG_LOCALE_INIT"))
+                           ? TRUE
+                           : FALSE;
+#   define DEBUG_LOCALE_INIT(category, locale, result)                      \
+       STMT_START {                                                        \
+               if (debug_initialization) {                                 \
+                    PerlIO_printf(Perl_debug_log,                           \
+                                  "%s:%d: %s\n",                            \
+                                  __FILE__, __LINE__,                       \
+                                  _setlocale_debug_string(category,         \
+                                                          locale,           \
+                                                          result));         \
+                }                                                           \
+       } STMT_END
+#else
+#   define DEBUG_LOCALE_INIT(a,b,c)
+#endif
+
 #ifndef LOCALE_ENVIRON_REQUIRED
     PERL_UNUSED_VAR(done);
     PERL_UNUSED_VAR(locale_param);
@@ -1370,24 +1403,17 @@ Perl_init_i18nl10n(pTHX_ int printwarn)
     PERL_UNUSED_ARG(printwarn);
 #endif /* USE_LOCALE */
 
+#ifdef DEBUGGING
+    /* So won't continue to output stuff */
+    debug_initialization = FALSE;
+#endif
+
     return ok;
 }
 
-
 #ifdef USE_LOCALE_COLLATE
 
 char *
-Perl_mem_collxfrm(pTHX_ const char *input_string, STRLEN len, STRLEN *xlen)
-{
-    /* This function is retained for compatibility in case someone outside core
-     * is using this (but it is undocumented) */
-
-    PERL_ARGS_ASSERT_MEM_COLLXFRM;
-
-    return _mem_collxfrm(input_string, len, xlen, FALSE);
-}
-
-char *
 Perl__mem_collxfrm(pTHX_ const char *input_string,
                          STRLEN len,    /* Length of 'input_string' */
                          STRLEN *xlen,  /* Set to length of returned string
@@ -1400,14 +1426,16 @@ Perl__mem_collxfrm(pTHX_ const char *input_string,
     /* _mem_collxfrm() is a bit like strxfrm() but with two important
      * differences. First, it handles embedded NULs. Second, it allocates a bit
      * more memory than needed for the transformed data itself.  The real
-     * transformed data begins at offset sizeof(collationix).  *xlen is set to
+     * transformed data begins at offset COLLXFRM_HDR_LEN.  *xlen is set to
      * the length of that, and doesn't include the collation index size.
      * Please see sv_collxfrm() to see how this is used. */
 
+#define COLLXFRM_HDR_LEN    sizeof(PL_collation_ix)
+
     char * s = (char *) input_string;
     STRLEN s_strlen = strlen(input_string);
     char *xbuf = NULL;
-    STRLEN xAlloc, xout; /* xalloc is a reserved word in VC */
+    STRLEN xAlloc;          /* xalloc is a reserved word in VC */
     bool first_time = TRUE; /* Cleared after first loop iteration */
 
     PERL_ARGS_ASSERT__MEM_COLLXFRM;
@@ -1445,7 +1473,10 @@ Perl__mem_collxfrm(pTHX_ const char *input_string,
          * locale, find it */
         if (*PL_strxfrm_min_char == '\0') {
             int j;
-            char * cur_min_x = NULL;    /* Cur cp's xfrm, (except it also
+#ifdef DEBUGGING
+            U8     cur_min_cp = 1;  /* The code point that sorts lowest, so far */
+#endif
+            char * cur_min_x = NULL;    /* And its xfrm, (except it also
                                            includes the collation index
                                            prefixed. */
 
@@ -1482,7 +1513,7 @@ Perl__mem_collxfrm(pTHX_ const char *input_string,
                 /* If something went wrong (which it shouldn't), just
                  * ignore this code point */
                 if (   x_len == 0
-                    || strlen(x + sizeof(PL_collation_ix)) < x_len)
+                    || strlen(x + COLLXFRM_HDR_LEN) < x_len)
                 {
                     continue;
                 }
@@ -1490,11 +1521,14 @@ Perl__mem_collxfrm(pTHX_ const char *input_string,
                 /* If this character's transformation is lower than
                  * the current lowest, this one becomes the lowest */
                 if (   cur_min_x == NULL
-                    || strLT(x         + sizeof(PL_collation_ix),
-                             cur_min_x + sizeof(PL_collation_ix)))
+                    || strLT(x         + COLLXFRM_HDR_LEN,
+                             cur_min_x + COLLXFRM_HDR_LEN))
                 {
                     strcpy(PL_strxfrm_min_char, cur_source);
                     cur_min_x = x;
+#ifdef DEBUGGING
+                    cur_min_cp = j;
+#endif
                 }
                 else {
                     Safefree(x);
@@ -1510,6 +1544,21 @@ Perl__mem_collxfrm(pTHX_ const char *input_string,
                 /* cur_min_cp was already initialized to 1 */
             }
 
+            DEBUG_L(PerlIO_printf(Perl_debug_log,
+                    "_mem_collxfrm: lowest collating control in the 0-255 "
+                    "range in locale %s is 0x%02X\n",
+                    PL_collation_name,
+                    cur_min_cp));
+            if (DEBUG_Lv_TEST) {
+                unsigned i;
+                PerlIO_printf(Perl_debug_log, "Its xfrm is");
+                for (i = 0; i < strlen(cur_min_x + COLLXFRM_HDR_LEN); i ++) {
+                    PerlIO_printf(Perl_debug_log, " %02x",
+                                (U8) *(cur_min_x + COLLXFRM_HDR_LEN + i));
+                }
+                PerlIO_printf(Perl_debug_log, "\n");
+            }
+
             Safefree(cur_min_x);
         }
 
@@ -1616,8 +1665,8 @@ Perl__mem_collxfrm(pTHX_ const char *input_string,
                         /* If this character's transformation is higher than
                          * the current highest, this one becomes the highest */
                         if (   cur_max_x == NULL
-                            || strGT(x         + sizeof(PL_collation_ix),
-                                     cur_max_x + sizeof(PL_collation_ix)))
+                            || strGT(x         + COLLXFRM_HDR_LEN,
+                                     cur_max_x + COLLXFRM_HDR_LEN))
                         {
                             PL_strxfrm_max_cp = j;
                             cur_max_x = x;
@@ -1627,6 +1676,24 @@ Perl__mem_collxfrm(pTHX_ const char *input_string,
                         }
                     }
 
+                    DEBUG_L(PerlIO_printf(Perl_debug_log,
+                            "_mem_collxfrm: highest 1-byte collating character"
+                            " in locale %s is 0x%02X\n",
+                            PL_collation_name,
+                            PL_strxfrm_max_cp));
+                    if (DEBUG_Lv_TEST) {
+                        unsigned i;
+                        PerlIO_printf(Perl_debug_log, "Its xfrm is ");
+                        for (i = 0;
+                             i < strlen(cur_max_x + COLLXFRM_HDR_LEN);
+                             i++)
+                        {
+                            PerlIO_printf(Perl_debug_log, " %02x",
+                                        (U8) cur_max_x[i + COLLXFRM_HDR_LEN]);
+                        }
+                        PerlIO_printf(Perl_debug_log, "\n");
+                    }
+
                     Safefree(cur_max_x);
                 }
 
@@ -1671,7 +1738,7 @@ Perl__mem_collxfrm(pTHX_ const char *input_string,
     /* The first element in the output is the collation id, used by
      * sv_collxfrm(); then comes the space for the transformed string.  The
      * equation should give us a good estimate as to how much is needed */
-    xAlloc = sizeof(PL_collation_ix)
+    xAlloc = COLLXFRM_HDR_LEN
            + PL_collxfrm_base
            + (PL_collxfrm_mult * ((utf8)
                                  ? utf8_length((U8 *) s, (U8 *) s + len)
@@ -1682,22 +1749,20 @@ Perl__mem_collxfrm(pTHX_ const char *input_string,
 
     /* Store the collation id */
     *(U32*)xbuf = PL_collation_ix;
-    xout = sizeof(PL_collation_ix);
 
     /* Then the transformation of the input.  We loop until successful, or we
      * give up */
     for (;;) {
-        STRLEN xused = strxfrm(xbuf + xout, s, xAlloc - xout);
+        *xlen = strxfrm(xbuf + COLLXFRM_HDR_LEN, s, xAlloc - COLLXFRM_HDR_LEN);
 
         /* If the transformed string occupies less space than we told strxfrm()
          * was available, it means it successfully transformed the whole
          * string. */
-        if (xused < xAlloc - xout) {
-            xout += xused;
+        if (*xlen < xAlloc - COLLXFRM_HDR_LEN) {
             break;
         }
 
-        if (UNLIKELY(xused >= PERL_INT_MAX))
+        if (UNLIKELY(*xlen >= PERL_INT_MAX))
             goto bad;
 
         /* A well-behaved strxfrm() returns exactly how much space it needs
@@ -1705,7 +1770,7 @@ Perl__mem_collxfrm(pTHX_ const char *input_string,
          * space being provided.  Assume that this is the case unless it's been
          * proven otherwise */
         if (LIKELY(PL_strxfrm_is_behaved) && first_time) {
-            xAlloc = xused + sizeof(PL_collation_ix) + 1;
+            xAlloc = *xlen + COLLXFRM_HDR_LEN + 1;
         }
         else { /* Here, either:
                 *  1)  The strxfrm() has previously shown bad behavior; or
@@ -1717,10 +1782,19 @@ Perl__mem_collxfrm(pTHX_ const char *input_string,
                 *      isn't sufficient, they return the input size instead of
                 *      how much is needed.)
                 * Increase the buffer size by a fixed percentage and try again. */
-            xAlloc = (2 * xAlloc) + 1;
+            xAlloc += (xAlloc / 4) + 1;
             PL_strxfrm_is_behaved = FALSE;
-        }
 
+#ifdef DEBUGGING
+            if (DEBUG_Lv_TEST || debug_initialization) {
+                PerlIO_printf(Perl_debug_log,
+                "_mem_collxfrm required more space than previously calculated"
+                " for locale %s, trying again with new guess=%d+%"UVuf"\n",
+                PL_collation_name, (int) COLLXFRM_HDR_LEN,
+                (UV) xAlloc - COLLXFRM_HDR_LEN);
+            }
+#endif
+        }
 
         Renew(xbuf, xAlloc, char);
         if (UNLIKELY(! xbuf))
@@ -1729,10 +1803,23 @@ Perl__mem_collxfrm(pTHX_ const char *input_string,
         first_time = FALSE;
     }
 
-    *xlen = xout - sizeof(PL_collation_ix);
+
+#ifdef DEBUGGING
+    if (DEBUG_Lv_TEST || debug_initialization) {
+        unsigned i;
+        PerlIO_printf(Perl_debug_log,
+            "_mem_collxfrm[%d]: returning %"UVuf" for locale %s '%s'\n",
+            PL_collation_ix, *xlen, PL_collation_name, input_string);
+        PerlIO_printf(Perl_debug_log, "Its xfrm is");
+        for (i = COLLXFRM_HDR_LEN; i < *xlen + COLLXFRM_HDR_LEN; i++) {
+            PerlIO_printf(Perl_debug_log, " %02x", (U8) xbuf[i]);
+        }
+        PerlIO_printf(Perl_debug_log, "\n");
+    }
+#endif
 
     /* Free up unneeded space; retain ehough for trailing NUL */
-    Renew(xbuf, xout + 1, char);
+    Renew(xbuf, COLLXFRM_HDR_LEN + *xlen + 1, char);
 
     if (s != input_string) {
         Safefree(s);
@@ -1746,10 +1833,17 @@ Perl__mem_collxfrm(pTHX_ const char *input_string,
         Safefree(s);
     }
     *xlen = 0;
+#ifdef DEBUGGING
+    if (DEBUG_Lv_TEST || debug_initialization) {
+        PerlIO_printf(Perl_debug_log, "_mem_collxfrm[%d] returning NULL\n",
+                                      PL_collation_ix);
+    }
+#endif
     return NULL;
 }
 
 #endif /* USE_LOCALE_COLLATE */
+
 #ifdef USE_LOCALE
 
 bool