This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
locale.c: Not so aggressive collation memory use guess
[perl5.git] / locale.c
index 826092e..48b9184 100644 (file)
--- a/locale.c
+++ b/locale.c
@@ -666,6 +666,19 @@ Perl_new_collate(pTHX_ const char *newcoll)
                 /* Add 1 for the trailing NUL */
                 PL_collxfrm_base = base + 1;
             }
+
+#ifdef DEBUGGING
+            if (DEBUG_L_TEST || debug_initialization) {
+                PerlIO_printf(Perl_debug_log,
+                    "%s:%d: ?UTF-8 locale=%d; x_len_shorter=%"UVuf", "
+                    "x_len_longer=%"UVuf","
+                    " collate multipler=%"UVuf", collate base=%"UVuf"\n",
+                    __FILE__, __LINE__,
+                    PL_in_utf8_COLLATE_locale,
+                    x_len_shorter, x_len_longer,
+                    PL_collxfrm_mult, PL_collxfrm_base);
+            }
+#endif
        }
     }
 
@@ -1413,14 +1426,16 @@ Perl__mem_collxfrm(pTHX_ const char *input_string,
     /* _mem_collxfrm() is a bit like strxfrm() but with two important
      * differences. First, it handles embedded NULs. Second, it allocates a bit
      * more memory than needed for the transformed data itself.  The real
-     * transformed data begins at offset sizeof(collationix).  *xlen is set to
+     * transformed data begins at offset COLLXFRM_HDR_LEN.  *xlen is set to
      * the length of that, and doesn't include the collation index size.
      * Please see sv_collxfrm() to see how this is used. */
 
+#define COLLXFRM_HDR_LEN    sizeof(PL_collation_ix)
+
     char * s = (char *) input_string;
     STRLEN s_strlen = strlen(input_string);
     char *xbuf = NULL;
-    STRLEN xAlloc, xout; /* xalloc is a reserved word in VC */
+    STRLEN xAlloc;          /* xalloc is a reserved word in VC */
     bool first_time = TRUE; /* Cleared after first loop iteration */
 
     PERL_ARGS_ASSERT__MEM_COLLXFRM;
@@ -1458,7 +1473,10 @@ Perl__mem_collxfrm(pTHX_ const char *input_string,
          * locale, find it */
         if (*PL_strxfrm_min_char == '\0') {
             int j;
-            char * cur_min_x = NULL;    /* Cur cp's xfrm, (except it also
+#ifdef DEBUGGING
+            U8     cur_min_cp = 1;  /* The code point that sorts lowest, so far */
+#endif
+            char * cur_min_x = NULL;    /* And its xfrm, (except it also
                                            includes the collation index
                                            prefixed. */
 
@@ -1495,7 +1513,7 @@ Perl__mem_collxfrm(pTHX_ const char *input_string,
                 /* If something went wrong (which it shouldn't), just
                  * ignore this code point */
                 if (   x_len == 0
-                    || strlen(x + sizeof(PL_collation_ix)) < x_len)
+                    || strlen(x + COLLXFRM_HDR_LEN) < x_len)
                 {
                     continue;
                 }
@@ -1503,11 +1521,14 @@ Perl__mem_collxfrm(pTHX_ const char *input_string,
                 /* If this character's transformation is lower than
                  * the current lowest, this one becomes the lowest */
                 if (   cur_min_x == NULL
-                    || strLT(x         + sizeof(PL_collation_ix),
-                             cur_min_x + sizeof(PL_collation_ix)))
+                    || strLT(x         + COLLXFRM_HDR_LEN,
+                             cur_min_x + COLLXFRM_HDR_LEN))
                 {
                     strcpy(PL_strxfrm_min_char, cur_source);
                     cur_min_x = x;
+#ifdef DEBUGGING
+                    cur_min_cp = j;
+#endif
                 }
                 else {
                     Safefree(x);
@@ -1523,6 +1544,21 @@ Perl__mem_collxfrm(pTHX_ const char *input_string,
                 /* cur_min_cp was already initialized to 1 */
             }
 
+            DEBUG_L(PerlIO_printf(Perl_debug_log,
+                    "_mem_collxfrm: lowest collating control in the 0-255 "
+                    "range in locale %s is 0x%02X\n",
+                    PL_collation_name,
+                    cur_min_cp));
+            if (DEBUG_Lv_TEST) {
+                unsigned i;
+                PerlIO_printf(Perl_debug_log, "Its xfrm is");
+                for (i = 0; i < strlen(cur_min_x + COLLXFRM_HDR_LEN); i ++) {
+                    PerlIO_printf(Perl_debug_log, " %02x",
+                                (U8) *(cur_min_x + COLLXFRM_HDR_LEN + i));
+                }
+                PerlIO_printf(Perl_debug_log, "\n");
+            }
+
             Safefree(cur_min_x);
         }
 
@@ -1629,8 +1665,8 @@ Perl__mem_collxfrm(pTHX_ const char *input_string,
                         /* If this character's transformation is higher than
                          * the current highest, this one becomes the highest */
                         if (   cur_max_x == NULL
-                            || strGT(x         + sizeof(PL_collation_ix),
-                                     cur_max_x + sizeof(PL_collation_ix)))
+                            || strGT(x         + COLLXFRM_HDR_LEN,
+                                     cur_max_x + COLLXFRM_HDR_LEN))
                         {
                             PL_strxfrm_max_cp = j;
                             cur_max_x = x;
@@ -1640,6 +1676,24 @@ Perl__mem_collxfrm(pTHX_ const char *input_string,
                         }
                     }
 
+                    DEBUG_L(PerlIO_printf(Perl_debug_log,
+                            "_mem_collxfrm: highest 1-byte collating character"
+                            " in locale %s is 0x%02X\n",
+                            PL_collation_name,
+                            PL_strxfrm_max_cp));
+                    if (DEBUG_Lv_TEST) {
+                        unsigned i;
+                        PerlIO_printf(Perl_debug_log, "Its xfrm is ");
+                        for (i = 0;
+                             i < strlen(cur_max_x + COLLXFRM_HDR_LEN);
+                             i++)
+                        {
+                            PerlIO_printf(Perl_debug_log, " %02x",
+                                        (U8) cur_max_x[i + COLLXFRM_HDR_LEN]);
+                        }
+                        PerlIO_printf(Perl_debug_log, "\n");
+                    }
+
                     Safefree(cur_max_x);
                 }
 
@@ -1684,7 +1738,7 @@ Perl__mem_collxfrm(pTHX_ const char *input_string,
     /* The first element in the output is the collation id, used by
      * sv_collxfrm(); then comes the space for the transformed string.  The
      * equation should give us a good estimate as to how much is needed */
-    xAlloc = sizeof(PL_collation_ix)
+    xAlloc = COLLXFRM_HDR_LEN
            + PL_collxfrm_base
            + (PL_collxfrm_mult * ((utf8)
                                  ? utf8_length((U8 *) s, (U8 *) s + len)
@@ -1695,22 +1749,20 @@ Perl__mem_collxfrm(pTHX_ const char *input_string,
 
     /* Store the collation id */
     *(U32*)xbuf = PL_collation_ix;
-    xout = sizeof(PL_collation_ix);
 
     /* Then the transformation of the input.  We loop until successful, or we
      * give up */
     for (;;) {
-        STRLEN xused = strxfrm(xbuf + xout, s, xAlloc - xout);
+        *xlen = strxfrm(xbuf + COLLXFRM_HDR_LEN, s, xAlloc - COLLXFRM_HDR_LEN);
 
         /* If the transformed string occupies less space than we told strxfrm()
          * was available, it means it successfully transformed the whole
          * string. */
-        if (xused < xAlloc - xout) {
-            xout += xused;
+        if (*xlen < xAlloc - COLLXFRM_HDR_LEN) {
             break;
         }
 
-        if (UNLIKELY(xused >= PERL_INT_MAX))
+        if (UNLIKELY(*xlen >= PERL_INT_MAX))
             goto bad;
 
         /* A well-behaved strxfrm() returns exactly how much space it needs
@@ -1718,7 +1770,7 @@ Perl__mem_collxfrm(pTHX_ const char *input_string,
          * space being provided.  Assume that this is the case unless it's been
          * proven otherwise */
         if (LIKELY(PL_strxfrm_is_behaved) && first_time) {
-            xAlloc = xused + sizeof(PL_collation_ix) + 1;
+            xAlloc = *xlen + COLLXFRM_HDR_LEN + 1;
         }
         else { /* Here, either:
                 *  1)  The strxfrm() has previously shown bad behavior; or
@@ -1730,10 +1782,19 @@ Perl__mem_collxfrm(pTHX_ const char *input_string,
                 *      isn't sufficient, they return the input size instead of
                 *      how much is needed.)
                 * Increase the buffer size by a fixed percentage and try again. */
-            xAlloc = (2 * xAlloc) + 1;
+            xAlloc += (xAlloc / 4) + 1;
             PL_strxfrm_is_behaved = FALSE;
-        }
 
+#ifdef DEBUGGING
+            if (DEBUG_Lv_TEST || debug_initialization) {
+                PerlIO_printf(Perl_debug_log,
+                "_mem_collxfrm required more space than previously calculated"
+                " for locale %s, trying again with new guess=%d+%"UVuf"\n",
+                PL_collation_name, (int) COLLXFRM_HDR_LEN,
+                (UV) xAlloc - COLLXFRM_HDR_LEN);
+            }
+#endif
+        }
 
         Renew(xbuf, xAlloc, char);
         if (UNLIKELY(! xbuf))
@@ -1742,10 +1803,23 @@ Perl__mem_collxfrm(pTHX_ const char *input_string,
         first_time = FALSE;
     }
 
-    *xlen = xout - sizeof(PL_collation_ix);
+
+#ifdef DEBUGGING
+    if (DEBUG_Lv_TEST || debug_initialization) {
+        unsigned i;
+        PerlIO_printf(Perl_debug_log,
+            "_mem_collxfrm[%d]: returning %"UVuf" for locale %s '%s'\n",
+            PL_collation_ix, *xlen, PL_collation_name, input_string);
+        PerlIO_printf(Perl_debug_log, "Its xfrm is");
+        for (i = COLLXFRM_HDR_LEN; i < *xlen + COLLXFRM_HDR_LEN; i++) {
+            PerlIO_printf(Perl_debug_log, " %02x", (U8) xbuf[i]);
+        }
+        PerlIO_printf(Perl_debug_log, "\n");
+    }
+#endif
 
     /* Free up unneeded space; retain ehough for trailing NUL */
-    Renew(xbuf, xout + 1, char);
+    Renew(xbuf, COLLXFRM_HDR_LEN + *xlen + 1, char);
 
     if (s != input_string) {
         Safefree(s);
@@ -1759,10 +1833,17 @@ Perl__mem_collxfrm(pTHX_ const char *input_string,
         Safefree(s);
     }
     *xlen = 0;
+#ifdef DEBUGGING
+    if (DEBUG_Lv_TEST || debug_initialization) {
+        PerlIO_printf(Perl_debug_log, "_mem_collxfrm[%d] returning NULL\n",
+                                      PL_collation_ix);
+    }
+#endif
     return NULL;
 }
 
 #endif /* USE_LOCALE_COLLATE */
+
 #ifdef USE_LOCALE
 
 bool