Use compiled-in C structure for inverted case folds
authorKarl Williamson <khw@cpan.org>
Thu, 29 Mar 2018 22:32:49 +0000 (16:32 -0600)
committerKarl Williamson <khw@cpan.org>
Sat, 31 Mar 2018 21:36:46 +0000 (15:36 -0600)
This commit changes to use the C data structures generated by the
previous commit to compute what characters fold to a given one.  This is
used to find out what things should match under /i.

This now avoids the expensive start up cost of switching to perl
utf8_heavy.pl, loading a file from disk, and constructing a hash from
it.

12 files changed:
embed.fnc
embed.h
embedvar.h
intrpvar.h
perl.c
perlapi.h
perlvars.h
proto.h
regcomp.c
regexec.c
sv.c
utf8.c

index 4a9992f..095b528 100644 (file)
--- a/embed.fnc
+++ b/embed.fnc
@@ -1892,6 +1892,9 @@ ApM       |U8*    |uvoffuni_to_utf8_flags_msgs|NN U8 *d|UV uv|const UV flags|NULLOK HV**
 Ap     |U8*    |uvuni_to_utf8_flags    |NN U8 *d|UV uv|UV flags
 Apd    |char*  |pv_uni_display |NN SV *dsv|NN const U8 *spv|STRLEN len|STRLEN pvlim|UV flags
 ApdR   |char*  |sv_uni_display |NN SV *dsv|NN SV *ssv|STRLEN pvlim|UV flags
+EXpR   |Size_t |_inverse_folds |const UV cp                        \
+                               |NN int * first_folds_to            \
+                               |NN const int ** remaining_folds_to
 : Used by Data::Alias
 EXp    |void   |vivify_defelem |NN SV* sv
 : Used in pp.c
diff --git a/embed.h b/embed.h
index 97832ba..a6189a1 100644 (file)
--- a/embed.h
+++ b/embed.h
 #endif
 #if defined(PERL_CORE) || defined(PERL_EXT)
 #define _byte_dump_string(a,b,c)       Perl__byte_dump_string(aTHX_ a,b,c)
+#define _inverse_folds(a,b,c)  Perl__inverse_folds(aTHX_ a,b,c)
 #define append_utf8_from_native_byte   S_append_utf8_from_native_byte
 #define av_reify(a)            Perl_av_reify(aTHX_ a)
 #define current_re_engine()    Perl_current_re_engine(aTHX)
index 70d6406..e038ae7 100644 (file)
 #define PL_unitcheckav_save    (vTHX->Iunitcheckav_save)
 #define PL_unlockhook          (vTHX->Iunlockhook)
 #define PL_unsafe              (vTHX->Iunsafe)
-#define PL_utf8_foldclosures   (vTHX->Iutf8_foldclosures)
 #define PL_utf8_mark           (vTHX->Iutf8_mark)
 #define PL_utf8cache           (vTHX->Iutf8cache)
 #define PL_utf8locale          (vTHX->Iutf8locale)
 #define PL_Gutf8_charname_continue     (my_vars->Gutf8_charname_continue)
 #define PL_utf8_foldable       (my_vars->Gutf8_foldable)
 #define PL_Gutf8_foldable      (my_vars->Gutf8_foldable)
+#define PL_utf8_foldclosures   (my_vars->Gutf8_foldclosures)
+#define PL_Gutf8_foldclosures  (my_vars->Gutf8_foldclosures)
 #define PL_utf8_idcont         (my_vars->Gutf8_idcont)
 #define PL_Gutf8_idcont                (my_vars->Gutf8_idcont)
 #define PL_utf8_idstart                (my_vars->Gutf8_idstart)
index 1dfc603..f7b6ee3 100644 (file)
@@ -753,10 +753,6 @@ PERLVAR(I, registered_mros, HV *)
 /* Compile-time block start/end hooks */
 PERLVAR(I, blockhooks, AV *)
 
-/* Everything that folds to a given character, for case insensitivity regex
- * matching */
-PERLVARI(I, utf8_foldclosures, HV *, NULL)
-
 PERLVAR(I, custom_ops, HV *)           /* custom op registrations */
 
 PERLVAR(I, Xpv,                XPV *)          /* (unused) held temporary value */
diff --git a/perl.c b/perl.c
index 2a3df3d..6fb2cc6 100644 (file)
--- a/perl.c
+++ b/perl.c
@@ -336,6 +336,7 @@ perl_construct(pTHXx)
                                             NonL1_Perl_Non_Final_Folds_invlist);
     PL_utf8_charname_begin = _new_invlist_C_array(_Perl_Charname_Begin_invlist);
     PL_utf8_charname_continue = _new_invlist_C_array(_Perl_Charname_Continue_invlist);
+    PL_utf8_foldclosures = _new_invlist_C_array(_Perl_IVCF_invlist);
 
 
 #if defined(LOCAL_PATCH_COUNT)
@@ -1206,13 +1207,11 @@ perl_destruct(pTHXx)
 
     /* clear character classes  */
     SvREFCNT_dec(PL_utf8_mark);
-    SvREFCNT_dec(PL_utf8_foldclosures);
     SvREFCNT_dec(PL_InBitmap);
 #ifdef USE_LOCALE_CTYPE
     SvREFCNT_dec(PL_warn_locale);
 #endif
     PL_utf8_mark       = NULL;
-    PL_utf8_foldclosures = NULL;
     PL_InBitmap          = NULL;
 #ifdef USE_LOCALE_CTYPE
     PL_warn_locale       = NULL;
index 6b90055..e41d61f 100644 (file)
--- a/perlapi.h
+++ b/perlapi.h
@@ -211,6 +211,8 @@ END_EXTERN_C
 #define PL_utf8_charname_continue      (*Perl_Gutf8_charname_continue_ptr(NULL))
 #undef  PL_utf8_foldable
 #define PL_utf8_foldable       (*Perl_Gutf8_foldable_ptr(NULL))
+#undef  PL_utf8_foldclosures
+#define PL_utf8_foldclosures   (*Perl_Gutf8_foldclosures_ptr(NULL))
 #undef  PL_utf8_idcont
 #define PL_utf8_idcont         (*Perl_Gutf8_idcont_ptr(NULL))
 #undef  PL_utf8_idstart
index af48fa8..b6cc9ca 100644 (file)
@@ -302,3 +302,7 @@ PERLVAR(G, utf8_tofold,     SV *)
 PERLVAR(G, utf8_tosimplefold,  SV *)
 PERLVAR(G, utf8_charname_begin, SV *)
 PERLVAR(G, utf8_charname_continue, SV *)
+
+/* Everything that folds to a given character, for case insensitivity regex
+ * matching */
+PERLVAR(G, utf8_foldclosures, SV *)
diff --git a/proto.h b/proto.h
index 82ab3eb..ebead79 100644 (file)
--- a/proto.h
+++ b/proto.h
@@ -62,6 +62,11 @@ PERL_CALLCONV char * Perl__byte_dump_string(pTHX_ const U8 * const start, const
 PERL_CALLCONV void     Perl__force_out_malformed_utf8_message(pTHX_ const U8 *const p, const U8 * const e, const U32 flags, const bool die_here);
 #define PERL_ARGS_ASSERT__FORCE_OUT_MALFORMED_UTF8_MESSAGE     \
        assert(p); assert(e)
+PERL_CALLCONV Size_t   Perl__inverse_folds(pTHX_ const UV cp, int * first_folds_to, const int ** remaining_folds_to)
+                       __attribute__warn_unused_result__;
+#define PERL_ARGS_ASSERT__INVERSE_FOLDS        \
+       assert(first_folds_to); assert(remaining_folds_to)
+
 PERL_CALLCONV bool     Perl__is_in_locale_category(pTHX_ const bool compiling, const int category);
 PERL_CALLCONV bool     Perl__is_uni_FOO(pTHX_ const U8 classnum, const UV c)
                        __attribute__warn_unused_result__;
index 42167e4..9319e60 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -10178,7 +10178,6 @@ Perl__invlist_dump(pTHX_ PerlIO *file, I32 level,
 void
 Perl__load_PL_utf8_foldclosures (pTHX)
 {
-    PL_utf8_foldclosures = _swash_inversion_hash();
 }
 #endif
 
@@ -10291,9 +10290,9 @@ S__make_exactf_invlist(pTHX_ RExC_state_t *pRExC_state, regnode *node)
         U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' };
         STRLEN foldlen = UTF8SKIP(s);
         const U8* e = s + bytelen;
-        SV** listp;
+        IV fc;
 
-        uc = utf8_to_uvchr_buf(s, s + bytelen, NULL);
+        fc = uc = utf8_to_uvchr_buf(s, s + bytelen, NULL);
 
         /* The only code points that aren't folded in a UTF EXACTFish
          * node are are the problematic ones in EXACTFL nodes */
@@ -10305,14 +10304,21 @@ S__make_exactf_invlist(pTHX_ RExC_state_t *pRExC_state, regnode *node)
             U8 *d = folded;
             int i;
 
+            fc = -1;
             for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < e; i++) {
                 if (isASCII(*s)) {
                     *(d++) = (U8) toFOLD(*s);
+                    if (fc < 0) {       /* Save the first fold */
+                        fc = *(d-1);
+                    }
                     s++;
                 }
                 else {
                     STRLEN len;
-                    toFOLD_utf8_safe(s, e, d, &len);
+                    UV fold = toFOLD_utf8_safe(s, e, d, &len);
+                    if (fc < 0) {       /* Save the first fold */
+                        fc = fold;
+                    }
                     d += len;
                     s += UTF8SKIP(s);
                 }
@@ -10328,8 +10334,7 @@ S__make_exactf_invlist(pTHX_ RExC_state_t *pRExC_state, regnode *node)
         /* When we reach here 's' points to the fold of the first
          * character(s) of the node; and 'e' points to far enough along
          * the folded string to be just past any possible multi-char
-         * fold. 'foldlen' is the length in bytes of the first
-         * character in 's'
+         * fold.
          *
          * Unlike the non-UTF-8 case, the macro for determining if a
          * string is a multi-char fold requires all the characters to
@@ -10342,33 +10347,29 @@ S__make_exactf_invlist(pTHX_ RExC_state_t *pRExC_state, regnode *node)
             invlist = _add_range_to_invlist(invlist, 0, UV_MAX);
         }
         else {  /* Single char fold */
+            unsigned int k;
+            int first_folds_to;
+            const int * remaining_folds_to_list;
+            Size_t folds_to_count;
 
-            /* It matches all the things that fold to it, which are
-             * found in PL_utf8_foldclosures (including itself) */
-            invlist = add_cp_to_invlist(invlist, uc);
-            if (! PL_utf8_foldclosures)
-                _load_PL_utf8_foldclosures();
-            if ((listp = hv_fetch(PL_utf8_foldclosures,
-                                (char *) s, foldlen, FALSE)))
-            {
-                AV* list = (AV*) *listp;
-                IV k;
-                for (k = 0; k <= av_tindex_skip_len_mg(list); k++) {
-                    SV** c_p = av_fetch(list, k, FALSE);
-                    UV c;
-                    assert(c_p);
+            /* It matches itself */
+            invlist = add_cp_to_invlist(invlist, fc);
 
-                    c = SvUV(*c_p);
+            /* ... plus all the things that fold to it, which are found in
+             * PL_utf8_foldclosures */
+            folds_to_count = _inverse_folds(fc, &first_folds_to,
+                                                &remaining_folds_to_list);
+            for (k = 0; k < folds_to_count; k++) {
+                UV c = (k == 0) ? first_folds_to : remaining_folds_to_list[k-1];
 
                     /* /aa doesn't allow folds between ASCII and non- */
                     if ((OP(node) == EXACTFAA || OP(node) == EXACTFAA_NO_TRIE)
-                        && isASCII(c) != isASCII(uc))
+                        && isASCII(c) != isASCII(fc))
                     {
                         continue;
                     }
 
                     invlist = add_cp_to_invlist(invlist, c);
-                }
             }
         }
     }
@@ -17859,27 +17860,20 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
             _invlist_intersection(PL_utf8_foldable, cp_foldable_list,
                                   &fold_intersection);
 
-            /* The folds for all the Latin1 characters are hard-coded into this
-             * program, but we have to go out to disk to get the others. */
-            if (invlist_highest(cp_foldable_list) >= 256) {
-
-                /* This is a hash that for a particular fold gives all
-                 * characters that are involved in it */
-                if (! PL_utf8_foldclosures) {
-                    _load_PL_utf8_foldclosures();
-                }
-            }
-
             /* Now look at the foldable characters in this class individually */
             invlist_iterinit(fold_intersection);
             while (invlist_iternext(fold_intersection, &start, &end)) {
                 UV j;
+                UV folded;
 
                 /* Look at every character in the range */
                 for (j = start; j <= end; j++) {
                     U8 foldbuf[UTF8_MAXBYTES_CASE+1];
                     STRLEN foldlen;
-                    SV** listp;
+                    unsigned int k;
+                    Size_t folds_to_count;
+                    int first_folds_to;
+                    const int * remaining_folds_to_list;
 
                     if (j < 256) {
 
@@ -17914,29 +17908,24 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                      * rules hard-coded for it.  First, get its fold.  This is
                      * the simple fold, as the multi-character folds have been
                      * handled earlier and separated out */
-                    _to_uni_fold_flags(j, foldbuf, &foldlen,
+                    folded = _to_uni_fold_flags(j, foldbuf, &foldlen,
                                                         (ASCII_FOLD_RESTRICTED)
                                                         ? FOLD_FLAGS_NOMIX_ASCII
                                                         : 0);
 
-                    /* Single character fold of above Latin1.  Add everything in
-                    * its fold closure to the list that this node should match.
-                    * The fold closures data structure is a hash with the keys
-                    * being the UTF-8 of every character that is folded to, like
-                    * 'k', and the values each an array of all code points that
-                    * fold to its key.  e.g. [ 'k', 'K', KELVIN_SIGN ].
-                    * Multi-character folds are not included */
-                    if ((listp = hv_fetch(PL_utf8_foldclosures,
-                                        (char *) foldbuf, foldlen, FALSE)))
-                    {
-                        AV* list = (AV*) *listp;
-                        IV k;
-                        for (k = 0; k <= av_tindex_skip_len_mg(list); k++) {
-                            SV** c_p = av_fetch(list, k, FALSE);
-                            UV c;
-                            assert(c_p);
+                    /* Single character fold of above Latin1.  Add everything
+                     * in its fold closure to the list that this node should
+                     * match. */
+                    folds_to_count = _inverse_folds(folded, &first_folds_to,
+                                                    &remaining_folds_to_list);
+                    for (k = 0; k <= folds_to_count; k++) {
+                        UV c = (k == 0)     /* First time through use itself */
+                                ? folded
+                                : (k == 1)  /* 2nd time use, the first fold */
+                                   ? first_folds_to
 
-                            c = SvUV(*c_p);
+                                     /* Then the remaining ones */
+                                   : remaining_folds_to_list[k-2];
 
                             /* /aa doesn't allow folds between ASCII and non- */
                             if ((ASCII_FOLD_RESTRICTED
@@ -17965,7 +17954,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                                            has_upper_latin1_only_utf8_matches,
                                            c);
                             }
-                        }
                     }
                 }
             }
index 9f0abd7..c1b89e5 100644 (file)
--- a/regexec.c
+++ b/regexec.c
@@ -4510,47 +4510,25 @@ S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p,
         else { /* an EXACTFish node which doesn't begin with a multi-char fold */
             c1 = is_utf8_pat ? valid_utf8_to_uvchr(pat, NULL) : *pat;
             if (c1 > 255) {
-                /* Load the folds hash, if not already done */
-                SV** listp;
-                if (! PL_utf8_foldclosures) {
-                    _load_PL_utf8_foldclosures();
+                const int * remaining_folds_to_list;
+                int first_folds_to;
+
+                /* Look up what code points (besides c1) fold to c1;  e.g.,
+                 * [ 'K', KELVIN_SIGN ] both fold to 'k'. */
+                Size_t folds_to_count = _inverse_folds(c1,
+                                                     &first_folds_to,
+                                                     &remaining_folds_to_list);
+                if (folds_to_count == 0) {
+                    c2 = c1;    /* there is only a single character that could
+                                   match */
                 }
-
-                /* The fold closures data structure is a hash with the keys
-                 * being the UTF-8 of every character that is folded to, like
-                 * 'k', and the values each an array of all code points that
-                 * fold to its key.  e.g. [ 'k', 'K', KELVIN_SIGN ].
-                 * Multi-character folds are not included */
-                if ((! (listp = hv_fetch(PL_utf8_foldclosures,
-                                        (char *) pat,
-                                        UTF8SKIP(pat),
-                                        FALSE))))
-                {
-                    /* Not found in the hash, therefore there are no folds
-                    * containing it, so there is only a single character that
-                    * could match */
-                    c2 = c1;
-                }
-                else {  /* Does participate in folds */
-                    AV* list = (AV*) *listp;
-                    if (av_tindex_skip_len_mg(list) != 1) {
-
-                        /* If there aren't exactly two folds to this, it is
-                         * outside the scope of this function */
+                else if (folds_to_count != 1) {
+                    /* If there aren't exactly two folds to this (itself and
+                     * another), it is outside the scope of this function */
                         use_chrtest_void = TRUE;
                     }
-                    else {  /* There are two.  Get them */
-                        SV** c_p = av_fetch(list, 0, FALSE);
-                        if (c_p == NULL) {
-                            Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
-                        }
-                        c1 = SvUV(*c_p);
-
-                        c_p = av_fetch(list, 1, FALSE);
-                        if (c_p == NULL) {
-                            Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
-                        }
-                        c2 = SvUV(*c_p);
+                    else {  /* There are two.  We already have one, get the other */
+                        c2 = first_folds_to;
 
                         /* Folds that cross the 255/256 boundary are forbidden
                          * if EXACTFL (and isnt a UTF8 locale), or EXACTFAA and
@@ -4575,7 +4553,6 @@ S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p,
                             }
                         }
                     }
-                }
             }
             else /* Here, c1 is <= 255 */
                 if (utf8_target
diff --git a/sv.c b/sv.c
index d38ccdc..07865bb 100644 (file)
--- a/sv.c
+++ b/sv.c
@@ -15688,7 +15688,6 @@ perl_clone_using(PerlInterpreter *proto_perl, UV flags,
 
     PL_registered_mros  = hv_dup_inc(proto_perl->Iregistered_mros, param);
     PL_blockhooks      = av_dup_inc(proto_perl->Iblockhooks, param);
-    PL_utf8_foldclosures = hv_dup_inc(proto_perl->Iutf8_foldclosures, param);
 
     /* Call the ->CLONE method, if it exists, for each of the stashes
        identified by sv_dup() above.
diff --git a/utf8.c b/utf8.c
index 77115f3..7c5262e 100644 (file)
--- a/utf8.c
+++ b/utf8.c
@@ -3637,6 +3637,66 @@ S__to_utf8_case(pTHX_ const UV uv1, const U8 *p,
 
 }
 
+Size_t
+Perl__inverse_folds(pTHX_ const UV cp, int * first_folds_to, const int ** remaining_folds_to)
+{
+    /* Returns the count of the number of code points that fold to the input
+     * 'cp' (besides itself).
+     *
+     * If the return is 0, there is nothing else that folds to it, and
+     * '*first_folds_to' is set to 0, and '*remaining_folds_to' is set to NULL.
+     *
+     * If the return is 1, '*first_folds_to' is set to the single code point,
+     * and '*remaining_folds_to' is set to NULL.
+     *
+     * Otherwise, '*first_folds_to' is set to a code point, and
+     * '*remaining_fold_to' is set to an array that contains the others.  The
+     * length of this array is the returned count minus 1.
+     *
+     * The reason for this convolution is to avoid having to deal with
+     * allocating and freeing memory.  The lists are already constructed, so
+     * the return can point to them, but single code points aren't, so would
+     * need to be constructed if we didn't employ something like this API */
+
+    SSize_t index = _invlist_search(PL_utf8_foldclosures, cp);
+    int base = _Perl_IVCF_invmap[index];
+
+    PERL_ARGS_ASSERT__INVERSE_FOLDS;
+
+    if (base == 0) {            /* No fold */
+        *first_folds_to = 0;
+        *remaining_folds_to = NULL;
+        return 0;
+    }
+
+#ifndef HAS_IVCF_AUX_TABLES     /* This Unicode version only has 1-1 folds */
+
+    assert(base > 0);
+
+#else
+
+    if (UNLIKELY(base < 0)) {   /* Folds to more than one character */
+
+        /* The data structure is set up so that the absolute value of 'base' is
+         * an index into a table of pointers to arrays, with the array
+         * corresponding to the index being the list of code points that fold
+         * to 'cp', and the parallel array containing the length of the list
+         * array */
+        *first_folds_to = IVCF_AUX_TABLE_ptrs[-base][0];
+        *remaining_folds_to = IVCF_AUX_TABLE_ptrs[-base] + 1; /* +1 excludes
+                                                                 *first_folds_to
+                                                                */
+        return IVCF_AUX_TABLE_lengths[-base];
+    }
+
+#endif
+
+    /* Only the single code point.  This works like 'fc(G) = G - A + a' */
+    *first_folds_to = base + cp - invlist_array(PL_utf8_foldclosures)[index];
+    *remaining_folds_to = NULL;
+    return 1;
+}
+
 STATIC UV
 S_check_locale_boundary_crossing(pTHX_ const U8* const p, const UV result,
                                        U8* const ustrp, STRLEN *lenp)