This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
regcomp.c: Add code to compute edit distance (Damerau–Levenshtein)
authorKarl Williamson <khw@cpan.org>
Tue, 9 Feb 2016 17:40:38 +0000 (10:40 -0700)
committerKarl Williamson <khw@cpan.org>
Wed, 10 Feb 2016 06:30:53 +0000 (23:30 -0700)
This will be used in a future commit.

This code is taken from CPAN Text::Levenshtein::Damerau::XS with the
author's knowledge.  There have been white-space changes to make it
conform better to perl's core coding standards, and declaration changes
to make it more portable, such as using UV instead of 'unsigned int',
and PERL_STATIC_INLINE instead of a less portable form, but the logic is
unchanged.  One variable was changed to signed from unsigned to avoid a
warning message from some compilers.

The author and I will decide later about keeping the cpan module and
this code in sync.  It changes very rarely.

embed.fnc
embed.h
proto.h
regcomp.c

index 97ecfa3..020f432 100644 (file)
--- a/embed.fnc
+++ b/embed.fnc
@@ -2244,6 +2244,11 @@ Es       |I32    |make_trie      |NN RExC_state_t *pRExC_state \
 Es     |regnode *|construct_ahocorasick_from_trie|NN RExC_state_t *pRExC_state \
                                 |NN regnode *source|U32 depth
 EnPs   |const char *|cntrl_to_mnemonic|const U8 c
+EnPs   |int    |edit_distance  |NN const UV *src                   \
+                               |NN const UV *tgt                   \
+                               |const STRLEN x                     \
+                               |const STRLEN y                     \
+                               |const SSize_t maxDistance
 #  ifdef DEBUGGING
 Es        |void        |regdump_intflags|NULLOK const char *lead| const U32 flags
 Es     |void   |regdump_extflags|NULLOK const char *lead| const U32 flags
diff --git a/embed.h b/embed.h
index 24b61bf..82b7ced 100644 (file)
--- a/embed.h
+++ b/embed.h
 #define compute_EXACTish       S_compute_EXACTish
 #define construct_ahocorasick_from_trie(a,b,c) S_construct_ahocorasick_from_trie(aTHX_ a,b,c)
 #define could_it_be_a_POSIX_class      S_could_it_be_a_POSIX_class
+#define edit_distance          S_edit_distance
 #define get_ANYOF_cp_list_for_ssc(a,b) S_get_ANYOF_cp_list_for_ssc(aTHX_ a,b)
 #define get_invlist_iter_addr  S_get_invlist_iter_addr
 #define grok_bslash_N(a,b,c,d,e,f)     S_grok_bslash_N(aTHX_ a,b,c,d,e,f)
diff --git a/proto.h b/proto.h
index 512a264..6bb11c8 100644 (file)
--- a/proto.h
+++ b/proto.h
@@ -4737,6 +4737,11 @@ STATIC regnode * S_construct_ahocorasick_from_trie(pTHX_ RExC_state_t *pRExC_sta
 STATIC bool    S_could_it_be_a_POSIX_class(RExC_state_t *pRExC_state);
 #define PERL_ARGS_ASSERT_COULD_IT_BE_A_POSIX_CLASS     \
        assert(pRExC_state)
+STATIC int     S_edit_distance(const UV *src, const UV *tgt, const STRLEN x, const STRLEN y, const SSize_t maxDistance)
+                       __attribute__pure__;
+#define PERL_ARGS_ASSERT_EDIT_DISTANCE \
+       assert(src); assert(tgt)
+
 STATIC SV*     S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state, const regnode_charclass* const node);
 #define PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC     \
        assert(pRExC_state); assert(node)
index d6a3b5b..d6e4545 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -997,6 +997,151 @@ DEBUG_OPTIMISE_MORE_r(if(data){                                      \
     PerlIO_printf(Perl_debug_log,"\n");                              \
 });
 
+/* =========================================================
+ * BEGIN edit_distance stuff.
+ *
+ * This calculates how many single character changes of any type are needed to
+ * transform a string into another one.  It is taken from version 3.1 of
+ *
+ * https://metacpan.org/pod/Text::Levenshtein::Damerau::XS
+ */
+
+/* Our unsorted dictionary linked list.   */
+/* Note we use UVs, not chars. */
+
+struct dictionary{
+  UV key;
+  UV value;
+  struct dictionary* next;
+};
+typedef struct dictionary item;
+
+
+PERL_STATIC_INLINE item*
+push(UV key,item* curr)
+{
+    item* head;
+    Newxz(head, 1, item);
+    head->key = key;
+    head->value = 0;
+    head->next = curr;
+    return head;
+}
+
+
+PERL_STATIC_INLINE item*
+find(item* head, UV key)
+{
+    item* iterator = head;
+    while (iterator){
+        if (iterator->key == key){
+            return iterator;
+        }
+        iterator = iterator->next;
+    }
+
+    return NULL;
+}
+
+PERL_STATIC_INLINE item*
+uniquePush(item* head,UV key)
+{
+    item* iterator = head;
+
+    while (iterator){
+        if (iterator->key == key) {
+            return head;
+        }
+        iterator = iterator->next;
+    }
+
+    return push(key,head);
+}
+
+PERL_STATIC_INLINE void
+dict_free(item* head)
+{
+    item* iterator = head;
+
+    while (iterator) {
+        item* temp = iterator;
+        iterator = iterator->next;
+        Safefree(temp);
+    }
+
+    head = NULL;
+}
+
+/* End of Dictionary Stuff */
+
+/* All calculations/work are done here */
+STATIC int
+S_edit_distance(const UV* src,
+                const UV* tgt,
+                const STRLEN x,             /* length of src[] */
+                const STRLEN y,             /* length of tgt[] */
+                const SSize_t maxDistance
+)
+{
+    item *head = NULL;
+    UV swapCount,swapScore,targetCharCount,i,j;
+    UV *scores;
+    UV score_ceil = x + y;
+
+    PERL_ARGS_ASSERT_EDIT_DISTANCE;
+
+    /* intialize matrix start values */
+    Newxz(scores, ( (x + 2) * (y + 2)), UV);
+    scores[0] = score_ceil;
+    scores[1 * (y + 2) + 0] = score_ceil;
+    scores[0 * (y + 2) + 1] = score_ceil;
+    scores[1 * (y + 2) + 1] = 0;
+    head = uniquePush(uniquePush(head,src[0]),tgt[0]);
+
+    /* work loops    */
+    /* i = src index */
+    /* j = tgt index */
+    for (i=1;i<=x;i++) {
+        if (i < x)
+            head = uniquePush(head,src[i]);
+        scores[(i+1) * (y + 2) + 1] = i;
+        scores[(i+1) * (y + 2) + 0] = score_ceil;
+        swapCount = 0;
+
+        for (j=1;j<=y;j++) {
+            if (i == 1) {
+                if(j < y)
+                head = uniquePush(head,tgt[j]);
+                scores[1 * (y + 2) + (j + 1)] = j;
+                scores[0 * (y + 2) + (j + 1)] = score_ceil;
+            }
+
+            targetCharCount = find(head,tgt[j-1])->value;
+            swapScore = scores[targetCharCount * (y + 2) + swapCount] + i - targetCharCount - 1 + j - swapCount;
+
+            if (src[i-1] != tgt[j-1]){
+                scores[(i+1) * (y + 2) + (j + 1)] = MIN(swapScore,(MIN(scores[i * (y + 2) + j], MIN(scores[(i+1) * (y + 2) + j], scores[i * (y + 2) + (j + 1)])) + 1));
+            }
+            else {
+                swapCount = j;
+                scores[(i+1) * (y + 2) + (j + 1)] = MIN(scores[i * (y + 2) + j], swapScore);
+            }
+        }
+
+        find(head,src[i-1])->value = i;
+    }
+
+    {
+        IV score = scores[(x+1) * (y + 2) + (y + 1)];
+        dict_free(head);
+        Safefree(scores);
+        return (maxDistance != 0 && maxDistance < score)?(-1):score;
+    }
+}
+
+/* END of edit_distance() stuff
+ * ========================================================= */
+
 /* is c a control character for which we have a mnemonic? */
 #define isMNEMONIC_CNTRL(c) _IS_MNEMONIC_CNTRL_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)