This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
regcomp.c: Extract out functionality into a function
authorKarl Williamson <khw@cpan.org>
Sun, 7 Sep 2014 03:30:14 +0000 (21:30 -0600)
committerKarl Williamson <khw@cpan.org>
Sun, 7 Sep 2014 03:44:48 +0000 (21:44 -0600)
This is in preparation for it being called from a 2nd place.  The code
was merely moved and outdented, and comments moved within the function
and added to.

embed.fnc
embed.h
proto.h
regcomp.c

index d1c73d4..d25c78e 100644 (file)
--- a/embed.fnc
+++ b/embed.fnc
@@ -2098,6 +2098,9 @@ Es        |void    |set_ANYOF_arg |NN RExC_state_t* const pRExC_state \
                                |NULLOK SV* const only_utf8_locale_list    \
                                |NULLOK SV* const swash                    \
                                |const bool has_user_defined_property
+Es     |AV*     |add_multi_match|NULLOK AV* multi_char_matches             \
+                               |NN SV* multi_fold                          \
+                               |const STRLEN cp_count
 Es     |regnode*|regclass      |NN RExC_state_t *pRExC_state \
                                |NN I32 *flagp|U32 depth|const bool stop_at_1 \
                                |bool allow_multi_fold                        \
diff --git a/embed.h b/embed.h
index 607cca8..17d1fd5 100644 (file)
--- a/embed.h
+++ b/embed.h
 #define add_above_Latin1_folds(a,b,c)  S_add_above_Latin1_folds(aTHX_ a,b,c)
 #define add_cp_to_invlist(a,b) S_add_cp_to_invlist(aTHX_ a,b)
 #define add_data               S_add_data
+#define add_multi_match(a,b,c) S_add_multi_match(aTHX_ a,b,c)
 #define alloc_maybe_populate_EXACT(a,b,c,d,e,f)        S_alloc_maybe_populate_EXACT(aTHX_ a,b,c,d,e,f)
 #define compute_EXACTish       S_compute_EXACTish
 #define construct_ahocorasick_from_trie(a,b,c) S_construct_ahocorasick_from_trie(aTHX_ a,b,c)
diff --git a/proto.h b/proto.h
index 3ff1650..e733e79 100644 (file)
--- a/proto.h
+++ b/proto.h
@@ -6729,6 +6729,11 @@ STATIC U32       S_add_data(RExC_state_t* const pRExC_state, const char* const s, cons
 #define PERL_ARGS_ASSERT_ADD_DATA      \
        assert(pRExC_state); assert(s)
 
+STATIC AV*     S_add_multi_match(pTHX_ AV* multi_char_matches, SV* multi_fold, const STRLEN cp_count)
+                       __attribute__nonnull__(pTHX_2);
+#define PERL_ARGS_ASSERT_ADD_MULTI_MATCH       \
+       assert(multi_fold)
+
 PERL_STATIC_INLINE void        S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, regnode *node, I32 *flagp, STRLEN len, UV code_point, bool downgradable)
                        __attribute__nonnull__(pTHX_1)
                        __attribute__nonnull__(pTHX_2)
index 810cfef..f531026 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -13328,6 +13328,49 @@ S_add_above_Latin1_folds(pTHX_ RExC_state_t *pRExC_state, const U8 cp, SV** invl
     }
 }
 
+STATIC AV *
+S_add_multi_match(pTHX_ AV* multi_char_matches, SV* multi_fold, const STRLEN cp_count)
+{
+    /* This adds the string scalar <multi_fold> to the array
+     * <multi_char_matches>.  <multi_fold> is known to have exactly
+     * <cp_count> code points in it.  This is used when constructing a
+     * bracketed character class and we find something that needs to match more
+     * than a single character.
+     *
+     * <multi_char_matches> is actually an array of arrays.  There will be one
+     * or two top-level elements: [2], and/or [3].  The [2] element is an
+     * array, each element thereof is a character which folds to TWO
+     * characters; [3] is for folds to THREE characters.  (Unicode guarantees a
+     * maximum of 3 characters in any fold.)  When we rewrite the character
+     * class below, we will do so such that the longest folds are written
+     * first, so that it prefers the longest matching strings first.  This is
+     * done even if it turns out that any quantifier is non-greedy, out of
+     * programmer laziness.  Tom Christiansen has agreed that this is ok.  This
+     * makes the test for the ligature 'ffi' come before the test for 'ff' */
+
+    AV* this_array;
+    AV** this_array_ptr;
+
+    PERL_ARGS_ASSERT_ADD_MULTI_MATCH;
+
+    if (! multi_char_matches) {
+        multi_char_matches = newAV();
+    }
+
+    if (av_exists(multi_char_matches, cp_count)) {
+        this_array_ptr = (AV**) av_fetch(multi_char_matches, cp_count, FALSE);
+        this_array = *this_array_ptr;
+    }
+    else {
+        this_array = newAV();
+        av_store(multi_char_matches, cp_count,
+                 (SV*) this_array);
+    }
+    av_push(this_array, multi_fold);
+
+    return multi_char_matches;
+}
+
 /* The names of properties whose definitions are not known at compile time are
  * stored in this SV, after a constant heading.  So if the length has been
  * changed since initialization, then there is a run-time definition. */
@@ -14196,44 +14239,17 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                      * again.  Otherwise add this character to the list of
                      * multi-char folds. */
                     if (! RExC_in_multi_char_class) {
-                        AV** this_array_ptr;
-                        AV* this_array;
                         STRLEN cp_count = utf8_length(foldbuf,
                                                       foldbuf + foldlen);
                         SV* multi_fold = sv_2mortal(newSVpvs(""));
 
                         Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%"UVXf"}", value);
 
+                        multi_char_matches
+                                        = add_multi_match(multi_char_matches,
+                                                          multi_fold,
+                                                          cp_count);
 
-                        if (! multi_char_matches) {
-                            multi_char_matches = newAV();
-                        }
-
-                        /* <multi_char_matches> is actually an array of arrays.
-                         * There will be one or two top-level elements: [2],
-                         * and/or [3].  The [2] element is an array, each
-                         * element thereof is a character which folds to TWO
-                         * characters; [3] is for folds to THREE characters.
-                         * (Unicode guarantees a maximum of 3 characters in any
-                         * fold.)  When we rewrite the character class below,
-                         * we will do so such that the longest folds are
-                         * written first, so that it prefers the longest
-                         * matching strings first.  This is done even if it
-                         * turns out that any quantifier is non-greedy, out of
-                         * programmer laziness.  Tom Christiansen has agreed
-                         * that this is ok.  This makes the test for the
-                         * ligature 'ffi' come before the test for 'ff' */
-                        if (av_exists(multi_char_matches, cp_count)) {
-                            this_array_ptr = (AV**) av_fetch(multi_char_matches,
-                                                             cp_count, FALSE);
-                            this_array = *this_array_ptr;
-                        }
-                        else {
-                            this_array = newAV();
-                            av_store(multi_char_matches, cp_count,
-                                     (SV*) this_array);
-                        }
-                        av_push(this_array, multi_fold);
                     }
 
                     /* This element should not be processed further in this