This is a live mirror of the Perl 5 development currently hosted at
regexec.c: More cleaning of FBC macro/code interface
authorKarl Williamson <>
Mon, 23 Jun 2014 01:41:25 +0000 (19:41 -0600)
committerKarl Williamson <>
Fri, 27 Jun 2014 00:09:19 +0000 (18:09 -0600)
The definition of \w is now compiled into the Perl core.  This allows
the complicated swash_fetch function call to be replaced by
isWORDCHAR_utf8, which takes a single parameter, so the interface can be
simplified. [1].

This macro will execute faster on Latin1-range inputs, as it doesn't do
a swash_fetch on them, but slower on other code points due to function
call overhead, and some currently in-place error checking that wasn't
done previously.  This overhead could be removed by using inline
functions, and perhaps a different interface for known non-malformed
input (though I'm actually not sure the input is known to be well-formed
in this case).

These macros still depend on and modify outside variables.  That could
be cleaned up by adding additional parameters to them, but I'm not going
to do it now.  I don't like these kinds of code-generating macros, and
have been tempted to rewrite these as inline functions, but it's not a
trivial task to do.

[1] I hadn't realized it before, but the interface could have been
cleaned up instead by introducting a macro that makes it look like a
single parameter is used uniformly to existing macros, looking like
 #define FBC_BOUND_SWASH_FETCH(s)  \
    cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], s, utf8_target))
But it seems better to me to use isWORDCHAR_utf8 as it is faster for
Western European languages, and can be made nearly the same speed as the
alternative if experience tells us that this is a slow spot that should
be sped up.


index 0dbef02..06d90d3 100644 (file)
--- a/regexec.c
+++ b/regexec.c
@@ -1583,7 +1583,7 @@ if ((reginfo->intuit || regtry(reginfo, &s))) \
            }                                                                  \
        );                                                                     \
-#define FBC_UTF8(TEST_UV, TEST2_UTF8, IF_SUCCESS, IF_FAIL)                     \
+#define FBC_UTF8(TEST_UV, TEST_UTF8, IF_SUCCESS, IF_FAIL)                      \
        if (s == reginfo->strbeg) {                                            \
            tmp = '\n';                                                        \
        }                                                                      \
@@ -1595,7 +1595,7 @@ if ((reginfo->intuit || regtry(reginfo, &s))) \
        tmp = TEST_UV(tmp);                                                    \
        LOAD_UTF8_CHARCLASS_ALNUM();                                           \
        REXEC_FBC_UTF8_SCAN(                                                   \
-           if (tmp == ! (TEST2_UTF8)) {                                       \
+           if (tmp == ! (TEST_UTF8((U8 *) s))) {                              \
                tmp = !tmp;                                                    \
                IF_SUCCESS;                                                    \
            }                                                                  \
@@ -1608,21 +1608,19 @@ if ((reginfo->intuit || regtry(reginfo, &s))) \
  * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
  * NBOUND.  This is accomplished by passing it in either the if or else clause,
  * with the other one being empty */
-/* Common to the BOUND and NBOUND cases.  Unfortunately the UTF8 tests need to
- * be passed in completely with the variable name being tested, which isn't
- * such a clean interface, but this is easier to read than it was before.  We
+/* Common to the BOUND and NBOUND cases.  We
  * are looking for the boundary (or non-boundary between a word and non-word
  * character.  The utf8 and non-utf8 cases have the same logic, but the details
  * must be different.  Find the "wordness" of the character just prior to this
@@ -1842,45 +1840,30 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
     case BOUNDL:
-                  isWORDCHAR_LC_uvchr,
-                  isWORDCHAR_LC_utf8((U8*)s));
     case NBOUNDL:
-                   isWORDCHAR_LC_uvchr,
-                   isWORDCHAR_LC_utf8((U8*)s));
     case BOUND:
-                  isWORDCHAR_uni,
-                  cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
+        FBC_BOUND(isWORDCHAR, isWORDCHAR_uni, isWORDCHAR_utf8);
     case BOUNDA:
-                    isWORDCHAR_A,
-                    isWORDCHAR_A((U8*)s));
     case NBOUND:
-                   isWORDCHAR_uni,
-                   cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
     case NBOUNDA:
-                     isWORDCHAR_A,
-                     isWORDCHAR_A((U8*)s));
     case BOUNDU:
-                  isWORDCHAR_uni,
-                  cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
+        FBC_BOUND(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8);
     case NBOUNDU:
-                   isWORDCHAR_uni,
-                   cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
+        FBC_NBOUND(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8);
     case LNBREAK:
         REXEC_FBC_CSCAN(is_LNBREAK_utf8_safe(s, strend),