This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
regexec.c: Use new macros instead of swashes
authorKarl Williamson <public@khwilliamson.com>
Sun, 2 Sep 2012 20:46:38 +0000 (14:46 -0600)
committerKarl Williamson <public@khwilliamson.com>
Fri, 14 Sep 2012 03:14:02 +0000 (21:14 -0600)
A previous commit has caused macros to be generated that will match
Unicode code points of interest to the \X algorithm.  This patch uses
them.  This speeds up modern Korean processing by 15%.

Together with recent previous commits, the throughput of modern Korean
under \X has more than doubled, and is now comparable to other
languages (which have increased themselved by 35%)

embed.fnc
embed.h
embedvar.h
intrpvar.h
proto.h
regen/unicode_constants.pl
regexec.c
sv.c
unicode_constants.h
utf8.c

index ab2cdec..756f7c1 100644 (file)
--- a/embed.fnc
+++ b/embed.fnc
@@ -661,16 +661,9 @@ ApR        |bool   |is_utf8_punct  |NN const U8 *p
 ApR    |bool   |is_utf8_xdigit |NN const U8 *p
 ApR    |bool   |is_utf8_mark   |NN const U8 *p
 EXpR   |bool   |is_utf8_X_extend       |NN const U8 *p
-EXpR   |bool   |is_utf8_X_prepend      |NN const U8 *p
 EXpR   |bool   |is_utf8_X_regular_begin|NN const U8 *p
-EXpR   |bool   |is_utf8_X_special_begin|NN const U8 *p
-EXpR   |bool   |is_utf8_X_L            |NN const U8 *p
-EXpR   |bool   |is_utf8_X_RI           |NN const U8 *p
 :not currently used EXpR       |bool   |is_utf8_X_LV           |NN const U8 *p
 EXpR   |bool   |is_utf8_X_LVT          |NN const U8 *p
-EXpR   |bool   |is_utf8_X_LV_LVT_V     |NN const U8 *p
-EXpR   |bool   |is_utf8_X_T            |NN const U8 *p
-EXpR   |bool   |is_utf8_X_V            |NN const U8 *p
 : Used in perly.y
 p      |OP*    |jmaybe         |NN OP *o
 : Used in pp.c 
diff --git a/embed.h b/embed.h
index 45291f0..33d732d 100644 (file)
--- a/embed.h
+++ b/embed.h
 #define _is_utf8__perl_idstart(a)      Perl__is_utf8__perl_idstart(aTHX_ a)
 #define av_reify(a)            Perl_av_reify(aTHX_ a)
 #define current_re_engine()    Perl_current_re_engine(aTHX)
-#define is_utf8_X_L(a)         Perl_is_utf8_X_L(aTHX_ a)
 #define is_utf8_X_LVT(a)       Perl_is_utf8_X_LVT(aTHX_ a)
-#define is_utf8_X_LV_LVT_V(a)  Perl_is_utf8_X_LV_LVT_V(aTHX_ a)
-#define is_utf8_X_RI(a)                Perl_is_utf8_X_RI(aTHX_ a)
-#define is_utf8_X_T(a)         Perl_is_utf8_X_T(aTHX_ a)
-#define is_utf8_X_V(a)         Perl_is_utf8_X_V(aTHX_ a)
 #define is_utf8_X_extend(a)    Perl_is_utf8_X_extend(aTHX_ a)
-#define is_utf8_X_prepend(a)   Perl_is_utf8_X_prepend(aTHX_ a)
 #define is_utf8_X_regular_begin(a)     Perl_is_utf8_X_regular_begin(aTHX_ a)
-#define is_utf8_X_special_begin(a)     Perl_is_utf8_X_special_begin(aTHX_ a)
 #define op_clear(a)            Perl_op_clear(aTHX_ a)
 #define qerror(a)              Perl_qerror(aTHX_ a)
 #define reg_named_buff(a,b,c,d)        Perl_reg_named_buff(aTHX_ a,b,c,d)
index 877e811..d3eeaf0 100644 (file)
 #define PL_unitcheckav_save    (vTHX->Iunitcheckav_save)
 #define PL_unlockhook          (vTHX->Iunlockhook)
 #define PL_unsafe              (vTHX->Iunsafe)
-#define PL_utf8_X_L            (vTHX->Iutf8_X_L)
 #define PL_utf8_X_LVT          (vTHX->Iutf8_X_LVT)
-#define PL_utf8_X_LV_LVT_V     (vTHX->Iutf8_X_LV_LVT_V)
-#define PL_utf8_X_RI           (vTHX->Iutf8_X_RI)
-#define PL_utf8_X_T            (vTHX->Iutf8_X_T)
-#define PL_utf8_X_V            (vTHX->Iutf8_X_V)
 #define PL_utf8_X_extend       (vTHX->Iutf8_X_extend)
-#define PL_utf8_X_prepend      (vTHX->Iutf8_X_prepend)
 #define PL_utf8_X_regular_begin        (vTHX->Iutf8_X_regular_begin)
-#define PL_utf8_X_special_begin        (vTHX->Iutf8_X_special_begin)
 #define PL_utf8_alnum          (vTHX->Iutf8_alnum)
 #define PL_utf8_alpha          (vTHX->Iutf8_alpha)
 #define PL_utf8_blank          (vTHX->Iutf8_blank)
index 94b7425..641cac6 100644 (file)
@@ -628,14 +628,7 @@ PERLVAR(I, utf8_xdigit,    SV *)
 PERLVAR(I, utf8_mark,  SV *)
 PERLVAR(I, utf8_X_regular_begin, SV *)
 PERLVAR(I, utf8_X_extend, SV *)
-PERLVAR(I, utf8_X_prepend, SV *)
-PERLVAR(I, utf8_X_special_begin, SV *)
-PERLVAR(I, utf8_X_L,   SV *)
 PERLVAR(I, utf8_X_LVT, SV *)
-PERLVAR(I, utf8_X_RI,  SV *)
-PERLVAR(I, utf8_X_T,   SV *)
-PERLVAR(I, utf8_X_V,   SV *)
-PERLVAR(I, utf8_X_LV_LVT_V, SV *)
 PERLVAR(I, utf8_toupper, SV *)
 PERLVAR(I, utf8_totitle, SV *)
 PERLVAR(I, utf8_tolower, SV *)
diff --git a/proto.h b/proto.h
index f97fe1f..187b0ae 100644 (file)
--- a/proto.h
+++ b/proto.h
@@ -1764,66 +1764,24 @@ PERL_CALLCONV bool      Perl_is_uni_xdigit_lc(pTHX_ UV c)
                        __attribute__warn_unused_result__
                        __attribute__pure__;
 
-PERL_CALLCONV bool     Perl_is_utf8_X_L(pTHX_ const U8 *p)
-                       __attribute__warn_unused_result__
-                       __attribute__nonnull__(pTHX_1);
-#define PERL_ARGS_ASSERT_IS_UTF8_X_L   \
-       assert(p)
-
 PERL_CALLCONV bool     Perl_is_utf8_X_LVT(pTHX_ const U8 *p)
                        __attribute__warn_unused_result__
                        __attribute__nonnull__(pTHX_1);
 #define PERL_ARGS_ASSERT_IS_UTF8_X_LVT \
        assert(p)
 
-PERL_CALLCONV bool     Perl_is_utf8_X_LV_LVT_V(pTHX_ const U8 *p)
-                       __attribute__warn_unused_result__
-                       __attribute__nonnull__(pTHX_1);
-#define PERL_ARGS_ASSERT_IS_UTF8_X_LV_LVT_V    \
-       assert(p)
-
-PERL_CALLCONV bool     Perl_is_utf8_X_RI(pTHX_ const U8 *p)
-                       __attribute__warn_unused_result__
-                       __attribute__nonnull__(pTHX_1);
-#define PERL_ARGS_ASSERT_IS_UTF8_X_RI  \
-       assert(p)
-
-PERL_CALLCONV bool     Perl_is_utf8_X_T(pTHX_ const U8 *p)
-                       __attribute__warn_unused_result__
-                       __attribute__nonnull__(pTHX_1);
-#define PERL_ARGS_ASSERT_IS_UTF8_X_T   \
-       assert(p)
-
-PERL_CALLCONV bool     Perl_is_utf8_X_V(pTHX_ const U8 *p)
-                       __attribute__warn_unused_result__
-                       __attribute__nonnull__(pTHX_1);
-#define PERL_ARGS_ASSERT_IS_UTF8_X_V   \
-       assert(p)
-
 PERL_CALLCONV bool     Perl_is_utf8_X_extend(pTHX_ const U8 *p)
                        __attribute__warn_unused_result__
                        __attribute__nonnull__(pTHX_1);
 #define PERL_ARGS_ASSERT_IS_UTF8_X_EXTEND      \
        assert(p)
 
-PERL_CALLCONV bool     Perl_is_utf8_X_prepend(pTHX_ const U8 *p)
-                       __attribute__warn_unused_result__
-                       __attribute__nonnull__(pTHX_1);
-#define PERL_ARGS_ASSERT_IS_UTF8_X_PREPEND     \
-       assert(p)
-
 PERL_CALLCONV bool     Perl_is_utf8_X_regular_begin(pTHX_ const U8 *p)
                        __attribute__warn_unused_result__
                        __attribute__nonnull__(pTHX_1);
 #define PERL_ARGS_ASSERT_IS_UTF8_X_REGULAR_BEGIN       \
        assert(p)
 
-PERL_CALLCONV bool     Perl_is_utf8_X_special_begin(pTHX_ const U8 *p)
-                       __attribute__warn_unused_result__
-                       __attribute__nonnull__(pTHX_1);
-#define PERL_ARGS_ASSERT_IS_UTF8_X_SPECIAL_BEGIN       \
-       assert(p)
-
 PERL_CALLCONV bool     Perl_is_utf8_alnum(pTHX_ const U8 *p)
                        __attribute__warn_unused_result__
                        __attribute__nonnull__(pTHX_1);
index 73ec4ae..56e5349 100644 (file)
@@ -127,9 +127,6 @@ __DATA__
 03C5 first
 03C5 tail
 
-1100
-1160
-11A8
 2010 string
 
 007F native
index f51d50d..4e9b80c 100644 (file)
--- a/regexec.c
+++ b/regexec.c
         /* No asserts are done for some of these, in case called on a   */  \
         /* Unicode version in which they map to nothing */                  \
        LOAD_UTF8_CHARCLASS(X_regular_begin, HYPHEN_UTF8);                          \
-       LOAD_UTF8_CHARCLASS_NO_CHECK(X_special_begin);                      \
        LOAD_UTF8_CHARCLASS(X_extend, COMBINING_GRAVE_ACCENT_UTF8);         \
-       LOAD_UTF8_CHARCLASS_NO_CHECK(X_prepend);/* empty in most releases*/ \
-       LOAD_UTF8_CHARCLASS(X_L, HANGUL_CHOSEONG_KIYEOK_UTF8);              \
-       LOAD_UTF8_CHARCLASS(X_LV_LVT_V, HANGUL_JUNGSEONG_FILLER_UTF8);      \
-       LOAD_UTF8_CHARCLASS_NO_CHECK(X_RI);    /* empty in many releases */ \
-       LOAD_UTF8_CHARCLASS(X_T, HANGUL_JONGSEONG_KIYEOK_UTF8);             \
-       LOAD_UTF8_CHARCLASS(X_V, HANGUL_JUNGSEONG_FILLER_UTF8)
 
 #define PLACEHOLDER    /* Something for the preprocessor to grab onto */
 
@@ -4058,6 +4051,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
                    locinput += 2;
                }
                else {
+                    STRLEN len;
+
                    /* In case have to backtrack to beginning, then match '.' */
                    char *starting = locinput;
 
@@ -4066,16 +4061,12 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
 
                    LOAD_UTF8_CHARCLASS_GCB();
 
-                    /* Match (prepend)*, but don't bother trying if empty (as
-                     * being set to _undef indicates) */
-                    if (PL_utf8_X_prepend != &PL_sv_undef) {
-                        while (locinput < PL_regeol
-                               && swash_fetch(PL_utf8_X_prepend,
-                                              (U8*)locinput, utf8_target))
-                        {
-                            previous_prepend = locinput;
-                            locinput += UTF8SKIP(locinput);
-                        }
+                    /* Match (prepend)*   */
+                    while (locinput < PL_regeol
+                           && (len = is_GCB_Prepend_utf8(locinput)))
+                    {
+                        previous_prepend = locinput;
+                        locinput += len;
                     }
 
                    /* As noted above, if we matched a prepend character, but
@@ -4085,8 +4076,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
                        && (locinput >=  PL_regeol
                            || (! swash_fetch(PL_utf8_X_regular_begin,
                                             (U8*)locinput, utf8_target)
-                                && ! swash_fetch(PL_utf8_X_special_begin,
-                                            (U8*)locinput, utf8_target)))
+                                && ! is_GCB_SPECIAL_BEGIN_utf8(locinput)))
                         )
                    {
                        locinput = previous_prepend;
@@ -4101,9 +4091,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
                                     (U8*)locinput, utf8_target)) {
                         locinput += UTF8SKIP(locinput);
                     }
-                    else if (! swash_fetch(PL_utf8_X_special_begin,
-                                       (U8*)locinput, utf8_target))
-                       {
+                    else if (! is_GCB_SPECIAL_BEGIN_utf8(locinput)) {
 
                        /* Here did not match the required 'Begin' in the
                         * second term.  So just match the very first
@@ -4115,26 +4103,20 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
                         /* Here is a special begin.  It can be composed of
                          * several individual characters.  One possibility is
                          * RI+ */
-                        if (swash_fetch(PL_utf8_X_RI,
-                                        (U8*)locinput, utf8_target))
-                        {
-                            locinput += UTF8SKIP(locinput);
+                        if ((len = is_GCB_RI_utf8(locinput))) {
+                            locinput += len;
                             while (locinput < PL_regeol
-                                    && swash_fetch(PL_utf8_X_RI,
-                                                    (U8*)locinput, utf8_target))
+                                   && (len = is_GCB_RI_utf8(locinput)))
                             {
-                                locinput += UTF8SKIP(locinput);
+                                locinput += len;
                             }
-                        } else /* Another possibility is T+ */
-                               if (swash_fetch(PL_utf8_X_T,
-                                               (U8*)locinput, utf8_target))
-                        {
-                            locinput += UTF8SKIP(locinput);
+                        } else if ((len = is_GCB_T_utf8(locinput))) {
+                            /* Another possibility is T+ */
+                            locinput += len;
                             while (locinput < PL_regeol
-                                    && swash_fetch(PL_utf8_X_T,
-                                                    (U8*)locinput, utf8_target))
+                                && (len = is_GCB_T_utf8(locinput)))
                             {
-                                locinput += UTF8SKIP(locinput);
+                                locinput += len;
                             }
                         } else {
 
@@ -4145,10 +4127,9 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
 
                             /* Match L*           */
                             while (locinput < PL_regeol
-                                    && swash_fetch(PL_utf8_X_L,
-                                                    (U8*)locinput, utf8_target))
+                                   && (len = is_GCB_L_utf8(locinput)))
                             {
-                                locinput += UTF8SKIP(locinput);
+                                locinput += len;
                             }
 
                             /* Here, have exhausted L*.  If the next character
@@ -4158,8 +4139,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
                              * Are done. */
 
                             if (locinput < PL_regeol
-                                && swash_fetch(PL_utf8_X_LV_LVT_V,
-                                                (U8*)locinput, utf8_target))
+                                && is_GCB_LV_LVT_V_utf8(locinput))
                             {
 
                                 /* Otherwise keep going.  Must be LV, LVT or V.
@@ -4172,22 +4152,18 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
                                      * V*     */
                                     locinput += UTF8SKIP(locinput);
                                     while (locinput < PL_regeol
-                                            && swash_fetch(PL_utf8_X_V,
-                                                           (U8*)locinput,
-                                                           utf8_target))
+                                           && (len = is_GCB_V_utf8(locinput)))
                                     {
-                                        locinput += UTF8SKIP(locinput);
+                                        locinput += len;
                                     }
                                 }
 
                                 /* And any of LV, LVT, or V can be followed
-                                    * by T*            */
+                                 * by T*            */
                                 while (locinput < PL_regeol
-                                        && swash_fetch(PL_utf8_X_T,
-                                                        (U8*)locinput,
-                                                        utf8_target))
+                                       && (len = is_GCB_T_utf8(locinput)))
                                 {
-                                    locinput += UTF8SKIP(locinput);
+                                    locinput += len;
                                 }
                             }
                         }
diff --git a/sv.c b/sv.c
index 497417c..a757ad2 100644 (file)
--- a/sv.c
+++ b/sv.c
@@ -13376,15 +13376,7 @@ perl_clone_using(PerlInterpreter *proto_perl, UV flags,
     PL_utf8_mark       = sv_dup_inc(proto_perl->Iutf8_mark, param);
     PL_utf8_X_regular_begin    = sv_dup_inc(proto_perl->Iutf8_X_regular_begin, param);
     PL_utf8_X_extend   = sv_dup_inc(proto_perl->Iutf8_X_extend, param);
-    PL_utf8_X_prepend  = sv_dup_inc(proto_perl->Iutf8_X_prepend, param);
-    PL_utf8_X_special_begin    = sv_dup_inc(proto_perl->Iutf8_X_special_begin, param);
-    PL_utf8_X_L        = sv_dup_inc(proto_perl->Iutf8_X_L, param);
-    /*not currently used: PL_utf8_X_LV = sv_dup_inc(proto_perl->Iutf8_X_LV, param);*/
     PL_utf8_X_LVT      = sv_dup_inc(proto_perl->Iutf8_X_LVT, param);
-    PL_utf8_X_RI       = sv_dup_inc(proto_perl->Iutf8_X_RI, param);
-    PL_utf8_X_T        = sv_dup_inc(proto_perl->Iutf8_X_T, param);
-    PL_utf8_X_V        = sv_dup_inc(proto_perl->Iutf8_X_V, param);
-    PL_utf8_X_LV_LVT_V = sv_dup_inc(proto_perl->Iutf8_X_LV_LVT_V, param);
     PL_utf8_toupper    = sv_dup_inc(proto_perl->Iutf8_toupper, param);
     PL_utf8_totitle    = sv_dup_inc(proto_perl->Iutf8_totitle, param);
     PL_utf8_tolower    = sv_dup_inc(proto_perl->Iutf8_tolower, param);
index f915d59..82081d9 100644 (file)
@@ -30,9 +30,6 @@
 #define GREEK_SMALL_LETTER_UPSILON_UTF8_FIRST_BYTE 0xCF    /* U+03C5 */
 #define GREEK_SMALL_LETTER_UPSILON_UTF8_TAIL "\x85"    /* U+03C5 */
 
-#define HANGUL_CHOSEONG_KIYEOK_UTF8 "\xE1\x84\x80"    /* U+1100 */
-#define HANGUL_JUNGSEONG_FILLER_UTF8 "\xE1\x85\xA0"    /* U+1160 */
-#define HANGUL_JONGSEONG_KIYEOK_UTF8 "\xE1\x86\xA8"    /* U+11A8 */
 #define HYPHEN_UTF8 "\xE2\x80\x90"    /* U+2010 */
 
 #define DELETE_NATIVE 0x007F    /* U+007F */
diff --git a/utf8.c b/utf8.c
index 2172d31..88ca041 100644 (file)
--- a/utf8.c
+++ b/utf8.c
@@ -2229,76 +2229,6 @@ Perl_is_utf8_X_extend(pTHX_ const U8 *p)
     return is_utf8_common(p, &PL_utf8_X_extend, "_X_Extend");
 }
 
-bool
-Perl_is_utf8_X_prepend(pTHX_ const U8 *p)
-{
-    /* If no code points in the Unicode version being worked on match
-     * GCB=Prepend, this will set PL_utf8_X_prepend to &PL_sv_undef during its
-     * first call.  Otherwise, it will set it to a swash created for it.
-     * swash_fetch() hence can't be used without checking first if it is valid
-     * to do so. */
-
-    dVAR;
-    bool initialized = cBOOL(PL_utf8_X_prepend);
-    bool ret;
-
-    PERL_ARGS_ASSERT_IS_UTF8_X_PREPEND;
-
-    if (PL_utf8_X_prepend == &PL_sv_undef) {
-        return FALSE;
-    }
-
-    if ((ret = is_utf8_common(p, &PL_utf8_X_prepend, "_X_GCB_Prepend"))
-        || initialized)
-    {
-        return ret;
-    }
-
-    /* Here the code point being checked was not a prepend, and we hadn't
-     * initialized PL_utf8_X_prepend, so we don't know if it is just this
-     * particular input code point that didn't match, or if the table is
-     * completely empty. The is_utf8_common() call did the initialization, so
-     * we can inspect the swash's inversion list to find out.  If there are no
-     * elements in its inversion list, it's empty, and nothing will ever match,
-     * so set things up so we can skip the check in future calls. */
-    if (_invlist_len(_get_swash_invlist(PL_utf8_X_prepend)) == 0) {
-        SvREFCNT_dec(PL_utf8_X_prepend);
-        PL_utf8_X_prepend = &PL_sv_undef;
-    }
-
-    return FALSE;
-}
-
-bool
-Perl_is_utf8_X_special_begin(pTHX_ const U8 *p)
-{
-    dVAR;
-
-    PERL_ARGS_ASSERT_IS_UTF8_X_SPECIAL_BEGIN;
-
-    return is_utf8_common(p, &PL_utf8_X_special_begin, "_X_Special_Begin");
-}
-
-bool
-Perl_is_utf8_X_L(pTHX_ const U8 *p)
-{
-    dVAR;
-
-    PERL_ARGS_ASSERT_IS_UTF8_X_L;
-
-    return is_utf8_common(p, &PL_utf8_X_L, "_X_GCB_L");
-}
-
-bool
-Perl_is_utf8_X_RI(pTHX_ const U8 *p)
-{
-    dVAR;
-
-    PERL_ARGS_ASSERT_IS_UTF8_X_RI;
-
-    return is_utf8_common(p, &PL_utf8_X_RI, "_X_RI");
-}
-
 /* These constants are for finding GCB=LV and GCB=LVT.  These are for the
  * pre-composed Hangul syllables, which are all in a contiguous block and
  * arranged there in such a way so as to facilitate alorithmic determination of
@@ -2367,35 +2297,6 @@ Perl_is_utf8_X_LVT(pTHX_ const U8 *p)
             && (cp - SBASE) % TCount != 0); /* All but every TCount one is LV */
 }
 
-bool
-Perl_is_utf8_X_T(pTHX_ const U8 *p)
-{
-    dVAR;
-
-    PERL_ARGS_ASSERT_IS_UTF8_X_T;
-
-    return is_utf8_common(p, &PL_utf8_X_T, "_X_GCB_T");
-}
-
-bool
-Perl_is_utf8_X_V(pTHX_ const U8 *p)
-{
-    dVAR;
-
-    PERL_ARGS_ASSERT_IS_UTF8_X_V;
-
-    return is_utf8_common(p, &PL_utf8_X_V, "_X_GCB_V");
-}
-
-bool
-Perl_is_utf8_X_LV_LVT_V(pTHX_ const U8 *p)
-{
-    dVAR;
-
-    PERL_ARGS_ASSERT_IS_UTF8_X_LV_LVT_V;
-
-    return is_utf8_common(p, &PL_utf8_X_LV_LVT_V, "_X_LV_LVT_V");
-}
 
 bool
 Perl__is_utf8_quotemeta(pTHX_ const U8 *p)
@@ -2408,7 +2309,6 @@ Perl__is_utf8_quotemeta(pTHX_ const U8 *p)
 
     return is_utf8_common(p, &PL_utf8_quotemeta, "_Perl_Quotemeta");
 }
-
 /*
 =for apidoc to_utf8_case