regexec.c: Use SPACE macros instead of swash
authorKarl Williamson <public@khwilliamson.com>
Mon, 19 Nov 2012 21:24:29 +0000 (14:24 -0700)
committerKarl Williamson <public@khwilliamson.com>
Tue, 20 Nov 2012 00:13:02 +0000 (17:13 -0700)
This will avoid loading a swash when an above Latin1 code point is
tested to see if it matches \s.  The SPACE macro is quite small, and
unlikely to grow over time, as Unicode has mostly finished adding white
space equivalents to the Standard.

The CCC_TRY_U macro in regexec.c could not be used for this, and I just
expanded out what it would generate, modified to use the macro instead
of a swash.

regcharclass.h
regen/regcharclass.pl
regexec.c

index 3bdaffa..64e4453 100644 (file)
 ( ( 0xFF21 <= cp && cp <= 0xFF26 ) || ( 0xFF41 <= cp && cp <= 0xFF46 ) ) ) )
 
 /*
+       XPERLSPACE: \p{XPerlSpace}
+
+       \p{XPerlSpace}
+*/
+/*** GENERATED CODE ***/
+#define is_XPERLSPACE(s,is_utf8)                                            \
+( ( ( 0x09 <= ((U8*)s)[0] && ((U8*)s)[0] <= 0x0D ) || 0x20 == ((U8*)s)[0] ) ? 1\
+: ( is_utf8 ) ?                                                             \
+    ( ( 0xC2 == ((U8*)s)[0] ) ?                                             \
+       ( ( 0x85 == ((U8*)s)[1] || 0xA0 == ((U8*)s)[1] ) ? 2 : 0 )          \
+    : ( 0xE1 == ((U8*)s)[0] ) ?                                             \
+       ( ( 0x9A == ((U8*)s)[1] ) ?                                         \
+           ( ( 0x80 == ((U8*)s)[2] ) ? 3 : 0 )                             \
+       : ( ( 0xA0 == ((U8*)s)[1] ) && ( 0x8E == ((U8*)s)[2] ) ) ? 3 : 0 )  \
+    : ( 0xE2 == ((U8*)s)[0] ) ?                                             \
+       ( ( 0x80 == ((U8*)s)[1] ) ?                                         \
+           ( ( ( 0x80 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0x8A ) || ( ((U8*)s)[2] & 0xFE ) == 0xA8 || 0xAF == ((U8*)s)[2] ) ? 3 : 0 )\
+       : ( ( 0x81 == ((U8*)s)[1] ) && ( 0x9F == ((U8*)s)[2] ) ) ? 3 : 0 )  \
+    : ( ( ( 0xE3 == ((U8*)s)[0] ) && ( 0x80 == ((U8*)s)[1] ) ) && ( 0x80 == ((U8*)s)[2] ) ) ? 3 : 0 )\
+: ( 0x85 == ((U8*)s)[0] || 0xA0 == ((U8*)s)[0] ) )
+
+/*** GENERATED CODE ***/
+#define is_XPERLSPACE_utf8(s)                                               \
+( ( ( 0x09 <= ((U8*)s)[0] && ((U8*)s)[0] <= 0x0D ) || 0x20 == ((U8*)s)[0] ) ? 1\
+: ( 0xC2 == ((U8*)s)[0] ) ?                                                 \
+    ( ( 0x85 == ((U8*)s)[1] || 0xA0 == ((U8*)s)[1] ) ? 2 : 0 )              \
+: ( 0xE1 == ((U8*)s)[0] ) ?                                                 \
+    ( ( 0x9A == ((U8*)s)[1] ) ?                                             \
+       ( ( 0x80 == ((U8*)s)[2] ) ? 3 : 0 )                                 \
+    : ( ( 0xA0 == ((U8*)s)[1] ) && ( 0x8E == ((U8*)s)[2] ) ) ? 3 : 0 )      \
+: ( 0xE2 == ((U8*)s)[0] ) ?                                                 \
+    ( ( 0x80 == ((U8*)s)[1] ) ?                                             \
+       ( ( ( ((U8*)s)[2] <= 0x8A ) || ( ((U8*)s)[2] & 0xFE ) == 0xA8 || 0xAF == ((U8*)s)[2] ) ? 3 : 0 )\
+    : ( ( 0x81 == ((U8*)s)[1] ) && ( 0x9F == ((U8*)s)[2] ) ) ? 3 : 0 )      \
+: ( ( ( 0xE3 == ((U8*)s)[0] ) && ( 0x80 == ((U8*)s)[1] ) ) && ( 0x80 == ((U8*)s)[2] ) ) ? 3 : 0 )
+
+/*** GENERATED CODE ***/
+#define is_XPERLSPACE_high(s)                                               \
+( ( 0xE1 == ((U8*)s)[0] ) ?                                                 \
+    ( ( 0x9A == ((U8*)s)[1] ) ?                                             \
+       ( ( 0x80 == ((U8*)s)[2] ) ? 3 : 0 )                                 \
+    : ( ( 0xA0 == ((U8*)s)[1] ) && ( 0x8E == ((U8*)s)[2] ) ) ? 3 : 0 )      \
+: ( 0xE2 == ((U8*)s)[0] ) ?                                                 \
+    ( ( 0x80 == ((U8*)s)[1] ) ?                                             \
+       ( ( ( ((U8*)s)[2] <= 0x8A ) || ( ((U8*)s)[2] & 0xFE ) == 0xA8 || 0xAF == ((U8*)s)[2] ) ? 3 : 0 )\
+    : ( ( 0x81 == ((U8*)s)[1] ) && ( 0x9F == ((U8*)s)[2] ) ) ? 3 : 0 )      \
+: ( ( ( 0xE3 == ((U8*)s)[0] ) && ( 0x80 == ((U8*)s)[1] ) ) && ( 0x80 == ((U8*)s)[2] ) ) ? 3 : 0 )
+
+/*** GENERATED CODE ***/
+#define is_XPERLSPACE_cp_high(cp)                                           \
+( 0x1680 == cp || ( 0x1680 < cp &&                                          \
+( 0x180E == cp || ( 0x180E < cp &&                                          \
+( ( 0x2000 <= cp && cp <= 0x200A ) || ( 0x200A < cp &&                      \
+( 0x2028 == cp || ( 0x2028 < cp &&                                          \
+( 0x2029 == cp || ( 0x2029 < cp &&                                          \
+( 0x202F == cp || ( 0x202F < cp &&                                          \
+( 0x205F == cp || 0x3000 == cp ) ) ) ) ) ) ) ) ) ) ) ) )
+
+/*
        REPLACEMENT: Unicode REPLACEMENT CHARACTER
 
        0xFFFD
index 46425e4..0bab570 100755 (executable)
@@ -1400,6 +1400,10 @@ XDIGIT: Hexadecimal digits
 => UTF8 high cp_high :fast
 \p{XDigit}
 
+XPERLSPACE: \p{XPerlSpace}
+=> generic UTF8 high cp_high :fast
+\p{XPerlSpace}
+
 REPLACEMENT: Unicode REPLACEMENT CHARACTER
 => UTF8 :safe
 0xFFFD
index 69bda15..d0560ce 100644 (file)
--- a/regexec.c
+++ b/regexec.c
@@ -164,7 +164,6 @@ static const char* const non_utf8_target_but_utf8_required
 
 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
 #define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
-#define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
 
 #define LOAD_UTF8_CHARCLASS_GCB()  /* Grapheme cluster boundaries */        \
         /* No asserts are done for some of these, in case called on a   */  \
@@ -1713,16 +1712,14 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
            );
            break;
        case SPACEU:
-           REXEC_FBC_CSCAN_PRELOAD(
-               LOAD_UTF8_CHARCLASS_SPACE(),
-               *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target),
+           REXEC_FBC_CSCAN(
+               is_XPERLSPACE_utf8(s),
                 isSPACE_L1((U8) *s)
            );
            break;
        case SPACE:
-           REXEC_FBC_CSCAN_PRELOAD(
-               LOAD_UTF8_CHARCLASS_SPACE(),
-               *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target),
+           REXEC_FBC_CSCAN(
+               is_XPERLSPACE_utf8(s),
                 isSPACE((U8) *s)
            );
            break;
@@ -1738,16 +1735,14 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
            );
            break;
        case NSPACEU:
-           REXEC_FBC_CSCAN_PRELOAD(
-               LOAD_UTF8_CHARCLASS_SPACE(),
-               !( *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target)),
+           REXEC_FBC_CSCAN(
+               ! is_XPERLSPACE_utf8(s),
                 ! isSPACE_L1((U8) *s)
            );
            break;
        case NSPACE:
-           REXEC_FBC_CSCAN_PRELOAD(
-               LOAD_UTF8_CHARCLASS_SPACE(),
-               !(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target)),
+           REXEC_FBC_CSCAN(
+               ! is_XPERLSPACE_utf8(s),
                 ! isSPACE((U8) *s)
            );
            break;
@@ -4331,11 +4326,73 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
                  ALNUMA, NALNUMA, isWORDCHAR_A,
                  alnum, "a");
 
-        CCC_TRY_U(SPACE,  NSPACE,  isSPACE,
-                 SPACEL, NSPACEL, isSPACE_LC, isSPACE_LC_utf8,
-                 SPACEU, NSPACEU, isSPACE_L1,
-                 SPACEA, NSPACEA, isSPACE_A,
-                 space, " ");
+        case SPACEL:
+            PL_reg_flags |= RF_tainted;
+            if (NEXTCHR_IS_EOS) {
+                sayNO;
+            }
+            if (utf8_target && UTF8_IS_CONTINUED(nextchr)) {
+                if (! isSPACE_LC_utf8((U8 *) locinput)) {
+                    sayNO;
+                }
+            }
+            else if (! isSPACE_LC((U8) nextchr)) {
+                    sayNO;
+            }
+            goto increment_locinput;
+
+        case NSPACEL:
+            PL_reg_flags |= RF_tainted;
+            if (NEXTCHR_IS_EOS) {
+                sayNO;
+            }
+            if (utf8_target && UTF8_IS_CONTINUED(nextchr)) {
+                if (isSPACE_LC_utf8((U8 *) locinput)) {
+                    sayNO;
+                }
+            }
+            else if (isSPACE_LC(nextchr)) {
+                    sayNO;
+            }
+            goto increment_locinput;
+
+        case SPACE:
+            if (utf8_target) {
+                goto utf8_space;
+            }
+            /* FALL THROUGH */
+        case SPACEA:
+            if (NEXTCHR_IS_EOS || ! isSPACE_A(nextchr)) {
+                sayNO;
+            }
+            /* Matched a utf8-invariant, so don't have to worry about utf8 */
+            locinput++;
+            break;
+
+        case NSPACE:
+            if (utf8_target) {
+                goto utf8_nspace;
+            }
+            /* FALL THROUGH */
+        case NSPACEA:
+            if (NEXTCHR_IS_EOS || isSPACE_A(nextchr)) {
+                sayNO;
+            }
+            goto increment_locinput;
+
+        case SPACEU:
+          utf8_space:
+            if (NEXTCHR_IS_EOS || ! is_XPERLSPACE(locinput, utf8_target)) {
+                sayNO;
+            }
+            goto increment_locinput;
+
+        case NSPACEU:
+          utf8_nspace:
+            if (NEXTCHR_IS_EOS || is_XPERLSPACE(locinput, utf8_target)) {
+                sayNO;
+            }
+            goto increment_locinput;
 
         CCC_TRY(DIGIT,  NDIGIT,  isDIGIT,
                DIGITL, NDIGITL, isDIGIT_LC, isDIGIT_LC_utf8,
@@ -6902,10 +6959,8 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma
 
     utf8_space:
 
-           LOAD_UTF8_CHARCLASS_SPACE();
-           while (hardcount < max && scan < loceol &&
-                  (*scan == ' ' ||
-                    swash_fetch(PL_utf8_space,(U8*)scan, utf8_target)))
+           while (hardcount < max && scan < loceol
+                   && is_XPERLSPACE_utf8((U8*)scan))
             {
                scan += UTF8SKIP(scan);
                hardcount++;
@@ -6955,10 +7010,8 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma
 
     utf8_Nspace:
 
-           LOAD_UTF8_CHARCLASS_SPACE();
-           while (hardcount < max && scan < loceol &&
-                  ! (*scan == ' ' ||
-                      swash_fetch(PL_utf8_space,(U8*)scan, utf8_target)))
+           while (hardcount < max && scan < loceol
+                   && ! is_XPERLSPACE_utf8((U8*)scan))
             {
                scan += UTF8SKIP(scan);
                hardcount++;