S_regmatch(): combine CURLY_B_min/_known states
authorDavid Mitchell <davem@iabyn.com>
Wed, 15 Aug 2018 17:02:53 +0000 (18:02 +0100)
committerDavid Mitchell <davem@iabyn.com>
Sun, 26 Aug 2018 19:57:34 +0000 (20:57 +0100)
There are currently two similar backtracking states for simple
non-greedy pattern repeats:

    CURLY_B_min
    CURLY_B_min_known

the latter is a variant of the former for when the character which must
follow the repeat is known, e.g.  /(...)*?X.../, which allows quick
skipping to the next viable position.

The code for the two cases:

    case CURLY_B_min_fail:
    case CURLY_B_min_known_fail:

share a lot of similarities. This commit merges the two states into a
single CURLY_B_min state, with an associated single CURLY_B_min_fail
fail state.

That one code block can handle both types, with a single

    if (ST.c1 == CHRTEST_VOID) ...

test to choose between the two variant parts of the code.

This makes the code smaller and more maintainable, at the cost of one
extra test per backtrack.

regcomp.sym
regexec.c
regnodes.h

index 3680395..6c20e28 100644 (file)
@@ -255,7 +255,7 @@ WHILEM          A_pre,A_min,A_max,B_min,B_max:FAIL
 BRANCH          next:FAIL
 CURLYM          A,B:FAIL
 IFMATCH         A:FAIL
-CURLY           B_min_known,B_min,B_max:FAIL
+CURLY           B_min,B_max:FAIL
 COMMIT          next:FAIL
 MARKPOINT       next:FAIL
 SKIP            next:FAIL
index c927abc..cb6ec5d 100644 (file)
--- a/regexec.c
+++ b/regexec.c
@@ -8462,24 +8462,41 @@ NULL
            }
            NOT_REACHED; /* NOTREACHED */
 
-       case CURLY_B_min_known_fail:
-           /* failed to find B in a non-greedy match where c1,c2 valid */
+       case CURLY_B_min_fail:
+           /* failed to find B in a non-greedy match.
+             * Handles both cases where c1,c2 valid or not */
 
            REGCP_UNWIND(ST.cp);
             if (ST.paren) {
                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
             }
-           /* Couldn't or didn't -- move forward. */
-           ST.oldloc = locinput;
-           if (utf8_target)
-               locinput += UTF8SKIP(locinput);
-           else
-               locinput++;
-           ST.count++;
-         curly_try_B_min_known:
-            /* find the next place where 'B' could work, then call B */
-           {
+
+            if (ST.c1 == CHRTEST_VOID) {
+                /* failed -- move forward one */
+                char *li = locinput;
+                if (!regrepeat(rex, &li, ST.A, reginfo, 1)) {
+                    sayNO;
+                }
+                locinput = li;
+                ST.count++;
+               if (!(   ST.count <= ST.max
+                        /* count overflow ? */
+                     || (ST.max == REG_INFTY && ST.count > 0))
+                )
+                    sayNO;
+            }
+            else {
                int n;
+                /* Couldn't or didn't -- move forward. */
+                ST.oldloc = locinput;
+                if (utf8_target)
+                    locinput += UTF8SKIP(locinput);
+                else
+                    locinput++;
+                ST.count++;
+
+              curly_try_B_min_known:
+                /* find the next place where 'B' could work, then call B */
                if (utf8_target) {
                    n = (ST.oldloc == locinput) ? 0 : 1;
                    if (ST.c1 == ST.c2) {
@@ -8558,43 +8575,16 @@ NULL
                        sayNO;
                     assert(n == REG_INFTY || locinput == li);
                }
-               CURLY_SETPAREN(ST.paren, ST.count);
-                if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.paren))
-                   goto fake_end;
-               PUSH_STATE_GOTO(CURLY_B_min_known, ST.B, locinput);
            }
-           NOT_REACHED; /* NOTREACHED */
 
-       case CURLY_B_min_fail:
-           /* failed to find B in a non-greedy match where c1,c2 invalid */
-
-           REGCP_UNWIND(ST.cp);
-            if (ST.paren) {
-                UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
-            }
-           /* failed -- move forward one */
-            {
-                char *li = locinput;
-                if (!regrepeat(rex, &li, ST.A, reginfo, 1)) {
-                    sayNO;
-                }
-                locinput = li;
-            }
-            {
-               ST.count++;
-               if (ST.count <= ST.max || (ST.max == REG_INFTY &&
-                       ST.count > 0)) /* count overflow ? */
-               {
-                 curly_try_B_min:
-                   CURLY_SETPAREN(ST.paren, ST.count);
-                    if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.paren))
-                        goto fake_end;
-                   PUSH_STATE_GOTO(CURLY_B_min, ST.B, locinput);
-               }
-           }
-            sayNO;
+          curly_try_B_min:
+            CURLY_SETPAREN(ST.paren, ST.count);
+            if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.paren))
+                goto fake_end;
+            PUSH_STATE_GOTO(CURLY_B_min, ST.B, locinput);
            NOT_REACHED; /* NOTREACHED */
 
+
           curly_try_B_max:
            /* a successful greedy match: now try to match B */
             if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.paren))
index 69f3e38..eeb5ce9 100644 (file)
@@ -7,7 +7,7 @@
 /* Regops and State definitions */
 
 #define REGNODE_MAX            97
-#define REGMATCH_STATE_MAX     139
+#define REGMATCH_STATE_MAX     137
 
 #define        END                     0       /* 0000 End of program. */
 #define        SUCCEED                 1       /* 0x01 Return from a subroutine, basically. */
 #define        CURLYM_B_fail           (REGNODE_MAX + 24)      /* state for CURLYM */
 #define        IFMATCH_A               (REGNODE_MAX + 25)      /* state for IFMATCH */
 #define        IFMATCH_A_fail          (REGNODE_MAX + 26)      /* state for IFMATCH */
-#define        CURLY_B_min_known       (REGNODE_MAX + 27)      /* state for CURLY */
-#define        CURLY_B_min_known_fail  (REGNODE_MAX + 28)      /* state for CURLY */
-#define        CURLY_B_min             (REGNODE_MAX + 29)      /* state for CURLY */
-#define        CURLY_B_min_fail        (REGNODE_MAX + 30)      /* state for CURLY */
-#define        CURLY_B_max             (REGNODE_MAX + 31)      /* state for CURLY */
-#define        CURLY_B_max_fail        (REGNODE_MAX + 32)      /* state for CURLY */
-#define        COMMIT_next             (REGNODE_MAX + 33)      /* state for COMMIT */
-#define        COMMIT_next_fail        (REGNODE_MAX + 34)      /* state for COMMIT */
-#define        MARKPOINT_next          (REGNODE_MAX + 35)      /* state for MARKPOINT */
-#define        MARKPOINT_next_fail     (REGNODE_MAX + 36)      /* state for MARKPOINT */
-#define        SKIP_next               (REGNODE_MAX + 37)      /* state for SKIP */
-#define        SKIP_next_fail          (REGNODE_MAX + 38)      /* state for SKIP */
-#define        CUTGROUP_next           (REGNODE_MAX + 39)      /* state for CUTGROUP */
-#define        CUTGROUP_next_fail      (REGNODE_MAX + 40)      /* state for CUTGROUP */
-#define        KEEPS_next              (REGNODE_MAX + 41)      /* state for KEEPS */
-#define        KEEPS_next_fail         (REGNODE_MAX + 42)      /* state for KEEPS */
+#define        CURLY_B_min             (REGNODE_MAX + 27)      /* state for CURLY */
+#define        CURLY_B_min_fail        (REGNODE_MAX + 28)      /* state for CURLY */
+#define        CURLY_B_max             (REGNODE_MAX + 29)      /* state for CURLY */
+#define        CURLY_B_max_fail        (REGNODE_MAX + 30)      /* state for CURLY */
+#define        COMMIT_next             (REGNODE_MAX + 31)      /* state for COMMIT */
+#define        COMMIT_next_fail        (REGNODE_MAX + 32)      /* state for COMMIT */
+#define        MARKPOINT_next          (REGNODE_MAX + 33)      /* state for MARKPOINT */
+#define        MARKPOINT_next_fail     (REGNODE_MAX + 34)      /* state for MARKPOINT */
+#define        SKIP_next               (REGNODE_MAX + 35)      /* state for SKIP */
+#define        SKIP_next_fail          (REGNODE_MAX + 36)      /* state for SKIP */
+#define        CUTGROUP_next           (REGNODE_MAX + 37)      /* state for CUTGROUP */
+#define        CUTGROUP_next_fail      (REGNODE_MAX + 38)      /* state for CUTGROUP */
+#define        KEEPS_next              (REGNODE_MAX + 39)      /* state for KEEPS */
+#define        KEEPS_next_fail         (REGNODE_MAX + 40)      /* state for KEEPS */
 
 /* PL_regkind[] What type of regop or state is this. */
 
@@ -284,8 +282,6 @@ EXTCONST U8 PL_regkind[] = {
        CURLYM,         /* CURLYM_B_fail          */
        IFMATCH,        /* IFMATCH_A              */
        IFMATCH,        /* IFMATCH_A_fail         */
-       CURLY,          /* CURLY_B_min_known      */
-       CURLY,          /* CURLY_B_min_known_fail */
        CURLY,          /* CURLY_B_min            */
        CURLY,          /* CURLY_B_min_fail       */
        CURLY,          /* CURLY_B_max            */
@@ -645,22 +641,20 @@ EXTCONST char * const PL_reg_name[] = {
        "CURLYM_B_fail",                /* REGNODE_MAX +0x18 */
        "IFMATCH_A",                    /* REGNODE_MAX +0x19 */
        "IFMATCH_A_fail",               /* REGNODE_MAX +0x1a */
-       "CURLY_B_min_known",            /* REGNODE_MAX +0x1b */
-       "CURLY_B_min_known_fail",       /* REGNODE_MAX +0x1c */
-       "CURLY_B_min",                  /* REGNODE_MAX +0x1d */
-       "CURLY_B_min_fail",             /* REGNODE_MAX +0x1e */
-       "CURLY_B_max",                  /* REGNODE_MAX +0x1f */
-       "CURLY_B_max_fail",             /* REGNODE_MAX +0x20 */
-       "COMMIT_next",                  /* REGNODE_MAX +0x21 */
-       "COMMIT_next_fail",             /* REGNODE_MAX +0x22 */
-       "MARKPOINT_next",               /* REGNODE_MAX +0x23 */
-       "MARKPOINT_next_fail",          /* REGNODE_MAX +0x24 */
-       "SKIP_next",                    /* REGNODE_MAX +0x25 */
-       "SKIP_next_fail",               /* REGNODE_MAX +0x26 */
-       "CUTGROUP_next",                /* REGNODE_MAX +0x27 */
-       "CUTGROUP_next_fail",           /* REGNODE_MAX +0x28 */
-       "KEEPS_next",                   /* REGNODE_MAX +0x29 */
-       "KEEPS_next_fail",              /* REGNODE_MAX +0x2a */
+       "CURLY_B_min",                  /* REGNODE_MAX +0x1b */
+       "CURLY_B_min_fail",             /* REGNODE_MAX +0x1c */
+       "CURLY_B_max",                  /* REGNODE_MAX +0x1d */
+       "CURLY_B_max_fail",             /* REGNODE_MAX +0x1e */
+       "COMMIT_next",                  /* REGNODE_MAX +0x1f */
+       "COMMIT_next_fail",             /* REGNODE_MAX +0x20 */
+       "MARKPOINT_next",               /* REGNODE_MAX +0x21 */
+       "MARKPOINT_next_fail",          /* REGNODE_MAX +0x22 */
+       "SKIP_next",                    /* REGNODE_MAX +0x23 */
+       "SKIP_next_fail",               /* REGNODE_MAX +0x24 */
+       "CUTGROUP_next",                /* REGNODE_MAX +0x25 */
+       "CUTGROUP_next_fail",           /* REGNODE_MAX +0x26 */
+       "KEEPS_next",                   /* REGNODE_MAX +0x27 */
+       "KEEPS_next_fail",              /* REGNODE_MAX +0x28 */
 };
 #endif /* DOINIT */