This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
regcomp.c: Clarify comment
[perl5.git] / regcomp.c
index d983428..7335481 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -12624,92 +12624,103 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
         FAIL2("panic: regatom returned failure, flags=%#" UVxf, (UV) flags);
     }
 
         FAIL2("panic: regatom returned failure, flags=%#" UVxf, (UV) flags);
     }
 
-    if (! ISMULT2(RExC_parse)) {
-        *flagp = flags;
-        return(ret);
-    }
-
-    /* Here we know the input is a legal quantifier, including {m,n} */
-
-    op = *RExC_parse;
-
 #ifdef RE_TRACK_PATTERN_OFFSETS
     parse_start = RExC_parse;
 #endif
 
 #ifdef RE_TRACK_PATTERN_OFFSETS
     parse_start = RExC_parse;
 #endif
 
-    if (op != '{') {
+    op = *RExC_parse;
+    switch (op) {
+
+      case '*':
         nextchar(pRExC_state);
         nextchar(pRExC_state);
+        min = 0;
+        break;
 
 
-        if (op == '*') {
-            min = 0;
-        }
-        else if (op == '+') {
-            min = 1;
-        }
-        else if (op == '?') {
-            min = 0; max = 1;
-        }
-    }
-    else {  /* is '{' */
-        const char* endptr;
+      case '+':
+        nextchar(pRExC_state);
+        min = 1;
+        break;
 
 
-        maxpos = NULL;
-        next = RExC_parse + 1;
-        while (isDIGIT(*next) || *next == ',') {
-            if (*next == ',') {
-                if (maxpos)
-                    break;
-                else
-                    maxpos = next;
+      case '?':
+        nextchar(pRExC_state);
+        min = 0; max = 1;
+        break;
+
+      case '{':  /* A '{' may or may not indicate a quantifier; call regcurly()
+                    to determine which */
+        if (regcurly(RExC_parse)) {
+            const char* endptr;
+
+            /* Here is a quantifier, parse for min and max values */
+            maxpos = NULL;
+            next = RExC_parse + 1;
+            while (isDIGIT(*next) || *next == ',') {
+                if (*next == ',') {
+                    if (maxpos)
+                        break;
+                    else
+                        maxpos = next;
+                }
+                next++;
             }
             }
-            next++;
-        }
 
 
-        assert(*next == '}');
+            assert(*next == '}');
 
 
-        if (!maxpos)
-            maxpos = next;
-        RExC_parse++;
-        if (isDIGIT(*RExC_parse)) {
-            endptr = RExC_end;
-            if (!grok_atoUV(RExC_parse, &uv, &endptr))
-                vFAIL("Invalid quantifier in {,}");
-            if (uv >= REG_INFTY)
-                vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
-            min = (I32)uv;
-        } else {
-            min = 0;
-        }
-        if (*maxpos == ',')
-            maxpos++;
-        else
-            maxpos = RExC_parse;
-        if (isDIGIT(*maxpos)) {
-            endptr = RExC_end;
-            if (!grok_atoUV(maxpos, &uv, &endptr))
-                vFAIL("Invalid quantifier in {,}");
-            if (uv >= REG_INFTY)
-                vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
-            max = (I32)uv;
-        } else {
-            max = REG_INFTY;            /* meaning "infinity" */
-        }
-        RExC_parse = next;
-        nextchar(pRExC_state);
-        if (max < min) {    /* If can't match, warn and optimize to fail
-                               unconditionally */
-            reginsert(pRExC_state, OPFAIL, orig_emit, depth+1);
-            ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
-            NEXT_OFF(REGNODE_p(orig_emit)) =
-                                regarglen[OPFAIL] + NODE_STEP_REGNODE;
-            return ret;
-        }
-        else if (min == max && *RExC_parse == '?')
-        {
-            ckWARN2reg(RExC_parse + 1,
-                       "Useless use of greediness modifier '%c'",
-                       *RExC_parse);
-        }
+            if (!maxpos)
+                maxpos = next;
+            RExC_parse++;
+            if (isDIGIT(*RExC_parse)) {
+                endptr = RExC_end;
+                if (!grok_atoUV(RExC_parse, &uv, &endptr))
+                    vFAIL("Invalid quantifier in {,}");
+                if (uv >= REG_INFTY)
+                    vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
+                min = (I32)uv;
+            } else {
+                min = 0;
+            }
+            if (*maxpos == ',')
+                maxpos++;
+            else
+                maxpos = RExC_parse;
+            if (isDIGIT(*maxpos)) {
+                endptr = RExC_end;
+                if (!grok_atoUV(maxpos, &uv, &endptr))
+                    vFAIL("Invalid quantifier in {,}");
+                if (uv >= REG_INFTY)
+                    vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
+                max = (I32)uv;
+            } else {
+                max = REG_INFTY;            /* meaning "infinity" */
+            }
+
+            RExC_parse = next;
+            nextchar(pRExC_state);
+            if (max < min) {    /* If can't match, warn and optimize to fail
+                                   unconditionally */
+                reginsert(pRExC_state, OPFAIL, orig_emit, depth+1);
+                ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
+                NEXT_OFF(REGNODE_p(orig_emit)) =
+                                    regarglen[OPFAIL] + NODE_STEP_REGNODE;
+                return ret;
+            }
+            else if (min == max && *RExC_parse == '?')
+            {
+                ckWARN2reg(RExC_parse + 1,
+                           "Useless use of greediness modifier '%c'",
+                           *RExC_parse);
+            }
+
+            break;
+        } /* End of is regcurly() */
+
+        /* Here was a '{', but what followed it didn't form a quantifier. */
+        /* FALLTHROUGH */
+
+      default:
+        *flagp = flags;
+        return(ret);
+        NOT_REACHED; /*NOTREACHED*/
     }
 
     /* Here we have a quantifier, and have calculated 'min' and 'max'.
     }
 
     /* Here we have a quantifier, and have calculated 'min' and 'max'.
@@ -12755,43 +12766,33 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
             RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
     }
 
             RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
     }
 
+    /* 'SIMPLE' operands don't require full generality */
     if ((flags&SIMPLE)) {
         if (max == REG_INFTY) {
     if ((flags&SIMPLE)) {
         if (max == REG_INFTY) {
-            if (min == 1) {
-                reginsert(pRExC_state, PLUS, ret, depth+1);
-                MARK_NAUGHTY(3);
-                goto done_main_op;
-            }
-            else if (min == 0) {
-
-                /* Going from 0..inf is currently forbidden in wildcard
-                 * subpatterns.  The only reason is to make it harder to
-                 * write patterns that take a long long time to halt, and
-                 * because the use of this construct isn't necessary in
-                 * matching Unicode property values */
-                if (RExC_pm_flags & PMf_WILDCARD) {
-                    RExC_parse++;
-                    /* diag_listed_as: Use of %s is not allowed in Unicode
-                       property wildcard subpatterns in regex; marked by
-                       <-- HERE in m/%s/ */
-                    vFAIL("Use of quantifier '*' is not allowed in"
-                          " Unicode property wildcard subpatterns");
-                    /* Note, don't need to worry about {0,}, as a '}' isn't
-                     * legal at all in wildcards, so wouldn't get this far
-                     * */
+            if (min == 0) {
+                if (UNLIKELY(RExC_pm_flags & PMf_WILDCARD)) {
+                    goto min0_maxINF_wildcard_forbidden;
                 }
 
                 reginsert(pRExC_state, STAR, ret, depth+1);
                 MARK_NAUGHTY(4);
                 goto done_main_op;
             }
                 }
 
                 reginsert(pRExC_state, STAR, ret, depth+1);
                 MARK_NAUGHTY(4);
                 goto done_main_op;
             }
+            else if (min == 1) {
+                reginsert(pRExC_state, PLUS, ret, depth+1);
+                MARK_NAUGHTY(3);
+                goto done_main_op;
+            }
         }
         }
+
+        /* Here, SIMPLE, but not the '*' and '+' special cases */
+
         MARK_NAUGHTY_EXP(2, 2);
         reginsert(pRExC_state, CURLY, ret, depth+1);
         Set_Node_Offset(REGNODE_p(ret), parse_start+1); /* MJD */
         Set_Node_Cur_Length(REGNODE_p(ret), parse_start);
     }
         MARK_NAUGHTY_EXP(2, 2);
         reginsert(pRExC_state, CURLY, ret, depth+1);
         Set_Node_Offset(REGNODE_p(ret), parse_start+1); /* MJD */
         Set_Node_Cur_Length(REGNODE_p(ret), parse_start);
     }
-    else {
+    else {  /* not SIMPLE */
         const regnode_offset w = reg_node(pRExC_state, WHILEM);
 
         FLAGS(REGNODE_p(w)) = 0;
         const regnode_offset w = reg_node(pRExC_state, WHILEM);
 
         FLAGS(REGNODE_p(w)) = 0;
@@ -12821,6 +12822,7 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
         MARK_NAUGHTY_EXP(1, 4);     /* compound interest */
     }
 
         MARK_NAUGHTY_EXP(1, 4);     /* compound interest */
     }
 
+    /* Finish up the CURLY/CURLYX case */
     FLAGS(REGNODE_p(ret)) = 0;
 
     ARG1_SET(REGNODE_p(ret), (U16)min);
     FLAGS(REGNODE_p(ret)) = 0;
 
     ARG1_SET(REGNODE_p(ret), (U16)min);
@@ -12828,6 +12830,7 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
 
   done_main_op:
 
 
   done_main_op:
 
+    /* Process any greediness modifiers */
     if (*RExC_parse == '?') {
         nextchar(pRExC_state);
         reginsert(pRExC_state, MINMOD, ret, depth+1);
     if (*RExC_parse == '?') {
         nextchar(pRExC_state);
         reginsert(pRExC_state, MINMOD, ret, depth+1);
@@ -12849,12 +12852,32 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
         }
     }
 
         }
     }
 
+    /* Forbid extra quantifiers */
     if (ISMULT2(RExC_parse)) {
         RExC_parse++;
         vFAIL("Nested quantifiers");
     }
 
     return(ret);
     if (ISMULT2(RExC_parse)) {
         RExC_parse++;
         vFAIL("Nested quantifiers");
     }
 
     return(ret);
+
+  min0_maxINF_wildcard_forbidden:
+
+    /* Here we are in a wildcard match, and the minimum match length is 0, and
+     * the max could be infinity.  This is currently forbidden.  The only
+     * reason is to make it harder to write patterns that take a long long time
+     * to halt, and because the use of this construct isn't necessary in
+     * matching Unicode property values */
+    RExC_parse++;
+    /* diag_listed_as: Use of %s is not allowed in Unicode property wildcard
+       subpatterns in regex; marked by <-- HERE in m/%s/
+     */
+    vFAIL("Use of quantifier '*' is not allowed in Unicode property wildcard"
+          " subpatterns");
+
+    /* Note, don't need to worry about the input being '{0,}', as a '}' isn't
+     * legal at all in wildcards, so can't get this far */
+
+    NOT_REACHED; /*NOTREACHED*/
 }
 
 STATIC bool
 }
 
 STATIC bool
@@ -13586,7 +13609,6 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                 /* SBOL is shared with /^/ so we set the flags so we can tell
                  * /\A/ from /^/ in split. */
                 FLAGS(REGNODE_p(ret)) = 1;
                 /* SBOL is shared with /^/ so we set the flags so we can tell
                  * /\A/ from /^/ in split. */
                 FLAGS(REGNODE_p(ret)) = 1;
-                *flagp |= SIMPLE;   /* Wrong, but too late to fix for 5.32 */
             }
            goto finish_meta_pat;
        case 'G':
             }
            goto finish_meta_pat;
        case 'G':
@@ -13623,7 +13645,6 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
             }
             else {
                 ret = reg_node(pRExC_state, SEOL);
             }
             else {
                 ret = reg_node(pRExC_state, SEOL);
-                *flagp |= SIMPLE;   /* Wrong, but too late to fix for 5.32 */
             }
            RExC_seen_zerolen++;                /* Do not optimize RE away */
            goto finish_meta_pat;
             }
            RExC_seen_zerolen++;                /* Do not optimize RE away */
            goto finish_meta_pat;
@@ -13634,7 +13655,6 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
             }
             else {
                 ret = reg_node(pRExC_state, EOS);
             }
             else {
                 ret = reg_node(pRExC_state, EOS);
-                *flagp |= SIMPLE;   /* Wrong, but too late to fix for 5.32 */
             }
            RExC_seen_zerolen++;                /* Do not optimize RE away */
            goto finish_meta_pat;
             }
            RExC_seen_zerolen++;                /* Do not optimize RE away */
            goto finish_meta_pat;
@@ -14985,12 +15005,12 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                  *
                  * The solution used here for peeking ahead is to look at that
                  * next character.  If it isn't ASCII punctuation, then it will
                  *
                  * The solution used here for peeking ahead is to look at that
                  * next character.  If it isn't ASCII punctuation, then it will
-                 * be something that continues in an EXACTish node if there
-                 * were space.  We append the fold of it to s, having reserved
-                 * enough room in s0 for the purpose.  If we can't reasonably
-                 * peek ahead, we instead assume the worst case: that it is
-                 * something that would form the completion of a multi-char
-                 * fold.
+                 * be something that would continue on in an EXACTish node if
+                 * there were space.  We append the fold of it to s, having
+                 * reserved enough room in s0 for the purpose.  If we can't
+                 * reasonably peek ahead, we instead assume the worst case:
+                 * that it is something that would form the completion of a
+                 * multi-char fold.
                  *
                  * If we can't split between s and ender, we work backwards
                  * character-by-character down to s0.  At each current point
                  *
                  * If we can't split between s and ender, we work backwards
                  * character-by-character down to s0.  At each current point