X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/dd2cafb96f113a74d005d5cbdb410971a0588fb2..3b58492077941aecb1bb81af35f727992854262c:/regcomp.c diff --git a/regcomp.c b/regcomp.c index d983428..7335481 100644 --- a/regcomp.c +++ b/regcomp.c @@ -12624,92 +12624,103 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) FAIL2("panic: regatom returned failure, flags=%#" UVxf, (UV) flags); } - if (! ISMULT2(RExC_parse)) { - *flagp = flags; - return(ret); - } - - /* Here we know the input is a legal quantifier, including {m,n} */ - - op = *RExC_parse; - #ifdef RE_TRACK_PATTERN_OFFSETS parse_start = RExC_parse; #endif - if (op != '{') { + op = *RExC_parse; + switch (op) { + + case '*': nextchar(pRExC_state); + min = 0; + break; - if (op == '*') { - min = 0; - } - else if (op == '+') { - min = 1; - } - else if (op == '?') { - min = 0; max = 1; - } - } - else { /* is '{' */ - const char* endptr; + case '+': + nextchar(pRExC_state); + min = 1; + break; - maxpos = NULL; - next = RExC_parse + 1; - while (isDIGIT(*next) || *next == ',') { - if (*next == ',') { - if (maxpos) - break; - else - maxpos = next; + case '?': + nextchar(pRExC_state); + min = 0; max = 1; + break; + + case '{': /* A '{' may or may not indicate a quantifier; call regcurly() + to determine which */ + if (regcurly(RExC_parse)) { + const char* endptr; + + /* Here is a quantifier, parse for min and max values */ + maxpos = NULL; + next = RExC_parse + 1; + while (isDIGIT(*next) || *next == ',') { + if (*next == ',') { + if (maxpos) + break; + else + maxpos = next; + } + next++; } - next++; - } - assert(*next == '}'); + assert(*next == '}'); - if (!maxpos) - maxpos = next; - RExC_parse++; - if (isDIGIT(*RExC_parse)) { - endptr = RExC_end; - if (!grok_atoUV(RExC_parse, &uv, &endptr)) - vFAIL("Invalid quantifier in {,}"); - if (uv >= REG_INFTY) - vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1); - min = (I32)uv; - } else { - min = 0; - } - if (*maxpos == ',') - maxpos++; - else - maxpos = RExC_parse; - if (isDIGIT(*maxpos)) { - endptr = RExC_end; - if (!grok_atoUV(maxpos, &uv, &endptr)) - vFAIL("Invalid quantifier in {,}"); - if (uv >= REG_INFTY) - vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1); - max = (I32)uv; - } else { - max = REG_INFTY; /* meaning "infinity" */ - } - RExC_parse = next; - nextchar(pRExC_state); - if (max < min) { /* If can't match, warn and optimize to fail - unconditionally */ - reginsert(pRExC_state, OPFAIL, orig_emit, depth+1); - ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match"); - NEXT_OFF(REGNODE_p(orig_emit)) = - regarglen[OPFAIL] + NODE_STEP_REGNODE; - return ret; - } - else if (min == max && *RExC_parse == '?') - { - ckWARN2reg(RExC_parse + 1, - "Useless use of greediness modifier '%c'", - *RExC_parse); - } + if (!maxpos) + maxpos = next; + RExC_parse++; + if (isDIGIT(*RExC_parse)) { + endptr = RExC_end; + if (!grok_atoUV(RExC_parse, &uv, &endptr)) + vFAIL("Invalid quantifier in {,}"); + if (uv >= REG_INFTY) + vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1); + min = (I32)uv; + } else { + min = 0; + } + if (*maxpos == ',') + maxpos++; + else + maxpos = RExC_parse; + if (isDIGIT(*maxpos)) { + endptr = RExC_end; + if (!grok_atoUV(maxpos, &uv, &endptr)) + vFAIL("Invalid quantifier in {,}"); + if (uv >= REG_INFTY) + vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1); + max = (I32)uv; + } else { + max = REG_INFTY; /* meaning "infinity" */ + } + + RExC_parse = next; + nextchar(pRExC_state); + if (max < min) { /* If can't match, warn and optimize to fail + unconditionally */ + reginsert(pRExC_state, OPFAIL, orig_emit, depth+1); + ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match"); + NEXT_OFF(REGNODE_p(orig_emit)) = + regarglen[OPFAIL] + NODE_STEP_REGNODE; + return ret; + } + else if (min == max && *RExC_parse == '?') + { + ckWARN2reg(RExC_parse + 1, + "Useless use of greediness modifier '%c'", + *RExC_parse); + } + + break; + } /* End of is regcurly() */ + + /* Here was a '{', but what followed it didn't form a quantifier. */ + /* FALLTHROUGH */ + + default: + *flagp = flags; + return(ret); + NOT_REACHED; /*NOTREACHED*/ } /* Here we have a quantifier, and have calculated 'min' and 'max'. @@ -12755,43 +12766,33 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN; } + /* 'SIMPLE' operands don't require full generality */ if ((flags&SIMPLE)) { if (max == REG_INFTY) { - if (min == 1) { - reginsert(pRExC_state, PLUS, ret, depth+1); - MARK_NAUGHTY(3); - goto done_main_op; - } - else if (min == 0) { - - /* Going from 0..inf is currently forbidden in wildcard - * subpatterns. The only reason is to make it harder to - * write patterns that take a long long time to halt, and - * because the use of this construct isn't necessary in - * matching Unicode property values */ - if (RExC_pm_flags & PMf_WILDCARD) { - RExC_parse++; - /* diag_listed_as: Use of %s is not allowed in Unicode - property wildcard subpatterns in regex; marked by - <-- HERE in m/%s/ */ - vFAIL("Use of quantifier '*' is not allowed in" - " Unicode property wildcard subpatterns"); - /* Note, don't need to worry about {0,}, as a '}' isn't - * legal at all in wildcards, so wouldn't get this far - * */ + if (min == 0) { + if (UNLIKELY(RExC_pm_flags & PMf_WILDCARD)) { + goto min0_maxINF_wildcard_forbidden; } reginsert(pRExC_state, STAR, ret, depth+1); MARK_NAUGHTY(4); goto done_main_op; } + else if (min == 1) { + reginsert(pRExC_state, PLUS, ret, depth+1); + MARK_NAUGHTY(3); + goto done_main_op; + } } + + /* Here, SIMPLE, but not the '*' and '+' special cases */ + MARK_NAUGHTY_EXP(2, 2); reginsert(pRExC_state, CURLY, ret, depth+1); Set_Node_Offset(REGNODE_p(ret), parse_start+1); /* MJD */ Set_Node_Cur_Length(REGNODE_p(ret), parse_start); } - else { + else { /* not SIMPLE */ const regnode_offset w = reg_node(pRExC_state, WHILEM); FLAGS(REGNODE_p(w)) = 0; @@ -12821,6 +12822,7 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) MARK_NAUGHTY_EXP(1, 4); /* compound interest */ } + /* Finish up the CURLY/CURLYX case */ FLAGS(REGNODE_p(ret)) = 0; ARG1_SET(REGNODE_p(ret), (U16)min); @@ -12828,6 +12830,7 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) done_main_op: + /* Process any greediness modifiers */ if (*RExC_parse == '?') { nextchar(pRExC_state); reginsert(pRExC_state, MINMOD, ret, depth+1); @@ -12849,12 +12852,32 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) } } + /* Forbid extra quantifiers */ if (ISMULT2(RExC_parse)) { RExC_parse++; vFAIL("Nested quantifiers"); } return(ret); + + min0_maxINF_wildcard_forbidden: + + /* Here we are in a wildcard match, and the minimum match length is 0, and + * the max could be infinity. This is currently forbidden. The only + * reason is to make it harder to write patterns that take a long long time + * to halt, and because the use of this construct isn't necessary in + * matching Unicode property values */ + RExC_parse++; + /* diag_listed_as: Use of %s is not allowed in Unicode property wildcard + subpatterns in regex; marked by <-- HERE in m/%s/ + */ + vFAIL("Use of quantifier '*' is not allowed in Unicode property wildcard" + " subpatterns"); + + /* Note, don't need to worry about the input being '{0,}', as a '}' isn't + * legal at all in wildcards, so can't get this far */ + + NOT_REACHED; /*NOTREACHED*/ } STATIC bool @@ -13586,7 +13609,6 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) /* SBOL is shared with /^/ so we set the flags so we can tell * /\A/ from /^/ in split. */ FLAGS(REGNODE_p(ret)) = 1; - *flagp |= SIMPLE; /* Wrong, but too late to fix for 5.32 */ } goto finish_meta_pat; case 'G': @@ -13623,7 +13645,6 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) } else { ret = reg_node(pRExC_state, SEOL); - *flagp |= SIMPLE; /* Wrong, but too late to fix for 5.32 */ } RExC_seen_zerolen++; /* Do not optimize RE away */ goto finish_meta_pat; @@ -13634,7 +13655,6 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) } else { ret = reg_node(pRExC_state, EOS); - *flagp |= SIMPLE; /* Wrong, but too late to fix for 5.32 */ } RExC_seen_zerolen++; /* Do not optimize RE away */ goto finish_meta_pat; @@ -14985,12 +15005,12 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) * * The solution used here for peeking ahead is to look at that * next character. If it isn't ASCII punctuation, then it will - * be something that continues in an EXACTish node if there - * were space. We append the fold of it to s, having reserved - * enough room in s0 for the purpose. If we can't reasonably - * peek ahead, we instead assume the worst case: that it is - * something that would form the completion of a multi-char - * fold. + * be something that would continue on in an EXACTish node if + * there were space. We append the fold of it to s, having + * reserved enough room in s0 for the purpose. If we can't + * reasonably peek ahead, we instead assume the worst case: + * that it is something that would form the completion of a + * multi-char fold. * * If we can't split between s and ender, we work backwards * character-by-character down to s0. At each current point