*/
#define HASWIDTH 0x01 /* Known to not match null strings, could match
non-null ones. */
-
-/* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single
- * character. (There needs to be a case: in the switch statement in regexec.c
- * for any node marked SIMPLE.) Note that this is not the same thing as
- * REGNODE_SIMPLE */
-#define SIMPLE 0x02
+#define SIMPLE 0x02 /* Exactly one character wide */
+ /* (or LNBREAK as a special case) */
#define POSTPONED 0x08 /* (?1),(?&name), (??{...}) or similar */
#define TRYAGAIN 0x10 /* Weeded out a declaration. */
#define RESTART_PARSE 0x20 /* Need to redo the parse */
* might result in a minlen of 1 and not of 4,
* but this doesn't make us mismatch, just try a bit
* harder than we should.
- * */
+ *
+ * However we must assume this GOSUB is infinite, to
+ * avoid wrongly applying other optimizations in the
+ * enclosing scope - see GH 18096, for example.
+ */
+ is_inf = is_inf_internal = 1;
scan= regnext(scan);
continue;
}
}
if (flags & SCF_DO_SUBSTR)
data->pos_min++;
+ /* This will bypass the formal 'min += minnext * mincount'
+ * calculation in the do_curly path, so assumes min width
+ * of the PLUS payload is exactly one. */
min++;
/* FALLTHROUGH */
case STAR:
} else if (flags & RXapif_ONE) {
ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
av = MUTABLE_AV(SvRV(ret));
- length = av_tindex(av);
+ length = av_count(av);
SvREFCNT_dec_NN(ret);
- return newSViv(length + 1);
+ return newSViv(length);
} else {
Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar",
(int)flags);
FAIL2("panic: regatom returned failure, flags=%#" UVxf, (UV) flags);
}
- op = *RExC_parse;
-
- if (op == '{' && regcurly(RExC_parse)) {
- maxpos = NULL;
#ifdef RE_TRACK_PATTERN_OFFSETS
- parse_start = RExC_parse; /* MJD */
+ parse_start = RExC_parse;
#endif
- next = RExC_parse + 1;
- while (isDIGIT(*next) || *next == ',') {
- if (*next == ',') {
- if (maxpos)
- break;
- else
- maxpos = next;
- }
- next++;
- }
- if (*next == '}') { /* got one */
+
+ op = *RExC_parse;
+ switch (op) {
+
+ case '*':
+ nextchar(pRExC_state);
+ min = 0;
+ break;
+
+ case '+':
+ nextchar(pRExC_state);
+ min = 1;
+ break;
+
+ case '?':
+ nextchar(pRExC_state);
+ min = 0; max = 1;
+ break;
+
+ case '{': /* A '{' may or may not indicate a quantifier; call regcurly()
+ to determine which */
+ if (regcurly(RExC_parse)) {
const char* endptr;
- if (!maxpos)
- maxpos = next;
- RExC_parse++;
+
+ /* Here is a quantifier, parse for min and max values */
+ maxpos = NULL;
+ next = RExC_parse + 1;
+ while (isDIGIT(*next) || *next == ',') {
+ if (*next == ',') {
+ if (maxpos)
+ break;
+ else
+ maxpos = next;
+ }
+ next++;
+ }
+
+ assert(*next == '}');
+
+ if (!maxpos)
+ maxpos = next;
+ RExC_parse++;
if (isDIGIT(*RExC_parse)) {
endptr = RExC_end;
if (!grok_atoUV(RExC_parse, &uv, &endptr))
} else {
min = 0;
}
- if (*maxpos == ',')
- maxpos++;
- else
- maxpos = RExC_parse;
+ if (*maxpos == ',')
+ maxpos++;
+ else
+ maxpos = RExC_parse;
if (isDIGIT(*maxpos)) {
endptr = RExC_end;
if (!grok_atoUV(maxpos, &uv, &endptr))
vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
max = (I32)uv;
} else {
- max = REG_INFTY; /* meaning "infinity" */
+ max = REG_INFTY; /* meaning "infinity" */
}
- RExC_parse = next;
- nextchar(pRExC_state);
+
+ RExC_parse = next;
+ nextchar(pRExC_state);
if (max < min) { /* If can't match, warn and optimize to fail
unconditionally */
reginsert(pRExC_state, OPFAIL, orig_emit, depth+1);
*RExC_parse);
}
- do_curly:
- if ((flags&SIMPLE)) {
- if (min == 0 && max == REG_INFTY) {
-
- /* Going from 0..inf is currently forbidden in wildcard
- * subpatterns. The only reason is to make it harder to
- * write patterns that take a long long time to halt, and
- * because the use of this construct isn't necessary in
- * matching Unicode property values */
- if (RExC_pm_flags & PMf_WILDCARD) {
- RExC_parse++;
- /* diag_listed_as: Use of %s is not allowed in Unicode
- property wildcard subpatterns in regex; marked by
- <-- HERE in m/%s/ */
- vFAIL("Use of quantifier '*' is not allowed in"
- " Unicode property wildcard subpatterns");
- /* Note, don't need to worry about {0,}, as a '}' isn't
- * legal at all in wildcards, so wouldn't get this far
- * */
- }
- reginsert(pRExC_state, STAR, ret, depth+1);
- MARK_NAUGHTY(4);
- RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
- goto nest_check;
- }
- if (min == 1 && max == REG_INFTY) {
- reginsert(pRExC_state, PLUS, ret, depth+1);
- MARK_NAUGHTY(3);
- RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
- goto nest_check;
- }
- MARK_NAUGHTY_EXP(2, 2);
- reginsert(pRExC_state, CURLY, ret, depth+1);
- Set_Node_Offset(REGNODE_p(ret), parse_start+1); /* MJD */
- Set_Node_Cur_Length(REGNODE_p(ret), parse_start);
- }
- else {
- const regnode_offset w = reg_node(pRExC_state, WHILEM);
+ break;
+ } /* End of is regcurly() */
- FLAGS(REGNODE_p(w)) = 0;
- if (! REGTAIL(pRExC_state, ret, w)) {
- REQUIRE_BRANCHJ(flagp, 0);
- }
- if (RExC_use_BRANCHJ) {
- reginsert(pRExC_state, LONGJMP, ret, depth+1);
- reginsert(pRExC_state, NOTHING, ret, depth+1);
- NEXT_OFF(REGNODE_p(ret)) = 3; /* Go over LONGJMP. */
- }
- reginsert(pRExC_state, CURLYX, ret, depth+1);
- /* MJD hk */
- Set_Node_Offset(REGNODE_p(ret), parse_start+1);
- Set_Node_Length(REGNODE_p(ret),
- op == '{' ? (RExC_parse - parse_start) : 1);
+ /* Here was a '{', but what followed it didn't form a quantifier. */
+ /* FALLTHROUGH */
- if (RExC_use_BRANCHJ)
- NEXT_OFF(REGNODE_p(ret)) = 3; /* Go over NOTHING to
- LONGJMP. */
- if (! REGTAIL(pRExC_state, ret, reg_node(pRExC_state,
- NOTHING)))
- {
- REQUIRE_BRANCHJ(flagp, 0);
- }
- RExC_whilem_seen++;
- MARK_NAUGHTY_EXP(1, 4); /* compound interest */
- }
- FLAGS(REGNODE_p(ret)) = 0;
-
- if (min > 0)
- *flagp = 0;
- if (max > 0)
- *flagp |= HASWIDTH;
- ARG1_SET(REGNODE_p(ret), (U16)min);
- ARG2_SET(REGNODE_p(ret), (U16)max);
- if (max == REG_INFTY)
- RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
-
- goto nest_check;
- }
+ default:
+ *flagp = flags;
+ return(ret);
+ NOT_REACHED; /*NOTREACHED*/
}
- if (!ISMULT1(op)) {
- *flagp = flags;
- return(ret);
- }
+ /* Here we have a quantifier, and have calculated 'min' and 'max'.
+ *
+ * Check and possibly adjust a zero width operand */
+ if (! (flags & (HASWIDTH|POSTPONED))) {
+ if (max > REG_INFTY/3) {
+ if (origparse[0] == '\\' && origparse[1] == 'K') {
+ vFAIL2utf8f(
+ "%" UTF8f " is forbidden - matches null string"
+ " many times",
+ UTF8fARG(UTF, (RExC_parse >= origparse
+ ? RExC_parse - origparse
+ : 0),
+ origparse));
+ } else {
+ ckWARN2reg(RExC_parse,
+ "%" UTF8f " matches null string many times",
+ UTF8fARG(UTF, (RExC_parse >= origparse
+ ? RExC_parse - origparse
+ : 0),
+ origparse));
+ }
+ }
-#if 0 /* Now runtime fix should be reliable. */
+ /* There's no point in trying to match something 0 length more than
+ * once except for extra side effects, which we don't have here since
+ * not POSTPONED */
+ if (max > 1) {
+ max = 1;
+ if (min > max) {
+ min = max;
+ }
+ }
+ }
- /* if this is reinstated, don't forget to put this back into perldiag:
+ /* If this is a code block pass it up */
+ *flagp |= (flags & POSTPONED);
- =item Regexp *+ operand could be empty at {#} in regex m/%s/
+ if (max > 0) {
+ *flagp |= (flags & HASWIDTH);
+ if (max == REG_INFTY)
+ RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
+ }
- (F) The part of the regexp subject to either the * or + quantifier
- could match an empty string. The {#} shows in the regular
- expression about where the problem was discovered.
+ /* 'SIMPLE' operands don't require full generality */
+ if ((flags&SIMPLE)) {
+ if (max == REG_INFTY) {
+ if (min == 0) {
+ if (UNLIKELY(RExC_pm_flags & PMf_WILDCARD)) {
+ goto min0_maxINF_wildcard_forbidden;
+ }
- */
+ reginsert(pRExC_state, STAR, ret, depth+1);
+ MARK_NAUGHTY(4);
+ goto done_main_op;
+ }
+ else if (min == 1) {
+ reginsert(pRExC_state, PLUS, ret, depth+1);
+ MARK_NAUGHTY(3);
+ goto done_main_op;
+ }
+ }
- if (!(flags&HASWIDTH) && op != '?')
- vFAIL("Regexp *+ operand could be empty");
-#endif
+ /* Here, SIMPLE, but not the '*' and '+' special cases */
-#ifdef RE_TRACK_PATTERN_OFFSETS
- parse_start = RExC_parse;
-#endif
- nextchar(pRExC_state);
+ MARK_NAUGHTY_EXP(2, 2);
+ reginsert(pRExC_state, CURLY, ret, depth+1);
+ Set_Node_Offset(REGNODE_p(ret), parse_start+1); /* MJD */
+ Set_Node_Cur_Length(REGNODE_p(ret), parse_start);
+ }
+ else { /* not SIMPLE */
+ const regnode_offset w = reg_node(pRExC_state, WHILEM);
- *flagp = HASWIDTH;
-
- if (op == '*') {
- min = 0;
- goto do_curly;
- }
- else if (op == '+') {
- min = 1;
- goto do_curly;
- }
- else if (op == '?') {
- min = 0; max = 1;
- goto do_curly;
- }
- nest_check:
- if (!(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
- if (origparse[0] == '\\' && origparse[1] == 'K') {
- vFAIL2utf8f(
- "%" UTF8f " is forbidden - matches null string many times",
- UTF8fARG(UTF, (RExC_parse >= origparse
- ? RExC_parse - origparse
- : 0),
- origparse));
- /* NOT-REACHED */
- } else {
- ckWARN2reg(RExC_parse,
- "%" UTF8f " matches null string many times",
- UTF8fARG(UTF, (RExC_parse >= origparse
- ? RExC_parse - origparse
- : 0),
- origparse));
+ FLAGS(REGNODE_p(w)) = 0;
+ if (! REGTAIL(pRExC_state, ret, w)) {
+ REQUIRE_BRANCHJ(flagp, 0);
+ }
+ if (RExC_use_BRANCHJ) {
+ reginsert(pRExC_state, LONGJMP, ret, depth+1);
+ reginsert(pRExC_state, NOTHING, ret, depth+1);
+ NEXT_OFF(REGNODE_p(ret)) = 3; /* Go over LONGJMP. */
+ }
+ reginsert(pRExC_state, CURLYX, ret, depth+1);
+ /* MJD hk */
+ Set_Node_Offset(REGNODE_p(ret), parse_start+1);
+ Set_Node_Length(REGNODE_p(ret),
+ op == '{' ? (RExC_parse - parse_start) : 1);
+
+ if (RExC_use_BRANCHJ)
+ NEXT_OFF(REGNODE_p(ret)) = 3; /* Go over NOTHING to
+ LONGJMP. */
+ if (! REGTAIL(pRExC_state, ret, reg_node(pRExC_state,
+ NOTHING)))
+ {
+ REQUIRE_BRANCHJ(flagp, 0);
}
+ RExC_whilem_seen++;
+ MARK_NAUGHTY_EXP(1, 4); /* compound interest */
}
+ /* Finish up the CURLY/CURLYX case */
+ FLAGS(REGNODE_p(ret)) = 0;
+
+ ARG1_SET(REGNODE_p(ret), (U16)min);
+ ARG2_SET(REGNODE_p(ret), (U16)max);
+
+ done_main_op:
+
+ /* Process any greediness modifiers */
if (*RExC_parse == '?') {
- nextchar(pRExC_state);
- reginsert(pRExC_state, MINMOD, ret, depth+1);
+ nextchar(pRExC_state);
+ reginsert(pRExC_state, MINMOD, ret, depth+1);
if (! REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE)) {
REQUIRE_BRANCHJ(flagp, 0);
}
}
}
+ /* Forbid extra quantifiers */
if (ISMULT2(RExC_parse)) {
- RExC_parse++;
- vFAIL("Nested quantifiers");
+ RExC_parse++;
+ vFAIL("Nested quantifiers");
}
return(ret);
+
+ min0_maxINF_wildcard_forbidden:
+
+ /* Here we are in a wildcard match, and the minimum match length is 0, and
+ * the max could be infinity. This is currently forbidden. The only
+ * reason is to make it harder to write patterns that take a long long time
+ * to halt, and because the use of this construct isn't necessary in
+ * matching Unicode property values */
+ RExC_parse++;
+ /* diag_listed_as: Use of %s is not allowed in Unicode property wildcard
+ subpatterns in regex; marked by <-- HERE in m/%s/
+ */
+ vFAIL("Use of quantifier '*' is not allowed in Unicode property wildcard"
+ " subpatterns");
+
+ /* Note, don't need to worry about the input being '{0,}', as a '}' isn't
+ * legal at all in wildcards, so can't get this far */
+
+ NOT_REACHED; /*NOTREACHED*/
}
STATIC bool
/* SBOL is shared with /^/ so we set the flags so we can tell
* /\A/ from /^/ in split. */
FLAGS(REGNODE_p(ret)) = 1;
- *flagp |= SIMPLE; /* Wrong, but too late to fix for 5.32 */
}
goto finish_meta_pat;
case 'G':
}
ret = reg_node(pRExC_state, GPOS);
RExC_seen |= REG_GPOS_SEEN;
- *flagp |= SIMPLE;
goto finish_meta_pat;
case 'K':
if (!RExC_in_lookbehind && !RExC_in_lookahead) {
RExC_seen_zerolen++;
ret = reg_node(pRExC_state, KEEPS);
- *flagp |= SIMPLE;
/* XXX:dmq : disabling in-place substitution seems to
* be necessary here to avoid cases of memory corruption, as
* with: C<$_="x" x 80; s/x\K/y/> -- rgs
}
else {
ret = reg_node(pRExC_state, SEOL);
- *flagp |= SIMPLE; /* Wrong, but too late to fix for 5.32 */
}
RExC_seen_zerolen++; /* Do not optimize RE away */
goto finish_meta_pat;
}
else {
ret = reg_node(pRExC_state, EOS);
- *flagp |= SIMPLE; /* Wrong, but too late to fix for 5.32 */
}
RExC_seen_zerolen++; /* Do not optimize RE away */
goto finish_meta_pat;
ret = reg_node(pRExC_state, op);
FLAGS(REGNODE_p(ret)) = flags;
- *flagp |= SIMPLE;
-
goto finish_meta_pat;
}
*
* The solution used here for peeking ahead is to look at that
* next character. If it isn't ASCII punctuation, then it will
- * be something that continues in an EXACTish node if there
- * were space. We append the fold of it to s, having reserved
- * enough room in s0 for the purpose. If we can't reasonably
- * peek ahead, we instead assume the worst case: that it is
- * something that would form the completion of a multi-char
- * fold.
+ * be something that would continue on in an EXACTish node if
+ * there were space. We append the fold of it to s, having
+ * reserved enough room in s0 for the purpose. If we can't
+ * reasonably peek ahead, we instead assume the worst case:
+ * that it is something that would form the completion of a
+ * multi-char fold.
*
* If we can't split between s and ender, we work backwards
* character-by-character down to s0. At each current point
FAIL2("panic: loc_correspondence[%d] is 0",
(int) (s - s_start));
}
+ Safefree(locfold_buf);
+ Safefree(loc_correspondence);
}
else {
upper_fill = s - s0;
if ( posix_warnings
&& RExC_warn_text
- && av_top_index(RExC_warn_text) > -1)
+ && av_count(RExC_warn_text) > 0)
{
*posix_warnings = RExC_warn_text;
}
*
* There is a line below that uses the same white space criteria but is outside
* this macro. Both here and there must use the same definition */
-#define SKIP_BRACKETED_WHITE_SPACE(do_skip, p) \
+#define SKIP_BRACKETED_WHITE_SPACE(do_skip, p, stop_p) \
STMT_START { \
if (do_skip) { \
- while (isBLANK_A(UCHARAT(p))) \
+ while (p < stop_p && isBLANK_A(UCHARAT(p))) \
{ \
p++; \
} \
initial_listsv_len = SvCUR(listsv);
SvTEMP_off(listsv); /* Grr, TEMPs and mortals are conflated. */
- SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse);
+ SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse, RExC_end);
assert(RExC_parse <= RExC_end);
invert = TRUE;
allow_mutiple_chars = FALSE;
MARK_NAUGHTY(1);
- SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse);
+ SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse, RExC_end);
}
/* Check that they didn't say [:posix:] instead of [[:posix:]] */
output_posix_warnings(pRExC_state, posix_warnings);
}
+ SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse, RExC_end);
+
if (RExC_parse >= stop_ptr) {
break;
}
- SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse);
-
if (UCHARAT(RExC_parse) == ']') {
break;
}
}
/* For each multi-character string ... */
- while (av_tindex(strings) >= 0) {
+ while (av_count(strings) > 0) {
/* ... Each entry is itself an array of code
* points. */
AV * this_string = (AV *) av_shift( strings);
- STRLEN cp_count = av_tindex(this_string) + 1;
+ STRLEN cp_count = av_count(this_string);
SV * final = newSV(cp_count * 4);
SvPVCLEAR(final);
/* Create another string of sequences of \x{...} */
- while (av_tindex(this_string) >= 0) {
+ while (av_count(this_string) > 0) {
SV * character = av_shift(this_string);
UV cp = SvUV(character);
}
} /* end of namedclass \blah */
- SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse);
+ SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse, RExC_end);
/* If 'range' is set, 'value' is the ending of a range--check its
* validity. (If value isn't a single code point in the case of a
char* next_char_ptr = RExC_parse + 1;
/* Get the next real char after the '-' */
- SKIP_BRACKETED_WHITE_SPACE(skip_white, next_char_ptr);
+ SKIP_BRACKETED_WHITE_SPACE(skip_white, next_char_ptr, RExC_end);
/* If the '-' is at the end of the class (just before the ']',
* it is a literal minus; otherwise it is a range */
#define SAVEPVN(p, n) ((p) ? savepvn(p, n) : NULL)
/*
- re_dup_guts - duplicate a regexp.
+=for apidoc_section REGEXP Functions
+=for apidoc re_dup_guts
+Duplicate a regexp.
+
+This routine is expected to clone a given regexp structure. It is only
+compiled under USE_ITHREADS.
- This routine is expected to clone a given regexp structure. It is only
- compiled under USE_ITHREADS.
+After all of the core data stored in struct regexp is duplicated
+the regexp_engine.dupe method is used to copy any private data
+stored in the *pprivate pointer. This allows extensions to handle
+any duplication they need to do.
- After all of the core data stored in struct regexp is duplicated
- the regexp_engine.dupe method is used to copy any private data
- stored in the *pprivate pointer. This allows extensions to handle
- any duplication they need to do.
+=cut
See pregfree() and regfree_internal() if you change anything here.
*/