* character folded sequences. Since a single character can fold into
* such a sequence, the minimum match length for this node is less than
* the number of characters in the node. This routine returns in
- * *min_subtract how many characters to subtract from the the actual
+ * *min_subtract how many characters to subtract from the actual
* length of the string to get a real minimum match length; it is 0 if
* there are no multi-char foldeds. This delta is used by the caller to
* adjust the min length of the match, and the delta between min and max,
/* EXACTF nodes need to know that the minimum length
* changed so that a sharp s in the string can match this
* ss in the pattern, but they remain EXACTF nodes, as they
- * won't match this unless the target string is is UTF-8,
+ * won't match this unless the target string is in UTF-8,
* which we don't know until runtime. EXACTFL nodes can't
* transform into EXACTFU nodes */
if (OP(scan) != EXACTF && OP(scan) != EXACTFL) {
} while (f);
}
+/* Follow the next-chain of the current node and optimize away
+ all the NOTHINGs from it.
+ */
+STATIC void
+S_rck_elide_nothing(pTHX_ regnode *node)
+{
+ dVAR;
+
+ PERL_ARGS_ASSERT_RCK_ELIDE_NOTHING;
+
+ if (OP(node) != CURLYX) {
+ const int max = (reg_off_by_arg[OP(node)]
+ ? I32_MAX
+ /* I32 may be smaller than U16 on CRAYs! */
+ : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
+ int off = (reg_off_by_arg[OP(node)] ? ARG(node) : NEXT_OFF(node));
+ int noff;
+ regnode *n = node;
+
+ /* Skip NOTHING and LONGJMP. */
+ while (
+ (n = regnext(n))
+ && (
+ (PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
+ || ((OP(n) == LONGJMP) && (noff = ARG(n)))
+ )
+ && off + noff < max
+ ) {
+ off += noff;
+ }
+ if (reg_off_by_arg[OP(node)])
+ ARG(node) = off;
+ else
+ NEXT_OFF(node) = off;
+ }
+ return;
+}
+
/* the return from this sub is the minimum length that could possibly match */
STATIC SSize_t
S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
I32 stopparen,
U32 recursed_depth,
regnode_ssc *and_withp,
- U32 flags, U32 depth)
+ U32 flags, U32 depth, bool was_mutate_ok)
/* scanp: Start here (read-write). */
/* deltap: Write maxlen-minlen here. */
/* last: Stop before this one. */
node length to get a real minimum (because
the folded version may be shorter) */
bool unfolded_multi_char = FALSE;
- bool mutate_ok = (frame && frame->in_gosub) ? 0 : 1;
+ /* avoid mutating ops if we are anywhere within the recursed or
+ * enframed handling for a GOSUB: the outermost level will handle it.
+ */
+ bool mutate_ok = was_mutate_ok && !(frame && frame->in_gosub);
/* Peephole optimizer: */
DEBUG_STUDYDATA("Peep", data, depth, is_inf);
DEBUG_PEEP("Peep", scan, depth, flags);
}
/* Follow the next-chain of the current node and optimize
- away all the NOTHINGs from it. */
- if (OP(scan) != CURLYX) {
- const int max = (reg_off_by_arg[OP(scan)]
- ? I32_MAX
- /* I32 may be smaller than U16 on CRAYs! */
- : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
- int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
- int noff;
- regnode *n = scan;
-
- /* Skip NOTHING and LONGJMP. */
- while ( (n = regnext(n))
- && ( (PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
- || ((OP(n) == LONGJMP) && (noff = ARG(n))))
- && off + noff < max)
- off += noff;
- if (reg_off_by_arg[OP(scan)])
- ARG(scan) = off;
- else
- NEXT_OFF(scan) = off;
- }
+ away all the NOTHINGs from it.
+ */
+ rck_elide_nothing(scan);
/* The principal pseudo-switch. Cannot be a switch, since we look into
* several different things. */
/* DEFINEP study_chunk() recursion */
(void)study_chunk(pRExC_state, &scan, &minlen,
&deltanext, next, &data_fake, stopparen,
- recursed_depth, NULL, f, depth+1);
+ recursed_depth, NULL, f, depth+1, mutate_ok);
scan = next;
} else
/* recurse study_chunk() for each BRANCH in an alternation */
minnext = study_chunk(pRExC_state, &scan, minlenp,
&deltanext, next, &data_fake, stopparen,
- recursed_depth, NULL, f, depth+1);
+ recursed_depth, NULL, f, depth+1,
+ mutate_ok);
if (min1 > minnext)
min1 = minnext;
(mincount == 0
? (f & ~SCF_DO_SUBSTR)
: f)
- ,depth+1);
+ , depth+1, mutate_ok);
if (flags & SCF_DO_STCLASS)
data->start_class = oclass;
RExC_precomp)));
}
+ if ( ( minnext > 0 && mincount >= SSize_t_MAX / minnext )
+ || min >= SSize_t_MAX - minnext * mincount )
+ {
+ FAIL("Regexp out of space");
+ }
+
min += minnext * mincount;
is_inf_internal |= deltanext == OPTIMIZE_INFTY
|| (maxcount == REG_INFTY && minnext + deltanext > 0);
/* recurse study_chunk() on optimised CURLYX => CURLYM */
study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
NULL, stopparen, recursed_depth, NULL, 0,
- depth+1);
+ depth+1, mutate_ok);
}
else
oscan->flags = 0;
if (data && (fl & SF_HAS_EVAL))
data->flags |= SF_HAS_EVAL;
optimize_curly_tail:
- if (OP(oscan) != CURLYX) {
- while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
- && NEXT_OFF(next))
- NEXT_OFF(oscan) += NEXT_OFF(next);
- }
+ rck_elide_nothing(oscan);
continue;
default:
/* recurse study_chunk() for lookahead body */
minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
last, &data_fake, stopparen,
- recursed_depth, NULL, f, depth+1);
+ recursed_depth, NULL, f, depth+1,
+ mutate_ok);
if (scan->flags) {
if ( deltanext < 0
|| deltanext > (I32) U8_MAX
*minnextp = study_chunk(pRExC_state, &nscan, minnextp,
&deltanext, last, &data_fake,
stopparen, recursed_depth, NULL,
- f, depth+1);
+ f, depth+1, mutate_ok);
if (scan->flags) {
assert(0); /* This code has never been tested since this
is normally not compiled */
/* optimise study_chunk() for TRIE */
minnext = study_chunk(pRExC_state, &scan, minlenp,
&deltanext, (regnode *)nextbranch, &data_fake,
- stopparen, recursed_depth, NULL, f, depth+1);
+ stopparen, recursed_depth, NULL, f, depth+1,
+ mutate_ok);
}
if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
nextbranch= regnext((regnode*)nextbranch);
/* we make the assumption here that each op in the list of
* op_siblings maps to one SV pushed onto the stack,
* except for code blocks, with have both an OP_NULL and
- * and OP_CONST.
+ * an OP_CONST.
* This allows us to match up the list of SVs against the
* list of OPs to find the next code block.
*
PERL_ARGS_ASSERT_SET_REGEX_PV;
/* make sure PL_bitcount bounds not exceeded */
- assert(sizeof(STD_PAT_MODS) <= 8);
+ STATIC_ASSERT_STMT(sizeof(STD_PAT_MODS) <= 8);
p = sv_grow(MUTABLE_SV(Rx), wraplen + 1); /* +1 for the ending NUL */
SvPOK_on(Rx);
* length of the pattern. Patches welcome to improve that guess. That amount
* of space is malloc'd and then immediately freed, and then clawed back node
* by node. This design is to minimze, to the extent possible, memory churn
- * when doing the the reallocs.
+ * when doing the reallocs.
*
* A separate parentheses counting pass may be needed in some cases.
* (Previously the sizing pass did this.) Patches welcome to reduce the number
/* We have that number in RExC_npar */
RExC_total_parens = RExC_npar;
+
+ /* XXX For backporting, use long jumps if there is any possibility of
+ * overflow */
+ if (RExC_size > U16_MAX && ! RExC_use_BRANCHJ) {
+ RExC_use_BRANCHJ = TRUE;
+ flags |= RESTART_PARSE;
+ }
}
else if (! MUST_RESTART(flags)) {
ReREFCNT_dec(Rx);
&data, -1, 0, NULL,
SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag
| (restudied ? SCF_TRIE_DOING_RESTUDY : 0),
- 0);
+ 0, TRUE);
CHECK_RESTUDY_GOTO_butfirst(LEAVE_with_name("study_chunk"));
SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS|(restudied
? SCF_TRIE_DOING_RESTUDY
: 0),
- 0);
+ 0, TRUE);
CHECK_RESTUDY_GOTO_butfirst(NOOP);
* one of them */
while (i_a < len_a && i_b < len_b) {
UV cp; /* The element to potentially add to the union's array */
- bool cp_in_set; /* is it in the the input list's set or not */
+ bool cp_in_set; /* is it in the input list's set or not */
/* We need to take one or the other of the two inputs for the union.
* Since we are merging two sorted lists, we take the smaller of the
fc = uc = utf8_to_uvchr_buf(s, s + bytelen, NULL);
/* The only code points that aren't folded in a UTF EXACTFish
- * node are are the problematic ones in EXACTFL nodes */
+ * node are the problematic ones in EXACTFL nodes */
if (OP(node) == EXACTFL && is_PROBLEMATIC_LOCALE_FOLDEDS_START_cp(uc)) {
/* We need to check for the possibility that this EXACTFL
* node begins with a multi-char fold. Therefore we fold
* have to map that back to the original */
if (need_to_fold_loc) {
upper_fill = loc_correspondence[s - s_start];
- Safefree(locfold_buf);
- Safefree(loc_correspondence);
-
if (upper_fill == 0) {
FAIL2("panic: loc_correspondence[%d] is 0",
(int) (s - s_start));
}
goto reparse;
}
- else if (need_to_fold_loc) {
- Safefree(locfold_buf);
- Safefree(loc_correspondence);
- }
/* Here the node consists entirely of non-final multi-char
* folds. (Likely it is all 'f's or all 's's.) There's no
* whole thing */
len = old_s - s0;
}
+
+ if (need_to_fold_loc) {
+ Safefree(locfold_buf);
+ Safefree(loc_correspondence);
+ }
} /* End of verifying node ends with an appropriate char */
/* We need to start the next node at the character that didn't fit
/* If more than a single node returned, the nested
* parens evaluated to more than just a (?[...]),
* which isn't legal */
- || node != 1) {
+ || RExC_emit != orig_emit
+ + NODE_STEP_REGNODE
+ + regarglen[REGEX_SET])
+ {
vFAIL("Expecting interpolated extended charclass");
}
resultant_invlist = (SV *) ARGp(REGNODE_p(node));
* printable should have each end point be a portable value
* for it (preferably like 'A', but we don't warn if it is
* a (portable) Unicode name or code point), and the range
- * must be be all digits or all letters of the same case.
+ * must be all digits or all letters of the same case.
* Otherwise, the range is non-portable and unclear as to
* what it contains */
if ( (isPRINT_A(prevvalue) || isPRINT_A(value))
* points) in the ASCII range, so we can't use it here to
* artificially restrict the fold domain, so we check if
* the class does or does not match some EXACTFish node.
- * Further, if we aren't under /i, and and the folded-to
+ * Further, if we aren't under /i, and the folded-to
* character is part of a multi-character fold, we can't do
* this optimization, as the sequence around it could be
* that multi-character fold, and we don't here know the
av_store(av, INVLIST_INDEX, SvREFCNT_inc_NN(cp_list));
}
+ /* (Note that if any of this changes, the size calculations in
+ * S_optimize_regclass() might need to be updated.) */
+
if (only_utf8_locale_list) {
av_store(av, ONLY_LOCALE_MATCHES_INDEX,
SvREFCNT_inc_NN(only_utf8_locale_list));
PERL_UNUSED_ARG(depth);
#endif
- /* Find last node. */
+ /* The final node in the chain is the first one with a nonzero next pointer
+ * */
scan = (regnode_offset) p;
for (;;) {
regnode * const temp = regnext(REGNODE_p(scan));
scan = REGNODE_OFFSET(temp);
}
+ /* Populate this node's next pointer */
assert(val >= scan);
if (reg_off_by_arg[OP(REGNODE_p(scan))]) {
assert((UV) (val - scan) <= U32_MAX);
: (OP(o) == ANYOFH || OP(o) == ANYOFR)
? 0xFF
: lowest;
- Perl_sv_catpvf(aTHX_ sv, " (First UTF-8 byte=%02X", lowest);
- if (lowest != highest) {
- Perl_sv_catpvf(aTHX_ sv, "-%02X", highest);
+#ifndef EBCDIC
+ if (OP(o) != ANYOFR || ! isASCII(ANYOFRbase(o) + ANYOFRdelta(o)))
+#endif
+ {
+ Perl_sv_catpvf(aTHX_ sv, " (First UTF-8 byte=%02X", lowest);
+ if (lowest != highest) {
+ Perl_sv_catpvf(aTHX_ sv, "-%02X", highest);
+ }
+ Perl_sv_catpvf(aTHX_ sv, ")");
}
- Perl_sv_catpvf(aTHX_ sv, ")");
}
SvREFCNT_dec(unresolved);
int inverted_bias, as_is_bias;
- /* We will apply our bias to whichever of the the results doesn't have
+ /* We will apply our bias to whichever of the results doesn't have
* the '^' */
if (invert) {
invert = FALSE;
SV * empty; /* For matching zero length names */
SV * must_sv; /* Contains the substring, if any, that must be
in a name for the subpattern to match */
- char * must; /* The PV of 'must' */
+ const char * must; /* The PV of 'must' */
STRLEN must_len; /* And its length */
SV * syllable_name = NULL; /* For Hangul syllables */
const char hangul_prefix[] = "HANGUL SYLLABLE ";