regnode *next_regnode; /* next node to process when last is reached */
U32 prev_recursed_depth;
I32 stopparen; /* what stopparen do we use */
+ bool in_gosub; /* this or an outer frame is for GOSUB */
struct scan_frame *this_prev_frame; /* this previous frame */
struct scan_frame *prev_frame; /* previous frame */
? OPTIMIZE_INFTY
: (l
? data->last_start_max
+ /* temporary underflow guard for 5.32 */
+ : data->pos_delta < 0 ? OPTIMIZE_INFTY
: (data->pos_delta > OPTIMIZE_INFTY - data->pos_min
? OPTIMIZE_INFTY
: data->pos_min + data->pos_delta));
}
- if (data->flags & SF_BEFORE_EOL)
- data->substrs[i].flags |= (data->flags & SF_BEFORE_EOL);
- else
- data->substrs[i].flags &= ~SF_BEFORE_EOL;
+ data->substrs[i].flags &= ~SF_BEFORE_EOL;
+ data->substrs[i].flags |= data->flags & SF_BEFORE_EOL;
data->substrs[i].minlenp = minlenp;
data->substrs[i].lookbehind = 0;
}
* character folded sequences. Since a single character can fold into
* such a sequence, the minimum match length for this node is less than
* the number of characters in the node. This routine returns in
- * *min_subtract how many characters to subtract from the the actual
+ * *min_subtract how many characters to subtract from the actual
* length of the string to get a real minimum match length; it is 0 if
* there are no multi-char foldeds. This delta is used by the caller to
* adjust the min length of the match, and the delta between min and max,
s++;
}
}
- else {
+ else if (OP(scan) != EXACTFAA_NO_TRIE) {
/* Non-UTF-8 pattern, not EXACTFAA node. Look for the multi-char
* folds that are all Latin1. As explained in the comments
/* EXACTF nodes need to know that the minimum length
* changed so that a sharp s in the string can match this
* ss in the pattern, but they remain EXACTF nodes, as they
- * won't match this unless the target string is is UTF-8,
+ * won't match this unless the target string is in UTF-8,
* which we don't know until runtime. EXACTFL nodes can't
* transform into EXACTFU nodes */
if (OP(scan) != EXACTF && OP(scan) != EXACTFL) {
} while (f);
}
+/* Follow the next-chain of the current node and optimize away
+ all the NOTHINGs from it.
+ */
+STATIC void
+S_rck_elide_nothing(pTHX_ regnode *node)
+{
+ dVAR;
+
+ PERL_ARGS_ASSERT_RCK_ELIDE_NOTHING;
+
+ if (OP(node) != CURLYX) {
+ const int max = (reg_off_by_arg[OP(node)]
+ ? I32_MAX
+ /* I32 may be smaller than U16 on CRAYs! */
+ : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
+ int off = (reg_off_by_arg[OP(node)] ? ARG(node) : NEXT_OFF(node));
+ int noff;
+ regnode *n = node;
+
+ /* Skip NOTHING and LONGJMP. */
+ while (
+ (n = regnext(n))
+ && (
+ (PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
+ || ((OP(n) == LONGJMP) && (noff = ARG(n)))
+ )
+ && off + noff < max
+ ) {
+ off += noff;
+ }
+ if (reg_off_by_arg[OP(node)])
+ ARG(node) = off;
+ else
+ NEXT_OFF(node) = off;
+ }
+ return;
+}
+
/* the return from this sub is the minimum length that could possibly match */
STATIC SSize_t
S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
I32 stopparen,
U32 recursed_depth,
regnode_ssc *and_withp,
- U32 flags, U32 depth)
+ U32 flags, U32 depth, bool was_mutate_ok)
/* scanp: Start here (read-write). */
/* deltap: Write maxlen-minlen here. */
/* last: Stop before this one. */
node length to get a real minimum (because
the folded version may be shorter) */
bool unfolded_multi_char = FALSE;
+ /* avoid mutating ops if we are anywhere within the recursed or
+ * enframed handling for a GOSUB: the outermost level will handle it.
+ */
+ bool mutate_ok = was_mutate_ok && !(frame && frame->in_gosub);
/* Peephole optimizer: */
DEBUG_STUDYDATA("Peep", data, depth, is_inf);
DEBUG_PEEP("Peep", scan, depth, flags);
* parsing code, as each (?:..) is handled by a different invocation of
* reg() -- Yves
*/
- if (PL_regkind[OP(scan)] == EXACT && OP(scan) != LEXACT
- && OP(scan) != LEXACT_REQ8)
+ if (PL_regkind[OP(scan)] == EXACT
+ && OP(scan) != LEXACT
+ && OP(scan) != LEXACT_REQ8
+ && mutate_ok
+ ) {
join_exact(pRExC_state, scan, &min_subtract, &unfolded_multi_char,
0, NULL, depth + 1);
+ }
/* Follow the next-chain of the current node and optimize
- away all the NOTHINGs from it. */
- if (OP(scan) != CURLYX) {
- const int max = (reg_off_by_arg[OP(scan)]
- ? I32_MAX
- /* I32 may be smaller than U16 on CRAYs! */
- : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
- int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
- int noff;
- regnode *n = scan;
-
- /* Skip NOTHING and LONGJMP. */
- while ( (n = regnext(n))
- && ( (PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
- || ((OP(n) == LONGJMP) && (noff = ARG(n))))
- && off + noff < max)
- off += noff;
- if (reg_off_by_arg[OP(scan)])
- ARG(scan) = off;
- else
- NEXT_OFF(scan) = off;
- }
+ away all the NOTHINGs from it.
+ */
+ rck_elide_nothing(scan);
/* The principal pseudo-switch. Cannot be a switch, since we look into
* several different things. */
/* DEFINEP study_chunk() recursion */
(void)study_chunk(pRExC_state, &scan, &minlen,
&deltanext, next, &data_fake, stopparen,
- recursed_depth, NULL, f, depth+1);
+ recursed_depth, NULL, f, depth+1, mutate_ok);
scan = next;
} else
/* recurse study_chunk() for each BRANCH in an alternation */
minnext = study_chunk(pRExC_state, &scan, minlenp,
&deltanext, next, &data_fake, stopparen,
- recursed_depth, NULL, f, depth+1);
+ recursed_depth, NULL, f, depth+1,
+ mutate_ok);
if (min1 > minnext)
min1 = minnext;
}
}
- if (PERL_ENABLE_TRIE_OPTIMISATION &&
- OP( startbranch ) == BRANCH )
- {
+ if (PERL_ENABLE_TRIE_OPTIMISATION
+ && OP(startbranch) == BRANCH
+ && mutate_ok
+ ) {
/* demq.
Assuming this was/is a branch we are dealing with: 'scan'
newframe->stopparen = stopparen;
newframe->prev_recursed_depth = recursed_depth;
newframe->this_prev_frame= frame;
+ newframe->in_gosub = (
+ (frame && frame->in_gosub) || OP(scan) == GOSUB
+ );
DEBUG_STUDYDATA("frame-new", data, depth, is_inf);
DEBUG_PEEP("fnew", scan, depth, flags);
offset, later match for variable offset. */
if (data->last_end == -1) { /* Update the start info. */
data->last_start_min = data->pos_min;
- data->last_start_max = is_inf
- ? OPTIMIZE_INFTY : data->pos_min + data->pos_delta;
+ data->last_start_max =
+ is_inf ? OPTIMIZE_INFTY
+ : (data->pos_delta > OPTIMIZE_INFTY - data->pos_min)
+ ? OPTIMIZE_INFTY : data->pos_min + data->pos_delta;
}
sv_catpvn(data->last_found, STRING(scan), bytelen);
if (UTF)
&& isALPHA_A(*s)
&& ( OP(scan) == EXACTFAA
|| ( OP(scan) == EXACTFU
- && ! HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(*s))))
- {
+ && ! HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(*s)))
+ && mutate_ok
+ ) {
U8 mask = ~ ('A' ^ 'a'); /* These differ in just one bit */
OP(scan) = ANYOFM;
/* This temporary node can now be turned into EXACTFU, and
* must, as regexec.c doesn't handle it */
- if (OP(next) == EXACTFU_S_EDGE) {
+ if (OP(next) == EXACTFU_S_EDGE && mutate_ok) {
OP(next) = EXACTFU;
}
&& isALPHA_A(* STRING(next))
&& ( OP(next) == EXACTFAA
|| ( OP(next) == EXACTFU
- && ! HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(* STRING(next)))))
- {
+ && ! HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(* STRING(next))))
+ && mutate_ok
+ ) {
/* These differ in just one bit */
U8 mask = ~ ('A' ^ 'a');
(mincount == 0
? (f & ~SCF_DO_SUBSTR)
: f)
- ,depth+1);
+ , depth+1, mutate_ok);
if (flags & SCF_DO_STCLASS)
data->start_class = oclass;
RExC_precomp)));
}
+ if ( ( minnext > 0 && mincount >= SSize_t_MAX / minnext )
+ || min >= SSize_t_MAX - minnext * mincount )
+ {
+ FAIL("Regexp out of space");
+ }
+
min += minnext * mincount;
is_inf_internal |= deltanext == OPTIMIZE_INFTY
|| (maxcount == REG_INFTY && minnext + deltanext > 0);
if ( OP(oscan) == CURLYX && data
&& data->flags & SF_IN_PAR
&& !(data->flags & SF_HAS_EVAL)
- && !deltanext && minnext == 1 ) {
+ && !deltanext && minnext == 1
+ && mutate_ok
+ ) {
/* Try to optimize to CURLYN. */
regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
regnode * const nxt1 = nxt;
&& !(data->flags & SF_HAS_EVAL)
&& !deltanext /* atom is fixed width */
&& minnext != 0 /* CURLYM can't handle zero width */
-
/* Nor characters whose fold at run-time may be
* multi-character */
&& ! (RExC_seen & REG_UNFOLDED_MULTI_SEEN)
+ && mutate_ok
) {
/* XXXX How to optimize if data == 0? */
/* Optimize to a simpler form. */
/* recurse study_chunk() on optimised CURLYX => CURLYM */
study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
NULL, stopparen, recursed_depth, NULL, 0,
- depth+1);
+ depth+1, mutate_ok);
}
else
oscan->flags = 0;
if (data && (fl & SF_HAS_EVAL))
data->flags |= SF_HAS_EVAL;
optimize_curly_tail:
- if (OP(oscan) != CURLYX) {
- while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
- && NEXT_OFF(next))
- NEXT_OFF(oscan) += NEXT_OFF(next);
- }
+ rck_elide_nothing(oscan);
continue;
default:
(regnode_charclass *) scan);
break;
- case NANYOFM:
+ case NANYOFM: /* NANYOFM already contains the inversion of the
+ input ANYOF data, so, unlike things like
+ NPOSIXA, don't change 'invert' to TRUE */
+ /* FALLTHROUGH */
case ANYOFM:
{
SV* cp_list = get_ANYOFM_contents(scan);
/* recurse study_chunk() for lookahead body */
minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
last, &data_fake, stopparen,
- recursed_depth, NULL, f, depth+1);
+ recursed_depth, NULL, f, depth+1,
+ mutate_ok);
if (scan->flags) {
if ( deltanext < 0
|| deltanext > (I32) U8_MAX
*minnextp = study_chunk(pRExC_state, &nscan, minnextp,
&deltanext, last, &data_fake,
stopparen, recursed_depth, NULL,
- f, depth+1);
+ f, depth+1, mutate_ok);
if (scan->flags) {
assert(0); /* This code has never been tested since this
is normally not compiled */
/* optimise study_chunk() for TRIE */
minnext = study_chunk(pRExC_state, &scan, minlenp,
&deltanext, (regnode *)nextbranch, &data_fake,
- stopparen, recursed_depth, NULL, f, depth+1);
+ stopparen, recursed_depth, NULL, f, depth+1,
+ mutate_ok);
}
if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
nextbranch= regnext((regnode*)nextbranch);
/* we make the assumption here that each op in the list of
* op_siblings maps to one SV pushed onto the stack,
* except for code blocks, with have both an OP_NULL and
- * and OP_CONST.
+ * an OP_CONST.
* This allows us to match up the list of SVs against the
* list of OPs to find the next code block.
*
pRExC_state->code_blocks->count -= n;
n = 0;
}
- else {
+ else {
/* ... or failing that, try "" overload */
while (SvAMAGIC(msv)
&& (sv = AMG_CALLunary(msv, string_amg))
PERL_ARGS_ASSERT_SET_REGEX_PV;
/* make sure PL_bitcount bounds not exceeded */
- assert(sizeof(STD_PAT_MODS) <= 8);
+ STATIC_ASSERT_STMT(sizeof(STD_PAT_MODS) <= 8);
p = sv_grow(MUTABLE_SV(Rx), wraplen + 1); /* +1 for the ending NUL */
SvPOK_on(Rx);
* length of the pattern. Patches welcome to improve that guess. That amount
* of space is malloc'd and then immediately freed, and then clawed back node
* by node. This design is to minimze, to the extent possible, memory churn
- * when doing the the reallocs.
+ * when doing the reallocs.
*
* A separate parentheses counting pass may be needed in some cases.
* (Previously the sizing pass did this.) Patches welcome to reduce the number
/* We have that number in RExC_npar */
RExC_total_parens = RExC_npar;
+
+ /* XXX For backporting, use long jumps if there is any possibility of
+ * overflow */
+ if (RExC_size > U16_MAX && ! RExC_use_BRANCHJ) {
+ RExC_use_BRANCHJ = TRUE;
+ flags |= RESTART_PARSE;
+ }
}
else if (! MUST_RESTART(flags)) {
ReREFCNT_dec(Rx);
&data, -1, 0, NULL,
SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag
| (restudied ? SCF_TRIE_DOING_RESTUDY : 0),
- 0);
+ 0, TRUE);
CHECK_RESTUDY_GOTO_butfirst(LEAVE_with_name("study_chunk"));
SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS|(restudied
? SCF_TRIE_DOING_RESTUDY
: 0),
- 0);
+ 0, TRUE);
CHECK_RESTUDY_GOTO_butfirst(NOOP);
* one of them */
while (i_a < len_a && i_b < len_b) {
UV cp; /* The element to potentially add to the union's array */
- bool cp_in_set; /* is it in the the input list's set or not */
+ bool cp_in_set; /* is it in the input list's set or not */
/* We need to take one or the other of the two inputs for the union.
* Since we are merging two sorted lists, we take the smaller of the
fc = uc = utf8_to_uvchr_buf(s, s + bytelen, NULL);
/* The only code points that aren't folded in a UTF EXACTFish
- * node are are the problematic ones in EXACTFL nodes */
+ * node are the problematic ones in EXACTFL nodes */
if (OP(node) == EXACTFL && is_PROBLEMATIC_LOCALE_FOLDEDS_START_cp(uc)) {
/* We need to check for the possibility that this EXACTFL
* node begins with a multi-char fold. Therefore we fold
bool is_logical = 0;
const char * const seqstart = RExC_parse;
const char * endptr;
+ const char non_existent_group_msg[]
+ = "Reference to nonexistent group";
+ const char impossible_group[] = "Invalid reference to group";
+
if (has_intervening_patws) {
RExC_parse++;
vFAIL("In '(?...)', the '(' and '?' must be adjacent");
) {
num = (I32)unum;
RExC_parse = (char*)endptr;
- } else
- num = I32_MAX;
+ }
+ else { /* Overflow, or something like that. Position
+ beyond all digits for the message */
+ while (RExC_parse < RExC_end && isDIGIT(*RExC_parse)) {
+ RExC_parse++;
+ }
+ vFAIL(impossible_group);
+ }
if (is_neg) {
- /* Some limit for num? */
+ /* -num is always representable on 1 and 2's complement
+ * machines */
num = -num;
}
}
vFAIL("Expecting close bracket");
gen_recurse_regop:
- if ( paren == '-' ) {
+ if (paren == '-' || paren == '+') {
+
+ /* Don't overflow */
+ if (UNLIKELY(I32_MAX - RExC_npar < num)) {
+ RExC_parse++;
+ vFAIL(impossible_group);
+ }
+
/*
Diagram of capture buffer numbering.
Top line is the normal capture buffer numbers
Bottom line is the negative indexing as from
the X (the (?-2))
- + 1 2 3 4 5 X 6 7
+ 1 2 3 4 5 X Y 6 7
+ /(a(x)y)(a(b(c(?+2)d)e)f)(g(h))/
/(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
- - 5 4 3 2 1 X x x
+ - 5 4 3 2 1 X Y x x
+ Resolve to absolute group. Recall that RExC_npar is +1 of
+ the actual parenthesis group number. For lookahead, we
+ have to compensate for that. Using the above example, when
+ we get to Y in the parse, num is 2 and RExC_npar is 6. We
+ want 7 for +2, and 4 for -2.
*/
- num = RExC_npar + num;
- if (num < 1) {
+ if ( paren == '+' ) {
+ num--;
+ }
- /* It might be a forward reference; we can't fail until
- * we know, by completing the parse to get all the
- * groups, and then reparsing */
- if (ALL_PARENS_COUNTED) {
- RExC_parse++;
- vFAIL("Reference to nonexistent group");
- }
- else {
- REQUIRE_PARENS_PASS;
- }
+ num += RExC_npar;
+
+ if (paren == '-' && num < 1) {
+ RExC_parse++;
+ vFAIL(non_existent_group_msg);
}
- } else if ( paren == '+' ) {
- num = RExC_npar + num - 1;
}
- /* We keep track how many GOSUB items we have produced.
- To start off the ARG2L() of the GOSUB holds its "id",
- which is used later in conjunction with RExC_recurse
- to calculate the offset we need to jump for the GOSUB,
- which it will store in the final representation.
- We have to defer the actual calculation until much later
- as the regop may move.
- */
- ret = reg2Lanode(pRExC_state, GOSUB, num, RExC_recurse_count);
if (num >= RExC_npar) {
/* It might be a forward reference; we can't fail until we
if (ALL_PARENS_COUNTED) {
if (num >= RExC_total_parens) {
RExC_parse++;
- vFAIL("Reference to nonexistent group");
+ vFAIL(non_existent_group_msg);
}
}
else {
REQUIRE_PARENS_PASS;
}
}
+
+ /* We keep track how many GOSUB items we have produced.
+ To start off the ARG2L() of the GOSUB holds its "id",
+ which is used later in conjunction with RExC_recurse
+ to calculate the offset we need to jump for the GOSUB,
+ which it will store in the final representation.
+ We have to defer the actual calculation until much later
+ as the regop may move.
+ */
+ ret = reg2Lanode(pRExC_state, GOSUB, num, RExC_recurse_count);
RExC_recurse_count++;
DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
"%*s%*s Recurse #%" UVuf " to %" IVdf "\n",
/* SBOL is shared with /^/ so we set the flags so we can tell
* /\A/ from /^/ in split. */
FLAGS(REGNODE_p(ret)) = 1;
+ *flagp |= SIMPLE; /* Wrong, but too late to fix for 5.32 */
}
- *flagp |= SIMPLE;
goto finish_meta_pat;
case 'G':
if (RExC_pm_flags & PMf_WILDCARD) {
}
else {
ret = reg_node(pRExC_state, SEOL);
+ *flagp |= SIMPLE; /* Wrong, but too late to fix for 5.32 */
}
- *flagp |= SIMPLE;
RExC_seen_zerolen++; /* Do not optimize RE away */
goto finish_meta_pat;
case 'z':
}
else {
ret = reg_node(pRExC_state, EOS);
+ *flagp |= SIMPLE; /* Wrong, but too late to fix for 5.32 */
}
- *flagp |= SIMPLE;
RExC_seen_zerolen++; /* Do not optimize RE away */
goto finish_meta_pat;
case 'C':
* have to map that back to the original */
if (need_to_fold_loc) {
upper_fill = loc_correspondence[s - s_start];
- Safefree(locfold_buf);
- Safefree(loc_correspondence);
-
if (upper_fill == 0) {
FAIL2("panic: loc_correspondence[%d] is 0",
(int) (s - s_start));
}
goto reparse;
}
- else if (need_to_fold_loc) {
- Safefree(locfold_buf);
- Safefree(loc_correspondence);
- }
/* Here the node consists entirely of non-final multi-char
* folds. (Likely it is all 'f's or all 's's.) There's no
* whole thing */
len = old_s - s0;
}
+
+ if (need_to_fold_loc) {
+ Safefree(locfold_buf);
+ Safefree(loc_correspondence);
+ }
} /* End of verifying node ends with an appropriate char */
/* We need to start the next node at the character that didn't fit
/* If more than a single node returned, the nested
* parens evaluated to more than just a (?[...]),
* which isn't legal */
- || node != 1) {
+ || RExC_emit != orig_emit
+ + NODE_STEP_REGNODE
+ + regarglen[REGEX_SET])
+ {
vFAIL("Expecting interpolated extended charclass");
}
resultant_invlist = (SV *) ARGp(REGNODE_p(node));
goto regclass_failed;
}
+ assert(current);
+
/* regclass() will return with parsing just the \ sequence,
* leaving the parse pointer at the next thing to parse */
RExC_parse--;
goto regclass_failed;
}
- if (! current) {
- break;
- }
+ assert(current);
/* function call leaves parse pointing to the ']', except if we
* faked it */
if (RExC_sets_depth) { /* If within a recursive call, return in a special
regnode */
RExC_parse++;
- node = regpnode(pRExC_state, REGEX_SET, (void *) final);
+ node = regpnode(pRExC_state, REGEX_SET, final);
}
else {
PERL_UNUSED_ARG(depth);
#endif
+ assert(! (ret_invlist && allow_mutiple_chars));
/* If wants an inversion list returned, we can't optimize to something
* else. */
assert(prop_definition || strings);
if (strings) {
- if (! RExC_in_multi_char_class) {
+ if (ret_invlist) {
+ if (! prop_definition) {
+ RExC_parse = e + 1;
+ vFAIL("Unicode string properties are not implemented in (?[...])");
+ }
+ else {
+ ckWARNreg(e + 1,
+ "Using just the single character results"
+ " returned by \\p{} in (?[...])");
+ }
+ }
+ else if (! RExC_in_multi_char_class) {
if (invert ^ (value == 'P')) {
RExC_parse = e + 1;
vFAIL("Inverting a character class which contains"
* printable should have each end point be a portable value
* for it (preferably like 'A', but we don't warn if it is
* a (portable) Unicode name or code point), and the range
- * must be be all digits or all letters of the same case.
+ * must be all digits or all letters of the same case.
* Otherwise, the range is non-portable and unclear as to
* what it contains */
if ( (isPRINT_A(prevvalue) || isPRINT_A(value))
if (ret_invlist) {
*ret_invlist = cp_list;
- return RExC_emit;
+ return (cp_list) ? RExC_emit : 0;
}
if (anyof_flags & ANYOF_LOCALE_FLAGS) {
* points) in the ASCII range, so we can't use it here to
* artificially restrict the fold domain, so we check if
* the class does or does not match some EXACTFish node.
- * Further, if we aren't under /i, and and the folded-to
+ * Further, if we aren't under /i, and the folded-to
* character is part of a multi-character fold, we can't do
* this optimization, as the sequence around it could be
* that multi-character fold, and we don't here know the
av_store(av, INVLIST_INDEX, SvREFCNT_inc_NN(cp_list));
}
+ /* (Note that if any of this changes, the size calculations in
+ * S_optimize_regclass() might need to be updated.) */
+
if (only_utf8_locale_list) {
av_store(av, ONLY_LOCALE_MATCHES_INDEX,
SvREFCNT_inc_NN(only_utf8_locale_list));
}
/*
-- regpnode - emit a temporary node with a void* argument
+- regpnode - emit a temporary node with a SV* argument
*/
STATIC regnode_offset /* Location. */
-S_regpnode(pTHX_ RExC_state_t *pRExC_state, U8 op, void * arg)
+S_regpnode(pTHX_ RExC_state_t *pRExC_state, U8 op, SV * arg)
{
const regnode_offset ret = regnode_guts(pRExC_state, op, regarglen[op], "regpnode");
regnode_offset ptr = ret;
PERL_UNUSED_ARG(depth);
#endif
- /* Find last node. */
+ /* The final node in the chain is the first one with a nonzero next pointer
+ * */
scan = (regnode_offset) p;
for (;;) {
regnode * const temp = regnext(REGNODE_p(scan));
scan = REGNODE_OFFSET(temp);
}
+ /* Populate this node's next pointer */
assert(val >= scan);
if (reg_off_by_arg[OP(REGNODE_p(scan))]) {
assert((UV) (val - scan) <= U32_MAX);
: (OP(o) == ANYOFH || OP(o) == ANYOFR)
? 0xFF
: lowest;
- Perl_sv_catpvf(aTHX_ sv, " (First UTF-8 byte=%02X", lowest);
- if (lowest != highest) {
- Perl_sv_catpvf(aTHX_ sv, "-%02X", highest);
+#ifndef EBCDIC
+ if (OP(o) != ANYOFR || ! isASCII(ANYOFRbase(o) + ANYOFRdelta(o)))
+#endif
+ {
+ Perl_sv_catpvf(aTHX_ sv, " (First UTF-8 byte=%02X", lowest);
+ if (lowest != highest) {
+ Perl_sv_catpvf(aTHX_ sv, "-%02X", highest);
+ }
+ Perl_sv_catpvf(aTHX_ sv, ")");
}
- Perl_sv_catpvf(aTHX_ sv, ")");
}
SvREFCNT_dec(unresolved);
Perl_re_intuit_string(pTHX_ REGEXP * const r)
{ /* Assume that RE_INTUIT is set */
/* Returns an SV containing a string that must appear in the target for it
- * to match */
+ * to match, or NULL if nothing is known that must match.
+ *
+ * CAUTION: the SV can be freed during execution of the regex engine */
struct regexp *const prog = ReANY(r);
DECLARE_AND_GET_RE_DEBUG_FLAGS;
int inverted_bias, as_is_bias;
- /* We will apply our bias to whichever of the the results doesn't have
+ /* We will apply our bias to whichever of the results doesn't have
* the '^' */
if (invert) {
invert = FALSE;
# define CUR_CONTEXT aTHX
# define ORIGINAL_CONTEXT save_aTHX
# else
-# define DECLARATION_FOR_GLOBAL_CONTEXT
+# define DECLARATION_FOR_GLOBAL_CONTEXT dNOOP
# define SWITCH_TO_GLOBAL_CONTEXT NOOP
# define RESTORE_CONTEXT NOOP
# define CUR_CONTEXT NULL
where we are now */
bool found_matches = FALSE; /* Did any name match so far? */
SV * empty; /* For matching zero length names */
- SV * must; /* What substring, if any, must be in a name
- for the subpattern to match */
+ SV * must_sv; /* Contains the substring, if any, that must be
+ in a name for the subpattern to match */
+ const char * must; /* The PV of 'must' */
+ STRLEN must_len; /* And its length */
SV * syllable_name = NULL; /* For Hangul syllables */
const char hangul_prefix[] = "HANGUL SYLLABLE ";
const STRLEN hangul_prefix_len = sizeof(hangul_prefix) - 1;
/* Compile the subpattern consisting of the name being looked for */
subpattern_re = compile_wildcard(wname, wname_len, FALSE /* /-i */ );
- must = re_intuit_string(subpattern_re);
+
+ must_sv = re_intuit_string(subpattern_re);
+ if (must_sv) {
+ /* regexec.c can free the re_intuit_string() return. GH #17734 */
+ must_sv = sv_2mortal(newSVsv(must_sv));
+ must = SvPV(must_sv, must_len);
+ }
+ else {
+ must = "";
+ must_len = 0;
+ }
+
+ /* (Note: 'must' could contain a NUL. And yet we use strspn() below on it.
+ * This works because the NUL causes the function to return early, thus
+ * showing that there are characters in it other than the acceptable ones,
+ * which is our desired result.) */
+
prog = ReANY(subpattern_re);
/* If only nothing is matched, skip to where empty names are looked for */
/* And match against the string of all names /gc. Don't even try if it
* must match a character not found in any name. */
- if ( ! must
- || SvCUR(must) == 0
- || strspn(SvPVX(must), "\n -0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ()")
- == SvCUR(must))
+ if (strspn(must, "\n -0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ()") == must_len)
{
while (execute_wildcard(subpattern_re,
cur_pos,
* one of the characters in that isn't in any Hangul syllable. */
if ( prog->minlen <= (SSize_t) syl_max_len
&& prog->maxlen > 0
- && ( ! must
- || SvCUR(must) == 0
- || strspn(SvPVX(must), "\n ABCDEGHIJKLMNOPRSTUWY") == SvCUR(must)))
+ && (strspn(must, "\n ABCDEGHIJKLMNOPRSTUWY") == must_len))
{
/* These constants, names, values, and algorithm are adapted from the
* Unicode standard, version 5.1, section 3.12, and should never
* series */
if ( prog->minlen <= (SSize_t) SvCUR(algo_name)
&& prog->maxlen > 0
- && ( ! must
- || SvCUR(must) == 0
- || strspn(SvPVX(must), legal) == SvCUR(must)))
+ && (strspn(must, legal) == must_len))
{
for (j = low; j <= high; j++) { /* For each code point in the series */