*
* All of these macros depend on the above RExC_ accessor macros, which
* in turns depend on a variable pRExC_state being in scope where they
- * are used. This is the based regexp parser context variable which is
+ * are used. This is the standard regexp parser context variable which is
* passed into every non-trivial parse function in this file.
*
* Note that the UTF macro is itself a wrapper around RExC_utf8, so all
static void
S_debug_studydata(pTHX_ const char *where, scan_data_t *data,
- U32 depth, int is_inf)
+ U32 depth, int is_inf,
+ SSize_t min, SSize_t stopmin, SSize_t delta)
{
DECLARE_AND_GET_RE_DEBUG_FLAGS;
DEBUG_OPTIMISE_MORE_r({
if (!data)
return;
- Perl_re_indentf(aTHX_ "%s: Pos:%" IVdf "/%" IVdf " Flags: 0x%" UVXf,
+ Perl_re_indentf(aTHX_ "%s: M/S/D: %" IVdf "/%" IVdf "/%" IVdf " Pos:%" IVdf "/%" IVdf " Flags: 0x%" UVXf,
depth,
where,
+ min, stopmin, delta,
(IV)data->pos_min,
(IV)data->pos_delta,
(UV)data->flags
}
-# define DEBUG_STUDYDATA(where, data, depth, is_inf) \
- S_debug_studydata(aTHX_ where, data, depth, is_inf)
+# define DEBUG_STUDYDATA(where, data, depth, is_inf, min, stopmin, delta) \
+ S_debug_studydata(aTHX_ where, data, depth, is_inf, min, stopmin, delta)
# define DEBUG_PEEP(str, scan, depth, flags) \
S_debug_peep(aTHX_ str, pRExC_state, scan, depth, flags)
#else
-# define DEBUG_STUDYDATA(where, data, depth, is_inf) NOOP
+# define DEBUG_STUDYDATA(where, data, depth, is_inf, min, stopmin, delta) NOOP
# define DEBUG_PEEP(str, scan, depth, flags) NOOP
#endif
}
data->last_end = -1;
data->flags &= ~SF_BEFORE_EOL;
- DEBUG_STUDYDATA("commit", data, 0, is_inf);
+ DEBUG_STUDYDATA("commit", data, 0, is_inf, -1, -1, -1);
}
/* An SSC is just a regnode_charclass_posix with an extra field: the inversion
S_study_chunk(pTHX_
RExC_state_t *pRExC_state,
regnode **scanp, /* Start here (read-write). */
- SSize_t *minlenp,
+ SSize_t *minlenp, /* used for the minlen of substrings? */
SSize_t *deltap, /* Write maxlen-minlen here. */
regnode *last, /* Stop before this one. */
scan_data_t *data, /* string data about the pattern */
a higher caller is holding a ptr to them. */
)
{
- SSize_t final_minlen;
- /* There must be at least this number of characters to match */
- SSize_t min = 0;
- I32 pars = 0, code;
- regnode *scan = *scanp, *next;
- SSize_t delta = 0;
+ /* vars about the regnodes we are working with */
+ regnode *scan = *scanp; /* the current opcode we are inspecting */
+ regnode *next = NULL; /* the next opcode beyond scan, tmp var */
+ regnode *first_non_open = scan; /* FIXME: should this init to NULL?
+ the first non open regop, if the init
+ val IS an OPEN then we will skip past
+ it just after the var decls section */
+ I32 code = 0; /* temp var used to hold the optype of a regop */
+
+ /* vars about the min and max length of the pattern */
+ SSize_t min = 0; /* min length of this part of the pattern */
+ SSize_t stopmin = OPTIMIZE_INFTY; /* min length accounting for ACCEPT
+ this is adjusted down if we find
+ an ACCEPT */
+ SSize_t delta = 0; /* difference between min and max length
+ (not accounting for stopmin) */
+
+ /* vars about capture buffers in the pattern */
+ I32 pars = 0; /* count of OPEN opcodes */
+ I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0; /* is this op an OPEN? */
+
+ /* vars about whether this pattern contains something that can match
+ * infinitely long strings, eg, X* or X+ */
int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
int is_inf_internal = 0; /* The studied chunk is infinite */
- I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
- scan_data_t data_fake;
- SV *re_trie_maxbuff = NULL;
- regnode *first_non_open = scan;
- SSize_t stopmin = OPTIMIZE_INFTY;
- scan_frame *frame = NULL;
+
+ /* scan_data_t (struct) is used to hold information about the substrings
+ * and start class we have extracted from the string */
+ scan_data_t data_fake; /* temp var used for recursing in some cases */
+
+ SV *re_trie_maxbuff = NULL; /* temp var used to hold whether we can do
+ trie optimizations */
+
+ scan_frame *frame = NULL; /* used as part of fake recursion */
+
DECLARE_AND_GET_RE_DEBUG_FLAGS;
PERL_ARGS_ASSERT_STUDY_CHUNK;
first_non_open=regnext(first_non_open);
}
-
fake_study_recurse:
DEBUG_r(
RExC_study_chunk_recursed_count++;
*/
bool mutate_ok = was_mutate_ok && !(frame && frame->in_gosub);
/* Peephole optimizer: */
- DEBUG_STUDYDATA("Peep", data, depth, is_inf);
+ DEBUG_STUDYDATA("Peep", data, depth, is_inf, min, stopmin, delta);
DEBUG_PEEP("Peep", scan, depth, flags);
}
if (flags & SCF_DO_STCLASS)
ssc_or(pRExC_state, &accum, (regnode_charclass*)&this_class);
+ DEBUG_STUDYDATA("end BRANCH", data, depth, is_inf, min, stopmin, delta);
}
if (code == IFTHEN && num < 2) /* Empty ELSE branch */
min1 = 0;
flags |= SCF_DO_STCLASS_OR;
}
}
+ DEBUG_STUDYDATA("pre TRIE", data, depth, is_inf, min, stopmin, delta);
if (PERL_ENABLE_TRIE_OPTIMISATION
&& OP(startbranch) == BRANCH
} /* end if ( prev) */
} /* TRIE_MAXBUF is non zero */
} /* do trie */
-
+ DEBUG_STUDYDATA("after TRIE", data, depth, is_inf, min, stopmin, delta);
}
else if ( code == BRANCHJ ) { /* single branch is optimized. */
scan = NEXTOPER(NEXTOPER(scan));
RExC_study_chunk_recursed_bytes, U8);
}
/* we havent recursed into this paren yet, so recurse into it */
- DEBUG_STUDYDATA("gosub-set", data, depth, is_inf);
+ DEBUG_STUDYDATA("gosub-set", data, depth, is_inf, min, stopmin, delta);
PAREN_SET(recursed_depth, paren);
my_recursed_depth= recursed_depth + 1;
} else {
- DEBUG_STUDYDATA("gosub-inf", data, depth, is_inf);
+ DEBUG_STUDYDATA("gosub-inf", data, depth, is_inf, min, stopmin, delta);
/* some form of infinite recursion, assume infinite length
* */
if (flags & SCF_DO_SUBSTR) {
(frame && frame->in_gosub) || OP(scan) == GOSUB
);
- DEBUG_STUDYDATA("frame-new", data, depth, is_inf);
+ DEBUG_STUDYDATA("frame-new", data, depth, is_inf, min, stopmin, delta);
DEBUG_PEEP("fnew", scan, depth, flags);
frame = newframe;
ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
}
flags &= ~SCF_DO_STCLASS;
+ DEBUG_STUDYDATA("end EXACT", data, depth, is_inf, min, stopmin, delta);
}
else if (PL_regkind[OP(scan)] == EXACT) {
/* But OP != EXACT!, so is EXACTFish */
flags &= ~SCF_DO_STCLASS;
SvREFCNT_dec(EXACTF_invlist);
}
+ DEBUG_STUDYDATA("end EXACTish", data, depth, is_inf, min, stopmin, delta);
}
else if (REGNODE_VARIES(OP(scan))) {
SSize_t mincount, maxcount, minnext, deltanext, pos_before = 0;
delta += (minnext + deltanext) * maxcount
- minnext * mincount;
}
+
+ if (data && data->flags & SCF_SEEN_ACCEPT) {
+ if (flags & SCF_DO_SUBSTR) {
+ scan_commit(pRExC_state, data, minlenp, is_inf);
+ flags &= ~SCF_DO_SUBSTR;
+ }
+ if (stopmin > min)
+ stopmin = min;
+ DEBUG_STUDYDATA("after-whilem accept", data, depth, is_inf, min, stopmin, delta);
+ }
/* Try powerful optimization CURLYX => CURLYN. */
if ( OP(oscan) == CURLYX && data
&& data->flags & SF_IN_PAR
last, &data_fake, stopparen,
recursed_depth, NULL, f, depth+1,
mutate_ok);
+
if (scan->flags) {
if ( deltanext < 0
|| deltanext > (I32) U8_MAX
|= SSC_MATCHES_EMPTY_STRING;
}
}
+ DEBUG_STUDYDATA("end LOOKAROUND", data, depth, is_inf, min, stopmin, delta);
}
#if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
else {
if (OP(scan)==ACCEPT) {
/* m{(*ACCEPT)x} does not have to start with 'x' */
flags &= ~SCF_DO_STCLASS;
- if (data) {
+ if (data)
data->flags |= SCF_SEEN_ACCEPT;
- if (stopmin > min)
- stopmin = min;
- }
+ if (stopmin > min)
+ stopmin = min;
}
}
else if (OP(scan) == COMMIT) {
if (flags & SCF_DO_STCLASS)
ssc_or(pRExC_state, &accum, (regnode_charclass *) &this_class);
}
+ DEBUG_STUDYDATA("after JUMPTRIE", data, depth, is_inf, min, stopmin, delta);
}
if (flags & SCF_DO_SUBSTR) {
data->pos_min += min1;
}
}
scan= tail;
+ DEBUG_STUDYDATA("after TRIE study", data, depth, is_inf, min, stopmin, delta);
continue;
}
#else
/* we need to unwind recursion. */
depth = depth - 1;
- DEBUG_STUDYDATA("frame-end", data, depth, is_inf);
+ DEBUG_STUDYDATA("frame-end", data, depth, is_inf, min, stopmin, delta);
DEBUG_PEEP("fend", scan, depth, flags);
/* restore previous context */
}
assert(!frame);
- DEBUG_STUDYDATA("pre-fin", data, depth, is_inf);
+ DEBUG_STUDYDATA("pre-fin", data, depth, is_inf, min, stopmin, delta);
+
+ if (min > stopmin) {
+ /* stopmin might be shorter than min if we saw an (*ACCEPT). If
+ this is the case then it means this pattern is variable length
+ and we need to ensure that the delta accounts for it. delta
+ represents the difference between min length and max length for
+ this part of the pattern. */
+ delta += min - stopmin;
+ min = stopmin;
+ }
*scanp = scan;
*deltap = is_inf_internal ? OPTIMIZE_INFTY : delta;
if (flags & SCF_TRIE_RESTUDY)
data->flags |= SCF_TRIE_RESTUDY;
- DEBUG_STUDYDATA("post-fin", data, depth, is_inf);
-
- final_minlen = min < stopmin
- ? min : stopmin;
if (!(RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN)) {
- if (final_minlen > OPTIMIZE_INFTY - delta)
+ if (min > OPTIMIZE_INFTY - delta)
RExC_maxlen = OPTIMIZE_INFTY;
- else if (RExC_maxlen < final_minlen + delta)
- RExC_maxlen = final_minlen + delta;
+ else if (RExC_maxlen < min + delta)
+ RExC_maxlen = min + delta;
}
- return final_minlen;
+ DEBUG_STUDYDATA("post-fin", data, depth, is_inf, min, stopmin, delta);
+ return min;
}
/* add a data member to the struct reg_data attached to this regex, it should
return ret;
} else {
if (retarray)
- ret = newSVsv(&PL_sv_undef);
+ ret = newSV_type(SVt_NULL);
}
if (retarray)
av_push(retarray, ret);
return;
default:
fail_modifiers:
- RExC_parse += SKIP_IF_CHAR(RExC_parse, RExC_end);
+ RExC_parse_inc_if_char();
/* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
vFAIL2utf8f("Sequence (%" UTF8f "...) not recognized",
UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
NOT_REACHED; /*NOTREACHED*/
}
- RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
+ RExC_parse_inc();
}
vFAIL("Sequence (?... not terminated");
goto unterminated_verb_pattern;
}
- RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
+ RExC_parse_inc();
while ( RExC_parse < RExC_end && *RExC_parse != ')' ) {
- RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
+ RExC_parse_inc();
}
if ( RExC_parse >= RExC_end || *RExC_parse != ')' ) {
unterminated_verb_pattern:
} /* End of switch */
if ( ! op ) {
- RExC_parse += UTF ? UTF8_SAFE_SKIP(RExC_parse, RExC_end) : 1;
+ RExC_parse_inc_safe();
if (has_upper || verb_len == 0) {
vFAIL2utf8f( "Unknown verb pattern '%" UTF8f "'",
UTF8fARG(UTF, verb_len, start_verb));
RExC_parse_inc_by(1); /* past the '?' */
paren = *RExC_parse; /* might be a trailing NUL, if not
well-formed */
- RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
+ RExC_parse_inc();
if (RExC_parse > RExC_end) {
paren = '\0';
}
return handle_named_backref(pRExC_state, flagp,
segment_parse_start, ')');
}
- RExC_parse += SKIP_IF_CHAR(RExC_parse, RExC_end);
+ RExC_parse_inc_if_char();
/* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
vFAIL3("Sequence (%.*s...) not recognized",
(int) (RExC_parse - seqstart), seqstart);
case '?': /* (??...) */
is_logical = 1;
if (*RExC_parse != '{') {
- RExC_parse += SKIP_IF_CHAR(RExC_parse, RExC_end);
+ RExC_parse_inc_if_char();
/* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
vFAIL2utf8f(
"Sequence (%" UTF8f "...) not recognized",
"DEFINE"))
{
ret = reganode(pRExC_state, DEFINEP, 0);
- RExC_parse += DEFINE_len;
+ RExC_parse_inc_by(DEFINE_len);
is_define = 1;
goto insert_if_check_paren;
}
insert_if_check_paren:
if (UCHARAT(RExC_parse) != ')') {
- RExC_parse += UTF
- ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
- : 1;
+ RExC_parse_inc_safe();
vFAIL("Switch condition not recognized");
}
nextchar(pRExC_state);
#endif
return ret;
}
- RExC_parse += UTF
- ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
- : 1;
+ RExC_parse_inc_safe();
vFAIL("Unknown switch condition (?(...))");
}
case '[': /* (?[ ... ]) */
* converted a name to the \N{U+...} form. This include changing a
* name that evaluates to multiple code points to \N{U+c1.c2.c3 ...} */
- RExC_parse += 2; /* Skip past the 'U+' */
+ RExC_parse_inc_by(2); /* Skip past the 'U+' */
/* Code points are separated by dots. The '}' terminates the whole
* thing. */
vFAIL("Invalid hexadecimal number in \\N{U+...}");
}
- RExC_parse += len;
+ RExC_parse_inc_by(len);
if (cp > MAX_LEGAL_CP) {
vFAIL(form_cp_too_large_msg(16, start_digit, len, 0));
* \N{U+100.} )
* */
if (*RExC_parse != '.' || RExC_parse + 1 >= e) {
- RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
- ? UTF8SKIP(RExC_parse)
- : 1;
+ /*point to after 1st invalid */
+ RExC_parse_incf(RExC_orig_utf8);
/*Guard against malformed utf8*/
RExC_parse_set(MIN(e, RExC_parse));
goto bad_NU;
RExC_end - RExC_parse);
char * e = endbrace;
- RExC_parse += 2;
+ RExC_parse_inc_by(2);
if (! endbrace) {
vFAIL2("Missing right brace on \\%c{}", name);
&& UCHARAT(RExC_parse + 1) == '{'
&& UNLIKELY(! regcurly(RExC_parse + 1, RExC_end, NULL)))
{
- RExC_parse += 2;
+ RExC_parse_inc_by(2);
vFAIL("Unescaped left brace in regex is illegal here");
}
nextchar(pRExC_state);
/* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
vFAIL2("Sequence %.2s... not terminated", atom_parse_start);
} else {
- RExC_parse += 2;
+ RExC_parse_inc_by(2);
if (ch == '{') {
while (isBLANK(*RExC_parse)) {
RExC_parse_inc_by(1);
* compile time values are valid in all runtime cases */
REQUIRE_UNI_RULES(flagp, 0);
- ckWARNexperimental(RExC_parse,
- WARN_EXPERIMENTAL__REGEX_SETS,
- "The regex_sets feature is experimental");
-
/* Everything in this construct is a metacharacter. Operands begin with
* either a '\' (for an escape sequence), or a '[' for a bracketed
* character class. Any other character should be an operator, or
* so that everything gets evaluated down to a single operand, which is the
* result */
- sv_2mortal((SV *)(stack = newAV()));
- sv_2mortal((SV *)(fence_stack = newAV()));
+ stack = (AV*)newSV_type_mortal(SVt_PVAV);
+ fence_stack = (AV*)newSV_type_mortal(SVt_PVAV);
while (RExC_parse < RExC_end) {
I32 top_index; /* Index of top-most element in 'stack' */
break;
default:
- RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
+ RExC_parse_inc();
if (RExC_parse >= RExC_end) {
break;
}
} /* End of switch on next parse token */
- RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
+ RExC_parse_inc();
} /* End of loop parsing through the construct */
vFAIL("Syntax error in (?[...])");
value = utf8n_to_uvchr((U8*)RExC_parse,
RExC_end - RExC_parse,
&numlen, UTF8_ALLOW_DEFAULT);
- RExC_parse += numlen;
+ RExC_parse_inc_by(numlen);
}
else {
value = UCHARAT(RExC_parse);
value = utf8n_to_uvchr((U8*)RExC_parse,
RExC_end - RExC_parse,
&numlen, UTF8_ALLOW_DEFAULT);
- RExC_parse += numlen;
+ RExC_parse_inc_by(numlen);
}
else {
value = UCHARAT(RExC_parse);
} /* The \p isn't immediately followed by a '{' */
else if (! isALPHA(*RExC_parse)) {
- RExC_parse += (UTF)
- ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
- : 1;
+ RExC_parse_inc_safe();
vFAIL2("Character following \\%c must be '{' or a "
"single-character Unicode property name",
(U8) value);
{
/* going to die anyway; point to exact spot of
* failure */
- RExC_parse += (UTF)
- ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
- : 1;
+ RExC_parse_inc_safe();
vFAIL(message);
}
| PERL_SCAN_NOTIFY_ILLDIGIT;
numlen = (strict) ? 4 : 3;
value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
- RExC_parse += numlen;
+ RExC_parse_inc_by(numlen);
if (numlen != 3) {
if (strict) {
- RExC_parse += (UTF)
- ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
- : 1;
+ RExC_parse_inc_safe();
vFAIL("Need exactly 3 octal digits");
}
else if ( (flags & PERL_SCAN_NOTIFY_ILLDIGIT)
|| UTF8_IS_INVARIANT(*RExC_parse)
|| UTF8_IS_START(*RExC_parse));
- RExC_parse += (UTF)
- ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
- : 1;
+ RExC_parse_inc_safe();
skip_to_be_ignored_text(pRExC_state, &RExC_parse,
FALSE /* Don't force /x */ );