#define RExC_warned_WARN_EXPERIMENTAL__REGEX_SETS (pRExC_state->sWARN_EXPERIMENTAL__REGEX_SETS)
#define RExC_unlexed_names (pRExC_state->unlexed_names)
+
+/***********************************************************************/
+/* UTILITY MACROS FOR ADVANCING OR SETTING THE PARSE "CURSOR" RExC_parse
+ *
+ * All of these macros depend on the above RExC_ accessor macros, which
+ * in turns depend on a variable pRExC_state being in scope where they
+ * are used. This is the standard regexp parser context variable which is
+ * passed into every non-trivial parse function in this file.
+ *
+ * Note that the UTF macro is itself a wrapper around RExC_utf8, so all
+ * of the macros which do not take an argument will operate on the
+ * pRExC_state structure *only*.
+ *
+ * Please do NOT modify RExC_parse without using these macros. In the
+ * future these macros will be extended for enhanced debugging and trace
+ * output during the parse process.
+ */
+
+/* RExC_parse_incf(flag)
+ *
+ * Increment RExC_parse to point at the next codepoint, while doing
+ * the right thing depending on whether we are parsing UTF-8 strings
+ * or not. The 'flag' argument determines if content is UTF-8 or not,
+ * intended for cases where this is NOT governed by the UTF macro.
+ *
+ * Use RExC_parse_inc() if UTF-8ness is controlled by the UTF macro.
+ *
+ * WARNING: Does NOT take into account RExC_end; it is the callers
+ * responsibility to make sure there are enough octets left in
+ * RExC_parse to ensure that when processing UTF-8 we would not read
+ * past the end of the string.
+ */
+#define RExC_parse_incf(flag) STMT_START { \
+ RExC_parse += (flag) ? UTF8SKIP(RExC_parse) : 1; \
+} STMT_END
+
+/* RExC_parse_inc_safef(flag)
+ *
+ * Safely increment RExC_parse to point at the next codepoint,
+ * doing the right thing depending on whether we are parsing
+ * UTF-8 strings or not and NOT reading past the end of the buffer.
+ * The 'flag' argument determines if content is UTF-8 or not,
+ * intended for cases where this is NOT governed by the UTF macro.
+ *
+ * Use RExC_parse_safe() if UTF-8ness is controlled by the UTF macro.
+ *
+ * NOTE: Will NOT read past RExC_end when content is UTF-8.
+ */
+#define RExC_parse_inc_safef(flag) STMT_START { \
+ RExC_parse += (flag) ? UTF8_SAFE_SKIP(RExC_parse,RExC_end) : 1; \
+} STMT_END
+
+/* RExC_parse_inc()
+ *
+ * Increment RExC_parse to point at the next codepoint,
+ * doing the right thing depending on whether we are parsing
+ * UTF-8 strings or not.
+ *
+ * WARNING: Does NOT take into account RExC_end, it is the callers
+ * responsibility to make sure there are enough octets left in
+ * RExC_parse to ensure that when processing UTF-8 we would not read
+ * past the end of the string.
+ *
+ * NOTE: whether we are parsing UTF-8 or not is determined by the
+ * UTF macro which is defined as cBOOL(RExC_parse_utf8), thus this
+ * macro operates on the pRExC_state structure only.
+ */
+#define RExC_parse_inc() RExC_parse_incf(UTF)
+
+/* RExC_parse_inc_safe()
+ *
+ * Safely increment RExC_parse to point at the next codepoint,
+ * doing the right thing depending on whether we are parsing
+ * UTF-8 strings or not and NOT reading past the end of the buffer.
+ *
+ * NOTE: whether we are parsing UTF-8 or not is determined by the
+ * UTF macro which is defined as cBOOL(RExC_parse_utf8), thus this
+ * macro operates on the pRExC_state structure only.
+ */
+#define RExC_parse_inc_safe() RExC_parse_inc_safef(UTF)
+
+/* RExC_parse_inc_utf8()
+ *
+ * Increment RExC_parse to point at the next utf8 codepoint,
+ * assumes content is UTF-8.
+ *
+ * WARNING: Does NOT take into account RExC_end; it is the callers
+ * responsibility to make sure there are enough octets left in RExC_parse
+ * to ensure that when processing UTF-8 we would not read past the end
+ * of the string.
+ */
+#define RExC_parse_inc_utf8() STMT_START { \
+ RExC_parse += UTF8SKIP(RExC_parse); \
+} STMT_END
+
+/* RExC_parse_inc_if_char()
+ *
+ * Increment RExC_parse to point at the next codepoint, if and only
+ * if the current parse point is NOT a NULL, while doing the right thing
+ * depending on whether we are parsing UTF-8 strings or not.
+ *
+ * WARNING: Does NOT take into account RExC_end, it is the callers
+ * responsibility to make sure there are enough octets left in RExC_parse
+ * to ensure that when processing UTF-8 we would not read past the end
+ * of the string.
+ *
+ * NOTE: whether we are parsing UTF-8 or not is determined by the
+ * UTF macro which is defined as cBOOL(RExC_parse_utf8), thus this
+ * macro operates on the pRExC_state structure only.
+ */
+#define RExC_parse_inc_if_char() STMT_START { \
+ RExC_parse += SKIP_IF_CHAR(RExC_parse,RExC_end); \
+} STMT_END
+
+/* RExC_parse_inc_by(n_octets)
+ *
+ * Increment the parse cursor by the number of octets specified by
+ * the 'n_octets' argument.
+ *
+ * NOTE: Does NOT check ANY constraints. It is the callers responsibility
+ * that this will not move past the end of the string, or leave the
+ * pointer in the middle of a UTF-8 sequence.
+ *
+ * Typically used to advanced past previously analyzed content.
+ */
+#define RExC_parse_inc_by(n_octets) STMT_START { \
+ RExC_parse += (n_octets); \
+} STMT_END
+
+/* RExC_parse_set(to_ptr)
+ *
+ * Sets the RExC_parse pointer to the pointer specified by the 'to'
+ * argument. No validation whatsoever is performed on the to pointer.
+ */
+#define RExC_parse_set(to_ptr) STMT_START { \
+ RExC_parse = (to_ptr); \
+} STMT_END
+
+/**********************************************************************/
+
/* Heuristic check on the complexity of the pattern: if TOO_NAUGHTY, we set
* a flag to disable back-off on the fixed/floating substrings - if it's
* a high complexity pattern we assume the benefit of avoiding a full match
#define TRIE_STCLASS
#endif
+/* About the term "restudy" and the var "restudied" and the defines
+ * "SCF_TRIE_RESTUDY" and "SCF_TRIE_DOING_RESTUDY": All of these relate to
+ * doing multiple study_chunk() calls over the same set of opcodes for* the
+ * purpose of enhanced TRIE optimizations.
+ *
+ * Specifically, when TRIE_STUDY_OPT is defined, and it is defined in normal
+ * builds, (see above), during compilation SCF_TRIE_RESTUDY may be enabled
+ * which then causes the Perl_re_op_compile() to then call the optimizer
+ * S_study_chunk() a second time to perform additional optimizations,
+ * including the aho_corasick startclass optimization.
+ * This additional pass will only happen once, which is managed by the
+ * 'restudied' variable in Perl_re_op_compile().
+ *
+ * When this second pass is under way the flags passed into study_chunk() will
+ * include SCF_TRIE_DOING_RESTUDY and this flag is and must be cascaded down
+ * to any recursive calls to S_study_chunk().
+ *
+ * IMPORTANT: Any logic in study_chunk() that emits warnings should check that
+ * the SCF_TRIE_DOING_RESTUDY flag is NOT set in 'flags', or the warning may
+ * be produced twice.
+ *
+ * See commit 07be1b83a6b2d24b492356181ddf70e1c7917ae3 and
+ * 688e03912e3bff2d2419c457d8b0e1bab3eb7112 for more details.
+ */
#define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
I32 flags; /* common SF_* and SCF_* flags */
I32 whilem_c;
SSize_t *last_closep;
+ regnode **last_close_opp; /* pointer to pointer to last CLOSE regop
+ seen. DO NOT DEREFERENCE the regnode
+ pointer - the op may have been optimized
+ away */
regnode_ssc *start_class;
} scan_data_t;
{ NULL, 0, 0, 0, 0, 0 },
{ NULL, 0, 0, 0, 0, 0 },
},
- 0, 0, NULL, NULL
+ 0, 0, NULL, NULL, NULL
};
/* study flags */
#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
#define SCF_WHILEM_VISITED_POS 0x2000
-#define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
+#define SCF_TRIE_RESTUDY 0x4000 /* Need to do restudy in study_chunk()?
+ Search for "restudy" in this file
+ to find a detailed explanation.*/
#define SCF_SEEN_ACCEPT 0x8000
-#define SCF_TRIE_DOING_RESTUDY 0x10000
+#define SCF_TRIE_DOING_RESTUDY 0x10000 /* Are we in restudy right now?
+ Search for "restudy" in this file
+ to find a detailed explanation. */
#define SCF_IN_DEFINE 0x20000
-
#define UTF cBOOL(RExC_utf8)
/* The enums for all these are ordered so things work out correctly */
} \
} STMT_END
+#define ckWARNexperimental_with_arg(loc, class, m, arg) \
+ STMT_START { \
+ if (! RExC_warned_ ## class) { /* warn once per compilation */ \
+ RExC_warned_ ## class = 1; \
+ _WARN_HELPER(loc, packWARN(class), \
+ Perl_ck_warner_d(aTHX_ packWARN(class), \
+ m REPORT_LOCATION, \
+ arg, REPORT_LOCATION_ARGS(loc)));\
+ } \
+ } STMT_END
+
/* Convert between a pointer to a node and its offset from the beginning of the
* program */
#define REGNODE_p(offset) (RExC_emit_start + (offset))
static void
S_debug_studydata(pTHX_ const char *where, scan_data_t *data,
- U32 depth, int is_inf)
+ U32 depth, int is_inf,
+ SSize_t min, SSize_t stopmin, SSize_t delta)
{
DECLARE_AND_GET_RE_DEBUG_FLAGS;
DEBUG_OPTIMISE_MORE_r({
if (!data)
return;
- Perl_re_indentf(aTHX_ "%s: Pos:%" IVdf "/%" IVdf " Flags: 0x%" UVXf,
+ Perl_re_indentf(aTHX_ "%s: M/S/D: %" IVdf "/%" IVdf "/%" IVdf " Pos:%" IVdf "/%" IVdf " Flags: 0x%" UVXf,
depth,
where,
+ min, stopmin, delta,
(IV)data->pos_min,
(IV)data->pos_delta,
(UV)data->flags
}
-# define DEBUG_STUDYDATA(where, data, depth, is_inf) \
- S_debug_studydata(aTHX_ where, data, depth, is_inf)
+# define DEBUG_STUDYDATA(where, data, depth, is_inf, min, stopmin, delta) \
+ S_debug_studydata(aTHX_ where, data, depth, is_inf, min, stopmin, delta)
# define DEBUG_PEEP(str, scan, depth, flags) \
S_debug_peep(aTHX_ str, pRExC_state, scan, depth, flags)
#else
-# define DEBUG_STUDYDATA(where, data, depth, is_inf) NOOP
+# define DEBUG_STUDYDATA(where, data, depth, is_inf, min, stopmin, delta) NOOP
# define DEBUG_PEEP(str, scan, depth, flags) NOOP
#endif
}
data->last_end = -1;
data->flags &= ~SF_BEFORE_EOL;
- DEBUG_STUDYDATA("commit", data, 0, is_inf);
+ DEBUG_STUDYDATA("commit", data, 0, is_inf, -1, -1, -1);
}
/* An SSC is just a regnode_charclass_posix with an extra field: the inversion
/* the return from this sub is the minimum length that could possibly match */
STATIC SSize_t
-S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
- SSize_t *minlenp, SSize_t *deltap,
- regnode *last,
- scan_data_t *data,
- I32 stopparen,
- U32 recursed_depth,
- regnode_ssc *and_withp,
- U32 flags, U32 depth, bool was_mutate_ok)
- /* scanp: Start here (read-write). */
- /* deltap: Write maxlen-minlen here. */
- /* last: Stop before this one. */
- /* data: string data about the pattern */
- /* stopparen: treat close N as END */
- /* recursed: which subroutines have we recursed into */
- /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
+S_study_chunk(pTHX_
+ RExC_state_t *pRExC_state,
+ regnode **scanp, /* Start here (read-write). */
+ SSize_t *minlenp, /* used for the minlen of substrings? */
+ SSize_t *deltap, /* Write maxlen-minlen here. */
+ regnode *last, /* Stop before this one. */
+ scan_data_t *data, /* string data about the pattern */
+ I32 stopparen, /* treat CLOSE-N as END, see GOSUB */
+ U32 recursed_depth, /* how deep have we recursed via GOSUB */
+ regnode_ssc *and_withp, /* Valid if flags & SCF_DO_STCLASS_OR */
+ U32 flags, /* flags controlling this call, see SCF_ flags */
+ U32 depth, /* how deep have we recursed period */
+ bool was_mutate_ok /* TRUE if in-place optimizations are allowed.
+ FALSE only if the caller (recursively) was
+ prohibited from modifying the regops, because
+ a higher caller is holding a ptr to them. */
+)
{
- SSize_t final_minlen;
- /* There must be at least this number of characters to match */
- SSize_t min = 0;
- I32 pars = 0, code;
- regnode *scan = *scanp, *next;
- SSize_t delta = 0;
+ /* vars about the regnodes we are working with */
+ regnode *scan = *scanp; /* the current opcode we are inspecting */
+ regnode *next = NULL; /* the next opcode beyond scan, tmp var */
+ regnode *first_non_open = scan; /* FIXME: should this init to NULL?
+ the first non open regop, if the init
+ val IS an OPEN then we will skip past
+ it just after the var decls section */
+ I32 code = 0; /* temp var used to hold the optype of a regop */
+
+ /* vars about the min and max length of the pattern */
+ SSize_t min = 0; /* min length of this part of the pattern */
+ SSize_t stopmin = OPTIMIZE_INFTY; /* min length accounting for ACCEPT
+ this is adjusted down if we find
+ an ACCEPT */
+ SSize_t delta = 0; /* difference between min and max length
+ (not accounting for stopmin) */
+
+ /* vars about capture buffers in the pattern */
+ I32 pars = 0; /* count of OPEN opcodes */
+ I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0; /* is this op an OPEN? */
+
+ /* vars about whether this pattern contains something that can match
+ * infinitely long strings, eg, X* or X+ */
int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
int is_inf_internal = 0; /* The studied chunk is infinite */
- I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
- scan_data_t data_fake;
- SV *re_trie_maxbuff = NULL;
- regnode *first_non_open = scan;
- SSize_t stopmin = OPTIMIZE_INFTY;
- scan_frame *frame = NULL;
+
+ /* scan_data_t (struct) is used to hold information about the substrings
+ * and start class we have extracted from the string */
+ scan_data_t data_fake; /* temp var used for recursing in some cases */
+
+ SV *re_trie_maxbuff = NULL; /* temp var used to hold whether we can do
+ trie optimizations */
+
+ scan_frame *frame = NULL; /* used as part of fake recursion */
+
DECLARE_AND_GET_RE_DEBUG_FLAGS;
PERL_ARGS_ASSERT_STUDY_CHUNK;
first_non_open=regnext(first_non_open);
}
-
fake_study_recurse:
DEBUG_r(
RExC_study_chunk_recursed_count++;
*/
bool mutate_ok = was_mutate_ok && !(frame && frame->in_gosub);
/* Peephole optimizer: */
- DEBUG_STUDYDATA("Peep", data, depth, is_inf);
+ DEBUG_STUDYDATA("Peep", data, depth, is_inf, min, stopmin, delta);
DEBUG_PEEP("Peep", scan, depth, flags);
SSize_t minlen = 0;
SSize_t deltanext = 0;
SSize_t fake_last_close = 0;
- I32 f = SCF_IN_DEFINE;
+ regnode *fake_last_close_op = NULL;
+ U32 f = SCF_IN_DEFINE | (flags & SCF_TRIE_DOING_RESTUDY);
StructCopy(&zero_scan_data, &data_fake, scan_data_t);
scan = regnext(scan);
DEBUG_PEEP("expect IFTHEN", scan, depth, flags);
data_fake.last_closep= &fake_last_close;
+ data_fake.last_close_opp= &fake_last_close_op;
minlen = *minlenp;
next = regnext(scan);
scan = NEXTOPER(NEXTOPER(scan));
ssc_init_zero(pRExC_state, &accum);
while (OP(scan) == code) {
- SSize_t deltanext, minnext, fake;
- I32 f = 0;
+ SSize_t deltanext, minnext, fake_last_close = 0;
+ regnode *fake_last_close_op = NULL;
+ U32 f = (flags & SCF_TRIE_DOING_RESTUDY);
regnode_ssc this_class;
DEBUG_PEEP("Branch", scan, depth, flags);
if (data) {
data_fake.whilem_c = data->whilem_c;
data_fake.last_closep = data->last_closep;
+ data_fake.last_close_opp = data->last_close_opp;
+ }
+ else {
+ data_fake.last_closep = &fake_last_close;
+ data_fake.last_close_opp = &fake_last_close_op;
}
- else
- data_fake.last_closep = &fake;
data_fake.pos_delta = delta;
next = regnext(scan);
if (flags & SCF_DO_STCLASS) {
ssc_init(pRExC_state, &this_class);
data_fake.start_class = &this_class;
- f = SCF_DO_STCLASS_AND;
+ f |= SCF_DO_STCLASS_AND;
}
if (flags & SCF_WHILEM_VISITED_POS)
f |= SCF_WHILEM_VISITED_POS;
}
if (flags & SCF_DO_STCLASS)
ssc_or(pRExC_state, &accum, (regnode_charclass*)&this_class);
+ DEBUG_STUDYDATA("end BRANCH", data, depth, is_inf, min, stopmin, delta);
}
if (code == IFTHEN && num < 2) /* Empty ELSE branch */
min1 = 0;
flags |= SCF_DO_STCLASS_OR;
}
}
+ DEBUG_STUDYDATA("pre TRIE", data, depth, is_inf, min, stopmin, delta);
if (PERL_ENABLE_TRIE_OPTIMISATION
&& OP(startbranch) == BRANCH
} /* end if ( prev) */
} /* TRIE_MAXBUF is non zero */
} /* do trie */
-
+ DEBUG_STUDYDATA("after TRIE", data, depth, is_inf, min, stopmin, delta);
}
else if ( code == BRANCHJ ) { /* single branch is optimized. */
scan = NEXTOPER(NEXTOPER(scan));
RExC_study_chunk_recursed_bytes, U8);
}
/* we havent recursed into this paren yet, so recurse into it */
- DEBUG_STUDYDATA("gosub-set", data, depth, is_inf);
+ DEBUG_STUDYDATA("gosub-set", data, depth, is_inf, min, stopmin, delta);
PAREN_SET(recursed_depth, paren);
my_recursed_depth= recursed_depth + 1;
} else {
- DEBUG_STUDYDATA("gosub-inf", data, depth, is_inf);
+ DEBUG_STUDYDATA("gosub-inf", data, depth, is_inf, min, stopmin, delta);
/* some form of infinite recursion, assume infinite length
* */
if (flags & SCF_DO_SUBSTR) {
(frame && frame->in_gosub) || OP(scan) == GOSUB
);
- DEBUG_STUDYDATA("frame-new", data, depth, is_inf);
+ DEBUG_STUDYDATA("frame-new", data, depth, is_inf, min, stopmin, delta);
DEBUG_PEEP("fnew", scan, depth, flags);
frame = newframe;
ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
}
flags &= ~SCF_DO_STCLASS;
+ DEBUG_STUDYDATA("end EXACT", data, depth, is_inf, min, stopmin, delta);
}
else if (PL_regkind[OP(scan)] == EXACT) {
/* But OP != EXACT!, so is EXACTFish */
OP(scan) = ANYOFM;
ARG_SET(scan, *s & mask);
FLAGS(scan) = mask;
- /* we're not EXACTFish any more, so restudy */
+ /* We're not EXACTFish any more, so restudy.
+ * Search for "restudy" in this file to find
+ * a comment with details. */
continue;
}
flags &= ~SCF_DO_STCLASS;
SvREFCNT_dec(EXACTF_invlist);
}
+ DEBUG_STUDYDATA("end EXACTish", data, depth, is_inf, min, stopmin, delta);
}
else if (REGNODE_VARIES(OP(scan))) {
SSize_t mincount, maxcount, minnext, deltanext, pos_before = 0;
- I32 fl = 0, f = flags;
+ I32 fl = 0;
+ U32 f = flags;
regnode * const oscan = scan;
regnode_ssc this_class;
regnode_ssc *oclass = NULL;
delta += (minnext + deltanext) * maxcount
- minnext * mincount;
}
+
+ if (data && data->flags & SCF_SEEN_ACCEPT) {
+ if (flags & SCF_DO_SUBSTR) {
+ scan_commit(pRExC_state, data, minlenp, is_inf);
+ flags &= ~SCF_DO_SUBSTR;
+ }
+ if (stopmin > min)
+ stopmin = min;
+ DEBUG_STUDYDATA("after-whilem accept", data, depth, is_inf, min, stopmin, delta);
+ }
/* Try powerful optimization CURLYX => CURLYN. */
if ( OP(oscan) == CURLYX && data
&& data->flags & SF_IN_PAR
In this case we can't do fixed string optimisation.
*/
- SSize_t deltanext, minnext, fake = 0;
+ bool is_positive = OP(scan) == IFMATCH ? 1 : 0;
+ SSize_t deltanext, minnext;
+ SSize_t fake_last_close = 0;
+ regnode *fake_last_close_op = NULL;
+ regnode *cur_last_close_op;
regnode *nscan;
regnode_ssc intrnl;
- int f = 0;
+ U32 f = (flags & SCF_TRIE_DOING_RESTUDY);
StructCopy(&zero_scan_data, &data_fake, scan_data_t);
if (data) {
data_fake.whilem_c = data->whilem_c;
data_fake.last_closep = data->last_closep;
+ data_fake.last_close_opp = data->last_close_opp;
}
- else
- data_fake.last_closep = &fake;
+ else {
+ data_fake.last_closep = &fake_last_close;
+ data_fake.last_close_opp = &fake_last_close_op;
+ }
+
+ /* remember the last_close_op we saw so we can see if
+ * we are dealing with variable length lookbehind that
+ * contains capturing buffers, which are considered
+ * experimental */
+ cur_last_close_op= *(data_fake.last_close_opp);
+
data_fake.pos_delta = delta;
if ( flags & SCF_DO_STCLASS && !scan->flags
&& OP(scan) == IFMATCH ) { /* Lookahead */
last, &data_fake, stopparen,
recursed_depth, NULL, f, depth+1,
mutate_ok);
+
if (scan->flags) {
if ( deltanext < 0
|| deltanext > (I32) U8_MAX
* one. (This leaves it at 0 for non-variable length
* matches to avoid breakage for those not using this
* extension) */
- if (deltanext) {
+ if (deltanext) {
scan->next_off = deltanext;
- ckWARNexperimental(RExC_parse,
- WARN_EXPERIMENTAL__VLB,
- "Variable length lookbehind is experimental");
+ if (
+ /* See a CLOSE op inside this lookbehind? */
+ cur_last_close_op != *(data_fake.last_close_opp)
+ /* and not doing restudy. see: restudied */
+ && !(flags & SCF_TRIE_DOING_RESTUDY)
+ ) {
+ /* this is positive variable length lookbehind with
+ * capture buffers inside of it */
+ ckWARNexperimental_with_arg(RExC_parse,
+ WARN_EXPERIMENTAL__VLB,
+ "Variable length %s lookbehind with capturing is experimental",
+ is_positive ? "positive" : "negative");
+ }
}
scan->flags = (U8)minnext + deltanext;
}
|= SSC_MATCHES_EMPTY_STRING;
}
}
+ DEBUG_STUDYDATA("end LOOKAROUND", data, depth, is_inf, min, stopmin, delta);
}
#if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
else {
length of the pattern, something we won't know about
until after the recurse.
*/
- SSize_t deltanext, fake = 0;
+ SSize_t deltanext, fake_last_close = 0;
+ regnode *last_close_op = NULL;
regnode *nscan;
regnode_ssc intrnl;
- int f = 0;
+ U32 f = (flags & SCF_TRIE_DOING_RESTUDY);
/* We use SAVEFREEPV so that when the full compile
is finished perl will clean up the allocated
minlens when it's all done. This way we don't
data_fake.last_found=newSVsv(data->last_found);
}
}
- else
- data_fake.last_closep = &fake;
+ else {
+ data_fake.last_closep = &fake_last_close;
+ data_fake.last_close_opp = &fake_last_close_opp;
+ }
data_fake.flags = 0;
data_fake.substrs[0].flags = 0;
data_fake.substrs[1].flags = 0;
if ( next && (OP(next) != WHILEM) && next < last)
is_par = 0; /* Disable optimization */
}
- if (data)
+ if (data) {
*(data->last_closep) = ARG(scan);
+ *(data->last_close_opp) = scan;
+ }
}
else if (OP(scan) == EVAL) {
if (data)
if (OP(scan)==ACCEPT) {
/* m{(*ACCEPT)x} does not have to start with 'x' */
flags &= ~SCF_DO_STCLASS;
- if (data) {
+ if (data)
data->flags |= SCF_SEEN_ACCEPT;
- if (stopmin > min)
- stopmin = min;
- }
+ if (stopmin > min)
+ stopmin = min;
}
}
else if (OP(scan) == COMMIT) {
for ( word=1 ; word <= trie->wordcount ; word++)
{
- SSize_t deltanext=0, minnext=0, f = 0, fake;
+ SSize_t deltanext = 0, minnext = 0;
+ U32 f = (flags & SCF_TRIE_DOING_RESTUDY);
+ SSize_t fake_last_close = 0;
+ regnode *fake_last_close_op = NULL;
regnode_ssc this_class;
StructCopy(&zero_scan_data, &data_fake, scan_data_t);
if (data) {
data_fake.whilem_c = data->whilem_c;
data_fake.last_closep = data->last_closep;
+ data_fake.last_close_opp = data->last_close_opp;
+ }
+ else {
+ data_fake.last_closep = &fake_last_close;
+ data_fake.last_close_opp = &fake_last_close_op;
}
- else
- data_fake.last_closep = &fake;
data_fake.pos_delta = delta;
if (flags & SCF_DO_STCLASS) {
ssc_init(pRExC_state, &this_class);
data_fake.start_class = &this_class;
- f = SCF_DO_STCLASS_AND;
+ f |= SCF_DO_STCLASS_AND;
}
if (flags & SCF_WHILEM_VISITED_POS)
f |= SCF_WHILEM_VISITED_POS;
if (flags & SCF_DO_STCLASS)
ssc_or(pRExC_state, &accum, (regnode_charclass *) &this_class);
}
+ DEBUG_STUDYDATA("after JUMPTRIE", data, depth, is_inf, min, stopmin, delta);
}
if (flags & SCF_DO_SUBSTR) {
data->pos_min += min1;
}
}
scan= tail;
+ DEBUG_STUDYDATA("after TRIE study", data, depth, is_inf, min, stopmin, delta);
continue;
}
#else
/* we need to unwind recursion. */
depth = depth - 1;
- DEBUG_STUDYDATA("frame-end", data, depth, is_inf);
+ DEBUG_STUDYDATA("frame-end", data, depth, is_inf, min, stopmin, delta);
DEBUG_PEEP("fend", scan, depth, flags);
/* restore previous context */
}
assert(!frame);
- DEBUG_STUDYDATA("pre-fin", data, depth, is_inf);
+ DEBUG_STUDYDATA("pre-fin", data, depth, is_inf, min, stopmin, delta);
+
+ if (min > stopmin) {
+ /* stopmin might be shorter than min if we saw an (*ACCEPT). If
+ this is the case then it means this pattern is variable length
+ and we need to ensure that the delta accounts for it. delta
+ represents the difference between min length and max length for
+ this part of the pattern. */
+ delta += min - stopmin;
+ min = stopmin;
+ }
*scanp = scan;
*deltap = is_inf_internal ? OPTIMIZE_INFTY : delta;
if (flags & SCF_TRIE_RESTUDY)
data->flags |= SCF_TRIE_RESTUDY;
- DEBUG_STUDYDATA("post-fin", data, depth, is_inf);
-
- final_minlen = min < stopmin
- ? min : stopmin;
if (!(RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN)) {
- if (final_minlen > OPTIMIZE_INFTY - delta)
+ if (min > OPTIMIZE_INFTY - delta)
RExC_maxlen = OPTIMIZE_INFTY;
- else if (RExC_maxlen < final_minlen + delta)
- RExC_maxlen = final_minlen + delta;
+ else if (RExC_maxlen < min + delta)
+ RExC_maxlen = min + delta;
}
- return final_minlen;
+ DEBUG_STUDYDATA("post-fin", data, depth, is_inf, min, stopmin, delta);
+ return min;
}
/* add a data member to the struct reg_data attached to this regex, it should
#ifdef TRIE_STUDY_OPT
+/* search for "restudy" in this file for a detailed explanation */
#define CHECK_RESTUDY_GOTO_butfirst(dOsomething) \
STMT_START { \
if ( \
RExC_state_t RExC_state;
RExC_state_t * const pRExC_state = &RExC_state;
#ifdef TRIE_STUDY_OPT
+ /* search for "restudy" in this file for a detailed explanation */
int restudied = 0;
RExC_state_t copyRExC_state;
#endif
RExC_rx->intflags = 0;
RExC_flags = rx_flags; /* don't let top level (?i) bleed */
- RExC_parse = exp;
+ RExC_parse_set(exp);
/* This NUL is guaranteed because the pattern comes from an SV*, and the sv
* code makes sure the final byte is an uncounted NUL. But should this
* ever not be the case, lots of things could read beyond the end of the
* buffer: loops like
- * while(isFOO(*RExC_parse)) RExC_parse++;
+ * while(isFOO(*RExC_parse)) RExC_parse_inc_by(1);
* strchr(RExC_parse, "foo");
* etc. So it is worth noting. */
assert(*RExC_end == '\0');
#ifdef TRIE_STUDY_OPT
+ /* search for "restudy" in this file for a detailed explanation */
if (!restudied) {
StructCopy(&zero_scan_data, &data, scan_data_t);
copyRExC_state = RExC_state;
data in the pattern. If there is then we can use it for optimisations */
if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN)) { /* Only one top-level choice.
*/
- SSize_t fake;
+ SSize_t fake_deltap;
STRLEN longest_length[2];
regnode_ssc ch_class; /* pointed to by data */
int stclass_flag;
SSize_t last_close = 0; /* pointed to by data */
regnode *first= scan;
regnode *first_next= regnext(first);
+ regnode *last_close_op= NULL;
int i;
/*
else if (PL_regkind[OP(first)] == TRIE &&
((reg_trie_data *)RExC_rxi->data->data[ ARG(first) ])->minlen>0)
{
- /* this can happen only on restudy */
+ /* this can happen only on restudy
+ * Search for "restudy" in this file to find
+ * a comment with details. */
RExC_rxi->regstclass = construct_ahocorasick_from_trie(pRExC_state, (regnode *)first, 0);
}
#endif
/* Scan is after the zeroth branch, first is atomic matcher. */
#ifdef TRIE_STUDY_OPT
+ /* search for "restudy" in this file for a detailed explanation */
DEBUG_PARSE_r(
if (!restudied)
Perl_re_printf( aTHX_ "first at %" IVdf "\n",
} else /* XXXX Check for BOUND? */
stclass_flag = 0;
data.last_closep = &last_close;
+ data.last_close_opp = &last_close_op;
DEBUG_RExC_seen();
/*
* MAIN ENTRY FOR study_chunk() FOR m/PATTERN/
* (NO top level branches)
*/
- minlen = study_chunk(pRExC_state, &first, &minlen, &fake,
+ minlen = study_chunk(pRExC_state, &first, &minlen, &fake_deltap,
scan + RExC_size, /* Up to end */
&data, -1, 0, NULL,
SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag
| (restudied ? SCF_TRIE_DOING_RESTUDY : 0),
0, TRUE);
+ /* search for "restudy" in this file for a detailed explanation
+ * of 'restudied' and SCF_TRIE_DOING_RESTUDY */
CHECK_RESTUDY_GOTO_butfirst(LEAVE_with_name("study_chunk"));
}
else {
/* Several toplevels. Best we can is to set minlen. */
- SSize_t fake;
+ SSize_t fake_deltap;
regnode_ssc ch_class;
SSize_t last_close = 0;
+ regnode *last_close_op = NULL;
DEBUG_PARSE_r(Perl_re_printf( aTHX_ "\nMulti Top Level\n"));
ssc_init(pRExC_state, &ch_class);
data.start_class = &ch_class;
data.last_closep = &last_close;
+ data.last_close_opp = &last_close_op;
DEBUG_RExC_seen();
/*
* (patterns WITH top level branches)
*/
minlen = study_chunk(pRExC_state,
- &scan, &minlen, &fake, scan + RExC_size, &data, -1, 0, NULL,
+ &scan, &minlen, &fake_deltap, scan + RExC_size, &data, -1, 0, NULL,
SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS|(restudied
? SCF_TRIE_DOING_RESTUDY
: 0),
0, TRUE);
+ /* search for "restudy" in this file for a detailed explanation
+ * of 'restudied' and SCF_TRIE_DOING_RESTUDY */
CHECK_RESTUDY_GOTO_butfirst(NOOP);
return ret;
} else {
if (retarray)
- ret = newSVsv(&PL_sv_undef);
+ ret = newSV_type(SVt_NULL);
}
if (retarray)
av_push(retarray, ret);
* using do...while */
if (UTF)
do {
- RExC_parse += UTF8SKIP(RExC_parse);
+ RExC_parse_inc_utf8();
} while ( RExC_parse < RExC_end
&& isWORDCHAR_utf8_safe((U8*)RExC_parse, (U8*) RExC_end));
else
do {
- RExC_parse++;
+ RExC_parse_inc_by(1);
} while (RExC_parse < RExC_end && isWORDCHAR(*RExC_parse));
} else {
- RExC_parse++; /* so the <- from the vFAIL is after the offending
+ RExC_parse_inc_by(1); /* so the <- from the vFAIL is after the offending
character */
vFAIL("Group name must start with a non-digit word character");
}
/* '^' as an initial flag sets certain defaults */
if (UCHARAT(RExC_parse) == '^') {
- RExC_parse++;
+ RExC_parse_inc_by(1);
has_use_defaults = TRUE;
STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
cs = (toUSE_UNI_CHARSET_NOT_DEPENDS)
if ((RExC_pm_flags & PMf_WILDCARD)) {
if (flagsp == & negflags) {
if (*RExC_parse == 'm') {
- RExC_parse++;
+ RExC_parse_inc_by(1);
/* diag_listed_as: Use of %s is not allowed in Unicode
property wildcard subpatterns in regex; marked by <--
HERE in m/%s/ */
has_charset_modifier = DEPENDS_PAT_MOD;
break;
excess_modifier:
- RExC_parse++;
+ RExC_parse_inc_by(1);
if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
}
}
NOT_REACHED; /*NOTREACHED*/
neg_modifier:
- RExC_parse++;
+ RExC_parse_inc_by(1);
vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"",
*(RExC_parse - 1));
NOT_REACHED; /*NOTREACHED*/
if ( (RExC_pm_flags & PMf_WILDCARD)
&& cs != REGEX_ASCII_MORE_RESTRICTED_CHARSET)
{
- RExC_parse++;
+ RExC_parse_inc_by(1);
/* diag_listed_as: Use of %s is not allowed in Unicode
property wildcard subpatterns in regex; marked by <--
HERE in m/%s/ */
return;
default:
fail_modifiers:
- RExC_parse += SKIP_IF_CHAR(RExC_parse, RExC_end);
+ RExC_parse_inc_if_char();
/* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
vFAIL2utf8f("Sequence (%" UTF8f "...) not recognized",
UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
NOT_REACHED; /*NOTREACHED*/
}
- RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
+ RExC_parse_inc();
}
vFAIL("Sequence (?... not terminated");
modifier_illegal_in_wildcard:
- RExC_parse++;
+ RExC_parse_inc_by(1);
/* diag_listed_as: Use of %s is not allowed in Unicode property wildcard
subpatterns in regex; marked by <-- HERE in m/%s/ */
vFAIL2("Use of modifier '%c' is not allowed in Unicode property wildcard"
if (RExC_parse != name_start && ch == '}') {
while (isBLANK(*RExC_parse)) {
- RExC_parse++;
+ RExC_parse_inc_by(1);
}
}
if (RExC_parse == name_start || *RExC_parse != ch) {
return ret;
}
+/* reg_la_NOTHING()
+ *
+ * Maybe parse a parenthezised lookaround construct that is equivalent to a
+ * NOTHING regop when the construct is empty.
+ *
+ * Calls skip_to_be_ignored_text() before checking if the construct is empty.
+ *
+ * Checks for unterminated constructs and throws a "not terminated" error
+ * with the appropriate type if necessary
+ *
+ * Assuming it does not throw an exception increments RExC_seen_zerolen.
+ *
+ * If the construct is empty generates a NOTHING op and returns its
+ * regnode_offset, which the caller would then return to its caller.
+ *
+ * If the construct is not empty increments RExC_in_lookaround, and turns
+ * on any flags provided in RExC_seen, and then returns 0 to signify
+ * that parsing should continue.
+ *
+ * PS: I would have called this reg_parse_lookaround_NOTHING() but then
+ * any use of it would have had to be broken onto multiple lines, hence
+ * the abbreviation.
+ */
+STATIC regnode_offset
+S_reg_la_NOTHING(pTHX_ RExC_state_t *pRExC_state, U32 flags,
+ const char *type)
+{
+
+ PERL_ARGS_ASSERT_REG_LA_NOTHING;
+
+ /* false below so we do not force /x */
+ skip_to_be_ignored_text(pRExC_state, &RExC_parse, FALSE);
+
+ if (RExC_parse >= RExC_end)
+ vFAIL2("Sequence (%s... not terminated", type);
+
+ /* Always increment as NOTHING regops are zerolen */
+ RExC_seen_zerolen++;
+
+ if (*RExC_parse == ')') {
+ regnode_offset ret= reg_node(pRExC_state, NOTHING);
+ nextchar(pRExC_state);
+ return ret;
+ }
+
+ RExC_seen |= flags;
+ RExC_in_lookaround++;
+ return 0; /* keep parsing! */
+}
+
+/* reg_la_OPFAIL()
+ *
+ * Maybe parse a parenthezised lookaround construct that is equivalent to a
+ * OPFAIL regop when the construct is empty.
+ *
+ * Calls skip_to_be_ignored_text() before checking if the construct is empty.
+ *
+ * Checks for unterminated constructs and throws a "not terminated" error
+ * if necessary.
+ *
+ * If the construct is empty generates an OPFAIL op and returns its
+ * regnode_offset which the caller should then return to its caller.
+ *
+ * If the construct is not empty increments RExC_in_lookaround, and also
+ * increments RExC_seen_zerolen, and turns on the flags provided in
+ * RExC_seen, and then returns 0 to signify that parsing should continue.
+ *
+ * PS: I would have called this reg_parse_lookaround_OPFAIL() but then
+ * any use of it would have had to be broken onto multiple lines, hence
+ * the abbreviation.
+ */
+
+STATIC regnode_offset
+S_reg_la_OPFAIL(pTHX_ RExC_state_t *pRExC_state, U32 flags,
+ const char *type)
+{
+
+ PERL_ARGS_ASSERT_REG_LA_OPFAIL;
+
+ /* FALSE so we don't force to /x below */;
+ skip_to_be_ignored_text(pRExC_state, &RExC_parse, FALSE);
+
+ if (RExC_parse >= RExC_end)
+ vFAIL2("Sequence (%s... not terminated", type);
+
+ if (*RExC_parse == ')') {
+ regnode_offset ret= reganode(pRExC_state, OPFAIL, 0);
+ nextchar(pRExC_state);
+ return ret; /* return produced regop */
+ }
+
+ /* only increment zerolen *after* we check if we produce an OPFAIL
+ * as an OPFAIL does not match a zero length construct, as it
+ * does not match ever. */
+ RExC_seen_zerolen++;
+ RExC_seen |= flags;
+ RExC_in_lookaround++;
+ return 0; /* keep parsing! */
+}
+
/* Below are the main parsing routines.
*
* S_reg() parses a whole pattern or subpattern. It itself handles things
/* Having this true makes it feasible to have a lot fewer tests for the
* parse pointer being in scope. For example, we can write
- * while(isFOO(*RExC_parse)) RExC_parse++;
+ * while(isFOO(*RExC_parse)) RExC_parse_inc_by(1);
* instead of
- * while(RExC_parse < RExC_end && isFOO(*RExC_parse)) RExC_parse++;
+ * while(RExC_parse < RExC_end && isFOO(*RExC_parse)) RExC_parse_inc_by(1);
*/
assert(*RExC_end == '\0');
char *start_arg = NULL;
unsigned char op = 0;
int arg_required = 0;
- int internal_argval = -1; /* if >-1 we are not allowed an argument*/
+ int internal_argval = -1; /* if > -1 no argument allowed */
bool has_upper = FALSE;
+ U32 seen_flag_set = 0; /* RExC_seen flags we must set */
if (has_intervening_patws) {
- RExC_parse++; /* past the '*' */
+ RExC_parse_inc_by(1); /* past the '*' */
/* For strict backwards compatibility, don't change the message
* now that we also have lowercase operands */
if (isUPPER(*RExC_parse)) {
has_upper = TRUE;
}
- RExC_parse++;
+ RExC_parse_inc_by(1);
}
else {
- RExC_parse += UTF8SKIP(RExC_parse);
+ RExC_parse_inc_utf8();
}
}
verb_len = RExC_parse - start_verb;
goto unterminated_verb_pattern;
}
- RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
+ RExC_parse_inc();
while ( RExC_parse < RExC_end && *RExC_parse != ')' ) {
- RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
+ RExC_parse_inc();
}
if ( RExC_parse >= RExC_end || *RExC_parse != ')' ) {
unterminated_verb_pattern:
goto no_colon;
}
- RExC_parse = start_arg;
+ RExC_parse_set(start_arg);
if (RExC_in_script_run) {
break;
lookbehind_alpha_assertions:
- RExC_seen |= REG_LOOKBEHIND_SEEN;
+ seen_flag_set = REG_LOOKBEHIND_SEEN;
/*FALLTHROUGH*/
alpha_assertions:
- RExC_in_lookaround++;
- RExC_seen_zerolen++;
-
- if (! start_arg) {
+ if ( !start_arg ) {
goto no_colon;
}
- /* An empty negative lookahead assertion simply is failure */
- if (paren == 'A' && RExC_parse == start_arg) {
- ret=reganode(pRExC_state, OPFAIL, 0);
- nextchar(pRExC_state);
- return ret;
+ if ( RExC_parse == start_arg ) {
+ if ( paren == 'A' || paren == 'B' ) {
+ /* An empty negative lookaround assertion is failure.
+ * See also: S_reg_la_OPFAIL() */
+
+ /* Note: OPFAIL is *not* zerolen. */
+ ret = reganode(pRExC_state, OPFAIL, 0);
+ nextchar(pRExC_state);
+ return ret;
+ }
+ else
+ if ( paren == 'a' || paren == 'b' ) {
+ /* An empty positive lookaround assertion is success.
+ * See also: S_reg_la_NOTHING() */
+
+ /* Note: NOTHING is zerolen, so increment here */
+ RExC_seen_zerolen++;
+ ret = reg_node(pRExC_state, NOTHING);
+ nextchar(pRExC_state);
+ return ret;
+ }
}
- RExC_parse = start_arg;
+ RExC_seen_zerolen++;
+ RExC_in_lookaround++;
+ RExC_seen |= seen_flag_set;
+
+ RExC_parse_set(start_arg);
goto parse_rest;
no_colon:
- vFAIL2utf8f(
- "'(*%" UTF8f "' requires a terminating ':'",
- UTF8fARG(UTF, verb_len, start_verb));
+ vFAIL2utf8f( "'(*%" UTF8f "' requires a terminating ':'",
+ UTF8fARG(UTF, verb_len, start_verb));
NOT_REACHED; /*NOTREACHED*/
} /* End of switch */
if ( ! op ) {
- RExC_parse += UTF
- ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
- : 1;
+ RExC_parse_inc_safe();
if (has_upper || verb_len == 0) {
- vFAIL2utf8f(
- "Unknown verb pattern '%" UTF8f "'",
- UTF8fARG(UTF, verb_len, start_verb));
+ vFAIL2utf8f( "Unknown verb pattern '%" UTF8f "'",
+ UTF8fARG(UTF, verb_len, start_verb));
}
else {
- vFAIL2utf8f(
- "Unknown '(*...)' construct '%" UTF8f "'",
- UTF8fARG(UTF, verb_len, start_verb));
+ vFAIL2utf8f( "Unknown '(*...)' construct '%" UTF8f "'",
+ UTF8fARG(UTF, verb_len, start_verb));
}
}
if ( RExC_parse == start_arg ) {
start_arg = NULL;
}
if ( arg_required && !start_arg ) {
- vFAIL3("Verb pattern '%.*s' has a mandatory argument",
+ vFAIL3( "Verb pattern '%.*s' has a mandatory argument",
(int) verb_len, start_verb);
}
if (internal_argval == -1) {
}
RExC_seen |= REG_VERBARG_SEEN;
if (start_arg) {
- SV *sv = newSVpvn( start_arg,
- RExC_parse - start_arg);
+ SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
ARG(REGNODE_p(ret)) = add_data( pRExC_state,
STR_WITH_LEN("S"));
RExC_rxi->data->data[ARG(REGNODE_p(ret))]=(void*)sv;
const char impossible_group[] = "Invalid reference to group";
if (has_intervening_patws) {
- RExC_parse++;
+ RExC_parse_inc_by(1);
vFAIL("In '(?...)', the '(' and '?' must be adjacent");
}
- RExC_parse++; /* past the '?' */
+ RExC_parse_inc_by(1); /* past the '?' */
paren = *RExC_parse; /* might be a trailing NUL, if not
well-formed */
- RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
+ RExC_parse_inc();
if (RExC_parse > RExC_end) {
paren = '\0';
}
case 'P': /* (?P...) variants for those used to PCRE/Python */
paren = *RExC_parse;
if ( paren == '<') { /* (?P<...>) named capture */
- RExC_parse++;
+ RExC_parse_inc_by(1);
if (RExC_parse >= RExC_end) {
vFAIL("Sequence (?P<... not terminated");
}
goto named_capture;
}
else if (paren == '>') { /* (?P>name) named recursion */
- RExC_parse++;
+ RExC_parse_inc_by(1);
if (RExC_parse >= RExC_end) {
vFAIL("Sequence (?P>... not terminated");
}
goto named_recursion;
}
else if (paren == '=') { /* (?P=...) named backref */
- RExC_parse++;
+ RExC_parse_inc_by(1);
return handle_named_backref(pRExC_state, flagp,
segment_parse_start, ')');
}
- RExC_parse += SKIP_IF_CHAR(RExC_parse, RExC_end);
+ RExC_parse_inc_if_char();
/* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
vFAIL3("Sequence (%.*s...) not recognized",
(int) (RExC_parse - seqstart), seqstart);
NOT_REACHED; /*NOTREACHED*/
case '<': /* (?<...) */
/* If you want to support (?<*...), first reconcile with GH #17363 */
- if (*RExC_parse == '!')
- paren = ',';
- else if (*RExC_parse != '=')
+ if (*RExC_parse == '!') {
+ paren = ','; /* negative lookbehind (?<! ... ) */
+ RExC_parse_inc_by(1);
+ if ((ret= reg_la_OPFAIL(pRExC_state,REG_LB_SEEN,"?<!")))
+ return ret;
+ break;
+ }
+ else
+ if (*RExC_parse == '=') {
+ /* paren = '<' - negative lookahead (?<= ... ) */
+ RExC_parse_inc_by(1);
+ if ((ret= reg_la_NOTHING(pRExC_state,REG_LB_SEEN,"?<=")))
+ return ret;
+ break;
+ }
+ else
named_capture:
{ /* (?<...>) */
char *name_start;
paren = 1;
goto capturing_parens;
}
-
- RExC_seen |= REG_LOOKBEHIND_SEEN;
- RExC_in_lookaround++;
- RExC_parse++;
- if (RExC_parse >= RExC_end) {
- vFAIL("Sequence (?... not terminated");
- }
- RExC_seen_zerolen++;
- break;
+ NOT_REACHED; /*NOTREACHED*/
case '=': /* (?=...) */
- RExC_seen_zerolen++;
- RExC_in_lookaround++;
+ if ((ret= reg_la_NOTHING(pRExC_state, 0, "?=")))
+ return ret;
break;
case '!': /* (?!...) */
- RExC_seen_zerolen++;
- /* check if we're really just a "FAIL" assertion */
- skip_to_be_ignored_text(pRExC_state, &RExC_parse,
- FALSE /* Don't force to /x */ );
- if (*RExC_parse == ')') {
- ret=reganode(pRExC_state, OPFAIL, 0);
- nextchar(pRExC_state);
+ if ((ret= reg_la_OPFAIL(pRExC_state, 0, "?!")))
return ret;
- }
- RExC_in_lookaround++;
break;
case '|': /* (?|...) */
/* branch reset, behave like a (?:...) except that
/* NOTREACHED */
case '+':
if (! inRANGE(RExC_parse[0], '1', '9')) {
- RExC_parse++;
+ RExC_parse_inc_by(1);
vFAIL("Illegal pattern");
}
goto parse_recursion;
/* FALLTHROUGH */
case '1': case '2': case '3': case '4': /* (?1) */
case '5': case '6': case '7': case '8': case '9':
- RExC_parse = (char *) seqstart + 1; /* Point to the digit */
+ RExC_parse_set((char *) seqstart + 1); /* Point to the digit */
parse_recursion:
{
bool is_neg = FALSE;
UV unum;
segment_parse_start = RExC_parse - 1;
if (*RExC_parse == '-') {
- RExC_parse++;
+ RExC_parse_inc_by(1);
is_neg = TRUE;
}
endptr = RExC_end;
&& unum <= I32_MAX
) {
num = (I32)unum;
- RExC_parse = (char*)endptr;
+ RExC_parse_set((char*)endptr);
}
else { /* Overflow, or something like that. Position
beyond all digits for the message */
while (RExC_parse < RExC_end && isDIGIT(*RExC_parse)) {
- RExC_parse++;
+ RExC_parse_inc_by(1);
}
vFAIL(impossible_group);
}
/* Don't overflow */
if (UNLIKELY(I32_MAX - RExC_npar < num)) {
- RExC_parse++;
+ RExC_parse_inc_by(1);
vFAIL(impossible_group);
}
num += RExC_npar;
if (paren == '-' && num < 1) {
- RExC_parse++;
+ RExC_parse_inc_by(1);
vFAIL(non_existent_group_msg);
}
}
* then reparsing */
if (ALL_PARENS_COUNTED) {
if (num >= RExC_total_parens) {
- RExC_parse++;
+ RExC_parse_inc_by(1);
vFAIL(non_existent_group_msg);
}
}
case '?': /* (??...) */
is_logical = 1;
if (*RExC_parse != '{') {
- RExC_parse += SKIP_IF_CHAR(RExC_parse, RExC_end);
+ RExC_parse_inc_if_char();
/* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
vFAIL2utf8f(
"Sequence (%" UTF8f "...) not recognized",
}
*flagp |= POSTPONED;
paren = '{';
- RExC_parse++;
+ RExC_parse_inc_by(1);
/* FALLTHROUGH */
case '{': /* (?{...}) */
{
}
/* this is a pre-compiled code block (?{...}) */
cb = &pRExC_state->code_blocks->cb[pRExC_state->code_index];
- RExC_parse = RExC_start + cb->end;
+ RExC_parse_set(RExC_start + cb->end);
o = cb->block;
if (cb->src_regex) {
n = add_data(pRExC_state, STR_WITH_LEN("rl"));
|| RExC_parse[0] == '\'' ) /* (?('NAME')...) */
{
char ch = RExC_parse[0] == '<' ? '>' : '\'';
- char *name_start= RExC_parse++;
+ char *name_start= RExC_parse;
+ RExC_parse_inc_by(1);
U32 num = 0;
SV *sv_dat=reg_scan_name(pRExC_state, REG_RSN_RETURN_DATA);
if ( RExC_parse == name_start
vFAIL2("Sequence (?(%c... not terminated",
(ch == '>' ? '<' : ch));
}
- RExC_parse++;
+ RExC_parse_inc_by(1);
if (sv_dat) {
num = add_data( pRExC_state, STR_WITH_LEN("S"));
RExC_rxi->data->data[num]=(void*)sv_dat;
"DEFINE"))
{
ret = reganode(pRExC_state, DEFINEP, 0);
- RExC_parse += DEFINE_len;
+ RExC_parse_inc_by(DEFINE_len);
is_define = 1;
goto insert_if_check_paren;
}
else if (RExC_parse[0] == 'R') {
- RExC_parse++;
+ RExC_parse_inc_by(1);
/* parno == 0 => /(?(R)YES|NO)/ "in any form of recursion OR eval"
* parno == 1 => /(?(R0)YES|NO)/ "in GOSUB (?0) / (?R)"
* parno == 2 => /(?(R1)YES|NO)/ "in GOSUB (?1) (parno-1)"
parno = 0;
if (RExC_parse[0] == '0') {
parno = 1;
- RExC_parse++;
+ RExC_parse_inc_by(1);
}
else if (inRANGE(RExC_parse[0], '1', '9')) {
UV uv;
&& uv <= I32_MAX
) {
parno = (I32)uv + 1;
- RExC_parse = (char*)endptr;
+ RExC_parse_set((char*)endptr);
}
/* else "Switch condition not recognized" below */
} else if (RExC_parse[0] == '&') {
SV *sv_dat;
- RExC_parse++;
+ RExC_parse_inc_by(1);
sv_dat = reg_scan_name(pRExC_state,
REG_RSN_RETURN_DATA);
if (sv_dat)
&& uv <= I32_MAX
) {
parno = (I32)uv;
- RExC_parse = (char*)endptr;
+ RExC_parse_set((char*)endptr);
}
else {
vFAIL("panic: grok_atoUV returned FALSE");
insert_if_check_paren:
if (UCHARAT(RExC_parse) != ')') {
- RExC_parse += UTF
- ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
- : 1;
+ RExC_parse_inc_safe();
vFAIL("Switch condition not recognized");
}
nextchar(pRExC_state);
#endif
return ret;
}
- RExC_parse += UTF
- ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
- : 1;
+ RExC_parse_inc_safe();
vFAIL("Unknown switch condition (?(...))");
}
case '[': /* (?[ ... ]) */
case '*': /* If you want to support (?*...), first reconcile with GH #17363 */
/* FALLTHROUGH */
default: /* e.g., (?i) */
- RExC_parse = (char *) seqstart + 1;
+ RExC_parse_set((char *) seqstart + 1);
parse_flags:
parse_lparen_question_flags(pRExC_state);
if (UCHARAT(RExC_parse) != ':') {
set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET);
}
if (RExC_parse >= RExC_end || UCHARAT(RExC_parse) != ')') {
- RExC_parse = reg_parse_start;
+ RExC_parse_set(reg_parse_start);
vFAIL("Unmatched (");
}
nextchar(pRExC_state);
}
else if (!paren && RExC_parse < RExC_end) {
if (*RExC_parse == ')') {
- RExC_parse++;
+ RExC_parse_inc_by(1);
vFAIL("Unmatched )");
}
else
}
else if (*start == '0') { /* grok_atoUV() fails for only two reasons:
leading zeros or overflow */
- RExC_parse = (char * ) end;
+ RExC_parse_set((char * ) end);
/* Perhaps too generic a msg for what is only failure from having
* leading zeros, but this is how it's always behaved. */
/* Here, found a quantifier, but was too large; either it overflowed or was
* too big a legal number */
- RExC_parse = (char * ) end;
+ RExC_parse_set((char * ) end);
vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
NOT_REACHED; /*NOTREACHED*/
max = get_quantifier_value(pRExC_state, max_start, max_end);
}
- RExC_parse = (char *) regcurly_return[RBRACE];
+ RExC_parse_set((char *) regcurly_return[RBRACE]);
nextchar(pRExC_state);
if (max < min) { /* If can't match, warn and optimize to fail
/* Forbid extra quantifiers */
if (isQUANTIFIER(RExC_parse, RExC_end)) {
- RExC_parse++;
+ RExC_parse_inc_by(1);
vFAIL("Nested quantifiers");
}
* reason is to make it harder to write patterns that take a long long time
* to halt, and because the use of this construct isn't necessary in
* matching Unicode property values */
- RExC_parse++;
+ RExC_parse_inc_by(1);
/* diag_listed_as: Use of %s is not allowed in Unicode property wildcard
subpatterns in regex; marked by <-- HERE in m/%s/
*/
* [^\n]. The latter is assumed when the {...} following the \N is a legal
* quantifier, or if there is no '{' at all */
if (*p != '{' || regcurly(p, RExC_end, NULL)) {
- RExC_parse = p;
+ RExC_parse_set(p);
if (cp_count) {
*cp_count = -1;
}
vFAIL("Missing braces on \\N{}");
}
- RExC_parse++; /* Skip past the '{' */
+ RExC_parse_inc_by(1); /* Skip past the '{' */
endbrace = (char *) memchr(RExC_parse, '}', RExC_end - RExC_parse);
if (! endbrace) { /* no trailing brace */
/* \N{_} is what toke.c returns to us to indicate a name that evaluates to
* nothing at all (not allowed under strict) */
if (endbrace - RExC_parse == 1 && *RExC_parse == '_') {
- RExC_parse = endbrace;
+ RExC_parse_set(endbrace);
if (strict) {
- RExC_parse++; /* Position after the "}" */
+ RExC_parse_inc_by(1); /* Position after the "}" */
vFAIL("Zero length \\N{}");
}
}
while (isBLANK(*RExC_parse)) {
- RExC_parse++;
+ RExC_parse_inc_by(1);
}
e = endbrace;
UTF,
&error_msg);
if (error_msg) {
- RExC_parse = endbrace;
+ RExC_parse_set(endbrace);
vFAIL(error_msg);
}
/* Here, exactly one code point. If that isn't what is wanted,
* fail */
if (! code_point_p) {
- RExC_parse = p;
+ RExC_parse_set(p);
return FALSE;
}
/* Have parsed this entire single code point \N{...}. *cp_count
* has already been set to 1, so don't do it again. */
- RExC_parse = endbrace;
+ RExC_parse_set(endbrace);
nextchar(pRExC_state);
return TRUE;
} /* End of is a single code point */
* case). */
if (! node_p) {
if (! cp_count) {
- RExC_parse = p;
+ RExC_parse_set(p);
}
return FALSE;
}
* converted a name to the \N{U+...} form. This include changing a
* name that evaluates to multiple code points to \N{U+c1.c2.c3 ...} */
- RExC_parse += 2; /* Skip past the 'U+' */
+ RExC_parse_inc_by(2); /* Skip past the 'U+' */
/* Code points are separated by dots. The '}' terminates the whole
* thing. */
UV cp = grok_hex(RExC_parse, &len, &flags, &overflow_value);
if (len == 0) {
- RExC_parse++;
+ RExC_parse_inc_by(1);
bad_NU:
vFAIL("Invalid hexadecimal number in \\N{U+...}");
}
- RExC_parse += len;
+ RExC_parse_inc_by(len);
if (cp > MAX_LEGAL_CP) {
vFAIL(form_cp_too_large_msg(16, start_digit, len, 0));
/* Here, is a single code point; fail if doesn't want that */
if (! code_point_p) {
- RExC_parse = p;
+ RExC_parse_set(p);
return FALSE;
}
/* A single code point is easy to handle; just return it */
*code_point_p = UNI_TO_NATIVE(cp);
- RExC_parse = endbrace;
+ RExC_parse_set(endbrace);
nextchar(pRExC_state);
return TRUE;
}
* \N{U+100.} )
* */
if (*RExC_parse != '.' || RExC_parse + 1 >= e) {
- RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
- ? UTF8SKIP(RExC_parse)
- : 1;
- RExC_parse = MIN(e, RExC_parse);/* Guard against malformed utf8
- */
+ /*point to after 1st invalid */
+ RExC_parse_incf(RExC_orig_utf8);
+ /*Guard against malformed utf8*/
+ RExC_parse_set(MIN(e, RExC_parse));
goto bad_NU;
}
/* Move to after the dot (or ending brace the final time through.)
* */
- RExC_parse++;
+ RExC_parse_inc_by(1);
count++;
} while (RExC_parse < e);
save_start = RExC_start;
orig_end = RExC_end;
- RExC_parse = RExC_start = SvPVX(substitute_parse);
+ RExC_start = SvPVX(substitute_parse);
+ RExC_parse_set(RExC_start);
RExC_end = RExC_parse + SvCUR(substitute_parse);
TURN_OFF_WARNINGS_IN_SUBSTITUTE_PARSE;
/* Restore the saved values */
RESTORE_WARNINGS;
RExC_start = save_start;
- RExC_parse = endbrace;
+ RExC_parse_set(endbrace);
RExC_end = orig_end;
SET_recode_x_to_native(0);
(UV) *flagp);
}
if (*RExC_parse != ']') {
- RExC_parse = cc_parse_start;
+ RExC_parse_set(cc_parse_start);
vFAIL("Unmatched [");
}
nextchar(pRExC_state);
case '?':
case '+':
case '*':
- RExC_parse++;
+ RExC_parse_inc_by(1);
vFAIL("Quantifier follows nothing");
break;
case '\\':
required, as the default for this switch is to jump to the
literal text handling code.
*/
- RExC_parse++;
+ RExC_parse_inc_by(1);
switch ((U8)*RExC_parse) {
/* Special Escapes */
case 'A':
goto finish_meta_pat;
case 'G':
if (RExC_pm_flags & PMf_WILDCARD) {
- RExC_parse++;
+ RExC_parse_inc_by(1);
/* diag_listed_as: Use of %s is not allowed in Unicode property
wildcard subpatterns in regex; marked by <-- HERE in m/%s/
*/
RExC_end - RExC_parse);
char * e = endbrace;
- RExC_parse += 2;
+ RExC_parse_inc_by(2);
if (! endbrace) {
vFAIL2("Missing right brace on \\%c{}", name);
}
while (isBLANK(*RExC_parse)) {
- RExC_parse++;
+ RExC_parse_inc_by(1);
}
while (RExC_parse < e && isBLANK(*(e - 1))) {
}
if (e == RExC_parse) {
- RExC_parse = endbrace + 1; /* After the '}' */
+ RExC_parse_set(endbrace + 1); /* After the '}' */
vFAIL2("Empty \\%c{}", name);
}
break;
default:
bad_bound_type:
- RExC_parse = e;
+ RExC_parse_set(e);
vFAIL2utf8f(
"'%" UTF8f "' is an unknown bound type",
UTF8fARG(UTF, length, e - length));
NOT_REACHED; /*NOTREACHED*/
}
- RExC_parse = endbrace;
+ RExC_parse_set(endbrace);
REQUIRE_UNI_RULES(flagp, 0);
if (op == BOUND) {
&& UCHARAT(RExC_parse + 1) == '{'
&& UNLIKELY(! regcurly(RExC_parse + 1, RExC_end, NULL)))
{
- RExC_parse += 2;
+ RExC_parse_inc_by(2);
vFAIL("Unescaped left brace in regex is illegal here");
}
nextchar(pRExC_state);
RETURN_FAIL_ON_RESTART_FLAGP(flagp);
/* Here, evaluates to a single code point. Go get that */
- RExC_parse = atom_parse_start;
+ RExC_parse_set(atom_parse_start);
goto defchar;
case 'k': /* Handle \k<NAME> and \k'NAME' and \k{NAME} */
&& ch != '\''
&& ch != '{'))
{
- RExC_parse++;
+ RExC_parse_inc_by(1);
/* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
vFAIL2("Sequence %.2s... not terminated", atom_parse_start);
} else {
- RExC_parse += 2;
+ RExC_parse_inc_by(2);
if (ch == '{') {
while (isBLANK(*RExC_parse)) {
- RExC_parse++;
+ RExC_parse_inc_by(1);
}
}
ret = handle_named_backref(pRExC_state,
s++;
} while isDIGIT(*s);
- RExC_parse = s;
+ RExC_parse_set(s);
vFAIL("Unterminated \\g{...} pattern");
}
goto parse_named_seq;
}
- RExC_parse = s;
+ RExC_parse_set(s);
num = S_backref_value(RExC_parse, RExC_end);
if (num == 0)
vFAIL("Reference to invalid group 0");
* to be an octal character escape, e.g. \35 or \777.
* The above logic should make it obvious why using
* octal escapes in patterns is problematic. - Yves */
- RExC_parse = atom_parse_start;
+ RExC_parse_set(atom_parse_start);
goto defchar;
}
}
* We've already figured out what value the digits represent.
* Now, move the parse to beyond them. */
if (endbrace) {
- RExC_parse = endbrace + 1;
+ RExC_parse_set(endbrace + 1);
}
else while (isDIGIT(*RExC_parse)) {
- RExC_parse++;
+ RExC_parse_inc_by(1);
}
if (num >= (I32)RExC_npar) {
default:
/* Do not generate "unrecognized" warnings here, we fall
back into the quick-grab loop below */
- RExC_parse = atom_parse_start;
+ RExC_parse_set(atom_parse_start);
goto defchar;
} /* end of switch on a \foo sequence */
break;
assert((RExC_flags & RXf_PMf_EXTENDED) == 0);
/*
if (RExC_flags & RXf_PMf_EXTENDED) {
- RExC_parse = reg_skipcomment( pRExC_state, RExC_parse );
+ RExC_parse_set( reg_skipcomment( pRExC_state, RExC_parse ) );
if (RExC_parse < RExC_end)
goto tryagain;
}
p++;
break;
case 'N': /* Handle a single-code point named character. */
- RExC_parse = p + 1;
+ RExC_parse_set( p + 1 );
if (! grok_bslash_N(pRExC_state,
NULL, /* Fail if evaluates to
anything other than a
/* Here, it wasn't a single code point. Go close
* up this EXACTish node. The switch() prior to
* this switch handles the other cases */
- RExC_parse = p = oldp;
+ p = oldp;
+ RExC_parse_set(p);
goto loopdone;
}
p = RExC_parse;
- RExC_parse = atom_parse_start;
+ RExC_parse_set(atom_parse_start);
/* The \N{} means the pattern, if previously /d,
* becomes /u. That means it can't be an EXACTF node,
FALSE, /* No illegal cp's */
UTF))
{
- RExC_parse = p; /* going to die anyway; point to
+ RExC_parse_set(p); /* going to die anyway; point to
exact spot of failure */
vFAIL(message);
}
FALSE, /* No illegal cp's */
UTF))
{
- RExC_parse = p; /* going to die anyway; point
+ RExC_parse_set(p); /* going to die anyway; point
to exact spot of failure */
vFAIL(message);
}
{
/* going to die anyway; point to exact spot of
* failure */
- RExC_parse = p + ((UTF)
+ char *new_p= p + ((UTF)
? UTF8_SAFE_SKIP(p, RExC_end)
: 1);
+ RExC_parse_set(new_p);
vFAIL(message);
}
&& isALPHA_A(*(p - 1))
&& *(p - 2) == '\\'))
{
- RExC_parse = p + 1;
+ RExC_parse_set(p + 1);
vFAIL("Unescaped left brace in regex is "
"illegal here");
}
*flagp |= HASWIDTH | maybe_SIMPLE;
}
- RExC_parse = p;
+ RExC_parse_set(p);
{
/* len is STRLEN which is unsigned, need to copy to signed */
&& OP(REGNODE_p(ret)) != SBOL && ! regcurly(RExC_parse, RExC_end, NULL))
{
if (RExC_strict) {
- RExC_parse++;
+ RExC_parse_inc_by(1);
vFAIL("Unescaped left brace in regex is illegal here");
}
ckWARNreg(RExC_parse + 1, "Unescaped left brace in regex is"
if (*temp_ptr == ']') {
temp_ptr++;
if (! found_problem && ! check_only) {
- RExC_parse = (char *) temp_ptr;
+ RExC_parse_set((char *) temp_ptr);
vFAIL3("POSIX syntax [%c %c] is reserved for future "
"extensions", open_char, open_char);
}
const char * const complement_string = (complement)
? "^"
: "";
- RExC_parse = (char *) p;
+ RExC_parse_set((char *) p);
vFAIL3utf8f("POSIX class [:%s%" UTF8f ":] unknown",
complement_string,
UTF8fARG(UTF, RExC_parse - name_start - 2, name_start));
* compile time values are valid in all runtime cases */
REQUIRE_UNI_RULES(flagp, 0);
- ckWARNexperimental(RExC_parse,
- WARN_EXPERIMENTAL__REGEX_SETS,
- "The regex_sets feature is experimental");
-
/* Everything in this construct is a metacharacter. Operands begin with
* either a '\' (for an escape sequence), or a '[' for a bracketed
* character class. Any other character should be an operator, or
* so that everything gets evaluated down to a single operand, which is the
* result */
- sv_2mortal((SV *)(stack = newAV()));
- sv_2mortal((SV *)(fence_stack = newAV()));
+ stack = (AV*)newSV_type_mortal(SVt_PVAV);
+ fence_stack = (AV*)newSV_type_mortal(SVt_PVAV);
while (RExC_parse < RExC_end) {
I32 top_index; /* Index of top-most element in 'stack' */
if ( RExC_parse < RExC_end - 2
&& UCHARAT(RExC_parse + 1) == '?'
- && UCHARAT(RExC_parse + 2) == '^')
+ && strchr("^" STD_PAT_MODS, *(RExC_parse + 2)))
{
const regnode_offset orig_emit = RExC_emit;
SV * resultant_invlist;
- /* If is a '(?^', could be an embedded '(?^flags:(?[...])'.
+ /* Here it could be an embedded '(?flags:(?[...])'.
* This happens when we have some thing like
*
* my $thai_or_lao = qr/(?[ \p{Thai} + \p{Lao} ])/;
* an error: we need to get a single inversion list back
* from the recursion */
- RExC_parse++;
+ RExC_parse_inc_by(1);
RExC_sets_depth++;
node = reg(pRExC_state, 2, flagp, depth+1);
FALSE))
|| ! IS_OPERATOR(*stacked_ptr))))
{
- RExC_parse++;
+ RExC_parse_inc_by(1);
vFAIL("Unexpected '(' with no preceding operator");
}
}
* to fool regclass() into thinking it is part of a
* '[[:posix:]]'. */
if (! is_posix_class) {
- RExC_parse++;
+ RExC_parse_inc_by(1);
}
/* regclass() can only return RESTART_PARSE and NEED_UTF8 if
if (UCHARAT(RExC_parse - 1) == ']') {
break;
}
- RExC_parse++;
+ RExC_parse_inc_by(1);
vFAIL("Unexpected ')'");
}
/* If nothing after the fence, is missing an operand */
if (top_index - fence < 0) {
- RExC_parse++;
+ RExC_parse_inc_by(1);
goto bad_syntax;
}
/* If at least two things on the stack, treat this as an
goto handle_operand;
}
- RExC_parse++;
+ RExC_parse_inc_by(1);
goto bad_syntax;
case '&':
}
unexpected_binary:
- RExC_parse++;
+ RExC_parse_inc_by(1);
vFAIL2("Unexpected binary operator '%c' with no "
"preceding operand", curchar);
}
break;
default:
- RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
+ RExC_parse_inc();
if (RExC_parse >= RExC_end) {
break;
}
} /* End of switch on next parse token */
- RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
+ RExC_parse_inc();
} /* End of loop parsing through the construct */
vFAIL("Syntax error in (?[...])");
if (RExC_parse >= RExC_end || RExC_parse[1] != ')') {
if (RExC_parse < RExC_end) {
- RExC_parse++;
+ RExC_parse_inc_by(1);
}
vFAIL("Unexpected ']' with no following ')' in (?[...");
if (RExC_sets_depth) { /* If within a recursive call, return in a special
regnode */
- RExC_parse++;
+ RExC_parse_inc_by(1);
node = regpnode(pRExC_state, REGEX_SET, final);
}
else {
/* About to generate an ANYOF (or similar) node from the inversion list
* we have calculated */
save_parse = RExC_parse;
- RExC_parse = SvPV(result_string, len);
+ RExC_parse_set(SvPV(result_string, len));
save_end = RExC_end;
RExC_end = RExC_parse + len;
TURN_OFF_WARNINGS_IN_SUBSTITUTE_PARSE;
);
RESTORE_WARNINGS;
- RExC_parse = save_parse + 1;
+ RExC_parse_set(save_parse + 1);
RExC_end = save_end;
SvREFCNT_dec_NN(final);
SvREFCNT_dec_NN(result_string);
assert(RExC_parse <= RExC_end);
if (UCHARAT(RExC_parse) == '^') { /* Complement the class */
- RExC_parse++;
+ RExC_parse_inc_by(1);
invert = TRUE;
allow_mutiple_chars = FALSE;
MARK_NAUGHTY(1);
value = utf8n_to_uvchr((U8*)RExC_parse,
RExC_end - RExC_parse,
&numlen, UTF8_ALLOW_DEFAULT);
- RExC_parse += numlen;
+ RExC_parse_inc_by(numlen);
+ }
+ else {
+ value = UCHARAT(RExC_parse);
+ RExC_parse_inc_by(1);
}
- else
- value = UCHARAT(RExC_parse++);
if (value == '[') {
char * posix_class_end;
av_undef(posix_warnings);
}
- RExC_parse = posix_class_end;
+ RExC_parse_set(posix_class_end);
}
else if (namedclass == OOB_NAMEDCLASS) {
not_posix_region_end = posix_class_end;
value = utf8n_to_uvchr((U8*)RExC_parse,
RExC_end - RExC_parse,
&numlen, UTF8_ALLOW_DEFAULT);
- RExC_parse += numlen;
+ RExC_parse_inc_by(numlen);
+ }
+ else {
+ value = UCHARAT(RExC_parse);
+ RExC_parse_inc_by(1);
}
- else
- value = UCHARAT(RExC_parse++);
/* Some compilers cannot handle switching on 64-bit integer
* values, therefore value cannot be an UV. Yes, this will
char *e;
if (RExC_pm_flags & PMf_WILDCARD) {
- RExC_parse++;
+ RExC_parse_inc_by(1);
/* diag_listed_as: Use of %s is not allowed in Unicode
property wildcard subpatterns in regex; marked by <--
HERE in m/%s/ */
const U8 c = (U8)value;
e = (char *) memchr(RExC_parse, '}', RExC_end - RExC_parse);
if (!e) {
- RExC_parse++;
+ RExC_parse_inc_by(1);
vFAIL2("Missing right brace on \\%c{}", c);
}
- RExC_parse++;
+ RExC_parse_inc_by(1);
/* White space is allowed adjacent to the braces and after
* any '^', even when not under /x */
while (isSPACE(*RExC_parse)) {
- RExC_parse++;
+ RExC_parse_inc_by(1);
}
if (UCHARAT(RExC_parse) == '^') {
* that bit) */
value ^= 'P' ^ 'p';
- RExC_parse++;
+ RExC_parse_inc_by(1);
while (isSPACE(*RExC_parse)) {
- RExC_parse++;
+ RExC_parse_inc_by(1);
}
}
} /* The \p isn't immediately followed by a '{' */
else if (! isALPHA(*RExC_parse)) {
- RExC_parse += (UTF)
- ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
- : 1;
+ RExC_parse_inc_safe();
vFAIL2("Character following \\%c must be '{' or a "
"single-character Unicode property name",
(U8) value);
);
if (SvCUR(msg)) { /* Assumes any error causes a msg */
assert(prop_definition == NULL);
- RExC_parse = e + 1;
+ RExC_parse_set(e + 1);
if (SvUTF8(msg)) { /* msg being UTF-8 makes the whole
thing so, or else the display is
mojibake */
if (strings) {
if (ret_invlist) {
if (! prop_definition) {
- RExC_parse = e + 1;
+ RExC_parse_set(e + 1);
vFAIL("Unicode string properties are not implemented in (?[...])");
}
else {
}
else if (! RExC_in_multi_char_class) {
if (invert ^ (value == 'P')) {
- RExC_parse = e + 1;
+ RExC_parse_set(e + 1);
vFAIL("Inverting a character class which contains"
" a multi-character sequence is illegal");
}
}
}
- RExC_parse = e + 1;
+ RExC_parse_set(e + 1);
namedclass = ANYOF_UNIPROP; /* no official name, but it's
named */
}
{
/* going to die anyway; point to exact spot of
* failure */
- RExC_parse += (UTF)
- ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
- : 1;
+ RExC_parse_inc_safe();
vFAIL(message);
}
value = grok_c_char;
- RExC_parse++;
+ RExC_parse_inc_by(1);
if (message && TO_OUTPUT_WARNINGS(RExC_parse)) {
warn_non_literal_string(RExC_parse, packed_warn, message);
}
| PERL_SCAN_NOTIFY_ILLDIGIT;
numlen = (strict) ? 4 : 3;
value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
- RExC_parse += numlen;
+ RExC_parse_inc_by(numlen);
if (numlen != 3) {
if (strict) {
- RExC_parse += (UTF)
- ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
- : 1;
+ RExC_parse_inc_safe();
vFAIL("Need exactly 3 octal digits");
}
else if ( (flags & PERL_SCAN_NOTIFY_ILLDIGIT)
/* If the '-' is at the end of the class (just before the ']',
* it is a literal minus; otherwise it is a range */
if (next_char_ptr < RExC_end && *next_char_ptr != ']') {
- RExC_parse = next_char_ptr;
+ RExC_parse_set(next_char_ptr);
/* a bad range like \w-, [:word:]- ? */
if (namedclass > OOB_NAMEDCLASS) {
* reported. See the comments at the definition of
* REPORT_LOCATION_ARGS for details */
RExC_copy_start_in_input = (char *) orig_parse;
- RExC_start = RExC_parse = SvPV(substitute_parse, len);
+ RExC_start = SvPV(substitute_parse, len);
+ RExC_parse_set( RExC_start );
RExC_copy_start_in_constructed = RExC_start + constructed_prefix_len;
RExC_end = RExC_parse + len;
RExC_in_multi_char_class = 1;
*flagp |= reg_flags & (HASWIDTH|SIMPLE|POSTPONED|RESTART_PARSE|NEED_UTF8);
/* And restore so can parse the rest of the pattern */
- RExC_parse = save_parse;
+ RExC_parse_set(save_parse);
RExC_start = RExC_copy_start_in_constructed = RExC_copy_start_in_input = save_start;
RExC_end = save_end;
RExC_in_multi_char_class = 0;
those two cases, the parse position is advanced beyond all such comments and
white space.
- This is the UTF, (?#...), and /x friendly way of saying RExC_parse++.
+ This is the UTF, (?#...), and /x friendly way of saying RExC_parse_inc_by(1).
*/
STATIC void
|| UTF8_IS_INVARIANT(*RExC_parse)
|| UTF8_IS_START(*RExC_parse));
- RExC_parse += (UTF)
- ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
- : 1;
+ RExC_parse_inc_safe();
skip_to_be_ignored_text(pRExC_state, &RExC_parse,
FALSE /* Don't force /x */ );
DEBUG_COMPILE_r(
{
- if (prog->maxlen > 0) {
+ if (prog->maxlen > 0 && (prog->check_utf8 || prog->check_substr)) {
const char * const s = SvPV_nolen_const(RX_UTF8(r)
? prog->check_utf8 : prog->check_substr);