/* Valid for non-utf8 strings: avoids the reginclass
* call if there are no complications: i.e., if everything matchable is
* straight forward in the bitmap */
-#define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0) \
+#define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0) \
: ANYOF_BITMAP_TEST(p,*(c)))
/*
DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
goto fail;
}
- if (prog->check_offset_min == prog->check_offset_max &&
- !(prog->extflags & RXf_CANY_SEEN)) {
+ if (prog->check_offset_min == prog->check_offset_max
+ && !(prog->extflags & RXf_CANY_SEEN)
+ && ! multiline) /* /m can cause \n's to match that aren't
+ accounted for in the string max length.
+ See [perl #115242] */
+ {
/* Substring at constant offset from beg-of-str... */
I32 slen;
switch (OP(c)) {
case ANYOF:
if (utf8_target) {
- STRLEN inclasslen = strend - s;
REXEC_FBC_UTF8_CLASS_SCAN(
- reginclass(prog, c, (U8*)s, &inclasslen, utf8_target));
+ reginclass(prog, c, (U8*)s, utf8_target));
}
else {
REXEC_FBC_CLASS_SCAN(REGINCLASS(prog, c, (U8*)s));
const bool utf8_target = PL_reg_match_utf8;
- UV c1, c2;
+ UV c1 = CHRTEST_NOT_A_CP_1;
+ UV c2 = CHRTEST_NOT_A_CP_2;
bool use_chrtest_void = FALSE;
/* Used when we have both utf8 input and utf8 output, to avoid converting
c2 = PL_fold_latin1[c1];
break;
- default: Perl_croak(aTHX_ "panic: Unexpected op %u", OP(text_node));
+ default:
+ Perl_croak(aTHX_ "panic: Unexpected op %u", OP(text_node));
+ assert(0); /* NOTREACHED */
}
}
}
st->u.keeper.val = rex->offs[0].start;
rex->offs[0].start = locinput - PL_bostr;
PUSH_STATE_GOTO(KEEPS_next, next, locinput);
- /*NOT-REACHED*/
+ assert(0); /*NOTREACHED*/
case KEEPS_next_fail:
/* rollback the start point change */
rex->offs[0].start = st->u.keeper.val;
sayNO_SILENT;
- /*NOT-REACHED*/
+ assert(0); /*NOTREACHED*/
case EOL: /* /..$/ */
goto seol;
if (NEXTCHR_IS_EOS)
sayNO;
if (utf8_target) {
- STRLEN inclasslen = PL_regeol - locinput;
- if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, utf8_target))
+ if (!reginclass(rex, scan, (U8*)locinput, utf8_target))
sayNO;
- locinput += inclasslen;
+ locinput += UTF8SKIP(locinput);
break;
}
else {
/*
- regrepeat - repeatedly match something simple, report how many
*
+ * What 'simple' means is a node which can be the operand of a quantifier like
+ * '+', or {1,3}
+ *
* startposp - pointer a pointer to the start position. This is updated
* to point to the byte following the highest successful
* match.
case ANYOF:
if (utf8_target) {
STRLEN inclasslen;
- inclasslen = loceol - scan;
while (hardcount < max
- && ((inclasslen = loceol - scan) > 0)
- && reginclass(prog, p, (U8*)scan, &inclasslen, utf8_target))
+ && scan + (inclasslen = UTF8SKIP(scan)) <= loceol
+ && reginclass(prog, p, (U8*)scan, utf8_target))
{
scan += inclasslen;
hardcount++;
}
break;
case LNBREAK:
- Perl_croak(aTHX_ "panic: regrepeat() should not be called with non-simple: LNBREAK");
- assert(0); /* NOTREACHED */
+ if (utf8_target) {
+ while (hardcount < max && scan < loceol &&
+ (c=is_LNBREAK_utf8_safe(scan, loceol))) {
+ scan += c;
+ hardcount++;
+ }
+ } else {
+ /* LNBREAK can match one or two latin chars, which is ok, but we
+ * have to use hardcount in this situation, and throw away the
+ * adjustment to <loceol> done before the switch statement */
+ loceol = PL_regeol;
+ while (scan < loceol && (c=is_LNBREAK_latin1_safe(scan, loceol))) {
+ scan+=c;
+ hardcount++;
+ }
+ }
+ break;
case HORIZWS:
if (utf8_target) {
while (hardcount < max && scan < loceol &&
}
break;
- default: /* Called on something of 0 width. */
- break; /* So match right here or not at all. */
+ case BOUND:
+ case BOUNDA:
+ case BOUNDL:
+ case BOUNDU:
+ case EOS:
+ case GPOS:
+ case KEEPS:
+ case NBOUND:
+ case NBOUNDA:
+ case NBOUNDL:
+ case NBOUNDU:
+ case OPFAIL:
+ case SBOL:
+ case SEOL:
+ /* These are all 0 width, so match right here or not at all. */
+ break;
+
+ default:
+ Perl_croak(aTHX_ "panic: regrepeat() called with unrecognized node type %d='%s'", OP(p), PL_reg_name[OP(p)]);
+ assert(0); /* NOTREACHED */
+
}
if (hardcount)
n is the ANYOF regnode
p is the target string
- lenp is pointer to the maximum number of bytes of how far to go in p
- (This is assumed wthout checking to always be at least the current
- character's size)
utf8_target tells whether p is in UTF-8.
- Returns true if matched; false otherwise. If lenp is not NULL, on return
- from a successful match, the value it points to will be updated to how many
- bytes in p were matched. If there was no match, the value is undefined,
- possibly changed from the input.
+ Returns true if matched; false otherwise.
Note that this can be a synthetic start class, a combination of various
nodes, so things you think might be mutually exclusive, such as locale,
*/
STATIC bool
-S_reginclass(pTHX_ const regexp * const prog, register const regnode * const n, register const U8* const p, STRLEN* lenp, register const bool utf8_target)
+S_reginclass(pTHX_ const regexp * const prog, register const regnode * const n, register const U8* const p, register const bool utf8_target)
{
dVAR;
const char flags = ANYOF_FLAGS(n);
bool match = FALSE;
UV c = *p;
- STRLEN c_len = 0;
- STRLEN maxlen;
PERL_ARGS_ASSERT_REGINCLASS;
- /* If c is not already the code point, get it */
- if (utf8_target && !UTF8_IS_INVARIANT(c)) {
+ /* If c is not already the code point, get it. Note that
+ * UTF8_IS_INVARIANT() works even if not in UTF-8 */
+ if (! UTF8_IS_INVARIANT(c) && utf8_target) {
+ STRLEN c_len = 0;
c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &c_len,
(UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV)
| UTF8_ALLOW_FFFF | UTF8_CHECK_ONLY);
if (c_len == (STRLEN)-1)
Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
}
- else {
- c_len = 1;
- }
-
- /* Use passed in max length, or one character if none passed in or less
- * than one character. And assume will match just one character. This is
- * overwritten later if matched more. */
- if (lenp) {
- maxlen = (*lenp > c_len) ? *lenp : c_len;
- *lenp = c_len;
-
- }
- else {
- maxlen = c_len;
- }
/* If this character is potentially in the bitmap, check it */
if (c < 256) {