/* Valid for non-utf8 strings: avoids the reginclass
* call if there are no complications: i.e., if everything matchable is
* straight forward in the bitmap */
-#define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0) \
+#define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0) \
: ANYOF_BITMAP_TEST(p,*(c)))
/*
DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
goto fail;
}
- if (prog->check_offset_min == prog->check_offset_max &&
- !(prog->extflags & RXf_CANY_SEEN)) {
+ if (prog->check_offset_min == prog->check_offset_max
+ && !(prog->extflags & RXf_CANY_SEEN)
+ && ! multiline) /* /m can cause \n's to match that aren't
+ accounted for in the string max length.
+ See [perl #115242] */
+ {
/* Substring at constant offset from beg-of-str... */
I32 slen;
switch (OP(c)) {
case ANYOF:
if (utf8_target) {
- STRLEN inclasslen = strend - s;
REXEC_FBC_UTF8_CLASS_SCAN(
- reginclass(prog, c, (U8*)s, &inclasslen, utf8_target));
+ reginclass(prog, c, (U8*)s, utf8_target));
}
else {
REXEC_FBC_CLASS_SCAN(REGINCLASS(prog, c, (U8*)s));
const bool utf8_target = PL_reg_match_utf8;
- UV c1, c2;
+ UV c1 = CHRTEST_NOT_A_CP_1;
+ UV c2 = CHRTEST_NOT_A_CP_2;
bool use_chrtest_void = FALSE;
/* Used when we have both utf8 input and utf8 output, to avoid converting
c2 = PL_fold_latin1[c1];
break;
- default: Perl_croak(aTHX_ "panic: Unexpected op %u", OP(text_node));
+ default:
+ Perl_croak(aTHX_ "panic: Unexpected op %u", OP(text_node));
+ assert(0); /* NOTREACHED */
}
}
}
reenter_switch:
SET_nextchr;
+ assert(nextchr < 256 && (nextchr >= 0 || nextchr == NEXTCHR_EOS));
switch (state_num) {
case BOL: /* /^../ */
st->u.keeper.val = rex->offs[0].start;
rex->offs[0].start = locinput - PL_bostr;
PUSH_STATE_GOTO(KEEPS_next, next, locinput);
- /*NOT-REACHED*/
+ assert(0); /*NOTREACHED*/
case KEEPS_next_fail:
/* rollback the start point change */
rex->offs[0].start = st->u.keeper.val;
sayNO_SILENT;
- /*NOT-REACHED*/
+ assert(0); /*NOTREACHED*/
case EOL: /* /..$/ */
goto seol;
}
/* Neither the target nor the pattern are utf8 */
- if (UCHARAT(s) != nextchr &&
- UCHARAT(s) != fold_array[nextchr])
+ if (UCHARAT(s) != nextchr
+ && !NEXTCHR_IS_EOS
+ && UCHARAT(s) != fold_array[nextchr])
{
sayNO;
}
if (NEXTCHR_IS_EOS)
sayNO;
if (utf8_target) {
- STRLEN inclasslen = PL_regeol - locinput;
- if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, utf8_target))
+ if (!reginclass(rex, scan, (U8*)locinput, utf8_target))
sayNO;
- locinput += inclasslen;
+ locinput += UTF8SKIP(locinput);
break;
}
else {
/* This call case insensitively compares the entire buffer
* at s, with the current input starting at locinput, but
* not going off the end given by PL_regeol, and returns in
- * limit upon success, how much of the current input was
+ * <limit> upon success, how much of the current input was
* matched */
if (! foldEQ_utf8_flags(s, NULL, rex->offs[n].end - ln, utf8_target,
locinput, &limit, 0, utf8_target, utf8_fold_flags))
/* this is a point to jump to in order to increment
* locinput by one character */
increment_locinput:
+ assert(!NEXTCHR_IS_EOS);
if (utf8_target) {
locinput += PL_utf8skip[nextchr];
/* locinput is allowed to go 1 char off the end, but not 2+ */
/*
- regrepeat - repeatedly match something simple, report how many
*
+ * What 'simple' means is a node which can be the operand of a quantifier like
+ * '+', or {1,3}
+ *
* startposp - pointer a pointer to the start position. This is updated
* to point to the byte following the highest successful
* match.
* p - the regnode to be repeatedly matched against.
- * max - maximum number of characters to match.
+ * max - maximum number of things to match.
* depth - (for debugging) backtracking depth.
*/
STATIC I32
S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 max, int depth)
{
dVAR;
- char *scan;
+ char *scan; /* Pointer to current position in target string */
I32 c;
- char *loceol = PL_regeol;
- I32 hardcount = 0;
+ char *loceol = PL_regeol; /* local version */
+ I32 hardcount = 0; /* How many matches so far */
bool utf8_target = PL_reg_match_utf8;
UV utf8_flags;
#ifndef DEBUGGING
scan = *startposp;
if (max == REG_INFTY)
max = I32_MAX;
- else if (max < loceol - scan)
+ else if (! utf8_target && scan + max < loceol)
loceol = scan + max;
+
+ /* Here, for the case of a non-UTF-8 target we have adjusted <loceol> down
+ * to the maximum of how far we should go in it (leaving it set to the real
+ * end, if the maximum permissible would take us beyond that). This allows
+ * us to make the loop exit condition that we haven't gone past <loceol> to
+ * also mean that we haven't exceeded the max permissible count, saving a
+ * test each time through the loop. But it assumes that the OP matches a
+ * single byte, which is true for most of the OPs below when applied to a
+ * non-UTF-8 target. Those relatively few OPs that don't have this
+ * characteristic will have to compensate.
+ *
+ * There is no adjustment for UTF-8 targets, as the number of bytes per
+ * character varies. OPs will have to test both that the count is less
+ * than the max permissible (using <hardcount> to keep track), and that we
+ * are still within the bounds of the string (using <loceol>. A few OPs
+ * match a single byte no matter what the encoding. They can omit the max
+ * test if, for the UTF-8 case, they do the adjustment that was skipped
+ * above.
+ *
+ * Thus, the code above sets things up for the common case; and exceptional
+ * cases need extra work; the common case is to make sure <scan> doesn't
+ * go past <loceol>, and for UTF-8 to also use <hardcount> to make sure the
+ * count doesn't exceed the maximum permissible */
+
switch (OP(p)) {
case REG_ANY:
if (utf8_target) {
- loceol = PL_regeol;
while (scan < loceol && hardcount < max && *scan != '\n') {
scan += UTF8SKIP(scan);
hardcount++;
break;
case SANY:
if (utf8_target) {
- loceol = PL_regeol;
while (scan < loceol && hardcount < max) {
scan += UTF8SKIP(scan);
hardcount++;
else
scan = loceol;
break;
- case CANY:
- scan = loceol;
+ case CANY: /* Move <scan> forward <max> bytes, unless goes off end */
+ if (utf8_target && scan + max < loceol) {
+
+ /* <loceol> hadn't been adjusted in the UTF-8 case */
+ scan += max;
+ }
+ else {
+ scan = loceol;
+ }
break;
case EXACT:
assert(STR_LEN(p) == (UTF_PATTERN) ? UTF8SKIP(STRING(p)) : 1);
* can use UTF8_IS_INVARIANT() even if the pattern isn't UTF-8, as it's
* true iff it doesn't matter if the argument is in UTF-8 or not */
if (UTF8_IS_INVARIANT(c) || (! utf8_target && ! UTF_PATTERN)) {
+ if (utf8_target && scan + max < loceol) {
+ /* We didn't adjust <loceol> because is UTF-8, but ok to do so,
+ * since here, to match at all, 1 char == 1 byte */
+ loceol = scan + max;
+ }
while (scan < loceol && UCHARAT(scan) == c) {
scan++;
}
else if (UTF_PATTERN) {
if (utf8_target) {
STRLEN scan_char_len;
- loceol = PL_regeol;
- /* When both target and pattern are UTF-8, we have to do s
+ /* When both target and pattern are UTF-8, we have to do
* string EQ */
while (hardcount < max
&& scan + (scan_char_len = UTF8SKIP(scan)) <= loceol
* then look for those in sequence in the utf8 string */
U8 high = UTF8_TWO_BYTE_HI(c);
U8 low = UTF8_TWO_BYTE_LO(c);
- loceol = PL_regeol;
while (hardcount < max
&& scan + 1 < loceol
if (S_setup_EXACTISH_ST_c1_c2(aTHX_ p, &c1, c1_utf8, &c2, c2_utf8)) {
if (c1 == CHRTEST_VOID) {
- /* Use full Unicode fold matching */
- char *tmpeol = loceol;
- STRLEN pat_len = (UTF_PATTERN) ? UTF8SKIP(STRING(p)) : 1;
- while (hardcount < max
- && foldEQ_utf8_flags(scan, &tmpeol, 0, utf8_target,
- STRING(p), NULL, pat_len, cBOOL(UTF_PATTERN), utf8_flags))
- {
- scan = tmpeol;
- tmpeol = loceol;
- hardcount++;
- }
+ /* Use full Unicode fold matching */
+ char *tmpeol = PL_regeol;
+ STRLEN pat_len = (UTF_PATTERN) ? UTF8SKIP(STRING(p)) : 1;
+ while (hardcount < max
+ && foldEQ_utf8_flags(scan, &tmpeol, 0, utf8_target,
+ STRING(p), NULL, pat_len,
+ cBOOL(UTF_PATTERN), utf8_flags))
+ {
+ scan = tmpeol;
+ tmpeol = PL_regeol;
+ hardcount++;
+ }
}
else if (utf8_target) {
if (c1 == c2) {
- while (hardcount < max
+ while (scan < loceol
+ && hardcount < max
&& memEQ(scan, c1_utf8, UTF8SKIP(scan)))
{
scan += UTF8SKIP(scan);
}
}
else {
- while (hardcount < max
+ while (scan < loceol
+ && hardcount < max
&& (memEQ(scan, c1_utf8, UTF8SKIP(scan))
|| memEQ(scan, c2_utf8, UTF8SKIP(scan))))
{
case ANYOF:
if (utf8_target) {
STRLEN inclasslen;
- loceol = PL_regeol;
- inclasslen = loceol - scan;
while (hardcount < max
- && ((inclasslen = loceol - scan) > 0)
- && reginclass(prog, p, (U8*)scan, &inclasslen, utf8_target))
+ && scan + (inclasslen = UTF8SKIP(scan)) <= loceol
+ && reginclass(prog, p, (U8*)scan, utf8_target))
{
scan += inclasslen;
hardcount++;
case ALNUMU:
if (utf8_target) {
utf8_wordchar:
- loceol = PL_regeol;
LOAD_UTF8_CHARCLASS_ALNUM();
while (hardcount < max && scan < loceol &&
swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
}
break;
case ALNUMA:
+ if (utf8_target && scan + max < loceol) {
+
+ /* We didn't adjust <loceol> because is UTF-8, but ok to do so,
+ * since here, to match, 1 char == 1 byte */
+ loceol = scan + max;
+ }
while (scan < loceol && isWORDCHAR_A((U8) *scan)) {
scan++;
}
case ALNUML:
PL_reg_flags |= RF_tainted;
if (utf8_target) {
- loceol = PL_regeol;
while (hardcount < max && scan < loceol &&
isALNUM_LC_utf8((U8*)scan)) {
scan += UTF8SKIP(scan);
utf8_Nwordchar:
- loceol = PL_regeol;
LOAD_UTF8_CHARCLASS_ALNUM();
while (hardcount < max && scan < loceol &&
! swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
break;
case POSIXA:
- while (scan < loceol && _generic_isCC_A((U8) *scan, FLAGS(p))) {
+ if (utf8_target && scan + max < loceol) {
+
+ /* We didn't adjust <loceol> because is UTF-8, but ok to do so,
+ * since here, to match, 1 char == 1 byte */
+ loceol = scan + max;
+ }
+ while (scan < loceol && _generic_isCC_A((U8) *scan, FLAGS(p))) {
scan++;
}
break;
case NPOSIXA:
if (utf8_target) {
- while (scan < loceol && ! _generic_isCC_A((U8) *scan, FLAGS(p))) {
+ while (scan < loceol && hardcount < max
+ && ! _generic_isCC_A((U8) *scan, FLAGS(p)))
+ {
scan += UTF8SKIP(scan);
+ hardcount++;
}
}
else {
break;
case NALNUMA:
if (utf8_target) {
- while (scan < loceol && ! isWORDCHAR_A((U8) *scan)) {
+ while (scan < loceol && hardcount < max
+ && ! isWORDCHAR_A((U8) *scan))
+ {
scan += UTF8SKIP(scan);
+ hardcount++;
}
}
else {
case NALNUML:
PL_reg_flags |= RF_tainted;
if (utf8_target) {
- loceol = PL_regeol;
while (hardcount < max && scan < loceol &&
!isALNUM_LC_utf8((U8*)scan)) {
scan += UTF8SKIP(scan);
utf8_space:
- loceol = PL_regeol;
LOAD_UTF8_CHARCLASS_SPACE();
while (hardcount < max && scan < loceol &&
(*scan == ' ' ||
}
break;
case SPACEA:
+ if (utf8_target && scan + max < loceol) {
+
+ /* We didn't adjust <loceol> because is UTF-8, but ok to do so,
+ * since here, to match, 1 char == 1 byte */
+ loceol = scan + max;
+ }
while (scan < loceol && isSPACE_A((U8) *scan)) {
scan++;
}
case SPACEL:
PL_reg_flags |= RF_tainted;
if (utf8_target) {
- loceol = PL_regeol;
while (hardcount < max && scan < loceol &&
isSPACE_LC_utf8((U8*)scan)) {
scan += UTF8SKIP(scan);
utf8_Nspace:
- loceol = PL_regeol;
LOAD_UTF8_CHARCLASS_SPACE();
while (hardcount < max && scan < loceol &&
! (*scan == ' ' ||
break;
case NSPACEA:
if (utf8_target) {
- while (scan < loceol && ! isSPACE_A((U8) *scan)) {
+ while (hardcount < max && scan < loceol
+ && ! isSPACE_A((U8) *scan))
+ {
scan += UTF8SKIP(scan);
+ hardcount++;
}
}
else {
case NSPACEL:
PL_reg_flags |= RF_tainted;
if (utf8_target) {
- loceol = PL_regeol;
while (hardcount < max && scan < loceol &&
!isSPACE_LC_utf8((U8*)scan)) {
scan += UTF8SKIP(scan);
break;
case DIGIT:
if (utf8_target) {
- loceol = PL_regeol;
LOAD_UTF8_CHARCLASS_DIGIT();
while (hardcount < max && scan < loceol &&
swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) {
}
break;
case DIGITA:
+ if (utf8_target && scan + max < loceol) {
+
+ /* We didn't adjust <loceol> because is UTF-8, but ok to do so,
+ * since here, to match, 1 char == 1 byte */
+ loceol = scan + max;
+ }
while (scan < loceol && isDIGIT_A((U8) *scan)) {
scan++;
}
case DIGITL:
PL_reg_flags |= RF_tainted;
if (utf8_target) {
- loceol = PL_regeol;
while (hardcount < max && scan < loceol &&
isDIGIT_LC_utf8((U8*)scan)) {
scan += UTF8SKIP(scan);
break;
case NDIGIT:
if (utf8_target) {
- loceol = PL_regeol;
LOAD_UTF8_CHARCLASS_DIGIT();
while (hardcount < max && scan < loceol &&
!swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) {
break;
case NDIGITA:
if (utf8_target) {
- while (scan < loceol && ! isDIGIT_A((U8) *scan)) {
+ while (hardcount < max && scan < loceol
+ && ! isDIGIT_A((U8) *scan)) {
scan += UTF8SKIP(scan);
+ hardcount++;
}
}
else {
case NDIGITL:
PL_reg_flags |= RF_tainted;
if (utf8_target) {
- loceol = PL_regeol;
while (hardcount < max && scan < loceol &&
!isDIGIT_LC_utf8((U8*)scan)) {
scan += UTF8SKIP(scan);
break;
case LNBREAK:
if (utf8_target) {
- loceol = PL_regeol;
while (hardcount < max && scan < loceol &&
(c=is_LNBREAK_utf8_safe(scan, loceol))) {
scan += c;
hardcount++;
}
} else {
- /*
- LNBREAK can match two latin chars, which is ok,
- because we have a null terminated string, but we
- have to use hardcount in this situation
- */
+ /* LNBREAK can match one or two latin chars, which is ok, but we
+ * have to use hardcount in this situation, and throw away the
+ * adjustment to <loceol> done before the switch statement */
+ loceol = PL_regeol;
while (scan < loceol && (c=is_LNBREAK_latin1_safe(scan, loceol))) {
scan+=c;
hardcount++;
}
- }
+ }
break;
case HORIZWS:
if (utf8_target) {
- loceol = PL_regeol;
while (hardcount < max && scan < loceol &&
(c=is_HORIZWS_utf8_safe(scan, loceol)))
{
break;
case NHORIZWS:
if (utf8_target) {
- loceol = PL_regeol;
while (hardcount < max && scan < loceol &&
!is_HORIZWS_utf8_safe(scan, loceol))
{
break;
case VERTWS:
if (utf8_target) {
- loceol = PL_regeol;
while (hardcount < max && scan < loceol &&
(c=is_VERTWS_utf8_safe(scan, loceol)))
{
break;
case NVERTWS:
if (utf8_target) {
- loceol = PL_regeol;
while (hardcount < max && scan < loceol &&
!is_VERTWS_utf8_safe(scan, loceol))
{
}
break;
- default: /* Called on something of 0 width. */
- break; /* So match right here or not at all. */
+ case BOUND:
+ case BOUNDA:
+ case BOUNDL:
+ case BOUNDU:
+ case EOS:
+ case GPOS:
+ case KEEPS:
+ case NBOUND:
+ case NBOUNDA:
+ case NBOUNDL:
+ case NBOUNDU:
+ case OPFAIL:
+ case SBOL:
+ case SEOL:
+ /* These are all 0 width, so match right here or not at all. */
+ break;
+
+ default:
+ Perl_croak(aTHX_ "panic: regrepeat() called with unrecognized node type %d='%s'", OP(p), PL_reg_name[OP(p)]);
+ assert(0); /* NOTREACHED */
+
}
if (hardcount)
n is the ANYOF regnode
p is the target string
- lenp is pointer to the maximum number of bytes of how far to go in p
- (This is assumed wthout checking to always be at least the current
- character's size)
utf8_target tells whether p is in UTF-8.
- Returns true if matched; false otherwise. If lenp is not NULL, on return
- from a successful match, the value it points to will be updated to how many
- bytes in p were matched. If there was no match, the value is undefined,
- possibly changed from the input.
+ Returns true if matched; false otherwise.
Note that this can be a synthetic start class, a combination of various
nodes, so things you think might be mutually exclusive, such as locale,
*/
STATIC bool
-S_reginclass(pTHX_ const regexp * const prog, register const regnode * const n, register const U8* const p, STRLEN* lenp, register const bool utf8_target)
+S_reginclass(pTHX_ const regexp * const prog, register const regnode * const n, register const U8* const p, register const bool utf8_target)
{
dVAR;
const char flags = ANYOF_FLAGS(n);
bool match = FALSE;
UV c = *p;
- STRLEN c_len = 0;
- STRLEN maxlen;
PERL_ARGS_ASSERT_REGINCLASS;
- /* If c is not already the code point, get it */
- if (utf8_target && !UTF8_IS_INVARIANT(c)) {
+ /* If c is not already the code point, get it. Note that
+ * UTF8_IS_INVARIANT() works even if not in UTF-8 */
+ if (! UTF8_IS_INVARIANT(c) && utf8_target) {
+ STRLEN c_len = 0;
c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &c_len,
(UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV)
| UTF8_ALLOW_FFFF | UTF8_CHECK_ONLY);
if (c_len == (STRLEN)-1)
Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
}
- else {
- c_len = 1;
- }
-
- /* Use passed in max length, or one character if none passed in or less
- * than one character. And assume will match just one character. This is
- * overwritten later if matched more. */
- if (lenp) {
- maxlen = (*lenp > c_len) ? *lenp : c_len;
- *lenp = c_len;
-
- }
- else {
- maxlen = c_len;
- }
/* If this character is potentially in the bitmap, check it */
if (c < 256) {