X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/895cc420d0398ff184560679b40f5f2c0af72366..bd8446cfdeebd0d46fb2f59b7a8ad1ef248ac0d3:/regexec.c

diff --git a/regexec.c b/regexec.c
index 6ace8b6..989affa 100644
--- a/regexec.c
+++ b/regexec.c
@@ -121,6 +121,18 @@
 #define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
 
+
+#define NEXTCHR_EOS -10 /* nextchr has fallen off the end */
+#define NEXTCHR_IS_EOS (nextchr < 0)
+
+#define SET_nextchr \
+    nextchr = ((locinput < PL_regeol) ? UCHARAT(locinput) : NEXTCHR_EOS)
+
+#define SET_locinput(p) \
+    locinput = (p);  \
+    SET_nextchr
+
+
 /* these are unrolled below in the CCC_TRY_XXX defined */
 #define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
     if (!CAT2(PL_utf8_,class)) { \
@@ -165,7 +177,7 @@
  * fails, or advance to the next character */
 
 #define _CCC_TRY_CODE(POS_OR_NEG, FUNC, UTF8_TEST, CLASS, STR)                \
-    if (locinput >= PL_regeol) {                                              \
+    if (NEXTCHR_IS_EOS) {                                              \
 	sayNO;                                                                \
     }                                                                         \
     if (utf8_target && UTF8_IS_CONTINUED(nextchr)) {                          \
@@ -219,14 +231,14 @@
 	_CCC_TRY_CODE( PLACEHOLDER, LCFUNC, LCFUNC_utf8((U8*)locinput),       \
 		       CLASS, STR)                                            \
     case NAMEA:                                                               \
-	if (locinput >= PL_regeol || ! FUNCA(nextchr)) {                      \
+	if (NEXTCHR_IS_EOS || ! FUNCA(nextchr)) {                      \
 	    sayNO;                                                            \
 	}                                                                     \
 	/* Matched a utf8-invariant, so don't have to worry about utf8 */     \
 	locinput++;                                        \
 	break;                                                                \
     case NNAMEA:                                                              \
-	if (locinput >= PL_regeol || FUNCA(nextchr)) {                        \
+	if (NEXTCHR_IS_EOS || FUNCA(nextchr)) {                        \
 	    sayNO;                                                            \
 	}                                                                     \
         goto increment_locinput;                                              \
@@ -597,7 +609,21 @@ Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos,
 	goto fail;
     }
                 
-    strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
+    /* XXX we need to pass strbeg as a separate arg: the following is
+     * guesswork and can be wrong... */
+    if (sv && SvPOK(sv)) {
+        char * p   = SvPVX(sv);
+        STRLEN cur = SvCUR(sv); 
+        if (p <= strpos && strpos < p + cur) {
+            strbeg = p;
+            assert(p <= strend && strend <= p + cur);
+        }
+        else
+            strbeg = strend - cur;
+    }
+    else 
+        strbeg = strpos;
+
     PL_regeol = strend;
     if (utf8_target) {
 	if (!prog->check_utf8 && prog->check_substr)
@@ -1238,7 +1264,7 @@ STMT_START {                                              \
 
 #define REXEC_FBC_UTF8_SCAN(CoDe)                     \
 STMT_START {                                          \
-    while (s + (uskip = UTF8SKIP(s)) <= strend) {     \
+    while (s < strend && s + (uskip = UTF8SKIP(s)) <= strend) {     \
 	CoDe                                          \
 	s += uskip;                                   \
     }                                                 \
@@ -1763,32 +1789,32 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
 	    break;
 	case LNBREAK:
 	    REXEC_FBC_CSCAN(
-		is_LNBREAK_utf8(s),
-		is_LNBREAK_latin1(s)
+		is_LNBREAK_utf8_safe(s, strend),
+		is_LNBREAK_latin1_safe(s, strend)
 	    );
 	    break;
 	case VERTWS:
 	    REXEC_FBC_CSCAN(
-		is_VERTWS_utf8(s),
-		is_VERTWS_latin1(s)
+		is_VERTWS_utf8_safe(s, strend),
+		is_VERTWS_latin1_safe(s, strend)
 	    );
 	    break;
 	case NVERTWS:
 	    REXEC_FBC_CSCAN(
-		!is_VERTWS_utf8(s),
-		!is_VERTWS_latin1(s)
+		!is_VERTWS_utf8_safe(s, strend),
+		!is_VERTWS_latin1_safe(s, strend)
 	    );
 	    break;
 	case HORIZWS:
 	    REXEC_FBC_CSCAN(
-		is_HORIZWS_utf8(s),
-		is_HORIZWS_latin1(s)
+		is_HORIZWS_utf8_safe(s, strend),
+		is_HORIZWS_latin1_safe(s, strend)
 	    );
 	    break;
 	case NHORIZWS:
 	    REXEC_FBC_CSCAN(
-		!is_HORIZWS_utf8(s),
-		!is_HORIZWS_latin1(s)
+		!is_HORIZWS_utf8_safe(s, strend),
+		!is_HORIZWS_latin1_safe(s, strend)
 	    );	    
 	    break;
 	case POSIXA:
@@ -1923,16 +1949,24 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
                             
                         }
                         points[pointpos++ % maxlen]= uc;
-			REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
+                        if (foldlen || uc < (U8*)strend) {
+                            REXEC_TRIE_READ_CHAR(trie_type, trie,
+                                             widecharmap, uc,
 					     uscan, len, uvc, charid, foldlen,
 					     foldbuf, uniflags);
-                        DEBUG_TRIE_EXECUTE_r({
-                            dump_exec_pos( (char *)uc, c, strend, real_start, 
-                                s,   utf8_target );
-                            PerlIO_printf(Perl_debug_log,
-                                " Charid:%3u CP:%4"UVxf" ",
-                                 charid, uvc);
-                        });
+                            DEBUG_TRIE_EXECUTE_r({
+                                dump_exec_pos( (char *)uc, c, strend,
+                                            real_start, s, utf8_target);
+                                PerlIO_printf(Perl_debug_log,
+                                    " Charid:%3u CP:%4"UVxf" ",
+                                     charid, uvc);
+                            });
+                        }
+                        else {
+                            len = 0;
+                            charid = 0;
+                        }
+
 
                         do {
 #ifdef DEBUGGING
@@ -2380,7 +2414,11 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *stre
 		while (s <= last1) {
 		    if (regtry(&reginfo, &s))
 			goto got_it;
-		    s += UTF8SKIP(s);
+                    if (s >= last1) {
+                        s++; /* to break out of outer loop */
+                        break;
+                    }
+                    s += UTF8SKIP(s);
 		}
 	    }
 	    else {
@@ -2682,7 +2720,6 @@ phooey:
         Safefree(prog->offs);
         prog->offs = swap;
     }
-
     return 0;
 }
 
@@ -3288,7 +3325,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
 	st = PL_regmatch_state = S_push_slab(aTHX);
 
     /* Note that nextchr is a byte even in UTF */
-    nextchr = UCHARAT(locinput);
+    SET_nextchr;
     scan = prog;
     while (scan != NULL) {
 
@@ -3313,8 +3350,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
 
       reenter_switch:
 
-        nextchr = UCHARAT(locinput);
-        assert(nextchr >= 0);
+        SET_nextchr;
 
 	switch (state_num) {
 	case BOL: /*  /^../  */
@@ -3327,7 +3363,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
 
 	case MBOL: /*  /^../m  */
 	    if (locinput == PL_bostr ||
-		((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
+		(!NEXTCHR_IS_EOS && locinput[-1] == '\n'))
 	    {
 		break;
 	    }
@@ -3359,36 +3395,36 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
 		goto seol;
 
 	case MEOL: /* /..$/m  */
-	    if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
+	    if (!NEXTCHR_IS_EOS && nextchr != '\n')
 		sayNO;
 	    break;
 
 	case SEOL: /* /..$/s  */
 	  seol:
-	    if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
+	    if (!NEXTCHR_IS_EOS && nextchr != '\n')
 		sayNO;
 	    if (PL_regeol - locinput > 1)
 		sayNO;
 	    break;
 
 	case EOS: /*  \z  */
-	    if (PL_regeol != locinput)
+	    if (!NEXTCHR_IS_EOS)
 		sayNO;
 	    break;
 
 	case SANY: /*  /./s  */
-	    if (!nextchr && locinput >= PL_regeol)
+	    if (NEXTCHR_IS_EOS)
 		sayNO;
             goto increment_locinput;
 
 	case CANY: /*  \C  */
-	    if (!nextchr && locinput >= PL_regeol)
+	    if (NEXTCHR_IS_EOS)
 		sayNO;
 	    locinput++;
 	    break;
 
 	case REG_ANY: /*  /./  */
-	    if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
+	    if ((NEXTCHR_IS_EOS) || nextchr == '\n')
 		sayNO;
             goto increment_locinput;
 
@@ -3399,7 +3435,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
             /* In this case the charclass data is available inline so
                we can fail fast without a lot of extra overhead. 
              */
-            if(!ANYOF_BITMAP_TEST(scan, nextchr)) {
+            if(!NEXTCHR_IS_EOS && !ANYOF_BITMAP_TEST(scan, nextchr)) {
                 DEBUG_EXECUTE_r(
                     PerlIO_printf(Perl_debug_log,
                               "%*s  %sfailed to match trie start class...%s\n",
@@ -3464,7 +3500,9 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
 		HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
                 U32 state = trie->startstate;
 
-                if (trie->bitmap && !TRIE_BITMAP_TEST(trie, nextchr) ) {
+                if (   trie->bitmap
+                    && (NEXTCHR_IS_EOS || !TRIE_BITMAP_TEST(trie, nextchr)))
+                {
         	    if (trie->states[ state ].wordnum) {
         	         DEBUG_EXECUTE_r(
                             PerlIO_printf(Perl_debug_log,
@@ -3537,7 +3575,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
 		    });
 
 		    /* read a char and goto next state */
-		    if ( base ) {
+		    if ( base && (foldlen || uc < (U8*)PL_regeol)) {
 			I32 offset;
 			REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
 					     uscan, len, uvc, charid, foldlen,
@@ -3808,6 +3846,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
 	    locinput += ln;
 	    break;
 	    }
+
 	case EXACTFL: {          /*  /abc/il      */
 	    re_fold_t folder;
 	    const U8 * fold_array;
@@ -3899,12 +3938,17 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
 		}
 		if (FLAGS(scan) != REGEX_LOCALE_CHARSET) {
 		    ln = isALNUM_uni(ln);
-		    LOAD_UTF8_CHARCLASS_ALNUM();
-		    n = swash_fetch(PL_utf8_alnum, (U8*)locinput, utf8_target);
+                    if (NEXTCHR_IS_EOS)
+                        n = 0;
+                    else {
+                        LOAD_UTF8_CHARCLASS_ALNUM();
+                        n = swash_fetch(PL_utf8_alnum, (U8*)locinput,
+                                                                utf8_target);
+                    }
 		}
 		else {
 		    ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
-		    n = isALNUM_LC_utf8((U8*)locinput);
+		    n = NEXTCHR_IS_EOS ? 0 : isALNUM_LC_utf8((U8*)locinput);
 		}
 	    }
 	    else {
@@ -3925,20 +3969,20 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
 		switch (FLAGS(scan)) {
 		    case REGEX_UNICODE_CHARSET:
 			ln = isWORDCHAR_L1(ln);
-			n = isWORDCHAR_L1(nextchr);
+			n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_L1(nextchr);
 			break;
 		    case REGEX_LOCALE_CHARSET:
 			ln = isALNUM_LC(ln);
-			n = isALNUM_LC(nextchr);
+			n = NEXTCHR_IS_EOS ? 0 : isALNUM_LC(nextchr);
 			break;
 		    case REGEX_DEPENDS_CHARSET:
 			ln = isALNUM(ln);
-			n = isALNUM(nextchr);
+			n = NEXTCHR_IS_EOS ? 0 : isALNUM(nextchr);
 			break;
 		    case REGEX_ASCII_RESTRICTED_CHARSET:
 		    case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
 			ln = isWORDCHAR_A(ln);
-			n = isWORDCHAR_A(nextchr);
+			n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_A(nextchr);
 			break;
 		    default:
 			Perl_croak(aTHX_ "panic: Unexpected FLAGS %u in op %u", FLAGS(scan), OP(scan));
@@ -3953,19 +3997,16 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
 
 	case ANYOFV: /*  /[abx{df}]/i  */
 	case ANYOF:  /*  /[abc]/       */
+            if (NEXTCHR_IS_EOS)
+                sayNO;
 	    if (utf8_target || state_num == ANYOFV) {
 	        STRLEN inclasslen = PL_regeol - locinput;
-		if (locinput >= PL_regeol)
-		    sayNO;
-
 	        if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, utf8_target))
 		    sayNO;
 		locinput += inclasslen;
 		break;
 	    }
 	    else {
-		if (!nextchr && locinput >= PL_regeol)
-		    sayNO;
 		if (!REGINCLASS(rex, scan, (U8*)locinput))
 		    sayNO;
 		locinput++;
@@ -3993,7 +4034,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
 		digit, "0");
 
         case POSIXA: /* /[[:ascii:]]/ etc */
-            if (locinput >= PL_regeol || ! _generic_isCC_A(nextchr, FLAGS(scan))) {
+            if (NEXTCHR_IS_EOS || ! _generic_isCC_A(nextchr, FLAGS(scan))) {
                 sayNO;
             }
             /* Matched a utf8-invariant, so don't have to worry about utf8 */
@@ -4001,7 +4042,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
             break;
 
         case NPOSIXA: /*  /[^[:ascii:]]/  etc */
-            if (locinput >= PL_regeol || _generic_isCC_A(nextchr, FLAGS(scan))) {
+            if (NEXTCHR_IS_EOS || _generic_isCC_A(nextchr, FLAGS(scan))) {
                 sayNO;
             }
             goto increment_locinput;
@@ -4040,7 +4081,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
 	       Prepend, that one will be a suitable Begin.
 	    */
 
-	    if (locinput >= PL_regeol)
+	    if (NEXTCHR_IS_EOS)
 		sayNO;
 	    if  (! utf8_target) {
 
@@ -4056,7 +4097,9 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
 
 		/* Utf8: See if is ( CR LF ); already know that locinput <
 		 * PL_regeol, so locinput+1 is in bounds */
-		if (nextchr == '\r' && UCHARAT(locinput + 1) == '\n') {
+		if ( nextchr == '\r' && locinput+1 < PL_regeol
+                        && UCHARAT(locinput + 1) == '\n')
+                {
 		    locinput += 2;
 		}
 		else {
@@ -4311,7 +4354,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
 	    }
 
 	    /* Not utf8:  Inline the first character, for speed. */
-	    if (UCHARAT(s) != nextchr &&
+	    if (!NEXTCHR_IS_EOS &&
+                UCHARAT(s) != nextchr &&
 		(type == REF ||
 		 UCHARAT(s) != fold_array[nextchr]))
 		sayNO;
@@ -5325,7 +5369,8 @@ NULL
 		    (int)(REPORT_CODE_OFF+(depth*2)),
 		    "", (IV)ST.count)
 		);
-	    if (ST.c1 != CHRTEST_VOID
+	    if (       !NEXTCHR_IS_EOS
+                    && ST.c1 != CHRTEST_VOID
 		    && nextchr != ST.c1
 		    && nextchr != ST.c2)
 	    {
@@ -5379,8 +5424,7 @@ NULL
 	    if (ST.count == ARG1(ST.me) /* min */)
 		sayNO;
 	    ST.count--;
-	    locinput = HOPc(locinput, -ST.alen);
-            nextchr = UCHARAT(locinput);
+	    SET_locinput(HOPc(locinput, -ST.alen));
 	    goto curlym_do_B; /* try to match B */
 
 #undef ST
@@ -5511,8 +5555,7 @@ NULL
 		minmod = 0;
 		if (ST.min && regrepeat(rex, &li, ST.A, ST.min, depth) < ST.min)
 		    sayNO;
-                locinput = li;
-                nextchr = UCHARAT(locinput);
+                SET_locinput(li);
 		ST.count = ST.min;
 		REGCP_SET(ST.cp);
 		if (ST.c1 == CHRTEST_VOID)
@@ -5549,8 +5592,7 @@ NULL
 		ST.count = regrepeat(rex, &li, ST.A, ST.max, depth);
 		if (ST.count < ST.min)
 		    sayNO;
-                locinput = li;
-                nextchr = UCHARAT(locinput);
+                SET_locinput(li);
 		if ((ST.count > ST.min)
 		    && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
 		{
@@ -5690,12 +5732,15 @@ NULL
             }
 	    {
 		UV c = 0;
-		if (ST.c1 != CHRTEST_VOID)
+		if (ST.c1 != CHRTEST_VOID && locinput < PL_regeol)
 		    c = utf8_target ? utf8n_to_uvchr((U8*)locinput,
 					   UTF8_MAXBYTES, 0, uniflags)
 				: (UV) UCHARAT(locinput);
 		/* If it could work, try it. */
-		if (ST.c1 == CHRTEST_VOID || c == (UV)ST.c1 || c == (UV)ST.c2) {
+		if (ST.c1 == CHRTEST_VOID
+                    || (locinput < PL_regeol &&
+                        (c == (UV)ST.c1 || c == (UV)ST.c2)))
+                {
 		    CURLY_SETPAREN(ST.paren, ST.count);
 		    PUSH_STATE_GOTO(CURLY_B_max, ST.B, locinput);
 		    assert(0); /* NOTREACHED */
@@ -5949,7 +5994,7 @@ NULL
 #undef ST
 
         case LNBREAK: /* \R */
-            if ((n=is_LNBREAK(locinput,utf8_target))) {
+            if ((n=is_LNBREAK_safe(locinput, PL_regeol, utf8_target))) {
                 locinput += n;
             } else
                 sayNO;
@@ -5957,7 +6002,7 @@ NULL
 
 #define CASE_CLASS(nAmE)                              \
         case nAmE:                                    \
-	    if (locinput >= PL_regeol)                \
+	    if (NEXTCHR_IS_EOS)                       \
 		sayNO;                                \
             if ((n=is_##nAmE(locinput,utf8_target))) {    \
                 locinput += n;                        \
@@ -5965,7 +6010,7 @@ NULL
                 sayNO;                                \
             break;                                    \
         case N##nAmE:                                 \
-	    if (locinput >= PL_regeol)                \
+	    if (NEXTCHR_IS_EOS)                       \
 		sayNO;                                \
             if ((n=is_##nAmE(locinput,utf8_target))) {    \
                 sayNO;                                \
@@ -5988,6 +6033,7 @@ NULL
         increment_locinput:
             if (utf8_target) {
                 locinput += PL_utf8skip[nextchr];
+                /* locinput is allowed to go 1 char off the end, but not 2+ */
                 if (locinput > PL_regeol)
                     sayNO;
             }
@@ -6654,7 +6700,8 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma
     case LNBREAK:
         if (utf8_target) {
 	    loceol = PL_regeol;
-	    while (hardcount < max && scan < loceol && (c=is_LNBREAK_utf8(scan))) {
+	    while (hardcount < max && scan < loceol &&
+                    (c=is_LNBREAK_utf8_safe(scan, loceol))) {
 		scan += c;
 		hardcount++;
 	    }
@@ -6664,7 +6711,7 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma
 	      because we have a null terminated string, but we
 	      have to use hardcount in this situation
 	    */
-	    while (scan < loceol && (c=is_LNBREAK_latin1(scan)))  {
+	    while (scan < loceol && (c=is_LNBREAK_latin1_safe(scan, loceol))) {
 		scan+=c;
 		hardcount++;
 	    }
@@ -6673,24 +6720,28 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma
     case HORIZWS:
         if (utf8_target) {
 	    loceol = PL_regeol;
-	    while (hardcount < max && scan < loceol && (c=is_HORIZWS_utf8(scan))) {
+	    while (hardcount < max && scan < loceol &&
+                    (c=is_HORIZWS_utf8_safe(scan, loceol)))
+            {
 		scan += c;
 		hardcount++;
 	    }
 	} else {
-	    while (scan < loceol && is_HORIZWS_latin1(scan)) 
+	    while (scan < loceol && is_HORIZWS_latin1_safe(scan, loceol)) 
 		scan++;		
 	}	
 	break;
     case NHORIZWS:
         if (utf8_target) {
 	    loceol = PL_regeol;
-	    while (hardcount < max && scan < loceol && !is_HORIZWS_utf8(scan)) {
+	    while (hardcount < max && scan < loceol &&
+                        !is_HORIZWS_utf8_safe(scan, loceol))
+            {
 		scan += UTF8SKIP(scan);
 		hardcount++;
 	    }
 	} else {
-	    while (scan < loceol && !is_HORIZWS_latin1(scan))
+	    while (scan < loceol && !is_HORIZWS_latin1_safe(scan, loceol))
 		scan++;
 
 	}	
@@ -6698,12 +6749,14 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma
     case VERTWS:
         if (utf8_target) {
 	    loceol = PL_regeol;
-	    while (hardcount < max && scan < loceol && (c=is_VERTWS_utf8(scan))) {
+	    while (hardcount < max && scan < loceol &&
+                            (c=is_VERTWS_utf8_safe(scan, loceol)))
+            {
 		scan += c;
 		hardcount++;
 	    }
 	} else {
-	    while (scan < loceol && is_VERTWS_latin1(scan)) 
+	    while (scan < loceol && is_VERTWS_latin1_safe(scan, loceol)) 
 		scan++;
 
 	}	
@@ -6711,12 +6764,14 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma
     case NVERTWS:
         if (utf8_target) {
 	    loceol = PL_regeol;
-	    while (hardcount < max && scan < loceol && !is_VERTWS_utf8(scan)) {
+	    while (hardcount < max && scan < loceol &&
+                                !is_VERTWS_utf8_safe(scan, loceol))
+            {
 		scan += UTF8SKIP(scan);
 		hardcount++;
 	    }
 	} else {
-	    while (scan < loceol && !is_VERTWS_latin1(scan)) 
+	    while (scan < loceol && !is_VERTWS_latin1_safe(scan, loceol)) 
 		scan++;
           
 	}