X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/52bcf2657b8c6ee52d2eeb841fc2607db893f58f..11675f52de33c98d9506667c90b3109e45b149b3:/regexec.c

diff --git a/regexec.c b/regexec.c
index cd0a94f..97ea458 100644
--- a/regexec.c
+++ b/regexec.c
@@ -92,7 +92,7 @@ static const char utf8_locale_required[] =
 #ifdef DEBUGGING
 /* At least one required character in the target string is expressible only in
  * UTF-8. */
-static const char* const non_utf8_target_but_utf8_required
+static const char non_utf8_target_but_utf8_required[]
                 = "Can't match, because target string needs to be in UTF-8\n";
 #endif
 
@@ -1173,8 +1173,8 @@ Perl_re_intuit_start(pTHX_
 
     /* now look for the 'other' substring if defined */
 
-    if (utf8_target ? prog->substrs->data[other_ix].utf8_substr
-                    : prog->substrs->data[other_ix].substr)
+    if (prog->substrs->data[other_ix].utf8_substr
+        || prog->substrs->data[other_ix].substr)
     {
 	/* Take into account the "other" substring. */
         char *last, *last1;
@@ -1184,6 +1184,11 @@ Perl_re_intuit_start(pTHX_
 
       do_other_substr:
         other = &prog->substrs->data[other_ix];
+        if (!utf8_target && !other->substr) {
+            if (!to_byte_substr(prog)) {
+                NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail);
+            }
+        }
 
         /* if "other" is anchored:
          * we've previously found a floating substr starting at check_at.
@@ -1720,7 +1725,7 @@ STMT_START {
         } else {                                                                    \
             uvc = _toFOLD_utf8_flags( (const U8*) uc, uc_end, foldbuf, &foldlen,    \
                                                                             flags); \
-            len = UTF8SKIP(uc);                                                     \
+            len = UTF8_SAFE_SKIP(uc, uc_end);                                       \
             skiplen = UVCHR_SKIP( uvc );                                            \
             foldlen -= skiplen;                                                     \
             uscan = foldbuf + skiplen;                                              \
@@ -1782,7 +1787,9 @@ STMT_START {
     STMT_START {                                            \
         while (s < strend) {                                \
             CODE                                            \
-            s += ((UTF8) ? UTF8SKIP(s) : 1);                \
+            s += ((UTF8)                                    \
+                  ? UTF8_SAFE_SKIP(s, reginfo->strend)      \
+                  : 1);                                     \
         }                                                   \
     } STMT_END
 
@@ -1796,7 +1803,7 @@ STMT_START {
 #define REXEC_FBC_CLASS_SCAN_GUTS(UTF8, COND)                  \
     if (COND) {                                                \
         FBC_CHECK_AND_TRY                                      \
-        s += ((UTF8) ? UTF8SKIP(s) : 1);                       \
+        s += ((UTF8) ? UTF8_SAFE_SKIP(s, reginfo->strend) : 1);\
         previous_occurrence_end = s;                           \
     }                                                          \
     else {                                                     \
@@ -1815,12 +1822,13 @@ STMT_START {
  * of the one we're looking for.  Knowing that, we can see right away if the
  * next occurrence is adjacent to the previous.  When 'doevery' is FALSE, we
  * don't accept the 2nd and succeeding adjacent occurrences */
-#define FBC_CHECK_AND_TRY                                      \
-        if (   (   doevery                                     \
-                || s != previous_occurrence_end)               \
-            && (reginfo->intuit || regtry(reginfo, &s)))       \
-        {                                                      \
-            goto got_it;                                       \
+#define FBC_CHECK_AND_TRY                                           \
+        if (   (   doevery                                          \
+                || s != previous_occurrence_end)                    \
+            && (   reginfo->intuit                                  \
+                || (s <= reginfo->strend && regtry(reginfo, &s))))  \
+        {                                                           \
+            goto got_it;                                            \
         }
 
 
@@ -1839,6 +1847,28 @@ STMT_START {
         previous_occurrence_end = s;                        \
     }
 
+/* This differs from the above macros in that it is passed a single byte that
+ * is known to begin the next occurrence of the thing being looked for in 's'.
+ * It does a memchr to find the next occurrence of 'byte', before trying 'COND'
+ * at that position. */
+#define REXEC_FBC_FIND_NEXT_UTF8_BYTE_SCAN(byte, COND)      \
+    while (s < strend) {                                    \
+        s = (char *) memchr(s, byte, strend -s);            \
+        if (s == NULL) {                                    \
+            s = (char *) strend;                            \
+            break;                                          \
+        }                                                   \
+                                                            \
+        if (COND) {                                         \
+            FBC_CHECK_AND_TRY                               \
+            s += UTF8_SAFE_SKIP(s, reginfo->strend);        \
+            previous_occurrence_end = s;                    \
+        }                                                   \
+        else {                                              \
+            s += UTF8SKIP(s);                               \
+        }                                                   \
+    }
+
 /* The three macros below are slightly different versions of the same logic.
  *
  * The first is for /a and /aa when the target string is UTF-8.  This can only
@@ -1945,9 +1975,12 @@ STMT_START {
     }
 
 /* This is the macro to use when we want to see if something that looks like it
- * could match, actually does, and if so exits the loop */
-#define REXEC_FBC_TRYIT                            \
-    if ((reginfo->intuit || regtry(reginfo, &s)))  \
+ * could match, actually does, and if so exits the loop.  It needs to be used
+ * only for bounds checking macros, as it allows for matching beyond the end of
+ * string (which should be zero length without having to look at the string
+ * contents) */
+#define REXEC_FBC_TRYIT                                                     \
+    if (reginfo->intuit || (s <= reginfo->strend && regtry(reginfo, &s)))   \
         goto got_it
 
 /* The only difference between the BOUND and NBOUND cases is that
@@ -2129,21 +2162,47 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
         break;
 
     case ANYOFM:    /* ARG() is the base byte; FLAGS() the mask byte */
-        /* UTF-8ness doesn't matter, so use 0 */
+        /* UTF-8ness doesn't matter because only matches UTF-8 invariants, so
+         * use 0 */
         REXEC_FBC_FIND_NEXT_SCAN(0,
          (char *) find_next_masked((U8 *) s, (U8 *) strend,
                                    (U8) ARG(c), FLAGS(c)));
         break;
 
-    case NANYOFM:
-        REXEC_FBC_FIND_NEXT_SCAN(0,
+    case NANYOFM:   /* UTF-8ness does matter because can match UTF-8 variants.
+                     */
+        REXEC_FBC_FIND_NEXT_SCAN(utf8_target,
          (char *) find_span_end_mask((U8 *) s, (U8 *) strend,
                                    (U8) ARG(c), FLAGS(c)));
         break;
 
     case ANYOFH:
-        if (utf8_target) REXEC_FBC_CLASS_SCAN(TRUE,
+        if (utf8_target) {  /* Can't possibly match a non-UTF-8 target */
+            REXEC_FBC_CLASS_SCAN(TRUE,
+                  (   (U8) NATIVE_UTF8_TO_I8(*s) >= ANYOF_FLAGS(c)
+                   && reginclass(prog, c, (U8*)s, (U8*) strend, utf8_target)));
+        }
+        break;
+
+    case ANYOFHb:
+        if (utf8_target) {  /* Can't possibly match a non-UTF-8 target */
+
+            /* We know what the first byte of any matched string should be */
+            U8 first_byte = FLAGS(c);
+
+            REXEC_FBC_FIND_NEXT_UTF8_BYTE_SCAN(first_byte,
                       reginclass(prog, c, (U8*)s, (U8*) strend, utf8_target));
+        }
+        break;
+
+    case ANYOFHr:
+        if (utf8_target) {  /* Can't possibly match a non-UTF-8 target */
+            REXEC_FBC_CLASS_SCAN(TRUE,
+                  (   inRANGE((U8) NATIVE_UTF8_TO_I8(*s),
+                              LOWEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(c)),
+                              HIGHEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(c)))
+                   && reginclass(prog, c, (U8*)s, (U8*) strend, utf8_target)));
+        }
         break;
 
     case EXACTFAA_NO_TRIE: /* This node only generated for non-utf8 patterns */
@@ -2355,7 +2414,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
             {
                 goto got_it;
             }
-            s += (utf8_target) ? UTF8SKIP(s) : 1;
+            s += (utf8_target) ? UTF8_SAFE_SKIP(s, reginfo->strend) : 1;
         }
         break;
     }
@@ -2439,7 +2498,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
                     }
 
                     /* Didn't match.  Try at the next position (if there is one) */
-                    s += (utf8_target) ? UTF8SKIP(s) : 1;
+                    s += (utf8_target) ? UTF8_SAFE_SKIP(s, reginfo->strend) : 1;
                     if (UNLIKELY(s >= reginfo->strend)) {
                         break;
                     }
@@ -2463,7 +2522,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
                             goto got_it;
                         }
                         before = after;
-                        s += UTF8SKIP(s);
+                        s += UTF8_SAFE_SKIP(s, reginfo->strend);
                     }
                 }
                 else {  /* Not utf8.  Everything is a GCB except between CR and
@@ -2481,7 +2540,9 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
 
                 /* And, since this is a bound, it can match after the final
                  * character in the string */
-                if ((reginfo->intuit || regtry(reginfo, &s))) {
+                if (   reginfo->intuit
+                    || (s <= reginfo->strend && regtry(reginfo, &s)))
+                {
                     goto got_it;
                 }
                 break;
@@ -2491,7 +2552,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
                     if (reginfo->intuit || regtry(reginfo, &s)) {
                         goto got_it;
                     }
-                    s += (utf8_target) ? UTF8SKIP(s) : 1;
+                    s += (utf8_target) ? UTF8_SAFE_SKIP(s, reginfo->strend) : 1;
                     if (UNLIKELY(s >= reginfo->strend)) {
                         break;
                     }
@@ -2515,7 +2576,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
                             goto got_it;
                         }
                         before = after;
-                        s += UTF8SKIP(s);
+                        s += UTF8_SAFE_SKIP(s, reginfo->strend);
                     }
                 }
                 else {  /* Not utf8. */
@@ -2537,7 +2598,9 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
                     }
                 }
 
-                if (reginfo->intuit || regtry(reginfo, &s)) {
+                if (   reginfo->intuit
+                    || (s <= reginfo->strend && regtry(reginfo, &s)))
+                {
                     goto got_it;
                 }
 
@@ -2548,7 +2611,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
                     if (reginfo->intuit || regtry(reginfo, &s)) {
                         goto got_it;
                     }
-                    s += (utf8_target) ? UTF8SKIP(s) : 1;
+                    s += (utf8_target) ? UTF8_SAFE_SKIP(s, reginfo->strend) : 1;
                     if (UNLIKELY(s >= reginfo->strend)) {
                         break;
                     }
@@ -2573,7 +2636,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
                             goto got_it;
                         }
                         before = after;
-                        s += UTF8SKIP(s);
+                        s += UTF8_SAFE_SKIP(s, reginfo->strend);
                     }
                 }
                 else {  /* Not utf8. */
@@ -2598,7 +2661,9 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
                 /* Here are at the final position in the target string.  The SB
                  * value is always true here, so matches, depending on other
                  * constraints */
-                if (reginfo->intuit || regtry(reginfo, &s)) {
+                if (   reginfo->intuit
+                    || (s <= reginfo->strend && regtry(reginfo, &s)))
+                {
                     goto got_it;
                 }
 
@@ -2609,7 +2674,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
                     if (reginfo->intuit || regtry(reginfo, &s)) {
                         goto got_it;
                     }
-                    s += (utf8_target) ? UTF8SKIP(s) : 1;
+                    s += (utf8_target) ? UTF8_SAFE_SKIP(s, reginfo->strend) : 1;
                     if (UNLIKELY(s >= reginfo->strend)) {
                         break;
                     }
@@ -2643,7 +2708,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
                         }
                         previous = before;
                         before = after;
-                        s += UTF8SKIP(s);
+                        s += UTF8_SAFE_SKIP(s, reginfo->strend);
                     }
                 }
                 else {  /* Not utf8. */
@@ -2668,7 +2733,9 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
                     }
                 }
 
-                if (reginfo->intuit || regtry(reginfo, &s)) {
+                if (   reginfo->intuit
+                    || (s <= reginfo->strend && regtry(reginfo, &s)))
+                {
                     goto got_it;
                 }
         }
@@ -2985,7 +3052,9 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
                         LEAVE;
                         goto got_it;
                     }
-                    s = HOPc(s,1);
+                    if (s < reginfo->strend) {
+                        s = HOPc(s,1);
+                    }
                     DEBUG_TRIE_EXECUTE_r({
                         Perl_re_printf( aTHX_ "Pattern failed. Looking for new start point...\n");
                     });
@@ -3305,7 +3374,7 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend,
             RXp_MATCH_UTF8_set(prog, utf8_target);
             prog->offs[0].start = s - strbeg;
             prog->offs[0].end = utf8_target
-                ? (char*)utf8_hop((U8*)s, prog->minlenret) - strbeg
+                ? (char*)utf8_hop_forward((U8*)s, prog->minlenret, (U8 *) strend) - strbeg
                 : s - strbeg + prog->minlenret;
             if ( !(flags & REXEC_NOT_FIRST) )
                 S_reg_set_capture_string(aTHX_ rx,
@@ -3504,7 +3573,7 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend,
 		if (*s == ch) {
 		    DEBUG_EXECUTE_r( did_match = 1 );
 		    if (regtry(reginfo, &s)) goto got_it;
-		    s += UTF8SKIP(s);
+		    s += UTF8_SAFE_SKIP(s, strend);
 		    while (s < strend && *s == ch)
 			s += UTF8SKIP(s);
 		}
@@ -5465,18 +5534,20 @@ S_backup_one_WB(pTHX_ WB_enum * previous, const U8 * const strbeg, U8 ** curpos,
 
 /* push a new state then goto it */
 
-#define PUSH_STATE_GOTO(state, node, input, eol) \
+#define PUSH_STATE_GOTO(state, node, input, eol, sr0)       \
     pushinput = input; \
     pusheol = eol; \
+    pushsr0 = sr0; \
     scan = node; \
     st->resume_state = state; \
     goto push_state;
 
 /* push a new state with success backtracking, then goto it */
 
-#define PUSH_YES_STATE_GOTO(state, node, input, eol) \
+#define PUSH_YES_STATE_GOTO(state, node, input, eol, sr0)   \
     pushinput = input; \
     pusheol = eol;     \
+    pushsr0 = sr0; \
     scan = node; \
     st->resume_state = state; \
     goto push_yes_state;
@@ -5660,6 +5731,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
     char *loceol = reginfo->strend;
     char *pushinput; /* where to continue after a PUSH */
     char *pusheol;   /* where to stop matching (loceol) after a PUSH */
+    U8   *pushsr0;   /* save starting pos of script run */
     I32 nextchr;   /* is always set to UCHARAT(locinput), or -1 at EOS */
 
     bool result = 0;	    /* return value of S_regmatch */
@@ -5796,7 +5868,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
 	    /* update the startpoint */
 	    st->u.keeper.val = rex->offs[0].start;
 	    rex->offs[0].start = locinput - reginfo->strbeg;
-	    PUSH_STATE_GOTO(KEEPS_next, next, locinput, loceol);
+	    PUSH_STATE_GOTO(KEEPS_next, next, locinput, loceol,
+                            script_run_begin);
 	    NOT_REACHED; /* NOTREACHED */
 
 	case KEEPS_next_fail:
@@ -6172,7 +6245,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
 	    });
 
 	    if ( ST.accepted > 1 || has_cutgroup || ST.jump ) {
-		PUSH_STATE_GOTO(TRIE_next, scan, (char*)uc, loceol);
+		PUSH_STATE_GOTO(TRIE_next, scan, (char*)uc, loceol,
+                                script_run_begin);
 		NOT_REACHED; /* NOTREACHED */
 	    }
 	    /* only one choice left - just continue */
@@ -6742,6 +6816,33 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
         case ANYOFH:
             if (   ! utf8_target
                 ||   NEXTCHR_IS_EOS
+                ||   ANYOF_FLAGS(scan) > NATIVE_UTF8_TO_I8((U8) *locinput)
+	        || ! reginclass(rex, scan, (U8*)locinput, (U8*) loceol,
+                                                                   utf8_target))
+            {
+                sayNO;
+            }
+            goto increment_locinput;
+            break;
+
+        case ANYOFHb:
+            if (   ! utf8_target
+                ||   NEXTCHR_IS_EOS
+                ||   ANYOF_FLAGS(scan) != (U8) *locinput
+	        || ! reginclass(rex, scan, (U8*)locinput, (U8*) loceol,
+                                                                  utf8_target))
+            {
+                sayNO;
+            }
+            goto increment_locinput;
+            break;
+
+        case ANYOFHr:
+            if (   ! utf8_target
+                ||   NEXTCHR_IS_EOS
+                || ! inRANGE((U8) NATIVE_UTF8_TO_I8(*locinput),
+                             LOWEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(scan)),
+                             HIGHEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(scan)))
 	        || ! reginclass(rex, scan, (U8*)locinput, (U8*) loceol,
                                                                    utf8_target))
             {
@@ -6921,7 +7022,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
                         }
                         break;
                 }
-                locinput += UTF8SKIP(locinput);
+                locinput += UTF8_SAFE_SKIP(locinput, reginfo->strend);
             }
             break;
 
@@ -6971,7 +7072,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
 	    }
 	    break;
             
-	case NREFFL:  /*  /\g{name}/il  */
+	case REFFLN:  /*  /\g{name}/il  */
 	{   /* The capture buffer cases.  The ones beginning with N for the
 	       named buffers just convert to the equivalent numbered and
 	       pretend they were called as the corresponding numbered buffer
@@ -6991,28 +7092,28 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
 	    utf8_fold_flags = FOLDEQ_LOCALE;
 	    goto do_nref;
 
-	case NREFFA:  /*  /\g{name}/iaa  */
+	case REFFAN:  /*  /\g{name}/iaa  */
 	    folder = foldEQ_latin1;
 	    fold_array = PL_fold_latin1;
 	    type = REFFA;
 	    utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
 	    goto do_nref;
 
-	case NREFFU:  /*  /\g{name}/iu  */
+	case REFFUN:  /*  /\g{name}/iu  */
 	    folder = foldEQ_latin1;
 	    fold_array = PL_fold_latin1;
 	    type = REFFU;
 	    utf8_fold_flags = 0;
 	    goto do_nref;
 
-	case NREFF:  /*  /\g{name}/i  */
+	case REFFN:  /*  /\g{name}/i  */
 	    folder = foldEQ;
 	    fold_array = PL_fold;
 	    type = REFF;
 	    utf8_fold_flags = 0;
 	    goto do_nref;
 
-	case NREF:  /*  /\g{name}/   */
+	case REFN:  /*  /\g{name}/   */
 	    type = REF;
 	    folder = NULL;
 	    fold_array = NULL;
@@ -7408,7 +7509,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
                 PL_curpm = PL_reg_curpm;
 
 		if (logical != 2) {
-                    PUSH_STATE_GOTO(EVAL_B, next, locinput, loceol);
+                    PUSH_STATE_GOTO(EVAL_B, next, locinput, loceol,
+                                    script_run_begin);
 		    /* NOTREACHED */
                 }
 	    }
@@ -7508,7 +7610,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
 		ST.prev_eval = cur_eval;
 		cur_eval = st;
 		/* now continue from first node in postoned RE */
-		PUSH_YES_STATE_GOTO(EVAL_postponed_AB, startpoint, locinput, loceol);
+		PUSH_YES_STATE_GOTO(EVAL_postponed_AB, startpoint, locinput,
+                                    loceol, script_run_begin);
 		NOT_REACHED; /* NOTREACHED */
 	}
 
@@ -7664,7 +7767,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
 	    sw = cBOOL(rex->lastparen >= n && rex->offs[n].end != -1);
 	    break;
 
-	case NGROUPP:  /*  (?(<name>))  */
+	case GROUPPN:  /*  (?(<name>))  */
 	    /* reg_check_named_buff_matched returns 0 for no match */
 	    sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan));
 	    break;
@@ -7808,7 +7911,8 @@ NULL
 	    ST.count = -1;	/* this will be updated by WHILEM */
 	    ST.lastloc = NULL;  /* this will be updated by WHILEM */
 
-	    PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next), locinput, loceol);
+	    PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next), locinput, loceol,
+                                script_run_begin);
 	    NOT_REACHED; /* NOTREACHED */
 	}
 
@@ -7856,7 +7960,8 @@ NULL
 		cur_curlyx->u.curlyx.lastloc = locinput;
 		REGCP_SET(ST.lastcp);
 
-		PUSH_STATE_GOTO(WHILEM_A_pre, A, locinput, loceol);
+		PUSH_STATE_GOTO(WHILEM_A_pre, A, locinput, loceol,
+                                script_run_begin);
 		NOT_REACHED; /* NOTREACHED */
 	    }
 
@@ -7964,7 +8069,7 @@ NULL
 		ST.save_curlyx = cur_curlyx;
 		cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
 		PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B,
-                                    locinput, loceol);
+                                    locinput, loceol, script_run_begin);
 		NOT_REACHED; /* NOTREACHED */
 	    }
 
@@ -7975,7 +8080,8 @@ NULL
                             maxopenparen);
 		cur_curlyx->u.curlyx.lastloc = locinput;
 		REGCP_SET(ST.lastcp);
-		PUSH_STATE_GOTO(WHILEM_A_max, A, locinput, loceol);
+		PUSH_STATE_GOTO(WHILEM_A_max, A, locinput, loceol,
+                                script_run_begin);
 		NOT_REACHED; /* NOTREACHED */
 	    }
 	    goto do_whilem_B_max;
@@ -8027,7 +8133,7 @@ NULL
 	    ST.save_curlyx = cur_curlyx;
 	    cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
 	    PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B,
-                                locinput, loceol);
+                                locinput, loceol, script_run_begin);
 	    NOT_REACHED; /* NOTREACHED */
 
 	case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
@@ -8058,7 +8164,7 @@ NULL
 	    REGCP_SET(ST.lastcp);
 	    PUSH_STATE_GOTO(WHILEM_A_min,
 		/*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS,
-                locinput, loceol);
+                locinput, loceol, script_run_begin);
 	    NOT_REACHED; /* NOTREACHED */
 
 #undef  ST
@@ -8080,9 +8186,11 @@ NULL
 
 	    /* Now go into the branch */
 	    if (has_cutgroup) {
-	        PUSH_YES_STATE_GOTO(BRANCH_next, scan, locinput, loceol);
+	        PUSH_YES_STATE_GOTO(BRANCH_next, scan, locinput, loceol,
+                                    script_run_begin);
 	    } else {
-	        PUSH_STATE_GOTO(BRANCH_next, scan, locinput, loceol);
+	        PUSH_STATE_GOTO(BRANCH_next, scan, locinput, loceol,
+                                script_run_begin);
 	    }
 	    NOT_REACHED; /* NOTREACHED */
 
@@ -8090,7 +8198,8 @@ NULL
             sv_yes_mark = st->u.mark.mark_name = scan->flags
                 ? MUTABLE_SV(rexi->data->data[ ARG( scan ) ])
                 : NULL;
-            PUSH_STATE_GOTO(CUTGROUP_next, next, locinput, loceol);
+            PUSH_STATE_GOTO(CUTGROUP_next, next, locinput, loceol,
+                            script_run_begin);
             NOT_REACHED; /* NOTREACHED */
 
         case CUTGROUP_next_fail:
@@ -8167,7 +8276,8 @@ NULL
 		goto curlym_do_B;
 
 	  curlym_do_A: /* execute the A in /A{m,n}B/  */
-	    PUSH_YES_STATE_GOTO(CURLYM_A, ST.A, locinput, loceol); /* match A */
+	    PUSH_YES_STATE_GOTO(CURLYM_A, ST.A, locinput, loceol, /* match A */
+                                script_run_begin);
 	    NOT_REACHED; /* NOTREACHED */
 
 	case CURLYM_A: /* we've just matched an A */
@@ -8242,8 +8352,10 @@ NULL
                             * having to worry about one being shorter than the
                             * other, since the first byte of each gives the
                             * length of the character) */
-                    if (memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
-                        && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
+                    if (   memNE(locinput, ST.c1_utf8, UTF8_SAFE_SKIP(locinput,
+                                                              reginfo->strend))
+                        && memNE(locinput, ST.c2_utf8, UTF8_SAFE_SKIP(locinput,
+                                                             reginfo->strend)))
                     {
                         /* simulate B failing */
                         DEBUG_OPTIMISE_r(
@@ -8289,7 +8401,8 @@ NULL
 	        }
 	    }
 	    
-	    PUSH_STATE_GOTO(CURLYM_B, ST.B, locinput, loceol); /* match B */
+	    PUSH_STATE_GOTO(CURLYM_B, ST.B, locinput, loceol,   /* match B */
+                            script_run_begin);
 	    NOT_REACHED; /* NOTREACHED */
 
 	case CURLYM_B_fail: /* just failed to match a B */
@@ -8505,20 +8618,26 @@ NULL
 		    n = (ST.oldloc == locinput) ? 0 : 1;
 		    if (ST.c1 == ST.c2) {
 			/* set n to utf8_distance(oldloc, locinput) */
-			while (locinput <= ST.maxpos
-                              && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput)))
+			while (    locinput <= ST.maxpos
+                               &&  locinput < loceol
+                               &&  memNE(locinput, ST.c1_utf8,
+                                    UTF8_SAFE_SKIP(locinput, reginfo->strend)))
                         {
-			    locinput += UTF8SKIP(locinput);
+			    locinput += UTF8_SAFE_SKIP(locinput,
+                                                       reginfo->strend);
 			    n++;
 			}
 		    }
 		    else {
 			/* set n to utf8_distance(oldloc, locinput) */
-			while (locinput <= ST.maxpos
-                              && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
-                              && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
+			while (   locinput <= ST.maxpos
+                               && locinput < loceol
+                               && memNE(locinput, ST.c1_utf8,
+                                     UTF8_SAFE_SKIP(locinput, reginfo->strend))
+                               && memNE(locinput, ST.c2_utf8,
+                                    UTF8_SAFE_SKIP(locinput, reginfo->strend)))
                         {
-			    locinput += UTF8SKIP(locinput);
+			    locinput += UTF8_SAFE_SKIP(locinput, reginfo->strend);
 			    n++;
 			}
 		    }
@@ -8583,7 +8702,8 @@ NULL
 
           curly_try_B_min:
             CURLY_SETPAREN(ST.paren, ST.count);
-            PUSH_STATE_GOTO(CURLY_B_min, ST.B, locinput, loceol);
+            PUSH_STATE_GOTO(CURLY_B_min, ST.B, locinput, loceol,
+                            script_run_begin);
 	    NOT_REACHED; /* NOTREACHED */
 
 
@@ -8596,21 +8716,22 @@ NULL
                 if (ST.c1 != CHRTEST_VOID && could_match) {
                     if (! UTF8_IS_INVARIANT(UCHARAT(locinput)) && utf8_target)
                     {
-                        could_match = memEQ(locinput,
-                                            ST.c1_utf8,
-                                            UTF8SKIP(locinput))
-                                    || memEQ(locinput,
-                                             ST.c2_utf8,
-                                             UTF8SKIP(locinput));
+                        could_match =  memEQ(locinput, ST.c1_utf8,
+                                             UTF8_SAFE_SKIP(locinput,
+                                                            reginfo->strend))
+                                    || memEQ(locinput, ST.c2_utf8,
+                                             UTF8_SAFE_SKIP(locinput,
+                                                            reginfo->strend));
                     }
                     else {
-                        could_match = UCHARAT(locinput) == ST.c1
-                                      || UCHARAT(locinput) == ST.c2;
+                        could_match =   UCHARAT(locinput) == ST.c1
+                                     || UCHARAT(locinput) == ST.c2;
                     }
                 }
                 if (ST.c1 == CHRTEST_VOID || could_match) {
 		    CURLY_SETPAREN(ST.paren, ST.count);
-		    PUSH_STATE_GOTO(CURLY_B_max, ST.B, locinput, loceol);
+		    PUSH_STATE_GOTO(CURLY_B_max, ST.B, locinput, loceol,
+                                    script_run_begin);
 		    NOT_REACHED; /* NOTREACHED */
 		}
 	    }
@@ -8665,8 +8786,9 @@ NULL
 
                 SET_RECURSE_LOCINPUT("FAKE-END[after]", cur_eval->locinput);
 
-                PUSH_YES_STATE_GOTO(EVAL_postponed_AB, st->u.eval.prev_eval->u.eval.B,
-                                    locinput, loceol); /* match B */
+                PUSH_YES_STATE_GOTO(EVAL_postponed_AB,          /* match B */
+                                    st->u.eval.prev_eval->u.eval.B,
+                                    locinput, loceol, script_run_begin);
 	    }
 
 	    if (locinput < reginfo->till) {
@@ -8717,8 +8839,8 @@ NULL
                 PERL_UINT_FAST8_T back_count = scan->flags;
 		char * s;
 
-                /* Lookbehind ends here */
-		ST.end = locinput;
+                /* Lookbehind can look beyond the current position */
+		ST.end = loceol;
 
                 /* ... and starts at the first place in the input that is in
                  * the range of the possible start positions */
@@ -8752,7 +8874,8 @@ NULL
 	    logical = 0; /* XXX: reset state of logical once it has been saved into ST */
 	    
 	    /* execute body of (?...A) */
-	    PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)), ST.start, ST.end);
+	    PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)), ST.start,
+                                ST.end, script_run_begin);
 	    NOT_REACHED; /* NOTREACHED */
 
         {
@@ -8786,6 +8909,7 @@ NULL
                 /* restore old position except for (?>...) */
 		locinput = st->locinput;
                 loceol = st->loceol;
+                script_run_begin = st->sr0;
 	    }
 	    scan = ST.me + ARG(ST.me);
 	    if (scan == ST.me)
@@ -8809,7 +8933,8 @@ NULL
 	case PRUNE:   /*  (*PRUNE)   */
             if (scan->flags)
 	        sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
-	    PUSH_STATE_GOTO(COMMIT_next, next, locinput, loceol);
+	    PUSH_STATE_GOTO(COMMIT_next, next, locinput, loceol,
+                            script_run_begin);
 	    NOT_REACHED; /* NOTREACHED */
 
 	case COMMIT_next_fail:
@@ -8839,7 +8964,8 @@ NULL
                 = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
             mark_state = st;
             ST.mark_loc = locinput;
-            PUSH_YES_STATE_GOTO(MARKPOINT_next, next, locinput, loceol);
+            PUSH_YES_STATE_GOTO(MARKPOINT_next, next, locinput, loceol,
+                                script_run_begin);
             NOT_REACHED; /* NOTREACHED */
 
         case MARKPOINT_next:
@@ -8872,7 +8998,8 @@ NULL
                 /* (*SKIP) : if we fail we cut here*/
                 ST.mark_name = NULL;
                 ST.mark_loc = locinput;
-                PUSH_STATE_GOTO(SKIP_next,next, locinput, loceol);
+                PUSH_STATE_GOTO(SKIP_next,next, locinput, loceol,
+                                script_run_begin);
             } else {
                 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was, 
                    otherwise do nothing.  Meaning we need to scan 
@@ -8885,7 +9012,8 @@ NULL
                                 find ) ) 
                     {
                         ST.mark_name = find;
-                        PUSH_STATE_GOTO( SKIP_next, next, locinput, loceol);
+                        PUSH_STATE_GOTO( SKIP_next, next, locinput, loceol,
+                                         script_run_begin);
                     }
                     cur = cur->u.mark.prev_mark;
                 }
@@ -8982,6 +9110,7 @@ NULL
 	    depth++;
 	    st->locinput = locinput;
 	    st->loceol = loceol;
+            st->sr0 = script_run_begin;
 	    newst = st+1; 
 	    if (newst >  SLAB_LAST(PL_regmatch_slab))
 		newst = S_push_slab(aTHX);
@@ -8989,6 +9118,7 @@ NULL
 
 	    locinput = pushinput;
             loceol = pusheol;
+            script_run_begin = pushsr0;
 	    st = newst;
 	    continue;
             /* NOTREACHED */
@@ -9044,6 +9174,7 @@ NULL
         if (no_final) {
             locinput= st->locinput;
             loceol= st->loceol;
+            script_run_begin = st->sr0;
         }
 	state_num = st->resume_state + no_final;
 	goto reenter_switch;
@@ -9095,6 +9226,7 @@ NULL
 	PL_regmatch_state = st;
 	locinput= st->locinput;
 	loceol= st->loceol;
+        script_run_begin = st->sr0;
 
 	DEBUG_STATE_pp("pop");
 	depth--;
@@ -9377,19 +9509,22 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p,
                 if (c1 == c2) {
                     while (scan < this_eol
                            && hardcount < max
-                           && memEQ(scan, c1_utf8, UTF8SKIP(scan)))
+                           && memEQ(scan, c1_utf8, UTF8_SAFE_SKIP(scan,
+                                                                  loceol)))
                     {
-                        scan += UTF8SKIP(scan);
+                        scan += UTF8SKIP(c1_utf8);
                         hardcount++;
                     }
                 }
                 else {
                     while (scan < this_eol
                            && hardcount < max
-                           && (memEQ(scan, c1_utf8, UTF8SKIP(scan))
-                               || memEQ(scan, c2_utf8, UTF8SKIP(scan))))
+                           && (   memEQ(scan, c1_utf8, UTF8_SAFE_SKIP(scan,
+                                                                     loceol))
+                               || memEQ(scan, c2_utf8, UTF8_SAFE_SKIP(scan,
+                                                                     loceol))))
                     {
-                        scan += UTF8SKIP(scan);
+                        scan += UTF8_SAFE_SKIP(scan, loceol);
                         hardcount++;
                     }
                 }
@@ -9480,13 +9615,47 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p,
         break;
 
     case ANYOFH:
-        if (utf8_target) while (   hardcount < max
-                                && scan < this_eol
-                                && reginclass(prog, p, (U8*)scan, (U8*) this_eol,
-                                                                  TRUE))
-        {
-            scan += UTF8SKIP(scan);
-            hardcount++;
+        if (utf8_target) {  /* ANYOFH only can match UTF-8 targets */
+            while (  hardcount < max
+                   && scan < this_eol
+                   && NATIVE_UTF8_TO_I8((U8) *scan) >= ANYOF_FLAGS(p)
+                   && reginclass(prog, p, (U8*)scan, (U8*) this_eol, TRUE))
+            {
+                scan += UTF8SKIP(scan);
+                hardcount++;
+            }
+        }
+        break;
+
+    case ANYOFHb:
+        if (utf8_target) {  /* ANYOFHb only can match UTF-8 targets */
+
+            /* we know the first byte must be the FLAGS field */
+            while (   hardcount < max
+                   && scan < this_eol
+                   && (U8) *scan == ANYOF_FLAGS(p)
+                   && reginclass(prog, p, (U8*)scan, (U8*) this_eol,
+                                                              TRUE))
+            {
+                scan += UTF8SKIP(scan);
+                hardcount++;
+            }
+        }
+        break;
+
+    case ANYOFHr:
+        if (utf8_target) {  /* ANYOFH only can match UTF-8 targets */
+            while (  hardcount < max
+                   && scan < this_eol
+                   && inRANGE((U8) NATIVE_UTF8_TO_I8(*scan),
+                              LOWEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(p)),
+                              HIGHEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(p)))
+                   && NATIVE_UTF8_TO_I8((U8) *scan) >= ANYOF_FLAGS(p)
+                   && reginclass(prog, p, (U8*)scan, (U8*) this_eol, TRUE))
+            {
+                scan += UTF8SKIP(scan);
+                hardcount++;
+            }
         }
         break;
 
@@ -9733,7 +9902,9 @@ STATIC bool
 S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const p, const U8* const p_end, const bool utf8_target)
 {
     dVAR;
-    const char flags = ANYOF_FLAGS(n);
+    const char flags = (inRANGE(OP(n), ANYOFH, ANYOFHr))
+                        ? 0
+                        : ANYOF_FLAGS(n);
     bool match = FALSE;
     UV c = *p;
 
@@ -9760,7 +9931,7 @@ S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const
     }
 
     /* If this character is potentially in the bitmap, check it */
-    if (c < NUM_ANYOF_CODE_POINTS && OP(n) != ANYOFH) {
+    if (c < NUM_ANYOF_CODE_POINTS && ! inRANGE(OP(n), ANYOFH, ANYOFHb)) {
 	if (ANYOF_BITMAP_TEST(n, c))
 	    match = TRUE;
 	else if ((flags
@@ -10062,6 +10233,7 @@ S_setup_eval_state(pTHX_ regmatch_info *const reginfo)
     regmatch_info_aux_eval *eval_state = reginfo->info_aux_eval;
 
     eval_state->rex = rex;
+    eval_state->sv  = reginfo->sv;
 
     if (reginfo->sv) {
         /* Make $_ available to executed code. */
@@ -10069,6 +10241,8 @@ S_setup_eval_state(pTHX_ regmatch_info *const reginfo)
             SAVE_DEFSV;
             DEFSV_set(reginfo->sv);
         }
+        /* will be dec'd by S_cleanup_regmatch_info_aux */
+        SvREFCNT_inc_NN(reginfo->sv);
 
         if (!(mg = mg_find_mglob(reginfo->sv))) {
             /* prepare for quick setting of pos */
@@ -10160,6 +10334,7 @@ S_cleanup_regmatch_info_aux(pTHX_ void *arg)
         }
 
         PL_curpm = eval_state->curpm;
+        SvREFCNT_dec(eval_state->sv);
     }
 
     PL_regmatch_state = aux->old_regmatch_state;
@@ -10230,6 +10405,7 @@ S_to_byte_substr(pTHX_ regexp *prog)
 	    && !prog->substrs->data[i].substr) {
 	    SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
 	    if (! sv_utf8_downgrade(sv, TRUE)) {
+                SvREFCNT_dec_NN(sv);
                 return FALSE;
             }
             if (SvVALID(prog->substrs->data[i].utf8_substr)) {