This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Remove deprecated flag from sv_nosharing.
[perl5.git] / regexec.c
index bca2c4c..97ea458 100644 (file)
--- a/regexec.c
+++ b/regexec.c
@@ -92,7 +92,7 @@ static const char utf8_locale_required[] =
 #ifdef DEBUGGING
 /* At least one required character in the target string is expressible only in
  * UTF-8. */
-static const char* const non_utf8_target_but_utf8_required
+static const char non_utf8_target_but_utf8_required[]
                 = "Can't match, because target string needs to be in UTF-8\n";
 #endif
 
@@ -1173,8 +1173,8 @@ Perl_re_intuit_start(pTHX_
 
     /* now look for the 'other' substring if defined */
 
-    if (utf8_target ? prog->substrs->data[other_ix].utf8_substr
-                    : prog->substrs->data[other_ix].substr)
+    if (prog->substrs->data[other_ix].utf8_substr
+        || prog->substrs->data[other_ix].substr)
     {
        /* Take into account the "other" substring. */
         char *last, *last1;
@@ -1184,6 +1184,11 @@ Perl_re_intuit_start(pTHX_
 
       do_other_substr:
         other = &prog->substrs->data[other_ix];
+        if (!utf8_target && !other->substr) {
+            if (!to_byte_substr(prog)) {
+                NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail);
+            }
+        }
 
         /* if "other" is anchored:
          * we've previously found a floating substr starting at check_at.
@@ -1720,7 +1725,7 @@ STMT_START {
         } else {                                                                    \
             uvc = _toFOLD_utf8_flags( (const U8*) uc, uc_end, foldbuf, &foldlen,    \
                                                                             flags); \
-            len = UTF8SKIP(uc);                                                     \
+            len = UTF8_SAFE_SKIP(uc, uc_end);                                       \
             skiplen = UVCHR_SKIP( uvc );                                            \
             foldlen -= skiplen;                                                     \
             uscan = foldbuf + skiplen;                                              \
@@ -1782,7 +1787,9 @@ STMT_START {
     STMT_START {                                            \
         while (s < strend) {                                \
             CODE                                            \
-            s += ((UTF8) ? UTF8SKIP(s) : 1);                \
+            s += ((UTF8)                                    \
+                  ? UTF8_SAFE_SKIP(s, reginfo->strend)      \
+                  : 1);                                     \
         }                                                   \
     } STMT_END
 
@@ -1796,7 +1803,7 @@ STMT_START {
 #define REXEC_FBC_CLASS_SCAN_GUTS(UTF8, COND)                  \
     if (COND) {                                                \
         FBC_CHECK_AND_TRY                                      \
-        s += ((UTF8) ? UTF8SKIP(s) : 1);                       \
+        s += ((UTF8) ? UTF8_SAFE_SKIP(s, reginfo->strend) : 1);\
         previous_occurrence_end = s;                           \
     }                                                          \
     else {                                                     \
@@ -1815,12 +1822,13 @@ STMT_START {
  * of the one we're looking for.  Knowing that, we can see right away if the
  * next occurrence is adjacent to the previous.  When 'doevery' is FALSE, we
  * don't accept the 2nd and succeeding adjacent occurrences */
-#define FBC_CHECK_AND_TRY                                      \
-        if (   (   doevery                                     \
-                || s != previous_occurrence_end)               \
-            && (reginfo->intuit || regtry(reginfo, &s)))       \
-        {                                                      \
-            goto got_it;                                       \
+#define FBC_CHECK_AND_TRY                                           \
+        if (   (   doevery                                          \
+                || s != previous_occurrence_end)                    \
+            && (   reginfo->intuit                                  \
+                || (s <= reginfo->strend && regtry(reginfo, &s))))  \
+        {                                                           \
+            goto got_it;                                            \
         }
 
 
@@ -1839,6 +1847,28 @@ STMT_START {
         previous_occurrence_end = s;                        \
     }
 
+/* This differs from the above macros in that it is passed a single byte that
+ * is known to begin the next occurrence of the thing being looked for in 's'.
+ * It does a memchr to find the next occurrence of 'byte', before trying 'COND'
+ * at that position. */
+#define REXEC_FBC_FIND_NEXT_UTF8_BYTE_SCAN(byte, COND)      \
+    while (s < strend) {                                    \
+        s = (char *) memchr(s, byte, strend -s);            \
+        if (s == NULL) {                                    \
+            s = (char *) strend;                            \
+            break;                                          \
+        }                                                   \
+                                                            \
+        if (COND) {                                         \
+            FBC_CHECK_AND_TRY                               \
+            s += UTF8_SAFE_SKIP(s, reginfo->strend);        \
+            previous_occurrence_end = s;                    \
+        }                                                   \
+        else {                                              \
+            s += UTF8SKIP(s);                               \
+        }                                                   \
+    }
+
 /* The three macros below are slightly different versions of the same logic.
  *
  * The first is for /a and /aa when the target string is UTF-8.  This can only
@@ -1945,9 +1975,12 @@ STMT_START {
     }
 
 /* This is the macro to use when we want to see if something that looks like it
- * could match, actually does, and if so exits the loop */
-#define REXEC_FBC_TRYIT                            \
-    if ((reginfo->intuit || regtry(reginfo, &s)))  \
+ * could match, actually does, and if so exits the loop.  It needs to be used
+ * only for bounds checking macros, as it allows for matching beyond the end of
+ * string (which should be zero length without having to look at the string
+ * contents) */
+#define REXEC_FBC_TRYIT                                                     \
+    if (reginfo->intuit || (s <= reginfo->strend && regtry(reginfo, &s)))   \
         goto got_it
 
 /* The only difference between the BOUND and NBOUND cases is that
@@ -2129,21 +2162,47 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
         break;
 
     case ANYOFM:    /* ARG() is the base byte; FLAGS() the mask byte */
-        /* UTF-8ness doesn't matter, so use 0 */
+        /* UTF-8ness doesn't matter because only matches UTF-8 invariants, so
+         * use 0 */
         REXEC_FBC_FIND_NEXT_SCAN(0,
          (char *) find_next_masked((U8 *) s, (U8 *) strend,
                                    (U8) ARG(c), FLAGS(c)));
         break;
 
-    case NANYOFM:
-        REXEC_FBC_FIND_NEXT_SCAN(0,
+    case NANYOFM:   /* UTF-8ness does matter because can match UTF-8 variants.
+                     */
+        REXEC_FBC_FIND_NEXT_SCAN(utf8_target,
          (char *) find_span_end_mask((U8 *) s, (U8 *) strend,
                                    (U8) ARG(c), FLAGS(c)));
         break;
 
     case ANYOFH:
-        if (utf8_target) REXEC_FBC_CLASS_SCAN(TRUE,
+        if (utf8_target) {  /* Can't possibly match a non-UTF-8 target */
+            REXEC_FBC_CLASS_SCAN(TRUE,
+                  (   (U8) NATIVE_UTF8_TO_I8(*s) >= ANYOF_FLAGS(c)
+                   && reginclass(prog, c, (U8*)s, (U8*) strend, utf8_target)));
+        }
+        break;
+
+    case ANYOFHb:
+        if (utf8_target) {  /* Can't possibly match a non-UTF-8 target */
+
+            /* We know what the first byte of any matched string should be */
+            U8 first_byte = FLAGS(c);
+
+            REXEC_FBC_FIND_NEXT_UTF8_BYTE_SCAN(first_byte,
                       reginclass(prog, c, (U8*)s, (U8*) strend, utf8_target));
+        }
+        break;
+
+    case ANYOFHr:
+        if (utf8_target) {  /* Can't possibly match a non-UTF-8 target */
+            REXEC_FBC_CLASS_SCAN(TRUE,
+                  (   inRANGE((U8) NATIVE_UTF8_TO_I8(*s),
+                              LOWEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(c)),
+                              HIGHEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(c)))
+                   && reginclass(prog, c, (U8*)s, (U8*) strend, utf8_target)));
+        }
         break;
 
     case EXACTFAA_NO_TRIE: /* This node only generated for non-utf8 patterns */
@@ -2355,7 +2414,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
             {
                 goto got_it;
             }
-            s += (utf8_target) ? UTF8SKIP(s) : 1;
+            s += (utf8_target) ? UTF8_SAFE_SKIP(s, reginfo->strend) : 1;
         }
         break;
     }
@@ -2439,7 +2498,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
                     }
 
                     /* Didn't match.  Try at the next position (if there is one) */
-                    s += (utf8_target) ? UTF8SKIP(s) : 1;
+                    s += (utf8_target) ? UTF8_SAFE_SKIP(s, reginfo->strend) : 1;
                     if (UNLIKELY(s >= reginfo->strend)) {
                         break;
                     }
@@ -2463,7 +2522,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
                             goto got_it;
                         }
                         before = after;
-                        s += UTF8SKIP(s);
+                        s += UTF8_SAFE_SKIP(s, reginfo->strend);
                     }
                 }
                 else {  /* Not utf8.  Everything is a GCB except between CR and
@@ -2481,7 +2540,9 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
 
                 /* And, since this is a bound, it can match after the final
                  * character in the string */
-                if ((reginfo->intuit || regtry(reginfo, &s))) {
+                if (   reginfo->intuit
+                    || (s <= reginfo->strend && regtry(reginfo, &s)))
+                {
                     goto got_it;
                 }
                 break;
@@ -2491,7 +2552,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
                     if (reginfo->intuit || regtry(reginfo, &s)) {
                         goto got_it;
                     }
-                    s += (utf8_target) ? UTF8SKIP(s) : 1;
+                    s += (utf8_target) ? UTF8_SAFE_SKIP(s, reginfo->strend) : 1;
                     if (UNLIKELY(s >= reginfo->strend)) {
                         break;
                     }
@@ -2515,7 +2576,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
                             goto got_it;
                         }
                         before = after;
-                        s += UTF8SKIP(s);
+                        s += UTF8_SAFE_SKIP(s, reginfo->strend);
                     }
                 }
                 else {  /* Not utf8. */
@@ -2537,7 +2598,9 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
                     }
                 }
 
-                if (reginfo->intuit || regtry(reginfo, &s)) {
+                if (   reginfo->intuit
+                    || (s <= reginfo->strend && regtry(reginfo, &s)))
+                {
                     goto got_it;
                 }
 
@@ -2548,7 +2611,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
                     if (reginfo->intuit || regtry(reginfo, &s)) {
                         goto got_it;
                     }
-                    s += (utf8_target) ? UTF8SKIP(s) : 1;
+                    s += (utf8_target) ? UTF8_SAFE_SKIP(s, reginfo->strend) : 1;
                     if (UNLIKELY(s >= reginfo->strend)) {
                         break;
                     }
@@ -2573,7 +2636,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
                             goto got_it;
                         }
                         before = after;
-                        s += UTF8SKIP(s);
+                        s += UTF8_SAFE_SKIP(s, reginfo->strend);
                     }
                 }
                 else {  /* Not utf8. */
@@ -2598,7 +2661,9 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
                 /* Here are at the final position in the target string.  The SB
                  * value is always true here, so matches, depending on other
                  * constraints */
-                if (reginfo->intuit || regtry(reginfo, &s)) {
+                if (   reginfo->intuit
+                    || (s <= reginfo->strend && regtry(reginfo, &s)))
+                {
                     goto got_it;
                 }
 
@@ -2609,7 +2674,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
                     if (reginfo->intuit || regtry(reginfo, &s)) {
                         goto got_it;
                     }
-                    s += (utf8_target) ? UTF8SKIP(s) : 1;
+                    s += (utf8_target) ? UTF8_SAFE_SKIP(s, reginfo->strend) : 1;
                     if (UNLIKELY(s >= reginfo->strend)) {
                         break;
                     }
@@ -2643,7 +2708,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
                         }
                         previous = before;
                         before = after;
-                        s += UTF8SKIP(s);
+                        s += UTF8_SAFE_SKIP(s, reginfo->strend);
                     }
                 }
                 else {  /* Not utf8. */
@@ -2668,7 +2733,9 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
                     }
                 }
 
-                if (reginfo->intuit || regtry(reginfo, &s)) {
+                if (   reginfo->intuit
+                    || (s <= reginfo->strend && regtry(reginfo, &s)))
+                {
                     goto got_it;
                 }
         }
@@ -2985,7 +3052,9 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
                         LEAVE;
                         goto got_it;
                     }
-                    s = HOPc(s,1);
+                    if (s < reginfo->strend) {
+                        s = HOPc(s,1);
+                    }
                     DEBUG_TRIE_EXECUTE_r({
                         Perl_re_printf( aTHX_ "Pattern failed. Looking for new start point...\n");
                     });
@@ -3305,7 +3374,7 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend,
             RXp_MATCH_UTF8_set(prog, utf8_target);
             prog->offs[0].start = s - strbeg;
             prog->offs[0].end = utf8_target
-                ? (char*)utf8_hop((U8*)s, prog->minlenret) - strbeg
+                ? (char*)utf8_hop_forward((U8*)s, prog->minlenret, (U8 *) strend) - strbeg
                 : s - strbeg + prog->minlenret;
             if ( !(flags & REXEC_NOT_FIRST) )
                 S_reg_set_capture_string(aTHX_ rx,
@@ -3504,7 +3573,7 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend,
                if (*s == ch) {
                    DEBUG_EXECUTE_r( did_match = 1 );
                    if (regtry(reginfo, &s)) goto got_it;
-                   s += UTF8SKIP(s);
+                   s += UTF8_SAFE_SKIP(s, strend);
                    while (s < strend && *s == ch)
                        s += UTF8SKIP(s);
                }
@@ -5465,16 +5534,20 @@ S_backup_one_WB(pTHX_ WB_enum * previous, const U8 * const strbeg, U8 ** curpos,
 
 /* push a new state then goto it */
 
-#define PUSH_STATE_GOTO(state, node, input) \
+#define PUSH_STATE_GOTO(state, node, input, eol, sr0)       \
     pushinput = input; \
+    pusheol = eol; \
+    pushsr0 = sr0; \
     scan = node; \
     st->resume_state = state; \
     goto push_state;
 
 /* push a new state with success backtracking, then goto it */
 
-#define PUSH_YES_STATE_GOTO(state, node, input) \
+#define PUSH_YES_STATE_GOTO(state, node, input, eol, sr0)   \
     pushinput = input; \
+    pusheol = eol;     \
+    pushsr0 = sr0; \
     scan = node; \
     st->resume_state = state; \
     goto push_yes_state;
@@ -5595,8 +5668,8 @@ The topmost backtrack state, pointed to by st, is usually free. If you
 want to claim it, populate any ST.foo fields in it with values you wish to
 save, then do one of
 
-       PUSH_STATE_GOTO(resume_state, node, newinput);
-       PUSH_YES_STATE_GOTO(resume_state, node, newinput);
+       PUSH_STATE_GOTO(resume_state, node, newinput, new_eol);
+       PUSH_YES_STATE_GOTO(resume_state, node, newinput, new_eol);
 
 which sets that backtrack state's resume value to 'resume_state', pushes a
 new free entry to the top of the backtrack stack, then goes to 'node'.
@@ -5624,6 +5697,16 @@ allocated, and is never freed until interpreter destruction. When the slab
 is full, a new one is allocated and chained to the end. At exit from
 regmatch(), slabs allocated since entry are freed.
 
+In order to work with variable length lookbehinds, an upper limit is placed on
+lookbehinds which is set to where the match position is at the end of where the
+lookbehind would get to.  Nothing in the lookbehind should match above that,
+except we should be able to look beyond if for things like \b, which need the
+next character in the string to be able to determine if this is a boundary or
+not.  We also can't match the end of string/line unless we are also at the end
+of the entire string, so NEXTCHR_IS_EOS remains the same, and for those OPs
+that match a width, we have to add a condition that they are within the legal
+bounds of our window into the string.
+
 */
 
 /* returns -1 on failure, $+[0] on success */
@@ -5647,6 +5730,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
     char *locinput = startpos;
     char *loceol = reginfo->strend;
     char *pushinput; /* where to continue after a PUSH */
+    char *pusheol;   /* where to stop matching (loceol) after a PUSH */
+    U8   *pushsr0;   /* save starting pos of script run */
     I32 nextchr;   /* is always set to UCHARAT(locinput), or -1 at EOS */
 
     bool result = 0;       /* return value of S_regmatch */
@@ -5783,7 +5868,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
            /* update the startpoint */
            st->u.keeper.val = rex->offs[0].start;
            rex->offs[0].start = locinput - reginfo->strbeg;
-           PUSH_STATE_GOTO(KEEPS_next, next, locinput);
+           PUSH_STATE_GOTO(KEEPS_next, next, locinput, loceol,
+                            script_run_begin);
            NOT_REACHED; /* NOTREACHED */
 
        case KEEPS_next_fail:
@@ -6114,6 +6200,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
 
                    while (chars) {
                        if (utf8_target) {
+                            /* XXX This assumes the length is well-formed, as
+                             * does the UTF8SKIP below */
                            uvc = utf8n_to_uvchr((U8*)uc, UTF8_MAXLEN, &len,
                                                    uniflags);
                            uc += len;
@@ -6157,7 +6245,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
            });
 
            if ( ST.accepted > 1 || has_cutgroup || ST.jump ) {
-               PUSH_STATE_GOTO(TRIE_next, scan, (char*)uc);
+               PUSH_STATE_GOTO(TRIE_next, scan, (char*)uc, loceol,
+                                script_run_begin);
                NOT_REACHED; /* NOTREACHED */
            }
            /* only one choice left - just continue */
@@ -6727,6 +6816,33 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
         case ANYOFH:
             if (   ! utf8_target
                 ||   NEXTCHR_IS_EOS
+                ||   ANYOF_FLAGS(scan) > NATIVE_UTF8_TO_I8((U8) *locinput)
+               || ! reginclass(rex, scan, (U8*)locinput, (U8*) loceol,
+                                                                   utf8_target))
+            {
+                sayNO;
+            }
+            goto increment_locinput;
+            break;
+
+        case ANYOFHb:
+            if (   ! utf8_target
+                ||   NEXTCHR_IS_EOS
+                ||   ANYOF_FLAGS(scan) != (U8) *locinput
+               || ! reginclass(rex, scan, (U8*)locinput, (U8*) loceol,
+                                                                  utf8_target))
+            {
+                sayNO;
+            }
+            goto increment_locinput;
+            break;
+
+        case ANYOFHr:
+            if (   ! utf8_target
+                ||   NEXTCHR_IS_EOS
+                || ! inRANGE((U8) NATIVE_UTF8_TO_I8(*locinput),
+                             LOWEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(scan)),
+                             HIGHEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(scan)))
                || ! reginclass(rex, scan, (U8*)locinput, (U8*) loceol,
                                                                    utf8_target))
             {
@@ -6906,7 +7022,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
                         }
                         break;
                 }
-                locinput += UTF8SKIP(locinput);
+                locinput += UTF8_SAFE_SKIP(locinput, reginfo->strend);
             }
             break;
 
@@ -6956,7 +7072,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
            }
            break;
             
-       case NREFFL:  /*  /\g{name}/il  */
+       case REFFLN:  /*  /\g{name}/il  */
        {   /* The capture buffer cases.  The ones beginning with N for the
               named buffers just convert to the equivalent numbered and
               pretend they were called as the corresponding numbered buffer
@@ -6976,28 +7092,28 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
            utf8_fold_flags = FOLDEQ_LOCALE;
            goto do_nref;
 
-       case NREFFA:  /*  /\g{name}/iaa  */
+       case REFFAN:  /*  /\g{name}/iaa  */
            folder = foldEQ_latin1;
            fold_array = PL_fold_latin1;
            type = REFFA;
            utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
            goto do_nref;
 
-       case NREFFU:  /*  /\g{name}/iu  */
+       case REFFUN:  /*  /\g{name}/iu  */
            folder = foldEQ_latin1;
            fold_array = PL_fold_latin1;
            type = REFFU;
            utf8_fold_flags = 0;
            goto do_nref;
 
-       case NREFF:  /*  /\g{name}/i  */
+       case REFFN:  /*  /\g{name}/i  */
            folder = foldEQ;
            fold_array = PL_fold;
            type = REFF;
            utf8_fold_flags = 0;
            goto do_nref;
 
-       case NREF:  /*  /\g{name}/   */
+       case REFN:  /*  /\g{name}/   */
            type = REF;
            folder = NULL;
            fold_array = NULL;
@@ -7393,7 +7509,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
                 PL_curpm = PL_reg_curpm;
 
                if (logical != 2) {
-                    PUSH_STATE_GOTO(EVAL_B, next, locinput);
+                    PUSH_STATE_GOTO(EVAL_B, next, locinput, loceol,
+                                    script_run_begin);
                    /* NOTREACHED */
                 }
            }
@@ -7493,7 +7610,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
                ST.prev_eval = cur_eval;
                cur_eval = st;
                /* now continue from first node in postoned RE */
-               PUSH_YES_STATE_GOTO(EVAL_postponed_AB, startpoint, locinput);
+               PUSH_YES_STATE_GOTO(EVAL_postponed_AB, startpoint, locinput,
+                                    loceol, script_run_begin);
                NOT_REACHED; /* NOTREACHED */
        }
 
@@ -7649,7 +7767,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
            sw = cBOOL(rex->lastparen >= n && rex->offs[n].end != -1);
            break;
 
-       case NGROUPP:  /*  (?(<name>))  */
+       case GROUPPN:  /*  (?(<name>))  */
            /* reg_check_named_buff_matched returns 0 for no match */
            sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan));
            break;
@@ -7793,7 +7911,8 @@ NULL
            ST.count = -1;      /* this will be updated by WHILEM */
            ST.lastloc = NULL;  /* this will be updated by WHILEM */
 
-           PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next), locinput);
+           PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next), locinput, loceol,
+                                script_run_begin);
            NOT_REACHED; /* NOTREACHED */
        }
 
@@ -7841,7 +7960,8 @@ NULL
                cur_curlyx->u.curlyx.lastloc = locinput;
                REGCP_SET(ST.lastcp);
 
-               PUSH_STATE_GOTO(WHILEM_A_pre, A, locinput);
+               PUSH_STATE_GOTO(WHILEM_A_pre, A, locinput, loceol,
+                                script_run_begin);
                NOT_REACHED; /* NOTREACHED */
            }
 
@@ -7949,7 +8069,7 @@ NULL
                ST.save_curlyx = cur_curlyx;
                cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
                PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B,
-                                    locinput);
+                                    locinput, loceol, script_run_begin);
                NOT_REACHED; /* NOTREACHED */
            }
 
@@ -7960,7 +8080,8 @@ NULL
                             maxopenparen);
                cur_curlyx->u.curlyx.lastloc = locinput;
                REGCP_SET(ST.lastcp);
-               PUSH_STATE_GOTO(WHILEM_A_max, A, locinput);
+               PUSH_STATE_GOTO(WHILEM_A_max, A, locinput, loceol,
+                                script_run_begin);
                NOT_REACHED; /* NOTREACHED */
            }
            goto do_whilem_B_max;
@@ -8012,7 +8133,7 @@ NULL
            ST.save_curlyx = cur_curlyx;
            cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
            PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B,
-                                locinput);
+                                locinput, loceol, script_run_begin);
            NOT_REACHED; /* NOTREACHED */
 
        case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
@@ -8043,7 +8164,7 @@ NULL
            REGCP_SET(ST.lastcp);
            PUSH_STATE_GOTO(WHILEM_A_min,
                /*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS,
-                locinput);
+                locinput, loceol, script_run_begin);
            NOT_REACHED; /* NOTREACHED */
 
 #undef  ST
@@ -8065,9 +8186,11 @@ NULL
 
            /* Now go into the branch */
            if (has_cutgroup) {
-               PUSH_YES_STATE_GOTO(BRANCH_next, scan, locinput);
+               PUSH_YES_STATE_GOTO(BRANCH_next, scan, locinput, loceol,
+                                    script_run_begin);
            } else {
-               PUSH_STATE_GOTO(BRANCH_next, scan, locinput);
+               PUSH_STATE_GOTO(BRANCH_next, scan, locinput, loceol,
+                                script_run_begin);
            }
            NOT_REACHED; /* NOTREACHED */
 
@@ -8075,7 +8198,8 @@ NULL
             sv_yes_mark = st->u.mark.mark_name = scan->flags
                 ? MUTABLE_SV(rexi->data->data[ ARG( scan ) ])
                 : NULL;
-            PUSH_STATE_GOTO(CUTGROUP_next, next, locinput);
+            PUSH_STATE_GOTO(CUTGROUP_next, next, locinput, loceol,
+                            script_run_begin);
             NOT_REACHED; /* NOTREACHED */
 
         case CUTGROUP_next_fail:
@@ -8152,7 +8276,8 @@ NULL
                goto curlym_do_B;
 
          curlym_do_A: /* execute the A in /A{m,n}B/  */
-           PUSH_YES_STATE_GOTO(CURLYM_A, ST.A, locinput); /* match A */
+           PUSH_YES_STATE_GOTO(CURLYM_A, ST.A, locinput, loceol, /* match A */
+                                script_run_begin);
            NOT_REACHED; /* NOTREACHED */
 
        case CURLYM_A: /* we've just matched an A */
@@ -8222,8 +8347,15 @@ NULL
                );
            if (! NEXTCHR_IS_EOS && ST.c1 != CHRTEST_VOID) {
                 if (! UTF8_IS_INVARIANT(nextchr) && utf8_target) {
-                    if (memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
-                        && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
+
+                           /* (We can use memEQ and memNE in this file without
+                            * having to worry about one being shorter than the
+                            * other, since the first byte of each gives the
+                            * length of the character) */
+                    if (   memNE(locinput, ST.c1_utf8, UTF8_SAFE_SKIP(locinput,
+                                                              reginfo->strend))
+                        && memNE(locinput, ST.c2_utf8, UTF8_SAFE_SKIP(locinput,
+                                                             reginfo->strend)))
                     {
                         /* simulate B failing */
                         DEBUG_OPTIMISE_r(
@@ -8269,7 +8401,8 @@ NULL
                }
            }
            
-           PUSH_STATE_GOTO(CURLYM_B, ST.B, locinput); /* match B */
+           PUSH_STATE_GOTO(CURLYM_B, ST.B, locinput, loceol,   /* match B */
+                            script_run_begin);
            NOT_REACHED; /* NOTREACHED */
 
        case CURLYM_B_fail: /* just failed to match a B */
@@ -8485,20 +8618,26 @@ NULL
                    n = (ST.oldloc == locinput) ? 0 : 1;
                    if (ST.c1 == ST.c2) {
                        /* set n to utf8_distance(oldloc, locinput) */
-                       while (locinput <= ST.maxpos
-                              && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput)))
+                       while (    locinput <= ST.maxpos
+                               &&  locinput < loceol
+                               &&  memNE(locinput, ST.c1_utf8,
+                                    UTF8_SAFE_SKIP(locinput, reginfo->strend)))
                         {
-                           locinput += UTF8SKIP(locinput);
+                           locinput += UTF8_SAFE_SKIP(locinput,
+                                                       reginfo->strend);
                            n++;
                        }
                    }
                    else {
                        /* set n to utf8_distance(oldloc, locinput) */
-                       while (locinput <= ST.maxpos
-                              && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
-                              && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
+                       while (   locinput <= ST.maxpos
+                               && locinput < loceol
+                               && memNE(locinput, ST.c1_utf8,
+                                     UTF8_SAFE_SKIP(locinput, reginfo->strend))
+                               && memNE(locinput, ST.c2_utf8,
+                                    UTF8_SAFE_SKIP(locinput, reginfo->strend)))
                         {
-                           locinput += UTF8SKIP(locinput);
+                           locinput += UTF8_SAFE_SKIP(locinput, reginfo->strend);
                            n++;
                        }
                    }
@@ -8563,7 +8702,8 @@ NULL
 
           curly_try_B_min:
             CURLY_SETPAREN(ST.paren, ST.count);
-            PUSH_STATE_GOTO(CURLY_B_min, ST.B, locinput);
+            PUSH_STATE_GOTO(CURLY_B_min, ST.B, locinput, loceol,
+                            script_run_begin);
            NOT_REACHED; /* NOTREACHED */
 
 
@@ -8576,21 +8716,22 @@ NULL
                 if (ST.c1 != CHRTEST_VOID && could_match) {
                     if (! UTF8_IS_INVARIANT(UCHARAT(locinput)) && utf8_target)
                     {
-                        could_match = memEQ(locinput,
-                                            ST.c1_utf8,
-                                            UTF8SKIP(locinput))
-                                    || memEQ(locinput,
-                                             ST.c2_utf8,
-                                             UTF8SKIP(locinput));
+                        could_match =  memEQ(locinput, ST.c1_utf8,
+                                             UTF8_SAFE_SKIP(locinput,
+                                                            reginfo->strend))
+                                    || memEQ(locinput, ST.c2_utf8,
+                                             UTF8_SAFE_SKIP(locinput,
+                                                            reginfo->strend));
                     }
                     else {
-                        could_match = UCHARAT(locinput) == ST.c1
-                                      || UCHARAT(locinput) == ST.c2;
+                        could_match =   UCHARAT(locinput) == ST.c1
+                                     || UCHARAT(locinput) == ST.c2;
                     }
                 }
                 if (ST.c1 == CHRTEST_VOID || could_match) {
                    CURLY_SETPAREN(ST.paren, ST.count);
-                   PUSH_STATE_GOTO(CURLY_B_max, ST.B, locinput);
+                   PUSH_STATE_GOTO(CURLY_B_max, ST.B, locinput, loceol,
+                                    script_run_begin);
                    NOT_REACHED; /* NOTREACHED */
                }
            }
@@ -8645,8 +8786,9 @@ NULL
 
                 SET_RECURSE_LOCINPUT("FAKE-END[after]", cur_eval->locinput);
 
-                PUSH_YES_STATE_GOTO(EVAL_postponed_AB, st->u.eval.prev_eval->u.eval.B,
-                                    locinput); /* match B */
+                PUSH_YES_STATE_GOTO(EVAL_postponed_AB,          /* match B */
+                                    st->u.eval.prev_eval->u.eval.B,
+                                    locinput, loceol, script_run_begin);
            }
 
            if (locinput < reginfo->till) {
@@ -8670,12 +8812,11 @@ NULL
 #undef  ST
 #define ST st->u.ifmatch
 
-        {
-            char *newstart;
-
        case SUSPEND:   /* (?>A) */
            ST.wanted = 1;
-           newstart = locinput;
+           ST.start = locinput;
+           ST.end = loceol;
+            ST.count = 1;
            goto do_ifmatch;    
 
        case UNLESSM:   /* -ve lookaround: (?!A), or with 'flags', (?<!A) */
@@ -8685,25 +8826,47 @@ NULL
        case IFMATCH:   /* +ve lookaround: (?=A), or with 'flags', (?<=A) */
            ST.wanted = 1;
          ifmatch_trivial_fail_test:
-           if (scan->flags) {
-               char * const s = HOPBACKc(locinput, scan->flags);
-               if (!s) {
-                   /* trivial fail */
-                   if (logical) {
-                       logical = 0;
-                       sw = 1 - cBOOL(ST.wanted);
-                   }
-                   else if (ST.wanted)
-                       sayNO;
-                   next = scan + ARG(scan);
-                   if (next == scan)
-                       next = NULL;
-                   break;
-               }
-               newstart = s;
+            ST.count = scan->next_off + 1; /* next_off repurposed to be
+                                              lookbehind count, requires
+                                              non-zero flags */
+           if (! scan->flags) {    /* 'flags' zero means lookahed */
+
+                /* Lookahead starts here and ends at the normal place */
+               ST.start = locinput;
+               ST.end = loceol;
+            }
+           else {
+                PERL_UINT_FAST8_T back_count = scan->flags;
+               char * s;
+
+                /* Lookbehind can look beyond the current position */
+               ST.end = loceol;
+
+                /* ... and starts at the first place in the input that is in
+                 * the range of the possible start positions */
+                for (; ST.count > 0; ST.count--, back_count--) {
+                    s = HOPBACKc(locinput, back_count);
+                    if (s) {
+                        ST.start = s;
+                        goto do_ifmatch;
+                    }
+                }
+
+                /* If the lookbehind doesn't start in the actual string, is a
+                 * trivial match failure */
+                if (logical) {
+                    logical = 0;
+                    sw = 1 - cBOOL(ST.wanted);
+                }
+                else if (ST.wanted)
+                    sayNO;
+
+                /* Here, we didn't want it to match, so is actually success */
+                next = scan + ARG(scan);
+                if (next == scan)
+                    next = NULL;
+                break;
            }
-           else
-               newstart = locinput;
 
          do_ifmatch:
            ST.me = scan;
@@ -8711,29 +8874,48 @@ NULL
            logical = 0; /* XXX: reset state of logical once it has been saved into ST */
            
            /* execute body of (?...A) */
-           PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)), newstart);
+           PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)), ST.start,
+                                ST.end, script_run_begin);
            NOT_REACHED; /* NOTREACHED */
-        }
+
+        {
+            bool matched;
 
        case IFMATCH_A_fail: /* body of (?...A) failed */
-           ST.wanted = !ST.wanted;
-           /* FALLTHROUGH */
+           if (! ST.logical && ST.count > 1) {
+
+                /* It isn't a real failure until we've tried all starting
+                 * positions.  Move to the next starting position and retry */
+                ST.count--;
+                ST.start = HOPc(ST.start, 1);
+                scan = ST.me;
+                logical = ST.logical;
+                goto do_ifmatch;
+            }
+
+            /* Here, all starting positions have been tried. */
+           matched = FALSE;
+           goto ifmatch_done;
 
        case IFMATCH_A: /* body of (?...A) succeeded */
-           if (ST.logical) {
-               sw = cBOOL(ST.wanted);
-           }
-           else if (!ST.wanted)
-               sayNO;
+           matched = TRUE;
+          ifmatch_done:
+            sw = matched == ST.wanted;
+           if (! ST.logical && !sw) {
+                sayNO;
+            }
 
            if (OP(ST.me) != SUSPEND) {
                 /* restore old position except for (?>...) */
                locinput = st->locinput;
+                loceol = st->loceol;
+                script_run_begin = st->sr0;
            }
            scan = ST.me + ARG(ST.me);
            if (scan == ST.me)
                scan = NULL;
            continue; /* execute B */
+        }
 
 #undef ST
 
@@ -8751,7 +8933,8 @@ NULL
        case PRUNE:   /*  (*PRUNE)   */
             if (scan->flags)
                sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
-           PUSH_STATE_GOTO(COMMIT_next, next, locinput);
+           PUSH_STATE_GOTO(COMMIT_next, next, locinput, loceol,
+                            script_run_begin);
            NOT_REACHED; /* NOTREACHED */
 
        case COMMIT_next_fail:
@@ -8781,7 +8964,8 @@ NULL
                 = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
             mark_state = st;
             ST.mark_loc = locinput;
-            PUSH_YES_STATE_GOTO(MARKPOINT_next, next, locinput);
+            PUSH_YES_STATE_GOTO(MARKPOINT_next, next, locinput, loceol,
+                                script_run_begin);
             NOT_REACHED; /* NOTREACHED */
 
         case MARKPOINT_next:
@@ -8814,7 +8998,8 @@ NULL
                 /* (*SKIP) : if we fail we cut here*/
                 ST.mark_name = NULL;
                 ST.mark_loc = locinput;
-                PUSH_STATE_GOTO(SKIP_next,next, locinput);
+                PUSH_STATE_GOTO(SKIP_next,next, locinput, loceol,
+                                script_run_begin);
             } else {
                 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was, 
                    otherwise do nothing.  Meaning we need to scan 
@@ -8827,7 +9012,8 @@ NULL
                                 find ) ) 
                     {
                         ST.mark_name = find;
-                        PUSH_STATE_GOTO( SKIP_next, next, locinput);
+                        PUSH_STATE_GOTO( SKIP_next, next, locinput, loceol,
+                                         script_run_begin);
                     }
                     cur = cur->u.mark.prev_mark;
                 }
@@ -8923,12 +9109,16 @@ NULL
             );
            depth++;
            st->locinput = locinput;
+           st->loceol = loceol;
+            st->sr0 = script_run_begin;
            newst = st+1; 
            if (newst >  SLAB_LAST(PL_regmatch_slab))
                newst = S_push_slab(aTHX);
            PL_regmatch_state = newst;
 
            locinput = pushinput;
+            loceol = pusheol;
+            script_run_begin = pushsr0;
            st = newst;
            continue;
             /* NOTREACHED */
@@ -8981,8 +9171,11 @@ NULL
        yes_state = st->u.yes.prev_yes_state;
        PL_regmatch_state = st;
         
-        if (no_final)
+        if (no_final) {
             locinput= st->locinput;
+            loceol= st->loceol;
+            script_run_begin = st->sr0;
+        }
        state_num = st->resume_state + no_final;
        goto reenter_switch;
     }
@@ -9032,6 +9225,8 @@ NULL
        }
        PL_regmatch_state = st;
        locinput= st->locinput;
+       loceol= st->loceol;
+        script_run_begin = st->sr0;
 
        DEBUG_STATE_pp("pop");
        depth--;
@@ -9314,19 +9509,22 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p,
                 if (c1 == c2) {
                     while (scan < this_eol
                            && hardcount < max
-                           && memEQ(scan, c1_utf8, UTF8SKIP(scan)))
+                           && memEQ(scan, c1_utf8, UTF8_SAFE_SKIP(scan,
+                                                                  loceol)))
                     {
-                        scan += UTF8SKIP(scan);
+                        scan += UTF8SKIP(c1_utf8);
                         hardcount++;
                     }
                 }
                 else {
                     while (scan < this_eol
                            && hardcount < max
-                           && (memEQ(scan, c1_utf8, UTF8SKIP(scan))
-                               || memEQ(scan, c2_utf8, UTF8SKIP(scan))))
+                           && (   memEQ(scan, c1_utf8, UTF8_SAFE_SKIP(scan,
+                                                                     loceol))
+                               || memEQ(scan, c2_utf8, UTF8_SAFE_SKIP(scan,
+                                                                     loceol))))
                     {
-                        scan += UTF8SKIP(scan);
+                        scan += UTF8_SAFE_SKIP(scan, loceol);
                         hardcount++;
                     }
                 }
@@ -9417,13 +9615,47 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p,
         break;
 
     case ANYOFH:
-        if (utf8_target) while (   hardcount < max
-                                && scan < this_eol
-                                && reginclass(prog, p, (U8*)scan, (U8*) this_eol,
-                                                                  TRUE))
-        {
-            scan += UTF8SKIP(scan);
-            hardcount++;
+        if (utf8_target) {  /* ANYOFH only can match UTF-8 targets */
+            while (  hardcount < max
+                   && scan < this_eol
+                   && NATIVE_UTF8_TO_I8((U8) *scan) >= ANYOF_FLAGS(p)
+                   && reginclass(prog, p, (U8*)scan, (U8*) this_eol, TRUE))
+            {
+                scan += UTF8SKIP(scan);
+                hardcount++;
+            }
+        }
+        break;
+
+    case ANYOFHb:
+        if (utf8_target) {  /* ANYOFHb only can match UTF-8 targets */
+
+            /* we know the first byte must be the FLAGS field */
+            while (   hardcount < max
+                   && scan < this_eol
+                   && (U8) *scan == ANYOF_FLAGS(p)
+                   && reginclass(prog, p, (U8*)scan, (U8*) this_eol,
+                                                              TRUE))
+            {
+                scan += UTF8SKIP(scan);
+                hardcount++;
+            }
+        }
+        break;
+
+    case ANYOFHr:
+        if (utf8_target) {  /* ANYOFH only can match UTF-8 targets */
+            while (  hardcount < max
+                   && scan < this_eol
+                   && inRANGE((U8) NATIVE_UTF8_TO_I8(*scan),
+                              LOWEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(p)),
+                              HIGHEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(p)))
+                   && NATIVE_UTF8_TO_I8((U8) *scan) >= ANYOF_FLAGS(p)
+                   && reginclass(prog, p, (U8*)scan, (U8*) this_eol, TRUE))
+            {
+                scan += UTF8SKIP(scan);
+                hardcount++;
+            }
         }
         break;
 
@@ -9670,7 +9902,9 @@ STATIC bool
 S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const p, const U8* const p_end, const bool utf8_target)
 {
     dVAR;
-    const char flags = ANYOF_FLAGS(n);
+    const char flags = (inRANGE(OP(n), ANYOFH, ANYOFHr))
+                        ? 0
+                        : ANYOF_FLAGS(n);
     bool match = FALSE;
     UV c = *p;
 
@@ -9697,7 +9931,7 @@ S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const
     }
 
     /* If this character is potentially in the bitmap, check it */
-    if (c < NUM_ANYOF_CODE_POINTS && OP(n) != ANYOFH) {
+    if (c < NUM_ANYOF_CODE_POINTS && ! inRANGE(OP(n), ANYOFH, ANYOFHb)) {
        if (ANYOF_BITMAP_TEST(n, c))
            match = TRUE;
        else if ((flags
@@ -9999,6 +10233,7 @@ S_setup_eval_state(pTHX_ regmatch_info *const reginfo)
     regmatch_info_aux_eval *eval_state = reginfo->info_aux_eval;
 
     eval_state->rex = rex;
+    eval_state->sv  = reginfo->sv;
 
     if (reginfo->sv) {
         /* Make $_ available to executed code. */
@@ -10006,6 +10241,8 @@ S_setup_eval_state(pTHX_ regmatch_info *const reginfo)
             SAVE_DEFSV;
             DEFSV_set(reginfo->sv);
         }
+        /* will be dec'd by S_cleanup_regmatch_info_aux */
+        SvREFCNT_inc_NN(reginfo->sv);
 
         if (!(mg = mg_find_mglob(reginfo->sv))) {
             /* prepare for quick setting of pos */
@@ -10097,6 +10334,7 @@ S_cleanup_regmatch_info_aux(pTHX_ void *arg)
         }
 
         PL_curpm = eval_state->curpm;
+        SvREFCNT_dec(eval_state->sv);
     }
 
     PL_regmatch_state = aux->old_regmatch_state;
@@ -10167,6 +10405,7 @@ S_to_byte_substr(pTHX_ regexp *prog)
            && !prog->substrs->data[i].substr) {
            SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
            if (! sv_utf8_downgrade(sv, TRUE)) {
+                SvREFCNT_dec_NN(sv);
                 return FALSE;
             }
             if (SvVALID(prog->substrs->data[i].utf8_substr)) {