X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/24e16d7b405f10168aae144d4a2c37d9c6443b9e..b41db090e21841b21ad0a9529cf6f76acc688dea:/regexec.c

diff --git a/regexec.c b/regexec.c
index 340a49e..0892554 100644
--- a/regexec.c
+++ b/regexec.c
@@ -96,6 +96,12 @@ static const char* const non_utf8_target_but_utf8_required
                 = "Can't match, because target string needs to be in UTF-8\n";
 #endif
 
+/* Returns a boolean as to whether the input unsigned number is a power of 2
+ * (2**0, 2**1, etc).  In other words if it has just a single bit set.
+ * If not, subtracting 1 would leave the uppermost bit set, so the & would
+ * yield non-zero */
+#define isPOWER_OF_2(n) ((n & (n-1)) == 0)
+
 #define NON_UTF8_TARGET_BUT_UTF8_REQUIRED(target) STMT_START {           \
     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "%s", non_utf8_target_but_utf8_required));\
     goto target;                                                         \
@@ -119,7 +125,6 @@ static const char* const non_utf8_target_but_utf8_required
  */
 
 #define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
-#define CHR_DIST(a,b) (reginfo->is_utf8_target ? utf8_distance(a,b) : a - b)
 
 #define HOPc(pos,off) \
 	(char *)(reginfo->is_utf8_target \
@@ -127,13 +132,16 @@ static const char* const non_utf8_target_but_utf8_required
                     (U8*)(off >= 0 ? reginfo->strend : reginfo->strbeg)) \
 	    : (U8*)(pos + off))
 
-#define HOPBACKc(pos, off) \
-	(char*)(reginfo->is_utf8_target \
-	    ? reghopmaybe3((U8*)pos, (SSize_t)0-off, (U8*)(reginfo->strbeg)) \
-	    : (pos - off >= reginfo->strbeg)	\
-		? (U8*)pos - off		\
+/* like HOPMAYBE3 but backwards. lim must be +ve. Returns NULL on overshoot */
+#define HOPBACK3(pos, off, lim) \
+	(reginfo->is_utf8_target                          \
+	    ? reghopmaybe3((U8*)pos, (SSize_t)0-off, (U8*)(lim)) \
+	    : (pos - off >= lim)	                         \
+		? (U8*)pos - off		                 \
 		: NULL)
 
+#define HOPBACKc(pos, off) ((char*)HOPBACK3(pos, off, reginfo->strbeg))
+
 #define HOP3(pos,off,lim) (reginfo->is_utf8_target  ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
 
@@ -150,6 +158,7 @@ static const char* const non_utf8_target_but_utf8_required
 #define HOP3lim(pos,off,lim) (reginfo->is_utf8_target \
     ? reghop3((U8*)(pos), off, (U8*)(lim)) \
     : (U8*)((pos + off) > lim ? lim : (pos + off)))
+#define HOP3clim(pos,off,lim) ((char*)HOP3lim(pos,off,lim))
 
 #define HOP4(pos,off,llim, rlim) (reginfo->is_utf8_target \
     ? reghop4((U8*)(pos), off, (U8*)(llim), (U8*)(rlim)) \
@@ -445,6 +454,8 @@ S_regcp_restore(pTHX_ regexp *rex, I32 ix, U32 *maxopenparen_p _pDEPTH)
 
 #define regcpblow(cp) LEAVE_SCOPE(cp)	/* Ignores regcppush()ed data. */
 
+#ifndef PERL_IN_XSUB_RE
+
 bool
 Perl_isFOO_lc(pTHX_ const U8 classnum, const U8 character)
 {
@@ -486,6 +497,8 @@ Perl_isFOO_lc(pTHX_ const U8 classnum, const U8 character)
     return FALSE;
 }
 
+#endif
+
 STATIC bool
 S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character)
 {
@@ -539,6 +552,116 @@ S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character)
     return FALSE; /* Things like CNTRL are always below 256 */
 }
 
+STATIC char *
+S_find_next_ascii(char * s, const char * send, const bool utf8_target)
+{
+    /* Returns the position of the first ASCII byte in the sequence between 's'
+     * and 'send-1' inclusive; returns 'send' if none found */
+
+    PERL_ARGS_ASSERT_FIND_NEXT_ASCII;
+
+#ifndef EBCDIC
+
+    if ((STRLEN) (send - s) >= PERL_WORDSIZE
+
+                            /* This term is wordsize if subword; 0 if not */
+                          + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s)
+
+                            /* 'offset' */
+                          - (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK))
+    {
+
+        /* Process per-byte until reach word boundary.  XXX This loop could be
+         * eliminated if we knew that this platform had fast unaligned reads */
+        while (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK) {
+            if (isASCII(*s)) {
+                return s;
+            }
+            s++;    /* khw didn't bother creating a separate loop for
+                       utf8_target */
+        }
+
+        /* Here, we know we have at least one full word to process.  Process
+         * per-word as long as we have at least a full word left */
+        do {
+            if ((* (PERL_UINTMAX_T *) s) & ~ PERL_VARIANTS_WORD_MASK)  {
+                break;
+            }
+            s += PERL_WORDSIZE;
+        } while (s + PERL_WORDSIZE <= send);
+    }
+
+#endif
+
+    /* Process per-character */
+    if (utf8_target) {
+        while (s < send) {
+            if (isASCII(*s)) {
+                return s;
+            }
+            s += UTF8SKIP(s);
+        }
+    }
+    else {
+        while (s < send) {
+            if (isASCII(*s)) {
+                return s;
+            }
+            s++;
+        }
+    }
+
+    return s;
+}
+
+STATIC char *
+S_find_next_non_ascii(char * s, const char * send, const bool utf8_target)
+{
+    /* Returns the position of the first non-ASCII byte in the sequence between
+     * 's' and 'send-1' inclusive; returns 'send' if none found */
+
+#ifdef EBCDIC
+
+    PERL_ARGS_ASSERT_FIND_NEXT_NON_ASCII;
+
+    if (utf8_target) {
+        while (s < send) {
+            if ( ! isASCII(*s)) {
+                return s;
+            }
+            s += UTF8SKIP(s);
+        }
+    }
+    else {
+        while (s < send) {
+            if ( ! isASCII(*s)) {
+                return s;
+            }
+            s++;
+        }
+    }
+
+    return s;
+
+#else
+
+    const U8 * next_non_ascii = NULL;
+
+    PERL_ARGS_ASSERT_FIND_NEXT_NON_ASCII;
+    PERL_UNUSED_ARG(utf8_target);
+
+    /* On ASCII platforms invariants and ASCII are identical, so if the string
+     * is entirely invariants, there is no non-ASCII character */
+    return (is_utf8_invariant_string_loc((U8 *) s,
+                                         (STRLEN) (send - s),
+                                         &next_non_ascii))
+            ? (char *) send
+            : (char *) next_non_ascii;
+
+#endif
+
+}
+
 /*
  * pregexec and friends
  */
@@ -700,7 +823,7 @@ Perl_re_intuit_start(pTHX_
 	goto fail;
     }
 
-    RX_MATCH_UTF8_set(rx,utf8_target);
+    RXp_MATCH_UTF8_set(prog, utf8_target);
     reginfo->is_utf8_target = cBOOL(utf8_target);
     reginfo->info_aux = NULL;
     reginfo->strbeg = strbeg;
@@ -834,7 +957,7 @@ Perl_re_intuit_start(pTHX_
 #ifdef DEBUGGING	/* 7/99: reports of failure (with the older version) */
     if (end_shift < 0)
 	Perl_croak(aTHX_ "panic: end_shift: %" IVdf " pattern:\n%s\n ",
-		   (IV)end_shift, RX_PRECOMP(prog));
+		   (IV)end_shift, RX_PRECOMP(rx));
 #endif
 
   restart:
@@ -880,7 +1003,9 @@ Perl_re_intuit_start(pTHX_
                 (IV)prog->check_end_shift);
         });
         
-        end_point = HOP3(strend, -end_shift, strbeg);
+        end_point = HOPBACK3(strend, end_shift, rx_origin);
+        if (!end_point)
+            goto fail_finish;
         start_point = HOPMAYBE3(rx_origin, start_shift, end_point);
         if (!start_point)
             goto fail_finish;
@@ -898,19 +1023,28 @@ Perl_re_intuit_start(pTHX_
             && prog->intflags & PREGf_ANCH
             && prog->check_offset_max != SSize_t_MAX)
         {
-            SSize_t len = SvCUR(check) - !!SvTAIL(check);
+            SSize_t check_len = SvCUR(check) - !!SvTAIL(check);
             const char * const anchor =
                         (prog->intflags & PREGf_ANCH_GPOS ? strpos : strbeg);
+            SSize_t targ_len = (char*)end_point - anchor;
+
+            if (check_len > targ_len) {
+                DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
+			      "Anchored string too short...\n"));
+                goto fail_finish;
+            }
 
             /* do a bytes rather than chars comparison. It's conservative;
              * so it skips doing the HOP if the result can't possibly end
              * up earlier than the old value of end_point.
              */
-            if ((char*)end_point - anchor > prog->check_offset_max) {
+            assert(anchor + check_len <= (char *)end_point);
+            if (prog->check_offset_max + check_len < targ_len) {
                 end_point = HOP3lim((U8*)anchor,
                                 prog->check_offset_max,
-                                end_point -len)
-                            + len;
+                                end_point - check_len
+                            )
+                            + check_len;
             }
         }
 
@@ -1288,10 +1422,10 @@ Perl_re_intuit_start(pTHX_
          */
 
 	if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
-            endpos= HOP3c(rx_origin, (prog->minlen ? cl_l : 0), strend);
+            endpos = HOP3clim(rx_origin, (prog->minlen ? cl_l : 0), strend);
         else if (prog->float_substr || prog->float_utf8) {
 	    rx_max_float = HOP3c(check_at, -start_shift, strbeg);
-	    endpos= HOP3c(rx_max_float, cl_l, strend);
+	    endpos = HOP3clim(rx_max_float, cl_l, strend);
         }
         else 
             endpos= strend;
@@ -1500,8 +1634,9 @@ STMT_START {
             uscan += len;                                                           \
             len=0;                                                                  \
         } else {                                                                    \
-            uvc = _to_utf8_fold_flags( (const U8*) uc, foldbuf, &foldlen, flags);   \
             len = UTF8SKIP(uc);                                                     \
+            uvc = _toFOLD_utf8_flags( (const U8*) uc, uc + len, foldbuf, &foldlen,  \
+                                                                            flags); \
             skiplen = UVCHR_SKIP( uvc );                                            \
             foldlen -= skiplen;                                                     \
             uscan = foldbuf + skiplen;                                              \
@@ -1969,10 +2104,8 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
          * trying that it will fail; so don't start a match past the
          * required minimum number from the far end */
         e = HOP3c(strend, -((SSize_t)ln), s);
-
-        if (reginfo->intuit && e < s) {
-            e = s;			/* Due to minlen logic of intuit() */
-        }
+        if (e < s)
+            break;
 
         c1 = *pat_string;
         c2 = fold_array[c1];
@@ -2016,10 +2149,6 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
          */
         e = HOP3c(strend, -((SSize_t)lnc), s);
 
-        if (reginfo->intuit && e < s) {
-            e = s;			/* Due to minlen logic of intuit() */
-        }
-
         /* XXX Note that we could recalculate e to stop the loop earlier,
          * as the worst case expansion above will rarely be met, and as we
          * go along we would usually find that e moves further to the left.
@@ -2360,6 +2489,22 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
         );
         break;
 
+    case ASCII:
+        s = find_next_ascii(s, strend, utf8_target);
+        if (s < strend && (reginfo->intuit || regtry(reginfo, &s))) {
+            goto got_it;
+        }
+
+        break;
+
+    case NASCII:
+        s = find_next_non_ascii(s, strend, utf8_target);
+        if (s < strend && (reginfo->intuit || regtry(reginfo, &s))) {
+            goto got_it;
+        }
+
+        break;
+
     /* The argument to all the POSIX node types is the class number to pass to
      * _generic_isCC() to build a mask for searching in PL_charclass[] */
 
@@ -2393,12 +2538,19 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
         }
 
         to_complement = 1;
-        /* FALLTHROUGH */
+        goto posixa;
 
     case POSIXA:
-      posixa:
         /* Don't need to worry about utf8, as it can match only a single
-         * byte invariant character. */
+         * byte invariant character.  But we do anyway for performance reasons,
+         * as otherwise we would have to examine all the continuation
+         * characters */
+        if (utf8_target) {
+            REXEC_FBC_UTF8_CLASS_SCAN(_generic_isCC_A(*s, FLAGS(c)));
+            break;
+        }
+
+      posixa:
         REXEC_FBC_CLASS_SCAN(
                         to_complement ^ cBOOL(_generic_isCC_A(*s, FLAGS(c))));
         break;
@@ -2429,7 +2581,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
                     if ((UTF8_IS_INVARIANT(*s)
                          && to_complement ^ cBOOL(_generic_isCC((U8) *s,
                                                                 classnum)))
-                        || (UTF8_IS_DOWNGRADEABLE_START(*s)
+                        || (   UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(s, strend)
                             && to_complement ^ cBOOL(
                                 _generic_isCC(EIGHT_BIT_UTF8_TO_NATIVE(*s,
                                                                       *(s + 1)),
@@ -2767,7 +2919,7 @@ S_reg_set_capture_string(pTHX_ REGEXP * const rx,
             }
             else {
                 /* create new COW SV to share string */
-                RX_MATCH_COPY_FREE(rx);
+                RXp_MATCH_COPY_FREE(prog);
                 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
             }
             prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
@@ -2830,7 +2982,7 @@ S_reg_set_capture_string(pTHX_ REGEXP * const rx,
             assert(min >= 0 && min <= max && min <= strend - strbeg);
             sublen = max - min;
 
-            if (RX_MATCH_COPIED(rx)) {
+            if (RXp_MATCH_COPIED(prog)) {
                 if (sublen > prog->sublen)
                     prog->subbeg =
                             (char*)saferealloc(prog->subbeg, sublen+1);
@@ -2841,7 +2993,7 @@ S_reg_set_capture_string(pTHX_ REGEXP * const rx,
             prog->subbeg[sublen] = '\0';
             prog->suboffset = min;
             prog->sublen = sublen;
-            RX_MATCH_COPIED_on(rx);
+            RXp_MATCH_COPIED_on(prog);
         }
         prog->subcoffset = prog->suboffset;
         if (prog->suboffset && utf8_target) {
@@ -2868,7 +3020,7 @@ S_reg_set_capture_string(pTHX_ REGEXP * const rx,
         }
     }
     else {
-        RX_MATCH_COPY_FREE(rx);
+        RXp_MATCH_COPY_FREE(prog);
         prog->subbeg = strbeg;
         prog->suboffset = 0;
         prog->subcoffset = 0;
@@ -3025,7 +3177,7 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend,
             /* match via INTUIT shouldn't have any captures.
              * Let @-, @+, $^N know */
             prog->lastparen = prog->lastcloseparen = 0;
-            RX_MATCH_UTF8_set(rx, utf8_target);
+            RXp_MATCH_UTF8_set(prog, utf8_target);
             prog->offs[0].start = s - strbeg;
             prog->offs[0].end = utf8_target
                 ? (char*)utf8_hop((U8*)s, prog->minlenret) - strbeg
@@ -3052,8 +3204,8 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend,
 	Perl_croak(aTHX_ "corrupted regexp program");
     }
 
-    RX_MATCH_TAINTED_off(rx);
-    RX_MATCH_UTF8_set(rx, utf8_target);
+    RXp_MATCH_TAINTED_off(prog);
+    RXp_MATCH_UTF8_set(prog, utf8_target);
 
     reginfo->prog = rx;	 /* Yes, sorry that this is confusing.  */
     reginfo->intuit = 0;
@@ -3375,7 +3527,7 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend,
             regprop(prog, prop, c, reginfo, NULL);
 	    {
 		RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
-		    s,strend-s,60);
+		    s,strend-s,PL_dump_re_max_len);
                 Perl_re_printf( aTHX_
 		    "Matching stclass %.*s against %s (%d bytes)\n",
 		    (int)SvCUR(prop), SvPVX_const(prop),
@@ -3895,10 +4047,10 @@ S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
             reginitcolors();    
     {
         RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0), 
-            RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);   
+            RX_PRECOMP_const(prog), RX_PRELEN(prog), PL_dump_re_max_len);
         
         RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
-            start, end - start, 60); 
+            start, end - start, PL_dump_re_max_len);
         
         Perl_re_printf( aTHX_
             "%s%s REx%s %s against %s\n", 
@@ -3954,11 +4106,11 @@ S_dump_exec_pos(pTHX_ const char *locinput,
 	const int is_uni = utf8_target ? 1 : 0;
 
 	RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
-	    (locinput - pref_len),pref0_len, 60, 4, 5);
+	    (locinput - pref_len),pref0_len, PL_dump_re_max_len, 4, 5);
 	
 	RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
 		    (locinput - pref_len + pref0_len),
-		    pref_len - pref0_len, 60, 2, 3);
+		    pref_len - pref0_len, PL_dump_re_max_len, 2, 3);
 	
 	RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
 		    locinput, loc_regeol - locinput, 10, 0, 1);
@@ -4133,10 +4285,11 @@ S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p,
                     }
                     else {
                         STRLEN len;
-                        _to_utf8_fold_flags(s,
-                                            d,
-                                            &len,
-                                            FOLD_FLAGS_FULL | FOLD_FLAGS_LOCALE);
+                        _toFOLD_utf8_flags(s,
+                                           pat_end,
+                                           d,
+                                           &len,
+                                           FOLD_FLAGS_FULL | FOLD_FLAGS_LOCALE);
                         d += len;
                         s += UTF8SKIP(s);
                     }
@@ -4181,7 +4334,7 @@ S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p,
                 }
                 else {  /* Does participate in folds */
                     AV* list = (AV*) *listp;
-                    if (av_tindex_nomg(list) != 1) {
+                    if (av_tindex_skip_len_mg(list) != 1) {
 
                         /* If there aren't exactly two folds to this, it is
                          * outside the scope of this function */
@@ -4340,7 +4493,7 @@ S_isGCB(pTHX_ const GCB_enum before, const GCB_enum after, const U8 * const strb
                 /* Do not break within emoji flag sequences. That is, do not
                  * break between regional indicator (RI) symbols if there is an
                  * odd number of RI characters before the break point.
-                 *  GB12     ^ (RI RI)* RI Ã RI
+                 *  GB12   sot (RI RI)* RI Ã RI
                  *  GB13 [^RI] (RI RI)* RI Ã RI */
 
                 while (backup_one_GCB(strbeg,
@@ -4632,7 +4785,7 @@ S_isLB(pTHX_ LB_enum before,
                  * only if there are an even number of regional indicators
                  * preceding the position of the break.
                  *
-                 *  sot (RI RI)* RI Ã RI
+                 *    sot (RI RI)* RI Ã RI
                  *  [^RI] (RI RI)* RI Ã RI */
 
                 while (backup_one_LB(strbeg,
@@ -5162,7 +5315,7 @@ S_isWB(pTHX_ WB_enum previous,
                  * odd number of RI characters before the potential break
                  * point.
                  *
-                 * WB15     ^ (RI RI)* RI Ã RI
+                 * WB15   sot (RI RI)* RI Ã RI
                  * WB16 [^RI] (RI RI)* RI Ã RI */
 
                 while (backup_one_WB(&previous,
@@ -5343,6 +5496,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
     regnode *next;
     U32 n = 0;	/* general value; init to avoid compiler warning */
     SSize_t ln = 0; /* len or last;  init to avoid compiler warning */
+    SSize_t endref = 0; /* offset of end of backref when ln is start */
     char *locinput = startpos;
     char *pushinput; /* where to continue after a PUSH */
     I32 nextchr;   /* is always set to UCHARAT(locinput), or -1 at EOS */
@@ -5369,7 +5523,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
     SV *sv_yes_mark = NULL; /* last mark name we have seen 
                                during a successful match */
     U32 lastopen = 0;       /* last open we saw */
-    bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;   
+    bool has_cutgroup = RXp_HAS_CUTGROUP(rex) ? 1 : 0;
     SV* const oreplsv = GvSVn(PL_replgv);
     /* these three flags are set by various ops to signal information to
      * the very next op. They have a useful lifetime of exactly one loop
@@ -5390,12 +5544,13 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
     U8 gimme = G_SCALAR;
     CV *caller_cv = NULL;	/* who called us */
     CV *last_pushed_cv = NULL;	/* most recently called (?{}) CV */
-    CHECKPOINT runops_cp;	/* savestack position before executing EVAL */
     U32 maxopenparen = 0;       /* max '(' index seen so far */
     int to_complement;  /* Invert the result? */
     _char_class_number classnum;
     bool is_utf8_pat = reginfo->is_utf8_pat;
     bool match = FALSE;
+    I32 orig_savestack_ix = PL_savestack_ix;
+    U8 * script_run_begin = NULL;
 
 /* Solaris Studio 12.3 messes up fetching PL_charclass['\n'] */
 #if (defined(__SUNPRO_C) && (__SUNPRO_C == 0x5120) && defined(__x86_64) && defined(USE_64_BIT_ALL))
@@ -5429,8 +5584,6 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
     }));
 
     while (scan != NULL) {
-
-
 	next = scan + NEXT_OFF(scan);
 	if (next == scan)
 	    next = NULL;
@@ -5592,6 +5745,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
                 if (scan->flags == EXACTL || scan->flags == EXACTFLU8) {
                     _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
                     if (utf8_target
+                        && nextchr >= 0 /* guard against negative EOS value in nextchr */
                         && UTF8_IS_ABOVE_LATIN1(nextchr)
                         && scan->flags == EXACTL)
                     {
@@ -5735,6 +5889,11 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
         {
             U8 *uc;
             if ( ST.jump ) {
+                /* undo any captures done in the tail part of a branch,
+                 * e.g.
+                 *    /(?:X(.)(.)|Y(.)).../
+                 * where the trie just matches X then calls out to do the
+                 * rest of the branch */
                 REGCP_UNWIND(ST.cp);
                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
 	    }
@@ -6349,6 +6508,22 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
 	    }
 	    break;
 
+        case ASCII:
+            if (NEXTCHR_IS_EOS || ! isASCII(UCHARAT(locinput))) {
+                sayNO;
+            }
+
+            locinput++;     /* ASCII is always single byte */
+            break;
+
+        case NASCII:
+            if (NEXTCHR_IS_EOS || isASCII(UCHARAT(locinput))) {
+                sayNO;
+            }
+
+            goto increment_locinput;
+            break;
+
         /* The argument (FLAGS) to all the POSIX node types is the class number
          * */
 
@@ -6373,8 +6548,10 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
                 break;
             }
 
-            if (! UTF8_IS_DOWNGRADEABLE_START(nextchr)) { /* An above Latin-1 code point */
-                _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput, reginfo->strend);
+            if (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(locinput, reginfo->strend)) {
+                /* An above Latin-1 code point, or malformed */
+                _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput,
+                                                       reginfo->strend);
                 goto utf8_posix_above_latin1;
             }
 
@@ -6458,7 +6635,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
                 }
                 locinput++;
             }
-            else if (UTF8_IS_DOWNGRADEABLE_START(nextchr)) {
+            else if (UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(locinput, reginfo->strend)) {
                 if (! (to_complement
                        ^ cBOOL(_generic_isCC(EIGHT_BIT_UTF8_TO_NATIVE(nextchr,
                                                                *(locinput + 1)),
@@ -6671,10 +6848,11 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
 
 	  do_nref_ref_common:
 	    ln = rex->offs[n].start;
+	    endref = rex->offs[n].end;
 	    reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */
-	    if (rex->lastparen < n || ln == -1)
+	    if (rex->lastparen < n || ln == -1 || endref == -1)
 		sayNO;			/* Do not match unless seen CLOSEn. */
-	    if (ln == rex->offs[n].end)
+	    if (ln == endref)
 		break;
 
 	    s = reginfo->strbeg + ln;
@@ -6688,7 +6866,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
                     * not going off the end given by reginfo->strend, and
                     * returns in <limit> upon success, how much of the
                     * current input was matched */
-		if (! foldEQ_utf8_flags(s, NULL, rex->offs[n].end - ln, utf8_target,
+		if (! foldEQ_utf8_flags(s, NULL, endref - ln, utf8_target,
 				    locinput, &limit, 0, utf8_target, utf8_fold_flags))
 		{
 		    sayNO;
@@ -6703,7 +6881,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
 		(type == REF ||
 		 UCHARAT(s) != fold_array[nextchr]))
 		sayNO;
-	    ln = rex->offs[n].end - ln;
+	    ln = endref - ln;
 	    if (locinput + ln > reginfo->strend)
 		sayNO;
 	    if (ln > 1 && (type == REF
@@ -6782,7 +6960,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
             goto eval_recurse_doit;
             /* NOTREACHED */
 
-        case EVAL:  /*   /(?{A})B/   /(??{A})B/  and /(?(?{A})X|Y)B/   */        
+        case EVAL:  /*   /(?{...})B/   /(??{A})B/  and  /(?(?{...})X|Y)B/   */
             if (cur_eval && cur_eval->locinput==locinput) {
 		if ( ++nochange_depth > max_nochange_depth )
                     Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
@@ -6801,7 +6979,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
 
 		/* save *all* paren positions */
                 regcppush(rex, 0, maxopenparen);
-		REGCP_SET(runops_cp);
+                REGCP_SET(ST.lastcp);
 
 		if (!caller_cv)
 		    caller_cv = find_runcv(NULL);
@@ -6826,30 +7004,67 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
 		    nop = (OP*)rexi->data->data[n];
 		}
 
-		/* normally if we're about to execute code from the same
-		 * CV that we used previously, we just use the existing
-		 * CX stack entry. However, its possible that in the
-		 * meantime we may have backtracked, popped from the save
-		 * stack, and undone the SAVECOMPPAD(s) associated with
-		 * PUSH_MULTICALL; in which case PL_comppad no longer
-		 * points to newcv's pad. */
+                /* Some notes about MULTICALL and the context and save stacks.
+                 *
+                 * In something like
+                 *   /...(?{ my $x)}...(?{ my $y)}...(?{ my $z)}.../
+                 * since codeblocks don't introduce a new scope (so that
+                 * local() etc accumulate), at the end of a successful
+                 * match there will be a SAVEt_CLEARSV on the savestack
+                 * for each of $x, $y, $z. If the three code blocks above
+                 * happen to have come from different CVs (e.g. via
+                 * embedded qr//s), then we must ensure that during any
+                 * savestack unwinding, PL_comppad always points to the
+                 * right pad at each moment. We achieve this by
+                 * interleaving SAVEt_COMPPAD's on the savestack whenever
+                 * there is a change of pad.
+                 * In theory whenever we call a code block, we should
+                 * push a CXt_SUB context, then pop it on return from
+                 * that code block. This causes a bit of an issue in that
+                 * normally popping a context also clears the savestack
+                 * back to cx->blk_oldsaveix, but here we specifically
+                 * don't want to clear the save stack on exit from the
+                 * code block.
+                 * Also for efficiency we don't want to keep pushing and
+                 * popping the single SUB context as we backtrack etc.
+                 * So instead, we push a single context the first time
+                 * we need, it, then hang onto it until the end of this
+                 * function. Whenever we encounter a new code block, we
+                 * update the CV etc if that's changed. During the times
+                 * in this function where we're not executing a code
+                 * block, having the SUB context still there is a bit
+                 * naughty - but we hope that no-one notices.
+                 * When the SUB context is initially pushed, we fake up
+                 * cx->blk_oldsaveix to be as if we'd pushed this context
+                 * on first entry to S_regmatch rather than at some random
+                 * point during the regexe execution. That way if we
+                 * croak, popping the context stack will ensure that
+                 * *everything* SAVEd by this function is undone and then
+                 * the context popped, rather than e.g., popping the
+                 * context (and restoring the original PL_comppad) then
+                 * popping more of the savestack and restoring a bad
+                 * PL_comppad.
+                 */
+
+                /* If this is the first EVAL, push a MULTICALL. On
+                 * subsequent calls, if we're executing a different CV, or
+                 * if PL_comppad has got messed up from backtracking
+                 * through SAVECOMPPADs, then refresh the context.
+                 */
 		if (newcv != last_pushed_cv || PL_comppad != last_pad)
 		{
                     U8 flags = (CXp_SUB_RE |
                                 ((newcv == caller_cv) ? CXp_SUB_RE_FAKE : 0));
+                    SAVECOMPPAD();
 		    if (last_pushed_cv) {
-                        /* PUSH/POP_MULTICALL save and restore the
-                         * caller's PL_comppad; if we call multiple subs
-                         * using the same CX block, we have to save and
-                         * unwind the varying PL_comppad's ourselves,
-                         * especially restoring the right PL_comppad on
-                         * backtrack - so save it on the save stack */
-                        SAVECOMPPAD();
 			CHANGE_MULTICALL_FLAGS(newcv, flags);
 		    }
 		    else {
 			PUSH_MULTICALL_FLAGS(newcv, flags);
 		    }
+                    /* see notes above */
+                    CX_CUR()->blk_oldsaveix = orig_savestack_ix;
+
 		    last_pushed_cv = newcv;
 		}
 		else {
@@ -6931,7 +7146,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
 		if (logical == 0)        /*   (?{})/   */
 		    sv_setsv(save_scalar(PL_replgv), ret); /* $^R */
 		else if (logical == 1) { /*   /(?(?{...})X|Y)/    */
-		    sw = cBOOL(SvTRUE(ret));
+		    sw = cBOOL(SvTRUE_NN(ret));
 		    logical = 0;
 		}
 		else {                   /*  /(??{})  */
@@ -6966,12 +7181,14 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
 		 * in the regexp code uses the pad ! */
 		PL_op = oop;
 		PL_curcop = ocurcop;
-                regcp_restore(rex, runops_cp, &maxopenparen);
+                regcp_restore(rex, ST.lastcp, &maxopenparen);
                 PL_curpm_under = PL_curpm;
                 PL_curpm = PL_reg_curpm;
 
-		if (logical != 2)
-		    break;
+		if (logical != 2) {
+                    PUSH_STATE_GOTO(EVAL_B, next, locinput);
+		    /* NOTREACHED */
+                }
 	    }
 
 		/* only /(??{})/  from now on */
@@ -7069,11 +7286,11 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
 		ST.prev_eval = cur_eval;
 		cur_eval = st;
 		/* now continue from first node in postoned RE */
-		PUSH_YES_STATE_GOTO(EVAL_AB, startpoint, locinput);
+		PUSH_YES_STATE_GOTO(EVAL_postponed_AB, startpoint, locinput);
 		NOT_REACHED; /* NOTREACHED */
 	}
 
-	case EVAL_AB: /* cleanup after a successful (??{A})B */
+	case EVAL_postponed_AB: /* cleanup after a successful (??{A})B */
             /* note: this is called twice; first after popping B, then A */
             DEBUG_STACK_r({
                 Perl_re_exec_indentf( aTHX_  "EVAL_AB cur_eval=%p prev_eval=%p\n",
@@ -7119,7 +7336,11 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
 	    sayYES;
 
 
-	case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
+	case EVAL_B_fail: /* unsuccessful B in (?{...})B */
+	    REGCP_UNWIND(ST.lastcp);
+            sayNO;
+
+	case EVAL_postponed_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
 	    /* note: this is called twice; first after popping B, then A */
             DEBUG_STACK_r({
                 Perl_re_exec_indentf( aTHX_  "EVAL_AB_fail cur_eval=%p prev_eval=%p\n",
@@ -7165,6 +7386,10 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
             lastopen = n;
 	    break;
 
+        case SROPEN: /*  (*SCRIPT_RUN:  */
+            script_run_begin = (U8 *) locinput;
+            break;
+
 /* XXX really need to log other places start/end are set too */
 #define CLOSE_CAPTURE                                                      \
     rex->offs[n].start = rex->offs[n].start_tmp;                           \
@@ -7190,6 +7415,16 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
 
 	    break;
 
+        case SRCLOSE:  /*  (*SCRIPT_RUN: ... )   */
+
+            if (! isSCRIPT_RUN(script_run_begin, (U8 *) locinput, utf8_target))
+            {
+                sayNO;
+            }
+
+            break;
+
+
         case ACCEPT:  /*  (*ACCEPT)  */
             if (scan->flags)
                 sv_yes_mark = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
@@ -7519,9 +7754,6 @@ NULL
 	    if (cur_curlyx->u.curlyx.minmod) {
 		ST.save_curlyx = cur_curlyx;
 		cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
-                ST.cp = regcppush(rex, ST.save_curlyx->u.curlyx.parenfloor,
-                            maxopenparen);
-		REGCP_SET(ST.lastcp);
 		PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B,
                                     locinput);
 		NOT_REACHED; /* NOTREACHED */
@@ -7554,11 +7786,11 @@ NULL
 	    CACHEsayNO;
 	    NOT_REACHED; /* NOTREACHED */
 
-	case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
-	    /* FALLTHROUGH */
 	case WHILEM_A_pre_fail: /* just failed to match even minimal A */
 	    REGCP_UNWIND(ST.lastcp);
             regcppop(rex, &maxopenparen);
+	    /* FALLTHROUGH */
+	case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
 	    cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
 	    cur_curlyx->u.curlyx.count--;
 	    CACHEsayNO;
@@ -7591,8 +7823,6 @@ NULL
 
 	case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
 	    cur_curlyx = ST.save_curlyx;
-	    REGCP_UNWIND(ST.lastcp);
-            regcppop(rex, &maxopenparen);
 
 	    if (cur_curlyx->u.curlyx.count >= /*max*/ARG2(cur_curlyx->u.curlyx.me)) {
 		/* Maximum greed exceeded */
@@ -7614,9 +7844,6 @@ NULL
 	    );
 	    /* Try grabbing another A and see if it helps. */
 	    cur_curlyx->u.curlyx.lastloc = locinput;
-            ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
-                            maxopenparen);
-	    REGCP_SET(ST.lastcp);
 	    PUSH_STATE_GOTO(WHILEM_A_min,
 		/*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS,
                 locinput);
@@ -8080,16 +8307,46 @@ NULL
 		}
 		else {  /* Not utf8_target */
 		    if (ST.c1 == ST.c2) {
-			while (locinput <= ST.maxpos &&
-			       UCHARAT(locinput) != ST.c1)
-			    locinput++;
-		    }
-		    else {
-			while (locinput <= ST.maxpos
-			       && UCHARAT(locinput) != ST.c1
-			       && UCHARAT(locinput) != ST.c2)
-			    locinput++;
+                        locinput = (char *) memchr(locinput,
+                                                   ST.c1,
+                                                   ST.maxpos + 1 - locinput);
+                        if (! locinput) {
+                            locinput = ST.maxpos + 1;
+                        }
 		    }
+                    else {
+                        U8 c1_c2_bits_differing = ST.c1 ^ ST.c2;
+
+                        if (! isPOWER_OF_2(c1_c2_bits_differing)) {
+                            while (   locinput <= ST.maxpos
+                                   && UCHARAT(locinput) != ST.c1
+                                   && UCHARAT(locinput) != ST.c2)
+                            {
+                                locinput++;
+                            }
+                        }
+                        else {
+                            /* If c1 and c2 only differ by a single bit, we can
+                             * avoid a conditional each time through the loop,
+                             * at the expense of a little preliminary setup and
+                             * an extra mask each iteration.  By masking out
+                             * that bit, we match exactly two characters, c1
+                             * and c2, and so we don't have to test for both.
+                             * On both ASCII and EBCDIC platforms, most of the
+                             * ASCII-range and Latin1-range folded equivalents
+                             * differ only in a single bit, so this is actually
+                             * the most common case. (e.g. 'A' 0x41 vs 'a'
+                             * 0x61). */
+                            U8 c1_masked = ST.c1 &~ c1_c2_bits_differing;
+                            U8 c1_c2_mask = ~ c1_c2_bits_differing;
+                            while (   locinput <= ST.maxpos
+                                   && (UCHARAT(locinput) & c1_c2_mask)
+                                                                != c1_masked)
+                            {
+                                locinput++;
+                            }
+                        }
+                    }
 		    n = locinput - ST.oldloc;
 		}
 		if (locinput > ST.maxpos)
@@ -8221,7 +8478,7 @@ NULL
 
                 SET_RECURSE_LOCINPUT("FAKE-END[after]", cur_eval->locinput);
 
-                PUSH_YES_STATE_GOTO(EVAL_AB, st->u.eval.prev_eval->u.eval.B,
+                PUSH_YES_STATE_GOTO(EVAL_postponed_AB, st->u.eval.prev_eval->u.eval.B,
                                     locinput); /* match B */
 	    }
 
@@ -8449,7 +8706,8 @@ NULL
             assert(!NEXTCHR_IS_EOS);
             if (utf8_target) {
                 locinput += PL_utf8skip[nextchr];
-                /* locinput is allowed to go 1 char off the end, but not 2+ */
+                /* locinput is allowed to go 1 char off the end (signifying
+                 * EOS), but not 2+ */
                 if (locinput > reginfo->strend)
                     sayNO;
             }
@@ -8477,16 +8735,17 @@ NULL
 	    DEBUG_STACK_r({
 	        regmatch_state *cur = st;
 	        regmatch_state *curyes = yes_state;
-	        int curd = depth;
+	        U32 i;
 	        regmatch_slab *slab = PL_regmatch_slab;
-                for (;curd > -1 && (depth-curd < 3);cur--,curd--) {
+                for (i = 0; i < 3 && i <= depth; cur--,i++) {
                     if (cur < SLAB_FIRST(slab)) {
                 	slab = slab->prev;
                 	cur = SLAB_LAST(slab);
                     }
-                    Perl_re_exec_indentf( aTHX_ "#%-3d %-10s %s\n",
+                    Perl_re_exec_indentf( aTHX_ "%4s #%-3d %-10s %s\n",
                         depth,
-                        curd, PL_reg_name[cur->resume_state],
+                        i ? "    " : "push",
+                        depth - i, PL_reg_name[cur->resume_state],
                         (curyes == cur) ? "yes" : ""
                     );
                     if (curyes == cur)
@@ -8638,9 +8897,12 @@ NULL
 
     if (last_pushed_cv) {
 	dSP;
+        /* see "Some notes about MULTICALL" above */
 	POP_MULTICALL;
         PERL_UNUSED_VAR(SP);
     }
+    else
+        LEAVE_SCOPE(orig_savestack_ix);
 
     assert(!result ||  locinput - reginfo->strbeg >= 0);
     return result ?  locinput - reginfo->strbeg : -1;
@@ -8876,10 +9138,27 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p,
                 }
             }
             else {
-                while (scan < loceol &&
-                    (UCHARAT(scan) == c1 || UCHARAT(scan) == c2))
-                {
-                    scan++;
+                /* See comments in regmatch() CURLY_B_min_known_fail.  We avoid
+                 * a conditional each time through the loop if the characters
+                 * differ only in a single bit, as is the usual situation */
+                U8 c1_c2_bits_differing = c1 ^ c2;
+
+                if (isPOWER_OF_2(c1_c2_bits_differing)) {
+                    U8 c1_masked = c1 & ~ c1_c2_bits_differing;
+                    U8 c1_c2_mask = ~ c1_c2_bits_differing;
+
+                    while (   scan < loceol
+                           && (UCHARAT(scan) & c1_c2_mask) == c1_masked)
+                    {
+                        scan++;
+                    }
+                }
+                else {
+                    while (    scan < loceol
+                           && (UCHARAT(scan) == c1 || UCHARAT(scan) == c2))
+                    {
+                        scan++;
+                    }
                 }
             }
 	}
@@ -8914,6 +9193,33 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p,
 	}
 	break;
 
+    case ASCII:
+        if (utf8_target && loceol - scan > max) {
+
+            /* We didn't adjust <loceol> at the beginning of this routine
+             * because is UTF-8, but it is actually ok to do so, since here, to
+             * match, 1 char == 1 byte. */
+            loceol = scan + max;
+        }
+
+        scan = find_next_non_ascii(scan, loceol, utf8_target);
+	break;
+
+    case NASCII:
+	if (utf8_target) {
+	    while (     hardcount < max
+                   &&   scan < loceol
+		   && ! isASCII_utf8_safe(scan, loceol))
+	    {
+		scan += UTF8SKIP(scan);
+		hardcount++;
+	    }
+	}
+        else {
+            scan = find_next_ascii(scan, loceol, utf8_target);
+	}
+	break;
+
     /* The argument (FLAGS) to all the POSIX node types is the class number */
 
     case NPOSIXL:
@@ -9408,7 +9714,10 @@ S_reghop3(U8 *s, SSize_t off, const U8* lim)
     if (off >= 0) {
 	while (off-- && s < lim) {
 	    /* XXX could check well-formedness here */
-	    s += UTF8SKIP(s);
+	    U8 *new_s = s + UTF8SKIP(s);
+            if (new_s > lim) /* lim may be in the middle of a long character */
+                return s;
+            s = new_s;
 	}
     }
     else {
@@ -9700,6 +10009,434 @@ S_to_byte_substr(pTHX_ regexp *prog)
     return TRUE;
 }
 
+#ifndef PERL_IN_XSUB_RE
+
+bool
+Perl__is_grapheme(pTHX_ const U8 * strbeg, const U8 * s, const U8 * strend, const UV cp)
+{
+    /* Temporary helper function for toke.c.  Verify that the code point 'cp'
+     * is a stand-alone grapheme.  The UTF-8 for 'cp' begins at position 's' in
+     * the larger string bounded by 'strbeg' and 'strend'.
+     *
+     * 'cp' needs to be assigned (if not a future version of the Unicode
+     * Standard could make it something that combines with adjacent characters,
+     * so code using it would then break), and there has to be a GCB break
+     * before and after the character. */
+
+    GCB_enum cp_gcb_val, prev_cp_gcb_val, next_cp_gcb_val;
+    const U8 * prev_cp_start;
+
+    PERL_ARGS_ASSERT__IS_GRAPHEME;
+
+    /* Unassigned code points are forbidden */
+    if (UNLIKELY(! ELEMENT_RANGE_MATCHES_INVLIST(
+                                    _invlist_search(PL_Assigned_invlist, cp))))
+    {
+        return FALSE;
+    }
+
+    cp_gcb_val = getGCB_VAL_CP(cp);
+
+    /* Find the GCB value of the previous code point in the input */
+    prev_cp_start = utf8_hop_back(s, -1, strbeg);
+    if (UNLIKELY(prev_cp_start == s)) {
+        prev_cp_gcb_val = GCB_EDGE;
+    }
+    else {
+        prev_cp_gcb_val = getGCB_VAL_UTF8(prev_cp_start, strend);
+    }
+
+    /* And check that is a grapheme boundary */
+    if (! isGCB(prev_cp_gcb_val, cp_gcb_val, strbeg, s,
+                TRUE /* is UTF-8 encoded */ ))
+    {
+        return FALSE;
+    }
+
+    /* Similarly verify there is a break between the current character and the
+     * following one */
+    s += UTF8SKIP(s);
+    if (s >= strend) {
+        next_cp_gcb_val = GCB_EDGE;
+    }
+    else {
+        next_cp_gcb_val = getGCB_VAL_UTF8(s, strend);
+    }
+
+    return isGCB(cp_gcb_val, next_cp_gcb_val, strbeg, s, TRUE);
+}
+
+#endif
+
+bool
+Perl_isSCRIPT_RUN(pTHX_ const U8 * s, const U8 * send, const bool utf8_target)
+{
+    /* Checks that every character in the sequence from 's' to 'send' is one of
+     * three scripts: Common, Inherited, and possibly one other.  Additionally
+     * all decimal digits must come from the same consecutive sequence of 10.
+     * 'utf8_target' is TRUE iff the sequence is encoded in UTF-8.
+     *
+     * Basically, it looks at each character in the sequence to see if the
+     * above conditions are met; if not it fails.  It uses an inversion map to
+     * find the enum corresponding to the script of each character.  But this
+     * is complicated by the fact that a few code points can be in any of
+     * several scripts.  The data has been constructed so that there are
+     * additional enum values (all negative) for these situations.  The
+     * absolute value of those is an index into another table which contains
+     * pointers to auxiliary tables for each such situation.  Each aux array
+     * lists all the scripts for the given situation.  There is another,
+     * parallel, table that gives the number of entries in each aux table.
+     * These are all defined in charclass_invlists.h */
+
+    /* XXX Here are the additional things UTS 39 says could be done:
+     * Mark Chinese strings as âmixed scriptâ if they contain both simplified
+     * (S) and traditional (T) Chinese characters, using the Unihan data in the
+     * Unicode Character Database [UCD].  The criterion can only be applied if
+     * the language of the string is known to be Chinese. So, for example, the
+     * string âåçã ãã®çµå©å¼ â is Japanese, and should not be marked as
+     * mixed script because of a mixture of S and T characters.  Testing for
+     * whether a character is S or T needs to be based not on whether the
+     * character has a S or T variant , but whether the character is an S or T
+     * variant. khw notes that the sample contains a Hiragana character, and it
+     * is unclear if absence of any foreign script marks the script as
+     * "Chinese"
+     *
+     * Forbid sequences of the same nonspacing mark
+     *
+     * Check to see that all the characters are in the sets of exemplar
+     * characters for at least one language in the Unicode Common Locale Data
+     * Repository [CLDR]. */
+
+
+    /* Things that match /\d/u */
+    SV * decimals_invlist = PL_XPosix_ptrs[_CC_DIGIT];
+    UV * decimals_array = invlist_array(decimals_invlist);
+
+    /* What code point is the digit '0' of the script run? */
+    UV zero_of_run = 0;
+    SCX_enum script_of_run  = SCX_INVALID;   /* Illegal value */
+    SCX_enum script_of_char = SCX_INVALID;
+
+    /* If the script remains not fully determined from iteration to iteration,
+     * this is the current intersection of the possiblities.  */
+    SCX_enum * intersection = NULL;
+    PERL_UINT_FAST8_T intersection_len = 0;
+
+    bool retval = TRUE;
+
+    assert(send > s);
+
+    PERL_ARGS_ASSERT_ISSCRIPT_RUN;
+
+    /* Look at each character in the sequence */
+    while (s < send) {
+        UV cp;
+
+        /* The code allows all scripts to use the ASCII digits.  This is
+         * because they are used in commerce even in scripts that have their
+         * own set.  Hence any ASCII ones found are ok, unless a digit from
+         * another set has already been encountered.  (The other digit ranges
+         * in Common are not similarly blessed */
+        if (UNLIKELY(isDIGIT(*s))) {
+            if (zero_of_run > 0) {
+                if (zero_of_run != '0') {
+                    retval = FALSE;
+                    break;
+                }
+            }
+            else {
+                zero_of_run = '0';
+            }
+            s++;
+            continue;
+        }
+
+        /* Here, isn't an ASCII digit.  Find the code point of the character */
+        if (utf8_target && ! UTF8_IS_INVARIANT(*s)) {
+            Size_t len;
+            cp = valid_utf8_to_uvchr((U8 *) s, &len);
+            s += len;
+        }
+        else {
+            cp = *(s++);
+        }
+
+        /* If is within the range [+0 .. +9] of the script's zero, it also is a
+         * digit in that script.  We can skip the rest of this code for this
+         * character. */
+        if (UNLIKELY(   zero_of_run > 0
+                     && cp >= zero_of_run
+                     && cp - zero_of_run <= 9))
+        {
+            continue;
+        }
+
+        /* Find the character's script.  The correct values are hard-coded here
+         * for small-enough code points. */
+        if (cp < 0x2B9) {   /* From inspection of Unicode db; extremely
+                               unlikely to change */
+            if (       cp > 255
+                || (   isALPHA_L1(cp)
+                    && LIKELY(cp != MICRO_SIGN_NATIVE)))
+            {
+                script_of_char = SCX_Latin;
+            }
+            else {
+                script_of_char = SCX_Common;
+            }
+        }
+        else {
+            script_of_char = _Perl_SCX_invmap[
+                                       _invlist_search(PL_SCX_invlist, cp)];
+        }
+
+        /* We arbitrarily accept a single unassigned character, but not in
+         * combination with anything else, and not a run of them. */
+        if (   UNLIKELY(script_of_run == SCX_Unknown)
+            || UNLIKELY(   script_of_run != SCX_INVALID
+                        && script_of_char == SCX_Unknown))
+        {
+            retval = FALSE;
+            break;
+        }
+
+        if (UNLIKELY(script_of_char == SCX_Unknown)) {
+                script_of_run = SCX_Unknown;
+                continue;
+        }
+
+        /* We accept 'inherited' script characters currently even at the
+         * beginning.  (We know that no characters in Inherited are digits, or
+         * we'd have to check for that) */
+        if (UNLIKELY(script_of_char == SCX_Inherited)) {
+            continue;
+        }
+
+        /* If unknown, the run's script is set to the char's */
+        if (UNLIKELY(script_of_run == SCX_INVALID)) {
+            script_of_run = script_of_char;
+        }
+
+        /* All decimal digits must be from the same sequence of 10.  Above, we
+         * handled any ASCII digits without descending to here.  We also
+         * handled the case where we already knew what digit sequence is the
+         * one to use, and the character is in that sequence.  Now that we know
+         * the script, we can use script_zeros[] to directly find which
+         * sequence the script uses, except in a few cases it returns 0 */
+        if (UNLIKELY(zero_of_run == 0) && script_of_char >= 0) {
+            zero_of_run = script_zeros[script_of_char];
+        }
+
+        /* Now we can see if the script of the character is the same as that of
+         * the run */
+        if (LIKELY(script_of_char == script_of_run)) {
+            /* By far the most common case */
+            goto scripts_match;
+        }
+
+        /* Here, the scripts of the run and the current character don't match
+         * exactly.  The run could so far have been entirely characters from
+         * Common.  It's now time to change its script to that of this
+         * non-Common character */
+        if (script_of_run == SCX_Common) {
+
+            /* But Common contains several sets of digits.  Only the '0' set
+             * can be part of another script. */
+            if (zero_of_run > 0 && zero_of_run != '0') {
+                retval = FALSE;
+                break;
+            }
+
+            script_of_run = script_of_char;
+            goto scripts_match;
+        }
+
+        /* Here, the script of the run isn't Common.  But characters in Common
+         * match any script */
+        if (script_of_char == SCX_Common) {
+            goto scripts_match;
+        }
+
+#ifndef HAS_SCX_AUX_TABLES
+
+        /* Too early a Unicode version to have a code point belonging to more
+         * than one script, so, if the scripts don't exactly match, fail */
+        retval = FALSE;
+        break;
+
+#else
+
+        /* Here there is no exact match between the character's script and the
+         * run's.  And we've handled the special cases of scripts Unknown,
+         * Inherited, and Common.
+         *
+         * Negative script numbers signify that the value may be any of several
+         * scripts, and we need to look at auxiliary information to make our
+         * deterimination.  But if both are non-negative, we can fail now */
+        if (LIKELY(script_of_char >= 0)) {
+            const SCX_enum * search_in;
+            PERL_UINT_FAST8_T search_in_len;
+            PERL_UINT_FAST8_T i;
+
+            if (LIKELY(script_of_run >= 0)) {
+                retval = FALSE;
+                break;
+            }
+
+            /* Use the previously constructed set of possible scripts, if any.
+             * */
+            if (intersection) {
+                search_in = intersection;
+                search_in_len = intersection_len;
+            }
+            else {
+                search_in = SCX_AUX_TABLE_ptrs[-script_of_run];
+                search_in_len = SCX_AUX_TABLE_lengths[-script_of_run];
+            }
+
+            for (i = 0; i < search_in_len; i++) {
+                if (search_in[i] == script_of_char) {
+                    script_of_run = script_of_char;
+                    goto scripts_match;
+                }
+            }
+
+            retval = FALSE;
+            break;
+        }
+        else if (LIKELY(script_of_run >= 0)) {
+            /* script of character could be one of several, but run is a single
+             * script */
+            const SCX_enum * search_in = SCX_AUX_TABLE_ptrs[-script_of_char];
+            const PERL_UINT_FAST8_T search_in_len
+                                     = SCX_AUX_TABLE_lengths[-script_of_char];
+            PERL_UINT_FAST8_T i;
+
+            for (i = 0; i < search_in_len; i++) {
+                if (search_in[i] == script_of_run) {
+                    script_of_char = script_of_run;
+                    goto scripts_match;
+                }
+            }
+
+            retval = FALSE;
+            break;
+        }
+        else {
+            /* Both run and char could be in one of several scripts.  If the
+             * intersection is empty, then this character isn't in this script
+             * run.  Otherwise, we need to calculate the intersection to use
+             * for future iterations of the loop, unless we are already at the
+             * final character */
+            const SCX_enum * search_char = SCX_AUX_TABLE_ptrs[-script_of_char];
+            const PERL_UINT_FAST8_T char_len
+                                      = SCX_AUX_TABLE_lengths[-script_of_char];
+            const SCX_enum * search_run;
+            PERL_UINT_FAST8_T run_len;
+
+            SCX_enum * new_overlap = NULL;
+            PERL_UINT_FAST8_T i, j;
+
+            if (intersection) {
+                search_run = intersection;
+                run_len = intersection_len;
+            }
+            else {
+                search_run = SCX_AUX_TABLE_ptrs[-script_of_run];
+                run_len = SCX_AUX_TABLE_lengths[-script_of_run];
+            }
+
+            intersection_len = 0;
+
+            for (i = 0; i < run_len; i++) {
+                for (j = 0; j < char_len; j++) {
+                    if (search_run[i] == search_char[j]) {
+
+                        /* Here, the script at i,j matches.  That means this
+                         * character is in the run.  But continue on to find
+                         * the complete intersection, for the next loop
+                         * iteration, and for the digit check after it.
+                         *
+                         * On the first found common script, we malloc space
+                         * for the intersection list for the worst case of the
+                         * intersection, which is the minimum of the number of
+                         * scripts remaining in each set. */
+                        if (intersection_len == 0) {
+                            Newx(new_overlap,
+                                 MIN(run_len - i, char_len - j),
+                                 SCX_enum);
+                        }
+                        new_overlap[intersection_len++] = search_run[i];
+                    }
+                }
+            }
+
+            /* Here we've looked through everything.  If they have no scripts
+             * in common, not a run */
+            if (intersection_len == 0) {
+                retval = FALSE;
+                break;
+            }
+
+            /* If there is only a single script in common, set to that.
+             * Otherwise, use the intersection going forward */
+            Safefree(intersection);
+            if (intersection_len == 1) {
+                script_of_run = script_of_char = new_overlap[0];
+                Safefree(new_overlap);
+            }
+            else {
+                intersection = new_overlap;
+            }
+        }
+
+#endif
+
+  scripts_match:
+
+        /* Here, the script of the character is compatible with that of the
+         * run.  Either they match exactly, or one or both can be any of
+         * several scripts, and the intersection is not empty.  If the
+         * character is not a decimal digit, we are done with it.  Otherwise,
+         * it could still fail if it is from a different set of 10 than seen
+         * already (or we may not have seen any, and we need to set the
+         * sequence).  If we have determined a single script and that script
+         * only has one set of digits (almost all scripts are like that), then
+         * this isn't a problem, as any digit must come from the same sequence.
+         * The only scripts that have multiple sequences have been constructed
+         * to be 0 in 'script_zeros[]'.
+         *
+         * Here we check if it is a digit. */
+        if (    cp >= FIRST_NON_ASCII_DECIMAL_DIGIT
+            && (   (          zero_of_run == 0
+                    || (  (   script_of_char >= 0
+                           && script_zeros[script_of_char] == 0)
+                        ||    intersection))))
+        {
+            SSize_t range_zero_index;
+            range_zero_index = _invlist_search(decimals_invlist, cp);
+            if (   LIKELY(range_zero_index >= 0)
+                && ELEMENT_RANGE_MATCHES_INVLIST(range_zero_index))
+            {
+                UV range_zero = decimals_array[range_zero_index];
+                if (zero_of_run) {
+                    if (zero_of_run != range_zero) {
+                        retval = FALSE;
+                        break;
+                    }
+                }
+                else {
+                    zero_of_run = range_zero;
+                }
+            }
+        }
+    } /* end of looping through CLOSESR text */
+
+    Safefree(intersection);
+    return retval;
+}
+
+
 /*
  * ex: set ts=8 sts=4 sw=4 et:
  */