Allow trie use for /iaa matching

[perl5.git] / regcomp.c
diff --git a/regcomp.c b/regcomp.c

index f5adebc..e22dcd8 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -123,7 +123,7 @@ typedef struct RExC_state_t {
      char       *start;                 /* Start of input for compile */
      char       *end;                   /* End of input for compile */
      char       *parse;                 /* Input-scan pointer. */
-    I32                whilem_seen;            /* number of WHILEM in this expr */
+    SSize_t    whilem_seen;            /* number of WHILEM in this expr */
      regnode    *emit_start;            /* Start of emitted-code area */
      regnode    *emit_bound;            /* First regnode outside of the allocated space */
      regnode    *emit;                  /* Code-emit pointer; if = &emit_dummy,
@@ -132,7 +132,7 @@ typedef struct RExC_state_t {
      I32                naughty;                /* How bad is this pattern? */
      I32                sawback;                /* Did we see \1, ...? */
      U32                seen;
-    I32                size;                   /* Code size. */
+    SSize_t    size;                   /* Code size. */
      I32                npar;                   /* Capture buffer count, (OPEN). */
      I32                cpar;                   /* Capture buffer count, (CLOSE). */
      I32                nestroot;               /* root parens we are in - used by accept */
@@ -301,7 +301,7 @@ typedef struct RExC_state_t {
    
    - max_offset
      Only used for floating strings. This is the rightmost point that
-    the string can appear at. If set to I32 max it indicates that the
+    the string can appear at. If set to SSize_t_MAX it indicates that the
      string can occur infinitely far to the right.
    
    - minlenp
@@ -345,28 +345,46 @@ typedef struct RExC_state_t {
  typedef struct scan_data_t {
      /*I32 len_min;      unused */
      /*I32 len_delta;    unused */
-    I32 pos_min;
-    I32 pos_delta;
+    SSize_t pos_min;
+    SSize_t pos_delta;
      SV *last_found;
-    I32 last_end;          /* min value, <0 unless valid. */
-    I32 last_start_min;
-    I32 last_start_max;
+    SSize_t last_end;      /* min value, <0 unless valid. */
+    SSize_t last_start_min;
+    SSize_t last_start_max;
      SV **longest;          /* Either &l_fixed, or &l_float. */
      SV *longest_fixed;      /* longest fixed string found in pattern */
-    I32 offset_fixed;       /* offset where it starts */
-    I32 *minlen_fixed;      /* pointer to the minlen relevant to the string */
+    SSize_t offset_fixed;   /* offset where it starts */
+    SSize_t *minlen_fixed;  /* pointer to the minlen relevant to the string */
      I32 lookbehind_fixed;   /* is the position of the string modfied by LB */
      SV *longest_float;      /* longest floating string found in pattern */
-    I32 offset_float_min;   /* earliest point in string it can appear */
-    I32 offset_float_max;   /* latest point in string it can appear */
-    I32 *minlen_float;      /* pointer to the minlen relevant to the string */
-    I32 lookbehind_float;   /* is the position of the string modified by LB */
+    SSize_t offset_float_min; /* earliest point in string it can appear */
+    SSize_t offset_float_max; /* latest point in string it can appear */
+    SSize_t *minlen_float;  /* pointer to the minlen relevant to the string */
+    SSize_t lookbehind_float; /* is the pos of the string modified by LB */
      I32 flags;
      I32 whilem_c;
-    I32 *last_closep;
+    SSize_t *last_closep;
      struct regnode_charclass_class *start_class;
  } scan_data_t;
  
+/* The below is perhaps overboard, but this allows us to save a test at the
+ * expense of a mask.  This is because on both EBCDIC and ASCII machines, 'A'
+ * and 'a' differ by a single bit; the same with the upper and lower case of
+ * all other ASCII-range alphabetics.  On ASCII platforms, they are 32 apart;
+ * on EBCDIC, they are 64.  This uses an exclusive 'or' to find that bit and
+ * then inverts it to form a mask, with just a single 0, in the bit position
+ * where the upper- and lowercase differ.  XXX There are about 40 other
+ * instances in the Perl core where this micro-optimization could be used.
+ * Should decide if maintenance cost is worse, before changing those
+ *
+ * Returns a boolean as to whether or not 'v' is either a lowercase or
+ * uppercase instance of 'c', where 'c' is in [A-Za-z].  If 'c' is a
+ * compile-time constant, the generated code is better than some optimizing
+ * compilers figure out, amounting to a mask and test.  The results are
+ * meaningless if 'c' is not one of [A-Za-z] */
+#define isARG2_lower_or_UPPER_ARG1(c, v) \
+                              (((v) & ~('A' ^ 'a')) ==  ((c) & ~('A' ^ 'a')))
+
  /*
   * Forward declarations for pregcomp()'s friends.
   */
@@ -405,6 +423,7 @@ static const scan_data_t zero_scan_data =
  
  #define SCF_TRIE_RESTUDY        0x4000 /* Do restudy? */
  #define SCF_SEEN_ACCEPT         0x8000 
+#define SCF_TRIE_DOING_RESTUDY 0x10000
  
  #define UTF cBOOL(RExC_utf8)
  
@@ -573,9 +592,9 @@ static const scan_data_t zero_scan_data =
             (int)offset, RExC_precomp, RExC_precomp + offset);          \
  } STMT_END
  
-#define        ckWARN2regdep(loc,m, a1) STMT_START {                           \
+#define        ckWARN2reg_d(loc,m, a1) STMT_START {                            \
      const IV offset = loc - RExC_precomp;                              \
-    Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP),    \
+    Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP),                      \
             m REPORT_LOCATION,                                          \
             a1, (int)offset, RExC_precomp, RExC_precomp + offset);      \
  } STMT_END
@@ -729,7 +748,8 @@ DEBUG_OPTIMISE_MORE_r(if(data){                                      \
     floating substrings if needed. */
  
  STATIC void
-S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
+S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data,
+                    SSize_t *minlenp, int is_inf)
  {
      const STRLEN l = CHR_SVLEN(data->last_found);
      const STRLEN old_l = CHR_SVLEN(*data->longest);
@@ -753,9 +773,12 @@ S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *min
             data->offset_float_min = l ? data->last_start_min : data->pos_min;
             data->offset_float_max = (l
                                       ? data->last_start_max
-                                     : (data->pos_delta == I32_MAX ? I32_MAX : data->pos_min + data->pos_delta));
-           if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
-               data->offset_float_max = I32_MAX;
+                                     : (data->pos_delta == SSize_t_MAX
+                                        ? SSize_t_MAX
+                                        : data->pos_min + data->pos_delta));
+           if (is_inf
+                || (STRLEN)data->offset_float_max > (STRLEN)SSize_t_MAX)
+               data->offset_float_max = SSize_t_MAX;
             if (data->flags & SF_BEFORE_EOL)
                 data->flags
                     |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
@@ -847,7 +870,7 @@ S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
  }
  
  /* These two functions currently do the exact same thing */
-#define cl_init_zero           S_cl_init
+#define cl_init_zero           cl_init
  
  /* 'AND' a given class with another one.  Can create false positives.  'cl'
   * should not be inverted.  'and_with->flags & ANYOF_CLASS' should be 0 if
@@ -1018,8 +1041,8 @@ S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, con
                      255, which means that the union with cl should just be
                      what cl has in it, so can ignore this flag
              ANYOF_NON_UTF8_LATIN1_ALL and inverted means if not utf8 and ord
-                    is 127-255 to match them, but then invert that, so the
-                    union with cl should just be what cl has in it, so can
+                    is (ASCII) 127-255 to match them, but then invert that, so
+                    the union with cl should just be what cl has in it, so can
                      ignore this flag
           */
      } else {    /* 'or_with' is not inverted */
@@ -1420,7 +1443,7 @@ and would end up looking like:
     8: EXACT <baz>(10)
    10: END(0)
  
-    d = uvuni_to_utf8_flags(d, uv, 0);
+    d = uvchr_to_utf8_flags(d, uv, 0);
  
  is the recommended Unicode-aware way of saying
  
@@ -1432,7 +1455,7 @@ is the recommended Unicode-aware way of saying
         if (UTF) {                                                         \
              SV *zlopp = newSV(7); /* XXX: optimize me */                   \
             unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp);      \
-            unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, val); \
+            unsigned const char *const kapow = uvchr_to_utf8(flrbbbbb, val); \
             SvCUR_set(zlopp, kapow - flrbbbbb);                            \
             SvPOK_on(zlopp);                                               \
             SvUTF8_on(zlopp);                                              \
@@ -1443,31 +1466,28 @@ is the recommended Unicode-aware way of saying
         }                                                                  \
          } STMT_END
  
-#define TRIE_READ_CHAR STMT_START {                                                     \
-    wordlen++;                                                                          \
-    if ( UTF ) {                                                                        \
-        /* if it is UTF then it is either already folded, or does not need folding */   \
-        uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags);             \
-    }                                                                                   \
-    else if (folder == PL_fold_latin1) {                                                \
-        /* if we use this folder we have to obey unicode rules on latin-1 data */       \
-        if ( foldlen > 0 ) {                                                            \
-           uvc = utf8n_to_uvuni( (const U8*) scan, UTF8_MAXLEN, &len, uniflags );       \
-           foldlen -= len;                                                              \
-           scan += len;                                                                 \
-           len = 0;                                                                     \
-        } else {                                                                        \
-            len = 1;                                                                    \
-            uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, FOLD_FLAGS_FULL);       \
-            skiplen = UNISKIP(uvc);                                                     \
-            foldlen -= skiplen;                                                         \
-            scan = foldbuf + skiplen;                                                   \
-        }                                                                               \
-    } else {                                                                            \
-        /* raw data, will be folded later if needed */                                  \
-        uvc = (U32)*uc;                                                                 \
-        len = 1;                                                                        \
-    }                                                                                   \
+/* This gets the next character from the input, folding it if not already
+ * folded. */
+#define TRIE_READ_CHAR STMT_START {                                           \
+    wordlen++;                                                                \
+    if ( UTF ) {                                                              \
+        /* if it is UTF then it is either already folded, or does not need    \
+         * folding */                                                         \
+        uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags);   \
+    }                                                                         \
+    else if (folder == PL_fold_latin1) {                                      \
+        /* This folder implies Unicode rules, which in the range expressible  \
+         *  by not UTF is the lower case, with the two exceptions, one of     \
+         *  which should have been taken care of before calling this */       \
+        assert(*uc != LATIN_SMALL_LETTER_SHARP_S);                            \
+        uvc = toLOWER_L1(*uc);                                                \
+        if (UNLIKELY(uvc == MICRO_SIGN)) uvc = GREEK_SMALL_LETTER_MU;         \
+        len = 1;                                                              \
+    } else {                                                                  \
+        /* raw data, will be folded later if needed */                        \
+        uvc = (U32)*uc;                                                       \
+        len = 1;                                                              \
+    }                                                                         \
  } STMT_END
  
  
@@ -1587,7 +1607,6 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs
          case EXACT: break;
         case EXACTFA:
          case EXACTFU_SS:
-        case EXACTFU_TRICKYFOLD:
         case EXACTFU: folder = PL_fold_latin1; break;
         case EXACTF:  folder = PL_fold; break;
         case EXACTFL: folder = PL_fold_locale; break;
@@ -1659,11 +1678,9 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs
          const U8 *uc = (U8*)STRING( noper );
          const U8 *e  = uc + STR_LEN( noper );
          STRLEN foldlen = 0;
-        U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
-        STRLEN skiplen = 0;
-        const U8 *scan = (U8*)NULL;
          U32 wordlen      = 0;         /* required init */
-        STRLEN chars = 0;
+        STRLEN minbytes = 0;
+        STRLEN maxbytes = 0;
          bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
  
          if (OP(noper) == NOTHING) {
@@ -1684,13 +1701,61 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs
                                            regardless of encoding */
              if (OP( noper ) == EXACTFU_SS) {
                  /* false positives are ok, so just set this */
-                TRIE_BITMAP_SET(trie,0xDF);
+                TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S);
              }
          }
          for ( ; uc < e ; uc += len ) {
              TRIE_CHARCOUNT(trie)++;
              TRIE_READ_CHAR;
-            chars++;
+
+            /* Acummulate to the current values, the range in the number of
+             * bytes that this character could match.  The max is presumed to
+             * be the same as the folded input (which TRIE_READ_CHAR returns),
+             * except that when this is not in UTF-8, it could be matched
+             * against a string which is UTF-8, and the variant characters
+             * could be 2 bytes instead of the 1 here.  Likewise, for the
+             * minimum number of bytes when not folded.  When folding, the min
+             * is assumed to be 1 byte could fold to match the single character
+             * here, or in the case of a multi-char fold, 1 byte can fold to
+             * the whole sequence.  'foldlen' is used to denote whether we are
+             * in such a sequence, skipping the min setting if so.  XXX TODO
+             * Use the exact list of what folds to each character, from
+             * PL_utf8_foldclosures */
+            if (UTF) {
+                maxbytes += UTF8SKIP(uc);
+                if (! folder) {
+                    /* A non-UTF-8 string could be 1 byte to match our 2 */
+                    minbytes += (UTF8_IS_DOWNGRADEABLE_START(*uc))
+                                ? 1
+                                : UTF8SKIP(uc);
+                }
+                else {
+                    if (foldlen) {
+                        foldlen -= UTF8SKIP(uc);
+                    }
+                    else {
+                        foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e);
+                        minbytes++;
+                    }
+                }
+            }
+            else {
+                maxbytes += (UNI_IS_INVARIANT(*uc))
+                             ? 1
+                             : 2;
+                if (! folder) {
+                    minbytes++;
+                }
+                else {
+                    if (foldlen) {
+                        foldlen--;
+                    }
+                    else {
+                        foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e);
+                        minbytes++;
+                    }
+                }
+            }
              if ( uvc < 256 ) {
                  if ( folder ) {
                      U8 folded= folder[ (U8) uvc ];
@@ -1714,7 +1779,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs
                     if ( !UTF ) {
                         /* store first byte of utf8 representation of
                            variant codepoints */
-                       if (! UNI_IS_INVARIANT(uvc)) {
+                       if (! NATIVE_IS_INVARIANT(uvc)) {
                             TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
                         }
                     }
@@ -1737,25 +1802,13 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs
              }
          }
          if( cur == first ) {
-            trie->minlen = chars;
-            trie->maxlen = chars;
-        } else if (chars < trie->minlen) {
-            trie->minlen = chars;
-        } else if (chars > trie->maxlen) {
-            trie->maxlen = chars;
-        }
-        if (OP( noper ) == EXACTFU_SS) {
-            /* XXX: workaround - 'ss' could match "\x{DF}" so minlen could be 1 and not 2*/
-           if (trie->minlen > 1)
-                trie->minlen= 1;
-        }
-       if (OP( noper ) == EXACTFU_TRICKYFOLD) {
-           /* XXX: workround - things like "\x{1FBE}\x{0308}\x{0301}" can match "\x{0390}" 
-            *                - We assume that any such sequence might match a 2 byte string */
-            if (trie->minlen > 2 )
-                trie->minlen= 2;
+            trie->minlen = minbytes;
+            trie->maxlen = maxbytes;
+        } else if (minbytes < trie->minlen) {
+            trie->minlen = minbytes;
+        } else if (maxbytes > trie->maxlen) {
+            trie->maxlen = maxbytes;
          }
-
      } /* end first pass */
      DEBUG_TRIE_COMPILE_r(
          PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
@@ -1822,11 +1875,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs
              const U8 *e      = uc + STR_LEN( noper );
             U32 state        = 1;         /* required init */
             U16 charid       = 0;         /* sanity init */
-           U8 *scan         = (U8*)NULL; /* sanity init */
-           STRLEN foldlen   = 0;         /* required init */
              U32 wordlen      = 0;         /* required init */
-           U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
-            STRLEN skiplen   = 0;
  
              if (OP(noper) == NOTHING) {
                  regnode *noper_next= regnext(noper);
@@ -2032,12 +2081,8 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs
  
              U16 charid       = 0;         /* sanity init */
              U32 accept_state = 0;         /* sanity init */
-            U8 *scan         = (U8*)NULL; /* sanity init */
  
-            STRLEN foldlen   = 0;         /* required init */
              U32 wordlen      = 0;         /* required init */
-            STRLEN skiplen   = 0;
-            U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
  
              if (OP(noper) == NOTHING) {
                  regnode *noper_next= regnext(noper);
@@ -2657,8 +2702,8 @@ S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source,  regnode
   * that is "sss".
   *
   * It turns out that there are problems with all multi-character folds, and not
- * just these three.  Now the code is general, for all such cases, but the
- * three still have some special handling.  The approach taken is:
+ * just these three.  Now the code is general, for all such cases.  The
+ * approach taken is:
   * 1)   This routine examines each EXACTFish node that could contain multi-
   *      character fold sequences.  It returns in *min_subtract how much to
   *      subtract from the the actual length of the string to get a real minimum
@@ -2666,10 +2711,7 @@ S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source,  regnode
   *      used by the caller to adjust the min length of the match, and the delta
   *      between min and max, so that the optimizer doesn't reject these
   *      possibilities based on size constraints.
- * 2)   Certain of these sequences require special handling by the trie code,
- *      so, if found, this code changes the joined node type to special ops:
- *      EXACTFU_TRICKYFOLD and EXACTFU_SS.
- * 3)   For the sequence involving the Sharp s (\xDF), the node type EXACTFU_SS
+ * 2)   For the sequence involving the Sharp s (\xDF), the node type EXACTFU_SS
   *      is used for an EXACTFU node that contains at least one "ss" sequence in
   *      it.  For non-UTF-8 patterns and strings, this is the only case where
   *      there is a possible fold length change.  That means that a regular
@@ -2688,7 +2730,7 @@ S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source,  regnode
   *      this file makes sure that in EXACTFU nodes, the sharp s gets folded to
   *      'ss', even if the pattern isn't UTF-8.  This avoids the issues
   *      described in the next item.
- * 4)   A problem remains for the sharp s in EXACTF and EXACTFA nodes when the
+ * 3)   A problem remains for the sharp s in EXACTF and EXACTFA nodes when the
   *      pattern isn't in UTF-8. (BTW, there cannot be an EXACTF node with a
   *      UTF-8 pattern.)  An assumption that the optimizer part of regexec.c
   *      (probably unwittingly, in Perl_regexec_flags()) makes is that a
@@ -2719,7 +2761,14 @@ S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source,  regnode
   *      but in a non-UTF8 pattern, folding it to that above-Latin1 string would
   *      require the pattern to be forced into UTF-8, the overhead of which we
   *      want to avoid.)
- */
+ *
+ *      Similarly, the code that generates tries doesn't currently handle
+ *      not-already-folded multi-char folds, and it looks like a pain to change
+ *      that.  Therefore, trie generation of EXACTFA nodes with the sharp s
+ *      doesn't work.  Instead, such an EXACTFA is turned into a new regnode,
+ *      EXACTFA_NO_TRIE, which the trie code knows not to handle.  Most people
+ *      using /iaa matching will be doing so almost entirely with ASCII
+ *      strings, so this should rarely be encountered in practice */
  
  #define JOIN_EXACT(scan,min_subtract,has_exactf_sharp_s, flags) \
      if (PL_regkind[OP(scan)] == EXACT) \
@@ -2840,39 +2889,17 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, b
                  }
  
                  /* Nodes with 'ss' require special handling, except for EXACTFL
-                 * and EXACTFA for which there is no multi-char fold to this */
+                 * and EXACTFA-ish for which there is no multi-char fold to
+                 * this */
                  if (len == 2 && *s == 's' && *(s+1) == 's'
-                    && OP(scan) != EXACTFL && OP(scan) != EXACTFA)
+                    && OP(scan) != EXACTFL
+                    && OP(scan) != EXACTFA
+                    && OP(scan) != EXACTFA_NO_TRIE)
                  {
                      count = 2;
                      OP(scan) = EXACTFU_SS;
                      s += 2;
                  }
-                else if (len == 6   /* len is the same in both ASCII and EBCDIC
-                                       for these */
-                         && (memEQ(s, GREEK_SMALL_LETTER_IOTA_UTF8
-                                      COMBINING_DIAERESIS_UTF8
-                                      COMBINING_ACUTE_ACCENT_UTF8,
-                                   6)
-                             || memEQ(s, GREEK_SMALL_LETTER_UPSILON_UTF8
-                                         COMBINING_DIAERESIS_UTF8
-                                         COMBINING_ACUTE_ACCENT_UTF8,
-                                     6)))
-                {
-                    count = 3;
-
-                    /* These two folds require special handling by trie's, so
-                     * change the node type to indicate this.  If EXACTFA and
-                     * EXACTFL were ever to be handled by trie's, this would
-                     * have to be changed.  If this node has already been
-                     * changed to EXACTFU_SS in this loop, leave it as is.  (I
-                     * (khw) think it doesn't matter in regexec.c for UTF
-                     * patterns, but no need to change it */
-                    if (OP(scan) == EXACTFU) {
-                        OP(scan) = EXACTFU_TRICKYFOLD;
-                    }
-                    s += 6;
-                }
                  else { /* Here is a generic multi-char fold. */
                      const U8* multi_end  = s + len;
  
@@ -2885,7 +2912,10 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, b
                       * test for them.  The code that generates the
                       * is_MULTI_foo() macros croaks should one actually get put
                       * into Unicode .) */
-                    if (OP(scan) != EXACTFL && OP(scan) != EXACTFA) {
+                    if (OP(scan) != EXACTFL
+                        && OP(scan) != EXACTFA
+                        && OP(scan) != EXACTFA_NO_TRIE)
+                    {
                          count = utf8_length(s, multi_end);
                          s = multi_end;
                      }
@@ -2914,9 +2944,12 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, b
              /* Non-UTF-8 pattern, EXACTFA node.  There can't be a multi-char
               * fold to the ASCII range (and there are no existing ones in the
               * upper latin1 range).  But, as outlined in the comments preceding
-             * this function, we need to flag any occurrences of the sharp s */
+             * this function, we need to flag any occurrences of the sharp s.
+             * This character forbids trie formation (because of added
+             * complexity) */
             while (s < s_end) {
                  if (*s == LATIN_SMALL_LETTER_SHARP_S) {
+                    OP(scan) = EXACTFA_NO_TRIE;
                      *has_exactf_sharp_s = TRUE;
                      break;
                  }
@@ -2937,16 +2970,6 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, b
               * have to find at least two characters for a multi-fold */
             const U8* upper = (OP(scan) == EXACTF) ? s_end : s_end -1;
  
-            /* The below is perhaps overboard, but this allows us to save a
-             * test each time through the loop at the expense of a mask.  This
-             * is because on both EBCDIC and ASCII machines, 'S' and 's' differ
-             * by a single bit.  On ASCII they are 32 apart; on EBCDIC, they
-             * are 64.  This uses an exclusive 'or' to find that bit and then
-             * inverts it to form a mask, with just a single 0, in the bit
-             * position where 'S' and 's' differ. */
-            const U8 S_or_s_mask = (U8) ~ ('S' ^ 's');
-            const U8 s_masked = 's' & S_or_s_mask;
-
             while (s < upper) {
                  int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end);
                  if (! len) {    /* Not a multi-char fold. */
@@ -2959,8 +2982,8 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, b
                  }
  
                  if (len == 2
-                    && ((*s & S_or_s_mask) == s_masked)
-                    && ((*(s+1) & S_or_s_mask) == s_masked))
+                    && isARG2_lower_or_UPPER_ARG1('s', *s)
+                    && isARG2_lower_or_UPPER_ARG1('s', *(s+1)))
                  {
  
                      /* EXACTF nodes need to know that the minimum length
@@ -3019,9 +3042,9 @@ typedef struct scan_frame {
  
  #define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
  
-STATIC I32
+STATIC SSize_t
  S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
-                        I32 *minlenp, I32 *deltap,
+                        SSize_t *minlenp, SSize_t *deltap,
                         regnode *last,
                         scan_data_t *data,
                         I32 stopparen,
@@ -3037,17 +3060,18 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                         /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
  {
      dVAR;
-    I32 min = 0;    /* There must be at least this number of characters to match */
+    /* There must be at least this number of characters to match */
+    SSize_t min = 0;
      I32 pars = 0, code;
      regnode *scan = *scanp, *next;
-    I32 delta = 0;
+    SSize_t delta = 0;
      int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
      int is_inf_internal = 0;           /* The studied chunk is infinite */
      I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
      scan_data_t data_fake;
      SV *re_trie_maxbuff = NULL;
      regnode *first_non_open = scan;
-    I32 stopmin = I32_MAX;
+    SSize_t stopmin = SSize_t_MAX;
      scan_frame *frame = NULL;
      GET_RE_DEBUG_FLAGS_DECL;
  
@@ -3115,7 +3139,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                 /* NOTE - There is similar code to this block below for handling
                    TRIE nodes on a re-study.  If you change stuff here check there
                    too. */
-               I32 max1 = 0, min1 = I32_MAX, num = 0;
+               SSize_t max1 = 0, min1 = SSize_t_MAX, num = 0;
                 struct regnode_charclass_class accum;
                 regnode * const startbranch=scan;
  
@@ -3125,7 +3149,8 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                     cl_init_zero(pRExC_state, &accum);
  
                 while (OP(scan) == code) {
-                   I32 deltanext, minnext, f = 0, fake;
+                   SSize_t deltanext, minnext, fake;
+                   I32 f = 0;
                     struct regnode_charclass_class this_class;
  
                     num++;
@@ -3156,9 +3181,9 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                                           stopparen, recursed, NULL, f,depth+1);
                     if (min1 > minnext)
                         min1 = minnext;
-                   if (deltanext == I32_MAX) {
+                   if (deltanext == SSize_t_MAX) {
                         is_inf = is_inf_internal = 1;
-                       max1 = I32_MAX;
+                       max1 = SSize_t_MAX;
                     } else if (max1 < minnext + deltanext)
                         max1 = minnext + deltanext;
                     scan = next;
@@ -3183,16 +3208,17 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                     min1 = 0;
                 if (flags & SCF_DO_SUBSTR) {
                     data->pos_min += min1;
-                   if (data->pos_delta >= I32_MAX - (max1 - min1))
-                       data->pos_delta = I32_MAX;
+                   if (data->pos_delta >= SSize_t_MAX - (max1 - min1))
+                       data->pos_delta = SSize_t_MAX;
                     else
                         data->pos_delta += max1 - min1;
                     if (max1 != min1 || is_inf)
                         data->longest = &(data->longest_float);
                 }
                 min += min1;
-               if (delta == I32_MAX || I32_MAX - delta - (max1 - min1) < 0)
-                   delta = I32_MAX;
+               if (delta == SSize_t_MAX
+                || SSize_t_MAX - delta - (max1 - min1) < 0)
+                   delta = SSize_t_MAX;
                 else
                     delta += max1 - min1;
                 if (flags & SCF_DO_STCLASS_OR) {
@@ -3343,14 +3369,14 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                                  EXACT           | EXACT
                                  EXACTFU         | EXACTFU
                                  EXACTFU_SS      | EXACTFU
-                                EXACTFU_TRICKYFOLD | EXACTFU
-                                EXACTFA         | 0
+                                EXACTFA         | EXACTFA
  
  
                          */
  #define TRIE_TYPE(X) ( ( NOTHING == (X) ) ? NOTHING :   \
                         ( EXACT == (X) )   ? EXACT :        \
-                       ( EXACTFU == (X) || EXACTFU_SS == (X) || EXACTFU_TRICKYFOLD == (X) ) ? EXACTFU :        \
+                       ( EXACTFU == (X) || EXACTFU_SS == (X) ) ? EXACTFU :        \
+                       ( EXACTFA == (X) ) ? EXACTFA :        \
                         0 )
  
                          /* dont use tail as the end marker for this traverse */
@@ -3580,7 +3606,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
             }
         }
         else if (OP(scan) == EXACT) {
-           I32 l = STR_LEN(scan);
+           SSize_t l = STR_LEN(scan);
             UV uc;
             if (UTF) {
                 const U8 * const s = (U8*)STRING(scan);
@@ -3596,7 +3622,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                 if (data->last_end == -1) { /* Update the start info. */
                     data->last_start_min = data->pos_min;
                     data->last_start_max = is_inf
-                       ? I32_MAX : data->pos_min + data->pos_delta;
+                       ? SSize_t_MAX : data->pos_min + data->pos_delta;
                 }
                 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
                 if (UTF)
@@ -3669,7 +3695,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
             flags &= ~SCF_DO_STCLASS;
         }
         else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
-           I32 l = STR_LEN(scan);
+           SSize_t l = STR_LEN(scan);
             UV uc = *((U8*)STRING(scan));
  
             /* Search for fixed substrings supports EXACT only. */
@@ -3730,7 +3756,8 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                          /* All other (EXACTFL handled above) folds except under
                           * /iaa that include s, S, and sharp_s also may include
                           * the others */
-                       if (OP(scan) != EXACTFA) {
+                       if (OP(scan) != EXACTFA && OP(scan) != EXACTFA_NO_TRIE)
+                        {
                             if (uc == 's' || uc == 'S') {
                                 ANYOF_BITMAP_SET(data->start_class,
                                                  LATIN_SMALL_LETTER_SHARP_S);
@@ -3767,7 +3794,9 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
  
                             /* All folds except under /iaa that include s, S,
                              * and sharp_s also may include the others */
-                           if (OP(scan) != EXACTFA) {
+                           if (OP(scan) != EXACTFA
+                                && OP(scan) != EXACTFA_NO_TRIE)
+                            {
                                 if (uc == 's' || uc == 'S') {
                                     ANYOF_BITMAP_SET(data->start_class,
                                                    LATIN_SMALL_LETTER_SHARP_S);
@@ -3786,8 +3815,8 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
             flags &= ~SCF_DO_STCLASS;
         }
         else if (REGNODE_VARIES(OP(scan))) {
-           I32 mincount, maxcount, minnext, deltanext, fl = 0;
-           I32 f = flags, pos_before = 0;
+           SSize_t mincount, maxcount, minnext, deltanext, pos_before = 0;
+           I32 fl = 0, f = flags;
             regnode * const oscan = scan;
             struct regnode_charclass_class this_class;
             struct regnode_charclass_class *oclass = NULL;
@@ -3909,8 +3938,9 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                 }
                 if (!scan)              /* It was not CURLYX, but CURLY. */
                     scan = next;
-               if ( /* ? quantifier ok, except for (?{ ... }) */
-                   (next_is_eval || !(mincount == 0 && maxcount == 1))
+               if (!(flags & SCF_TRIE_DOING_RESTUDY)
+                   /* ? quantifier ok, except for (?{ ... }) */
+                   && (next_is_eval || !(mincount == 0 && maxcount == 1))
                     && (minnext == 0) && (deltanext == 0)
                     && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
                     && maxcount <= REG_INFTY/3) /* Complement check for big count */
@@ -3923,11 +3953,11 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                 }
  
                 min += minnext * mincount;
-               is_inf_internal |= deltanext == I32_MAX
+               is_inf_internal |= deltanext == SSize_t_MAX
                                      || (maxcount == REG_INFTY && minnext + deltanext > 0);
                 is_inf |= is_inf_internal;
                 if (is_inf)
-                   delta = I32_MAX;
+                   delta = SSize_t_MAX;
                 else
                     delta += (minnext + deltanext) * maxcount - minnext * mincount;
  
@@ -4057,10 +4087,10 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
  
                     if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
  #if defined(SPARC64_GCC_WORKAROUND)
-                       I32 b = 0;
+                       SSize_t b = 0;
                         STRLEN l = 0;
                         const char *s = NULL;
-                       I32 old = 0;
+                       SSize_t old = 0;
  
                         if (pos_before >= data->last_start_min)
                             b = pos_before;
@@ -4072,11 +4102,11 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                         old = b - data->last_start_min;
  
  #else
-                       I32 b = pos_before >= data->last_start_min
+                       SSize_t b = pos_before >= data->last_start_min
                             ? pos_before : data->last_start_min;
                         STRLEN l;
                         const char * const s = SvPV_const(data->last_found, l);
-                       I32 old = b - data->last_start_min;
+                       SSize_t old = b - data->last_start_min;
  #endif
  
                         if (UTF)
@@ -4108,20 +4138,26 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                         } else {
                             /* start offset must point into the last copy */
                             data->last_start_min += minnext * (mincount - 1);
-                           data->last_start_max += is_inf ? I32_MAX
+                           data->last_start_max += is_inf ? SSize_t_MAX
                                 : (maxcount - 1) * (minnext + data->pos_delta);
                         }
                     }
                     /* It is counted once already... */
                     data->pos_min += minnext * (mincount - counted);
  #if 0
-PerlIO_printf(Perl_debug_log, "counted=%d deltanext=%d I32_MAX=%d minnext=%d maxcount=%d mincount=%d\n",
-    counted, deltanext, I32_MAX, minnext, maxcount, mincount);
-if (deltanext != I32_MAX)
-PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount, I32_MAX - data->pos_delta);
+PerlIO_printf(Perl_debug_log, "counted=%"UVdf" deltanext=%"UVdf
+                              " SSize_t_MAX=%"UVdf" minnext=%"UVdf
+                              " maxcount=%"UVdf" mincount=%"UVdf"\n",
+    (UV)counted, (UV)deltanext, (UV)SSize_t_MAX, (UV)minnext, (UV)maxcount,
+    (UV)mincount);
+if (deltanext != SSize_t_MAX)
+PerlIO_printf(Perl_debug_log, "LHS=%"UVdf" RHS=%"UVdf"\n",
+    (UV)(-counted * deltanext + (minnext + deltanext) * maxcount
+          - minnext * mincount), (UV)(SSize_t_MAX - data->pos_delta));
  #endif
-                   if (deltanext == I32_MAX || -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount >= I32_MAX - data->pos_delta)
-                       data->pos_delta = I32_MAX;
+                   if (deltanext == SSize_t_MAX ||
+                       -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount >= SSize_t_MAX - data->pos_delta)
+                       data->pos_delta = SSize_t_MAX;
                     else
                         data->pos_delta += - counted * deltanext +
                         (minnext + deltanext) * maxcount - minnext * mincount;
@@ -4141,7 +4177,7 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext
                             data->last_start_min =
                                 data->pos_min - CHR_SVLEN(last_str);
                             data->last_start_max = is_inf
-                               ? I32_MAX
+                               ? SSize_t_MAX
                                 : data->pos_min + data->pos_delta
                                 - CHR_SVLEN(last_str);
                         }
@@ -4253,8 +4289,8 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext
                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
                             ANYOF_CLASS_CLEAR(data->start_class, classnum_to_namedclass(classnum) + 1);
                              for (value = 0; value < loop_max; value++) {
-                                if (! _generic_isCC(UNI_TO_NATIVE(value), classnum)) {
-                                    ANYOF_BITMAP_CLEAR(data->start_class, UNI_TO_NATIVE(value));
+                                if (! _generic_isCC(LATIN1_TO_NATIVE(value), classnum)) {
+                                    ANYOF_BITMAP_CLEAR(data->start_class, LATIN1_TO_NATIVE(value));
                                  }
                              }
                         }
@@ -4269,8 +4305,8 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext
                          * in case it isn't a true locale-node.  This will
                          * create false positives if it truly is locale */
                          for (value = 0; value < loop_max; value++) {
-                            if (_generic_isCC(UNI_TO_NATIVE(value), classnum)) {
-                                ANYOF_BITMAP_SET(data->start_class, UNI_TO_NATIVE(value));
+                            if (_generic_isCC(LATIN1_TO_NATIVE(value), classnum)) {
+                                ANYOF_BITMAP_SET(data->start_class, LATIN1_TO_NATIVE(value));
                              }
                          }
                          }
@@ -4287,8 +4323,8 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext
                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
                             ANYOF_CLASS_CLEAR(data->start_class, classnum_to_namedclass(classnum));
                              for (value = 0; value < loop_max; value++) {
-                                if (_generic_isCC(UNI_TO_NATIVE(value), classnum)) {
-                                    ANYOF_BITMAP_CLEAR(data->start_class, UNI_TO_NATIVE(value));
+                                if (_generic_isCC(LATIN1_TO_NATIVE(value), classnum)) {
+                                    ANYOF_BITMAP_CLEAR(data->start_class, LATIN1_TO_NATIVE(value));
                                  }
                              }
                         }
@@ -4303,8 +4339,8 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext
                          * case it isn't a true locale-node.  This will create
                          * false positives if it truly is locale */
                          for (value = 0; value < loop_max; value++) {
-                            if (! _generic_isCC(UNI_TO_NATIVE(value), classnum)) {
-                                ANYOF_BITMAP_SET(data->start_class, UNI_TO_NATIVE(value));
+                            if (! _generic_isCC(LATIN1_TO_NATIVE(value), classnum)) {
+                                ANYOF_BITMAP_SET(data->start_class, LATIN1_TO_NATIVE(value));
                              }
                          }
                          if (PL_regkind[OP(scan)] == NPOSIXD) {
@@ -4363,7 +4399,7 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext
                     In this case we can't do fixed string optimisation.
                  */
  
-                I32 deltanext, minnext, fake = 0;
+                SSize_t deltanext, minnext, fake = 0;
                  regnode *nscan;
                  struct regnode_charclass_class intrnl;
                  int f = 0;
@@ -4432,7 +4468,8 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext
                     length of the pattern, something we won't know about
                     until after the recurse.
                  */
-                I32 deltanext, fake = 0;
+                SSize_t deltanext;
+                I32 fake = 0;
                  regnode *nscan;
                  struct regnode_charclass_class intrnl;
                  int f = 0;
@@ -4442,8 +4479,8 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext
                      have to worry about freeing them when we know
                      they wont be used, which would be a pain.
                   */
-                I32 *minnextp;
-                Newx( minnextp, 1, I32 );
+                SSize_t *minnextp;
+                Newx( minnextp, 1, SSize_t );
                  SAVEFREEPV(minnextp);
  
                  if (data) {
@@ -4572,7 +4609,7 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext
             {
                 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
                     RExC_rx->extflags |= RXf_ANCH_GPOS;
-               if (RExC_rx->gofs < (U32)min)
+               if (RExC_rx->gofs < (STRLEN)min)
                     RExC_rx->gofs = min;
              } else {
                  RExC_rx->extflags |= RXf_GPOS_FLOAT;
@@ -4588,7 +4625,7 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext
              regnode *trie_node= scan;
              regnode *tail= regnext(scan);
              reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
-            I32 max1 = 0, min1 = I32_MAX;
+            SSize_t max1 = 0, min1 = SSize_t_MAX;
              struct regnode_charclass_class accum;
  
              if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
@@ -4605,7 +4642,7 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext
                  
                  for ( word=1 ; word <= trie->wordcount ; word++) 
                  {
-                    I32 deltanext=0, minnext=0, f = 0, fake;
+                    SSize_t deltanext=0, minnext=0, f = 0, fake;
                      struct regnode_charclass_class this_class;
                      
                      data_fake.flags = 0;
@@ -4639,12 +4676,12 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext
                      if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
                          nextbranch= regnext((regnode*)nextbranch);
                      
-                    if (min1 > (I32)(minnext + trie->minlen))
+                    if (min1 > (SSize_t)(minnext + trie->minlen))
                          min1 = minnext + trie->minlen;
-                    if (deltanext == I32_MAX) {
+                    if (deltanext == SSize_t_MAX) {
                          is_inf = is_inf_internal = 1;
-                        max1 = I32_MAX;
-                    } else if (max1 < (I32)(minnext + deltanext + trie->maxlen))
+                        max1 = SSize_t_MAX;
+                    } else if (max1 < (SSize_t)(minnext + deltanext + trie->maxlen))
                          max1 = minnext + deltanext + trie->maxlen;
                      
                      if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
@@ -4738,9 +4775,9 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext
      DEBUG_STUDYDATA("pre-fin:",data,depth);
  
      *scanp = scan;
-    *deltap = is_inf_internal ? I32_MAX : delta;
+    *deltap = is_inf_internal ? SSize_t_MAX : delta;
      if (flags & SCF_DO_SUBSTR && is_inf)
-       data->pos_delta = I32_MAX - data->pos_min;
+       data->pos_delta = SSize_t_MAX - data->pos_min;
      if (is_par > (I32)U8_MAX)
         is_par = 0;
      if (is_par && pars==1 && data) {
@@ -4926,12 +4963,11 @@ S_pat_upgrade_to_utf8(pTHX_ RExC_state_t * const pRExC_state,
      Newx(dst, *plen_p * 2 + 1, U8);
  
      while (s < *plen_p) {
-        const UV uv = NATIVE_TO_ASCII(src[s]);
-        if (UNI_IS_INVARIANT(uv))
-            dst[d]   = (U8)UTF_TO_NATIVE(uv);
+        if (NATIVE_IS_INVARIANT(src[s]))
+            dst[d]   = src[s];
          else {
-            dst[d++] = (U8)UTF8_EIGHT_BIT_HI(uv);
-            dst[d]   = (U8)UTF8_EIGHT_BIT_LO(uv);
+            dst[d++] = UTF8_EIGHT_BIT_HI(src[s]);
+            dst[d]   = UTF8_EIGHT_BIT_LO(src[s]);
          }
          if (n < num_code_blocks) {
              if (!do_end && pRExC_state->code_blocks[n].start == s) {
@@ -5408,13 +5444,15 @@ S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
  
  
  STATIC bool
-S_setup_longest(pTHX_ RExC_state_t *pRExC_state, SV* sv_longest, SV** rx_utf8, SV** rx_substr, I32* rx_end_shift, I32 lookbehind, I32 offset, I32 *minlen, STRLEN longest_length, bool eol, bool meol)
+S_setup_longest(pTHX_ RExC_state_t *pRExC_state, SV* sv_longest, SV** rx_utf8, SV** rx_substr, SSize_t* rx_end_shift,
+                     SSize_t lookbehind, SSize_t offset, SSize_t *minlen, STRLEN longest_length, bool eol, bool meol)
  {
      /* This is the common code for setting up the floating and fixed length
       * string data extracted from Perl_re_op_compile() below.  Returns a boolean
       * as to whether succeeded or not */
  
-    I32 t,ml;
+    I32 t;
+    SSize_t ml;
  
      if (! (longest_length
             || (eol /* Can't have SEOL and MULTI */
@@ -5439,7 +5477,7 @@ S_setup_longest(pTHX_ RExC_state_t *pRExC_state, SV* sv_longest, SV** rx_utf8, S
          follow this item. We calculate it ahead of time as once the
          lookbehind offset is added in we lose the ability to correctly
          calculate it.*/
-    ml = minlen ? *(minlen) : (I32)longest_length;
+    ml = minlen ? *(minlen) : (SSize_t)longest_length;
      *rx_end_shift = ml - offset
          - longest_length + (SvTAIL(sv_longest) != 0)
          + lookbehind;
@@ -5508,7 +5546,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
      char *exp;
      regnode *scan;
      I32 flags;
-    I32 minlen = 0;
+    SSize_t minlen = 0;
      U32 rx_flags;
      SV *pat;
      SV *code_blocksv = NULL;
@@ -5519,6 +5557,8 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
      I32 sawlookahead = 0;
      I32 sawplus = 0;
      I32 sawopen = 0;
+    I32 sawminmod = 0;
+
      regex_charset initial_charset = get_regex_charset(orig_rx_flags);
      bool recompile = 0;
      bool runtime_code = 0;
@@ -6027,7 +6067,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
      }
  
  reStudy:
-    r->minlen = minlen = sawlookahead = sawplus = sawopen = 0;
+    r->minlen = minlen = sawlookahead = sawplus = sawopen = sawminmod = 0;
      Zero(r->substrs, 1, struct reg_substr_data);
  
  #ifdef TRIE_STUDY_OPT
@@ -6063,11 +6103,11 @@ reStudy:
      /* testing for BRANCH here tells us whether there is "must appear"
         data in the pattern. If there is then we can use it for optimisations */
      if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /*  Only one top-level choice. */
-       I32 fake;
+       SSize_t fake;
         STRLEN longest_float_length, longest_fixed_length;
         struct regnode_charclass_class ch_class; /* pointed to by data */
         int stclass_flag;
-       I32 last_close = 0; /* pointed to by data */
+       SSize_t last_close = 0; /* pointed to by data */
          regnode *first= scan;
          regnode *first_next= regnext(first);
         /*
@@ -6096,12 +6136,15 @@ reStudy:
                  * the only op that could be a regnode is PLUS, all the rest
                  * will be regnode_1 or regnode_2.
                  *
+                 * (yves doesn't think this is true)
                  */
                 if (OP(first) == PLUS)
                     sawplus = 1;
-               else
+                else {
+                    if (OP(first) == MINMOD)
+                        sawminmod = 1;
                     first += regarglen[OP(first)];
-
+                }
                 first = NEXTOPER(first);
                 first_next= regnext(first);
         }
@@ -6172,7 +6215,7 @@ reStudy:
             first = NEXTOPER(first);
             goto again;
         }
-       if (sawplus && !sawlookahead && (!sawopen || !RExC_sawback)
+        if (sawplus && !sawminmod && !sawlookahead && (!sawopen || !RExC_sawback)
             && !pRExC_state->num_code_blocks) /* May examine pos and $& */
             /* x+ must match at the 1st pos of run of x's */
             r->intflags |= PREGf_SKIP;
@@ -6223,7 +6266,9 @@ reStudy:
          
         minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
              &data, -1, NULL, NULL,
-            SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
+            SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag
+                          | (restudied ? SCF_TRIE_DOING_RESTUDY : 0),
+            0);
  
  
          CHECK_RESTUDY_GOTO_butfirst(LEAVE_with_name("study_chunk"));
@@ -6233,7 +6278,7 @@ reStudy:
              && data.last_start_min == 0 && data.last_end > 0
              && !RExC_seen_zerolen
              && !(RExC_seen & REG_SEEN_VERBARG)
-            && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
+            && !((RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
             r->extflags |= RXf_CHECK_ALL;
         scan_commit(pRExC_state, &data,&minlen,0);
  
@@ -6256,7 +6301,7 @@ reStudy:
          {
             r->float_min_offset = data.offset_float_min - data.lookbehind_float;
             r->float_max_offset = data.offset_float_max;
-           if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
+           if (data.offset_float_max < SSize_t_MAX) /* Don't offset infinity */
                 r->float_max_offset -= data.lookbehind_float;
             SvREFCNT_inc_simple_void_NN(data.longest_float);
         }
@@ -6330,9 +6375,7 @@ reStudy:
             r->check_offset_min = r->float_min_offset;
             r->check_offset_max = r->float_max_offset;
         }
-       /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
-          This should be changed ASAP!  */
-       if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
+       if ((r->check_substr || r->check_utf8) ) {
             r->extflags |= RXf_USE_INTUIT;
             if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
                 r->extflags |= RXf_INTUIT_TAIL;
@@ -6346,9 +6389,9 @@ reStudy:
      }
      else {
         /* Several toplevels. Best we can is to set minlen. */
-       I32 fake;
+       SSize_t fake;
         struct regnode_charclass_class ch_class;
-       I32 last_close = 0;
+       SSize_t last_close = 0;
  
         DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
  
@@ -6359,7 +6402,10 @@ reStudy:
  
          
         minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
-           &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
+           &data, -1, NULL, NULL,
+            SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS
+                              |(restudied ? SCF_TRIE_DOING_RESTUDY : 0),
+            0);
          
          CHECK_RESTUDY_GOTO_butfirst(NOOP);
  
@@ -6458,8 +6504,8 @@ reStudy:
      });
  #ifdef RE_TRACK_PATTERN_OFFSETS
      DEBUG_OFFSETS_r(if (ri->u.offsets) {
-        const U32 len = ri->u.offsets[0];
-        U32 i;
+        const STRLEN len = ri->u.offsets[0];
+        STRLEN i;
          GET_RE_DEBUG_FLAGS_DECL;
          PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
          for (i = 1; i <= len; i++) {
@@ -6647,7 +6693,7 @@ Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
  {
      SV *ret;
      AV *av;
-    I32 length;
+    SSize_t length;
      struct regexp *const rx = ReANY(r);
  
      PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
@@ -6710,19 +6756,29 @@ Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
  {
      struct regexp *const rx = ReANY(r);
      char *s = NULL;
-    I32 i = 0;
-    I32 s1, t1;
+    SSize_t i = 0;
+    SSize_t s1, t1;
      I32 n = paren;
  
      PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
          
-    if ( (    n == RX_BUFF_IDX_CARET_PREMATCH
+    if (      n == RX_BUFF_IDX_CARET_PREMATCH
             || n == RX_BUFF_IDX_CARET_FULLMATCH
             || n == RX_BUFF_IDX_CARET_POSTMATCH
-         )
-         && !(rx->extflags & RXf_PMf_KEEPCOPY)
-    )
-        goto ret_undef;
+       )
+    {
+        bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
+        if (!keepcopy) {
+            /* on something like
+             *    $r = qr/.../;
+             *    /$qr/p;
+             * the KEEPCOPY is set on the PMOP rather than the regex */
+            if (PL_curpm && r == PM_GETRE(PL_curpm))
+                 keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
+        }
+        if (!keepcopy)
+            goto ret_undef;
+    }
  
      if (!rx->subbeg)
          goto ret_undef;
@@ -6759,7 +6815,7 @@ Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
      }          
  
      assert(s >= rx->subbeg);
-    assert(rx->sublen >= (s - rx->subbeg) + i );
+    assert((STRLEN)rx->sublen >= (STRLEN)((s - rx->subbeg) + i) );
      if (i >= 0) {
  #if NO_TAINT_SUPPORT
          sv_setpvn(sv, s, i);
@@ -6828,13 +6884,27 @@ Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
  
      PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
  
+    if (   paren == RX_BUFF_IDX_CARET_PREMATCH
+        || paren == RX_BUFF_IDX_CARET_FULLMATCH
+        || paren == RX_BUFF_IDX_CARET_POSTMATCH
+    )
+    {
+        bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
+        if (!keepcopy) {
+            /* on something like
+             *    $r = qr/.../;
+             *    /$qr/p;
+             * the KEEPCOPY is set on the PMOP rather than the regex */
+            if (PL_curpm && r == PM_GETRE(PL_curpm))
+                 keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
+        }
+        if (!keepcopy)
+            goto warn_undef;
+    }
+
      /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
      switch (paren) {
        case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */
-         if (!(rx->extflags & RXf_PMf_KEEPCOPY))
-            goto warn_undef;
-        /*FALLTHROUGH*/
-
        case RX_BUFF_IDX_PREMATCH:       /* $` */
          if (rx->offs[0].start != -1) {
                         i = rx->offs[0].start;
@@ -6847,8 +6917,6 @@ Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
          return 0;
  
        case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */
-         if (!(rx->extflags & RXf_PMf_KEEPCOPY))
-            goto warn_undef;
        case RX_BUFF_IDX_POSTMATCH:       /* $' */
             if (rx->offs[0].end != -1) {
                         i = rx->sublen - rx->offs[0].end;
@@ -6860,13 +6928,7 @@ Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
             }
          return 0;
  
-      case RX_BUFF_IDX_CARET_FULLMATCH: /* ${^MATCH} */
-         if (!(rx->extflags & RXf_PMf_KEEPCOPY))
-            goto warn_undef;
-        /*FALLTHROUGH*/
-
-      /* $& / ${^MATCH}, $1, $2, ... */
-      default:
+      default: /* $& / ${^MATCH}, $1, $2, ... */
             if (paren <= (I32)rx->nparens &&
              (s1 = rx->offs[paren].start) != -1 &&
              (t1 = rx->offs[paren].end) != -1)
@@ -7014,11 +7076,12 @@ S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
  /* This section of code defines the inversion list object and its methods.  The
   * interfaces are highly subject to change, so as much as possible is static to
   * this file.  An inversion list is here implemented as a malloc'd C UV array
- * with some added info that is placed as UVs at the beginning in a header
- * portion.  An inversion list for Unicode is an array of code points, sorted
- * by ordinal number.  The zeroth element is the first code point in the list.
- * The 1th element is the first element beyond that not in the list.  In other
- * words, the first range is
+ * as an SVt_INVLIST scalar.
+ *
+ * An inversion list for Unicode is an array of code points, sorted by ordinal
+ * number.  The zeroth element is the first code point in the list.  The 1th
+ * element is the first element beyond that not in the list.  In other words,
+ * the first range is
   *  invlist[0]..(invlist[1]-1)
   * The other ranges follow.  Thus every element whose index is divisible by two
   * marks the beginning of a range that is in the list, and every element not
@@ -7036,9 +7099,9 @@ S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
   * Taking the complement (inverting) an inversion list is quite simple, if the
   * first element is 0, remove it; otherwise add a 0 element at the beginning.
   * This implementation reserves an element at the beginning of each inversion
- * list to contain 0 when the list contains 0, and contains 1 otherwise.  The
- * actual beginning of the list is either that element if 0, or the next one if
- * 1.
+ * list to always contain 0; there is an additional flag in the header which
+ * indicates if the list begins at the 0, or is offset to begin at the next
+ * element.
   *
   * More about inversion lists can be found in "Unicode Demystified"
   * Chapter 13 by Richard Gillam, published by Addison-Wesley.
@@ -7053,32 +7116,31 @@ S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
   * should eventually be made public */
  
  /* The header definitions are in F<inline_invlist.c> */
-#define TO_INTERNAL_SIZE(x) (((x) + HEADER_LENGTH) * sizeof(UV))
-#define FROM_INTERNAL_SIZE(x) (((x)/ sizeof(UV)) - HEADER_LENGTH)
-
-#define INVLIST_INITIAL_LEN 10
  
  PERL_STATIC_INLINE UV*
  S__invlist_array_init(pTHX_ SV* const invlist, const bool will_have_0)
  {
      /* Returns a pointer to the first element in the inversion list's array.
       * This is called upon initialization of an inversion list.  Where the
-     * array begins depends on whether the list has the code point U+0000
-     * in it or not.  The other parameter tells it whether the code that
-     * follows this call is about to put a 0 in the inversion list or not.
-     * The first element is either the element with 0, if 0, or the next one,
-     * if 1 */
+     * array begins depends on whether the list has the code point U+0000 in it
+     * or not.  The other parameter tells it whether the code that follows this
+     * call is about to put a 0 in the inversion list or not.  The first
+     * element is either the element reserved for 0, if TRUE, or the element
+     * after it, if FALSE */
  
-    UV* zero = get_invlist_zero_addr(invlist);
+    bool* offset = get_invlist_offset_addr(invlist);
+    UV* zero_addr = (UV *) SvPVX(invlist);
  
      PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
  
      /* Must be empty */
-    assert(! *_get_invlist_len_addr(invlist));
+    assert(! _invlist_len(invlist));
+
+    *zero_addr = 0;
  
      /* 1^1 = 0; 1^0 = 1 */
-    *zero = 1 ^ will_have_0;
-    return zero + *zero;
+    *offset = 1 ^ will_have_0;
+    return zero_addr + *offset;
  }
  
  PERL_STATIC_INLINE UV*
@@ -7092,53 +7154,44 @@ S_invlist_array(pTHX_ SV* const invlist)
  
      /* Must not be empty.  If these fail, you probably didn't check for <len>
       * being non-zero before trying to get the array */
-    assert(*_get_invlist_len_addr(invlist));
-    assert(*get_invlist_zero_addr(invlist) == 0
-          || *get_invlist_zero_addr(invlist) == 1);
-
-    /* The array begins either at the element reserved for zero if the
-     * list contains 0 (that element will be set to 0), or otherwise the next
-     * element (in which case the reserved element will be set to 1). */
-    return (UV *) (get_invlist_zero_addr(invlist)
-                  + *get_invlist_zero_addr(invlist));
+    assert(_invlist_len(invlist));
+
+    /* The very first element always contains zero, The array begins either
+     * there, or if the inversion list is offset, at the element after it.
+     * The offset header field determines which; it contains 0 or 1 to indicate
+     * how much additionally to add */
+    assert(0 == *(SvPVX(invlist)));
+    return ((UV *) SvPVX(invlist) + *get_invlist_offset_addr(invlist));
  }
  
  PERL_STATIC_INLINE void
-S_invlist_set_len(pTHX_ SV* const invlist, const UV len)
+S_invlist_set_len(pTHX_ SV* const invlist, const UV len, const bool offset)
  {
-    /* Sets the current number of elements stored in the inversion list */
+    /* Sets the current number of elements stored in the inversion list.
+     * Updates SvCUR correspondingly */
  
      PERL_ARGS_ASSERT_INVLIST_SET_LEN;
  
-    *_get_invlist_len_addr(invlist) = len;
-
-    assert(len <= SvLEN(invlist));
-
-    SvCUR_set(invlist, TO_INTERNAL_SIZE(len));
-    /* If the list contains U+0000, that element is part of the header,
-     * and should not be counted as part of the array.  It will contain
-     * 0 in that case, and 1 otherwise.  So we could flop 0=>1, 1=>0 and
-     * subtract:
-     * SvCUR_set(invlist,
-     *           TO_INTERNAL_SIZE(len
-     *                            - (*get_invlist_zero_addr(inv_list) ^ 1)));
-     * But, this is only valid if len is not 0.  The consequences of not doing
-     * this is that the memory allocation code may think that 1 more UV is
-     * being used than actually is, and so might do an unnecessary grow.  That
-     * seems worth not bothering to make this the precise amount.
-     *
-     * Note that when inverting, SvCUR shouldn't change */
+    assert(SvTYPE(invlist) == SVt_INVLIST);
+
+    SvCUR_set(invlist,
+              (len == 0)
+               ? 0
+               : TO_INTERNAL_SIZE(len + offset));
+    assert(SvLEN(invlist) == 0 || SvCUR(invlist) <= SvLEN(invlist));
  }
  
  PERL_STATIC_INLINE IV*
  S_get_invlist_previous_index_addr(pTHX_ SV* invlist)
  {
-    /* Return the address of the UV that is reserved to hold the cached index
+    /* Return the address of the IV that is reserved to hold the cached index
       * */
  
      PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR;
  
-    return (IV *) (SvPVX(invlist) + (INVLIST_PREVIOUS_INDEX_OFFSET * sizeof (UV)));
+    assert(SvTYPE(invlist) == SVt_INVLIST);
+
+    return &(((XINVLIST*) SvANY(invlist))->prev_index);
  }
  
  PERL_STATIC_INLINE IV
@@ -7171,22 +7224,13 @@ S_invlist_max(pTHX_ SV* const invlist)
  
      PERL_ARGS_ASSERT_INVLIST_MAX;
  
-    return SvLEN(invlist) == 0  /* This happens under _new_invlist_C_array */
-           ? _invlist_len(invlist)
-           : FROM_INTERNAL_SIZE(SvLEN(invlist));
-}
-
-PERL_STATIC_INLINE UV*
-S_get_invlist_zero_addr(pTHX_ SV* invlist)
-{
-    /* Return the address of the UV that is reserved to hold 0 if the inversion
-     * list contains 0.  This has to be the last element of the heading, as the
-     * list proper starts with either it if 0, or the next element if not.
-     * (But we force it to contain either 0 or 1) */
+    assert(SvTYPE(invlist) == SVt_INVLIST);
  
-    PERL_ARGS_ASSERT_GET_INVLIST_ZERO_ADDR;
-
-    return (UV *) (SvPVX(invlist) + (INVLIST_ZERO_OFFSET * sizeof (UV)));
+    /* Assumes worst case, in which the 0 element is not counted in the
+     * inversion list, so subtracts 1 for that */
+    return SvLEN(invlist) == 0  /* This happens under _new_invlist_C_array */
+           ? FROM_INTERNAL_SIZE(SvCUR(invlist)) - 1
+           : FROM_INTERNAL_SIZE(SvLEN(invlist)) - 1;
  }
  
  #ifndef PERL_IN_XSUB_RE
@@ -7201,56 +7245,75 @@ Perl__new_invlist(pTHX_ IV initial_size)
      SV* new_list;
  
      if (initial_size < 0) {
-       initial_size = INVLIST_INITIAL_LEN;
+       initial_size = 10;
      }
  
      /* Allocate the initial space */
-    new_list = newSV(TO_INTERNAL_SIZE(initial_size));
-    invlist_set_len(new_list, 0);
+    new_list = newSV_type(SVt_INVLIST);
  
-    /* Force iterinit() to be used to get iteration to work */
-    *get_invlist_iter_addr(new_list) = UV_MAX;
+    /* First 1 is in case the zero element isn't in the list; second 1 is for
+     * trailing NUL */
+    SvGROW(new_list, TO_INTERNAL_SIZE(initial_size + 1) + 1);
+    invlist_set_len(new_list, 0, 0);
  
-    /* This should force a segfault if a method doesn't initialize this
-     * properly */
-    *get_invlist_zero_addr(new_list) = UV_MAX;
+    /* Force iterinit() to be used to get iteration to work */
+    *get_invlist_iter_addr(new_list) = (STRLEN) UV_MAX;
  
      *get_invlist_previous_index_addr(new_list) = 0;
-    *get_invlist_version_id_addr(new_list) = INVLIST_VERSION_ID;
-#if HEADER_LENGTH != 5
-#   error Need to regenerate INVLIST_VERSION_ID by running perl -E 'say int(rand 2**31-1)', and then changing the #if to the new length
-#endif
  
      return new_list;
  }
  #endif
  
  STATIC SV*
-S__new_invlist_C_array(pTHX_ UV* list)
+S__new_invlist_C_array(pTHX_ const UV* const list)
  {
      /* Return a pointer to a newly constructed inversion list, initialized to
       * point to <list>, which has to be in the exact correct inversion list
       * form, including internal fields.  Thus this is a dangerous routine that
-     * should not be used in the wrong hands */
+     * should not be used in the wrong hands.  The passed in 'list' contains
+     * several header fields at the beginning that are not part of the
+     * inversion list body proper */
+
+    const STRLEN length = (STRLEN) list[0];
+    const UV version_id =          list[1];
+    const bool offset   =    cBOOL(list[2]);
+#define HEADER_LENGTH 3
+    /* If any of the above changes in any way, you must change HEADER_LENGTH
+     * (if appropriate) and regenerate INVLIST_VERSION_ID by running
+     *      perl -E 'say int(rand 2**31-1)'
+     */
+#define INVLIST_VERSION_ID 148565664 /* This is a combination of a version and
+                                        data structure type, so that one being
+                                        passed in can be validated to be an
+                                        inversion list of the correct vintage.
+                                       */
  
-    SV* invlist = newSV_type(SVt_PV);
+    SV* invlist = newSV_type(SVt_INVLIST);
  
      PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
  
-    SvPV_set(invlist, (char *) list);
+    if (version_id != INVLIST_VERSION_ID) {
+        Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
+    }
+
+    /* The generated array passed in includes header elements that aren't part
+     * of the list proper, so start it just after them */
+    SvPV_set(invlist, (char *) (list + HEADER_LENGTH));
+
      SvLEN_set(invlist, 0);  /* Means we own the contents, and the system
                                shouldn't touch it */
-    SvCUR_set(invlist, TO_INTERNAL_SIZE(_invlist_len(invlist)));
  
-    if (*get_invlist_version_id_addr(invlist) != INVLIST_VERSION_ID) {
-        Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
-    }
+    *(get_invlist_offset_addr(invlist)) = offset;
+
+    /* The 'length' passed to us is the physical number of elements in the
+     * inversion list.  But if there is an offset the logical number is one
+     * less than that */
+    invlist_set_len(invlist, length  - offset, offset);
  
-    /* Initialize the iteration pointer.
-     * XXX This could be done at compile time in charclass_invlists.h, but I
-     * (khw) am not confident that the suffixes for specifying the C constant
-     * UV_MAX are portable, e.g.  'ull' on a 32 bit machine that is configured
-     * to use 64 bits; might need a Configure probe */
+    invlist_set_previous_index(invlist, 0);
+
+    /* Initialize the iteration pointer. */
      invlist_iterfinish(invlist);
  
      return invlist;
@@ -7263,7 +7326,11 @@ S_invlist_extend(pTHX_ SV* const invlist, const UV new_max)
  
      PERL_ARGS_ASSERT_INVLIST_EXTEND;
  
-    SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max));
+    assert(SvTYPE(invlist) == SVt_INVLIST);
+
+    /* Add one to account for the zero element at the beginning which may not
+     * be counted by the calling parameters */
+    SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max + 1));
  }
  
  PERL_STATIC_INLINE void
@@ -7271,9 +7338,10 @@ S_invlist_trim(pTHX_ SV* const invlist)
  {
      PERL_ARGS_ASSERT_INVLIST_TRIM;
  
+    assert(SvTYPE(invlist) == SVt_INVLIST);
+
      /* Change the length of the inversion list to how many entries it currently
       * has */
-
      SvPV_shrink_to_cur((SV *) invlist);
  }
  
@@ -7289,11 +7357,13 @@ S__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV end
      UV* array;
      UV max = invlist_max(invlist);
      UV len = _invlist_len(invlist);
+    bool offset;
  
      PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
  
      if (len == 0) { /* Empty lists must be initialized */
-        array = _invlist_array_init(invlist, start == 0);
+        offset = start != 0;
+        array = _invlist_array_init(invlist, ! offset);
      }
      else {
         /* Here, the existing list is non-empty. The current max entry in the
@@ -7316,6 +7386,7 @@ S__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV end
          * value not in the set, it is extending the set, so the new first
          * value not in the set is one greater than the newly extended range.
          * */
+        offset = *get_invlist_offset_addr(invlist);
         if (array[final_element] == start) {
             if (end != UV_MAX) {
                 array[final_element] = end + 1;
@@ -7323,7 +7394,7 @@ S__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV end
             else {
                 /* But if the end is the maximum representable on the machine,
                  * just let the range that this would extend to have no end */
-               invlist_set_len(invlist, len - 1);
+               invlist_set_len(invlist, len - 1, offset);
             }
             return;
         }
@@ -7333,16 +7404,18 @@ S__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV end
  
      len += 2;  /* Includes an element each for the start and end of range */
  
-    /* If overflows the existing space, extend, which may cause the array to be
-     * moved */
+    /* If wll overflow the existing space, extend, which may cause the array to
+     * be moved */
      if (max < len) {
         invlist_extend(invlist, len);
-       invlist_set_len(invlist, len);  /* Have to set len here to avoid assert
-                                          failure in invlist_array() */
+
+        /* Have to set len here to avoid assert failure in invlist_array() */
+        invlist_set_len(invlist, len, offset);
+
         array = invlist_array(invlist);
      }
      else {
-       invlist_set_len(invlist, len);
+       invlist_set_len(invlist, len, offset);
      }
  
      /* The next item on the list starts the range, the one after that is
@@ -7354,7 +7427,7 @@ S__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV end
      else {
         /* But if the end is the maximum representable on the machine, just let
          * the range have no end */
-       invlist_set_len(invlist, len - 1);
+       invlist_set_len(invlist, len - 1, offset);
      }
  }
  
@@ -7542,7 +7615,7 @@ Perl__invlist_populate_swatch(pTHX_ SV* const invlist, const UV start, const UV
  }
  
  void
-Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool complement_b, SV** output)
+Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, const bool complement_b, SV** output)
  {
      /* Take the union of two inversion lists and point <output> to it.  *output
       * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
@@ -7564,8 +7637,8 @@ Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool co
       * return the larger of the input lists, but then outside code might need
       * to keep track of whether to free the input list or not */
  
-    UV* array_a;    /* a's array */
-    UV* array_b;
+    const UV* array_a;    /* a's array */
+    const UV* array_b;
      UV len_a;      /* length of a's array */
      UV len_b;
  
@@ -7633,23 +7706,17 @@ Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool co
      if (complement_b) {
  
         /* To complement, we invert: if the first element is 0, remove it.  To
-        * do this, we just pretend the array starts one later, and clear the
-        * flag as we don't have to do anything else later */
+        * do this, we just pretend the array starts one later */
          if (array_b[0] == 0) {
              array_b++;
              len_b--;
-            complement_b = FALSE;
          }
          else {
  
-            /* But if the first element is not zero, we unshift a 0 before the
-             * array.  The data structure reserves a space for that 0 (which
-             * should be a '1' right now), so physical shifting is unneeded,
-             * but temporarily change that element to 0.  Before exiting the
-             * routine, we must restore the element to '1' */
+            /* But if the first element is not zero, we pretend the list starts
+             * at the 0 that is always stored immediately before the array. */
              array_b--;
              len_b++;
-            array_b[0] = 0;
          }
      }
  
@@ -7745,7 +7812,7 @@ Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool co
      /* Set result to final length, which can change the pointer to array_u, so
       * re-find it */
      if (len_u != _invlist_len(u)) {
-       invlist_set_len(u, len_u);
+       invlist_set_len(u, len_u, *get_invlist_offset_addr(u));
         invlist_trim(u);
         array_u = invlist_array(u);
      }
@@ -7766,11 +7833,6 @@ Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool co
         }
      }
  
-    /* If we've changed b, restore it */
-    if (complement_b) {
-        array_b[0] = 1;
-    }
-
      /*  We may be removing a reference to one of the inputs */
      if (a == *output || b == *output) {
          assert(! invlist_is_iterating(*output));
@@ -7782,7 +7844,7 @@ Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool co
  }
  
  void
-Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool complement_b, SV** i)
+Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, const bool complement_b, SV** i)
  {
      /* Take the intersection of two inversion lists and point <i> to it.  *i
       * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
@@ -7799,8 +7861,8 @@ Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
       * union above
       */
  
-    UV* array_a;               /* a's array */
-    UV* array_b;
+    const UV* array_a;         /* a's array */
+    const UV* array_b;
      UV len_a;  /* length of a's array */
      UV len_b;
  
@@ -7824,7 +7886,7 @@ Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
      assert(a != b);
  
      /* Special case if either one is empty */
-    len_a = _invlist_len(a);
+    len_a = (a == NULL) ? 0 : _invlist_len(a);
      if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) {
  
          if (len_a != 0 && complement_b) {
@@ -7834,11 +7896,11 @@ Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
               * must be every possible code point.  Thus the intersection is
               * simply 'a'. */
              if (*i != a) {
-                *i = invlist_clone(a);
-
                  if (*i == b) {
                      SvREFCNT_dec_NN(b);
                  }
+
+                *i = invlist_clone(a);
              }
              /* else *i is already 'a' */
              return;
@@ -7865,23 +7927,17 @@ Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
      if (complement_b) {
  
         /* To complement, we invert: if the first element is 0, remove it.  To
-        * do this, we just pretend the array starts one later, and clear the
-        * flag as we don't have to do anything else later */
+        * do this, we just pretend the array starts one later */
          if (array_b[0] == 0) {
              array_b++;
              len_b--;
-            complement_b = FALSE;
          }
          else {
  
-            /* But if the first element is not zero, we unshift a 0 before the
-             * array.  The data structure reserves a space for that 0 (which
-             * should be a '1' right now), so physical shifting is unneeded,
-             * but temporarily change that element to 0.  Before exiting the
-             * routine, we must restore the element to '1' */
+            /* But if the first element is not zero, we pretend the list starts
+             * at the 0 that is always stored immediately before the array. */
              array_b--;
              len_b++;
-            array_b[0] = 0;
          }
      }
  
@@ -7972,7 +8028,7 @@ Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
      /* Set result to final length, which can change the pointer to array_r, so
       * re-find it */
      if (len_r != _invlist_len(r)) {
-       invlist_set_len(r, len_r);
+       invlist_set_len(r, len_r, *get_invlist_offset_addr(r));
         invlist_trim(r);
         array_r = invlist_array(r);
      }
@@ -7988,11 +8044,6 @@ Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
         }
      }
  
-    /* If we've changed b, restore it */
-    if (complement_b) {
-        array_b[0] = 1;
-    }
-
      /*  We may be removing a reference to one of the inputs */
      if (a == *i || b == *i) {
          assert(! invlist_is_iterating(*i));
@@ -8061,27 +8112,17 @@ Perl__invlist_invert(pTHX_ SV* const invlist)
       * have a zero; removes it otherwise.  As described above, the data
       * structure is set up so that this is very efficient */
  
-    UV* len_pos = _get_invlist_len_addr(invlist);
-
      PERL_ARGS_ASSERT__INVLIST_INVERT;
  
      assert(! invlist_is_iterating(invlist));
  
      /* The inverse of matching nothing is matching everything */
-    if (*len_pos == 0) {
+    if (_invlist_len(invlist) == 0) {
         _append_range_to_invlist(invlist, 0, UV_MAX);
         return;
      }
  
-    /* The exclusive or complents 0 to 1; and 1 to 0.  If the result is 1, the
-     * zero element was a 0, so it is being removed, so the length decrements
-     * by 1; and vice-versa.  SvCUR is unaffected */
-    if (*get_invlist_zero_addr(invlist) ^= 1) {
-       (*len_pos)--;
-    }
-    else {
-       (*len_pos)++;
-    }
+    *get_invlist_offset_addr(invlist) = ! *get_invlist_offset_addr(invlist);
  }
  
  void
@@ -8111,11 +8152,11 @@ Perl__invlist_invert_prop(pTHX_ SV* const invlist)
                 invlist_extend(invlist, len);
                 array = invlist_array(invlist);
             }
-           invlist_set_len(invlist, len);
+           invlist_set_len(invlist, len, *get_invlist_offset_addr(invlist));
             array[len - 1] = PERL_UNICODE_MAX + 1;
         }
         else {  /* Remove the 0x110000 */
-           invlist_set_len(invlist, len - 1);
+           invlist_set_len(invlist, len - 1, *get_invlist_offset_addr(invlist));
         }
      }
  
@@ -8133,17 +8174,19 @@ S_invlist_clone(pTHX_ SV* const invlist)
      /* Need to allocate extra space to accommodate Perl's addition of a
       * trailing NUL to SvPV's, since it thinks they are always strings */
      SV* new_invlist = _new_invlist(_invlist_len(invlist) + 1);
-    STRLEN length = SvCUR(invlist);
+    STRLEN physical_length = SvCUR(invlist);
+    bool offset = *(get_invlist_offset_addr(invlist));
  
      PERL_ARGS_ASSERT_INVLIST_CLONE;
  
-    SvCUR_set(new_invlist, length); /* This isn't done automatically */
-    Copy(SvPVX(invlist), SvPVX(new_invlist), length, char);
+    *(get_invlist_offset_addr(new_invlist)) = offset;
+    invlist_set_len(new_invlist, _invlist_len(invlist), offset);
+    Copy(SvPVX(invlist), SvPVX(new_invlist), physical_length, char);
  
      return new_invlist;
  }
  
-PERL_STATIC_INLINE UV*
+PERL_STATIC_INLINE STRLEN*
  S_get_invlist_iter_addr(pTHX_ SV* invlist)
  {
      /* Return the address of the UV that contains the current iteration
@@ -8151,17 +8194,9 @@ S_get_invlist_iter_addr(pTHX_ SV* invlist)
  
      PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR;
  
-    return (UV *) (SvPVX(invlist) + (INVLIST_ITER_OFFSET * sizeof (UV)));
-}
+    assert(SvTYPE(invlist) == SVt_INVLIST);
  
-PERL_STATIC_INLINE UV*
-S_get_invlist_version_id_addr(pTHX_ SV* invlist)
-{
-    /* Return the address of the UV that contains the version id. */
-
-    PERL_ARGS_ASSERT_GET_INVLIST_VERSION_ID_ADDR;
-
-    return (UV *) (SvPVX(invlist) + (INVLIST_VERSION_ID_OFFSET * sizeof (UV)));
+    return &(((XINVLIST*) SvANY(invlist))->iterator);
  }
  
  PERL_STATIC_INLINE void
@@ -8185,7 +8220,7 @@ S_invlist_iterfinish(pTHX_ SV* invlist)
  
      PERL_ARGS_ASSERT_INVLIST_ITERFINISH;
  
-    *get_invlist_iter_addr(invlist) = UV_MAX;
+    *get_invlist_iter_addr(invlist) = (STRLEN) UV_MAX;
  }
  
  STATIC bool
@@ -8198,14 +8233,14 @@ S_invlist_iternext(pTHX_ SV* invlist, UV* start, UV* end)
       * <*start> and <*end> are unchanged, and the next call to this function
       * will start over at the beginning of the list */
  
-    UV* pos = get_invlist_iter_addr(invlist);
+    STRLEN* pos = get_invlist_iter_addr(invlist);
      UV len = _invlist_len(invlist);
      UV *array;
  
      PERL_ARGS_ASSERT_INVLIST_ITERNEXT;
  
      if (*pos >= len) {
-       *pos = UV_MAX;  /* Force iterinit() to be required next time */
+       *pos = (STRLEN) UV_MAX; /* Force iterinit() to be required next time */
         return FALSE;
      }
  
@@ -8228,7 +8263,7 @@ S_invlist_is_iterating(pTHX_ SV* const invlist)
  {
      PERL_ARGS_ASSERT_INVLIST_IS_ITERATING;
  
-    return *(get_invlist_iter_addr(invlist)) < UV_MAX;
+    return *(get_invlist_iter_addr(invlist)) < (STRLEN) UV_MAX;
  }
  
  PERL_STATIC_INLINE UV
@@ -8294,51 +8329,67 @@ Perl__invlist_contents(pTHX_ SV* const invlist)
  }
  #endif
  
-#ifdef PERL_ARGS_ASSERT__INVLIST_DUMP
+#ifndef PERL_IN_XSUB_RE
  void
-Perl__invlist_dump(pTHX_ SV* const invlist, const char * const header)
+Perl__invlist_dump(pTHX_ PerlIO *file, I32 level, const char * const indent, SV* const invlist)
  {
-    /* Dumps out the ranges in an inversion list.  The string 'header'
-     * if present is output on a line before the first range */
+    /* Designed to be called only by do_sv_dump().  Dumps out the ranges of the
+     * inversion list 'invlist' to 'file' at 'level'  Each line is prefixed by
+     * the string 'indent'.  The output looks like this:
+         [0] 0x000A .. 0x000D
+         [2] 0x0085
+         [4] 0x2028 .. 0x2029
+         [6] 0x3104 .. INFINITY
+     * This means that the first range of code points matched by the list are
+     * 0xA through 0xD; the second range contains only the single code point
+     * 0x85, etc.  An inversion list is an array of UVs.  Two array elements
+     * are used to define each range (except if the final range extends to
+     * infinity, only a single element is needed).  The array index of the
+     * first element for the corresponding range is given in brackets. */
  
      UV start, end;
+    STRLEN count = 0;
  
      PERL_ARGS_ASSERT__INVLIST_DUMP;
  
-    if (header && strlen(header)) {
-       PerlIO_printf(Perl_debug_log, "%s\n", header);
-    }
      if (invlist_is_iterating(invlist)) {
-        PerlIO_printf(Perl_debug_log, "Can't dump because is in middle of iterating\n");
+        Perl_dump_indent(aTHX_ level, file,
+             "%sCan't dump inversion list because is in middle of iterating\n",
+             indent);
          return;
      }
  
      invlist_iterinit(invlist);
      while (invlist_iternext(invlist, &start, &end)) {
         if (end == UV_MAX) {
-           PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. INFINITY\n", start);
+           Perl_dump_indent(aTHX_ level, file,
+                                       "%s[%"UVuf"] 0x%04"UVXf" .. INFINITY\n",
+                                   indent, (UV)count, start);
         }
         else if (end != start) {
-           PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. 0x%04"UVXf"\n",
-                                                start,         end);
+           Perl_dump_indent(aTHX_ level, file,
+                                    "%s[%"UVuf"] 0x%04"UVXf" .. 0x%04"UVXf"\n",
+                               indent, (UV)count, start,         end);
         }
         else {
-           PerlIO_printf(Perl_debug_log, "0x%04"UVXf"\n", start);
+           Perl_dump_indent(aTHX_ level, file, "%s[%"UVuf"] 0x%04"UVXf"\n",
+                                            indent, (UV)count, start);
         }
+        count += 2;
      }
  }
  #endif
  
-#if 0
+#ifdef PERL_ARGS_ASSERT__INVLISTEQ
  bool
-S__invlistEQ(pTHX_ SV* const a, SV* const b, bool complement_b)
+S__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b)
  {
      /* Return a boolean as to if the two passed in inversion lists are
       * identical.  The final argument, if TRUE, says to take the complement of
       * the second inversion list before doing the comparison */
  
-    UV* array_a = invlist_array(a);
-    UV* array_b = invlist_array(b);
+    const UV* array_a = invlist_array(a);
+    const UV* array_b = invlist_array(b);
      UV len_a = _invlist_len(a);
      UV len_b = _invlist_len(b);
  
@@ -8360,23 +8411,17 @@ S__invlistEQ(pTHX_ SV* const a, SV* const b, bool complement_b)
  
              /* Otherwise, to complement, we invert.  Here, the first element is
               * 0, just remove it.  To do this, we just pretend the array starts
-             * one later, and clear the flag as we don't have to do anything
-             * else later */
+             * one later */
  
              array_b++;
              len_b--;
-            complement_b = FALSE;
          }
          else {
  
-            /* But if the first element is not zero, we unshift a 0 before the
-             * array.  The data structure reserves a space for that 0 (which
-             * should be a '1' right now), so physical shifting is unneeded,
-             * but temporarily change that element to 0.  Before exiting the
-             * routine, we must restore the element to '1' */
+            /* But if the first element is not zero, we pretend the list starts
+             * at the 0 that is always stored immediately before the array. */
              array_b--;
              len_b++;
-            array_b[0] = 0;
          }
      }
  
@@ -8393,22 +8438,14 @@ S__invlistEQ(pTHX_ SV* const a, SV* const b, bool complement_b)
          }
      }
  
-    if (complement_b) {
-        array_b[0] = 1;
-    }
      return retval;
  }
  #endif
  
  #undef HEADER_LENGTH
-#undef INVLIST_INITIAL_LENGTH
  #undef TO_INTERNAL_SIZE
  #undef FROM_INTERNAL_SIZE
-#undef INVLIST_LEN_OFFSET
-#undef INVLIST_ZERO_OFFSET
-#undef INVLIST_ITER_OFFSET
  #undef INVLIST_VERSION_ID
-#undef INVLIST_PREVIOUS_INDEX_OFFSET
  
  /* End of inversion list object */
  
@@ -9684,16 +9721,6 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                  ret = reg_node(pRExC_state, OPFAIL);
                  return ret;
              }
-            else if (max == 0) {    /* replace {0} with a nothing node */
-                if (SIZE_ONLY) {
-                    RExC_size = PREVOPER(RExC_size) - regarglen[(U8)NOTHING];
-                }
-                else {
-                    RExC_emit = orig_emit;
-                }
-                ret = reg_node(pRExC_state, NOTHING);
-                return ret;
-            }
  
         do_curly:
             if ((flags&SIMPLE)) {
@@ -9805,9 +9832,7 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
         reginsert(pRExC_state, MINMOD, ret, depth+1);
          REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
      }
-#ifndef REG_ALLOW_MINMOD_SUSPEND
      else
-#endif
      if (RExC_parse < RExC_end && *RExC_parse == '+') {
          regnode *ender;
          nextchar(pRExC_state);
@@ -9817,7 +9842,6 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
          ret->flags = 0;
          ender = reg_node(pRExC_state, TAIL);
          REGTAIL(pRExC_state, ret, ender);
-        /*ret= ender;*/
      }
  
      if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
@@ -10172,7 +10196,7 @@ S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, regnode *node, I32
      if (! len_passed_in) {
          if (UTF) {
              if (FOLD && (! LOC || code_point > 255)) {
-                _to_uni_fold_flags(NATIVE_TO_UNI(code_point),
+                _to_uni_fold_flags(code_point,
                                     character,
                                     &len,
                                     FOLD_FLAGS_FULL | ((LOC)
@@ -10689,7 +10713,7 @@ tryagain:
                      if (num < 1)
                          vFAIL("Reference to nonexistent or unclosed group");
                  }
-               if (!isg && num > 9 && num >= RExC_npar)
+                if (!isg && num > 9 && num >= RExC_npar && *RExC_parse != '8' && *RExC_parse != '9')
                      /* Probably a character specified in octal, e.g. \35 */
                     goto defchar;
                 else {
@@ -10756,7 +10780,7 @@ tryagain:
  
         defchar: {
             STRLEN len = 0;
-           UV ender;
+           UV ender = 0;
             char *p;
             char *s;
  #define MAX_NODE_STRING_SIZE 127
@@ -10764,17 +10788,22 @@ tryagain:
             char *s0;
             U8 upper_parse = MAX_NODE_STRING_SIZE;
             STRLEN foldlen;
-            U8 node_type;
+            U8 node_type = compute_EXACTish(pRExC_state);
              bool next_is_quantifier;
              char * oldp = NULL;
  
+            /* We can convert EXACTF nodes to EXACTFU if they contain only
+             * characters that match identically regardless of the target
+             * string's UTF8ness.  The reason to do this is that EXACTF is not
+             * trie-able, EXACTFU is.  (We don't need to figure this out until
+             * pass 2) */
+            bool maybe_exactfu = node_type == EXACTF && PASS2;
+
              /* If a folding node contains only code points that don't
               * participate in folds, it can be changed into an EXACT node,
               * which allows the optimizer more things to look for */
              bool maybe_exact;
  
-           ender = 0;
-            node_type = compute_EXACTish(pRExC_state);
             ret = reg_node(pRExC_state, node_type);
  
              /* In pass1, folded, we use a temporary buffer instead of the
@@ -10787,8 +10816,8 @@ tryagain:
  
              /* We do the EXACTFish to EXACT node only if folding, and not if in
               * locale, as whether a character folds or not isn't known until
-             * runtime */
-            maybe_exact = FOLD && ! LOC;
+             * runtime.  (And we don't need to figure this out until pass 2) */
+            maybe_exact = FOLD && ! LOC && PASS2;
  
             /* XXX The node can hold up to 255 bytes, yet this only goes to
               * 127.  I (khw) do not know why.  Keeping it somewhat less than
@@ -10901,7 +10930,7 @@ tryagain:
                         p++;
                         break;
                     case 'a':
-                         ender = ASCII_TO_NATIVE('\007');
+                         ender = '\a';
                         p++;
                         break;
                     case 'o':
@@ -10966,10 +10995,30 @@ tryagain:
                         p++;
                         ender = grok_bslash_c(*p++, UTF, SIZE_ONLY);
                         break;
-                   case '0': case '1': case '2': case '3':case '4':
+                    case '8': case '9': /* must be a backreference */
+                        --p;
+                        goto loopdone;
+                    case '1': case '2': case '3':case '4':
                     case '5': case '6': case '7':
-                       if (*p == '0' ||
-                           (isDIGIT(p[1]) && atoi(p) >= RExC_npar))
+                        /* When we parse backslash escapes there is ambiguity
+                         * between backreferences and octal escapes. Any escape
+                         * from \1 - \9 is a backreference, any multi-digit
+                         * escape which does not start with 0 and which when
+                         * evaluated as decimal could refer to an already
+                         * parsed capture buffer is a backslash. Anything else
+                         * is octal.
+                         *
+                         * Note this implies that \118 could be interpreted as
+                         * 118 OR as "\11" . "8" depending on whether there
+                         * were 118 capture buffers defined already in the
+                         * pattern.  */
+                        if ( !isDIGIT(p[1]) || atoi(p) <= RExC_npar )
+                        {  /* Not to be treated as an octal constant, go
+                                   find backref */
+                            --p;
+                            goto loopdone;
+                        }
+                    case '0':
                         {
                             I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
                             STRLEN numlen = 3;
@@ -10988,11 +11037,6 @@ tryagain:
                                           form_short_octal_warning(p, numlen));
                              }
                         }
-                        else {  /* Not to be treated as an octal constant, go
-                                   find backref */
-                           --p;
-                           goto loopdone;
-                       }
                         if (PL_encoding && ender < 0x100)
                             goto recode_encoding;
                         break;
@@ -11082,15 +11126,29 @@ tryagain:
                          REGC((char)ender, s++);
                      }
                  }
-                else /* FOLD */
-                     if (! ( UTF
+                else /* FOLD */ if (! ( UTF
                          /* See comments for join_exact() as to why we fold this
                           * non-UTF at compile time */
                          || (node_type == EXACTFU
                              && ender == LATIN_SMALL_LETTER_SHARP_S)))
                  {
+                    if (IS_IN_SOME_FOLD_L1(ender)) {
+                        maybe_exact = FALSE;
+
+                        /* See if the character's fold differs between /d and
+                         * /u.  This includes the multi-char fold SHARP S to
+                         * 'ss' */
+                        if (maybe_exactfu
+                            && (PL_fold[ender] != PL_fold_latin1[ender]
+                                || ender == LATIN_SMALL_LETTER_SHARP_S
+                                || (len > 0
+                                   && isARG2_lower_or_UPPER_ARG1('s', ender)
+                                   && isARG2_lower_or_UPPER_ARG1('s', *(s-1)))))
+                        {
+                            maybe_exactfu = FALSE;
+                        }
+                    }
                      *(s++) = (char) ender;
-                    maybe_exact &= ! IS_IN_SOME_FOLD_L1(ender);
                  }
                  else {  /* UTF */
  
@@ -11100,7 +11158,7 @@ tryagain:
                       * utf8.  If we start to fold non-UTF patterns, be sure to
                       * update join_exact() */
                      if (LOC && ender < 256) {
-                        if (UNI_IS_INVARIANT(ender)) {
+                        if (NATIVE_IS_INVARIANT(ender)) {
                              *s = (U8) ender;
                              foldlen = 1;
                          } else {
@@ -11241,8 +11299,8 @@ tryagain:
                              /* No Latin1 characters participate in multi-char
                               * folds under /l */
                              if (LOC
-                                || ! IS_NON_FINAL_FOLD(TWO_BYTE_UTF8_TO_UNI(
-                                                                *s, *(s+1))))
+                                || ! IS_NON_FINAL_FOLD(TWO_BYTE_UTF8_TO_NATIVE(
+                                                                  *s, *(s+1))))
                              {
                                  break;
                              }
@@ -11279,6 +11337,15 @@ tryagain:
                   * do any better */
                  if (len == 0) {
                      len = full_len;
+
+                    /* If the node ends in an 's' we make sure it stays EXACTF,
+                     * as if it turns into an EXACTFU, it could later get
+                     * joined with another 's' that would then wrongly match
+                     * the sharp s */
+                    if (maybe_exactfu && isARG2_lower_or_UPPER_ARG1('s', ender))
+                    {
+                        maybe_exactfu = FALSE;
+                    }
                  } else {
  
                      /* Here, the node does contain some characters that aren't
@@ -11331,19 +11398,26 @@ tryagain:
         loopdone:   /* Jumped to when encounters something that shouldn't be in
                        the node */
  
-            /* If 'maybe_exact' is still set here, means there are no
-             * code points in the node that participate in folds */
-            if (FOLD && maybe_exact) {
-                OP(ret) = EXACT;
-            }
-
              /* I (khw) don't know if you can get here with zero length, but the
               * old code handled this situation by creating a zero-length EXACT
               * node.  Might as well be NOTHING instead */
              if (len == 0) {
                  OP(ret) = NOTHING;
              }
-            else{
+            else {
+                if (FOLD) {
+                    /* If 'maybe_exact' is still set here, means there are no
+                     * code points in the node that participate in folds;
+                     * similarly for 'maybe_exactfu' and code points that match
+                     * differently depending on UTF8ness of the target string
+                     * */
+                    if (maybe_exact) {
+                        OP(ret) = EXACT;
+                    }
+                    else if (maybe_exactfu) {
+                        OP(ret) = EXACTFU;
+                    }
+                }
                  alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender);
              }
  
@@ -11664,6 +11738,7 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, I32 *f
       * these things, we need to realize that something preceded by a backslash
       * is escaped, so we have to keep track of backslashes */
      if (SIZE_ONLY) {
+        UV depth = 0; /* how many nested (?[...]) constructs */
  
          Perl_ck_warner_d(aTHX_
              packWARN(WARN_EXPERIMENTAL__REGEX_SETS),
@@ -11675,6 +11750,9 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, I32 *f
              RExC_parse = regpatws(pRExC_state, RExC_parse,
                                  TRUE); /* means recognize comments */
              switch (*RExC_parse) {
+                case '?':
+                    if (RExC_parse[1] == '[') depth++, RExC_parse++;
+                    /* FALL THROUGH */
                  default:
                      break;
                  case '\\':
@@ -11722,6 +11800,7 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, I32 *f
                  }
  
                  case ']':
+                    if (depth--) break;
                      RExC_parse++;
                      if (RExC_parse < RExC_end
                          && *RExC_parse == ')')
@@ -11949,6 +12028,7 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, I32 *f
                      || IS_OPERAND(lparen)
                      || SvUV(lparen) != '(')
                  {
+                    SvREFCNT_dec(current);
                      RExC_parse++;
                      vFAIL("Unexpected ')'");
                  }
@@ -11971,6 +12051,8 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, I32 *f
                      char current_operator;
  
                      if (IS_OPERAND(top)) {
+                        SvREFCNT_dec_NN(top);
+                        SvREFCNT_dec_NN(current);
                          vFAIL("Operand with no preceding operator");
                      }
                      current_operator = (char) SvUV(top);
@@ -12212,7 +12294,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
      regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
          case we need to change the emitted regop to an EXACT. */
      const char * orig_parse = RExC_parse;
-    const I32 orig_size = RExC_size;
+    const SSize_t orig_size = RExC_size;
      GET_RE_DEBUG_FLAGS_DECL;
  
      PERL_ARGS_ASSERT_REGCLASS;
@@ -12510,7 +12592,7 @@ parseit:
             case 'f':   value = '\f';                   break;
             case 'b':   value = '\b';                   break;
             case 'e':   value = ASCII_TO_NATIVE('\033');break;
-           case 'a':   value = ASCII_TO_NATIVE('\007');break;
+           case 'a':   value = '\a';                   break;
             case 'o':
                 RExC_parse--;   /* function expects to be pointed at the 'o' */
                 {
@@ -13121,9 +13203,11 @@ parseit:
               * included.  literal_endpoint==2 means both ends of the range used
               * a literal character, not \x{foo} */
             if (literal_endpoint == 2
-                && (prevvalue >= 'a' && value <= 'z')
-                    || (prevvalue >= 'A' && value <= 'Z'))
+                && ((prevvalue >= 'a' && value <= 'z')
+                    || (prevvalue >= 'A' && value <= 'Z')))
              {
+                _invlist_intersection(this_range, PL_ASCII,
+                                      &this_range);
                  _invlist_intersection(this_range, PL_Posix_ptrs[_CC_ALPHA],
                                        &this_range);
              }
@@ -13420,7 +13504,7 @@ parseit:
                      /* If the folds haven't been read in, call a fold function
                       * to force that */
                      if (! PL_utf8_tofold) {
-                        U8 dummy[UTF8_MAXBYTES+1];
+                        U8 dummy[UTF8_MAXBYTES_CASE+1];
  
                          /* This string is just a short named one above \xff */
                          to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
@@ -13542,7 +13626,7 @@ parseit:
                              default:
                                  /* Use deprecated warning to increase the
                                   * chances of this being output */
-                                ckWARN2regdep(RExC_parse, "Perl folding rules are not up-to-date for 0x%"UVXf"; please use the perlbug utility to report;", j);
+                                ckWARN2reg_d(RExC_parse, "Perl folding rules are not up-to-date for 0x%"UVXf"; please use the perlbug utility to report;", j);
                                  break;
                          }
                      }
@@ -13906,8 +13990,6 @@ parseit:
             for (i = start; i <= (int) high; i++) {
                 if (! ANYOF_BITMAP_TEST(ret, i)) {
                     ANYOF_BITMAP_SET(ret, i);
-                   prevvalue = value;
-                   value = i;
                 }
             }
         }
@@ -14373,10 +14455,10 @@ S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,
              switch (OP(scan)) {
                  case EXACT:
                  case EXACTF:
+                case EXACTFA_NO_TRIE:
                  case EXACTFA:
                  case EXACTFU:
                  case EXACTFU_SS:
-                case EXACTFU_TRICKYFOLD:
                  case EXACTFL:
                          if( exact == PSEUDO )
                              exact= OP(scan);
@@ -14426,6 +14508,28 @@ S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,
   - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
   */
  #ifdef DEBUGGING
+
+static void
+S_regdump_intflags(pTHX_ const char *lead, const U32 flags)
+{
+    int bit;
+    int set=0;
+
+    for (bit=0; bit<32; bit++) {
+        if (flags & (1<<bit)) {
+            if (!set++ && lead)
+                PerlIO_printf(Perl_debug_log, "%s",lead);
+            PerlIO_printf(Perl_debug_log, "%s ",PL_reg_intflags_name[bit]);
+        }
+    }
+    if (lead)  {
+        if (set)
+            PerlIO_printf(Perl_debug_log, "\n");
+        else
+            PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
+    }
+}
+
  static void 
  S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
  {
@@ -14558,7 +14662,10 @@ Perl_regdump(pTHX_ const regexp *r)
      if (r->extflags & RXf_EVAL_SEEN)
         PerlIO_printf(Perl_debug_log, "with eval ");
      PerlIO_printf(Perl_debug_log, "\n");
-    DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags));            
+    DEBUG_FLAGS_r({
+        regdump_extflags("r->extflags: ",r->extflags);
+        regdump_intflags("r->intflags: ",r->intflags);
+    });
  #else
      PERL_ARGS_ASSERT_REGDUMP;
      PERL_UNUSED_CONTEXT;
@@ -14685,26 +14792,10 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
              )
          );
          if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
-            int i;
-            int rangestart = -1;
-            U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
              sv_catpvs(sv, "[");
-            for (i = 0; i <= 256; i++) {
-                if (i < 256 && BITMAP_TEST(bitmap,i)) {
-                    if (rangestart == -1)
-                        rangestart = i;
-                } else if (rangestart != -1) {
-                    if (i <= rangestart + 3)
-                        for (; rangestart < i; rangestart++)
-                            put_byte(sv, rangestart);
-                    else {
-                        put_byte(sv, rangestart);
-                        sv_catpvs(sv, "-");
-                        put_byte(sv, i - 1);
-                    }
-                    rangestart = -1;
-                }
-            }
+            (void) put_latin1_charclass_innards(sv, IS_ANYOF_TRIE(op)
+                                                   ? ANYOF_BITMAP(o)
+                                                   : TRIE_BITMAP(trie));
              sv_catpvs(sv, "]");
          } 
          
@@ -14748,7 +14839,6 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
      } else if (k == LOGICAL)
         Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);     /* 2: embedded, otherwise 1 */
      else if (k == ANYOF) {
-       int i, rangestart = -1;
         const U8 flags = ANYOF_FLAGS(o);
         int do_sep = 0;
  
@@ -14762,32 +14852,19 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
             sv_catpvs(sv, "^");
  
         /* output what the standard cp 0-255 bitmap matches */
-       for (i = 0; i <= 256; i++) {
-           if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
-               if (rangestart == -1)
-                   rangestart = i;
-           } else if (rangestart != -1) {
-               if (i <= rangestart + 3)
-                   for (; rangestart < i; rangestart++)
-                       put_byte(sv, rangestart);
-               else {
-                   put_byte(sv, rangestart);
-                   sv_catpvs(sv, "-");
-                   put_byte(sv, i - 1);
-               }
-               do_sep = 1;
-               rangestart = -1;
-           }
-       }
+        do_sep = put_latin1_charclass_innards(sv, ANYOF_BITMAP(o));
          
          EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
          /* output any special charclass tests (used entirely under use locale) */
-       if (ANYOF_CLASS_TEST_ANY_SET(o))
-           for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
+       if (ANYOF_CLASS_TEST_ANY_SET(o)) {
+            int i;
+           for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++) {
                 if (ANYOF_CLASS_TEST(o,i)) {
                     sv_catpv(sv, anyofs[i]);
                     do_sep = 1;
                 }
+            }
+        }
          
          EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
          
@@ -14798,91 +14875,61 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
          /* output information about the unicode matching */
         if (flags & ANYOF_UNICODE_ALL)
             sv_catpvs(sv, "{unicode_all}");
-       else if (ANYOF_NONBITMAP(o))
-           sv_catpvs(sv, "{unicode}");
-       if (flags & ANYOF_NONBITMAP_NON_UTF8)
-           sv_catpvs(sv, "{outside bitmap}");
-
-       if (ANYOF_NONBITMAP(o)) {
-           SV *lv; /* Set if there is something outside the bit map */
-           SV * const sw = regclass_swash(prog, o, FALSE, &lv, NULL);
+       else if (ANYOF_NONBITMAP(o)) {
+            SV *lv; /* Set if there is something outside the bit map. */
              bool byte_output = FALSE;   /* If something in the bitmap has been
                                             output */
  
-           if (lv && lv != &PL_sv_undef) {
-               if (sw) {
-                   U8 s[UTF8_MAXBYTES_CASE+1];
-
-                   for (i = 0; i <= 256; i++) { /* Look at chars in bitmap */
-                       uvchr_to_utf8(s, i);
-
-                       if (i < 256
-                            && ! ANYOF_BITMAP_TEST(o, i)    /* Don't duplicate
-                                                               things already
-                                                               output as part
-                                                               of the bitmap */
-                            && swash_fetch(sw, s, TRUE))
-                        {
-                           if (rangestart == -1)
-                               rangestart = i;
-                       } else if (rangestart != -1) {
-                            byte_output = TRUE;
-                           if (i <= rangestart + 3)
-                               for (; rangestart < i; rangestart++) {
-                                   put_byte(sv, rangestart);
-                               }
-                           else {
-                               put_byte(sv, rangestart);
-                               sv_catpvs(sv, "-");
-                               put_byte(sv, i-1);
-                           }
-                           rangestart = -1;
-                       }
-                   }
-               }
+            if (flags & ANYOF_NONBITMAP_NON_UTF8) {
+                sv_catpvs(sv, "{outside bitmap}");
+            }
+            else {
+                sv_catpvs(sv, "{utf8}");
+            }
  
-               {
-                   char *s = savesvpv(lv);
-                   char * const origs = s;
+            /* Get the stuff that wasn't in the bitmap */
+           (void) regclass_swash(prog, o, FALSE, &lv, NULL);
+           if (lv && lv != &PL_sv_undef) {
+                char *s = savesvpv(lv);
+                char * const origs = s;
  
-                   while (*s && *s != '\n')
-                       s++;
+                while (*s && *s != '\n')
+                    s++;
  
-                   if (*s == '\n') {
-                       const char * const t = ++s;
+                if (*s == '\n') {
+                    const char * const t = ++s;
  
-                        if (byte_output) {
-                            sv_catpvs(sv, " ");
-                        }
+                    if (byte_output) {
+                        sv_catpvs(sv, " ");
+                    }
  
-                       while (*s) {
-                           if (*s == '\n') {
+                    while (*s) {
+                        if (*s == '\n') {
  
-                                /* Truncate very long output */
-                               if (s - origs > 256) {
-                                   Perl_sv_catpvf(aTHX_ sv,
-                                                  "%.*s...",
-                                                  (int) (s - origs - 1),
-                                                  t);
-                                   goto out_dump;
-                               }
-                               *s = ' ';
-                           }
-                           else if (*s == '\t') {
-                               *s = '-';
-                           }
-                           s++;
-                       }
-                       if (s[-1] == ' ')
-                           s[-1] = 0;
+                            /* Truncate very long output */
+                            if (s - origs > 256) {
+                                Perl_sv_catpvf(aTHX_ sv,
+                                               "%.*s...",
+                                               (int) (s - origs - 1),
+                                               t);
+                                goto out_dump;
+                            }
+                            *s = ' ';
+                        }
+                        else if (*s == '\t') {
+                            *s = '-';
+                        }
+                        s++;
+                    }
+                    if (s[-1] == ' ')
+                        s[-1] = 0;
  
-                       sv_catpv(sv, t);
-                   }
+                    sv_catpv(sv, t);
+                }
  
-               out_dump:
+            out_dump:
  
-                   Safefree(origs);
-               }
+                Safefree(origs);
                 SvREFCNT_dec_NN(lv);
             }
         }
@@ -15279,7 +15326,6 @@ Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
                so we need to copy it locally.  */
      RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED(sstr), SvCUR(sstr)+1);
      ret->mother_re   = NULL;
-    ret->gofs = 0;
  }
  #endif /* PERL_IN_XSUB_RE */
  
@@ -15512,12 +15558,17 @@ S_put_byte(pTHX_ SV *sv, int c)
  
         So the old condition can be simplified to !isPRINT(c)  */
      if (!isPRINT(c)) {
-       if (c < 256) {
-           Perl_sv_catpvf(aTHX_ sv, "\\x%02x", c);
-       }
-       else {
-           Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
-       }
+        switch (c) {
+            case '\r': Perl_sv_catpvf(aTHX_ sv, "\\r"); break;
+            case '\n': Perl_sv_catpvf(aTHX_ sv, "\\n"); break;
+            case '\t': Perl_sv_catpvf(aTHX_ sv, "\\t"); break;
+            case '\f': Perl_sv_catpvf(aTHX_ sv, "\\f"); break;
+            case '\a': Perl_sv_catpvf(aTHX_ sv, "\\a"); break;
+
+            default:
+                Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
+                break;
+        }
      }
      else {
         const char string = c;
@@ -15527,6 +15578,63 @@ S_put_byte(pTHX_ SV *sv, int c)
      }
  }
  
+STATIC bool
+S_put_latin1_charclass_innards(pTHX_ SV *sv, char *bitmap)
+{
+    /* Appends to 'sv' a displayable version of the innards of the bracketed
+     * character class whose bitmap is 'bitmap';  Returns 'TRUE' if it actually
+     * output anything */
+
+    int i;
+    int rangestart = -1;
+    bool has_output_anything = FALSE;
+
+    PERL_ARGS_ASSERT_PUT_LATIN1_CHARCLASS_INNARDS;
+
+    for (i = 0; i <= 256; i++) {
+        if (i < 256 && BITMAP_TEST((U8 *) bitmap,i)) {
+            if (rangestart == -1)
+                rangestart = i;
+        } else if (rangestart != -1) {
+            int j = i - 1;
+            if (i <= rangestart + 3) {  /* Individual chars in short ranges */
+                for (; rangestart < i; rangestart++)
+                    put_byte(sv, rangestart);
+            }
+            else if (   j > 255
+                     || ! isALPHANUMERIC(rangestart)
+                     || ! isALPHANUMERIC(j)
+                     || isDIGIT(rangestart) != isDIGIT(j)
+                     || isUPPER(rangestart) != isUPPER(j)
+                     || isLOWER(rangestart) != isLOWER(j)
+
+                        /* This final test should get optimized out except
+                         * on EBCDIC platforms, where it causes ranges that
+                         * cross discontinuities like i/j to be shown as hex
+                         * instead of the misleading, e.g. H-K (since that
+                         * range includes more than H, I, J, K). */
+                     || (j - rangestart)
+                         != NATIVE_TO_ASCII(j) - NATIVE_TO_ASCII(rangestart))
+            {
+                Perl_sv_catpvf(aTHX_ sv, "\\x{%02x}-\\x{%02x}",
+                               rangestart,
+                               (j < 256) ? j : 255);
+            }
+            else { /* Here, the ends of the range are both digits, or both
+                      uppercase, or both lowercase; and there's no
+                      discontinuity in the range (which could happen on EBCDIC
+                      platforms) */
+                put_byte(sv, rangestart);
+                sv_catpvs(sv, "-");
+                put_byte(sv, j);
+            }
+            rangestart = -1;
+            has_output_anything = TRUE;
+        }
+    }
+
+    return has_output_anything;
+}
  
  #define CLEAR_OPTSTART \
      if (optstart) STMT_START { \