Add Bo Johansson to AUTHORS

[perl5.git] / regcomp.c
diff --git a/regcomp.c b/regcomp.c

index 6f3bfe8..a067af6 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -134,10 +134,16 @@ typedef struct RExC_state_t {
      I32                orig_utf8;      /* whether the pattern was originally in utf8 */
                                 /* XXX use this for future optimisation of case
                                  * where pattern must be upgraded to utf8. */
+    I32                uni_semantics;  /* If a d charset modifier should use unicode
+                                  rules, even if the pattern is not in
+                                  utf8 */
      HV         *paren_names;           /* Paren names */
      
      regnode    **recurse;              /* Recurse regops */
      I32                recurse_count;          /* Number of recurse regops */
+    I32                in_lookbehind;
+    I32                contains_locale;
+    I32                override_recoding;
  #if ADD_TO_REGEXEC
      char       *starttry;              /* -Dr: where regtry was called. */
  #define RExC_starttry  (pRExC_state->starttry)
@@ -177,6 +183,7 @@ typedef struct RExC_state_t {
  #define RExC_seen_zerolen      (pRExC_state->seen_zerolen)
  #define RExC_seen_evals        (pRExC_state->seen_evals)
  #define RExC_utf8      (pRExC_state->utf8)
+#define RExC_uni_semantics     (pRExC_state->uni_semantics)
  #define RExC_orig_utf8 (pRExC_state->orig_utf8)
  #define RExC_open_parens       (pRExC_state->open_parens)
  #define RExC_close_parens      (pRExC_state->close_parens)
@@ -184,6 +191,9 @@ typedef struct RExC_state_t {
  #define RExC_paren_names       (pRExC_state->paren_names)
  #define RExC_recurse   (pRExC_state->recurse)
  #define RExC_recurse_count     (pRExC_state->recurse_count)
+#define RExC_in_lookbehind     (pRExC_state->in_lookbehind)
+#define RExC_contains_locale   (pRExC_state->contains_locale)
+#define RExC_override_recoding (pRExC_state->override_recoding)
  
  
  #define        ISMULT1(c)      ((c) == '*' || (c) == '+' || (c) == '?')
@@ -200,7 +210,7 @@ typedef struct RExC_state_t {
  #define        HASWIDTH        0x01    /* Known to match non-null strings. */
  
  /* Simple enough to be STAR/PLUS operand, in an EXACT node must be a single
- * character, and if utf8, must be invariant. */
+ * character, and if utf8, must be invariant.  Note that this is not the same thing as REGNODE_SIMPLE */
  #define        SIMPLE          0x02
  #define        SPSTART         0x04    /* Starts with * or +. */
  #define TRYAGAIN       0x08    /* Weeded out a declaration. */
@@ -280,7 +290,7 @@ typedef struct RExC_state_t {
      minimum length for the F is 1. This is important as the minimum length
      is used to determine offsets in front of and behind the string being 
      looked for.  Since strings can be composites this is the length of the
-    pattern at the time it was commited with a scan_commit. Note that
+    pattern at the time it was committed with a scan_commit. Note that
      the length is calculated by study_chunk, so that the minimum lengths
      are not known until the full pattern has been compiled, thus the 
      pointer to the value.
@@ -316,12 +326,12 @@ typedef struct scan_data_t {
      SV **longest;          /* Either &l_fixed, or &l_float. */
      SV *longest_fixed;      /* longest fixed string found in pattern */
      I32 offset_fixed;       /* offset where it starts */
-    I32 *minlen_fixed;      /* pointer to the minlen relevent to the string */
+    I32 *minlen_fixed;      /* pointer to the minlen relevant to the string */
      I32 lookbehind_fixed;   /* is the position of the string modfied by LB */
      SV *longest_float;      /* longest floating string found in pattern */
      I32 offset_float_min;   /* earliest point in string it can appear */
      I32 offset_float_max;   /* latest point in string it can appear */
-    I32 *minlen_float;      /* pointer to the minlen relevent to the string */
+    I32 *minlen_float;      /* pointer to the minlen relevant to the string */
      I32 lookbehind_float;   /* is the position of the string modified by LB */
      I32 flags;
      I32 whilem_c;
@@ -369,8 +379,14 @@ static const scan_data_t zero_scan_data =
  #define SCF_SEEN_ACCEPT         0x8000 
  
  #define UTF cBOOL(RExC_utf8)
-#define LOC cBOOL(RExC_flags & RXf_PMf_LOCALE)
-#define UNI_SEMANTICS cBOOL(RExC_flags & RXf_PMf_UNICODE)
+#define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
+#define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
+#define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_DEPENDS_CHARSET)
+#define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) >= REGEX_UNICODE_CHARSET)
+#define ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_RESTRICTED_CHARSET)
+#define MORE_ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
+#define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) >= REGEX_ASCII_RESTRICTED_CHARSET)
+
  #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
  
  #define OOB_UNICODE            12345678
@@ -497,6 +513,13 @@ static const scan_data_t zero_scan_data =
             (int)offset, RExC_precomp, RExC_precomp + offset);          \
  } STMT_END
  
+#define        ckWARN2regdep(loc,m, a1) STMT_START {                           \
+    const IV offset = loc - RExC_precomp;                              \
+    Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP),    \
+           m REPORT_LOCATION,                                          \
+           a1, (int)offset, RExC_precomp, RExC_precomp + offset);      \
+} STMT_END
+
  #define        ckWARN2reg(loc, m, a1) STMT_START {                             \
      const IV offset = loc - RExC_precomp;                              \
      Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,     \
@@ -705,12 +728,24 @@ S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *c
  {
      PERL_ARGS_ASSERT_CL_ANYTHING;
  
-    ANYOF_CLASS_ZERO(cl);
      ANYOF_BITMAP_SETALL(cl);
-    cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
-    if (LOC)
+    cl->flags = ANYOF_CLASS|ANYOF_EOS|ANYOF_UNICODE_ALL
+               |ANYOF_LOC_NONBITMAP_FOLD|ANYOF_NON_UTF8_LATIN1_ALL;
+
+    /* If any portion of the regex is to operate under locale rules,
+     * initialization includes it.  The reason this isn't done for all regexes
+     * is that the optimizer was written under the assumption that locale was
+     * all-or-nothing.  Given the complexity and lack of documentation in the
+     * optimizer, and that there are inadequate test cases for locale, so many
+     * parts of it may not work properly, it is safest to avoid locale unless
+     * necessary. */
+    if (RExC_contains_locale) {
+       ANYOF_CLASS_SETALL(cl);     /* /l uses class */
         cl->flags |= ANYOF_LOCALE;
-    cl->flags |= ANYOF_FOLD;
+    }
+    else {
+       ANYOF_CLASS_ZERO(cl);       /* Only /l uses class now */
+    }
  }
  
  /* Can match anything (initialization) */
@@ -740,22 +775,15 @@ S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
      Zero(cl, 1, struct regnode_charclass_class);
      cl->type = ANYOF;
      cl_anything(pRExC_state, cl);
+    ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
  }
  
-STATIC void
-S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
-{
-    PERL_ARGS_ASSERT_CL_INIT_ZERO;
-
-    Zero(cl, 1, struct regnode_charclass_class);
-    cl->type = ANYOF;
-    cl_anything(pRExC_state, cl);
-    if (LOC)
-       cl->flags |= ANYOF_LOCALE;
-}
+/* These two functions currently do the exact same thing */
+#define cl_init_zero           S_cl_init
  
-/* 'And' a given class with another one.  Can create false positives */
-/* We assume that cl is not inverted */
+/* 'AND' a given class with another one.  Can create false positives.  'cl'
+ * should not be inverted.  'and_with->flags & ANYOF_CLASS' should be 0 if
+ * 'and_with' is a regnode_charclass instead of a regnode_charclass_class. */
  STATIC void
  S_cl_and(struct regnode_charclass_class *cl,
         const struct regnode_charclass_class *and_with)
@@ -763,11 +791,13 @@ S_cl_and(struct regnode_charclass_class *cl,
      PERL_ARGS_ASSERT_CL_AND;
  
      assert(and_with->type == ANYOF);
-    if (!(and_with->flags & ANYOF_CLASS)
-       && !(cl->flags & ANYOF_CLASS)
+
+    /* I (khw) am not sure all these restrictions are necessary XXX */
+    if (!(ANYOF_CLASS_TEST_ANY_SET(and_with))
+       && !(ANYOF_CLASS_TEST_ANY_SET(cl))
         && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
-       && !(and_with->flags & ANYOF_FOLD)
-       && !(cl->flags & ANYOF_FOLD)) {
+       && !(and_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
+       && !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) {
         int i;
  
         if (and_with->flags & ANYOF_INVERT)
@@ -777,36 +807,119 @@ S_cl_and(struct regnode_charclass_class *cl,
             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
                 cl->bitmap[i] &= and_with->bitmap[i];
      } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
-    if (!(and_with->flags & ANYOF_EOS))
-       cl->flags &= ~ANYOF_EOS;
-
-    if (!(and_with->flags & ANYOF_FOLD))
-       cl->flags &= ~ANYOF_FOLD;
-
-    if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_NONBITMAP &&
-       !(and_with->flags & ANYOF_INVERT)) {
-       cl->flags &= ~ANYOF_UNICODE_ALL;
-       cl->flags |= and_with->flags & ANYOF_NONBITMAP; /* field is 2 bits; use
-                                                          only the one(s)
-                                                          actually set */
-       ARG_SET(cl, ARG(and_with));
-    }
-    if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
-       !(and_with->flags & ANYOF_INVERT))
-       cl->flags &= ~ANYOF_UNICODE_ALL;
-    if (!(and_with->flags & (ANYOF_NONBITMAP|ANYOF_UNICODE_ALL)) &&
-       !(and_with->flags & ANYOF_INVERT))
-       cl->flags &= ~ANYOF_NONBITMAP;
+
+    if (and_with->flags & ANYOF_INVERT) {
+
+        /* Here, the and'ed node is inverted.  Get the AND of the flags that
+         * aren't affected by the inversion.  Those that are affected are
+         * handled individually below */
+       U8 affected_flags = cl->flags & ~INVERSION_UNAFFECTED_FLAGS;
+       cl->flags &= (and_with->flags & INVERSION_UNAFFECTED_FLAGS);
+       cl->flags |= affected_flags;
+
+        /* We currently don't know how to deal with things that aren't in the
+         * bitmap, but we know that the intersection is no greater than what
+         * is already in cl, so let there be false positives that get sorted
+         * out after the synthetic start class succeeds, and the node is
+         * matched for real. */
+
+        /* The inversion of these two flags indicate that the resulting
+         * intersection doesn't have them */
+       if (and_with->flags & ANYOF_UNICODE_ALL) {
+           cl->flags &= ~ANYOF_UNICODE_ALL;
+       }
+       if (and_with->flags & ANYOF_NON_UTF8_LATIN1_ALL) {
+           cl->flags &= ~ANYOF_NON_UTF8_LATIN1_ALL;
+       }
+    }
+    else {   /* and'd node is not inverted */
+       U8 outside_bitmap_but_not_utf8; /* Temp variable */
+
+       if (! ANYOF_NONBITMAP(and_with)) {
+
+            /* Here 'and_with' doesn't match anything outside the bitmap
+             * (except possibly ANYOF_UNICODE_ALL), which means the
+             * intersection can't either, except for ANYOF_UNICODE_ALL, in
+             * which case we don't know what the intersection is, but it's no
+             * greater than what cl already has, so can just leave it alone,
+             * with possible false positives */
+            if (! (and_with->flags & ANYOF_UNICODE_ALL)) {
+                ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
+               cl->flags &= ~ANYOF_NONBITMAP_NON_UTF8;
+            }
+       }
+       else if (! ANYOF_NONBITMAP(cl)) {
+
+           /* Here, 'and_with' does match something outside the bitmap, and cl
+            * doesn't have a list of things to match outside the bitmap.  If
+             * cl can match all code points above 255, the intersection will
+             * be those above-255 code points that 'and_with' matches.  If cl
+             * can't match all Unicode code points, it means that it can't
+             * match anything outside the bitmap (since the 'if' that got us
+             * into this block tested for that), so we leave the bitmap empty.
+             */
+           if (cl->flags & ANYOF_UNICODE_ALL) {
+               ARG_SET(cl, ARG(and_with));
+
+                /* and_with's ARG may match things that don't require UTF8.
+                 * And now cl's will too, in spite of this being an 'and'.  See
+                 * the comments below about the kludge */
+               cl->flags |= and_with->flags & ANYOF_NONBITMAP_NON_UTF8;
+           }
+       }
+       else {
+            /* Here, both 'and_with' and cl match something outside the
+             * bitmap.  Currently we do not do the intersection, so just match
+             * whatever cl had at the beginning.  */
+       }
+
+
+        /* Take the intersection of the two sets of flags.  However, the
+         * ANYOF_NONBITMAP_NON_UTF8 flag is treated as an 'or'.  This is a
+         * kludge around the fact that this flag is not treated like the others
+         * which are initialized in cl_anything().  The way the optimizer works
+         * is that the synthetic start class (SSC) is initialized to match
+         * anything, and then the first time a real node is encountered, its
+         * values are AND'd with the SSC's with the result being the values of
+         * the real node.  However, there are paths through the optimizer where
+         * the AND never gets called, so those initialized bits are set
+         * inappropriately, which is not usually a big deal, as they just cause
+         * false positives in the SSC, which will just mean a probably
+         * imperceptible slow down in execution.  However this bit has a
+         * higher false positive consequence in that it can cause utf8.pm,
+         * utf8_heavy.pl ... to be loaded when not necessary, which is a much
+         * bigger slowdown and also causes significant extra memory to be used.
+         * In order to prevent this, the code now takes a different tack.  The
+         * bit isn't set unless some part of the regular expression needs it,
+         * but once set it won't get cleared.  This means that these extra
+         * modules won't get loaded unless there was some path through the
+         * pattern that would have required them anyway, and  so any false
+         * positives that occur by not ANDing them out when they could be
+         * aren't as severe as they would be if we treated this bit like all
+         * the others */
+        outside_bitmap_but_not_utf8 = (cl->flags | and_with->flags)
+                                      & ANYOF_NONBITMAP_NON_UTF8;
+       cl->flags &= and_with->flags;
+       cl->flags |= outside_bitmap_but_not_utf8;
+    }
  }
  
-/* 'OR' a given class with another one.  Can create false positives */
-/* We assume that cl is not inverted */
+/* 'OR' a given class with another one.  Can create false positives.  'cl'
+ * should not be inverted.  'or_with->flags & ANYOF_CLASS' should be 0 if
+ * 'or_with' is a regnode_charclass instead of a regnode_charclass_class. */
  STATIC void
  S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
  {
      PERL_ARGS_ASSERT_CL_OR;
  
      if (or_with->flags & ANYOF_INVERT) {
+
+        /* Here, the or'd node is to be inverted.  This means we take the
+         * complement of everything not in the bitmap, but currently we don't
+         * know what that is, so give up and match anything */
+       if (ANYOF_NONBITMAP(or_with)) {
+           cl_anything(pRExC_state, cl);
+       }
         /* We do not use
          * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
          *   <= (B1 | !B2) | (CL1 | !CL2)
@@ -816,9 +929,9 @@ S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, con
          *   (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
          *   (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
          */
-       if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
-            && !(or_with->flags & ANYOF_FOLD)
-            && !(cl->flags & ANYOF_FOLD) ) {
+       else if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
+            && !(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
+            && !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD) ) {
             int i;
  
             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
@@ -827,17 +940,31 @@ S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, con
         else {
             cl_anything(pRExC_state, cl);
         }
-    } else {
+
+        /* And, we can just take the union of the flags that aren't affected
+         * by the inversion */
+       cl->flags |= or_with->flags & INVERSION_UNAFFECTED_FLAGS;
+
+        /* For the remaining flags:
+            ANYOF_UNICODE_ALL and inverted means to not match anything above
+                    255, which means that the union with cl should just be
+                    what cl has in it, so can ignore this flag
+            ANYOF_NON_UTF8_LATIN1_ALL and inverted means if not utf8 and ord
+                    is 127-255 to match them, but then invert that, so the
+                    union with cl should just be what cl has in it, so can
+                    ignore this flag
+         */
+    } else {    /* 'or_with' is not inverted */
         /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
         if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
-            && (!(or_with->flags & ANYOF_FOLD)
-                || (cl->flags & ANYOF_FOLD)) ) {
+            && (!(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
+                || (cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) ) {
             int i;
  
             /* OR char bitmap and class bitmap separately */
             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
                 cl->bitmap[i] |= or_with->bitmap[i];
-           if (or_with->flags & ANYOF_CLASS) {
+           if (ANYOF_CLASS_TEST_ANY_SET(or_with)) {
                 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
                     cl->classflags[i] |= or_with->classflags[i];
                 cl->flags |= ANYOF_CLASS;
@@ -846,23 +973,32 @@ S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, con
         else { /* XXXX: logic is complicated, leave it along for a moment. */
             cl_anything(pRExC_state, cl);
         }
-    }
-    if (or_with->flags & ANYOF_EOS)
-       cl->flags |= ANYOF_EOS;
  
-    if (or_with->flags & ANYOF_FOLD)
-       cl->flags |= ANYOF_FOLD;
+       if (ANYOF_NONBITMAP(or_with)) {
+
+           /* Use the added node's outside-the-bit-map match if there isn't a
+            * conflict.  If there is a conflict (both nodes match something
+            * outside the bitmap, but what they match outside is not the same
+            * pointer, and hence not easily compared until XXX we extend
+            * inversion lists this far), give up and allow the start class to
+            * match everything outside the bitmap.  If that stuff is all above
+            * 255, can just set UNICODE_ALL, otherwise caould be anything. */
+           if (! ANYOF_NONBITMAP(cl)) {
+               ARG_SET(cl, ARG(or_with));
+           }
+           else if (ARG(cl) != ARG(or_with)) {
  
-    /* If both nodes match something outside the bitmap, but what they match
-     * outside is not the same pointer, and hence not easily compared, give up
-     * and allow the start class to match everything outside the bitmap */
-    if (cl->flags & ANYOF_NONBITMAP && or_with->flags & ANYOF_NONBITMAP &&
-       ARG(cl) != ARG(or_with)) {
-       cl->flags |= ANYOF_UNICODE_ALL;
-    }
+               if ((or_with->flags & ANYOF_NONBITMAP_NON_UTF8)) {
+                   cl_anything(pRExC_state, cl);
+               }
+               else {
+                   cl->flags |= ANYOF_UNICODE_ALL;
+               }
+           }
+       }
  
-    if (or_with->flags & ANYOF_UNICODE_ALL) {
-       cl->flags |= ANYOF_UNICODE_ALL;
+        /* Take the union */
+       cl->flags |= or_with->flags;
      }
  }
  
@@ -883,7 +1019,7 @@ S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, con
     tables that are used to generate the final compressed
     representation which is what dump_trie expects.
  
-   Part of the reason for their existance is to provide a form
+   Part of the reason for their existence is to provide a form
     of documentation as to how the different representations function.
  
  */
@@ -1158,7 +1294,7 @@ the silent ignoring of duplicate alternations which are of the form:
  
  Thus EVAL blocks following a trie may be called a different number of times with
  and without the optimisation. With the optimisations dupes will be silently
-ignored. This inconsistant behaviour of EVAL type nodes is well established as
+ignored. This inconsistent behaviour of EVAL type nodes is well established as
  the following demonstrates:
  
   'words'=~/(word|word|word)(?{ print $1 })[xyz]/
@@ -1171,7 +1307,7 @@ which doesnt print it out at all. This is due to other optimisations kicking in.
  
  Example of what happens on a structural level:
  
-The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
+The regexp /(ac|ad|ab)+/ will produce the following debug output:
  
     1: CURLYM[1] {1,32767}(18)
     5:   BRANCH(8)
@@ -1379,6 +1515,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs
  #endif
  
      switch (flags) {
+       case EXACTFA:
         case EXACTFU: folder = PL_fold_latin1; break;
         case EXACTF:  folder = PL_fold; break;
         case EXACTFL: folder = PL_fold_locale; break;
@@ -1439,7 +1576,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs
         middle and the least common are on the outside.  IMO this would be better
         than a most to least common mapping as theres a decent chance the most
         common letter will share a node with the least common, meaning the node
-       will not be compressable. With a middle is most common approach the worst
+       will not be compressible. With a middle is most common approach the worst
         case is when we have the least common nodes twice.
  
       */
@@ -2454,7 +2591,7 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags
  #define UPSILON_D_T    GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS
  
      if (UTF
-       && ( OP(scan) == EXACTF || OP(scan) == EXACTFU)
+       && ( OP(scan) == EXACTF || OP(scan) == EXACTFU || OP(scan) == EXACTFA)
         && ( STR_LEN(scan) >= 6 ) )
      {
      /*
@@ -2524,7 +2661,7 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags
      return stopnow;
  }
  
-/* REx optimizer.  Converts nodes into quickier variants "in place".
+/* REx optimizer.  Converts nodes into quicker variants "in place".
     Finds fixed substrings.  */
  
  /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
@@ -2536,7 +2673,7 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags
      SAVEFREEPV(and_withp)
  
  /* this is a chain of data about sub patterns we are processing that
-   need to be handled seperately/specially in study_chunk. Its so
+   need to be handled separately/specially in study_chunk. Its so
     we can simulate recursion without losing state.  */
  struct scan_frame;
  typedef struct scan_frame {
@@ -2777,7 +2914,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
  
                    which would be constructed from a pattern like /A|LIST|OF|WORDS/
  
-                  If we can find such a subseqence we need to turn the first
+                  If we can find such a subsequence we need to turn the first
                    element into a trie and then add the subsequent branch exact
                    strings to the trie.
  
@@ -2800,7 +2937,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
  
                   If x(1..n)==tail then we can do a simple trie, if not we make
                   a "jump" trie, such that when we match the appropriate word
-                 we "jump" to the appopriate tail node. Essentailly we turn
+                 we "jump" to the appropriate tail node. Essentially we turn
                   a nested if into a case structure of sorts.
  
                 */
@@ -2854,7 +2991,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                             and noper_next is the same as scan (our current
                             position in the regex) then the EXACT branch is
                             a possible optimization target. Once we have
-                           two or more consequetive such branches we can
+                           two or more consecutive such branches we can
                             create a trie of the EXACT's contents and stich
                             it in place. If the sequence represents all of
                             the branches we eliminate the whole thing and
@@ -2911,9 +3048,9 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                                  }
                              } else {
  /* 
-    Currently we do not believe that the trie logic can
-    handle case insensitive matching properly when the
-    pattern is not unicode (thus forcing unicode semantics).
+    Currently the trie logic handles case insensitive matching properly only
+    when the pattern is UTF-8 and the node is EXACTFU (thus forcing unicode
+    semantics).
  
      If/when this is fixed the following define can be swapped
      in below to fully enable trie logic.
@@ -2921,7 +3058,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
  #define TRIE_TYPE_IS_SAFE 1
  
  */
-#define TRIE_TYPE_IS_SAFE (UTF || optype==EXACT)
+#define TRIE_TYPE_IS_SAFE ((UTF && optype == EXACTFU) || optype==EXACT)
  
                                  if ( last && TRIE_TYPE_IS_SAFE ) {
                                      make_trie( pRExC_state, 
@@ -3075,7 +3212,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                 int compat = 1;
  
  
-               /* If compatibile, we or it in below.  It is compatible if is
+               /* If compatible, we or it in below.  It is compatible if is
                  * in the bitmp and either 1) its bit or its fold is set, or 2)
                  * it's for a locale.  Even if there isn't unicode semantics
                  * here, at runtime there may be because of matching against a
@@ -3084,14 +3221,32 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                 if (uc >= 0x100 ||
                     (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
                     && !ANYOF_BITMAP_TEST(data->start_class, uc)
-                   && (!(data->start_class->flags & ANYOF_FOLD)
+                   && (!(data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD)
                         || !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
                      )
+               {
                     compat = 0;
+               }
                 ANYOF_CLASS_ZERO(data->start_class);
                 ANYOF_BITMAP_ZERO(data->start_class);
                 if (compat)
                     ANYOF_BITMAP_SET(data->start_class, uc);
+               else if (uc >= 0x100) {
+                   int i;
+
+                   /* Some Unicode code points fold to the Latin1 range; as
+                    * XXX temporary code, instead of figuring out if this is
+                    * one, just assume it is and set all the start class bits
+                    * that could be some such above 255 code point's fold
+                    * which will generate fals positives.  As the code
+                    * elsewhere that does compute the fold settles down, it
+                    * can be extracted out and re-used here */
+                   for (i = 0; i < 256; i++){
+                       if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)) {
+                           ANYOF_BITMAP_SET(data->start_class, i);
+                       }
+                   }
+               }
                 data->start_class->flags &= ~ANYOF_EOS;
                 if (uc < 0x100)
                   data->start_class->flags &= ~ANYOF_UNICODE_ALL;
@@ -3139,8 +3294,11 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                 if (compat) {
                     ANYOF_BITMAP_SET(data->start_class, uc);
                     data->start_class->flags &= ~ANYOF_EOS;
-                   data->start_class->flags |= ANYOF_FOLD;
+                   data->start_class->flags |= ANYOF_LOC_NONBITMAP_FOLD;
                     if (OP(scan) == EXACTFL) {
+                       /* XXX This set is probably no longer necessary, and
+                        * probably wrong as LOCALE now is on in the initial
+                        * state */
                         data->start_class->flags |= ANYOF_LOCALE;
                     }
                     else {
@@ -3152,9 +3310,17 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                         ANYOF_BITMAP_SET(data->start_class, PL_fold_latin1[uc]);
                     }
                 }
+               else if (uc >= 0x100) {
+                   int i;
+                   for (i = 0; i < 256; i++){
+                       if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)) {
+                           ANYOF_BITMAP_SET(data->start_class, i);
+                       }
+                   }
+               }
             }
             else if (flags & SCF_DO_STCLASS_OR) {
-               if (data->start_class->flags & ANYOF_FOLD) {
+               if (data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD) {
                     /* false positive possible if the class is case-folded.
                        Assume that the locale settings are the same... */
                     if (uc < 0x100) {
@@ -3392,8 +3558,8 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
  #ifdef DEBUGGING
                         OP(nxt1 + 1) = OPTIMIZED; /* was count. */
                         OP(nxt + 1) = OPTIMIZED; /* was count. */
-                       NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
-                       NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
+                       NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
+                       NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
  #endif
  #if 0
                         while ( nxt1 && (OP(nxt1) != WHILEM)) {
@@ -3532,7 +3698,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                         NEXT_OFF(oscan) += NEXT_OFF(next);
                 }
                 continue;
-           default:                    /* REF and CLUMP only? */
+           default:                    /* REF, ANYOFV, and CLUMP only? */
                 if (flags & SCF_DO_SUBSTR) {
                     SCAN_COMMIT(pRExC_state,data,minlenp);      /* Cannot expect anything... */
                     data->longest = &(data->longest_float);
@@ -3609,8 +3775,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                         goto do_default;
                     if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
                         value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
-                                || ((data->start_class->flags & ANYOF_CLASS)
-                                     && ANYOF_CLASS_TEST_ANY_SET(data->start_class)));
+                                || ANYOF_CLASS_TEST_ANY_SET(data->start_class));
                         cl_anything(pRExC_state, data->start_class);
                     }
                     if (flags & SCF_DO_STCLASS_AND || !value)
@@ -3628,7 +3793,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                     if (flags & SCF_DO_STCLASS_AND) {
                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
-                            if (FLAGS(scan) & USE_UNI) {
+                            if (OP(scan) == ALNUMU) {
                                  for (value = 0; value < 256; value++) {
                                      if (!isWORDCHAR_L1(value)) {
                                          ANYOF_BITMAP_CLEAR(data->start_class, value);
@@ -3646,7 +3811,11 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                     else {
                         if (data->start_class->flags & ANYOF_LOCALE)
                             ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
-                        else if (FLAGS(scan) & USE_UNI) {
+
+                       /* Even if under locale, set the bits for non-locale
+                        * in case it isn't a true locale-node.  This will
+                        * create false positives if it truly is locale */
+                        if (OP(scan) == ALNUMU) {
                              for (value = 0; value < 256; value++) {
                                  if (isWORDCHAR_L1(value)) {
                                      ANYOF_BITMAP_SET(data->start_class, value);
@@ -3661,21 +3830,11 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                          }
                     }
                     break;
-               case ALNUML:
-                   if (flags & SCF_DO_STCLASS_AND) {
-                       if (data->start_class->flags & ANYOF_LOCALE)
-                           ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
-                   }
-                   else {
-                       ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
-                       data->start_class->flags |= ANYOF_LOCALE;
-                   }
-                   break;
                 case NALNUM:
                     if (flags & SCF_DO_STCLASS_AND) {
                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
-                            if (FLAGS(scan) & USE_UNI) {
+                            if (OP(scan) == NALNUMU) {
                                  for (value = 0; value < 256; value++) {
                                      if (isWORDCHAR_L1(value)) {
                                          ANYOF_BITMAP_CLEAR(data->start_class, value);
@@ -3693,28 +3852,30 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                     else {
                         if (data->start_class->flags & ANYOF_LOCALE)
                             ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
-                       else {
-                           for (value = 0; value < 256; value++)
-                               if (!isALNUM(value))
+
+                       /* Even if under locale, set the bits for non-locale in
+                        * case it isn't a true locale-node.  This will create
+                        * false positives if it truly is locale */
+                       if (OP(scan) == NALNUMU) {
+                           for (value = 0; value < 256; value++) {
+                               if (! isWORDCHAR_L1(value)) {
+                                   ANYOF_BITMAP_SET(data->start_class, value);
+                               }
+                           }
+                       } else {
+                           for (value = 0; value < 256; value++) {
+                               if (! isALNUM(value)) {
                                     ANYOF_BITMAP_SET(data->start_class, value);
+                               }
+                           }
                         }
                     }
                     break;
-               case NALNUML:
-                   if (flags & SCF_DO_STCLASS_AND) {
-                       if (data->start_class->flags & ANYOF_LOCALE)
-                           ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
-                   }
-                   else {
-                       data->start_class->flags |= ANYOF_LOCALE;
-                       ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
-                   }
-                   break;
                 case SPACE:
                     if (flags & SCF_DO_STCLASS_AND) {
                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
-                           if (FLAGS(scan) & USE_UNI) {
+                           if (OP(scan) == SPACEU) {
                                  for (value = 0; value < 256; value++) {
                                      if (!isSPACE_L1(value)) {
                                          ANYOF_BITMAP_CLEAR(data->start_class, value);
@@ -3733,7 +3894,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                          if (data->start_class->flags & ANYOF_LOCALE) {
                             ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
                          }
-                        else if (FLAGS(scan) & USE_UNI) {
+                        if (OP(scan) == SPACEU) {
                              for (value = 0; value < 256; value++) {
                                  if (isSPACE_L1(value)) {
                                      ANYOF_BITMAP_SET(data->start_class, value);
@@ -3748,21 +3909,11 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                         }
                     }
                     break;
-               case SPACEL:
-                   if (flags & SCF_DO_STCLASS_AND) {
-                       if (data->start_class->flags & ANYOF_LOCALE)
-                           ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
-                   }
-                   else {
-                       data->start_class->flags |= ANYOF_LOCALE;
-                       ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
-                   }
-                   break;
                 case NSPACE:
                     if (flags & SCF_DO_STCLASS_AND) {
                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
-                            if (FLAGS(scan) & USE_UNI) {
+                            if (OP(scan) == NSPACEU) {
                                  for (value = 0; value < 256; value++) {
                                      if (isSPACE_L1(value)) {
                                          ANYOF_BITMAP_CLEAR(data->start_class, value);
@@ -3780,7 +3931,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                     else {
                         if (data->start_class->flags & ANYOF_LOCALE)
                             ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
-                        else if (FLAGS(scan) & USE_UNI) {
+                        if (OP(scan) == NSPACEU) {
                              for (value = 0; value < 256; value++) {
                                  if (!isSPACE_L1(value)) {
                                      ANYOF_BITMAP_SET(data->start_class, value);
@@ -3796,40 +3947,27 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                          }
                     }
                     break;
-               case NSPACEL:
+               case DIGIT:
                     if (flags & SCF_DO_STCLASS_AND) {
-                       if (data->start_class->flags & ANYOF_LOCALE) {
-                           ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
+                       if (!(data->start_class->flags & ANYOF_LOCALE)) {
+                            ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
                             for (value = 0; value < 256; value++)
-                               if (!isSPACE(value))
+                               if (!isDIGIT(value))
                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
                         }
                     }
                     else {
-                       data->start_class->flags |= ANYOF_LOCALE;
-                       ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
-                   }
-                   break;
-               case DIGIT:
-                   if (flags & SCF_DO_STCLASS_AND) {
-                       ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
-                       for (value = 0; value < 256; value++)
-                           if (!isDIGIT(value))
-                               ANYOF_BITMAP_CLEAR(data->start_class, value);
-                   }
-                   else {
                         if (data->start_class->flags & ANYOF_LOCALE)
                             ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
-                       else {
-                           for (value = 0; value < 256; value++)
-                               if (isDIGIT(value))
-                                   ANYOF_BITMAP_SET(data->start_class, value);
-                       }
+                       for (value = 0; value < 256; value++)
+                           if (isDIGIT(value))
+                               ANYOF_BITMAP_SET(data->start_class, value);
                     }
                     break;
                 case NDIGIT:
                     if (flags & SCF_DO_STCLASS_AND) {
-                       ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
+                       if (!(data->start_class->flags & ANYOF_LOCALE))
+                            ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
                         for (value = 0; value < 256; value++)
                             if (isDIGIT(value))
                                 ANYOF_BITMAP_CLEAR(data->start_class, value);
@@ -3837,11 +3975,9 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                     else {
                         if (data->start_class->flags & ANYOF_LOCALE)
                             ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
-                       else {
-                           for (value = 0; value < 256; value++)
-                               if (!isDIGIT(value))
-                                   ANYOF_BITMAP_SET(data->start_class, value);
-                       }
+                       for (value = 0; value < 256; value++)
+                           if (!isDIGIT(value))
+                               ANYOF_BITMAP_SET(data->start_class, value);
                     }
                     break;
                 CASE_SYNST_FNC(VERTWS);
@@ -4400,6 +4536,7 @@ Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags)
      I32 sawplus = 0;
      I32 sawopen = 0;
      bool used_setjump = FALSE;
+    regex_charset initial_charset = get_regex_charset(orig_pm_flags);
  
      U8 jump_ret = 0;
      dJMPENV;
@@ -4417,6 +4554,8 @@ Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags)
      DEBUG_r(if (!PL_colorset) reginitcolors());
  
      RExC_utf8 = RExC_orig_utf8 = SvUTF8(pattern);
+    RExC_uni_semantics = 0;
+    RExC_contains_locale = 0;
  
      /****************** LONG JUMP TARGET HERE***********************/
      /* Longjmp back to here if have to switch in midstream to utf8 */
@@ -4473,11 +4612,16 @@ Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags)
      restudied = 0;
  #endif
  
-    /* Set to use unicode semantics if the pattern is in utf8 and has the
-     * 'dual' charset specified, as it means unicode when utf8  */
      pm_flags = orig_pm_flags;
-    if (RExC_utf8  && ! (pm_flags & (RXf_PMf_LOCALE|RXf_PMf_UNICODE))) {
-       pm_flags |= RXf_PMf_UNICODE;
+
+    if (initial_charset == REGEX_LOCALE_CHARSET) {
+       RExC_contains_locale = 1;
+    }
+    else if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) {
+
+       /* Set to use unicode semantics if the pattern is in utf8 and has the
+        * 'depends' charset specified, as it means unicode when utf8  */
+       set_regex_charset(&pm_flags, REGEX_UNICODE_CHARSET);
      }
  
      RExC_precomp = exp;
@@ -4485,9 +4629,11 @@ Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags)
      RExC_sawback = 0;
  
      RExC_seen = 0;
+    RExC_in_lookbehind = 0;
      RExC_seen_zerolen = *exp == '^' ? -1 : 0;
      RExC_seen_evals = 0;
      RExC_extralen = 0;
+    RExC_override_recoding = 0;
  
      /* First pass: determine size, legality. */
      RExC_parse = exp;
@@ -4523,6 +4669,7 @@ Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags)
      if (used_setjump) {
         JMPENV_POP;
      }
+
      DEBUG_PARSE_r({
          PerlIO_printf(Perl_debug_log, 
              "Required size %"IVdf" nodes\n"
@@ -4531,6 +4678,14 @@ Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags)
          RExC_lastnum=0; 
          RExC_lastparse=NULL; 
      });
+
+    /* The first pass could have found things that force Unicode semantics */
+    if ((RExC_utf8 || RExC_uni_semantics)
+        && get_regex_charset(pm_flags) == REGEX_DEPENDS_CHARSET)
+    {
+       set_regex_charset(&pm_flags, REGEX_UNICODE_CHARSET);
+    }
+
      /* Small enough for pointer-storage convention?
         If extralen==0, this means that we will not need long jumps. */
      if (RExC_size >= 0x10000L && RExC_extralen)
@@ -4563,7 +4718,7 @@ Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags)
      r->extflags = pm_flags;
      {
          bool has_p     = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
-        bool has_charset = cBOOL(r->extflags & (RXf_PMf_LOCALE|RXf_PMf_UNICODE));
+        bool has_charset = (get_regex_charset(r->extflags) != REGEX_DEPENDS_CHARSET);
  
          /* The caret is output if there are any defaults: if not all the STD
           * flags are set, or if no character set specifier is needed */
@@ -4584,7 +4739,9 @@ Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags)
           * covered by the caret */
         const STRLEN wraplen = plen + has_p + has_runon
              + has_default       /* If needs a caret */
-            + has_charset       /* If needs a character set specifier */
+
+               /* If needs a character set specifier */
+           + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
              + (sizeof(STD_PAT_MODS) - 1)
              + (sizeof("(?:)") - 1);
  
@@ -4598,11 +4755,10 @@ Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags)
              *p++= DEFAULT_PAT_MOD;
          }
          if (has_charset) {
-            if (r->extflags & RXf_PMf_LOCALE) {
-                *p++ = LOCALE_PAT_MOD;
-            } else {
-                *p++ = UNICODE_PAT_MOD;
-            }
+           STRLEN len;
+           const char* const name = get_regex_charset_name(r->extflags, &len);
+           Copy(name, p, len, char);
+           p += len;
          }
          if (has_p)
              *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
@@ -4985,12 +5141,14 @@ reStudy:
         if (ri->regstclass
             && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
             ri->regstclass = NULL;
+
         if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
             && stclass_flag
             && !(data.start_class->flags & ANYOF_EOS)
             && !cl_is_anything(data.start_class))
         {
             const U32 n = add_data(pRExC_state, 1, "f");
+           data.start_class->flags |= ANYOF_IS_SYNTHETIC;
  
             Newx(RExC_rxi->data->data[n], 1,
                 struct regnode_charclass_class);
@@ -5057,10 +5215,12 @@ reStudy:
  
         r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
                 = r->float_substr = r->float_utf8 = NULL;
+
         if (!(data.start_class->flags & ANYOF_EOS)
             && !cl_is_anything(data.start_class))
         {
             const U32 n = add_data(pRExC_state, 1, "f");
+           data.start_class->flags |= ANYOF_IS_SYNTHETIC;
  
             Newx(RExC_rxi->data->data[n], 1,
                 struct regnode_charclass_class);
@@ -5121,13 +5281,13 @@ reStudy:
      else {
          regnode *first = ri->program + 1;
          U8 fop = OP(first);
-        U8 nop = OP(NEXTOPER(first));
-        
-        if (PL_regkind[fop] == NOTHING && nop == END)
+
+        if (PL_regkind[fop] == NOTHING && OP(NEXTOPER(first)) == END)
              r->extflags |= RXf_NULL;
-        else if (PL_regkind[fop] == BOL && nop == END)
+        else if (PL_regkind[fop] == BOL && OP(NEXTOPER(first)) == END)
              r->extflags |= RXf_START_ONLY;
-        else if (fop == PLUS && nop ==SPACE && OP(regnext(first))==END)
+        else if (fop == PLUS && OP(NEXTOPER(first)) == SPACE
+                            && OP(regnext(first)) == END)
              r->extflags |= RXf_WHITE;    
      }
  #endif
@@ -5661,83 +5821,628 @@ S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
      DEBUG_PARSE_MSG((funcname));                            \
      PerlIO_printf(Perl_debug_log,fmt "\n",args);               \
  })
-/*
- - reg - regular expression, i.e. main body or parenthesized thing
+
+/* This section of code defines the inversion list object and its methods.  The
+ * interfaces are highly subject to change, so as much as possible is static to
+ * this file.  An inversion list is here implemented as a malloc'd C array with
+ * some added info.  More will be coming when functionality is added later.
   *
- * Caller must absorb opening parenthesis.
+ * It is currently implemented as an HV to the outside world, but is actually
+ * an SV pointing to an array of UVs that the SV thinks are bytes.  This allows
+ * us to have an array of UV whose memory management is automatically handled
+ * by the existing facilities for SV's.
   *
- * Combining parenthesis handling with the base level of regular expression
- * is a trifle forced, but the need to tie the tails of the branches to what
- * follows makes it hard to avoid.
- */
-#define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
-#ifdef DEBUGGING
-#define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
-#else
-#define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
+ * Some of the methods should always be private to the implementation, and some
+ * should eventually be made public */
+
+#define INVLIST_INITIAL_LEN 10
+
+PERL_STATIC_INLINE UV*
+S_invlist_array(pTHX_ HV* const invlist)
+{
+    /* Returns the pointer to the inversion list's array.  Every time the
+     * length changes, this needs to be called in case malloc or realloc moved
+     * it */
+
+    PERL_ARGS_ASSERT_INVLIST_ARRAY;
+
+    return (UV *) SvPVX(invlist);
+}
+
+PERL_STATIC_INLINE UV
+S_invlist_len(pTHX_ HV* const invlist)
+{
+    /* Returns the current number of elements in the inversion list's array */
+
+    PERL_ARGS_ASSERT_INVLIST_LEN;
+
+    return SvCUR(invlist) / sizeof(UV);
+}
+
+PERL_STATIC_INLINE UV
+S_invlist_max(pTHX_ HV* const invlist)
+{
+    /* Returns the maximum number of elements storable in the inversion list's
+     * array, without having to realloc() */
+
+    PERL_ARGS_ASSERT_INVLIST_MAX;
+
+    return SvLEN(invlist) / sizeof(UV);
+}
+
+PERL_STATIC_INLINE void
+S_invlist_set_len(pTHX_ HV* const invlist, const UV len)
+{
+    /* Sets the current number of elements stored in the inversion list */
+
+    PERL_ARGS_ASSERT_INVLIST_SET_LEN;
+
+    SvCUR_set(invlist, len * sizeof(UV));
+}
+
+PERL_STATIC_INLINE void
+S_invlist_set_max(pTHX_ HV* const invlist, const UV max)
+{
+
+    /* Sets the maximum number of elements storable in the inversion list
+     * without having to realloc() */
+
+    PERL_ARGS_ASSERT_INVLIST_SET_MAX;
+
+    if (max < invlist_len(invlist)) {
+       Perl_croak(aTHX_ "panic: Can't make max size '%"UVuf"' less than current length %"UVuf" in inversion list", invlist_max(invlist), invlist_len(invlist));
+    }
+
+    SvLEN_set(invlist, max * sizeof(UV));
+}
+
+#ifndef PERL_IN_XSUB_RE
+HV*
+Perl__new_invlist(pTHX_ IV initial_size)
+{
+
+    /* Return a pointer to a newly constructed inversion list, with enough
+     * space to store 'initial_size' elements.  If that number is negative, a
+     * system default is used instead */
+
+    if (initial_size < 0) {
+       initial_size = INVLIST_INITIAL_LEN;
+    }
+
+    /* Allocate the initial space */
+    return (HV *) newSV(initial_size * sizeof(UV));
+}
  #endif
  
-STATIC regnode *
-S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
-    /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
+PERL_STATIC_INLINE void
+S_invlist_destroy(pTHX_ HV* const invlist)
  {
-    dVAR;
-    register regnode *ret;             /* Will be the head of the group. */
-    register regnode *br;
-    register regnode *lastbr;
-    register regnode *ender = NULL;
-    register I32 parno = 0;
-    I32 flags;
-    U32 oregflags = RExC_flags;
-    bool have_branch = 0;
-    bool is_open = 0;
-    I32 freeze_paren = 0;
-    I32 after_freeze = 0;
+   /* Inversion list destructor */
  
-    /* for (?g), (?gc), and (?o) warnings; warning
-       about (?c) will warn about (?g) -- japhy    */
+    PERL_ARGS_ASSERT_INVLIST_DESTROY;
  
-#define WASTED_O  0x01
-#define WASTED_G  0x02
-#define WASTED_C  0x04
-#define WASTED_GC (0x02|0x04)
-    I32 wastedflags = 0x00;
+    SvREFCNT_dec(invlist);
+}
  
-    char * parse_start = RExC_parse; /* MJD */
-    char * const oregcomp_parse = RExC_parse;
+STATIC void
+S_invlist_extend(pTHX_ HV* const invlist, const UV new_max)
+{
+    /* Grow the maximum size of an inversion list */
  
-    GET_RE_DEBUG_FLAGS_DECL;
+    PERL_ARGS_ASSERT_INVLIST_EXTEND;
  
-    PERL_ARGS_ASSERT_REG;
-    DEBUG_PARSE("reg ");
+    SvGROW((SV *)invlist, new_max * sizeof(UV));
+}
  
-    *flagp = 0;                                /* Tentatively. */
+PERL_STATIC_INLINE void
+S_invlist_trim(pTHX_ HV* const invlist)
+{
+    PERL_ARGS_ASSERT_INVLIST_TRIM;
  
+    /* Change the length of the inversion list to how many entries it currently
+     * has */
  
-    /* Make an OPEN node, if parenthesized. */
-    if (paren) {
-        if ( *RExC_parse == '*') { /* (*VERB:ARG) */
-           char *start_verb = RExC_parse;
-           STRLEN verb_len = 0;
-           char *start_arg = NULL;
-           unsigned char op = 0;
-           int argok = 1;
-           int internal_argval = 0; /* internal_argval is only useful if !argok */
-           while ( *RExC_parse && *RExC_parse != ')' ) {
-               if ( *RExC_parse == ':' ) {
-                   start_arg = RExC_parse + 1;
-                   break;
-               }
-               RExC_parse++;
+    SvPV_shrink_to_cur((SV *) invlist);
+}
+
+/* An element is in an inversion list iff its index is even numbered: 0, 2, 4,
+ * etc */
+
+#define ELEMENT_IN_INVLIST_SET(i) (! ((i) & 1))
+#define PREV_ELEMENT_IN_INVLIST_SET(i) ! ELEMENT_IN_INVLIST_SET(i)
+
+#ifndef PERL_IN_XSUB_RE
+void
+Perl__append_range_to_invlist(pTHX_ HV* const invlist, const UV start, const UV end)
+{
+   /* Subject to change or removal.  Append the range from 'start' to 'end' at
+    * the end of the inversion list.  The range must be above any existing
+    * ones. */
+
+    UV* array = invlist_array(invlist);
+    UV max = invlist_max(invlist);
+    UV len = invlist_len(invlist);
+
+    PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
+
+    if (len > 0) {
+
+       /* Here, the existing list is non-empty. The current max entry in the
+        * list is generally the first value not in the set, except when the
+        * set extends to the end of permissible values, in which case it is
+        * the first entry in that final set, and so this call is an attempt to
+        * append out-of-order */
+
+       UV final_element = len - 1;
+       if (array[final_element] > start
+           || ELEMENT_IN_INVLIST_SET(final_element))
+       {
+           Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list");
+       }
+
+       /* Here, it is a legal append.  If the new range begins with the first
+        * value not in the set, it is extending the set, so the new first
+        * value not in the set is one greater than the newly extended range.
+        * */
+       if (array[final_element] == start) {
+           if (end != UV_MAX) {
+               array[final_element] = end + 1;
             }
-           ++start_verb;
-           verb_len = RExC_parse - start_verb;
-           if ( start_arg ) {
-               RExC_parse++;
-               while ( *RExC_parse && *RExC_parse != ')' ) 
-                   RExC_parse++;
-               if ( *RExC_parse != ')' ) 
-                   vFAIL("Unterminated verb pattern argument");
+           else {
+               /* But if the end is the maximum representable on the machine,
+                * just let the range that this would extend have no end */
+               invlist_set_len(invlist, len - 1);
+           }
+           return;
+       }
+    }
+
+    /* Here the new range doesn't extend any existing set.  Add it */
+
+    len += 2;  /* Includes an element each for the start and end of range */
+
+    /* If overflows the existing space, extend, which may cause the array to be
+     * moved */
+    if (max < len) {
+       invlist_extend(invlist, len);
+       array = invlist_array(invlist);
+    }
+
+    invlist_set_len(invlist, len);
+
+    /* The next item on the list starts the range, the one after that is
+     * one past the new range.  */
+    array[len - 2] = start;
+    if (end != UV_MAX) {
+       array[len - 1] = end + 1;
+    }
+    else {
+       /* But if the end is the maximum representable on the machine, just let
+        * the range have no end */
+       invlist_set_len(invlist, len - 1);
+    }
+}
+#endif
+
+STATIC HV*
+S_invlist_union(pTHX_ HV* const a, HV* const b)
+{
+    /* Return a new inversion list which is the union of two inversion lists.
+     * The basis for this comes from "Unicode Demystified" Chapter 13 by
+     * Richard Gillam, published by Addison-Wesley, and explained at some
+     * length there.  The preface says to incorporate its examples into your
+     * code at your own risk.
+     *
+     * The algorithm is like a merge sort.
+     *
+     * XXX A potential performance improvement is to keep track as we go along
+     * if only one of the inputs contributes to the result, meaning the other
+     * is a subset of that one.  In that case, we can skip the final copy and
+     * return the larger of the input lists */
+
+    UV* array_a = invlist_array(a);   /* a's array */
+    UV* array_b = invlist_array(b);
+    UV len_a = invlist_len(a); /* length of a's array */
+    UV len_b = invlist_len(b);
+
+    HV* u;                     /* the resulting union */
+    UV* array_u;
+    UV len_u;
+
+    UV i_a = 0;                    /* current index into a's array */
+    UV i_b = 0;
+    UV i_u = 0;
+
+    /* running count, as explained in the algorithm source book; items are
+     * stopped accumulating and are output when the count changes to/from 0.
+     * The count is incremented when we start a range that's in the set, and
+     * decremented when we start a range that's not in the set.  So its range
+     * is 0 to 2.  Only when the count is zero is something not in the set.
+     */
+    UV count = 0;
+
+    PERL_ARGS_ASSERT_INVLIST_UNION;
+
+    /* Size the union for the worst case: that the sets are completely
+     * disjoint */
+    u = _new_invlist(len_a + len_b);
+    array_u = invlist_array(u);
+
+    /* Go through each list item by item, stopping when exhausted one of
+     * them */
+    while (i_a < len_a && i_b < len_b) {
+       UV cp;      /* The element to potentially add to the union's array */
+       bool cp_in_set;   /* is it in the the input list's set or not */
+
+       /* We need to take one or the other of the two inputs for the union.
+        * Since we are merging two sorted lists, we take the smaller of the
+        * next items.  In case of a tie, we take the one that is in its set
+        * first.  If we took one not in the set first, it would decrement the
+        * count, possibly to 0 which would cause it to be output as ending the
+        * range, and the next time through we would take the same number, and
+        * output it again as beginning the next range.  By doing it the
+        * opposite way, there is no possibility that the count will be
+        * momentarily decremented to 0, and thus the two adjoining ranges will
+        * be seamlessly merged.  (In a tie and both are in the set or both not
+        * in the set, it doesn't matter which we take first.) */
+       if (array_a[i_a] < array_b[i_b]
+           || (array_a[i_a] == array_b[i_b] && ELEMENT_IN_INVLIST_SET(i_a)))
+       {
+           cp_in_set = ELEMENT_IN_INVLIST_SET(i_a);
+           cp= array_a[i_a++];
+       }
+       else {
+           cp_in_set = ELEMENT_IN_INVLIST_SET(i_b);
+           cp= array_b[i_b++];
+       }
+
+       /* Here, have chosen which of the two inputs to look at.  Only output
+        * if the running count changes to/from 0, which marks the
+        * beginning/end of a range in that's in the set */
+       if (cp_in_set) {
+           if (count == 0) {
+               array_u[i_u++] = cp;
+           }
+           count++;
+       }
+       else {
+           count--;
+           if (count == 0) {
+               array_u[i_u++] = cp;
+           }
+       }
+    }
+
+    /* Here, we are finished going through at least one of the lists, which
+     * means there is something remaining in at most one.  We check if the list
+     * that hasn't been exhausted is positioned such that we are in the middle
+     * of a range in its set or not.  (i_a and i_b point to the element beyond
+     * the one we care about.) If in the set, we decrement 'count'; if 0, there
+     * is potentially more to output.
+     * There are four cases:
+     * 1) Both weren't in their sets, count is 0, and remains 0.  What's left
+     *    in the union is entirely from the non-exhausted set.
+     * 2) Both were in their sets, count is 2.  Nothing further should
+     *    be output, as everything that remains will be in the exhausted
+     *    list's set, hence in the union; decrementing to 1 but not 0 insures
+     *    that
+     * 3) the exhausted was in its set, non-exhausted isn't, count is 1.
+     *    Nothing further should be output because the union includes
+     *    everything from the exhausted set.  Not decrementing ensures that.
+     * 4) the exhausted wasn't in its set, non-exhausted is, count is 1;
+     *    decrementing to 0 insures that we look at the remainder of the
+     *    non-exhausted set */
+    if ((i_a != len_a && PREV_ELEMENT_IN_INVLIST_SET(i_a))
+       || (i_b != len_b && PREV_ELEMENT_IN_INVLIST_SET(i_b)))
+    {
+       count--;
+    }
+
+    /* The final length is what we've output so far, plus what else is about to
+     * be output.  (If 'count' is non-zero, then the input list we exhausted
+     * has everything remaining up to the machine's limit in its set, and hence
+     * in the union, so there will be no further output. */
+    len_u = i_u;
+    if (count == 0) {
+       /* At most one of the subexpressions will be non-zero */
+       len_u += (len_a - i_a) + (len_b - i_b);
+    }
+
+    /* Set result to final length, which can change the pointer to array_u, so
+     * re-find it */
+    if (len_u != invlist_len(u)) {
+       invlist_set_len(u, len_u);
+       invlist_trim(u);
+       array_u = invlist_array(u);
+    }
+
+    /* When 'count' is 0, the list that was exhausted (if one was shorter than
+     * the other) ended with everything above it not in its set.  That means
+     * that the remaining part of the union is precisely the same as the
+     * non-exhausted list, so can just copy it unchanged.  (If both list were
+     * exhausted at the same time, then the operations below will be both 0.)
+     */
+    if (count == 0) {
+       IV copy_count; /* At most one will have a non-zero copy count */
+       if ((copy_count = len_a - i_a) > 0) {
+           Copy(array_a + i_a, array_u + i_u, copy_count, UV);
+       }
+       else if ((copy_count = len_b - i_b) > 0) {
+           Copy(array_b + i_b, array_u + i_u, copy_count, UV);
+       }
+    }
+
+    return u;
+}
+
+STATIC HV*
+S_invlist_intersection(pTHX_ HV* const a, HV* const b)
+{
+    /* Return the intersection of two inversion lists.  The basis for this
+     * comes from "Unicode Demystified" Chapter 13 by Richard Gillam, published
+     * by Addison-Wesley, and explained at some length there.  The preface says
+     * to incorporate its examples into your code at your own risk.  In fact,
+     * it had bugs
+     *
+     * The algorithm is like a merge sort, and is essentially the same as the
+     * union above
+     */
+
+    UV* array_a = invlist_array(a);   /* a's array */
+    UV* array_b = invlist_array(b);
+    UV len_a = invlist_len(a); /* length of a's array */
+    UV len_b = invlist_len(b);
+
+    HV* r;                  /* the resulting intersection */
+    UV* array_r;
+    UV len_r;
+
+    UV i_a = 0;                    /* current index into a's array */
+    UV i_b = 0;
+    UV i_r = 0;
+
+    /* running count, as explained in the algorithm source book; items are
+     * stopped accumulating and are output when the count changes to/from 2.
+     * The count is incremented when we start a range that's in the set, and
+     * decremented when we start a range that's not in the set.  So its range
+     * is 0 to 2.  Only when the count is 2 is something in the intersection.
+     */
+    UV count = 0;
+
+    PERL_ARGS_ASSERT_INVLIST_INTERSECTION;
+
+    /* Size the intersection for the worst case: that the intersection ends up
+     * fragmenting everything to be completely disjoint */
+    r= _new_invlist(len_a + len_b);
+    array_r = invlist_array(r);
+
+    /* Go through each list item by item, stopping when exhausted one of
+     * them */
+    while (i_a < len_a && i_b < len_b) {
+       UV cp;      /* The element to potentially add to the intersection's
+                      array */
+       bool cp_in_set; /* Is it in the input list's set or not */
+
+       /* We need to take one or the other of the two inputs for the
+        * intersection.  Since we are merging two sorted lists, we take the
+        * smaller of the next items.  In case of a tie, we take the one that
+        * is not in its set first (a difference from the union algorithm).  If
+        * we took one in the set first, it would increment the count, possibly
+        * to 2 which would cause it to be output as starting a range in the
+        * intersection, and the next time through we would take that same
+        * number, and output it again as ending the set.  By doing it the
+        * opposite of this, there is no possibility that the count will be
+        * momentarily incremented to 2.  (In a tie and both are in the set or
+        * both not in the set, it doesn't matter which we take first.) */
+       if (array_a[i_a] < array_b[i_b]
+           || (array_a[i_a] == array_b[i_b] && ! ELEMENT_IN_INVLIST_SET(i_a)))
+       {
+           cp_in_set = ELEMENT_IN_INVLIST_SET(i_a);
+           cp= array_a[i_a++];
+       }
+       else {
+           cp_in_set = ELEMENT_IN_INVLIST_SET(i_b);
+           cp= array_b[i_b++];
+       }
+
+       /* Here, have chosen which of the two inputs to look at.  Only output
+        * if the running count changes to/from 2, which marks the
+        * beginning/end of a range that's in the intersection */
+       if (cp_in_set) {
+           count++;
+           if (count == 2) {
+               array_r[i_r++] = cp;
+           }
+       }
+       else {
+           if (count == 2) {
+               array_r[i_r++] = cp;
+           }
+           count--;
+       }
+    }
+
+    /* Here, we are finished going through at least one of the lists, which
+     * means there is something remaining in at most one.  We check if the list
+     * that has been exhausted is positioned such that we are in the middle
+     * of a range in its set or not.  (i_a and i_b point to elements 1 beyond
+     * the ones we care about.)  There are four cases:
+     * 1) Both weren't in their sets, count is 0, and remains 0.  There's
+     *    nothing left in the intersection.
+     * 2) Both were in their sets, count is 2 and perhaps is incremented to
+     *    above 2.  What should be output is exactly that which is in the
+     *    non-exhausted set, as everything it has is also in the intersection
+     *    set, and everything it doesn't have can't be in the intersection
+     * 3) The exhausted was in its set, non-exhausted isn't, count is 1, and
+     *    gets incremented to 2.  Like the previous case, the intersection is
+     *    everything that remains in the non-exhausted set.
+     * 4) the exhausted wasn't in its set, non-exhausted is, count is 1, and
+     *    remains 1.  And the intersection has nothing more. */
+    if ((i_a == len_a && PREV_ELEMENT_IN_INVLIST_SET(i_a))
+       || (i_b == len_b && PREV_ELEMENT_IN_INVLIST_SET(i_b)))
+    {
+       count++;
+    }
+
+    /* The final length is what we've output so far plus what else is in the
+     * intersection.  At most one of the subexpressions below will be non-zero */
+    len_r = i_r;
+    if (count >= 2) {
+       len_r += (len_a - i_a) + (len_b - i_b);
+    }
+
+    /* Set result to final length, which can change the pointer to array_r, so
+     * re-find it */
+    if (len_r != invlist_len(r)) {
+       invlist_set_len(r, len_r);
+       invlist_trim(r);
+       array_r = invlist_array(r);
+    }
+
+    /* Finish outputting any remaining */
+    if (count >= 2) { /* At most one will have a non-zero copy count */
+       IV copy_count;
+       if ((copy_count = len_a - i_a) > 0) {
+           Copy(array_a + i_a, array_r + i_r, copy_count, UV);
+       }
+       else if ((copy_count = len_b - i_b) > 0) {
+           Copy(array_b + i_b, array_r + i_r, copy_count, UV);
+       }
+    }
+
+    return r;
+}
+
+STATIC HV*
+S_add_range_to_invlist(pTHX_ HV* invlist, const UV start, const UV end)
+{
+    /* Add the range from 'start' to 'end' inclusive to the inversion list's
+     * set.  A pointer to the inversion list is returned.  This may actually be
+     * a new list, in which case the passed in one has been destroyed.  The
+     * passed in inversion list can be NULL, in which case a new one is created
+     * with just the one range in it */
+
+    HV* range_invlist;
+    HV* added_invlist;
+    UV len;
+
+    if (invlist == NULL) {
+       invlist = _new_invlist(2);
+       len = 0;
+    }
+    else {
+       len = invlist_len(invlist);
+    }
+
+    /* If comes after the final entry, can just append it to the end */
+    if (len == 0
+       || start >= invlist_array(invlist)
+                                   [invlist_len(invlist) - 1])
+    {
+       _append_range_to_invlist(invlist, start, end);
+       return invlist;
+    }
+
+    /* Here, can't just append things, create and return a new inversion list
+     * which is the union of this range and the existing inversion list */
+    range_invlist = _new_invlist(2);
+    _append_range_to_invlist(range_invlist, start, end);
+
+    added_invlist = invlist_union(invlist, range_invlist);
+
+    /* The passed in list can be freed, as well as our temporary */
+    invlist_destroy(range_invlist);
+    if (invlist != added_invlist) {
+       invlist_destroy(invlist);
+    }
+
+    return added_invlist;
+}
+
+PERL_STATIC_INLINE HV*
+S_add_cp_to_invlist(pTHX_ HV* invlist, const UV cp) {
+    return add_range_to_invlist(invlist, cp, cp);
+}
+
+/* End of inversion list object */
+
+/*
+ - reg - regular expression, i.e. main body or parenthesized thing
+ *
+ * Caller must absorb opening parenthesis.
+ *
+ * Combining parenthesis handling with the base level of regular expression
+ * is a trifle forced, but the need to tie the tails of the branches to what
+ * follows makes it hard to avoid.
+ */
+#define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
+#ifdef DEBUGGING
+#define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
+#else
+#define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
+#endif
+
+STATIC regnode *
+S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
+    /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
+{
+    dVAR;
+    register regnode *ret;             /* Will be the head of the group. */
+    register regnode *br;
+    register regnode *lastbr;
+    register regnode *ender = NULL;
+    register I32 parno = 0;
+    I32 flags;
+    U32 oregflags = RExC_flags;
+    bool have_branch = 0;
+    bool is_open = 0;
+    I32 freeze_paren = 0;
+    I32 after_freeze = 0;
+
+    /* for (?g), (?gc), and (?o) warnings; warning
+       about (?c) will warn about (?g) -- japhy    */
+
+#define WASTED_O  0x01
+#define WASTED_G  0x02
+#define WASTED_C  0x04
+#define WASTED_GC (0x02|0x04)
+    I32 wastedflags = 0x00;
+
+    char * parse_start = RExC_parse; /* MJD */
+    char * const oregcomp_parse = RExC_parse;
+
+    GET_RE_DEBUG_FLAGS_DECL;
+
+    PERL_ARGS_ASSERT_REG;
+    DEBUG_PARSE("reg ");
+
+    *flagp = 0;                                /* Tentatively. */
+
+
+    /* Make an OPEN node, if parenthesized. */
+    if (paren) {
+        if ( *RExC_parse == '*') { /* (*VERB:ARG) */
+           char *start_verb = RExC_parse;
+           STRLEN verb_len = 0;
+           char *start_arg = NULL;
+           unsigned char op = 0;
+           int argok = 1;
+           int internal_argval = 0; /* internal_argval is only useful if !argok */
+           while ( *RExC_parse && *RExC_parse != ')' ) {
+               if ( *RExC_parse == ':' ) {
+                   start_arg = RExC_parse + 1;
+                   break;
+               }
+               RExC_parse++;
+           }
+           ++start_verb;
+           verb_len = RExC_parse - start_verb;
+           if ( start_arg ) {
+               RExC_parse++;
+               while ( *RExC_parse && *RExC_parse != ')' ) 
+                   RExC_parse++;
+               if ( *RExC_parse != ')' ) 
+                   vFAIL("Unterminated verb pattern argument");
                 if ( RExC_parse == start_arg )
                     start_arg = NULL;
             } else {
@@ -5857,11 +6562,13 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
                     ret = reganode(pRExC_state,
                                    ((! FOLD)
                                      ? NREF
-                                    : (UNI_SEMANTICS)
-                                      ? NREFFU
-                                      : (LOC)
-                                        ? NREFFL
-                                        : NREFF),
+                                    : (MORE_ASCII_RESTRICTED)
+                                      ? NREFFA
+                                       : (AT_LEAST_UNI_SEMANTICS)
+                                         ? NREFFU
+                                         : (LOC)
+                                           ? NREFFL
+                                           : NREFF),
                                     num);
                      *flagp |= HASWIDTH;
  
@@ -5900,7 +6607,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
                     if (SIZE_ONLY) {
                         HE *he_str;
                         SV *sv_dat = NULL;
-                        if (!svname) /* shouldnt happen */
+                        if (!svname) /* shouldn't happen */
                              Perl_croak(aTHX_
                                  "panic: reg_scan_name returned NULL");
                          if (!RExC_paren_names) {
@@ -5946,6 +6653,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
                              SvIV_set(sv_dat, 1);
                          }
  #ifdef DEBUGGING
+                       /* Yes this does cause a memory leak in debugging Perls */
                          if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
                              SvREFCNT_dec(svname);
  #endif
@@ -5957,6 +6665,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
                     goto capturing_parens;
                 }
                  RExC_seen |= REG_SEEN_LOOKBEHIND;
+               RExC_in_lookbehind++;
                 RExC_parse++;
             case '=':           /* (?=...) */
                 RExC_seen_zerolen++;
@@ -6289,10 +6998,9 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
                                        that follow */
                  has_use_defaults = TRUE;
                  STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
-               if (RExC_utf8) {    /* But the default for a utf8 pattern is
-                                      unicode semantics */
-                   RExC_flags |= RXf_PMf_UNICODE;
-               }
+               set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics)
+                                               ? REGEX_UNICODE_CHARSET
+                                               : REGEX_DEPENDS_CHARSET);
                  goto parse_flags;
             default:
                 --RExC_parse;
@@ -6300,7 +7008,10 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
             {
                  U32 posflags = 0, negflags = 0;
                 U32 *flagsp = &posflags;
-                bool has_charset_modifier = 0;
+                char has_charset_modifier = '\0';
+               regex_charset cs = (RExC_utf8 || RExC_uni_semantics)
+                                   ? REGEX_UNICODE_CHARSET
+                                   : REGEX_DEPENDS_CHARSET;
  
                 while (*RExC_parse) {
                     /* && strchr("iogcmsx", *RExC_parse) */
@@ -6309,41 +7020,78 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
                      switch (*RExC_parse) {
                     CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
                      case LOCALE_PAT_MOD:
-                        if (has_charset_modifier || flagsp == &negflags) {
-                            goto fail_modifiers;
+                        if (has_charset_modifier) {
+                           goto excess_modifier;
+                       }
+                       else if (flagsp == &negflags) {
+                            goto neg_modifier;
                          }
-                        posflags |= RXf_PMf_LOCALE;
-                        negflags |= RXf_PMf_UNICODE;
-                        has_charset_modifier = 1;
+                       cs = REGEX_LOCALE_CHARSET;
+                        has_charset_modifier = LOCALE_PAT_MOD;
+                       RExC_contains_locale = 1;
                          break;
                      case UNICODE_PAT_MOD:
-                        if (has_charset_modifier || flagsp == &negflags) {
-                            goto fail_modifiers;
+                        if (has_charset_modifier) {
+                           goto excess_modifier;
+                       }
+                       else if (flagsp == &negflags) {
+                            goto neg_modifier;
                          }
-                        posflags |= RXf_PMf_UNICODE;
-                        negflags |= RXf_PMf_LOCALE;
-                        has_charset_modifier = 1;
+                       cs = REGEX_UNICODE_CHARSET;
+                        has_charset_modifier = UNICODE_PAT_MOD;
                          break;
-                    case DUAL_PAT_MOD:
-                        if (has_use_defaults
-                            || has_charset_modifier
-                            || flagsp == &negflags)
-                        {
+                    case ASCII_RESTRICT_PAT_MOD:
+                        if (flagsp == &negflags) {
+                            goto neg_modifier;
+                        }
+                        if (has_charset_modifier) {
+                            if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
+                                goto excess_modifier;
+                            }
+                           /* Doubled modifier implies more restricted */
+                            cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
+                        }
+                       else {
+                           cs = REGEX_ASCII_RESTRICTED_CHARSET;
+                       }
+                        has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
+                        break;
+                    case DEPENDS_PAT_MOD:
+                        if (has_use_defaults) {
                              goto fail_modifiers;
+                       }
+                       else if (flagsp == &negflags) {
+                            goto neg_modifier;
+                       }
+                       else if (has_charset_modifier) {
+                           goto excess_modifier;
                          }
  
                         /* The dual charset means unicode semantics if the
                          * pattern (or target, not known until runtime) are
-                        * utf8 */
-                       if (RExC_utf8) {
-                           posflags |= RXf_PMf_UNICODE;
-                           negflags |= RXf_PMf_LOCALE;
+                        * utf8, or something in the pattern indicates unicode
+                        * semantics */
+                       cs = (RExC_utf8 || RExC_uni_semantics)
+                            ? REGEX_UNICODE_CHARSET
+                            : REGEX_DEPENDS_CHARSET;
+                        has_charset_modifier = DEPENDS_PAT_MOD;
+                        break;
+                   excess_modifier:
+                       RExC_parse++;
+                       if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
+                           vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
+                       }
+                       else if (has_charset_modifier == *(RExC_parse - 1)) {
+                           vFAIL2("Regexp modifier \"%c\" may not appear twice", *(RExC_parse - 1));
                         }
                         else {
-                           negflags |= (RXf_PMf_LOCALE|RXf_PMf_UNICODE);
+                           vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
                         }
-                        has_charset_modifier = 1;
-                        break;
+                       /*NOTREACHED*/
+                   neg_modifier:
+                       RExC_parse++;
+                       vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"", *(RExC_parse - 1));
+                       /*NOTREACHED*/
                      case ONCE_PAT_MOD: /* 'o' */
                      case GLOBAL_PAT_MOD: /* 'g' */
                         if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
@@ -6402,9 +7150,11 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
                      case ')':
                          RExC_flags |= posflags;
                          RExC_flags &= ~negflags;
+                       set_regex_charset(&RExC_flags, cs);
                          if (paren != ':') {
                              oregflags |= posflags;
                              oregflags &= ~negflags;
+                           set_regex_charset(&oregflags, cs);
                          }
                          nextchar(pRExC_state);
                         if (paren != ':') {
@@ -6455,12 +7205,6 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
      parse_start = RExC_parse;   /* MJD */
      br = regbranch(pRExC_state, &flags, 1,depth+1);
  
-    if (freeze_paren) {
-        if (RExC_npar > after_freeze)
-            after_freeze = RExC_npar;
-        RExC_npar = freeze_paren;
-    }
-
      /*     branch_len = (paren != 0); */
  
      if (br == NULL)
@@ -6600,7 +7344,11 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
             FAIL("Junk on end of regexp");      /* "Can't happen". */
         /* NOTREACHED */
      }
-    if (after_freeze)
+
+    if (RExC_in_lookbehind) {
+       RExC_in_lookbehind--;
+    }
+    if (after_freeze > RExC_npar)
          RExC_npar = after_freeze;
      return(ret);
  }
@@ -6694,7 +7442,9 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
      const char * const origparse = RExC_parse;
      I32 min;
      I32 max = REG_INFTY;
+#ifdef RE_TRACK_PATTERN_OFFSETS
      char *parse_start;
+#endif
      const char *maxpos = NULL;
      GET_RE_DEBUG_FLAGS_DECL;
  
@@ -6713,7 +7463,9 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
  
      if (op == '{' && regcurly(RExC_parse)) {
         maxpos = NULL;
+#ifdef RE_TRACK_PATTERN_OFFSETS
          parse_start = RExC_parse; /* MJD */
+#endif
         next = RExC_parse + 1;
         while (isDIGIT(*next) || *next == ',') {
             if (*next == ',') {
@@ -6809,7 +7561,9 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
        vFAIL("Regexp *+ operand could be empty");
  #endif
  
+#ifdef RE_TRACK_PATTERN_OFFSETS
      parse_start = RExC_parse;
+#endif
      nextchar(pRExC_state);
  
      *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
@@ -6873,7 +7627,7 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
  }
  
  
-/* reg_namedseq(pRExC_state,UVp)
+/* reg_namedseq(pRExC_state,UVp, UV depth)
     
     This is expected to be called by a parser routine that has 
     recognized '\N' and needs to handle the rest. RExC_parse is
@@ -6916,13 +7670,10 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
     Parsing failures will generate a fatal error via vFAIL(...)
   */
  STATIC regnode *
-S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp)
+S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp, U32 depth)
  {
      char * endbrace;    /* '}' following the name */
      regnode *ret = NULL;
-#ifdef DEBUGGING
-    char* parse_start = RExC_parse - 2;            /* points to the '\N' */
-#endif
      char* p;
  
      GET_RE_DEBUG_FLAGS_DECL;
@@ -7035,117 +7786,55 @@ S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp)
          ret = (regnode *) &RExC_parse; /* Invalid regnode pointer */
      }
      else {     /* Not a char class */
-       char *s;            /* String to put in generated EXACT node */
-       STRLEN len = 0;     /* Its current byte length */
+
+       /* What is done here is to convert this to a sub-pattern of the form
+        * (?:\x{char1}\x{char2}...)
+        * and then call reg recursively.  That way, it retains its atomicness,
+        * while not having to worry about special handling that some code
+        * points may have.  toke.c has converted the original Unicode values
+        * to native, so that we can just pass on the hex values unchanged.  We
+        * do have to set a flag to keep recoding from happening in the
+        * recursion */
+
+       SV * substitute_parse = newSVpvn_flags("?:", 2, SVf_UTF8|SVs_TEMP);
+       STRLEN len;
         char *endchar;      /* Points to '.' or '}' ending cur char in the input
                                stream */
+       char *orig_end = RExC_end;
  
-       ret = reg_node(pRExC_state, (U8) ((! FOLD) ? EXACT
-                                                  : (LOC)
-                                                     ? EXACTFL
-                                                     : UNI_SEMANTICS
-                                                       ? EXACTFU
-                                                       : EXACTF));
-       s= STRING(ret);
-
-       /* Exact nodes can hold only a U8 length's of text = 255.  Loop through
-        * the input which is of the form now 'c1.c2.c3...}' until find the
-        * ending brace or exceed length 255.  The characters that exceed this
-        * limit are dropped.  The limit could be relaxed should it become
-        * desirable by reparsing this as (?:\N{NAME}), so could generate
-        * multiple EXACT nodes, as is done for just regular input.  But this
-        * is primarily a named character, and not intended to be a huge long
-        * string, so 255 bytes should be good enough */
-       while (1) {
-           STRLEN length_of_hex;
-           I32 grok_flags = PERL_SCAN_ALLOW_UNDERSCORES
-                           | PERL_SCAN_DISALLOW_PREFIX
-                           | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
-           UV cp;  /* Ord of current character */
+       while (RExC_parse < endbrace) {
  
             /* Code points are separated by dots.  If none, there is only one
              * code point, and is terminated by the brace */
             endchar = RExC_parse + strcspn(RExC_parse, ".}");
  
-           /* The values are Unicode even on EBCDIC machines */
-           length_of_hex = (STRLEN)(endchar - RExC_parse);
-           cp = grok_hex(RExC_parse, &length_of_hex, &grok_flags, NULL);
-           if ( length_of_hex == 0 
-               || length_of_hex != (STRLEN)(endchar - RExC_parse) )
-           {
-               RExC_parse += length_of_hex;        /* Includes all the valid */
-               RExC_parse += (RExC_orig_utf8)  /* point to after 1st invalid */
-                               ? UTF8SKIP(RExC_parse)
-                               : 1;
-               /* Guard against malformed utf8 */
-               if (RExC_parse >= endchar) RExC_parse = endchar;
-               vFAIL("Invalid hexadecimal number in \\N{U+...}");
-           }    
-
-           if (! FOLD) {       /* Not folding, just append to the string */
-               STRLEN unilen;
-
-               /* Quit before adding this character if would exceed limit */
-               if (len + UNISKIP(cp) > U8_MAX) break;
-
-               unilen = reguni(pRExC_state, cp, s);
-               if (unilen > 0) {
-                   s   += unilen;
-                   len += unilen;
-               }
-           } else {    /* Folding, output the folded equivalent */
-               STRLEN foldlen,numlen;
-               U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
-               cp = toFOLD_uni(cp, tmpbuf, &foldlen);
-
-               /* Quit before exceeding size limit */
-               if (len + foldlen > U8_MAX) break;
-               
-               for (foldbuf = tmpbuf;
-                   foldlen;
-                   foldlen -= numlen) 
-               {
-                   cp = utf8_to_uvchr(foldbuf, &numlen);
-                   if (numlen > 0) {
-                       const STRLEN unilen = reguni(pRExC_state, cp, s);
-                       s       += unilen;
-                       len     += unilen;
-                       /* In EBCDIC the numlen and unilen can differ. */
-                       foldbuf += numlen;
-                       if (numlen >= foldlen)
-                           break;
-                   }
-                   else
-                       break; /* "Can't happen." */
-               }                          
-           }
+           /* Convert to notation the rest of the code understands */
+           sv_catpv(substitute_parse, "\\x{");
+           sv_catpvn(substitute_parse, RExC_parse, endchar - RExC_parse);
+           sv_catpv(substitute_parse, "}");
  
             /* Point to the beginning of the next character in the sequence. */
             RExC_parse = endchar + 1;
-
-           /* Quit if no more characters */
-           if (RExC_parse >= endbrace) break;
         }
+       sv_catpv(substitute_parse, ")");
  
+       RExC_parse = SvPV(substitute_parse, len);
  
-       if (SIZE_ONLY) {
-           if (RExC_parse < endbrace) {
-               ckWARNreg(RExC_parse - 1,
-                         "Using just the first characters returned by \\N{}");
-           }
-
-           RExC_size += STR_SZ(len);
-       } else {
-           STR_LEN(ret) = len;
-           RExC_emit += STR_SZ(len);
+       /* Don't allow empty number */
+       if (len < 8) {
+           vFAIL("Invalid hexadecimal number in \\N{U+...}");
         }
+       RExC_end = RExC_parse + len;
  
-       RExC_parse = endbrace + 1;
+       /* The values are Unicode, and therefore not subject to recoding */
+       RExC_override_recoding = 1;
+
+       ret = reg(pRExC_state, 1, flagp, depth+1);
+
+       RExC_parse = endbrace;
+       RExC_end = orig_end;
+       RExC_override_recoding = 0;
  
-       *flagp |= HASWIDTH; /* Not SIMPLE, as that causes the engine to fail
-                              with malformed in t/re/pat_advanced.t */
-       RExC_parse --;
-       Set_Node_Cur_Length(ret); /* MJD */
         nextchar(pRExC_state);
      }
  
@@ -7203,7 +7892,7 @@ S_reg_recode(pTHX_ const char value, SV **encp)
  
     Note: we have to be careful with escapes, as they can be both literal
     and special, and in the case of \10 and friends can either, depending
-   on context. Specifically there are two seperate switches for handling
+   on context. Specifically there are two separate switches for handling
     escape sequences, with the one for handling literal escapes requiring
     a dummy entry for all of the special escapes that are actually handled
     by the other.
@@ -7216,6 +7905,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
      register regnode *ret = NULL;
      I32 flags;
      char *parse_start = RExC_parse;
+    U8 op;
      GET_RE_DEBUG_FLAGS_DECL;
      DEBUG_PARSE("atom");
      *flagp = WORST;            /* Tentatively. */
@@ -7307,27 +7997,6 @@ tryagain:
         RExC_parse++;
         vFAIL("Quantifier follows nothing");
         break;
-    case LATIN_SMALL_LETTER_SHARP_S:
-    case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S):
-    case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T):
-#if UTF8_TWO_BYTE_HI_nocast(UPSILON_D_T) != UTF8_TWO_BYTE_HI_nocast(IOTA_D_T)
-#error The beginning utf8 byte of IOTA_D_T and UPSILON_D_T unexpectedly differ.  Other instances in this code should have the case statement below.
-    case UTF8_TWO_BYTE_HI_nocast(UPSILON_D_T):
-#endif
-        do_foldchar:
-        if (!LOC && FOLD) {
-            U32 len,cp;
-           len=0; /* silence a spurious compiler warning */
-            if ((cp = what_len_TRICKYFOLD_safe(RExC_parse,RExC_end,UTF,len))) {
-                *flagp |= HASWIDTH; /* could be SIMPLE too, but needs a handler in regexec.regrepeat */
-                RExC_parse+=len-1; /* we get one from nextchar() as well. :-( */
-                ret = reganode(pRExC_state, FOLDCHAR, cp);
-                Set_Node_Length(ret, 1); /* MJD */
-                nextchar(pRExC_state); /* kill whitespace under /x */
-                return ret;
-            }
-        }
-        goto outer_default;
      case '\\':
         /* Special Escapes
  
@@ -7342,10 +8011,6 @@ tryagain:
            literal text handling code.
         */
         switch ((U8)*++RExC_parse) {
-       case LATIN_SMALL_LETTER_SHARP_S:
-       case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S):
-       case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T):
-                  goto do_foldchar;        
         /* Special Escapes */
         case 'A':
             RExC_seen_zerolen++;
@@ -7387,69 +8052,179 @@ tryagain:
             *flagp |= HASWIDTH;
             goto finish_meta_pat;
         case 'w':
-           if (LOC) {
-                ret = reg_node(pRExC_state, (U8)(ALNUML));
-            } else {
-                ret = reg_node(pRExC_state, (U8)(ALNUM));
-                FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
+           switch (get_regex_charset(RExC_flags)) {
+               case REGEX_LOCALE_CHARSET:
+                   op = ALNUML;
+                   break;
+               case REGEX_UNICODE_CHARSET:
+                   op = ALNUMU;
+                   break;
+               case REGEX_ASCII_RESTRICTED_CHARSET:
+               case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
+                   op = ALNUMA;
+                   break;
+               case REGEX_DEPENDS_CHARSET:
+                   op = ALNUM;
+                   break;
+               default:
+                   goto bad_charset;
              }
+           ret = reg_node(pRExC_state, op);
             *flagp |= HASWIDTH|SIMPLE;
             goto finish_meta_pat;
         case 'W':
-            if (LOC) {
-                ret = reg_node(pRExC_state, (U8)(NALNUML));
-            } else {
-                ret = reg_node(pRExC_state, (U8)(NALNUM));
-                FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
+           switch (get_regex_charset(RExC_flags)) {
+               case REGEX_LOCALE_CHARSET:
+                   op = NALNUML;
+                   break;
+               case REGEX_UNICODE_CHARSET:
+                   op = NALNUMU;
+                   break;
+               case REGEX_ASCII_RESTRICTED_CHARSET:
+               case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
+                   op = NALNUMA;
+                   break;
+               case REGEX_DEPENDS_CHARSET:
+                   op = NALNUM;
+                   break;
+               default:
+                   goto bad_charset;
              }
+           ret = reg_node(pRExC_state, op);
             *flagp |= HASWIDTH|SIMPLE;
             goto finish_meta_pat;
         case 'b':
             RExC_seen_zerolen++;
             RExC_seen |= REG_SEEN_LOOKBEHIND;
-            if (LOC) {
-                ret = reg_node(pRExC_state, (U8)(BOUNDL));
-            } else {
-                ret = reg_node(pRExC_state, (U8)(BOUND));
-                FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
+           switch (get_regex_charset(RExC_flags)) {
+               case REGEX_LOCALE_CHARSET:
+                   op = BOUNDL;
+                   break;
+               case REGEX_UNICODE_CHARSET:
+                   op = BOUNDU;
+                   break;
+               case REGEX_ASCII_RESTRICTED_CHARSET:
+               case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
+                   op = BOUNDA;
+                   break;
+               case REGEX_DEPENDS_CHARSET:
+                   op = BOUND;
+                   break;
+               default:
+                   goto bad_charset;
              }
+           ret = reg_node(pRExC_state, op);
+           FLAGS(ret) = get_regex_charset(RExC_flags);
             *flagp |= SIMPLE;
+           if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
+               ckWARNregdep(RExC_parse, "\"\\b{\" is deprecated; use \"\\b\\{\" instead");
+           }
             goto finish_meta_pat;
         case 'B':
             RExC_seen_zerolen++;
             RExC_seen |= REG_SEEN_LOOKBEHIND;
-            if (LOC) {
-                ret = reg_node(pRExC_state, (U8)(NBOUNDL));
-            } else {
-                ret = reg_node(pRExC_state, (U8)(NBOUND));
-                FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
+           switch (get_regex_charset(RExC_flags)) {
+               case REGEX_LOCALE_CHARSET:
+                   op = NBOUNDL;
+                   break;
+               case REGEX_UNICODE_CHARSET:
+                   op = NBOUNDU;
+                   break;
+               case REGEX_ASCII_RESTRICTED_CHARSET:
+               case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
+                   op = NBOUNDA;
+                   break;
+               case REGEX_DEPENDS_CHARSET:
+                   op = NBOUND;
+                   break;
+               default:
+                   goto bad_charset;
              }
+           ret = reg_node(pRExC_state, op);
+           FLAGS(ret) = get_regex_charset(RExC_flags);
             *flagp |= SIMPLE;
+           if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
+               ckWARNregdep(RExC_parse, "\"\\B{\" is deprecated; use \"\\B\\{\" instead");
+           }
             goto finish_meta_pat;
         case 's':
-            if (LOC) {
-                ret = reg_node(pRExC_state, (U8)(SPACEL));
-            } else {
-                ret = reg_node(pRExC_state, (U8)(SPACE));
-                FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
+           switch (get_regex_charset(RExC_flags)) {
+               case REGEX_LOCALE_CHARSET:
+                   op = SPACEL;
+                   break;
+               case REGEX_UNICODE_CHARSET:
+                   op = SPACEU;
+                   break;
+               case REGEX_ASCII_RESTRICTED_CHARSET:
+               case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
+                   op = SPACEA;
+                   break;
+               case REGEX_DEPENDS_CHARSET:
+                   op = SPACE;
+                   break;
+               default:
+                   goto bad_charset;
              }
+           ret = reg_node(pRExC_state, op);
             *flagp |= HASWIDTH|SIMPLE;
             goto finish_meta_pat;
         case 'S':
-            if (LOC) {
-                ret = reg_node(pRExC_state, (U8)(NSPACEL));
-            } else {
-                ret = reg_node(pRExC_state, (U8)(NSPACE));
-                FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
+           switch (get_regex_charset(RExC_flags)) {
+               case REGEX_LOCALE_CHARSET:
+                   op = NSPACEL;
+                   break;
+               case REGEX_UNICODE_CHARSET:
+                   op = NSPACEU;
+                   break;
+               case REGEX_ASCII_RESTRICTED_CHARSET:
+               case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
+                   op = NSPACEA;
+                   break;
+               case REGEX_DEPENDS_CHARSET:
+                   op = NSPACE;
+                   break;
+               default:
+                   goto bad_charset;
              }
+           ret = reg_node(pRExC_state, op);
             *flagp |= HASWIDTH|SIMPLE;
             goto finish_meta_pat;
         case 'd':
-           ret = reg_node(pRExC_state, DIGIT);
+           switch (get_regex_charset(RExC_flags)) {
+               case REGEX_LOCALE_CHARSET:
+                   op = DIGITL;
+                   break;
+               case REGEX_ASCII_RESTRICTED_CHARSET:
+               case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
+                   op = DIGITA;
+                   break;
+               case REGEX_DEPENDS_CHARSET: /* No difference between these */
+               case REGEX_UNICODE_CHARSET:
+                   op = DIGIT;
+                   break;
+               default:
+                   goto bad_charset;
+            }
+           ret = reg_node(pRExC_state, op);
             *flagp |= HASWIDTH|SIMPLE;
             goto finish_meta_pat;
         case 'D':
-           ret = reg_node(pRExC_state, NDIGIT);
+           switch (get_regex_charset(RExC_flags)) {
+               case REGEX_LOCALE_CHARSET:
+                   op = NDIGITL;
+                   break;
+               case REGEX_ASCII_RESTRICTED_CHARSET:
+               case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
+                   op = NDIGITA;
+                   break;
+               case REGEX_DEPENDS_CHARSET: /* No difference between these */
+               case REGEX_UNICODE_CHARSET:
+                   op = NDIGIT;
+                   break;
+               default:
+                   goto bad_charset;
+            }
+           ret = reg_node(pRExC_state, op);
             *flagp |= HASWIDTH|SIMPLE;
             goto finish_meta_pat;
         case 'R':
@@ -7518,7 +8293,7 @@ tryagain:
              Also this makes sure that things like /\N{BLAH}+/ and 
              \N{BLAH} being multi char Just Happen. dmq*/
              ++RExC_parse;
-            ret= reg_namedseq(pRExC_state, NULL, flagp); 
+            ret= reg_namedseq(pRExC_state, NULL, flagp, depth);
              break;
         case 'k':    /* Handle \k<NAME> and \k'NAME' */
         parse_named_seq:
@@ -7548,11 +8323,13 @@ tryagain:
                  ret = reganode(pRExC_state,
                                 ((! FOLD)
                                   ? NREF
-                                 : (UNI_SEMANTICS)
-                                   ? NREFFU
-                                   : (LOC)
-                                     ? NREFFL
-                                     : NREFF),
+                                : (MORE_ASCII_RESTRICTED)
+                                  ? NREFFA
+                                   : (AT_LEAST_UNI_SEMANTICS)
+                                     ? NREFFU
+                                     : (LOC)
+                                       ? NREFFL
+                                       : NREFF),
                                  num);
                  *flagp |= HASWIDTH;
  
@@ -7616,11 +8393,13 @@ tryagain:
                     ret = reganode(pRExC_state,
                                    ((! FOLD)
                                      ? REF
-                                    : (UNI_SEMANTICS)
-                                      ? REFFU
-                                      : (LOC)
-                                        ? REFFL
-                                        : REFF),
+                                    : (MORE_ASCII_RESTRICTED)
+                                      ? REFFA
+                                       : (AT_LEAST_UNI_SEMANTICS)
+                                         ? REFFU
+                                         : (LOC)
+                                           ? REFFL
+                                           : REFF),
                                     num);
                     *flagp |= HASWIDTH;
  
@@ -7652,27 +8431,42 @@ tryagain:
         /* FALL THROUGH */
  
      default:
-        outer_default:{
+
+            parse_start = RExC_parse - 1;
+
+           RExC_parse++;
+
+       defchar: {
+           typedef enum {
+               generic_char = 0,
+               char_s,
+               upsilon_1,
+               upsilon_2,
+               iota_1,
+               iota_2,
+           } char_state;
+           char_state latest_char_state = generic_char;
             register STRLEN len;
             register UV ender;
             register char *p;
             char *s;
             STRLEN foldlen;
             U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
+           regnode * orig_emit;
  
-            parse_start = RExC_parse - 1;
-
-           RExC_parse++;
-
-       defchar:
             ender = 0;
+           orig_emit = RExC_emit; /* Save the original output node position in
+                                     case we need to output a different node
+                                     type */
             ret = reg_node(pRExC_state,
                            (U8) ((! FOLD) ? EXACT
                                           : (LOC)
                                              ? EXACTFL
-                                            : (UNI_SEMANTICS)
-                                              ? EXACTFU
-                                              : EXACTF)
+                                            : (MORE_ASCII_RESTRICTED)
+                                              ? EXACTFA
+                                              : (AT_LEAST_UNI_SEMANTICS)
+                                                ? EXACTFU
+                                                : EXACTF)
                     );
             s = STRING(ret);
             for (len = 0, p = RExC_parse - 1;
@@ -7684,11 +8478,6 @@ tryagain:
                 if (RExC_flags & RXf_PMf_EXTENDED)
                     p = regwhite( pRExC_state, p );
                 switch ((U8)*p) {
-               case LATIN_SMALL_LETTER_SHARP_S:
-               case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S):
-               case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T):
-                          if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
-                               goto normal_default;
                 case '^':
                 case '$':
                 case '.':
@@ -7713,11 +8502,6 @@ tryagain:
  
                     switch ((U8)*++p) {
                     /* These are all the special escapes. */
-                    case LATIN_SMALL_LETTER_SHARP_S:
-                    case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S):
-                    case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T):
-                          if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
-                               goto normal_default;                
                     case 'A':             /* Start assertion */
                     case 'b': case 'B':   /* Word-boundary assertion*/
                     case 'C':             /* Single char !DANGEROUS! */
@@ -7821,7 +8605,7 @@ tryagain:
                         break;
                     case 'c':
                         p++;
-                       ender = grok_bslash_c(*p++, SIZE_ONLY);
+                       ender = grok_bslash_c(*p++, UTF, SIZE_ONLY);
                         break;
                     case '0': case '1': case '2': case '3':case '4':
                     case '5': case '6': case '7': case '8':case '9':
@@ -7844,7 +8628,7 @@ tryagain:
                             goto recode_encoding;
                         break;
                     recode_encoding:
-                       {
+                       if (! RExC_override_recoding) {
                             SV* enc = PL_encoding;
                             ender = reg_recode((const char)(U8)ender, &enc);
                             if (!enc && SIZE_ONLY)
@@ -7857,8 +8641,13 @@ tryagain:
                             FAIL("Trailing \\");
                         /* FALL THROUGH */
                     default:
-                       if (!SIZE_ONLY&& isALPHA(*p))
-                           ckWARN2reg(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
+                       if (!SIZE_ONLY&& isALPHA(*p)) {
+                           /* Include any { following the alpha to emphasize
+                            * that it could be part of an escape at some point
+                            * in the future */
+                           int len = (*(p + 1) == '{') ? 2 : 1;
+                           ckWARN3reg(p + len, "Unrecognized escape \\%.*s passed through", len, p);
+                       }
                         goto normal_default;
                     }
                     break;
@@ -7871,14 +8660,281 @@ tryagain:
                         p += numlen;
                     }
                     else
-                       ender = *p++;
+                       ender = (U8) *p++;
                     break;
+               } /* End of switch on the literal */
+
+               /* Certain characters are problematic because their folded
+                * length is so different from their original length that it
+                * isn't handleable by the optimizer.  They are therefore not
+                * placed in an EXACTish node; and are here handled specially.
+                * (Even if the optimizer handled LATIN_SMALL_LETTER_SHARP_S,
+                * putting it in a special node keeps regexec from having to
+                * deal with a non-utf8 multi-char fold */
+               if (FOLD
+                   && (ender > 255 || (! MORE_ASCII_RESTRICTED && ! LOC)))
+               {
+                   /* We look for either side of the fold.  For example \xDF
+                    * folds to 'ss'.  We look for both the single character
+                    * \xDF and the sequence 'ss'.  When we find something that
+                    * could be one of those, we stop and flush whatever we
+                    * have output so far into the EXACTish node that was being
+                    * built.  Then restore the input pointer to what it was.
+                    * regatom will return that EXACT node, and will be called
+                    * again, positioned so the first character is the one in
+                    * question, which we return in a different node type.
+                    * The multi-char folds are a sequence, so the occurrence
+                    * of the first character in that sequence doesn't
+                    * necessarily mean that what follows is the rest of the
+                    * sequence.  We keep track of that with a state machine,
+                    * with the state being set to the latest character
+                    * processed before the current one.  Most characters will
+                    * set the state to 0, but if one occurs that is part of a
+                    * potential tricky fold sequence, the state is set to that
+                    * character, and the next loop iteration sees if the state
+                    * should progress towards the final folded-from character,
+                    * or if it was a false alarm.  If it turns out to be a
+                    * false alarm, the character(s) will be output in a new
+                    * EXACTish node, and join_exact() will later combine them.
+                    * In the case of the 'ss' sequence, which is more common
+                    * and more easily checked, some look-ahead is done to
+                    * save time by ruling-out some false alarms */
+                   switch (ender) {
+                       default:
+                           latest_char_state = generic_char;
+                           break;
+                       case 's':
+                       case 'S':
+                       case 0x17F: /* LATIN SMALL LETTER LONG S */
+                            if (AT_LEAST_UNI_SEMANTICS) {
+                               if (latest_char_state == char_s) {  /* 'ss' */
+                                   ender = LATIN_SMALL_LETTER_SHARP_S;
+                                   goto do_tricky;
+                               }
+                               else if (p < RExC_end) {
+
+                                   /* Look-ahead at the next character.  If it
+                                    * is also an s, we handle as a sharp s
+                                    * tricky regnode.  */
+                                   if (*p == 's' || *p == 'S') {
+
+                                       /* But first flush anything in the
+                                        * EXACTish buffer */
+                                       if (len != 0) {
+                                           p = oldp;
+                                           goto loopdone;
+                                       }
+                                       p++;    /* Account for swallowing this
+                                                  's' up */
+                                       ender = LATIN_SMALL_LETTER_SHARP_S;
+                                       goto do_tricky;
+                                   }
+                                       /* Here, the next character is not a
+                                        * literal 's', but still could
+                                        * evaluate to one if part of a \o{},
+                                        * \x or \OCTAL-DIGIT.  The minimum
+                                        * length required for that is 4, eg
+                                        * \x53 or \123 */
+                                   else if (*p == '\\'
+                                            && p < RExC_end - 4
+                                            && (isDIGIT(*(p + 1))
+                                                || *(p + 1) == 'x'
+                                                || *(p + 1) == 'o' ))
+                                   {
+
+                                       /* Here, it could be an 's', too much
+                                        * bother to figure it out here.  Flush
+                                        * the buffer if any; when come back
+                                        * here, set the state so know that the
+                                        * previous char was an 's' */
+                                       if (len != 0) {
+                                           latest_char_state = generic_char;
+                                           p = oldp;
+                                           goto loopdone;
+                                       }
+                                       latest_char_state = char_s;
+                                       break;
+                                   }
+                               }
+                           }
+
+                           /* Here, can't be an 'ss' sequence, or at least not
+                            * one that could fold to/from the sharp ss */
+                           latest_char_state = generic_char;
+                           break;
+                       case 0x03C5:    /* First char in upsilon series */
+                       case 0x03A5:    /* Also capital UPSILON, which folds to
+                                          03C5, and hence exhibits the same
+                                          problem */
+                           if (p < RExC_end - 4) { /* Need >= 4 bytes left */
+                               latest_char_state = upsilon_1;
+                               if (len != 0) {
+                                   p = oldp;
+                                   goto loopdone;
+                               }
+                           }
+                           else {
+                               latest_char_state = generic_char;
+                           }
+                           break;
+                       case 0x03B9:    /* First char in iota series */
+                       case 0x0399:    /* Also capital IOTA */
+                       case 0x1FBE:    /* GREEK PROSGEGRAMMENI folds to 3B9 */
+                       case 0x0345:    /* COMBINING GREEK YPOGEGRAMMENI folds
+                                          to 3B9 */
+                           if (p < RExC_end - 4) {
+                               latest_char_state = iota_1;
+                               if (len != 0) {
+                                   p = oldp;
+                                   goto loopdone;
+                               }
+                           }
+                           else {
+                               latest_char_state = generic_char;
+                           }
+                           break;
+                       case 0x0308:
+                           if (latest_char_state == upsilon_1) {
+                               latest_char_state = upsilon_2;
+                           }
+                           else if (latest_char_state == iota_1) {
+                               latest_char_state = iota_2;
+                           }
+                           else {
+                               latest_char_state = generic_char;
+                           }
+                           break;
+                       case 0x301:
+                           if (latest_char_state == upsilon_2) {
+                               ender = GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS;
+                               goto do_tricky;
+                           }
+                           else if (latest_char_state == iota_2) {
+                               ender = GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS;
+                               goto do_tricky;
+                           }
+                           latest_char_state = generic_char;
+                           break;
+
+                       /* These are the tricky fold characters.  Flush any
+                        * buffer first. (When adding to this list, also should
+                        * add them to fold_grind.t to make sure get tested) */
+                       case GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS:
+                       case GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS:
+                       case LATIN_SMALL_LETTER_SHARP_S:
+                       case LATIN_CAPITAL_LETTER_SHARP_S:
+                       case 0x1FD3: /* GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA */
+                       case 0x1FE3: /* GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA */
+                           if (len != 0) {
+                               p = oldp;
+                               goto loopdone;
+                           }
+                           /* FALL THROUGH */
+                       do_tricky: {
+                           char* const oldregxend = RExC_end;
+                           U8 tmpbuf[UTF8_MAXBYTES+1];
+
+                           /* Here, we know we need to generate a special
+                            * regnode, and 'ender' contains the tricky
+                            * character.  What's done is to pretend it's in a
+                            * [bracketed] class, and let the code that deals
+                            * with those handle it, as that code has all the
+                            * intelligence necessary.  First save the current
+                            * parse state, get rid of the already allocated
+                            * but empty EXACT node that the ANYOFV node will
+                            * replace, and point the parse to a buffer which
+                            * we fill with the character we want the regclass
+                            * code to think is being parsed */
+                           RExC_emit = orig_emit;
+                           RExC_parse = (char *) tmpbuf;
+                           if (UTF) {
+                               U8 *d = uvchr_to_utf8(tmpbuf, ender);
+                               *d = '\0';
+                               RExC_end = (char *) d;
+                           }
+                           else {  /* ender above 255 already excluded */
+                               tmpbuf[0] = (U8) ender;
+                               tmpbuf[1] = '\0';
+                               RExC_end = RExC_parse + 1;
+                           }
+
+                           ret = regclass(pRExC_state,depth+1);
+
+                           /* Here, have parsed the buffer.  Reset the parse to
+                            * the actual input, and return */
+                           RExC_end = oldregxend;
+                           RExC_parse = p - 1;
+
+                           Set_Node_Offset(ret, RExC_parse);
+                           Set_Node_Cur_Length(ret);
+                           nextchar(pRExC_state);
+                           *flagp |= HASWIDTH|SIMPLE;
+                           return ret;
+                       }
+                   }
                 }
+
                 if ( RExC_flags & RXf_PMf_EXTENDED)
                     p = regwhite( pRExC_state, p );
                 if (UTF && FOLD) {
-                   /* Prime the casefolded buffer. */
-                   ender = toFOLD_uni(ender, tmpbuf, &foldlen);
+                   /* Prime the casefolded buffer.  Locale rules, which apply
+                    * only to code points < 256, aren't known until execution,
+                    * so for them, just output the original character using
+                    * utf8 */
+                   if (LOC && ender < 256) {
+                       if (UNI_IS_INVARIANT(ender)) {
+                           *tmpbuf = (U8) ender;
+                           foldlen = 1;
+                       } else {
+                           *tmpbuf = UTF8_TWO_BYTE_HI(ender);
+                           *(tmpbuf + 1) = UTF8_TWO_BYTE_LO(ender);
+                           foldlen = 2;
+                       }
+                   }
+                   else if (isASCII(ender)) {  /* Note: Here can't also be LOC
+                                                */
+                       ender = toLOWER(ender);
+                       *tmpbuf = (U8) ender;
+                       foldlen = 1;
+                   }
+                   else if (! MORE_ASCII_RESTRICTED && ! LOC) {
+
+                       /* Locale and /aa require more selectivity about the
+                        * fold, so are handled below.  Otherwise, here, just
+                        * use the fold */
+                       ender = toFOLD_uni(ender, tmpbuf, &foldlen);
+                   }
+                   else {
+                       /* Under locale rules or /aa we are not to mix,
+                        * respectively, ords < 256 or ASCII with non-.  So
+                        * reject folds that mix them, using only the
+                        * non-folded code point.  So do the fold to a
+                        * temporary, and inspect each character in it. */
+                       U8 trialbuf[UTF8_MAXBYTES_CASE+1];
+                       U8* s = trialbuf;
+                       UV tmpender = toFOLD_uni(ender, trialbuf, &foldlen);
+                       U8* e = s + foldlen;
+                       bool fold_ok = TRUE;
+
+                       while (s < e) {
+                           if (isASCII(*s)
+                               || (LOC && (UTF8_IS_INVARIANT(*s)
+                                          || UTF8_IS_DOWNGRADEABLE_START(*s))))
+                           {
+                               fold_ok = FALSE;
+                               break;
+                           }
+                           s += UTF8SKIP(s);
+                       }
+                       if (fold_ok) {
+                           Copy(trialbuf, tmpbuf, foldlen, U8);
+                           ender = tmpender;
+                       }
+                       else {
+                           uvuni_to_utf8(tmpbuf, ender);
+                           foldlen = UNISKIP(ender);
+                       }
+                   }
                 }
                 if (p < RExC_end && ISMULT2(p)) { /* Back off on ?+*. */
                     if (len)
@@ -7950,10 +9006,12 @@ tryagain:
                      }
                      len--;
                 }
-               else
+               else {
                     REGC((char)ender, s++);
+               }
             }
-       loopdone:
+       loopdone:   /* Jumped to when encounters something that shouldn't be in
+                      the node */
             RExC_parse = p - 1;
              Set_Node_Cur_Length(ret); /* MJD */
             nextchar(pRExC_state);
@@ -7979,6 +9037,11 @@ tryagain:
      }
  
      return(ret);
+
+/* Jumped to when an unrecognized character set is encountered */
+bad_charset:
+    Perl_croak(aTHX_ "panic: Unknown regex character set encoding: %u", get_regex_charset(RExC_flags));
+    return(NULL);
  }
  
  STATIC char *
@@ -8166,125 +9229,213 @@ S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
      }
  }
  
-/* No locale test */
-#define _C_C_T_NOLOC_(NAME,TEST,WORD)                   \
-ANYOF_##NAME:                                           \
-       for (value = 0; value < 256; value++)           \
-           if (TEST)                                   \
-               stored += S_set_regclass_bit(aTHX_ pRExC_state, ret, value); \
-    yesno = '+';                                        \
-    what = WORD;                                        \
-    break;                                              \
-case ANYOF_N##NAME:                                     \
-       for (value = 0; value < 256; value++)           \
-           if (!TEST)                                  \
-               stored += S_set_regclass_bit(aTHX_ pRExC_state, ret, value); \
-    yesno = '!';                                        \
-    what = WORD;                                        \
+/* No locale test, and always Unicode semantics */
+#define _C_C_T_NOLOC_(NAME,TEST,WORD)                                          \
+ANYOF_##NAME:                                                                  \
+       for (value = 0; value < 256; value++)                                  \
+           if (TEST)                                                          \
+           stored += set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate);  \
+    yesno = '+';                                                               \
+    what = WORD;                                                               \
+    break;                                                                     \
+case ANYOF_N##NAME:                                                            \
+       for (value = 0; value < 256; value++)                                  \
+           if (!TEST)                                                         \
+           stored += set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate);  \
+    yesno = '!';                                                               \
+    what = WORD;                                                               \
      break
  
  /* Like the above, but there are differences if we are in uni-8-bit or not, so
   * there are two tests passed in, to use depending on that. There aren't any
   * cases where the label is different from the name, so no need for that
   * parameter */
-#define _C_C_T_(NAME,TEST_8,TEST_7,WORD)       \
-ANYOF_##NAME:                                           \
-    if (LOC) ANYOF_CLASS_SET(ret, ANYOF_##NAME);        \
-    else if (UNI_SEMANTICS) {                           \
-        for (value = 0; value < 256; value++) {         \
-            if (TEST_8) stored +=                       \
-                      S_set_regclass_bit(aTHX_ pRExC_state, ret, value); \
-        }                                               \
-    }                                                   \
-    else {                                              \
-        for (value = 0; value < 128; value++) {         \
-            if (TEST_7) stored +=                       \
-                       S_set_regclass_bit(aTHX_ pRExC_state, ret, UNI_TO_NATIVE(value)); \
-        }                                               \
-    }                                                   \
-    yesno = '+';                                        \
-    what = WORD;                                        \
-    break;                                              \
-case ANYOF_N##NAME:                                     \
-    if (LOC) ANYOF_CLASS_SET(ret, ANYOF_N##NAME);       \
-    else if (UNI_SEMANTICS) {                           \
-        for (value = 0; value < 256; value++) {         \
-            if (! TEST_8) stored +=                     \
-                        S_set_regclass_bit(aTHX_ pRExC_state, ret, value); \
-        }                                               \
-    }                                                   \
-    else {                                              \
-        for (value = 0; value < 128; value++) {         \
-            if (! TEST_7) stored +=                     \
-                        S_set_regclass_bit(aTHX_ pRExC_state, ret, value); \
-        }                                               \
-        for (value = 128; value < 256; value++) {         \
-                        S_set_regclass_bit(aTHX_ pRExC_state, ret, value); \
-        }                                               \
-    }                                                   \
-    yesno = '!';                                        \
-    what = WORD;                                        \
+#define _C_C_T_(NAME, TEST_8, TEST_7, WORD)                                    \
+ANYOF_##NAME:                                                                  \
+    if (LOC) ANYOF_CLASS_SET(ret, ANYOF_##NAME);                               \
+    else if (UNI_SEMANTICS) {                                                  \
+        for (value = 0; value < 256; value++) {                                \
+            if (TEST_8(value)) stored +=                                       \
+                      set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate);  \
+        }                                                                      \
+    }                                                                          \
+    else {                                                                     \
+        for (value = 0; value < 128; value++) {                                \
+            if (TEST_7(UNI_TO_NATIVE(value))) stored +=                        \
+               set_regclass_bit(pRExC_state, ret,                     \
+                                  (U8) UNI_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate);                 \
+        }                                                                      \
+    }                                                                          \
+    yesno = '+';                                                               \
+    what = WORD;                                                               \
+    break;                                                                     \
+case ANYOF_N##NAME:                                                            \
+    if (LOC) ANYOF_CLASS_SET(ret, ANYOF_N##NAME);                              \
+    else if (UNI_SEMANTICS) {                                                  \
+        for (value = 0; value < 256; value++) {                                \
+            if (! TEST_8(value)) stored +=                                     \
+                   set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate);    \
+        }                                                                      \
+    }                                                                          \
+    else {                                                                     \
+        for (value = 0; value < 128; value++) {                                \
+            if (! TEST_7(UNI_TO_NATIVE(value))) stored += set_regclass_bit(  \
+                       pRExC_state, ret, (U8) UNI_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate);    \
+        }                                                                      \
+       if (AT_LEAST_ASCII_RESTRICTED) {                                       \
+           for (value = 128; value < 256; value++) {                          \
+             stored += set_regclass_bit(                                     \
+                          pRExC_state, ret, (U8) UNI_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate); \
+           }                                                                  \
+           ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;                             \
+       }                                                                      \
+       else {                                                                 \
+           /* For a non-ut8 target string with DEPENDS semantics, all above   \
+            * ASCII Latin1 code points match the complement of any of the     \
+            * classes.  But in utf8, they have their Unicode semantics, so    \
+            * can't just set them in the bitmap, or else regexec.c will think \
+            * they matched when they shouldn't. */                            \
+           ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL;                     \
+       }                                                                      \
+    }                                                                          \
+    yesno = '!';                                                               \
+    what = WORD;                                                               \
      break
  
-/* 
-   We dont use PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS as the direct test
-   so that it is possible to override the option here without having to 
-   rebuild the entire core. as we are required to do if we change regcomp.h
-   which is where PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS is defined.
-*/
-#if PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS
-#define BROKEN_UNICODE_CHARCLASS_MAPPINGS
-#endif
-
-#ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
-#define POSIX_CC_UNI_NAME(CCNAME) CCNAME
-#else
-#define POSIX_CC_UNI_NAME(CCNAME) "Posix" CCNAME
-#endif
-
  STATIC U8
-S_set_regclass_bit_fold(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value)
+S_set_regclass_bit_fold(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value, HV** invlist_ptr, AV** alternate_ptr)
  {
  
      /* Handle the setting of folds in the bitmap for non-locale ANYOF nodes.
       * Locale folding is done at run-time, so this function should not be
       * called for nodes that are for locales.
       *
-     * This function simply sets the bit corresponding to the fold of the input
+     * This function sets the bit corresponding to the fold of the input
       * 'value', if not already set.  The fold of 'f' is 'F', and the fold of
       * 'F' is 'f'.
       *
-     * It also sets any necessary flags, and returns the number of bits that
-     * actually changed from 0 to 1 */
+     * It also knows about the characters that are in the bitmap that have
+     * folds that are matchable only outside it, and sets the appropriate lists
+     * and flags.
+     *
+     * It returns the number of bits that actually changed from 0 to 1 */
  
      U8 stored = 0;
      U8 fold;
  
-    fold = (UNI_SEMANTICS) ? PL_fold_latin1[value]
-                           : PL_fold[value];
+    PERL_ARGS_ASSERT_SET_REGCLASS_BIT_FOLD;
+
+    fold = (AT_LEAST_UNI_SEMANTICS) ? PL_fold_latin1[value]
+                                    : PL_fold[value];
  
      /* It assumes the bit for 'value' has already been set */
      if (fold != value && ! ANYOF_BITMAP_TEST(node, fold)) {
          ANYOF_BITMAP_SET(node, fold);
          stored++;
      }
-
-    /* The fold of the German sharp s is two ASCII characters, so isn't in the
-     * bitmap and doesn't have to be in utf8, but we only process it if unicode
-     * semantics are called for */
-    if (UNI_SEMANTICS && value == LATIN_SMALL_LETTER_SHARP_S) {
-       ANYOF_FLAGS(node) |= ANYOF_NONBITMAP_NON_UTF8;
+    if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value) && (! isASCII(value) || ! MORE_ASCII_RESTRICTED)) {
+       /* Certain Latin1 characters have matches outside the bitmap.  To get
+        * here, 'value' is one of those characters.   None of these matches is
+        * valid for ASCII characters under /aa, which have been excluded by
+        * the 'if' above.  The matches fall into three categories:
+        * 1) They are singly folded-to or -from an above 255 character, as
+        *    LATIN SMALL LETTER Y WITH DIAERESIS and LATIN CAPITAL LETTER Y
+        *    WITH DIAERESIS;
+        * 2) They are part of a multi-char fold with another character in the
+        *    bitmap, only LATIN SMALL LETTER SHARP S => "ss" fits that bill;
+        * 3) They are part of a multi-char fold with a character not in the
+        *    bitmap, such as various ligatures.
+        * We aren't dealing fully with multi-char folds, except we do deal
+        * with the pattern containing a character that has a multi-char fold
+        * (not so much the inverse).
+        * For types 1) and 3), the matches only happen when the target string
+        * is utf8; that's not true for 2), and we set a flag for it.
+        *
+        * The code below adds to the passed in inversion list the single fold
+        * closures for 'value'.  The values are hard-coded here so that an
+        * innocent-looking character class, like /[ks]/i won't have to go out
+        * to disk to find the possible matches.  XXX It would be better to
+        * generate these via regen, in case a new version of the Unicode
+        * standard adds new mappings, though that is not really likely. */
+       switch (value) {
+           case 'k':
+           case 'K':
+               /* KELVIN SIGN */
+               *invlist_ptr = add_cp_to_invlist(*invlist_ptr, 0x212A);
+               break;
+           case 's':
+           case 'S':
+               /* LATIN SMALL LETTER LONG S */
+               *invlist_ptr = add_cp_to_invlist(*invlist_ptr, 0x017F);
+               break;
+           case MICRO_SIGN:
+               *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
+                                                GREEK_SMALL_LETTER_MU);
+               *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
+                                                GREEK_CAPITAL_LETTER_MU);
+               break;
+           case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
+           case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
+               /* ANGSTROM SIGN */
+               *invlist_ptr = add_cp_to_invlist(*invlist_ptr, 0x212B);
+               if (DEPENDS_SEMANTICS) {    /* See DEPENDS comment below */
+                   *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
+                                                    PL_fold_latin1[value]);
+               }
+               break;
+           case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
+               *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
+                                       LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
+               break;
+           case LATIN_SMALL_LETTER_SHARP_S:
+               *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
+                                       LATIN_CAPITAL_LETTER_SHARP_S);
+
+               /* Under /a, /d, and /u, this can match the two chars "ss" */
+               if (! MORE_ASCII_RESTRICTED) {
+                   add_alternate(alternate_ptr, (U8 *) "ss", 2);
+
+                   /* And under /u or /a, it can match even if the target is
+                    * not utf8 */
+                   if (AT_LEAST_UNI_SEMANTICS) {
+                       ANYOF_FLAGS(node) |= ANYOF_NONBITMAP_NON_UTF8;
+                   }
+               }
+               break;
+           case 'F': case 'f':
+           case 'I': case 'i':
+           case 'L': case 'l':
+           case 'T': case 't':
+           case 'A': case 'a':
+           case 'H': case 'h':
+           case 'J': case 'j':
+           case 'N': case 'n':
+           case 'W': case 'w':
+           case 'Y': case 'y':
+                /* These all are targets of multi-character folds from code
+                 * points that require UTF8 to express, so they can't match
+                 * unless the target string is in UTF-8, so no action here is
+                 * necessary, as regexec.c properly handles the general case
+                 * for UTF-8 matching */
+               break;
+           default:
+               /* Use deprecated warning to increase the chances of this
+                * being output */
+               ckWARN2regdep(RExC_parse, "Perl folding rules are not up-to-date for 0x%x; please use the perlbug utility to report;", value);
+               break;
+       }
      }
-    else if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value)
-            || (! UNI_SEMANTICS
-                 && ! isASCII(value)
-                 && PL_fold_latin1[value] != value))
-    {   /* A character that has a fold outside of Latin1 matches outside the
-           bitmap, but only when the target string is utf8.  Similarly when we
-           don't have unicode semantics for the above ASCII Latin-1 characters,
-           and they have a fold, they should match if the target is utf8, and
-           not otherwise */
-       ANYOF_FLAGS(node) |= ANYOF_UTF8;
+    else if (DEPENDS_SEMANTICS
+           && ! isASCII(value)
+           && PL_fold_latin1[value] != value)
+    {
+          /* Under DEPENDS rules, non-ASCII Latin1 characters match their
+           * folds only when the target string is in UTF-8.  We add the fold
+           * here to the list of things to match outside the bitmap, which
+           * won't be looked at unless it is UTF8 (or else if something else
+           * says to look even if not utf8, but those things better not happen
+           * under DEPENDS semantics. */
+       *invlist_ptr = add_cp_to_invlist(*invlist_ptr, PL_fold_latin1[value]);
      }
  
      return stored;
@@ -8292,7 +9443,7 @@ S_set_regclass_bit_fold(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8
  
  
  PERL_STATIC_INLINE U8
-S_set_regclass_bit(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U32 value)
+S_set_regclass_bit(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value, HV** invlist_ptr, AV** alternate_ptr)
  {
      /* This inline function sets a bit in the bitmap if not already set, and if
       * appropriate, its fold, returning the number of bits that actually
@@ -8300,6 +9451,8 @@ S_set_regclass_bit(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U32 val
  
      U8 stored;
  
+    PERL_ARGS_ASSERT_SET_REGCLASS_BIT;
+
      if (ANYOF_BITMAP_TEST(node, value)) {   /* Already set */
         return 0;
      }
@@ -8308,18 +9461,36 @@ S_set_regclass_bit(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U32 val
      stored = 1;
  
      if (FOLD && ! LOC) {       /* Locale folds aren't known until runtime */
-       stored += S_set_regclass_bit_fold(aTHX_ pRExC_state, node, value);
+       stored += set_regclass_bit_fold(pRExC_state, node, value, invlist_ptr, alternate_ptr);
      }
  
      return stored;
  }
  
+STATIC void
+S_add_alternate(pTHX_ AV** alternate_ptr, U8* string, STRLEN len)
+{
+    /* Adds input 'string' with length 'len' to the ANYOF node's unicode
+     * alternate list, pointed to by 'alternate_ptr'.  This is an array of
+     * the multi-character folds of characters in the node */
+    SV *sv;
+
+    PERL_ARGS_ASSERT_ADD_ALTERNATE;
+
+    if (! *alternate_ptr) {
+       *alternate_ptr = newAV();
+    }
+    sv = newSVpvn_utf8((char*)string, len, TRUE);
+    av_push(*alternate_ptr, sv);
+    return;
+}
+
  /*
     parse a class specification and produce either an ANYOF node that
-   matches the pattern or if the pattern matches a single char only and
-   that char is < 256 and we are case insensitive then we produce an 
-   EXACT node instead.
-*/
+   matches the pattern or perhaps will be optimized into an EXACTish node
+   instead. The node contains a bit map for the first 256 characters, with the
+   corresponding bit set if that character is in the list.  For characters
+   above 255, a range list is used */
  
  STATIC regnode *
  S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
@@ -8334,14 +9505,37 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
      IV namedclass;
      char *rangebegin = NULL;
      bool need_class = 0;
+    bool allow_full_fold = TRUE;   /* Assume wants multi-char folding */
      SV *listsv = NULL;
+    STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
+                                     than just initialized.  */
      UV n;
-    bool optimize_invert   = TRUE;
+
+    /* code points this node matches that can't be stored in the bitmap */
+    HV* nonbitmap = NULL;
+
+    /* The items that are to match that aren't stored in the bitmap, but are a
+     * result of things that are stored there.  This is the fold closure of
+     * such a character, either because it has DEPENDS semantics and shouldn't
+     * be matched unless the target string is utf8, or is a code point that is
+     * too large for the bit map, as for example, the fold of the MICRO SIGN is
+     * above 255.  This all is solely for performance reasons.  By having this
+     * code know the outside-the-bitmap folds that the bitmapped characters are
+     * involved with, we don't have to go out to disk to find the list of
+     * matches, unless the character class includes code points that aren't
+     * storable in the bit map.  That means that a character class with an 's'
+     * in it, for example, doesn't need to go out to disk to find everything
+     * that matches.  A 2nd list is used so that the 'nonbitmap' list is kept
+     * empty unless there is something whose fold we don't know about, and will
+     * have to go out to the disk to find. */
+    HV* l1_fold_invlist = NULL;
+
+    /* List of multi-character folds that are matched by this node */
      AV* unicode_alternate  = NULL;
  #ifdef EBCDIC
      UV literal_endpoint = 0;
  #endif
-    UV stored = 0;  /* 0, 1, or more than 1 chars stored in the class */
+    UV stored = 0;  /* how many chars stored in the bitmap */
  
      regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
          case we need to change the emitted regop to an EXACT. */
@@ -8358,14 +9552,26 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
      /* Assume we are going to generate an ANYOF node. */
      ret = reganode(pRExC_state, ANYOF, 0);
  
-    if (!SIZE_ONLY)
+
+    if (!SIZE_ONLY) {
         ANYOF_FLAGS(ret) = 0;
+    }
  
      if (UCHARAT(RExC_parse) == '^') {  /* Complement of range. */
         RExC_naughty++;
         RExC_parse++;
         if (!SIZE_ONLY)
             ANYOF_FLAGS(ret) |= ANYOF_INVERT;
+
+        /* We have decided to not allow multi-char folds in inverted character
+        * classes, due to the confusion that can happen, especially with
+        * classes that are designed for a non-Unicode world:  You have the
+        * peculiar case that:
+            "s s" =~ /^[^\xDF]+$/i => Y
+            "ss"  =~ /^[^\xDF]+$/i => N
+         *
+         * See [perl #89750] */
+        allow_full_fold = FALSE;
      }
  
      if (SIZE_ONLY) {
@@ -8374,12 +9580,12 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
      }
      else {
         RExC_emit += ANYOF_SKIP;
-       if (FOLD)
-           ANYOF_FLAGS(ret) |= ANYOF_FOLD;
-       if (LOC)
+       if (LOC) {
             ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
+       }
         ANYOF_BITMAP_ZERO(ret);
         listsv = newSVpvs("# comment\n");
+       initial_listsv_len = SvCUR(listsv);
      }
  
      nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
@@ -8445,7 +9651,7 @@ parseit:
                      from earlier versions, OTOH that behaviour was broken
                      as well. */
                      UV v; /* value is register so we cant & it /grrr */
-                    if (reg_namedseq(pRExC_state, &v, NULL)) {
+                    if (reg_namedseq(pRExC_state, &v, NULL, depth)) {
                          goto parseit;
                      }
                      value= v; 
@@ -8484,15 +9690,28 @@ parseit:
                               n--;
                          }
                     }
-                   Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%.*s\n",
-                       (value=='p' ? '+' : '!'), (int)n, RExC_parse);
+
+                   /* Add the property name to the list.  If /i matching, give
+                    * a different name which consists of the normal name
+                    * sandwiched between two underscores and '_i'.  The design
+                    * is discussed in the commit message for this. */
+                   Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s%.*s%s\n",
+                                       (value=='p' ? '+' : '!'),
+                                       (FOLD) ? "__" : "",
+                                       (int)n,
+                                       RExC_parse,
+                                       (FOLD) ? "_i" : ""
+                                   );
                 }
                 RExC_parse = e + 1;
  
                 /* The \p could match something in the Latin1 range, hence
                  * something that isn't utf8 */
-               ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP;
+               ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP_NON_UTF8;
                 namedclass = ANYOF_MAX;  /* no official name, but it's named */
+
+               /* \p means they want Unicode semantics */
+               RExC_uni_semantics = 1;
                 }
                 break;
             case 'n':   value = '\n';                   break;
@@ -8542,7 +9761,7 @@ parseit:
                     goto recode_encoding;
                 break;
             case 'c':
-               value = grok_bslash_c(*RExC_parse++, SIZE_ONLY);
+               value = grok_bslash_c(*RExC_parse++, UTF, SIZE_ONLY);
                 break;
             case '0': case '1': case '2': case '3': case '4':
             case '5': case '6': case '7':
@@ -8557,7 +9776,7 @@ parseit:
                     break;
                 }
             recode_encoding:
-               {
+               if (! RExC_override_recoding) {
                     SV* enc = PL_encoding;
                     value = reg_recode((const char)(U8)value, &enc);
                     if (!enc && SIZE_ONLY)
@@ -8591,16 +9810,18 @@ parseit:
             if (LOC && namedclass < ANYOF_MAX && ! need_class) {
                 need_class = 1;
                 if (SIZE_ONLY) {
-                   RExC_size += ANYOF_CLASS_ADD_SKIP;
+                   RExC_size += ANYOF_CLASS_SKIP - ANYOF_SKIP;
                 }
                 else {
-                   RExC_emit += ANYOF_CLASS_ADD_SKIP;
+                   RExC_emit += ANYOF_CLASS_SKIP - ANYOF_SKIP;
                     ANYOF_CLASS_ZERO(ret);
                 }
                 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
             }
  
-           /* a bad range like a-\d, a-[:digit:] ? */
+           /* a bad range like a-\d, a-[:digit:].  The '-' is taken as a
+            * literal, as is the character that began the false range, i.e.
+            * the 'a' in the examples */
             if (range) {
                 if (!SIZE_ONLY) {
                     const int w =
@@ -8610,16 +9831,14 @@ parseit:
                                "False [] range \"%*.*s\"",
                                w, w, rangebegin);
  
+                   stored +=
+                         set_regclass_bit(pRExC_state, ret, '-', &l1_fold_invlist, &unicode_alternate);
                     if (prevvalue < 256) {
                         stored +=
-                         S_set_regclass_bit(aTHX_ pRExC_state, ret, prevvalue);
-                       stored +=
-                         S_set_regclass_bit(aTHX_ pRExC_state, ret, '-');
+                         set_regclass_bit(pRExC_state, ret, (U8) prevvalue, &l1_fold_invlist, &unicode_alternate);
                     }
                     else {
-                       ANYOF_FLAGS(ret) |= ANYOF_UTF8;
-                       Perl_sv_catpvf(aTHX_ listsv,
-                                      "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
+                       nonbitmap = add_cp_to_invlist(nonbitmap, prevvalue);
                     }
                 }
  
@@ -8632,34 +9851,26 @@ parseit:
                 const char *what = NULL;
                 char yesno = 0;
  
-               if (namedclass > OOB_NAMEDCLASS)
-                   optimize_invert = FALSE;
                 /* Possible truncation here but in some 64-bit environments
                  * the compiler gets heartburn about switch on 64-bit values.
                  * A similar issue a little earlier when switching on value.
                  * --jhi */
                 switch ((I32)namedclass) {
                 
-               case _C_C_T_(ALNUMC, isALNUMC_L1(value), isALNUMC(value), "XPosixAlnum");
-               case _C_C_T_(ALPHA, isALPHA_L1(value), isALPHA(value), "XPosixAlpha");
-               case _C_C_T_(BLANK, isBLANK_L1(value), isBLANK(value), "XPosixBlank");
-               case _C_C_T_(CNTRL, isCNTRL_L1(value), isCNTRL(value), "XPosixCntrl");
-               case _C_C_T_(GRAPH, isGRAPH_L1(value), isGRAPH(value), "XPosixGraph");
-               case _C_C_T_(LOWER, isLOWER_L1(value), isLOWER(value), "XPosixLower");
-               case _C_C_T_(PRINT, isPRINT_L1(value), isPRINT(value), "XPosixPrint");
-               case _C_C_T_(PSXSPC, isPSXSPC_L1(value), isPSXSPC(value), "XPosixSpace");
-               case _C_C_T_(PUNCT, isPUNCT_L1(value), isPUNCT(value), "XPosixPunct");
-               case _C_C_T_(UPPER, isUPPER_L1(value), isUPPER(value), "XPosixUpper");
-#ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
+               case _C_C_T_(ALNUMC, isALNUMC_L1, isALNUMC, "XPosixAlnum");
+               case _C_C_T_(ALPHA, isALPHA_L1, isALPHA, "XPosixAlpha");
+               case _C_C_T_(BLANK, isBLANK_L1, isBLANK, "XPosixBlank");
+               case _C_C_T_(CNTRL, isCNTRL_L1, isCNTRL, "XPosixCntrl");
+               case _C_C_T_(GRAPH, isGRAPH_L1, isGRAPH, "XPosixGraph");
+               case _C_C_T_(LOWER, isLOWER_L1, isLOWER, "XPosixLower");
+               case _C_C_T_(PRINT, isPRINT_L1, isPRINT, "XPosixPrint");
+               case _C_C_T_(PSXSPC, isPSXSPC_L1, isPSXSPC, "XPosixSpace");
+               case _C_C_T_(PUNCT, isPUNCT_L1, isPUNCT, "XPosixPunct");
+               case _C_C_T_(UPPER, isUPPER_L1, isUPPER, "XPosixUpper");
                  /* \s, \w match all unicode if utf8. */
-                case _C_C_T_(SPACE, isSPACE_L1(value), isSPACE(value), "SpacePerl");
-                case _C_C_T_(ALNUM, isWORDCHAR_L1(value), isALNUM(value), "Word");
-#else
-                /* \s, \w match ascii and locale only */
-                case _C_C_T_(SPACE, isSPACE_L1(value), isSPACE(value), "PerlSpace");
-                case _C_C_T_(ALNUM, isWORDCHAR_L1(value), isALNUM(value), "PerlWord");
-#endif         
-               case _C_C_T_(XDIGIT, isXDIGIT_L1(value), isXDIGIT(value), "XPosixXDigit");
+                case _C_C_T_(SPACE, isSPACE_L1, isSPACE, "SpacePerl");
+                case _C_C_T_(ALNUM, isWORDCHAR_L1, isALNUM, "Word");
+               case _C_C_T_(XDIGIT, isXDIGIT_L1, isXDIGIT, "XPosixXDigit");
                 case _C_C_T_NOLOC_(VERTWS, is_VERTWS_latin1(&value), "VertSpace");
                 case _C_C_T_NOLOC_(HORIZWS, is_HORIZWS_latin1(&value), "HorizSpace");
                 case ANYOF_ASCII:
@@ -8668,10 +9879,11 @@ parseit:
                     else {
                         for (value = 0; value < 128; value++)
                             stored +=
-                              S_set_regclass_bit(aTHX_ pRExC_state, ret, ASCII_TO_NATIVE(value));
+                              set_regclass_bit(pRExC_state, ret, (U8) ASCII_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate);
                     }
                     yesno = '+';
-                   what = "ASCII";
+                   what = NULL;        /* Doesn't match outside ascii, so
+                                          don't want to add +utf8:: */
                     break;
                 case ANYOF_NASCII:
                     if (LOC)
@@ -8679,8 +9891,9 @@ parseit:
                     else {
                         for (value = 128; value < 256; value++)
                             stored +=
-                              S_set_regclass_bit(aTHX_ pRExC_state, ret, ASCII_TO_NATIVE(value));
+                              set_regclass_bit(pRExC_state, ret, (U8) ASCII_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate);
                     }
+                   ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;
                     yesno = '!';
                     what = "ASCII";
                     break;              
@@ -8691,10 +9904,10 @@ parseit:
                         /* consecutive digits assumed */
                         for (value = '0'; value <= '9'; value++)
                             stored +=
-                              S_set_regclass_bit(aTHX_ pRExC_state, ret, value);
+                              set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate);
                     }
                     yesno = '+';
-                   what = POSIX_CC_UNI_NAME("Digit");
+                   what = "Digit";
                     break;
                 case ANYOF_NDIGIT:
                     if (LOC)
@@ -8703,13 +9916,16 @@ parseit:
                         /* consecutive digits assumed */
                         for (value = 0; value < '0'; value++)
                             stored +=
-                              S_set_regclass_bit(aTHX_ pRExC_state, ret, value);
+                              set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate);
                         for (value = '9' + 1; value < 256; value++)
                             stored +=
-                              S_set_regclass_bit(aTHX_ pRExC_state, ret, value);
+                              set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate);
                     }
                     yesno = '!';
-                   what = POSIX_CC_UNI_NAME("Digit");
+                   what = "Digit";
+                   if (AT_LEAST_ASCII_RESTRICTED ) {
+                       ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;
+                   }
                     break;              
                 case ANYOF_MAX:
                     /* this is to handle \p and \P */
@@ -8718,17 +9934,11 @@ parseit:
                     vFAIL("Invalid [::] class");
                     break;
                 }
-               if (what) {
+               if (what && ! (AT_LEAST_ASCII_RESTRICTED)) {
                     /* Strings such as "+utf8::isWord\n" */
                     Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
                 }
-               stored+=2; /* can't optimize this class */
  
-               /* All but ASCII can match Unicode characters, but all the ones
-                * that aren't in utf8 are in the bitmap */
-               if (namedclass != ANYOF_ASCII) {
-                   ANYOF_FLAGS(ret) |= ANYOF_UTF8;
-               }
                 continue;
             }
         } /* end of namedclass \blah */
@@ -8742,8 +9952,10 @@ parseit:
         }
         else {
             prevvalue = value; /* save the beginning of the range */
-           if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
-               RExC_parse[1] != ']') {
+           if (RExC_parse+1 < RExC_end
+               && *RExC_parse == '-'
+               && RExC_parse[1] != ']')
+           {
                 RExC_parse++;
  
                 /* a bad range like \w-, [:word:]- ? */
@@ -8758,15 +9970,20 @@ parseit:
                     }
                     if (!SIZE_ONLY)
                         stored +=
-                            S_set_regclass_bit(aTHX_ pRExC_state, ret, '-');
+                            set_regclass_bit(pRExC_state, ret, '-', &l1_fold_invlist, &unicode_alternate);
                 } else
                     range = 1;  /* yeah, it's a range! */
                 continue;       /* but do it the next time */
             }
         }
  
+       /* non-Latin1 code point implies unicode semantics.  Must be set in
+        * pass1 so is there for the whole of pass 2 */
+       if (value > 255) {
+           RExC_uni_semantics = 1;
+       }
+
         /* now is the next time */
-        /*stored += (value - prevvalue + 1);*/
         if (!SIZE_ONLY) {
             if (prevvalue < 256) {
                 const IV ceilvalue = value < 256 ? value : 255;
@@ -8782,109 +9999,27 @@ parseit:
                         for (i = prevvalue; i <= ceilvalue; i++)
                             if (isLOWER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
                                 stored +=
-                                  S_set_regclass_bit(aTHX_ pRExC_state, ret, i);
+                                  set_regclass_bit(pRExC_state, ret, (U8) i, &l1_fold_invlist, &unicode_alternate);
                             }
                     } else {
                         for (i = prevvalue; i <= ceilvalue; i++)
                             if (isUPPER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
                                 stored +=
-                                  S_set_regclass_bit(aTHX_ pRExC_state, ret, i);
+                                  set_regclass_bit(pRExC_state, ret, (U8) i, &l1_fold_invlist, &unicode_alternate);
                             }
                     }
                 }
                 else
  #endif
                       for (i = prevvalue; i <= ceilvalue; i++) {
-                       stored += S_set_regclass_bit(aTHX_ pRExC_state, ret, i);
+                       stored += set_regclass_bit(pRExC_state, ret, (U8) i, &l1_fold_invlist, &unicode_alternate);
                       }
           }
-         if (value > 255 || UTF) {
-               const UV prevnatvalue  = NATIVE_TO_UNI(prevvalue);
-               const UV natvalue      = NATIVE_TO_UNI(value);
-                stored+=2; /* can't optimize this class */
-
-               /* If the code point requires utf8 to represent, and we are not
-                * folding, it can't match unless the target is in utf8.  Only
-                * a few code points above 255 fold to below it, so XXX an
-                * optimization would be to know which ones and set the flag
-                * appropriately. */
-               ANYOF_FLAGS(ret) |= (FOLD || value < 256)
-                                   ? ANYOF_NONBITMAP
-                                   : ANYOF_UTF8;
-               if (prevnatvalue < natvalue) { /* what about > ? */
-                   Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
-                                  prevnatvalue, natvalue);
-               }
-               else if (prevnatvalue == natvalue) {
-                   Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
-                   if (FOLD) {
-                        U8 foldbuf[UTF8_MAXBYTES_CASE+1];
-                        STRLEN foldlen;
-                        const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
-
-#ifdef EBCDIC /* RD t/uni/fold ff and 6b */
-                        if (RExC_precomp[0] == ':' &&
-                            RExC_precomp[1] == '[' &&
-                            (f == 0xDF || f == 0x92)) {
-                            f = NATIVE_TO_UNI(f);
-                        }
-#endif
-                        /* If folding and foldable and a single
-                         * character, insert also the folded version
-                         * to the charclass. */
-                        if (f != value) {
-#ifdef EBCDIC /* RD tunifold ligatures s,t fb05, fb06 */
-                            if ((RExC_precomp[0] == ':' &&
-                                 RExC_precomp[1] == '[' &&
-                                 (f == 0xA2 &&
-                                  (value == 0xFB05 || value == 0xFB06))) ?
-                                foldlen == ((STRLEN)UNISKIP(f) - 1) :
-                                foldlen == (STRLEN)UNISKIP(f) )
-#else
-                             if (foldlen == (STRLEN)UNISKIP(f))
-#endif
-                                 Perl_sv_catpvf(aTHX_ listsv,
-                                                "%04"UVxf"\n", f);
-                             else {
-                                 /* Any multicharacter foldings
-                                  * require the following transform:
-                                  * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
-                                  * where E folds into "pq" and F folds
-                                  * into "rst", all other characters
-                                  * fold to single characters.  We save
-                                  * away these multicharacter foldings,
-                                  * to be later saved as part of the
-                                  * additional "s" data. */
-                                 SV *sv;
-
-                                 if (!unicode_alternate)
-                                     unicode_alternate = newAV();
-                                 sv = newSVpvn_utf8((char*)foldbuf, foldlen,
-                                                    TRUE);
-                                 av_push(unicode_alternate, sv);
-                             }
-                        }
-
-                        /* If folding and the value is one of the Greek
-                         * sigmas insert a few more sigmas to make the
-                         * folding rules of the sigmas to work right.
-                         * Note that not all the possible combinations
-                         * are handled here: some of them are handled
-                         * by the standard folding rules, and some of
-                         * them (literal or EXACTF cases) are handled
-                         * during runtime in regexec.c:S_find_byclass(). */
-                        if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
-                             Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
-                                            (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
-                             Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
-                                            (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
-                        }
-                        else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
-                             Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
-                                            (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
-                   }
-               }
-           }
+         if (value > 255) {
+           const UV prevnatvalue  = NATIVE_TO_UNI(prevvalue);
+           const UV natvalue      = NATIVE_TO_UNI(value);
+           nonbitmap = add_range_to_invlist(nonbitmap, prevnatvalue, natvalue);
+       }
  #ifdef EBCDIC
             literal_endpoint = 0;
  #endif
@@ -8899,53 +10034,367 @@ parseit:
          return ret;
      /****** !SIZE_ONLY AFTER HERE *********/
  
-    if( stored == 1 && (value < 128 || (value < 256 && !UTF))
-        && !( ANYOF_FLAGS(ret) & ( ANYOF_FLAGS_ALL ^ ANYOF_FOLD ) )
-    ) {
-       /* optimize single char class to an EXACT node but *only* when its not
-        * a UTF/high char.  Note that the information needed to decide to do
-        * this optimization is not currently available until the 2nd pass, and
-        * that the actually used EXACT node takes less space than the
-        * calculated ANYOF node, and hence the amount of space calculated in
-         * the first pass is larger than actually used.  Currently we don't
-         * keep track of enough information to do this for nodes which contain
-         * matches outside the bitmap */
+    /* If folding and there are code points above 255, we calculate all
+     * characters that could fold to or from the ones already on the list */
+    if (FOLD && nonbitmap) {
+       UV i;
+
+       HV* fold_intersection;
+       UV* fold_list;
+
+       /* This is a list of all the characters that participate in folds
+           * (except marks, etc in multi-char folds */
+       if (! PL_utf8_foldable) {
+           SV* swash = swash_init("utf8", "Cased", &PL_sv_undef, 1, 0);
+           PL_utf8_foldable = _swash_to_invlist(swash);
+       }
+
+       /* This is a hash that for a particular fold gives all characters
+           * that are involved in it */
+       if (! PL_utf8_foldclosures) {
+
+           /* If we were unable to find any folds, then we likely won't be
+            * able to find the closures.  So just create an empty list.
+            * Folding will effectively be restricted to the non-Unicode rules
+            * hard-coded into Perl.  (This case happens legitimately during
+            * compilation of Perl itself before the Unicode tables are
+            * generated) */
+           if (invlist_len(PL_utf8_foldable) == 0) {
+               PL_utf8_foldclosures = _new_invlist(0);
+           } else {
+               /* If the folds haven't been read in, call a fold function
+                   * to force that */
+               if (! PL_utf8_tofold) {
+                   U8 dummy[UTF8_MAXBYTES+1];
+                   STRLEN dummy_len;
+                   to_utf8_fold((U8*) "A", dummy, &dummy_len);
+               }
+               PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
+           }
+       }
+
+       /* Only the characters in this class that participate in folds need
+           * be checked.  Get the intersection of this class and all the
+           * possible characters that are foldable.  This can quickly narrow
+           * down a large class */
+       fold_intersection = invlist_intersection(PL_utf8_foldable, nonbitmap);
+
+       /* Now look at the foldable characters in this class individually */
+       fold_list = invlist_array(fold_intersection);
+       for (i = 0; i < invlist_len(fold_intersection); i++) {
+           UV j;
+
+           /* The next entry is the beginning of the range that is in the
+            * class */
+           UV start = fold_list[i++];
+
+
+           /* The next entry is the beginning of the next range, which
+               * isn't in the class, so the end of the current range is one
+               * less than that */
+           UV end = fold_list[i] - 1;
+
+           /* Look at every character in the range */
+           for (j = start; j <= end; j++) {
+
+               /* Get its fold */
+               U8 foldbuf[UTF8_MAXBYTES_CASE+1];
+               STRLEN foldlen;
+               const UV f =
+                    _to_uni_fold_flags(j, foldbuf, &foldlen, allow_full_fold);
+
+               if (foldlen > (STRLEN)UNISKIP(f)) {
+
+                   /* Any multicharacter foldings (disallowed in
+                       * lookbehind patterns) require the following
+                       * transform: [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst) where
+                       * E folds into "pq" and F folds into "rst", all other
+                       * characters fold to single characters.  We save away
+                       * these multicharacter foldings, to be later saved as
+                       * part of the additional "s" data. */
+                   if (! RExC_in_lookbehind) {
+                       U8* loc = foldbuf;
+                       U8* e = foldbuf + foldlen;
+
+                       /* If any of the folded characters of this are in
+                           * the Latin1 range, tell the regex engine that
+                           * this can match a non-utf8 target string.  The
+                           * only multi-byte fold whose source is in the
+                           * Latin1 range (U+00DF) applies only when the
+                           * target string is utf8, or under unicode rules */
+                       if (j > 255 || AT_LEAST_UNI_SEMANTICS) {
+                           while (loc < e) {
+
+                               /* Can't mix ascii with non- under /aa */
+                               if (MORE_ASCII_RESTRICTED
+                                   && (isASCII(*loc) != isASCII(j)))
+                               {
+                                   goto end_multi_fold;
+                               }
+                               if (UTF8_IS_INVARIANT(*loc)
+                                   || UTF8_IS_DOWNGRADEABLE_START(*loc))
+                               {
+                                   /* Can't mix above and below 256 under
+                                       * LOC */
+                                   if (LOC) {
+                                       goto end_multi_fold;
+                                   }
+                                   ANYOF_FLAGS(ret)
+                                           |= ANYOF_NONBITMAP_NON_UTF8;
+                                   break;
+                               }
+                               loc += UTF8SKIP(loc);
+                           }
+                       }
+
+                       add_alternate(&unicode_alternate, foldbuf, foldlen);
+                   end_multi_fold: ;
+                   }
+
+                   /* This is special-cased, as it is the only letter which
+                    * has both a multi-fold and single-fold in Latin1.  All
+                    * the other chars that have single and multi-folds are
+                    * always in utf8, and the utf8 folding algorithm catches
+                    * them */
+                   if (! LOC && j == LATIN_CAPITAL_LETTER_SHARP_S) {
+                       stored += set_regclass_bit(pRExC_state,
+                                       ret,
+                                       LATIN_SMALL_LETTER_SHARP_S,
+                                       &l1_fold_invlist, &unicode_alternate);
+                   }
+               }
+               else {
+                   /* Single character fold.  Add everything in its fold
+                       * closure to the list that this node should match */
+                   SV** listp;
+
+                   /* The fold closures data structure is a hash with the
+                       * keys being every character that is folded to, like
+                       * 'k', and the values each an array of everything that
+                       * folds to its key.  e.g. [ 'k', 'K', KELVIN_SIGN ] */
+                   if ((listp = hv_fetch(PL_utf8_foldclosures,
+                                   (char *) foldbuf, foldlen, FALSE)))
+                   {
+                       AV* list = (AV*) *listp;
+                       IV k;
+                       for (k = 0; k <= av_len(list); k++) {
+                           SV** c_p = av_fetch(list, k, FALSE);
+                           UV c;
+                           if (c_p == NULL) {
+                               Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
+                           }
+                           c = SvUV(*c_p);
+
+                           /* /aa doesn't allow folds between ASCII and
+                               * non-; /l doesn't allow them between above
+                               * and below 256 */
+                           if ((MORE_ASCII_RESTRICTED
+                                && (isASCII(c) != isASCII(j)))
+                                   || (LOC && ((c < 256) != (j < 256))))
+                           {
+                               continue;
+                           }
+
+                           if (c < 256 && AT_LEAST_UNI_SEMANTICS) {
+                               stored += set_regclass_bit(pRExC_state,
+                                       ret,
+                                       (U8) c,
+                                       &l1_fold_invlist, &unicode_alternate);
+                           }
+                               /* It may be that the code point is already
+                                   * in this range or already in the bitmap,
+                                   * in which case we need do nothing */
+                           else if ((c < start || c > end)
+                                       && (c > 255
+                                           || ! ANYOF_BITMAP_TEST(ret, c)))
+                           {
+                               nonbitmap = add_cp_to_invlist(nonbitmap, c);
+                           }
+                       }
+                   }
+               }
+           }
+       }
+       invlist_destroy(fold_intersection);
+    }
+
+    /* Combine the two lists into one. */
+    if (l1_fold_invlist) {
+       if (nonbitmap) {
+           HV* temp = invlist_union(nonbitmap, l1_fold_invlist);
+           invlist_destroy(nonbitmap);
+           nonbitmap = temp;
+           invlist_destroy(l1_fold_invlist);
+       }
+       else {
+           nonbitmap = l1_fold_invlist;
+       }
+    }
+
+    /* Here, we have calculated what code points should be in the character
+     * class.   Now we can see about various optimizations.  Fold calculation
+     * needs to take place before inversion.  Otherwise /[^k]/i would invert to
+     * include K, which under /i would match k. */
+
+    /* Optimize inverted simple patterns (e.g. [^a-z]).  Note that we haven't
+     * set the FOLD flag yet, so this this does optimize those.  It doesn't
+     * optimize locale.  Doing so perhaps could be done as long as there is
+     * nothing like \w in it; some thought also would have to be given to the
+     * interaction with above 0x100 chars */
+    if (! LOC
+       && (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT
+       && ! unicode_alternate
+       && ! nonbitmap
+       && SvCUR(listsv) == initial_listsv_len)
+    {
+       for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
+           ANYOF_BITMAP(ret)[value] ^= 0xFF;
+       stored = 256 - stored;
+
+       /* The inversion means that everything above 255 is matched; and at the
+        * same time we clear the invert flag */
+       ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
+    }
+
+    /* Folding in the bitmap is taken care of above, but not for locale (for
+     * which we have to wait to see what folding is in effect at runtime), and
+     * for things not in the bitmap.  Set run-time fold flag for these */
+    if (FOLD && (LOC || nonbitmap || unicode_alternate)) {
+       ANYOF_FLAGS(ret) |= ANYOF_LOC_NONBITMAP_FOLD;
+    }
+
+    /* A single character class can be "optimized" into an EXACTish node.
+     * Note that since we don't currently count how many characters there are
+     * outside the bitmap, we are XXX missing optimization possibilities for
+     * them.  This optimization can't happen unless this is a truly single
+     * character class, which means that it can't be an inversion into a
+     * many-character class, and there must be no possibility of there being
+     * things outside the bitmap.  'stored' (only) for locales doesn't include
+     * \w, etc, so have to make a special test that they aren't present
+     *
+     * Similarly A 2-character class of the very special form like [bB] can be
+     * optimized into an EXACTFish node, but only for non-locales, and for
+     * characters which only have the two folds; so things like 'fF' and 'Ii'
+     * wouldn't work because they are part of the fold of 'LATIN SMALL LIGATURE
+     * FI'. */
+    if (! nonbitmap
+       && ! unicode_alternate
+       && SvCUR(listsv) == initial_listsv_len
+       && ! (ANYOF_FLAGS(ret) & (ANYOF_INVERT|ANYOF_UNICODE_ALL))
+        && (((stored == 1 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE))
+                              || (! ANYOF_CLASS_TEST_ANY_SET(ret)))))
+           || (stored == 2 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE))
+                                && (! _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value))
+                                /* If the latest code point has a fold whose
+                                 * bit is set, it must be the only other one */
+                               && ((prevvalue = PL_fold_latin1[value]) != (IV)value)
+                                && ANYOF_BITMAP_TEST(ret, prevvalue)))))
+    {
+        /* Note that the information needed to decide to do this optimization
+         * is not currently available until the 2nd pass, and that the actually
+        * used EXACTish node takes less space than the calculated ANYOF node,
+        * and hence the amount of space calculated in the first pass is larger
+         * than actually used, so this optimization doesn't gain us any space.
+        * But an EXACT node is faster than an ANYOF node, and can be combined
+        * with any adjacent EXACT nodes later by the optimizer for further
+        * gains.  The speed of executing an EXACTF is similar to an ANYOF
+        * node, so the optimization advantage comes from the ability to join
+        * it to adjacent EXACT nodes */
+
          const char * cur_parse= RExC_parse;
+       U8 op;
          RExC_emit = (regnode *)orig_emit;
          RExC_parse = (char *)orig_parse;
-        ret = reg_node(pRExC_state,
-                       (U8)((ANYOF_FLAGS(ret) & ANYOF_FOLD) ? EXACTF : EXACT));
+
+       if (stored == 1) {
+
+           /* A locale node with one point can be folded; all the other cases
+            * with folding will have two points, since we calculate them above
+            */
+           if (ANYOF_FLAGS(ret) & ANYOF_LOC_NONBITMAP_FOLD) {
+                op = EXACTFL;
+           }
+           else {
+               op = EXACT;
+           }
+       }   /* else 2 chars in the bit map: the folds of each other */
+       else if (AT_LEAST_UNI_SEMANTICS || !isASCII(value)) {
+
+           /* To join adjacent nodes, they must be the exact EXACTish type.
+            * Try to use the most likely type, by using EXACTFU if the regex
+            * calls for them, or is required because the character is
+            * non-ASCII */
+           op = EXACTFU;
+       }
+       else {    /* Otherwise, more likely to be EXACTF type */
+           op = EXACTF;
+       }
+
+       ret = reg_node(pRExC_state, op);
          RExC_parse = (char *)cur_parse;
-        *STRING(ret)= (char)value;
-        STR_LEN(ret)= 1;
-        RExC_emit += STR_SZ(1);
+       if (UTF && ! NATIVE_IS_INVARIANT(value)) {
+           *STRING(ret)= UTF8_EIGHT_BIT_HI((U8) value);
+           *(STRING(ret) + 1)= UTF8_EIGHT_BIT_LO((U8) value);
+           STR_LEN(ret)= 2;
+           RExC_emit += STR_SZ(2);
+       }
+       else {
+           *STRING(ret)= (char)value;
+           STR_LEN(ret)= 1;
+           RExC_emit += STR_SZ(1);
+       }
         SvREFCNT_dec(listsv);
          return ret;
      }
-    /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
-    if ( /* If the only flag is folding (plus possibly inversion). */
-       ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
-       ) {
-       for (value = 0; value < 256; ++value) {
-           if (ANYOF_BITMAP_TEST(ret, value)) {
-               UV fold = PL_fold[value];
  
-               if (fold != value)
-                   ANYOF_BITMAP_SET(ret, fold);
+    if (nonbitmap) {
+       UV* nonbitmap_array = invlist_array(nonbitmap);
+       UV nonbitmap_len = invlist_len(nonbitmap);
+       UV i;
+
+       /*  Here have the full list of items to match that aren't in the
+        *  bitmap.  Convert to the structure that the rest of the code is
+        *  expecting.   XXX That rest of the code should convert to this
+        *  structure */
+       for (i = 0; i < nonbitmap_len; i++) {
+
+           /* The next entry is the beginning of the range that is in the
+            * class */
+           UV start = nonbitmap_array[i++];
+           UV end;
+
+           /* The next entry is the beginning of the next range, which isn't
+            * in the class, so the end of the current range is one less than
+            * that.  But if there is no next range, it means that the range
+            * begun by 'start' extends to infinity, which for this platform
+            * ends at UV_MAX */
+           if (i == nonbitmap_len) {
+               end = UV_MAX;
+           }
+           else {
+               end = nonbitmap_array[i] - 1;
+           }
+
+           if (start == end) {
+               Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", start);
+           }
+           else {
+               /* The \t sets the whole range */
+               Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
+                       /* XXX EBCDIC */
+                                  start, end);
             }
         }
-       ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
+       invlist_destroy(nonbitmap);
      }
  
-    /* optimize inverted simple patterns (e.g. [^a-z]) */
-    if (optimize_invert &&
-       /* If the only flag is inversion. */
-       (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
-       for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
-           ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
-       ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
+    if (SvCUR(listsv) == initial_listsv_len && ! unicode_alternate) {
+       ARG_SET(ret, ANYOF_NONBITMAP_EMPTY);
+       SvREFCNT_dec(listsv);
+       SvREFCNT_dec(unicode_alternate);
      }
-    {
+    else {
+
         AV * const av = newAV();
         SV *rv;
         /* The 0th element stores the character class description
@@ -8956,7 +10405,18 @@ parseit:
          * used later (regexec.c:S_reginclass()). */
         av_store(av, 0, listsv);
         av_store(av, 1, NULL);
-       av_store(av, 2, MUTABLE_SV(unicode_alternate));
+
+        /* Store any computed multi-char folds only if we are allowing
+         * them */
+        if (allow_full_fold) {
+            av_store(av, 2, MUTABLE_SV(unicode_alternate));
+            if (unicode_alternate) { /* This node is variable length */
+                OP(ret) = ANYOFV;
+            }
+        }
+        else {
+            av_store(av, 2, NULL);
+        }
         rv = newRV_noinc(MUTABLE_SV(av));
         n = add_data(pRExC_state, 1, "s");
         RExC_rxi->data->data[n] = (void*)rv;
@@ -9296,7 +10756,7 @@ S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 de
  - Look for optimizable sequences at the same time.
  - currently only looks for EXACT chains.
  
-This is expermental code. The idea is to use this routine to perform 
+This is experimental code. The idea is to use this routine to perform 
  in place optimizations on branches and groups as they are constructed,
  with the long term intention of removing optimization from study_chunk so
  that it is purely analytical.
@@ -9338,6 +10798,7 @@ S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,
              switch (OP(scan)) {
                  case EXACT:
                  case EXACTF:
+                case EXACTFA:
                  case EXACTFU:
                  case EXACTFL:
                          if( exact == PSEUDO )
@@ -9393,14 +10854,40 @@ S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
  {
      int bit;
      int set=0;
+    regex_charset cs;
  
      for (bit=0; bit<32; bit++) {
          if (flags & (1<<bit)) {
+           if ((1<<bit) & RXf_PMf_CHARSET) {   /* Output separately, below */
+               continue;
+           }
              if (!set++ && lead) 
                  PerlIO_printf(Perl_debug_log, "%s",lead);
              PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
          }              
      }     
+    if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
+            if (!set++ && lead) {
+                PerlIO_printf(Perl_debug_log, "%s",lead);
+            }
+            switch (cs) {
+                case REGEX_UNICODE_CHARSET:
+                    PerlIO_printf(Perl_debug_log, "UNICODE");
+                    break;
+                case REGEX_LOCALE_CHARSET:
+                    PerlIO_printf(Perl_debug_log, "LOCALE");
+                    break;
+                case REGEX_ASCII_RESTRICTED_CHARSET:
+                    PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
+                    break;
+                case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
+                    PerlIO_printf(Perl_debug_log, "ASCII-MORE_RESTRICTED");
+                    break;
+                default:
+                    PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
+                    break;
+            }
+    }
      if (lead)  {
          if (set) 
              PerlIO_printf(Perl_debug_log, "\n");
@@ -9545,6 +11032,7 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
          * --jhi */
         pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
                   PERL_PV_ESCAPE_UNI_DETECT |
+                 PERL_PV_ESCAPE_NONASCII   |
                   PERL_PV_PRETTY_ELLIPSES   |
                   PERL_PV_PRETTY_LTGT       |
                   PERL_PV_PRETTY_NOCLEAR
@@ -9679,7 +11167,7 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
  
         if (flags & ANYOF_LOCALE)
             sv_catpvs(sv, "{loc}");
-       if (flags & ANYOF_FOLD)
+       if (flags & ANYOF_LOC_NONBITMAP_FOLD)
             sv_catpvs(sv, "{i}");
         Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
         if (flags & ANYOF_INVERT)
@@ -9705,8 +11193,8 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
         }
          
          EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
-        /* output any special charclass tests (used mostly under use locale) */
-       if (o->flags & ANYOF_CLASS && ANYOF_CLASS_TEST_ANY_SET(o))
+        /* output any special charclass tests (used entirely under use locale) */
+       if (ANYOF_CLASS_TEST_ANY_SET(o))
             for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
                 if (ANYOF_CLASS_TEST(o,i)) {
                     sv_catpv(sv, anyofs[i]);
@@ -9715,15 +11203,19 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
          
          EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
          
+       if (flags & ANYOF_NON_UTF8_LATIN1_ALL) {
+           sv_catpvs(sv, "{non-utf8-latin1-all}");
+       }
+
          /* output information about the unicode matching */
         if (flags & ANYOF_UNICODE_ALL)
             sv_catpvs(sv, "{unicode_all}");
-       else if (flags & ANYOF_UTF8)
+       else if (ANYOF_NONBITMAP(o))
             sv_catpvs(sv, "{unicode}");
         if (flags & ANYOF_NONBITMAP_NON_UTF8)
             sv_catpvs(sv, "{outside bitmap}");
  
-       {
+       if (ANYOF_NONBITMAP(o)) {
             SV *lv;
             SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
         
@@ -9888,7 +11380,7 @@ Perl_pregfree2(pTHX_ REGEXP *rx)
      
      The solution is to make a lightweight copy of the regexp structure 
      when a qr// is returned from the code executed by (??{$qr}) this
-    lightweight copy doesnt actually own any of its data except for
+    lightweight copy doesn't actually own any of its data except for
      the starp/end and the actual regexp structure itself. 
      
  */    
@@ -10196,12 +11688,11 @@ Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
      dVAR;
      struct regexp *const r = (struct regexp *)SvANY(rx);
      regexp_internal *reti;
-    int len, npar;
+    int len;
      RXi_GET_DECL(r,ri);
  
      PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
      
-    npar = r->nparens+1;
      len = ProgLen(ri);
      
      Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal);
@@ -10580,7 +12071,7 @@ S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
         else if ( op == PLUS || op == STAR) {
             DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
         }
-       else if (op == ANYOF) {
+       else if (PL_regkind[(U8)op] == ANYOF) {
             /* arglen 1 + class block */
             node += 1 + ((ANYOF_FLAGS(node) & ANYOF_CLASS)
                     ? ANYOF_CLASS_SKIP : ANYOF_SKIP);