X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/216bfc0a080a7190b7235110e12114b87f6e7b56..868625d2e6637edf38415dd2a2648c512c379d47:/regcomp.c

diff --git a/regcomp.c b/regcomp.c
index 62e9448..560696e 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -142,6 +142,8 @@ typedef struct RExC_state_t {
     regnode	**recurse;		/* Recurse regops */
     I32		recurse_count;		/* Number of recurse regops */
     I32		in_lookbehind;
+    I32		contains_locale;
+    I32		override_recoding;
 #if ADD_TO_REGEXEC
     char 	*starttry;		/* -Dr: where regtry was called. */
 #define RExC_starttry	(pRExC_state->starttry)
@@ -190,6 +192,8 @@ typedef struct RExC_state_t {
 #define RExC_recurse	(pRExC_state->recurse)
 #define RExC_recurse_count	(pRExC_state->recurse_count)
 #define RExC_in_lookbehind	(pRExC_state->in_lookbehind)
+#define RExC_contains_locale	(pRExC_state->contains_locale)
+#define RExC_override_recoding	(pRExC_state->override_recoding)
 
 
 #define	ISMULT1(c)	((c) == '*' || (c) == '+' || (c) == '?')
@@ -380,6 +384,8 @@ static const scan_data_t zero_scan_data =
 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_DEPENDS_CHARSET)
 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) >= REGEX_UNICODE_CHARSET)
 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_RESTRICTED_CHARSET)
+#define MORE_ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
+#define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) >= REGEX_ASCII_RESTRICTED_CHARSET)
 
 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
 
@@ -507,6 +513,13 @@ static const scan_data_t zero_scan_data =
 	    (int)offset, RExC_precomp, RExC_precomp + offset);		\
 } STMT_END
 
+#define	ckWARN2regdep(loc,m, a1) STMT_START {				\
+    const IV offset = loc - RExC_precomp;				\
+    Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP),	\
+	    m REPORT_LOCATION,						\
+	    a1, (int)offset, RExC_precomp, RExC_precomp + offset);	\
+} STMT_END
+
 #define	ckWARN2reg(loc, m, a1) STMT_START {				\
     const IV offset = loc - RExC_precomp;				\
     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,	\
@@ -715,11 +728,32 @@ S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *c
 {
     PERL_ARGS_ASSERT_CL_ANYTHING;
 
-    ANYOF_CLASS_ZERO(cl);
     ANYOF_BITMAP_SETALL(cl);
-    cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL|ANYOF_LOC_NONBITMAP_FOLD|ANYOF_NON_UTF8_LATIN1_ALL;
-    if (LOC)
+    cl->flags = ANYOF_CLASS|ANYOF_EOS|ANYOF_UNICODE_ALL
+		|ANYOF_LOC_NONBITMAP_FOLD|ANYOF_NON_UTF8_LATIN1_ALL
+		    /* Even though no bitmap is in use here, we need to set
+		     * the flag below so an AND with a node that does have one
+		     * doesn't lose that one.  The flag should get cleared if
+		     * the other one doesn't; and the code in regexec.c is
+		     * structured so this being set when not needed does no
+		     * harm.  It seemed a little cleaner to set it here than do
+		     * a special case in cl_and() */
+		|ANYOF_NONBITMAP_NON_UTF8;
+
+    /* If any portion of the regex is to operate under locale rules,
+     * initialization includes it.  The reason this isn't done for all regexes
+     * is that the optimizer was written under the assumption that locale was
+     * all-or-nothing.  Given the complexity and lack of documentation in the
+     * optimizer, and that there are inadequate test cases for locale, so many
+     * parts of it may not work properly, it is safest to avoid locale unless
+     * necessary. */
+    if (RExC_contains_locale) {
+	ANYOF_CLASS_SETALL(cl);	    /* /l uses class */
 	cl->flags |= ANYOF_LOCALE;
+    }
+    else {
+	ANYOF_CLASS_ZERO(cl);	    /* Only /l uses class now */
+    }
 }
 
 /* Can match anything (initialization) */
@@ -749,22 +783,15 @@ S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
     Zero(cl, 1, struct regnode_charclass_class);
     cl->type = ANYOF;
     cl_anything(pRExC_state, cl);
+    ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
 }
 
-STATIC void
-S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
-{
-    PERL_ARGS_ASSERT_CL_INIT_ZERO;
-
-    Zero(cl, 1, struct regnode_charclass_class);
-    cl->type = ANYOF;
-    cl_anything(pRExC_state, cl);
-    if (LOC)
-	cl->flags |= ANYOF_LOCALE;
-}
+/* These two functions currently do the exact same thing */
+#define cl_init_zero		S_cl_init
 
-/* 'And' a given class with another one.  Can create false positives */
-/* We assume that cl is not inverted */
+/* 'AND' a given class with another one.  Can create false positives.  'cl'
+ * should not be inverted.  'and_with->flags & ANYOF_CLASS' should be 0 if
+ * 'and_with' is a regnode_charclass instead of a regnode_charclass_class. */
 STATIC void
 S_cl_and(struct regnode_charclass_class *cl,
 	const struct regnode_charclass_class *and_with)
@@ -773,6 +800,7 @@ S_cl_and(struct regnode_charclass_class *cl,
 
     assert(and_with->type == ANYOF);
 
+    /* I (khw) am not sure all these restrictions are necessary XXX */
     if (!(ANYOF_CLASS_TEST_ANY_SET(and_with))
 	&& !(ANYOF_CLASS_TEST_ANY_SET(cl))
 	&& (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
@@ -787,42 +815,88 @@ S_cl_and(struct regnode_charclass_class *cl,
 	    for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 		cl->bitmap[i] &= and_with->bitmap[i];
     } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
-    if (!(and_with->flags & ANYOF_EOS))
-	cl->flags &= ~ANYOF_EOS;
 
-    if (!(and_with->flags & ANYOF_LOC_NONBITMAP_FOLD))
-	cl->flags &= ~ANYOF_LOC_NONBITMAP_FOLD;
-    if (!(and_with->flags & ANYOF_NON_UTF8_LATIN1_ALL))
-	cl->flags &= ~ANYOF_NON_UTF8_LATIN1_ALL;
+    if (and_with->flags & ANYOF_INVERT) {
 
-    if (cl->flags & ANYOF_UNICODE_ALL
-	&& and_with->flags & ANYOF_NONBITMAP
-	&& !(and_with->flags & ANYOF_INVERT))
-    {
-	if (! (and_with->flags & ANYOF_UNICODE_ALL)) {
+        /* Here, the and'ed node is inverted.  Get the AND of the flags that
+         * aren't affected by the inversion.  Those that are affected are
+         * handled individually below */
+	U8 affected_flags = cl->flags & ~INVERSION_UNAFFECTED_FLAGS;
+	cl->flags &= (and_with->flags & INVERSION_UNAFFECTED_FLAGS);
+	cl->flags |= affected_flags;
+
+        /* We currently don't know how to deal with things that aren't in the
+         * bitmap, but we know that the intersection is no greater than what
+         * is already in cl, so let there be false positives that get sorted
+         * out after the synthetic start class succeeds, and the node is
+         * matched for real. */
+
+        /* The inversion of these two flags indicate that the resulting
+         * intersection doesn't have them */
+	if (and_with->flags & ANYOF_UNICODE_ALL) {
 	    cl->flags &= ~ANYOF_UNICODE_ALL;
 	}
-	cl->flags |= and_with->flags & ANYOF_NONBITMAP;	/* field is 2 bits; use
-							   only the one(s)
-							   actually set */
-	ARG_SET(cl, ARG(and_with));
-    }
-    if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
-	!(and_with->flags & ANYOF_INVERT))
-	cl->flags &= ~ANYOF_UNICODE_ALL;
-    if (!(and_with->flags & (ANYOF_NONBITMAP|ANYOF_UNICODE_ALL)) &&
-	!(and_with->flags & ANYOF_INVERT))
-	cl->flags &= ~ANYOF_NONBITMAP;
+	if (and_with->flags & ANYOF_NON_UTF8_LATIN1_ALL) {
+	    cl->flags &= ~ANYOF_NON_UTF8_LATIN1_ALL;
+	}
+    }
+    else {   /* and'd node is not inverted */
+	if (! ANYOF_NONBITMAP(and_with)) {
+
+            /* Here 'and_with' doesn't match anything outside the bitmap
+             * (except possibly ANYOF_UNICODE_ALL), which means the
+             * intersection can't either, except for ANYOF_UNICODE_ALL, in
+             * which case we don't know what the intersection is, but it's no
+             * greater than what cl already has, so can just leave it alone,
+             * with possible false positives */
+            if (! (and_with->flags & ANYOF_UNICODE_ALL)) {
+                ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
+		cl->flags &= ~ANYOF_NONBITMAP_NON_UTF8;
+            }
+	}
+	else if (! ANYOF_NONBITMAP(cl)) {
+
+	    /* Here, 'and_with' does match something outside the bitmap, and cl
+	     * doesn't have a list of things to match outside the bitmap.  If
+             * cl can match all code points above 255, the intersection will
+             * be those above-255 code points that 'and_with' matches.  There
+             * may be false positives from code points in 'and_with' that are
+             * outside the bitmap but below 256, but those get sorted out
+             * after the synthetic start class succeeds).  If cl can't match
+             * all Unicode code points, it means here that it can't match *
+             * anything outside the bitmap, so we leave the bitmap empty */
+	    if (cl->flags & ANYOF_UNICODE_ALL) {
+		ARG_SET(cl, ARG(and_with));
+	    }
+	}
+	else {
+            /* Here, both 'and_with' and cl match something outside the
+             * bitmap.  Currently we do not do the intersection, so just match
+             * whatever cl had at the beginning.  */
+	}
+
+
+        /* Take the intersection of the two sets of flags */
+	cl->flags &= and_with->flags;
+    }
 }
 
-/* 'OR' a given class with another one.  Can create false positives */
-/* We assume that cl is not inverted */
+/* 'OR' a given class with another one.  Can create false positives.  'cl'
+ * should not be inverted.  'or_with->flags & ANYOF_CLASS' should be 0 if
+ * 'or_with' is a regnode_charclass instead of a regnode_charclass_class. */
 STATIC void
 S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
 {
     PERL_ARGS_ASSERT_CL_OR;
 
     if (or_with->flags & ANYOF_INVERT) {
+
+        /* Here, the or'd node is to be inverted.  This means we take the
+         * complement of everything not in the bitmap, but currently we don't
+         * know what that is, so give up and match anything */
+	if (ANYOF_NONBITMAP(or_with)) {
+	    cl_anything(pRExC_state, cl);
+	}
 	/* We do not use
 	 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
 	 *   <= (B1 | !B2) | (CL1 | !CL2)
@@ -832,7 +906,7 @@ S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, con
 	 *   (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
 	 *   (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
 	 */
-	if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
+	else if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 	     && !(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
 	     && !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD) ) {
 	    int i;
@@ -843,7 +917,21 @@ S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, con
 	else {
 	    cl_anything(pRExC_state, cl);
 	}
-    } else {
+
+        /* And, we can just take the union of the flags that aren't affected
+         * by the inversion */
+	cl->flags |= or_with->flags & INVERSION_UNAFFECTED_FLAGS;
+
+        /* For the remaining flags:
+            ANYOF_UNICODE_ALL and inverted means to not match anything above
+                    255, which means that the union with cl should just be
+                    what cl has in it, so can ignore this flag
+            ANYOF_NON_UTF8_LATIN1_ALL and inverted means if not utf8 and ord
+                    is 127-255 to match them, but then invert that, so the
+                    union with cl should just be what cl has in it, so can
+                    ignore this flag
+         */
+    } else {    /* 'or_with' is not inverted */
 	/* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
 	if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 	     && (!(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
@@ -862,25 +950,32 @@ S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, con
 	else { /* XXXX: logic is complicated, leave it along for a moment. */
 	    cl_anything(pRExC_state, cl);
 	}
-    }
-    if (or_with->flags & ANYOF_EOS)
-	cl->flags |= ANYOF_EOS;
-    if (!(or_with->flags & ANYOF_NON_UTF8_LATIN1_ALL))
-	cl->flags |= ANYOF_NON_UTF8_LATIN1_ALL;
 
-    if (or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
-	cl->flags |= ANYOF_LOC_NONBITMAP_FOLD;
+	if (ANYOF_NONBITMAP(or_with)) {
+
+	    /* Use the added node's outside-the-bit-map match if there isn't a
+	     * conflict.  If there is a conflict (both nodes match something
+	     * outside the bitmap, but what they match outside is not the same
+	     * pointer, and hence not easily compared until XXX we extend
+	     * inversion lists this far), give up and allow the start class to
+	     * match everything outside the bitmap.  If that stuff is all above
+	     * 255, can just set UNICODE_ALL, otherwise caould be anything. */
+	    if (! ANYOF_NONBITMAP(cl)) {
+		ARG_SET(cl, ARG(or_with));
+	    }
+	    else if (ARG(cl) != ARG(or_with)) {
 
-    /* If both nodes match something outside the bitmap, but what they match
-     * outside is not the same pointer, and hence not easily compared, give up
-     * and allow the start class to match everything outside the bitmap */
-    if (cl->flags & ANYOF_NONBITMAP && or_with->flags & ANYOF_NONBITMAP &&
-	ARG(cl) != ARG(or_with)) {
-	cl->flags |= ANYOF_UNICODE_ALL;
-    }
+		if ((or_with->flags & ANYOF_NONBITMAP_NON_UTF8)) {
+		    cl_anything(pRExC_state, cl);
+		}
+		else {
+		    cl->flags |= ANYOF_UNICODE_ALL;
+		}
+	    }
 
-    if (or_with->flags & ANYOF_UNICODE_ALL) {
-	cl->flags |= ANYOF_UNICODE_ALL;
+        /* Take the union */
+	cl->flags |= or_with->flags;
+	}
     }
 }
 
@@ -1397,6 +1492,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs
 #endif
 
     switch (flags) {
+	case EXACTFA:
 	case EXACTFU: folder = PL_fold_latin1; break;
 	case EXACTF:  folder = PL_fold; break;
 	case EXACTFL: folder = PL_fold_locale; break;
@@ -2472,7 +2568,7 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags
 #define UPSILON_D_T	GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS
 
     if (UTF
-	&& ( OP(scan) == EXACTF || OP(scan) == EXACTFU)
+	&& ( OP(scan) == EXACTF || OP(scan) == EXACTFU || OP(scan) == EXACTFA)
 	&& ( STR_LEN(scan) >= 6 ) )
     {
     /*
@@ -2936,10 +3032,13 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
     If/when this is fixed the following define can be swapped
     in below to fully enable trie logic.
 
+    XXX It may work if not UTF and/or /a (AT_LEAST_UNI_SEMANTICS) but perhaps
+    not /aa
+
 #define TRIE_TYPE_IS_SAFE 1
 
 */
-#define TRIE_TYPE_IS_SAFE (UTF || optype==EXACT)
+#define TRIE_TYPE_IS_SAFE ((UTF && UNI_SEMANTICS) || optype==EXACT)
 
                                 if ( last && TRIE_TYPE_IS_SAFE ) {
                                     make_trie( pRExC_state, 
@@ -3105,11 +3204,29 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
 		    && (!(data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD)
 			|| !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
                     )
+		{
 		    compat = 0;
+		}
 		ANYOF_CLASS_ZERO(data->start_class);
 		ANYOF_BITMAP_ZERO(data->start_class);
 		if (compat)
 		    ANYOF_BITMAP_SET(data->start_class, uc);
+		else if (uc >= 0x100) {
+		    int i;
+
+		    /* Some Unicode code points fold to the Latin1 range; as
+		     * XXX temporary code, instead of figuring out if this is
+		     * one, just assume it is and set all the start class bits
+		     * that could be some such above 255 code point's fold
+		     * which will generate fals positives.  As the code
+		     * elsewhere that does compute the fold settles down, it
+		     * can be extracted out and re-used here */
+		    for (i = 0; i < 256; i++){
+			if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)) {
+			    ANYOF_BITMAP_SET(data->start_class, i);
+			}
+		    }
+		}
 		data->start_class->flags &= ~ANYOF_EOS;
 		if (uc < 0x100)
 		  data->start_class->flags &= ~ANYOF_UNICODE_ALL;
@@ -3159,6 +3276,9 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
 		    data->start_class->flags &= ~ANYOF_EOS;
 		    data->start_class->flags |= ANYOF_LOC_NONBITMAP_FOLD;
 		    if (OP(scan) == EXACTFL) {
+			/* XXX This set is probably no longer necessary, and
+			 * probably wrong as LOCALE now is on in the initial
+			 * state */
 			data->start_class->flags |= ANYOF_LOCALE;
 		    }
 		    else {
@@ -3170,6 +3290,14 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
 			ANYOF_BITMAP_SET(data->start_class, PL_fold_latin1[uc]);
 		    }
 		}
+		else if (uc >= 0x100) {
+		    int i;
+		    for (i = 0; i < 256; i++){
+			if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)) {
+			    ANYOF_BITMAP_SET(data->start_class, i);
+			}
+		    }
+		}
 	    }
 	    else if (flags & SCF_DO_STCLASS_OR) {
 		if (data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD) {
@@ -3663,7 +3791,11 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
 		    else {
 			if (data->start_class->flags & ANYOF_LOCALE)
 			    ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
-                        else if (OP(scan) == ALNUMU) {
+
+			/* Even if under locale, set the bits for non-locale
+			 * in case it isn't a true locale-node.  This will
+			 * create false positives if it truly is locale */
+                        if (OP(scan) == ALNUMU) {
                             for (value = 0; value < 256; value++) {
                                 if (isWORDCHAR_L1(value)) {
                                     ANYOF_BITMAP_SET(data->start_class, value);
@@ -3700,19 +3832,21 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
 		    else {
 			if (data->start_class->flags & ANYOF_LOCALE)
 			    ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
-			else {
-                            if (OP(scan) == NALNUMU) {
-                                for (value = 0; value < 256; value++) {
-                                    if (! isWORDCHAR_L1(value)) {
-                                        ANYOF_BITMAP_SET(data->start_class, value);
-                                    }
-                                }
-                            } else {
-                                for (value = 0; value < 256; value++) {
-                                    if (! isALNUM(value)) {
-                                        ANYOF_BITMAP_SET(data->start_class, value);
-                                    }
-                                }
+
+			/* Even if under locale, set the bits for non-locale in
+			 * case it isn't a true locale-node.  This will create
+			 * false positives if it truly is locale */
+			if (OP(scan) == NALNUMU) {
+			    for (value = 0; value < 256; value++) {
+				if (! isWORDCHAR_L1(value)) {
+				    ANYOF_BITMAP_SET(data->start_class, value);
+				}
+			    }
+			} else {
+			    for (value = 0; value < 256; value++) {
+				if (! isALNUM(value)) {
+				    ANYOF_BITMAP_SET(data->start_class, value);
+				}
 			    }
 			}
 		    }
@@ -3740,7 +3874,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                         if (data->start_class->flags & ANYOF_LOCALE) {
 			    ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
                         }
-                        else if (OP(scan) == SPACEU) {
+                        if (OP(scan) == SPACEU) {
                             for (value = 0; value < 256; value++) {
                                 if (isSPACE_L1(value)) {
                                     ANYOF_BITMAP_SET(data->start_class, value);
@@ -3777,7 +3911,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
 		    else {
 			if (data->start_class->flags & ANYOF_LOCALE)
 			    ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
-                        else if (OP(scan) == NSPACEU) {
+                        if (OP(scan) == NSPACEU) {
                             for (value = 0; value < 256; value++) {
                                 if (!isSPACE_L1(value)) {
                                     ANYOF_BITMAP_SET(data->start_class, value);
@@ -3795,24 +3929,25 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
 		    break;
 		case DIGIT:
 		    if (flags & SCF_DO_STCLASS_AND) {
-			ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
-			for (value = 0; value < 256; value++)
-			    if (!isDIGIT(value))
-				ANYOF_BITMAP_CLEAR(data->start_class, value);
+			if (!(data->start_class->flags & ANYOF_LOCALE)) {
+                            ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
+			    for (value = 0; value < 256; value++)
+				if (!isDIGIT(value))
+				    ANYOF_BITMAP_CLEAR(data->start_class, value);
+			}
 		    }
 		    else {
 			if (data->start_class->flags & ANYOF_LOCALE)
 			    ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
-			else {
-			    for (value = 0; value < 256; value++)
-				if (isDIGIT(value))
-				    ANYOF_BITMAP_SET(data->start_class, value);
-			}
+			for (value = 0; value < 256; value++)
+			    if (isDIGIT(value))
+				ANYOF_BITMAP_SET(data->start_class, value);
 		    }
 		    break;
 		case NDIGIT:
 		    if (flags & SCF_DO_STCLASS_AND) {
-			ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
+			if (!(data->start_class->flags & ANYOF_LOCALE))
+                            ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
 			for (value = 0; value < 256; value++)
 			    if (isDIGIT(value))
 				ANYOF_BITMAP_CLEAR(data->start_class, value);
@@ -3820,11 +3955,9 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
 		    else {
 			if (data->start_class->flags & ANYOF_LOCALE)
 			    ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
-			else {
-			    for (value = 0; value < 256; value++)
-				if (!isDIGIT(value))
-				    ANYOF_BITMAP_SET(data->start_class, value);
-			}
+			for (value = 0; value < 256; value++)
+			    if (!isDIGIT(value))
+				ANYOF_BITMAP_SET(data->start_class, value);
 		    }
 		    break;
 		CASE_SYNST_FNC(VERTWS);
@@ -4383,6 +4516,7 @@ Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags)
     I32 sawplus = 0;
     I32 sawopen = 0;
     bool used_setjump = FALSE;
+    regex_charset initial_charset = get_regex_charset(orig_pm_flags);
 
     U8 jump_ret = 0;
     dJMPENV;
@@ -4401,6 +4535,7 @@ Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags)
 
     RExC_utf8 = RExC_orig_utf8 = SvUTF8(pattern);
     RExC_uni_semantics = 0;
+    RExC_contains_locale = 0;
 
     /****************** LONG JUMP TARGET HERE***********************/
     /* Longjmp back to here if have to switch in midstream to utf8 */
@@ -4457,11 +4592,15 @@ Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags)
     restudied = 0;
 #endif
 
-    /* Set to use unicode semantics if the pattern is in utf8 and has the
-     * 'depends' charset specified, as it means unicode when utf8  */
     pm_flags = orig_pm_flags;
 
-    if (RExC_utf8 && get_regex_charset(pm_flags) == REGEX_DEPENDS_CHARSET) {
+    if (initial_charset == REGEX_LOCALE_CHARSET) {
+	RExC_contains_locale = 1;
+    }
+    else if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) {
+
+	/* Set to use unicode semantics if the pattern is in utf8 and has the
+	 * 'depends' charset specified, as it means unicode when utf8  */
 	set_regex_charset(&pm_flags, REGEX_UNICODE_CHARSET);
     }
 
@@ -4474,6 +4613,7 @@ Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags)
     RExC_seen_zerolen = *exp == '^' ? -1 : 0;
     RExC_seen_evals = 0;
     RExC_extralen = 0;
+    RExC_override_recoding = 0;
 
     /* First pass: determine size, legality. */
     RExC_parse = exp;
@@ -4982,14 +5122,13 @@ reStudy:
 	    && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
 	    ri->regstclass = NULL;
 
-	/* If the synthetic start class were to ever be used when EOS is set,
-	 * that bit would have to be cleared, as it is shared with another */
 	if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
 	    && stclass_flag
 	    && !(data.start_class->flags & ANYOF_EOS)
 	    && !cl_is_anything(data.start_class))
 	{
 	    const U32 n = add_data(pRExC_state, 1, "f");
+	    data.start_class->flags |= ANYOF_IS_SYNTHETIC;
 
 	    Newx(RExC_rxi->data->data[n], 1,
 		struct regnode_charclass_class);
@@ -5057,12 +5196,11 @@ reStudy:
 	r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
 		= r->float_substr = r->float_utf8 = NULL;
 
-	/* If the synthetic start class were to ever be used when EOS is set,
-	 * that bit would have to be cleared, as it is shared with another */
 	if (!(data.start_class->flags & ANYOF_EOS)
 	    && !cl_is_anything(data.start_class))
 	{
 	    const U32 n = add_data(pRExC_state, 1, "f");
+	    data.start_class->flags |= ANYOF_IS_SYNTHETIC;
 
 	    Newx(RExC_rxi->data->data[n], 1,
 		struct regnode_charclass_class);
@@ -5940,7 +6078,7 @@ Perl__append_range_to_invlist(pTHX_ HV* const invlist, const UV start, const UV
 }
 #endif
 
-PERL_STATIC_INLINE HV*
+STATIC HV*
 S_invlist_union(pTHX_ HV* const a, HV* const b)
 {
     /* Return a new inversion list which is the union of two inversion lists.
@@ -6091,7 +6229,7 @@ S_invlist_union(pTHX_ HV* const a, HV* const b)
     return u;
 }
 
-PERL_STATIC_INLINE HV*
+STATIC HV*
 S_invlist_intersection(pTHX_ HV* const a, HV* const b)
 {
     /* Return the intersection of two inversion lists.  The basis for this
@@ -6216,18 +6354,25 @@ S_invlist_intersection(pTHX_ HV* const a, HV* const b)
 }
 
 STATIC HV*
-S_add_range_to_invlist(pTHX_ HV* const invlist, const UV start, const UV end)
+S_add_range_to_invlist(pTHX_ HV* invlist, const UV start, const UV end)
 {
     /* Add the range from 'start' to 'end' inclusive to the inversion list's
      * set.  A pointer to the inversion list is returned.  This may actually be
-     * a new list, in which case the passed in one has been destroyed */
+     * a new list, in which case the passed in one has been destroyed.  The
+     * passed in inversion list can be NULL, in which case a new one is created
+     * with just the one range in it */
 
     HV* range_invlist;
     HV* added_invlist;
+    UV len;
 
-    UV len = invlist_len(invlist);
-
-    PERL_ARGS_ASSERT_ADD_RANGE_TO_INVLIST;
+    if (invlist == NULL) {
+	invlist = _new_invlist(2);
+	len = 0;
+    }
+    else {
+	len = invlist_len(invlist);
+    }
 
     /* If comes after the final entry, can just append it to the end */
     if (len == 0
@@ -6254,6 +6399,11 @@ S_add_range_to_invlist(pTHX_ HV* const invlist, const UV start, const UV end)
     return added_invlist;
 }
 
+PERL_STATIC_INLINE HV*
+S_add_cp_to_invlist(pTHX_ HV* invlist, const UV cp) {
+    return add_range_to_invlist(invlist, cp, cp);
+}
+
 /* End of inversion list object */
 
 /*
@@ -6452,11 +6602,13 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
 		    ret = reganode(pRExC_state,
 				   ((! FOLD)
 				     ? NREF
-				     : (UNI_SEMANTICS)
-				       ? NREFFU
-				       : (LOC)
-				         ? NREFFL
-					 : NREFF),
+				     : (MORE_ASCII_RESTRICTED)
+				       ? NREFFA
+                                       : (AT_LEAST_UNI_SEMANTICS)
+                                         ? NREFFU
+                                         : (LOC)
+                                           ? NREFFL
+                                           : NREFF),
 				    num);
                     *flagp |= HASWIDTH;
 
@@ -6896,7 +7048,9 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
                 U32 posflags = 0, negflags = 0;
 	        U32 *flagsp = &posflags;
                 bool has_charset_modifier = 0;
-		regex_charset cs = REGEX_DEPENDS_CHARSET;
+		regex_charset cs = (RExC_utf8 || RExC_uni_semantics)
+				    ? REGEX_UNICODE_CHARSET
+				    : REGEX_DEPENDS_CHARSET;
 
 		while (*RExC_parse) {
 		    /* && strchr("iogcmsx", *RExC_parse) */
@@ -6910,6 +7064,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
                         }
 			cs = REGEX_LOCALE_CHARSET;
                         has_charset_modifier = 1;
+			RExC_contains_locale = 1;
                         break;
                     case UNICODE_PAT_MOD:
                         if (has_charset_modifier || flagsp == &negflags) {
@@ -6922,7 +7077,14 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
                         if (has_charset_modifier || flagsp == &negflags) {
                             goto fail_modifiers;
                         }
-			cs = REGEX_ASCII_RESTRICTED_CHARSET;
+			if (*(RExC_parse + 1) == ASCII_RESTRICT_PAT_MOD) {
+			    /* Doubled modifier implies more restricted */
+			    cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
+			    RExC_parse++;
+			}
+			else {
+			    cs = REGEX_ASCII_RESTRICTED_CHARSET;
+			}
                         has_charset_modifier = 1;
                         break;
                     case DEPENDS_PAT_MOD:
@@ -7055,12 +7217,6 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
     parse_start = RExC_parse;   /* MJD */
     br = regbranch(pRExC_state, &flags, 1,depth+1);
 
-    if (freeze_paren) {
-        if (RExC_npar > after_freeze)
-            after_freeze = RExC_npar;
-        RExC_npar = freeze_paren;
-    }
-
     /*     branch_len = (paren != 0); */
 
     if (br == NULL)
@@ -7204,7 +7360,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
     if (RExC_in_lookbehind) {
 	RExC_in_lookbehind--;
     }
-    if (after_freeze)
+    if (after_freeze > RExC_npar)
         RExC_npar = after_freeze;
     return(ret);
 }
@@ -7477,7 +7633,7 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
 }
 
 
-/* reg_namedseq(pRExC_state,UVp)
+/* reg_namedseq(pRExC_state,UVp, UV depth)
    
    This is expected to be called by a parser routine that has 
    recognized '\N' and needs to handle the rest. RExC_parse is
@@ -7520,13 +7676,10 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
    Parsing failures will generate a fatal error via vFAIL(...)
  */
 STATIC regnode *
-S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp)
+S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp, U32 depth)
 {
     char * endbrace;    /* '}' following the name */
     regnode *ret = NULL;
-#ifdef DEBUGGING
-    char* parse_start = RExC_parse - 2;	    /* points to the '\N' */
-#endif
     char* p;
 
     GET_RE_DEBUG_FLAGS_DECL;
@@ -7639,117 +7792,55 @@ S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp)
         ret = (regnode *) &RExC_parse;	/* Invalid regnode pointer */
     }
     else {	/* Not a char class */
-	char *s;	    /* String to put in generated EXACT node */
-	STRLEN len = 0;	    /* Its current byte length */
+
+	/* What is done here is to convert this to a sub-pattern of the form
+	 * (?:\x{char1}\x{char2}...)
+	 * and then call reg recursively.  That way, it retains its atomicness,
+	 * while not having to worry about special handling that some code
+	 * points may have.  toke.c has converted the original Unicode values
+	 * to native, so that we can just pass on the hex values unchanged.  We
+	 * do have to set a flag to keep recoding from happening in the
+	 * recursion */
+
+	SV * substitute_parse = newSVpvn_flags("?:", 2, SVf_UTF8|SVs_TEMP);
+	STRLEN len;
 	char *endchar;	    /* Points to '.' or '}' ending cur char in the input
 			       stream */
+	char *orig_end = RExC_end;
 
-	ret = reg_node(pRExC_state, (U8) ((! FOLD) ? EXACT
-						   : (LOC)
-						      ? EXACTFL
-						      : UNI_SEMANTICS
-						        ? EXACTFU
-						        : EXACTF));
-	s= STRING(ret);
-
-	/* Exact nodes can hold only a U8 length's of text = 255.  Loop through
-	 * the input which is of the form now 'c1.c2.c3...}' until find the
-	 * ending brace or exceed length 255.  The characters that exceed this
-	 * limit are dropped.  The limit could be relaxed should it become
-	 * desirable by reparsing this as (?:\N{NAME}), so could generate
-	 * multiple EXACT nodes, as is done for just regular input.  But this
-	 * is primarily a named character, and not intended to be a huge long
-	 * string, so 255 bytes should be good enough */
-	while (1) {
-	    STRLEN length_of_hex;
-	    I32 grok_flags = PERL_SCAN_ALLOW_UNDERSCORES
-			    | PERL_SCAN_DISALLOW_PREFIX
-			    | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
-	    UV cp;  /* Ord of current character */
+	while (RExC_parse < endbrace) {
 
 	    /* Code points are separated by dots.  If none, there is only one
 	     * code point, and is terminated by the brace */
 	    endchar = RExC_parse + strcspn(RExC_parse, ".}");
 
-	    /* The values are Unicode even on EBCDIC machines */
-	    length_of_hex = (STRLEN)(endchar - RExC_parse);
-	    cp = grok_hex(RExC_parse, &length_of_hex, &grok_flags, NULL);
-	    if ( length_of_hex == 0 
-		|| length_of_hex != (STRLEN)(endchar - RExC_parse) )
-	    {
-		RExC_parse += length_of_hex;	    /* Includes all the valid */
-		RExC_parse += (RExC_orig_utf8)	/* point to after 1st invalid */
-				? UTF8SKIP(RExC_parse)
-				: 1;
-		/* Guard against malformed utf8 */
-		if (RExC_parse >= endchar) RExC_parse = endchar;
-		vFAIL("Invalid hexadecimal number in \\N{U+...}");
-	    }    
-
-	    if (! FOLD) {	/* Not folding, just append to the string */
-		STRLEN unilen;
-
-		/* Quit before adding this character if would exceed limit */
-		if (len + UNISKIP(cp) > U8_MAX) break;
-
-		unilen = reguni(pRExC_state, cp, s);
-		if (unilen > 0) {
-		    s   += unilen;
-		    len += unilen;
-		}
-	    } else {	/* Folding, output the folded equivalent */
-		STRLEN foldlen,numlen;
-		U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
-		cp = toFOLD_uni(cp, tmpbuf, &foldlen);
-
-		/* Quit before exceeding size limit */
-		if (len + foldlen > U8_MAX) break;
-		
-		for (foldbuf = tmpbuf;
-		    foldlen;
-		    foldlen -= numlen) 
-		{
-		    cp = utf8_to_uvchr(foldbuf, &numlen);
-		    if (numlen > 0) {
-			const STRLEN unilen = reguni(pRExC_state, cp, s);
-			s       += unilen;
-			len     += unilen;
-			/* In EBCDIC the numlen and unilen can differ. */
-			foldbuf += numlen;
-			if (numlen >= foldlen)
-			    break;
-		    }
-		    else
-			break; /* "Can't happen." */
-		}                          
-	    }
+	    /* Convert to notation the rest of the code understands */
+	    sv_catpv(substitute_parse, "\\x{");
+	    sv_catpvn(substitute_parse, RExC_parse, endchar - RExC_parse);
+	    sv_catpv(substitute_parse, "}");
 
 	    /* Point to the beginning of the next character in the sequence. */
 	    RExC_parse = endchar + 1;
-
-	    /* Quit if no more characters */
-	    if (RExC_parse >= endbrace) break;
 	}
+	sv_catpv(substitute_parse, ")");
 
+	RExC_parse = SvPV(substitute_parse, len);
 
-	if (SIZE_ONLY) {
-	    if (RExC_parse < endbrace) {
-		ckWARNreg(RExC_parse - 1,
-			  "Using just the first characters returned by \\N{}");
-	    }
-
-	    RExC_size += STR_SZ(len);
-	} else {
-	    STR_LEN(ret) = len;
-	    RExC_emit += STR_SZ(len);
+	/* Don't allow empty number */
+	if (len < 8) {
+	    vFAIL("Invalid hexadecimal number in \\N{U+...}");
 	}
+	RExC_end = RExC_parse + len;
 
-	RExC_parse = endbrace + 1;
+	/* The values are Unicode, and therefore not subject to recoding */
+	RExC_override_recoding = 1;
+
+	ret = reg(pRExC_state, 1, flagp, depth+1);
+
+	RExC_parse = endbrace;
+	RExC_end = orig_end;
+	RExC_override_recoding = 0;
 
-	*flagp |= HASWIDTH; /* Not SIMPLE, as that causes the engine to fail
-			       with malformed in t/re/pat_advanced.t */
-	RExC_parse --;
-	Set_Node_Cur_Length(ret); /* MJD */
 	nextchar(pRExC_state);
     }
 
@@ -7912,27 +8003,6 @@ tryagain:
 	RExC_parse++;
 	vFAIL("Quantifier follows nothing");
 	break;
-    case LATIN_SMALL_LETTER_SHARP_S:
-    case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S):
-    case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T):
-#if UTF8_TWO_BYTE_HI_nocast(UPSILON_D_T) != UTF8_TWO_BYTE_HI_nocast(IOTA_D_T)
-#error The beginning utf8 byte of IOTA_D_T and UPSILON_D_T unexpectedly differ.  Other instances in this code should have the case statement below.
-    case UTF8_TWO_BYTE_HI_nocast(UPSILON_D_T):
-#endif
-        do_foldchar:
-        if (!LOC && FOLD) {
-            U32 len,cp;
-	    len=0; /* silence a spurious compiler warning */
-            if ((cp = what_len_TRICKYFOLD_safe(RExC_parse,RExC_end,UTF,len))) {
-                *flagp |= HASWIDTH; /* could be SIMPLE too, but needs a handler in regexec.regrepeat */
-                RExC_parse+=len-1; /* we get one from nextchar() as well. :-( */
-                ret = reganode(pRExC_state, FOLDCHAR, cp);
-                Set_Node_Length(ret, 1); /* MJD */
-                nextchar(pRExC_state); /* kill whitespace under /x */
-                return ret;
-            }
-        }
-        goto outer_default;
     case '\\':
 	/* Special Escapes
 
@@ -7947,10 +8017,6 @@ tryagain:
 	   literal text handling code.
 	*/
 	switch ((U8)*++RExC_parse) {
-	case LATIN_SMALL_LETTER_SHARP_S:
-	case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S):
-	case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T):
-	           goto do_foldchar;	    
 	/* Special Escapes */
 	case 'A':
 	    RExC_seen_zerolen++;
@@ -8000,6 +8066,7 @@ tryagain:
 		    op = ALNUMU;
 		    break;
 		case REGEX_ASCII_RESTRICTED_CHARSET:
+		case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
 		    op = ALNUMA;
 		    break;
 		case REGEX_DEPENDS_CHARSET:
@@ -8020,6 +8087,7 @@ tryagain:
 		    op = NALNUMU;
 		    break;
 		case REGEX_ASCII_RESTRICTED_CHARSET:
+		case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
 		    op = NALNUMA;
 		    break;
 		case REGEX_DEPENDS_CHARSET:
@@ -8042,6 +8110,7 @@ tryagain:
 		    op = BOUNDU;
 		    break;
 		case REGEX_ASCII_RESTRICTED_CHARSET:
+		case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
 		    op = BOUNDA;
 		    break;
 		case REGEX_DEPENDS_CHARSET:
@@ -8053,6 +8122,9 @@ tryagain:
 	    ret = reg_node(pRExC_state, op);
 	    FLAGS(ret) = get_regex_charset(RExC_flags);
 	    *flagp |= SIMPLE;
+	    if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
+		ckWARNregdep(RExC_parse, "\"\\b{\" is deprecated; use \"\\b\\{\" instead");
+	    }
 	    goto finish_meta_pat;
 	case 'B':
 	    RExC_seen_zerolen++;
@@ -8065,6 +8137,7 @@ tryagain:
 		    op = NBOUNDU;
 		    break;
 		case REGEX_ASCII_RESTRICTED_CHARSET:
+		case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
 		    op = NBOUNDA;
 		    break;
 		case REGEX_DEPENDS_CHARSET:
@@ -8076,6 +8149,9 @@ tryagain:
 	    ret = reg_node(pRExC_state, op);
 	    FLAGS(ret) = get_regex_charset(RExC_flags);
 	    *flagp |= SIMPLE;
+	    if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
+		ckWARNregdep(RExC_parse, "\"\\B{\" is deprecated; use \"\\B\\{\" instead");
+	    }
 	    goto finish_meta_pat;
 	case 's':
 	    switch (get_regex_charset(RExC_flags)) {
@@ -8086,6 +8162,7 @@ tryagain:
 		    op = SPACEU;
 		    break;
 		case REGEX_ASCII_RESTRICTED_CHARSET:
+		case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
 		    op = SPACEA;
 		    break;
 		case REGEX_DEPENDS_CHARSET:
@@ -8106,6 +8183,7 @@ tryagain:
 		    op = NSPACEU;
 		    break;
 		case REGEX_ASCII_RESTRICTED_CHARSET:
+		case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
 		    op = NSPACEA;
 		    break;
 		case REGEX_DEPENDS_CHARSET:
@@ -8123,6 +8201,7 @@ tryagain:
 		    op = DIGITL;
 		    break;
 		case REGEX_ASCII_RESTRICTED_CHARSET:
+		case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
 		    op = DIGITA;
 		    break;
 		case REGEX_DEPENDS_CHARSET: /* No difference between these */
@@ -8141,6 +8220,7 @@ tryagain:
 		    op = NDIGITL;
 		    break;
 		case REGEX_ASCII_RESTRICTED_CHARSET:
+		case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
 		    op = NDIGITA;
 		    break;
 		case REGEX_DEPENDS_CHARSET: /* No difference between these */
@@ -8219,7 +8299,7 @@ tryagain:
             Also this makes sure that things like /\N{BLAH}+/ and 
             \N{BLAH} being multi char Just Happen. dmq*/
             ++RExC_parse;
-            ret= reg_namedseq(pRExC_state, NULL, flagp); 
+            ret= reg_namedseq(pRExC_state, NULL, flagp, depth);
             break;
 	case 'k':    /* Handle \k<NAME> and \k'NAME' */
 	parse_named_seq:
@@ -8249,11 +8329,13 @@ tryagain:
                 ret = reganode(pRExC_state,
                                ((! FOLD)
                                  ? NREF
-                                 : (AT_LEAST_UNI_SEMANTICS)
-                                   ? NREFFU
-                                   : (LOC)
-                                     ? NREFFL
-                                     : NREFF),
+				 : (MORE_ASCII_RESTRICTED)
+				   ? NREFFA
+                                   : (AT_LEAST_UNI_SEMANTICS)
+                                     ? NREFFU
+                                     : (LOC)
+                                       ? NREFFL
+                                       : NREFF),
                                 num);
                 *flagp |= HASWIDTH;
 
@@ -8317,11 +8399,13 @@ tryagain:
 		    ret = reganode(pRExC_state,
 				   ((! FOLD)
 				     ? REF
-				     : (AT_LEAST_UNI_SEMANTICS)
-				       ? REFFU
-				       : (LOC)
-				         ? REFFL
-					 : REFF),
+				     : (MORE_ASCII_RESTRICTED)
+				       ? REFFA
+                                       : (AT_LEAST_UNI_SEMANTICS)
+                                         ? REFFU
+                                         : (LOC)
+                                           ? REFFL
+                                           : REFF),
 				    num);
 		    *flagp |= HASWIDTH;
 
@@ -8353,27 +8437,42 @@ tryagain:
 	/* FALL THROUGH */
 
     default:
-        outer_default:{
+
+            parse_start = RExC_parse - 1;
+
+	    RExC_parse++;
+
+	defchar: {
+	    typedef enum {
+		generic_char = 0,
+		char_s,
+		upsilon_1,
+		upsilon_2,
+		iota_1,
+		iota_2,
+	    } char_state;
+	    char_state latest_char_state = generic_char;
 	    register STRLEN len;
 	    register UV ender;
 	    register char *p;
 	    char *s;
 	    STRLEN foldlen;
 	    U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
+	    regnode * orig_emit;
 
-            parse_start = RExC_parse - 1;
-
-	    RExC_parse++;
-
-	defchar:
 	    ender = 0;
+	    orig_emit = RExC_emit; /* Save the original output node position in
+				      case we need to output a different node
+				      type */
 	    ret = reg_node(pRExC_state,
 			   (U8) ((! FOLD) ? EXACT
 					  : (LOC)
 					     ? EXACTFL
-					     : (AT_LEAST_UNI_SEMANTICS)
-					       ? EXACTFU
-					       : EXACTF)
+					     : (MORE_ASCII_RESTRICTED)
+					       ? EXACTFA
+					       : (AT_LEAST_UNI_SEMANTICS)
+					         ? EXACTFU
+					         : EXACTF)
 		    );
 	    s = STRING(ret);
 	    for (len = 0, p = RExC_parse - 1;
@@ -8385,11 +8484,6 @@ tryagain:
 		if (RExC_flags & RXf_PMf_EXTENDED)
 		    p = regwhite( pRExC_state, p );
 		switch ((U8)*p) {
-		case LATIN_SMALL_LETTER_SHARP_S:
-		case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S):
-		case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T):
-		           if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
-		                goto normal_default;
 		case '^':
 		case '$':
 		case '.':
@@ -8414,11 +8508,6 @@ tryagain:
 
 		    switch ((U8)*++p) {
 		    /* These are all the special escapes. */
-                    case LATIN_SMALL_LETTER_SHARP_S:
-                    case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S):
-                    case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T):
-    		           if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
-    		                goto normal_default;		    
 		    case 'A':             /* Start assertion */
 		    case 'b': case 'B':   /* Word-boundary assertion*/
 		    case 'C':             /* Single char !DANGEROUS! */
@@ -8545,7 +8634,7 @@ tryagain:
 			    goto recode_encoding;
 			break;
 		    recode_encoding:
-			{
+			if (! RExC_override_recoding) {
 			    SV* enc = PL_encoding;
 			    ender = reg_recode((const char)(U8)ender, &enc);
 			    if (!enc && SIZE_ONLY)
@@ -8577,14 +8666,272 @@ tryagain:
 			p += numlen;
 		    }
 		    else
-			ender = *p++;
+			ender = (U8) *p++;
 		    break;
+		} /* End of switch on the literal */
+
+		/* Certain characters are problematic because their folded
+		 * length is so different from their original length that it
+		 * isn't handleable by the optimizer.  They are therefore not
+		 * placed in an EXACTish node; and are here handled specially.
+		 * (Even if the optimizer handled LATIN_SMALL_LETTER_SHARP_S,
+		 * putting it in a special node keeps regexec from having to
+		 * deal with a non-utf8 multi-char fold */
+		if (FOLD
+		    && (ender > 255 || (! MORE_ASCII_RESTRICTED && ! LOC)))
+		{
+		    /* We look for either side of the fold.  For example \xDF
+		     * folds to 'ss'.  We look for both the single character
+		     * \xDF and the sequence 'ss'.  When we find something that
+		     * could be one of those, we stop and flush whatever we
+		     * have output so far into the EXACTish node that was being
+		     * built.  Then restore the input pointer to what it was.
+		     * regatom will return that EXACT node, and will be called
+		     * again, positioned so the first character is the one in
+		     * question, which we return in a different node type.
+		     * The multi-char folds are a sequence, so the occurrence
+		     * of the first character in that sequence doesn't
+		     * necessarily mean that what follows is the rest of the
+		     * sequence.  We keep track of that with a state machine,
+		     * with the state being set to the latest character
+		     * processed before the current one.  Most characters will
+		     * set the state to 0, but if one occurs that is part of a
+		     * potential tricky fold sequence, the state is set to that
+		     * character, and the next loop iteration sees if the state
+		     * should progress towards the final folded-from character,
+		     * or if it was a false alarm.  If it turns out to be a
+		     * false alarm, the character(s) will be output in a new
+		     * EXACTish node, and join_exact() will later combine them.
+		     * In the case of the 'ss' sequence, which is more common
+		     * and more easily checked, some look-ahead is done to
+		     * save time by ruling-out some false alarms */
+		    switch (ender) {
+			default:
+			    latest_char_state = generic_char;
+			    break;
+			case 's':
+			case 'S':
+			     if (AT_LEAST_UNI_SEMANTICS) {
+				if (latest_char_state == char_s) {  /* 'ss' */
+				    ender = LATIN_SMALL_LETTER_SHARP_S;
+				    goto do_tricky;
+				}
+				else if (p < RExC_end) {
+
+				    /* Look-ahead at the next character.  If it
+				     * is also an s, we handle as a sharp s
+				     * tricky regnode.  */
+				    if (*p == 's' || *p == 'S') {
+
+					/* But first flush anything in the
+					 * EXACTish buffer */
+					if (len != 0) {
+					    p = oldp;
+					    goto loopdone;
+					}
+					p++;	/* Account for swallowing this
+						   's' up */
+					ender = LATIN_SMALL_LETTER_SHARP_S;
+					goto do_tricky;
+				    }
+					/* Here, the next character is not a
+					 * literal 's', but still could
+					 * evaluate to one if part of a \o{},
+					 * \x or \OCTAL-DIGIT.  The minimum
+					 * length required for that is 4, eg
+					 * \x53 or \123 */
+				    else if (*p == '\\'
+					     && p < RExC_end - 4
+					     && (isDIGIT(*(p + 1))
+						 || *(p + 1) == 'x'
+						 || *(p + 1) == 'o' ))
+				    {
+
+					/* Here, it could be an 's', too much
+					 * bother to figure it out here.  Flush
+					 * the buffer if any; when come back
+					 * here, set the state so know that the
+					 * previous char was an 's' */
+					if (len != 0) {
+					    latest_char_state = generic_char;
+					    p = oldp;
+					    goto loopdone;
+					}
+					latest_char_state = char_s;
+					break;
+				    }
+				}
+			    }
+
+			    /* Here, can't be an 'ss' sequence, or at least not
+			     * one that could fold to/from the sharp ss */
+			    latest_char_state = generic_char;
+			    break;
+			case 0x03C5:	/* First char in upsilon series */
+			    if (p < RExC_end - 4) { /* Need >= 4 bytes left */
+				latest_char_state = upsilon_1;
+				if (len != 0) {
+				    p = oldp;
+				    goto loopdone;
+				}
+			    }
+			    else {
+				latest_char_state = generic_char;
+			    }
+			    break;
+			case 0x03B9:	/* First char in iota series */
+			    if (p < RExC_end - 4) {
+				latest_char_state = iota_1;
+				if (len != 0) {
+				    p = oldp;
+				    goto loopdone;
+				}
+			    }
+			    else {
+				latest_char_state = generic_char;
+			    }
+			    break;
+			case 0x0308:
+			    if (latest_char_state == upsilon_1) {
+				latest_char_state = upsilon_2;
+			    }
+			    else if (latest_char_state == iota_1) {
+				latest_char_state = iota_2;
+			    }
+			    else {
+				latest_char_state = generic_char;
+			    }
+			    break;
+			case 0x301:
+			    if (latest_char_state == upsilon_2) {
+				ender = GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS;
+				goto do_tricky;
+			    }
+			    else if (latest_char_state == iota_2) {
+				ender = GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS;
+				goto do_tricky;
+			    }
+			    latest_char_state = generic_char;
+			    break;
+
+			/* These are the tricky fold characters.  Flush any
+			 * buffer first. */
+			case GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS:
+			case GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS:
+			case LATIN_SMALL_LETTER_SHARP_S:
+			case LATIN_CAPITAL_LETTER_SHARP_S:
+			case 0x1FD3:
+			case 0x1FE3:
+			    if (len != 0) {
+				p = oldp;
+				goto loopdone;
+			    }
+			    /* FALL THROUGH */
+			do_tricky: {
+			    char* const oldregxend = RExC_end;
+			    U8 tmpbuf[UTF8_MAXBYTES+1];
+
+			    /* Here, we know we need to generate a special
+			     * regnode, and 'ender' contains the tricky
+			     * character.  What's done is to pretend it's in a
+			     * [bracketed] class, and let the code that deals
+			     * with those handle it, as that code has all the
+			     * intelligence necessary.  First save the current
+			     * parse state, get rid of the already allocated
+			     * but empty EXACT node that the ANYOFV node will
+			     * replace, and point the parse to a buffer which
+			     * we fill with the character we want the regclass
+			     * code to think is being parsed */
+			    RExC_emit = orig_emit;
+			    RExC_parse = (char *) tmpbuf;
+			    if (UTF) {
+				U8 *d = uvchr_to_utf8(tmpbuf, ender);
+				*d = '\0';
+				RExC_end = (char *) d;
+			    }
+			    else {  /* ender above 255 already excluded */
+				tmpbuf[0] = (U8) ender;
+				tmpbuf[1] = '\0';
+				RExC_end = RExC_parse + 1;
+			    }
+
+			    ret = regclass(pRExC_state,depth+1);
+
+			    /* Here, have parsed the buffer.  Reset the parse to
+			     * the actual input, and return */
+			    RExC_end = oldregxend;
+			    RExC_parse = p - 1;
+
+			    Set_Node_Offset(ret, RExC_parse);
+			    Set_Node_Cur_Length(ret);
+			    nextchar(pRExC_state);
+			    *flagp |= HASWIDTH|SIMPLE;
+			    return ret;
+			}
+		    }
 		}
+
 		if ( RExC_flags & RXf_PMf_EXTENDED)
 		    p = regwhite( pRExC_state, p );
 		if (UTF && FOLD) {
-		    /* Prime the casefolded buffer. */
-		    ender = toFOLD_uni(ender, tmpbuf, &foldlen);
+		    /* Prime the casefolded buffer.  Locale rules, which apply
+		     * only to code points < 256, aren't known until execution,
+		     * so for them, just output the original character using
+		     * utf8 */
+		    if (LOC && ender < 256) {
+			if (UNI_IS_INVARIANT(ender)) {
+			    *tmpbuf = (U8) ender;
+			    foldlen = 1;
+			} else {
+			    *tmpbuf = UTF8_TWO_BYTE_HI(ender);
+			    *(tmpbuf + 1) = UTF8_TWO_BYTE_LO(ender);
+			    foldlen = 2;
+			}
+		    }
+		    else if (isASCII(ender)) {	/* Note: Here can't also be LOC
+						 */
+			ender = toLOWER(ender);
+			*tmpbuf = (U8) ender;
+			foldlen = 1;
+		    }
+		    else if (! MORE_ASCII_RESTRICTED && ! LOC) {
+
+			/* Locale and /aa require more selectivity about the
+			 * fold, so are handled below.  Otherwise, here, just
+			 * use the fold */
+			ender = toFOLD_uni(ender, tmpbuf, &foldlen);
+		    }
+		    else {
+			/* Under locale rules or /aa we are not to mix,
+			 * respectively, ords < 256 or ASCII with non-.  So
+			 * reject folds that mix them, using only the
+			 * non-folded code point.  So do the fold to a
+			 * temporary, and inspect each character in it. */
+			U8 trialbuf[UTF8_MAXBYTES_CASE+1];
+			U8* s = trialbuf;
+			UV tmpender = toFOLD_uni(ender, trialbuf, &foldlen);
+			U8* e = s + foldlen;
+			bool fold_ok = TRUE;
+
+			while (s < e) {
+			    if (isASCII(*s)
+				|| (LOC && (UTF8_IS_INVARIANT(*s)
+					   || UTF8_IS_DOWNGRADEABLE_START(*s))))
+			    {
+				fold_ok = FALSE;
+				break;
+			    }
+			    s += UTF8SKIP(s);
+			}
+			if (fold_ok) {
+			    Copy(trialbuf, tmpbuf, foldlen, U8);
+			    ender = tmpender;
+			}
+			else {
+			    uvuni_to_utf8(tmpbuf, ender);
+			    foldlen = UNISKIP(ender);
+			}
+		    }
 		}
 		if (p < RExC_end && ISMULT2(p)) { /* Back off on ?+*. */
 		    if (len)
@@ -8656,10 +9003,12 @@ tryagain:
 		     }
 		     len--;
 		}
-		else
+		else {
 		    REGC((char)ender, s++);
+		}
 	    }
-	loopdone:
+	loopdone:   /* Jumped to when encounters something that shouldn't be in
+		       the node */
 	    RExC_parse = p - 1;
             Set_Node_Cur_Length(ret); /* MJD */
 	    nextchar(pRExC_state);
@@ -8882,14 +9231,14 @@ S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
 ANYOF_##NAME:                                                                  \
 	for (value = 0; value < 256; value++)                                  \
 	    if (TEST)                                                          \
-	    stored += S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value, &nonbitmap);  \
+	    stored += set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate);  \
     yesno = '+';                                                               \
     what = WORD;                                                               \
     break;                                                                     \
 case ANYOF_N##NAME:                                                            \
 	for (value = 0; value < 256; value++)                                  \
 	    if (!TEST)                                                         \
-	    stored += S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value, &nonbitmap);  \
+	    stored += set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate);  \
     yesno = '!';                                                               \
     what = WORD;                                                               \
     break
@@ -8904,14 +9253,14 @@ ANYOF_##NAME:                                                                  \
     else if (UNI_SEMANTICS) {                                                  \
         for (value = 0; value < 256; value++) {                                \
             if (TEST_8(value)) stored +=                                       \
-                      S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value, &nonbitmap);  \
+                      set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate);  \
         }                                                                      \
     }                                                                          \
     else {                                                                     \
         for (value = 0; value < 128; value++) {                                \
             if (TEST_7(UNI_TO_NATIVE(value))) stored +=                        \
-		S_set_regclass_bit(aTHX_ pRExC_state, ret,                     \
-			           (U8) UNI_TO_NATIVE(value), &nonbitmap);                 \
+		set_regclass_bit(pRExC_state, ret,                     \
+			           (U8) UNI_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate);                 \
         }                                                                      \
     }                                                                          \
     yesno = '+';                                                               \
@@ -8922,20 +9271,20 @@ case ANYOF_N##NAME:                                                            \
     else if (UNI_SEMANTICS) {                                                  \
         for (value = 0; value < 256; value++) {                                \
             if (! TEST_8(value)) stored +=                                     \
-		    S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value, &nonbitmap);    \
+		    set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate);    \
         }                                                                      \
     }                                                                          \
     else {                                                                     \
         for (value = 0; value < 128; value++) {                                \
-            if (! TEST_7(UNI_TO_NATIVE(value))) stored += S_set_regclass_bit(  \
-			aTHX_ pRExC_state, ret, (U8) UNI_TO_NATIVE(value), &nonbitmap);    \
+            if (! TEST_7(UNI_TO_NATIVE(value))) stored += set_regclass_bit(  \
+			pRExC_state, ret, (U8) UNI_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate);    \
         }                                                                      \
-	if (ASCII_RESTRICTED) {                                                \
+	if (AT_LEAST_ASCII_RESTRICTED) {                                       \
 	    for (value = 128; value < 256; value++) {                          \
-             stored += S_set_regclass_bit(                                     \
-			   aTHX_ pRExC_state, ret, (U8) UNI_TO_NATIVE(value), &nonbitmap); \
+             stored += set_regclass_bit(                                     \
+			   pRExC_state, ret, (U8) UNI_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate); \
 	    }                                                                  \
-	    ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL|ANYOF_UTF8;                  \
+	    ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;                             \
 	}                                                                      \
 	else {                                                                 \
 	    /* For a non-ut8 target string with DEPENDS semantics, all above   \
@@ -8943,69 +9292,151 @@ case ANYOF_N##NAME:                                                            \
 	     * classes.  But in utf8, they have their Unicode semantics, so    \
 	     * can't just set them in the bitmap, or else regexec.c will think \
 	     * they matched when they shouldn't. */                            \
-	    ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL|ANYOF_UTF8;          \
+	    ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL;                     \
 	}                                                                      \
     }                                                                          \
     yesno = '!';                                                               \
     what = WORD;                                                               \
     break
 
-/* 
-   We dont use PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS as the direct test
-   so that it is possible to override the option here without having to 
-   rebuild the entire core. as we are required to do if we change regcomp.h
-   which is where PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS is defined.
-*/
-#if PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS
-#define BROKEN_UNICODE_CHARCLASS_MAPPINGS
-#endif
-
-#ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
-#define POSIX_CC_UNI_NAME(CCNAME) CCNAME
-#else
-#define POSIX_CC_UNI_NAME(CCNAME) "Posix" CCNAME
-#endif
-
 STATIC U8
-S_set_regclass_bit_fold(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value, HV** nonbitmap_ptr)
+S_set_regclass_bit_fold(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value, HV** invlist_ptr, AV** alternate_ptr)
 {
 
     /* Handle the setting of folds in the bitmap for non-locale ANYOF nodes.
      * Locale folding is done at run-time, so this function should not be
      * called for nodes that are for locales.
      *
-     * This function simply sets the bit corresponding to the fold of the input
+     * This function sets the bit corresponding to the fold of the input
      * 'value', if not already set.  The fold of 'f' is 'F', and the fold of
      * 'F' is 'f'.
      *
-     * It also sets any necessary flags, and returns the number of bits that
-     * actually changed from 0 to 1 */
+     * It also knows about the characters that are in the bitmap that have
+     * folds that are matchable only outside it, and sets the appropriate lists
+     * and flags.
+     *
+     * It returns the number of bits that actually changed from 0 to 1 */
 
     U8 stored = 0;
     U8 fold;
 
+    PERL_ARGS_ASSERT_SET_REGCLASS_BIT_FOLD;
+
     fold = (AT_LEAST_UNI_SEMANTICS) ? PL_fold_latin1[value]
-                           : PL_fold[value];
+                                    : PL_fold[value];
 
     /* It assumes the bit for 'value' has already been set */
     if (fold != value && ! ANYOF_BITMAP_TEST(node, fold)) {
         ANYOF_BITMAP_SET(node, fold);
         stored++;
     }
-    if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value)
-	|| (! UNI_SEMANTICS
-	    && ! isASCII(value)
-	    && PL_fold_latin1[value] != value))
-    {   /* A character that has a fold outside of Latin1 matches outside the
-           bitmap, but only when the target string is utf8.  Similarly when we
-           don't have unicode semantics for the above ASCII Latin-1 characters,
-           and they have a fold, they should match if the target is utf8, and
-           not otherwise */
-	if (! *nonbitmap_ptr) {
-	    *nonbitmap_ptr = _new_invlist(2);
+    if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value) && (! isASCII(value) || ! MORE_ASCII_RESTRICTED)) {
+	/* Certain Latin1 characters have matches outside the bitmap.  To get
+	 * here, 'value' is one of those characters.   None of these matches is
+	 * valid for ASCII characters under /aa, which have been excluded by
+	 * the 'if' above.  The matches fall into three categories:
+	 * 1) They are singly folded-to or -from an above 255 character, as
+	 *    LATIN SMALL LETTER Y WITH DIAERESIS and LATIN CAPITAL LETTER Y
+	 *    WITH DIAERESIS;
+	 * 2) They are part of a multi-char fold with another character in the
+	 *    bitmap, only LATIN SMALL LETTER SHARP S => "ss" fits that bill;
+	 * 3) They are part of a multi-char fold with a character not in the
+	 *    bitmap, such as various ligatures.
+	 * We aren't dealing fully with multi-char folds, except we do deal
+	 * with the pattern containing a character that has a multi-char fold
+	 * (not so much the inverse).
+	 * For types 1) and 3), the matches only happen when the target string
+	 * is utf8; that's not true for 2), and we set a flag for it.
+	 *
+	 * The code below adds to the passed in inversion list the single fold
+	 * closures for 'value'.  The values are hard-coded here so that an
+	 * innocent-looking character class, like /[ks]/i won't have to go out
+	 * to disk to find the possible matches.  XXX It would be better to
+	 * generate these via regen, in case a new version of the Unicode
+	 * standard adds new mappings, though that is not really likely. */
+	switch (value) {
+	    case 'k':
+	    case 'K':
+		/* KELVIN SIGN */
+		*invlist_ptr = add_cp_to_invlist(*invlist_ptr, 0x212A);
+		break;
+	    case 's':
+	    case 'S':
+		/* LATIN SMALL LETTER LONG S */
+		*invlist_ptr = add_cp_to_invlist(*invlist_ptr, 0x017F);
+		break;
+	    case MICRO_SIGN:
+		*invlist_ptr = add_cp_to_invlist(*invlist_ptr,
+						 GREEK_SMALL_LETTER_MU);
+		*invlist_ptr = add_cp_to_invlist(*invlist_ptr,
+						 GREEK_CAPITAL_LETTER_MU);
+		break;
+	    case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
+	    case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
+		/* ANGSTROM SIGN */
+		*invlist_ptr = add_cp_to_invlist(*invlist_ptr, 0x212B);
+		if (DEPENDS_SEMANTICS) {    /* See DEPENDS comment below */
+		    *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
+						     PL_fold_latin1[value]);
+		}
+		break;
+	    case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
+		*invlist_ptr = add_cp_to_invlist(*invlist_ptr,
+					LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
+		break;
+	    case LATIN_SMALL_LETTER_SHARP_S:
+		*invlist_ptr = add_cp_to_invlist(*invlist_ptr,
+					LATIN_CAPITAL_LETTER_SHARP_S);
+
+		/* Under /a, /d, and /u, this can match the two chars "ss" */
+		if (! MORE_ASCII_RESTRICTED) {
+		    add_alternate(alternate_ptr, (U8 *) "ss", 2);
+
+		    /* And under /u or /a, it can match even if the target is
+		     * not utf8 */
+		    if (AT_LEAST_UNI_SEMANTICS) {
+			ANYOF_FLAGS(node) |= ANYOF_NONBITMAP_NON_UTF8;
+		    }
+		}
+		break;
+	    case 'F': case 'f':
+	    case 'I': case 'i':
+	    case 'L': case 'l':
+	    case 'T': case 't':
+		/* These all are targets of multi-character folds, which can
+		 * occur with only non-Latin1 characters in the fold, so they
+		 * can match if the target string isn't UTF-8 */
+		ANYOF_FLAGS(node) |= ANYOF_NONBITMAP_NON_UTF8;
+		break;
+	    case 'A': case 'a':
+	    case 'H': case 'h':
+	    case 'J': case 'j':
+	    case 'N': case 'n':
+	    case 'W': case 'w':
+	    case 'Y': case 'y':
+		/* These all are targets of multi-character folds, which occur
+		 * only with a non-Latin1 character as part of the fold, so
+		 * they can't match unless the target string is in UTF-8, so no
+		 * action here is necessary */
+		break;
+	    default:
+		/* Use deprecated warning to increase the chances of this
+		 * being output */
+		ckWARN2regdep(RExC_parse, "Perl folding rules are not up-to-date for 0x%x; please use the perlbug utility to report;", value);
+		break;
 	}
-	*nonbitmap_ptr = add_range_to_invlist(*nonbitmap_ptr, value, value);
-	ANYOF_FLAGS(node) |= ANYOF_UTF8;
+    }
+    else if (DEPENDS_SEMANTICS
+	    && ! isASCII(value)
+	    && PL_fold_latin1[value] != value)
+    {
+	   /* Under DEPENDS rules, non-ASCII Latin1 characters match their
+	    * folds only when the target string is in UTF-8.  We add the fold
+	    * here to the list of things to match outside the bitmap, which
+	    * won't be looked at unless it is UTF8 (or else if something else
+	    * says to look even if not utf8, but those things better not happen
+	    * under DEPENDS semantics. */
+	*invlist_ptr = add_cp_to_invlist(*invlist_ptr, PL_fold_latin1[value]);
     }
 
     return stored;
@@ -9013,7 +9444,7 @@ S_set_regclass_bit_fold(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8
 
 
 PERL_STATIC_INLINE U8
-S_set_regclass_bit(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value, HV** nonbitmap_ptr)
+S_set_regclass_bit(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value, HV** invlist_ptr, AV** alternate_ptr)
 {
     /* This inline function sets a bit in the bitmap if not already set, and if
      * appropriate, its fold, returning the number of bits that actually
@@ -9021,6 +9452,8 @@ S_set_regclass_bit(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 valu
 
     U8 stored;
 
+    PERL_ARGS_ASSERT_SET_REGCLASS_BIT;
+
     if (ANYOF_BITMAP_TEST(node, value)) {   /* Already set */
 	return 0;
     }
@@ -9029,18 +9462,36 @@ S_set_regclass_bit(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 valu
     stored = 1;
 
     if (FOLD && ! LOC) {	/* Locale folds aren't known until runtime */
-	stored += S_set_regclass_bit_fold(aTHX_ pRExC_state, node, value, nonbitmap_ptr);
+	stored += set_regclass_bit_fold(pRExC_state, node, value, invlist_ptr, alternate_ptr);
     }
 
     return stored;
 }
 
+STATIC void
+S_add_alternate(pTHX_ AV** alternate_ptr, U8* string, STRLEN len)
+{
+    /* Adds input 'string' with length 'len' to the ANYOF node's unicode
+     * alternate list, pointed to by 'alternate_ptr'.  This is an array of
+     * the multi-character folds of characters in the node */
+    SV *sv;
+
+    PERL_ARGS_ASSERT_ADD_ALTERNATE;
+
+    if (! *alternate_ptr) {
+	*alternate_ptr = newAV();
+    }
+    sv = newSVpvn_utf8((char*)string, len, TRUE);
+    av_push(*alternate_ptr, sv);
+    return;
+}
+
 /*
    parse a class specification and produce either an ANYOF node that
-   matches the pattern or if the pattern matches a single char only and
-   that char is < 256 and we are case insensitive then we produce an 
-   EXACT node instead.
-*/
+   matches the pattern or perhaps will be optimized into an EXACTish node
+   instead. The node contains a bit map for the first 256 characters, with the
+   corresponding bit set if that character is in the list.  For characters
+   above 255, a range list is used */
 
 STATIC regnode *
 S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
@@ -9056,8 +9507,30 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
     char *rangebegin = NULL;
     bool need_class = 0;
     SV *listsv = NULL;
+    STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
+				      than just initialized.  */
     UV n;
+
+    /* code points this node matches that can't be stored in the bitmap */
     HV* nonbitmap = NULL;
+
+    /* The items that are to match that aren't stored in the bitmap, but are a
+     * result of things that are stored there.  This is the fold closure of
+     * such a character, either because it has DEPENDS semantics and shouldn't
+     * be matched unless the target string is utf8, or is a code point that is
+     * too large for the bit map, as for example, the fold of the MICRO SIGN is
+     * above 255.  This all is solely for performance reasons.  By having this
+     * code know the outside-the-bitmap folds that the bitmapped characters are
+     * involved with, we don't have to go out to disk to find the list of
+     * matches, unless the character class includes code points that aren't
+     * storable in the bit map.  That means that a character class with an 's'
+     * in it, for example, doesn't need to go out to disk to find everything
+     * that matches.  A 2nd list is used so that the 'nonbitmap' list is kept
+     * empty unless there is something whose fold we don't know about, and will
+     * have to go out to the disk to find. */
+    HV* l1_fold_invlist = NULL;
+
+    /* List of multi-character folds that are matched by this node */
     AV* unicode_alternate  = NULL;
 #ifdef EBCDIC
     UV literal_endpoint = 0;
@@ -9093,23 +9566,16 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
 
     if (SIZE_ONLY) {
 	RExC_size += ANYOF_SKIP;
-#ifdef ANYOF_ADD_LOC_SKIP
-	if (LOC) {
-	    RExC_size += ANYOF_ADD_LOC_SKIP;
-	}
-#endif
 	listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
     }
     else {
  	RExC_emit += ANYOF_SKIP;
 	if (LOC) {
 	    ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
-#ifdef ANYOF_ADD_LOC_SKIP
-	    RExC_emit += ANYOF_ADD_LOC_SKIP;
-#endif
 	}
 	ANYOF_BITMAP_ZERO(ret);
 	listsv = newSVpvs("# comment\n");
+	initial_listsv_len = SvCUR(listsv);
     }
 
     nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
@@ -9175,7 +9641,7 @@ parseit:
                     from earlier versions, OTOH that behaviour was broken
                     as well. */
                     UV v; /* value is register so we cant & it /grrr */
-                    if (reg_namedseq(pRExC_state, &v, NULL)) {
+                    if (reg_namedseq(pRExC_state, &v, NULL, depth)) {
                         goto parseit;
                     }
                     value= v; 
@@ -9204,14 +9670,7 @@ parseit:
 		    e = RExC_parse;
 		    n = 1;
 		}
-		if (SIZE_ONLY) {
-		    if (LOC) {
-			ckWARN2reg(RExC_parse,
-				"\\%c uses Unicode rules, not locale rules",
-				(int) value);
-		    }
-		}
-		else {
+		if (!SIZE_ONLY) {
 		    if (UCHARAT(RExC_parse) == '^') {
 			 RExC_parse++;
 			 n--;
@@ -9238,7 +9697,7 @@ parseit:
 
 		/* The \p could match something in the Latin1 range, hence
 		 * something that isn't utf8 */
-		ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP;
+		ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP_NON_UTF8;
 		namedclass = ANYOF_MAX;  /* no official name, but it's named */
 
 		/* \p means they want Unicode semantics */
@@ -9307,7 +9766,7 @@ parseit:
 		    break;
 		}
 	    recode_encoding:
-		{
+		if (! RExC_override_recoding) {
 		    SV* enc = PL_encoding;
 		    value = reg_recode((const char)(U8)value, &enc);
 		    if (!enc && SIZE_ONLY)
@@ -9341,21 +9800,18 @@ parseit:
 	    if (LOC && namedclass < ANYOF_MAX && ! need_class) {
 		need_class = 1;
 		if (SIZE_ONLY) {
-#ifdef ANYOF_CLASS_ADD_SKIP
-		    RExC_size += ANYOF_CLASS_ADD_SKIP;
-#endif
+		    RExC_size += ANYOF_CLASS_SKIP - ANYOF_SKIP;
 		}
 		else {
-#ifdef ANYOF_CLASS_ADD_SKIP
-		    RExC_emit += ANYOF_CLASS_ADD_SKIP;
-#endif
+		    RExC_emit += ANYOF_CLASS_SKIP - ANYOF_SKIP;
 		    ANYOF_CLASS_ZERO(ret);
 		}
 		ANYOF_FLAGS(ret) |= ANYOF_CLASS;
 	    }
 
 	    /* a bad range like a-\d, a-[:digit:].  The '-' is taken as a
-	     * literal */
+	     * literal, as is the character that began the false range, i.e.
+	     * the 'a' in the examples */
 	    if (range) {
 		if (!SIZE_ONLY) {
 		    const int w =
@@ -9365,16 +9821,14 @@ parseit:
 			       "False [] range \"%*.*s\"",
 			       w, w, rangebegin);
 
+		    stored +=
+                         set_regclass_bit(pRExC_state, ret, '-', &l1_fold_invlist, &unicode_alternate);
 		    if (prevvalue < 256) {
 			stored +=
-                         S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) prevvalue, &nonbitmap);
-			stored +=
-                         S_set_regclass_bit(aTHX_ pRExC_state, ret, '-', &nonbitmap);
+                         set_regclass_bit(pRExC_state, ret, (U8) prevvalue, &l1_fold_invlist, &unicode_alternate);
 		    }
 		    else {
-			ANYOF_FLAGS(ret) |= ANYOF_UTF8;
-			Perl_sv_catpvf(aTHX_ listsv,
-			   "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
+			nonbitmap = add_cp_to_invlist(nonbitmap, prevvalue);
 		    }
 		}
 
@@ -9403,15 +9857,9 @@ parseit:
 		case _C_C_T_(PSXSPC, isPSXSPC_L1, isPSXSPC, "XPosixSpace");
 		case _C_C_T_(PUNCT, isPUNCT_L1, isPUNCT, "XPosixPunct");
 		case _C_C_T_(UPPER, isUPPER_L1, isUPPER, "XPosixUpper");
-#ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
                 /* \s, \w match all unicode if utf8. */
                 case _C_C_T_(SPACE, isSPACE_L1, isSPACE, "SpacePerl");
                 case _C_C_T_(ALNUM, isWORDCHAR_L1, isALNUM, "Word");
-#else
-                /* \s, \w match ascii and locale only */
-                case _C_C_T_(SPACE, isSPACE_L1, isSPACE, "PerlSpace");
-                case _C_C_T_(ALNUM, isWORDCHAR_L1, isALNUM, "PerlWord");
-#endif		
 		case _C_C_T_(XDIGIT, isXDIGIT_L1, isXDIGIT, "XPosixXDigit");
 		case _C_C_T_NOLOC_(VERTWS, is_VERTWS_latin1(&value), "VertSpace");
 		case _C_C_T_NOLOC_(HORIZWS, is_HORIZWS_latin1(&value), "HorizSpace");
@@ -9421,7 +9869,7 @@ parseit:
 		    else {
 			for (value = 0; value < 128; value++)
 			    stored +=
-                              S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) ASCII_TO_NATIVE(value), &nonbitmap);
+                              set_regclass_bit(pRExC_state, ret, (U8) ASCII_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate);
 		    }
 		    yesno = '+';
 		    what = NULL;	/* Doesn't match outside ascii, so
@@ -9433,7 +9881,7 @@ parseit:
 		    else {
 			for (value = 128; value < 256; value++)
 			    stored +=
-                              S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) ASCII_TO_NATIVE(value), &nonbitmap);
+                              set_regclass_bit(pRExC_state, ret, (U8) ASCII_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate);
 		    }
 		    ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;
 		    yesno = '!';
@@ -9446,10 +9894,10 @@ parseit:
 			/* consecutive digits assumed */
 			for (value = '0'; value <= '9'; value++)
 			    stored +=
-                              S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value, &nonbitmap);
+                              set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate);
 		    }
 		    yesno = '+';
-		    what = POSIX_CC_UNI_NAME("Digit");
+		    what = "Digit";
 		    break;
 		case ANYOF_NDIGIT:
 		    if (LOC)
@@ -9458,14 +9906,14 @@ parseit:
 			/* consecutive digits assumed */
 			for (value = 0; value < '0'; value++)
 			    stored +=
-                              S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value, &nonbitmap);
+                              set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate);
 			for (value = '9' + 1; value < 256; value++)
 			    stored +=
-                              S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value, &nonbitmap);
+                              set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate);
 		    }
 		    yesno = '!';
-		    what = POSIX_CC_UNI_NAME("Digit");
-		    if (ASCII_RESTRICTED ) {
+		    what = "Digit";
+		    if (AT_LEAST_ASCII_RESTRICTED ) {
 			ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;
 		    }
 		    break;		
@@ -9476,10 +9924,9 @@ parseit:
 		    vFAIL("Invalid [::] class");
 		    break;
 		}
-		if (what && ! (ASCII_RESTRICTED)) {
+		if (what && ! (AT_LEAST_ASCII_RESTRICTED)) {
 		    /* Strings such as "+utf8::isWord\n" */
 		    Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
-		    ANYOF_FLAGS(ret) |= ANYOF_UTF8;
 		}
 
 		continue;
@@ -9495,8 +9942,10 @@ parseit:
 	}
 	else {
 	    prevvalue = value; /* save the beginning of the range */
-	    if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
-		RExC_parse[1] != ']') {
+	    if (RExC_parse+1 < RExC_end
+		&& *RExC_parse == '-'
+		&& RExC_parse[1] != ']')
+	    {
 		RExC_parse++;
 
 		/* a bad range like \w-, [:word:]- ? */
@@ -9511,13 +9960,15 @@ parseit:
 		    }
 		    if (!SIZE_ONLY)
 			stored +=
-                            S_set_regclass_bit(aTHX_ pRExC_state, ret, '-', &nonbitmap);
+                            set_regclass_bit(pRExC_state, ret, '-', &l1_fold_invlist, &unicode_alternate);
 		} else
 		    range = 1;	/* yeah, it's a range! */
 		continue;	/* but do it the next time */
 	    }
 	}
 
+	/* non-Latin1 code point implies unicode semantics.  Must be set in
+	 * pass1 so is there for the whole of pass 2 */
 	if (value > 255) {
 	    RExC_uni_semantics = 1;
 	}
@@ -9538,129 +9989,27 @@ parseit:
 			for (i = prevvalue; i <= ceilvalue; i++)
 			    if (isLOWER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
 				stored +=
-                                  S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) i, &nonbitmap);
+                                  set_regclass_bit(pRExC_state, ret, (U8) i, &l1_fold_invlist, &unicode_alternate);
 			    }
 		    } else {
 			for (i = prevvalue; i <= ceilvalue; i++)
 			    if (isUPPER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
 				stored +=
-                                  S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) i, &nonbitmap);
+                                  set_regclass_bit(pRExC_state, ret, (U8) i, &l1_fold_invlist, &unicode_alternate);
 			    }
 		    }
 		}
 		else
 #endif
 		      for (i = prevvalue; i <= ceilvalue; i++) {
-			stored += S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) i, &nonbitmap);
+			stored += set_regclass_bit(pRExC_state, ret, (U8) i, &l1_fold_invlist, &unicode_alternate);
 	              }
 	  }
 	  if (value > 255) {
 	    const UV prevnatvalue  = NATIVE_TO_UNI(prevvalue);
 	    const UV natvalue      = NATIVE_TO_UNI(value);
-	    if (! nonbitmap) {
-		nonbitmap = _new_invlist(2);
-	    }
 	    nonbitmap = add_range_to_invlist(nonbitmap, prevnatvalue, natvalue);
-	    ANYOF_FLAGS(ret) |= ANYOF_UTF8;
 	}
-#if 0
-
-		/* If the code point requires utf8 to represent, and we are not
-		 * folding, it can't match unless the target is in utf8.  Only
-		 * a few code points above 255 fold to below it, so XXX an
-		 * optimization would be to know which ones and set the flag
-		 * appropriately. */
-		ANYOF_FLAGS(ret) |= (FOLD || value < 256)
-				    ? ANYOF_NONBITMAP
-				    : ANYOF_UTF8;
-		if (prevnatvalue < natvalue) { /* '>' case is fatal error above */
-
-		    /* The \t sets the whole range */
-		    Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
-				   prevnatvalue, natvalue);
-
-		    /* Currently, we don't look at every value in the range.
-		     * Therefore we have to assume the worst case: that if
-		     * folding, it will match more than one character.  But in
-		     * lookbehind patterns, can only be single character
-		     * length, so disallow those folds */
-		    if (FOLD && ! RExC_in_lookbehind) {
-		      OP(ret) = ANYOFV;
-		    }
-		}
-		else if (prevnatvalue == natvalue) {
-		    Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
-		    if (FOLD) {
-			 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
-			 STRLEN foldlen;
-			 const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
-
-#ifdef EBCDIC /* RD t/uni/fold ff and 6b */
-			 if (RExC_precomp[0] == ':' &&
-			     RExC_precomp[1] == '[' &&
-			     (f == 0xDF || f == 0x92)) {
-			     f = NATIVE_TO_UNI(f);
-                        }
-#endif
-			 /* If folding and foldable and a single
-			  * character, insert also the folded version
-			  * to the charclass. */
-			 if (f != value) {
-#ifdef EBCDIC /* RD tunifold ligatures s,t fb05, fb06 */
-			     if ((RExC_precomp[0] == ':' &&
-				  RExC_precomp[1] == '[' &&
-				  (f == 0xA2 &&
-				   (value == 0xFB05 || value == 0xFB06))) ?
-				 foldlen == ((STRLEN)UNISKIP(f) - 1) :
-				 foldlen == (STRLEN)UNISKIP(f) )
-#else
-			      if (foldlen == (STRLEN)UNISKIP(f))
-#endif
-				  Perl_sv_catpvf(aTHX_ listsv,
-						 "%04"UVxf"\n", f);
-			      else if (! RExC_in_lookbehind) {
-				  /* Any multicharacter foldings
-				   * (disallowed in lookbehind patterns)
-				   * require the following transform:
-				   * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
-				   * where E folds into "pq" and F folds
-				   * into "rst", all other characters
-				   * fold to single characters.  We save
-				   * away these multicharacter foldings,
-				   * to be later saved as part of the
-				   * additional "s" data. */
-				  SV *sv;
-
-				  if (!unicode_alternate)
-				      unicode_alternate = newAV();
-				  sv = newSVpvn_utf8((char*)foldbuf, foldlen,
-						     TRUE);
-				  av_push(unicode_alternate, sv);
-				  OP(ret) = ANYOFV;
-			      }
-			 }
-
-			 /* If folding and the value is one of the Greek
-			  * sigmas insert a few more sigmas to make the
-			  * folding rules of the sigmas to work right.
-			  * Note that not all the possible combinations
-			  * are handled here: some of them are handled
-			  * by the standard folding rules, and some of
-			  * them (literal or EXACTF cases) are handled
-			  * during runtime in regexec.c:S_find_byclass(). */
-			 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
-			      Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
-					     (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
-			      Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
-					     (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
-			 }
-			 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
-			      Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
-					     (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
-		    }
-		}
-	    }
-#endif
 #ifdef EBCDIC
 	    literal_endpoint = 0;
 #endif
@@ -9675,227 +10024,229 @@ parseit:
         return ret;
     /****** !SIZE_ONLY AFTER HERE *********/
 
-    /* Finish up the non-bitmap entries */
-    if (nonbitmap) {
-	UV* nonbitmap_array;
+    /* If folding and there are code points above 255, we calculate all
+     * characters that could fold to or from the ones already on the list */
+    if (FOLD && nonbitmap) {
 	UV i;
 
-	/* If folding, we add to the list all characters that could fold to or
-	 * from the ones already on the list */
-	if (FOLD) {
-	    HV* fold_intersection;
-	    UV* fold_list;
-
-	    /* This is a list of all the characters that participate in folds
-	     * (except marks, etc in multi-char folds */
-	    if (! PL_utf8_foldable) {
-		SV* swash = swash_init("utf8", "Cased", &PL_sv_undef, 1, 0);
-		PL_utf8_foldable = _swash_to_invlist(swash);
-	    }
+	HV* fold_intersection;
+	UV* fold_list;
 
-	    /* This is a hash that for a particular fold gives all characters
-	     * that are involved in it */
-	    if (! PL_utf8_foldclosures) {
-
-		/* If we were unable to find any folds, then we likely won't be
-		 * able to find the closures.  So just create an empty list.
-		 * Folding will effectively be restricted to the non-Unicode
-		 * rules hard-coded into Perl.  (This case happens legitimately
-		 * during compilation of Perl itself before the Unicode tables
-		 * are generated) */
-		if (invlist_len(PL_utf8_foldable) == 0) {
-		    PL_utf8_foldclosures = _new_invlist(0);
-		} else {
-		    /* If the folds haven't been read in, call a fold function
-		     * to force that */
-		    if (! PL_utf8_tofold) {
-			U8 dummy[UTF8_MAXBYTES+1];
-			STRLEN dummy_len;
-			to_utf8_fold((U8*) "A", dummy, &dummy_len);
-		    }
-		    PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
+	/* This is a list of all the characters that participate in folds
+	    * (except marks, etc in multi-char folds */
+	if (! PL_utf8_foldable) {
+	    SV* swash = swash_init("utf8", "Cased", &PL_sv_undef, 1, 0);
+	    PL_utf8_foldable = _swash_to_invlist(swash);
+	}
+
+	/* This is a hash that for a particular fold gives all characters
+	    * that are involved in it */
+	if (! PL_utf8_foldclosures) {
+
+	    /* If we were unable to find any folds, then we likely won't be
+	     * able to find the closures.  So just create an empty list.
+	     * Folding will effectively be restricted to the non-Unicode rules
+	     * hard-coded into Perl.  (This case happens legitimately during
+	     * compilation of Perl itself before the Unicode tables are
+	     * generated) */
+	    if (invlist_len(PL_utf8_foldable) == 0) {
+		PL_utf8_foldclosures = _new_invlist(0);
+	    } else {
+		/* If the folds haven't been read in, call a fold function
+		    * to force that */
+		if (! PL_utf8_tofold) {
+		    U8 dummy[UTF8_MAXBYTES+1];
+		    STRLEN dummy_len;
+		    to_utf8_fold((U8*) "A", dummy, &dummy_len);
 		}
+		PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
 	    }
+	}
 
-	    /* Only the characters in this class that participate in folds need
-	     * be checked.  Get the intersection of this class and all the
-	     * possible characters that are foldable.  This can quickly narrow
-	     * down a large class */
-	    fold_intersection = invlist_intersection(PL_utf8_foldable, nonbitmap);
-
-	    /* Now look at the foldable characters in this class individually */
-	    fold_list = invlist_array(fold_intersection);
-	    for (i = 0; i < invlist_len(fold_intersection); i++) {
-		UV j;
-
-		/* The next entry is the beginning of the range that is in the
-		 * class */
-		UV start = fold_list[i++];
-
-
-		/* The next entry is the beginning of the next range, which
-		 * isn't in the class, so the end of the current range is one
-		 * less than that */
-		UV end = fold_list[i] - 1;
-
-		/* Look at every character in the range */
-		for (j = start; j <= end; j++) {
-
-		    /* Get its fold */
-		    U8 foldbuf[UTF8_MAXBYTES_CASE+1];
-		    STRLEN foldlen;
-		    const UV f = to_uni_fold(j, foldbuf, &foldlen);
-
-		    if (foldlen > (STRLEN)UNISKIP(f)) {
-
-			/* Any multicharacter foldings (disallowed in
-			 * lookbehind patterns) require the following
-			 * transform: [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst) where
-			 * E folds into "pq" and F folds into "rst", all other
-			 * characters fold to single characters.  We save away
-			 * these multicharacter foldings, to be later saved as
-			 * part of the additional "s" data. */
-			if (! RExC_in_lookbehind) {
-			    /* XXX Discard this fold if any are latin1 and LOC */
-			    SV *sv;
-
-			    if (!unicode_alternate) {
-				unicode_alternate = newAV();
-			    }
-			    sv = newSVpvn_utf8((char*)foldbuf, foldlen, TRUE);
-			    av_push(unicode_alternate, sv);
+	/* Only the characters in this class that participate in folds need
+	    * be checked.  Get the intersection of this class and all the
+	    * possible characters that are foldable.  This can quickly narrow
+	    * down a large class */
+	fold_intersection = invlist_intersection(PL_utf8_foldable, nonbitmap);
 
-			    /* This node is variable length */
-			    OP(ret) = ANYOFV;
-			    ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
-			}
-		    }
-		    else { /* Single character fold */
-			SV** listp;
-
-			/* Consider "k" =~ /[K]/i.  The line above would have
-			 * just folded the 'k' to itself, and that isn't going
-			 * to match 'K'.  So we look through the closure of
-			 * everything that folds to 'k'.  That will find the
-			 * 'K'.  Initialize the list, if necessary */
-
-			/* The data structure is a hash with the keys every
-			 * character that is folded to, like 'k', and the
-			 * values each an array of everything that folds to its
-			 * key.  e.g. [ 'k', 'K', KELVIN_SIGN ] */
-			if ((listp = hv_fetch(PL_utf8_foldclosures,
-				      (char *) foldbuf, foldlen, FALSE)))
-			{
-			    AV* list = (AV*) *listp;
-			    IV k;
-			    for (k = 0; k <= av_len(list); k++) {
-				SV** c_p = av_fetch(list, k, FALSE);
-				UV c;
-				if (c_p == NULL) {
-				    Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
-				}
-				c = SvUV(*c_p);
+	/* Now look at the foldable characters in this class individually */
+	fold_list = invlist_array(fold_intersection);
+	for (i = 0; i < invlist_len(fold_intersection); i++) {
+	    UV j;
 
-				if (c < 256 && AT_LEAST_UNI_SEMANTICS) {
-				    stored += S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) c, &nonbitmap);
+	    /* The next entry is the beginning of the range that is in the
+	     * class */
+	    UV start = fold_list[i++];
+
+
+	    /* The next entry is the beginning of the next range, which
+		* isn't in the class, so the end of the current range is one
+		* less than that */
+	    UV end = fold_list[i] - 1;
+
+	    /* Look at every character in the range */
+	    for (j = start; j <= end; j++) {
+
+		/* Get its fold */
+		U8 foldbuf[UTF8_MAXBYTES_CASE+1];
+		STRLEN foldlen;
+		const UV f = to_uni_fold(j, foldbuf, &foldlen);
+
+		if (foldlen > (STRLEN)UNISKIP(f)) {
+
+		    /* Any multicharacter foldings (disallowed in
+			* lookbehind patterns) require the following
+			* transform: [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst) where
+			* E folds into "pq" and F folds into "rst", all other
+			* characters fold to single characters.  We save away
+			* these multicharacter foldings, to be later saved as
+			* part of the additional "s" data. */
+		    if (! RExC_in_lookbehind) {
+			U8* loc = foldbuf;
+			U8* e = foldbuf + foldlen;
+
+			/* If any of the folded characters of this are in
+			    * the Latin1 range, tell the regex engine that
+			    * this can match a non-utf8 target string.  The
+			    * only multi-byte fold whose source is in the
+			    * Latin1 range (U+00DF) applies only when the
+			    * target string is utf8, or under unicode rules */
+			if (j > 255 || AT_LEAST_UNI_SEMANTICS) {
+			    while (loc < e) {
+
+				/* Can't mix ascii with non- under /aa */
+				if (MORE_ASCII_RESTRICTED
+				    && (isASCII(*loc) != isASCII(j)))
+				{
+				    goto end_multi_fold;
 				}
-				    /* It may be that the code point is already
-				     * in this range or already in the bitmap,
-				     * XXX THink about LOC
-				     * in which case we need do nothing */
-				else if ((c < start || c > end)
-					 && (c > 255
-					     || ! ANYOF_BITMAP_TEST(ret, c)))
+				if (UTF8_IS_INVARIANT(*loc)
+				    || UTF8_IS_DOWNGRADEABLE_START(*loc))
 				{
-				    nonbitmap = add_range_to_invlist(nonbitmap, c, c);
+				    /* Can't mix above and below 256 under
+					* LOC */
+				    if (LOC) {
+					goto end_multi_fold;
+				    }
+				    ANYOF_FLAGS(ret)
+					    |= ANYOF_NONBITMAP_NON_UTF8;
+				    break;
 				}
+				loc += UTF8SKIP(loc);
 			    }
 			}
-		    }
-		}
-	    }
-	    invlist_destroy(fold_intersection);
-	} /* End of processing all the folds */
-
-	/*  Here have the full list of items to match that aren't in the
-	 *  bitmap.  Convert to the structure that the rest of the code is
-	 *  expecting.   XXX That rest of the code should convert to this
-	 *  structure */
-	nonbitmap_array = invlist_array(nonbitmap);
-	for (i = 0; i < invlist_len(nonbitmap); i++) {
 
-	    /* The next entry is the beginning of the range that is in the
-	     * class */
-	    UV start = nonbitmap_array[i++];
+			add_alternate(&unicode_alternate, foldbuf, foldlen);
+		    end_multi_fold: ;
+		    }
 
-	    /* The next entry is the beginning of the next range, which isn't
-	     * in the class, so the end of the current range is one less than
-	     * that */
-	    UV end = nonbitmap_array[i] - 1;
+		    /* This is special-cased, as it is the only letter which
+		     * has both a multi-fold and single-fold in Latin1.  All
+		     * the other chars that have single and multi-folds are
+		     * always in utf8, and the utf8 folding algorithm catches
+		     * them */
+		    if (! LOC && j == LATIN_CAPITAL_LETTER_SHARP_S) {
+			stored += set_regclass_bit(pRExC_state,
+					ret,
+					LATIN_SMALL_LETTER_SHARP_S,
+					&l1_fold_invlist, &unicode_alternate);
+		    }
+		}
+		else {
+		    /* Single character fold.  Add everything in its fold
+			* closure to the list that this node should match */
+		    SV** listp;
+
+		    /* The fold closures data structure is a hash with the
+			* keys being every character that is folded to, like
+			* 'k', and the values each an array of everything that
+			* folds to its key.  e.g. [ 'k', 'K', KELVIN_SIGN ] */
+		    if ((listp = hv_fetch(PL_utf8_foldclosures,
+				    (char *) foldbuf, foldlen, FALSE)))
+		    {
+			AV* list = (AV*) *listp;
+			IV k;
+			for (k = 0; k <= av_len(list); k++) {
+			    SV** c_p = av_fetch(list, k, FALSE);
+			    UV c;
+			    if (c_p == NULL) {
+				Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
+			    }
+			    c = SvUV(*c_p);
+
+			    /* /aa doesn't allow folds between ASCII and
+				* non-; /l doesn't allow them between above
+				* and below 256 */
+			    if ((MORE_ASCII_RESTRICTED
+				 && (isASCII(c) != isASCII(j)))
+				    || (LOC && ((c < 256) != (j < 256))))
+			    {
+				continue;
+			    }
 
-	    if (start == end) {
-		Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", start);
-	    }
-	    else {
-		/* The \t sets the whole range */
-		Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
-			/* XXX EBCDIC */
-				   start, end);
+			    if (c < 256 && AT_LEAST_UNI_SEMANTICS) {
+				stored += set_regclass_bit(pRExC_state,
+					ret,
+					(U8) c,
+					&l1_fold_invlist, &unicode_alternate);
+			    }
+				/* It may be that the code point is already
+				    * in this range or already in the bitmap,
+				    * in which case we need do nothing */
+			    else if ((c < start || c > end)
+					&& (c > 255
+					    || ! ANYOF_BITMAP_TEST(ret, c)))
+			    {
+				nonbitmap = add_cp_to_invlist(nonbitmap, c);
+			    }
+			}
+		    }
+		}
 	    }
 	}
-	invlist_destroy(nonbitmap);
+	invlist_destroy(fold_intersection);
+    }
+
+    /* Combine the two lists into one. */
+    if (l1_fold_invlist) {
+	if (nonbitmap) {
+	    nonbitmap = invlist_union(nonbitmap, l1_fold_invlist);
+	}
+	else {
+	    nonbitmap = l1_fold_invlist;
+	}
     }
 
+    /* Here, we have calculated what code points should be in the character
+     * class.   Now we can see about various optimizations.  Fold calculation
+     * needs to take place before inversion.  Otherwise /[^k]/i would invert to
+     * include K, which under /i would match k. */
+
     /* Optimize inverted simple patterns (e.g. [^a-z]).  Note that we haven't
      * set the FOLD flag yet, so this this does optimize those.  It doesn't
      * optimize locale.  Doing so perhaps could be done as long as there is
      * nothing like \w in it; some thought also would have to be given to the
      * interaction with above 0x100 chars */
-    if (! LOC && (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
+    if (! LOC
+	&& (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT
+	&& ! unicode_alternate
+	&& ! nonbitmap
+	&& SvCUR(listsv) == initial_listsv_len)
+    {
 	for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
 	    ANYOF_BITMAP(ret)[value] ^= 0xFF;
 	stored = 256 - stored;
 
 	/* The inversion means that everything above 255 is matched; and at the
 	 * same time we clear the invert flag */
-	ANYOF_FLAGS(ret) = ANYOF_UTF8|ANYOF_UNICODE_ALL;
+	ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
     }
 
-    if (FOLD) {
-	SV *sv;
-
-	/* This is the one character in the bitmap that needs special handling
-	 * under non-locale folding, as it folds to two characters 'ss'.  This
-	 * happens if it is set and not inverting, or isn't set and are
-	 * inverting (disallowed in lookbehind patterns because they can't be
-	 * variable length) */
-	if (! LOC
-	    && ! RExC_in_lookbehind
-	    && (cBOOL(ANYOF_BITMAP_TEST(ret, LATIN_SMALL_LETTER_SHARP_S))
-		^ cBOOL(ANYOF_FLAGS(ret) & ANYOF_INVERT)))
-	{
-	    OP(ret) = ANYOFV;	/* Can match more than a single char */
-
-	    /* Under Unicode semantics), it can do this when the target string
-	     * isn't in utf8 */
-	    if (UNI_SEMANTICS) {
-		ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP_NON_UTF8;
-	    }
-
-	    if (!unicode_alternate) {
-		unicode_alternate = newAV();
-	    }
-	    sv = newSVpvn_utf8("ss", 2, TRUE);
-	    av_push(unicode_alternate, sv);
-	}
-
-	/* Folding in the bitmap is taken care of above, but not for locale
-	 * (for which we have to wait to see what folding is in effect at
-	 * runtime), and for things not in the bitmap.  Set run-time fold flag
-	 * for these */
-	if ((LOC || (ANYOF_FLAGS(ret) & ANYOF_NONBITMAP))) {
-	    ANYOF_FLAGS(ret) |= ANYOF_LOC_NONBITMAP_FOLD;
-	}
+    /* Folding in the bitmap is taken care of above, but not for locale (for
+     * which we have to wait to see what folding is in effect at runtime), and
+     * for things not in the bitmap.  Set run-time fold flag for these */
+    if (FOLD && (LOC || nonbitmap || unicode_alternate)) {
+	ANYOF_FLAGS(ret) |= ANYOF_LOC_NONBITMAP_FOLD;
     }
 
     /* A single character class can be "optimized" into an EXACTish node.
@@ -9912,7 +10263,10 @@ parseit:
      * characters which only have the two folds; so things like 'fF' and 'Ii'
      * wouldn't work because they are part of the fold of 'LATIN SMALL LIGATURE
      * FI'. */
-    if (! (ANYOF_FLAGS(ret) & (ANYOF_NONBITMAP|ANYOF_INVERT|ANYOF_UNICODE_ALL))
+    if (! nonbitmap
+	&& ! unicode_alternate
+	&& SvCUR(listsv) == initial_listsv_len
+	&& ! (ANYOF_FLAGS(ret) & (ANYOF_INVERT|ANYOF_UNICODE_ALL))
         && (((stored == 1 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE))
                               || (! ANYOF_CLASS_TEST_ANY_SET(ret)))))
 	    || (stored == 2 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE))
@@ -9979,7 +10333,54 @@ parseit:
         return ret;
     }
 
-    {
+    if (nonbitmap) {
+	UV* nonbitmap_array = invlist_array(nonbitmap);
+	UV nonbitmap_len = invlist_len(nonbitmap);
+	UV i;
+
+	/*  Here have the full list of items to match that aren't in the
+	 *  bitmap.  Convert to the structure that the rest of the code is
+	 *  expecting.   XXX That rest of the code should convert to this
+	 *  structure */
+	for (i = 0; i < nonbitmap_len; i++) {
+
+	    /* The next entry is the beginning of the range that is in the
+	     * class */
+	    UV start = nonbitmap_array[i++];
+	    UV end;
+
+	    /* The next entry is the beginning of the next range, which isn't
+	     * in the class, so the end of the current range is one less than
+	     * that.  But if there is no next range, it means that the range
+	     * begun by 'start' extends to infinity, which for this platform
+	     * ends at UV_MAX */
+	    if (i == nonbitmap_len) {
+		end = UV_MAX;
+	    }
+	    else {
+		end = nonbitmap_array[i] - 1;
+	    }
+
+	    if (start == end) {
+		Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", start);
+	    }
+	    else {
+		/* The \t sets the whole range */
+		Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
+			/* XXX EBCDIC */
+				   start, end);
+	    }
+	}
+	invlist_destroy(nonbitmap);
+    }
+
+    if (SvCUR(listsv) == initial_listsv_len && ! unicode_alternate) {
+	ARG_SET(ret, ANYOF_NONBITMAP_EMPTY);
+	SvREFCNT_dec(listsv);
+	SvREFCNT_dec(unicode_alternate);
+    }
+    else {
+
 	AV * const av = newAV();
 	SV *rv;
 	/* The 0th element stores the character class description
@@ -9991,6 +10392,9 @@ parseit:
 	av_store(av, 0, listsv);
 	av_store(av, 1, NULL);
 	av_store(av, 2, MUTABLE_SV(unicode_alternate));
+	if (unicode_alternate) { /* This node is variable length */
+	    OP(ret) = ANYOFV;
+	}
 	rv = newRV_noinc(MUTABLE_SV(av));
 	n = add_data(pRExC_state, 1, "s");
 	RExC_rxi->data->data[n] = (void*)rv;
@@ -10372,6 +10776,7 @@ S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,
             switch (OP(scan)) {
                 case EXACT:
                 case EXACTF:
+                case EXACTFA:
                 case EXACTFU:
                 case EXACTFL:
                         if( exact == PSEUDO )
@@ -10453,6 +10858,9 @@ S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
                 case REGEX_ASCII_RESTRICTED_CHARSET:
                     PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
                     break;
+                case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
+                    PerlIO_printf(Perl_debug_log, "ASCII-MORE_RESTRICTED");
+                    break;
                 default:
                     PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
                     break;
@@ -10780,12 +11188,12 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
         /* output information about the unicode matching */
 	if (flags & ANYOF_UNICODE_ALL)
 	    sv_catpvs(sv, "{unicode_all}");
-	else if (flags & ANYOF_UTF8)
+	else if (ANYOF_NONBITMAP(o))
 	    sv_catpvs(sv, "{unicode}");
 	if (flags & ANYOF_NONBITMAP_NON_UTF8)
 	    sv_catpvs(sv, "{outside bitmap}");
 
-	{
+	if (ANYOF_NONBITMAP(o)) {
 	    SV *lv;
 	    SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);