X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/1e37780e62d722a9041ac13ce793f8f2173b8c12..91abb413c86e04a93ab807cac8a8d3ff68cbb345:/regcomp.c?ds=sidebyside

diff --git a/regcomp.c b/regcomp.c
index 5dff17c..bba5a2b 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -199,6 +199,7 @@ struct RExC_state_t {
     scan_frame *frame_head;
     scan_frame *frame_last;
     U32         frame_count;
+    AV         *warn_text;
 #ifdef ADD_TO_REGEXEC
     char 	*starttry;		/* -Dr: where regtry was called. */
 #define RExC_starttry	(pRExC_state->starttry)
@@ -222,6 +223,7 @@ struct RExC_state_t {
 #endif
     bool        seen_unfolded_sharp_s;
     bool        strict;
+    bool        study_started;
 };
 
 #define RExC_flags	(pRExC_state->flags)
@@ -288,6 +290,8 @@ struct RExC_state_t {
 #define RExC_frame_last (pRExC_state->frame_last)
 #define RExC_frame_count (pRExC_state->frame_count)
 #define RExC_strict (pRExC_state->strict)
+#define RExC_study_started      (pRExC_state->study_started)
+#define RExC_warn_text (pRExC_state->warn_text)
 
 /* Heuristic check on the complexity of the pattern: if TOO_NAUGHTY, we set
  * a flag to disable back-off on the fixed/floating substrings - if it's
@@ -502,7 +506,20 @@ static const scan_data_t zero_scan_data =
 #define SF_HAS_PAR		0x0080
 #define SF_IN_PAR		0x0100
 #define SF_HAS_EVAL		0x0200
+
+
+/* SCF_DO_SUBSTR is the flag that tells the regexp analyzer to track the
+ * longest substring in the pattern. When it is not set the optimiser keeps
+ * track of position, but does not keep track of the actual strings seen,
+ *
+ * So for instance /foo/ will be parsed with SCF_DO_SUBSTR being true, but
+ * /foo/i will not.
+ *
+ * Similarly, /foo.*(blah|erm|huh).*fnorble/ will have "foo" and "fnorble"
+ * parsed with SCF_DO_SUBSTR on, but while processing the (...) it will be
+ * turned off because of the alternation (BRANCH). */
 #define SCF_DO_SUBSTR		0x0400
+
 #define SCF_DO_STCLASS_AND	0x0800
 #define SCF_DO_STCLASS_OR	0x1000
 #define SCF_DO_STCLASS		(SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
@@ -920,7 +937,7 @@ Perl_re_indentf(pTHX_ const char *fmt, U32 depth, ...)
     PerlIO *f= Perl_debug_log;
     PERL_ARGS_ASSERT_RE_INDENTF;
     va_start(ap, depth);
-    PerlIO_printf(f, "%*s", ( depth % 20 ) * 2, "");
+    PerlIO_printf(f, "%*s", ( (int)depth % 20 ) * 2, "");
     result = PerlIO_vprintf(f, fmt, ap);
     va_end(ap);
     return result;
@@ -1266,8 +1283,8 @@ S_ssc_anything(pTHX_ regnode_ssc *ssc)
 
     assert(is_ANYOF_SYNTHETIC(ssc));
 
-    ssc->invlist = sv_2mortal(_new_invlist(2)); /* mortalize so won't leak */
-    _append_range_to_invlist(ssc->invlist, 0, UV_MAX);
+    /* mortalize so won't leak */
+    ssc->invlist = sv_2mortal(_add_range_to_invlist(NULL, 0, UV_MAX));
     ANYOF_FLAGS(ssc) |= SSC_MATCHES_EMPTY_STRING;  /* Plus matches empty */
 }
 
@@ -2153,7 +2170,8 @@ S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
         }
     }
 
-    Perl_re_printf( aTHX_  "\n%*sState+-",depth+1 );
+    Perl_re_printf( aTHX_ "\n");
+    Perl_re_indentf( aTHX_  "State+-", depth+1 );
 
     for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
         Perl_re_printf( aTHX_  "%.*s", colwidth,"--------");
@@ -4088,6 +4106,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
     GET_RE_DEBUG_FLAGS_DECL;
 
     PERL_ARGS_ASSERT_STUDY_CHUNK;
+    RExC_study_started= 1;
 
 
     if ( depth == 0 ) {
@@ -4144,9 +4163,10 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
         DEBUG_PEEP("Peep", scan, depth);
 
 
-        /* The reason we do this here we need to deal with things like /(?:f)(?:o)(?:o)/
-         * which cant be dealt with by the normal EXACT parsing code, as each (?:..) is handled
-         * by a different invocation of reg() -- Yves
+        /* The reason we do this here is that we need to deal with things like
+         * /(?:f)(?:o)(?:o)/ which cant be dealt with by the normal EXACT
+         * parsing code, as each (?:..) is handled by a different invocation of
+         * reg() -- Yves
          */
         JOIN_EXACT(scan,&min_subtract, &unfolded_multi_char, 0);
 
@@ -5884,15 +5904,10 @@ Perl_re_printf( aTHX_  "LHS=%"UVuf" RHS=%"UVuf"\n",
 	/* Else: zero-length, ignore. */
 	scan = regnext(scan);
     }
-    /* If we are exiting a recursion we can unset its recursed bit
-     * and allow ourselves to enter it again - no danger of an
-     * infinite loop there.
-    if (stopparen > -1 && recursed) {
-	DEBUG_STUDYDATA("unset:", data,depth);
-        PAREN_UNSET( recursed, stopparen);
-    }
-    */
+
+  finish:
     if (frame) {
+        /* we need to unwind recursion. */
         depth = depth - 1;
 
         DEBUG_STUDYDATA("frame-end:",data,depth);
@@ -5909,7 +5924,6 @@ Perl_re_printf( aTHX_  "LHS=%"UVuf" RHS=%"UVuf"\n",
         goto fake_study_recurse;
     }
 
-  finish:
     assert(!frame);
     DEBUG_STUDYDATA("pre-fin:",data,depth);
 
@@ -6752,6 +6766,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
 #endif
     }
 
+    pRExC_state->warn_text = NULL;
     pRExC_state->code_blocks = NULL;
     pRExC_state->num_code_blocks = 0;
 
@@ -6868,6 +6883,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
     RExC_contains_locale = 0;
     RExC_contains_i = 0;
     RExC_strict = cBOOL(pm_flags & RXf_PMf_STRICT);
+    RExC_study_started = 0;
     pRExC_state->runtime_code_qr = NULL;
     RExC_frame_head= NULL;
     RExC_frame_last= NULL;
@@ -8308,33 +8324,47 @@ S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
  * as an SVt_INVLIST scalar.
  *
  * An inversion list for Unicode is an array of code points, sorted by ordinal
- * number.  The zeroth element is the first code point in the list.  The 1th
- * element is the first element beyond that not in the list.  In other words,
- * the first range is
- *  invlist[0]..(invlist[1]-1)
- * The other ranges follow.  Thus every element whose index is divisible by two
- * marks the beginning of a range that is in the list, and every element not
- * divisible by two marks the beginning of a range not in the list.  A single
- * element inversion list that contains the single code point N generally
- * consists of two elements
- *  invlist[0] == N
- *  invlist[1] == N+1
- * (The exception is when N is the highest representable value on the
- * machine, in which case the list containing just it would be a single
- * element, itself.  By extension, if the last range in the list extends to
- * infinity, then the first element of that range will be in the inversion list
- * at a position that is divisible by two, and is the final element in the
- * list.)
+ * number.  Each element gives the code point that begins a range that extends
+ * up-to but not including the code point given by the next element.  The final
+ * element gives the first code point of a range that extends to the platform's
+ * infinity.  The even-numbered elements (invlist[0], invlist[2], invlist[4],
+ * ...) give ranges whose code points are all in the inversion list.  We say
+ * that those ranges are in the set.  The odd-numbered elements give ranges
+ * whose code points are not in the inversion list, and hence not in the set.
+ * Thus, element [0] is the first code point in the list.  Element [1]
+ * is the first code point beyond that not in the list; and element [2] is the
+ * first code point beyond that that is in the list.  In other words, the first
+ * range is invlist[0]..(invlist[1]-1), and all code points in that range are
+ * in the inversion list.  The second range is invlist[1]..(invlist[2]-1), and
+ * all code points in that range are not in the inversion list.  The third
+ * range invlist[2]..(invlist[3]-1) gives code points that are in the inversion
+ * list, and so forth.  Thus every element whose index is divisible by two
+ * gives the beginning of a range that is in the list, and every element whose
+ * index is not divisible by two gives the beginning of a range not in the
+ * list.  If the final element's index is divisible by two, the inversion list
+ * extends to the platform's infinity; otherwise the highest code point in the
+ * inversion list is the contents of that element minus 1.
+ *
+ * A range that contains just a single code point N will look like
+ *  invlist[i]   == N
+ *  invlist[i+1] == N+1
+ *
+ * If N is UV_MAX (the highest representable code point on the machine), N+1 is
+ * impossible to represent, so element [i+1] is omitted.  The single element
+ * inversion list
+ *  invlist[0] == UV_MAX
+ * contains just UV_MAX, but is interpreted as matching to infinity.
+ *
  * Taking the complement (inverting) an inversion list is quite simple, if the
  * first element is 0, remove it; otherwise add a 0 element at the beginning.
  * This implementation reserves an element at the beginning of each inversion
  * list to always contain 0; there is an additional flag in the header which
  * indicates if the list begins at the 0, or is offset to begin at the next
- * element.
+ * element.  This means that the inversion list can be inverted without any
+ * copying; just flip the flag.
  *
  * More about inversion lists can be found in "Unicode Demystified"
  * Chapter 13 by Richard Gillam, published by Addison-Wesley.
- * More will be coming when functionality is added later.
  *
  * The inversion list data structure is currently implemented as an SV pointing
  * to an array of UVs that the SV thinks are bytes.  This allows us to have an
@@ -8346,6 +8376,8 @@ S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
 
 /* The header definitions are in F<invlist_inline.h> */
 
+#ifndef PERL_IN_XSUB_RE
+
 PERL_STATIC_INLINE UV*
 S__invlist_array_init(SV* const invlist, const bool will_have_0)
 {
@@ -8372,6 +8404,8 @@ S__invlist_array_init(SV* const invlist, const bool will_have_0)
     return zero_addr + *offset;
 }
 
+#endif
+
 PERL_STATIC_INLINE void
 S_invlist_set_len(pTHX_ SV* const invlist, const UV len, const bool offset)
 {
@@ -8508,6 +8542,8 @@ S_invlist_is_iterating(SV* const invlist)
     return *(get_invlist_iter_addr(invlist)) < (STRLEN) UV_MAX;
 }
 
+#ifndef PERL_IN_XSUB_RE
+
 PERL_STATIC_INLINE UV
 S_invlist_max(SV* const invlist)
 {
@@ -8524,8 +8560,6 @@ S_invlist_max(SV* const invlist)
            ? FROM_INTERNAL_SIZE(SvCUR(invlist)) - 1
            : FROM_INTERNAL_SIZE(SvLEN(invlist)) - 1;
 }
-
-#ifndef PERL_IN_XSUB_RE
 SV*
 Perl__new_invlist(pTHX_ IV initial_size)
 {
@@ -8611,7 +8645,6 @@ Perl__new_invlist_C_array(pTHX_ const UV* const list)
 
     return invlist;
 }
-#endif /* ifndef PERL_IN_XSUB_RE */
 
 STATIC void
 S_invlist_extend(pTHX_ SV* const invlist, const UV new_max)
@@ -8655,7 +8688,7 @@ S__append_range_to_invlist(pTHX_ SV* const invlist,
 
 	UV final_element = len - 1;
 	array = invlist_array(invlist);
-	if (array[final_element] > start
+	if (   array[final_element] > start
 	    || ELEMENT_RANGE_MATCHES_INVLIST(final_element))
 	{
 	    Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%"UVuf", start=%"UVuf", match=%c",
@@ -8663,10 +8696,10 @@ S__append_range_to_invlist(pTHX_ SV* const invlist,
 		     ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
 	}
 
-	/* Here, it is a legal append.  If the new range begins with the first
-	 * value not in the set, it is extending the set, so the new first
-	 * value not in the set is one greater than the newly extended range.
-	 * */
+        /* Here, it is a legal append.  If the new range begins 1 above the end
+         * of the range below it, it is extending the range below it, so the
+         * new first value not in the set is one greater than the newly
+         * extended range.  */
         offset = *get_invlist_offset_addr(invlist);
 	if (array[final_element] == start) {
 	    if (end != UV_MAX) {
@@ -8674,7 +8707,8 @@ S__append_range_to_invlist(pTHX_ SV* const invlist,
 	    }
 	    else {
 		/* But if the end is the maximum representable on the machine,
-		 * just let the range that this would extend to have no end */
+                 * assume that infinity was actually what was meant.  Just let
+                 * the range that this would extend to have no end */
 		invlist_set_len(invlist, len - 1, offset);
 	    }
 	    return;
@@ -8712,9 +8746,7 @@ S__append_range_to_invlist(pTHX_ SV* const invlist,
     }
 }
 
-#ifndef PERL_IN_XSUB_RE
-
-IV
+SSize_t
 Perl__invlist_search(SV* const invlist, const UV cp)
 {
     /* Searches the inversion list for the entry that contains the input code
@@ -8918,13 +8950,7 @@ Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
      * length there.  The preface says to incorporate its examples into your
      * code at your own risk.
      *
-     * The algorithm is like a merge sort.
-     *
-     * XXX A potential performance improvement is to keep track as we go along
-     * if only one of the inputs contributes to the result, meaning the other
-     * is a subset of that one.  In that case, we can skip the final copy and
-     * return the larger of the input lists, but then outside code might need
-     * to keep track of whether to free the input list or not */
+     * The algorithm is like a merge sort. */
 
     const UV* array_a;    /* a's array */
     const UV* array_b;
@@ -8941,10 +8967,12 @@ Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
 
     /* running count, as explained in the algorithm source book; items are
      * stopped accumulating and are output when the count changes to/from 0.
-     * The count is incremented when we start a range that's in the set, and
-     * decremented when we start a range that's not in the set.  So its range
-     * is 0 to 2.  Only when the count is zero is something not in the set.
-     */
+     * The count is incremented when we start a range that's in an input's set,
+     * and decremented when we start a range that's not in a set.  So this
+     * variable can be 0, 1, or 2.  When it is 0 neither input is in their set,
+     * and hence nothing goes into the union; 1, just one of the inputs is in
+     * its set (and its current range gets added to the union); and 2 when both
+     * inputs are in their sets.  */
     UV count = 0;
 
     PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
@@ -8958,8 +8986,7 @@ Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
          * It's easiest to create a new inversion list that matches everything.
          * */
         if (complement_b) {
-            SV* everything = _new_invlist(1);
-            _append_range_to_invlist(everything, 0, UV_MAX);
+            SV* everything = _add_range_to_invlist(NULL, 0, UV_MAX);
 
             /* If the output didn't exist, just point it at the new list */
             if (*output == NULL) {
@@ -9007,7 +9034,7 @@ Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
         invlist_replace_list_destroys_src(*output, u);
         SvREFCNT_dec_NN(u);
 
-	return;
+        return;
     }
 
     if (a == NULL || ((len_a = _invlist_len(a)) == 0)) {
@@ -9078,10 +9105,10 @@ Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
     u = _new_invlist(len_a + len_b);
 
     /* Will contain U+0000 if either component does */
-    array_u = _invlist_array_init(u, (len_a > 0 && array_a[0] == 0)
-				      || (len_b > 0 && array_b[0] == 0));
+    array_u = _invlist_array_init(u, (    len_a > 0 && array_a[0] == 0)
+                                      || (len_b > 0 && array_b[0] == 0));
 
-    /* Go through each list item by item, stopping when exhausted one of
+    /* Go through each input list item by item, stopping when exhausted one of
      * them */
     while (i_a < len_a && i_b < len_b) {
 	UV cp;	    /* The element to potentially add to the union's array */
@@ -9089,21 +9116,21 @@ Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
 
 	/* We need to take one or the other of the two inputs for the union.
 	 * Since we are merging two sorted lists, we take the smaller of the
-	 * next items.  In case of a tie, we take the one that is in its set
-	 * first.  If we took one not in the set first, it would decrement the
-	 * count, possibly to 0 which would cause it to be output as ending the
-	 * range, and the next time through we would take the same number, and
-	 * output it again as beginning the next range.  By doing it the
-	 * opposite way, there is no possibility that the count will be
-	 * momentarily decremented to 0, and thus the two adjoining ranges will
-	 * be seamlessly merged.  (In a tie and both are in the set or both not
-	 * in the set, it doesn't matter which we take first.) */
-	if (array_a[i_a] < array_b[i_b]
-	    || (array_a[i_a] == array_b[i_b]
+         * next items.  In case of a tie, we take first the one that is in its
+         * set.  If we first took the one not in its set, it would decrement
+         * the count, possibly to 0 which would cause it to be output as ending
+         * the range, and the next time through we would take the same number,
+         * and output it again as beginning the next range.  By doing it the
+         * opposite way, there is no possibility that the count will be
+         * momentarily decremented to 0, and thus the two adjoining ranges will
+         * be seamlessly merged.  (In a tie and both are in the set or both not
+         * in the set, it doesn't matter which we take first.) */
+	if (       array_a[i_a] < array_b[i_b]
+	    || (   array_a[i_a] == array_b[i_b]
 		&& ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
 	{
 	    cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
-	    cp= array_a[i_a++];
+	    cp = array_a[i_a++];
 	}
 	else {
 	    cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
@@ -9127,39 +9154,53 @@ Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
 	}
     }
 
-    /* Here, we are finished going through at least one of the lists, which
-     * means there is something remaining in at most one.  We check if the list
-     * that hasn't been exhausted is positioned such that we are in the middle
-     * of a range in its set or not.  (i_a and i_b point to the element beyond
-     * the one we care about.) If in the set, we decrement 'count'; if 0, there
-     * is potentially more to output.
-     * There are four cases:
-     *	1) Both weren't in their sets, count is 0, and remains 0.  What's left
-     *	   in the union is entirely from the non-exhausted set.
-     *	2) Both were in their sets, count is 2.  Nothing further should
-     *	   be output, as everything that remains will be in the exhausted
-     *	   list's set, hence in the union; decrementing to 1 but not 0 insures
-     *	   that
-     *	3) the exhausted was in its set, non-exhausted isn't, count is 1.
-     *	   Nothing further should be output because the union includes
-     *	   everything from the exhausted set.  Not decrementing ensures that.
-     *	4) the exhausted wasn't in its set, non-exhausted is, count is 1;
-     *	   decrementing to 0 insures that we look at the remainder of the
-     *	   non-exhausted set */
+
+    /* The loop above increments the index into exactly one of the input lists
+     * each iteration, and ends when either index gets to its list end.  That
+     * means the other index is lower than its end, and so something is
+     * remaining in that one.  We decrement 'count', as explained below, if
+     * that list is in its set.  (i_a and i_b each currently index the element
+     * beyond the one we care about.) */
     if (   (i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
 	|| (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
     {
 	count--;
     }
 
-    /* The final length is what we've output so far, plus what else is about to
-     * be output.  (If 'count' is non-zero, then the input list we exhausted
-     * has everything remaining up to the machine's limit in its set, and hence
-     * in the union, so there will be no further output. */
-    len_u = i_u;
-    if (count == 0) {
-	/* At most one of the subexpressions will be non-zero */
-	len_u += (len_a - i_a) + (len_b - i_b);
+    /* Above we decremented 'count' if the list that had unexamined elements in
+     * it was in its set.  This has made it so that 'count' being non-zero
+     * means there isn't anything left to output; and 'count' equal to 0 means
+     * that what is left to output is precisely that which is left in the
+     * non-exhausted input list.
+     *
+     * To see why, note first that the exhausted input obviously has nothing
+     * left to add to the union.  If it was in its set at its end, that means
+     * the set extends from here to the platform's infinity, and hence so does
+     * the union and the non-exhausted set is irrelevant.  The exhausted set
+     * also contributed 1 to 'count'.  If 'count' was 2, it got decremented to
+     * 1, but if it was 1, the non-exhausted set wasn't in its set, and so
+     * 'count' remains at 1.  This is consistent with the decremented 'count'
+     * != 0 meaning there's nothing left to add to the union.
+     *
+     * But if the exhausted input wasn't in its set, it contributed 0 to
+     * 'count', and the rest of the union will be whatever the other input is.
+     * If 'count' was 0, neither list was in its set, and 'count' remains 0;
+     * otherwise it gets decremented to 0.  This is consistent with 'count'
+     * == 0 meaning the remainder of the union is whatever is left in the
+     * non-exhausted list. */
+    if (count != 0) {
+        len_u = i_u;
+    }
+    else {
+        IV copy_count = len_a - i_a;
+        if (copy_count > 0) {   /* The non-exhausted input is 'a' */
+	    Copy(array_a + i_a, array_u + i_u, copy_count, UV);
+        }
+        else { /* The non-exhausted input is b */
+            copy_count = len_b - i_b;
+	    Copy(array_b + i_b, array_u + i_u, copy_count, UV);
+        }
+        len_u = i_u + copy_count;
     }
 
     /* Set the result to the final length, which can change the pointer to
@@ -9171,22 +9212,6 @@ Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
 	array_u = invlist_array(u);
     }
 
-    /* When 'count' is 0, the list that was exhausted (if one was shorter than
-     * the other) ended with everything above it not in its set.  That means
-     * that the remaining part of the union is precisely the same as the
-     * non-exhausted list, so can just copy it unchanged.  (If both lists were
-     * exhausted at the same time, then the operations below will be both 0.)
-     */
-    if (count == 0) {
-	IV copy_count; /* At most one will have a non-zero copy count */
-	if ((copy_count = len_a - i_a) > 0) {
-	    Copy(array_a + i_a, array_u + i_u, copy_count, UV);
-	}
-	else if ((copy_count = len_b - i_b) > 0) {
-	    Copy(array_b + i_b, array_u + i_u, copy_count, UV);
-	}
-    }
-
     /* If the output is not to overwrite either of the inputs, just return the
      * calculated union */
     if (a != *output && b != *output) {
@@ -9199,7 +9224,7 @@ Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
          *  shown [perl #127392] that if the input is a mortal, we can get a
          *  huge build-up of these during regex compilation before they get
          *  freed.  So for that case, replace just the input's interior with
-         *  the output's, and then free the output */
+         *  the union's, and then free the union */
 
         assert(! invlist_is_iterating(*output));
 
@@ -9251,12 +9276,12 @@ Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
     UV i_b = 0;
     UV i_r = 0;
 
-    /* running count, as explained in the algorithm source book; items are
-     * stopped accumulating and are output when the count changes to/from 2.
-     * The count is incremented when we start a range that's in the set, and
-     * decremented when we start a range that's not in the set.  So its range
-     * is 0 to 2.  Only when the count is 2 is something in the intersection.
-     */
+    /* running count of how many of the two inputs are postitioned at ranges
+     * that are in their sets.  As explained in the algorithm source book,
+     * items are stopped accumulating and are output when the count changes
+     * to/from 2.  The count is incremented when we start a range that's in an
+     * input's set, and decremented when we start a range that's not in a set.
+     * Only when it is 2 are we in the intersection. */
     UV count = 0;
 
     PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
@@ -9328,8 +9353,8 @@ Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
     r= _new_invlist(len_a + len_b);
 
     /* Will contain U+0000 iff both components do */
-    array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0
-				     && len_b > 0 && array_b[0] == 0);
+    array_r = _invlist_array_init(r,    len_a > 0 && array_a[0] == 0
+                                     && len_b > 0 && array_b[0] == 0);
 
     /* Go through each list item by item, stopping when exhausted one of
      * them */
@@ -9340,21 +9365,21 @@ Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
 
 	/* We need to take one or the other of the two inputs for the
 	 * intersection.  Since we are merging two sorted lists, we take the
-	 * smaller of the next items.  In case of a tie, we take the one that
-	 * is not in its set first (a difference from the union algorithm).  If
-	 * we took one in the set first, it would increment the count, possibly
-	 * to 2 which would cause it to be output as starting a range in the
-	 * intersection, and the next time through we would take that same
-	 * number, and output it again as ending the set.  By doing it the
-	 * opposite of this, there is no possibility that the count will be
-	 * momentarily incremented to 2.  (In a tie and both are in the set or
-	 * both not in the set, it doesn't matter which we take first.) */
-	if (array_a[i_a] < array_b[i_b]
-	    || (array_a[i_a] == array_b[i_b]
+         * smaller of the next items.  In case of a tie, we take first the one
+         * that is not in its set (a difference from the union algorithm).  If
+         * we first took the one in its set, it would increment the count,
+         * possibly to 2 which would cause it to be output as starting a range
+         * in the intersection, and the next time through we would take that
+         * same number, and output it again as ending the set.  By doing the
+         * opposite of this, there is no possibility that the count will be
+         * momentarily incremented to 2.  (In a tie and both are in the set or
+         * both not in the set, it doesn't matter which we take first.) */
+	if (       array_a[i_a] < array_b[i_b]
+	    || (   array_a[i_a] == array_b[i_b]
 		&& ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
 	{
 	    cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
-	    cp= array_a[i_a++];
+	    cp = array_a[i_a++];
 	}
 	else {
 	    cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
@@ -9376,36 +9401,55 @@ Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
 	    }
 	    count--;
 	}
+
     }
 
-    /* Here, we are finished going through at least one of the lists, which
-     * means there is something remaining in at most one.  We check if the list
-     * that has been exhausted is positioned such that we are in the middle
-     * of a range in its set or not.  (i_a and i_b point to elements 1 beyond
-     * the ones we care about.)  There are four cases:
-     *	1) Both weren't in their sets, count is 0, and remains 0.  There's
-     *	   nothing left in the intersection.
-     *	2) Both were in their sets, count is 2 and perhaps is incremented to
-     *	   above 2.  What should be output is exactly that which is in the
-     *	   non-exhausted set, as everything it has is also in the intersection
-     *	   set, and everything it doesn't have can't be in the intersection
-     *	3) The exhausted was in its set, non-exhausted isn't, count is 1, and
-     *	   gets incremented to 2.  Like the previous case, the intersection is
-     *	   everything that remains in the non-exhausted set.
-     *	4) the exhausted wasn't in its set, non-exhausted is, count is 1, and
-     *	   remains 1.  And the intersection has nothing more. */
+    /* The loop above increments the index into exactly one of the input lists
+     * each iteration, and ends when either index gets to its list end.  That
+     * means the other index is lower than its end, and so something is
+     * remaining in that one.  We increment 'count', as explained below, if the
+     * exhausted list was in its set.  (i_a and i_b each currently index the
+     * element beyond the one we care about.) */
     if (   (i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
-	|| (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
+        || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
     {
 	count++;
     }
 
-    /* The final length is what we've output so far plus what else is in the
-     * intersection.  At most one of the subexpressions below will be non-zero
-     * */
-    len_r = i_r;
-    if (count >= 2) {
-	len_r += (len_a - i_a) + (len_b - i_b);
+    /* Above we incremented 'count' if the exhausted list was in its set.  This
+     * has made it so that 'count' being below 2 means there is nothing left to
+     * output; otheriwse what's left to add to the intersection is precisely
+     * that which is left in the non-exhausted input list.
+     *
+     * To see why, note first that the exhausted input obviously has nothing
+     * left to affect the intersection.  If it was in its set at its end, that
+     * means the set extends from here to the platform's infinity, and hence
+     * anything in the non-exhausted's list will be in the intersection, and
+     * anything not in it won't be.  Hence, the rest of the intersection is
+     * precisely what's in the non-exhausted list  The exhausted set also
+     * contributed 1 to 'count', meaning 'count' was at least 1.  Incrementing
+     * it means 'count' is now at least 2.  This is consistent with the
+     * incremented 'count' being >= 2 means to add the non-exhausted list to
+     * the intersection.
+     *
+     * But if the exhausted input wasn't in its set, it contributed 0 to
+     * 'count', and the intersection can't include anything further; the
+     * non-exhausted set is irrelevant.  'count' was at most 1, and doesn't get
+     * incremented.  This is consistent with 'count' being < 2 meaning nothing
+     * further to add to the intersection. */
+    if (count < 2) { /* Nothing left to put in the intersection. */
+        len_r = i_r;
+    }
+    else { /* copy the non-exhausted list, unchanged. */
+        IV copy_count = len_a - i_a;
+        if (copy_count > 0) {   /* a is the one with stuff left */
+	    Copy(array_a + i_a, array_r + i_r, copy_count, UV);
+        }
+        else {  /* b is the one with stuff left */
+            copy_count = len_b - i_b;
+	    Copy(array_b + i_b, array_r + i_r, copy_count, UV);
+        }
+        len_r = i_r + copy_count;
     }
 
     /* Set the result to the final length, which can change the pointer to
@@ -9464,50 +9508,261 @@ Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
 }
 
 SV*
-Perl__add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end)
+Perl__add_range_to_invlist(pTHX_ SV* invlist, UV start, UV end)
 {
     /* Add the range from 'start' to 'end' inclusive to the inversion list's
      * set.  A pointer to the inversion list is returned.  This may actually be
      * a new list, in which case the passed in one has been destroyed.  The
      * passed-in inversion list can be NULL, in which case a new one is created
-     * with just the one range in it */
-
-    SV* range_invlist;
-    UV len;
-
+     * with just the one range in it.  The new list is not necessarily
+     * NUL-terminated.  Space is not freed if the inversion list shrinks as a
+     * result of this function.  The gain would not be large, and in many
+     * cases, this is called multiple times on a single inversion list, so
+     * anything freed may almost immediately be needed again.
+     *
+     * This used to mostly call the 'union' routine, but that is much more
+     * heavyweight than really needed for a single range addition */
+
+    UV* array;              /* The array implementing the inversion list */
+    UV len;                 /* How many elements in 'array' */
+    SSize_t i_s;            /* index into the invlist array where 'start'
+                               should go */
+    SSize_t i_e = 0;        /* And the index where 'end' should go */
+    UV cur_highest;         /* The highest code point in the inversion list
+                               upon entry to this function */
+
+    /* This range becomes the whole inversion list if none already existed */
     if (invlist == NULL) {
 	invlist = _new_invlist(2);
-	len = 0;
+        _append_range_to_invlist(invlist, start, end);
+        return invlist;
     }
-    else {
-	len = _invlist_len(invlist);
+
+    /* Likewise, if the inversion list is currently empty */
+    len = _invlist_len(invlist);
+    if (len == 0) {
+        _append_range_to_invlist(invlist, start, end);
+        return invlist;
     }
 
-    /* If comes after the final entry actually in the list, can just append it
-     * to the end, */
-    if (len == 0
-	|| (! ELEMENT_RANGE_MATCHES_INVLIST(len - 1)
-            && start >= invlist_array(invlist)[len - 1]))
-    {
-	_append_range_to_invlist(invlist, start, end);
-	return invlist;
+    /* Starting here, we have to know the internals of the list */
+    array = invlist_array(invlist);
+
+    /* If the new range ends higher than the current highest ... */
+    cur_highest = invlist_highest(invlist);
+    if (end > cur_highest) {
+
+        /* If the whole range is higher, we can just append it */
+        if (start > cur_highest) {
+            _append_range_to_invlist(invlist, start, end);
+            return invlist;
+        }
+
+        /* Otherwise, add the portion that is higher ... */
+        _append_range_to_invlist(invlist, cur_highest + 1, end);
+
+        /* ... and continue on below to handle the rest.  As a result of the
+         * above append, we know that the index of the end of the range is the
+         * final even numbered one of the array.  Recall that the final element
+         * always starts a range that extends to infinity.  If that range is in
+         * the set (meaning the set goes from here to infinity), it will be an
+         * even index, but if it isn't in the set, it's odd, and the final
+         * range in the set is one less, which is even. */
+        if (end == UV_MAX) {
+            i_e = len;
+        }
+        else {
+            i_e = len - 2;
+        }
+    }
+
+    /* We have dealt with appending, now see about prepending.  If the new
+     * range starts lower than the current lowest ... */
+    if (start < array[0]) {
+
+        /* Adding something which has 0 in it is somewhat tricky, and uncommon.
+         * Let the union code handle it, rather than having to know the
+         * trickiness in two code places.  */
+        if (UNLIKELY(start == 0)) {
+            SV* range_invlist;
+
+            range_invlist = _new_invlist(2);
+            _append_range_to_invlist(range_invlist, start, end);
+
+            _invlist_union(invlist, range_invlist, &invlist);
+
+            SvREFCNT_dec_NN(range_invlist);
+
+            return invlist;
+        }
+
+        /* If the whole new range comes before the first entry, and doesn't
+         * extend it, we have to insert it as an additional range */
+        if (end < array[0] - 1) {
+            i_s = i_e = -1;
+            goto splice_in_new_range;
+        }
+
+        /* Here the new range adjoins the existing first range, extending it
+         * downwards. */
+        array[0] = start;
+
+        /* And continue on below to handle the rest.  We know that the index of
+         * the beginning of the range is the first one of the array */
+        i_s = 0;
+    }
+    else { /* Not prepending any part of the new range to the existing list.
+            * Find where in the list it should go.  This finds i_s, such that:
+            *     invlist[i_s] <= start < array[i_s+1]
+            */
+        i_s = _invlist_search(invlist, start);
+    }
+
+    /* At this point, any extending before the beginning of the inversion list
+     * and/or after the end has been done.  This has made it so that, in the
+     * code below, each endpoint of the new range is either in a range that is
+     * in the set, or is in a gap between two ranges that are.  This means we
+     * don't have to worry about exceeding the array bounds.
+     *
+     * Find where in the list the new range ends (but we can skip this if we
+     * have already determined what it is, or if it will be the same as i_s,
+     * which we already have computed) */
+    if (i_e == 0) {
+        i_e = (start == end)
+              ? i_s
+              : _invlist_search(invlist, end);
+    }
+
+    /* Here generally invlist[i_e] <= end < array[i_e+1].  But if invlist[i_e]
+     * is a range that goes to infinity there is no element at invlist[i_e+1],
+     * so only the first relation holds. */
+
+    if ( ! ELEMENT_RANGE_MATCHES_INVLIST(i_s)) {
+
+        /* Here, the ranges on either side of the beginning of the new range
+         * are in the set, and this range starts in the gap between them.
+         *
+         * The new range extends the range above it downwards if the new range
+         * ends at or above that range's start */
+        const bool extends_the_range_above = (   end == UV_MAX
+                                              || end + 1 >= array[i_s+1]);
+
+        /* The new range extends the range below it upwards if it begins just
+         * after where that range ends */
+        if (start == array[i_s]) {
+
+            /* If the new range fills the entire gap between the other ranges,
+             * they will get merged together.  Other ranges may also get
+             * merged, depending on how many of them the new range spans.  In
+             * the general case, we do the merge later, just once, after we
+             * figure out how many to merge.  But in the case where the new
+             * range exactly spans just this one gap (possibly extending into
+             * the one above), we do the merge here, and an early exit.  This
+             * is done here to avoid having to special case later. */
+            if (i_e - i_s <= 1) {
+
+                /* If i_e - i_s == 1, it means that the new range terminates
+                 * within the range above, and hence 'extends_the_range_above'
+                 * must be true.  (If the range above it extends to infinity,
+                 * 'i_s+2' will be above the array's limit, but 'len-i_s-2'
+                 * will be 0, so no harm done.) */
+                if (extends_the_range_above) {
+                    Move(array + i_s + 2, array + i_s, len - i_s - 2, UV);
+                    invlist_set_len(invlist,
+                                    len - 2,
+                                    *(get_invlist_offset_addr(invlist)));
+                    return invlist;
+                }
+
+                /* Here, i_e must == i_s.  We keep them in sync, as they apply
+                 * to the same range, and below we are about to decrement i_s
+                 * */
+                i_e--;
+            }
+
+            /* Here, the new range is adjacent to the one below.  (It may also
+             * span beyond the range above, but that will get resolved later.)
+             * Extend the range below to include this one. */
+            array[i_s] = (end == UV_MAX) ? UV_MAX : end + 1;
+            i_s--;
+            start = array[i_s];
+        }
+        else if (extends_the_range_above) {
+
+            /* Here the new range only extends the range above it, but not the
+             * one below.  It merges with the one above.  Again, we keep i_e
+             * and i_s in sync if they point to the same range */
+            if (i_e == i_s) {
+                i_e++;
+            }
+            i_s++;
+            array[i_s] = start;
+        }
+    }
+
+    /* Here, we've dealt with the new range start extending any adjoining
+     * existing ranges.
+     *
+     * If the new range extends to infinity, it is now the final one,
+     * regardless of what was there before */
+    if (UNLIKELY(end == UV_MAX)) {
+        invlist_set_len(invlist, i_s + 1, *(get_invlist_offset_addr(invlist)));
+        return invlist;
     }
 
-    /* Here, can't just append things, create and return a new inversion list
-     * which is the union of this range and the existing inversion list.  (If
-     * the new range is well-behaved wrt to the old one, we could just insert
-     * it, doing a Move() down on the tail of the old one (potentially growing
-     * it first).  But to determine that means we would have the extra
-     * (possibly throw-away) work of first finding where the new one goes and
-     * whether it disrupts (splits) an existing range, so it doesn't appear to
-     * me (khw) that it's worth it) */
-    range_invlist = _new_invlist(2);
-    _append_range_to_invlist(range_invlist, start, end);
+    /* If i_e started as == i_s, it has also been dealt with,
+     * and been updated to the new i_s, which will fail the following if */
+    if (! ELEMENT_RANGE_MATCHES_INVLIST(i_e)) {
 
-    _invlist_union(invlist, range_invlist, &invlist);
+        /* Here, the ranges on either side of the end of the new range are in
+         * the set, and this range ends in the gap between them.
+         *
+         * If this range is adjacent to (hence extends) the range above it, it
+         * becomes part of that range; likewise if it extends the range below,
+         * it becomes part of that range */
+        if (end + 1 == array[i_e+1]) {
+            i_e++;
+            array[i_e] = start;
+        }
+        else if (start <= array[i_e]) {
+            array[i_e] = end + 1;
+            i_e--;
+        }
+    }
+
+    if (i_s == i_e) {
+
+        /* If the range fits entirely in an existing range (as possibly already
+         * extended above), it doesn't add anything new */
+        if (ELEMENT_RANGE_MATCHES_INVLIST(i_s)) {
+            return invlist;
+        }
 
-    /* The temporary can be freed */
-    SvREFCNT_dec_NN(range_invlist);
+        /* Here, no part of the range is in the list.  Must add it.  It will
+         * occupy 2 more slots */
+      splice_in_new_range:
+
+        invlist_extend(invlist, len + 2);
+        array = invlist_array(invlist);
+        /* Move the rest of the array down two slots. Don't include any
+         * trailing NUL */
+        Move(array + i_e + 1, array + i_e + 3, len - i_e - 1, UV);
+
+        /* Do the actual splice */
+        array[i_e+1] = start;
+        array[i_e+2] = end + 1;
+        invlist_set_len(invlist, len + 2, *(get_invlist_offset_addr(invlist)));
+        return invlist;
+    }
+
+    /* Here the new range crossed the boundaries of a pre-existing range.  The
+     * code above has adjusted things so that both ends are in ranges that are
+     * in the set.  This means everything in between must also be in the set.
+     * Just squash things together */
+    Move(array + i_e + 1, array + i_s + 1, len - i_e - 1, UV);
+    invlist_set_len(invlist,
+                    len - i_e + i_s,
+                    *(get_invlist_offset_addr(invlist)));
 
     return invlist;
 }
@@ -9533,7 +9788,7 @@ Perl__setup_canned_invlist(pTHX_ const STRLEN size, const UV element0,
 
     PERL_ARGS_ASSERT__SETUP_CANNED_INVLIST;
 
-    _append_range_to_invlist(invlist, element0, element0);
+    invlist = add_cp_to_invlist(invlist, element0);
     offset = *get_invlist_offset_addr(invlist);
 
     invlist_set_len(invlist, size, offset);
@@ -10218,8 +10473,9 @@ S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state)
                 if (RExC_flags & RXf_PMf_FOLD) {
                     RExC_contains_i = 1;
                 }
-                if (PASS2) {
-                    STD_PMMOD_FLAGS_PARSE_X_WARN(x_mod_count);
+
+                if (UNLIKELY((x_mod_count) > 1)) {
+                    vFAIL("Only one /x regex modifier is allowed");
                 }
                 return;
                 /*NOTREACHED*/
@@ -10604,7 +10860,10 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
                 RExC_seen |= REG_LOOKBEHIND_SEEN;
 		RExC_in_lookbehind++;
 		RExC_parse++;
-                assert(RExC_parse < RExC_end);
+                if (RExC_parse >= RExC_end) {
+                    vFAIL("Sequence (?... not terminated");
+                }
+
                 /* FALLTHROUGH */
 	    case '=':           /* (?=...) */
 		RExC_seen_zerolen++;
@@ -11340,7 +11599,7 @@ S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
             FAIL2("panic: regpiece returned NULL, flags=%#"UVxf"", (UV) flags);
 	}
 	else if (ret == NULL)
-	    ret = latest;
+            ret = latest;
 	*flagp |= flags&(HASWIDTH|POSTPONED);
 	if (chain == NULL) 	/* First piece. */
 	    *flagp |= flags&SPSTART;
@@ -11950,39 +12209,6 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state,
 }
 
 
-/*
- * reg_recode
- *
- * It returns the code point in utf8 for the value in *encp.
- *    value: a code value in the source encoding
- *    encp:  a pointer to an Encode object
- *
- * If the result from Encode is not a single character,
- * it returns U+FFFD (Replacement character) and sets *encp to NULL.
- */
-STATIC UV
-S_reg_recode(pTHX_ const U8 value, SV **encp)
-{
-    STRLEN numlen = 1;
-    SV * const sv = newSVpvn_flags((const char *) &value, numlen, SVs_TEMP);
-    const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
-    const STRLEN newlen = SvCUR(sv);
-    UV uv = UNICODE_REPLACEMENT;
-
-    PERL_ARGS_ASSERT_REG_RECODE;
-
-    if (newlen)
-	uv = SvUTF8(sv)
-	     ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
-	     : *(U8*)s;
-
-    if (!newlen || numlen != newlen) {
-	uv = UNICODE_REPLACEMENT;
-	*encp = NULL;
-    }
-    return uv;
-}
-
 PERL_STATIC_INLINE U8
 S_compute_EXACTish(RExC_state_t *pRExC_state)
 {
@@ -12196,13 +12422,15 @@ S_backref_value(char *p)
 /*
  - regatom - the lowest level
 
-   Try to identify anything special at the start of the pattern. If there
-   is, then handle it as required. This may involve generating a single regop,
-   such as for an assertion; or it may involve recursing, such as to
-   handle a () structure.
+   Try to identify anything special at the start of the current parse position.
+   If there is, then handle it as required. This may involve generating a
+   single regop, such as for an assertion; or it may involve recursing, such as
+   to handle a () structure.
 
    If the string doesn't start with something special then we gobble up
-   as much literal text as we can.
+   as much literal text as we can.  If we encounter a quantifier, we have to
+   back off the final literal character, as that quantifier applies to just it
+   and not to the whole string of literals.
 
    Once we have been able to handle whatever type of thing started the
    sequence, we return.
@@ -12917,6 +13145,9 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                    || UTF8_IS_INVARIANT(UCHARAT(RExC_parse))
                    || UTF8_IS_START(UCHARAT(RExC_parse)));
 
+            /* Here, we have a literal character.  Find the maximal string of
+             * them in the input that we can fit into a single EXACTish node.
+             * We quit at the first non-literal or when the node gets full */
 	    for (p = RExC_parse;
 	         len < upper_parse && p < RExC_end;
 	         len++)
@@ -13047,9 +13278,6 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
 				vFAIL(error_msg);
 			    }
                             ender = result;
-			    if (IN_ENCODING && ender < 0x100) {
-				goto recode_encoding;
-			    }
 			    if (ender > 0xff) {
 				REQUIRE_UTF8(flagp);
 			    }
@@ -13082,11 +13310,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                                 if (RExC_recode_x_to_native) {
                                     ender = LATIN1_TO_NATIVE(ender);
                                 }
-                                else
 #endif
-                                if (IN_ENCODING) {
-                                    goto recode_encoding;
-                                }
 			    }
                             else {
 				REQUIRE_UTF8(flagp);
@@ -13146,17 +13370,6 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                                          form_short_octal_warning(p, numlen));
                             }
 			}
-			if (IN_ENCODING && ender < 0x100)
-			    goto recode_encoding;
-			break;
-		      recode_encoding:
-			if (! RExC_override_recoding) {
-			    SV* enc = _get_encoding();
-			    ender = reg_recode((U8)ender, &enc);
-			    if (!enc && PASS2)
-				ckWARNreg(p, "Invalid escape in the specified encoding");
-			    REQUIRE_UTF8(flagp);
-			}
 			break;
 		    case '\0':
 			if (p >= RExC_end)
@@ -13174,14 +13387,13 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
 		    } /* End of switch on '\' */
 		    break;
 		case '{':
-		    /* Currently we don't warn when the lbrace is at the start
+		    /* Currently we don't care if the lbrace is at the start
 		     * of a construct.  This catches it in the middle of a
 		     * literal string, or when it's the first thing after
 		     * something like "\b" */
-		    if (! SIZE_ONLY
-			&& (len || (p > RExC_start && isALPHA_A(*(p -1)))))
-		    {
-			ckWARNregdep(p + 1, "Unescaped left brace in regex is deprecated, passed through");
+		    if (len || (p > RExC_start && isALPHA_A(*(p -1)))) {
+                        RExC_parse = p + 1;
+			vFAIL("Unescaped left brace in regex is illegal here");
 		    }
 		    /*FALLTHROUGH*/
 		default:    /* A literal character */
@@ -13213,6 +13425,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                  * the node, close the node with just them, and set up to do
                  * this character again next time through, when it will be the
                  * only thing in its new node */
+
                 if ((next_is_quantifier = (   LIKELY(p < RExC_end)
                                            && UNLIKELY(ISMULT2(p))))
                     && LIKELY(len))
@@ -13586,8 +13799,6 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
 	    RExC_parse = p - 1;
             Set_Node_Cur_Length(ret, parse_start);
 	    RExC_parse = p;
-            skip_to_be_ignored_text(pRExC_state, &RExC_parse,
-                                    FALSE /* Don't force to /x */ );
 	    {
 		/* len is STRLEN which is unsigned, need to copy to signed */
 		IV iv = len;
@@ -13599,6 +13810,13 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
 	break;
     } /* End of giant switch on input character */
 
+    /* Position parse to next real character */
+    skip_to_be_ignored_text(pRExC_state, &RExC_parse,
+                                            FALSE /* Don't force to /x */ );
+    if (PASS2 && *RExC_parse == '{' && OP(ret) != SBOL && ! regcurly(RExC_parse)) {
+        ckWARNregdep(RExC_parse + 1, "Unescaped left brace in regex is deprecated here, passed through");
+    }
+
     return(ret);
 }
 
@@ -13689,8 +13907,8 @@ S_populate_ANYOF_from_invlist(pTHX_ regnode *node, SV** invlist_ptr)
  * routine. q.v. */
 #define ADD_POSIX_WARNING(p, text)  STMT_START {                            \
         if (posix_warnings) {                                               \
-            if (! warn_text) warn_text = newAV();                           \
-            av_push(warn_text, Perl_newSVpvf(aTHX_                          \
+            if (! RExC_warn_text ) RExC_warn_text = (AV *) sv_2mortal((SV *) newAV()); \
+            av_push(RExC_warn_text, Perl_newSVpvf(aTHX_                          \
                                              WARNING_PREFIX                 \
                                              text                           \
                                              REPORT_LOCATION,               \
@@ -13821,7 +14039,6 @@ S_handle_possible_posix(pTHX_ RExC_state_t *pRExC_state,
     bool has_opening_colon    = FALSE;
     int class_number          = OOB_NAMEDCLASS; /* Out-of-bounds until find
                                                    valid class */
-    AV* warn_text             = NULL;   /* any warning messages */
     const char * possible_end = NULL;   /* used for a 2nd parse pass */
     const char* name_start;             /* ptr to class name first char */
 
@@ -13837,6 +14054,9 @@ S_handle_possible_posix(pTHX_ RExC_state_t *pRExC_state,
 
     PERL_ARGS_ASSERT_HANDLE_POSSIBLE_POSIX;
 
+    if (posix_warnings && RExC_warn_text)
+        av_clear(RExC_warn_text);
+
     if (p >= e) {
         return NOT_MEANT_TO_BE_A_POSIX_CLASS;
     }
@@ -14454,14 +14674,8 @@ S_handle_possible_posix(pTHX_ RExC_state_t *pRExC_state,
                 ADD_POSIX_WARNING(p, "there is no terminating ']'");
             }
 
-            if (warn_text) {
-                if (posix_warnings) {
-                    /* mortalize to avoid a leak with FATAL warnings */
-                    *posix_warnings = (AV *) sv_2mortal((SV *) warn_text);
-                }
-                else {
-                    SvREFCNT_dec_NN(warn_text);
-                }
+            if (posix_warnings && RExC_warn_text && av_top_index(RExC_warn_text) > -1) {
+                *posix_warnings = RExC_warn_text;
             }
         }
         else if (class_number != OOB_NAMEDCLASS) {
@@ -14533,7 +14747,9 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
                                        'stack' of where the undealt-with left
                                        parens would be if they were actually
                                        put there */
-    IV fence = 0;                   /* Position of where most recent undealt-
+    /* The 'VOL' (expanding to 'volatile') is a workaround for an optimiser bug
+     * in Solaris Studio 12.3. See RT #127455 */
+    VOL IV fence = 0;               /* Position of where most recent undealt-
                                        with left paren in stack is; -1 if none.
                                      */
     STRLEN len;                     /* Temporary */
@@ -15138,21 +15354,7 @@ redo_curchar:
              * may have altered the stack in the time since we earlier set
              * 'top_index'.  */
 
-            {
-                /* Work round an optimiser bug in Solaris Studio 12.3:
-                 * for some reason, the presence of the __assert() in
-                 * av_tindex_nomg() causes the value of fence to get
-                 * corrupted, even though the assert is never called. So
-                 * save the value then restore afterwards.
-                 * Note that in fact merely accessing the value of fence
-                 * prior to the statement containing the assert is enough
-                 * to make the bug go away.
-                 */
-                IV f = fence;
-                top_index = av_tindex_nomg(stack);
-                fence = f;
-            }
-
+            top_index = av_tindex_nomg(stack);
             if (top_index - fence >= 0) {
                 /* If the top entry on the stack is an operator, it had better
                  * be a '!', otherwise the entry below the top operand should
@@ -15972,6 +16174,10 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                     SAVEFREEPV(name);
                     if (FOLD) {
                         lookup_name = savepv(Perl_form(aTHX_ "__%s_i", name));
+
+                        /* The function call just below that uses this can fail
+                         * to return, leaking memory if we don't do this */
+                        SAVEFREEPV(lookup_name);
                     }
 
                     /* Look up the property name, and get its swash and
@@ -15987,9 +16193,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                                              NULL, /* No inversion list */
                                              &swash_init_flags
                                             );
-                    if (lookup_name) {
-                        Safefree(lookup_name);
-                    }
                     if (! swash || ! (invlist = _get_swash_invlist(swash))) {
                         HV* curpkg = (IN_PERL_COMPILETIME)
                                       ? PL_curstash
@@ -16141,9 +16344,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
 		    }
 		}
                 non_portable_endpoint++;
-		if (IN_ENCODING && value < 0x100) {
-		    goto recode_encoding;
-		}
 		break;
 	    case 'x':
 		RExC_parse--;	/* function expects to be pointed at the 'x' */
@@ -16161,8 +16361,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
 		    }
 		}
                 non_portable_endpoint++;
-		if (IN_ENCODING && value < 0x100)
-		    goto recode_encoding;
 		break;
 	    case 'c':
 		value = grok_bslash_c(*RExC_parse++, PASS2);
@@ -16195,23 +16393,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                         }
                     }
                     non_portable_endpoint++;
-		    if (IN_ENCODING && value < 0x100)
-			goto recode_encoding;
-		    break;
-		}
-	      recode_encoding:
-		if (! RExC_override_recoding) {
-		    SV* enc = _get_encoding();
-		    value = reg_recode((U8)value, &enc);
-		    if (!enc) {
-                        if (strict) {
-                            vFAIL("Invalid escape in the specified encoding");
-                        }
-                        else if (PASS2) {
-                            ckWARNreg(RExC_parse,
-				  "Invalid escape in the specified encoding");
-                        }
-                    }
 		    break;
 		}
 	    default:
@@ -16352,9 +16533,9 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
             else if (! SIZE_ONLY) {
 
                 /* Here, not in pass1 (in that pass we skip calculating the
-                 * contents of this class), and is /l, or is a POSIX class for
-                 * which /l doesn't matter (or is a Unicode property, which is
-                 * skipped here). */
+                 * contents of this class), and is not /l, or is a POSIX class
+                 * for which /l doesn't matter (or is a Unicode property, which
+                 * is skipped here). */
                 if (namedclass >= ANYOF_POSIXL_MAX) {  /* If a special class */
                     if (namedclass != ANYOF_UNIPROP) { /* UNIPROP = \p and \P */
 
@@ -16379,9 +16560,9 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                                 &cp_list);
                     }
                 }
-                else if (UNI_SEMANTICS
+                else if (  UNI_SEMANTICS
                         || classnum == _CC_ASCII
-                        || (DEPENDS_SEMANTICS && (classnum == _CC_DIGIT
+                        || (DEPENDS_SEMANTICS && (   classnum == _CC_DIGIT
                                                   || classnum == _CC_XDIGIT)))
                 {
                     /* We usually have to worry about /d and /a affecting what
@@ -17178,76 +17359,152 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
 	SvREFCNT_dec_NN(cp_foldable_list);
     }
 
-    /* And combine the result (if any) with any inversion list from posix
+    /* And combine the result (if any) with any inversion lists from posix
      * classes.  The lists are kept separate up to now because we don't want to
      * fold the classes (folding of those is automatically handled by the swash
      * fetching code) */
-    if (simple_posixes) {
-        _invlist_union(cp_list, simple_posixes, &cp_list);
-        SvREFCNT_dec_NN(simple_posixes);
+    if (simple_posixes) {   /* These are the classes known to be unaffected by
+                               /a, /aa, and /d */
+        if (cp_list) {
+            _invlist_union(cp_list, simple_posixes, &cp_list);
+            SvREFCNT_dec_NN(simple_posixes);
+        }
+        else {
+            cp_list = simple_posixes;
+        }
     }
     if (posixes || nposixes) {
-        if (posixes && AT_LEAST_ASCII_RESTRICTED) {
+
+        /* We have to adjust /a and /aa */
+        if (AT_LEAST_ASCII_RESTRICTED) {
+
             /* Under /a and /aa, nothing above ASCII matches these */
-            _invlist_intersection(posixes,
-                                  PL_XPosix_ptrs[_CC_ASCII],
-                                  &posixes);
-        }
-        if (nposixes) {
-            if (DEPENDS_SEMANTICS) {
-                /* Under /d, everything in the upper half of the Latin1 range
-                 * matches these complements */
-                ANYOF_FLAGS(ret) |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
+            if (posixes) {
+                _invlist_intersection(posixes,
+                                    PL_XPosix_ptrs[_CC_ASCII],
+                                    &posixes);
             }
-            else if (AT_LEAST_ASCII_RESTRICTED) {
-                /* Under /a and /aa, everything above ASCII matches these
-                 * complements */
+
+            /* Under /a and /aa, everything above ASCII matches these
+             * complements */
+            if (nposixes) {
                 _invlist_union_complement_2nd(nposixes,
                                               PL_XPosix_ptrs[_CC_ASCII],
                                               &nposixes);
             }
-            if (posixes) {
-                _invlist_union(posixes, nposixes, &posixes);
-                SvREFCNT_dec_NN(nposixes);
-            }
-            else {
-                posixes = nposixes;
-            }
         }
+
         if (! DEPENDS_SEMANTICS) {
-            if (cp_list) {
-                _invlist_union(cp_list, posixes, &cp_list);
-                SvREFCNT_dec_NN(posixes);
+
+            /* For everything but /d, we can just add the current 'posixes' and
+             * 'nposixes' to the main list */
+            if (posixes) {
+                if (cp_list) {
+                    _invlist_union(cp_list, posixes, &cp_list);
+                    SvREFCNT_dec_NN(posixes);
+                }
+                else {
+                    cp_list = posixes;
+                }
             }
-            else {
-                cp_list = posixes;
+            if (nposixes) {
+                if (cp_list) {
+                    _invlist_union(cp_list, nposixes, &cp_list);
+                    SvREFCNT_dec_NN(nposixes);
+                }
+                else {
+                    cp_list = nposixes;
+                }
             }
         }
         else {
-            /* Under /d, we put into a separate list the Latin1 things that
-             * match only when the target string is utf8 */
-            SV* nonascii_but_latin1_properties = NULL;
-            _invlist_intersection(posixes, PL_UpperLatin1,
-                                  &nonascii_but_latin1_properties);
-            _invlist_subtract(posixes, nonascii_but_latin1_properties,
-                              &posixes);
-            if (cp_list) {
-                _invlist_union(cp_list, posixes, &cp_list);
-                SvREFCNT_dec_NN(posixes);
+            /* Under /d, things like \w match upper Latin1 characters only if
+             * the target string is in UTF-8.  But things like \W match all the
+             * upper Latin1 characters if the target string is not in UTF-8.
+             *
+             * Handle the case where there something like \W separately */
+            if (nposixes) {
+                SV* only_non_utf8_list = invlist_clone(PL_UpperLatin1);
+
+                /* A complemented posix class matches all upper Latin1
+                 * characters if not in UTF-8.  And it matches just certain
+                 * ones when in UTF-8.  That means those certain ones are
+                 * matched regardless, so can just be added to the
+                 * unconditional list */
+                if (cp_list) {
+                    _invlist_union(cp_list, nposixes, &cp_list);
+                    SvREFCNT_dec_NN(nposixes);
+                    nposixes = NULL;
+                }
+                else {
+                    cp_list = nposixes;
+                }
+
+                /* Likewise for 'posixes' */
+                _invlist_union(posixes, cp_list, &cp_list);
+
+                /* Likewise for anything else in the range that matched only
+                 * under UTF-8 */
+                if (has_upper_latin1_only_utf8_matches) {
+                    _invlist_union(cp_list,
+                                   has_upper_latin1_only_utf8_matches,
+                                   &cp_list);
+                    SvREFCNT_dec_NN(has_upper_latin1_only_utf8_matches);
+                    has_upper_latin1_only_utf8_matches = NULL;
+                }
+
+                /* If we don't match all the upper Latin1 characters regardless
+                 * of UTF-8ness, we have to set a flag to match the rest when
+                 * not in UTF-8 */
+                _invlist_subtract(only_non_utf8_list, cp_list,
+                                  &only_non_utf8_list);
+                if (_invlist_len(only_non_utf8_list) != 0) {
+                    ANYOF_FLAGS(ret) |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
+                }
             }
             else {
-                cp_list = posixes;
-            }
-
-            if (has_upper_latin1_only_utf8_matches) {
+                /* Here there were no complemented posix classes.  That means
+                 * the upper Latin1 characters in 'posixes' match only when the
+                 * target string is in UTF-8.  So we have to add them to the
+                 * list of those types of code points, while adding the
+                 * remainder to the unconditional list.
+                 *
+                 * First calculate what they are */
+                SV* nonascii_but_latin1_properties = NULL;
+                _invlist_intersection(posixes, PL_UpperLatin1,
+                                      &nonascii_but_latin1_properties);
+
+                /* And add them to the final list of such characters. */
                 _invlist_union(has_upper_latin1_only_utf8_matches,
                                nonascii_but_latin1_properties,
                                &has_upper_latin1_only_utf8_matches);
-                SvREFCNT_dec_NN(nonascii_but_latin1_properties);
-            }
-            else {
-                has_upper_latin1_only_utf8_matches
-                                            = nonascii_but_latin1_properties;
+
+                /* Remove them from what now becomes the unconditional list */
+                _invlist_subtract(posixes, nonascii_but_latin1_properties,
+                                  &posixes);
+
+                /* And add those unconditional ones to the final list */
+                if (cp_list) {
+                    _invlist_union(cp_list, posixes, &cp_list);
+                    SvREFCNT_dec_NN(posixes);
+                    posixes = NULL;
+                }
+                else {
+                    cp_list = posixes;
+                }
+
+                SvREFCNT_dec(nonascii_but_latin1_properties);
+
+                /* Get rid of any characters that we now know are matched
+                 * unconditionally from the conditional list, which may make
+                 * that list empty */
+                _invlist_subtract(has_upper_latin1_only_utf8_matches,
+                                  cp_list,
+                                  &has_upper_latin1_only_utf8_matches);
+                if (_invlist_len(has_upper_latin1_only_utf8_matches) == 0) {
+                    SvREFCNT_dec_NN(has_upper_latin1_only_utf8_matches);
+                    has_upper_latin1_only_utf8_matches = NULL;
+                }
             }
         }
     }
@@ -17337,79 +17594,14 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
             invlist_iterfinish(cp_list);
         }
     }
-
-#define MATCHES_ALL_NON_UTF8_NON_ASCII(ret)                                 \
-    (   DEPENDS_SEMANTICS                                                   \
-     && (ANYOF_FLAGS(ret)                                                   \
-        & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER))
-
-    /* See if we can simplify things under /d */
-    if (   has_upper_latin1_only_utf8_matches
-        || MATCHES_ALL_NON_UTF8_NON_ASCII(ret))
+    else if (   DEPENDS_SEMANTICS
+             && (    has_upper_latin1_only_utf8_matches
+                 || (ANYOF_FLAGS(ret) & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)))
     {
-        /* But not if we are inverting, as that screws it up */
-        if (! invert) {
-            if (has_upper_latin1_only_utf8_matches) {
-                if (MATCHES_ALL_NON_UTF8_NON_ASCII(ret)) {
-
-                    /* Here, we have both the flag and inversion list.  Any
-                     * character in 'has_upper_latin1_only_utf8_matches'
-                     * matches when UTF-8 is in effect, but it also matches
-                     * when UTF-8 is not in effect because of
-                     * MATCHES_ALL_NON_UTF8_NON_ASCII.  Therefore it matches
-                     * unconditionally, so can be added to the regular list,
-                     * and 'has_upper_latin1_only_utf8_matches' cleared */
-                    _invlist_union(cp_list,
-                                   has_upper_latin1_only_utf8_matches,
-                                   &cp_list);
-                    SvREFCNT_dec_NN(has_upper_latin1_only_utf8_matches);
-                    has_upper_latin1_only_utf8_matches = NULL;
-                }
-                else if (cp_list) {
-
-                    /* Here, 'cp_list' gives chars that always match, and
-                     * 'has_upper_latin1_only_utf8_matches' gives chars that
-                     * were specified to match only if the target string is in
-                     * UTF-8.  It may be that these overlap, so we can subtract
-                     * the unconditionally matching from the conditional ones,
-                     * to make the conditional list as small as possible,
-                     * perhaps even clearing it, in which case more
-                     * optimizations are possible later */
-                    _invlist_subtract(has_upper_latin1_only_utf8_matches,
-                                      cp_list,
-                                      &has_upper_latin1_only_utf8_matches);
-                    if (_invlist_len(has_upper_latin1_only_utf8_matches) == 0) {
-                        SvREFCNT_dec_NN(has_upper_latin1_only_utf8_matches);
-                        has_upper_latin1_only_utf8_matches = NULL;
-                    }
-                }
-            }
-
-            /* Similarly, if the unconditional matches include every upper
-             * latin1 character, we can clear that flag to permit later
-             * optimizations */
-            if (cp_list && MATCHES_ALL_NON_UTF8_NON_ASCII(ret)) {
-                SV* only_non_utf8_list = invlist_clone(PL_UpperLatin1);
-                _invlist_subtract(only_non_utf8_list, cp_list,
-                                  &only_non_utf8_list);
-                if (_invlist_len(only_non_utf8_list) == 0) {
-                    ANYOF_FLAGS(ret) &= ~ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
-                }
-                SvREFCNT_dec_NN(only_non_utf8_list);
-                only_non_utf8_list = NULL;;
-            }
-        }
-
-        /* If we haven't gotten rid of all conditional matching, we change the
-         * regnode type to indicate that */
-        if (   has_upper_latin1_only_utf8_matches
-            || MATCHES_ALL_NON_UTF8_NON_ASCII(ret))
-        {
-            OP(ret) = ANYOFD;
-            optimizable = FALSE;
-        }
+        OP(ret) = ANYOFD;
+        optimizable = FALSE;
     }
-#undef MATCHES_ALL_NON_UTF8_NON_ASCII
+
 
     /* Optimize inverted simple patterns (e.g. [^a-z]) when everything is known
      * at compile time.  Besides not inverting folded locale now, we can't
@@ -18107,7 +18299,7 @@ S_nextchar(pTHX_ RExC_state_t *pRExC_state)
         RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
 
         skip_to_be_ignored_text(pRExC_state, &RExC_parse,
-                                FALSE /* Don't assume /x */ );
+                                FALSE /* Don't force /x */ );
     }
 }
 
@@ -18237,7 +18429,9 @@ S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
 	RExC_size += size;
 	return;
     }
-
+    assert(!RExC_study_started); /* I believe we should never use reginsert once we have started
+                                    studying. If this is wrong then we need to adjust RExC_recurse
+                                    below like we do with RExC_open_parens/RExC_close_parens. */
     src = RExC_emit;
     RExC_emit += size;
     dst = RExC_emit;
@@ -18248,7 +18442,10 @@ S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
          * iow it is 1 more than the number of parens seen in
          * the pattern so far. */
         for ( paren=0 ; paren < RExC_npar ; paren++ ) {
-            if ( RExC_open_parens[paren] >= opnd ) {
+            /* note, RExC_open_parens[0] is the start of the
+             * regex, it can't move. RExC_close_parens[0] is the end
+             * of the regex, it *can* move. */
+            if ( paren && RExC_open_parens[paren] >= opnd ) {
                 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
                 RExC_open_parens[paren] += size;
             } else {
@@ -18743,7 +18940,8 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_
                                                  : TRIE_BITMAP(trie)),
                                                 NULL,
                                                 NULL,
-                                                NULL
+                                                NULL,
+                                                FALSE
                                                );
             sv_catpvs(sv, "]");
         }
@@ -18842,6 +19040,8 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_
         /* And things that aren't in the bitmap, but are small enough to be */
         SV* bitmap_range_not_in_bitmap = NULL;
 
+        const bool inverted = flags & ANYOF_INVERT;
+
 	if (OP(o) == ANYOFL) {
             if (ANYOFL_UTF8_LOCALE_REQD(flags)) {
                 sv_catpvs(sv, "{utf8-locale-reqd}");
@@ -18886,21 +19086,37 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_
                                               ANYOF_BITMAP(o),
                                               bitmap_range_not_in_bitmap,
                                               only_utf8_locale_invlist,
-                                              o);
+                                              o,
+
+                                              /* Can't try inverting for a
+                                               * better display if there are
+                                               * things that haven't been
+                                               * resolved */
+                                              unresolved != NULL);
         SvREFCNT_dec(bitmap_range_not_in_bitmap);
 
         /* If there are user-defined properties which haven't been defined yet,
-         * output them, in a separate [] from the bitmap range stuff */
+         * output them.  If the result is not to be inverted, it is clearest to
+         * output them in a separate [] from the bitmap range stuff.  If the
+         * result is to be complemented, we have to show everything in one [],
+         * as the inversion applies to the whole thing.  Use {braces} to
+         * separate them from anything in the bitmap and anything above the
+         * bitmap. */
         if (unresolved) {
-            if (do_sep) {
-                Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]);
+            if (inverted) {
+                if (! do_sep) { /* If didn't output anything in the bitmap */
+                    sv_catpvs(sv, "^");
+                }
+                sv_catpvs(sv, "{");
             }
-            if (flags & ANYOF_INVERT) {
-                sv_catpvs(sv, "^");
+            else if (do_sep) {
+                Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]);
             }
             sv_catsv(sv, unresolved);
-            do_sep = TRUE;
-            SvREFCNT_dec_NN(unresolved);
+            if (inverted) {
+                sv_catpvs(sv, "}");
+            }
+            do_sep = ! inverted;
         }
 
         /* And, finally, add the above-the-bitmap stuff */
@@ -18917,9 +19133,11 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_
                 Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]);
             }
 
-            /* And, for easy of understanding, it is always output not-shown as
-             * complemented */
-            if (flags & ANYOF_INVERT) {
+            /* And, for easy of understanding, it is shown in the
+             * uncomplemented form if possible.  The one exception being if
+             * there are unresolved items, where the inversion has to be
+             * delayed until runtime */
+            if (inverted && ! unresolved) {
                 _invlist_invert(nonbitmap_invlist);
                 _invlist_subtract(nonbitmap_invlist, PL_InBitmap, &nonbitmap_invlist);
             }
@@ -18956,6 +19174,8 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_
 
         /* And finally the matching, closing ']' */
 	Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
+
+        SvREFCNT_dec(unresolved);
     }
     else if (k == POSIXD || k == NPOSIXD) {
         U8 index = FLAGS(o) * 2;
@@ -19769,30 +19989,36 @@ S_put_range(pTHX_ SV *sv, UV start, const UV end, const bool allow_literals)
          * mnemonic names.  Split off any of those at the beginning and end of
          * the range to print mnemonically.  It isn't possible for many of
          * these to be in a row, so this won't overwhelm with output */
-        while (isMNEMONIC_CNTRL(start) && start <= end) {
-            put_code_point(sv, start);
-            start++;
-        }
-        if (start < end && isMNEMONIC_CNTRL(end)) {
-
-            /* Here, the final character in the range has a mnemonic name.
-             * Work backwards from the end to find the final non-mnemonic */
-            UV temp_end = end - 1;
-            while (isMNEMONIC_CNTRL(temp_end)) {
-                temp_end--;
+        if (   start <= end
+            && (isMNEMONIC_CNTRL(start) || isMNEMONIC_CNTRL(end)))
+        {
+            while (isMNEMONIC_CNTRL(start) && start <= end) {
+                put_code_point(sv, start);
+                start++;
             }
 
-            /* And separately output the interior range that doesn't start or
-             * end with mnemonics */
-            put_range(sv, start, temp_end, FALSE);
+            /* If this didn't take care of the whole range ... */
+            if (start <= end) {
 
-            /* Then output the mnemonic trailing controls */
-            start = temp_end + 1;
-            while (start <= end) {
-                put_code_point(sv, start);
-                start++;
+                /* Look backwards from the end to find the final non-mnemonic
+                 * */
+                UV temp_end = end;
+                while (isMNEMONIC_CNTRL(temp_end)) {
+                    temp_end--;
+                }
+
+                /* And separately output the interior range that doesn't start
+                 * or end with mnemonics */
+                put_range(sv, start, temp_end, FALSE);
+
+                /* Then output the mnemonic trailing controls */
+                start = temp_end + 1;
+                while (start <= end) {
+                    put_code_point(sv, start);
+                    start++;
+                }
+                break;
             }
-            break;
         }
 
         /* As a final resort, output the range or subrange as hex. */
@@ -19880,7 +20106,9 @@ S_put_charclass_bitmap_innards_common(pTHX_
 )
 {
     /* Create and return an SV containing a displayable version of the bitmap
-     * and associated information determined by the input parameters. */
+     * and associated information determined by the input parameters.  If the
+     * output would have been only the inversion indicator '^', NULL is instead
+     * returned. */
 
     SV * output;
 
@@ -19939,9 +20167,8 @@ S_put_charclass_bitmap_innards_common(pTHX_
         }
     }
 
-    /* If the only thing we output is the '^', clear it */
     if (invert && SvCUR(output) == 1) {
-        SvCUR_set(output, 0);
+        return NULL;
     }
 
     return output;
@@ -19952,7 +20179,8 @@ S_put_charclass_bitmap_innards(pTHX_ SV *sv,
                                      char *bitmap,
                                      SV *nonbitmap_invlist,
                                      SV *only_utf8_locale_invlist,
-                                     const regnode * const node)
+                                     const regnode * const node,
+                                     const bool force_as_is_display)
 {
     /* Appends to 'sv' a displayable version of the innards of the bracketed
      * character class defined by the other arguments:
@@ -19968,13 +20196,16 @@ S_put_charclass_bitmap_innards(pTHX_ SV *sv,
      *  'node' is the regex pattern node.  It is needed only when the above two
      *      parameters are not null, and is passed so that this routine can
      *      tease apart the various reasons for them.
+     *  'force_as_is_display' is TRUE if this routine should definitely NOT try
+     *      to invert things to see if that leads to a cleaner display.  If
+     *      FALSE, this routine is free to use its judgment about doing this.
      *
      * It returns TRUE if there was actually something output.  (It may be that
      * the bitmap, etc is empty.)
      *
      * When called for outputting the bitmap of a non-ANYOF node, just pass the
-     * bitmap, with the succeeding parameters set to NULL.
-     *
+     * bitmap, with the succeeding parameters set to NULL, and the final one to
+     * FALSE.
      */
 
     /* In general, it tries to display the 'cleanest' representation of the
@@ -19982,7 +20213,7 @@ S_put_charclass_bitmap_innards(pTHX_ SV *sv,
      * whether the class itself is to be inverted.  However,  there are some
      * cases where it can't try inverting, as what actually matches isn't known
      * until runtime, and hence the inversion isn't either. */
-    bool inverting_allowed = TRUE;
+    bool inverting_allowed = ! force_as_is_display;
 
     int i;
     STRLEN orig_sv_cur = SvCUR(sv);
@@ -19998,7 +20229,7 @@ S_put_charclass_bitmap_innards(pTHX_ SV *sv,
                                        is UTF-8 */
 
     SV* as_is_display;      /* The output string when we take the inputs
-                              literally */
+                               literally */
     SV* inverted_display;   /* The output string when we invert the inputs */
 
     U8 flags = (node) ? ANYOF_FLAGS(node) : 0;
@@ -20036,14 +20267,7 @@ S_put_charclass_bitmap_innards(pTHX_ SV *sv,
             /* And this flag for matching all non-ASCII 0xFF and below */
             if (flags & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)
             {
-                if (invert) {
-                    not_utf8 = _new_invlist(0);
-                }
-                else {
-                    not_utf8 = invlist_clone(PL_UpperLatin1);
-                }
-                inverting_allowed = FALSE;  /* XXX needs more work to be able
-                                               to allow this */
+                not_utf8 = invlist_clone(PL_UpperLatin1);
             }
         }
         else if (OP(node) == ANYOFL) {
@@ -20118,7 +20342,10 @@ S_put_charclass_bitmap_innards(pTHX_ SV *sv,
 
     /* If have to take the output as-is, just do that */
     if (! inverting_allowed) {
-        sv_catsv(sv, as_is_display);
+        if (as_is_display) {
+            sv_catsv(sv, as_is_display);
+            SvREFCNT_dec_NN(as_is_display);
+        }
     }
     else { /* But otherwise, create the output again on the inverted input, and
               use whichever version is shorter */
@@ -20145,6 +20372,7 @@ S_put_charclass_bitmap_innards(pTHX_ SV *sv,
          * conditional code points, so that when inverted, they will be gone
          * from it */
         _invlist_union(only_utf8, invlist, &invlist);
+        _invlist_union(not_utf8, invlist, &invlist);
         _invlist_union(only_utf8_locale, invlist, &invlist);
         _invlist_invert(invlist);
         _invlist_intersection(invlist, PL_InBitmap, &invlist);
@@ -20153,10 +20381,13 @@ S_put_charclass_bitmap_innards(pTHX_ SV *sv,
             _invlist_invert(only_utf8);
             _invlist_intersection(only_utf8, PL_UpperLatin1, &only_utf8);
         }
+        else if (not_utf8) {
 
-        if (not_utf8) {
-            _invlist_invert(not_utf8);
-            _invlist_intersection(not_utf8, PL_UpperLatin1, &not_utf8);
+            /* If a code point matches iff the target string is not in UTF-8,
+             * then complementing the result has it not match iff not in UTF-8,
+             * which is the same thing as matching iff it is UTF-8. */
+            only_utf8 = not_utf8;
+            not_utf8 = NULL;
         }
 
         if (only_utf8_locale) {
@@ -20175,17 +20406,19 @@ S_put_charclass_bitmap_innards(pTHX_ SV *sv,
 
         /* Use the shortest representation, taking into account our bias
          * against showing it inverted */
-        if (SvCUR(inverted_display) + inverted_bias
-            < SvCUR(as_is_display) + as_is_bias)
+        if (   inverted_display
+            && (   ! as_is_display
+                || (  SvCUR(inverted_display) + inverted_bias
+                    < SvCUR(as_is_display)    + as_is_bias)))
         {
 	    sv_catsv(sv, inverted_display);
         }
-        else {
+        else if (as_is_display) {
 	    sv_catsv(sv, as_is_display);
         }
 
-        SvREFCNT_dec_NN(as_is_display);
-        SvREFCNT_dec_NN(inverted_display);
+        SvREFCNT_dec(as_is_display);
+        SvREFCNT_dec(inverted_display);
     }
 
     SvREFCNT_dec_NN(invlist);