X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/711b303b46e294e0fd67c7f4f1c7a525c6ca76b4..ff26755128a5a8772dc97867f605a19f44cc9abb:/regcomp.c

diff --git a/regcomp.c b/regcomp.c
index 8c73c53..bb93631 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -105,6 +105,10 @@ EXTERN_C const struct regexp_engine my_reg_engine;
 #define MIN(a,b) ((a) < (b) ? (a) : (b))
 #endif
 
+#ifndef MAX
+#define MAX(a,b) ((a) > (b) ? (a) : (b))
+#endif
+
 /* this is a chain of data about sub patterns we are processing that
    need to be handled separately/specially in study_chunk. Its so
    we can simulate recursion without losing state.  */
@@ -139,6 +143,8 @@ struct RExC_state_t {
     char	*start;			/* Start of input for compile */
     char	*end;			/* End of input for compile */
     char	*parse;			/* Input-scan pointer. */
+    char        *adjusted_start;        /* 'start', adjusted.  See code use */
+    STRLEN      precomp_adj;            /* an offset beyond precomp.  See code use */
     SSize_t	whilem_seen;		/* number of WHILEM in this expr */
     regnode	*emit_start;		/* Start of emitted-code area */
     regnode	*emit_bound;		/* First regnode outside of the
@@ -193,7 +199,6 @@ struct RExC_state_t {
     scan_frame *frame_head;
     scan_frame *frame_last;
     U32         frame_count;
-    U32         strict;
 #ifdef ADD_TO_REGEXEC
     char 	*starttry;		/* -Dr: where regtry was called. */
 #define RExC_starttry	(pRExC_state->starttry)
@@ -216,11 +221,14 @@ struct RExC_state_t {
 
 #endif
     bool        seen_unfolded_sharp_s;
+    bool        strict;
 };
 
 #define RExC_flags	(pRExC_state->flags)
 #define RExC_pm_flags	(pRExC_state->pm_flags)
 #define RExC_precomp	(pRExC_state->precomp)
+#define RExC_precomp_adj (pRExC_state->precomp_adj)
+#define RExC_adjusted_start  (pRExC_state->adjusted_start)
 #define RExC_precomp_end (pRExC_state->precomp_end)
 #define RExC_rx_sv	(pRExC_state->rx_sv)
 #define RExC_rx		(pRExC_state->rx)
@@ -556,19 +564,67 @@ static const scan_data_t zero_scan_data =
 #define REPORT_LOCATION " in regex; marked by " MARKER1    \
                         " in m/%"UTF8f MARKER2 "%"UTF8f"/"
 
-#define REPORT_LOCATION_ARGS(loc)                                           \
-                UTF8fARG(UTF,                                               \
-                         ((loc) > RExC_end)                                 \
-                          ? RExC_precomp_end - RExC_precomp                 \
-                          : (loc) - RExC_precomp,                           \
-                         RExC_precomp),                                     \
-                UTF8fARG(UTF,                                               \
-                         ((loc) > RExC_end)                                 \
-                          ? 0                                               \
-                          : RExC_precomp_end - (loc),                       \
-                         ((loc) > RExC_end)                                 \
-                          ? RExC_precomp_end                                \
-                          : (loc))
+/* The code in this file in places uses one level of recursion with parsing
+ * rebased to an alternate string constructed by us in memory.  This can take
+ * the form of something that is completely different from the input, or
+ * something that uses the input as part of the alternate.  In the first case,
+ * there should be no possibility of an error, as we are in complete control of
+ * the alternate string.  But in the second case we don't control the input
+ * portion, so there may be errors in that.  Here's an example:
+ *      /[abc\x{DF}def]/ui
+ * is handled specially because \x{df} folds to a sequence of more than one
+ * character, 'ss'.  What is done is to create and parse an alternate string,
+ * which looks like this:
+ *      /(?:\x{DF}|[abc\x{DF}def])/ui
+ * where it uses the input unchanged in the middle of something it constructs,
+ * which is a branch for the DF outside the character class, and clustering
+ * parens around the whole thing. (It knows enough to skip the DF inside the
+ * class while in this substitute parse.) 'abc' and 'def' may have errors that
+ * need to be reported.  The general situation looks like this:
+ *
+ *              sI                       tI               xI       eI
+ * Input:       ----------------------------------------------------
+ * Constructed:         ---------------------------------------------------
+ *                      sC               tC               xC       eC     EC
+ *
+ * The input string sI..eI is the input pattern.  The string sC..EC is the
+ * constructed substitute parse string.  The portions sC..tC and eC..EC are
+ * constructed by us.  The portion tC..eC is an exact duplicate of the input
+ * pattern tI..eI.  In the diagram, these are vertically aligned.  Suppose that
+ * while parsing, we find an error at xC.  We want to display a message showing
+ * the real input string.  Thus we need to find the point xI in it which
+ * corresponds to xC.  xC >= tC, since the portion of the string sC..tC has
+ * been constructed by us, and so shouldn't have errors.  We get:
+ *
+ *      xI = sI + (tI - sI) + (xC - tC)
+ *
+ * and, the offset into sI is:
+ *
+ *      (xI - sI) = (tI - sI) + (xC - tC)
+ *
+ * When the substitute is constructed, we save (tI -sI) as RExC_precomp_adj,
+ * and we save tC as RExC_adjusted_start.
+ *
+ * During normal processing of the input pattern, everything points to that,
+ * with RExC_precomp_adj set to 0, and RExC_adjusted_start set to sI.
+ */
+
+#define tI_sI           RExC_precomp_adj
+#define tC              RExC_adjusted_start
+#define sC              RExC_precomp
+#define xI_offset(xC)   ((IV) (tI_sI + (xC - tC)))
+#define xI(xC)          (sC + xI_offset(xC))
+#define eC              RExC_precomp_end
+
+#define REPORT_LOCATION_ARGS(xC)                                            \
+    UTF8fARG(UTF,                                                           \
+             (xI(xC) > eC) /* Don't run off end */                          \
+              ? eC - sC   /* Length before the <--HERE */                   \
+              : xI_offset(xC),                                              \
+             sC),         /* The input pattern printed up to the <--HERE */ \
+    UTF8fARG(UTF,                                                           \
+             (xI(xC) > eC) ? 0 : eC - xI(xC), /* Length after <--HERE */    \
+             (xI(xC) > eC) ? eC : xI(xC))     /* pattern after <--HERE */
 
 /* Used to point after bad bytes for an error message, but avoid skipping
  * past a nul byte. */
@@ -945,6 +1001,151 @@ DEBUG_OPTIMISE_MORE_r(if(data){                                      \
     PerlIO_printf(Perl_debug_log,"\n");                              \
 });
 
+/* =========================================================
+ * BEGIN edit_distance stuff.
+ *
+ * This calculates how many single character changes of any type are needed to
+ * transform a string into another one.  It is taken from version 3.1 of
+ *
+ * https://metacpan.org/pod/Text::Levenshtein::Damerau::XS
+ */
+
+/* Our unsorted dictionary linked list.   */
+/* Note we use UVs, not chars. */
+
+struct dictionary{
+  UV key;
+  UV value;
+  struct dictionary* next;
+};
+typedef struct dictionary item;
+
+
+PERL_STATIC_INLINE item*
+push(UV key,item* curr)
+{
+    item* head;
+    Newxz(head, 1, item);
+    head->key = key;
+    head->value = 0;
+    head->next = curr;
+    return head;
+}
+
+
+PERL_STATIC_INLINE item*
+find(item* head, UV key)
+{
+    item* iterator = head;
+    while (iterator){
+        if (iterator->key == key){
+            return iterator;
+        }
+        iterator = iterator->next;
+    }
+
+    return NULL;
+}
+
+PERL_STATIC_INLINE item*
+uniquePush(item* head,UV key)
+{
+    item* iterator = head;
+
+    while (iterator){
+        if (iterator->key == key) {
+            return head;
+        }
+        iterator = iterator->next;
+    }
+
+    return push(key,head);
+}
+
+PERL_STATIC_INLINE void
+dict_free(item* head)
+{
+    item* iterator = head;
+
+    while (iterator) {
+        item* temp = iterator;
+        iterator = iterator->next;
+        Safefree(temp);
+    }
+
+    head = NULL;
+}
+
+/* End of Dictionary Stuff */
+
+/* All calculations/work are done here */
+STATIC int
+S_edit_distance(const UV* src,
+                const UV* tgt,
+                const STRLEN x,             /* length of src[] */
+                const STRLEN y,             /* length of tgt[] */
+                const SSize_t maxDistance
+)
+{
+    item *head = NULL;
+    UV swapCount,swapScore,targetCharCount,i,j;
+    UV *scores;
+    UV score_ceil = x + y;
+
+    PERL_ARGS_ASSERT_EDIT_DISTANCE;
+
+    /* intialize matrix start values */
+    Newxz(scores, ( (x + 2) * (y + 2)), UV);
+    scores[0] = score_ceil;
+    scores[1 * (y + 2) + 0] = score_ceil;
+    scores[0 * (y + 2) + 1] = score_ceil;
+    scores[1 * (y + 2) + 1] = 0;
+    head = uniquePush(uniquePush(head,src[0]),tgt[0]);
+
+    /* work loops    */
+    /* i = src index */
+    /* j = tgt index */
+    for (i=1;i<=x;i++) {
+        if (i < x)
+            head = uniquePush(head,src[i]);
+        scores[(i+1) * (y + 2) + 1] = i;
+        scores[(i+1) * (y + 2) + 0] = score_ceil;
+        swapCount = 0;
+
+        for (j=1;j<=y;j++) {
+            if (i == 1) {
+                if(j < y)
+                head = uniquePush(head,tgt[j]);
+                scores[1 * (y + 2) + (j + 1)] = j;
+                scores[0 * (y + 2) + (j + 1)] = score_ceil;
+            }
+
+            targetCharCount = find(head,tgt[j-1])->value;
+            swapScore = scores[targetCharCount * (y + 2) + swapCount] + i - targetCharCount - 1 + j - swapCount;
+
+            if (src[i-1] != tgt[j-1]){
+                scores[(i+1) * (y + 2) + (j + 1)] = MIN(swapScore,(MIN(scores[i * (y + 2) + j], MIN(scores[(i+1) * (y + 2) + j], scores[i * (y + 2) + (j + 1)])) + 1));
+            }
+            else {
+                swapCount = j;
+                scores[(i+1) * (y + 2) + (j + 1)] = MIN(scores[i * (y + 2) + j], swapScore);
+            }
+        }
+
+        find(head,src[i-1])->value = i;
+    }
+
+    {
+        IV score = scores[(x+1) * (y + 2) + (y + 1)];
+        dict_free(head);
+        Safefree(scores);
+        return (maxDistance != 0 && maxDistance < score)?(-1):score;
+    }
+}
+
+/* END of edit_distance() stuff
+ * ========================================================= */
+
 /* is c a control character for which we have a mnemonic? */
 #define isMNEMONIC_CNTRL(c) _IS_MNEMONIC_CNTRL_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
 
@@ -1159,7 +1360,7 @@ S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
      * returned list must, and will, contain every code point that is a
      * possibility. */
 
-    SV* invlist = sv_2mortal(_new_invlist(0));
+    SV* invlist = NULL;
     SV* only_utf8_locale_invlist = NULL;
     unsigned int i;
     const U32 n = ARG(node);
@@ -1181,6 +1382,7 @@ S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
 
             /* Here, no compile-time swash, and there are things that won't be
              * known until runtime -- we have to assume it could be anything */
+            invlist = sv_2mortal(_new_invlist(1));
             return _add_range_to_invlist(invlist, 0, UV_MAX);
         }
         else if (ary[3] && ary[3] != &PL_sv_undef) {
@@ -1191,13 +1393,17 @@ S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
         }
 
         /* Get the code points valid only under UTF-8 locales */
-        if ((ANYOF_FLAGS(node) & ANYOF_LOC_FOLD)
+        if ((ANYOF_FLAGS(node) & ANYOFL_FOLD)
             && ary[2] && ary[2] != &PL_sv_undef)
         {
             only_utf8_locale_invlist = ary[2];
         }
     }
 
+    if (! invlist) {
+        invlist = sv_2mortal(_new_invlist(0));
+    }
+
     /* An ANYOF node contains a bitmap for the first NUM_ANYOF_CODE_POINTS
      * code points, and an inversion list for the others, but if there are code
      * points that should match only conditionally on the target string being
@@ -1223,8 +1429,10 @@ S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
     }
 
     /* If this can match all upper Latin1 code points, have to add them
-     * as well */
-    if (OP(node) == ANYOFD
+     * as well.  But don't add them if inverting, as when that gets done below,
+     * it would exclude all these characters, including the ones it shouldn't
+     * that were added just above */
+    if (! (ANYOF_FLAGS(node) & ANYOF_INVERT) && OP(node) == ANYOFD
         && (ANYOF_FLAGS(node) & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER))
     {
         _invlist_union(invlist, PL_UpperLatin1, &invlist);
@@ -1238,7 +1446,7 @@ S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
     if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
         _invlist_invert(invlist);
     }
-    else if (new_node_has_latin1 && ANYOF_FLAGS(node) & ANYOF_LOC_FOLD) {
+    else if (new_node_has_latin1 && ANYOF_FLAGS(node) & ANYOFL_FOLD) {
 
         /* Under /li, any 0-255 could fold to any other 0-255, depending on the
          * locale.  We can skip this if there are no 0-255 at all. */
@@ -1317,6 +1525,10 @@ S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
             &( ANYOF_COMMON_FLAGS
               |ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
               |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP);
+            if (ANYOFL_UTF8_LOCALE_REQD(ANYOF_FLAGS(and_with))) {
+                anded_flags &=
+                    ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
+            }
         }
     }
 
@@ -1473,6 +1685,10 @@ S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
             |= ANYOF_FLAGS(or_with)
              & ( ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
                 |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP);
+            if (ANYOFL_UTF8_LOCALE_REQD(ANYOF_FLAGS(or_with))) {
+                ored_flags |=
+                    ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
+            }
         }
     }
 
@@ -1617,35 +1833,28 @@ S_is_ssc_worth_it(const RExC_state_t * pRExC_state, const regnode_ssc * ssc)
      *      (unassigned, private use, surrogates, controls and formats).  This
      *      is a much large number. */
 
-    const U32 max_match = (LOC)
-                          ? 127
-                          : (! UNI_SEMANTICS)
-                            ? 63
-                            : (invlist_highest(ssc->invlist) < 256)
-                              ? 127
-                              : ((NON_OTHER_COUNT + 1) / 2) - 1;
     U32 count = 0;      /* Running total of number of code points matched by
                            'ssc' */
     UV start, end;      /* Start and end points of current range in inversion
                            list */
+    const U32 max_code_points = (LOC)
+                                ?  256
+                                : ((   ! UNI_SEMANTICS
+                                     || invlist_highest(ssc->invlist) < 256)
+                                  ? 128
+                                  : NON_OTHER_COUNT);
+    const U32 max_match = max_code_points / 2;
 
     PERL_ARGS_ASSERT_IS_SSC_WORTH_IT;
 
     invlist_iterinit(ssc->invlist);
     while (invlist_iternext(ssc->invlist, &start, &end)) {
-
-        /* /u is the only thing that we expect to match above 255; so if not /u
-         * and even if there are matches above 255, ignore them.  This catches
-         * things like \d under /d which does match the digits above 255, but
-         * since the pattern is /d, it is not likely to be expecting them */
-        if (! UNI_SEMANTICS) {
-            if (start > 255) {
-                break;
-            }
-            end = MIN(end, 255);
+        if (start >= max_code_points) {
+            break;
         }
+        end = MIN(end, max_code_points - 1);
         count += end - start + 1;
-        if (count > max_match) {
+        if (count >= max_match) {
             invlist_iterfinish(ssc->invlist);
             return FALSE;
         }
@@ -6500,6 +6709,10 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
     /* Initialize these here instead of as-needed, as is quick and avoids
      * having to test them each time otherwise */
     if (! PL_AboveLatin1) {
+#ifdef DEBUGGING
+        char * dump_len_string;
+#endif
+
 	PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
 	PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
 	PL_UpperLatin1 = _new_invlist_C_array(UpperLatin1_invlist);
@@ -6513,6 +6726,14 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
 	PL_InBitmap = _new_invlist(2);
 	PL_InBitmap = _add_range_to_invlist(PL_InBitmap, 0,
                                                     NUM_ANYOF_CODE_POINTS - 1);
+#ifdef DEBUGGING
+        dump_len_string = PerlEnv_getenv("PERL_DUMP_RE_MAX_LEN");
+        if (   ! dump_len_string
+            || ! grok_atoUV(dump_len_string, (UV *)&PL_dump_re_max_len, NULL))
+        {
+            PL_dump_re_max_len = 0;
+        }
+#endif
     }
 
     pRExC_state->code_blocks = NULL;
@@ -6693,6 +6914,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
     }
 
     RExC_precomp = exp;
+    RExC_precomp_adj = 0;
     RExC_flags = rx_flags;
     RExC_pm_flags = pm_flags;
 
@@ -6726,7 +6948,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
 
     /* First pass: determine size, legality. */
     RExC_parse = exp;
-    RExC_start = exp;
+    RExC_start = RExC_adjusted_start = exp;
     RExC_end = exp + plen;
     RExC_precomp_end = RExC_end;
     RExC_naughty = 0;
@@ -7958,7 +8180,8 @@ S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
     assert (RExC_parse <= RExC_end);
     if (RExC_parse == RExC_end) NOOP;
     else if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
-	 /* skip IDFIRST by using do...while */
+         /* Note that the code here assumes well-formed UTF-8.  Skip IDFIRST by
+          * using do...while */
 	if (UTF)
 	    do {
 		RExC_parse += UTF8SKIP(RExC_parse);
@@ -8133,6 +8356,52 @@ S_invlist_set_len(pTHX_ SV* const invlist, const UV len, const bool offset)
 
 #ifndef PERL_IN_XSUB_RE
 
+STATIC void
+S_invlist_replace_list_destroys_src(pTHX_ SV * dest, SV * src)
+{
+    /* Replaces the inversion list in 'src' with the one in 'dest'.  It steals
+     * the list from 'src', so 'src' is made to have a NULL list.  This is
+     * similar to what SvSetMagicSV() would do, if it were implemented on
+     * inversion lists, though this routine avoids a copy */
+
+    const UV src_len          = _invlist_len(src);
+    const bool src_offset     = *get_invlist_offset_addr(src);
+    const STRLEN src_byte_len = SvLEN(src);
+    char * array              = SvPVX(src);
+
+    const int oldtainted = TAINT_get;
+
+    PERL_ARGS_ASSERT_INVLIST_REPLACE_LIST_DESTROYS_SRC;
+
+    assert(SvTYPE(src) == SVt_INVLIST);
+    assert(SvTYPE(dest) == SVt_INVLIST);
+    assert(! invlist_is_iterating(src));
+    assert(SvCUR(src) == 0 || SvCUR(src) < SvLEN(src));
+
+    /* Make sure it ends in the right place with a NUL, as our inversion list
+     * manipulations aren't careful to keep this true, but sv_usepvn_flags()
+     * asserts it */
+    array[src_byte_len - 1] = '\0';
+
+    TAINT_NOT;      /* Otherwise it breaks */
+    sv_usepvn_flags(dest,
+                    (char *) array,
+                    src_byte_len - 1,
+
+                    /* This flag is documented to cause a copy to be avoided */
+                    SV_HAS_TRAILING_NUL);
+    TAINT_set(oldtainted);
+    SvPV_set(src, 0);
+    SvLEN_set(src, 0);
+    SvCUR_set(src, 0);
+
+    /* Finish up copying over the other fields in an inversion list */
+    *get_invlist_offset_addr(dest) = src_offset;
+    invlist_set_len(dest, src_len, src_offset);
+    *get_invlist_previous_index_addr(dest) = 0;
+    invlist_iterfinish(dest);
+}
+
 PERL_STATIC_INLINE IV*
 S_get_invlist_previous_index_addr(SV* invlist)
 {
@@ -8168,17 +8437,35 @@ S_invlist_set_previous_index(SV* const invlist, const IV index)
 }
 
 PERL_STATIC_INLINE void
-S_invlist_trim(SV* const invlist)
+S_invlist_trim(SV* invlist)
 {
+    /* Free the not currently-being-used space in an inversion list */
+
+    /* But don't free up the space needed for 0 UV that is always at the
+     * beginning of the list, nor the trailing NUL */
+    const UV min_size = TO_INTERNAL_SIZE(1) + 1;
+
     PERL_ARGS_ASSERT_INVLIST_TRIM;
 
     assert(SvTYPE(invlist) == SVt_INVLIST);
 
-    /* Change the length of the inversion list to how many entries it currently
-     * has */
-    SvPV_shrink_to_cur((SV *) invlist);
+    SvPV_renew(invlist, MAX(min_size, SvCUR(invlist) + 1));
+
+}
+
+PERL_STATIC_INLINE void
+S_invlist_clear(pTHX_ SV* invlist)    /* Empty the inversion list */
+{
+    PERL_ARGS_ASSERT_INVLIST_CLEAR;
+
+    assert(SvTYPE(invlist) == SVt_INVLIST);
+
+    invlist_set_len(invlist, 0, 0);
+    invlist_trim(invlist);
 }
 
+#endif /* ifndef PERL_IN_XSUB_RE */
+
 PERL_STATIC_INLINE bool
 S_invlist_is_iterating(SV* const invlist)
 {
@@ -8187,8 +8474,6 @@ S_invlist_is_iterating(SV* const invlist)
     return *(get_invlist_iter_addr(invlist)) < (STRLEN) UV_MAX;
 }
 
-#endif /* ifndef PERL_IN_XSUB_RE */
-
 PERL_STATIC_INLINE UV
 S_invlist_max(SV* const invlist)
 {
@@ -8401,7 +8686,9 @@ Perl__invlist_search(SV* const invlist, const UV cp)
     /* Searches the inversion list for the entry that contains the input code
      * point <cp>.  If <cp> is not in the list, -1 is returned.  Otherwise, the
      * return value is the index into the list's array of the range that
-     * contains <cp> */
+     * contains <cp>, that is, 'i' such that
+     *	array[i] <= cp < array[i+1]
+     */
 
     IV low = 0;
     IV mid;
@@ -8420,7 +8707,10 @@ Perl__invlist_search(SV* const invlist, const UV cp)
     array = invlist_array(invlist);
 
     mid = invlist_previous_index(invlist);
-    assert(mid >=0 && mid <= highest_element);
+    assert(mid >=0);
+    if (mid > highest_element) {
+        mid = highest_element;
+    }
 
     /* <mid> contains the cache of the result of the previous call to this
      * function (0 the first time).  See if this call is for the same result,
@@ -8584,10 +8874,10 @@ Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
     /* Take the union of two inversion lists and point <output> to it.  *output
      * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
      * the reference count to that list will be decremented if not already a
-     * temporary (mortal); otherwise *output will be made correspondingly
-     * mortal.  The first list, <a>, may be NULL, in which case a copy of the
-     * second list is returned.  If <complement_b> is TRUE, the union is taken
-     * of the complement (inversion) of <b> instead of b itself.
+     * temporary (mortal); otherwise just its contents will be modified to be
+     * the union.  The first list, <a>, may be NULL, in which case a copy of
+     * the second list is returned.  If <complement_b> is TRUE, the union is
+     * taken of the complement (inversion) of <b> instead of b itself.
      *
      * The basis for this comes from "Unicode Demystified" Chapter 13 by
      * Richard Gillam, published by Addison-Wesley, and explained at some
@@ -8815,21 +9105,30 @@ Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
 	}
     }
 
-    /*  We may be removing a reference to one of the inputs.  If so, the output
-     *  is made mortal if the input was.  (Mortal SVs shouldn't have their ref
-     *  count decremented) */
-    if (a == *output || b == *output) {
+    if (a != *output && b != *output) {
+        *output = u;
+    }
+    else {
+        /*  Here, the output is to be the same as one of the input scalars,
+         *  hence replacing it.  The simple thing to do is to free the input
+         *  scalar, making it instead be the output one.  But experience has
+         *  shown [perl #127392] that if the input is a mortal, we can get a
+         *  huge build-up of these during regex compilation before they get
+         *  freed.  So for that case, replace just the input's interior with
+         *  the output's, and then free the output */
+
         assert(! invlist_is_iterating(*output));
-        if ((SvTEMP(*output))) {
-            sv_2mortal(u);
+
+        if (! SvTEMP(*output)) {
+            SvREFCNT_dec_NN(*output);
+            *output = u;
         }
         else {
-            SvREFCNT_dec_NN(*output);
+            invlist_replace_list_destroys_src(*output, u);
+            SvREFCNT_dec_NN(u);
         }
     }
 
-    *output = u;
-
     return;
 }
 
@@ -8840,11 +9139,11 @@ Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
     /* Take the intersection of two inversion lists and point <i> to it.  *i
      * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
      * the reference count to that list will be decremented if not already a
-     * temporary (mortal); otherwise *i will be made correspondingly mortal.
-     * The first list, <a>, may be NULL, in which case an empty list is
-     * returned.  If <complement_b> is TRUE, the result will be the
-     * intersection of <a> and the complement (or inversion) of <b> instead of
-     * <b> directly.
+     * temporary (mortal); otherwise just its contents will be modified to be
+     * the intersection.  The first list, <a>, may be NULL, in which case an
+     * empty list is returned.  If <complement_b> is TRUE, the result will be
+     * the intersection of <a> and the complement (or inversion) of <b> instead
+     * of <b> directly.
      *
      * The basis for this comes from "Unicode Demystified" Chapter 13 by
      * Richard Gillam, published by Addison-Wesley, and explained at some
@@ -8910,8 +9209,10 @@ Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
         /* Here, 'a' or 'b' is empty and not using the complement of 'b'.  The
          * intersection must be empty */
 	if (*i == a) {
-            if (! (make_temp = cBOOL(SvTEMP(a)))) {
-                SvREFCNT_dec_NN(a);
+            if (a != NULL) {
+                if (! (make_temp = cBOOL(SvTEMP(a)))) {
+                    SvREFCNT_dec_NN(a);
+                }
             }
 	}
 	else if (*i == b) {
@@ -9054,21 +9355,36 @@ Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
 	}
     }
 
-    /*  We may be removing a reference to one of the inputs.  If so, the output
-     *  is made mortal if the input was.  (Mortal SVs shouldn't have their ref
-     *  count decremented) */
-    if (a == *i || b == *i) {
+    if (a != *i && b != *i) {
+        *i = r;
+    }
+    else {
+        /*  Here, the output is to be the same as one of the input scalars,
+         *  hence replacing it.  The simple thing to do is to free the input
+         *  scalar, making it instead be the output one.  But experience has
+         *  shown [perl #127392] that if the input is a mortal, we can get a
+         *  huge build-up of these during regex compilation before they get
+         *  freed.  So for that case, replace just the input's interior with
+         *  the output's, and then free the output.  A short-cut in this case
+         *  is if the output is empty, we can just set the input to be empty */
+
         assert(! invlist_is_iterating(*i));
-        if (SvTEMP(*i)) {
-            sv_2mortal(r);
+
+        if (! SvTEMP(*i)) {
+            SvREFCNT_dec_NN(*i);
+            *i = r;
         }
         else {
-            SvREFCNT_dec_NN(*i);
+            if (len_r) {
+                invlist_replace_list_destroys_src(*i, r);
+            }
+            else {
+                invlist_clear(*i);
+            }
+            SvREFCNT_dec_NN(r);
         }
     }
 
-    *i = r;
-
     return;
 }
 
@@ -9304,38 +9620,56 @@ S_invlist_highest(SV* const invlist)
            : array[len - 1] - 1;
 }
 
-#ifndef PERL_IN_XSUB_RE
-SV *
-Perl__invlist_contents(pTHX_ SV* const invlist)
+STATIC SV *
+S_invlist_contents(pTHX_ SV* const invlist, const bool traditional_style)
 {
     /* Get the contents of an inversion list into a string SV so that they can
-     * be printed out.  It uses the format traditionally done for debug tracing
-     */
+     * be printed out.  If 'traditional_style' is TRUE, it uses the format
+     * traditionally done for debug tracing; otherwise it uses a format
+     * suitable for just copying to the output, with blanks between ranges and
+     * a dash between range components */
 
     UV start, end;
-    SV* output = newSVpvs("\n");
+    SV* output;
+    const char intra_range_delimiter = (traditional_style ? '\t' : '-');
+    const char inter_range_delimiter = (traditional_style ? '\n' : ' ');
+
+    if (traditional_style) {
+        output = newSVpvs("\n");
+    }
+    else {
+        output = newSVpvs("");
+    }
 
-    PERL_ARGS_ASSERT__INVLIST_CONTENTS;
+    PERL_ARGS_ASSERT_INVLIST_CONTENTS;
 
     assert(! invlist_is_iterating(invlist));
 
     invlist_iterinit(invlist);
     while (invlist_iternext(invlist, &start, &end)) {
 	if (end == UV_MAX) {
-	    Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\tINFINITY\n", start);
+	    Perl_sv_catpvf(aTHX_ output, "%04"UVXf"%cINFINITY%c",
+                                          start, intra_range_delimiter,
+                                                 inter_range_delimiter);
 	}
 	else if (end != start) {
-	    Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\t%04"UVXf"\n",
-		    start,       end);
+	    Perl_sv_catpvf(aTHX_ output, "%04"UVXf"%c%04"UVXf"%c",
+		                          start,
+                                                   intra_range_delimiter,
+                                                  end, inter_range_delimiter);
 	}
 	else {
-	    Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\n", start);
+	    Perl_sv_catpvf(aTHX_ output, "%04"UVXf"%c",
+                                          start, inter_range_delimiter);
 	}
     }
 
+    if (SvCUR(output) && ! traditional_style) {/* Get rid of trailing blank */
+        SvCUR_set(output, SvCUR(output) - 1);
+    }
+
     return output;
 }
-#endif
 
 #ifndef PERL_IN_XSUB_RE
 void
@@ -9665,7 +9999,7 @@ S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state)
         cs = REGEX_UNICODE_CHARSET;
     }
 
-    while (*RExC_parse) {
+    while (RExC_parse < RExC_end) {
         /* && strchr("iogcmsx", *RExC_parse) */
         /* (?g), (?gc) and (?o) are useless here
            and must be globally applied -- japhy */
@@ -9823,7 +10157,7 @@ S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state)
                 NOT_REACHED; /*NOTREACHED*/
         }
 
-        ++RExC_parse;
+        RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
     }
 
     vFAIL("Sequence (?... not terminated");
@@ -9845,6 +10179,54 @@ S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state)
 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
 #endif
 
+PERL_STATIC_INLINE regnode *
+S_handle_named_backref(pTHX_ RExC_state_t *pRExC_state,
+                             I32 *flagp,
+                             char * parse_start,
+                             char ch
+                      )
+{
+    regnode *ret;
+    char* name_start = RExC_parse;
+    U32 num = 0;
+    SV *sv_dat = reg_scan_name(pRExC_state, SIZE_ONLY
+                                            ? REG_RSN_RETURN_NULL
+                                            : REG_RSN_RETURN_DATA);
+    GET_RE_DEBUG_FLAGS_DECL;
+
+    PERL_ARGS_ASSERT_HANDLE_NAMED_BACKREF;
+
+    if (RExC_parse == name_start || *RExC_parse != ch) {
+        /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
+        vFAIL2("Sequence %.3s... not terminated",parse_start);
+    }
+
+    if (!SIZE_ONLY) {
+        num = add_data( pRExC_state, STR_WITH_LEN("S"));
+        RExC_rxi->data->data[num]=(void*)sv_dat;
+        SvREFCNT_inc_simple_void(sv_dat);
+    }
+    RExC_sawback = 1;
+    ret = reganode(pRExC_state,
+                   ((! FOLD)
+                     ? NREF
+                     : (ASCII_FOLD_RESTRICTED)
+                       ? NREFFA
+                       : (AT_LEAST_UNI_SEMANTICS)
+                         ? NREFFU
+                         : (LOC)
+                           ? NREFFL
+                           : NREFF),
+                    num);
+    *flagp |= HASWIDTH;
+
+    Set_Node_Offset(ret, parse_start+1);
+    Set_Node_Cur_Length(ret, parse_start);
+
+    nextchar(pRExC_state);
+    return ret;
+}
+
 /* Returns NULL, setting *flagp to TRYAGAIN at the end of (?) that only sets
    flags. Returns NULL, setting *flagp to RESTART_PASS1 if the sizing scan
    needs to be restarted, or'd with NEED_UTF8 if the pattern needs to be
@@ -9897,40 +10279,47 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
          * indivisible */
         bool has_intervening_patws = paren == 2 && *(RExC_parse - 1) != '(';
 
+        assert(RExC_parse < RExC_end);
+
         if ( *RExC_parse == '*') { /* (*VERB:ARG) */
-	    char *start_verb = RExC_parse;
-	    STRLEN verb_len = 0;
+	    char *start_verb = RExC_parse + 1;
+	    STRLEN verb_len;
 	    char *start_arg = NULL;
 	    unsigned char op = 0;
             int arg_required = 0;
             int internal_argval = -1; /* if >-1 we are not allowed an argument*/
 
             if (has_intervening_patws) {
-                RExC_parse++;
+                RExC_parse++;   /* past the '*' */
                 vFAIL("In '(*VERB...)', the '(' and '*' must be adjacent");
             }
-	    while ( *RExC_parse && *RExC_parse != ')' ) {
+	    while (RExC_parse < RExC_end && *RExC_parse != ')' ) {
 	        if ( *RExC_parse == ':' ) {
 	            start_arg = RExC_parse + 1;
 	            break;
 	        }
-	        RExC_parse++;
+	        RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
 	    }
-	    ++start_verb;
 	    verb_len = RExC_parse - start_verb;
 	    if ( start_arg ) {
-	        RExC_parse++;
-	        while ( *RExC_parse && *RExC_parse != ')' )
-	            RExC_parse++;
-	        if ( *RExC_parse != ')' )
+                if (RExC_parse >= RExC_end) {
+                    goto unterminated_verb_pattern;
+                }
+	        RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
+	        while ( RExC_parse < RExC_end && *RExC_parse != ')' )
+                    RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
+	        if ( RExC_parse >= RExC_end || *RExC_parse != ')' )
+                  unterminated_verb_pattern:
 	            vFAIL("Unterminated verb pattern argument");
 	        if ( RExC_parse == start_arg )
 	            start_arg = NULL;
 	    } else {
-	        if ( *RExC_parse != ')' )
+	        if ( RExC_parse >= RExC_end || *RExC_parse != ')' )
 	            vFAIL("Unterminated verb pattern");
 	    }
 
+            /* Here, we know that RExC_parse < RExC_end */
+
 	    switch ( *start_verb ) {
             case 'A':  /* (*ACCEPT) */
                 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
@@ -10012,56 +10401,37 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
                 vFAIL("In '(?...)', the '(' and '?' must be adjacent");
             }
 
-	    RExC_parse++;
-	    paren = *RExC_parse++;
-	    ret = NULL;			/* For lookahead/behind. */
+	    RExC_parse++;           /* past the '?' */
+            paren = *RExC_parse;    /* might be a trailing NUL, if not
+                                       well-formed */
+            RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
+            if (RExC_parse > RExC_end) {
+                paren = '\0';
+            }
+	    ret = NULL;			/* For look-ahead/behind. */
 	    switch (paren) {
 
 	    case 'P':	/* (?P...) variants for those used to PCRE/Python */
-	        paren = *RExC_parse++;
-		if ( paren == '<')         /* (?P<...>) named capture */
+	        paren = *RExC_parse;
+		if ( paren == '<') {    /* (?P<...>) named capture */
+                    RExC_parse++;
+                    if (RExC_parse >= RExC_end) {
+                        vFAIL("Sequence (?P<... not terminated");
+                    }
 		    goto named_capture;
+                }
                 else if (paren == '>') {   /* (?P>name) named recursion */
+                    RExC_parse++;
+                    if (RExC_parse >= RExC_end) {
+                        vFAIL("Sequence (?P>... not terminated");
+                    }
                     goto named_recursion;
                 }
                 else if (paren == '=') {   /* (?P=...)  named backref */
-                    /* this pretty much dupes the code for \k<NAME> in
-                     * regatom(), if you change this make sure you change that
-                     * */
-                    char* name_start = RExC_parse;
-		    U32 num = 0;
-                    SV *sv_dat = reg_scan_name(pRExC_state,
-                        SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
-                    if (RExC_parse == name_start || *RExC_parse != ')')
-                        /* diag_listed_as: Sequence ?P=... not terminated in regex; marked by <-- HERE in m/%s/ */
-                        vFAIL2("Sequence %.3s... not terminated",parse_start);
-
-                    if (!SIZE_ONLY) {
-                        num = add_data( pRExC_state, STR_WITH_LEN("S"));
-                        RExC_rxi->data->data[num]=(void*)sv_dat;
-                        SvREFCNT_inc_simple_void(sv_dat);
-                    }
-                    RExC_sawback = 1;
-		    ret = reganode(pRExC_state,
-				   ((! FOLD)
-				     ? NREF
-				     : (ASCII_FOLD_RESTRICTED)
-				       ? NREFFA
-                                       : (AT_LEAST_UNI_SEMANTICS)
-                                         ? NREFFU
-                                         : (LOC)
-                                           ? NREFFL
-                                           : NREFF),
-				    num);
-                    *flagp |= HASWIDTH;
-
-                    Set_Node_Offset(ret, parse_start+1);
-                    Set_Node_Cur_Length(ret, parse_start);
-
-                    nextchar(pRExC_state);
-                    return ret;
+                    RExC_parse++;
+                    return handle_named_backref(pRExC_state, flagp,
+                                                parse_start, ')');
                 }
-                --RExC_parse;
                 RExC_parse += SKIP_IF_CHAR(RExC_parse);
                 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
 		vFAIL3("Sequence (%.*s...) not recognized",
@@ -10076,15 +10446,20 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
 		    char *name_start;
 		    SV *svname;
 		    paren= '>';
+                /* FALLTHROUGH */
             case '\'':          /* (?'...') */
-    		    name_start= RExC_parse;
-    		    svname = reg_scan_name(pRExC_state,
+                    name_start = RExC_parse;
+                    svname = reg_scan_name(pRExC_state,
                         SIZE_ONLY    /* reverse test from the others */
                         ? REG_RSN_RETURN_NAME
                         : REG_RSN_RETURN_NULL);
-		    if (RExC_parse == name_start || *RExC_parse != paren)
+		    if (   RExC_parse == name_start
+                        || RExC_parse >= RExC_end
+                        || *RExC_parse != paren)
+                    {
 		        vFAIL2("Sequence (?%c... not terminated",
 		            paren=='>' ? '<' : paren);
+                    }
 		    if (SIZE_ONLY) {
 			HE *he_str;
 			SV *sv_dat = NULL;
@@ -10152,6 +10527,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
                 RExC_seen |= REG_LOOKBEHIND_SEEN;
 		RExC_in_lookbehind++;
 		RExC_parse++;
+                assert(RExC_parse < RExC_end);
                 /* FALLTHROUGH */
 	    case '=':           /* (?=...) */
 		RExC_seen_zerolen++;
@@ -10199,7 +10575,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
     		        SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
     		     num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
                 }
-                if (RExC_parse == RExC_end || *RExC_parse != ')')
+                if (RExC_parse >= RExC_end || *RExC_parse != ')')
                     vFAIL("Sequence (?&... not terminated");
                 goto gen_recurse_regop;
                 /* NOTREACHED */
@@ -10218,7 +10594,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
                 /* FALLTHROUGH */
             case '1': case '2': case '3': case '4': /* (?1) */
 	    case '5': case '6': case '7': case '8': case '9':
-	        RExC_parse--;
+	        RExC_parse = (char *) seqstart + 1;  /* Point to the digit */
               parse_recursion:
                 {
                     bool is_neg = FALSE;
@@ -10298,7 +10674,8 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
 		    NOT_REACHED; /*NOTREACHED*/
 		}
 		*flagp |= POSTPONED;
-		paren = *RExC_parse++;
+		paren = '{';
+                RExC_parse++;
 		/* FALLTHROUGH */
 	    case '{':           /* (?{...}) */
 	    {
@@ -10365,11 +10742,11 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
 	        int is_define= 0;
                 const int DEFINE_len = sizeof("DEFINE") - 1;
 		if (RExC_parse[0] == '?') {        /* (?(?...)) */
-                    if (
-                        RExC_parse[1] == '=' ||
-                        RExC_parse[1] == '!' ||
-                        RExC_parse[1] == '<' ||
-                        RExC_parse[1] == '{'
+                    if (   RExC_parse < RExC_end - 1
+                        && (   RExC_parse[1] == '='
+                            || RExC_parse[1] == '!'
+                            || RExC_parse[1] == '<'
+                            || RExC_parse[1] == '{')
                     ) { /* Lookahead or eval. */
 			I32 flag;
                         regnode *tail;
@@ -10397,9 +10774,13 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
 	            U32 num = 0;
 	            SV *sv_dat=reg_scan_name(pRExC_state,
 	                SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
-	            if (RExC_parse == name_start || *RExC_parse != ch)
+	            if (   RExC_parse == name_start
+                        || RExC_parse >= RExC_end
+                        || *RExC_parse != ch)
+                    {
                         vFAIL2("Sequence (?(%c... not terminated",
                             (ch == '>' ? '<' : ch));
+                    }
                     RExC_parse++;
 	            if (!SIZE_ONLY) {
                         num = add_data( pRExC_state, STR_WITH_LEN("S"));
@@ -10503,7 +10884,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
 		    else
 			lastbr = NULL;
                     if (c != ')') {
-                        if (RExC_parse>RExC_end)
+                        if (RExC_parse >= RExC_end)
                             vFAIL("Switch (?(condition)... not terminated");
                         else
                             vFAIL("Switch (?(condition)... contains too many branches");
@@ -10527,16 +10908,16 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
 	    case '[':           /* (?[ ... ]) */
                 return handle_regex_sets(pRExC_state, NULL, flagp, depth,
                                          oregcomp_parse);
-            case 0:
+            case 0: /* A NUL */
 		RExC_parse--; /* for vFAIL to print correctly */
                 vFAIL("Sequence (? incomplete");
                 break;
 	    default: /* e.g., (?i) */
-	        --RExC_parse;
+	        RExC_parse = (char *) seqstart + 1;
               parse_flags:
 		parse_lparen_question_flags(pRExC_state);
                 if (UCHARAT(RExC_parse) != ':') {
-                    if (*RExC_parse)
+                    if (RExC_parse < RExC_end)
                         nextchar(pRExC_state);
                     *flagp = TRYAGAIN;
                     return NULL;
@@ -10972,7 +11353,7 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
 
                     /* We can't back off the size because we have to reserve
                      * enough space for all the things we are about to throw
-                     * away, but we can shrink it by the ammount we are about
+                     * away, but we can shrink it by the amount we are about
                      * to re-use here */
                     RExC_size += PREVOPER(RExC_size) - regarglen[(U8)OPFAIL];
                 }
@@ -10983,7 +11364,7 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                 ret = reganode(pRExC_state, OPFAIL, 0);
                 return ret;
             }
-            else if (min == max && RExC_parse < RExC_end && *RExC_parse == '?')
+            else if (min == max && *RExC_parse == '?')
             {
                 if (PASS2) {
                     ckWARN2reg(RExC_parse + 1,
@@ -11105,13 +11486,12 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
 	(void)ReREFCNT_inc(RExC_rx_sv);
     }
 
-    if (RExC_parse < RExC_end && *RExC_parse == '?') {
+    if (*RExC_parse == '?') {
 	nextchar(pRExC_state);
 	reginsert(pRExC_state, MINMOD, ret, depth+1);
         REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
     }
-    else
-    if (RExC_parse < RExC_end && *RExC_parse == '+') {
+    else if (*RExC_parse == '+') {
         regnode *ender;
         nextchar(pRExC_state);
         ender = reg_node(pRExC_state, SUCCEED);
@@ -11122,7 +11502,7 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
         REGTAIL(pRExC_state, ret, ender);
     }
 
-    if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
+    if (ISMULT2(RExC_parse)) {
 	RExC_parse++;
 	vFAIL("Nested quantifiers");
     }
@@ -11136,6 +11516,7 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state,
                 UV * code_point_p,
                 int * cp_count,
                 I32 * flagp,
+                const bool strict,
                 const U32 depth
     )
 {
@@ -11285,6 +11666,10 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state,
                                         semantics */
 
     if (endbrace == RExC_parse) {   /* empty: \N{} */
+        if (strict) {
+            RExC_parse++;   /* Position after the "}" */
+            vFAIL("Zero length \\N{}");
+        }
         if (cp_count) {
             *cp_count = 0;
         }
@@ -11299,6 +11684,8 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state,
 
     RExC_parse += 2;	/* Skip past the 'U+' */
 
+    /* Because toke.c has generated a special construct for us guaranteed not
+     * to have NULs, we can use a str function */
     endchar = RExC_parse + strcspn(RExC_parse, ".}");
 
     /* Code points are separated by dots.  If none, there is only one code
@@ -11358,6 +11745,7 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state,
 	SV * substitute_parse;
 	STRLEN len;
 	char *orig_end = RExC_end;
+	char *save_start = RExC_start;
         I32 flags;
 
         /* Count the code points, if desired, in the sequence */
@@ -11403,7 +11791,8 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state,
 	}
         sv_catpv(substitute_parse, ")");
 
-	RExC_parse = SvPV(substitute_parse, len);
+        RExC_parse = RExC_start = RExC_adjusted_start = SvPV(substitute_parse,
+                                                             len);
 
 	/* Don't allow empty number */
 	if (len < (STRLEN) 8) {
@@ -11433,6 +11822,7 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state,
         }
 
         /* Restore the saved values */
+	RExC_start = RExC_adjusted_start = save_start;
 	RExC_parse = endbrace;
 	RExC_end = orig_end;
 	RExC_override_recoding = 0;
@@ -11777,6 +12167,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
 
   tryagain:
     parse_start = RExC_parse;
+    assert(RExC_parse < RExC_end);
     switch ((U8)*RExC_parse) {
     case '^':
 	RExC_seen_zerolen++;
@@ -11816,6 +12207,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                        FALSE, /* don't silence non-portable warnings. */
                        (bool) RExC_strict,
                        TRUE, /* Allow an optimized regnode result */
+                       NULL,
                        NULL);
         if (ret == NULL) {
             if (*flagp & (RESTART_PASS1|NEED_UTF8))
@@ -11836,7 +12228,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
         ret = reg(pRExC_state, 2, &flags,depth+1);
 	if (ret == NULL) {
 		if (flags & TRYAGAIN) {
-		    if (RExC_parse == RExC_end) {
+		    if (RExC_parse >= RExC_end) {
 			 /* Make parent create an empty node if needed. */
 			*flagp |= TRYAGAIN;
 			return(NULL);
@@ -11880,7 +12272,8 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
 	   required, as the default for this switch is to jump to the
 	   literal text handling code.
 	*/
-	switch ((U8)*++RExC_parse) {
+	RExC_parse++;
+	switch ((U8)*RExC_parse) {
 	/* Special Escapes */
 	case 'A':
 	    RExC_seen_zerolen++;
@@ -11948,7 +12341,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
 
 	    ret = reg_node(pRExC_state, op);
 	    *flagp |= SIMPLE;
-	    if (*(RExC_parse + 1) != '{') {
+	    if (RExC_parse >= RExC_end || *(RExC_parse + 1) != '{') {
                 FLAGS(ret) = TRADITIONAL_BOUND;
                 if (PASS2 && op > BOUNDA) {  /* /aa is same as /a */
                     OP(ret) = BOUNDA;
@@ -11987,6 +12380,12 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                         }
                         FLAGS(ret) = GCB_BOUND;
                         break;
+                    case 'l':
+                        if (length != 2 || *(RExC_parse + 1) != 'b') {
+                            goto bad_bound_type;
+                        }
+                        FLAGS(ret) = LB_BOUND;
+                        break;
                     case 's':
                         if (length != 2 || *(RExC_parse + 1) != 'b') {
                             goto bad_bound_type;
@@ -12114,6 +12513,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                                      non-portables */
                            (bool) RExC_strict,
                            TRUE, /* Allow an optimized regnode result */
+                           NULL,
                            NULL);
             if (*flagp & RESTART_PASS1)
                 return NULL;
@@ -12149,6 +12549,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                               NULL,     /* Don't need a count of how many code
                                            points */
                               flagp,
+                              RExC_strict,
                               depth)
             ) {
                 break;
@@ -12164,48 +12565,25 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
 	case 'k':    /* Handle \k<NAME> and \k'NAME' */
       parse_named_seq:
         {
-            char ch= RExC_parse[1];
-	    if (ch != '<' && ch != '\'' && ch != '{') {
+            char ch;
+            if (   RExC_parse >= RExC_end - 1
+                || ((   ch = RExC_parse[1]) != '<'
+                                      && ch != '\''
+                                      && ch != '{'))
+            {
 	        RExC_parse++;
 		/* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
 	        vFAIL2("Sequence %.2s... not terminated",parse_start);
 	    } else {
-	        /* this pretty much dupes the code for (?P=...) in reg(), if
-                   you change this make sure you change that */
-		char* name_start = (RExC_parse += 2);
-		U32 num = 0;
-                SV *sv_dat = reg_scan_name(pRExC_state,
-                    SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
-                ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
-                if (RExC_parse == name_start || *RExC_parse != ch)
-                    /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
-                    vFAIL2("Sequence %.3s... not terminated",parse_start);
-
-                if (!SIZE_ONLY) {
-                    num = add_data( pRExC_state, STR_WITH_LEN("S"));
-                    RExC_rxi->data->data[num]=(void*)sv_dat;
-                    SvREFCNT_inc_simple_void(sv_dat);
-                }
-
-                RExC_sawback = 1;
-                ret = reganode(pRExC_state,
-                               ((! FOLD)
-                                 ? NREF
-				 : (ASCII_FOLD_RESTRICTED)
-				   ? NREFFA
-                                   : (AT_LEAST_UNI_SEMANTICS)
-                                     ? NREFFU
-                                     : (LOC)
-                                       ? NREFFL
-                                       : NREFF),
-                                num);
-                *flagp |= HASWIDTH;
-
-                /* override incorrect value set in reganode MJD */
-                Set_Node_Offset(ret, parse_start+1);
-                Set_Node_Cur_Length(ret, parse_start);
-                nextchar(pRExC_state);
-
+		RExC_parse += 2;
+                ret = handle_named_backref(pRExC_state,
+                                           flagp,
+                                           parse_start,
+                                           (ch == '<')
+                                           ? '>'
+                                           : (ch == '{')
+                                             ? '}'
+                                             : '\'');
             }
             break;
 	}
@@ -12234,6 +12612,9 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
 		        goto parse_named_seq;
                     }
 
+                    if (RExC_parse >= RExC_end) {
+                        goto unterminated_g;
+                    }
                     num = S_backref_value(RExC_parse);
                     if (num == 0)
                         vFAIL("Reference to invalid group 0");
@@ -12241,6 +12622,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                          if (isDIGIT(*RExC_parse))
 			    vFAIL("Reference to nonexistent group");
                         else
+                          unterminated_g:
                             vFAIL("Unterminated \\g... pattern");
                     }
 
@@ -12494,6 +12876,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                                             NULL,   /* Don't need a count of
                                                        how many code points */
                                             flagp,
+                                            RExC_strict,
                                             depth)
                         ) {
                             if (*flagp & NEED_UTF8)
@@ -12644,7 +13027,6 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
 			    p += numlen;
                             if (PASS2   /* like \08, \178 */
                                 && numlen < 3
-                                && p < RExC_end
                                 && isDIGIT(*p) && ckWARN(WARN_REGEXP))
                             {
 				reg_warn_non_literal_string(
@@ -13183,202 +13565,835 @@ S_populate_ANYOF_from_invlist(pTHX_ regnode *node, SV** invlist_ptr)
 #define POSIXCC_DONE(c)   ((c) == ':')
 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
+#define MAYBE_POSIXCC(c) (POSIXCC(c) || (c) == '^' || (c) == ';')
+
+#define WARNING_PREFIX              "Assuming NOT a POSIX class since "
+#define NO_BLANKS_POSIX_WARNING     "no blanks are allowed in one"
+#define SEMI_COLON_POSIX_WARNING    "a semi-colon was found instead of a colon"
+
+#define NOT_MEANT_TO_BE_A_POSIX_CLASS (OOB_NAMEDCLASS - 1)
+
+/* 'posix_warnings' and 'warn_text' are names of variables in the following
+ * routine. q.v. */
+#define ADD_POSIX_WARNING(p, text)  STMT_START {                            \
+        if (posix_warnings && (   posix_warnings != (AV **) -1              \
+                               || (PASS2 && ckWARN(WARN_REGEXP))))          \
+        {                                                                   \
+            if (! warn_text) warn_text = newAV();                           \
+            av_push(warn_text, Perl_newSVpvf(aTHX_                          \
+                                             WARNING_PREFIX                 \
+                                             text                           \
+                                             REPORT_LOCATION,               \
+                                             REPORT_LOCATION_ARGS(p)));     \
+        }                                                                   \
+    } STMT_END
 
-PERL_STATIC_INLINE I32
-S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value, const bool strict)
+STATIC int
+S_handle_possible_posix(pTHX_ RExC_state_t *pRExC_state,
+
+    const char * const s,      /* Where the putative posix class begins.
+                                  Normally, this is one past the '['.  This
+                                  parameter exists so it can be somewhere
+                                  besides RExC_parse. */
+    char ** updated_parse_ptr, /* Where to set the updated parse pointer, or
+                                  NULL */
+    AV ** posix_warnings       /* Where to place any generated warnings, or -1
+                                  if to output them, or NULL */
+)
 {
-    I32 namedclass = OOB_NAMEDCLASS;
+    /* This parses what the caller thinks may be one of the three POSIX
+     * constructs:
+     *  1) a character class, like [:blank:]
+     *  2) a collating symbol, like [. .]
+     *  3) an equivalence class, like [= =]
+     * In the latter two cases, it croaks if it finds a syntactically legal
+     * one, as these are not handled by Perl.
+     *
+     * The main purpose is to look for a POSIX character class.  It returns:
+     *  a) the class number
+     *      if it is a completely syntactically and semantically legal class.
+     *      'updated_parse_ptr', if not NULL, is set to point to just after the
+     *      closing ']' of the class
+     *  b) OOB_NAMEDCLASS
+     *      if it appears that one of the three POSIX constructs was meant, but
+     *      its specification was somehow defective.  'updated_parse_ptr', if
+     *      not NULL, is set to point to the character just after the end
+     *      character of the class.  See below for handling of warnings.
+     *  c) NOT_MEANT_TO_BE_A_POSIX_CLASS
+     *      if it  doesn't appear that a POSIX construct was intended.
+     *      'updated_parse_ptr' is not changed.  No warnings nor errors are
+     *      raised.
+     *
+     * In b) there may be warnings and even errors generated.  What to do about
+     * these is determined by the 'posix_warnings' parameter.  If it is NULL,
+     * this call is treated as a check-only, scouting-out-the-territory call,
+     * and no warnings nor errors are generated at all.  Otherwise, any errors
+     * are raised if found.  If 'posix_warnings' is -1 (appropriately cast),
+     * warnings are generated and displayed (in pass 2), just as they would be
+     * for any other message of the same type from this file.  If it isn't NULL
+     * and not -1, warnings aren't displayed, but instead an AV is generated
+     * with all the warning messages (that aren't to be ignored) stored into
+     * it, so that the caller can output them if it wants.  This is done in all
+     * passes.  The reason for this is that the rest of the parsing is heavily
+     * dependent on whether this routine found a valid posix class or not.  If
+     * it did, the closing ']' is absorbed as part of the class.  If no class
+     * or an invalid one is found, any ']' will be considered the terminator of
+     * the outer bracketed character class, leading to very different results.
+     * In particular, a '(?[ ])' construct will likely have a syntax error if
+     * the class is parsed other than intended, and this will happen in pass1,
+     * before the warnings would normally be output.  This mechanism allows the
+     * caller to output those warnings in pass1 just before dieing, giving a
+     * much better clue as to what is wrong.
+     *
+     * The reason for this function, and its complexity is that a bracketed
+     * character class can contain just about anything.  But it's easy to
+     * mistype the very specific posix class syntax but yielding a valid
+     * regular bracketed class, so it silently gets compiled into something
+     * quite unintended.
+     *
+     * The solution adopted here maintains backward compatibility except that
+     * it adds a warning if it looks like a posix class was intended but
+     * improperly specified.  The warning is not raised unless what is input
+     * very closely resembles one of the 14 legal posix classes.  To do this,
+     * it uses fuzzy parsing.  It calculates how many single-character edits it
+     * would take to transform what was input into a legal posix class.  Only
+     * if that number is quite small does it think that the intention was a
+     * posix class.  Obviously these are heuristics, and there will be cases
+     * where it errs on one side or another, and they can be tweaked as
+     * experience informs.
+     *
+     * The syntax for a legal posix class is:
+     *
+     * qr/(?xa: \[ : \^? [:lower:]{4,6} : \] )/
+     *
+     * What this routine considers syntactically to be an intended posix class
+     * is this (the comments indicate some restrictions that the pattern
+     * doesn't show):
+     *
+     *  qr/(?x: \[?                         # The left bracket, possibly
+     *                                      # omitted
+     *          \h*                         # possibly followed by blanks
+     *          (?: \^ \h* )?               # possibly a misplaced caret
+     *          [:;]?                       # The opening class character,
+     *                                      # possibly omitted.  A typo
+     *                                      # semi-colon can also be used.
+     *          \h*
+     *          \^?                         # possibly a correctly placed
+     *                                      # caret, but not if there was also
+     *                                      # a misplaced one
+     *          \h*
+     *          .{3,15}                     # The class name.  If there are
+     *                                      # deviations from the legal syntax,
+     *                                      # its edit distance must be close
+     *                                      # to a real class name in order
+     *                                      # for it to be considered to be
+     *                                      # an intended posix class.
+     *          \h*
+     *          [:punct:]?                  # The closing class character,
+     *                                      # possibly omitted.  If not a colon
+     *                                      # nor semi colon, the class name
+     *                                      # must be even closer to a valid
+     *                                      # one
+     *          \h*
+     *          \]?                         # The right bracket, possibly
+     *                                      # omitted.
+     *     )/
+     *
+     * In the above, \h must be ASCII-only.
+     *
+     * These are heuristics, and can be tweaked as field experience dictates.
+     * There will be cases when someone didn't intend to specify a posix class
+     * that this warns as being so.  The goal is to minimize these, while
+     * maximizing the catching of things intended to be a posix class that
+     * aren't parsed as such.
+     */
 
-    PERL_ARGS_ASSERT_REGPPOSIXCC;
+    const char* p             = s;
+    const char * const e      = RExC_end;
+    unsigned complement       = 0;      /* If to complement the class */
+    bool found_problem        = FALSE;  /* Assume OK until proven otherwise */
+    bool has_opening_bracket  = FALSE;
+    bool has_opening_colon    = FALSE;
+    int class_number          = OOB_NAMEDCLASS; /* Out-of-bounds until find
+                                                   valid class */
+    AV* warn_text             = NULL;   /* any warning messages */
+    const char * possible_end = NULL;   /* used for a 2nd parse pass */
+    const char* name_start;             /* ptr to class name first char */
 
-    if (value == '[' && RExC_parse + 1 < RExC_end &&
-	/* I smell either [: or [= or [. -- POSIX has been here, right? */
-	POSIXCC(UCHARAT(RExC_parse)))
-    {
-	const char c = UCHARAT(RExC_parse);
-	char* const s = RExC_parse++;
+    /* If the number of single-character typos the input name is away from a
+     * legal name is no more than this number, it is considered to have meant
+     * the legal name */
+    int max_distance          = 2;
 
-	while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
-	    RExC_parse++;
-	if (RExC_parse == RExC_end) {
-            if (strict) {
+    /* to store the name.  The size determines the maximum length before we
+     * decide that no posix class was intended.  Should be at least
+     * sizeof("alphanumeric") */
+    UV input_text[15];
 
-                /* Try to give a better location for the error (than the end of
-                 * the string) by looking for the matching ']' */
-                RExC_parse = s;
-                while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
-                    RExC_parse++;
-                }
-                vFAIL2("Unmatched '%c' in POSIX class", c);
+    PERL_ARGS_ASSERT_HANDLE_POSSIBLE_POSIX;
+
+    if (p >= e) {
+        return NOT_MEANT_TO_BE_A_POSIX_CLASS;
+    }
+
+    if (*(p - 1) != '[') {
+        ADD_POSIX_WARNING(p, "it doesn't start with a '['");
+        found_problem = TRUE;
+    }
+    else {
+        has_opening_bracket = TRUE;
+    }
+
+    /* They could be confused and think you can put spaces between the
+     * components */
+    if (isBLANK(*p)) {
+        found_problem = TRUE;
+
+        do {
+            p++;
+        } while (p < e && isBLANK(*p));
+
+        ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
+    }
+
+    /* For [. .] and [= =].  These are quite different internally from [: :],
+     * so they are handled separately.  */
+    if (POSIXCC_NOTYET(*p)) {
+        const char open_char  = *p;
+        const char * temp_ptr = p + 1;
+        unsigned int len      = 0;
+
+        /* These two constructs are not handled by perl, and if we find a
+         * syntactically valid one, we croak.  It looks like just about any
+         * byte can be in them, but they are likely very short, like [.ch.] to
+         * denote a ligature 'ch' single character.  If we find something that
+         * started out to look like one of these constructs, but isn't, we
+         * break so that it can be checked for being a class name with a typo
+         * of '.' or '=' instead of a colon */
+        while (temp_ptr < e) {
+            len++;
+
+            /* qr/[[.].]]/, for example, is valid.  But otherwise we quit on an
+             * unexpected ']'.  It is possible, it appears, for such a ']' to
+             * be not in the final position, but that's so unlikely that that
+             * case is not handled. */
+            if (*temp_ptr == ']' && temp_ptr[1] != open_char) {
+                break;
             }
-	    /* Grandfather lone [:, [=, [. */
-	    RExC_parse = s;
-        }
-	else {
-	    const char* const t = RExC_parse++; /* skip over the c */
-	    assert(*t == c);
-
-  	    if (UCHARAT(RExC_parse) == ']') {
-		const char *posixcc = s + 1;
-  		RExC_parse++; /* skip over the ending ] */
-
-		if (*s == ':') {
-		    const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
-		    const I32 skip = t - posixcc;
-
-		    /* Initially switch on the length of the name.  */
-		    switch (skip) {
-		    case 4:
-                        if (memEQ(posixcc, "word", 4)) /* this is not POSIX,
-                                                          this is the Perl \w
-                                                        */
-			    namedclass = ANYOF_WORDCHAR;
-			break;
-		    case 5:
-			/* Names all of length 5.  */
-			/* alnum alpha ascii blank cntrl digit graph lower
-			   print punct space upper  */
-			/* Offset 4 gives the best switch position.  */
-			switch (posixcc[4]) {
-			case 'a':
-			    if (memEQ(posixcc, "alph", 4)) /* alpha */
-				namedclass = ANYOF_ALPHA;
-			    break;
-			case 'e':
-			    if (memEQ(posixcc, "spac", 4)) /* space */
-				namedclass = ANYOF_SPACE;
-			    break;
-			case 'h':
-			    if (memEQ(posixcc, "grap", 4)) /* graph */
-				namedclass = ANYOF_GRAPH;
-			    break;
-			case 'i':
-			    if (memEQ(posixcc, "asci", 4)) /* ascii */
-				namedclass = ANYOF_ASCII;
-			    break;
-			case 'k':
-			    if (memEQ(posixcc, "blan", 4)) /* blank */
-				namedclass = ANYOF_BLANK;
-			    break;
-			case 'l':
-			    if (memEQ(posixcc, "cntr", 4)) /* cntrl */
-				namedclass = ANYOF_CNTRL;
-			    break;
-			case 'm':
-			    if (memEQ(posixcc, "alnu", 4)) /* alnum */
-				namedclass = ANYOF_ALPHANUMERIC;
-			    break;
-			case 'r':
-			    if (memEQ(posixcc, "lowe", 4)) /* lower */
-				namedclass = (FOLD) ? ANYOF_CASED : ANYOF_LOWER;
-			    else if (memEQ(posixcc, "uppe", 4)) /* upper */
-				namedclass = (FOLD) ? ANYOF_CASED : ANYOF_UPPER;
-			    break;
-			case 't':
-			    if (memEQ(posixcc, "digi", 4)) /* digit */
-				namedclass = ANYOF_DIGIT;
-			    else if (memEQ(posixcc, "prin", 4)) /* print */
-				namedclass = ANYOF_PRINT;
-			    else if (memEQ(posixcc, "punc", 4)) /* punct */
-				namedclass = ANYOF_PUNCT;
-			    break;
-			}
-			break;
-		    case 6:
-			if (memEQ(posixcc, "xdigit", 6))
-			    namedclass = ANYOF_XDIGIT;
-			break;
-		    }
 
-		    if (namedclass == OOB_NAMEDCLASS)
-			vFAIL2utf8f(
-                            "POSIX class [:%"UTF8f":] unknown",
-			    UTF8fARG(UTF, t - s - 1, s + 1));
+            /* XXX this could be cut down, but this value is certainly large
+             * enough */
+            if (len > 10) {
+                break;
+            }
 
-                    /* The #defines are structured so each complement is +1 to
-                     * the normal one */
-                    if (complement) {
-                        namedclass++;
+            if (*temp_ptr == open_char) {
+                temp_ptr++;
+                if (*temp_ptr == ']') {
+                    temp_ptr++;
+                    if (! found_problem && posix_warnings) {
+                        RExC_parse = (char *) temp_ptr;
+                        vFAIL3("POSIX syntax [%c %c] is reserved for future "
+                               "extensions", open_char, open_char);
                     }
-		    assert (posixcc[skip] == ':');
-		    assert (posixcc[skip+1] == ']');
-		} else if (!SIZE_ONLY) {
-		    /* [[=foo=]] and [[.foo.]] are still future. */
-
-		    /* adjust RExC_parse so the warning shows after
-		       the class closes */
-		    while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
-			RExC_parse++;
-		    vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
-		}
-	    } else {
-		/* Maternal grandfather:
-		 * "[:" ending in ":" but not in ":]" */
-                if (strict) {
-                    vFAIL("Unmatched '[' in POSIX class");
-                }
 
-                /* Grandfather lone [:, [=, [. */
-		RExC_parse = s;
-	    }
-	}
-    }
+                    /* Here, the syntax wasn't completely valid, or else the
+                     * call is to check-only */
+                    if (updated_parse_ptr) {
+                        *updated_parse_ptr = (char *) temp_ptr;
+                    }
 
-    return namedclass;
-}
+                    return OOB_NAMEDCLASS;
+                }
+            }
+            else if (*temp_ptr == '\\') {
+
+                /* A backslash is treate as like any other character, unless it
+                 * precedes a comment starter.  XXX multiple backslashes in a
+                 * row are not handled specially here, nor would they ever
+                 * likely to be handled specially in one of these constructs */
+                if (temp_ptr[1] == '#' && (RExC_flags & RXf_PMf_EXTENDED)) {
+                    temp_ptr++;
+                }
+                temp_ptr++;
+            }
+            else if (*temp_ptr == '#' && (RExC_flags & RXf_PMf_EXTENDED)) {
+                break;  /* Under no circumstances can we look at the interior
+                           of a comment */
+            }
+            else if (*temp_ptr == '\n') {   /* And we don't allow newlines
+                                               either as it's extremely
+                                               unlikely that one could be in an
+                                               intended class */
+                break;
+            }
+            else if (UTF && ! UTF8_IS_INVARIANT(*temp_ptr)) {
+                /* XXX Since perl will never handle multi-byte locales, except
+                 * for UTF-8, we could break if we found a byte above latin1,
+                 * but perhaps the person intended to use one. */
+                temp_ptr += UTF8SKIP(temp_ptr);
+            }
+            else {
+                temp_ptr++;
+            }
+        }
+    }
 
-STATIC bool
-S_could_it_be_a_POSIX_class(RExC_state_t *pRExC_state)
-{
-    /* This applies some heuristics at the current parse position (which should
-     * be at a '[') to see if what follows might be intended to be a [:posix:]
-     * class.  It returns true if it really is a posix class, of course, but it
-     * also can return true if it thinks that what was intended was a posix
-     * class that didn't quite make it.
-     *
-     * It will return true for
-     *      [:alphanumerics:
-     *      [:alphanumerics]  (as long as the ] isn't followed immediately by a
-     *                         ')' indicating the end of the (?[
-     *      [:any garbage including %^&$ punctuation:]
-     *
-     * This is designed to be called only from S_handle_regex_sets; it could be
-     * easily adapted to be called from the spot at the beginning of regclass()
-     * that checks to see in a normal bracketed class if the surrounding []
-     * have been omitted ([:word:] instead of [[:word:]]).  But doing so would
-     * change long-standing behavior, so I (khw) didn't do that */
-    char* p = RExC_parse + 1;
-    char first_char = *p;
+    /* Here, we think there is a possibility that a [: :] class was meant, and
+     * we have the first real character.  It could be they think the '^' comes
+     * first */
+    if (*p == '^') {
+        found_problem = TRUE;
+        ADD_POSIX_WARNING(p + 1, "the '^' must come after the colon");
+        complement = 1;
+        p++;
 
-    PERL_ARGS_ASSERT_COULD_IT_BE_A_POSIX_CLASS;
+        if (isBLANK(*p)) {
+            found_problem = TRUE;
 
-    assert(*(p - 1) == '[');
+            do {
+                p++;
+            } while (p < e && isBLANK(*p));
 
-    if (! POSIXCC(first_char)) {
-        return FALSE;
+            ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
+        }
     }
 
-    p++;
-    while (p < RExC_end && isWORDCHAR(*p)) p++;
+    /* But the first character should be a colon, which they could have easily
+     * mistyped on a qwerty keyboard as a semi-colon (and which may be hard to
+     * distinguish from a colon, so treat that as a colon).  */
+    if (*p == ':') {
+        p++;
+        has_opening_colon = TRUE;
+    }
+    else if (*p == ';') {
+        found_problem = TRUE;
+        p++;
+        ADD_POSIX_WARNING(p, SEMI_COLON_POSIX_WARNING);
+        has_opening_colon = TRUE;
+    }
+    else {
+        found_problem = TRUE;
+        ADD_POSIX_WARNING(p, "there must be a starting ':'");
 
-    if (p >= RExC_end) {
-        return FALSE;
+        /* Consider an initial punctuation (not one of the recognized ones) to
+         * be a left terminator */
+        if (*p != '^' && *p != ']' && isPUNCT(*p)) {
+            p++;
+        }
     }
 
-    if (p - RExC_parse > 2    /* Got at least 1 word character */
-        && (*p == first_char
-            || (*p == ']' && p + 1 < RExC_end && *(p + 1) != ')')))
-    {
-        return TRUE;
+    /* They may think that you can put spaces between the components */
+    if (isBLANK(*p)) {
+        found_problem = TRUE;
+
+        do {
+            p++;
+        } while (p < e && isBLANK(*p));
+
+        ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
     }
 
-    p = (char *) memchr(RExC_parse, ']', RExC_end - RExC_parse);
+    if (*p == '^') {
+
+        /* We consider something like [^:^alnum:]] to not have been intended to
+         * be a posix class, but XXX maybe we should */
+        if (complement) {
+            return NOT_MEANT_TO_BE_A_POSIX_CLASS;
+        }
+
+        complement = 1;
+        p++;
+    }
+
+    /* Again, they may think that you can put spaces between the components */
+    if (isBLANK(*p)) {
+        found_problem = TRUE;
+
+        do {
+            p++;
+        } while (p < e && isBLANK(*p));
+
+        ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
+    }
+
+    if (*p == ']') {
+
+        /* XXX This ']' may be a typo, and something else was meant.  But
+         * treating it as such creates enough complications, that that
+         * possibility isn't currently considered here.  So we assume that the
+         * ']' is what is intended, and if we've already found an initial '[',
+         * this leaves this construct looking like [:] or [:^], which almost
+         * certainly weren't intended to be posix classes */
+        if (has_opening_bracket) {
+            return NOT_MEANT_TO_BE_A_POSIX_CLASS;
+        }
+
+        /* But this function can be called when we parse the colon for
+         * something like qr/[alpha:]]/, so we back up to look for the
+         * beginning */
+        p--;
+
+        if (*p == ';') {
+            found_problem = TRUE;
+            ADD_POSIX_WARNING(p, SEMI_COLON_POSIX_WARNING);
+        }
+        else if (*p != ':') {
+
+            /* XXX We are currently very restrictive here, so this code doesn't
+             * consider the possibility that, say, /[alpha.]]/ was intended to
+             * be a posix class. */
+            return NOT_MEANT_TO_BE_A_POSIX_CLASS;
+        }
+
+        /* Here we have something like 'foo:]'.  There was no initial colon,
+         * and we back up over 'foo.  XXX Unlike the going forward case, we
+         * don't handle typos of non-word chars in the middle */
+        has_opening_colon = FALSE;
+        p--;
+
+        while (p > RExC_start && isWORDCHAR(*p)) {
+            p--;
+        }
+        p++;
+
+        /* Here, we have positioned ourselves to where we think the first
+         * character in the potential class is */
+    }
+
+    /* Now the interior really starts.  There are certain key characters that
+     * can end the interior, or these could just be typos.  To catch both
+     * cases, we may have to do two passes.  In the first pass, we keep on
+     * going unless we come to a sequence that matches
+     *      qr/ [[:punct:]] [[:blank:]]* \] /xa
+     * This means it takes a sequence to end the pass, so two typos in a row if
+     * that wasn't what was intended.  If the class is perfectly formed, just
+     * this one pass is needed.  We also stop if there are too many characters
+     * being accumulated, but this number is deliberately set higher than any
+     * real class.  It is set high enough so that someone who thinks that
+     * 'alphanumeric' is a correct name would get warned that it wasn't.
+     * While doing the pass, we keep track of where the key characters were in
+     * it.  If we don't find an end to the class, and one of the key characters
+     * was found, we redo the pass, but stop when we get to that character.
+     * Thus the key character was considered a typo in the first pass, but a
+     * terminator in the second.  If two key characters are found, we stop at
+     * the second one in the first pass.  Again this can miss two typos, but
+     * catches a single one
+     *
+     * In the first pass, 'possible_end' starts as NULL, and then gets set to
+     * point to the first key character.  For the second pass, it starts as -1.
+     * */
+
+    name_start = p;
+  parse_name:
+    {
+        bool has_blank               = FALSE;
+        bool has_upper               = FALSE;
+        bool has_terminating_colon   = FALSE;
+        bool has_terminating_bracket = FALSE;
+        bool has_semi_colon          = FALSE;
+        unsigned int name_len        = 0;
+        int punct_count              = 0;
+
+        while (p < e) {
+
+            /* Squeeze out blanks when looking up the class name below */
+            if (isBLANK(*p) ) {
+                has_blank = TRUE;
+                found_problem = TRUE;
+                p++;
+                continue;
+            }
 
-    return (p
-            && p - RExC_parse > 2 /* [:] evaluates to colon;
-                                      [::] is a bad posix class. */
-            && first_char == *(p - 1));
+            /* The name will end with a punctuation */
+            if (isPUNCT(*p)) {
+                const char * peek = p + 1;
+
+                /* Treat any non-']' punctuation followed by a ']' (possibly
+                 * with intervening blanks) as trying to terminate the class.
+                 * ']]' is very likely to mean a class was intended (but
+                 * missing the colon), but the warning message that gets
+                 * generated shows the error position better if we exit the
+                 * loop at the bottom (eventually), so skip it here. */
+                if (*p != ']') {
+                    if (peek < e && isBLANK(*peek)) {
+                        has_blank = TRUE;
+                        found_problem = TRUE;
+                        do {
+                            peek++;
+                        } while (peek < e && isBLANK(*peek));
+                    }
+
+                    if (peek < e && *peek == ']') {
+                        has_terminating_bracket = TRUE;
+                        if (*p == ':') {
+                            has_terminating_colon = TRUE;
+                        }
+                        else if (*p == ';') {
+                            has_semi_colon = TRUE;
+                            has_terminating_colon = TRUE;
+                        }
+                        else {
+                            found_problem = TRUE;
+                        }
+                        p = peek + 1;
+                        goto try_posix;
+                    }
+                }
+
+                /* Here we have punctuation we thought didn't end the class.
+                 * Keep track of the position of the key characters that are
+                 * more likely to have been class-enders */
+                if (*p == ']' || *p == '[' || *p == ':' || *p == ';') {
+
+                    /* Allow just one such possible class-ender not actually
+                     * ending the class. */
+                    if (possible_end) {
+                        break;
+                    }
+                    possible_end = p;
+                }
+
+                /* If we have too many punctuation characters, no use in
+                 * keeping going */
+                if (++punct_count > max_distance) {
+                    break;
+                }
+
+                /* Treat the punctuation as a typo. */
+                input_text[name_len++] = *p;
+                p++;
+            }
+            else if (isUPPER(*p)) { /* Use lowercase for lookup */
+                input_text[name_len++] = toLOWER(*p);
+                has_upper = TRUE;
+                found_problem = TRUE;
+                p++;
+            } else if (! UTF || UTF8_IS_INVARIANT(*p)) {
+                input_text[name_len++] = *p;
+                p++;
+            }
+            else {
+                input_text[name_len++] = utf8_to_uvchr_buf((U8 *) p, e, NULL);
+                p+= UTF8SKIP(p);
+            }
+
+            /* The declaration of 'input_text' is how long we allow a potential
+             * class name to be, before saying they didn't mean a class name at
+             * all */
+            if (name_len >= C_ARRAY_LENGTH(input_text)) {
+                break;
+            }
+        }
+
+        /* We get to here when the possible class name hasn't been properly
+         * terminated before:
+         *   1) we ran off the end of the pattern; or
+         *   2) found two characters, each of which might have been intended to
+         *      be the name's terminator
+         *   3) found so many punctuation characters in the purported name,
+         *      that the edit distance to a valid one is exceeded
+         *   4) we decided it was more characters than anyone could have
+         *      intended to be one. */
+
+        found_problem = TRUE;
+
+        /* In the final two cases, we know that looking up what we've
+         * accumulated won't lead to a match, even a fuzzy one. */
+        if (   name_len >= C_ARRAY_LENGTH(input_text)
+            || punct_count > max_distance)
+        {
+            /* If there was an intermediate key character that could have been
+             * an intended end, redo the parse, but stop there */
+            if (possible_end && possible_end != (char *) -1) {
+                possible_end = (char *) -1; /* Special signal value to say
+                                               we've done a first pass */
+                p = name_start;
+                goto parse_name;
+            }
+
+            /* Otherwise, it can't have meant to have been a class */
+            return NOT_MEANT_TO_BE_A_POSIX_CLASS;
+        }
+
+        /* If we ran off the end, and the final character was a punctuation
+         * one, back up one, to look at that final one just below.  Later, we
+         * will restore the parse pointer if appropriate */
+        if (name_len && p == e && isPUNCT(*(p-1))) {
+            p--;
+            name_len--;
+        }
+
+        if (p < e && isPUNCT(*p)) {
+            if (*p == ']') {
+                has_terminating_bracket = TRUE;
+
+                /* If this is a 2nd ']', and the first one is just below this
+                 * one, consider that to be the real terminator.  This gives a
+                 * uniform and better positioning for the warning message  */
+                if (   possible_end
+                    && possible_end != (char *) -1
+                    && *possible_end == ']'
+                    && name_len && input_text[name_len - 1] == ']')
+                {
+                    name_len--;
+                    p = possible_end;
+
+                    /* And this is actually equivalent to having done the 2nd
+                     * pass now, so set it to not try again */
+                    possible_end = (char *) -1;
+                }
+            }
+            else {
+                if (*p == ':') {
+                    has_terminating_colon = TRUE;
+                }
+                else if (*p == ';') {
+                    has_semi_colon = TRUE;
+                    has_terminating_colon = TRUE;
+                }
+                p++;
+            }
+        }
+
+    try_posix:
+
+        /* Here, we have a class name to look up.  We can short circuit the
+         * stuff below for short names that can't possibly be meant to be a
+         * class name.  (We can do this on the first pass, as any second pass
+         * will yield an even shorter name) */
+        if (name_len < 3) {
+            return NOT_MEANT_TO_BE_A_POSIX_CLASS;
+        }
+
+        /* Find which class it is.  Initially switch on the length of the name.
+         * */
+        switch (name_len) {
+            case 4:
+                if (memEQ(name_start, "word", 4)) {
+                    /* this is not POSIX, this is the Perl \w */
+                    class_number = ANYOF_WORDCHAR;
+                }
+                break;
+            case 5:
+                /* Names all of length 5: alnum alpha ascii blank cntrl digit
+                 *                        graph lower print punct space upper
+                 * Offset 4 gives the best switch position.  */
+                switch (name_start[4]) {
+                    case 'a':
+                        if (memEQ(name_start, "alph", 4)) /* alpha */
+                            class_number = ANYOF_ALPHA;
+                        break;
+                    case 'e':
+                        if (memEQ(name_start, "spac", 4)) /* space */
+                            class_number = ANYOF_SPACE;
+                        break;
+                    case 'h':
+                        if (memEQ(name_start, "grap", 4)) /* graph */
+                            class_number = ANYOF_GRAPH;
+                        break;
+                    case 'i':
+                        if (memEQ(name_start, "asci", 4)) /* ascii */
+                            class_number = ANYOF_ASCII;
+                        break;
+                    case 'k':
+                        if (memEQ(name_start, "blan", 4)) /* blank */
+                            class_number = ANYOF_BLANK;
+                        break;
+                    case 'l':
+                        if (memEQ(name_start, "cntr", 4)) /* cntrl */
+                            class_number = ANYOF_CNTRL;
+                        break;
+                    case 'm':
+                        if (memEQ(name_start, "alnu", 4)) /* alnum */
+                            class_number = ANYOF_ALPHANUMERIC;
+                        break;
+                    case 'r':
+                        if (memEQ(name_start, "lowe", 4)) /* lower */
+                            class_number = (FOLD) ? ANYOF_CASED : ANYOF_LOWER;
+                        else if (memEQ(name_start, "uppe", 4)) /* upper */
+                            class_number = (FOLD) ? ANYOF_CASED : ANYOF_UPPER;
+                        break;
+                    case 't':
+                        if (memEQ(name_start, "digi", 4)) /* digit */
+                            class_number = ANYOF_DIGIT;
+                        else if (memEQ(name_start, "prin", 4)) /* print */
+                            class_number = ANYOF_PRINT;
+                        else if (memEQ(name_start, "punc", 4)) /* punct */
+                            class_number = ANYOF_PUNCT;
+                        break;
+                }
+                break;
+            case 6:
+                if (memEQ(name_start, "xdigit", 6))
+                    class_number = ANYOF_XDIGIT;
+                break;
+        }
+
+        /* If the name exactly matches a posix class name the class number will
+         * here be set to it, and the input almost certainly was meant to be a
+         * posix class, so we can skip further checking.  If instead the syntax
+         * is exactly correct, but the name isn't one of the legal ones, we
+         * will return that as an error below.  But if neither of these apply,
+         * it could be that no posix class was intended at all, or that one
+         * was, but there was a typo.  We tease these apart by doing fuzzy
+         * matching on the name */
+        if (class_number == OOB_NAMEDCLASS && found_problem) {
+            const UV posix_names[][6] = {
+                                                { 'a', 'l', 'n', 'u', 'm' },
+                                                { 'a', 'l', 'p', 'h', 'a' },
+                                                { 'a', 's', 'c', 'i', 'i' },
+                                                { 'b', 'l', 'a', 'n', 'k' },
+                                                { 'c', 'n', 't', 'r', 'l' },
+                                                { 'd', 'i', 'g', 'i', 't' },
+                                                { 'g', 'r', 'a', 'p', 'h' },
+                                                { 'l', 'o', 'w', 'e', 'r' },
+                                                { 'p', 'r', 'i', 'n', 't' },
+                                                { 'p', 'u', 'n', 'c', 't' },
+                                                { 's', 'p', 'a', 'c', 'e' },
+                                                { 'u', 'p', 'p', 'e', 'r' },
+                                                { 'w', 'o', 'r', 'd' },
+                                                { 'x', 'd', 'i', 'g', 'i', 't' }
+                                            };
+            /* The names of the above all have added NULs to make them the same
+             * size, so we need to also have the real lengths */
+            const UV posix_name_lengths[] = {
+                                                sizeof("alnum") - 1,
+                                                sizeof("alpha") - 1,
+                                                sizeof("ascii") - 1,
+                                                sizeof("blank") - 1,
+                                                sizeof("cntrl") - 1,
+                                                sizeof("digit") - 1,
+                                                sizeof("graph") - 1,
+                                                sizeof("lower") - 1,
+                                                sizeof("print") - 1,
+                                                sizeof("punct") - 1,
+                                                sizeof("space") - 1,
+                                                sizeof("upper") - 1,
+                                                sizeof("word")  - 1,
+                                                sizeof("xdigit")- 1
+                                            };
+            unsigned int i;
+            int temp_max = max_distance;    /* Use a temporary, so if we
+                                               reparse, we haven't changed the
+                                               outer one */
+
+            /* Use a smaller max edit distance if we are missing one of the
+             * delimiters */
+            if (   has_opening_bracket + has_opening_colon < 2
+                || has_terminating_bracket + has_terminating_colon < 2)
+            {
+                temp_max--;
+            }
+
+            /* See if the input name is close to a legal one */
+            for (i = 0; i < C_ARRAY_LENGTH(posix_names); i++) {
+
+                /* Short circuit call if the lengths are too far apart to be
+                 * able to match */
+                if (abs( (int) (name_len - posix_name_lengths[i]))
+                    > temp_max)
+                {
+                    continue;
+                }
+
+                if (edit_distance(input_text,
+                                  posix_names[i],
+                                  name_len,
+                                  posix_name_lengths[i],
+                                  temp_max
+                                 )
+                    > -1)
+                { /* If it is close, it probably was intended to be a class */
+                    goto probably_meant_to_be;
+                }
+            }
+
+            /* Here the input name is not close enough to a valid class name
+             * for us to consider it to be intended to be a posix class.  If
+             * we haven't already done so, and the parse found a character that
+             * could have been terminators for the name, but which we absorbed
+             * as typos during the first pass, repeat the parse, signalling it
+             * to stop at that character */
+            if (possible_end && possible_end != (char *) -1) {
+                possible_end = (char *) -1;
+                p = name_start;
+                goto parse_name;
+            }
+
+            /* Here neither pass found a close-enough class name */
+            return NOT_MEANT_TO_BE_A_POSIX_CLASS;
+        }
+
+    probably_meant_to_be:
+
+        /* Here we think that a posix specification was intended.  Update any
+         * parse pointer */
+        if (updated_parse_ptr) {
+            *updated_parse_ptr = (char *) p;
+        }
+
+        /* If a posix class name was intended but incorrectly specified, we
+         * output or return the warnings */
+        if (found_problem) {
+
+            /* We set flags for these issues in the parse loop above instead of
+             * adding them to the list of warnings, because we can parse it
+             * twice, and we only want one warning instance */
+            if (has_upper) {
+                ADD_POSIX_WARNING(p, "the name must be all lowercase letters");
+            }
+            if (has_blank) {
+                ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
+            }
+            if (has_semi_colon) {
+                ADD_POSIX_WARNING(p, SEMI_COLON_POSIX_WARNING);
+            }
+            else if (! has_terminating_colon) {
+                ADD_POSIX_WARNING(p, "there is no terminating ':'");
+            }
+            if (! has_terminating_bracket) {
+                ADD_POSIX_WARNING(p, "there is no terminating ']'");
+            }
+
+            if (warn_text) {
+                if (posix_warnings != (AV **) -1) {
+                    *posix_warnings = warn_text;
+                }
+                else {
+                    SV * msg;
+                    while ((msg = av_shift(warn_text)) != &PL_sv_undef) {
+                        Perl_warner(aTHX_ packWARN(WARN_REGEXP),
+                                    "%s", SvPVX(msg));
+                        SvREFCNT_dec_NN(msg);
+                    }
+                    SvREFCNT_dec_NN(warn_text);
+                }
+            }
+        }
+        else if (class_number != OOB_NAMEDCLASS) {
+            /* If it is a known class, return the class.  The class number
+             * #defines are structured so each complement is +1 to the normal
+             * one */
+            return class_number + complement;
+        }
+        else if (posix_warnings) {
+
+            /* Here, it is an unrecognized class.  This is an error (unless the
+            * call is to check only, which we've already handled above) */
+            const char * const complement_string = (complement)
+                                                   ? "^"
+                                                   : "";
+            RExC_parse = (char *) p;
+            vFAIL3utf8f("POSIX class [:%s%"UTF8f":] unknown",
+                        complement_string,
+                        UTF8fARG(UTF, RExC_parse - name_start - 2, name_start));
+        }
+    }
+
+    return OOB_NAMEDCLASS;
 }
+#undef ADD_POSIX_WARNING
 
 STATIC unsigned  int
 S_regex_set_precedence(const U8 my_operator) {
@@ -13434,6 +14449,7 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
     const bool save_fold = FOLD;    /* Temporary */
     char *save_end, *save_parse;    /* Temporaries */
     const bool in_locale = LOC;     /* we turn off /l during processing */
+    AV* posix_warnings = NULL;
 
     GET_RE_DEBUG_FLAGS_DECL;
 
@@ -13471,27 +14487,26 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
                 default:
                     break;
                 case '\\':
-                    /* Skip the next byte (which could cause us to end up in
-                     * the middle of a UTF-8 character, but since none of those
-                     * are confusable with anything we currently handle in this
-                     * switch (invariants all), it's safe.  We'll just hit the
-                     * default: case next time and keep on incrementing until
-                     * we find one of the invariants we do handle. */
+                    /* Skip past this, so the next character gets skipped, after
+                     * the switch */
                     RExC_parse++;
                     if (*RExC_parse == 'c') {
                             /* Skip the \cX notation for control characters */
                             RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
                     }
                     break;
+
                 case '[':
                 {
-                    /* If this looks like it is a [:posix:] class, leave the
-                     * parse pointer at the '[' to fool regclass() into
-                     * thinking it is part of a '[[:posix:]]'.  That function
-                     * will use strict checking to force a syntax error if it
-                     * doesn't work out to a legitimate class */
-                    bool is_posix_class
-                                    = could_it_be_a_POSIX_class(pRExC_state);
+                    /* See if this is a [:posix:] class. */
+                    bool is_posix_class = (OOB_NAMEDCLASS
+                            < handle_possible_posix(pRExC_state,
+                                                RExC_parse + 1,
+                                                NULL,
+                                                NULL));
+                    /* If it is a posix class, leave the parse pointer at the
+                     * '[' to fool regclass() into thinking it is part of a
+                     * '[[:posix:]]'. */
                     if (! is_posix_class) {
                         RExC_parse++;
                     }
@@ -13506,7 +14521,8 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
                                   TRUE, /* silence non-portable warnings. */
                                   TRUE, /* strict */
                                   FALSE, /* Require return to be an ANYOF */
-                                  &current
+                                  &current,
+                                  &posix_warnings
                                  ))
                         FAIL2("panic: regclass returned NULL to handle_sets, "
                               "flags=%#"UVxf"", (UV) *flagp);
@@ -13524,9 +14540,7 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
                 case ']':
                     if (depth--) break;
                     RExC_parse++;
-                    if (RExC_parse < RExC_end
-                        && *RExC_parse == ')')
-                    {
+                    if (*RExC_parse == ')') {
                         node = reganode(pRExC_state, ANYOF, 0);
                         RExC_size += ANYOF_SKIP;
                         nextchar(pRExC_state);
@@ -13545,6 +14559,17 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
         }
 
       no_close:
+        /* We output the messages even if warnings are off, because we'll fail
+         * the very next thing, and these give a likely diagnosis for that */
+        if (posix_warnings) {
+            SV * msg;
+            while ((msg = av_shift(posix_warnings)) != &PL_sv_undef) {
+                Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s", SvPVX(msg));
+                SvREFCNT_dec_NN(msg);
+            }
+            SvREFCNT_dec_NN(posix_warnings);
+        }
+
         FAIL("Syntax error in (?[...])");
     }
 
@@ -13662,7 +14687,8 @@ redo_curchar:
 
             case '(':
 
-                if (RExC_parse < RExC_end && (UCHARAT(RExC_parse + 1) == '?'))
+                if (   RExC_parse < RExC_end - 1
+                    && (UCHARAT(RExC_parse + 1) == '?'))
                 {
                     /* If is a '(?', could be an embedded '(?flags:(?[...])'.
                      * This happens when we have some thing like
@@ -13718,12 +14744,12 @@ redo_curchar:
                      * inversion list, and RExC_parse points to the trailing
                      * ']'; the next character should be the ')' */
                     RExC_parse++;
-                    assert(RExC_parse < RExC_end && UCHARAT(RExC_parse) == ')');
+                    assert(UCHARAT(RExC_parse) == ')');
 
                     /* Then the ')' matching the original '(' handled by this
                      * case: statement */
                     RExC_parse++;
-                    assert(RExC_parse < RExC_end && UCHARAT(RExC_parse) == ')');
+                    assert(UCHARAT(RExC_parse) == ')');
 
                     RExC_parse++;
                     RExC_flags = save_flags;
@@ -13763,7 +14789,8 @@ redo_curchar:
                               FALSE, /* don't silence non-portable warnings.  */
                               TRUE,  /* strict */
                               FALSE, /* Require return to be an ANYOF */
-                              &current))
+                              &current,
+                              NULL))
                 {
                     FAIL2("panic: regclass returned NULL to handle_sets, "
                           "flags=%#"UVxf"", (UV) *flagp);
@@ -13776,23 +14803,32 @@ redo_curchar:
 
             case '[':   /* Is a bracketed character class */
             {
-                bool is_posix_class = could_it_be_a_POSIX_class(pRExC_state);
-
+                /* See if this is a [:posix:] class. */
+                bool is_posix_class = (OOB_NAMEDCLASS
+                            < handle_possible_posix(pRExC_state,
+                                                RExC_parse + 1,
+                                                NULL,
+                                                NULL));
+                /* If it is a posix class, leave the parse pointer at the '['
+                 * to fool regclass() into thinking it is part of a
+                 * '[[:posix:]]'. */
                 if (! is_posix_class) {
                     RExC_parse++;
                 }
 
                 /* regclass() can only return RESTART_PASS1 and NEED_UTF8 if
                  * multi-char folds are allowed.  */
-                if(!regclass(pRExC_state, flagp,depth+1,
-                             is_posix_class, /* parse the whole char class
-                                                only if not a posix class */
-                             FALSE, /* don't allow multi-char folds */
-                             FALSE, /* don't silence non-portable warnings.  */
-                             TRUE,   /* strict */
-                             FALSE, /* Require return to be an ANYOF */
-                             &current
-                            ))
+                if (!regclass(pRExC_state, flagp,depth+1,
+                                is_posix_class, /* parse the whole char
+                                                    class only if not a
+                                                    posix class */
+                                FALSE, /* don't allow multi-char folds */
+                                TRUE, /* silence non-portable warnings. */
+                                TRUE, /* strict */
+                                FALSE, /* Require return to be an ANYOF */
+                                &current,
+                                NULL
+                                ))
                 {
                     FAIL2("panic: regclass returned NULL to handle_sets, "
                           "flags=%#"UVxf"", (UV) *flagp);
@@ -14120,6 +15156,7 @@ redo_curchar:
                              they're valid on this machine */
                     FALSE, /* similarly, no need for strict */
                     FALSE, /* Require return to be an ANYOF */
+                    NULL,
                     NULL
                 );
     if (!node)
@@ -14148,7 +15185,8 @@ redo_curchar:
         assert(OP(node) == ANYOF);
 
         OP(node) = ANYOFL;
-        ANYOF_FLAGS(node) |= ANYOF_LOC_REQ_UTF8;
+        ANYOF_FLAGS(node)
+                |= ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
     }
 
     if (save_fold) {
@@ -14307,8 +15345,7 @@ S_add_multi_match(pTHX_ AV* multi_char_matches, SV* multi_string, const STRLEN c
 #define SKIP_BRACKETED_WHITE_SPACE(do_skip, p)                          \
     STMT_START {                                                        \
         if (do_skip) {                                                  \
-            while (   p < RExC_end                                      \
-                   && isBLANK_A(UCHARAT(p)))                            \
+            while (isBLANK_A(UCHARAT(p)))                               \
             {                                                           \
                 p++;                                                    \
             }                                                           \
@@ -14326,7 +15363,8 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                  const bool strict,
                  bool optimizable,                  /* ? Allow a non-ANYOF return
                                                        node */
-                 SV** ret_invlist  /* Return an inversion list, not a node */
+                 SV** ret_invlist, /* Return an inversion list, not a node */
+                 AV** return_posix_warnings
           )
 {
     /* parse a bracketed class specification.  Most of these will produce an
@@ -14360,7 +15398,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
     UV value = OOB_UNICODE, save_value = OOB_UNICODE;
     regnode *ret;
     STRLEN numlen;
-    IV namedclass = OOB_NAMEDCLASS;
+    int namedclass = OOB_NAMEDCLASS;
     char *rangebegin = NULL;
     bool need_class = 0;
     SV *listsv = NULL;
@@ -14439,6 +15477,14 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
     const char * orig_parse = RExC_parse;
     const SSize_t orig_size = RExC_size;
     bool posixl_matches_all = FALSE; /* Does /l class have both e.g. \W,\w ? */
+
+    /* This variable is used to mark where in the input something that looks
+     * like a POSIX construct ends.  During the parse, when something looks
+     * like it could be such a construct is encountered, it is checked for
+     * being one, but not if we've already checked this area of the input.
+     * Only after this position is reached do we check again */
+    char *not_posix_region_end = RExC_parse - 1;
+
     GET_RE_DEBUG_FLAGS_DECL;
 
     PERL_ARGS_ASSERT_REGCLASS;
@@ -14454,6 +15500,10 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
     allow_multi_folds = FALSE;
 #endif
 
+    if (return_posix_warnings == NULL) {
+        return_posix_warnings = (AV **) -1;
+    }
+
     /* Assume we are going to generate an ANYOF node. */
     ret = reganode(pRExC_state,
                    (LOC)
@@ -14476,7 +15526,9 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
 
     SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse);
 
-    if (UCHARAT(RExC_parse) == '^') {	/* Complement of range. */
+    assert(RExC_parse <= RExC_end);
+
+    if (UCHARAT(RExC_parse) == '^') {	/* Complement the class */
 	RExC_parse++;
         invert = TRUE;
         allow_multi_folds = FALSE;
@@ -14485,21 +15537,27 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
     }
 
     /* Check that they didn't say [:posix:] instead of [[:posix:]] */
-    if (!SIZE_ONLY && RExC_parse < RExC_end && POSIXCC(UCHARAT(RExC_parse))) {
-	const char *s = RExC_parse;
-	const char  c = *s++;
-
-        if (*s == '^') {
-            s++;
-        }
-	while (isWORDCHAR(*s))
-	    s++;
-	if (*s && c == *s && s[1] == ']') {
-	    SAVEFREESV(RExC_rx_sv);
-	    ckWARN3reg(s+2,
-		       "POSIX syntax [%c %c] belongs inside character classes",
-		       c, c);
-	    (void)ReREFCNT_inc(RExC_rx_sv);
+    if (! ret_invlist && MAYBE_POSIXCC(UCHARAT(RExC_parse))) {
+        char *class_end;
+        int maybe_class = handle_possible_posix(pRExC_state,
+                                                RExC_parse,
+                                                &class_end,
+                                                NULL);
+        if (maybe_class >= OOB_NAMEDCLASS) {
+            not_posix_region_end = class_end;
+            if (PASS2 && return_posix_warnings == (AV **) -1) {
+                SAVEFREESV(RExC_rx_sv);
+                ckWARN4reg(class_end,
+                        "POSIX syntax [%c %c] belongs inside character classes%s",
+                        *RExC_parse, *RExC_parse,
+                        (maybe_class == OOB_NAMEDCLASS)
+                        ? ((POSIXCC_NOTYET(*RExC_parse))
+                            ? " (but this one isn't implemented)"
+                            : " (but this one isn't fully valid)")
+                        : ""
+                        );
+                (void)ReREFCNT_inc(RExC_rx_sv);
+            }
 	}
     }
 
@@ -14535,7 +15593,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
 	    element_count++;
             non_portable_endpoint = 0;
 	}
-	if (UTF) {
+	if (UTF && ! UTF8_IS_INVARIANT(* RExC_parse)) {
 	    value = utf8n_to_uvchr((U8*)RExC_parse,
 				   RExC_end - RExC_parse,
 				   &numlen, UTF8_ALLOW_DEFAULT);
@@ -14544,11 +15602,27 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
 	else
 	    value = UCHARAT(RExC_parse++);
 
-        if (value == '['
-            && RExC_parse < RExC_end
-            && POSIXCC(UCHARAT(RExC_parse)))
+        if (value == '[') {
+            namedclass = handle_possible_posix(pRExC_state,
+                                               RExC_parse,
+                                               &not_posix_region_end,
+                                               return_posix_warnings);
+            if (namedclass > OOB_NAMEDCLASS) {
+                RExC_parse = not_posix_region_end;
+            }
+            else {
+                namedclass = OOB_NAMEDCLASS;
+            }
+        }
+        else if (   RExC_parse - 1 > not_posix_region_end
+                 && MAYBE_POSIXCC(value))
         {
-            namedclass = regpposixcc(pRExC_state, value, strict);
+            (void) handle_possible_posix(
+                        pRExC_state,
+                        RExC_parse - 1,  /* -1 because parse has already been
+                                            advanced */
+                        &not_posix_region_end,
+                        return_posix_warnings);
         }
         else if (value == '\\') {
             /* Is a backslash; get the code point of the char after it */
@@ -14593,6 +15667,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                                         &value,    /* Yes single value */
                                         &cp_count, /* Multiple code pt count */
                                         flagp,
+                                        strict,
                                         depth)
                     ) {
 
@@ -14605,11 +15680,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                             vFAIL("\\N in a character class must be a named character: \\N{...}");
                         }
                         else if (cp_count == 0) {
-                            if (strict) {
-                                RExC_parse++;   /* Position after the "}" */
-                                vFAIL("Zero length \\N{}");
-                            }
-                            else if (PASS2) {
+                            if (PASS2) {
                                 ckWARNreg(RExC_parse,
                                         "Ignoring zero length \\N{} in character class");
                             }
@@ -14665,7 +15736,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                                       |_CORE_SWASH_INIT_ACCEPT_INVLIST;
 
 		if (RExC_parse >= RExC_end)
-		    vFAIL2("Empty \\%c{}", (U8)value);
+		    vFAIL2("Empty \\%c", (U8)value);
 		if (*RExC_parse == '{') {
 		    const U8 c = (U8)value;
 		    e = strchr(RExC_parse, '}');
@@ -14730,9 +15801,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
 
                     /* Look up the property name, and get its swash and
                      * inversion list, if the property is found  */
-                    if (swash) {    /* Return any left-overs */
-                        SvREFCNT_dec_NN(swash);
-                    }
+                    SvREFCNT_dec(swash); /* Free any left-overs */
                     swash = _core_swash_init("utf8", name, &PL_sv_undef,
                                              1, /* binary */
                                              0, /* not tr/// */
@@ -15408,7 +16477,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                            "\"%.*s\" is more clearly written simply as \"%s\"",
                            (int) (RExC_parse - rangebegin),
                            rangebegin,
-                           cntrl_to_mnemonic((char) value)
+                           cntrl_to_mnemonic((U8) value)
                         );
                 }
             }
@@ -15470,11 +16539,17 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
 	STRLEN len;
 	char *save_end = RExC_end;
 	char *save_parse = RExC_parse;
+	char *save_start = RExC_start;
+        STRLEN prefix_end = 0;      /* We copy the character class after a
+                                       prefix supplied here.  This is the size
+                                       + 1 of that prefix */
         bool first_time = TRUE;     /* First multi-char occurrence doesn't get
                                        a "|" */
         I32 reg_flags;
 
         assert(! invert);
+        assert(RExC_precomp_adj == 0); /* Only one level of recursion allowed */
+
 #if 0   /* Have decided not to deal with multi-char folds in inverted classes,
            because too confusing */
         if (invert) {
@@ -15508,8 +16583,14 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
          * multi-character folds, have to include it in recursive parsing */
         if (element_count) {
             sv_catpv(substitute_parse, "|[");
+            prefix_end = SvCUR(substitute_parse);
             sv_catpvn(substitute_parse, orig_parse, RExC_parse - orig_parse);
-            sv_catpv(substitute_parse, "]");
+
+            /* Put in a closing ']' only if not going off the end, as otherwise
+             * we are adding something that really isn't there */
+            if (RExC_parse < RExC_end) {
+                sv_catpv(substitute_parse, "]");
+            }
         }
 
         sv_catpv(substitute_parse, ")");
@@ -15522,7 +16603,12 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
         }
 #endif
 
-	RExC_parse = SvPV(substitute_parse, len);
+        /* Set up the data structure so that any errors will be properly
+         * reported.  See the comments at the definition of
+         * REPORT_LOCATION_ARGS for details */
+        RExC_precomp_adj = orig_parse - RExC_precomp;
+	RExC_start =  RExC_parse = SvPV(substitute_parse, len);
+        RExC_adjusted_start = RExC_start + prefix_end;
 	RExC_end = RExC_parse + len;
         RExC_in_multi_char_class = 1;
 	RExC_override_recoding = 1;
@@ -15532,7 +16618,10 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
 
 	*flagp |= reg_flags&(HASWIDTH|SIMPLE|SPSTART|POSTPONED|RESTART_PASS1|NEED_UTF8);
 
-	RExC_parse = save_parse;
+        /* And restore so can parse the rest of the pattern */
+        RExC_parse = save_parse;
+	RExC_start = RExC_adjusted_start = save_start;
+        RExC_precomp_adj = 0;
 	RExC_end = save_end;
 	RExC_in_multi_char_class = 0;
 	RExC_override_recoding = 0;
@@ -15561,9 +16650,9 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
         if (UNLIKELY(posixl_matches_all)) {
             op = SANY;
         }
-        else if (namedclass > OOB_NAMEDCLASS) { /* this is a named class, like
-                                                   \w or [:digit:] or \p{foo}
-                                                 */
+        else if (namedclass > OOB_NAMEDCLASS) { /* this is a single named
+                                                   class, like \w or [:digit:]
+                                                   or \p{foo} */
 
             /* All named classes are mapped into POSIXish nodes, with its FLAG
              * argument giving which class it is */
@@ -15988,7 +17077,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
             /* If it matters to the final outcome, see if a non-property
              * component of the class matches above Unicode.  If so, the
              * warning gets suppressed.  This is true even if just a single
-             * such code point is specified, as though not strictly correct if
+             * such code point is specified, as, though not strictly correct if
              * another such code point is matched against, the fact that they
              * are using above-Unicode code points indicates they should know
              * the issues involved */
@@ -16029,15 +17118,29 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
      * fetching).  We know to set the flag if we have a non-NULL list for UTF-8
      * locales, or the class matches at least one 0-255 range code point */
     if (LOC && FOLD) {
+
+        /* Some things on the list might be unconditionally included because of
+         * other components.  Remove them, and clean up the list if it goes to
+         * 0 elements */
+        if (only_utf8_locale_list && cp_list) {
+            _invlist_subtract(only_utf8_locale_list, cp_list,
+                              &only_utf8_locale_list);
+
+            if (_invlist_len(only_utf8_locale_list) == 0) {
+                SvREFCNT_dec_NN(only_utf8_locale_list);
+                only_utf8_locale_list = NULL;
+            }
+        }
         if (only_utf8_locale_list) {
-            ANYOF_FLAGS(ret) |=  ANYOF_LOC_FOLD
-                                |ANYOF_ONLY_UTF8_LOC_FOLD_MATCHES;
+            ANYOF_FLAGS(ret)
+                 |=  ANYOFL_FOLD
+                    |ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
         }
         else if (cp_list) { /* Look to see if a 0-255 code point is in list */
             UV start, end;
             invlist_iterinit(cp_list);
             if (invlist_iternext(cp_list, &start, &end) && start < 256) {
-                ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD;
+                ANYOF_FLAGS(ret) |= ANYOFL_FOLD;
             }
             invlist_iterfinish(cp_list);
         }
@@ -16045,60 +17148,64 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
 
 #define MATCHES_ALL_NON_UTF8_NON_ASCII(ret)                                 \
     (   DEPENDS_SEMANTICS                                                   \
-     && ANYOF_FLAGS(ret)                                                    \
-        & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)
+     && (ANYOF_FLAGS(ret)                                                   \
+        & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER))
 
     /* See if we can simplify things under /d */
     if (   has_upper_latin1_only_utf8_matches
         || MATCHES_ALL_NON_UTF8_NON_ASCII(ret))
     {
-        if (has_upper_latin1_only_utf8_matches) {
-            if (MATCHES_ALL_NON_UTF8_NON_ASCII(ret)) {
-
-                /* Here, we have two, almost opposite, constraints in effect
-                 * for upper latin1 characters.  The macro means they all match
-                 * when the target string ISN'T in UTF-8.
-                 * 'has_upper_latin1_only_utf8_matches' contains the chars that
-                 * match only if the target string IS UTF-8.  Therefore the
-                 * ones in 'has_upper_latin1_only_utf8_matches' match
-                 * regardless of UTF-8, so can be added to the regular list,
-                 * and 'has_upper_latin1_only_utf8_matches' cleared */
-                _invlist_union(cp_list,
-                               has_upper_latin1_only_utf8_matches,
-                               &cp_list);
-                SvREFCNT_dec_NN(has_upper_latin1_only_utf8_matches);
-                has_upper_latin1_only_utf8_matches = NULL;
-            }
-            else if (cp_list) {
-
-                /* Here, 'cp_list' gives chars that always match, and
-                 * 'has_upper_latin1_only_utf8_matches' gives chars that were
-                 * specified to match only if the target string is in UTF-8.
-                 * It may be that these overlap, so we can subtract the
-                 * unconditionally matching from the conditional ones, to make
-                 * the conditional list as small as possible, perhaps even
-                 * clearing it, in which case more optimizations are possible
-                 * later */
-                _invlist_subtract(has_upper_latin1_only_utf8_matches,
-                                  cp_list,
-                                  &has_upper_latin1_only_utf8_matches);
-                if (_invlist_len(has_upper_latin1_only_utf8_matches) == 0) {
+        /* But not if we are inverting, as that screws it up */
+        if (! invert) {
+            if (has_upper_latin1_only_utf8_matches) {
+                if (MATCHES_ALL_NON_UTF8_NON_ASCII(ret)) {
+
+                    /* Here, we have both the flag and inversion list.  Any
+                     * character in 'has_upper_latin1_only_utf8_matches'
+                     * matches when UTF-8 is in effect, but it also matches
+                     * when UTF-8 is not in effect because of
+                     * MATCHES_ALL_NON_UTF8_NON_ASCII.  Therefore it matches
+                     * unconditionally, so can be added to the regular list,
+                     * and 'has_upper_latin1_only_utf8_matches' cleared */
+                    _invlist_union(cp_list,
+                                   has_upper_latin1_only_utf8_matches,
+                                   &cp_list);
                     SvREFCNT_dec_NN(has_upper_latin1_only_utf8_matches);
                     has_upper_latin1_only_utf8_matches = NULL;
                 }
+                else if (cp_list) {
+
+                    /* Here, 'cp_list' gives chars that always match, and
+                     * 'has_upper_latin1_only_utf8_matches' gives chars that
+                     * were specified to match only if the target string is in
+                     * UTF-8.  It may be that these overlap, so we can subtract
+                     * the unconditionally matching from the conditional ones,
+                     * to make the conditional list as small as possible,
+                     * perhaps even clearing it, in which case more
+                     * optimizations are possible later */
+                    _invlist_subtract(has_upper_latin1_only_utf8_matches,
+                                      cp_list,
+                                      &has_upper_latin1_only_utf8_matches);
+                    if (_invlist_len(has_upper_latin1_only_utf8_matches) == 0) {
+                        SvREFCNT_dec_NN(has_upper_latin1_only_utf8_matches);
+                        has_upper_latin1_only_utf8_matches = NULL;
+                    }
+                }
             }
-        }
 
-        /* Similarly, if the unconditional matches include every upper latin1
-         * character, we can clear that flag to permit later optimizations */
-        if (cp_list && MATCHES_ALL_NON_UTF8_NON_ASCII(ret)) {
-            SV* only_non_utf8_list = invlist_clone(PL_UpperLatin1);
-            _invlist_subtract(only_non_utf8_list, cp_list, &only_non_utf8_list);
-            if (_invlist_len(only_non_utf8_list) == 0) {
-                ANYOF_FLAGS(ret) &= ~ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
+            /* Similarly, if the unconditional matches include every upper
+             * latin1 character, we can clear that flag to permit later
+             * optimizations */
+            if (cp_list && MATCHES_ALL_NON_UTF8_NON_ASCII(ret)) {
+                SV* only_non_utf8_list = invlist_clone(PL_UpperLatin1);
+                _invlist_subtract(only_non_utf8_list, cp_list,
+                                  &only_non_utf8_list);
+                if (_invlist_len(only_non_utf8_list) == 0) {
+                    ANYOF_FLAGS(ret) &= ~ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
+                }
+                SvREFCNT_dec_NN(only_non_utf8_list);
+                only_non_utf8_list = NULL;;
             }
-            SvREFCNT_dec_NN(only_non_utf8_list);
-            only_non_utf8_list = NULL;;
         }
 
         /* If we haven't gotten rid of all conditional matching, we change the
@@ -16462,7 +17569,7 @@ Perl__get_regclass_nonbitmap_data(pTHX_ const regexp *prog,
                                         bool doinit,
                                         SV** listsvp,
                                         SV** only_utf8_locale_ptr,
-                                        SV*  exclude_list)
+                                        SV** output_invlist)
 
 {
     /* For internal core use only.
@@ -16474,20 +17581,32 @@ Perl__get_regclass_nonbitmap_data(pTHX_ const regexp *prog,
      *    swash exists, by calling this function with 'doinit' set to false, in
      *    which case the components that will be used to eventually create the
      *    swash are returned  (in a printable form).
-     * If <exclude_list> is not NULL, it is an inversion list of things to
-     *    exclude from what's returned in <listsvp>.
+     * If <only_utf8_locale_ptr> is not NULL, it is where this routine is to
+     *    store an inversion list of code points that should match only if the
+     *    execution-time locale is a UTF-8 one.
+     * If <output_invlist> is not NULL, it is where this routine is to store an
+     *    inversion list of the code points that would be instead returned in
+     *    <listsvp> if this were NULL.  Thus, what gets output in <listsvp>
+     *    when this parameter is used, is just the non-code point data that
+     *    will go into creating the swash.  This currently should be just
+     *    user-defined properties whose definitions were not known at compile
+     *    time.  Using this parameter allows for easier manipulation of the
+     *    swash's data by the caller.  It is illegal to call this function with
+     *    this parameter set, but not <listsvp>
+     *
      * Tied intimately to how S_set_ANYOF_arg sets up the data structure.  Note
      * that, in spite of this function's name, the swash it returns may include
      * the bitmap data as well */
 
     SV *sw  = NULL;
     SV *si  = NULL;         /* Input swash initialization string */
-    SV*  invlist = NULL;
+    SV* invlist = NULL;
 
     RXi_GET_DECL(prog,progi);
     const struct reg_data * const data = prog ? progi->data : NULL;
 
     PERL_ARGS_ASSERT__GET_REGCLASS_NONBITMAP_DATA;
+    assert(! output_invlist || listsvp);
 
     if (data && data->count) {
 	const U32 n = ARG(node);
@@ -16549,7 +17668,7 @@ Perl__get_regclass_nonbitmap_data(pTHX_ const regexp *prog,
 
     /* If requested, return a printable version of what this swash matches */
     if (listsvp) {
-	SV* matches_string = newSVpvs("");
+	SV* matches_string = NULL;
 
         /* The swash should be used, if possible, to get the data, as it
          * contains the resolved data.  But this function can be called at
@@ -16559,22 +17678,124 @@ Perl__get_regclass_nonbitmap_data(pTHX_ const regexp *prog,
 	if ((! sw || (invlist = _get_swash_invlist(sw)) == NULL)
             && (si && si != &PL_sv_undef))
         {
-	    sv_catsv(matches_string, si);
+            /* Here, we only have 'si' (and possibly some passed-in data in
+             * 'invlist', which is handled below)  If the caller only wants
+             * 'si', use that.  */
+            if (! output_invlist) {
+                matches_string = newSVsv(si);
+            }
+            else {
+                /* But if the caller wants an inversion list of the node, we
+                 * need to parse 'si' and place as much as possible in the
+                 * desired output inversion list, making 'matches_string' only
+                 * contain the currently unresolvable things */
+                const char *si_string = SvPVX(si);
+                STRLEN remaining = SvCUR(si);
+                UV prev_cp = 0;
+                U8 count = 0;
+
+                /* Ignore everything before the first new-line */
+                while (*si_string != '\n' && remaining > 0) {
+                    si_string++;
+                    remaining--;
+                }
+                assert(remaining > 0);
+
+                si_string++;
+                remaining--;
+
+                while (remaining > 0) {
+
+                    /* The data consists of just strings defining user-defined
+                     * property names, but in prior incarnations, and perhaps
+                     * somehow from pluggable regex engines, it could still
+                     * hold hex code point definitions.  Each component of a
+                     * range would be separated by a tab, and each range by a
+                     * new-line.  If these are found, instead add them to the
+                     * inversion list */
+                    I32 grok_flags =  PERL_SCAN_SILENT_ILLDIGIT
+                                     |PERL_SCAN_SILENT_NON_PORTABLE;
+                    STRLEN len = remaining;
+                    UV cp = grok_hex(si_string, &len, &grok_flags, NULL);
+
+                    /* If the hex decode routine found something, it should go
+                     * up to the next \n */
+                    if (   *(si_string + len) == '\n') {
+                        if (count) {    /* 2nd code point on line */
+                            *output_invlist = _add_range_to_invlist(*output_invlist, prev_cp, cp);
+                        }
+                        else {
+                            *output_invlist = add_cp_to_invlist(*output_invlist, cp);
+                        }
+                        count = 0;
+                        goto prepare_for_next_iteration;
+                    }
+
+                    /* If the hex decode was instead for the lower range limit,
+                     * save it, and go parse the upper range limit */
+                    if (*(si_string + len) == '\t') {
+                        assert(count == 0);
+
+                        prev_cp = cp;
+                        count = 1;
+                      prepare_for_next_iteration:
+                        si_string += len + 1;
+                        remaining -= len + 1;
+                        continue;
+                    }
+
+                    /* Here, didn't find a legal hex number.  Just add it from
+                     * here to the next \n */
+
+                    remaining -= len;
+                    while (*(si_string + len) != '\n' && remaining > 0) {
+                        remaining--;
+                        len++;
+                    }
+                    if (*(si_string + len) == '\n') {
+                        len++;
+                        remaining--;
+                    }
+                    if (matches_string) {
+                        sv_catpvn(matches_string, si_string, len - 1);
+                    }
+                    else {
+                        matches_string = newSVpvn(si_string, len - 1);
+                    }
+                    si_string += len;
+                    sv_catpvs(matches_string, " ");
+                } /* end of loop through the text */
+
+                assert(matches_string);
+                if (SvCUR(matches_string)) {  /* Get rid of trailing blank */
+                    SvCUR_set(matches_string, SvCUR(matches_string) - 1);
+                }
+            } /* end of has an 'si' but no swash */
 	}
 
-	/* Add the inversion list to whatever we have.  This may have come from
-	 * the swash, or from an input parameter */
-	if (invlist) {
-            if (exclude_list) {
-                SV* clone = invlist_clone(invlist);
-                _invlist_subtract(clone, exclude_list, &clone);
-                sv_catsv(matches_string, _invlist_contents(clone));
-                SvREFCNT_dec_NN(clone);
+        /* If we have a swash in place, its equivalent inversion list was above
+         * placed into 'invlist'.  If not, this variable may contain a stored
+         * inversion list which is information beyond what is in 'si' */
+        if (invlist) {
+
+            /* Again, if the caller doesn't want the output inversion list, put
+             * everything in 'matches-string' */
+            if (! output_invlist) {
+                if ( ! matches_string) {
+                    matches_string = newSVpvs("\n");
+                }
+                sv_catsv(matches_string, invlist_contents(invlist,
+                                                  TRUE /* traditional style */
+                                                  ));
+            }
+            else if (! *output_invlist) {
+                *output_invlist = invlist_clone(invlist);
             }
             else {
-                sv_catsv(matches_string, _invlist_contents(invlist));
+                _invlist_union(*output_invlist, invlist, output_invlist);
             }
-	}
+        }
+
 	*listsvp = matches_string;
     }
 
@@ -16686,14 +17907,16 @@ S_nextchar(pTHX_ RExC_state_t *pRExC_state)
 {
     PERL_ARGS_ASSERT_NEXTCHAR;
 
-    assert(   ! UTF
-           || UTF8_IS_INVARIANT(*RExC_parse)
-           || UTF8_IS_START(*RExC_parse));
+    if (RExC_parse < RExC_end) {
+        assert(   ! UTF
+               || UTF8_IS_INVARIANT(*RExC_parse)
+               || UTF8_IS_START(*RExC_parse));
 
-    RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
+        RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
 
-    skip_to_be_ignored_text(pRExC_state, &RExC_parse,
-                            FALSE /* Don't assume /x */ );
+        skip_to_be_ignored_text(pRExC_state, &RExC_parse,
+                                FALSE /* Don't assume /x */ );
+    }
 }
 
 STATIC regnode *
@@ -17206,6 +18429,53 @@ Perl_regdump(pTHX_ const regexp *r)
 #endif	/* DEBUGGING */
 }
 
+/* Should be synchronized with ANYOF_ #defines in regcomp.h */
+#ifdef DEBUGGING
+
+#  if   _CC_WORDCHAR != 0 || _CC_DIGIT != 1        || _CC_ALPHA != 2    \
+     || _CC_LOWER != 3    || _CC_UPPER != 4        || _CC_PUNCT != 5    \
+     || _CC_PRINT != 6    || _CC_ALPHANUMERIC != 7 || _CC_GRAPH != 8    \
+     || _CC_CASED != 9    || _CC_SPACE != 10       || _CC_BLANK != 11   \
+     || _CC_XDIGIT != 12  || _CC_CNTRL != 13       || _CC_ASCII != 14   \
+     || _CC_VERTSPACE != 15
+#   error Need to adjust order of anyofs[]
+#  endif
+static const char * const anyofs[] = {
+    "\\w",
+    "\\W",
+    "\\d",
+    "\\D",
+    "[:alpha:]",
+    "[:^alpha:]",
+    "[:lower:]",
+    "[:^lower:]",
+    "[:upper:]",
+    "[:^upper:]",
+    "[:punct:]",
+    "[:^punct:]",
+    "[:print:]",
+    "[:^print:]",
+    "[:alnum:]",
+    "[:^alnum:]",
+    "[:graph:]",
+    "[:^graph:]",
+    "[:cased:]",
+    "[:^cased:]",
+    "\\s",
+    "\\S",
+    "[:blank:]",
+    "[:^blank:]",
+    "[:xdigit:]",
+    "[:^xdigit:]",
+    "[:cntrl:]",
+    "[:^cntrl:]",
+    "[:ascii:]",
+    "[:^ascii:]",
+    "\\v",
+    "\\V"
+};
+#endif
+
 /*
 - regprop - printable representation of opcode, with run time support
 */
@@ -17215,49 +18485,6 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_
 {
 #ifdef DEBUGGING
     int k;
-
-    /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
-    static const char * const anyofs[] = {
-#if _CC_WORDCHAR != 0 || _CC_DIGIT != 1 || _CC_ALPHA != 2 || _CC_LOWER != 3 \
-    || _CC_UPPER != 4 || _CC_PUNCT != 5 || _CC_PRINT != 6                   \
-    || _CC_ALPHANUMERIC != 7 || _CC_GRAPH != 8 || _CC_CASED != 9            \
-    || _CC_SPACE != 10 || _CC_BLANK != 11 || _CC_XDIGIT != 12               \
-    || _CC_CNTRL != 13 || _CC_ASCII != 14 || _CC_VERTSPACE != 15
-  #error Need to adjust order of anyofs[]
-#endif
-        "\\w",
-        "\\W",
-        "\\d",
-        "\\D",
-        "[:alpha:]",
-        "[:^alpha:]",
-        "[:lower:]",
-        "[:^lower:]",
-        "[:upper:]",
-        "[:^upper:]",
-        "[:punct:]",
-        "[:^punct:]",
-        "[:print:]",
-        "[:^print:]",
-        "[:alnum:]",
-        "[:^alnum:]",
-        "[:graph:]",
-        "[:^graph:]",
-        "[:cased:]",
-        "[:^cased:]",
-        "\\s",
-        "\\S",
-        "[:blank:]",
-        "[:^blank:]",
-        "[:xdigit:]",
-        "[:^xdigit:]",
-        "[:cntrl:]",
-        "[:^cntrl:]",
-        "[:ascii:]",
-        "[:^ascii:]",
-        "\\v",
-        "\\V"
-    };
     RXi_GET_DECL(prog,progi);
     GET_RE_DEBUG_FLAGS_DECL;
 
@@ -17314,10 +18541,13 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_
         if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
             sv_catpvs(sv, "[");
             (void) put_charclass_bitmap_innards(sv,
-                                                (IS_ANYOF_TRIE(op))
+                                                ((IS_ANYOF_TRIE(op))
                                                  ? ANYOF_BITMAP(o)
-                                                 : TRIE_BITMAP(trie),
-                                                NULL);
+                                                 : TRIE_BITMAP(trie)),
+                                                NULL,
+                                                NULL,
+                                                NULL
+                                               );
             sv_catpvs(sv, "]");
         }
 
@@ -17400,152 +18630,133 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_
 	Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);
     else if (k == ANYOF) {
 	const U8 flags = ANYOF_FLAGS(o);
-	int do_sep = 0;
-        SV* bitmap_invlist = NULL;  /* Will hold what the bit map contains */
+        bool do_sep = FALSE;    /* Do we need to separate various components of
+                                   the output? */
+        /* Set if there is still an unresolved user-defined property */
+        SV *unresolved                = NULL;
 
+        /* Things that are ignored except when the runtime locale is UTF-8 */
+        SV *only_utf8_locale_invlist = NULL;
+
+        /* Code points that don't fit in the bitmap */
+        SV *nonbitmap_invlist = NULL;
+
+        /* And things that aren't in the bitmap, but are small enough to be */
+        SV* bitmap_range_not_in_bitmap = NULL;
 
 	if (OP(o) == ANYOFL) {
-            if (flags & ANYOF_LOC_REQ_UTF8) {
-                sv_catpvs(sv, "{utf8-loc}");
+            if (ANYOFL_UTF8_LOCALE_REQD(flags)) {
+                sv_catpvs(sv, "{utf8-locale-reqd}");
             }
-            else {
-                sv_catpvs(sv, "{loc}");
+            if (flags & ANYOFL_FOLD) {
+                sv_catpvs(sv, "{i}");
             }
         }
-	if (flags & ANYOF_LOC_FOLD)
-	    sv_catpvs(sv, "{i}");
-	Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
-	if (flags & ANYOF_INVERT)
-	    sv_catpvs(sv, "^");
 
-        /* output what the standard cp 0-NUM_ANYOF_CODE_POINTS-1 bitmap matches
-         * */
-        do_sep = put_charclass_bitmap_innards(sv, ANYOF_BITMAP(o),
-                                                            &bitmap_invlist);
+        /* If there is stuff outside the bitmap, get it */
+        if (ARG(o) != ANYOF_ONLY_HAS_BITMAP) {
+            (void) _get_regclass_nonbitmap_data(prog, o, FALSE,
+                                                &unresolved,
+                                                &only_utf8_locale_invlist,
+                                                &nonbitmap_invlist);
+            /* The non-bitmap data may contain stuff that could fit in the
+             * bitmap.  This could come from a user-defined property being
+             * finally resolved when this call was done; or much more likely
+             * because there are matches that require UTF-8 to be valid, and so
+             * aren't in the bitmap.  This is teased apart later */
+            _invlist_intersection(nonbitmap_invlist,
+                                  PL_InBitmap,
+                                  &bitmap_range_not_in_bitmap);
+            /* Leave just the things that don't fit into the bitmap */
+            _invlist_subtract(nonbitmap_invlist,
+                              PL_InBitmap,
+                              &nonbitmap_invlist);
+        }
 
-        /* output any special charclass tests (used entirely under use
-         * locale) * */
-	if (ANYOF_POSIXL_TEST_ANY_SET(o)) {
-            int i;
-	    for (i = 0; i < ANYOF_POSIXL_MAX; i++) {
-		if (ANYOF_POSIXL_TEST(o,i)) {
-		    sv_catpv(sv, anyofs[i]);
-		    do_sep = 1;
-		}
-            }
+        /* Obey this flag to add all above-the-bitmap code points */
+        if (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
+            nonbitmap_invlist = _add_range_to_invlist(nonbitmap_invlist,
+                                                      NUM_ANYOF_CODE_POINTS,
+                                                      UV_MAX);
         }
 
-        if (    ARG(o) != ANYOF_ONLY_HAS_BITMAP
-	    || (flags
-                & ( ANYOF_MATCHES_ALL_ABOVE_BITMAP
-                   |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP
-                   |ANYOF_LOC_FOLD)))
-        {
+        /* Ready to start outputting.  First, the initial left bracket */
+	Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
+
+        /* Then all the things that could fit in the bitmap */
+        do_sep = put_charclass_bitmap_innards(sv,
+                                              ANYOF_BITMAP(o),
+                                              bitmap_range_not_in_bitmap,
+                                              only_utf8_locale_invlist,
+                                              o);
+        SvREFCNT_dec(bitmap_range_not_in_bitmap);
+
+        /* If there are user-defined properties which haven't been defined yet,
+         * output them, in a separate [] from the bitmap range stuff */
+        if (unresolved) {
             if (do_sep) {
                 Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]);
-                if (flags & ANYOF_INVERT)
-                    /*make sure the invert info is in each */
-                    sv_catpvs(sv, "^");
             }
-
-            if (OP(o) == ANYOFD
-                && (flags & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER))
-            {
-                sv_catpvs(sv, "{non-utf8-latin1-all}");
+            if (flags & ANYOF_INVERT) {
+                sv_catpvs(sv, "^");
             }
+            sv_catsv(sv, unresolved);
+            do_sep = TRUE;
+            SvREFCNT_dec_NN(unresolved);
+        }
 
-            if (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP)
-                sv_catpvs(sv, "{above_bitmap_all}");
-
-            if (ARG(o) != ANYOF_ONLY_HAS_BITMAP) {
-                SV *lv; /* Set if there is something outside the bit map. */
-                bool byte_output = FALSE;   /* If something has been output */
-                SV *only_utf8_locale;
-
-                /* Get the stuff that wasn't in the bitmap.  'bitmap_invlist'
-                 * is used to guarantee that nothing in the bitmap gets
-                 * returned */
-                (void) _get_regclass_nonbitmap_data(prog, o, FALSE,
-                                                    &lv, &only_utf8_locale,
-                                                    bitmap_invlist);
-                if (lv && lv != &PL_sv_undef) {
-                    char *s = savesvpv(lv);
-                    char * const origs = s;
-
-                    while (*s && *s != '\n')
-                        s++;
-
-                    if (*s == '\n') {
-                        const char * const t = ++s;
-
-                        if (flags & ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP) {
-                            if (OP(o) == ANYOFD) {
-                                sv_catpvs(sv, "{utf8}");
-                            }
-                            else {
-                                sv_catpvs(sv, "{outside bitmap}");
-                            }
-                        }
+        /* And, finally, add the above-the-bitmap stuff */
+        if (nonbitmap_invlist) {
+            SV* contents;
 
-                        if (byte_output) {
-                            sv_catpvs(sv, " ");
-                        }
+            /* See if truncation size is overridden */
+            const STRLEN dump_len = (PL_dump_re_max_len)
+                                    ? PL_dump_re_max_len
+                                    : 256;
 
-                        while (*s) {
-                            if (*s == '\n') {
+            /* This is output in a separate [] */
+            if (do_sep) {
+                Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]);
+            }
 
-                                /* Truncate very long output */
-                                if (s - origs > 256) {
-                                    Perl_sv_catpvf(aTHX_ sv,
-                                                "%.*s...",
-                                                (int) (s - origs - 1),
-                                                t);
-                                    goto out_dump;
-                                }
-                                *s = ' ';
-                            }
-                            else if (*s == '\t') {
-                                *s = '-';
-                            }
-                            s++;
-                        }
-                        if (s[-1] == ' ')
-                            s[-1] = 0;
+            /* And, for easy of understanding, it is always output not-shown as
+             * complemented */
+            if (flags & ANYOF_INVERT) {
+                _invlist_invert(nonbitmap_invlist);
+                _invlist_subtract(nonbitmap_invlist, PL_InBitmap, &nonbitmap_invlist);
+            }
 
-                        sv_catpv(sv, t);
-                    }
+            contents = invlist_contents(nonbitmap_invlist,
+                                        FALSE /* output suitable for catsv */
+                                       );
 
-                  out_dump:
+            /* If the output is shorter than the permissible maximum, just do it. */
+            if (SvCUR(contents) <= dump_len) {
+                sv_catsv(sv, contents);
+            }
+            else {
+                const char * contents_string = SvPVX(contents);
+                STRLEN i = dump_len;
 
-                    Safefree(origs);
-                    SvREFCNT_dec_NN(lv);
+                /* Otherwise, start at the permissible max and work back to the
+                 * first break possibility */
+                while (i > 0 && contents_string[i] != ' ') {
+                    i--;
                 }
-
-                if ((flags & ANYOF_LOC_FOLD)
-                     && only_utf8_locale
-                     && only_utf8_locale != &PL_sv_undef)
-                {
-                    UV start, end;
-                    int max_entries = 256;
-
-                    sv_catpvs(sv, "{utf8 locale}");
-                    invlist_iterinit(only_utf8_locale);
-                    while (invlist_iternext(only_utf8_locale,
-                                            &start, &end)) {
-                        put_range(sv, start, end, FALSE);
-                        max_entries --;
-                        if (max_entries < 0) {
-                            sv_catpvs(sv, "...");
-                            break;
-                        }
-                    }
-                    invlist_iterfinish(only_utf8_locale);
+                if (i == 0) {       /* Fail-safe.  Use the max if we couldn't
+                                       find a legal break */
+                    i = dump_len;
                 }
+
+                sv_catpvn(sv, contents_string, i);
+                sv_catpvs(sv, "...");
             }
-	}
-        SvREFCNT_dec(bitmap_invlist);
 
+            SvREFCNT_dec_NN(contents);
+            SvREFCNT_dec_NN(nonbitmap_invlist);
+        }
 
+        /* And finally the matching, closing ']' */
 	Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
     }
     else if (k == POSIXD || k == NPOSIXD) {
@@ -17568,9 +18779,11 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_
         const char * const bounds[] = {
             "",      /* Traditional */
             "{gcb}",
+            "{lb}",
             "{sb}",
             "{wb}"
         };
+        assert(FLAGS(o) < C_ARRAY_LENGTH(bounds));
         sv_catpv(sv, bounds[FLAGS(o)]);
     }
     else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
@@ -18210,18 +19423,18 @@ S_put_code_point(pTHX_ SV *sv, UV c)
     }
     else if (isPRINT(c)) {
 	const char string = (char) c;
-	if (isBACKSLASHED_PUNCT(c))
+
+        /* We use {phrase} as metanotation in the class, so also escape literal
+         * braces */
+	if (isBACKSLASHED_PUNCT(c) || c == '{' || c == '}')
 	    sv_catpvs(sv, "\\");
 	sv_catpvn(sv, &string, 1);
     }
+    else if (isMNEMONIC_CNTRL(c)) {
+        Perl_sv_catpvf(aTHX_ sv, "%s", cntrl_to_mnemonic((U8) c));
+    }
     else {
-        const char * const mnemonic = cntrl_to_mnemonic((char) c);
-        if (mnemonic) {
-            Perl_sv_catpvf(aTHX_ sv, "%s", mnemonic);
-        }
-        else {
-            Perl_sv_catpvf(aTHX_ sv, "\\x{%02X}", (U8) c);
-        }
+        Perl_sv_catpvf(aTHX_ sv, "\\x%02X", (U8) c);
     }
 }
 
@@ -18231,8 +19444,15 @@ STATIC void
 S_put_range(pTHX_ SV *sv, UV start, const UV end, const bool allow_literals)
 {
     /* Appends to 'sv' a displayable version of the range of code points from
-     * 'start' to 'end'.  It assumes that only ASCII printables are displayable
-     * as-is (though some of these will be escaped by put_code_point()). */
+     * 'start' to 'end'.  Mnemonics (like '\r') are used for the few controls
+     * that have them, when they occur at the beginning or end of the range.
+     * It uses hex to output the remaining code points, unless 'allow_literals'
+     * is true, in which case the printable ASCII ones are output as-is (though
+     * some of these will be escaped by put_code_point()).
+     *
+     * NOTE:  This is designed only for printing ranges of code points that fit
+     *        inside an ANYOF bitmap.  Higher code points are simply suppressed
+     */
 
     const unsigned int min_range_count = 3;
 
@@ -18246,7 +19466,7 @@ S_put_range(pTHX_ SV *sv, UV start, const UV end, const bool allow_literals)
 
         if (end - start < min_range_count) {
 
-            /* Individual chars in short ranges */
+            /* Output chars individually when they occur in short ranges */
             for (; start <= end; start++) {
                 put_code_point(sv, start);
             }
@@ -18255,11 +19475,11 @@ S_put_range(pTHX_ SV *sv, UV start, const UV end, const bool allow_literals)
 
         /* If permitted by the input options, and there is a possibility that
          * this range contains a printable literal, look to see if there is
-         * one.  */
+         * one. */
         if (allow_literals && start <= MAX_PRINT_A) {
 
-            /* If the range begin isn't an ASCII printable, effectively split
-             * the range into two parts:
+            /* If the character at the beginning of the range isn't an ASCII
+             * printable, effectively split the range into two parts:
              *  1) the portion before the first such printable,
              *  2) the rest
              * and output them separately. */
@@ -18281,18 +19501,18 @@ S_put_range(pTHX_ SV *sv, UV start, const UV end, const bool allow_literals)
                     temp_end = end + 1;
                 }
 
-                /* Output the first part of the split range, the part that
-                 * doesn't have printables, with no looking for literals
-                 * (otherwise we would infinitely recurse) */
+                /* Output the first part of the split range: the part that
+                 * doesn't have printables, with the parameter set to not look
+                 * for literals (otherwise we would infinitely recurse) */
                 put_range(sv, start, temp_end - 1, FALSE);
 
                 /* The 2nd part of the range (if any) starts here. */
                 start = temp_end;
 
-                /* We continue instead of dropping down because even if the 2nd
-                 * part is non-empty, it could be so short that we want to
-                 * output it specially, as tested for at the top of this loop.
-                 * */
+                /* We do a continue, instead of dropping down, because even if
+                 * the 2nd part is non-empty, it could be so short that we want
+                 * to output it as individual characters, as tested for at the
+                 * top of this loop.  */
                 continue;
             }
 
@@ -18357,7 +19577,8 @@ S_put_range(pTHX_ SV *sv, UV start, const UV end, const bool allow_literals)
                 temp_end--;
             }
 
-            /* And separately output the range that doesn't have mnemonics */
+            /* And separately output the interior range that doesn't start or
+             * end with mnemonics */
             put_range(sv, start, temp_end, FALSE);
 
             /* Then output the mnemonic trailing controls */
@@ -18376,10 +19597,10 @@ S_put_range(pTHX_ SV *sv, UV start, const UV end, const bool allow_literals)
                     : NUM_ANYOF_CODE_POINTS - 1;
 #if NUM_ANYOF_CODE_POINTS > 256
         format = (this_end < 256)
-                 ? "\\x{%02"UVXf"}-\\x{%02"UVXf"}"
+                 ? "\\x%02"UVXf"-\\x%02"UVXf""
                  : "\\x{%04"UVXf"}-\\x{%04"UVXf"}";
 #else
-        format = "\\x{%02"UVXf"}-\\x{%02"UVXf"}";
+        format = "\\x%02"UVXf"-\\x%02"UVXf"";
 #endif
         GCC_DIAG_IGNORE(-Wformat-nonliteral);
         Perl_sv_catpvf(aTHX_ sv, format, start, this_end);
@@ -18388,55 +19609,26 @@ S_put_range(pTHX_ SV *sv, UV start, const UV end, const bool allow_literals)
     }
 }
 
-STATIC bool
-S_put_charclass_bitmap_innards(pTHX_ SV *sv, char *bitmap, SV** bitmap_invlist)
+STATIC void
+S_put_charclass_bitmap_innards_invlist(pTHX_ SV *sv, SV* invlist)
 {
-    /* Appends to 'sv' a displayable version of the innards of the bracketed
-     * character class whose bitmap is 'bitmap';  Returns 'TRUE' if it actually
-     * output anything, and bitmap_invlist, if not NULL, will point to an
-     * inversion list of what is in the bit map */
+    /* Concatenate onto the PV in 'sv' a displayable form of the inversion list
+     * 'invlist' */
 
-    int i;
     UV start, end;
-    unsigned int punct_count = 0;
-    SV* invlist;
     bool allow_literals = TRUE;
-    bool inverted_for_output = FALSE;
 
-    PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS;
-
-    /* Worst case is exactly every-other code point is in the list */
-    invlist = _new_invlist(NUM_ANYOF_CODE_POINTS / 2);
-
-    /* Convert the bit map to an inversion list, keeping track of how many
-     * ASCII puncts are set, including an extra amount for the backslashed
-     * ones.  */
-    for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
-        if (BITMAP_TEST(bitmap, i)) {
-            invlist = add_cp_to_invlist(invlist, i);
-            if (isPUNCT_A(i)) {
-                punct_count++;
-                if isBACKSLASHED_PUNCT(i) {
-                    punct_count++;
-                }
-            }
-        }
-    }
-
-    /* Nothing to output */
-    if (_invlist_len(invlist) == 0) {
-        SvREFCNT_dec_NN(invlist);
-        return FALSE;
-    }
+    PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS_INVLIST;
 
     /* Generally, it is more readable if printable characters are output as
      * literals, but if a range (nearly) spans all of them, it's best to output
      * it as a single range.  This code will use a single range if all but 2
-     * printables are in it */
+     * ASCII printables are in it */
     invlist_iterinit(invlist);
     while (invlist_iternext(invlist, &start, &end)) {
 
-        /* If range starts beyond final printable, it doesn't have any in it */
+        /* If the range starts beyond the final printable, it doesn't have any
+         * in it */
         if (start > MAX_PRINT_A) {
             break;
         }
@@ -18459,24 +19651,6 @@ S_put_charclass_bitmap_innards(pTHX_ SV *sv, char *bitmap, SV** bitmap_invlist)
     }
     invlist_iterfinish(invlist);
 
-    /* The legibility of the output depends mostly on how many punctuation
-     * characters are output.  There are 32 possible ASCII ones, and some have
-     * an additional backslash, bringing it to currently 36, so if any more
-     * than 18 are to be output, we can instead output it as its complement,
-     * yielding fewer puncts, and making it more legible.  But give some weight
-     * to the fact that outputting it as a complement is less legible than a
-     * straight output, so don't complement unless we are somewhat over the 18
-     * mark */
-    if (allow_literals && punct_count > 22) {
-        sv_catpvs(sv, "^");
-
-        /* Add everything remaining to the list, so when we invert it just
-         * below, it will be excluded */
-        _invlist_union_complement_2nd(invlist, PL_InBitmap, &invlist);
-        _invlist_invert(invlist);
-        inverted_for_output = TRUE;
-    }
-
     /* Here we have figured things out.  Output each range */
     invlist_iterinit(invlist);
     while (invlist_iternext(invlist, &start, &end)) {
@@ -18487,22 +19661,331 @@ S_put_charclass_bitmap_innards(pTHX_ SV *sv, char *bitmap, SV** bitmap_invlist)
     }
     invlist_iterfinish(invlist);
 
-    if (bitmap_invlist) {
+    return;
+}
+
+STATIC SV*
+S_put_charclass_bitmap_innards_common(pTHX_
+        SV* invlist,            /* The bitmap */
+        SV* posixes,            /* Under /l, things like [:word:], \S */
+        SV* only_utf8,          /* Under /d, matches iff the target is UTF-8 */
+        SV* not_utf8,           /* /d, matches iff the target isn't UTF-8 */
+        SV* only_utf8_locale,   /* Under /l, matches if the locale is UTF-8 */
+        const bool invert       /* Is the result to be inverted? */
+)
+{
+    /* Create and return an SV containing a displayable version of the bitmap
+     * and associated information determined by the input parameters. */
+
+    SV * output;
 
-        /* Here, wants the inversion list returned.  If we inverted it, we have
-         * to restore it to the original */
-        if (inverted_for_output) {
-            _invlist_invert(invlist);
-            _invlist_intersection(invlist, PL_InBitmap, &invlist);
-        }
+    PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS_COMMON;
 
-        *bitmap_invlist = invlist;
+    if (invert) {
+        output = newSVpvs("^");
     }
     else {
-        SvREFCNT_dec_NN(invlist);
+        output = newSVpvs("");
     }
 
-    return TRUE;
+    /* First, the code points in the bitmap that are unconditionally there */
+    put_charclass_bitmap_innards_invlist(output, invlist);
+
+    /* Traditionally, these have been placed after the main code points */
+    if (posixes) {
+        sv_catsv(output, posixes);
+    }
+
+    if (only_utf8 && _invlist_len(only_utf8)) {
+        Perl_sv_catpvf(aTHX_ output, "%s{utf8}%s", PL_colors[1], PL_colors[0]);
+        put_charclass_bitmap_innards_invlist(output, only_utf8);
+    }
+
+    if (not_utf8 && _invlist_len(not_utf8)) {
+        Perl_sv_catpvf(aTHX_ output, "%s{not utf8}%s", PL_colors[1], PL_colors[0]);
+        put_charclass_bitmap_innards_invlist(output, not_utf8);
+    }
+
+    if (only_utf8_locale && _invlist_len(only_utf8_locale)) {
+        Perl_sv_catpvf(aTHX_ output, "%s{utf8 locale}%s", PL_colors[1], PL_colors[0]);
+        put_charclass_bitmap_innards_invlist(output, only_utf8_locale);
+
+        /* This is the only list in this routine that can legally contain code
+         * points outside the bitmap range.  The call just above to
+         * 'put_charclass_bitmap_innards_invlist' will simply suppress them, so
+         * output them here.  There's about a half-dozen possible, and none in
+         * contiguous ranges longer than 2 */
+        if (invlist_highest(only_utf8_locale) >= NUM_ANYOF_CODE_POINTS) {
+            UV start, end;
+            SV* above_bitmap = NULL;
+
+            _invlist_subtract(only_utf8_locale, PL_InBitmap, &above_bitmap);
+
+            invlist_iterinit(above_bitmap);
+            while (invlist_iternext(above_bitmap, &start, &end)) {
+                UV i;
+
+                for (i = start; i <= end; i++) {
+                    put_code_point(output, i);
+                }
+            }
+            invlist_iterfinish(above_bitmap);
+            SvREFCNT_dec_NN(above_bitmap);
+        }
+    }
+
+    /* If the only thing we output is the '^', clear it */
+    if (invert && SvCUR(output) == 1) {
+        SvCUR_set(output, 0);
+    }
+
+    return output;
+}
+
+STATIC bool
+S_put_charclass_bitmap_innards(pTHX_ SV *sv,
+                                     char *bitmap,
+                                     SV *nonbitmap_invlist,
+                                     SV *only_utf8_locale_invlist,
+                                     const regnode * const node)
+{
+    /* Appends to 'sv' a displayable version of the innards of the bracketed
+     * character class defined by the other arguments:
+     *  'bitmap' points to the bitmap.
+     *  'nonbitmap_invlist' is an inversion list of the code points that are in
+     *      the bitmap range, but for some reason aren't in the bitmap; NULL if
+     *      none.  The reasons for this could be that they require some
+     *      condition such as the target string being or not being in UTF-8
+     *      (under /d), or because they came from a user-defined property that
+     *      was not resolved at the time of the regex compilation (under /u)
+     *  'only_utf8_locale_invlist' is an inversion list of the code points that
+     *      are valid only if the runtime locale is a UTF-8 one; NULL if none
+     *  'node' is the regex pattern node.  It is needed only when the above two
+     *      parameters are not null, and is passed so that this routine can
+     *      tease apart the various reasons for them.
+     *
+     * It returns TRUE if there was actually something output.  (It may be that
+     * the bitmap, etc is empty.)
+     *
+     * When called for outputting the bitmap of a non-ANYOF node, just pass the
+     * bitmap, with the succeeding parameters set to NULL.
+     *
+     */
+
+    /* In general, it tries to display the 'cleanest' representation of the
+     * innards, choosing whether to display them inverted or not, regardless of
+     * whether the class itself is to be inverted.  However,  there are some
+     * cases where it can't try inverting, as what actually matches isn't known
+     * until runtime, and hence the inversion isn't either. */
+    bool inverting_allowed = TRUE;
+
+    int i;
+    STRLEN orig_sv_cur = SvCUR(sv);
+
+    SV* invlist;            /* Inversion list we accumulate of code points that
+                               are unconditionally matched */
+    SV* only_utf8 = NULL;   /* Under /d, list of matches iff the target is
+                               UTF-8 */
+    SV* not_utf8 =  NULL;   /* /d, list of matches iff the target isn't UTF-8
+                             */
+    SV* posixes = NULL;     /* Under /l, string of things like [:word:], \D */
+    SV* only_utf8_locale = NULL;    /* Under /l, list of matches if the locale
+                                       is UTF-8 */
+
+    SV* as_is_display;      /* The output string when we take the inputs
+                              literally */
+    SV* inverted_display;   /* The output string when we invert the inputs */
+
+    U8 flags = (node) ? ANYOF_FLAGS(node) : 0;
+
+    bool invert = cBOOL(flags & ANYOF_INVERT);  /* Is the input to be inverted
+                                                   to match? */
+    /* We are biased in favor of displaying things without them being inverted,
+     * as that is generally easier to understand */
+    const int bias = 5;
+
+    PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS;
+
+    /* Start off with whatever code points are passed in.  (We clone, so we
+     * don't change the caller's list) */
+    if (nonbitmap_invlist) {
+        assert(invlist_highest(nonbitmap_invlist) < NUM_ANYOF_CODE_POINTS);
+        invlist = invlist_clone(nonbitmap_invlist);
+    }
+    else {  /* Worst case size is every other code point is matched */
+        invlist = _new_invlist(NUM_ANYOF_CODE_POINTS / 2);
+    }
+
+    if (flags) {
+        if (OP(node) == ANYOFD) {
+
+            /* This flag indicates that the code points below 0x100 in the
+             * nonbitmap list are precisely the ones that match only when the
+             * target is UTF-8 (they should all be non-ASCII). */
+            if (flags & ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP)
+            {
+                _invlist_intersection(invlist, PL_UpperLatin1, &only_utf8);
+                _invlist_subtract(invlist, only_utf8, &invlist);
+            }
+
+            /* And this flag for matching all non-ASCII 0xFF and below */
+            if (flags & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)
+            {
+                if (invert) {
+                    not_utf8 = _new_invlist(0);
+                }
+                else {
+                    not_utf8 = invlist_clone(PL_UpperLatin1);
+                }
+                inverting_allowed = FALSE;  /* XXX needs more work to be able
+                                               to allow this */
+            }
+        }
+        else if (OP(node) == ANYOFL) {
+
+            /* If either of these flags are set, what matches isn't
+             * determinable except during execution, so don't know enough here
+             * to invert */
+            if (flags & (ANYOFL_FOLD|ANYOF_MATCHES_POSIXL)) {
+                inverting_allowed = FALSE;
+            }
+
+            /* What the posix classes match also varies at runtime, so these
+             * will be output symbolically. */
+            if (ANYOF_POSIXL_TEST_ANY_SET(node)) {
+                int i;
+
+                posixes = newSVpvs("");
+                for (i = 0; i < ANYOF_POSIXL_MAX; i++) {
+                    if (ANYOF_POSIXL_TEST(node,i)) {
+                        sv_catpv(posixes, anyofs[i]);
+                    }
+                }
+            }
+        }
+    }
+
+    /* Accumulate the bit map into the unconditional match list */
+    for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
+        if (BITMAP_TEST(bitmap, i)) {
+            invlist = add_cp_to_invlist(invlist, i);
+        }
+    }
+
+    /* Make sure that the conditional match lists don't have anything in them
+     * that match unconditionally; otherwise the output is quite confusing.
+     * This could happen if the code that populates these misses some
+     * duplication. */
+    if (only_utf8) {
+        _invlist_subtract(only_utf8, invlist, &only_utf8);
+    }
+    if (not_utf8) {
+        _invlist_subtract(not_utf8, invlist, &not_utf8);
+    }
+
+    if (only_utf8_locale_invlist) {
+
+        /* Since this list is passed in, we have to make a copy before
+         * modifying it */
+        only_utf8_locale = invlist_clone(only_utf8_locale_invlist);
+
+        _invlist_subtract(only_utf8_locale, invlist, &only_utf8_locale);
+
+        /* And, it can get really weird for us to try outputting an inverted
+         * form of this list when it has things above the bitmap, so don't even
+         * try */
+        if (invlist_highest(only_utf8_locale) >= NUM_ANYOF_CODE_POINTS) {
+            inverting_allowed = FALSE;
+        }
+    }
+
+    /* Calculate what the output would be if we take the input as-is */
+    as_is_display = put_charclass_bitmap_innards_common(invlist,
+                                                    posixes,
+                                                    only_utf8,
+                                                    not_utf8,
+                                                    only_utf8_locale,
+                                                    invert);
+
+    /* If have to take the output as-is, just do that */
+    if (! inverting_allowed) {
+        sv_catsv(sv, as_is_display);
+    }
+    else { /* But otherwise, create the output again on the inverted input, and
+              use whichever version is shorter */
+
+        int inverted_bias, as_is_bias;
+
+        /* We will apply our bias to whichever of the the results doesn't have
+         * the '^' */
+        if (invert) {
+            invert = FALSE;
+            as_is_bias = bias;
+            inverted_bias = 0;
+        }
+        else {
+            invert = TRUE;
+            as_is_bias = 0;
+            inverted_bias = bias;
+        }
+
+        /* Now invert each of the lists that contribute to the output,
+         * excluding from the result things outside the possible range */
+
+        /* For the unconditional inversion list, we have to add in all the
+         * conditional code points, so that when inverted, they will be gone
+         * from it */
+        _invlist_union(only_utf8, invlist, &invlist);
+        _invlist_union(only_utf8_locale, invlist, &invlist);
+        _invlist_invert(invlist);
+        _invlist_intersection(invlist, PL_InBitmap, &invlist);
+
+        if (only_utf8) {
+            _invlist_invert(only_utf8);
+            _invlist_intersection(only_utf8, PL_UpperLatin1, &only_utf8);
+        }
+
+        if (not_utf8) {
+            _invlist_invert(not_utf8);
+            _invlist_intersection(not_utf8, PL_UpperLatin1, &not_utf8);
+        }
+
+        if (only_utf8_locale) {
+            _invlist_invert(only_utf8_locale);
+            _invlist_intersection(only_utf8_locale,
+                                  PL_InBitmap,
+                                  &only_utf8_locale);
+        }
+
+        inverted_display = put_charclass_bitmap_innards_common(
+                                            invlist,
+                                            posixes,
+                                            only_utf8,
+                                            not_utf8,
+                                            only_utf8_locale, invert);
+
+        /* Use the shortest representation, taking into account our bias
+         * against showing it inverted */
+        if (SvCUR(inverted_display) + inverted_bias
+            < SvCUR(as_is_display) + as_is_bias)
+        {
+	    sv_catsv(sv, inverted_display);
+        }
+        else {
+	    sv_catsv(sv, as_is_display);
+        }
+
+        SvREFCNT_dec_NN(as_is_display);
+        SvREFCNT_dec_NN(inverted_display);
+    }
+
+    SvREFCNT_dec_NN(invlist);
+    SvREFCNT_dec(only_utf8);
+    SvREFCNT_dec(not_utf8);
+    SvREFCNT_dec(posixes);
+    SvREFCNT_dec(only_utf8_locale);
+
+    return SvCUR(sv) > orig_sv_cur;
 }
 
 #define CLEAR_OPTSTART \