Peek.t: Test that DeadCode doesn’t crash

[perl5.git] / regcomp.c
diff --git a/regcomp.c b/regcomp.c

index a03c7d2..b5ed584 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -87,11 +87,10 @@ extern const struct regexp_engine my_reg_engine;
  #endif
  
  #include "dquote_static.c"
-#ifndef PERL_IN_XSUB_RE
-#  include "charclass_invlists.h"
-#endif
+#include "charclass_invlists.h"
  
  #define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
+#define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
  
  #ifdef op
  #undef op
@@ -2583,12 +2582,12 @@ S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source,  regnode
   * one, and looks for problematic sequences of characters whose folds vs.
   * non-folds have sufficiently different lengths, that the optimizer would be
   * fooled into rejecting legitimate matches of them, and the trie construction
- * code can't cope with them.  The joining is only done if:
+ * code needs to handle specially.  The joining is only done if:
   * 1) there is room in the current conglomerated node to entirely contain the
   *    next one.
   * 2) they are the exact same node type
   *
- * The adjacent nodes actually may be separated by NOTHING kind nodes, and
+ * The adjacent nodes actually may be separated by NOTHING-kind nodes, and
   * these get optimized out
   *
   * If there are problematic code sequences, *min_subtract is set to the delta
@@ -2601,26 +2600,27 @@ S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source,  regnode
   *
   * This is as good a place as any to discuss the design of handling these
   * problematic sequences.  It's been wrong in Perl for a very long time.  There
- * are three code points in Unicode whose folded lengths differ so much from
- * the un-folded lengths that it causes problems for the optimizer and trie
- * construction.  Why only these are problematic, and not others where lengths
- * also differ is something I (khw) do not understand.  New versions of Unicode
- * might add more such code points.  Hopefully the logic in fold_grind.t that
- * figures out what to test (in part by verifying that each size-combination
- * gets tested) will catch any that do come along, so they can be added to the
- * special handling below.  The chances of new ones are actually rather small,
- * as most, if not all, of the world's scripts that have casefolding have
- * already been encoded by Unicode.  Also, a number of Unicode's decisions were
- * made to allow compatibility with pre-existing standards, and almost all of
- * those have already been dealt with.  These would otherwise be the most
- * likely candidates for generating further tricky sequences.  In other words,
- * Unicode by itself is unlikely to add new ones unless it is for compatibility
- * with pre-existing standards, and there aren't many of those left.
+ * are three code points currently in Unicode whose folded lengths differ so
+ * much from the un-folded lengths that it causes problems for the optimizer
+ * and trie construction.  Why only these are problematic, and not others where
+ * lengths also differ is something I (khw) do not understand.  New versions of
+ * Unicode might add more such code points.  Hopefully the logic in
+ * fold_grind.t that figures out what to test (in part by verifying that each
+ * size-combination gets tested) will catch any that do come along, so they can
+ * be added to the special handling below.  The chances of new ones are
+ * actually rather small, as most, if not all, of the world's scripts that have
+ * casefolding have already been encoded by Unicode.  Also, a number of
+ * Unicode's decisions were made to allow compatibility with pre-existing
+ * standards, and almost all of those have already been dealt with.  These
+ * would otherwise be the most likely candidates for generating further tricky
+ * sequences.  In other words, Unicode by itself is unlikely to add new ones
+ * unless it is for compatibility with pre-existing standards, and there aren't
+ * many of those left.
   *
   * The previous designs for dealing with these involved assigning a special
   * node for them.  This approach doesn't work, as evidenced by this example:
   *      "\xDFs" =~ /s\xDF/ui    # Used to fail before these patches
- * Both these fold to "sss", but if the pattern is parsed to create a node of
+ * Both these fold to "sss", but if the pattern is parsed to create a node
   * that would match just the \xDF, it won't be able to handle the case where a
   * successful match would have to cross the node's boundary.  The new approach
   * that hopefully generally solves the problem generates an EXACTFU_SS node
@@ -2635,9 +2635,9 @@ S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source,  regnode
   *      problematic sequences.  This delta is used by the caller to adjust the
   *      min length of the match, and the delta between min and max, so that the
   *      optimizer doesn't reject these possibilities based on size constraints.
- * 2)   These sequences require special handling by the trie code, so it
- *      changes the joined node type to ops for the trie's benefit, those new
- *      ops being EXACTFU_SS and EXACTFU_TRICKYFOLD.
+ * 2)   These sequences require special handling by the trie code, so this code
+ *      changes the joined node type to special ops: EXACTFU_TRICKYFOLD and
+ *      EXACTFU_SS.
   * 3)   This is sufficient for the two Greek sequences (described below), but
   *      the one involving the Sharp s (\xDF) needs more.  The node type
   *      EXACTFU_SS is used for an EXACTFU node that contains at least one "ss"
@@ -2647,22 +2647,21 @@ S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source,  regnode
   *      itself with length changes, and so can be processed faster.  regexec.c
   *      takes advantage of this.  Generally, an EXACTFish node that is in UTF-8
   *      is pre-folded by regcomp.c.  This saves effort in regex matching.
- *      However, probably mostly for historical reasons, the pre-folding isn't
- *      done for non-UTF8 patterns (and it can't be for EXACTF and EXACTFL
- *      nodes, as what they fold to isn't known until runtime.)  The fold
- *      possibilities for the non-UTF8 patterns are quite simple, except for
- *      the sharp s.  All the ones that don't involve a UTF-8 target string
- *      are members of a fold-pair, and arrays are set up for all of them
- *      that quickly find the other member of the pair.  It might actually
- *      be faster to pre-fold these, but it isn't currently done, except for
- *      the sharp s.  Code elsewhere in this file makes sure that it gets
- *      folded to 'ss', even if the pattern isn't UTF-8.  This avoids the
- *      issues described in the next item.
+ *      However, the pre-folding isn't done for non-UTF8 patterns because the
+ *      fold of the MICRO SIGN requires UTF-8.  Also what EXACTF and EXACTFL
+ *      nodes fold to isn't known until runtime.  The fold possibilities for
+ *      the non-UTF8 patterns are quite simple, except for the sharp s.  All
+ *      the ones that don't involve a UTF-8 target string are members of a
+ *      fold-pair, and arrays are set up for all of them so that the other
+ *      member of the pair can be found quickly.  Code elsewhere in this file
+ *      makes sure that in EXACTFU nodes, the sharp s gets folded to 'ss', even
+ *      if the pattern isn't UTF-8.  This avoids the issues described in the
+ *      next item.
   * 4)   A problem remains for the sharp s in EXACTF nodes.  Whether it matches
   *      'ss' or not is not knowable at compile time.  It will match iff the
   *      target string is in UTF-8, unlike the EXACTFU nodes, where it always
   *      matches; and the EXACTFL and EXACTFA nodes where it never does.  Thus
- *      it can't be folded to "ss" at compile time, unlike EXACTFU does as
+ *      it can't be folded to "ss" at compile time, unlike EXACTFU does (as
   *      described in item 3).  An assumption that the optimizer part of
   *      regexec.c (probably unwittingly) makes is that a character in the
   *      pattern corresponds to at most a single character in the target string.
@@ -2734,6 +2733,8 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, b
              const unsigned int oldl = STR_LEN(scan);
              regnode * const nnext = regnext(n);
  
+            /* XXX I (khw) kind of doubt that this works on platforms where
+             * U8_MAX is above 255 because of lots of other assumptions */
              if (oldl + STR_LEN(n) > U8_MAX)
                  break;
              
@@ -2884,7 +2885,7 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, b
                       greek_sequence:
                         *min_subtract += 4;
  
-                       /* This can't currently be handled by trie's, so change
+                       /* This requires special handling by trie's, so change
                          * the node type to indicate this.  If EXACTFA and
                          * EXACTFL were ever to be handled by trie's, this
                          * would have to be changed.  If this node has already
@@ -2920,9 +2921,9 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, b
                             /* EXACTF nodes need to know that the minimum
                              * length changed so that a sharp s in the string
                              * can match this ss in the pattern, but they
-                            * remain EXACTF nodes, as they are not trie'able,
-                            * so don't have to invent a new node type to
-                            * exclude them from the trie code */
+                             * remain EXACTF nodes, as they won't match this
+                             * unless the target string is is UTF-8, which we
+                             * don't know until runtime */
                             if (OP(scan) != EXACTF) {
                                 OP(scan) = EXACTFU_SS;
                             }
@@ -3662,7 +3663,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                 uc = utf8_to_uvchr_buf(s, s + l, NULL);
                 l = utf8_length(s, s + l);
             }
-           else if (has_exactf_sharp_s) {
+           if (has_exactf_sharp_s) {
                 RExC_seen |= REG_SEEN_EXACTF_SHARP_S;
             }
             min += l - min_subtract;
@@ -3959,6 +3960,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                       && !(data->flags & SF_HAS_EVAL)
                       && !deltanext     /* atom is fixed width */
                       && minnext != 0   /* CURLYM can't handle zero width */
+                      && ! (RExC_seen & REG_SEEN_EXACTF_SHARP_S) /* Nor \xDF */
                 ) {
                     /* XXXX How to optimize if data == 0? */
                     /* Optimize to a simpler form.  */
@@ -5205,6 +5207,50 @@ S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
  }
  
  
+STATIC bool
+S_setup_longest(pTHX_ RExC_state_t *pRExC_state, SV* sv_longest, SV** rx_utf8, SV** rx_substr, I32* rx_end_shift, I32 lookbehind, I32 offset, I32 *minlen, STRLEN longest_length, bool eol, bool meol)
+{
+    /* This is the common code for setting up the floating and fixed length
+     * string data extracted from Perlre_op_compile() below.  Returns a boolean
+     * as to whether succeeded or not */
+
+    I32 t,ml;
+
+    if (! (longest_length
+           || (eol /* Can't have SEOL and MULTI */
+               && (! meol || (RExC_flags & RXf_PMf_MULTILINE)))
+          )
+            /* See comments for join_exact for why REG_SEEN_EXACTF_SHARP_S */
+        || (RExC_seen & REG_SEEN_EXACTF_SHARP_S))
+    {
+        return FALSE;
+    }
+
+    /* copy the information about the longest from the reg_scan_data
+        over to the program. */
+    if (SvUTF8(sv_longest)) {
+        *rx_utf8 = sv_longest;
+        *rx_substr = NULL;
+    } else {
+        *rx_substr = sv_longest;
+        *rx_utf8 = NULL;
+    }
+    /* end_shift is how many chars that must be matched that
+        follow this item. We calculate it ahead of time as once the
+        lookbehind offset is added in we lose the ability to correctly
+        calculate it.*/
+    ml = minlen ? *(minlen) : (I32)longest_length;
+    *rx_end_shift = ml - offset
+        - longest_length + (SvTAIL(sv_longest) != 0)
+        + lookbehind;
+
+    t = (eol/* Can't have SEOL and MULTI */
+         && (! meol || (RExC_flags & RXf_PMf_MULTILINE)));
+    fbm_compile(sv_longest, t ? FBMcf_TAIL : 0);
+
+    return TRUE;
+}
+
  /*
   * Perl_re_op_compile - the perl internal RE engine's function to compile a
   * regular expression into internal code.
@@ -5257,7 +5303,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
      dVAR;
      REGEXP *rx;
      struct regexp *r;
-    register regexp_internal *ri;
+    regexp_internal *ri;
      STRLEN plen;
      char  * VOL exp;
      char* xend;
@@ -6172,105 +6218,56 @@ reStudy:
         scan_commit(pRExC_state, &data,&minlen,0);
         SvREFCNT_dec(data.last_found);
  
-        /* Note that code very similar to this but for anchored string 
-           follows immediately below, changes may need to be made to both. 
-           Be careful. 
-         */
         longest_float_length = CHR_SVLEN(data.longest_float);
-       if (longest_float_length
-           || (data.flags & SF_FL_BEFORE_EOL
-               && (!(data.flags & SF_FL_BEFORE_MEOL)
-                   || (RExC_flags & RXf_PMf_MULTILINE)))) 
-        {
-            I32 t,ml;
  
-            /* See comments for join_exact for why REG_SEEN_EXACTF_SHARP_S */
-           if ((RExC_seen & REG_SEEN_EXACTF_SHARP_S)
-               || (SvCUR(data.longest_fixed)  /* ok to leave SvCUR */
-                   && data.offset_fixed == data.offset_float_min
-                   && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)))
-                   goto remove_float;          /* As in (a)+. */
-
-            /* copy the information about the longest float from the reg_scan_data
-               over to the program. */
-           if (SvUTF8(data.longest_float)) {
-               r->float_utf8 = data.longest_float;
-               r->float_substr = NULL;
-           } else {
-               r->float_substr = data.longest_float;
-               r->float_utf8 = NULL;
-           }
-           /* float_end_shift is how many chars that must be matched that 
-              follow this item. We calculate it ahead of time as once the
-              lookbehind offset is added in we lose the ability to correctly
-              calculate it.*/
-           ml = data.minlen_float ? *(data.minlen_float) 
-                                  : (I32)longest_float_length;
-           r->float_end_shift = ml - data.offset_float_min
-               - longest_float_length + (SvTAIL(data.longest_float) != 0)
-               + data.lookbehind_float;
+        if (! ((SvCUR(data.longest_fixed)  /* ok to leave SvCUR */
+                   && data.offset_fixed == data.offset_float_min
+                   && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)))
+            && S_setup_longest (aTHX_ pRExC_state,
+                                    data.longest_float,
+                                    &(r->float_utf8),
+                                    &(r->float_substr),
+                                    &(r->float_end_shift),
+                                    data.lookbehind_float,
+                                    data.offset_float_min,
+                                    data.minlen_float,
+                                    longest_float_length,
+                                    data.flags & SF_FL_BEFORE_EOL,
+                                    data.flags & SF_FL_BEFORE_MEOL))
+        {
             r->float_min_offset = data.offset_float_min - data.lookbehind_float;
             r->float_max_offset = data.offset_float_max;
             if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
                 r->float_max_offset -= data.lookbehind_float;
-           
-           t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
-                      && (!(data.flags & SF_FL_BEFORE_MEOL)
-                          || (RExC_flags & RXf_PMf_MULTILINE)));
-           fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
         }
         else {
-         remove_float:
             r->float_substr = r->float_utf8 = NULL;
             SvREFCNT_dec(data.longest_float);
             longest_float_length = 0;
         }
  
-        /* Note that code very similar to this but for floating string 
-           is immediately above, changes may need to be made to both. 
-           Be careful. 
-         */
         longest_fixed_length = CHR_SVLEN(data.longest_fixed);
  
-        /* See comments for join_exact for why REG_SEEN_EXACTF_SHARP_S */
-       if (! (RExC_seen & REG_SEEN_EXACTF_SHARP_S)
-           && (longest_fixed_length
-               || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
-                   && (!(data.flags & SF_FIX_BEFORE_MEOL)
-                       || (RExC_flags & RXf_PMf_MULTILINE)))) )
+        if (S_setup_longest (aTHX_ pRExC_state,
+                                data.longest_fixed,
+                                &(r->anchored_utf8),
+                                &(r->anchored_substr),
+                                &(r->anchored_end_shift),
+                                data.lookbehind_fixed,
+                                data.offset_fixed,
+                                data.minlen_fixed,
+                                longest_fixed_length,
+                                data.flags & SF_FIX_BEFORE_EOL,
+                                data.flags & SF_FIX_BEFORE_MEOL))
          {
-            I32 t,ml;
-
-            /* copy the information about the longest fixed 
-               from the reg_scan_data over to the program. */
-           if (SvUTF8(data.longest_fixed)) {
-               r->anchored_utf8 = data.longest_fixed;
-               r->anchored_substr = NULL;
-           } else {
-               r->anchored_substr = data.longest_fixed;
-               r->anchored_utf8 = NULL;
-           }
-           /* fixed_end_shift is how many chars that must be matched that 
-              follow this item. We calculate it ahead of time as once the
-              lookbehind offset is added in we lose the ability to correctly
-              calculate it.*/
-            ml = data.minlen_fixed ? *(data.minlen_fixed) 
-                                   : (I32)longest_fixed_length;
-            r->anchored_end_shift = ml - data.offset_fixed
-               - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
-               + data.lookbehind_fixed;
             r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
-
-           t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
-                && (!(data.flags & SF_FIX_BEFORE_MEOL)
-                    || (RExC_flags & RXf_PMf_MULTILINE)));
-           fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
         }
         else {
             r->anchored_substr = r->anchored_utf8 = NULL;
             SvREFCNT_dec(data.longest_fixed);
             longest_fixed_length = 0;
         }
+
         if (ri->regstclass
             && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
             ri->regstclass = NULL;
@@ -7314,8 +7311,10 @@ S__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV end
      }
  }
  
-STATIC IV
-S_invlist_search(pTHX_ SV* const invlist, const UV cp)
+#ifndef PERL_IN_XSUB_RE
+
+IV
+Perl__invlist_search(pTHX_ SV* const invlist, const UV cp)
  {
      /* Searches the inversion list for the entry that contains the input code
       * point <cp>.  If <cp> is not in the list, -1 is returned.  Otherwise, the
@@ -7326,7 +7325,7 @@ S_invlist_search(pTHX_ SV* const invlist, const UV cp)
      IV high = invlist_len(invlist);
      const UV * const array = invlist_array(invlist);
  
-    PERL_ARGS_ASSERT_INVLIST_SEARCH;
+    PERL_ARGS_ASSERT__INVLIST_SEARCH;
  
      /* If list is empty or the code point is before the first element, return
       * failure. */
@@ -7356,8 +7355,6 @@ S_invlist_search(pTHX_ SV* const invlist, const UV cp)
      return high - 1;
  }
  
-#ifndef PERL_IN_XSUB_RE
-
  void
  Perl__invlist_populate_swatch(pTHX_ SV* const invlist, const UV start, const UV end, U8* swatch)
  {
@@ -7382,7 +7379,7 @@ Perl__invlist_populate_swatch(pTHX_ SV* const invlist, const UV start, const UV
      array = invlist_array(invlist);
  
      /* Find which element it is */
-    i = invlist_search(invlist, start);
+    i = _invlist_search(invlist, start);
  
      /* We populate from <start> to <end> */
      while (current < end) {
@@ -7945,12 +7942,12 @@ Perl__add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end)
  
  #endif
  
-STATIC bool
+PERL_STATIC_INLINE bool
  S__invlist_contains_cp(pTHX_ SV* const invlist, const UV cp)
  {
      /* Does <invlist> contain code point <cp> as part of the set? */
  
-    IV index = invlist_search(invlist, cp);
+    IV index = _invlist_search(invlist, cp);
  
      PERL_ARGS_ASSERT__INVLIST_CONTAINS_CP;
  
@@ -8302,11 +8299,11 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
      /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
  {
      dVAR;
-    register regnode *ret;             /* Will be the head of the group. */
-    register regnode *br;
-    register regnode *lastbr;
-    register regnode *ender = NULL;
-    register I32 parno = 0;
+    regnode *ret;              /* Will be the head of the group. */
+    regnode *br;
+    regnode *lastbr;
+    regnode *ender = NULL;
+    I32 parno = 0;
      I32 flags;
      U32 oregflags = RExC_flags;
      bool have_branch = 0;
@@ -9312,9 +9309,9 @@ STATIC regnode *
  S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
  {
      dVAR;
-    register regnode *ret;
-    register regnode *chain = NULL;
-    register regnode *latest;
+    regnode *ret;
+    regnode *chain = NULL;
+    regnode *latest;
      I32 flags = 0, c = 0;
      GET_RE_DEBUG_FLAGS_DECL;
  
@@ -9385,9 +9382,9 @@ STATIC regnode *
  S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
  {
      dVAR;
-    register regnode *ret;
-    register char op;
-    register char *next;
+    regnode *ret;
+    char op;
+    char *next;
      I32 flags;
      const char * const origparse = RExC_parse;
      I32 min;
@@ -9576,86 +9573,95 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
      return(ret);
  }
  
-
-/* reg_namedseq(pRExC_state,UVp, UV depth)
+STATIC bool
+S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, UV *valuep, I32 *flagp, U32 depth, bool in_char_class)
+{
     
-   This is expected to be called by a parser routine that has 
-   recognized '\N' and needs to handle the rest. RExC_parse is
-   expected to point at the first char following the N at the time
-   of the call.
+ /* This is expected to be called by a parser routine that has recognized '\N'
+   and needs to handle the rest. RExC_parse is expected to point at the first
+   char following the N at the time of the call.  On successful return,
+   RExC_parse has been updated to point to just after the sequence identified
+   by this routine, and <*flagp> has been updated.
  
-   The \N may be inside (indicated by valuep not being NULL) or outside a
+   The \N may be inside (indicated by the boolean <in_char_class>) or outside a
     character class.
  
     \N may begin either a named sequence, or if outside a character class, mean
     to match a non-newline.  For non single-quoted regexes, the tokenizer has
-   attempted to decide which, and in the case of a named sequence converted it
+   attempted to decide which, and in the case of a named sequence, converted it
     into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
     where c1... are the characters in the sequence.  For single-quoted regexes,
     the tokenizer passes the \N sequence through unchanged; this code will not
-   attempt to determine this nor expand those.  The net effect is that if the
-   beginning of the passed-in pattern isn't '{U+' or there is no '}', it
-   signals that this \N occurrence means to match a non-newline.
-   
+   attempt to determine this nor expand those, instead raising a syntax error.
+   The net effect is that if the beginning of the passed-in pattern isn't '{U+'
+   or there is no '}', it signals that this \N occurrence means to match a
+   non-newline.
+
     Only the \N{U+...} form should occur in a character class, for the same
     reason that '.' inside a character class means to just match a period: it
     just doesn't make sense.
-   
-   If valuep is non-null then it is assumed that we are parsing inside 
-   of a charclass definition and the first codepoint in the resolved
-   string is returned via *valuep and the routine will return NULL. 
-   In this mode if a multichar string is returned from the charnames 
-   handler, a warning will be issued, and only the first char in the 
-   sequence will be examined. If the string returned is zero length
-   then the value of *valuep is undefined and NON-NULL will 
-   be returned to indicate failure. (This will NOT be a valid pointer 
-   to a regnode.)
-   
-   If valuep is null then it is assumed that we are parsing normal text and a
-   new EXACT node is inserted into the program containing the resolved string,
-   and a pointer to the new node is returned.  But if the string is zero length
-   a NOTHING node is emitted instead.
  
-   On success RExC_parse is set to the char following the endbrace.
-   Parsing failures will generate a fatal error via vFAIL(...)
+   The function raises an error (via vFAIL), and doesn't return for various
+   syntax errors.  Otherwise it returns TRUE and sets <node_p> or <valuep> on
+   success; it returns FALSE otherwise.
+
+   If <valuep> is non-null, it means the caller can accept an input sequence
+   consisting of a just a single code point; <*valuep> is set to that value
+   if the input is such.
+
+   If <node_p> is non-null it signifies that the caller can accept any other
+   legal sequence (i.e., one that isn't just a single code point).  <*node_p>
+   is set as follows:
+    1) \N means not-a-NL: points to a newly created REG_ANY node;
+    2) \N{}:              points to a new NOTHING node;
+    3) otherwise:         points to a new EXACT node containing the resolved
+                          string.
+   Note that FALSE is returned for single code point sequences if <valuep> is
+   null.
   */
-STATIC regnode *
-S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp, U32 depth)
-{
+
      char * endbrace;    /* '}' following the name */
-    regnode *ret = NULL;
      char* p;
+    char *endchar;     /* Points to '.' or '}' ending cur char in the input
+                           stream */
+    bool has_multiple_chars; /* true if the input stream contains a sequence of
+                                more than one character */
  
      GET_RE_DEBUG_FLAGS_DECL;
   
-    PERL_ARGS_ASSERT_REG_NAMEDSEQ;
+    PERL_ARGS_ASSERT_GROK_BSLASH_N;
  
      GET_RE_DEBUG_FLAGS;
  
+    assert(cBOOL(node_p) ^ cBOOL(valuep));  /* Exactly one should be set */
+
      /* The [^\n] meaning of \N ignores spaces and comments under the /x
       * modifier.  The other meaning does not */
      p = (RExC_flags & RXf_PMf_EXTENDED)
         ? regwhite( pRExC_state, RExC_parse )
         : RExC_parse;
-   
+
      /* Disambiguate between \N meaning a named character versus \N meaning
       * [^\n].  The former is assumed when it can't be the latter. */
      if (*p != '{' || regcurly(p)) {
         RExC_parse = p;
-       if (valuep) {
+       if (! node_p) {
             /* no bare \N in a charclass */
-           vFAIL("\\N in a character class must be a named character: \\N{...}");
-       }
+            if (in_char_class) {
+                vFAIL("\\N in a character class must be a named character: \\N{...}");
+            }
+            return FALSE;
+        }
         nextchar(pRExC_state);
-       ret = reg_node(pRExC_state, REG_ANY);
+       *node_p = reg_node(pRExC_state, REG_ANY);
         *flagp |= HASWIDTH|SIMPLE;
         RExC_naughty++;
         RExC_parse--;
-        Set_Node_Length(ret, 1); /* MJD */
-       return ret;
+        Set_Node_Length(*node_p, 1); /* MJD */
+       return TRUE;
      }
  
-    /* Here, we have decided it should be a named sequence */
+    /* Here, we have decided it should be a named character or sequence */
  
      /* The test above made sure that the next real character is a '{', but
       * under the /x modifier, it could be separated by space (or a comment and
@@ -9677,44 +9683,48 @@ S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp, U32 dept
      }
  
      if (endbrace == RExC_parse) {   /* empty: \N{} */
-       if (! valuep) {
-           RExC_parse = endbrace + 1;  
-           return reg_node(pRExC_state,NOTHING);
-       }
-
-       if (SIZE_ONLY) {
-           ckWARNreg(RExC_parse,
-                   "Ignoring zero length \\N{} in character class"
-           );
-           RExC_parse = endbrace + 1;  
+        bool ret = TRUE;
+       if (node_p) {
+           *node_p = reg_node(pRExC_state,NOTHING);
+       }
+        else if (in_char_class) {
+            if (SIZE_ONLY && in_char_class) {
+                ckWARNreg(RExC_parse,
+                        "Ignoring zero length \\N{} in character class"
+                );
+            }
+            ret = FALSE;
         }
-       *valuep = 0;
-       return (regnode *) &RExC_parse; /* Invalid regnode pointer */
+        else {
+            return FALSE;
+        }
+        nextchar(pRExC_state);
+        return ret;
      }
  
-    REQUIRE_UTF8;      /* named sequences imply Unicode semantics */
+    RExC_uni_semantics = 1; /* Unicode named chars imply Unicode semantics */
      RExC_parse += 2;   /* Skip past the 'U+' */
  
-    if (valuep) {   /* In a bracketed char class */
-       /* We only pay attention to the first char of 
-       multichar strings being returned. I kinda wonder
+    endchar = RExC_parse + strcspn(RExC_parse, ".}");
+
+    /* Code points are separated by dots.  If none, there is only one code
+     * point, and is terminated by the brace */
+    has_multiple_chars = (endchar < endbrace);
+
+    if (valuep && (! has_multiple_chars || in_char_class)) {
+       /* We only pay attention to the first char of
+        multichar strings being returned in char classes. I kinda wonder
         if this makes sense as it does change the behaviour
         from earlier versions, OTOH that behaviour was broken
         as well. XXX Solution is to recharacterize as
         [rest-of-class]|multi1|multi2... */
  
-       STRLEN length_of_hex;
-       I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
+       STRLEN length_of_hex = (STRLEN)(endchar - RExC_parse);
+       I32 grok_hex_flags = PERL_SCAN_ALLOW_UNDERSCORES
             | PERL_SCAN_DISALLOW_PREFIX
             | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
-    
-       char * endchar = RExC_parse + strcspn(RExC_parse, ".}");
-       if (endchar < endbrace) {
-           ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
-       }
  
-       length_of_hex = (STRLEN)(endchar - RExC_parse);
-       *valuep = grok_hex(RExC_parse, &length_of_hex, &flags, NULL);
+       *valuep = grok_hex(RExC_parse, &length_of_hex, &grok_hex_flags, NULL);
  
         /* The tokenizer should have guaranteed validity, but it's possible to
          * bypass it by using single quoting, so check */
@@ -9726,16 +9736,26 @@ S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp, U32 dept
                             ? UTF8SKIP(RExC_parse)
                             : 1;
             /* Guard against malformed utf8 */
-           if (RExC_parse >= endchar) RExC_parse = endchar;
+           if (RExC_parse >= endchar) {
+                RExC_parse = endchar;
+            }
             vFAIL("Invalid hexadecimal number in \\N{U+...}");
-       }    
+       }
  
-       RExC_parse = endbrace + 1;
-       if (endchar == endbrace) return NULL;
+        if (in_char_class && has_multiple_chars) {
+           ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
+        }
+        RExC_parse = endbrace + 1;
+    }
+    else if (! node_p || ! has_multiple_chars) {
  
-        ret = (regnode *) &RExC_parse; /* Invalid regnode pointer */
+        /* Here, the input is legal, but not according to the caller's
+         * options.  We fail without advancing the parse, so that the
+         * caller can try again */
+        RExC_parse = p;
+        return FALSE;
      }
-    else {     /* Not a char class */
+    else {
  
         /* What is done here is to convert this to a sub-pattern of the form
          * (?:\x{char1}\x{char2}...)
@@ -9748,16 +9768,11 @@ S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp, U32 dept
  
         SV * substitute_parse = newSVpvn_flags("?:", 2, SVf_UTF8|SVs_TEMP);
         STRLEN len;
-       char *endchar;      /* Points to '.' or '}' ending cur char in the input
-                              stream */
         char *orig_end = RExC_end;
+        I32 flags;
  
         while (RExC_parse < endbrace) {
  
-           /* Code points are separated by dots.  If none, there is only one
-            * code point, and is terminated by the brace */
-           endchar = RExC_parse + strcspn(RExC_parse, ".}");
-
             /* Convert to notation the rest of the code understands */
             sv_catpv(substitute_parse, "\\x{");
             sv_catpvn(substitute_parse, RExC_parse, endchar - RExC_parse);
@@ -9765,6 +9780,7 @@ S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp, U32 dept
  
             /* Point to the beginning of the next character in the sequence. */
             RExC_parse = endchar + 1;
+           endchar = RExC_parse + strcspn(RExC_parse, ".}");
         }
         sv_catpv(substitute_parse, ")");
  
@@ -9779,16 +9795,17 @@ S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp, U32 dept
         /* The values are Unicode, and therefore not subject to recoding */
         RExC_override_recoding = 1;
  
-       ret = reg(pRExC_state, 1, flagp, depth+1);
+       *node_p = reg(pRExC_state, 1, &flags, depth+1);
+       *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
  
         RExC_parse = endbrace;
         RExC_end = orig_end;
         RExC_override_recoding = 0;
  
-       nextchar(pRExC_state);
+        nextchar(pRExC_state);
      }
  
-    return ret;
+    return TRUE;
  }
  
  
@@ -9846,15 +9863,23 @@ S_compute_EXACTish(pTHX_ RExC_state_t *pRExC_state)
  }
  
  PERL_STATIC_INLINE void
-S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, regnode *node, STRLEN len, UV code_point)
+S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, regnode *node, I32* flagp, STRLEN len, UV code_point)
  {
-    /* This knows the details about sizing an EXACTish node, and potentially
-     * populating it with a single character.  If <len> is non-zero, it assumes
-     * that the node has already been populated, and just does the sizing,
-     * ignoring <code_point>.  Otherwise it looks at <code_point> and
-     * calculates what <len> should be.  In pass 1, it sizes the node
-     * appropriately.  In pass 2, it additionally will populate the node's
-     * STRING with <code_point>, if <len> is 0.
+    /* This knows the details about sizing an EXACTish node, setting flags for
+     * it (by setting <*flagp>, and potentially populating it with a single
+     * character.
+     *
+     * If <len> is non-zero, this function assumes that the node has already
+     * been populated, and just does the sizing.  In this case <code_point>
+     * should be the final code point that has already been placed into the
+     * node.  This value will be ignored except that under some circumstances
+     * <*flagp> is set based on it.
+     *
+     * If <len is zero, the function assumes that the node is to contain only
+     * the single character given by <code_point> and calculates what <len>
+     * should be.  In pass 1, it sizes the node appropriately.  In pass 2, it
+     * additionally will populate the node's STRING with <code_point>, if <len>
+     * is 0.  In both cases <*flagp> is appropriately set
       *
       * It knows that under FOLD, UTF characters and the Latin Sharp S must be
       * folded (the latter only when the rules indicate it can match 'ss') */
@@ -9899,6 +9924,10 @@ S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, regnode *node, STR
              Copy((char *) character, STRING(node), len, char);
          }
      }
+
+    *flagp |= HASWIDTH;
+    if (len == 1 && UNI_IS_INVARIANT(code_point))
+        *flagp |= SIMPLE;
  }
  
  /*
@@ -9965,7 +9994,7 @@ STATIC regnode *
  S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
  {
      dVAR;
-    register regnode *ret = NULL;
+    regnode *ret = NULL;
      I32 flags;
      char *parse_start = RExC_parse;
      U8 op;
@@ -10013,13 +10042,12 @@ tryagain:
      case '[':
      {
         char * const oregcomp_parse = ++RExC_parse;
-        ret = regclass(pRExC_state,depth+1);
+        ret = regclass(pRExC_state, flagp,depth+1);
         if (*RExC_parse != ']') {
             RExC_parse = oregcomp_parse;
             vFAIL("Unmatched [");
         }
         nextchar(pRExC_state);
-       *flagp |= HASWIDTH|SIMPLE;
          Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
         break;
      }
@@ -10230,7 +10258,7 @@ tryagain:
                 }
                 RExC_parse--;
  
-                ret = regclass(pRExC_state,depth+1);
+                ret = regclass(pRExC_state, flagp,depth+1);
  
                 RExC_end = oldregxend;
                 RExC_parse--;
@@ -10238,16 +10266,24 @@ tryagain:
                 Set_Node_Offset(ret, parse_start + 2);
                 Set_Node_Cur_Length(ret);
                 nextchar(pRExC_state);
-               *flagp |= HASWIDTH|SIMPLE;
             }
             break;
          case 'N': 
-            /* Handle \N and \N{NAME} here and not below because it can be
-            multicharacter. join_exact() will join them up later on. 
-            Also this makes sure that things like /\N{BLAH}+/ and 
-            \N{BLAH} being multi char Just Happen. dmq*/
+            /* Handle \N and \N{NAME} with multiple code points here and not
+             * below because it can be multicharacter. join_exact() will join
+             * them up later on.  Also this makes sure that things like
+             * /\N{BLAH}+/ and \N{BLAH} being multi char Just Happen. dmq.
+             * The options to the grok function call causes it to fail if the
+             * sequence is just a single code point.  We then go treat it as
+             * just another character in the current EXACT node, and hence it
+             * gets uniform treatment with all the other characters.  The
+             * special treatment for quantifiers is not needed for such single
+             * character sequences */
              ++RExC_parse;
-            ret= reg_namedseq(pRExC_state, NULL, flagp, depth);
+            if (! grok_bslash_N(pRExC_state, &ret, NULL, flagp, depth, FALSE)) {
+                RExC_parse--;
+                goto defchar;
+            }
              break;
         case 'k':    /* Handle \k<NAME> and \k'NAME' */
         parse_named_seq:
@@ -10392,19 +10428,18 @@ tryagain:
             RExC_parse++;
  
         defchar: {
-           register STRLEN len;
-           register UV ender;
-           register char *p;
+           STRLEN len = 0;
+           UV ender;
+           char *p;
             char *s;
  #define MAX_NODE_STRING_SIZE 127
-           char foldbuf[MAX_NODE_STRING_SIZE];
+           char foldbuf[MAX_NODE_STRING_SIZE+UTF8_MAXBYTES_CASE];
+           char *s0;
+           U8 upper_parse = MAX_NODE_STRING_SIZE;
             STRLEN foldlen;
              U8 node_type;
              bool next_is_quantifier;
-
-           /* Is this a LATIN LOWER CASE SHARP S in an EXACTFU node?  If so,
-            * it is folded to 'ss' even if not utf8 */
-           bool is_exactfu_sharp_s;
+            char * oldp;
  
             ender = 0;
              node_type = compute_EXACTish(pRExC_state);
@@ -10414,6 +10449,10 @@ tryagain:
               * actual node, as the node doesn't exist yet */
             s = (SIZE_ONLY && FOLD) ? foldbuf : STRING(ret);
  
+            s0 = s;
+
+       reparse:
+
             /* XXX The node can hold up to 255 bytes, yet this only goes to
               * 127.  I (khw) do not know why.  Keeping it somewhat less than
               * 255 allows us to not have to worry about overflow due to
@@ -10432,11 +10471,11 @@ tryagain:
               * could back off to end with only a code point that isn't such a
               * non-final, but it is possible for there not to be any in the
               * entire node. */
-           for (len = 0, p = RExC_parse - 1;
-                len < MAX_NODE_STRING_SIZE && p < RExC_end;
+           for (p = RExC_parse - 1;
+                len < upper_parse && p < RExC_end;
                  len++)
             {
-               char * const oldp = p;
+               oldp = p;
  
                 if (RExC_flags & RXf_PMf_EXTENDED)
                     p = regwhite( pRExC_state, p );
@@ -10472,7 +10511,6 @@ tryagain:
                     case 'g': case 'G':   /* generic-backref, pos assertion */
                     case 'h': case 'H':   /* HORIZWS */
                     case 'k': case 'K':   /* named backref, keep marker */
-                   case 'N':             /* named char sequence */
                     case 'p': case 'P':   /* Unicode property */
                               case 'R':   /* LNBREAK */
                     case 's': case 'S':   /* space class */
@@ -10490,6 +10528,22 @@ tryagain:
                         ender = '\n';
                         p++;
                         break;
+                   case 'N': /* Handle a single-code point named character. */
+                        /* The options cause it to fail if a multiple code
+                         * point sequence.  Handle those in the switch() above
+                         * */
+                        RExC_parse = p + 1;
+                        if (! grok_bslash_N(pRExC_state, NULL, &ender,
+                                            flagp, depth, FALSE))
+                        {
+                            RExC_parse = p = oldp;
+                            goto loopdone;
+                        }
+                        p = RExC_parse;
+                        if (ender > 0xff) {
+                            REQUIRE_UTF8;
+                        }
+                        break;
                     case 'r':
                         ender = '\r';
                         p++;
@@ -10635,8 +10689,10 @@ tryagain:
                     break;
                 } /* End of switch on the literal */
  
-                is_exactfu_sharp_s = (node_type == EXACTFU
-                                     && ender == LATIN_SMALL_LETTER_SHARP_S);
+               /* Here, have looked at the literal character and <ender>
+                * contains its ordinal, <p> points to the character after it
+                */
+
                 if ( RExC_flags & RXf_PMf_EXTENDED)
                     p = regwhite( pRExC_state, p );
  
@@ -10654,32 +10710,38 @@ tryagain:
                  }
  
                 if (FOLD) {
-                   if (UTF || is_exactfu_sharp_s) {
-
-                   /* Prime the casefolded buffer.  Locale rules, which apply
-                    * only to code points < 256, aren't known until execution,
-                    * so for them, just output the original character using
-                     * utf8.  If we start to fold non-UTF patterns, be sure to
-                     * update join_exact() */
-                   if (LOC && ender < 256) {
-                       if (UNI_IS_INVARIANT(ender)) {
-                           *s = (U8) ender;
-                           foldlen = 1;
-                       } else {
-                           *s = UTF8_TWO_BYTE_HI(ender);
-                           *(s + 1) = UTF8_TWO_BYTE_LO(ender);
-                           foldlen = 2;
-                       }
-                   }
-                   else {
-                       ender = _to_uni_fold_flags(ender, (U8 *) s, &foldlen,
-                                FOLD_FLAGS_FULL
-                                 | ((LOC) ?  FOLD_FLAGS_LOCALE
-                                          : (ASCII_FOLD_RESTRICTED)
-                                            ? FOLD_FLAGS_NOMIX_ASCII
-                                            : 0)
-                            );
-                   }
+                    if (UTF
+                            /* See comments for join_exact() as to why we fold
+                             * this non-UTF at compile time */
+                        || (node_type == EXACTFU
+                            && ender == LATIN_SMALL_LETTER_SHARP_S))
+                    {
+
+
+                        /* Prime the casefolded buffer.  Locale rules, which
+                         * apply only to code points < 256, aren't known until
+                         * execution, so for them, just output the original
+                         * character using utf8.  If we start to fold non-UTF
+                         * patterns, be sure to update join_exact() */
+                        if (LOC && ender < 256) {
+                            if (UNI_IS_INVARIANT(ender)) {
+                                *s = (U8) ender;
+                                foldlen = 1;
+                            } else {
+                                *s = UTF8_TWO_BYTE_HI(ender);
+                                *(s + 1) = UTF8_TWO_BYTE_LO(ender);
+                                foldlen = 2;
+                            }
+                        }
+                        else {
+                            ender = _to_uni_fold_flags(ender, (U8 *) s, &foldlen,
+                                    FOLD_FLAGS_FULL
+                                     | ((LOC) ?  FOLD_FLAGS_LOCALE
+                                              : (ASCII_FOLD_RESTRICTED)
+                                                ? FOLD_FLAGS_NOMIX_ASCII
+                                                : 0)
+                                );
+                        }
                         s += foldlen;
  
                         /* The loop increments <len> each time, as all but this
@@ -10689,20 +10751,20 @@ tryagain:
                          * subtract one to cancel out the increment that
                          * follows */
                         len += foldlen - 1;
-               }
-               else {
-                   REGC((char)ender, s++);
-               }
+                    }
+                    else {
+                        *(s++) = ender;
+                    }
                 }
                 else if (UTF) {
-                         const STRLEN unilen = reguni(pRExC_state, ender, s);
-                         if (unilen > 0) {
-                              s   += unilen;
-                              len += unilen;
-                         }
+                    const STRLEN unilen = reguni(pRExC_state, ender, s);
+                    if (unilen > 0) {
+                       s   += unilen;
+                       len += unilen;
+                    }
  
                     /* See comment just above for - 1 */
-                    len--;
+                   len--;
                 }
                 else {
                     REGC((char)ender, s++);
@@ -10717,9 +10779,180 @@ tryagain:
                      len++;
                      goto loopdone;
                 }
-           }
+
+           } /* End of loop through literal characters */
+
+            /* Here we have either exhausted the input or ran out of room in
+             * the node.  (If we encountered a character that can't be in the
+             * node, transfer is made directly to <loopdone>, and so we
+             * wouldn't have fallen off the end of the loop.)  In the latter
+             * case, we artificially have to split the node into two, because
+             * we just don't have enough space to hold everything.  This
+             * creates a problem if the final character participates in a
+             * multi-character fold in the non-final position, as a match that
+             * should have occurred won't, due to the way nodes are matched,
+             * and our artificial boundary.  So back off until we find a non-
+             * problematic character -- one that isn't at the beginning or
+             * middle of such a fold.  (Either it doesn't participate in any
+             * folds, or appears only in the final position of all the folds it
+             * does participate in.)  A better solution with far fewer false
+             * positives, and that would fill the nodes more completely, would
+             * be to actually have available all the multi-character folds to
+             * test against, and to back-off only far enough to be sure that
+             * this node isn't ending with a partial one.  <upper_parse> is set
+             * further below (if we need to reparse the node) to include just
+             * up through that final non-problematic character that this code
+             * identifies, so when it is set to less than the full node, we can
+             * skip the rest of this */
+            if (FOLD && p < RExC_end && upper_parse == MAX_NODE_STRING_SIZE) {
+
+                const STRLEN full_len = len;
+
+               assert(len >= MAX_NODE_STRING_SIZE);
+
+                /* Here, <s> points to the final byte of the final character.
+                 * Look backwards through the string until find a non-
+                 * problematic character */
+
+               if (! UTF) {
+
+                    /* These two have no multi-char folds to non-UTF characters
+                     */
+                    if (ASCII_FOLD_RESTRICTED || LOC) {
+                        goto loopdone;
+                    }
+
+                    while (--s >= s0 && IS_NON_FINAL_FOLD(*s)) { }
+                    len = s - s0 + 1;
+               }
+                else {
+                    if (!  PL_NonL1NonFinalFold) {
+                        PL_NonL1NonFinalFold = _new_invlist_C_array(
+                                        NonL1_Perl_Non_Final_Folds_invlist);
+                    }
+
+                    /* Point to the first byte of the final character */
+                    s = (char *) utf8_hop((U8 *) s, -1);
+
+                    while (s >= s0) {   /* Search backwards until find
+                                           non-problematic char */
+                        if (UTF8_IS_INVARIANT(*s)) {
+
+                            /* There are no ascii characters that participate
+                             * in multi-char folds under /aa.  In EBCDIC, the
+                             * non-ascii invariants are all control characters,
+                             * so don't ever participate in any folds. */
+                            if (ASCII_FOLD_RESTRICTED
+                                || ! IS_NON_FINAL_FOLD(*s))
+                            {
+                                break;
+                            }
+                        }
+                        else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
+
+                            /* No Latin1 characters participate in multi-char
+                             * folds under /l */
+                            if (LOC
+                                || ! IS_NON_FINAL_FOLD(TWO_BYTE_UTF8_TO_UNI(
+                                                                *s, *(s+1))))
+                            {
+                                break;
+                            }
+                        }
+                        else if (! _invlist_contains_cp(
+                                        PL_NonL1NonFinalFold,
+                                        valid_utf8_to_uvchr((U8 *) s, NULL)))
+                        {
+                            break;
+                        }
+
+                        /* Here, the current character is problematic in that
+                         * it does occur in the non-final position of some
+                         * fold, so try the character before it, but have to
+                         * special case the very first byte in the string, so
+                         * we don't read outside the string */
+                        s = (s == s0) ? s -1 : (char *) utf8_hop((U8 *) s, -1);
+                    } /* End of loop backwards through the string */
+
+                    /* If there were only problematic characters in the string,
+                     * <s> will point to before s0, in which case the length
+                     * should be 0, otherwise include the length of the
+                     * non-problematic character just found */
+                    len = (s < s0) ? 0 : s - s0 + UTF8SKIP(s);
+               }
+
+                /* Here, have found the final character, if any, that is
+                 * non-problematic as far as ending the node without splitting
+                 * it across a potential multi-char fold.  <len> contains the
+                 * number of bytes in the node up-to and including that
+                 * character, or is 0 if there is no such character, meaning
+                 * the whole node contains only problematic characters.  In
+                 * this case, give up and just take the node as-is.  We can't
+                 * do any better */
+                if (len == 0) {
+                    len = full_len;
+                } else {
+
+                    /* Here, the node does contain some characters that aren't
+                     * problematic.  If one such is the final character in the
+                     * node, we are done */
+                    if (len == full_len) {
+                        goto loopdone;
+                    }
+                    else if (len + ((UTF) ? UTF8SKIP(s) : 1) == full_len) {
+
+                        /* If the final character is problematic, but the
+                         * penultimate is not, back-off that last character to
+                         * later start a new node with it */
+                        p = oldp;
+                        goto loopdone;
+                    }
+
+                    /* Here, the final non-problematic character is earlier
+                     * in the input than the penultimate character.  What we do
+                     * is reparse from the beginning, going up only as far as
+                     * this final ok one, thus guaranteeing that the node ends
+                     * in an acceptable character.  The reason we reparse is
+                     * that we know how far in the character is, but we don't
+                     * know how to correlate its position with the input parse.
+                     * An alternate implementation would be to build that
+                     * correlation as we go along during the original parse,
+                     * but that would entail extra work for every node, whereas
+                     * this code gets executed only when the string is too
+                     * large for the node, and the final two characters are
+                     * problematic, an infrequent occurrence.  Yet another
+                     * possible strategy would be to save the tail of the
+                     * string, and the next time regatom is called, initialize
+                     * with that.  The problem with this is that unless you
+                     * back off one more character, you won't be guaranteed
+                     * regatom will get called again, unless regbranch,
+                     * regpiece ... are also changed.  If you do back off that
+                     * extra character, so that there is input guaranteed to
+                     * force calling regatom, you can't handle the case where
+                     * just the first character in the node is acceptable.  I
+                     * (khw) decided to try this method which doesn't have that
+                     * pitfall; if performance issues are found, we can do a
+                     * combination of the current approach plus that one */
+                    upper_parse = len;
+                    len = 0;
+                    s = s0;
+                    goto reparse;
+                }
+           }   /* End of verifying node ends with an appropriate char */
+
         loopdone:   /* Jumped to when encounters something that shouldn't be in
                        the node */
+
+            /* I (khw) don't know if you can get here with zero length, but the
+             * old code handled this situation by creating a zero-length EXACT
+             * node.  Might as well be NOTHING instead */
+            if (len == 0) {
+                OP(ret) = NOTHING;
+            }
+            else{
+                alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender);
+            }
+
             RExC_parse = p - 1;
              Set_Node_Cur_Length(ret); /* MJD */
             nextchar(pRExC_state);
@@ -10729,15 +10962,10 @@ tryagain:
                 if (iv < 0)
                     vFAIL("Internal disaster");
             }
-           if (len > 0)
-               *flagp |= HASWIDTH;
-           if (len == 1 && UNI_IS_INVARIANT(ender))
-               *flagp |= SIMPLE;
  
-            alloc_maybe_populate_EXACT(pRExC_state, ret, len, 0);
-       }
+       } /* End of label 'defchar:' */
         break;
-    }
+    } /* End of giant switch on input character */
  
      return(ret);
  }
@@ -11101,14 +11329,14 @@ S_add_alternate(pTHX_ AV** alternate_ptr, U8* string, STRLEN len)
     above 255, a range list is used */
  
  STATIC regnode *
-S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
+S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
  {
      dVAR;
-    register UV nextvalue;
-    register UV prevvalue = OOB_UNICODE;
-    register IV range = 0;
-    UV value = 0; /* XXX:dmq: needs to be referenceable (unfortunately) */
-    register regnode *ret;
+    UV nextvalue;
+    UV prevvalue = OOB_UNICODE;
+    IV range = 0;
+    UV value = 0;
+    regnode *ret;
      STRLEN numlen;
      IV namedclass = OOB_NAMEDCLASS;
      char *rangebegin = NULL;
@@ -11272,11 +11500,11 @@ parseit:
                      if this makes sense as it does change the behaviour
                      from earlier versions, OTOH that behaviour was broken
                      as well. */
-                    UV v; /* value is register so we cant & it /grrr */
-                    if (reg_namedseq(pRExC_state, &v, NULL, depth)) {
+                    if (! grok_bslash_N(pRExC_state, NULL, &value, flagp, depth,
+                                      TRUE /* => charclass */))
+                    {
                          goto parseit;
                      }
-                    value= v; 
                  }
                  break;
             case 'p':
@@ -11778,8 +12006,8 @@ parseit:
                     }
                      if (!SIZE_ONLY) {
                          cp_list = add_cp_to_invlist(cp_list, '-');
-                        element_count++;
                      }
+                    element_count++;
                 } else
                     range = 1;  /* yeah, it's a range! */
                 continue;       /* but do it the next time */
@@ -11891,6 +12119,7 @@ parseit:
                      if (invert) {
                          op += NALNUM - ALNUM;
                      }
+                    *flagp |= HASWIDTH|SIMPLE;
                      break;
  
                  /* The second group doesn't depend of the charset modifiers.
@@ -11901,6 +12130,7 @@ parseit:
                  case ANYOF_HORIZWS:
                    is_horizws:
                      op = (invert) ? NHORIZWS : HORIZWS;
+                    *flagp |= HASWIDTH|SIMPLE;
                      break;
  
                  case ANYOF_NVERTWS:
@@ -11908,6 +12138,7 @@ parseit:
                      /* FALLTHROUGH */
                  case ANYOF_VERTWS:
                      op = (invert) ? NVERTWS : VERTWS;
+                    *flagp |= HASWIDTH|SIMPLE;
                      break;
  
                  case ANYOF_MAX:
@@ -11947,6 +12178,8 @@ parseit:
              if (invert) {
                  if (! LOC && value == '\n') {
                      op = REG_ANY; /* Optimize [^\n] */
+                    *flagp |= HASWIDTH|SIMPLE;
+                    RExC_naughty++;
                  }
              }
              else if (value < 256 || UTF) {
@@ -11960,6 +12193,7 @@ parseit:
              if (prevvalue == '0') {
                  if (value == '9') {
                      op = (invert) ? NDIGITA : DIGITA;
+                    *flagp |= HASWIDTH|SIMPLE;
                  }
              }
          }
@@ -11993,9 +12227,10 @@ parseit:
                  if (! SIZE_ONLY) {
                      FLAGS(ret) = arg;
                  }
+                *flagp |= HASWIDTH|SIMPLE;
              }
              else if (PL_regkind[op] == EXACT) {
-                alloc_maybe_populate_EXACT(pRExC_state, ret, 0, value);
+                alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value);
              }
  
              RExC_parse = (char *) cur_parse;
@@ -12007,7 +12242,7 @@ parseit:
  
      if (SIZE_ONLY)
          return ret;
-    /****** !SIZE_ONLY AFTER HERE *********/
+    /****** !SIZE_ONLY (Pass 2) AFTER HERE *********/
  
      /* If folding, we calculate all characters that could fold to or from the
       * ones already on the list */
@@ -12463,6 +12698,7 @@ parseit:
               * it doesn't match anything.  (perluniprops.pod notes such
               * properties) */
              op = OPFAIL;
+            *flagp |= HASWIDTH|SIMPLE;
          }
          else if (start == end) {    /* The range is a single code point */
              if (! invlist_iternext(cp_list, &start, &end)
@@ -12528,12 +12764,16 @@ parseit:
          else if (start == 0) {
              if (end == UV_MAX) {
                  op = SANY;
+                *flagp |= HASWIDTH|SIMPLE;
+                RExC_naughty++;
              }
              else if (end == '\n' - 1
                      && invlist_iternext(cp_list, &start, &end)
                      && start == '\n' + 1 && end == UV_MAX)
              {
                  op = REG_ANY;
+                *flagp |= HASWIDTH|SIMPLE;
+                RExC_naughty++;
              }
          }
  
@@ -12546,7 +12786,7 @@ parseit:
              RExC_parse = (char *)cur_parse;
  
              if (PL_regkind[op] == EXACT) {
-                alloc_maybe_populate_EXACT(pRExC_state, ret, 0, value);
+                alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value);
              }
  
              SvREFCNT_dec(listsv);
@@ -12687,6 +12927,8 @@ parseit:
         RExC_rxi->data->data[n] = (void*)rv;
         ARG_SET(ret, n);
      }
+
+    *flagp |= HASWIDTH|SIMPLE;
      return ret;
  }
  #undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
@@ -12780,7 +13022,7 @@ STATIC regnode *                        /* Location. */
  S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
  {
      dVAR;
-    register regnode *ptr;
+    regnode *ptr;
      regnode * const ret = RExC_emit;
      GET_RE_DEBUG_FLAGS_DECL;
  
@@ -12822,7 +13064,7 @@ STATIC regnode *                        /* Location. */
  S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
  {
      dVAR;
-    register regnode *ptr;
+    regnode *ptr;
      regnode * const ret = RExC_emit;
      GET_RE_DEBUG_FLAGS_DECL;
  
@@ -12894,9 +13136,9 @@ STATIC void
  S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
  {
      dVAR;
-    register regnode *src;
-    register regnode *dst;
-    register regnode *place;
+    regnode *src;
+    regnode *dst;
+    regnode *place;
      const int offset = regarglen[(U8)op];
      const int size = NODE_STEP_REGNODE + offset;
      GET_RE_DEBUG_FLAGS_DECL;
@@ -12982,7 +13224,7 @@ STATIC void
  S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
  {
      dVAR;
-    register regnode *scan;
+    regnode *scan;
      GET_RE_DEBUG_FLAGS_DECL;
  
      PERL_ARGS_ASSERT_REGTAIL;
@@ -13041,7 +13283,7 @@ STATIC U8
  S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
  {
      dVAR;
-    register regnode *scan;
+    regnode *scan;
      U8 exact = PSEUDO;
  #ifdef EXPERIMENTAL_INPLACESCAN
      I32 min = 0;
@@ -13282,7 +13524,7 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
  {
  #ifdef DEBUGGING
      dVAR;
-    register int k;
+    int k;
  
      /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
      static const char * const anyofs[] = {
@@ -14086,7 +14328,7 @@ regnode *
  Perl_regnext(pTHX_ register regnode *p)
  {
      dVAR;
-    register I32 offset;
+    I32 offset;
  
      if (!p)
         return(NULL);
@@ -14245,8 +14487,8 @@ S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
             SV* sv, I32 indent, U32 depth)
  {
      dVAR;
-    register U8 op = PSEUDO;   /* Arbitrary non-END op. */
-    register const regnode *next;
+    U8 op = PSEUDO;    /* Arbitrary non-END op. */
+    const regnode *next;
      const regnode *optstart= NULL;
      
      RXi_GET_DECL(r,ri);
@@ -14297,9 +14539,9 @@ S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
         if (PL_regkind[(U8)op] == BRANCHJ) {
             assert(next);
             {
-                register const regnode *nnode = (OP(next) == LONGJMP
-                                            ? regnext((regnode *)next)
-                                            : next);
+                const regnode *nnode = (OP(next) == LONGJMP
+                                       ? regnext((regnode *)next)
+                                       : next);
                  if (last && nnode > last)
                      nnode = last;
                  DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);