regcomp.c: Make invlist_search() usable from re_comp.c

[perl5.git] / regcomp.c
diff --git a/regcomp.c b/regcomp.c

index ab9c705..488ebeb 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -81,6 +81,7 @@
  #define REG_COMP_C
  #ifdef PERL_IN_XSUB_RE
  #  include "re_comp.h"
+extern const struct regexp_engine my_reg_engine;
  #else
  #  include "regcomp.h"
  #endif
@@ -2582,12 +2583,12 @@ S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source,  regnode
   * one, and looks for problematic sequences of characters whose folds vs.
   * non-folds have sufficiently different lengths, that the optimizer would be
   * fooled into rejecting legitimate matches of them, and the trie construction
- * code can't cope with them.  The joining is only done if:
+ * code needs to handle specially.  The joining is only done if:
   * 1) there is room in the current conglomerated node to entirely contain the
   *    next one.
   * 2) they are the exact same node type
   *
- * The adjacent nodes actually may be separated by NOTHING kind nodes, and
+ * The adjacent nodes actually may be separated by NOTHING-kind nodes, and
   * these get optimized out
   *
   * If there are problematic code sequences, *min_subtract is set to the delta
@@ -2600,26 +2601,27 @@ S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source,  regnode
   *
   * This is as good a place as any to discuss the design of handling these
   * problematic sequences.  It's been wrong in Perl for a very long time.  There
- * are three code points in Unicode whose folded lengths differ so much from
- * the un-folded lengths that it causes problems for the optimizer and trie
- * construction.  Why only these are problematic, and not others where lengths
- * also differ is something I (khw) do not understand.  New versions of Unicode
- * might add more such code points.  Hopefully the logic in fold_grind.t that
- * figures out what to test (in part by verifying that each size-combination
- * gets tested) will catch any that do come along, so they can be added to the
- * special handling below.  The chances of new ones are actually rather small,
- * as most, if not all, of the world's scripts that have casefolding have
- * already been encoded by Unicode.  Also, a number of Unicode's decisions were
- * made to allow compatibility with pre-existing standards, and almost all of
- * those have already been dealt with.  These would otherwise be the most
- * likely candidates for generating further tricky sequences.  In other words,
- * Unicode by itself is unlikely to add new ones unless it is for compatibility
- * with pre-existing standards, and there aren't many of those left.
+ * are three code points currently in Unicode whose folded lengths differ so
+ * much from the un-folded lengths that it causes problems for the optimizer
+ * and trie construction.  Why only these are problematic, and not others where
+ * lengths also differ is something I (khw) do not understand.  New versions of
+ * Unicode might add more such code points.  Hopefully the logic in
+ * fold_grind.t that figures out what to test (in part by verifying that each
+ * size-combination gets tested) will catch any that do come along, so they can
+ * be added to the special handling below.  The chances of new ones are
+ * actually rather small, as most, if not all, of the world's scripts that have
+ * casefolding have already been encoded by Unicode.  Also, a number of
+ * Unicode's decisions were made to allow compatibility with pre-existing
+ * standards, and almost all of those have already been dealt with.  These
+ * would otherwise be the most likely candidates for generating further tricky
+ * sequences.  In other words, Unicode by itself is unlikely to add new ones
+ * unless it is for compatibility with pre-existing standards, and there aren't
+ * many of those left.
   *
   * The previous designs for dealing with these involved assigning a special
   * node for them.  This approach doesn't work, as evidenced by this example:
   *      "\xDFs" =~ /s\xDF/ui    # Used to fail before these patches
- * Both these fold to "sss", but if the pattern is parsed to create a node of
+ * Both these fold to "sss", but if the pattern is parsed to create a node
   * that would match just the \xDF, it won't be able to handle the case where a
   * successful match would have to cross the node's boundary.  The new approach
   * that hopefully generally solves the problem generates an EXACTFU_SS node
@@ -2634,9 +2636,9 @@ S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source,  regnode
   *      problematic sequences.  This delta is used by the caller to adjust the
   *      min length of the match, and the delta between min and max, so that the
   *      optimizer doesn't reject these possibilities based on size constraints.
- * 2)   These sequences require special handling by the trie code, so it
- *      changes the joined node type to ops for the trie's benefit, those new
- *      ops being EXACTFU_SS and EXACTFU_TRICKYFOLD.
+ * 2)   These sequences require special handling by the trie code, so this code
+ *      changes the joined node type to special ops: EXACTFU_TRICKYFOLD and
+ *      EXACTFU_SS.
   * 3)   This is sufficient for the two Greek sequences (described below), but
   *      the one involving the Sharp s (\xDF) needs more.  The node type
   *      EXACTFU_SS is used for an EXACTFU node that contains at least one "ss"
@@ -2646,17 +2648,16 @@ S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source,  regnode
   *      itself with length changes, and so can be processed faster.  regexec.c
   *      takes advantage of this.  Generally, an EXACTFish node that is in UTF-8
   *      is pre-folded by regcomp.c.  This saves effort in regex matching.
- *      However, probably mostly for historical reasons, the pre-folding isn't
- *      done for non-UTF8 patterns (and it can't be for EXACTF and EXACTFL
- *      nodes, as what they fold to isn't known until runtime.)  The fold
- *      possibilities for the non-UTF8 patterns are quite simple, except for
- *      the sharp s.  All the ones that don't involve a UTF-8 target string
- *      are members of a fold-pair, and arrays are set up for all of them
- *      that quickly find the other member of the pair.  It might actually
- *      be faster to pre-fold these, but it isn't currently done, except for
- *      the sharp s.  Code elsewhere in this file makes sure that it gets
- *      folded to 'ss', even if the pattern isn't UTF-8.  This avoids the
- *      issues described in the next item.
+ *      However, the pre-folding isn't done for non-UTF8 patterns because the
+ *      fold of the MICRO SIGN requires UTF-8.  Also what EXACTF and EXACTFL
+ *      nodes fold to isn't known until runtime.  The fold possibilities for
+ *      the non-UTF8 patterns are quite simple, except for the sharp s.  All
+ *      the ones that don't involve a UTF-8 target string are members of a
+ *      fold-pair, and arrays are set up for all of them so that the other
+ *      member of the pair can be found quickly.  Code elsewhere in this file
+ *      makes sure that in EXACTFU nodes, the sharp s gets folded to 'ss', even
+ *      if the pattern isn't UTF-8.  This avoids the issues described in the
+ *      next item.
   * 4)   A problem remains for the sharp s in EXACTF nodes.  Whether it matches
   *      'ss' or not is not knowable at compile time.  It will match iff the
   *      target string is in UTF-8, unlike the EXACTFU nodes, where it always
@@ -2733,6 +2734,8 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, b
              const unsigned int oldl = STR_LEN(scan);
              regnode * const nnext = regnext(n);
  
+            /* XXX I (khw) kind of doubt that this works on platforms where
+             * U8_MAX is above 255 because of lots of other assumptions */
              if (oldl + STR_LEN(n) > U8_MAX)
                  break;
              
@@ -2883,7 +2886,7 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, b
                       greek_sequence:
                         *min_subtract += 4;
  
-                       /* This can't currently be handled by trie's, so change
+                       /* This requires special handling by trie's, so change
                          * the node type to indicate this.  If EXACTFA and
                          * EXACTFL were ever to be handled by trie's, this
                          * would have to be changed.  If this node has already
@@ -2919,9 +2922,9 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, b
                             /* EXACTF nodes need to know that the minimum
                              * length changed so that a sharp s in the string
                              * can match this ss in the pattern, but they
-                            * remain EXACTF nodes, as they are not trie'able,
-                            * so don't have to invent a new node type to
-                            * exclude them from the trie code */
+                             * remain EXACTF nodes, as they won't match this
+                             * unless the target string is is UTF-8, which we
+                             * don't know until runtime */
                             if (OP(scan) != EXACTF) {
                                 OP(scan) = EXACTFU_SS;
                             }
@@ -4961,16 +4964,23 @@ Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
  }
  #endif
  
-/* public(ish) wrapper for Perl_re_op_compile that only takes an SV
- * pattern rather than a list of OPs */
+/* public(ish) entry point for the perl core's own regex compiling code.
+ * It's actually a wrapper for Perl_re_op_compile that only takes an SV
+ * pattern rather than a list of OPs, and uses the internal engine rather
+ * than the current one */
  
  REGEXP *
  Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags)
  {
      SV *pat = pattern; /* defeat constness! */
      PERL_ARGS_ASSERT_RE_COMPILE;
-    return Perl_re_op_compile(aTHX_ &pat, 1, NULL, current_re_engine(),
-                               NULL, NULL, rx_flags, 0);
+    return Perl_re_op_compile(aTHX_ &pat, 1, NULL,
+#ifdef PERL_IN_XSUB_RE
+                                &my_reg_engine,
+#else
+                                &PL_core_reg_engine,
+#endif
+                                NULL, NULL, rx_flags, 0);
  }
  
  /* see if there are any run-time code blocks in the pattern.
@@ -5685,7 +5695,6 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
         runtime_code = S_has_runtime_code(aTHX_ pRExC_state, expr, pm_flags,
                                             exp, plen);
         if (!runtime_code) {
-           ReREFCNT_inc(old_re);
             if (used_setjump) {
                 JMPENV_POP;
             }
@@ -7309,8 +7318,8 @@ S__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV end
  
  #ifndef PERL_IN_XSUB_RE
  
-STATIC IV
-S_invlist_search(pTHX_ SV* const invlist, const UV cp)
+IV
+Perl__invlist_search(pTHX_ SV* const invlist, const UV cp)
  {
      /* Searches the inversion list for the entry that contains the input code
       * point <cp>.  If <cp> is not in the list, -1 is returned.  Otherwise, the
@@ -7321,7 +7330,7 @@ S_invlist_search(pTHX_ SV* const invlist, const UV cp)
      IV high = invlist_len(invlist);
      const UV * const array = invlist_array(invlist);
  
-    PERL_ARGS_ASSERT_INVLIST_SEARCH;
+    PERL_ARGS_ASSERT__INVLIST_SEARCH;
  
      /* If list is empty or the code point is before the first element, return
       * failure. */
@@ -7375,7 +7384,7 @@ Perl__invlist_populate_swatch(pTHX_ SV* const invlist, const UV start, const UV
      array = invlist_array(invlist);
  
      /* Find which element it is */
-    i = invlist_search(invlist, start);
+    i = _invlist_search(invlist, start);
  
      /* We populate from <start> to <end> */
      while (current < end) {
@@ -7938,6 +7947,18 @@ Perl__add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end)
  
  #endif
  
+STATIC bool
+S__invlist_contains_cp(pTHX_ SV* const invlist, const UV cp)
+{
+    /* Does <invlist> contain code point <cp> as part of the set? */
+
+    IV index = _invlist_search(invlist, cp);
+
+    PERL_ARGS_ASSERT__INVLIST_CONTAINS_CP;
+
+    return index >= 0 && ELEMENT_RANGE_MATCHES_INVLIST(index);
+}
+
  PERL_STATIC_INLINE SV*
  S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) {
      return _add_range_to_invlist(invlist, cp, cp);
@@ -9558,7 +9579,7 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
  }
  
  
-/* reg_namedseq(pRExC_state,UVp, UV depth)
+/* grok_bslash_N(pRExC_state,UVp, UV depth)
     
     This is expected to be called by a parser routine that has 
     recognized '\N' and needs to handle the rest. RExC_parse is
@@ -9601,7 +9622,7 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
     Parsing failures will generate a fatal error via vFAIL(...)
   */
  STATIC regnode *
-S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp, U32 depth)
+S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp, U32 depth)
  {
      char * endbrace;    /* '}' following the name */
      regnode *ret = NULL;
@@ -9609,7 +9630,7 @@ S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp, U32 dept
  
      GET_RE_DEBUG_FLAGS_DECL;
   
-    PERL_ARGS_ASSERT_REG_NAMEDSEQ;
+    PERL_ARGS_ASSERT_GROK_BSLASH_N;
  
      GET_RE_DEBUG_FLAGS;
  
@@ -9636,7 +9657,7 @@ S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp, U32 dept
         return ret;
      }
  
-    /* Here, we have decided it should be a named sequence */
+    /* Here, we have decided it should be a named character or sequence */
  
      /* The test above made sure that the next real character is a '{', but
       * under the /x modifier, it could be separated by space (or a comment and
@@ -9673,7 +9694,7 @@ S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp, U32 dept
         return (regnode *) &RExC_parse; /* Invalid regnode pointer */
      }
  
-    REQUIRE_UTF8;      /* named sequences imply Unicode semantics */
+    RExC_uni_semantics = 1; /* Unicode named chars imply Unicode semantics */
      RExC_parse += 2;   /* Skip past the 'U+' */
  
      if (valuep) {   /* In a bracketed char class */
@@ -10228,7 +10249,7 @@ tryagain:
              Also this makes sure that things like /\N{BLAH}+/ and 
              \N{BLAH} being multi char Just Happen. dmq*/
              ++RExC_parse;
-            ret= reg_namedseq(pRExC_state, NULL, flagp, depth);
+            ret= grok_bslash_N(pRExC_state, NULL, flagp, depth);
              break;
         case 'k':    /* Handle \k<NAME> and \k'NAME' */
         parse_named_seq:
@@ -10377,18 +10398,19 @@ tryagain:
             register UV ender;
             register char *p;
             char *s;
+#define MAX_NODE_STRING_SIZE 127
+           char foldbuf[MAX_NODE_STRING_SIZE];
             STRLEN foldlen;
-           U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
              U8 node_type;
-
-           /* Is this a LATIN LOWER CASE SHARP S in an EXACTFU node?  If so,
-            * it is folded to 'ss' even if not utf8 */
-           bool is_exactfu_sharp_s;
+            bool next_is_quantifier;
  
             ender = 0;
              node_type = compute_EXACTish(pRExC_state);
             ret = reg_node(pRExC_state, node_type);
-           s = STRING(ret);
+
+            /* In pass1, folded, we use a temporary buffer instead of the
+             * actual node, as the node doesn't exist yet */
+           s = (SIZE_ONLY && FOLD) ? foldbuf : STRING(ret);
  
             /* XXX The node can hold up to 255 bytes, yet this only goes to
               * 127.  I (khw) do not know why.  Keeping it somewhat less than
@@ -10409,7 +10431,7 @@ tryagain:
               * non-final, but it is possible for there not to be any in the
               * entire node. */
             for (len = 0, p = RExC_parse - 1;
-                len < 127 && p < RExC_end;
+                len < MAX_NODE_STRING_SIZE && p < RExC_end;
                  len++)
             {
                 char * const oldp = p;
@@ -10611,148 +10633,95 @@ tryagain:
                     break;
                 } /* End of switch on the literal */
  
-                is_exactfu_sharp_s = (node_type == EXACTFU
-                                     && ender == LATIN_SMALL_LETTER_SHARP_S);
                 if ( RExC_flags & RXf_PMf_EXTENDED)
                     p = regwhite( pRExC_state, p );
-               if ((UTF && FOLD) || is_exactfu_sharp_s) {
-                   /* Prime the casefolded buffer.  Locale rules, which apply
-                    * only to code points < 256, aren't known until execution,
-                    * so for them, just output the original character using
-                     * utf8.  If we start to fold non-UTF patterns, be sure to
-                     * update join_exact() */
-                   if (LOC && ender < 256) {
-                       if (UNI_IS_INVARIANT(ender)) {
-                           *tmpbuf = (U8) ender;
-                           foldlen = 1;
-                       } else {
-                           *tmpbuf = UTF8_TWO_BYTE_HI(ender);
-                           *(tmpbuf + 1) = UTF8_TWO_BYTE_LO(ender);
-                           foldlen = 2;
-                       }
-                   }
-                   else if (isASCII(ender)) {  /* Note: Here can't also be LOC
-                                                */
-                       ender = toLOWER(ender);
-                       *tmpbuf = (U8) ender;
-                       foldlen = 1;
-                   }
-                   else if (! ASCII_FOLD_RESTRICTED && ! LOC) {
  
-                       /* Locale and /aa require more selectivity about the
-                        * fold, so are handled below.  Otherwise, here, just
-                        * use the fold */
-                       ender = toFOLD_uni(ender, tmpbuf, &foldlen);
-                   }
-                   else {
-                       /* Under locale rules or /aa we are not to mix,
-                        * respectively, ords < 256 or ASCII with non-.  So
-                        * reject folds that mix them, using only the
-                        * non-folded code point.  So do the fold to a
-                        * temporary, and inspect each character in it. */
-                       U8 trialbuf[UTF8_MAXBYTES_CASE+1];
-                       U8* s = trialbuf;
-                       UV tmpender = toFOLD_uni(ender, trialbuf, &foldlen);
-                       U8* e = s + foldlen;
-                       bool fold_ok = TRUE;
-
-                       while (s < e) {
-                           if (isASCII(*s)
-                               || (LOC && (UTF8_IS_INVARIANT(*s)
-                                          || UTF8_IS_DOWNGRADEABLE_START(*s))))
-                           {
-                               fold_ok = FALSE;
-                               break;
-                           }
-                           s += UTF8SKIP(s);
-                       }
-                       if (fold_ok) {
-                           Copy(trialbuf, tmpbuf, foldlen, U8);
-                           ender = tmpender;
-                       }
-                       else {
-                           uvuni_to_utf8(tmpbuf, ender);
-                           foldlen = UNISKIP(ender);
-                       }
-                   }
-               }
-               if (p < RExC_end && ISMULT2(p)) { /* Back off on ?+*. */
-                   if (len)
-                       p = oldp;
-                   else if (UTF || is_exactfu_sharp_s) {
-                        if (FOLD) {
-                             /* Emit all the Unicode characters. */
-                             STRLEN numlen;
-                             for (foldbuf = tmpbuf;
-                                  foldlen;
-                                  foldlen -= numlen) {
-
-                                  /* tmpbuf has been constructed by us, so we
-                                   * know it is valid utf8 */
-                                  ender = valid_utf8_to_uvchr(foldbuf, &numlen);
-                                  if (numlen > 0) {
-                                       const STRLEN unilen = reguni(pRExC_state, ender, s);
-                                       s       += unilen;
-                                       len     += unilen;
-                                       /* In EBCDIC the numlen
-                                        * and unilen can differ. */
-                                       foldbuf += numlen;
-                                       if (numlen >= foldlen)
-                                            break;
-                                  }
-                                  else
-                                       break; /* "Can't happen." */
-                             }
-                        }
-                        else {
-                             const STRLEN unilen = reguni(pRExC_state, ender, s);
-                             if (unilen > 0) {
-                                  s   += unilen;
-                                  len += unilen;
-                             }
-                        }
-                   }
-                   else {
-                       len++;
-                       REGC((char)ender, s++);
-                   }
-                   break;
+                /* If the next thing is a quantifier, it applies to this
+                 * character only, which means that this character has to be in
+                 * its own node and can't just be appended to the string in an
+                 * existing node, so if there are already other characters in
+                 * the node, close the node with just them, and set up to do
+                 * this character again next time through, when it will be the
+                 * only thing in its new node */
+                if ((next_is_quantifier = (p < RExC_end && ISMULT2(p))) && len)
+               {
+                    p = oldp;
+                    goto loopdone;
+                }
+
+               if (FOLD) {
+                    if (UTF
+                            /* See comments for join_exact() as to why we fold
+                             * this non-UTF at compile time */
+                        || (node_type == EXACTFU
+                            && ender == LATIN_SMALL_LETTER_SHARP_S))
+                    {
+
+
+                        /* Prime the casefolded buffer.  Locale rules, which
+                         * apply only to code points < 256, aren't known until
+                         * execution, so for them, just output the original
+                         * character using utf8.  If we start to fold non-UTF
+                         * patterns, be sure to update join_exact() */
+                        if (LOC && ender < 256) {
+                            if (UNI_IS_INVARIANT(ender)) {
+                                *s = (U8) ender;
+                                foldlen = 1;
+                            } else {
+                                *s = UTF8_TWO_BYTE_HI(ender);
+                                *(s + 1) = UTF8_TWO_BYTE_LO(ender);
+                                foldlen = 2;
+                            }
+                        }
+                        else {
+                            ender = _to_uni_fold_flags(ender, (U8 *) s, &foldlen,
+                                    FOLD_FLAGS_FULL
+                                     | ((LOC) ?  FOLD_FLAGS_LOCALE
+                                              : (ASCII_FOLD_RESTRICTED)
+                                                ? FOLD_FLAGS_NOMIX_ASCII
+                                                : 0)
+                                );
+                        }
+                       s += foldlen;
+
+                       /* The loop increments <len> each time, as all but this
+                        * path (and the one just below for UTF) through it add
+                        * a single byte to the EXACTish node.  But this one
+                        * has changed len to be the correct final value, so
+                        * subtract one to cancel out the increment that
+                        * follows */
+                       len += foldlen - 1;
+                    }
+                    else {
+                        REGC((char)ender, s++);
+                    }
                 }
-                if (UTF || is_exactfu_sharp_s) {
-                    if (FOLD) {
-                         /* Emit all the Unicode characters. */
-                         STRLEN numlen;
-                         for (foldbuf = tmpbuf;
-                              foldlen;
-                              foldlen -= numlen) {
-                              ender = valid_utf8_to_uvchr(foldbuf, &numlen);
-                              if (numlen > 0) {
-                                   const STRLEN unilen = reguni(pRExC_state, ender, s);
-                                   len     += unilen;
-                                   s       += unilen;
-                                   /* In EBCDIC the numlen
-                                    * and unilen can differ. */
-                                   foldbuf += numlen;
-                                   if (numlen >= foldlen)
-                                        break;
-                              }
-                              else
-                                   break;
-                         }
-                    }
-                    else {
-                         const STRLEN unilen = reguni(pRExC_state, ender, s);
-                         if (unilen > 0) {
-                              s   += unilen;
-                              len += unilen;
-                         }
-                    }
-                    len--;
+               else if (UTF) {
+                    const STRLEN unilen = reguni(pRExC_state, ender, s);
+                    if (unilen > 0) {
+                       s   += unilen;
+                       len += unilen;
+                    }
+
+                   /* See comment just above for - 1 */
+                   len--;
                 }
                 else {
                     REGC((char)ender, s++);
+                }
+
+               if (next_is_quantifier) {
+
+                    /* Here, the next input is a quantifier, and to get here,
+                     * the current character is the only one in the node.
+                     * Also, here <len> doesn't include the final byte for this
+                     * character */
+                    len++;
+                    goto loopdone;
                 }
-           }
+
+           } /* End of loop through literal characters */
+
         loopdone:   /* Jumped to when encounters something that shouldn't be in
                        the node */
             RExC_parse = p - 1;
@@ -10770,9 +10739,9 @@ tryagain:
                 *flagp |= SIMPLE;
  
              alloc_maybe_populate_EXACT(pRExC_state, ret, len, 0);
-       }
+       } /* End of label 'defchar:' */
         break;
-    }
+    } /* End of giant switch on input character */
  
      return(ret);
  }
@@ -11187,7 +11156,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
       * not escapes.  Thus we can tell if 'A' was input vs \x{C1} */
      UV literal_endpoint = 0;
  #endif
-    UV stored = 0;  /* how many chars stored in the bitmap */
      bool invert = FALSE;    /* Is this class to be complemented */
  
      /* Is there any thing like \W or [:^digit:] that matches above the legal
@@ -11309,7 +11277,7 @@ parseit:
                      from earlier versions, OTOH that behaviour was broken
                      as well. */
                      UV v; /* value is register so we cant & it /grrr */
-                    if (reg_namedseq(pRExC_state, &v, NULL, depth)) {
+                    if (grok_bslash_N(pRExC_state, &v, NULL, depth)) {
                          goto parseit;
                      }
                      value= v; 
@@ -12343,7 +12311,7 @@ parseit:
       * fold the classes (folding of those is automatically handled by the swash
       * fetching code) */
      if (posixes) {
-        if (AT_LEAST_UNI_SEMANTICS) {
+        if (! DEPENDS_SEMANTICS) {
              if (cp_list) {
                  _invlist_union(cp_list, posixes, &cp_list);
                  SvREFCNT_dec(posixes);
@@ -12353,7 +12321,6 @@ parseit:
              }
          }
          else {
-
              /* Under /d, we put into a separate list the Latin1 things that
               * match only when the target string is utf8 */
              SV* nonascii_but_latin1_properties = NULL;
@@ -12457,10 +12424,145 @@ parseit:
         invert = FALSE;
      }
  
+    /* If we didn't do folding, it's because some information isn't available
+     * until runtime; set the run-time fold flag for these.  (We don't have to
+     * worry about properties folding, as that is taken care of by the swash
+     * fetching) */
+    if (FOLD && (LOC || unicode_alternate))
+    {
+       ANYOF_FLAGS(ret) |= ANYOF_LOC_NONBITMAP_FOLD;
+    }
+
+    /* Some character classes are equivalent to other nodes.  Such nodes take
+     * up less room and generally fewer operations to execute than ANYOF nodes.
+     * Above, we checked for and optimized into some such equivalents for
+     * certain common classes that are easy to test.  Getting to this point in
+     * the code means that the class didn't get optimized there.  Since this
+     * code is only executed in Pass 2, it is too late to save space--it has
+     * been allocated in Pass 1, and currently isn't given back.  But turning
+     * things into an EXACTish node can allow the optimizer to join it to any
+     * adjacent such nodes.  And if the class is equivalent to things like /./,
+     * expensive run-time swashes can be avoided.  Now that we have more
+     * complete information, we can find things necessarily missed by the
+     * earlier code.  I (khw) am not sure how much to look for here.  It would
+     * be easy, but perhaps too slow, to check any candidates against all the
+     * node types they could possibly match using _invlistEQ(). */
+
+    if (cp_list
+        && ! unicode_alternate
+        && ! invert
+        && ! depends_list
+        && ! (ANYOF_FLAGS(ret) & ANYOF_CLASS)
+        && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
+    {
+       UV start, end;
+       U8 op = END;  /* The optimzation node-type */
+        const char * cur_parse= RExC_parse;
+
+       invlist_iterinit(cp_list);
+       if (! invlist_iternext(cp_list, &start, &end)) {
+
+            /* Here, the list is empty.  This happens, for example, when a
+             * Unicode property is the only thing in the character class, and
+             * it doesn't match anything.  (perluniprops.pod notes such
+             * properties) */
+            op = OPFAIL;
+        }
+        else if (start == end) {    /* The range is a single code point */
+            if (! invlist_iternext(cp_list, &start, &end)
+
+                    /* Don't do this optimization if it would require changing
+                     * the pattern to UTF-8 */
+                && (start < 256 || UTF))
+            {
+                /* Here, the list contains a single code point.  Can optimize
+                 * into an EXACT node */
+
+                value = start;
+
+                if (! FOLD) {
+                    op = EXACT;
+                }
+                else if (LOC) {
+
+                    /* A locale node under folding with one code point can be
+                     * an EXACTFL, as its fold won't be calculated until
+                     * runtime */
+                    op = EXACTFL;
+                }
+                else {
+
+                    /* Here, we are generally folding, but there is only one
+                     * code point to match.  If we have to, we use an EXACT
+                     * node, but it would be better for joining with adjacent
+                     * nodes in the optimization pass if we used the same
+                     * EXACTFish node that any such are likely to be.  We can
+                     * do this iff the code point doesn't participate in any
+                     * folds.  For example, an EXACTF of a colon is the same as
+                     * an EXACT one, since nothing folds to or from a colon.
+                     * In the Latin1 range, being an alpha means that the
+                     * character participates in a fold (except for the
+                     * feminine and masculine ordinals, which I (khw) don't
+                     * think are worrying about optimizing for). */
+                    if (value < 256) {
+                        if (isALPHA_L1(value)) {
+                            op = EXACT;
+                        }
+                    }
+                    else {
+                        if (! PL_utf8_foldable) {
+                            SV* swash = swash_init("utf8", "_Perl_Any_Folds",
+                                                &PL_sv_undef, 1, 0);
+                            PL_utf8_foldable = _get_swash_invlist(swash);
+                            SvREFCNT_dec(swash);
+                        }
+                        if (_invlist_contains_cp(PL_utf8_foldable, value)) {
+                            op = EXACT;
+                        }
+                    }
+
+                    /* If we haven't found the node type, above, it means we
+                     * can use the prevailing one */
+                    if (op == END) {
+                        op = compute_EXACTish(pRExC_state);
+                    }
+                }
+            }
+        }
+        else if (start == 0) {
+            if (end == UV_MAX) {
+                op = SANY;
+            }
+            else if (end == '\n' - 1
+                    && invlist_iternext(cp_list, &start, &end)
+                    && start == '\n' + 1 && end == UV_MAX)
+            {
+                op = REG_ANY;
+            }
+        }
+
+        if (op != END) {
+            RExC_parse = (char *)orig_parse;
+            RExC_emit = (regnode *)orig_emit;
+
+            ret = reg_node(pRExC_state, op);
+
+            RExC_parse = (char *)cur_parse;
+
+            if (PL_regkind[op] == EXACT) {
+                alloc_maybe_populate_EXACT(pRExC_state, ret, 0, value);
+            }
+
+            SvREFCNT_dec(listsv);
+            return ret;
+        }
+    }
+
      /* Here, <cp_list> contains all the code points we can determine at
       * compile time that match under all conditions.  Go through it, and
       * for things that belong in the bitmap, put them there, and delete from
-     * <cp_list> */
+     * <cp_list>.  While we are at it, see if everything above 255 is in the
+     * list, and if so, set a flag to speed up execution */
      ANYOF_BITMAP_ZERO(ret);
      if (cp_list) {
  
@@ -12475,6 +12577,10 @@ parseit:
             UV high;
             int i;
  
+            if (end == UV_MAX && start <= 256) {
+                ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;
+            }
+
             /* Quit if are above what we should change */
             if (start > 255) {
                 break;
@@ -12487,7 +12593,6 @@ parseit:
             for (i = start; i <= (int) high; i++) {
                 if (! ANYOF_BITMAP_TEST(ret, i)) {
                     ANYOF_BITMAP_SET(ret, i);
-                   stored++;
                     prevvalue = value;
                     value = i;
                 }
@@ -12511,7 +12616,9 @@ parseit:
          ANYOF_FLAGS(ret) |= ANYOF_INVERT;
      }
  
-    /* Combine the two lists into one. */
+    /* Here, the bitmap has been populated with all the Latin1 code points that
+     * always match.  Can now add to the overall list those that match only
+     * when the target string is UTF-8 (<depends_list>). */
      if (depends_list) {
         if (cp_list) {
             _invlist_union(cp_list, depends_list, &cp_list);
@@ -12522,121 +12629,13 @@ parseit:
         }
      }
  
-    /* Folding in the bitmap is taken care of above, but not for locale (for
-     * which we have to wait to see what folding is in effect at runtime), and
-     * for some things not in the bitmap (only the upper latin folds in this
-     * case, as all other single-char folding has been set above).  Set
-     * run-time fold flag for these */
-    if (FOLD && (LOC
-               || (DEPENDS_SEMANTICS
-                   && cp_list
-                   && ! (ANYOF_FLAGS(ret) & ANYOF_NONBITMAP_NON_UTF8))
-               || unicode_alternate))
-    {
-       ANYOF_FLAGS(ret) |= ANYOF_LOC_NONBITMAP_FOLD;
-    }
-
-    /* A single character class can be "optimized" into an EXACTish node.
-     * Note that since we don't currently count how many characters there are
-     * outside the bitmap, we are XXX missing optimization possibilities for
-     * them.  This optimization can't happen unless this is a truly single
-     * character class, which means that it can't be an inversion into a
-     * many-character class, and there must be no possibility of there being
-     * things outside the bitmap.  'stored' (only) for locales doesn't include
-     * \w, etc, so have to make a special test that they aren't present
-     *
-     * Similarly A 2-character class of the very special form like [bB] can be
-     * optimized into an EXACTFish node, but only for non-locales, and for
-     * characters which only have the two folds; so things like 'fF' and 'Ii'
-     * wouldn't work because they are part of the fold of 'LATIN SMALL LIGATURE
-     * FI'. */
-    if (! cp_list
-       && ! unicode_alternate
-       && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
-       && ! (ANYOF_FLAGS(ret) & (ANYOF_INVERT|ANYOF_UNICODE_ALL))
-        && (((stored == 1 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE))
-                              || (! ANYOF_CLASS_TEST_ANY_SET(ret)))))
-           || (stored == 2 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE))
-                                && (! _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value))
-                                /* If the latest code point has a fold whose
-                                 * bit is set, it must be the only other one */
-                               && ((prevvalue = PL_fold_latin1[value]) != value)
-                                && ANYOF_BITMAP_TEST(ret, prevvalue)))))
-    {
-        /* Note that the information needed to decide to do this optimization
-         * is not currently available until the 2nd pass, and that the actually
-        * used EXACTish node takes less space than the calculated ANYOF node,
-        * and hence the amount of space calculated in the first pass is larger
-         * than actually used, so this optimization doesn't gain us any space.
-        * But an EXACT node is faster than an ANYOF node, and can be combined
-        * with any adjacent EXACT nodes later by the optimizer for further
-        * gains.  The speed of executing an EXACTF is similar to an ANYOF
-        * node, so the optimization advantage comes from the ability to join
-        * it to adjacent EXACT nodes */
-
-        const char * cur_parse= RExC_parse;
-       U8 op;
-        RExC_emit = (regnode *)orig_emit;
-        RExC_parse = (char *)orig_parse;
-
-       if (stored == 1) {
-
-           /* A locale node with one point can be folded; all the other cases
-            * with folding will have two points, since we calculate them above
-            */
-           if (ANYOF_FLAGS(ret) & ANYOF_LOC_NONBITMAP_FOLD) {
-                op = EXACTFL;
-           }
-           else {
-               op = EXACT;
-           }
-       }
-       else {   /* else 2 chars in the bit map: the folds of each other */
-
-           /* Use the folded value, which for the cases where we get here,
-            * is just the lower case of the current one (which may resolve to
-            * itself, or to the other one */
-           value = toLOWER_LATIN1(value);
-
-           /* To join adjacent nodes, they must be the exact EXACTish type.
-            * Try to use the most likely type, by using EXACTFA if possible,
-            * then EXACTFU if the regex calls for it, or is required because
-            * the character is non-ASCII.  (If <value> is ASCII, its fold is
-            * also ASCII for the cases where we get here.) */
-           if (ASCII_FOLD_RESTRICTED && isASCII(value)) {
-               op = EXACTFA;
-           }
-           else if (AT_LEAST_UNI_SEMANTICS || !isASCII(value)) {
-               op = EXACTFU;
-           }
-           else {    /* Otherwise, more likely to be EXACTF type */
-               op = EXACTF;
-           }
-       }
-
-       ret = reg_node(pRExC_state, op);
-        RExC_parse = (char *)cur_parse;
-       if (UTF && ! NATIVE_IS_INVARIANT(value)) {
-           *STRING(ret)= UTF8_EIGHT_BIT_HI((U8) value);
-           *(STRING(ret) + 1)= UTF8_EIGHT_BIT_LO((U8) value);
-           STR_LEN(ret)= 2;
-           RExC_emit += STR_SZ(2);
-       }
-       else {
-           *STRING(ret)= (char)value;
-           STR_LEN(ret)= 1;
-           RExC_emit += STR_SZ(1);
-       }
-       SvREFCNT_dec(listsv);
-        return ret;
-    }
-
      /* If there is a swash and more than one element, we can't use the swash in
       * the optimization below. */
      if (swash && element_count > 1) {
         SvREFCNT_dec(swash);
         swash = NULL;
      }
+
      if (! cp_list
         && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
         && ! unicode_alternate)