fix [perl #76546] regex engine slowdown bug

[perl5.git] / regcomp.c
diff --git a/regcomp.c b/regcomp.c

index 19ecee6..403d5b2 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -86,6 +86,11 @@
  #endif
  
  #include "dquote_static.c"
+#ifndef PERL_IN_XSUB_RE
+#  include "charclass_invlists.h"
+#endif
+
+#define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
  
  #ifdef op
  #undef op
@@ -210,7 +215,8 @@ typedef struct RExC_state_t {
  #define        HASWIDTH        0x01    /* Known to match non-null strings. */
  
  /* Simple enough to be STAR/PLUS operand, in an EXACT node must be a single
- * character, and if utf8, must be invariant.  Note that this is not the same thing as REGNODE_SIMPLE */
+ * character, and if utf8, must be invariant.  Note that this is not the same
+ * thing as REGNODE_SIMPLE */
  #define        SIMPLE          0x02
  #define        SPSTART         0x04    /* Starts with * or +. */
  #define TRYAGAIN       0x08    /* Weeded out a declaration. */
@@ -379,9 +385,11 @@ static const scan_data_t zero_scan_data =
  #define SCF_SEEN_ACCEPT         0x8000 
  
  #define UTF cBOOL(RExC_utf8)
+
+/* The enums for all these are ordered so things work out correctly */
  #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
-#define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
  #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_DEPENDS_CHARSET)
+#define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
  #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) >= REGEX_UNICODE_CHARSET)
  #define ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_RESTRICTED_CHARSET)
  #define MORE_ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
@@ -1361,44 +1369,47 @@ is the recommended Unicode-aware way of saying
      *(d++) = uv;
  */
  
-#define TRIE_STORE_REVCHAR                                                 \
+#define TRIE_STORE_REVCHAR(val)                                            \
      STMT_START {                                                           \
         if (UTF) {                                                         \
-           SV *zlopp = newSV(2);                                          \
+            SV *zlopp = newSV(7); /* XXX: optimize me */                   \
             unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp);      \
-           unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, uvc & 0xFF); \
+            unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, val); \
             SvCUR_set(zlopp, kapow - flrbbbbb);                            \
             SvPOK_on(zlopp);                                               \
             SvUTF8_on(zlopp);                                              \
             av_push(revcharmap, zlopp);                                    \
         } else {                                                           \
-           char ooooff = (char)uvc;                                               \
+            char ooooff = (char)val;                                           \
             av_push(revcharmap, newSVpvn(&ooooff, 1));                     \
         }                                                                  \
          } STMT_END
  
-#define TRIE_READ_CHAR STMT_START {                                           \
-    wordlen++;                                                                \
-    if ( UTF ) {                                                              \
-       if ( folder ) {                                                       \
-           if ( foldlen > 0 ) {                                              \
-              uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags );     \
-              foldlen -= len;                                                \
-              scan += len;                                                   \
-              len = 0;                                                       \
-           } else {                                                          \
-               len = UTF8SKIP(uc);\
-               uvc = to_utf8_fold( uc, foldbuf, &foldlen);                   \
-               foldlen -= UNISKIP( uvc );                                    \
-               scan = foldbuf + UNISKIP( uvc );                              \
-           }                                                                 \
-       } else {                                                              \
-           uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
-       }                                                                     \
-    } else {                                                                  \
-       uvc = (U32)*uc;                                                       \
-       len = 1;                                                              \
-    }                                                                         \
+#define TRIE_READ_CHAR STMT_START {                                                     \
+    wordlen++;                                                                          \
+    if ( UTF ) {                                                                        \
+        /* if it is UTF then it is either already folded, or does not need folding */   \
+        uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags);             \
+    }                                                                                   \
+    else if (folder == PL_fold_latin1) {                                                \
+        /* if we use this folder we have to obey unicode rules on latin-1 data */       \
+        if ( foldlen > 0 ) {                                                            \
+           uvc = utf8n_to_uvuni( (const U8*) scan, UTF8_MAXLEN, &len, uniflags );       \
+           foldlen -= len;                                                              \
+           scan += len;                                                                 \
+           len = 0;                                                                     \
+        } else {                                                                        \
+            len = 1;                                                                    \
+            uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, 1);                     \
+            skiplen = UNISKIP(uvc);                                                     \
+            foldlen -= skiplen;                                                         \
+            scan = foldbuf + skiplen;                                                   \
+        }                                                                               \
+    } else {                                                                            \
+        /* raw data, will be folded later if needed */                                  \
+        uvc = (U32)*uc;                                                                 \
+        len = 1;                                                                        \
+    }                                                                                   \
  } STMT_END
  
  
@@ -1517,10 +1528,12 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs
      switch (flags) {
         case EXACT: break;
         case EXACTFA:
+        case EXACTFU_SS:
+        case EXACTFU_TRICKYFOLD:
         case EXACTFU: folder = PL_fold_latin1; break;
         case EXACTF:  folder = PL_fold; break;
         case EXACTFL: folder = PL_fold_locale; break;
-        default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u", (unsigned) flags );
+        default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
      }
  
      trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
@@ -1529,7 +1542,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs
      trie->wordcount = word_count;
      RExC_rxi->data->data[ data_slot ] = (void*)trie;
      trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
-    if (!(UTF && folder))
+    if (flags == EXACT)
         trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
      trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
                         trie->wordcount+1, sizeof(reg_trie_wordinfo));
@@ -1589,6 +1602,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs
          const U8 * const e  = uc + STR_LEN( noper );
          STRLEN foldlen = 0;
          U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
+        STRLEN skiplen = 0;
          const U8 *scan = (U8*)NULL;
          U32 wordlen      = 0;         /* required init */
          STRLEN chars = 0;
@@ -1598,28 +1612,37 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs
              trie->minlen= 0;
              continue;
          }
-        if ( set_bit ) /* bitmap only alloced when !(UTF&&Folding) */
+        if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
              TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
                                            regardless of encoding */
-
+            if (OP( noper ) == EXACTFU_SS) {
+                /* false positives are ok, so just set this */
+                TRIE_BITMAP_SET(trie,0xDF);
+            }
+        }
          for ( ; uc < e ; uc += len ) {
              TRIE_CHARCOUNT(trie)++;
              TRIE_READ_CHAR;
              chars++;
              if ( uvc < 256 ) {
+                if ( folder ) {
+                    U8 folded= folder[ (U8) uvc ];
+                    if ( !trie->charmap[ folded ] ) {
+                        trie->charmap[ folded ]=( ++trie->uniquecharcount );
+                        TRIE_STORE_REVCHAR( folded );
+                    }
+                }
                  if ( !trie->charmap[ uvc ] ) {
                      trie->charmap[ uvc ]=( ++trie->uniquecharcount );
-                    if ( folder )
-                        trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
-                    TRIE_STORE_REVCHAR;
+                    TRIE_STORE_REVCHAR( uvc );
                  }
                  if ( set_bit ) {
                     /* store the codepoint in the bitmap, and its folded
                      * equivalent. */
-                    TRIE_BITMAP_SET(trie,uvc);
+                    TRIE_BITMAP_SET(trie, uvc);
  
                     /* store the folded codepoint */
-                   if ( folder ) TRIE_BITMAP_SET(trie,folder[ uvc ]);
+                    if ( folder ) TRIE_BITMAP_SET(trie, folder[(U8) uvc ]);
  
                     if ( !UTF ) {
                         /* store first byte of utf8 representation of
@@ -1642,17 +1665,28 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs
  
                  if ( !SvTRUE( *svpp ) ) {
                      sv_setiv( *svpp, ++trie->uniquecharcount );
-                    TRIE_STORE_REVCHAR;
+                    TRIE_STORE_REVCHAR(uvc);
                  }
              }
          }
          if( cur == first ) {
-            trie->minlen=chars;
-            trie->maxlen=chars;
+            trie->minlen = chars;
+            trie->maxlen = chars;
          } else if (chars < trie->minlen) {
-            trie->minlen=chars;
+            trie->minlen = chars;
          } else if (chars > trie->maxlen) {
-            trie->maxlen=chars;
+            trie->maxlen = chars;
+        }
+        if (OP( noper ) == EXACTFU_SS) {
+            /* XXX: workaround - 'ss' could match "\x{DF}" so minlen could be 1 and not 2*/
+           if (trie->minlen > 1)
+                trie->minlen= 1;
+        }
+       if (OP( noper ) == EXACTFU_TRICKYFOLD) {
+           /* XXX: workround - things like "\x{1FBE}\x{0308}\x{0301}" can match "\x{0390}" 
+            *                - We assume that any such sequence might match a 2 byte string */
+            if (trie->minlen > 2 )
+                trie->minlen= 2;
          }
  
      } /* end first pass */
@@ -1725,6 +1759,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs
             STRLEN foldlen   = 0;         /* required init */
              U32 wordlen      = 0;         /* required init */
             U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
+            STRLEN skiplen   = 0;
  
              if (OP(noper) != NOTHING) {
                  for ( ; uc < e ; uc += len ) {
@@ -1925,8 +1960,10 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs
  
              STRLEN foldlen   = 0;         /* required init */
              U32 wordlen      = 0;         /* required init */
+            STRLEN skiplen   = 0;
              U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
  
+
              if ( OP(noper) != NOTHING ) {
                  for ( ; uc < e ; uc += len ) {
  
@@ -2563,7 +2600,7 @@ S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source,  regnode
   *      optimizer doesn't reject these possibilities based on size constraints.
   * 2)   These sequences are not currently correctly handled by the trie code
   *      either, so it changes the joined node type to ops that are not handled
- *      by trie's, those new ops being EXACTFU_SS and EXACTFU_NO_TRIE.
+ *      by trie's, those new ops being EXACTFU_SS and EXACTFU_TRICKYFOLD.
   * 3)   This is sufficient for the two Greek sequences (described below), but
   *      the one involving the Sharp s (\xDF) needs more.  The node type
   *      EXACTFU_SS is used for an EXACTFU node that contains at least one "ss"
@@ -2712,7 +2749,7 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, b
          * This uses an exclusive 'or' to find that bit and then inverts it to
          * form a mask, with just a single 0, in the bit position where 'S' and
          * 's' differ. */
-       const U8 S_or_s_mask = ~ ('S' ^ 's');
+       const U8 S_or_s_mask = (U8) ~ ('S' ^ 's');
         const U8 s_masked = 's' & S_or_s_mask;
  
         /* One pass is made over the node's string looking for all the
@@ -2818,7 +2855,7 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, b
                          * is.  (I (khw) think it doesn't matter in regexec.c
                          * for UTF patterns, but no need to change it */
                         if (OP(scan) == EXACTFU) {
-                           OP(scan) = EXACTFU_NO_TRIE;
+                           OP(scan) = EXACTFU_TRICKYFOLD;
                         }
                         s += 6; /* We already know what this sequence is.  Skip
                                    the rest of it */
@@ -3180,7 +3217,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                          regnode *first = (regnode *)NULL;
                          regnode *last = (regnode *)NULL;
                          regnode *tail = scan;
-                        U8 optype = 0;
+                        U8 trietype = 0;
                          U32 count=0;
  
  #ifdef DEBUGGING
@@ -3211,32 +3248,59 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                          
                          /*
  
-                           step through the branches, cur represents each
-                           branch, noper is the first thing to be matched
-                           as part of that branch and noper_next is the
-                           regnext() of that node. if noper is an EXACT
-                           and noper_next is the same as scan (our current
-                           position in the regex) then the EXACT branch is
-                           a possible optimization target. Once we have
-                           two or more consecutive such branches we can
-                           create a trie of the EXACT's contents and stich
-                           it in place. If the sequence represents all of
-                           the branches we eliminate the whole thing and
-                           replace it with a single TRIE. If it is a
-                           subsequence then we need to stitch it in. This
-                           means the first branch has to remain, and needs
-                           to be repointed at the item on the branch chain
-                           following the last branch optimized. This could
-                           be either a BRANCH, in which case the
-                           subsequence is internal, or it could be the
-                           item following the branch sequence in which
-                           case the subsequence is at the end.
+                            Step through the branches
+                                cur represents each branch,
+                                noper is the first thing to be matched as part of that branch
+                                noper_next is the regnext() of that node.
+
+                            We normally handle a case like this /FOO[xyz]|BAR[pqr]/
+                            via a "jump trie" but we also support building with NOJUMPTRIE,
+                            which restricts the trie logic to structures like /FOO|BAR/.
+
+                            If noper is a trieable nodetype then the branch is a possible optimization
+                            target. If we are building under NOJUMPTRIE then we require that noper_next
+                            is the same as scan (our current position in the regex program).
+
+                            Once we have two or more consecutive such branches we can create a
+                            trie of the EXACT's contents and stitch it in place into the program.
+
+                            If the sequence represents all of the branches in the alternation we
+                            replace the entire thing with a single TRIE node.
+
+                            Otherwise when it is a subsequence we need to stitch it in place and
+                            replace only the relevant branches. This means the first branch has
+                            to remain as it is used by the alternation logic, and its next pointer,
+                            and needs to be repointed at the item on the branch chain following
+                            the last branch we have optimized away.
+
+                            This could be either a BRANCH, in which case the subsequence is internal,
+                            or it could be the item following the branch sequence in which case the
+                            subsequence is at the end (which does not necessarily mean the first node
+                            is the start of the alternation).
+
+                            TRIE_TYPE(X) is a define which maps the optype to a trietype.
+
+                                optype          |  trietype
+                                ----------------+-----------
+                                NOTHING         | NOTHING
+                                EXACT           | EXACT
+                                EXACTFU         | EXACTFU
+                                EXACTFU_SS      | EXACTFU
+                                EXACTFU_TRICKYFOLD | EXACTFU
+                                EXACTFA         | 0
+
  
                          */
+#define TRIE_TYPE(X) ( ( NOTHING == (X) ) ? NOTHING :   \
+                       ( EXACT == (X) )   ? EXACT :        \
+                       ( EXACTFU == (X) || EXACTFU_SS == (X) || EXACTFU_TRICKYFOLD == (X) ) ? EXACTFU :        \
+                       0 )
  
                          /* dont use tail as the end marker for this traverse */
                          for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
                              regnode * const noper = NEXTOPER( cur );
+                            U8 noper_type = OP( noper );
+                            U8 noper_trietype = TRIE_TYPE( noper_type );
  #if defined(DEBUGGING) || defined(NOJUMPTRIE)
                              regnode * const noper_next = regnext( noper );
  #endif
@@ -3258,59 +3322,83 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                                  PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
                                     REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
                              });
-                            if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
-                                         : PL_regkind[ OP( noper ) ] == EXACT )
-                                  || OP(noper) == NOTHING )
+
+                            /* Is noper a trieable nodetype that can be merged with the
+                             * current trie (if there is one)? */
+                            if ( noper_trietype
+                                  &&
+                                  (
+                                        /* XXX: Currently we cannot allow a NOTHING node to be the first element
+                                         * of a TRIEABLE sequence, Otherwise we will overwrite the regop following
+                                         * the NOTHING with the TRIE regop later on. This is because a NOTHING node
+                                         * is only one regnode wide, and a TRIE is two regnodes. An example of a
+                                         * problematic pattern is: "x" =~ /\A(?>(?:(?:)A|B|C?x))\z/
+                                         * At a later point of time we can somewhat workaround this by handling
+                                         * NOTHING -> EXACT sequences as generated by /(?:)A|(?:)B/ type patterns,
+                                         * as we can effectively ignore the NOTHING regop in that case.
+                                         * This clause, which allows NOTHING to start a sequence is left commented
+                                         * out as a reference.
+                                         * - Yves
+
+                                           ( noper_trietype == NOTHING)
+                                           || ( trietype == NOTHING )
+                                        */
+                                        ( noper_trietype == NOTHING && trietype )
+                                        || ( trietype == noper_trietype )
+                                  )
  #ifdef NOJUMPTRIE
                                    && noper_next == tail
  #endif
                                    && count < U16_MAX)
                              {
+                                /* Handle mergable triable node
+                                 * Either we are the first node in a new trieable sequence,
+                                 * in which case we do some bookkeeping, otherwise we update
+                                 * the end pointer. */
                                  count++;
-                                if ( !first || optype == NOTHING ) {
-                                    if (!first) first = cur;
-                                    optype = OP( noper );
+                                if ( !first ) {
+                                    first = cur;
+                                    trietype = noper_trietype;
                                  } else {
+                                    if ( trietype == NOTHING )
+                                        trietype = noper_trietype;
                                      last = cur;
                                  }
-                            } else {
-/* 
-    Currently the trie logic handles case insensitive matching properly only
-    when the pattern is UTF-8 and the node is EXACTFU (thus forcing unicode
-    semantics).
-
-    If/when this is fixed the following define can be swapped
-    in below to fully enable trie logic.
-
-#define TRIE_TYPE_IS_SAFE 1
-
-Note that join_exact() assumes that the other types of EXACTFish nodes are not
-used in tries, so that would have to be updated if this changed
-
-*/
-#define TRIE_TYPE_IS_SAFE ((UTF && optype == EXACTFU) || optype==EXACT)
-
-                                if ( last && TRIE_TYPE_IS_SAFE ) {
-                                    make_trie( pRExC_state, 
-                                            startbranch, first, cur, tail, count, 
-                                            optype, depth+1 );
+                            } /* end handle mergable triable node */
+                            else {
+                                /* handle unmergable node -
+                                 * noper may either be a triable node which can not be tried
+                                 * together with the current trie, or a non triable node */
+                                if ( last ) {
+                                    /* If last is set and trietype is not NOTHING then we have found
+                                     * at least two triable branch sequences in a row of a similar
+                                     * trietype so we can turn them into a trie. If/when we
+                                     * allow NOTHING to start a trie sequence this condition will be
+                                     * required, and it isn't expensive so we leave it in for now. */
+                                    if ( trietype != NOTHING )
+                                        make_trie( pRExC_state,
+                                                startbranch, first, cur, tail, count,
+                                                trietype, depth+1 );
+                                    last = NULL; /* note: we clear/update first, trietype etc below, so we dont do it here */
                                  }
-                                if ( PL_regkind[ OP( noper ) ] == EXACT
+                                if ( noper_trietype
  #ifdef NOJUMPTRIE
                                       && noper_next == tail
  #endif
                                  ){
+                                    /* noper is triable, so we can start a new trie sequence */
                                      count = 1;
                                      first = cur;
-                                    optype = OP( noper );
-                                } else {
+                                    trietype = noper_trietype;
+                                } else if (first) {
+                                    /* if we already saw a first but the current node is not triable then we have
+                                     * to reset the first information. */
                                      count = 0;
                                      first = NULL;
-                                    optype = 0;
+                                    trietype = 0;
                                  }
-                                last = NULL;
-                            }
-                        }
+                            } /* end handle unmergable node */
+                        } /* loop over branches */
                          DEBUG_OPTIMISE_r({
                              regprop(RExC_rx, mysv, cur);
                              PerlIO_printf( Perl_debug_log,
@@ -3318,9 +3406,11 @@ used in tries, so that would have to be updated if this changed
                                "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
  
                          });
-                        
-                        if ( last && TRIE_TYPE_IS_SAFE ) {
-                            made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
+                        if ( last && trietype != NOTHING ) {
+                            /* the last branch of the sequence was part of a trie,
+                             * so we have to construct it here outside of the loop
+                             */
+                            made= make_trie( pRExC_state, startbranch, first, scan, tail, count, trietype, depth+1 );
  #ifdef TRIE_STUDY_OPT
                              if ( ((made == MADE_EXACT_TRIE && 
                                   startbranch == first) 
@@ -3334,8 +3424,8 @@ used in tries, so that would have to be updated if this changed
                                  }
                              }
  #endif
-                        }
-                    }
+                        } /* end if ( last) */
+                    } /* TRIE_MAXBUF is non zero */
                      
                  } /* do trie */
                  
@@ -3408,8 +3498,8 @@ used in tries, so that would have to be updated if this changed
             UV uc;
             if (UTF) {
                 const U8 * const s = (U8*)STRING(scan);
+               uc = utf8_to_uvchr_buf(s, s + l, NULL);
                 l = utf8_length(s, s + l);
-               uc = utf8_to_uvchr(s, NULL);
             } else {
                 uc = *((U8*)STRING(scan));
             }
@@ -3472,7 +3562,7 @@ used in tries, so that would have to be updated if this changed
                      * elsewhere that does compute the fold settles down, it
                      * can be extracted out and re-used here */
                     for (i = 0; i < 256; i++){
-                       if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)) {
+                       if (HAS_NONLATIN1_FOLD_CLOSURE(i)) {
                             ANYOF_BITMAP_SET(data->start_class, i);
                         }
                     }
@@ -3503,8 +3593,8 @@ used in tries, so that would have to be updated if this changed
             }
             if (UTF) {
                 const U8 * const s = (U8 *)STRING(scan);
+               uc = utf8_to_uvchr_buf(s, s + l, NULL);
                 l = utf8_length(s, s + l);
-               uc = utf8_to_uvchr(s, NULL);
             }
             else if (has_exactf_sharp_s) {
                 RExC_seen |= REG_SEEN_EXACTF_SHARP_S;
@@ -4813,6 +4903,64 @@ Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags)
  
      DEBUG_r(if (!PL_colorset) reginitcolors());
  
+#ifndef PERL_IN_XSUB_RE
+    /* Initialize these here instead of as-needed, as is quick and avoids
+     * having to test them each time otherwise */
+    if (! PL_AboveLatin1) {
+       PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
+       PL_ASCII = _new_invlist_C_array(ASCII_invlist);
+       PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
+
+       PL_L1PosixAlnum = _new_invlist_C_array(L1PosixAlnum_invlist);
+       PL_PosixAlnum = _new_invlist_C_array(PosixAlnum_invlist);
+
+       PL_L1PosixAlpha = _new_invlist_C_array(L1PosixAlpha_invlist);
+       PL_PosixAlpha = _new_invlist_C_array(PosixAlpha_invlist);
+
+       PL_PosixBlank = _new_invlist_C_array(PosixBlank_invlist);
+       PL_XPosixBlank = _new_invlist_C_array(XPosixBlank_invlist);
+
+       PL_L1Cased = _new_invlist_C_array(L1Cased_invlist);
+
+       PL_PosixCntrl = _new_invlist_C_array(PosixCntrl_invlist);
+       PL_XPosixCntrl = _new_invlist_C_array(XPosixCntrl_invlist);
+
+       PL_PosixDigit = _new_invlist_C_array(PosixDigit_invlist);
+
+       PL_L1PosixGraph = _new_invlist_C_array(L1PosixGraph_invlist);
+       PL_PosixGraph = _new_invlist_C_array(PosixGraph_invlist);
+
+       PL_L1PosixAlnum = _new_invlist_C_array(L1PosixAlnum_invlist);
+       PL_PosixAlnum = _new_invlist_C_array(PosixAlnum_invlist);
+
+       PL_L1PosixLower = _new_invlist_C_array(L1PosixLower_invlist);
+       PL_PosixLower = _new_invlist_C_array(PosixLower_invlist);
+
+       PL_L1PosixPrint = _new_invlist_C_array(L1PosixPrint_invlist);
+       PL_PosixPrint = _new_invlist_C_array(PosixPrint_invlist);
+
+       PL_L1PosixPunct = _new_invlist_C_array(L1PosixPunct_invlist);
+       PL_PosixPunct = _new_invlist_C_array(PosixPunct_invlist);
+
+       PL_PerlSpace = _new_invlist_C_array(PerlSpace_invlist);
+       PL_XPerlSpace = _new_invlist_C_array(XPerlSpace_invlist);
+
+       PL_PosixSpace = _new_invlist_C_array(PosixSpace_invlist);
+       PL_XPosixSpace = _new_invlist_C_array(XPosixSpace_invlist);
+
+       PL_L1PosixUpper = _new_invlist_C_array(L1PosixUpper_invlist);
+       PL_PosixUpper = _new_invlist_C_array(PosixUpper_invlist);
+
+       PL_VertSpace = _new_invlist_C_array(VertSpace_invlist);
+
+       PL_PosixWord = _new_invlist_C_array(PosixWord_invlist);
+       PL_L1PosixWord = _new_invlist_C_array(L1PosixWord_invlist);
+
+       PL_PosixXDigit = _new_invlist_C_array(PosixXDigit_invlist);
+       PL_XPosixXDigit = _new_invlist_C_array(XPosixXDigit_invlist);
+    }
+#endif
+
      exp = SvPV(pattern, plen);
  
      if (plen == 0) { /* ignore the utf8ness if the pattern is 0 length */
@@ -6140,7 +6288,19 @@ S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
  #define INVLIST_LEN_OFFSET 0   /* Number of elements in the inversion list */
  #define INVLIST_ITER_OFFSET 1  /* Current iteration position */
  
-#define INVLIST_ZERO_OFFSET 2  /* 0 or 1; must be last element in header */
+/* This is a combination of a version and data structure type, so that one
+ * being passed in can be validated to be an inversion list of the correct
+ * vintage.  When the structure of the header is changed, a new random number
+ * in the range 2**31-1 should be generated and the new() method changed to
+ * insert that at this location.  Then, if an auxiliary program doesn't change
+ * correspondingly, it will be discovered immediately */
+#define INVLIST_VERSION_ID_OFFSET 2
+#define INVLIST_VERSION_ID 1064334010
+
+/* For safety, when adding new elements, remember to #undef them at the end of
+ * the inversion list code section */
+
+#define INVLIST_ZERO_OFFSET 3  /* 0 or 1; must be last element in header */
  /* The UV at position ZERO contains either 0 or 1.  If 0, the inversion list
   * contains the code point U+00000, and begins here.  If 1, the inversion list
   * doesn't contain U+0000, and it begins at the next UV in the array.
@@ -6300,10 +6460,39 @@ Perl__new_invlist(pTHX_ IV initial_size)
       * properly */
      *get_invlist_zero_addr(new_list) = UV_MAX;
  
+    *get_invlist_version_id_addr(new_list) = INVLIST_VERSION_ID;
+#if HEADER_LENGTH != 4
+#   error Need to regenerate VERSION_ID by running perl -E 'say int(rand 2**31-1)', and then changing the #if to the new length
+#endif
+
      return new_list;
  }
  #endif
  
+STATIC SV*
+S__new_invlist_C_array(pTHX_ UV* list)
+{
+    /* Return a pointer to a newly constructed inversion list, initialized to
+     * point to <list>, which has to be in the exact correct inversion list
+     * form, including internal fields.  Thus this is a dangerous routine that
+     * should not be used in the wrong hands */
+
+    SV* invlist = newSV_type(SVt_PV);
+
+    PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
+
+    SvPV_set(invlist, (char *) list);
+    SvLEN_set(invlist, 0);  /* Means we own the contents, and the system
+                              shouldn't touch it */
+    SvCUR_set(invlist, TO_INTERNAL_SIZE(invlist_len(invlist)));
+
+    if (*get_invlist_version_id_addr(invlist) != INVLIST_VERSION_ID) {
+        Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
+    }
+
+    return invlist;
+}
+
  STATIC void
  S_invlist_extend(pTHX_ SV* const invlist, const UV new_max)
  {
@@ -6330,9 +6519,10 @@ S_invlist_trim(pTHX_ SV* const invlist)
  #define ELEMENT_RANGE_MATCHES_INVLIST(i) (! ((i) & 1))
  #define PREV_RANGE_MATCHES_INVLIST(i) (! ELEMENT_RANGE_MATCHES_INVLIST(i))
  
-#ifndef PERL_IN_XSUB_RE
-void
-Perl__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV end)
+#define _invlist_union_complement_2nd(a, b, output) _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
+
+STATIC void
+S__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV end)
  {
     /* Subject to change or removal.  Append the range from 'start' to 'end' at
      * the end of the inversion list.  The range must be above any existing
@@ -6410,6 +6600,8 @@ Perl__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV
      }
  }
  
+#ifndef PERL_IN_XSUB_RE
+
  STATIC IV
  S_invlist_search(pTHX_ SV* const invlist, const UV cp)
  {
@@ -6535,13 +6727,16 @@ Perl__invlist_populate_swatch(pTHX_ SV* const invlist, const UV start, const UV
      return;
  }
  
+
  void
-Perl__invlist_union(pTHX_ SV* const a, SV* const b, SV** output)
+Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool complement_b, SV** output)
  {
      /* Take the union of two inversion lists and point <output> to it.  *output
       * should be defined upon input, and if it points to one of the two lists,
       * the reference count to that list will be decremented.  The first list,
       * <a>, may be NULL, in which case a copy of the second list is returned.
+     * If <complement_b> is TRUE, the union is taken of the complement
+     * (inversion) of <b> instead of b itself.
       *
       * The basis for this comes from "Unicode Demystified" Chapter 13 by
       * Richard Gillam, published by Addison-Wesley, and explained at some
@@ -6577,7 +6772,7 @@ Perl__invlist_union(pTHX_ SV* const a, SV* const b, SV** output)
       */
      UV count = 0;
  
-    PERL_ARGS_ASSERT__INVLIST_UNION;
+    PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
      assert(a != b);
  
      /* If either one is empty, the union is the other one */
@@ -6589,6 +6784,9 @@ Perl__invlist_union(pTHX_ SV* const a, SV* const b, SV** output)
         }
         if (*output != b) {
             *output = invlist_clone(b);
+            if (complement_b) {
+                _invlist_invert(*output);
+            }
         } /* else *output already = b; */
         return;
      }
@@ -6596,10 +6794,20 @@ Perl__invlist_union(pTHX_ SV* const a, SV* const b, SV** output)
         if (*output == b) {
             SvREFCNT_dec(b);
         }
-       if (*output != a) {
-           *output = invlist_clone(a);
-       }
-       /* else *output already = a; */
+
+        /* The complement of an empty list is a list that has everything in it,
+         * so the union with <a> includes everything too */
+        if (complement_b) {
+            if (a == *output) {
+                SvREFCNT_dec(a);
+            }
+            *output = _new_invlist(1);
+            _append_range_to_invlist(*output, 0, UV_MAX);
+        }
+        else if (*output != a) {
+            *output = invlist_clone(a);
+        }
+        /* else *output already = a; */
         return;
      }
  
@@ -6607,6 +6815,31 @@ Perl__invlist_union(pTHX_ SV* const a, SV* const b, SV** output)
      array_a = invlist_array(a);
      array_b = invlist_array(b);
  
+    /* If are to take the union of 'a' with the complement of b, set it
+     * up so are looking at b's complement. */
+    if (complement_b) {
+
+       /* To complement, we invert: if the first element is 0, remove it.  To
+        * do this, we just pretend the array starts one later, and clear the
+        * flag as we don't have to do anything else later */
+        if (array_b[0] == 0) {
+            array_b++;
+            len_b--;
+            complement_b = FALSE;
+        }
+        else {
+
+            /* But if the first element is not zero, we unshift a 0 before the
+             * array.  The data structure reserves a space for that 0 (which
+             * should be a '1' right now), so physical shifting is unneeded,
+             * but temporarily change that element to 0.  Before exiting the
+             * routine, we must restore the element to '1' */
+            array_b--;
+            len_b++;
+            array_b[0] = 0;
+        }
+    }
+
      /* Size the union for the worst case: that the sets are completely
       * disjoint */
      u = _new_invlist(len_a + len_b);
@@ -6725,6 +6958,11 @@ Perl__invlist_union(pTHX_ SV* const a, SV* const b, SV** output)
         SvREFCNT_dec(*output);
      }
  
+    /* If we've changed b, restore it */
+    if (complement_b) {
+        array_b[0] = 1;
+    }
+
      *output = u;
      return;
  }
@@ -6950,10 +7188,8 @@ Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
      return;
  }
  
-#endif
-
-STATIC SV*
-S_add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end)
+SV*
+Perl__add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end)
  {
      /* Add the range from 'start' to 'end' inclusive to the inversion list's
       * set.  A pointer to the inversion list is returned.  This may actually be
@@ -6994,9 +7230,11 @@ S_add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end)
      return invlist;
  }
  
+#endif
+
  PERL_STATIC_INLINE SV*
  S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) {
-    return add_range_to_invlist(invlist, cp, cp);
+    return _add_range_to_invlist(invlist, cp, cp);
  }
  
  #ifndef PERL_IN_XSUB_RE
@@ -7098,6 +7336,16 @@ S_get_invlist_iter_addr(pTHX_ SV* invlist)
      return (UV *) (SvPVX(invlist) + (INVLIST_ITER_OFFSET * sizeof (UV)));
  }
  
+PERL_STATIC_INLINE UV*
+S_get_invlist_version_id_addr(pTHX_ SV* invlist)
+{
+    /* Return the address of the UV that contains the version id. */
+
+    PERL_ARGS_ASSERT_GET_INVLIST_VERSION_ID_ADDR;
+
+    return (UV *) (SvPVX(invlist) + (INVLIST_VERSION_ID_OFFSET * sizeof (UV)));
+}
+
  PERL_STATIC_INLINE void
  S_invlist_iterinit(pTHX_ SV* invlist)  /* Initialize iterator for invlist */
  {
@@ -7203,6 +7451,7 @@ S_invlist_dump(pTHX_ SV* const invlist, const char * const header)
  #undef INVLIST_LEN_OFFSET
  #undef INVLIST_ZERO_OFFSET
  #undef INVLIST_ITER_OFFSET
+#undef INVLIST_VERSION_ID
  
  /* End of inversion list object */
  
@@ -7849,9 +8098,12 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
                  U32 posflags = 0, negflags = 0;
                 U32 *flagsp = &posflags;
                  char has_charset_modifier = '\0';
-               regex_charset cs = (RExC_utf8 || RExC_uni_semantics)
-                                   ? REGEX_UNICODE_CHARSET
-                                   : REGEX_DEPENDS_CHARSET;
+               regex_charset cs = get_regex_charset(RExC_flags);
+               if (cs == REGEX_DEPENDS_CHARSET
+                   && (RExC_utf8 || RExC_uni_semantics))
+               {
+                   cs = REGEX_UNICODE_CHARSET;
+               }
  
                 while (*RExC_parse) {
                     /* && strchr("iogcmsx", *RExC_parse) */
@@ -8825,12 +9077,6 @@ tryagain:
         vFAIL("Internal urp");
                                 /* Supposed to be caught earlier. */
         break;
-    case '{':
-       if (!regcurly(RExC_parse)) {
-           RExC_parse++;
-           goto defchar;
-       }
-       /* FALL THROUGH */
      case '?':
      case '+':
      case '*':
@@ -8956,9 +9202,6 @@ tryagain:
             ret = reg_node(pRExC_state, op);
             FLAGS(ret) = get_regex_charset(RExC_flags);
             *flagp |= SIMPLE;
-           if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
-               ckWARNregdep(RExC_parse, "\"\\b{\" is deprecated; use \"\\b\\{\" instead");
-           }
             goto finish_meta_pat;
         case 'B':
             RExC_seen_zerolen++;
@@ -8983,9 +9226,6 @@ tryagain:
             ret = reg_node(pRExC_state, op);
             FLAGS(ret) = get_regex_charset(RExC_flags);
             *flagp |= SIMPLE;
-           if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
-               ckWARNregdep(RExC_parse, "\"\\B{\" is deprecated; use \"\\B\\{\" instead");
-           }
             goto finish_meta_pat;
         case 's':
             switch (get_regex_charset(RExC_flags)) {
@@ -9492,15 +9732,22 @@ tryagain:
                         /* FALL THROUGH */
                     default:
                         if (!SIZE_ONLY&& isALPHA(*p)) {
-                           /* Include any { following the alpha to emphasize
-                            * that it could be part of an escape at some point
-                            * in the future */
-                           int len = (*(p + 1) == '{') ? 2 : 1;
-                           ckWARN3reg(p + len, "Unrecognized escape \\%.*s passed through", len, p);
+                           ckWARN2reg(p + 1, "Unrecognized escape \\%.1s passed through", p);
                         }
                         goto normal_default;
                     }
                     break;
+               case '{':
+                   /* Currently we don't warn when the lbrace is at the start
+                    * of a construct.  This catches it in the middle of a
+                    * literal string, or when its the first thing after
+                    * something like "\b" */
+                   if (! SIZE_ONLY
+                       && (len || (p > RExC_start && isALPHA_A(*(p -1)))))
+                   {
+                       ckWARNregdep(p + 1, "Unescaped left brace in regex is deprecated, passed through");
+                   }
+                   /*FALLTHROUGH*/
                 default:
                   normal_default:
                     if (UTF8_IS_START(*p) && UTF) {
@@ -9589,7 +9836,10 @@ tryagain:
                               for (foldbuf = tmpbuf;
                                    foldlen;
                                    foldlen -= numlen) {
-                                  ender = utf8_to_uvchr(foldbuf, &numlen);
+
+                                  /* tmpbuf has been constructed by us, so we
+                                   * know it is valid utf8 */
+                                  ender = valid_utf8_to_uvchr(foldbuf, &numlen);
                                    if (numlen > 0) {
                                         const STRLEN unilen = reguni(pRExC_state, ender, s);
                                         s       += unilen;
@@ -9625,7 +9875,7 @@ tryagain:
                           for (foldbuf = tmpbuf;
                                foldlen;
                                foldlen -= numlen) {
-                              ender = utf8_to_uvchr(foldbuf, &numlen);
+                              ender = valid_utf8_to_uvchr(foldbuf, &numlen);
                                if (numlen > 0) {
                                     const STRLEN unilen = reguni(pRExC_state, ender, s);
                                     len     += unilen;
@@ -9872,91 +10122,136 @@ S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
      }
  }
  
-/* No locale test, and always Unicode semantics, no ignore-case differences */
-#define _C_C_T_NOLOC_(NAME,TEST,WORD)                                          \
-ANYOF_##NAME:                                                                  \
-       for (value = 0; value < 256; value++)                                  \
-           if (TEST)                                                          \
-           stored += set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate);  \
-    yesno = '+';                                                               \
-    what = WORD;                                                               \
-    break;                                                                     \
-case ANYOF_N##NAME:                                                            \
-       for (value = 0; value < 256; value++)                                  \
-           if (!TEST)                                                         \
-           stored += set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate);  \
-    yesno = '!';                                                               \
-    what = WORD;                                                               \
-    break
-
-/* Like the above, but there are differences if we are in uni-8-bit or not, so
- * there are two tests passed in, to use depending on that. There aren't any
- * cases where the label is different from the name, so no need for that
- * parameter.
- * Sets 'what' to WORD which is the property name for non-bitmap code points;
- * But, uses FOLD_WORD instead if /i has been selected, to allow a different
- * property name */
-#define _C_C_T_(NAME, TEST_8, TEST_7, WORD, FOLD_WORD)                         \
-ANYOF_##NAME:                                                                  \
-    if (LOC) ANYOF_CLASS_SET(ret, ANYOF_##NAME);                               \
-    else if (UNI_SEMANTICS) {                                                  \
-        for (value = 0; value < 256; value++) {                                \
-            if (TEST_8(value)) stored +=                                       \
-                      set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate);  \
-        }                                                                      \
-    }                                                                          \
-    else {                                                                     \
-        for (value = 0; value < 128; value++) {                                \
-            if (TEST_7(UNI_TO_NATIVE(value))) stored +=                        \
-               set_regclass_bit(pRExC_state, ret,                     \
-                                  (U8) UNI_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate);                 \
-        }                                                                      \
-    }                                                                          \
-    yesno = '+';                                                               \
-    if (FOLD) {                                                                \
-        what = FOLD_WORD;                                                      \
-    }                                                                          \
-    else {                                                                     \
-        what = WORD;                                                           \
-    }                                                                          \
-    break;                                                                     \
-case ANYOF_N##NAME:                                                            \
-    if (LOC) ANYOF_CLASS_SET(ret, ANYOF_N##NAME);                              \
-    else if (UNI_SEMANTICS) {                                                  \
-        for (value = 0; value < 256; value++) {                                \
-            if (! TEST_8(value)) stored +=                                     \
-                   set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate);    \
-        }                                                                      \
-    }                                                                          \
-    else {                                                                     \
-        for (value = 0; value < 128; value++) {                                \
-            if (! TEST_7(UNI_TO_NATIVE(value))) stored += set_regclass_bit(  \
-                       pRExC_state, ret, (U8) UNI_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate);    \
-        }                                                                      \
-       if (AT_LEAST_ASCII_RESTRICTED) {                                       \
-           for (value = 128; value < 256; value++) {                          \
-             stored += set_regclass_bit(                                     \
-                          pRExC_state, ret, (U8) UNI_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate); \
-           }                                                                  \
-           ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;                             \
-       }                                                                      \
-       else {                                                                 \
-           /* For a non-ut8 target string with DEPENDS semantics, all above   \
-            * ASCII Latin1 code points match the complement of any of the     \
-            * classes.  But in utf8, they have their Unicode semantics, so    \
-            * can't just set them in the bitmap, or else regexec.c will think \
-            * they matched when they shouldn't. */                            \
-           ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL;                     \
-       }                                                                      \
-    }                                                                          \
-    yesno = '!';                                                               \
-    if (FOLD) {                                                                \
-        what = FOLD_WORD;                                                      \
-    }                                                                          \
-    else {                                                                     \
-        what = WORD;                                                           \
-    }                                                                          \
-    break
+/* Generate the code to add a full posix character <class> to the bracketed
+ * character class given by <node>.  (<node> is needed only under locale rules)
+ * destlist     is the inversion list for non-locale rules that this class is
+ *              to be added to
+ * sourcelist   is the ASCII-range inversion list to add under /a rules
+ * Xsourcelist  is the full Unicode range list to use otherwise. */
+#define DO_POSIX(node, class, destlist, sourcelist, Xsourcelist)           \
+    if (LOC) {                                                             \
+       SV* scratch_list = NULL;                                           \
+                                                                           \
+        /* Set this class in the node for runtime matching */              \
+        ANYOF_CLASS_SET(node, class);                                      \
+                                                                           \
+        /* For above Latin1 code points, we use the full Unicode range */  \
+        _invlist_intersection(PL_AboveLatin1,                              \
+                              Xsourcelist,                                 \
+                              &scratch_list);                              \
+        /* And set the output to it, adding instead if there already is an \
+        * output.  Checking if <destlist> is NULL first saves an extra    \
+        * clone.  Its reference count will be decremented at the next     \
+        * union, etc, or if this is the only instance, at the end of the  \
+        * routine */                                                      \
+        if (! destlist) {                                                  \
+            destlist = scratch_list;                                       \
+        }                                                                  \
+        else {                                                             \
+            _invlist_union(destlist, scratch_list, &destlist);             \
+            SvREFCNT_dec(scratch_list);                                    \
+        }                                                                  \
+    }                                                                      \
+    else {                                                                 \
+        /* For non-locale, just add it to any existing list */             \
+        _invlist_union(destlist,                                           \
+                       (AT_LEAST_ASCII_RESTRICTED)                         \
+                           ? sourcelist                                    \
+                           : Xsourcelist,                                  \
+                       &destlist);                                         \
+    }
+
+/* Like DO_POSIX, but matches the complement of <sourcelist> and <Xsourcelist>.
+ */
+#define DO_N_POSIX(node, class, destlist, sourcelist, Xsourcelist)         \
+    if (LOC) {                                                             \
+        SV* scratch_list = NULL;                                           \
+        ANYOF_CLASS_SET(node, class);                                     \
+        _invlist_subtract(PL_AboveLatin1, Xsourcelist, &scratch_list);    \
+        if (! destlist) {                                                 \
+            destlist = scratch_list;                                      \
+        }                                                                  \
+        else {                                                             \
+            _invlist_union(destlist, scratch_list, &destlist);             \
+            SvREFCNT_dec(scratch_list);                                    \
+        }                                                                  \
+    }                                                                      \
+    else {                                                                 \
+        _invlist_union_complement_2nd(destlist,                            \
+                                    (AT_LEAST_ASCII_RESTRICTED)            \
+                                        ? sourcelist                       \
+                                        : Xsourcelist,                     \
+                                    &destlist);                            \
+        /* Under /d, everything in the upper half of the Latin1 range      \
+         * matches this complement */                                      \
+        if (DEPENDS_SEMANTICS) {                                           \
+            ANYOF_FLAGS(node) |= ANYOF_NON_UTF8_LATIN1_ALL;                \
+        }                                                                  \
+    }
+
+/* Generate the code to add a posix character <class> to the bracketed
+ * character class given by <node>.  (<node> is needed only under locale rules)
+ * destlist       is the inversion list for non-locale rules that this class is
+ *                to be added to
+ * sourcelist     is the ASCII-range inversion list to add under /a rules
+ * l1_sourcelist  is the Latin1 range list to use otherwise.
+ * Xpropertyname  is the name to add to <run_time_list> of the property to
+ *                specify the code points above Latin1 that will have to be
+ *                determined at run-time
+ * run_time_list  is a SV* that contains text names of properties that are to
+ *                be computed at run time.  This concatenates <Xpropertyname>
+ *                to it, apppropriately
+ * This is essentially DO_POSIX, but we know only the Latin1 values at compile
+ * time */
+#define DO_POSIX_LATIN1_ONLY_KNOWN(node, class, destlist, sourcelist,      \
+                              l1_sourcelist, Xpropertyname, run_time_list) \
+       /* First, resolve whether to use the ASCII-only list or the L1     \
+        * list */                                                         \
+        DO_POSIX_LATIN1_ONLY_KNOWN_L1_RESOLVED(node, class, destlist,      \
+                ((AT_LEAST_ASCII_RESTRICTED) ? sourcelist : l1_sourcelist),\
+                Xpropertyname, run_time_list)
+
+#define DO_POSIX_LATIN1_ONLY_KNOWN_L1_RESOLVED(node, class, destlist, sourcelist, \
+                Xpropertyname, run_time_list)                              \
+    /* If not /a matching, there are going to be code points we will have  \
+     * to defer to runtime to look-up */                                   \
+    if (! AT_LEAST_ASCII_RESTRICTED) {                                     \
+        Perl_sv_catpvf(aTHX_ run_time_list, "+utf8::%s\n", Xpropertyname); \
+    }                                                                      \
+    if (LOC) {                                                             \
+        ANYOF_CLASS_SET(node, class);                                      \
+    }                                                                      \
+    else {                                                                 \
+        _invlist_union(destlist, sourcelist, &destlist);                   \
+    }
+
+/* Like DO_POSIX_LATIN1_ONLY_KNOWN, but for the complement.  A combination of
+ * this and DO_N_POSIX */
+#define DO_N_POSIX_LATIN1_ONLY_KNOWN(node, class, destlist, sourcelist,    \
+                              l1_sourcelist, Xpropertyname, run_time_list) \
+    if (AT_LEAST_ASCII_RESTRICTED) {                                       \
+        _invlist_union_complement_2nd(destlist, sourcelist, &destlist);    \
+    }                                                                      \
+    else {                                                                 \
+        Perl_sv_catpvf(aTHX_ run_time_list, "!utf8::%s\n", Xpropertyname); \
+       if (LOC) {                                                         \
+           ANYOF_CLASS_SET(node, namedclass);                             \
+       }                                                                  \
+       else {                                                             \
+            SV* scratch_list = NULL;                                       \
+           _invlist_subtract(PL_Latin1, l1_sourcelist, &scratch_list);    \
+           if (! destlist) {                                              \
+               destlist = scratch_list;                                   \
+           }                                                              \
+           else {                                                         \
+               _invlist_union(destlist, scratch_list, &destlist);         \
+               SvREFCNT_dec(scratch_list);                                \
+           }                                                              \
+           if (DEPENDS_SEMANTICS) {                                       \
+               ANYOF_FLAGS(node) |= ANYOF_NON_UTF8_LATIN1_ALL;            \
+           }                                                              \
+       }                                                                  \
+    }
  
  STATIC U8
  S_set_regclass_bit_fold(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value, SV** invlist_ptr, AV** alternate_ptr)
@@ -10433,18 +10728,7 @@ parseit:
  
                          /* Invert if asking for the complement */
                          if (value == 'P') {
-
-                           /* Add to any existing list */
-                           if (! properties) {
-                               properties = invlist_clone(invlist);
-                               _invlist_invert(properties);
-                           }
-                           else {
-                               invlist = invlist_clone(invlist);
-                               _invlist_invert(invlist);
-                               _invlist_union(properties, invlist, &properties);
-                               SvREFCNT_dec(invlist);
-                           }
+                           _invlist_union_complement_2nd(properties, invlist, &properties);
  
                              /* The swash can't be used as-is, because we've
                              * inverted things; delay removing it to here after
@@ -10597,85 +10881,212 @@ parseit:
             }
  
             if (!SIZE_ONLY) {
-               const char *what = NULL;
-               char yesno = 0;
  
                 /* Possible truncation here but in some 64-bit environments
                  * the compiler gets heartburn about switch on 64-bit values.
                  * A similar issue a little earlier when switching on value.
                  * --jhi */
                 switch ((I32)namedclass) {
-               
-               case _C_C_T_(ALNUMC, isALNUMC_L1, isALNUMC, "XPosixAlnum", "XPosixAlnum");
-               case _C_C_T_(ALPHA, isALPHA_L1, isALPHA, "XPosixAlpha", "XPosixAlpha");
-               case _C_C_T_(BLANK, isBLANK_L1, isBLANK, "XPosixBlank", "XPosixBlank");
-               case _C_C_T_(CNTRL, isCNTRL_L1, isCNTRL, "XPosixCntrl", "XPosixCntrl");
-               case _C_C_T_(GRAPH, isGRAPH_L1, isGRAPH, "XPosixGraph", "XPosixGraph");
-               case _C_C_T_(LOWER, isLOWER_L1, isLOWER, "XPosixLower", "__XPosixLower_i");
-               case _C_C_T_(PRINT, isPRINT_L1, isPRINT, "XPosixPrint", "XPosixPrint");
-               case _C_C_T_(PSXSPC, isPSXSPC_L1, isPSXSPC, "XPosixSpace", "XPosixSpace");
-               case _C_C_T_(PUNCT, isPUNCT_L1, isPUNCT, "XPosixPunct", "XPosixPunct");
-               case _C_C_T_(UPPER, isUPPER_L1, isUPPER, "XPosixUpper", "__XPosixUpper_i");
-                /* \s, \w match all unicode if utf8. */
-                case _C_C_T_(SPACE, isSPACE_L1, isSPACE, "SpacePerl", "SpacePerl");
-                case _C_C_T_(ALNUM, isWORDCHAR_L1, isALNUM, "Word", "Word");
-               case _C_C_T_(XDIGIT, isXDIGIT_L1, isXDIGIT, "XPosixXDigit", "XPosixXDigit");
-               case _C_C_T_NOLOC_(VERTWS, is_VERTWS_latin1(&value), "VertSpace");
-               case _C_C_T_NOLOC_(HORIZWS, is_HORIZWS_latin1(&value), "HorizSpace");
+
+               case ANYOF_ALNUMC: /* C's alnum, in contrast to \w */
+                   DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
+                        PL_PosixAlnum, PL_L1PosixAlnum, "XPosixAlnum", listsv);
+                   break;
+               case ANYOF_NALNUMC:
+                   DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
+                        PL_PosixAlnum, PL_L1PosixAlnum, "XPosixAlnum", listsv);
+                   break;
+               case ANYOF_ALPHA:
+                   DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
+                        PL_PosixAlpha, PL_L1PosixAlpha, "XPosixAlpha", listsv);
+                   break;
+               case ANYOF_NALPHA:
+                   DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
+                        PL_PosixAlpha, PL_L1PosixAlpha, "XPosixAlpha", listsv);
+                   break;
                 case ANYOF_ASCII:
-                   if (LOC)
-                       ANYOF_CLASS_SET(ret, ANYOF_ASCII);
-                   else {
-                       for (value = 0; value < 128; value++)
-                           stored +=
-                              set_regclass_bit(pRExC_state, ret, (U8) ASCII_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate);
+                   if (LOC) {
+                       ANYOF_CLASS_SET(ret, namedclass);
                     }
-                   yesno = '+';
-                   what = NULL;        /* Doesn't match outside ascii, so
-                                          don't want to add +utf8:: */
+                    else {
+                        _invlist_union(properties, PL_ASCII, &properties);
+                    }
                     break;
                 case ANYOF_NASCII:
-                   if (LOC)
-                       ANYOF_CLASS_SET(ret, ANYOF_NASCII);
-                   else {
-                       for (value = 128; value < 256; value++)
-                           stored +=
-                              set_regclass_bit(pRExC_state, ret, (U8) ASCII_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate);
+                   if (LOC) {
+                       ANYOF_CLASS_SET(ret, namedclass);
                     }
-                   ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;
-                   yesno = '!';
-                   what = "ASCII";
-                   break;              
+                    else {
+                        _invlist_union_complement_2nd(properties,
+                                                    PL_ASCII, &properties);
+                        if (DEPENDS_SEMANTICS) {
+                            ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL;
+                        }
+                    }
+                   break;
+               case ANYOF_BLANK:
+                    DO_POSIX(ret, namedclass, properties,
+                                            PL_PosixBlank, PL_XPosixBlank);
+                   break;
+               case ANYOF_NBLANK:
+                    DO_N_POSIX(ret, namedclass, properties,
+                                            PL_PosixBlank, PL_XPosixBlank);
+                   break;
+               case ANYOF_CNTRL:
+                    DO_POSIX(ret, namedclass, properties,
+                                            PL_PosixCntrl, PL_XPosixCntrl);
+                   break;
+               case ANYOF_NCNTRL:
+                    DO_N_POSIX(ret, namedclass, properties,
+                                            PL_PosixCntrl, PL_XPosixCntrl);
+                   break;
                 case ANYOF_DIGIT:
-                   if (LOC)
-                       ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
+                   /* There are no digits in the Latin1 range outside of
+                    * ASCII, so call the macro that doesn't have to resolve
+                    * them */
+                   DO_POSIX_LATIN1_ONLY_KNOWN_L1_RESOLVED(ret, namedclass, properties,
+                        PL_PosixDigit, "XPosixDigit", listsv);
+                   break;
+               case ANYOF_NDIGIT:
+                   DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
+                        PL_PosixDigit, PL_PosixDigit, "XPosixDigit", listsv);
+                   break;
+               case ANYOF_GRAPH:
+                   DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
+                        PL_PosixGraph, PL_L1PosixGraph, "XPosixGraph", listsv);
+                   break;
+               case ANYOF_NGRAPH:
+                   DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
+                        PL_PosixGraph, PL_L1PosixGraph, "XPosixGraph", listsv);
+                   break;
+               case ANYOF_HORIZWS:
+                   /* For these, we use the nonbitmap, as /d doesn't make a
+                    * difference in what these match.  There would be problems
+                    * if these characters had folds other than themselves, as
+                    * nonbitmap is subject to folding.  It turns out that \h
+                    * is just a synonym for XPosixBlank */
+                   _invlist_union(nonbitmap, PL_XPosixBlank, &nonbitmap);
+                   break;
+               case ANYOF_NHORIZWS:
+                    _invlist_union_complement_2nd(nonbitmap,
+                                                 PL_XPosixBlank, &nonbitmap);
+                   break;
+               case ANYOF_LOWER:
+               case ANYOF_NLOWER:
+                {   /* These require special handling, as they differ under
+                      folding, matching Cased there (which in the ASCII range
+                      is the same as Alpha */
+
+                   SV* ascii_source;
+                   SV* l1_source;
+                   const char *Xname;
+
+                   if (FOLD && ! LOC) {
+                       ascii_source = PL_PosixAlpha;
+                       l1_source = PL_L1Cased;
+                       Xname = "Cased";
+                   }
+                   else {
+                       ascii_source = PL_PosixLower;
+                       l1_source = PL_L1PosixLower;
+                       Xname = "XPosixLower";
+                   }
+                   if (namedclass == ANYOF_LOWER) {
+                       DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
+                                    ascii_source, l1_source, Xname, listsv);
+                   }
                     else {
-                       /* consecutive digits assumed */
-                       for (value = '0'; value <= '9'; value++)
-                           stored +=
-                              set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate);
+                       DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass,
+                            properties, ascii_source, l1_source, Xname, listsv);
                     }
-                   yesno = '+';
-                   what = "Digit";
                     break;
-               case ANYOF_NDIGIT:
-                   if (LOC)
-                       ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
+               }
+               case ANYOF_PRINT:
+                   DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
+                        PL_PosixPrint, PL_L1PosixPrint, "XPosixPrint", listsv);
+                   break;
+               case ANYOF_NPRINT:
+                   DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
+                        PL_PosixPrint, PL_L1PosixPrint, "XPosixPrint", listsv);
+                   break;
+               case ANYOF_PUNCT:
+                   DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
+                        PL_PosixPunct, PL_L1PosixPunct, "XPosixPunct", listsv);
+                   break;
+               case ANYOF_NPUNCT:
+                   DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
+                        PL_PosixPunct, PL_L1PosixPunct, "XPosixPunct", listsv);
+                   break;
+               case ANYOF_PSXSPC:
+                    DO_POSIX(ret, namedclass, properties,
+                                            PL_PosixSpace, PL_XPosixSpace);
+                   break;
+               case ANYOF_NPSXSPC:
+                    DO_N_POSIX(ret, namedclass, properties,
+                                            PL_PosixSpace, PL_XPosixSpace);
+                   break;
+               case ANYOF_SPACE:
+                    DO_POSIX(ret, namedclass, properties,
+                                            PL_PerlSpace, PL_XPerlSpace);
+                   break;
+               case ANYOF_NSPACE:
+                    DO_N_POSIX(ret, namedclass, properties,
+                                            PL_PerlSpace, PL_XPerlSpace);
+                   break;
+               case ANYOF_UPPER:   /* Same as LOWER, above */
+               case ANYOF_NUPPER:
+               {
+                   SV* ascii_source;
+                   SV* l1_source;
+                   const char *Xname;
+
+                   if (FOLD && ! LOC) {
+                       ascii_source = PL_PosixAlpha;
+                       l1_source = PL_L1Cased;
+                       Xname = "Cased";
+                   }
                     else {
-                       /* consecutive digits assumed */
-                       for (value = 0; value < '0'; value++)
-                           stored +=
-                              set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate);
-                       for (value = '9' + 1; value < 256; value++)
-                           stored +=
-                              set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate);
+                       ascii_source = PL_PosixUpper;
+                       l1_source = PL_L1PosixUpper;
+                       Xname = "XPosixUpper";
                     }
-                   yesno = '!';
-                   what = "Digit";
-                   if (AT_LEAST_ASCII_RESTRICTED ) {
-                       ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;
+                   if (namedclass == ANYOF_UPPER) {
+                       DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
+                                    ascii_source, l1_source, Xname, listsv);
                     }
-                   break;              
+                   else {
+                       DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass,
+                        properties, ascii_source, l1_source, Xname, listsv);
+                   }
+                   break;
+               }
+               case ANYOF_ALNUM:   /* Really is 'Word' */
+                   DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
+                            PL_PosixWord, PL_L1PosixWord, "XPosixWord", listsv);
+                   break;
+               case ANYOF_NALNUM:
+                   DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
+                            PL_PosixWord, PL_L1PosixWord, "XPosixWord", listsv);
+                   break;
+               case ANYOF_VERTWS:
+                   /* For these, we use the nonbitmap, as /d doesn't make a
+                    * difference in what these match.  There would be problems
+                    * if these characters had folds other than themselves, as
+                    * nonbitmap is subject to folding */
+                   _invlist_union(nonbitmap, PL_VertSpace, &nonbitmap);
+                   break;
+               case ANYOF_NVERTWS:
+                    _invlist_union_complement_2nd(nonbitmap,
+                                                    PL_VertSpace, &nonbitmap);
+                   break;
+               case ANYOF_XDIGIT:
+                    DO_POSIX(ret, namedclass, properties,
+                                            PL_PosixXDigit, PL_XPosixXDigit);
+                   break;
+               case ANYOF_NXDIGIT:
+                    DO_N_POSIX(ret, namedclass, properties,
+                                            PL_PosixXDigit, PL_XPosixXDigit);
+                   break;
                 case ANYOF_MAX:
                     /* this is to handle \p and \P */
                     break;
@@ -10683,10 +11094,6 @@ parseit:
                     vFAIL("Invalid [::] class");
                     break;
                 }
-               if (what && ! (AT_LEAST_ASCII_RESTRICTED)) {
-                   /* Strings such as "+utf8::isWord\n" */
-                   Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s\n", yesno, what);
-               }
  
                 continue;
             }
@@ -10767,7 +11174,7 @@ parseit:
           if (value > 255) {
             const UV prevnatvalue  = NATIVE_TO_UNI(prevvalue);
             const UV natvalue      = NATIVE_TO_UNI(value);
-           nonbitmap = add_range_to_invlist(nonbitmap, prevnatvalue, natvalue);
+           nonbitmap = _add_range_to_invlist(nonbitmap, prevnatvalue, natvalue);
         }
  #ifdef EBCDIC
             literal_endpoint = 0;
@@ -10844,7 +11251,8 @@ parseit:
                 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
                 STRLEN foldlen;
                 const UV f =
-                    _to_uni_fold_flags(j, foldbuf, &foldlen, allow_full_fold);
+                    _to_uni_fold_flags(j, foldbuf, &foldlen,
+                                       (allow_full_fold) ? FOLD_FLAGS_FULL : 0);
  
                 if (foldlen > (STRLEN)UNISKIP(f)) {
  
@@ -11024,13 +11432,14 @@ parseit:
             }
         }
  
-       /* Done with loop; set <nonbitmap> to not include any code points that
-        * are in the bitmap */
+        /* Done with loop; remove any code points that are in the bitmap from
+         * <nonbitmap> */
         if (change_invlist) {
-           SV* keep_list = _new_invlist(2);
-           _append_range_to_invlist(keep_list, max_cp_to_set + 1, UV_MAX);
-           _invlist_intersection(nonbitmap, keep_list, &nonbitmap);
-           SvREFCNT_dec(keep_list);
+           _invlist_subtract(nonbitmap,
+                             (DEPENDS_SEMANTICS)
+                               ? PL_ASCII
+                               : PL_Latin1,
+                              &nonbitmap);
         }
  
         /* If have completely emptied it, remove it completely */
@@ -11117,6 +11526,12 @@ parseit:
              * there should not be overlap unless is /d rules. */
             _invlist_invert(nonbitmap);
  
+           /* Any swash can't be used as-is, because we've inverted things */
+           if (swash) {
+               SvREFCNT_dec(swash);
+               swash = NULL;
+           }
+
             for (i = 0; i < 256; ++i) {
                 if (ANYOF_BITMAP_TEST(ret, i)) {
                     ANYOF_BITMAP_CLEAR(ret, i);
@@ -11144,10 +11559,7 @@ parseit:
             else {
                 /* There is no overlap for non-/d, so just delete anything
                  * below 256 */
-               SV* keep_list = _new_invlist(2);
-               _append_range_to_invlist(keep_list, 256, UV_MAX);
-               _invlist_intersection(nonbitmap, keep_list, &nonbitmap);
-               SvREFCNT_dec(keep_list);
+               _invlist_intersection(nonbitmap, PL_AboveLatin1, &nonbitmap);
             }
         }
  
@@ -11329,7 +11741,6 @@ parseit:
      }
      return ret;
  }
-#undef _C_C_T_
  
  
  /* reg_skipcomment()
@@ -11713,7 +12124,7 @@ S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,
                  case EXACTFA:
                  case EXACTFU:
                  case EXACTFU_SS:
-                case EXACTFU_NO_TRIE:
+                case EXACTFU_TRICKYFOLD:
                  case EXACTFL:
                          if( exact == PSEUDO )
                              exact= OP(scan);
@@ -13034,8 +13445,8 @@ S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
   * Local variables:
   * c-indentation-style: bsd
   * c-basic-offset: 4
- * indent-tabs-mode: t
+ * indent-tabs-mode: nil
   * End:
   *
- * ex: set ts=8 sts=4 sw=4 noet:
+ * ex: set ts=8 sts=4 sw=4 et:
   */