This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
remove redundant lex_end
[perl5.git] / regcomp.c
index 1a815c6..e0f65fa 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -85,6 +85,8 @@
 #  include "regcomp.h"
 #endif
 
+#include "dquote_static.c"
+
 #ifdef op
 #undef op
 #endif /* op */
@@ -196,7 +198,10 @@ typedef struct RExC_state_t {
  */
 #define        WORST           0       /* Worst case. */
 #define        HASWIDTH        0x01    /* Known to match non-null strings. */
-#define        SIMPLE          0x02    /* Simple enough to be STAR/PLUS operand. */
+
+/* Simple enough to be STAR/PLUS operand, in an EXACT node must be a single
+ * character, and if utf8, must be invariant. */
+#define        SIMPLE          0x02
 #define        SPSTART         0x04    /* Starts with * or +. */
 #define TRYAGAIN       0x08    /* Weeded out a declaration. */
 #define POSTPONED      0x10    /* (?1),(?&name), (??{...}) or similar */
@@ -218,6 +223,11 @@ typedef struct RExC_state_t {
 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
 
+/* If not already in utf8, do a longjmp back to the beginning */
+#define UTF8_LONGJMP 42 /* Choose a value not likely to ever conflict */
+#define REQUIRE_UTF8   STMT_START {                                       \
+                                     if (! UTF) JMPENV_JUMP(UTF8_LONGJMP); \
+                        } STMT_END
 
 /* About scan_data_t.
 
@@ -358,9 +368,10 @@ static const scan_data_t zero_scan_data =
 #define SCF_TRIE_RESTUDY        0x4000 /* Do restudy? */
 #define SCF_SEEN_ACCEPT         0x8000 
 
-#define UTF (RExC_utf8 != 0)
-#define LOC ((RExC_flags & RXf_PMf_LOCALE) != 0)
-#define FOLD ((RExC_flags & RXf_PMf_FOLD) != 0)
+#define UTF cBOOL(RExC_utf8)
+#define LOC cBOOL(RExC_flags & RXf_PMf_LOCALE)
+#define UNI_SEMANTICS cBOOL(RExC_flags & RXf_PMf_UNICODE)
+#define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
 
 #define OOB_UNICODE            12345678
 #define OOB_NAMEDCLASS         -1
@@ -870,7 +881,7 @@ S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, con
   Dumps the final compressed table form of the trie to Perl_debug_log.
   Used for debugging make_trie().
 */
+
 STATIC void
 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
            AV *revcharmap, U32 depth)
@@ -878,6 +889,7 @@ S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
     U32 state;
     SV *sv=sv_newmortal();
     int colwidth= widecharmap ? 6 : 4;
+    U16 word;
     GET_RE_DEBUG_FLAGS_DECL;
 
     PERL_ARGS_ASSERT_DUMP_TRIE;
@@ -947,6 +959,13 @@ S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
         }
         PerlIO_printf( Perl_debug_log, "\n" );
     }
+    PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", (int)depth*2, "");
+    for (word=1; word <= trie->wordcount; word++) {
+       PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
+           (int)word, (int)(trie->wordinfo[word].prev),
+           (int)(trie->wordinfo[word].len));
+    }
+    PerlIO_printf(Perl_debug_log, "\n" );
 }    
 /*
   Dumps a fully constructed but uncompressed trie in list form.
@@ -1077,6 +1096,7 @@ S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
 
 #endif
 
+
 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
   startbranch: the first branch in the whole branch sequence
   first      : start branch of sequence of branch-exact nodes.
@@ -1257,8 +1277,6 @@ is the recommended Unicode-aware way of saying
     U16 dupe= trie->states[ state ].wordnum;                    \
     regnode * const noper_next = regnext( noper );              \
                                                                 \
-    if (trie->wordlen)                                          \
-        trie->wordlen[ curword ] = wordlen;                     \
     DEBUG_r({                                                   \
         /* store the word for dumping */                        \
         SV* tmp;                                                \
@@ -1270,6 +1288,9 @@ is the recommended Unicode-aware way of saying
     });                                                         \
                                                                 \
     curword++;                                                  \
+    trie->wordinfo[curword].prev   = 0;                         \
+    trie->wordinfo[curword].len    = wordlen;                   \
+    trie->wordinfo[curword].accept = state;                     \
                                                                 \
     if ( noper_next < tail ) {                                  \
         if (!trie->jump)                                        \
@@ -1282,16 +1303,11 @@ is the recommended Unicode-aware way of saying
     }                                                           \
                                                                 \
     if ( dupe ) {                                               \
-        /* So it's a dupe. This means we need to maintain a   */\
-        /* linked-list from the first to the next.            */\
-        /* we only allocate the nextword buffer when there    */\
-        /* a dupe, so first time we have to do the allocation */\
-        if (!trie->nextword)                                    \
-            trie->nextword = (U16 *)                                   \
-               PerlMemShared_calloc( word_count + 1, sizeof(U16));     \
-        while ( trie->nextword[dupe] )                          \
-            dupe= trie->nextword[dupe];                         \
-        trie->nextword[dupe]= curword;                          \
+        /* It's a dupe. Pre-insert into the wordinfo[].prev   */\
+        /* chain, so that when the bits of chain are later    */\
+        /* linked together, the dups appear in the chain      */\
+       trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
+       trie->wordinfo[dupe].prev = curword;                    \
     } else {                                                    \
         /* we haven't inserted this word yet.                */ \
         trie->states[ state ].wordnum = curword;                \
@@ -1329,6 +1345,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs
     regnode *jumper = NULL;
     regnode *nextbranch = NULL;
     regnode *convert = NULL;
+    U32 *prev_states; /* temp array mapping each state to previous one */
     /* we just use folder as a flag in utf8 */
     const U8 * const folder = ( flags == EXACTF
                        ? PL_fold
@@ -1364,6 +1381,9 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs
     trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
     if (!(UTF && folder))
        trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
+    trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
+                       trie->wordcount+1, sizeof(reg_trie_wordinfo));
+
     DEBUG_r({
         trie_words = newAV();
     });
@@ -1496,7 +1516,6 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs
                (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
                (int)trie->minlen, (int)trie->maxlen )
     );
-    trie->wordlen = (U32 *) PerlMemShared_calloc( word_count, sizeof(U32) );
 
     /*
         We now know what we are dealing with in terms of unique chars and
@@ -1520,6 +1539,9 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs
     */
 
 
+    Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
+    prev_states[1] = 0;
+
     if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
         /*
             Second Pass -- Array Of Lists Representation
@@ -1590,6 +1612,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs
                         }
                         if ( ! newstate ) {
                             newstate = next_alloc++;
+                           prev_states[newstate] = state;
                             TRIE_LIST_PUSH( state, charid, newstate );
                             transcount++;
                         }
@@ -1773,6 +1796,8 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs
                         if ( !trie->trans[ state + charid ].next ) {
                             trie->trans[ state + charid ].next = next_alloc;
                             trie->trans[ state ].check++;
+                           prev_states[TRIE_NODENUM(next_alloc)]
+                                   = TRIE_NODENUM(state);
                             next_alloc += trie->uniquecharcount;
                         }
                         state = trie->trans[ state + charid ].next;
@@ -1920,9 +1945,6 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs
        PerlMemShared_realloc( trie->trans, trie->lasttrans
                               * sizeof(reg_trie_trans) );
 
-    /* and now dump out the compressed format */
-    DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
-
     {   /* Modify the program and insert the new TRIE node*/ 
         U8 nodetype =(U8)(flags & 0xFF);
         char *str=NULL;
@@ -2052,6 +2074,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs
                    break;
                }
            }
+           trie->prefixlen = (state-1);
             if (str) {
                 regnode *n = convert+NODE_SZ_STR(convert);
                 NEXT_OFF(convert) = NODE_SZ_STR(convert);
@@ -2147,6 +2170,42 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs
             Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
         });
     } /* end node insert */
+
+    /*  Finish populating the prev field of the wordinfo array.  Walk back
+     *  from each accept state until we find another accept state, and if
+     *  so, point the first word's .prev field at the second word. If the
+     *  second already has a .prev field set, stop now. This will be the
+     *  case either if we've already processed that word's accept state,
+     *  or that that state had multiple words, and the overspill words
+     *  were already linked up earlier.
+     */
+    {
+       U16 word;
+       U32 state;
+       U16 prev;
+
+       for (word=1; word <= trie->wordcount; word++) {
+           prev = 0;
+           if (trie->wordinfo[word].prev)
+               continue;
+           state = trie->wordinfo[word].accept;
+           while (state) {
+               state = prev_states[state];
+               if (!state)
+                   break;
+               prev = trie->states[state].wordnum;
+               if (prev)
+                   break;
+           }
+           trie->wordinfo[word].prev = prev;
+       }
+       Safefree(prev_states);
+    }
+
+
+    /* and now dump out the compressed format */
+    DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
+
     RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
 #ifdef DEBUGGING
     RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
@@ -3068,7 +3127,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
            }
            flags &= ~SCF_DO_STCLASS;
        }
-       else if (strchr((const char*)PL_varies,OP(scan))) {
+       else if (REGNODE_VARIES(OP(scan))) {
            I32 mincount, maxcount, minnext, deltanext, fl = 0;
            I32 f = flags, pos_before = 0;
            regnode * const oscan = scan;
@@ -3148,7 +3207,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                /* These are the cases when once a subexpression
                   fails at a particular position, it cannot succeed
                   even after backtracking at the enclosing scope.
-               
+
                   XXXX what if minimal match and we are at the
                        initial run of {n,m}? */
                if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
@@ -3220,7 +3279,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
 
                    /* Skip open. */
                    nxt = regnext(nxt);
-                   if (!strchr((const char*)PL_simple,OP(nxt))
+                   if (!REGNODE_SIMPLE(OP(nxt))
                        && !(PL_regkind[OP(nxt)] == EXACT
                             && STR_LEN(nxt) == 1))
                        goto nogo;
@@ -3241,11 +3300,11 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
 
 #ifdef DEBUGGING
                    OP(nxt1 + 1) = OPTIMIZED; /* was count. */
-                   NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
-                   NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
+                   NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
+                   NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
                    OP(nxt) = OPTIMIZED;        /* was CLOSE. */
                    OP(nxt + 1) = OPTIMIZED; /* was count. */
-                   NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
+                   NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
 #endif
                }
              nogo:
@@ -3268,12 +3327,10 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                        nxt = nxt2;
                    OP(nxt2)  = SUCCEED; /* Whas WHILEM */
                    /* Need to optimize away parenths. */
-                   if (data->flags & SF_IN_PAR) {
+                   if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
                        /* Set the parenth number.  */
                        regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
 
-                       if (OP(nxt) != CLOSE)
-                           FAIL("Panic opt close");
                        oscan->flags = (U8)ARG(nxt);
                        if (RExC_open_parens) {
                            RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
@@ -3291,7 +3348,6 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
 #if 0
                        while ( nxt1 && (OP(nxt1) != WHILEM)) {
                            regnode *nnxt = regnext(nxt1);
-                       
                            if (nnxt == nxt) {
                                if (reg_off_by_arg[OP(nxt1)])
                                    ARG_SET(nxt1, nxt2 - nxt1);
@@ -3358,7 +3414,6 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
 
                        if (UTF)
                            old = utf8_hop((U8*)s, old) - (U8*)s;
-                       
                        l -= old;
                        /* Get the added string: */
                        last_str = newSVpvn_utf8(s  + old, l, UTF);
@@ -3446,13 +3501,13 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                if (flags & SCF_DO_STCLASS_AND) {
                     for (value = 0; value < 256; value++)
                         if (!is_VERTWS_cp(value))
-                            ANYOF_BITMAP_CLEAR(data->start_class, value);  
-                }                                                              
-                else {                                                         
+                            ANYOF_BITMAP_CLEAR(data->start_class, value);
+                }
+                else {
                     for (value = 0; value < 256; value++)
                         if (is_VERTWS_cp(value))
-                            ANYOF_BITMAP_SET(data->start_class, value);           
-                }                                                              
+                            ANYOF_BITMAP_SET(data->start_class, value);
+                }
                 if (flags & SCF_DO_STCLASS_OR)
                    cl_and(data->start_class, and_withp);
                flags &= ~SCF_DO_STCLASS;
@@ -3465,7 +3520,6 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                data->pos_delta += 1;
                data->longest = &(data->longest_float);
            }
-           
        }
        else if (OP(scan) == FOLDCHAR) {
            int d = ARG(scan)==0xDF ? 1 : 2;
@@ -3479,7 +3533,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                data->longest = &(data->longest_float);
            }
        }
-       else if (strchr((const char*)PL_simple,OP(scan))) {
+       else if (REGNODE_SIMPLE(OP(scan))) {
            int value = 0;
 
            if (flags & SCF_DO_SUBSTR) {
@@ -3523,19 +3577,37 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                    if (flags & SCF_DO_STCLASS_AND) {
                        if (!(data->start_class->flags & ANYOF_LOCALE)) {
                            ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
-                           for (value = 0; value < 256; value++)
-                               if (!isALNUM(value))
-                                   ANYOF_BITMAP_CLEAR(data->start_class, value);
+                            if (FLAGS(scan) & USE_UNI) {
+                                for (value = 0; value < 256; value++) {
+                                    if (!isWORDCHAR_L1(value)) {
+                                        ANYOF_BITMAP_CLEAR(data->start_class, value);
+                                    }
+                                }
+                            } else {
+                                for (value = 0; value < 256; value++) {
+                                    if (!isALNUM(value)) {
+                                        ANYOF_BITMAP_CLEAR(data->start_class, value);
+                                    }
+                                }
+                            }
                        }
                    }
                    else {
                        if (data->start_class->flags & ANYOF_LOCALE)
                            ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
-                       else {
-                           for (value = 0; value < 256; value++)
-                               if (isALNUM(value))
-                                   ANYOF_BITMAP_SET(data->start_class, value);                 
-                       }
+                        else if (FLAGS(scan) & USE_UNI) {
+                            for (value = 0; value < 256; value++) {
+                                if (isWORDCHAR_L1(value)) {
+                                    ANYOF_BITMAP_SET(data->start_class, value);
+                                }
+                            }
+                        } else {
+                            for (value = 0; value < 256; value++) {
+                                if (isALNUM(value)) {
+                                    ANYOF_BITMAP_SET(data->start_class, value);
+                                }
+                            }
+                        }
                    }
                    break;
                case ALNUML:
@@ -3552,9 +3624,19 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                    if (flags & SCF_DO_STCLASS_AND) {
                        if (!(data->start_class->flags & ANYOF_LOCALE)) {
                            ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
-                           for (value = 0; value < 256; value++)
-                               if (isALNUM(value))
-                                   ANYOF_BITMAP_CLEAR(data->start_class, value);
+                            if (FLAGS(scan) & USE_UNI) {
+                                for (value = 0; value < 256; value++) {
+                                    if (isWORDCHAR_L1(value)) {
+                                        ANYOF_BITMAP_CLEAR(data->start_class, value);
+                                    }
+                                }
+                            } else {
+                                for (value = 0; value < 256; value++) {
+                                    if (isALNUM(value)) {
+                                        ANYOF_BITMAP_CLEAR(data->start_class, value);
+                                    }
+                                }
+                           }
                        }
                    }
                    else {
@@ -3563,7 +3645,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                        else {
                            for (value = 0; value < 256; value++)
                                if (!isALNUM(value))
-                                   ANYOF_BITMAP_SET(data->start_class, value);                 
+                                   ANYOF_BITMAP_SET(data->start_class, value);
                        }
                    }
                    break;
@@ -3581,18 +3663,37 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                    if (flags & SCF_DO_STCLASS_AND) {
                        if (!(data->start_class->flags & ANYOF_LOCALE)) {
                            ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
-                           for (value = 0; value < 256; value++)
-                               if (!isSPACE(value))
-                                   ANYOF_BITMAP_CLEAR(data->start_class, value);
+                           if (FLAGS(scan) & USE_UNI) {
+                                for (value = 0; value < 256; value++) {
+                                    if (!isSPACE_L1(value)) {
+                                        ANYOF_BITMAP_CLEAR(data->start_class, value);
+                                    }
+                                }
+                            } else {
+                                for (value = 0; value < 256; value++) {
+                                    if (!isSPACE(value)) {
+                                        ANYOF_BITMAP_CLEAR(data->start_class, value);
+                                    }
+                                }
+                            }
                        }
                    }
                    else {
-                       if (data->start_class->flags & ANYOF_LOCALE)
+                        if (data->start_class->flags & ANYOF_LOCALE) {
                            ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
-                       else {
-                           for (value = 0; value < 256; value++)
-                               if (isSPACE(value))
-                                   ANYOF_BITMAP_SET(data->start_class, value);                 
+                        }
+                        else if (FLAGS(scan) & USE_UNI) {
+                            for (value = 0; value < 256; value++) {
+                                if (isSPACE_L1(value)) {
+                                    ANYOF_BITMAP_SET(data->start_class, value);
+                                }
+                            }
+                        } else {
+                            for (value = 0; value < 256; value++) {
+                                if (isSPACE(value)) {
+                                    ANYOF_BITMAP_SET(data->start_class, value);
+                                }
+                            }
                        }
                    }
                    break;
@@ -3610,19 +3711,38 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                    if (flags & SCF_DO_STCLASS_AND) {
                        if (!(data->start_class->flags & ANYOF_LOCALE)) {
                            ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
-                           for (value = 0; value < 256; value++)
-                               if (isSPACE(value))
-                                   ANYOF_BITMAP_CLEAR(data->start_class, value);
+                            if (FLAGS(scan) & USE_UNI) {
+                                for (value = 0; value < 256; value++) {
+                                    if (isSPACE_L1(value)) {
+                                        ANYOF_BITMAP_CLEAR(data->start_class, value);
+                                    }
+                                }
+                            } else {
+                                for (value = 0; value < 256; value++) {
+                                    if (isSPACE(value)) {
+                                        ANYOF_BITMAP_CLEAR(data->start_class, value);
+                                    }
+                                }
+                            }
                        }
                    }
                    else {
                        if (data->start_class->flags & ANYOF_LOCALE)
                            ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
-                       else {
-                           for (value = 0; value < 256; value++)
-                               if (!isSPACE(value))
-                                   ANYOF_BITMAP_SET(data->start_class, value);                 
-                       }
+                        else if (FLAGS(scan) & USE_UNI) {
+                            for (value = 0; value < 256; value++) {
+                                if (!isSPACE_L1(value)) {
+                                    ANYOF_BITMAP_SET(data->start_class, value);
+                                }
+                            }
+                        }
+                        else {
+                            for (value = 0; value < 256; value++) {
+                                if (!isSPACE(value)) {
+                                    ANYOF_BITMAP_SET(data->start_class, value);
+                                }
+                            }
+                        }
                    }
                    break;
                case NSPACEL:
@@ -3652,7 +3772,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                        else {
                            for (value = 0; value < 256; value++)
                                if (isDIGIT(value))
-                                   ANYOF_BITMAP_SET(data->start_class, value);                 
+                                   ANYOF_BITMAP_SET(data->start_class, value);
                        }
                    }
                    break;
@@ -3669,7 +3789,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                        else {
                            for (value = 0; value < 256; value++)
                                if (!isDIGIT(value))
-                                   ANYOF_BITMAP_SET(data->start_class, value);                 
+                                   ANYOF_BITMAP_SET(data->start_class, value);
                        }
                    }
                    break;
@@ -4216,18 +4336,20 @@ Perl_re_compile(pTHX_ SV * const pattern, U32 pm_flags)
     struct regexp *r;
     register regexp_internal *ri;
     STRLEN plen;
-    char  *exp = SvPV(pattern, plen);
-    char* xend = exp + plen;
+    char  *exp;
+    char* xend;
     regnode *scan;
     I32 flags;
     I32 minlen = 0;
     I32 sawplus = 0;
     I32 sawopen = 0;
+    U8 jump_ret = 0;
+    dJMPENV;
     scan_data_t data;
     RExC_state_t RExC_state;
     RExC_state_t * const pRExC_state = &RExC_state;
 #ifdef TRIE_STUDY_OPT    
-    int restudied= 0;
+    int restudied;
     RExC_state_t copyRExC_state;
 #endif    
     GET_RE_DEBUG_FLAGS_DECL;
@@ -4238,15 +4360,56 @@ Perl_re_compile(pTHX_ SV * const pattern, U32 pm_flags)
 
     RExC_utf8 = RExC_orig_utf8 = SvUTF8(pattern);
 
-    DEBUG_COMPILE_r({
-        SV *dsv= sv_newmortal();
-        RE_PV_QUOTED_DECL(s, RExC_utf8,
-            dsv, exp, plen, 60);
-        PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
-                      PL_colors[4],PL_colors[5],s);
-    });
 
-redo_first_pass:
+    /* Longjmp back to here if have to switch in midstream to utf8 */
+    if (! RExC_orig_utf8) {
+       JMPENV_PUSH(jump_ret);
+    }
+
+    if (jump_ret == 0) {    /* First time through */
+        exp = SvPV(pattern, plen);
+        xend = exp + plen;
+
+        DEBUG_COMPILE_r({
+            SV *dsv= sv_newmortal();
+            RE_PV_QUOTED_DECL(s, RExC_utf8,
+                dsv, exp, plen, 60);
+            PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
+                           PL_colors[4],PL_colors[5],s);
+        });
+    }
+    else {  /* longjumped back */
+        STRLEN len = plen;
+
+        /* If the cause for the longjmp was other than changing to utf8, pop
+         * our own setjmp, and longjmp to the correct handler */
+       if (jump_ret != UTF8_LONGJMP) {
+           JMPENV_POP;
+           JMPENV_JUMP(jump_ret);
+       }
+
+       GET_RE_DEBUG_FLAGS;
+
+        /* It's possible to write a regexp in ascii that represents Unicode
+        codepoints outside of the byte range, such as via \x{100}. If we
+        detect such a sequence we have to convert the entire pattern to utf8
+        and then recompile, as our sizing calculation will have been based
+        on 1 byte == 1 character, but we will need to use utf8 to encode
+        at least some part of the pattern, and therefore must convert the whole
+        thing.
+        -- dmq */
+        DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
+           "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
+        exp = (char*)Perl_bytes_to_utf8(aTHX_ (U8*)SvPV(pattern, plen), &len);
+        xend = exp + len;
+        RExC_orig_utf8 = RExC_utf8 = 1;
+        SAVEFREEPV(exp);
+    }
+
+#ifdef TRIE_STUDY_OPT
+    restudied = 0;
+#endif
+
     RExC_precomp = exp;
     RExC_flags = pm_flags;
     RExC_sawback = 0;
@@ -4285,24 +4448,14 @@ redo_first_pass:
        RExC_precomp = NULL;
        return(NULL);
     }
-    if (RExC_utf8 && !RExC_orig_utf8) {
-        /* It's possible to write a regexp in ascii that represents Unicode
-        codepoints outside of the byte range, such as via \x{100}. If we
-        detect such a sequence we have to convert the entire pattern to utf8
-        and then recompile, as our sizing calculation will have been based
-        on 1 byte == 1 character, but we will need to use utf8 to encode
-        at least some part of the pattern, and therefore must convert the whole
-        thing.
-        XXX: somehow figure out how to make this less expensive...
-        -- dmq */
-        STRLEN len = plen;
-        DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
-           "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
-        exp = (char*)Perl_bytes_to_utf8(aTHX_ (U8*)exp, &len);
-        xend = exp + len;
-        RExC_orig_utf8 = RExC_utf8;
-        SAVEFREEPV(exp);
-        goto redo_first_pass;
+
+    /* Here, finished first pass.  Get rid of our setjmp, which we added for
+     * efficiency only if the passed-in string wasn't in utf8, as shown by
+     * RExC_orig_utf8.  But if the first pass was redone, that variable will be
+     * 1 here even though the original string wasn't utf8, but in this case
+     * there will have been a long jump */
+    if (jump_ret == UTF8_LONGJMP || ! RExC_orig_utf8) {
+       JMPENV_POP;
     }
     DEBUG_PARSE_r({
         PerlIO_printf(Perl_debug_log, 
@@ -4344,39 +4497,56 @@ redo_first_pass:
     r->extflags = pm_flags;
     {
         bool has_p     = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
-       bool has_minus = ((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD);
+        bool has_charset = cBOOL(r->extflags & (RXf_PMf_LOCALE|RXf_PMf_UNICODE));
+
+        /* The caret is output if there are any defaults: if not all the STD
+         * flags are set, or if no character set specifier is needed */
+        bool has_default =
+                    (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
+                    || ! has_charset);
        bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
        U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
                            >> RXf_PMf_STD_PMMOD_SHIFT);
        const char *fptr = STD_PAT_MODS;        /*"msix"*/
        char *p;
-       const STRLEN wraplen = plen + has_minus + has_p + has_runon
+        /* Allocate for the worst case, which is all the std flags are turned
+         * on.  If more precision is desired, we could do a population count of
+         * the flags set.  This could be done with a small lookup table, or by
+         * shifting, masking and adding, or even, when available, assembly
+         * language for a machine-language population count.
+         * We never output a minus, as all those are defaults, so are
+         * covered by the caret */
+       const STRLEN wraplen = plen + has_p + has_runon
+            + has_default       /* If needs a caret */
+            + has_charset       /* If needs a character set specifier */
             + (sizeof(STD_PAT_MODS) - 1)
             + (sizeof("(?:)") - 1);
 
-       p = sv_grow(MUTABLE_SV(rx), wraplen + 1);
-       SvCUR_set(rx, wraplen);
+        p = sv_grow(MUTABLE_SV(rx), wraplen + 1); /* +1 for the ending NUL */
        SvPOK_on(rx);
        SvFLAGS(rx) |= SvUTF8(pattern);
         *p++='('; *p++='?';
+
+        /* If a default, cover it using the caret */
+        if (has_default) {
+            *p++= DEFAULT_PAT_MOD;
+        }
+        if (has_charset) {
+            if (r->extflags & RXf_PMf_LOCALE) {
+                *p++ = LOCALE_PAT_MOD;
+            } else {
+                *p++ = UNICODE_PAT_MOD;
+            }
+        }
         if (has_p)
             *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
         {
-            char *r = p + (sizeof(STD_PAT_MODS) - 1) + has_minus - 1;
-            char *colon = r + 1;
             char ch;
-
             while((ch = *fptr++)) {
                 if(reganch & 1)
                     *p++ = ch;
-                else
-                    *r-- = ch;
                 reganch >>= 1;
             }
-            if(has_minus) {
-                *r = '-';
-                p = colon;
-            }
         }
 
         *p++ = ':';
@@ -4388,6 +4558,7 @@ redo_first_pass:
             *p++ = '\n';
         *p++ = ')';
         *p = 0;
+       SvCUR_set(rx, p - SvPVX_const(rx));
     }
 
     r->intflags = 0;
@@ -4556,7 +4727,7 @@ reStudy:
            ri->regstclass = trie_op;
        }
 #endif 
-       else if (strchr((const char*)PL_simple,OP(first)))
+       else if (REGNODE_SIMPLE(OP(first)))
            ri->regstclass = first;
        else if (PL_regkind[OP(first)] == BOUND ||
                 PL_regkind[OP(first)] == NBOUND)
@@ -4897,7 +5068,7 @@ reStudy:
 #endif
 #ifdef DEBUGGING
     if (RExC_paren_names) {
-        ri->name_list_idx = add_data( pRExC_state, 1, "p" );
+        ri->name_list_idx = add_data( pRExC_state, 1, "a" );
         ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
     } else
 #endif
@@ -4947,7 +5118,7 @@ Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
     if (flags & RXapif_FETCH) {
         return reg_named_buff_fetch(rx, key, flags);
     } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
-        Perl_croak(aTHX_ "%s", PL_no_modify);
+        Perl_croak_no_modify(aTHX);
         return NULL;
     } else if (flags & RXapif_EXISTS) {
         return reg_named_buff_exists(rx, key, flags)
@@ -5247,7 +5418,7 @@ Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
     PERL_UNUSED_ARG(value);
 
     if (!PL_localizing)
-        Perl_croak(aTHX_ "%s", PL_no_modify);
+        Perl_croak_no_modify(aTHX);
 }
 
 I32
@@ -5588,6 +5759,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
        if (*RExC_parse == '?') { /* (?...) */
            bool is_logical = 0;
            const char * const seqstart = RExC_parse;
+            bool has_use_defaults = FALSE;
 
            RExC_parse++;
            paren = *RExC_parse++;
@@ -5717,7 +5889,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
                RExC_parse++;
            case '=':           /* (?=...) */
                RExC_seen_zerolen++;
-                       break;
+                break;
            case '!':           /* (?!...) */
                RExC_seen_zerolen++;
                if (*RExC_parse == ')') {
@@ -6042,12 +6214,19 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
                RExC_parse--; /* for vFAIL to print correctly */
                 vFAIL("Sequence (? incomplete");
                 break;
+            case DEFAULT_PAT_MOD:   /* Use default flags with the exceptions
+                                      that follow */
+                has_use_defaults = TRUE;
+                STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
+                RExC_flags &= ~(RXf_PMf_LOCALE|RXf_PMf_UNICODE);
+                goto parse_flags;
            default:
                --RExC_parse;
                parse_flags:      /* (?i) */  
            {
                 U32 posflags = 0, negflags = 0;
                U32 *flagsp = &posflags;
+                bool has_charset_modifier = 0;
 
                while (*RExC_parse) {
                    /* && strchr("iogcmsx", *RExC_parse) */
@@ -6055,6 +6234,32 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
                       and must be globally applied -- japhy */
                     switch (*RExC_parse) {
                    CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
+                    case LOCALE_PAT_MOD:
+                        if (has_charset_modifier || flagsp == &negflags) {
+                            goto fail_modifiers;
+                        }
+                        *flagsp &= ~RXf_PMf_UNICODE;
+                        *flagsp |= RXf_PMf_LOCALE;
+                        has_charset_modifier = 1;
+                        break;
+                    case UNICODE_PAT_MOD:
+                        if (has_charset_modifier || flagsp == &negflags) {
+                            goto fail_modifiers;
+                        }
+                        *flagsp &= ~RXf_PMf_LOCALE;
+                        *flagsp |= RXf_PMf_UNICODE;
+                        has_charset_modifier = 1;
+                        break;
+                    case DUAL_PAT_MOD:
+                        if (has_use_defaults
+                            || has_charset_modifier
+                            || flagsp == &negflags)
+                        {
+                            goto fail_modifiers;
+                        }
+                        *flagsp &= ~(RXf_PMf_LOCALE|RXf_PMf_UNICODE);
+                        has_charset_modifier = 1;
+                        break;
                     case ONCE_PAT_MOD: /* 'o' */
                     case GLOBAL_PAT_MOD: /* 'g' */
                        if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
@@ -6095,7 +6300,11 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
                         }
                        break;
                     case '-':
-                        if (flagsp == &negflags) {
+                        /* A flag is a default iff it is following a minus,  so
+                         * if there is a minus, it means will be trying to
+                         * re-specify a default which is an error */
+                        if (has_use_defaults || flagsp == &negflags) {
+            fail_modifiers:
                             RExC_parse++;
                            vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
                            /*NOTREACHED*/
@@ -6698,7 +6907,7 @@ S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp)
        return (regnode *) &RExC_parse; /* Invalid regnode pointer */
     }
 
-    RExC_utf8 = 1;     /* named sequences imply Unicode semantics */
+    REQUIRE_UTF8;      /* named sequences imply Unicode semantics */
     RExC_parse += 2;   /* Skip past the 'U+' */
 
     if (valuep) {   /* In a bracketed char class */
@@ -6714,11 +6923,10 @@ S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp)
            | PERL_SCAN_DISALLOW_PREFIX
            | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
     
-       char * endchar = strchr(RExC_parse, '.');
-       if (endchar) {
+       char * endchar = RExC_parse + strcspn(RExC_parse, ".}");
+       if (endchar < endbrace) {
            ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
        }
-       else endchar = endbrace;
 
        length_of_hex = (STRLEN)(endchar - RExC_parse);
        *valuep = grok_hex(RExC_parse, &length_of_hex, &flags, NULL);
@@ -6744,7 +6952,7 @@ S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp)
     }
     else {     /* Not a char class */
        char *s;            /* String to put in generated EXACT node */
-       STRLEN len = 0;     /* Its current length */
+       STRLEN len = 0;     /* Its current byte length */
        char *endchar;      /* Points to '.' or '}' ending cur char in the input
                               stream */
 
@@ -6754,7 +6962,7 @@ S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp)
 
        /* Exact nodes can hold only a U8 length's of text = 255.  Loop through
         * the input which is of the form now 'c1.c2.c3...}' until find the
-        * ending brace or exeed length 255.  The characters that exceed this
+        * ending brace or exceed length 255.  The characters that exceed this
         * limit are dropped.  The limit could be relaxed should it become
         * desirable by reparsing this as (?:\N{NAME}), so could generate
         * multiple EXACT nodes, as is done for just regular input.  But this
@@ -6769,8 +6977,7 @@ S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp)
 
            /* Code points are separated by dots.  If none, there is only one
             * code point, and is terminated by the brace */
-           endchar = strchr(RExC_parse, '.');
-           if (! endchar) endchar = endbrace;
+           endchar = RExC_parse + strcspn(RExC_parse, ".}");
 
            /* The values are Unicode even on EBCDIC machines */
            length_of_hex = (STRLEN)(endchar - RExC_parse);
@@ -7088,31 +7295,61 @@ tryagain:
            *flagp |= HASWIDTH;
            goto finish_meta_pat;
        case 'w':
-           ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML     : ALNUM));
+           if (LOC) {
+                ret = reg_node(pRExC_state, (U8)(ALNUML));
+            } else {
+                ret = reg_node(pRExC_state, (U8)(ALNUM));
+                FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
+            }
            *flagp |= HASWIDTH|SIMPLE;
            goto finish_meta_pat;
        case 'W':
-           ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML    : NALNUM));
+            if (LOC) {
+                ret = reg_node(pRExC_state, (U8)(NALNUML));
+            } else {
+                ret = reg_node(pRExC_state, (U8)(NALNUM));
+                FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
+            }
            *flagp |= HASWIDTH|SIMPLE;
            goto finish_meta_pat;
        case 'b':
            RExC_seen_zerolen++;
            RExC_seen |= REG_SEEN_LOOKBEHIND;
-           ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL     : BOUND));
+            if (LOC) {
+                ret = reg_node(pRExC_state, (U8)(BOUNDL));
+            } else {
+                ret = reg_node(pRExC_state, (U8)(BOUND));
+                FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
+            }
            *flagp |= SIMPLE;
            goto finish_meta_pat;
        case 'B':
            RExC_seen_zerolen++;
            RExC_seen |= REG_SEEN_LOOKBEHIND;
-           ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL    : NBOUND));
+            if (LOC) {
+                ret = reg_node(pRExC_state, (U8)(NBOUNDL));
+            } else {
+                ret = reg_node(pRExC_state, (U8)(NBOUND));
+                FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
+            }
            *flagp |= SIMPLE;
            goto finish_meta_pat;
        case 's':
-           ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL     : SPACE));
+            if (LOC) {
+                ret = reg_node(pRExC_state, (U8)(SPACEL));
+            } else {
+                ret = reg_node(pRExC_state, (U8)(SPACE));
+                FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
+            }
            *flagp |= HASWIDTH|SIMPLE;
            goto finish_meta_pat;
        case 'S':
-           ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL    : NSPACE));
+            if (LOC) {
+                ret = reg_node(pRExC_state, (U8)(NSPACEL));
+            } else {
+                ret = reg_node(pRExC_state, (U8)(NSPACE));
+                FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
+            }
            *flagp |= HASWIDTH|SIMPLE;
            goto finish_meta_pat;
        case 'd':
@@ -7416,6 +7653,35 @@ tryagain:
                          ender = ASCII_TO_NATIVE('\007');
                        p++;
                        break;
+                   case 'o':
+                       {
+                           STRLEN brace_len = len;
+                           UV result;
+                           const char* error_msg;
+
+                           bool valid = grok_bslash_o(p,
+                                                      &result,
+                                                      &brace_len,
+                                                      &error_msg,
+                                                      1);
+                           p += brace_len;
+                           if (! valid) {
+                               RExC_parse = p; /* going to die anyway; point
+                                                  to exact spot of failure */
+                               vFAIL(error_msg);
+                           }
+                           else
+                           {
+                               ender = result;
+                           }
+                           if (PL_encoding && ender < 0x100) {
+                               goto recode_encoding;
+                           }
+                           if (ender > 0xff) {
+                               REQUIRE_UTF8;
+                           }
+                           break;
+                       }
                    case 'x':
                        if (*++p == '{') {
                            char* const e = strchr(p, '}');
@@ -7430,7 +7696,7 @@ tryagain:
                                 STRLEN numlen = e - p - 1;
                                ender = grok_hex(p + 1, &numlen, &flags, NULL);
                                if (ender > 0xff)
-                                   RExC_utf8 = 1;
+                                   REQUIRE_UTF8;
                                p = e + 1;
                            }
                        }
@@ -7450,21 +7716,13 @@ tryagain:
                    case '0': case '1': case '2': case '3':case '4':
                    case '5': case '6': case '7': case '8':case '9':
                        if (*p == '0' ||
-                         (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
-                            I32 flags = 0;
+                           (isDIGIT(p[1]) && atoi(p) >= RExC_npar))
+                       {
+                           I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
                            STRLEN numlen = 3;
                            ender = grok_oct(p, &numlen, &flags, NULL);
-
-                           /* An octal above 0xff is interpreted differently
-                            * depending on if the re is in utf8 or not.  If it
-                            * is in utf8, the value will be itself, otherwise
-                            * it is interpreted as modulo 0x100.  It has been
-                            * decided to discourage the use of octal above the
-                            * single-byte range.  For now, warn only when
-                            * it ends up modulo */
-                           if (SIZE_ONLY && ender >= 0x100
-                                   && ! UTF && ! PL_encoding) {
-                               ckWARNregdep(p, "Use of octal value above 377 is deprecated");
+                           if (ender > 0xff) {
+                               REQUIRE_UTF8;
                            }
                            p += numlen;
                        }
@@ -7481,7 +7739,7 @@ tryagain:
                            ender = reg_recode((const char)(U8)ender, &enc);
                            if (!enc && SIZE_ONLY)
                                ckWARNreg(p, "Invalid escape in the specified encoding");
-                           RExC_utf8 = 1;
+                           REQUIRE_UTF8;
                        }
                        break;
                    case '\0':
@@ -7823,6 +8081,7 @@ case ANYOF_N##NAME:                                     \
     what = WORD;                                        \
     break
 
+/* Like above, but no locale test */
 #define _C_C_T_NOLOC_(NAME,TEST,WORD)                   \
 ANYOF_##NAME:                                           \
        for (value = 0; value < 256; value++)           \
@@ -7839,6 +8098,42 @@ case ANYOF_N##NAME:                                     \
     what = WORD;                                        \
     break
 
+/* Like the above, but there are differences if we are in uni-8-bit or not, so
+ * there are two tests passed in, to use depending on that. There aren't any
+ * cases where the label is different from the name, so no need for that
+ * parameter */
+#define _C_C_T_UNI_8_BIT(NAME,TEST_8,TEST_7,WORD)       \
+ANYOF_##NAME:                                           \
+    if (LOC) ANYOF_CLASS_SET(ret, ANYOF_##NAME);        \
+    else if (UNI_SEMANTICS) {                           \
+        for (value = 0; value < 256; value++) {         \
+            if (TEST_8) ANYOF_BITMAP_SET(ret, value);   \
+        }                                               \
+    }                                                   \
+    else {                                              \
+        for (value = 0; value < 256; value++) {         \
+            if (TEST_7) ANYOF_BITMAP_SET(ret, value);   \
+        }                                               \
+    }                                                   \
+    yesno = '+';                                        \
+    what = WORD;                                        \
+    break;                                              \
+case ANYOF_N##NAME:                                     \
+    if (LOC) ANYOF_CLASS_SET(ret, ANYOF_N##NAME);       \
+    else if (UNI_SEMANTICS) {                           \
+        for (value = 0; value < 256; value++) {         \
+            if (! TEST_8) ANYOF_BITMAP_SET(ret, value); \
+        }                                               \
+    }                                                   \
+    else {                                              \
+        for (value = 0; value < 256; value++) {         \
+            if (! TEST_7) ANYOF_BITMAP_SET(ret, value); \
+        }                                               \
+    }                                                   \
+    yesno = '!';                                        \
+    what = WORD;                                        \
+    break
+
 /* 
    We dont use PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS as the direct test
    so that it is possible to override the option here without having to 
@@ -8040,6 +8335,24 @@ parseit:
            case 'b':   value = '\b';                   break;
            case 'e':   value = ASCII_TO_NATIVE('\033');break;
            case 'a':   value = ASCII_TO_NATIVE('\007');break;
+           case 'o':
+               RExC_parse--;   /* function expects to be pointed at the 'o' */
+               {
+                   const char* error_msg;
+                   bool valid = grok_bslash_o(RExC_parse,
+                                              &value,
+                                              &numlen,
+                                              &error_msg,
+                                              SIZE_ONLY);
+                   RExC_parse += numlen;
+                   if (! valid) {
+                       vFAIL(error_msg);
+                   }
+               }
+               if (PL_encoding && value < 0x100) {
+                   goto recode_encoding;
+               }
+               break;
            case 'x':
                if (*RExC_parse == '{') {
                     I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
@@ -8065,9 +8378,10 @@ parseit:
                value = grok_bslash_c(*RExC_parse++, SIZE_ONLY);
                break;
            case '0': case '1': case '2': case '3': case '4':
-           case '5': case '6': case '7': case '8': case '9':
+           case '5': case '6': case '7':
                {
-                   I32 flags = 0;
+                   /* Take 1-3 octal digits */
+                   I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
                    numlen = 3;
                    value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
                    RExC_parse += numlen;
@@ -8085,10 +8399,12 @@ parseit:
                    break;
                }
            default:
-               if (!SIZE_ONLY && isALPHA(value))
+               /* Allow \_ to not give an error */
+               if (!SIZE_ONLY && isALNUM(value) && value != '_') {
                    ckWARN2reg(RExC_parse,
                               "Unrecognized escape \\%c in character class passed through",
                               (int)value);
+               }
                break;
            }
        } /* end of \blah */
@@ -8153,11 +8469,13 @@ parseit:
                case _C_C_T_(PUNCT, isPUNCT(value), POSIX_CC_UNI_NAME("Punct"));
                case _C_C_T_(UPPER, isUPPER(value), POSIX_CC_UNI_NAME("Upper"));
 #ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
-               case _C_C_T_(ALNUM, isALNUM(value), "Word");
-               case _C_C_T_(SPACE, isSPACE(value), "SpacePerl");
+                /* \s, \w match all unicode if utf8. */
+                case _C_C_T_UNI_8_BIT(SPACE, isSPACE_L1(value), isSPACE(value), "SpacePerl");
+                case _C_C_T_UNI_8_BIT(ALNUM, isWORDCHAR_L1(value), isALNUM(value), "Word");
 #else
-               case _C_C_T_(SPACE, isSPACE(value), "PerlSpace");
-               case _C_C_T_(ALNUM, isALNUM(value), "PerlWord");
+                /* \s, \w match ascii and locale only */
+                case _C_C_T_UNI_8_BIT(SPACE, isSPACE_L1(value), isSPACE(value), "PerlSpace");
+                case _C_C_T_UNI_8_BIT(ALNUM, isWORDCHAR_L1(value), isALNUM(value), "PerlWord");
 #endif         
                case _C_C_T_(XDIGIT, isXDIGIT(value), "XDigit");
                case _C_C_T_NOLOC_(VERTWS, is_VERTWS_latin1(&value), "VertSpace");
@@ -8883,31 +9201,6 @@ S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,
 #endif
 
 /*
- - regcurly - a little FSA that accepts {\d+,?\d*}
- */
-#ifndef PERL_IN_XSUB_RE
-I32
-Perl_regcurly(register const char *s)
-{
-    PERL_ARGS_ASSERT_REGCURLY;
-
-    if (*s++ != '{')
-       return FALSE;
-    if (!isDIGIT(*s))
-       return FALSE;
-    while (isDIGIT(*s))
-       s++;
-    if (*s == ',')
-       s++;
-    while (isDIGIT(*s))
-       s++;
-    if (*s != '}')
-       return FALSE;
-    return TRUE;
-}
-#endif
-
-/*
  - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
  */
 #ifdef DEBUGGING
@@ -9510,6 +9803,7 @@ Perl_regfree_internal(pTHX_ REGEXP * const rx)
        while (--n >= 0) {
           /* If you add a ->what type here, update the comment in regcomp.h */
            switch (ri->data->what[n]) {
+           case 'a':
            case 's':
            case 'S':
            case 'u':
@@ -9571,12 +9865,9 @@ Perl_regfree_internal(pTHX_ REGEXP * const rx)
                         PerlMemShared_free(trie->trans);
                         if (trie->bitmap)
                             PerlMemShared_free(trie->bitmap);
-                        if (trie->wordlen)
-                            PerlMemShared_free(trie->wordlen);
                         if (trie->jump)
                             PerlMemShared_free(trie->jump);
-                        if (trie->nextword)
-                            PerlMemShared_free(trie->nextword);
+                       PerlMemShared_free(trie->wordinfo);
                         /* do this last!!!! */
                         PerlMemShared_free(ri->data->data[n]);
                    }
@@ -9593,9 +9884,8 @@ Perl_regfree_internal(pTHX_ REGEXP * const rx)
     Safefree(ri);
 }
 
-#define sv_dup_inc(s,t)        SvREFCNT_inc(sv_dup(s,t))
-#define av_dup_inc(s,t)        MUTABLE_AV(SvREFCNT_inc(sv_dup((const SV *)s,t)))
-#define hv_dup_inc(s,t)        MUTABLE_HV(SvREFCNT_inc(sv_dup((const SV *)s,t)))
+#define av_dup_inc(s,t)        MUTABLE_AV(sv_dup_inc((const SV *)s,t))
+#define hv_dup_inc(s,t)        MUTABLE_HV(sv_dup_inc((const SV *)s,t))
 #define SAVEPVN(p,n)   ((p) ? savepvn(p,n) : NULL)
 
 /* 
@@ -9747,8 +10037,9 @@ Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
        for (i = 0; i < count; i++) {
            d->what[i] = ri->data->what[i];
            switch (d->what[i]) {
-               /* legal options are one of: sSfpontTu
+               /* legal options are one of: sSfpontTua
                   see also regcomp.h and pregfree() */
+           case 'a': /* actually an AV, but the dup function is identical.  */
            case 's':
            case 'S':
            case 'p': /* actually an AV, but the dup function is identical.  */
@@ -9824,6 +10115,10 @@ Perl_regnext(pTHX_ register regnode *p)
     if (!p)
        return(NULL);
 
+    if (OP(p) > REGNODE_MAX) {         /* regnode.type is unsigned */
+       Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(p), (int)REGNODE_MAX);
+    }
+
     offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
     if (offset == 0)
        return(NULL);