This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
regcomp.c: No longer need _C_C_T_ and variant macro
[perl5.git] / regcomp.c
index be5acdb..cbba23d 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -85,6 +85,8 @@
 #  include "regcomp.h"
 #endif
 
 #  include "regcomp.h"
 #endif
 
+#include "dquote_static.c"
+
 #ifdef op
 #undef op
 #endif /* op */
 #ifdef op
 #undef op
 #endif /* op */
@@ -196,7 +198,10 @@ typedef struct RExC_state_t {
  */
 #define        WORST           0       /* Worst case. */
 #define        HASWIDTH        0x01    /* Known to match non-null strings. */
  */
 #define        WORST           0       /* Worst case. */
 #define        HASWIDTH        0x01    /* Known to match non-null strings. */
-#define        SIMPLE          0x02    /* Simple enough to be STAR/PLUS operand. */
+
+/* Simple enough to be STAR/PLUS operand, in an EXACT node must be a single
+ * character, and if utf8, must be invariant. */
+#define        SIMPLE          0x02
 #define        SPSTART         0x04    /* Starts with * or +. */
 #define TRYAGAIN       0x08    /* Weeded out a declaration. */
 #define POSTPONED      0x10    /* (?1),(?&name), (??{...}) or similar */
 #define        SPSTART         0x04    /* Starts with * or +. */
 #define TRYAGAIN       0x08    /* Weeded out a declaration. */
 #define POSTPONED      0x10    /* (?1),(?&name), (??{...}) or similar */
@@ -218,6 +223,11 @@ typedef struct RExC_state_t {
 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
 
 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
 
+/* If not already in utf8, do a longjmp back to the beginning */
+#define UTF8_LONGJMP 42 /* Choose a value not likely to ever conflict */
+#define REQUIRE_UTF8   STMT_START {                                       \
+                                     if (! UTF) JMPENV_JUMP(UTF8_LONGJMP); \
+                        } STMT_END
 
 /* About scan_data_t.
 
 
 /* About scan_data_t.
 
@@ -358,9 +368,10 @@ static const scan_data_t zero_scan_data =
 #define SCF_TRIE_RESTUDY        0x4000 /* Do restudy? */
 #define SCF_SEEN_ACCEPT         0x8000 
 
 #define SCF_TRIE_RESTUDY        0x4000 /* Do restudy? */
 #define SCF_SEEN_ACCEPT         0x8000 
 
-#define UTF (RExC_utf8 != 0)
-#define LOC ((RExC_flags & RXf_PMf_LOCALE) != 0)
-#define FOLD ((RExC_flags & RXf_PMf_FOLD) != 0)
+#define UTF cBOOL(RExC_utf8)
+#define LOC cBOOL(RExC_flags & RXf_PMf_LOCALE)
+#define UNI_SEMANTICS cBOOL(RExC_flags & RXf_PMf_UNICODE)
+#define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
 
 #define OOB_UNICODE            12345678
 #define OOB_NAMEDCLASS         -1
 
 #define OOB_UNICODE            12345678
 #define OOB_NAMEDCLASS         -1
@@ -870,7 +881,7 @@ S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, con
   Dumps the final compressed table form of the trie to Perl_debug_log.
   Used for debugging make_trie().
 */
   Dumps the final compressed table form of the trie to Perl_debug_log.
   Used for debugging make_trie().
 */
+
 STATIC void
 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
            AV *revcharmap, U32 depth)
 STATIC void
 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
            AV *revcharmap, U32 depth)
@@ -3116,7 +3127,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
            }
            flags &= ~SCF_DO_STCLASS;
        }
            }
            flags &= ~SCF_DO_STCLASS;
        }
-       else if (strchr((const char*)PL_varies,OP(scan))) {
+       else if (REGNODE_VARIES(OP(scan))) {
            I32 mincount, maxcount, minnext, deltanext, fl = 0;
            I32 f = flags, pos_before = 0;
            regnode * const oscan = scan;
            I32 mincount, maxcount, minnext, deltanext, fl = 0;
            I32 f = flags, pos_before = 0;
            regnode * const oscan = scan;
@@ -3196,7 +3207,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                /* These are the cases when once a subexpression
                   fails at a particular position, it cannot succeed
                   even after backtracking at the enclosing scope.
                /* These are the cases when once a subexpression
                   fails at a particular position, it cannot succeed
                   even after backtracking at the enclosing scope.
-               
+
                   XXXX what if minimal match and we are at the
                        initial run of {n,m}? */
                if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
                   XXXX what if minimal match and we are at the
                        initial run of {n,m}? */
                if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
@@ -3268,7 +3279,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
 
                    /* Skip open. */
                    nxt = regnext(nxt);
 
                    /* Skip open. */
                    nxt = regnext(nxt);
-                   if (!strchr((const char*)PL_simple,OP(nxt))
+                   if (!REGNODE_SIMPLE(OP(nxt))
                        && !(PL_regkind[OP(nxt)] == EXACT
                             && STR_LEN(nxt) == 1))
                        goto nogo;
                        && !(PL_regkind[OP(nxt)] == EXACT
                             && STR_LEN(nxt) == 1))
                        goto nogo;
@@ -3289,11 +3300,11 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
 
 #ifdef DEBUGGING
                    OP(nxt1 + 1) = OPTIMIZED; /* was count. */
 
 #ifdef DEBUGGING
                    OP(nxt1 + 1) = OPTIMIZED; /* was count. */
-                   NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
-                   NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
+                   NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
+                   NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
                    OP(nxt) = OPTIMIZED;        /* was CLOSE. */
                    OP(nxt + 1) = OPTIMIZED; /* was count. */
                    OP(nxt) = OPTIMIZED;        /* was CLOSE. */
                    OP(nxt + 1) = OPTIMIZED; /* was count. */
-                   NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
+                   NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
 #endif
                }
              nogo:
 #endif
                }
              nogo:
@@ -3316,12 +3327,10 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                        nxt = nxt2;
                    OP(nxt2)  = SUCCEED; /* Whas WHILEM */
                    /* Need to optimize away parenths. */
                        nxt = nxt2;
                    OP(nxt2)  = SUCCEED; /* Whas WHILEM */
                    /* Need to optimize away parenths. */
-                   if (data->flags & SF_IN_PAR) {
+                   if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
                        /* Set the parenth number.  */
                        regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
 
                        /* Set the parenth number.  */
                        regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
 
-                       if (OP(nxt) != CLOSE)
-                           FAIL("Panic opt close");
                        oscan->flags = (U8)ARG(nxt);
                        if (RExC_open_parens) {
                            RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
                        oscan->flags = (U8)ARG(nxt);
                        if (RExC_open_parens) {
                            RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
@@ -3339,7 +3348,6 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
 #if 0
                        while ( nxt1 && (OP(nxt1) != WHILEM)) {
                            regnode *nnxt = regnext(nxt1);
 #if 0
                        while ( nxt1 && (OP(nxt1) != WHILEM)) {
                            regnode *nnxt = regnext(nxt1);
-                       
                            if (nnxt == nxt) {
                                if (reg_off_by_arg[OP(nxt1)])
                                    ARG_SET(nxt1, nxt2 - nxt1);
                            if (nnxt == nxt) {
                                if (reg_off_by_arg[OP(nxt1)])
                                    ARG_SET(nxt1, nxt2 - nxt1);
@@ -3406,7 +3414,6 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
 
                        if (UTF)
                            old = utf8_hop((U8*)s, old) - (U8*)s;
 
                        if (UTF)
                            old = utf8_hop((U8*)s, old) - (U8*)s;
-                       
                        l -= old;
                        /* Get the added string: */
                        last_str = newSVpvn_utf8(s  + old, l, UTF);
                        l -= old;
                        /* Get the added string: */
                        last_str = newSVpvn_utf8(s  + old, l, UTF);
@@ -3494,13 +3501,13 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                if (flags & SCF_DO_STCLASS_AND) {
                     for (value = 0; value < 256; value++)
                         if (!is_VERTWS_cp(value))
                if (flags & SCF_DO_STCLASS_AND) {
                     for (value = 0; value < 256; value++)
                         if (!is_VERTWS_cp(value))
-                            ANYOF_BITMAP_CLEAR(data->start_class, value);  
-                }                                                              
-                else {                                                         
+                            ANYOF_BITMAP_CLEAR(data->start_class, value);
+                }
+                else {
                     for (value = 0; value < 256; value++)
                         if (is_VERTWS_cp(value))
                     for (value = 0; value < 256; value++)
                         if (is_VERTWS_cp(value))
-                            ANYOF_BITMAP_SET(data->start_class, value);           
-                }                                                              
+                            ANYOF_BITMAP_SET(data->start_class, value);
+                }
                 if (flags & SCF_DO_STCLASS_OR)
                    cl_and(data->start_class, and_withp);
                flags &= ~SCF_DO_STCLASS;
                 if (flags & SCF_DO_STCLASS_OR)
                    cl_and(data->start_class, and_withp);
                flags &= ~SCF_DO_STCLASS;
@@ -3513,7 +3520,6 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                data->pos_delta += 1;
                data->longest = &(data->longest_float);
            }
                data->pos_delta += 1;
                data->longest = &(data->longest_float);
            }
-           
        }
        else if (OP(scan) == FOLDCHAR) {
            int d = ARG(scan)==0xDF ? 1 : 2;
        }
        else if (OP(scan) == FOLDCHAR) {
            int d = ARG(scan)==0xDF ? 1 : 2;
@@ -3527,7 +3533,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                data->longest = &(data->longest_float);
            }
        }
                data->longest = &(data->longest_float);
            }
        }
-       else if (strchr((const char*)PL_simple,OP(scan))) {
+       else if (REGNODE_SIMPLE(OP(scan))) {
            int value = 0;
 
            if (flags & SCF_DO_SUBSTR) {
            int value = 0;
 
            if (flags & SCF_DO_SUBSTR) {
@@ -3571,19 +3577,37 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                    if (flags & SCF_DO_STCLASS_AND) {
                        if (!(data->start_class->flags & ANYOF_LOCALE)) {
                            ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
                    if (flags & SCF_DO_STCLASS_AND) {
                        if (!(data->start_class->flags & ANYOF_LOCALE)) {
                            ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
-                           for (value = 0; value < 256; value++)
-                               if (!isALNUM(value))
-                                   ANYOF_BITMAP_CLEAR(data->start_class, value);
+                            if (FLAGS(scan) & USE_UNI) {
+                                for (value = 0; value < 256; value++) {
+                                    if (!isWORDCHAR_L1(value)) {
+                                        ANYOF_BITMAP_CLEAR(data->start_class, value);
+                                    }
+                                }
+                            } else {
+                                for (value = 0; value < 256; value++) {
+                                    if (!isALNUM(value)) {
+                                        ANYOF_BITMAP_CLEAR(data->start_class, value);
+                                    }
+                                }
+                            }
                        }
                    }
                    else {
                        if (data->start_class->flags & ANYOF_LOCALE)
                            ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
                        }
                    }
                    else {
                        if (data->start_class->flags & ANYOF_LOCALE)
                            ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
-                       else {
-                           for (value = 0; value < 256; value++)
-                               if (isALNUM(value))
-                                   ANYOF_BITMAP_SET(data->start_class, value);                 
-                       }
+                        else if (FLAGS(scan) & USE_UNI) {
+                            for (value = 0; value < 256; value++) {
+                                if (isWORDCHAR_L1(value)) {
+                                    ANYOF_BITMAP_SET(data->start_class, value);
+                                }
+                            }
+                        } else {
+                            for (value = 0; value < 256; value++) {
+                                if (isALNUM(value)) {
+                                    ANYOF_BITMAP_SET(data->start_class, value);
+                                }
+                            }
+                        }
                    }
                    break;
                case ALNUML:
                    }
                    break;
                case ALNUML:
@@ -3600,9 +3624,19 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                    if (flags & SCF_DO_STCLASS_AND) {
                        if (!(data->start_class->flags & ANYOF_LOCALE)) {
                            ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
                    if (flags & SCF_DO_STCLASS_AND) {
                        if (!(data->start_class->flags & ANYOF_LOCALE)) {
                            ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
-                           for (value = 0; value < 256; value++)
-                               if (isALNUM(value))
-                                   ANYOF_BITMAP_CLEAR(data->start_class, value);
+                            if (FLAGS(scan) & USE_UNI) {
+                                for (value = 0; value < 256; value++) {
+                                    if (isWORDCHAR_L1(value)) {
+                                        ANYOF_BITMAP_CLEAR(data->start_class, value);
+                                    }
+                                }
+                            } else {
+                                for (value = 0; value < 256; value++) {
+                                    if (isALNUM(value)) {
+                                        ANYOF_BITMAP_CLEAR(data->start_class, value);
+                                    }
+                                }
+                           }
                        }
                    }
                    else {
                        }
                    }
                    else {
@@ -3611,7 +3645,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                        else {
                            for (value = 0; value < 256; value++)
                                if (!isALNUM(value))
                        else {
                            for (value = 0; value < 256; value++)
                                if (!isALNUM(value))
-                                   ANYOF_BITMAP_SET(data->start_class, value);                 
+                                   ANYOF_BITMAP_SET(data->start_class, value);
                        }
                    }
                    break;
                        }
                    }
                    break;
@@ -3629,18 +3663,37 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                    if (flags & SCF_DO_STCLASS_AND) {
                        if (!(data->start_class->flags & ANYOF_LOCALE)) {
                            ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
                    if (flags & SCF_DO_STCLASS_AND) {
                        if (!(data->start_class->flags & ANYOF_LOCALE)) {
                            ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
-                           for (value = 0; value < 256; value++)
-                               if (!isSPACE(value))
-                                   ANYOF_BITMAP_CLEAR(data->start_class, value);
+                           if (FLAGS(scan) & USE_UNI) {
+                                for (value = 0; value < 256; value++) {
+                                    if (!isSPACE_L1(value)) {
+                                        ANYOF_BITMAP_CLEAR(data->start_class, value);
+                                    }
+                                }
+                            } else {
+                                for (value = 0; value < 256; value++) {
+                                    if (!isSPACE(value)) {
+                                        ANYOF_BITMAP_CLEAR(data->start_class, value);
+                                    }
+                                }
+                            }
                        }
                    }
                    else {
                        }
                    }
                    else {
-                       if (data->start_class->flags & ANYOF_LOCALE)
+                        if (data->start_class->flags & ANYOF_LOCALE) {
                            ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
                            ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
-                       else {
-                           for (value = 0; value < 256; value++)
-                               if (isSPACE(value))
-                                   ANYOF_BITMAP_SET(data->start_class, value);                 
+                        }
+                        else if (FLAGS(scan) & USE_UNI) {
+                            for (value = 0; value < 256; value++) {
+                                if (isSPACE_L1(value)) {
+                                    ANYOF_BITMAP_SET(data->start_class, value);
+                                }
+                            }
+                        } else {
+                            for (value = 0; value < 256; value++) {
+                                if (isSPACE(value)) {
+                                    ANYOF_BITMAP_SET(data->start_class, value);
+                                }
+                            }
                        }
                    }
                    break;
                        }
                    }
                    break;
@@ -3658,19 +3711,38 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                    if (flags & SCF_DO_STCLASS_AND) {
                        if (!(data->start_class->flags & ANYOF_LOCALE)) {
                            ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
                    if (flags & SCF_DO_STCLASS_AND) {
                        if (!(data->start_class->flags & ANYOF_LOCALE)) {
                            ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
-                           for (value = 0; value < 256; value++)
-                               if (isSPACE(value))
-                                   ANYOF_BITMAP_CLEAR(data->start_class, value);
+                            if (FLAGS(scan) & USE_UNI) {
+                                for (value = 0; value < 256; value++) {
+                                    if (isSPACE_L1(value)) {
+                                        ANYOF_BITMAP_CLEAR(data->start_class, value);
+                                    }
+                                }
+                            } else {
+                                for (value = 0; value < 256; value++) {
+                                    if (isSPACE(value)) {
+                                        ANYOF_BITMAP_CLEAR(data->start_class, value);
+                                    }
+                                }
+                            }
                        }
                    }
                    else {
                        if (data->start_class->flags & ANYOF_LOCALE)
                            ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
                        }
                    }
                    else {
                        if (data->start_class->flags & ANYOF_LOCALE)
                            ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
-                       else {
-                           for (value = 0; value < 256; value++)
-                               if (!isSPACE(value))
-                                   ANYOF_BITMAP_SET(data->start_class, value);                 
-                       }
+                        else if (FLAGS(scan) & USE_UNI) {
+                            for (value = 0; value < 256; value++) {
+                                if (!isSPACE_L1(value)) {
+                                    ANYOF_BITMAP_SET(data->start_class, value);
+                                }
+                            }
+                        }
+                        else {
+                            for (value = 0; value < 256; value++) {
+                                if (!isSPACE(value)) {
+                                    ANYOF_BITMAP_SET(data->start_class, value);
+                                }
+                            }
+                        }
                    }
                    break;
                case NSPACEL:
                    }
                    break;
                case NSPACEL:
@@ -3700,7 +3772,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                        else {
                            for (value = 0; value < 256; value++)
                                if (isDIGIT(value))
                        else {
                            for (value = 0; value < 256; value++)
                                if (isDIGIT(value))
-                                   ANYOF_BITMAP_SET(data->start_class, value);                 
+                                   ANYOF_BITMAP_SET(data->start_class, value);
                        }
                    }
                    break;
                        }
                    }
                    break;
@@ -3717,7 +3789,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                        else {
                            for (value = 0; value < 256; value++)
                                if (!isDIGIT(value))
                        else {
                            for (value = 0; value < 256; value++)
                                if (!isDIGIT(value))
-                                   ANYOF_BITMAP_SET(data->start_class, value);                 
+                                   ANYOF_BITMAP_SET(data->start_class, value);
                        }
                    }
                    break;
                        }
                    }
                    break;
@@ -4264,18 +4336,20 @@ Perl_re_compile(pTHX_ SV * const pattern, U32 pm_flags)
     struct regexp *r;
     register regexp_internal *ri;
     STRLEN plen;
     struct regexp *r;
     register regexp_internal *ri;
     STRLEN plen;
-    char  *exp = SvPV(pattern, plen);
-    char* xend = exp + plen;
+    char  *exp;
+    char* xend;
     regnode *scan;
     I32 flags;
     I32 minlen = 0;
     I32 sawplus = 0;
     I32 sawopen = 0;
     regnode *scan;
     I32 flags;
     I32 minlen = 0;
     I32 sawplus = 0;
     I32 sawopen = 0;
+    U8 jump_ret = 0;
+    dJMPENV;
     scan_data_t data;
     RExC_state_t RExC_state;
     RExC_state_t * const pRExC_state = &RExC_state;
 #ifdef TRIE_STUDY_OPT    
     scan_data_t data;
     RExC_state_t RExC_state;
     RExC_state_t * const pRExC_state = &RExC_state;
 #ifdef TRIE_STUDY_OPT    
-    int restudied= 0;
+    int restudied;
     RExC_state_t copyRExC_state;
 #endif    
     GET_RE_DEBUG_FLAGS_DECL;
     RExC_state_t copyRExC_state;
 #endif    
     GET_RE_DEBUG_FLAGS_DECL;
@@ -4286,15 +4360,56 @@ Perl_re_compile(pTHX_ SV * const pattern, U32 pm_flags)
 
     RExC_utf8 = RExC_orig_utf8 = SvUTF8(pattern);
 
 
     RExC_utf8 = RExC_orig_utf8 = SvUTF8(pattern);
 
-    DEBUG_COMPILE_r({
-        SV *dsv= sv_newmortal();
-        RE_PV_QUOTED_DECL(s, RExC_utf8,
-            dsv, exp, plen, 60);
-        PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
-                      PL_colors[4],PL_colors[5],s);
-    });
 
 
-redo_first_pass:
+    /* Longjmp back to here if have to switch in midstream to utf8 */
+    if (! RExC_orig_utf8) {
+       JMPENV_PUSH(jump_ret);
+    }
+
+    if (jump_ret == 0) {    /* First time through */
+        exp = SvPV(pattern, plen);
+        xend = exp + plen;
+
+        DEBUG_COMPILE_r({
+            SV *dsv= sv_newmortal();
+            RE_PV_QUOTED_DECL(s, RExC_utf8,
+                dsv, exp, plen, 60);
+            PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
+                           PL_colors[4],PL_colors[5],s);
+        });
+    }
+    else {  /* longjumped back */
+        STRLEN len = plen;
+
+        /* If the cause for the longjmp was other than changing to utf8, pop
+         * our own setjmp, and longjmp to the correct handler */
+       if (jump_ret != UTF8_LONGJMP) {
+           JMPENV_POP;
+           JMPENV_JUMP(jump_ret);
+       }
+
+       GET_RE_DEBUG_FLAGS;
+
+        /* It's possible to write a regexp in ascii that represents Unicode
+        codepoints outside of the byte range, such as via \x{100}. If we
+        detect such a sequence we have to convert the entire pattern to utf8
+        and then recompile, as our sizing calculation will have been based
+        on 1 byte == 1 character, but we will need to use utf8 to encode
+        at least some part of the pattern, and therefore must convert the whole
+        thing.
+        -- dmq */
+        DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
+           "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
+        exp = (char*)Perl_bytes_to_utf8(aTHX_ (U8*)SvPV(pattern, plen), &len);
+        xend = exp + len;
+        RExC_orig_utf8 = RExC_utf8 = 1;
+        SAVEFREEPV(exp);
+    }
+
+#ifdef TRIE_STUDY_OPT
+    restudied = 0;
+#endif
+
     RExC_precomp = exp;
     RExC_flags = pm_flags;
     RExC_sawback = 0;
     RExC_precomp = exp;
     RExC_flags = pm_flags;
     RExC_sawback = 0;
@@ -4333,24 +4448,14 @@ redo_first_pass:
        RExC_precomp = NULL;
        return(NULL);
     }
        RExC_precomp = NULL;
        return(NULL);
     }
-    if (RExC_utf8 && !RExC_orig_utf8) {
-        /* It's possible to write a regexp in ascii that represents Unicode
-        codepoints outside of the byte range, such as via \x{100}. If we
-        detect such a sequence we have to convert the entire pattern to utf8
-        and then recompile, as our sizing calculation will have been based
-        on 1 byte == 1 character, but we will need to use utf8 to encode
-        at least some part of the pattern, and therefore must convert the whole
-        thing.
-        XXX: somehow figure out how to make this less expensive...
-        -- dmq */
-        STRLEN len = plen;
-        DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
-           "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
-        exp = (char*)Perl_bytes_to_utf8(aTHX_ (U8*)exp, &len);
-        xend = exp + len;
-        RExC_orig_utf8 = RExC_utf8;
-        SAVEFREEPV(exp);
-        goto redo_first_pass;
+
+    /* Here, finished first pass.  Get rid of our setjmp, which we added for
+     * efficiency only if the passed-in string wasn't in utf8, as shown by
+     * RExC_orig_utf8.  But if the first pass was redone, that variable will be
+     * 1 here even though the original string wasn't utf8, but in this case
+     * there will have been a long jump */
+    if (jump_ret == UTF8_LONGJMP || ! RExC_orig_utf8) {
+       JMPENV_POP;
     }
     DEBUG_PARSE_r({
         PerlIO_printf(Perl_debug_log, 
     }
     DEBUG_PARSE_r({
         PerlIO_printf(Perl_debug_log, 
@@ -4392,39 +4497,56 @@ redo_first_pass:
     r->extflags = pm_flags;
     {
         bool has_p     = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
     r->extflags = pm_flags;
     {
         bool has_p     = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
-       bool has_minus = ((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD);
+        bool has_charset = cBOOL(r->extflags & (RXf_PMf_LOCALE|RXf_PMf_UNICODE));
+
+        /* The caret is output if there are any defaults: if not all the STD
+         * flags are set, or if no character set specifier is needed */
+        bool has_default =
+                    (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
+                    || ! has_charset);
        bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
        U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
                            >> RXf_PMf_STD_PMMOD_SHIFT);
        const char *fptr = STD_PAT_MODS;        /*"msix"*/
        char *p;
        bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
        U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
                            >> RXf_PMf_STD_PMMOD_SHIFT);
        const char *fptr = STD_PAT_MODS;        /*"msix"*/
        char *p;
-       const STRLEN wraplen = plen + has_minus + has_p + has_runon
+        /* Allocate for the worst case, which is all the std flags are turned
+         * on.  If more precision is desired, we could do a population count of
+         * the flags set.  This could be done with a small lookup table, or by
+         * shifting, masking and adding, or even, when available, assembly
+         * language for a machine-language population count.
+         * We never output a minus, as all those are defaults, so are
+         * covered by the caret */
+       const STRLEN wraplen = plen + has_p + has_runon
+            + has_default       /* If needs a caret */
+            + has_charset       /* If needs a character set specifier */
             + (sizeof(STD_PAT_MODS) - 1)
             + (sizeof("(?:)") - 1);
 
             + (sizeof(STD_PAT_MODS) - 1)
             + (sizeof("(?:)") - 1);
 
-       p = sv_grow(MUTABLE_SV(rx), wraplen + 1);
-       SvCUR_set(rx, wraplen);
+        p = sv_grow(MUTABLE_SV(rx), wraplen + 1); /* +1 for the ending NUL */
        SvPOK_on(rx);
        SvFLAGS(rx) |= SvUTF8(pattern);
         *p++='('; *p++='?';
        SvPOK_on(rx);
        SvFLAGS(rx) |= SvUTF8(pattern);
         *p++='('; *p++='?';
+
+        /* If a default, cover it using the caret */
+        if (has_default) {
+            *p++= DEFAULT_PAT_MOD;
+        }
+        if (has_charset) {
+            if (r->extflags & RXf_PMf_LOCALE) {
+                *p++ = LOCALE_PAT_MOD;
+            } else {
+                *p++ = UNICODE_PAT_MOD;
+            }
+        }
         if (has_p)
             *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
         {
         if (has_p)
             *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
         {
-            char *r = p + (sizeof(STD_PAT_MODS) - 1) + has_minus - 1;
-            char *colon = r + 1;
             char ch;
             char ch;
-
             while((ch = *fptr++)) {
                 if(reganch & 1)
                     *p++ = ch;
             while((ch = *fptr++)) {
                 if(reganch & 1)
                     *p++ = ch;
-                else
-                    *r-- = ch;
                 reganch >>= 1;
             }
                 reganch >>= 1;
             }
-            if(has_minus) {
-                *r = '-';
-                p = colon;
-            }
         }
 
         *p++ = ':';
         }
 
         *p++ = ':';
@@ -4436,6 +4558,7 @@ redo_first_pass:
             *p++ = '\n';
         *p++ = ')';
         *p = 0;
             *p++ = '\n';
         *p++ = ')';
         *p = 0;
+       SvCUR_set(rx, p - SvPVX_const(rx));
     }
 
     r->intflags = 0;
     }
 
     r->intflags = 0;
@@ -4604,7 +4727,7 @@ reStudy:
            ri->regstclass = trie_op;
        }
 #endif 
            ri->regstclass = trie_op;
        }
 #endif 
-       else if (strchr((const char*)PL_simple,OP(first)))
+       else if (REGNODE_SIMPLE(OP(first)))
            ri->regstclass = first;
        else if (PL_regkind[OP(first)] == BOUND ||
                 PL_regkind[OP(first)] == NBOUND)
            ri->regstclass = first;
        else if (PL_regkind[OP(first)] == BOUND ||
                 PL_regkind[OP(first)] == NBOUND)
@@ -4945,7 +5068,7 @@ reStudy:
 #endif
 #ifdef DEBUGGING
     if (RExC_paren_names) {
 #endif
 #ifdef DEBUGGING
     if (RExC_paren_names) {
-        ri->name_list_idx = add_data( pRExC_state, 1, "p" );
+        ri->name_list_idx = add_data( pRExC_state, 1, "a" );
         ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
     } else
 #endif
         ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
     } else
 #endif
@@ -4995,7 +5118,7 @@ Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
     if (flags & RXapif_FETCH) {
         return reg_named_buff_fetch(rx, key, flags);
     } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
     if (flags & RXapif_FETCH) {
         return reg_named_buff_fetch(rx, key, flags);
     } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
-        Perl_croak(aTHX_ "%s", PL_no_modify);
+        Perl_croak_no_modify(aTHX);
         return NULL;
     } else if (flags & RXapif_EXISTS) {
         return reg_named_buff_exists(rx, key, flags)
         return NULL;
     } else if (flags & RXapif_EXISTS) {
         return reg_named_buff_exists(rx, key, flags)
@@ -5295,7 +5418,7 @@ Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
     PERL_UNUSED_ARG(value);
 
     if (!PL_localizing)
     PERL_UNUSED_ARG(value);
 
     if (!PL_localizing)
-        Perl_croak(aTHX_ "%s", PL_no_modify);
+        Perl_croak_no_modify(aTHX);
 }
 
 I32
 }
 
 I32
@@ -5636,6 +5759,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
        if (*RExC_parse == '?') { /* (?...) */
            bool is_logical = 0;
            const char * const seqstart = RExC_parse;
        if (*RExC_parse == '?') { /* (?...) */
            bool is_logical = 0;
            const char * const seqstart = RExC_parse;
+            bool has_use_defaults = FALSE;
 
            RExC_parse++;
            paren = *RExC_parse++;
 
            RExC_parse++;
            paren = *RExC_parse++;
@@ -5765,7 +5889,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
                RExC_parse++;
            case '=':           /* (?=...) */
                RExC_seen_zerolen++;
                RExC_parse++;
            case '=':           /* (?=...) */
                RExC_seen_zerolen++;
-                       break;
+                break;
            case '!':           /* (?!...) */
                RExC_seen_zerolen++;
                if (*RExC_parse == ')') {
            case '!':           /* (?!...) */
                RExC_seen_zerolen++;
                if (*RExC_parse == ')') {
@@ -6090,12 +6214,19 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
                RExC_parse--; /* for vFAIL to print correctly */
                 vFAIL("Sequence (? incomplete");
                 break;
                RExC_parse--; /* for vFAIL to print correctly */
                 vFAIL("Sequence (? incomplete");
                 break;
+            case DEFAULT_PAT_MOD:   /* Use default flags with the exceptions
+                                      that follow */
+                has_use_defaults = TRUE;
+                STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
+                RExC_flags &= ~(RXf_PMf_LOCALE|RXf_PMf_UNICODE);
+                goto parse_flags;
            default:
                --RExC_parse;
                parse_flags:      /* (?i) */  
            {
                 U32 posflags = 0, negflags = 0;
                U32 *flagsp = &posflags;
            default:
                --RExC_parse;
                parse_flags:      /* (?i) */  
            {
                 U32 posflags = 0, negflags = 0;
                U32 *flagsp = &posflags;
+                bool has_charset_modifier = 0;
 
                while (*RExC_parse) {
                    /* && strchr("iogcmsx", *RExC_parse) */
 
                while (*RExC_parse) {
                    /* && strchr("iogcmsx", *RExC_parse) */
@@ -6103,6 +6234,32 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
                       and must be globally applied -- japhy */
                     switch (*RExC_parse) {
                    CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
                       and must be globally applied -- japhy */
                     switch (*RExC_parse) {
                    CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
+                    case LOCALE_PAT_MOD:
+                        if (has_charset_modifier || flagsp == &negflags) {
+                            goto fail_modifiers;
+                        }
+                        posflags |= RXf_PMf_LOCALE;
+                        negflags |= RXf_PMf_UNICODE;
+                        has_charset_modifier = 1;
+                        break;
+                    case UNICODE_PAT_MOD:
+                        if (has_charset_modifier || flagsp == &negflags) {
+                            goto fail_modifiers;
+                        }
+                        posflags |= RXf_PMf_UNICODE;
+                        negflags |= RXf_PMf_LOCALE;
+                        has_charset_modifier = 1;
+                        break;
+                    case DUAL_PAT_MOD:
+                        if (has_use_defaults
+                            || has_charset_modifier
+                            || flagsp == &negflags)
+                        {
+                            goto fail_modifiers;
+                        }
+                        negflags |= (RXf_PMf_LOCALE|RXf_PMf_UNICODE);
+                        has_charset_modifier = 1;
+                        break;
                     case ONCE_PAT_MOD: /* 'o' */
                     case GLOBAL_PAT_MOD: /* 'g' */
                        if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
                     case ONCE_PAT_MOD: /* 'o' */
                     case GLOBAL_PAT_MOD: /* 'g' */
                        if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
@@ -6143,7 +6300,11 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
                         }
                        break;
                     case '-':
                         }
                        break;
                     case '-':
-                        if (flagsp == &negflags) {
+                        /* A flag is a default iff it is following a minus,  so
+                         * if there is a minus, it means will be trying to
+                         * re-specify a default which is an error */
+                        if (has_use_defaults || flagsp == &negflags) {
+            fail_modifiers:
                             RExC_parse++;
                            vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
                            /*NOTREACHED*/
                             RExC_parse++;
                            vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
                            /*NOTREACHED*/
@@ -6746,7 +6907,7 @@ S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp)
        return (regnode *) &RExC_parse; /* Invalid regnode pointer */
     }
 
        return (regnode *) &RExC_parse; /* Invalid regnode pointer */
     }
 
-    RExC_utf8 = 1;     /* named sequences imply Unicode semantics */
+    REQUIRE_UTF8;      /* named sequences imply Unicode semantics */
     RExC_parse += 2;   /* Skip past the 'U+' */
 
     if (valuep) {   /* In a bracketed char class */
     RExC_parse += 2;   /* Skip past the 'U+' */
 
     if (valuep) {   /* In a bracketed char class */
@@ -6791,7 +6952,7 @@ S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp)
     }
     else {     /* Not a char class */
        char *s;            /* String to put in generated EXACT node */
     }
     else {     /* Not a char class */
        char *s;            /* String to put in generated EXACT node */
-       STRLEN len = 0;     /* Its current length */
+       STRLEN len = 0;     /* Its current byte length */
        char *endchar;      /* Points to '.' or '}' ending cur char in the input
                               stream */
 
        char *endchar;      /* Points to '.' or '}' ending cur char in the input
                               stream */
 
@@ -6801,7 +6962,7 @@ S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp)
 
        /* Exact nodes can hold only a U8 length's of text = 255.  Loop through
         * the input which is of the form now 'c1.c2.c3...}' until find the
 
        /* Exact nodes can hold only a U8 length's of text = 255.  Loop through
         * the input which is of the form now 'c1.c2.c3...}' until find the
-        * ending brace or exeed length 255.  The characters that exceed this
+        * ending brace or exceed length 255.  The characters that exceed this
         * limit are dropped.  The limit could be relaxed should it become
         * desirable by reparsing this as (?:\N{NAME}), so could generate
         * multiple EXACT nodes, as is done for just regular input.  But this
         * limit are dropped.  The limit could be relaxed should it become
         * desirable by reparsing this as (?:\N{NAME}), so could generate
         * multiple EXACT nodes, as is done for just regular input.  But this
@@ -7134,31 +7295,61 @@ tryagain:
            *flagp |= HASWIDTH;
            goto finish_meta_pat;
        case 'w':
            *flagp |= HASWIDTH;
            goto finish_meta_pat;
        case 'w':
-           ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML     : ALNUM));
+           if (LOC) {
+                ret = reg_node(pRExC_state, (U8)(ALNUML));
+            } else {
+                ret = reg_node(pRExC_state, (U8)(ALNUM));
+                FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
+            }
            *flagp |= HASWIDTH|SIMPLE;
            goto finish_meta_pat;
        case 'W':
            *flagp |= HASWIDTH|SIMPLE;
            goto finish_meta_pat;
        case 'W':
-           ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML    : NALNUM));
+            if (LOC) {
+                ret = reg_node(pRExC_state, (U8)(NALNUML));
+            } else {
+                ret = reg_node(pRExC_state, (U8)(NALNUM));
+                FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
+            }
            *flagp |= HASWIDTH|SIMPLE;
            goto finish_meta_pat;
        case 'b':
            RExC_seen_zerolen++;
            RExC_seen |= REG_SEEN_LOOKBEHIND;
            *flagp |= HASWIDTH|SIMPLE;
            goto finish_meta_pat;
        case 'b':
            RExC_seen_zerolen++;
            RExC_seen |= REG_SEEN_LOOKBEHIND;
-           ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL     : BOUND));
+            if (LOC) {
+                ret = reg_node(pRExC_state, (U8)(BOUNDL));
+            } else {
+                ret = reg_node(pRExC_state, (U8)(BOUND));
+                FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
+            }
            *flagp |= SIMPLE;
            goto finish_meta_pat;
        case 'B':
            RExC_seen_zerolen++;
            RExC_seen |= REG_SEEN_LOOKBEHIND;
            *flagp |= SIMPLE;
            goto finish_meta_pat;
        case 'B':
            RExC_seen_zerolen++;
            RExC_seen |= REG_SEEN_LOOKBEHIND;
-           ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL    : NBOUND));
+            if (LOC) {
+                ret = reg_node(pRExC_state, (U8)(NBOUNDL));
+            } else {
+                ret = reg_node(pRExC_state, (U8)(NBOUND));
+                FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
+            }
            *flagp |= SIMPLE;
            goto finish_meta_pat;
        case 's':
            *flagp |= SIMPLE;
            goto finish_meta_pat;
        case 's':
-           ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL     : SPACE));
+            if (LOC) {
+                ret = reg_node(pRExC_state, (U8)(SPACEL));
+            } else {
+                ret = reg_node(pRExC_state, (U8)(SPACE));
+                FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
+            }
            *flagp |= HASWIDTH|SIMPLE;
            goto finish_meta_pat;
        case 'S':
            *flagp |= HASWIDTH|SIMPLE;
            goto finish_meta_pat;
        case 'S':
-           ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL    : NSPACE));
+            if (LOC) {
+                ret = reg_node(pRExC_state, (U8)(NSPACEL));
+            } else {
+                ret = reg_node(pRExC_state, (U8)(NSPACE));
+                FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
+            }
            *flagp |= HASWIDTH|SIMPLE;
            goto finish_meta_pat;
        case 'd':
            *flagp |= HASWIDTH|SIMPLE;
            goto finish_meta_pat;
        case 'd':
@@ -7462,6 +7653,35 @@ tryagain:
                          ender = ASCII_TO_NATIVE('\007');
                        p++;
                        break;
                          ender = ASCII_TO_NATIVE('\007');
                        p++;
                        break;
+                   case 'o':
+                       {
+                           STRLEN brace_len = len;
+                           UV result;
+                           const char* error_msg;
+
+                           bool valid = grok_bslash_o(p,
+                                                      &result,
+                                                      &brace_len,
+                                                      &error_msg,
+                                                      1);
+                           p += brace_len;
+                           if (! valid) {
+                               RExC_parse = p; /* going to die anyway; point
+                                                  to exact spot of failure */
+                               vFAIL(error_msg);
+                           }
+                           else
+                           {
+                               ender = result;
+                           }
+                           if (PL_encoding && ender < 0x100) {
+                               goto recode_encoding;
+                           }
+                           if (ender > 0xff) {
+                               REQUIRE_UTF8;
+                           }
+                           break;
+                       }
                    case 'x':
                        if (*++p == '{') {
                            char* const e = strchr(p, '}');
                    case 'x':
                        if (*++p == '{') {
                            char* const e = strchr(p, '}');
@@ -7476,7 +7696,7 @@ tryagain:
                                 STRLEN numlen = e - p - 1;
                                ender = grok_hex(p + 1, &numlen, &flags, NULL);
                                if (ender > 0xff)
                                 STRLEN numlen = e - p - 1;
                                ender = grok_hex(p + 1, &numlen, &flags, NULL);
                                if (ender > 0xff)
-                                   RExC_utf8 = 1;
+                                   REQUIRE_UTF8;
                                p = e + 1;
                            }
                        }
                                p = e + 1;
                            }
                        }
@@ -7496,21 +7716,13 @@ tryagain:
                    case '0': case '1': case '2': case '3':case '4':
                    case '5': case '6': case '7': case '8':case '9':
                        if (*p == '0' ||
                    case '0': case '1': case '2': case '3':case '4':
                    case '5': case '6': case '7': case '8':case '9':
                        if (*p == '0' ||
-                         (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
-                            I32 flags = 0;
+                           (isDIGIT(p[1]) && atoi(p) >= RExC_npar))
+                       {
+                           I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
                            STRLEN numlen = 3;
                            ender = grok_oct(p, &numlen, &flags, NULL);
                            STRLEN numlen = 3;
                            ender = grok_oct(p, &numlen, &flags, NULL);
-
-                           /* An octal above 0xff is interpreted differently
-                            * depending on if the re is in utf8 or not.  If it
-                            * is in utf8, the value will be itself, otherwise
-                            * it is interpreted as modulo 0x100.  It has been
-                            * decided to discourage the use of octal above the
-                            * single-byte range.  For now, warn only when
-                            * it ends up modulo */
-                           if (SIZE_ONLY && ender >= 0x100
-                                   && ! UTF && ! PL_encoding) {
-                               ckWARNregdep(p, "Use of octal value above 377 is deprecated");
+                           if (ender > 0xff) {
+                               REQUIRE_UTF8;
                            }
                            p += numlen;
                        }
                            }
                            p += numlen;
                        }
@@ -7527,7 +7739,7 @@ tryagain:
                            ender = reg_recode((const char)(U8)ender, &enc);
                            if (!enc && SIZE_ONLY)
                                ckWARNreg(p, "Invalid escape in the specified encoding");
                            ender = reg_recode((const char)(U8)ender, &enc);
                            if (!enc && SIZE_ONLY)
                                ckWARNreg(p, "Invalid escape in the specified encoding");
-                           RExC_utf8 = 1;
+                           REQUIRE_UTF8;
                        }
                        break;
                    case '\0':
                        }
                        break;
                    case '\0':
@@ -7844,43 +8056,55 @@ S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
     }
 }
 
     }
 }
 
-
-#define _C_C_T_(NAME,TEST,WORD)                         \
+/* No locale test */
+#define _C_C_T_NOLOC_(NAME,TEST,WORD)                   \
 ANYOF_##NAME:                                           \
 ANYOF_##NAME:                                           \
-    if (LOC)                                            \
-       ANYOF_CLASS_SET(ret, ANYOF_##NAME);             \
-    else {                                              \
        for (value = 0; value < 256; value++)           \
            if (TEST)                                   \
                ANYOF_BITMAP_SET(ret, value);           \
        for (value = 0; value < 256; value++)           \
            if (TEST)                                   \
                ANYOF_BITMAP_SET(ret, value);           \
-    }                                                   \
     yesno = '+';                                        \
     what = WORD;                                        \
     break;                                              \
 case ANYOF_N##NAME:                                     \
     yesno = '+';                                        \
     what = WORD;                                        \
     break;                                              \
 case ANYOF_N##NAME:                                     \
-    if (LOC)                                            \
-       ANYOF_CLASS_SET(ret, ANYOF_N##NAME);            \
-    else {                                              \
        for (value = 0; value < 256; value++)           \
            if (!TEST)                                  \
                ANYOF_BITMAP_SET(ret, value);           \
        for (value = 0; value < 256; value++)           \
            if (!TEST)                                  \
                ANYOF_BITMAP_SET(ret, value);           \
-    }                                                   \
     yesno = '!';                                        \
     what = WORD;                                        \
     break
 
     yesno = '!';                                        \
     what = WORD;                                        \
     break
 
-#define _C_C_T_NOLOC_(NAME,TEST,WORD)                   \
+/* Like the above, but there are differences if we are in uni-8-bit or not, so
+ * there are two tests passed in, to use depending on that. There aren't any
+ * cases where the label is different from the name, so no need for that
+ * parameter */
+#define _C_C_T_(NAME,TEST_8,TEST_7,WORD)       \
 ANYOF_##NAME:                                           \
 ANYOF_##NAME:                                           \
-       for (value = 0; value < 256; value++)           \
-           if (TEST)                                   \
-               ANYOF_BITMAP_SET(ret, value);           \
+    if (LOC) ANYOF_CLASS_SET(ret, ANYOF_##NAME);        \
+    else if (UNI_SEMANTICS) {                           \
+        for (value = 0; value < 256; value++) {         \
+            if (TEST_8) ANYOF_BITMAP_SET(ret, value);   \
+        }                                               \
+    }                                                   \
+    else {                                              \
+        for (value = 0; value < 256; value++) {         \
+            if (TEST_7) ANYOF_BITMAP_SET(ret, value);   \
+        }                                               \
+    }                                                   \
     yesno = '+';                                        \
     what = WORD;                                        \
     break;                                              \
 case ANYOF_N##NAME:                                     \
     yesno = '+';                                        \
     what = WORD;                                        \
     break;                                              \
 case ANYOF_N##NAME:                                     \
-       for (value = 0; value < 256; value++)           \
-           if (!TEST)                                  \
-               ANYOF_BITMAP_SET(ret, value);           \
+    if (LOC) ANYOF_CLASS_SET(ret, ANYOF_N##NAME);       \
+    else if (UNI_SEMANTICS) {                           \
+        for (value = 0; value < 256; value++) {         \
+            if (! TEST_8) ANYOF_BITMAP_SET(ret, value); \
+        }                                               \
+    }                                                   \
+    else {                                              \
+        for (value = 0; value < 256; value++) {         \
+            if (! TEST_7) ANYOF_BITMAP_SET(ret, value); \
+        }                                               \
+    }                                                   \
     yesno = '!';                                        \
     what = WORD;                                        \
     break
     yesno = '!';                                        \
     what = WORD;                                        \
     break
@@ -7928,7 +8152,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
 #ifdef EBCDIC
     UV literal_endpoint = 0;
 #endif
 #ifdef EBCDIC
     UV literal_endpoint = 0;
 #endif
-    UV stored = 0;  /* number of chars stored in the class */
+    UV stored = 0;  /* 0, 1, or more than 1 chars stored in the class */
 
     regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
         case we need to change the emitted regop to an EXACT. */
 
     regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
         case we need to change the emitted regop to an EXACT. */
@@ -8086,6 +8310,24 @@ parseit:
            case 'b':   value = '\b';                   break;
            case 'e':   value = ASCII_TO_NATIVE('\033');break;
            case 'a':   value = ASCII_TO_NATIVE('\007');break;
            case 'b':   value = '\b';                   break;
            case 'e':   value = ASCII_TO_NATIVE('\033');break;
            case 'a':   value = ASCII_TO_NATIVE('\007');break;
+           case 'o':
+               RExC_parse--;   /* function expects to be pointed at the 'o' */
+               {
+                   const char* error_msg;
+                   bool valid = grok_bslash_o(RExC_parse,
+                                              &value,
+                                              &numlen,
+                                              &error_msg,
+                                              SIZE_ONLY);
+                   RExC_parse += numlen;
+                   if (! valid) {
+                       vFAIL(error_msg);
+                   }
+               }
+               if (PL_encoding && value < 0x100) {
+                   goto recode_encoding;
+               }
+               break;
            case 'x':
                if (*RExC_parse == '{') {
                     I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
            case 'x':
                if (*RExC_parse == '{') {
                     I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
@@ -8111,9 +8353,10 @@ parseit:
                value = grok_bslash_c(*RExC_parse++, SIZE_ONLY);
                break;
            case '0': case '1': case '2': case '3': case '4':
                value = grok_bslash_c(*RExC_parse++, SIZE_ONLY);
                break;
            case '0': case '1': case '2': case '3': case '4':
-           case '5': case '6': case '7': case '8': case '9':
+           case '5': case '6': case '7':
                {
                {
-                   I32 flags = 0;
+                   /* Take 1-3 octal digits */
+                   I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
                    numlen = 3;
                    value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
                    RExC_parse += numlen;
                    numlen = 3;
                    value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
                    RExC_parse += numlen;
@@ -8131,10 +8374,12 @@ parseit:
                    break;
                }
            default:
                    break;
                }
            default:
-               if (!SIZE_ONLY && isALPHA(value))
+               /* Allow \_ to not give an error */
+               if (!SIZE_ONLY && isALNUM(value) && value != '_') {
                    ckWARN2reg(RExC_parse,
                               "Unrecognized escape \\%c in character class passed through",
                               (int)value);
                    ckWARN2reg(RExC_parse,
                               "Unrecognized escape \\%c in character class passed through",
                               (int)value);
+               }
                break;
            }
        } /* end of \blah */
                break;
            }
        } /* end of \blah */
@@ -8145,10 +8390,23 @@ parseit:
 
        if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
 
 
        if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
 
-           if (!SIZE_ONLY && !need_class)
-               ANYOF_CLASS_ZERO(ret);
-
-           need_class = 1;
+           /* What matches in a locale is not known until runtime, so need to
+            * (one time per class) allocate extra space to pass to regexec.
+            * The space will contain a bit for each named class that is to be
+            * matched against.  This isn't needed for \p{} and pseudo-classes,
+            * as they are not affected by locale, and hence are dealt with
+            * separately */
+           if (LOC && namedclass < ANYOF_MAX && ! need_class) {
+               need_class = 1;
+               if (SIZE_ONLY) {
+                   RExC_size += ANYOF_CLASS_ADD_SKIP;
+               }
+               else {
+                   RExC_emit += ANYOF_CLASS_ADD_SKIP;
+                   ANYOF_CLASS_ZERO(ret);
+               }
+                   ANYOF_FLAGS(ret) |= ANYOF_CLASS|ANYOF_LARGE;
+           }
 
            /* a bad range like a-\d, a-[:digit:] ? */
            if (range) {
 
            /* a bad range like a-\d, a-[:digit:] ? */
            if (range) {
@@ -8188,24 +8446,26 @@ parseit:
                 * --jhi */
                switch ((I32)namedclass) {
                
                 * --jhi */
                switch ((I32)namedclass) {
                
-               case _C_C_T_(ALNUMC, isALNUMC(value), POSIX_CC_UNI_NAME("Alnum"));
-               case _C_C_T_(ALPHA, isALPHA(value), POSIX_CC_UNI_NAME("Alpha"));
-               case _C_C_T_(BLANK, isBLANK(value), POSIX_CC_UNI_NAME("Blank"));
-               case _C_C_T_(CNTRL, isCNTRL(value), POSIX_CC_UNI_NAME("Cntrl"));
-               case _C_C_T_(GRAPH, isGRAPH(value), POSIX_CC_UNI_NAME("Graph"));
-               case _C_C_T_(LOWER, isLOWER(value), POSIX_CC_UNI_NAME("Lower"));
-               case _C_C_T_(PRINT, isPRINT(value), POSIX_CC_UNI_NAME("Print"));
-               case _C_C_T_(PSXSPC, isPSXSPC(value), POSIX_CC_UNI_NAME("Space"));
-               case _C_C_T_(PUNCT, isPUNCT(value), POSIX_CC_UNI_NAME("Punct"));
-               case _C_C_T_(UPPER, isUPPER(value), POSIX_CC_UNI_NAME("Upper"));
+               case _C_C_T_(ALNUMC, isALNUMC_L1(value), isALNUMC(value), "XPosixAlnum");
+               case _C_C_T_(ALPHA, isALPHA_L1(value), isALPHA(value), "XPosixAlpha");
+               case _C_C_T_(BLANK, isBLANK_L1(value), isBLANK(value), "XPosixBlank");
+               case _C_C_T_(CNTRL, isCNTRL_L1(value), isCNTRL(value), "XPosixCntrl");
+               case _C_C_T_(GRAPH, isGRAPH_L1(value), isGRAPH(value), "XPosixGraph");
+               case _C_C_T_(LOWER, isLOWER_L1(value), isLOWER(value), "XPosixLower");
+               case _C_C_T_(PRINT, isPRINT_L1(value), isPRINT(value), "XPosixPrint");
+               case _C_C_T_(PSXSPC, isPSXSPC_L1(value), isPSXSPC(value), "XPosixSpace");
+               case _C_C_T_(PUNCT, isPUNCT_L1(value), isPUNCT(value), "XPosixPunct");
+               case _C_C_T_(UPPER, isUPPER_L1(value), isUPPER(value), "XPosixUpper");
 #ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
 #ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
-               case _C_C_T_(ALNUM, isALNUM(value), "Word");
-               case _C_C_T_(SPACE, isSPACE(value), "SpacePerl");
+                /* \s, \w match all unicode if utf8. */
+                case _C_C_T_(SPACE, isSPACE_L1(value), isSPACE(value), "SpacePerl");
+                case _C_C_T_(ALNUM, isWORDCHAR_L1(value), isALNUM(value), "Word");
 #else
 #else
-               case _C_C_T_(SPACE, isSPACE(value), "PerlSpace");
-               case _C_C_T_(ALNUM, isALNUM(value), "PerlWord");
+                /* \s, \w match ascii and locale only */
+                case _C_C_T_(SPACE, isSPACE_L1(value), isSPACE(value), "PerlSpace");
+                case _C_C_T_(ALNUM, isWORDCHAR_L1(value), isALNUM(value), "PerlWord");
 #endif         
 #endif         
-               case _C_C_T_(XDIGIT, isXDIGIT(value), "XDigit");
+               case _C_C_T_(XDIGIT, isXDIGIT_L1(value), isXDIGIT(value), "XPosixXDigit");
                case _C_C_T_NOLOC_(VERTWS, is_VERTWS_latin1(&value), "VertSpace");
                case _C_C_T_NOLOC_(HORIZWS, is_HORIZWS_latin1(&value), "HorizSpace");
                case ANYOF_ASCII:
                case _C_C_T_NOLOC_(VERTWS, is_VERTWS_latin1(&value), "VertSpace");
                case _C_C_T_NOLOC_(HORIZWS, is_HORIZWS_latin1(&value), "HorizSpace");
                case ANYOF_ASCII:
@@ -8277,8 +8537,7 @@ parseit:
                    /* Strings such as "+utf8::isWord\n" */
                    Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
                }
                    /* Strings such as "+utf8::isWord\n" */
                    Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
                }
-               if (LOC)
-                   ANYOF_FLAGS(ret) |= ANYOF_CLASS;
+               stored+=2; /* can't optimize this class */
                continue;
            }
        } /* end of namedclass \blah */
                continue;
            }
        } /* end of namedclass \blah */
@@ -8437,13 +8696,6 @@ parseit:
        range = 0; /* this range (if it was one) is done now */
     }
 
        range = 0; /* this range (if it was one) is done now */
     }
 
-    if (need_class) {
-       ANYOF_FLAGS(ret) |= ANYOF_LARGE;
-       if (SIZE_ONLY)
-           RExC_size += ANYOF_CLASS_ADD_SKIP;
-       else
-           RExC_emit += ANYOF_CLASS_ADD_SKIP;
-    }
 
 
     if (SIZE_ONLY)
 
 
     if (SIZE_ONLY)
@@ -8547,7 +8799,7 @@ S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
 
 /* nextchar()
 
 
 /* nextchar()
 
-   Advance that parse position, and optionall absorbs
+   Advance that parse position, and optionally absorbs
    "whitespace" from the inputstream.
 
    Without /x "whitespace" means (?#...) style comments only,
    "whitespace" from the inputstream.
 
    Without /x "whitespace" means (?#...) style comments only,
@@ -8929,31 +9181,6 @@ S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,
 #endif
 
 /*
 #endif
 
 /*
- - regcurly - a little FSA that accepts {\d+,?\d*}
- */
-#ifndef PERL_IN_XSUB_RE
-I32
-Perl_regcurly(register const char *s)
-{
-    PERL_ARGS_ASSERT_REGCURLY;
-
-    if (*s++ != '{')
-       return FALSE;
-    if (!isDIGIT(*s))
-       return FALSE;
-    while (isDIGIT(*s))
-       s++;
-    if (*s == ',')
-       s++;
-    while (isDIGIT(*s))
-       s++;
-    if (*s != '}')
-       return FALSE;
-    return TRUE;
-}
-#endif
-
-/*
  - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
  */
 #ifdef DEBUGGING
  - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
  */
 #ifdef DEBUGGING
@@ -9556,6 +9783,7 @@ Perl_regfree_internal(pTHX_ REGEXP * const rx)
        while (--n >= 0) {
           /* If you add a ->what type here, update the comment in regcomp.h */
            switch (ri->data->what[n]) {
        while (--n >= 0) {
           /* If you add a ->what type here, update the comment in regcomp.h */
            switch (ri->data->what[n]) {
+           case 'a':
            case 's':
            case 'S':
            case 'u':
            case 's':
            case 'S':
            case 'u':
@@ -9636,9 +9864,8 @@ Perl_regfree_internal(pTHX_ REGEXP * const rx)
     Safefree(ri);
 }
 
     Safefree(ri);
 }
 
-#define sv_dup_inc(s,t)        SvREFCNT_inc(sv_dup(s,t))
-#define av_dup_inc(s,t)        MUTABLE_AV(SvREFCNT_inc(sv_dup((const SV *)s,t)))
-#define hv_dup_inc(s,t)        MUTABLE_HV(SvREFCNT_inc(sv_dup((const SV *)s,t)))
+#define av_dup_inc(s,t)        MUTABLE_AV(sv_dup_inc((const SV *)s,t))
+#define hv_dup_inc(s,t)        MUTABLE_HV(sv_dup_inc((const SV *)s,t))
 #define SAVEPVN(p,n)   ((p) ? savepvn(p,n) : NULL)
 
 /* 
 #define SAVEPVN(p,n)   ((p) ? savepvn(p,n) : NULL)
 
 /* 
@@ -9790,8 +10017,9 @@ Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
        for (i = 0; i < count; i++) {
            d->what[i] = ri->data->what[i];
            switch (d->what[i]) {
        for (i = 0; i < count; i++) {
            d->what[i] = ri->data->what[i];
            switch (d->what[i]) {
-               /* legal options are one of: sSfpontTu
+               /* legal options are one of: sSfpontTua
                   see also regcomp.h and pregfree() */
                   see also regcomp.h and pregfree() */
+           case 'a': /* actually an AV, but the dup function is identical.  */
            case 's':
            case 'S':
            case 'p': /* actually an AV, but the dup function is identical.  */
            case 's':
            case 'S':
            case 'p': /* actually an AV, but the dup function is identical.  */
@@ -9867,6 +10095,10 @@ Perl_regnext(pTHX_ register regnode *p)
     if (!p)
        return(NULL);
 
     if (!p)
        return(NULL);
 
+    if (OP(p) > REGNODE_MAX) {         /* regnode.type is unsigned */
+       Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(p), (int)REGNODE_MAX);
+    }
+
     offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
     if (offset == 0)
        return(NULL);
     offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
     if (offset == 0)
        return(NULL);