This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
remove redundant lex_end
[perl5.git] / regcomp.c
index 74f996b..e0f65fa 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -85,6 +85,8 @@
 #  include "regcomp.h"
 #endif
 
+#include "dquote_static.c"
+
 #ifdef op
 #undef op
 #endif /* op */
@@ -196,7 +198,10 @@ typedef struct RExC_state_t {
  */
 #define        WORST           0       /* Worst case. */
 #define        HASWIDTH        0x01    /* Known to match non-null strings. */
-#define        SIMPLE          0x02    /* Simple enough to be STAR/PLUS operand. */
+
+/* Simple enough to be STAR/PLUS operand, in an EXACT node must be a single
+ * character, and if utf8, must be invariant. */
+#define        SIMPLE          0x02
 #define        SPSTART         0x04    /* Starts with * or +. */
 #define TRYAGAIN       0x08    /* Weeded out a declaration. */
 #define POSTPONED      0x10    /* (?1),(?&name), (??{...}) or similar */
@@ -218,6 +223,11 @@ typedef struct RExC_state_t {
 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
 
+/* If not already in utf8, do a longjmp back to the beginning */
+#define UTF8_LONGJMP 42 /* Choose a value not likely to ever conflict */
+#define REQUIRE_UTF8   STMT_START {                                       \
+                                     if (! UTF) JMPENV_JUMP(UTF8_LONGJMP); \
+                        } STMT_END
 
 /* About scan_data_t.
 
@@ -358,9 +368,10 @@ static const scan_data_t zero_scan_data =
 #define SCF_TRIE_RESTUDY        0x4000 /* Do restudy? */
 #define SCF_SEEN_ACCEPT         0x8000 
 
-#define UTF (RExC_utf8 != 0)
-#define LOC ((RExC_flags & RXf_PMf_LOCALE) != 0)
-#define FOLD ((RExC_flags & RXf_PMf_FOLD) != 0)
+#define UTF cBOOL(RExC_utf8)
+#define LOC cBOOL(RExC_flags & RXf_PMf_LOCALE)
+#define UNI_SEMANTICS cBOOL(RExC_flags & RXf_PMf_UNICODE)
+#define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
 
 #define OOB_UNICODE            12345678
 #define OOB_NAMEDCLASS         -1
@@ -870,7 +881,7 @@ S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, con
   Dumps the final compressed table form of the trie to Perl_debug_log.
   Used for debugging make_trie().
 */
+
 STATIC void
 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
            AV *revcharmap, U32 depth)
@@ -3196,7 +3207,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                /* These are the cases when once a subexpression
                   fails at a particular position, it cannot succeed
                   even after backtracking at the enclosing scope.
-               
+
                   XXXX what if minimal match and we are at the
                        initial run of {n,m}? */
                if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
@@ -3289,11 +3300,11 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
 
 #ifdef DEBUGGING
                    OP(nxt1 + 1) = OPTIMIZED; /* was count. */
-                   NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
-                   NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
+                   NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
+                   NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
                    OP(nxt) = OPTIMIZED;        /* was CLOSE. */
                    OP(nxt + 1) = OPTIMIZED; /* was count. */
-                   NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
+                   NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
 #endif
                }
              nogo:
@@ -3316,12 +3327,10 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                        nxt = nxt2;
                    OP(nxt2)  = SUCCEED; /* Whas WHILEM */
                    /* Need to optimize away parenths. */
-                   if (data->flags & SF_IN_PAR) {
+                   if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
                        /* Set the parenth number.  */
                        regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
 
-                       if (OP(nxt) != CLOSE)
-                           FAIL("Panic opt close");
                        oscan->flags = (U8)ARG(nxt);
                        if (RExC_open_parens) {
                            RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
@@ -3339,7 +3348,6 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
 #if 0
                        while ( nxt1 && (OP(nxt1) != WHILEM)) {
                            regnode *nnxt = regnext(nxt1);
-                       
                            if (nnxt == nxt) {
                                if (reg_off_by_arg[OP(nxt1)])
                                    ARG_SET(nxt1, nxt2 - nxt1);
@@ -3406,7 +3414,6 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
 
                        if (UTF)
                            old = utf8_hop((U8*)s, old) - (U8*)s;
-                       
                        l -= old;
                        /* Get the added string: */
                        last_str = newSVpvn_utf8(s  + old, l, UTF);
@@ -3494,13 +3501,13 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                if (flags & SCF_DO_STCLASS_AND) {
                     for (value = 0; value < 256; value++)
                         if (!is_VERTWS_cp(value))
-                            ANYOF_BITMAP_CLEAR(data->start_class, value);  
-                }                                                              
-                else {                                                         
+                            ANYOF_BITMAP_CLEAR(data->start_class, value);
+                }
+                else {
                     for (value = 0; value < 256; value++)
                         if (is_VERTWS_cp(value))
-                            ANYOF_BITMAP_SET(data->start_class, value);           
-                }                                                              
+                            ANYOF_BITMAP_SET(data->start_class, value);
+                }
                 if (flags & SCF_DO_STCLASS_OR)
                    cl_and(data->start_class, and_withp);
                flags &= ~SCF_DO_STCLASS;
@@ -3513,7 +3520,6 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                data->pos_delta += 1;
                data->longest = &(data->longest_float);
            }
-           
        }
        else if (OP(scan) == FOLDCHAR) {
            int d = ARG(scan)==0xDF ? 1 : 2;
@@ -3571,19 +3577,37 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                    if (flags & SCF_DO_STCLASS_AND) {
                        if (!(data->start_class->flags & ANYOF_LOCALE)) {
                            ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
-                           for (value = 0; value < 256; value++)
-                               if (!isALNUM(value))
-                                   ANYOF_BITMAP_CLEAR(data->start_class, value);
+                            if (FLAGS(scan) & USE_UNI) {
+                                for (value = 0; value < 256; value++) {
+                                    if (!isWORDCHAR_L1(value)) {
+                                        ANYOF_BITMAP_CLEAR(data->start_class, value);
+                                    }
+                                }
+                            } else {
+                                for (value = 0; value < 256; value++) {
+                                    if (!isALNUM(value)) {
+                                        ANYOF_BITMAP_CLEAR(data->start_class, value);
+                                    }
+                                }
+                            }
                        }
                    }
                    else {
                        if (data->start_class->flags & ANYOF_LOCALE)
                            ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
-                       else {
-                           for (value = 0; value < 256; value++)
-                               if (isALNUM(value))
-                                   ANYOF_BITMAP_SET(data->start_class, value);                 
-                       }
+                        else if (FLAGS(scan) & USE_UNI) {
+                            for (value = 0; value < 256; value++) {
+                                if (isWORDCHAR_L1(value)) {
+                                    ANYOF_BITMAP_SET(data->start_class, value);
+                                }
+                            }
+                        } else {
+                            for (value = 0; value < 256; value++) {
+                                if (isALNUM(value)) {
+                                    ANYOF_BITMAP_SET(data->start_class, value);
+                                }
+                            }
+                        }
                    }
                    break;
                case ALNUML:
@@ -3600,9 +3624,19 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                    if (flags & SCF_DO_STCLASS_AND) {
                        if (!(data->start_class->flags & ANYOF_LOCALE)) {
                            ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
-                           for (value = 0; value < 256; value++)
-                               if (isALNUM(value))
-                                   ANYOF_BITMAP_CLEAR(data->start_class, value);
+                            if (FLAGS(scan) & USE_UNI) {
+                                for (value = 0; value < 256; value++) {
+                                    if (isWORDCHAR_L1(value)) {
+                                        ANYOF_BITMAP_CLEAR(data->start_class, value);
+                                    }
+                                }
+                            } else {
+                                for (value = 0; value < 256; value++) {
+                                    if (isALNUM(value)) {
+                                        ANYOF_BITMAP_CLEAR(data->start_class, value);
+                                    }
+                                }
+                           }
                        }
                    }
                    else {
@@ -3611,7 +3645,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                        else {
                            for (value = 0; value < 256; value++)
                                if (!isALNUM(value))
-                                   ANYOF_BITMAP_SET(data->start_class, value);                 
+                                   ANYOF_BITMAP_SET(data->start_class, value);
                        }
                    }
                    break;
@@ -3629,18 +3663,37 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                    if (flags & SCF_DO_STCLASS_AND) {
                        if (!(data->start_class->flags & ANYOF_LOCALE)) {
                            ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
-                           for (value = 0; value < 256; value++)
-                               if (!isSPACE(value))
-                                   ANYOF_BITMAP_CLEAR(data->start_class, value);
+                           if (FLAGS(scan) & USE_UNI) {
+                                for (value = 0; value < 256; value++) {
+                                    if (!isSPACE_L1(value)) {
+                                        ANYOF_BITMAP_CLEAR(data->start_class, value);
+                                    }
+                                }
+                            } else {
+                                for (value = 0; value < 256; value++) {
+                                    if (!isSPACE(value)) {
+                                        ANYOF_BITMAP_CLEAR(data->start_class, value);
+                                    }
+                                }
+                            }
                        }
                    }
                    else {
-                       if (data->start_class->flags & ANYOF_LOCALE)
+                        if (data->start_class->flags & ANYOF_LOCALE) {
                            ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
-                       else {
-                           for (value = 0; value < 256; value++)
-                               if (isSPACE(value))
-                                   ANYOF_BITMAP_SET(data->start_class, value);                 
+                        }
+                        else if (FLAGS(scan) & USE_UNI) {
+                            for (value = 0; value < 256; value++) {
+                                if (isSPACE_L1(value)) {
+                                    ANYOF_BITMAP_SET(data->start_class, value);
+                                }
+                            }
+                        } else {
+                            for (value = 0; value < 256; value++) {
+                                if (isSPACE(value)) {
+                                    ANYOF_BITMAP_SET(data->start_class, value);
+                                }
+                            }
                        }
                    }
                    break;
@@ -3658,19 +3711,38 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                    if (flags & SCF_DO_STCLASS_AND) {
                        if (!(data->start_class->flags & ANYOF_LOCALE)) {
                            ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
-                           for (value = 0; value < 256; value++)
-                               if (isSPACE(value))
-                                   ANYOF_BITMAP_CLEAR(data->start_class, value);
+                            if (FLAGS(scan) & USE_UNI) {
+                                for (value = 0; value < 256; value++) {
+                                    if (isSPACE_L1(value)) {
+                                        ANYOF_BITMAP_CLEAR(data->start_class, value);
+                                    }
+                                }
+                            } else {
+                                for (value = 0; value < 256; value++) {
+                                    if (isSPACE(value)) {
+                                        ANYOF_BITMAP_CLEAR(data->start_class, value);
+                                    }
+                                }
+                            }
                        }
                    }
                    else {
                        if (data->start_class->flags & ANYOF_LOCALE)
                            ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
-                       else {
-                           for (value = 0; value < 256; value++)
-                               if (!isSPACE(value))
-                                   ANYOF_BITMAP_SET(data->start_class, value);                 
-                       }
+                        else if (FLAGS(scan) & USE_UNI) {
+                            for (value = 0; value < 256; value++) {
+                                if (!isSPACE_L1(value)) {
+                                    ANYOF_BITMAP_SET(data->start_class, value);
+                                }
+                            }
+                        }
+                        else {
+                            for (value = 0; value < 256; value++) {
+                                if (!isSPACE(value)) {
+                                    ANYOF_BITMAP_SET(data->start_class, value);
+                                }
+                            }
+                        }
                    }
                    break;
                case NSPACEL:
@@ -3700,7 +3772,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                        else {
                            for (value = 0; value < 256; value++)
                                if (isDIGIT(value))
-                                   ANYOF_BITMAP_SET(data->start_class, value);                 
+                                   ANYOF_BITMAP_SET(data->start_class, value);
                        }
                    }
                    break;
@@ -3717,7 +3789,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                        else {
                            for (value = 0; value < 256; value++)
                                if (!isDIGIT(value))
-                                   ANYOF_BITMAP_SET(data->start_class, value);                 
+                                   ANYOF_BITMAP_SET(data->start_class, value);
                        }
                    }
                    break;
@@ -4264,18 +4336,20 @@ Perl_re_compile(pTHX_ SV * const pattern, U32 pm_flags)
     struct regexp *r;
     register regexp_internal *ri;
     STRLEN plen;
-    char  *exp = SvPV(pattern, plen);
-    char* xend = exp + plen;
+    char  *exp;
+    char* xend;
     regnode *scan;
     I32 flags;
     I32 minlen = 0;
     I32 sawplus = 0;
     I32 sawopen = 0;
+    U8 jump_ret = 0;
+    dJMPENV;
     scan_data_t data;
     RExC_state_t RExC_state;
     RExC_state_t * const pRExC_state = &RExC_state;
 #ifdef TRIE_STUDY_OPT    
-    int restudied= 0;
+    int restudied;
     RExC_state_t copyRExC_state;
 #endif    
     GET_RE_DEBUG_FLAGS_DECL;
@@ -4286,15 +4360,56 @@ Perl_re_compile(pTHX_ SV * const pattern, U32 pm_flags)
 
     RExC_utf8 = RExC_orig_utf8 = SvUTF8(pattern);
 
-    DEBUG_COMPILE_r({
-        SV *dsv= sv_newmortal();
-        RE_PV_QUOTED_DECL(s, RExC_utf8,
-            dsv, exp, plen, 60);
-        PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
-                      PL_colors[4],PL_colors[5],s);
-    });
 
-redo_first_pass:
+    /* Longjmp back to here if have to switch in midstream to utf8 */
+    if (! RExC_orig_utf8) {
+       JMPENV_PUSH(jump_ret);
+    }
+
+    if (jump_ret == 0) {    /* First time through */
+        exp = SvPV(pattern, plen);
+        xend = exp + plen;
+
+        DEBUG_COMPILE_r({
+            SV *dsv= sv_newmortal();
+            RE_PV_QUOTED_DECL(s, RExC_utf8,
+                dsv, exp, plen, 60);
+            PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
+                           PL_colors[4],PL_colors[5],s);
+        });
+    }
+    else {  /* longjumped back */
+        STRLEN len = plen;
+
+        /* If the cause for the longjmp was other than changing to utf8, pop
+         * our own setjmp, and longjmp to the correct handler */
+       if (jump_ret != UTF8_LONGJMP) {
+           JMPENV_POP;
+           JMPENV_JUMP(jump_ret);
+       }
+
+       GET_RE_DEBUG_FLAGS;
+
+        /* It's possible to write a regexp in ascii that represents Unicode
+        codepoints outside of the byte range, such as via \x{100}. If we
+        detect such a sequence we have to convert the entire pattern to utf8
+        and then recompile, as our sizing calculation will have been based
+        on 1 byte == 1 character, but we will need to use utf8 to encode
+        at least some part of the pattern, and therefore must convert the whole
+        thing.
+        -- dmq */
+        DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
+           "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
+        exp = (char*)Perl_bytes_to_utf8(aTHX_ (U8*)SvPV(pattern, plen), &len);
+        xend = exp + len;
+        RExC_orig_utf8 = RExC_utf8 = 1;
+        SAVEFREEPV(exp);
+    }
+
+#ifdef TRIE_STUDY_OPT
+    restudied = 0;
+#endif
+
     RExC_precomp = exp;
     RExC_flags = pm_flags;
     RExC_sawback = 0;
@@ -4333,24 +4448,14 @@ redo_first_pass:
        RExC_precomp = NULL;
        return(NULL);
     }
-    if (RExC_utf8 && !RExC_orig_utf8) {
-        /* It's possible to write a regexp in ascii that represents Unicode
-        codepoints outside of the byte range, such as via \x{100}. If we
-        detect such a sequence we have to convert the entire pattern to utf8
-        and then recompile, as our sizing calculation will have been based
-        on 1 byte == 1 character, but we will need to use utf8 to encode
-        at least some part of the pattern, and therefore must convert the whole
-        thing.
-        XXX: somehow figure out how to make this less expensive...
-        -- dmq */
-        STRLEN len = plen;
-        DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
-           "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
-        exp = (char*)Perl_bytes_to_utf8(aTHX_ (U8*)exp, &len);
-        xend = exp + len;
-        RExC_orig_utf8 = RExC_utf8;
-        SAVEFREEPV(exp);
-        goto redo_first_pass;
+
+    /* Here, finished first pass.  Get rid of our setjmp, which we added for
+     * efficiency only if the passed-in string wasn't in utf8, as shown by
+     * RExC_orig_utf8.  But if the first pass was redone, that variable will be
+     * 1 here even though the original string wasn't utf8, but in this case
+     * there will have been a long jump */
+    if (jump_ret == UTF8_LONGJMP || ! RExC_orig_utf8) {
+       JMPENV_POP;
     }
     DEBUG_PARSE_r({
         PerlIO_printf(Perl_debug_log, 
@@ -4392,39 +4497,56 @@ redo_first_pass:
     r->extflags = pm_flags;
     {
         bool has_p     = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
-       bool has_minus = ((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD);
+        bool has_charset = cBOOL(r->extflags & (RXf_PMf_LOCALE|RXf_PMf_UNICODE));
+
+        /* The caret is output if there are any defaults: if not all the STD
+         * flags are set, or if no character set specifier is needed */
+        bool has_default =
+                    (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
+                    || ! has_charset);
        bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
        U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
                            >> RXf_PMf_STD_PMMOD_SHIFT);
        const char *fptr = STD_PAT_MODS;        /*"msix"*/
        char *p;
-       const STRLEN wraplen = plen + has_minus + has_p + has_runon
+        /* Allocate for the worst case, which is all the std flags are turned
+         * on.  If more precision is desired, we could do a population count of
+         * the flags set.  This could be done with a small lookup table, or by
+         * shifting, masking and adding, or even, when available, assembly
+         * language for a machine-language population count.
+         * We never output a minus, as all those are defaults, so are
+         * covered by the caret */
+       const STRLEN wraplen = plen + has_p + has_runon
+            + has_default       /* If needs a caret */
+            + has_charset       /* If needs a character set specifier */
             + (sizeof(STD_PAT_MODS) - 1)
             + (sizeof("(?:)") - 1);
 
-       p = sv_grow(MUTABLE_SV(rx), wraplen + 1);
-       SvCUR_set(rx, wraplen);
+        p = sv_grow(MUTABLE_SV(rx), wraplen + 1); /* +1 for the ending NUL */
        SvPOK_on(rx);
        SvFLAGS(rx) |= SvUTF8(pattern);
         *p++='('; *p++='?';
+
+        /* If a default, cover it using the caret */
+        if (has_default) {
+            *p++= DEFAULT_PAT_MOD;
+        }
+        if (has_charset) {
+            if (r->extflags & RXf_PMf_LOCALE) {
+                *p++ = LOCALE_PAT_MOD;
+            } else {
+                *p++ = UNICODE_PAT_MOD;
+            }
+        }
         if (has_p)
             *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
         {
-            char *r = p + (sizeof(STD_PAT_MODS) - 1) + has_minus - 1;
-            char *colon = r + 1;
             char ch;
-
             while((ch = *fptr++)) {
                 if(reganch & 1)
                     *p++ = ch;
-                else
-                    *r-- = ch;
                 reganch >>= 1;
             }
-            if(has_minus) {
-                *r = '-';
-                p = colon;
-            }
         }
 
         *p++ = ':';
@@ -4436,6 +4558,7 @@ redo_first_pass:
             *p++ = '\n';
         *p++ = ')';
         *p = 0;
+       SvCUR_set(rx, p - SvPVX_const(rx));
     }
 
     r->intflags = 0;
@@ -5636,6 +5759,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
        if (*RExC_parse == '?') { /* (?...) */
            bool is_logical = 0;
            const char * const seqstart = RExC_parse;
+            bool has_use_defaults = FALSE;
 
            RExC_parse++;
            paren = *RExC_parse++;
@@ -5765,7 +5889,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
                RExC_parse++;
            case '=':           /* (?=...) */
                RExC_seen_zerolen++;
-                       break;
+                break;
            case '!':           /* (?!...) */
                RExC_seen_zerolen++;
                if (*RExC_parse == ')') {
@@ -6090,12 +6214,19 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
                RExC_parse--; /* for vFAIL to print correctly */
                 vFAIL("Sequence (? incomplete");
                 break;
+            case DEFAULT_PAT_MOD:   /* Use default flags with the exceptions
+                                      that follow */
+                has_use_defaults = TRUE;
+                STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
+                RExC_flags &= ~(RXf_PMf_LOCALE|RXf_PMf_UNICODE);
+                goto parse_flags;
            default:
                --RExC_parse;
                parse_flags:      /* (?i) */  
            {
                 U32 posflags = 0, negflags = 0;
                U32 *flagsp = &posflags;
+                bool has_charset_modifier = 0;
 
                while (*RExC_parse) {
                    /* && strchr("iogcmsx", *RExC_parse) */
@@ -6103,6 +6234,32 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
                       and must be globally applied -- japhy */
                     switch (*RExC_parse) {
                    CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
+                    case LOCALE_PAT_MOD:
+                        if (has_charset_modifier || flagsp == &negflags) {
+                            goto fail_modifiers;
+                        }
+                        *flagsp &= ~RXf_PMf_UNICODE;
+                        *flagsp |= RXf_PMf_LOCALE;
+                        has_charset_modifier = 1;
+                        break;
+                    case UNICODE_PAT_MOD:
+                        if (has_charset_modifier || flagsp == &negflags) {
+                            goto fail_modifiers;
+                        }
+                        *flagsp &= ~RXf_PMf_LOCALE;
+                        *flagsp |= RXf_PMf_UNICODE;
+                        has_charset_modifier = 1;
+                        break;
+                    case DUAL_PAT_MOD:
+                        if (has_use_defaults
+                            || has_charset_modifier
+                            || flagsp == &negflags)
+                        {
+                            goto fail_modifiers;
+                        }
+                        *flagsp &= ~(RXf_PMf_LOCALE|RXf_PMf_UNICODE);
+                        has_charset_modifier = 1;
+                        break;
                     case ONCE_PAT_MOD: /* 'o' */
                     case GLOBAL_PAT_MOD: /* 'g' */
                        if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
@@ -6143,7 +6300,11 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
                         }
                        break;
                     case '-':
-                        if (flagsp == &negflags) {
+                        /* A flag is a default iff it is following a minus,  so
+                         * if there is a minus, it means will be trying to
+                         * re-specify a default which is an error */
+                        if (has_use_defaults || flagsp == &negflags) {
+            fail_modifiers:
                             RExC_parse++;
                            vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
                            /*NOTREACHED*/
@@ -6746,7 +6907,7 @@ S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp)
        return (regnode *) &RExC_parse; /* Invalid regnode pointer */
     }
 
-    RExC_utf8 = 1;     /* named sequences imply Unicode semantics */
+    REQUIRE_UTF8;      /* named sequences imply Unicode semantics */
     RExC_parse += 2;   /* Skip past the 'U+' */
 
     if (valuep) {   /* In a bracketed char class */
@@ -6791,7 +6952,7 @@ S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp)
     }
     else {     /* Not a char class */
        char *s;            /* String to put in generated EXACT node */
-       STRLEN len = 0;     /* Its current length */
+       STRLEN len = 0;     /* Its current byte length */
        char *endchar;      /* Points to '.' or '}' ending cur char in the input
                               stream */
 
@@ -6801,7 +6962,7 @@ S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp)
 
        /* Exact nodes can hold only a U8 length's of text = 255.  Loop through
         * the input which is of the form now 'c1.c2.c3...}' until find the
-        * ending brace or exeed length 255.  The characters that exceed this
+        * ending brace or exceed length 255.  The characters that exceed this
         * limit are dropped.  The limit could be relaxed should it become
         * desirable by reparsing this as (?:\N{NAME}), so could generate
         * multiple EXACT nodes, as is done for just regular input.  But this
@@ -7134,31 +7295,61 @@ tryagain:
            *flagp |= HASWIDTH;
            goto finish_meta_pat;
        case 'w':
-           ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML     : ALNUM));
+           if (LOC) {
+                ret = reg_node(pRExC_state, (U8)(ALNUML));
+            } else {
+                ret = reg_node(pRExC_state, (U8)(ALNUM));
+                FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
+            }
            *flagp |= HASWIDTH|SIMPLE;
            goto finish_meta_pat;
        case 'W':
-           ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML    : NALNUM));
+            if (LOC) {
+                ret = reg_node(pRExC_state, (U8)(NALNUML));
+            } else {
+                ret = reg_node(pRExC_state, (U8)(NALNUM));
+                FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
+            }
            *flagp |= HASWIDTH|SIMPLE;
            goto finish_meta_pat;
        case 'b':
            RExC_seen_zerolen++;
            RExC_seen |= REG_SEEN_LOOKBEHIND;
-           ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL     : BOUND));
+            if (LOC) {
+                ret = reg_node(pRExC_state, (U8)(BOUNDL));
+            } else {
+                ret = reg_node(pRExC_state, (U8)(BOUND));
+                FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
+            }
            *flagp |= SIMPLE;
            goto finish_meta_pat;
        case 'B':
            RExC_seen_zerolen++;
            RExC_seen |= REG_SEEN_LOOKBEHIND;
-           ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL    : NBOUND));
+            if (LOC) {
+                ret = reg_node(pRExC_state, (U8)(NBOUNDL));
+            } else {
+                ret = reg_node(pRExC_state, (U8)(NBOUND));
+                FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
+            }
            *flagp |= SIMPLE;
            goto finish_meta_pat;
        case 's':
-           ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL     : SPACE));
+            if (LOC) {
+                ret = reg_node(pRExC_state, (U8)(SPACEL));
+            } else {
+                ret = reg_node(pRExC_state, (U8)(SPACE));
+                FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
+            }
            *flagp |= HASWIDTH|SIMPLE;
            goto finish_meta_pat;
        case 'S':
-           ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL    : NSPACE));
+            if (LOC) {
+                ret = reg_node(pRExC_state, (U8)(NSPACEL));
+            } else {
+                ret = reg_node(pRExC_state, (U8)(NSPACE));
+                FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
+            }
            *flagp |= HASWIDTH|SIMPLE;
            goto finish_meta_pat;
        case 'd':
@@ -7362,7 +7553,6 @@ tryagain:
            register UV ender;
            register char *p;
            char *s;
-           char *error_msg;
            STRLEN foldlen;
            U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
 
@@ -7466,20 +7656,29 @@ tryagain:
                    case 'o':
                        {
                            STRLEN brace_len = len;
-                           if ((error_msg = grok_bslash_o(p,
-                                                          &ender,
-                                                          &brace_len,
-                                                          SIZE_ONLY))
-                               != NULL)
-                           {
+                           UV result;
+                           const char* error_msg;
+
+                           bool valid = grok_bslash_o(p,
+                                                      &result,
+                                                      &brace_len,
+                                                      &error_msg,
+                                                      1);
+                           p += brace_len;
+                           if (! valid) {
+                               RExC_parse = p; /* going to die anyway; point
+                                                  to exact spot of failure */
                                vFAIL(error_msg);
                            }
-                           p += brace_len;
+                           else
+                           {
+                               ender = result;
+                           }
                            if (PL_encoding && ender < 0x100) {
                                goto recode_encoding;
                            }
                            if (ender > 0xff) {
-                               RExC_utf8 = 1;
+                               REQUIRE_UTF8;
                            }
                            break;
                        }
@@ -7497,7 +7696,7 @@ tryagain:
                                 STRLEN numlen = e - p - 1;
                                ender = grok_hex(p + 1, &numlen, &flags, NULL);
                                if (ender > 0xff)
-                                   RExC_utf8 = 1;
+                                   REQUIRE_UTF8;
                                p = e + 1;
                            }
                        }
@@ -7517,12 +7716,13 @@ tryagain:
                    case '0': case '1': case '2': case '3':case '4':
                    case '5': case '6': case '7': case '8':case '9':
                        if (*p == '0' ||
-                         (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
-                            I32 flags = 0;
+                           (isDIGIT(p[1]) && atoi(p) >= RExC_npar))
+                       {
+                           I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
                            STRLEN numlen = 3;
                            ender = grok_oct(p, &numlen, &flags, NULL);
                            if (ender > 0xff) {
-                               RExC_utf8 = 1;
+                               REQUIRE_UTF8;
                            }
                            p += numlen;
                        }
@@ -7539,7 +7739,7 @@ tryagain:
                            ender = reg_recode((const char)(U8)ender, &enc);
                            if (!enc && SIZE_ONLY)
                                ckWARNreg(p, "Invalid escape in the specified encoding");
-                           RExC_utf8 = 1;
+                           REQUIRE_UTF8;
                        }
                        break;
                    case '\0':
@@ -7881,6 +8081,7 @@ case ANYOF_N##NAME:                                     \
     what = WORD;                                        \
     break
 
+/* Like above, but no locale test */
 #define _C_C_T_NOLOC_(NAME,TEST,WORD)                   \
 ANYOF_##NAME:                                           \
        for (value = 0; value < 256; value++)           \
@@ -7897,6 +8098,42 @@ case ANYOF_N##NAME:                                     \
     what = WORD;                                        \
     break
 
+/* Like the above, but there are differences if we are in uni-8-bit or not, so
+ * there are two tests passed in, to use depending on that. There aren't any
+ * cases where the label is different from the name, so no need for that
+ * parameter */
+#define _C_C_T_UNI_8_BIT(NAME,TEST_8,TEST_7,WORD)       \
+ANYOF_##NAME:                                           \
+    if (LOC) ANYOF_CLASS_SET(ret, ANYOF_##NAME);        \
+    else if (UNI_SEMANTICS) {                           \
+        for (value = 0; value < 256; value++) {         \
+            if (TEST_8) ANYOF_BITMAP_SET(ret, value);   \
+        }                                               \
+    }                                                   \
+    else {                                              \
+        for (value = 0; value < 256; value++) {         \
+            if (TEST_7) ANYOF_BITMAP_SET(ret, value);   \
+        }                                               \
+    }                                                   \
+    yesno = '+';                                        \
+    what = WORD;                                        \
+    break;                                              \
+case ANYOF_N##NAME:                                     \
+    if (LOC) ANYOF_CLASS_SET(ret, ANYOF_N##NAME);       \
+    else if (UNI_SEMANTICS) {                           \
+        for (value = 0; value < 256; value++) {         \
+            if (! TEST_8) ANYOF_BITMAP_SET(ret, value); \
+        }                                               \
+    }                                                   \
+    else {                                              \
+        for (value = 0; value < 256; value++) {         \
+            if (! TEST_7) ANYOF_BITMAP_SET(ret, value); \
+        }                                               \
+    }                                                   \
+    yesno = '!';                                        \
+    what = WORD;                                        \
+    break
+
 /* 
    We dont use PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS as the direct test
    so that it is possible to override the option here without having to 
@@ -7992,7 +8229,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
 
 parseit:
     while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
-       char* error_msg;
 
     charclassloop:
 
@@ -8101,15 +8337,18 @@ parseit:
            case 'a':   value = ASCII_TO_NATIVE('\007');break;
            case 'o':
                RExC_parse--;   /* function expects to be pointed at the 'o' */
-               if ((error_msg = grok_bslash_o(RExC_parse,
+               {
+                   const char* error_msg;
+                   bool valid = grok_bslash_o(RExC_parse,
                                               &value,
                                               &numlen,
-                                              SIZE_ONLY))
-                   != NULL)
-               {
-                   vFAIL(error_msg);
+                                              &error_msg,
+                                              SIZE_ONLY);
+                   RExC_parse += numlen;
+                   if (! valid) {
+                       vFAIL(error_msg);
+                   }
                }
-               RExC_parse += numlen;
                if (PL_encoding && value < 0x100) {
                    goto recode_encoding;
                }
@@ -8139,9 +8378,10 @@ parseit:
                value = grok_bslash_c(*RExC_parse++, SIZE_ONLY);
                break;
            case '0': case '1': case '2': case '3': case '4':
-           case '5': case '6': case '7': case '8': case '9':
+           case '5': case '6': case '7':
                {
-                   I32 flags = 0;
+                   /* Take 1-3 octal digits */
+                   I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
                    numlen = 3;
                    value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
                    RExC_parse += numlen;
@@ -8159,10 +8399,12 @@ parseit:
                    break;
                }
            default:
-               if (!SIZE_ONLY && isALPHA(value))
+               /* Allow \_ to not give an error */
+               if (!SIZE_ONLY && isALNUM(value) && value != '_') {
                    ckWARN2reg(RExC_parse,
                               "Unrecognized escape \\%c in character class passed through",
                               (int)value);
+               }
                break;
            }
        } /* end of \blah */
@@ -8227,11 +8469,13 @@ parseit:
                case _C_C_T_(PUNCT, isPUNCT(value), POSIX_CC_UNI_NAME("Punct"));
                case _C_C_T_(UPPER, isUPPER(value), POSIX_CC_UNI_NAME("Upper"));
 #ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
-               case _C_C_T_(ALNUM, isALNUM(value), "Word");
-               case _C_C_T_(SPACE, isSPACE(value), "SpacePerl");
+                /* \s, \w match all unicode if utf8. */
+                case _C_C_T_UNI_8_BIT(SPACE, isSPACE_L1(value), isSPACE(value), "SpacePerl");
+                case _C_C_T_UNI_8_BIT(ALNUM, isWORDCHAR_L1(value), isALNUM(value), "Word");
 #else
-               case _C_C_T_(SPACE, isSPACE(value), "PerlSpace");
-               case _C_C_T_(ALNUM, isALNUM(value), "PerlWord");
+                /* \s, \w match ascii and locale only */
+                case _C_C_T_UNI_8_BIT(SPACE, isSPACE_L1(value), isSPACE(value), "PerlSpace");
+                case _C_C_T_UNI_8_BIT(ALNUM, isWORDCHAR_L1(value), isALNUM(value), "PerlWord");
 #endif         
                case _C_C_T_(XDIGIT, isXDIGIT(value), "XDigit");
                case _C_C_T_NOLOC_(VERTWS, is_VERTWS_latin1(&value), "VertSpace");
@@ -8957,31 +9201,6 @@ S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,
 #endif
 
 /*
- - regcurly - a little FSA that accepts {\d+,?\d*}
- */
-#ifndef PERL_IN_XSUB_RE
-I32
-Perl_regcurly(register const char *s)
-{
-    PERL_ARGS_ASSERT_REGCURLY;
-
-    if (*s++ != '{')
-       return FALSE;
-    if (!isDIGIT(*s))
-       return FALSE;
-    while (isDIGIT(*s))
-       s++;
-    if (*s == ',')
-       s++;
-    while (isDIGIT(*s))
-       s++;
-    if (*s != '}')
-       return FALSE;
-    return TRUE;
-}
-#endif
-
-/*
  - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
  */
 #ifdef DEBUGGING