Add functions for adding and deleting stash names

[perl5.git] / regcomp.c
diff --git a/regcomp.c b/regcomp.c

index c290fbd..831d579 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -85,6 +85,8 @@
  #  include "regcomp.h"
  #endif
  
+#include "dquote_static.c"
+
  #ifdef op
  #undef op
  #endif /* op */
@@ -196,7 +198,10 @@ typedef struct RExC_state_t {
   */
  #define        WORST           0       /* Worst case. */
  #define        HASWIDTH        0x01    /* Known to match non-null strings. */
-#define        SIMPLE          0x02    /* Simple enough to be STAR/PLUS operand. */
+
+/* Simple enough to be STAR/PLUS operand, in an EXACT node must be a single
+ * character, and if utf8, must be invariant. */
+#define        SIMPLE          0x02
  #define        SPSTART         0x04    /* Starts with * or +. */
  #define TRYAGAIN       0x08    /* Weeded out a declaration. */
  #define POSTPONED      0x10    /* (?1),(?&name), (??{...}) or similar */
@@ -218,6 +223,11 @@ typedef struct RExC_state_t {
  #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
  #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
  
+/* If not already in utf8, do a longjmp back to the beginning */
+#define UTF8_LONGJMP 42 /* Choose a value not likely to ever conflict */
+#define REQUIRE_UTF8   STMT_START {                                       \
+                                     if (! UTF) JMPENV_JUMP(UTF8_LONGJMP); \
+                        } STMT_END
  
  /* About scan_data_t.
  
@@ -358,9 +368,10 @@ static const scan_data_t zero_scan_data =
  #define SCF_TRIE_RESTUDY        0x4000 /* Do restudy? */
  #define SCF_SEEN_ACCEPT         0x8000 
  
-#define UTF (RExC_utf8 != 0)
-#define LOC ((RExC_flags & RXf_PMf_LOCALE) != 0)
-#define FOLD ((RExC_flags & RXf_PMf_FOLD) != 0)
+#define UTF cBOOL(RExC_utf8)
+#define LOC cBOOL(RExC_flags & RXf_PMf_LOCALE)
+#define UNI_SEMANTICS cBOOL(RExC_flags & RXf_PMf_UNICODE)
+#define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
  
  #define OOB_UNICODE            12345678
  #define OOB_NAMEDCLASS         -1
@@ -870,7 +881,7 @@ S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, con
    Dumps the final compressed table form of the trie to Perl_debug_log.
    Used for debugging make_trie().
  */
- 
+
  STATIC void
  S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
             AV *revcharmap, U32 depth)
@@ -3196,7 +3207,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                 /* These are the cases when once a subexpression
                    fails at a particular position, it cannot succeed
                    even after backtracking at the enclosing scope.
-               
+
                    XXXX what if minimal match and we are at the
                         initial run of {n,m}? */
                 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
@@ -3289,11 +3300,11 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
  
  #ifdef DEBUGGING
                     OP(nxt1 + 1) = OPTIMIZED; /* was count. */
-                   NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
-                   NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
+                   NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
+                   NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
                     OP(nxt) = OPTIMIZED;        /* was CLOSE. */
                     OP(nxt + 1) = OPTIMIZED; /* was count. */
-                   NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
+                   NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
  #endif
                 }
               nogo:
@@ -3316,12 +3327,10 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                         nxt = nxt2;
                     OP(nxt2)  = SUCCEED; /* Whas WHILEM */
                     /* Need to optimize away parenths. */
-                   if (data->flags & SF_IN_PAR) {
+                   if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
                         /* Set the parenth number.  */
                         regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
  
-                       if (OP(nxt) != CLOSE)
-                           FAIL("Panic opt close");
                         oscan->flags = (U8)ARG(nxt);
                         if (RExC_open_parens) {
                             RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
@@ -3339,7 +3348,6 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
  #if 0
                         while ( nxt1 && (OP(nxt1) != WHILEM)) {
                             regnode *nnxt = regnext(nxt1);
-                       
                             if (nnxt == nxt) {
                                 if (reg_off_by_arg[OP(nxt1)])
                                     ARG_SET(nxt1, nxt2 - nxt1);
@@ -3406,7 +3414,6 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
  
                         if (UTF)
                             old = utf8_hop((U8*)s, old) - (U8*)s;
-                       
                         l -= old;
                         /* Get the added string: */
                         last_str = newSVpvn_utf8(s  + old, l, UTF);
@@ -3494,13 +3501,13 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                 if (flags & SCF_DO_STCLASS_AND) {
                      for (value = 0; value < 256; value++)
                          if (!is_VERTWS_cp(value))
-                            ANYOF_BITMAP_CLEAR(data->start_class, value);  
-                }                                                              
-                else {                                                         
+                            ANYOF_BITMAP_CLEAR(data->start_class, value);
+                }
+                else {
                      for (value = 0; value < 256; value++)
                          if (is_VERTWS_cp(value))
-                            ANYOF_BITMAP_SET(data->start_class, value);           
-                }                                                              
+                            ANYOF_BITMAP_SET(data->start_class, value);
+                }
                  if (flags & SCF_DO_STCLASS_OR)
                     cl_and(data->start_class, and_withp);
                 flags &= ~SCF_DO_STCLASS;
@@ -3513,7 +3520,6 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                 data->pos_delta += 1;
                 data->longest = &(data->longest_float);
             }
-           
         }
         else if (OP(scan) == FOLDCHAR) {
             int d = ARG(scan)==0xDF ? 1 : 2;
@@ -3571,19 +3577,37 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                     if (flags & SCF_DO_STCLASS_AND) {
                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
-                           for (value = 0; value < 256; value++)
-                               if (!isALNUM(value))
-                                   ANYOF_BITMAP_CLEAR(data->start_class, value);
+                            if (FLAGS(scan) & USE_UNI) {
+                                for (value = 0; value < 256; value++) {
+                                    if (!isWORDCHAR_L1(value)) {
+                                        ANYOF_BITMAP_CLEAR(data->start_class, value);
+                                    }
+                                }
+                            } else {
+                                for (value = 0; value < 256; value++) {
+                                    if (!isALNUM(value)) {
+                                        ANYOF_BITMAP_CLEAR(data->start_class, value);
+                                    }
+                                }
+                            }
                         }
                     }
                     else {
                         if (data->start_class->flags & ANYOF_LOCALE)
                             ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
-                       else {
-                           for (value = 0; value < 256; value++)
-                               if (isALNUM(value))
-                                   ANYOF_BITMAP_SET(data->start_class, value);                 
-                       }
+                        else if (FLAGS(scan) & USE_UNI) {
+                            for (value = 0; value < 256; value++) {
+                                if (isWORDCHAR_L1(value)) {
+                                    ANYOF_BITMAP_SET(data->start_class, value);
+                                }
+                            }
+                        } else {
+                            for (value = 0; value < 256; value++) {
+                                if (isALNUM(value)) {
+                                    ANYOF_BITMAP_SET(data->start_class, value);
+                                }
+                            }
+                        }
                     }
                     break;
                 case ALNUML:
@@ -3600,9 +3624,19 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                     if (flags & SCF_DO_STCLASS_AND) {
                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
-                           for (value = 0; value < 256; value++)
-                               if (isALNUM(value))
-                                   ANYOF_BITMAP_CLEAR(data->start_class, value);
+                            if (FLAGS(scan) & USE_UNI) {
+                                for (value = 0; value < 256; value++) {
+                                    if (isWORDCHAR_L1(value)) {
+                                        ANYOF_BITMAP_CLEAR(data->start_class, value);
+                                    }
+                                }
+                            } else {
+                                for (value = 0; value < 256; value++) {
+                                    if (isALNUM(value)) {
+                                        ANYOF_BITMAP_CLEAR(data->start_class, value);
+                                    }
+                                }
+                           }
                         }
                     }
                     else {
@@ -3611,7 +3645,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                         else {
                             for (value = 0; value < 256; value++)
                                 if (!isALNUM(value))
-                                   ANYOF_BITMAP_SET(data->start_class, value);                 
+                                   ANYOF_BITMAP_SET(data->start_class, value);
                         }
                     }
                     break;
@@ -3629,18 +3663,37 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                     if (flags & SCF_DO_STCLASS_AND) {
                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
-                           for (value = 0; value < 256; value++)
-                               if (!isSPACE(value))
-                                   ANYOF_BITMAP_CLEAR(data->start_class, value);
+                           if (FLAGS(scan) & USE_UNI) {
+                                for (value = 0; value < 256; value++) {
+                                    if (!isSPACE_L1(value)) {
+                                        ANYOF_BITMAP_CLEAR(data->start_class, value);
+                                    }
+                                }
+                            } else {
+                                for (value = 0; value < 256; value++) {
+                                    if (!isSPACE(value)) {
+                                        ANYOF_BITMAP_CLEAR(data->start_class, value);
+                                    }
+                                }
+                            }
                         }
                     }
                     else {
-                       if (data->start_class->flags & ANYOF_LOCALE)
+                        if (data->start_class->flags & ANYOF_LOCALE) {
                             ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
-                       else {
-                           for (value = 0; value < 256; value++)
-                               if (isSPACE(value))
-                                   ANYOF_BITMAP_SET(data->start_class, value);                 
+                        }
+                        else if (FLAGS(scan) & USE_UNI) {
+                            for (value = 0; value < 256; value++) {
+                                if (isSPACE_L1(value)) {
+                                    ANYOF_BITMAP_SET(data->start_class, value);
+                                }
+                            }
+                        } else {
+                            for (value = 0; value < 256; value++) {
+                                if (isSPACE(value)) {
+                                    ANYOF_BITMAP_SET(data->start_class, value);
+                                }
+                            }
                         }
                     }
                     break;
@@ -3658,19 +3711,38 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                     if (flags & SCF_DO_STCLASS_AND) {
                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
-                           for (value = 0; value < 256; value++)
-                               if (isSPACE(value))
-                                   ANYOF_BITMAP_CLEAR(data->start_class, value);
+                            if (FLAGS(scan) & USE_UNI) {
+                                for (value = 0; value < 256; value++) {
+                                    if (isSPACE_L1(value)) {
+                                        ANYOF_BITMAP_CLEAR(data->start_class, value);
+                                    }
+                                }
+                            } else {
+                                for (value = 0; value < 256; value++) {
+                                    if (isSPACE(value)) {
+                                        ANYOF_BITMAP_CLEAR(data->start_class, value);
+                                    }
+                                }
+                            }
                         }
                     }
                     else {
                         if (data->start_class->flags & ANYOF_LOCALE)
                             ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
-                       else {
-                           for (value = 0; value < 256; value++)
-                               if (!isSPACE(value))
-                                   ANYOF_BITMAP_SET(data->start_class, value);                 
-                       }
+                        else if (FLAGS(scan) & USE_UNI) {
+                            for (value = 0; value < 256; value++) {
+                                if (!isSPACE_L1(value)) {
+                                    ANYOF_BITMAP_SET(data->start_class, value);
+                                }
+                            }
+                        }
+                        else {
+                            for (value = 0; value < 256; value++) {
+                                if (!isSPACE(value)) {
+                                    ANYOF_BITMAP_SET(data->start_class, value);
+                                }
+                            }
+                        }
                     }
                     break;
                 case NSPACEL:
@@ -3700,7 +3772,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                         else {
                             for (value = 0; value < 256; value++)
                                 if (isDIGIT(value))
-                                   ANYOF_BITMAP_SET(data->start_class, value);                 
+                                   ANYOF_BITMAP_SET(data->start_class, value);
                         }
                     }
                     break;
@@ -3717,7 +3789,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                         else {
                             for (value = 0; value < 256; value++)
                                 if (!isDIGIT(value))
-                                   ANYOF_BITMAP_SET(data->start_class, value);                 
+                                   ANYOF_BITMAP_SET(data->start_class, value);
                         }
                     }
                     break;
@@ -4264,18 +4336,20 @@ Perl_re_compile(pTHX_ SV * const pattern, U32 pm_flags)
      struct regexp *r;
      register regexp_internal *ri;
      STRLEN plen;
-    char  *exp = SvPV(pattern, plen);
-    char* xend = exp + plen;
+    char  *exp;
+    char* xend;
      regnode *scan;
      I32 flags;
      I32 minlen = 0;
      I32 sawplus = 0;
      I32 sawopen = 0;
+    U8 jump_ret = 0;
+    dJMPENV;
      scan_data_t data;
      RExC_state_t RExC_state;
      RExC_state_t * const pRExC_state = &RExC_state;
  #ifdef TRIE_STUDY_OPT    
-    int restudied= 0;
+    int restudied;
      RExC_state_t copyRExC_state;
  #endif    
      GET_RE_DEBUG_FLAGS_DECL;
@@ -4286,15 +4360,56 @@ Perl_re_compile(pTHX_ SV * const pattern, U32 pm_flags)
  
      RExC_utf8 = RExC_orig_utf8 = SvUTF8(pattern);
  
-    DEBUG_COMPILE_r({
-        SV *dsv= sv_newmortal();
-        RE_PV_QUOTED_DECL(s, RExC_utf8,
-            dsv, exp, plen, 60);
-        PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
-                      PL_colors[4],PL_colors[5],s);
-    });
  
-redo_first_pass:
+    /* Longjmp back to here if have to switch in midstream to utf8 */
+    if (! RExC_orig_utf8) {
+       JMPENV_PUSH(jump_ret);
+    }
+
+    if (jump_ret == 0) {    /* First time through */
+        exp = SvPV(pattern, plen);
+        xend = exp + plen;
+
+        DEBUG_COMPILE_r({
+            SV *dsv= sv_newmortal();
+            RE_PV_QUOTED_DECL(s, RExC_utf8,
+                dsv, exp, plen, 60);
+            PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
+                           PL_colors[4],PL_colors[5],s);
+        });
+    }
+    else {  /* longjumped back */
+        STRLEN len = plen;
+
+        /* If the cause for the longjmp was other than changing to utf8, pop
+         * our own setjmp, and longjmp to the correct handler */
+       if (jump_ret != UTF8_LONGJMP) {
+           JMPENV_POP;
+           JMPENV_JUMP(jump_ret);
+       }
+
+       GET_RE_DEBUG_FLAGS;
+
+        /* It's possible to write a regexp in ascii that represents Unicode
+        codepoints outside of the byte range, such as via \x{100}. If we
+        detect such a sequence we have to convert the entire pattern to utf8
+        and then recompile, as our sizing calculation will have been based
+        on 1 byte == 1 character, but we will need to use utf8 to encode
+        at least some part of the pattern, and therefore must convert the whole
+        thing.
+        -- dmq */
+        DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
+           "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
+        exp = (char*)Perl_bytes_to_utf8(aTHX_ (U8*)SvPV(pattern, plen), &len);
+        xend = exp + len;
+        RExC_orig_utf8 = RExC_utf8 = 1;
+        SAVEFREEPV(exp);
+    }
+
+#ifdef TRIE_STUDY_OPT
+    restudied = 0;
+#endif
+
      RExC_precomp = exp;
      RExC_flags = pm_flags;
      RExC_sawback = 0;
@@ -4333,24 +4448,14 @@ redo_first_pass:
         RExC_precomp = NULL;
         return(NULL);
      }
-    if (RExC_utf8 && !RExC_orig_utf8) {
-        /* It's possible to write a regexp in ascii that represents Unicode
-        codepoints outside of the byte range, such as via \x{100}. If we
-        detect such a sequence we have to convert the entire pattern to utf8
-        and then recompile, as our sizing calculation will have been based
-        on 1 byte == 1 character, but we will need to use utf8 to encode
-        at least some part of the pattern, and therefore must convert the whole
-        thing.
-        XXX: somehow figure out how to make this less expensive...
-        -- dmq */
-        STRLEN len = plen;
-        DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
-           "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
-        exp = (char*)Perl_bytes_to_utf8(aTHX_ (U8*)exp, &len);
-        xend = exp + len;
-        RExC_orig_utf8 = RExC_utf8;
-        SAVEFREEPV(exp);
-        goto redo_first_pass;
+
+    /* Here, finished first pass.  Get rid of our setjmp, which we added for
+     * efficiency only if the passed-in string wasn't in utf8, as shown by
+     * RExC_orig_utf8.  But if the first pass was redone, that variable will be
+     * 1 here even though the original string wasn't utf8, but in this case
+     * there will have been a long jump */
+    if (jump_ret == UTF8_LONGJMP || ! RExC_orig_utf8) {
+       JMPENV_POP;
      }
      DEBUG_PARSE_r({
          PerlIO_printf(Perl_debug_log, 
@@ -4392,39 +4497,56 @@ redo_first_pass:
      r->extflags = pm_flags;
      {
          bool has_p     = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
-       bool has_minus = ((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD);
+        bool has_charset = cBOOL(r->extflags & (RXf_PMf_LOCALE|RXf_PMf_UNICODE));
+
+        /* The caret is output if there are any defaults: if not all the STD
+         * flags are set, or if no character set specifier is needed */
+        bool has_default =
+                    (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
+                    || ! has_charset);
         bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
         U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
                             >> RXf_PMf_STD_PMMOD_SHIFT);
         const char *fptr = STD_PAT_MODS;        /*"msix"*/
         char *p;
-       const STRLEN wraplen = plen + has_minus + has_p + has_runon
+        /* Allocate for the worst case, which is all the std flags are turned
+         * on.  If more precision is desired, we could do a population count of
+         * the flags set.  This could be done with a small lookup table, or by
+         * shifting, masking and adding, or even, when available, assembly
+         * language for a machine-language population count.
+         * We never output a minus, as all those are defaults, so are
+         * covered by the caret */
+       const STRLEN wraplen = plen + has_p + has_runon
+            + has_default       /* If needs a caret */
+            + has_charset       /* If needs a character set specifier */
              + (sizeof(STD_PAT_MODS) - 1)
              + (sizeof("(?:)") - 1);
  
-       p = sv_grow(MUTABLE_SV(rx), wraplen + 1);
-       SvCUR_set(rx, wraplen);
+        p = sv_grow(MUTABLE_SV(rx), wraplen + 1); /* +1 for the ending NUL */
         SvPOK_on(rx);
         SvFLAGS(rx) |= SvUTF8(pattern);
          *p++='('; *p++='?';
+
+        /* If a default, cover it using the caret */
+        if (has_default) {
+            *p++= DEFAULT_PAT_MOD;
+        }
+        if (has_charset) {
+            if (r->extflags & RXf_PMf_LOCALE) {
+                *p++ = LOCALE_PAT_MOD;
+            } else {
+                *p++ = UNICODE_PAT_MOD;
+            }
+        }
          if (has_p)
              *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
          {
-            char *r = p + (sizeof(STD_PAT_MODS) - 1) + has_minus - 1;
-            char *colon = r + 1;
              char ch;
-
              while((ch = *fptr++)) {
                  if(reganch & 1)
                      *p++ = ch;
-                else
-                    *r-- = ch;
                  reganch >>= 1;
              }
-            if(has_minus) {
-                *r = '-';
-                p = colon;
-            }
          }
  
          *p++ = ':';
@@ -4436,6 +4558,7 @@ redo_first_pass:
              *p++ = '\n';
          *p++ = ')';
          *p = 0;
+       SvCUR_set(rx, p - SvPVX_const(rx));
      }
  
      r->intflags = 0;
@@ -4945,7 +5068,7 @@ reStudy:
  #endif
  #ifdef DEBUGGING
      if (RExC_paren_names) {
-        ri->name_list_idx = add_data( pRExC_state, 1, "p" );
+        ri->name_list_idx = add_data( pRExC_state, 1, "a" );
          ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
      } else
  #endif
@@ -4995,7 +5118,7 @@ Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
      if (flags & RXapif_FETCH) {
          return reg_named_buff_fetch(rx, key, flags);
      } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
-        Perl_croak(aTHX_ "%s", PL_no_modify);
+        Perl_croak_no_modify(aTHX);
          return NULL;
      } else if (flags & RXapif_EXISTS) {
          return reg_named_buff_exists(rx, key, flags)
@@ -5295,7 +5418,7 @@ Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
      PERL_UNUSED_ARG(value);
  
      if (!PL_localizing)
-        Perl_croak(aTHX_ "%s", PL_no_modify);
+        Perl_croak_no_modify(aTHX);
  }
  
  I32
@@ -5636,6 +5759,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
         if (*RExC_parse == '?') { /* (?...) */
             bool is_logical = 0;
             const char * const seqstart = RExC_parse;
+            bool has_use_defaults = FALSE;
  
             RExC_parse++;
             paren = *RExC_parse++;
@@ -5765,7 +5889,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
                 RExC_parse++;
             case '=':           /* (?=...) */
                 RExC_seen_zerolen++;
-                       break;
+                break;
             case '!':           /* (?!...) */
                 RExC_seen_zerolen++;
                 if (*RExC_parse == ')') {
@@ -6090,12 +6214,19 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
                 RExC_parse--; /* for vFAIL to print correctly */
                  vFAIL("Sequence (? incomplete");
                  break;
+            case DEFAULT_PAT_MOD:   /* Use default flags with the exceptions
+                                      that follow */
+                has_use_defaults = TRUE;
+                STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
+                RExC_flags &= ~(RXf_PMf_LOCALE|RXf_PMf_UNICODE);
+                goto parse_flags;
             default:
                 --RExC_parse;
                 parse_flags:      /* (?i) */  
             {
                  U32 posflags = 0, negflags = 0;
                 U32 *flagsp = &posflags;
+                bool has_charset_modifier = 0;
  
                 while (*RExC_parse) {
                     /* && strchr("iogcmsx", *RExC_parse) */
@@ -6103,6 +6234,32 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
                        and must be globally applied -- japhy */
                      switch (*RExC_parse) {
                     CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
+                    case LOCALE_PAT_MOD:
+                        if (has_charset_modifier || flagsp == &negflags) {
+                            goto fail_modifiers;
+                        }
+                        posflags |= RXf_PMf_LOCALE;
+                        negflags |= RXf_PMf_UNICODE;
+                        has_charset_modifier = 1;
+                        break;
+                    case UNICODE_PAT_MOD:
+                        if (has_charset_modifier || flagsp == &negflags) {
+                            goto fail_modifiers;
+                        }
+                        posflags |= RXf_PMf_UNICODE;
+                        negflags |= RXf_PMf_LOCALE;
+                        has_charset_modifier = 1;
+                        break;
+                    case DUAL_PAT_MOD:
+                        if (has_use_defaults
+                            || has_charset_modifier
+                            || flagsp == &negflags)
+                        {
+                            goto fail_modifiers;
+                        }
+                        negflags |= (RXf_PMf_LOCALE|RXf_PMf_UNICODE);
+                        has_charset_modifier = 1;
+                        break;
                      case ONCE_PAT_MOD: /* 'o' */
                      case GLOBAL_PAT_MOD: /* 'g' */
                         if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
@@ -6143,7 +6300,11 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
                          }
                         break;
                      case '-':
-                        if (flagsp == &negflags) {
+                        /* A flag is a default iff it is following a minus,  so
+                         * if there is a minus, it means will be trying to
+                         * re-specify a default which is an error */
+                        if (has_use_defaults || flagsp == &negflags) {
+            fail_modifiers:
                              RExC_parse++;
                             vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
                             /*NOTREACHED*/
@@ -6746,7 +6907,7 @@ S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp)
         return (regnode *) &RExC_parse; /* Invalid regnode pointer */
      }
  
-    RExC_utf8 = 1;     /* named sequences imply Unicode semantics */
+    REQUIRE_UTF8;      /* named sequences imply Unicode semantics */
      RExC_parse += 2;   /* Skip past the 'U+' */
  
      if (valuep) {   /* In a bracketed char class */
@@ -6791,7 +6952,7 @@ S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp)
      }
      else {     /* Not a char class */
         char *s;            /* String to put in generated EXACT node */
-       STRLEN len = 0;     /* Its current length */
+       STRLEN len = 0;     /* Its current byte length */
         char *endchar;      /* Points to '.' or '}' ending cur char in the input
                                stream */
  
@@ -6801,7 +6962,7 @@ S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp)
  
         /* Exact nodes can hold only a U8 length's of text = 255.  Loop through
          * the input which is of the form now 'c1.c2.c3...}' until find the
-        * ending brace or exeed length 255.  The characters that exceed this
+        * ending brace or exceed length 255.  The characters that exceed this
          * limit are dropped.  The limit could be relaxed should it become
          * desirable by reparsing this as (?:\N{NAME}), so could generate
          * multiple EXACT nodes, as is done for just regular input.  But this
@@ -7134,31 +7295,61 @@ tryagain:
             *flagp |= HASWIDTH;
             goto finish_meta_pat;
         case 'w':
-           ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML     : ALNUM));
+           if (LOC) {
+                ret = reg_node(pRExC_state, (U8)(ALNUML));
+            } else {
+                ret = reg_node(pRExC_state, (U8)(ALNUM));
+                FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
+            }
             *flagp |= HASWIDTH|SIMPLE;
             goto finish_meta_pat;
         case 'W':
-           ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML    : NALNUM));
+            if (LOC) {
+                ret = reg_node(pRExC_state, (U8)(NALNUML));
+            } else {
+                ret = reg_node(pRExC_state, (U8)(NALNUM));
+                FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
+            }
             *flagp |= HASWIDTH|SIMPLE;
             goto finish_meta_pat;
         case 'b':
             RExC_seen_zerolen++;
             RExC_seen |= REG_SEEN_LOOKBEHIND;
-           ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL     : BOUND));
+            if (LOC) {
+                ret = reg_node(pRExC_state, (U8)(BOUNDL));
+            } else {
+                ret = reg_node(pRExC_state, (U8)(BOUND));
+                FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
+            }
             *flagp |= SIMPLE;
             goto finish_meta_pat;
         case 'B':
             RExC_seen_zerolen++;
             RExC_seen |= REG_SEEN_LOOKBEHIND;
-           ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL    : NBOUND));
+            if (LOC) {
+                ret = reg_node(pRExC_state, (U8)(NBOUNDL));
+            } else {
+                ret = reg_node(pRExC_state, (U8)(NBOUND));
+                FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
+            }
             *flagp |= SIMPLE;
             goto finish_meta_pat;
         case 's':
-           ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL     : SPACE));
+            if (LOC) {
+                ret = reg_node(pRExC_state, (U8)(SPACEL));
+            } else {
+                ret = reg_node(pRExC_state, (U8)(SPACE));
+                FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
+            }
             *flagp |= HASWIDTH|SIMPLE;
             goto finish_meta_pat;
         case 'S':
-           ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL    : NSPACE));
+            if (LOC) {
+                ret = reg_node(pRExC_state, (U8)(NSPACEL));
+            } else {
+                ret = reg_node(pRExC_state, (U8)(NSPACE));
+                FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
+            }
             *flagp |= HASWIDTH|SIMPLE;
             goto finish_meta_pat;
         case 'd':
@@ -7462,6 +7653,35 @@ tryagain:
                           ender = ASCII_TO_NATIVE('\007');
                         p++;
                         break;
+                   case 'o':
+                       {
+                           STRLEN brace_len = len;
+                           UV result;
+                           const char* error_msg;
+
+                           bool valid = grok_bslash_o(p,
+                                                      &result,
+                                                      &brace_len,
+                                                      &error_msg,
+                                                      1);
+                           p += brace_len;
+                           if (! valid) {
+                               RExC_parse = p; /* going to die anyway; point
+                                                  to exact spot of failure */
+                               vFAIL(error_msg);
+                           }
+                           else
+                           {
+                               ender = result;
+                           }
+                           if (PL_encoding && ender < 0x100) {
+                               goto recode_encoding;
+                           }
+                           if (ender > 0xff) {
+                               REQUIRE_UTF8;
+                           }
+                           break;
+                       }
                     case 'x':
                         if (*++p == '{') {
                             char* const e = strchr(p, '}');
@@ -7476,7 +7696,7 @@ tryagain:
                                  STRLEN numlen = e - p - 1;
                                 ender = grok_hex(p + 1, &numlen, &flags, NULL);
                                 if (ender > 0xff)
-                                   RExC_utf8 = 1;
+                                   REQUIRE_UTF8;
                                 p = e + 1;
                             }
                         }
@@ -7496,21 +7716,13 @@ tryagain:
                     case '0': case '1': case '2': case '3':case '4':
                     case '5': case '6': case '7': case '8':case '9':
                         if (*p == '0' ||
-                         (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
-                            I32 flags = 0;
+                           (isDIGIT(p[1]) && atoi(p) >= RExC_npar))
+                       {
+                           I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
                             STRLEN numlen = 3;
                             ender = grok_oct(p, &numlen, &flags, NULL);
-
-                           /* An octal above 0xff is interpreted differently
-                            * depending on if the re is in utf8 or not.  If it
-                            * is in utf8, the value will be itself, otherwise
-                            * it is interpreted as modulo 0x100.  It has been
-                            * decided to discourage the use of octal above the
-                            * single-byte range.  For now, warn only when
-                            * it ends up modulo */
-                           if (SIZE_ONLY && ender >= 0x100
-                                   && ! UTF && ! PL_encoding) {
-                               ckWARNregdep(p, "Use of octal value above 377 is deprecated");
+                           if (ender > 0xff) {
+                               REQUIRE_UTF8;
                             }
                             p += numlen;
                         }
@@ -7527,7 +7739,7 @@ tryagain:
                             ender = reg_recode((const char)(U8)ender, &enc);
                             if (!enc && SIZE_ONLY)
                                 ckWARNreg(p, "Invalid escape in the specified encoding");
-                           RExC_utf8 = 1;
+                           REQUIRE_UTF8;
                         }
                         break;
                     case '\0':
@@ -7869,6 +8081,7 @@ case ANYOF_N##NAME:                                     \
      what = WORD;                                        \
      break
  
+/* Like above, but no locale test */
  #define _C_C_T_NOLOC_(NAME,TEST,WORD)                   \
  ANYOF_##NAME:                                           \
         for (value = 0; value < 256; value++)           \
@@ -7885,6 +8098,42 @@ case ANYOF_N##NAME:                                     \
      what = WORD;                                        \
      break
  
+/* Like the above, but there are differences if we are in uni-8-bit or not, so
+ * there are two tests passed in, to use depending on that. There aren't any
+ * cases where the label is different from the name, so no need for that
+ * parameter */
+#define _C_C_T_UNI_8_BIT(NAME,TEST_8,TEST_7,WORD)       \
+ANYOF_##NAME:                                           \
+    if (LOC) ANYOF_CLASS_SET(ret, ANYOF_##NAME);        \
+    else if (UNI_SEMANTICS) {                           \
+        for (value = 0; value < 256; value++) {         \
+            if (TEST_8) ANYOF_BITMAP_SET(ret, value);   \
+        }                                               \
+    }                                                   \
+    else {                                              \
+        for (value = 0; value < 256; value++) {         \
+            if (TEST_7) ANYOF_BITMAP_SET(ret, value);   \
+        }                                               \
+    }                                                   \
+    yesno = '+';                                        \
+    what = WORD;                                        \
+    break;                                              \
+case ANYOF_N##NAME:                                     \
+    if (LOC) ANYOF_CLASS_SET(ret, ANYOF_N##NAME);       \
+    else if (UNI_SEMANTICS) {                           \
+        for (value = 0; value < 256; value++) {         \
+            if (! TEST_8) ANYOF_BITMAP_SET(ret, value); \
+        }                                               \
+    }                                                   \
+    else {                                              \
+        for (value = 0; value < 256; value++) {         \
+            if (! TEST_7) ANYOF_BITMAP_SET(ret, value); \
+        }                                               \
+    }                                                   \
+    yesno = '!';                                        \
+    what = WORD;                                        \
+    break
+
  /* 
     We dont use PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS as the direct test
     so that it is possible to override the option here without having to 
@@ -8086,6 +8335,24 @@ parseit:
             case 'b':   value = '\b';                   break;
             case 'e':   value = ASCII_TO_NATIVE('\033');break;
             case 'a':   value = ASCII_TO_NATIVE('\007');break;
+           case 'o':
+               RExC_parse--;   /* function expects to be pointed at the 'o' */
+               {
+                   const char* error_msg;
+                   bool valid = grok_bslash_o(RExC_parse,
+                                              &value,
+                                              &numlen,
+                                              &error_msg,
+                                              SIZE_ONLY);
+                   RExC_parse += numlen;
+                   if (! valid) {
+                       vFAIL(error_msg);
+                   }
+               }
+               if (PL_encoding && value < 0x100) {
+                   goto recode_encoding;
+               }
+               break;
             case 'x':
                 if (*RExC_parse == '{') {
                      I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
@@ -8111,9 +8378,10 @@ parseit:
                 value = grok_bslash_c(*RExC_parse++, SIZE_ONLY);
                 break;
             case '0': case '1': case '2': case '3': case '4':
-           case '5': case '6': case '7': case '8': case '9':
+           case '5': case '6': case '7':
                 {
-                   I32 flags = 0;
+                   /* Take 1-3 octal digits */
+                   I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
                     numlen = 3;
                     value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
                     RExC_parse += numlen;
@@ -8131,10 +8399,12 @@ parseit:
                     break;
                 }
             default:
-               if (!SIZE_ONLY && isALPHA(value))
+               /* Allow \_ to not give an error */
+               if (!SIZE_ONLY && isALNUM(value) && value != '_') {
                     ckWARN2reg(RExC_parse,
                                "Unrecognized escape \\%c in character class passed through",
                                (int)value);
+               }
                 break;
             }
         } /* end of \blah */
@@ -8199,11 +8469,13 @@ parseit:
                 case _C_C_T_(PUNCT, isPUNCT(value), POSIX_CC_UNI_NAME("Punct"));
                 case _C_C_T_(UPPER, isUPPER(value), POSIX_CC_UNI_NAME("Upper"));
  #ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
-               case _C_C_T_(ALNUM, isALNUM(value), "Word");
-               case _C_C_T_(SPACE, isSPACE(value), "SpacePerl");
+                /* \s, \w match all unicode if utf8. */
+                case _C_C_T_UNI_8_BIT(SPACE, isSPACE_L1(value), isSPACE(value), "SpacePerl");
+                case _C_C_T_UNI_8_BIT(ALNUM, isWORDCHAR_L1(value), isALNUM(value), "Word");
  #else
-               case _C_C_T_(SPACE, isSPACE(value), "PerlSpace");
-               case _C_C_T_(ALNUM, isALNUM(value), "PerlWord");
+                /* \s, \w match ascii and locale only */
+                case _C_C_T_UNI_8_BIT(SPACE, isSPACE_L1(value), isSPACE(value), "PerlSpace");
+                case _C_C_T_UNI_8_BIT(ALNUM, isWORDCHAR_L1(value), isALNUM(value), "PerlWord");
  #endif         
                 case _C_C_T_(XDIGIT, isXDIGIT(value), "XDigit");
                 case _C_C_T_NOLOC_(VERTWS, is_VERTWS_latin1(&value), "VertSpace");
@@ -8547,7 +8819,7 @@ S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
  
  /* nextchar()
  
-   Advance that parse position, and optionall absorbs
+   Advance that parse position, and optionally absorbs
     "whitespace" from the inputstream.
  
     Without /x "whitespace" means (?#...) style comments only,
@@ -8929,31 +9201,6 @@ S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,
  #endif
  
  /*
- - regcurly - a little FSA that accepts {\d+,?\d*}
- */
-#ifndef PERL_IN_XSUB_RE
-I32
-Perl_regcurly(register const char *s)
-{
-    PERL_ARGS_ASSERT_REGCURLY;
-
-    if (*s++ != '{')
-       return FALSE;
-    if (!isDIGIT(*s))
-       return FALSE;
-    while (isDIGIT(*s))
-       s++;
-    if (*s == ',')
-       s++;
-    while (isDIGIT(*s))
-       s++;
-    if (*s != '}')
-       return FALSE;
-    return TRUE;
-}
-#endif
-
-/*
   - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
   */
  #ifdef DEBUGGING
@@ -9556,6 +9803,7 @@ Perl_regfree_internal(pTHX_ REGEXP * const rx)
         while (--n >= 0) {
            /* If you add a ->what type here, update the comment in regcomp.h */
             switch (ri->data->what[n]) {
+           case 'a':
             case 's':
             case 'S':
             case 'u':
@@ -9789,8 +10037,9 @@ Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
         for (i = 0; i < count; i++) {
             d->what[i] = ri->data->what[i];
             switch (d->what[i]) {
-               /* legal options are one of: sSfpontTu
+               /* legal options are one of: sSfpontTua
                    see also regcomp.h and pregfree() */
+           case 'a': /* actually an AV, but the dup function is identical.  */
             case 's':
             case 'S':
             case 'p': /* actually an AV, but the dup function is identical.  */
@@ -9866,6 +10115,10 @@ Perl_regnext(pTHX_ register regnode *p)
      if (!p)
         return(NULL);
  
+    if (OP(p) > REGNODE_MAX) {         /* regnode.type is unsigned */
+       Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(p), (int)REGNODE_MAX);
+    }
+
      offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
      if (offset == 0)
         return(NULL);