embed.fnc: Mark atof2, atof3 for internal use only

[perl5.git] / regcomp.c
diff --git a/regcomp.c b/regcomp.c

index 36f5aff..b389f9e 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -182,11 +182,10 @@ struct RExC_state_t {
                                             through */
      U32         study_chunk_recursed_bytes;  /* bytes in bitmap */
      I32                in_lookbehind;
+    I32                in_lookahead;
      I32                contains_locale;
      I32                override_recoding;
-#ifdef EBCDIC
-    I32                recode_x_to_native;
-#endif
+    I32         recode_x_to_native;
      I32                in_multi_char_class;
      struct reg_code_blocks *code_blocks;/* positions of literal (?{})
                                             within pattern */
@@ -244,7 +243,6 @@ struct RExC_state_t {
  #define RExC_seen_d_op (pRExC_state->seen_d_op) /* Seen something that differs
                                                     under /d from /u ? */
  
-
  #ifdef RE_TRACK_PATTERN_OFFSETS
  #  define RExC_offsets (RExC_rxi->u.offsets) /* I am not like the
                                                           others */
@@ -273,10 +271,17 @@ struct RExC_state_t {
  #define RExC_study_chunk_recursed_bytes  \
                                     (pRExC_state->study_chunk_recursed_bytes)
  #define RExC_in_lookbehind     (pRExC_state->in_lookbehind)
+#define RExC_in_lookahead      (pRExC_state->in_lookahead)
  #define RExC_contains_locale   (pRExC_state->contains_locale)
+#define RExC_recode_x_to_native (pRExC_state->recode_x_to_native)
+
  #ifdef EBCDIC
-#   define RExC_recode_x_to_native (pRExC_state->recode_x_to_native)
+#  define SET_recode_x_to_native(x)                                         \
+                    STMT_START { RExC_recode_x_to_native = (x); } STMT_END
+#else
+#  define SET_recode_x_to_native(x) NOOP
  #endif
+
  #define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
  #define RExC_frame_head (pRExC_state->frame_head)
  #define RExC_frame_last (pRExC_state->frame_last)
@@ -372,12 +377,8 @@ struct RExC_state_t {
  #define REQUIRE_BRANCHJ(flagp, restart_retval)                              \
      STMT_START {                                                            \
                  RExC_use_BRANCHJ = 1;                                       \
-                if (LIKELY(! IN_PARENS_PASS)) {                             \
-                    /* No need to restart the parse immediately if we're    \
-                     * going to reparse anyway to count parens */           \
-                    *flagp |= RESTART_PARSE;                                \
-                    return restart_retval;                                  \
-                }                                                           \
+                *flagp |= RESTART_PARSE;                                    \
+                return restart_retval;                                      \
      } STMT_END
  
  /* Until we have completed the parse, we leave RExC_total_parens at 0 or
@@ -748,6 +749,10 @@ static const scan_data_t zero_scan_data = {
      Perl_croak(aTHX_ msg " in regex m/%" UTF8f "%s/",      \
             arg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
  
+#define        FAIL3(msg,arg1,arg2) _FAIL(                         \
+    Perl_croak(aTHX_ msg " in regex m/%" UTF8f "%s/",      \
+     arg1, arg2, UTF8fARG(UTF, len, RExC_precomp), ellipses))
+
  /*
   * Simple_vFAIL -- like FAIL, but marks the current location in the scan
   */
@@ -1582,7 +1587,9 @@ S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
      unsigned int i;
      const U32 n = ARG(node);
      bool new_node_has_latin1 = FALSE;
-    const U8 flags = OP(node) == ANYOFH ? 0 : ANYOF_FLAGS(node);
+    const U8 flags = (inRANGE(OP(node), ANYOFH, ANYOFHr))
+                      ? 0
+                      : ANYOF_FLAGS(node);
  
      PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC;
  
@@ -1635,7 +1642,7 @@ S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
      }
  
      /* Add in the points from the bit map */
-    if (OP(node) != ANYOFH) {
+    if (! inRANGE(OP(node), ANYOFH, ANYOFHr)) {
          for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
              if (ANYOF_BITMAP_TEST(node, i)) {
                  unsigned int start = i++;
@@ -1722,7 +1729,9 @@ S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
       * another SSC or a regular ANYOF class.  Can create false positives. */
  
      SV* anded_cp_list;
-    U8  and_with_flags = (OP(and_with) == ANYOFH) ? 0 : ANYOF_FLAGS(and_with);
+    U8  and_with_flags = inRANGE(OP(and_with), ANYOFH, ANYOFHr)
+                          ? 0
+                          : ANYOF_FLAGS(and_with);
      U8  anded_flags;
  
      PERL_ARGS_ASSERT_SSC_AND;
@@ -1906,7 +1915,9 @@ S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
  
      SV* ored_cp_list;
      U8 ored_flags;
-    U8  or_with_flags = (OP(or_with) == ANYOFH) ? 0 : ANYOF_FLAGS(or_with);
+    U8  or_with_flags = inRANGE(OP(or_with), ANYOFH, ANYOFHr)
+                         ? 0
+                         : ANYOF_FLAGS(or_with);
  
      PERL_ARGS_ASSERT_SSC_OR;
  
@@ -2520,7 +2531,8 @@ is the recommended Unicode-aware way of saying
         if (UTF) {                                                         \
              SV *zlopp = newSV(UTF8_MAXBYTES);                             \
             unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp);      \
-            unsigned const char *const kapow = uvchr_to_utf8(flrbbbbb, val); \
+            unsigned char *const kapow = uvchr_to_utf8(flrbbbbb, val);     \
+            *kapow = '\0';                                                 \
             SvCUR_set(zlopp, kapow - flrbbbbb);                            \
             SvPOK_on(zlopp);                                               \
             SvUTF8_on(zlopp);                                              \
@@ -5844,6 +5856,8 @@ Perl_re_printf( aTHX_  "LHS=%" UVuf " RHS=%" UVuf "\n",
                  case ANYOFL:
                  case ANYOFPOSIXL:
                  case ANYOFH:
+                case ANYOFHb:
+                case ANYOFHr:
                  case ANYOF:
                     if (flags & SCF_DO_STCLASS_AND)
                         ssc_and(pRExC_state, data->start_class,
@@ -7613,10 +7627,9 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
      RExC_seen = 0;
      RExC_maxlen = 0;
      RExC_in_lookbehind = 0;
+    RExC_in_lookahead = 0;
      RExC_seen_zerolen = *exp == '^' ? -1 : 0;
-#ifdef EBCDIC
      RExC_recode_x_to_native = 0;
-#endif
      RExC_in_multi_char_class = 0;
  
      RExC_start = RExC_copy_start_in_constructed = RExC_copy_start_in_input = RExC_precomp = exp;
@@ -7827,6 +7840,16 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
      SetProgLen(RExC_rxi,RExC_size);
  #endif
  
+    DEBUG_DUMP_PRE_OPTIMIZE_r({
+        SV * const sv = sv_newmortal();
+        RXi_GET_DECL(RExC_rx, ri);
+        DEBUG_RExC_seen();
+        Perl_re_printf( aTHX_ "Program before optimization:\n");
+
+        (void)dumpuntil(RExC_rx, ri->program, ri->program + 1, NULL, NULL,
+                        sv, 0, 0);
+    });
+
      DEBUG_OPTIMISE_r(
          Perl_re_printf( aTHX_  "Starting post parse optimization\n");
      );
@@ -11069,6 +11092,13 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
  
      *flagp = 0;                                /* Tentatively. */
  
+    if (RExC_in_lookbehind) {
+       RExC_in_lookbehind++;
+    }
+    if (RExC_in_lookahead) {
+        RExC_in_lookahead++;
+    }
+
      /* Having this true makes it feasible to have a lot fewer tests for the
       * parse pointer being in scope.  For example, we can write
       *      while(isFOO(*RExC_parse)) RExC_parse++;
@@ -11310,10 +11340,15 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
                          return 0;
                      }
  
-                    REGTAIL(pRExC_state, ret, atomic);
+                    if (! REGTAIL(pRExC_state, ret, atomic)) {
+                        REQUIRE_BRANCHJ(flagp, 0);
+                    }
  
-                    REGTAIL(pRExC_state, atomic,
-                           reg_node(pRExC_state, SRCLOSE));
+                    if (! REGTAIL(pRExC_state, atomic, reg_node(pRExC_state,
+                                                                SRCLOSE)))
+                    {
+                        REQUIRE_BRANCHJ(flagp, 0);
+                    }
  
                      RExC_in_script_run = 0;
                      return ret;
@@ -11533,10 +11568,11 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
                  if (RExC_parse >= RExC_end) {
                      vFAIL("Sequence (?... not terminated");
                  }
-
-                /* FALLTHROUGH */
+                RExC_seen_zerolen++;
+                break;
             case '=':           /* (?=...) */
                 RExC_seen_zerolen++;
+                RExC_in_lookahead++;
                  break;
             case '!':           /* (?!...) */
                 RExC_seen_zerolen++;
@@ -11772,7 +11808,9 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
                                         RExC_flags & RXf_PMf_COMPILETIME
                                        );
                      FLAGS(REGNODE_p(ret)) = 2;
-                    REGTAIL(pRExC_state, ret, eval);
+                    if (! REGTAIL(pRExC_state, ret, eval)) {
+                        REQUIRE_BRANCHJ(flagp, 0);
+                    }
                      /* deal with the length of this later - MJD */
                     return ret;
                 }
@@ -11825,7 +11863,9 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
  
                      tail = reg(pRExC_state, 1, &flag, depth+1);
                      RETURN_FAIL_ON_RESTART(flag, flagp);
-                    REGTAIL(pRExC_state, ret, tail);
+                    if (! REGTAIL(pRExC_state, ret, tail)) {
+                        REQUIRE_BRANCHJ(flagp, 0);
+                    }
                      goto insert_if;
                  }
                 else if (   RExC_parse[0] == '<'     /* (?(<NAME>)...) */
@@ -11917,15 +11957,22 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
                     }
                     nextchar(pRExC_state);
                   insert_if:
-                    REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
+                    if (! REGTAIL(pRExC_state, ret, reganode(pRExC_state,
+                                                             IFTHEN, 0)))
+                    {
+                        REQUIRE_BRANCHJ(flagp, 0);
+                    }
                      br = regbranch(pRExC_state, &flags, 1, depth+1);
                     if (br == 0) {
                          RETURN_FAIL_ON_RESTART(flags,flagp);
                          FAIL2("panic: regbranch returned failure, flags=%#" UVxf,
                                (UV) flags);
                      } else
-                        REGTAIL(pRExC_state, br, reganode(pRExC_state,
-                                                          LONGJMP, 0));
+                    if (! REGTAIL(pRExC_state, br, reganode(pRExC_state,
+                                                             LONGJMP, 0)))
+                    {
+                        REQUIRE_BRANCHJ(flagp, 0);
+                    }
                     c = UCHARAT(RExC_parse);
                      nextchar(pRExC_state);
                     if (flags&HASWIDTH)
@@ -11942,7 +11989,9 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
                              FAIL2("panic: regbranch returned failure, flags=%#" UVxf,
                                    (UV) flags);
                          }
-                        REGTAIL(pRExC_state, ret, lastbr);
+                        if (! REGTAIL(pRExC_state, ret, lastbr)) {
+                            REQUIRE_BRANCHJ(flagp, 0);
+                        }
                         if (flags&HASWIDTH)
                             *flagp |= HASWIDTH;
                          c = UCHARAT(RExC_parse);
@@ -11957,16 +12006,26 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
                              vFAIL("Switch (?(condition)... contains too many branches");
                      }
                     ender = reg_node(pRExC_state, TAIL);
-                    REGTAIL(pRExC_state, br, ender);
+                    if (! REGTAIL(pRExC_state, br, ender)) {
+                        REQUIRE_BRANCHJ(flagp, 0);
+                    }
                     if (lastbr) {
-                        REGTAIL(pRExC_state, lastbr, ender);
-                        REGTAIL(pRExC_state, REGNODE_OFFSET(
-                                                NEXTOPER(
-                                                NEXTOPER(REGNODE_p(lastbr)))),
-                                             ender);
+                        if (! REGTAIL(pRExC_state, lastbr, ender)) {
+                            REQUIRE_BRANCHJ(flagp, 0);
+                        }
+                        if (! REGTAIL(pRExC_state,
+                                      REGNODE_OFFSET(
+                                                 NEXTOPER(
+                                                 NEXTOPER(REGNODE_p(lastbr)))),
+                                      ender))
+                        {
+                            REQUIRE_BRANCHJ(flagp, 0);
+                        }
                     }
                     else
-                        REGTAIL(pRExC_state, ret, ender);
+                        if (! REGTAIL(pRExC_state, ret, ender)) {
+                            REQUIRE_BRANCHJ(flagp, 0);
+                        }
  #if 0  /* Removing this doesn't cause failures in the test suite -- khw */
                      RExC_size++; /* XXX WHY do we need this?!!
                                      For large programs it seems to be required
@@ -12116,7 +12175,9 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
         *flagp |= flags&SIMPLE;
      }
      if (is_open) {                             /* Starts with OPEN. */
-        REGTAIL(pRExC_state, ret, br);          /* OPEN -> first. */
+        if (! REGTAIL(pRExC_state, ret, br)) {  /* OPEN -> first. */
+            REQUIRE_BRANCHJ(flagp, 0);
+        }
      }
      else if (paren != '?')             /* Not Conditional */
         ret = br;
@@ -12124,12 +12185,15 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
      lastbr = br;
      while (*RExC_parse == '|') {
         if (RExC_use_BRANCHJ) {
+            bool shut_gcc_up;
+
             ender = reganode(pRExC_state, LONGJMP, 0);
  
              /* Append to the previous. */
-            REGTAIL(pRExC_state,
-                    REGNODE_OFFSET(NEXTOPER(NEXTOPER(REGNODE_p(lastbr)))),
-                    ender);
+            shut_gcc_up = REGTAIL(pRExC_state,
+                         REGNODE_OFFSET(NEXTOPER(NEXTOPER(REGNODE_p(lastbr)))),
+                         ender);
+            PERL_UNUSED_VAR(shut_gcc_up);
         }
         nextchar(pRExC_state);
         if (freeze_paren) {
@@ -12240,9 +12304,10 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
                          is_nothing= 0;
                 }
                 else if (op == BRANCHJ) {
-                    REGTAIL_STUDY(pRExC_state,
-                                  REGNODE_OFFSET(NEXTOPER(NEXTOPER(br))),
-                                  ender);
+                    bool shut_gcc_up = REGTAIL_STUDY(pRExC_state,
+                                        REGNODE_OFFSET(NEXTOPER(NEXTOPER(br))),
+                                        ender);
+                    PERL_UNUSED_VAR(shut_gcc_up);
                      /* for now we always disable this optimisation * /
                      if ( OP(NEXTOPER(NEXTOPER(br))) != NOTHING
                           || regnext(NEXTOPER(NEXTOPER(br))) != REGNODE_p(ender))
@@ -12335,6 +12400,9 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
      if (RExC_in_lookbehind) {
         RExC_in_lookbehind--;
      }
+    if (RExC_in_lookahead) {
+        RExC_in_lookahead--;
+    }
      if (after_freeze > RExC_npar)
          RExC_npar = after_freeze;
      return(ret);
@@ -12554,7 +12622,9 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                 const regnode_offset w = reg_node(pRExC_state, WHILEM);
  
                 FLAGS(REGNODE_p(w)) = 0;
-                REGTAIL(pRExC_state, ret, w);
+                if (!  REGTAIL(pRExC_state, ret, w)) {
+                    REQUIRE_BRANCHJ(flagp, 0);
+                }
                 if (RExC_use_BRANCHJ) {
                     reginsert(pRExC_state, LONGJMP, ret, depth+1);
                     reginsert(pRExC_state, NOTHING, ret, depth+1);
@@ -12569,7 +12639,11 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                 if (RExC_use_BRANCHJ)
                      NEXT_OFF(REGNODE_p(ret)) = 3;   /* Go over NOTHING to
                                                         LONGJMP. */
-                REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
+                if (! REGTAIL(pRExC_state, ret, reg_node(pRExC_state,
+                                                          NOTHING)))
+                {
+                    REQUIRE_BRANCHJ(flagp, 0);
+                }
                  RExC_whilem_seen++;
                  MARK_NAUGHTY_EXP(1, 4);     /* compound interest */
             }
@@ -12641,16 +12715,22 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
      if (*RExC_parse == '?') {
         nextchar(pRExC_state);
         reginsert(pRExC_state, MINMOD, ret, depth+1);
-        REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
+        if (! REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE)) {
+            REQUIRE_BRANCHJ(flagp, 0);
+        }
      }
      else if (*RExC_parse == '+') {
          regnode_offset ender;
          nextchar(pRExC_state);
          ender = reg_node(pRExC_state, SUCCEED);
-        REGTAIL(pRExC_state, ret, ender);
+        if (! REGTAIL(pRExC_state, ret, ender)) {
+            REQUIRE_BRANCHJ(flagp, 0);
+        }
          reginsert(pRExC_state, SUSPEND, ret, depth+1);
          ender = reg_node(pRExC_state, TAIL);
-        REGTAIL(pRExC_state, ret, ender);
+        if (! REGTAIL(pRExC_state, ret, ender)) {
+            REQUIRE_BRANCHJ(flagp, 0);
+        }
      }
  
      if (ISMULT2(RExC_parse)) {
@@ -12944,11 +13024,9 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state,
          sv_catsv(substitute_parse, value_sv);
          sv_catpv(substitute_parse, ")");
  
-#ifdef EBCDIC
          /* The value should already be native, so no need to convert on EBCDIC
           * platforms.*/
          assert(! RExC_recode_x_to_native);
-#endif
  
      }
      else {   /* \N{U+...} */
@@ -13081,12 +13159,9 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state,
  
          sv_catpvs(substitute_parse, ")");
  
-#ifdef EBCDIC
          /* The values are Unicode, and therefore have to be converted to native
           * on a non-Unicode (meaning non-ASCII) platform. */
-        RExC_recode_x_to_native = 1;
-#endif
-
+        SET_recode_x_to_native(1);
      }
  
      /* Here, we have the string the name evaluates to, ready to be parsed,
@@ -13111,9 +13186,7 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state,
      RExC_start = save_start;
      RExC_parse = endbrace;
      RExC_end = orig_end;
-#ifdef EBCDIC
-    RExC_recode_x_to_native = 0;
-#endif
+    SET_recode_x_to_native(0);
  
      SvREFCNT_dec_NN(substitute_parse);
  
@@ -13291,7 +13364,6 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
      char *parse_start;
      U8 op;
      int invert = 0;
-    U8 arg;
  
      GET_RE_DEBUG_FLAGS_DECL;
  
@@ -13420,15 +13492,21 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
             *flagp |= SIMPLE;
             goto finish_meta_pat;
         case 'K':
-           RExC_seen_zerolen++;
-           ret = reg_node(pRExC_state, KEEPS);
-           *flagp |= SIMPLE;
-           /* XXX:dmq : disabling in-place substitution seems to
-            * be necessary here to avoid cases of memory corruption, as
-            * with: C<$_="x" x 80; s/x\K/y/> -- rgs
-            */
-            RExC_seen |= REG_LOOKBEHIND_SEEN;
-           goto finish_meta_pat;
+            if (!RExC_in_lookbehind && !RExC_in_lookahead) {
+                RExC_seen_zerolen++;
+                ret = reg_node(pRExC_state, KEEPS);
+                *flagp |= SIMPLE;
+                /* XXX:dmq : disabling in-place substitution seems to
+                 * be necessary here to avoid cases of memory corruption, as
+                 * with: C<$_="x" x 80; s/x\K/y/> -- rgs
+                 */
+                RExC_seen |= REG_LOOKBEHIND_SEEN;
+                goto finish_meta_pat;
+            }
+            else {
+                ++RExC_parse; /* advance past the 'K' */
+                vFAIL("\\K not permitted in lookahead/lookbehind");
+            }
         case 'Z':
             ret = reg_node(pRExC_state, SEOL);
             *flagp |= SIMPLE;
@@ -13446,13 +13524,6 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
             *flagp |= HASWIDTH;
             goto finish_meta_pat;
  
-       case 'W':
-            invert = 1;
-            /* FALLTHROUGH */
-       case 'w':
-            arg = ANYOF_WORDCHAR;
-            goto join_posix;
-
         case 'B':
              invert = 1;
              /* FALLTHROUGH */
@@ -13571,85 +13642,26 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
             goto finish_meta_pat;
            }
  
-       case 'D':
-            invert = 1;
-            /* FALLTHROUGH */
-       case 'd':
-            arg = ANYOF_DIGIT;
-            if (! DEPENDS_SEMANTICS) {
-                goto join_posix;
-            }
-
-            /* \d doesn't have any matches in the upper Latin1 range, hence /d
-             * is equivalent to /u.  Changing to /u saves some branches at
-             * runtime */
-            op = POSIXU;
-            goto join_posix_op_known;
-
         case 'R':
             ret = reg_node(pRExC_state, LNBREAK);
             *flagp |= HASWIDTH|SIMPLE;
             goto finish_meta_pat;
  
-       case 'H':
-            invert = 1;
-            /* FALLTHROUGH */
+       case 'd':
+       case 'D':
         case 'h':
-           arg = ANYOF_BLANK;
-            op = POSIXU;
-            goto join_posix_op_known;
-
-       case 'V':
-            invert = 1;
-            /* FALLTHROUGH */
-       case 'v':
-           arg = ANYOF_VERTWS;
-            op = POSIXU;
-            goto join_posix_op_known;
-
-       case 'S':
-            invert = 1;
-            /* FALLTHROUGH */
-       case 's':
-            arg = ANYOF_SPACE;
-
-          join_posix:
-
-           op = POSIXD + get_regex_charset(RExC_flags);
-            if (op > POSIXA) {  /* /aa is same as /a */
-                op = POSIXA;
-            }
-            else if (op == POSIXL) {
-                RExC_contains_locale = 1;
-            }
-            else if (op == POSIXD) {
-                RExC_seen_d_op = TRUE;
-            }
-
-          join_posix_op_known:
-
-            if (invert) {
-                op += NPOSIXD - POSIXD;
-            }
-
-           ret = reg_node(pRExC_state, op);
-            FLAGS(REGNODE_p(ret)) = namedclass_to_classnum(arg);
-
-           *flagp |= HASWIDTH|SIMPLE;
-            /* FALLTHROUGH */
-
-          finish_meta_pat:
-            if (   UCHARAT(RExC_parse + 1) == '{'
-                && UNLIKELY(! new_regcurly(RExC_parse + 1, RExC_end)))
-            {
-                RExC_parse += 2;
-                vFAIL("Unescaped left brace in regex is illegal here");
-            }
-           nextchar(pRExC_state);
-            Set_Node_Length(REGNODE_p(ret), 2); /* MJD */
-           break;
+       case 'H':
         case 'p':
         case 'P':
+       case 's':
+       case 'S':
+       case 'v':
+       case 'V':
+       case 'w':
+       case 'W':
+            /* These all have the same meaning inside [brackets], and it knows
+             * how to do the best optimizations for them.  So, pretend we found
+             * these within brackets, and let it do the work */
              RExC_parse--;
  
              ret = regclass(pRExC_state, flagp, depth+1,
@@ -13668,10 +13680,21 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                  FAIL2("panic: regclass returned failure to regatom, flags=%#" UVxf,
                        (UV) *flagp);
  
-            RExC_parse--;
+            RExC_parse--;   /* regclass() leaves this one too far ahead */
  
+          finish_meta_pat:
+                   /* The escapes above that don't take a parameter can't be
+                    * followed by a '{'.  But 'pX', 'p{foo}' and
+                    * correspondingly 'P' can be */
+            if (   RExC_parse - parse_start == 1
+                && UCHARAT(RExC_parse + 1) == '{'
+                && UNLIKELY(! new_regcurly(RExC_parse + 1, RExC_end)))
+            {
+                RExC_parse += 2;
+                vFAIL("Unescaped left brace in regex is illegal here");
+            }
              Set_Node_Offset(REGNODE_p(ret), parse_start);
-            Set_Node_Cur_Length(REGNODE_p(ret), parse_start - 2);
+            Set_Node_Length(REGNODE_p(ret), RExC_parse - parse_start + 1); /* MJD */
              nextchar(pRExC_state);
             break;
          case 'N':
@@ -13792,7 +13815,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                          && num >= RExC_npar
                          /* cannot be an octal escape if it starts with 8 */
                          && *RExC_parse != '8'
-                        /* cannot be an octal escape it it starts with 9 */
+                        /* cannot be an octal escape if it starts with 9 */
                          && *RExC_parse != '9'
                      ) {
                          /* Probably not meant to be a backref, instead likely
@@ -14153,13 +14176,13 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                              UPDATE_WARNINGS_LOC(p - 1);
                              ender = result;
  
-                            if (ender < 0x100) {
  #ifdef EBCDIC
+                            if (ender < 0x100) {
                                  if (RExC_recode_x_to_native) {
                                      ender = LATIN1_TO_NATIVE(ender);
                                  }
-#endif
                             }
+#endif
                             break;
                         }
                     case 'c':
@@ -14787,7 +14810,7 @@ S_populate_ANYOF_from_invlist(pTHX_ regnode *node, SV** invlist_ptr)
      assert(PL_regkind[OP(node)] == ANYOF);
  
      /* There is no bitmap for this node type */
-    if (OP(node) == ANYOFH) {
+    if (inRANGE(OP(node), ANYOFH, ANYOFHr)) {
          return;
      }
  
@@ -17012,7 +17035,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                                  {
                                      if (strict) {
                                          RExC_parse--;
-                                        vFAIL("\\N{} in inverted character class or as a range end-point is restricted to one character");
+                                        vFAIL("\\N{} here is restricted to one character");
                                      }
                                      ckWARNreg(RExC_parse, "Using just the first character returned by \\N{} in character class");
                                      break; /* <value> contains the first code
@@ -18861,7 +18884,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
  
                      full_cp_count += this_end - this_start + 1;
                  }
-                invlist_iterfinish(cp_list);
  
                  /* At the end of the loop, we count how many bits differ from
                   * the bits in lowest code point, call the count 'd'.  If the
@@ -18890,8 +18912,10 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                      ret = reganode(pRExC_state, op, lowest_cp);
                      FLAGS(REGNODE_p(ret)) = ANYOFM_mask;
                  }
+
+              done_anyofm:
+                invlist_iterfinish(cp_list);
              }
-          done_anyofm:
  
              if (inverted) {
                  _invlist_invert(cp_list);
@@ -19016,52 +19040,92 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
              SvREFCNT_dec(intersection);
          }
  
-        /* If didn't find an optimization and there is no need for a
-        * bitmap, optimize to indicate that */
+        /* If didn't find an optimization and there is no need for a bitmap,
+         * optimize to indicate that */
          if (     start[0] >= NUM_ANYOF_CODE_POINTS
              && ! LOC
              && ! upper_latin1_only_utf8_matches
              &&   anyof_flags == 0)
          {
+            U8 low_utf8[UTF8_MAXBYTES+1];
              UV highest_cp = invlist_highest(cp_list);
  
-            /* If the lowest and highest code point in the class have the same
-             * UTF-8 first byte, then all do, and we can store that byte for
-             * regexec.c to use so that it can more quickly scan the target
-             * string for potential matches for this class.  We co-opt the the
-             * flags field for this.  Zero means, they don't have the same
-             * first byte.  We do accept here very large code points (for
-             * future use), but don't bother with this optimization for them,
-             * as it would cause other complications */
-            if (highest_cp > IV_MAX) {
-                anyof_flags = 0;
-            }
-            else {
-                U8 low_utf8[UTF8_MAXBYTES+1];
+            op = ANYOFH;
+
+            /* Currently the maximum allowed code point by the system is
+             * IV_MAX.  Higher ones are reserved for future internal use.  This
+             * particular regnode can be used for higher ones, but we can't
+             * calculate the code point of those.  IV_MAX suffices though, as
+             * it will be a large first byte */
+            (void) uvchr_to_utf8(low_utf8, MIN(start[0], IV_MAX));
+
+            /* We store the lowest possible first byte of the UTF-8
+             * representation, using the flags field.  This allows for quick
+             * ruling out of some inputs without having to convert from UTF-8
+             * to code point.  For EBCDIC, this has to be I8. */
+            anyof_flags = NATIVE_UTF8_TO_I8(low_utf8[0]);
+
+            /* If the first UTF-8 start byte for the highest code point in the
+             * range is suitably small, we may be able to get an upper bound as
+             * well */
+            if (highest_cp <= IV_MAX) {
                  U8 high_utf8[UTF8_MAXBYTES+1];
  
-                (void) uvchr_to_utf8(low_utf8, start[0]);
-                (void) uvchr_to_utf8(high_utf8, invlist_highest(cp_list));
+                (void) uvchr_to_utf8(high_utf8, highest_cp);
+
+                /* If the lowest and highest are the same, we can get an exact
+                 * first byte instead of a just minimum.  We signal this with a
+                 * different regnode */
+                if (low_utf8[0] == high_utf8[0]) {
+
+                    /* No need to convert to I8 for EBCDIC as this is an exact
+                     * match */
+                    anyof_flags = low_utf8[0];
+                    op = ANYOFHb;
+                }
+                else if (NATIVE_UTF8_TO_I8(high_utf8[0]) <= MAX_ANYOF_HRx_BYTE)
+                {
+
+                    /* Here, the high byte is not the same as the low, but is
+                     * small enough that its reasonable to have a loose upper
+                     * bound, which is packed in with the strict lower bound.
+                     * See comments at the definition of MAX_ANYOF_HRx_BYTE.
+                     * On EBCDIC platforms, I8 is used.  On ASCII platforms I8
+                     * is the same thing as UTF-8 */
  
-                anyof_flags = (low_utf8[0] == high_utf8[0])
-                            ? low_utf8[0]
-                            : 0;
+                    U8 bits = 0;
+                    U8 max_range_diff = MAX_ANYOF_HRx_BYTE - anyof_flags;
+                    U8 range_diff = NATIVE_UTF8_TO_I8(high_utf8[0])
+                                  - anyof_flags;
+
+                    if (range_diff <= max_range_diff / 8) {
+                        bits = 3;
+                    }
+                    else if (range_diff <= max_range_diff / 4) {
+                        bits = 2;
+                    }
+                    else if (range_diff <= max_range_diff / 2) {
+                        bits = 1;
+                    }
+                    anyof_flags = (anyof_flags - 0xC0) << 2 | bits;
+                    op = ANYOFHr;
+                }
              }
  
-            op = ANYOFH;
+            goto done_finding_op;
          }
      }   /* End of seeing if can optimize it into a different node */
  
    is_anyof: /* It's going to be an ANYOF node. */
-    if (op != ANYOFH) {
-        op = (has_runtime_dependency & HAS_D_RUNTIME_DEPENDENCY)
-             ? ANYOFD
-             : ((posixl)
-                ? ANYOFPOSIXL
-                : ((LOC)
-                   ? ANYOFL
-                   : ANYOF));
-    }
+    op = (has_runtime_dependency & HAS_D_RUNTIME_DEPENDENCY)
+         ? ANYOFD
+         : ((posixl)
+            ? ANYOFPOSIXL
+            : ((LOC)
+               ? ANYOFL
+               : ANYOF));
+
+  done_finding_op:
  
      ret = regnode_guts(pRExC_state, op, regarglen[op], "anyof");
      FILL_NODE(ret, op);        /* We set the argument later */
@@ -19818,8 +19882,8 @@ S_regtail(pTHX_ RExC_state_t * pRExC_state,
      }
      else {
          if (val - scan > U16_MAX) {
-            /* Since not all callers check the return value, populate this with
-             * something that won't loop and will likely lead to a crash if
+            /* Populate this with something that won't loop and will likely
+             * lead to a crash if the caller ignores the failure return, and
               * execution continues */
              NEXT_OFF(REGNODE_p(scan)) = U16_MAX;
              return FALSE;
@@ -19930,6 +19994,9 @@ S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode_offset p,
      }
      else {
          if (val - scan > U16_MAX) {
+            /* Populate this with something that won't loop and will likely
+             * lead to a crash if the caller ignores the failure return, and
+             * execution continues */
              NEXT_OFF(REGNODE_p(scan)) = U16_MAX;
              return FALSE;
          }
@@ -20207,11 +20274,16 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_
  
      SvPVCLEAR(sv);
  
-    if (OP(o) > REGNODE_MAX)           /* regnode.type is unsigned */
-       /* It would be nice to FAIL() here, but this may be called from
-          regexec.c, and it would be hard to supply pRExC_state. */
-       Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d",
-                                              (int)OP(o), (int)REGNODE_MAX);
+    if (OP(o) > REGNODE_MAX) {          /* regnode.type is unsigned */
+        if (pRExC_state) {  /* This gives more info, if we have it */
+            FAIL3("panic: corrupted regexp opcode %d > %d",
+                  (int)OP(o), (int)REGNODE_MAX);
+        }
+        else {
+            Perl_croak(aTHX_ "panic: corrupted regexp opcode %d > %d",
+                             (int)OP(o), (int)REGNODE_MAX);
+        }
+    }
      sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
  
      k = PL_regkind[OP(o)];
@@ -20348,7 +20420,9 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_
          /* 2: embedded, otherwise 1 */
         Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);
      else if (k == ANYOF) {
-       const U8 flags = (OP(o) == ANYOFH) ? 0 : ANYOF_FLAGS(o);
+       const U8 flags = inRANGE(OP(o), ANYOFH, ANYOFHr)
+                          ? 0
+                          : ANYOF_FLAGS(o);
          bool do_sep = FALSE;    /* Do we need to separate various components of
                                     the output? */
          /* Set if there is still an unresolved user-defined property */
@@ -20404,7 +20478,7 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_
          /* Ready to start outputting.  First, the initial left bracket */
         Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
  
-        if (OP(o) != ANYOFH) {
+        if (! inRANGE(OP(o), ANYOFH, ANYOFHr)) {
              /* Then all the things that could fit in the bitmap */
              do_sep = put_charclass_bitmap_innards(sv,
                                                    ANYOF_BITMAP(o),
@@ -20502,11 +20576,22 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_
          /* And finally the matching, closing ']' */
         Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
  
-        if (OP(o) == ANYOFH && FLAGS(o) != 0) {
-            Perl_sv_catpvf(aTHX_ sv, " (First UTF-8 byte=\\x%02x)", FLAGS(o));
+        if (inRANGE(OP(o), ANYOFH, ANYOFHr)) {
+            U8 lowest = (OP(o) != ANYOFHr)
+                         ? FLAGS(o)
+                         : LOWEST_ANYOF_HRx_BYTE(FLAGS(o));
+            U8 highest = (OP(o) == ANYOFHb)
+                         ? lowest
+                         : OP(o) == ANYOFH
+                           ? 0xFF
+                           : HIGHEST_ANYOF_HRx_BYTE(FLAGS(o));
+            Perl_sv_catpvf(aTHX_ sv, " (First UTF-8 byte=%02X", lowest);
+            if (lowest != highest) {
+                Perl_sv_catpvf(aTHX_ sv, "-%02X", highest);
+            }
+            Perl_sv_catpvf(aTHX_ sv, ")");
          }
  
-
          SvREFCNT_dec(unresolved);
      }
      else if (k == ANYOFM) {
@@ -21435,9 +21520,14 @@ S_put_range(pTHX_ SV *sv, UV start, const UV end, const bool allow_literals)
  
          /* As a final resort, output the range or subrange as hex. */
  
-        this_end = (end < NUM_ANYOF_CODE_POINTS)
-                    ? end
-                    : NUM_ANYOF_CODE_POINTS - 1;
+        if (start >= NUM_ANYOF_CODE_POINTS) {
+            this_end = end;
+        }
+        else {
+            this_end = (end < NUM_ANYOF_CODE_POINTS)
+                        ? end
+                        : NUM_ANYOF_CODE_POINTS - 1;
+        }
  #if NUM_ANYOF_CODE_POINTS > 256
          format = (this_end < 256)
                   ? "\\x%02" UVXf "-\\x%02" UVXf
@@ -22574,8 +22664,7 @@ Perl_parse_uniprop_string(pTHX_
      int slash_pos  = -1;    /* Where the '/' is found, or negative if none */
      int table_index = 0;    /* The entry number for this property in the table
                                 of all Unicode property names */
-    bool starts_with_In_or_Is = FALSE;  /* ? Does the name start with 'In' or
-                                             'Is' */
+    bool starts_with_Is = FALSE;  /* ? Does the name start with 'Is' */
      Size_t lookup_offset = 0;   /* Used to ignore the first few characters of
                                     the normalized name in certain situations */
      Size_t non_pkg_begin = 0;   /* Offset of first byte in 'name' that isn't
@@ -22865,7 +22954,7 @@ Perl_parse_uniprop_string(pTHX_
          /* Certain properties whose values are numeric need special handling.
           * They may optionally be prefixed by 'is'.  Ignore that prefix for the
           * purposes of checking if this is one of those properties */
-        if (memBEGINPs(lookup_name, name_len, "is")) {
+        if (memBEGINPs(lookup_name, j, "is")) {
              lookup_offset = 2;
          }
  
@@ -23031,7 +23120,9 @@ Perl_parse_uniprop_string(pTHX_
              }
  
              /* Store the first real character in the denominator */
-            lookup_name[j++] = name[i];
+            if (i < name_len) {
+                lookup_name[j++] = name[i];
+            }
          }
      }
  
@@ -23049,11 +23140,15 @@ Perl_parse_uniprop_string(pTHX_
  
      /* If the original input began with 'In' or 'Is', it could be a subroutine
       * call to a user-defined property instead of a Unicode property name. */
-    if (    non_pkg_begin + name_len > 2
+    if (    name_len - non_pkg_begin > 2
          &&  name[non_pkg_begin+0] == 'I'
          && (name[non_pkg_begin+1] == 'n' || name[non_pkg_begin+1] == 's'))
      {
-        starts_with_In_or_Is = TRUE;
+        /* Names that start with In have different characterstics than those
+         * that start with Is */
+        if (name[non_pkg_begin+1] == 's') {
+            starts_with_Is = TRUE;
+        }
      }
      else {
          could_be_user_defined = FALSE;
@@ -23392,8 +23487,11 @@ Perl_parse_uniprop_string(pTHX_
      /* If it didn't find the property ... */
      if (table_index == 0) {
  
-        /* Try again stripping off any initial 'In' or 'Is' */
-        if (starts_with_In_or_Is) {
+        /* Try again stripping off any initial 'Is'.  This is because we
+         * promise that an initial Is is optional.  The same isn't true of
+         * names that start with 'In'.  Those can match only blocks, and the
+         * lookup table already has those accounted for. */
+        if (starts_with_Is) {
              lookup_name += 2;
              lookup_len -= 2;
              equals_pos -= 2;