[perl #115440] Fix various leaks with fatal FETCH

[perl5.git] / regexec.c
diff --git a/regexec.c b/regexec.c

index e631f83..4029f1e 100644 (file)
--- a/regexec.c
+++ b/regexec.c
@@ -104,7 +104,7 @@ const char* const non_utf8_target_but_utf8_required
  /* Valid for non-utf8 strings: avoids the reginclass
   * call if there are no complications: i.e., if everything matchable is
   * straight forward in the bitmap */
-#define REGINCLASS(prog,p,c)  (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0)   \
+#define REGINCLASS(prog,p,c)  (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0)   \
                                               : ANYOF_BITMAP_TEST(p,*(c)))
  
  /*
@@ -660,8 +660,12 @@ Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos,
               DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
               goto fail;
           }
-         if (prog->check_offset_min == prog->check_offset_max &&
-             !(prog->extflags & RXf_CANY_SEEN)) {
+         if (prog->check_offset_min == prog->check_offset_max
+              && !(prog->extflags & RXf_CANY_SEEN)
+              && ! multiline)   /* /m can cause \n's to match that aren't
+                                   accounted for in the string max length.
+                                   See [perl #115242] */
+          {
             /* Substring at constant offset from beg-of-str... */
             I32 slen;
  
@@ -1454,9 +1458,8 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
         switch (OP(c)) {
         case ANYOF:
             if (utf8_target) {
-               STRLEN inclasslen = strend - s;
                 REXEC_FBC_UTF8_CLASS_SCAN(
-                          reginclass(prog, c, (U8*)s, &inclasslen, utf8_target));
+                          reginclass(prog, c, (U8*)s, utf8_target));
             }
             else {
                 REXEC_FBC_CLASS_SCAN(REGINCLASS(prog, c, (U8*)s));
@@ -3329,7 +3332,8 @@ S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p, U8* c
  
      const bool utf8_target = PL_reg_match_utf8;
  
-    UV c1, c2;
+    UV c1 = CHRTEST_NOT_A_CP_1;
+    UV c2 = CHRTEST_NOT_A_CP_2;
      bool use_chrtest_void = FALSE;
  
      /* Used when we have both utf8 input and utf8 output, to avoid converting
@@ -3485,7 +3489,9 @@ S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p, U8* c
                      c2 = PL_fold_latin1[c1];
                      break;
  
-               default: Perl_croak(aTHX_ "panic: Unexpected op %u", OP(text_node));
+               default:
+                    Perl_croak(aTHX_ "panic: Unexpected op %u", OP(text_node));
+                    assert(0); /* NOTREACHED */
              }
          }
      }
@@ -3658,6 +3664,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
        reenter_switch:
  
          SET_nextchr;
+        assert(nextchr < 256 && (nextchr >= 0 || nextchr == NEXTCHR_EOS));
  
         switch (state_num) {
         case BOL: /*  /^../  */
@@ -3691,12 +3698,12 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
             st->u.keeper.val = rex->offs[0].start;
             rex->offs[0].start = locinput - PL_bostr;
             PUSH_STATE_GOTO(KEEPS_next, next, locinput);
-           /*NOT-REACHED*/
+           assert(0); /*NOTREACHED*/
         case KEEPS_next_fail:
             /* rollback the start point change */
             rex->offs[0].start = st->u.keeper.val;
             sayNO_SILENT;
-           /*NOT-REACHED*/
+           assert(0); /*NOTREACHED*/
  
         case EOL: /* /..$/  */
                 goto seol;
@@ -4205,8 +4212,9 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
             }
  
             /* Neither the target nor the pattern are utf8 */
-           if (UCHARAT(s) != nextchr &&
-               UCHARAT(s) != fold_array[nextchr])
+           if (UCHARAT(s) != nextchr
+                && !NEXTCHR_IS_EOS
+               && UCHARAT(s) != fold_array[nextchr])
             {
                 sayNO;
             }
@@ -4306,10 +4314,9 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
              if (NEXTCHR_IS_EOS)
                  sayNO;
             if (utf8_target) {
-               STRLEN inclasslen = PL_regeol - locinput;
-               if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, utf8_target))
+               if (!reginclass(rex, scan, (U8*)locinput, utf8_target))
                     sayNO;
-               locinput += inclasslen;
+               locinput += UTF8SKIP(locinput);
                 break;
             }
             else {
@@ -4648,7 +4655,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
                 /* This call case insensitively compares the entire buffer
                     * at s, with the current input starting at locinput, but
                     * not going off the end given by PL_regeol, and returns in
-                   * limit upon success, how much of the current input was
+                   * <limit> upon success, how much of the current input was
                     * matched */
                 if (! foldEQ_utf8_flags(s, NULL, rex->offs[n].end - ln, utf8_target,
                                     locinput, &limit, 0, utf8_target, utf8_fold_flags))
@@ -6326,6 +6333,7 @@ NULL
          /* this is a point to jump to in order to increment
           * locinput by one character */
          increment_locinput:
+            assert(!NEXTCHR_IS_EOS);
              if (utf8_target) {
                  locinput += PL_utf8skip[nextchr];
                  /* locinput is allowed to go 1 char off the end, but not 2+ */
@@ -6523,21 +6531,24 @@ no_silent:
  /*
   - regrepeat - repeatedly match something simple, report how many
   *
+ * What 'simple' means is a node which can be the operand of a quantifier like
+ * '+', or {1,3}
+ *
   * startposp - pointer a pointer to the start position.  This is updated
   *             to point to the byte following the highest successful
   *             match.
   * p         - the regnode to be repeatedly matched against.
- * max       - maximum number of characters to match.
+ * max       - maximum number of things to match.
   * depth     - (for debugging) backtracking depth.
   */
  STATIC I32
  S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 max, int depth)
  {
      dVAR;
-    char *scan;
+    char *scan;     /* Pointer to current position in target string */
      I32 c;
-    char *loceol = PL_regeol;
-    I32 hardcount = 0;
+    char *loceol = PL_regeol;   /* local version */
+    I32 hardcount = 0;  /* How many matches so far */
      bool utf8_target = PL_reg_match_utf8;
      UV utf8_flags;
  #ifndef DEBUGGING
@@ -6549,12 +6560,35 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma
      scan = *startposp;
      if (max == REG_INFTY)
         max = I32_MAX;
-    else if (max < loceol - scan)
+    else if (! utf8_target && scan + max < loceol)
         loceol = scan + max;
+
+    /* Here, for the case of a non-UTF-8 target we have adjusted <loceol> down
+     * to the maximum of how far we should go in it (leaving it set to the real
+     * end, if the maximum permissible would take us beyond that).  This allows
+     * us to make the loop exit condition that we haven't gone past <loceol> to
+     * also mean that we haven't exceeded the max permissible count, saving a
+     * test each time through the loop.  But it assumes that the OP matches a
+     * single byte, which is true for most of the OPs below when applied to a
+     * non-UTF-8 target.  Those relatively few OPs that don't have this
+     * characteristic will have to compensate.
+     *
+     * There is no adjustment for UTF-8 targets, as the number of bytes per
+     * character varies.  OPs will have to test both that the count is less
+     * than the max permissible (using <hardcount> to keep track), and that we
+     * are still within the bounds of the string (using <loceol>.  A few OPs
+     * match a single byte no matter what the encoding.  They can omit the max
+     * test if, for the UTF-8 case, they do the adjustment that was skipped
+     * above.
+     *
+     * Thus, the code above sets things up for the common case; and exceptional
+     * cases need extra work; the common case is to make sure <scan> doesn't
+     * go past <loceol>, and for UTF-8 to also use <hardcount> to make sure the
+     * count doesn't exceed the maximum permissible */
+
      switch (OP(p)) {
      case REG_ANY:
         if (utf8_target) {
-           loceol = PL_regeol;
             while (scan < loceol && hardcount < max && *scan != '\n') {
                 scan += UTF8SKIP(scan);
                 hardcount++;
@@ -6566,7 +6600,6 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma
         break;
      case SANY:
          if (utf8_target) {
-           loceol = PL_regeol;
             while (scan < loceol && hardcount < max) {
                 scan += UTF8SKIP(scan);
                 hardcount++;
@@ -6575,8 +6608,15 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma
         else
             scan = loceol;
         break;
-    case CANY:
-       scan = loceol;
+    case CANY:  /* Move <scan> forward <max> bytes, unless goes off end */
+        if (utf8_target && scan + max < loceol) {
+
+            /* <loceol> hadn't been adjusted in the UTF-8 case */
+            scan +=  max;
+        }
+        else {
+            scan = loceol;
+        }
         break;
      case EXACT:
          assert(STR_LEN(p) == (UTF_PATTERN) ? UTF8SKIP(STRING(p)) : 1);
@@ -6588,6 +6628,11 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma
           * can use UTF8_IS_INVARIANT() even if the pattern isn't UTF-8, as it's
           * true iff it doesn't matter if the argument is in UTF-8 or not */
          if (UTF8_IS_INVARIANT(c) || (! utf8_target && ! UTF_PATTERN)) {
+            if (utf8_target && scan + max < loceol) {
+                /* We didn't adjust <loceol> because is UTF-8, but ok to do so,
+                 * since here, to match at all, 1 char == 1 byte */
+                loceol = scan + max;
+            }
             while (scan < loceol && UCHARAT(scan) == c) {
                 scan++;
             }
@@ -6595,9 +6640,8 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma
         else if (UTF_PATTERN) {
              if (utf8_target) {
                  STRLEN scan_char_len;
-                loceol = PL_regeol;
  
-                /* When both target and pattern are UTF-8, we have to do s
+                /* When both target and pattern are UTF-8, we have to do
                   * string EQ */
                  while (hardcount < max
                         && scan + (scan_char_len = UTF8SKIP(scan)) <= loceol
@@ -6627,7 +6671,6 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma
               * then look for those in sequence in the utf8 string */
             U8 high = UTF8_TWO_BYTE_HI(c);
             U8 low = UTF8_TWO_BYTE_LO(c);
-           loceol = PL_regeol;
  
             while (hardcount < max
                     && scan + 1 < loceol
@@ -6666,21 +6709,23 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma
  
          if (S_setup_EXACTISH_ST_c1_c2(aTHX_ p, &c1, c1_utf8, &c2, c2_utf8)) {
              if (c1 == CHRTEST_VOID) {
-            /* Use full Unicode fold matching */
-           char *tmpeol = loceol;
-            STRLEN pat_len = (UTF_PATTERN) ? UTF8SKIP(STRING(p)) : 1;
-           while (hardcount < max
-                   && foldEQ_utf8_flags(scan, &tmpeol, 0, utf8_target,
-                       STRING(p), NULL, pat_len, cBOOL(UTF_PATTERN), utf8_flags))
-           {
-               scan = tmpeol;
-               tmpeol = loceol;
-               hardcount++;
-           }
+                /* Use full Unicode fold matching */
+                char *tmpeol = PL_regeol;
+                STRLEN pat_len = (UTF_PATTERN) ? UTF8SKIP(STRING(p)) : 1;
+                while (hardcount < max
+                        && foldEQ_utf8_flags(scan, &tmpeol, 0, utf8_target,
+                                             STRING(p), NULL, pat_len,
+                                             cBOOL(UTF_PATTERN), utf8_flags))
+                {
+                    scan = tmpeol;
+                    tmpeol = PL_regeol;
+                    hardcount++;
+                }
              }
              else if (utf8_target) {
                  if (c1 == c2) {
-                    while (hardcount < max
+                    while (scan < loceol
+                           && hardcount < max
                             && memEQ(scan, c1_utf8, UTF8SKIP(scan)))
                      {
                          scan += UTF8SKIP(scan);
@@ -6688,7 +6733,8 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma
                      }
                  }
                  else {
-                    while (hardcount < max
+                    while (scan < loceol
+                           && hardcount < max
                             && (memEQ(scan, c1_utf8, UTF8SKIP(scan))
                                 || memEQ(scan, c2_utf8, UTF8SKIP(scan))))
                      {
@@ -6715,11 +6761,9 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma
      case ANYOF:
         if (utf8_target) {
             STRLEN inclasslen;
-           loceol = PL_regeol;
-           inclasslen = loceol - scan;
             while (hardcount < max
-                  && ((inclasslen = loceol - scan) > 0)
-                  && reginclass(prog, p, (U8*)scan, &inclasslen, utf8_target))
+                   && scan + (inclasslen = UTF8SKIP(scan)) <= loceol
+                  && reginclass(prog, p, (U8*)scan, utf8_target))
             {
                 scan += inclasslen;
                 hardcount++;
@@ -6732,7 +6776,6 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma
      case ALNUMU:
         if (utf8_target) {
      utf8_wordchar:
-           loceol = PL_regeol;
             LOAD_UTF8_CHARCLASS_ALNUM();
             while (hardcount < max && scan < loceol &&
                     swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
@@ -6754,6 +6797,12 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma
         }
         break;
      case ALNUMA:
+        if (utf8_target && scan + max < loceol) {
+
+            /* We didn't adjust <loceol> because is UTF-8, but ok to do so,
+             * since here, to match, 1 char == 1 byte */
+            loceol = scan + max;
+        }
         while (scan < loceol && isWORDCHAR_A((U8) *scan)) {
             scan++;
         }
@@ -6761,7 +6810,6 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma
      case ALNUML:
         PL_reg_flags |= RF_tainted;
         if (utf8_target) {
-           loceol = PL_regeol;
             while (hardcount < max && scan < loceol &&
                    isALNUM_LC_utf8((U8*)scan)) {
                 scan += UTF8SKIP(scan);
@@ -6777,7 +6825,6 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma
  
      utf8_Nwordchar:
  
-           loceol = PL_regeol;
             LOAD_UTF8_CHARCLASS_ALNUM();
             while (hardcount < max && scan < loceol &&
                     ! swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
@@ -6800,14 +6847,23 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma
         break;
  
      case POSIXA:
-       while (scan < loceol && _generic_isCC_A((U8) *scan, FLAGS(p))) {
+        if (utf8_target && scan + max < loceol) {
+
+            /* We didn't adjust <loceol> because is UTF-8, but ok to do so,
+             * since here, to match, 1 char == 1 byte */
+            loceol = scan + max;
+        }
+        while (scan < loceol && _generic_isCC_A((U8) *scan, FLAGS(p))) {
             scan++;
         }
         break;
      case NPOSIXA:
         if (utf8_target) {
-           while (scan < loceol && ! _generic_isCC_A((U8) *scan, FLAGS(p))) {
+           while (scan < loceol && hardcount < max
+                   && ! _generic_isCC_A((U8) *scan, FLAGS(p)))
+            {
                 scan += UTF8SKIP(scan);
+                hardcount++;
             }
         }
         else {
@@ -6818,8 +6874,11 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma
         break;
      case NALNUMA:
         if (utf8_target) {
-           while (scan < loceol && ! isWORDCHAR_A((U8) *scan)) {
+           while (scan < loceol && hardcount < max
+                   && ! isWORDCHAR_A((U8) *scan))
+            {
                 scan += UTF8SKIP(scan);
+                hardcount++;
             }
         }
         else {
@@ -6831,7 +6890,6 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma
      case NALNUML:
         PL_reg_flags |= RF_tainted;
         if (utf8_target) {
-           loceol = PL_regeol;
             while (hardcount < max && scan < loceol &&
                    !isALNUM_LC_utf8((U8*)scan)) {
                 scan += UTF8SKIP(scan);
@@ -6847,7 +6905,6 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma
  
      utf8_space:
  
-           loceol = PL_regeol;
             LOAD_UTF8_CHARCLASS_SPACE();
             while (hardcount < max && scan < loceol &&
                    (*scan == ' ' ||
@@ -6873,6 +6930,12 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma
         }
         break;
      case SPACEA:
+        if (utf8_target && scan + max < loceol) {
+
+            /* We didn't adjust <loceol> because is UTF-8, but ok to do so,
+             * since here, to match, 1 char == 1 byte */
+            loceol = scan + max;
+        }
         while (scan < loceol && isSPACE_A((U8) *scan)) {
             scan++;
         }
@@ -6880,7 +6943,6 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma
      case SPACEL:
         PL_reg_flags |= RF_tainted;
         if (utf8_target) {
-           loceol = PL_regeol;
             while (hardcount < max && scan < loceol &&
                    isSPACE_LC_utf8((U8*)scan)) {
                 scan += UTF8SKIP(scan);
@@ -6896,7 +6958,6 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma
  
      utf8_Nspace:
  
-           loceol = PL_regeol;
             LOAD_UTF8_CHARCLASS_SPACE();
             while (hardcount < max && scan < loceol &&
                    ! (*scan == ' ' ||
@@ -6923,8 +6984,11 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma
         break;
      case NSPACEA:
         if (utf8_target) {
-           while (scan < loceol && ! isSPACE_A((U8) *scan)) {
+           while (hardcount < max && scan < loceol
+                  && ! isSPACE_A((U8) *scan))
+            {
                 scan += UTF8SKIP(scan);
+               hardcount++;
             }
         }
         else {
@@ -6936,7 +7000,6 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma
      case NSPACEL:
         PL_reg_flags |= RF_tainted;
         if (utf8_target) {
-           loceol = PL_regeol;
             while (hardcount < max && scan < loceol &&
                    !isSPACE_LC_utf8((U8*)scan)) {
                 scan += UTF8SKIP(scan);
@@ -6949,7 +7012,6 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma
         break;
      case DIGIT:
         if (utf8_target) {
-           loceol = PL_regeol;
             LOAD_UTF8_CHARCLASS_DIGIT();
             while (hardcount < max && scan < loceol &&
                    swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) {
@@ -6962,6 +7024,12 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma
         }
         break;
      case DIGITA:
+        if (utf8_target && scan + max < loceol) {
+
+            /* We didn't adjust <loceol> because is UTF-8, but ok to do so,
+             * since here, to match, 1 char == 1 byte */
+            loceol = scan + max;
+        }
         while (scan < loceol && isDIGIT_A((U8) *scan)) {
             scan++;
         }
@@ -6969,7 +7037,6 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma
      case DIGITL:
         PL_reg_flags |= RF_tainted;
         if (utf8_target) {
-           loceol = PL_regeol;
             while (hardcount < max && scan < loceol &&
                    isDIGIT_LC_utf8((U8*)scan)) {
                 scan += UTF8SKIP(scan);
@@ -6982,7 +7049,6 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma
         break;
      case NDIGIT:
         if (utf8_target) {
-           loceol = PL_regeol;
             LOAD_UTF8_CHARCLASS_DIGIT();
             while (hardcount < max && scan < loceol &&
                    !swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) {
@@ -6996,8 +7062,10 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma
         break;
      case NDIGITA:
         if (utf8_target) {
-           while (scan < loceol && ! isDIGIT_A((U8) *scan)) {
+           while (hardcount < max && scan < loceol
+                  && ! isDIGIT_A((U8) *scan)) {
                 scan += UTF8SKIP(scan);
+                hardcount++;
             }
         }
         else {
@@ -7009,7 +7077,6 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma
      case NDIGITL:
         PL_reg_flags |= RF_tainted;
         if (utf8_target) {
-           loceol = PL_regeol;
             while (hardcount < max && scan < loceol &&
                    !isDIGIT_LC_utf8((U8*)scan)) {
                 scan += UTF8SKIP(scan);
@@ -7022,27 +7089,24 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma
         break;
      case LNBREAK:
          if (utf8_target) {
-           loceol = PL_regeol;
             while (hardcount < max && scan < loceol &&
                      (c=is_LNBREAK_utf8_safe(scan, loceol))) {
                 scan += c;
                 hardcount++;
             }
         } else {
-           /*
-             LNBREAK can match two latin chars, which is ok,
-             because we have a null terminated string, but we
-             have to use hardcount in this situation
-           */
+            /* LNBREAK can match one or two latin chars, which is ok, but we
+             * have to use hardcount in this situation, and throw away the
+             * adjustment to <loceol> done before the switch statement */
+            loceol = PL_regeol;
             while (scan < loceol && (c=is_LNBREAK_latin1_safe(scan, loceol))) {
                 scan+=c;
                 hardcount++;
             }
-       }       
+       }
         break;
      case HORIZWS:
          if (utf8_target) {
-           loceol = PL_regeol;
             while (hardcount < max && scan < loceol &&
                      (c=is_HORIZWS_utf8_safe(scan, loceol)))
              {
@@ -7056,7 +7120,6 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma
         break;
      case NHORIZWS:
          if (utf8_target) {
-           loceol = PL_regeol;
             while (hardcount < max && scan < loceol &&
                          !is_HORIZWS_utf8_safe(scan, loceol))
              {
@@ -7071,7 +7134,6 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma
         break;
      case VERTWS:
          if (utf8_target) {
-           loceol = PL_regeol;
             while (hardcount < max && scan < loceol &&
                              (c=is_VERTWS_utf8_safe(scan, loceol)))
              {
@@ -7086,7 +7148,6 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma
         break;
      case NVERTWS:
          if (utf8_target) {
-           loceol = PL_regeol;
             while (hardcount < max && scan < loceol &&
                                  !is_VERTWS_utf8_safe(scan, loceol))
              {
@@ -7100,8 +7161,27 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma
         }       
         break;
  
-    default:           /* Called on something of 0 width. */
-       break;          /* So match right here or not at all. */
+    case BOUND:
+    case BOUNDA:
+    case BOUNDL:
+    case BOUNDU:
+    case EOS:
+    case GPOS:
+    case KEEPS:
+    case NBOUND:
+    case NBOUNDA:
+    case NBOUNDL:
+    case NBOUNDU:
+    case OPFAIL:
+    case SBOL:
+    case SEOL:
+        /* These are all 0 width, so match right here or not at all. */
+        break;
+
+    default:
+        Perl_croak(aTHX_ "panic: regrepeat() called with unrecognized node type %d='%s'", OP(p), PL_reg_name[OP(p)]);
+        assert(0); /* NOTREACHED */
+
      }
  
      if (hardcount)
@@ -7238,15 +7318,9 @@ S_core_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bo
   
    n is the ANYOF regnode
    p is the target string
-  lenp is pointer to the maximum number of bytes of how far to go in p
-    (This is assumed wthout checking to always be at least the current
-    character's size)
    utf8_target tells whether p is in UTF-8.
  
-  Returns true if matched; false otherwise.  If lenp is not NULL, on return
-  from a successful match, the value it points to will be updated to how many
-  bytes in p were matched.  If there was no match, the value is undefined,
-  possibly changed from the input.
+  Returns true if matched; false otherwise.
  
    Note that this can be a synthetic start class, a combination of various
    nodes, so things you think might be mutually exclusive, such as locale,
@@ -7255,19 +7329,19 @@ S_core_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bo
   */
  
  STATIC bool
-S_reginclass(pTHX_ const regexp * const prog, register const regnode * const n, register const U8* const p, STRLEN* lenp, register const bool utf8_target)
+S_reginclass(pTHX_ const regexp * const prog, register const regnode * const n, register const U8* const p, register const bool utf8_target)
  {
      dVAR;
      const char flags = ANYOF_FLAGS(n);
      bool match = FALSE;
      UV c = *p;
-    STRLEN c_len = 0;
-    STRLEN maxlen;
  
      PERL_ARGS_ASSERT_REGINCLASS;
  
-    /* If c is not already the code point, get it */
-    if (utf8_target && !UTF8_IS_INVARIANT(c)) {
+    /* If c is not already the code point, get it.  Note that
+     * UTF8_IS_INVARIANT() works even if not in UTF-8 */
+    if (! UTF8_IS_INVARIANT(c) && utf8_target) {
+        STRLEN c_len = 0;
         c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &c_len,
                 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV)
                 | UTF8_ALLOW_FFFF | UTF8_CHECK_ONLY);
@@ -7276,21 +7350,6 @@ S_reginclass(pTHX_ const regexp * const prog, register const regnode * const n,
         if (c_len == (STRLEN)-1)
             Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
      }
-    else {
-       c_len = 1;
-    }
-
-    /* Use passed in max length, or one character if none passed in or less
-     * than one character.  And assume will match just one character.  This is
-     * overwritten later if matched more. */
-    if (lenp) {
-       maxlen = (*lenp > c_len) ? *lenp : c_len;
-       *lenp = c_len;
-
-    }
-    else {
-       maxlen = c_len;
-    }
  
      /* If this character is potentially in the bitmap, check it */
      if (c < 256) {