regcomp.c: After AvARRAY(), don't use av_store

[perl5.git] / regcomp.c
diff --git a/regcomp.c b/regcomp.c

index b9ca619..8a6ea89 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -2142,6 +2142,7 @@ S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc)
      populate_ANYOF_from_invlist( (regnode *) ssc, &invlist);
  
      set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist, NULL, NULL);
+    SvREFCNT_dec(invlist);
  
      /* Make sure is clone-safe */
      ssc->invlist = NULL;
@@ -4782,7 +4783,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                      if ( SvIV(re_trie_maxbuff)>=0  ) {
                          regnode *cur;
                          regnode *first = (regnode *)NULL;
-                        regnode *last = (regnode *)NULL;
+                        regnode *prev = (regnode *)NULL;
                          regnode *tail = scan;
                          U8 trietype = 0;
                          U32 count=0;
@@ -4913,7 +4914,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                                      REG_NODE_NUM(noper_next), SvPV_nolen_const(RExC_mysv));
                                  }
                                  Perl_re_printf( aTHX_  "(First==%d,Last==%d,Cur==%d,tt==%s,ntt==%s,nntt==%s)\n",
-                                   REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
+                                   REG_NODE_NUM(first), REG_NODE_NUM(prev), REG_NODE_NUM(cur),
                                    PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype]
                                 );
                              });
@@ -4959,7 +4960,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                                  } else {
                                      if ( trietype == NOTHING )
                                          trietype = noper_trietype;
-                                    last = cur;
+                                    prev = cur;
                                  }
                                 if (first)
                                     count++;
@@ -4969,7 +4970,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                                   * noper may either be a triable node which can
                                   * not be tried together with the current trie,
                                   * or a non triable node */
-                                if ( last ) {
+                                if ( prev ) {
                                      /* If last is set and trietype is not
                                       * NOTHING then we have found at least two
                                       * triable branch sequences in a row of a
@@ -4982,7 +4983,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                                          make_trie( pRExC_state,
                                                  startbranch, first, cur, tail,
                                                  count, trietype, depth+1 );
-                                    last = NULL; /* note: we clear/update
+                                    prev = NULL; /* note: we clear/update
                                                      first, trietype etc below,
                                                      so we dont do it here */
                                  }
@@ -5011,12 +5012,12 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                              Perl_re_indentf( aTHX_  "- %s (%d) <SCAN FINISHED> ",
                                depth+1, SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur));
                              Perl_re_printf( aTHX_  "(First==%d, Last==%d, Cur==%d, tt==%s)\n",
-                               REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
+                               REG_NODE_NUM(first), REG_NODE_NUM(prev), REG_NODE_NUM(cur),
                                 PL_reg_name[trietype]
                              );
  
                          });
-                        if ( last && trietype ) {
+                        if ( prev && trietype ) {
                              if ( trietype != NOTHING ) {
                                  /* the last branch of the sequence was part of
                                   * a trie, so we have to construct it here
@@ -5061,9 +5062,8 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                                          OP(opt)= OPTIMIZED;
                                  }
                              }
-                        } /* end if ( last) */
+                        } /* end if ( prev) */
                      } /* TRIE_MAXBUF is non zero */
-
                  } /* do trie */
  
             }
@@ -5864,6 +5864,7 @@ Perl_re_printf( aTHX_  "LHS=%" UVuf " RHS=%" UVuf "\n",
                  case ANYOFH:
                  case ANYOFHb:
                  case ANYOFHr:
+                case ANYOFHs:
                  case ANYOF:
                     if (flags & SCF_DO_STCLASS_AND)
                         ssc_and(pRExC_state, data->start_class,
@@ -7418,28 +7419,6 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
  
      DEBUG_r(if (!PL_colorset) reginitcolors());
  
-    /* Initialize these here instead of as-needed, as is quick and avoids
-     * having to test them each time otherwise */
-    if (! PL_InBitmap) {
-#ifdef DEBUGGING
-        char * dump_len_string;
-#endif
-
-        /* This is calculated here, because the Perl program that generates the
-         * static global ones doesn't currently have access to
-         * NUM_ANYOF_CODE_POINTS */
-       PL_InBitmap = _new_invlist(2);
-       PL_InBitmap = _add_range_to_invlist(PL_InBitmap, 0,
-                                                    NUM_ANYOF_CODE_POINTS - 1);
-#ifdef DEBUGGING
-        dump_len_string = PerlEnv_getenv("PERL_DUMP_RE_MAX_LEN");
-        if (   ! dump_len_string
-            || ! grok_atoUV(dump_len_string, (UV *)&PL_dump_re_max_len, NULL))
-        {
-            PL_dump_re_max_len = 60;    /* A reasonable default */
-        }
-#endif
-    }
  
      pRExC_state->warn_text = NULL;
      pRExC_state->unlexed_names = NULL;
@@ -8697,8 +8676,8 @@ Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
         i = rx->sublen + rx->suboffset - rx->offs[0].end;
      }
      else
-    if ( 0 <= n && n <= (I32)rx->nparens &&
-        (s1 = rx->offs[n].start) != -1 &&
+    if (inRANGE(n, 0, (I32)rx->nparens) &&
+        (s1 = rx->offs[n].start) != -1  &&
          (t1 = rx->offs[n].end) != -1)
      {
          /* $&, ${^MATCH},  $1 ... */
@@ -10317,6 +10296,28 @@ Perl_invlist_clone(pTHX_ SV* const invlist, SV* new_invlist)
  
  #endif
  
+PERL_STATIC_INLINE UV
+S_invlist_lowest(SV* const invlist)
+{
+    /* Returns the lowest code point that matches an inversion list.  This API
+     * has an ambiguity, as it returns 0 under either the lowest is actually
+     * 0, or if the list is empty.  If this distinction matters to you, check
+     * for emptiness before calling this function */
+
+    UV len = _invlist_len(invlist);
+    UV *array;
+
+    PERL_ARGS_ASSERT_INVLIST_LOWEST;
+
+    if (len == 0) {
+        return 0;
+    }
+
+    array = invlist_array(invlist);
+
+    return array[0];
+}
+
  STATIC SV *
  S_invlist_contents(pTHX_ SV* const invlist, const bool traditional_style)
  {
@@ -12850,9 +12851,9 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state,
          value = (U8 *) SvPV(value_sv, value_len);
  
          /* See if the result is one code point vs 0 or multiple */
-        if (value_len > 0 && value_len <= (UV) ((SvUTF8(value_sv))
-                                               ? UTF8SKIP(value)
-                                               : 1))
+        if (inRANGE(value_len, 1, ((UV) SvUTF8(value_sv)
+                                  ? UTF8SKIP(value)
+                                  : 1)))
          {
              /* Here, exactly one code point.  If that isn't what is wanted,
               * fail */
@@ -14546,13 +14547,11 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
  
                  goto continue_parse;
              }
-            else if (! LOC) {  /* XXX shouldn't /l assume could be a UTF-8
-                                locale, and prepare for that? */
+            else if (FOLD) {
                  bool splittable = FALSE;
                  bool backed_up = FALSE;
-                char * e = s;
-
-                assert(FOLD);
+                char * e;
+                char * s_start;
  
                  /* Here is /i.  Running out of room creates a problem if we are
                   * folding, and the split happens in the middle of a
@@ -14589,6 +14588,132 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                   *  oldp      points to the beginning byte in the input of
                   *            'ender'.
                   *
+                 * In the case of /il, we haven't folded anything that could be
+                 * affected by the locale.  That means only above-Latin1
+                 * characters that fold to other above-latin1 characters get
+                 * folded at compile time.  To check where a good place to
+                 * split nodes is, everything in it will have to be folded.
+                 * The boolean 'maybe_exactfu' keeps track in /il if there are
+                 * any unfolded characters in the node. */
+                bool need_to_fold_loc = LOC && ! maybe_exactfu;
+
+                /* If we do need to fold the node, we need a place to store the
+                 * folded copy, and a way to map back to the unfolded original
+                 * */
+                char * locfold_buf = NULL;
+                Size_t * loc_correspondence = NULL;
+
+                if (! need_to_fold_loc) {   /* The normal case.  Just
+                                               initialize to the actual node */
+                    e = s;
+                    s_start = s0;
+                    s = old_old_s;  /* Point to the beginning of the final char
+                                       that fits in the node */
+                }
+                else {
+
+                    /* Here, we have filled a /il node, and there are unfolded
+                     * characters in it.  If the runtime locale turns out to be
+                     * UTF-8, there are possible multi-character folds, just
+                     * like when not under /l.  The node hence can't terminate
+                     * in the middle of such a fold.  To determine this, we
+                     * have to create a folded copy of this node.  That means
+                     * reparsing the node, folding everything assuming a UTF-8
+                     * locale.  (If at runtime it isn't such a locale, the
+                     * actions here wouldn't have been necessary, but we have
+                     * to assume the worst case.)  If we find we need to back
+                     * off the folded string, we do so, and then map that
+                     * position back to the original unfolded node, which then
+                     * gets output, truncated at that spot */
+
+                    char * redo_p = RExC_parse;
+                    char * redo_e;
+                    char * old_redo_e;
+
+                    /* Allow enough space assuming a single byte input folds to
+                     * a single byte output, plus assume that the two unparsed
+                     * characters (that we may need) fold to the largest number
+                     * of bytes possible, plus extra for one more worst case
+                     * scenario.  In the loop below, if we start eating into
+                     * that final spare space, we enlarge this initial space */
+                    Size_t size = max_string_len + (3 * UTF8_MAXBYTES_CASE) + 1;
+
+                    Newxz(locfold_buf, size, char);
+                    Newxz(loc_correspondence, size, Size_t);
+
+                    /* Redo this node's parse, folding into 'locfold_buf' */
+                    redo_p = RExC_parse;
+                    old_redo_e = redo_e = locfold_buf;
+                    while (redo_p <= oldp) {
+
+                        old_redo_e = redo_e;
+                        loc_correspondence[redo_e - locfold_buf]
+                                                        = redo_p - RExC_parse;
+
+                        if (UTF) {
+                            Size_t added_len;
+
+                            (void) _to_utf8_fold_flags((U8 *) redo_p,
+                                                       (U8 *) RExC_end,
+                                                       (U8 *) redo_e,
+                                                       &added_len,
+                                                       FOLD_FLAGS_FULL);
+                            redo_e += added_len;
+                            redo_p += UTF8SKIP(redo_p);
+                        }
+                        else {
+
+                            /* Note that if this code is run on some ancient
+                             * Unicode versions, SHARP S doesn't fold to 'ss',
+                             * but rather than clutter the code with #ifdef's,
+                             * as is done above, we ignore that possibility.
+                             * This is ok because this code doesn't affect what
+                             * gets matched, but merely where the node gets
+                             * split */
+                            if (UCHARAT(redo_p) != LATIN_SMALL_LETTER_SHARP_S) {
+                                *redo_e++ = toLOWER_L1(UCHARAT(redo_p));
+                            }
+                            else {
+                                *redo_e++ = 's';
+                                *redo_e++ = 's';
+                            }
+                            redo_p++;
+                        }
+
+
+                        /* If we're getting so close to the end that a
+                         * worst-case fold in the next character would cause us
+                         * to overflow, increase, assuming one byte output byte
+                         * per one byte input one, plus room for another worst
+                         * case fold */
+                        if (   redo_p <= oldp
+                            && redo_e > locfold_buf + size
+                                                    - (UTF8_MAXBYTES_CASE + 1))
+                        {
+                            Size_t new_size = size
+                                            + (oldp - redo_p)
+                                            + UTF8_MAXBYTES_CASE + 1;
+                            Ptrdiff_t e_offset = redo_e - locfold_buf;
+
+                            Renew(locfold_buf, new_size, char);
+                            Renew(loc_correspondence, new_size, Size_t);
+                            size = new_size;
+
+                            redo_e = locfold_buf + e_offset;
+                        }
+                    }
+
+                    /* Set so that things are in terms of the folded, temporary
+                     * string */
+                    s = old_redo_e;
+                    s_start = locfold_buf;
+                    e = redo_e;
+
+                }
+
+                /* Here, we have 's', 's_start' and 'e' set up to point to the
+                 * input that goes into the node, folded.
+                 *
                   * If the final character of the node and the fold of ender
                   * form the first two characters of a three character fold, we
                   * need to peek ahead at the next (unparsed) character in the
@@ -14628,11 +14753,8 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                   * and try again.
                   *
                   * Otherwise, the node can be split at the current position.
-                 */
-                s = old_old_s;  /* Point to the beginning of the final char
-                                   that fits in the node */
-
-                /* The same logic is used for UTF-8 patterns and not */
+                 *
+                 * The same logic is used for UTF-8 patterns and not */
                  if (UTF) {
                      Size_t added_len;
  
@@ -14671,7 +14793,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                           * drop down to try at that position */
                          if (isPUNCT(*p)) {
                              s = (char *) utf8_hop_back((U8 *) s, -1,
-                                       (U8 *) s0);
+                                       (U8 *) s_start);
                              backed_up = TRUE;
                          }
                          else {
@@ -14703,7 +14825,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                       * either case would break apart a fold */
                      do {
                          char *prev_s = (char *) utf8_hop_back((U8 *) s, -1,
-                                                                    (U8 *) s0);
+                                                            (U8 *) s_start);
  
                          /* If is a multi-char fold, can't split here.  Backup
                           * one char and try again */
@@ -14717,11 +14839,11 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                           * three character fold starting at the character
                           * before s, we can't split either before or after s.
                           * Backup two chars and try again */
-                        if (   LIKELY(s > s0)
+                        if (   LIKELY(s > s_start)
                              && UNLIKELY(is_THREE_CHAR_FOLD_utf8_safe(prev_s, e)))
                          {
                              s = prev_s;
-                            s = (char *) utf8_hop_back((U8 *) s, -1, (U8 *) s0);
+                            s = (char *) utf8_hop_back((U8 *) s, -1, (U8 *) s_start);
                              backed_up = TRUE;
                              continue;
                          }
@@ -14731,7 +14853,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                          splittable = TRUE;
                          break;
  
-                    } while (s > s0); /* End of loops backing up through the node */
+                    } while (s > s_start); /* End of loops backing up through the node */
  
                      /* Here we either couldn't find a place to split the node,
                       * or else we broke out of the loop setting 'splittable' to
@@ -14780,7 +14902,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                              continue;
                          }
  
-                        if (   LIKELY(s > s0)
+                        if (   LIKELY(s > s_start)
                              && UNLIKELY(is_THREE_CHAR_FOLD_latin1_safe(s - 1, e)))
                          {
                              s -= 2;
@@ -14791,7 +14913,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                          splittable = TRUE;
                          break;
  
-                    } while (s > s0);
+                    } while (s > s_start);
  
                      if (splittable) {
                          s++;
@@ -14805,9 +14927,28 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                     /* If we did find a place to split, reparse the entire node
                      * stopping where we have calculated. */
                      if (splittable) {
-                        upper_fill = s - s0;
+
+                       /* If we created a temporary folded string under /l, we
+                        * have to map that back to the original */
+                        if (need_to_fold_loc) {
+                            upper_fill = loc_correspondence[s - s_start];
+                            Safefree(locfold_buf);
+                            Safefree(loc_correspondence);
+
+                            if (upper_fill == 0) {
+                                FAIL2("panic: loc_correspondence[%d] is 0",
+                                      (int) (s - s_start));
+                            }
+                        }
+                        else {
+                            upper_fill = s - s0;
+                        }
                          goto reparse;
                      }
+                    else if (need_to_fold_loc) {
+                        Safefree(locfold_buf);
+                        Safefree(loc_correspondence);
+                    }
  
                      /* Here the node consists entirely of non-final multi-char
                       * folds.  (Likely it is all 'f's or all 's's.)  There's no
@@ -16774,7 +16915,7 @@ S_output_posix_warnings(pTHX_ RExC_state_t *pRExC_state, AV* posix_warnings)
      UPDATE_WARNINGS_LOC(RExC_parse);
  }
  
-Size_t PERL_STATIC_INLINE
+PERL_STATIC_INLINE Size_t
  S_find_first_differing_byte_pos(const U8 * s1, const U8 * s2, const Size_t max)
  {
      const U8 * const start = s1;
@@ -18463,14 +18604,11 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                   |= ANYOFL_FOLD
                   |  ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
          }
-        else if (cp_list) { /* Look to see if a 0-255 code point is in list */
-            UV start, end;
-            invlist_iterinit(cp_list);
-            if (invlist_iternext(cp_list, &start, &end) && start < 256) {
-                anyof_flags |= ANYOFL_FOLD;
-                has_runtime_dependency |= HAS_L_RUNTIME_DEPENDENCY;
-            }
-            invlist_iterfinish(cp_list);
+        else if (cp_list && invlist_lowest(cp_list) < 256) {
+            /* If nothing is below 256, has no locale dependency; otherwise it
+             * does */
+            anyof_flags |= ANYOFL_FOLD;
+            has_runtime_dependency |= HAS_L_RUNTIME_DEPENDENCY;
          }
      }
      else if (   DEPENDS_SEMANTICS
@@ -18701,7 +18839,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
  
                  /* Only try if there are no more code points in the class than
                   * in the max possible fold */
-            &&   partial_cp_count > 0 && partial_cp_count <= MAX_FOLD_FROMS + 1)
+            &&   inRANGE(partial_cp_count, 1, MAX_FOLD_FROMS + 1))
          {
              if (partial_cp_count == 1 && ! upper_latin1_only_utf8_matches)
              {
@@ -19308,14 +19446,13 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
              U8 low_utf8[UTF8_MAXBYTES+1];
              UV highest_cp = invlist_highest(cp_list);
  
-            op = ANYOFH;
-
              /* Currently the maximum allowed code point by the system is
               * IV_MAX.  Higher ones are reserved for future internal use.  This
               * particular regnode can be used for higher ones, but we can't
               * calculate the code point of those.  IV_MAX suffices though, as
               * it will be a large first byte */
-            (void) uvchr_to_utf8(low_utf8, MIN(start[0], IV_MAX));
+            Size_t low_len = uvchr_to_utf8(low_utf8, MIN(start[0], IV_MAX))
+                           - low_utf8;
  
              /* We store the lowest possible first byte of the UTF-8
               * representation, using the flags field.  This allows for quick
@@ -19324,23 +19461,48 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
               * transformation would not rule out nearly so many things */
              anyof_flags = NATIVE_UTF8_TO_I8(low_utf8[0]);
  
+            op = ANYOFH;
+
              /* If the first UTF-8 start byte for the highest code point in the
               * range is suitably small, we may be able to get an upper bound as
               * well */
              if (highest_cp <= IV_MAX) {
                  U8 high_utf8[UTF8_MAXBYTES+1];
-
-                (void) uvchr_to_utf8(high_utf8, highest_cp);
+                Size_t high_len = uvchr_to_utf8(high_utf8, highest_cp)
+                                - high_utf8;
  
                  /* If the lowest and highest are the same, we can get an exact
-                 * first byte instead of a just minimum.  We signal this with a
-                 * different regnode */
+                 * first byte instead of a just minimum or even a sequence of
+                 * exact leading bytes.  We signal these with different
+                 * regnodes */
                  if (low_utf8[0] == high_utf8[0]) {
+                    Size_t len = find_first_differing_byte_pos(low_utf8,
+                                                               high_utf8,
+                                                       MIN(low_len, high_len));
+
+                    if (len == 1) {
  
-                    /* No need to convert to I8 for EBCDIC as this is an exact
-                     * match */
-                    anyof_flags = low_utf8[0];
-                    op = ANYOFHb;
+                        /* No need to convert to I8 for EBCDIC as this is an
+                         * exact match */
+                        anyof_flags = low_utf8[0];
+                        op = ANYOFHb;
+                    }
+                    else {
+                        op = ANYOFHs;
+                        ret = regnode_guts(pRExC_state, op,
+                                           regarglen[op] + STR_SZ(len),
+                                           "anyofhs");
+                        FILL_NODE(ret, op);
+                        ((struct regnode_anyofhs *) REGNODE_p(ret))->str_len
+                                                                        = len;
+                        Copy(low_utf8,  /* Add the common bytes */
+                           ((struct regnode_anyofhs *) REGNODE_p(ret))->string,
+                           len, U8);
+                        RExC_emit += NODE_SZ_STR(REGNODE_p(ret));
+                        set_ANYOF_arg(pRExC_state, REGNODE_p(ret), cp_list,
+                                                  NULL, only_utf8_locale_list);
+                        goto not_anyof;
+                    }
                  }
                  else if (NATIVE_UTF8_TO_I8(high_utf8[0]) <= MAX_ANYOF_HRx_BYTE)
                  {
@@ -19429,6 +19591,8 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                     ? listsv
                     : NULL,
                    only_utf8_locale_list);
+    SvREFCNT_dec(cp_list);;
+    SvREFCNT_dec(only_utf8_locale_list);
      return ret;
  
    not_anyof:
@@ -19439,6 +19603,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
      Set_Node_Offset_Length(REGNODE_p(ret), orig_parse - RExC_start,
                                             RExC_parse - orig_parse);;
      SvREFCNT_dec(cp_list);;
+    SvREFCNT_dec(only_utf8_locale_list);
      return ret;
  }
  
@@ -19479,11 +19644,12 @@ S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state,
         SV *rv;
  
          if (cp_list) {
-            av_store(av, INVLIST_INDEX, cp_list);
+            av_store(av, INVLIST_INDEX, SvREFCNT_inc(cp_list));
          }
  
          if (only_utf8_locale_list) {
-            av_store(av, ONLY_LOCALE_MATCHES_INDEX, only_utf8_locale_list);
+            av_store(av, ONLY_LOCALE_MATCHES_INDEX,
+                                          SvREFCNT_inc(only_utf8_locale_list));
          }
  
          if (runtime_defns) {
@@ -19597,10 +19763,10 @@ Perl__get_regclass_nonbitmap_data(pTHX_ const regexp *prog,
                      STATIC_ASSERT_STMT(ONLY_LOCALE_MATCHES_INDEX == 1 + INVLIST_INDEX);
                      STATIC_ASSERT_STMT(DEFERRED_USER_DEFINED_INDEX == 1 + ONLY_LOCALE_MATCHES_INDEX);
  
-                    av_store(av, INVLIST_INDEX, invlist);
+                    ary[INVLIST_INDEX] = invlist;
                      av_fill(av, (ary[ONLY_LOCALE_MATCHES_INDEX])
-                                 ? ONLY_LOCALE_MATCHES_INDEX:
-                                 INVLIST_INDEX);
+                                 ? ONLY_LOCALE_MATCHES_INDEX
+                                 : INVLIST_INDEX);
                      si = NULL;
                  }
             }
@@ -20137,6 +20303,7 @@ S_regtail(pTHX_ RExC_state_t * pRExC_state,
          scan = REGNODE_OFFSET(temp);
      }
  
+    assert(val >= scan);
      if (reg_off_by_arg[OP(REGNODE_p(scan))]) {
          assert((UV) (val - scan) <= U32_MAX);
          ARG_SET(REGNODE_p(scan), val - scan);
@@ -20868,7 +21035,10 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_
          /* And finally the matching, closing ']' */
         Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
  
-        if (inRANGE(OP(o), ANYOFH, ANYOFRb)) {
+        if (OP(o) == ANYOFHs) {
+            Perl_sv_catpvf(aTHX_ sv, " (Leading UTF-8 bytes=%s", _byte_dump_string((U8 *) ((struct regnode_anyofhs *) o)->string, FLAGS(o), 1));
+        }
+        else if (inRANGE(OP(o), ANYOFH, ANYOFRb)) {
              U8 lowest = (OP(o) != ANYOFHr)
                           ? FLAGS(o)
                           : LOWEST_ANYOF_HRx_BYTE(FLAGS(o));
@@ -22376,7 +22546,7 @@ S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
         else if ( op == PLUS || op == STAR) {
             DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
         }
-       else if (PL_regkind[(U8)op] == EXACT) {
+       else if (PL_regkind[(U8)op] == EXACT || op == ANYOFHs) {
              /* Literal string, where present. */
             node += NODE_SZ_STR(node) - 1;
             node = NEXTOPER(node);
@@ -22406,6 +22576,17 @@ Perl_init_uniprops(pTHX)
  {
      dVAR;
  
+#ifdef DEBUGGING
+    char * dump_len_string;
+
+    dump_len_string = PerlEnv_getenv("PERL_DUMP_RE_MAX_LEN");
+    if (   ! dump_len_string
+        || ! grok_atoUV(dump_len_string, (UV *)&PL_dump_re_max_len, NULL))
+    {
+        PL_dump_re_max_len = 60;    /* A reasonable default */
+    }
+#endif
+
      PL_user_def_props = newHV();
  
  #ifdef USE_ITHREADS
@@ -22415,7 +22596,7 @@ Perl_init_uniprops(pTHX)
  
  #endif
  
-    /* Set up the inversion list global variables */
+    /* Set up the inversion list interpreter-level variables */
  
      PL_XPosix_ptrs[_CC_ASCII] = _new_invlist_C_array(uni_prop_ptrs[UNI_ASCII]);
      PL_XPosix_ptrs[_CC_ALPHANUMERIC] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXALNUM]);
@@ -22457,6 +22638,7 @@ Perl_init_uniprops(pTHX)
      PL_LB_invlist = _new_invlist_C_array(_Perl_LB_invlist);
      PL_SCX_invlist = _new_invlist_C_array(_Perl_SCX_invlist);
  
+    PL_InBitmap = _new_invlist_C_array(_Perl_InBitmap_invlist);
      PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
      PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
      PL_UpperLatin1 = _new_invlist_C_array(UpperLatin1_invlist);