regen/regcharclass.pl: Generate macros for multi-char fold sequences

[perl5.git] / regcomp.c
diff --git a/regcomp.c b/regcomp.c

index 921c0e9..8cef832 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -89,7 +89,13 @@ extern const struct regexp_engine my_reg_engine;
  #include "dquote_static.c"
  #include "charclass_invlists.h"
  #include "inline_invlist.c"
-#include "utf8_strings.h"
+#include "unicode_constants.h"
+
+#ifdef HAS_ISBLANK
+#   define hasISBLANK 1
+#else
+#   define hasISBLANK 0
+#endif
  
  #define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
  #define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
@@ -291,8 +297,8 @@ typedef struct RExC_state_t {
      string can occur infinitely far to the right.
    
    - minlenp
-    A pointer to the minimum length of the pattern that the string 
-    was found inside. This is important as in the case of positive 
+    A pointer to the minimum number of characters of the pattern that the
+    string was found inside. This is important as in the case of positive
      lookahead or positive lookbehind we can have multiple patterns 
      involved. Consider
      
@@ -2593,9 +2599,9 @@ S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source,  regnode
   * these get optimized out
   *
   * If there are problematic code sequences, *min_subtract is set to the delta
- * that the minimum size of the node can be less than its actual size.  And,
- * the node type of the result is changed to reflect that it contains these
- * sequences.
+ * number of characters that the minimum size of the node can be less than its
+ * actual size.  And, the node type of the result is changed to reflect that it
+ * contains these sequences.
   *
   * And *has_exactf_sharp_s is set to indicate whether or not the node is EXACTF
   * and contains LATIN SMALL LETTER SHARP S
@@ -2812,15 +2818,12 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, b
              * U+03C5 U+0308 U+0301         0xCF 0x85 0xCC 0x88 0xCC 0x81
               *
              * This means that in case-insensitive matching (or "loose
-            * matching", as Unicode calls it), an EXACTF of length six (the
-            * UTF-8 encoded byte length of the above casefolded versions) can
-            * match a target string of length two (the byte length of UTF-8
-            * encoded U+0390 or U+03B0).  This would rather mess up the
-            * minimum length computation.  (there are other code points that
-            * also fold to these two sequences, but the delta is smaller)
+            * matching", as Unicode calls it), an EXACTF of length 3 chars can
+             * match a target string of length 1 char.  This would rather mess
+             * up the minimum length computation.
              *
              * If these sequences are found, the minimum length is decreased by
-            * four (six minus two).
+            * two.
              *
              * Similarly, 'ss' may match the single char and byte LATIN SMALL
              * LETTER SHARP S.  We decrease the min length by 1 for each
@@ -2882,7 +2885,7 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, b
                             break;
                         }
                       greek_sequence:
-                       *min_subtract += 4;
+                       *min_subtract += 2;
  
                         /* This requires special handling by trie's, so change
                          * the node type to indicate this.  If EXACTFA and
@@ -3025,7 +3028,8 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                         /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
  {
      dVAR;
-    I32 min = 0, pars = 0, code;
+    I32 min = 0;    /* There must be at least this number of characters to match */
+    I32 pars = 0, code;
      regnode *scan = *scanp, *next;
      I32 delta = 0;
      int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
@@ -3052,9 +3056,9 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
  
    fake_study_recurse:
      while ( scan && OP(scan) != END && scan < last ){
-        UV min_subtract = 0;    /* How much to subtract from the minimum node
-                                   length to get a real minimum (because the
-                                   folded version may be shorter) */
+        UV min_subtract = 0;    /* How mmany chars to subtract from the minimum
+                                   node length to get a real minimum (because
+                                   the folded version may be shorter) */
         bool has_exactf_sharp_s = FALSE;
         /* Peephole optimizer: */
         DEBUG_STUDYDATA("Peep:", data,depth);
@@ -3419,7 +3423,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                                       * trietype so we can turn them into a trie. If/when we
                                       * allow NOTHING to start a trie sequence this condition will be
                                       * required, and it isn't expensive so we leave it in for now. */
-                                    if ( trietype != NOTHING )
+                                    if ( trietype && trietype != NOTHING )
                                          make_trie( pRExC_state,
                                                  startbranch, first, cur, tail, count,
                                                  trietype, depth+1 );
@@ -3450,7 +3454,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                                "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
  
                          });
-                        if ( last ) {
+                        if ( last && trietype ) {
                              if ( trietype != NOTHING ) {
                                  /* the last branch of the sequence was part of a trie,
                                   * so we have to construct it here outside of the loop
@@ -3666,9 +3670,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                 RExC_seen |= REG_SEEN_EXACTF_SHARP_S;
             }
             min += l - min_subtract;
-            if (min < 0) {
-                min = 0;
-            }
+            assert (min >= 0);
              delta += min_subtract;
             if (flags & SCF_DO_SUBSTR) {
                 data->pos_min += l - min_subtract;
@@ -4209,7 +4211,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                 case ALNUM:
                     if (flags & SCF_DO_STCLASS_AND) {
                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
-                           ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
+                           ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NWORDCHAR);
                              if (OP(scan) == ALNUMU) {
                                  for (value = 0; value < 256; value++) {
                                      if (!isWORDCHAR_L1(value)) {
@@ -4227,7 +4229,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                     }
                     else {
                         if (data->start_class->flags & ANYOF_LOCALE)
-                           ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
+                           ANYOF_CLASS_SET(data->start_class,ANYOF_WORDCHAR);
  
                         /* Even if under locale, set the bits for non-locale
                          * in case it isn't a true locale-node.  This will
@@ -4250,7 +4252,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                 case NALNUM:
                     if (flags & SCF_DO_STCLASS_AND) {
                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
-                           ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
+                           ANYOF_CLASS_CLEAR(data->start_class,ANYOF_WORDCHAR);
                              if (OP(scan) == NALNUMU) {
                                  for (value = 0; value < 256; value++) {
                                      if (isWORDCHAR_L1(value)) {
@@ -4268,7 +4270,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                     }
                     else {
                         if (data->start_class->flags & ANYOF_LOCALE)
-                           ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
+                           ANYOF_CLASS_SET(data->start_class,ANYOF_NWORDCHAR);
  
                         /* Even if under locale, set the bits for non-locale in
                          * case it isn't a true locale-node.  This will create
@@ -5501,8 +5503,11 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
                 else  {
                      while (SvAMAGIC(msv)
                              && (sv = AMG_CALLunary(msv, string_amg))
-                            && sv != msv)
-                    {
+                            && sv != msv
+                            &&  !(   SvROK(msv)
+                                  && SvROK(sv)
+                                  && SvRV(msv) == SvRV(sv))
+                    ) {
                          msv = sv;
                          SvGETMAGIC(msv);
                      }
@@ -6398,18 +6403,12 @@ reStudy:
  #ifdef STUPID_PATTERN_CHECKS            
      if (RX_PRELEN(rx) == 0)
          r->extflags |= RXf_NULL;
-    if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
-        /* XXX: this should happen BEFORE we compile */
-        r->extflags |= (RXf_SKIPWHITE|RXf_WHITE); 
-    else if (RX_PRELEN(rx) == 3 && memEQ("\\s+", RX_PRECOMP(rx), 3))
+    if (RX_PRELEN(rx) == 3 && memEQ("\\s+", RX_PRECOMP(rx), 3))
          r->extflags |= RXf_WHITE;
      else if (RX_PRELEN(rx) == 1 && RXp_PRECOMP(rx)[0] == '^')
          r->extflags |= RXf_START_ONLY;
  #else
-    if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
-            /* XXX: this should happen BEFORE we compile */
-            r->extflags |= (RXf_SKIPWHITE|RXf_WHITE); 
-    else {
+    {
          regnode *first = ri->program + 1;
          U8 fop = OP(first);
  
@@ -6691,37 +6690,53 @@ Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
      char *s = NULL;
      I32 i = 0;
      I32 s1, t1;
+    I32 n = paren;
  
      PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
          
-    if (!rx->subbeg) {
-        sv_setsv(sv,&PL_sv_undef);
-        return;
-    } 
-    else               
-    if (paren == RX_BUFF_IDX_PREMATCH && rx->offs[0].start != -1) {
-        /* $` */
+    if ( (    n == RX_BUFF_IDX_CARET_PREMATCH
+           || n == RX_BUFF_IDX_CARET_FULLMATCH
+           || n == RX_BUFF_IDX_CARET_POSTMATCH
+         )
+         && !(rx->extflags & RXf_PMf_KEEPCOPY)
+    )
+        goto ret_undef;
+
+    if (!rx->subbeg)
+        goto ret_undef;
+
+    if (n == RX_BUFF_IDX_CARET_FULLMATCH)
+        /* no need to distinguish between them any more */
+        n = RX_BUFF_IDX_FULLMATCH;
+
+    if ((n == RX_BUFF_IDX_PREMATCH || n == RX_BUFF_IDX_CARET_PREMATCH)
+        && rx->offs[0].start != -1)
+    {
+        /* $`, ${^PREMATCH} */
         i = rx->offs[0].start;
         s = rx->subbeg;
      }
      else 
-    if (paren == RX_BUFF_IDX_POSTMATCH && rx->offs[0].end != -1) {
-        /* $' */
-       s = rx->subbeg + rx->offs[0].end;
-       i = rx->sublen - rx->offs[0].end;
+    if ((n == RX_BUFF_IDX_POSTMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH)
+        && rx->offs[0].end != -1)
+    {
+        /* $', ${^POSTMATCH} */
+       s = rx->subbeg - rx->suboffset + rx->offs[0].end;
+       i = rx->sublen + rx->suboffset - rx->offs[0].end;
      } 
      else
-    if ( 0 <= paren && paren <= (I32)rx->nparens &&
-        (s1 = rx->offs[paren].start) != -1 &&
-        (t1 = rx->offs[paren].end) != -1)
+    if ( 0 <= n && n <= (I32)rx->nparens &&
+        (s1 = rx->offs[n].start) != -1 &&
+        (t1 = rx->offs[n].end) != -1)
      {
-        /* $& $1 ... */
+        /* $&, ${^MATCH},  $1 ... */
          i = t1 - s1;
-        s = rx->subbeg + s1;
+        s = rx->subbeg + s1 - rx->suboffset;
      } else {
-        sv_setsv(sv,&PL_sv_undef);
-        return;
+        goto ret_undef;
      }          
+
+    assert(s >= rx->subbeg);
      assert(rx->sublen >= (s - rx->subbeg) + i );
      if (i >= 0) {
          const int oldtainted = PL_tainted;
@@ -6757,6 +6772,7 @@ Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
                  SvTAINTED_off(sv);
          }
      } else {
+      ret_undef:
          sv_setsv(sv,&PL_sv_undef);
          return;
      }
@@ -6787,9 +6803,13 @@ Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
      PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
  
      /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
-       switch (paren) {
-      /* $` / ${^PREMATCH} */
-      case RX_BUFF_IDX_PREMATCH:
+    switch (paren) {
+      case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */
+         if (!(rx->extflags & RXf_PMf_KEEPCOPY))
+            goto warn_undef;
+        /*FALLTHROUGH*/
+
+      case RX_BUFF_IDX_PREMATCH:       /* $` */
          if (rx->offs[0].start != -1) {
                         i = rx->offs[0].start;
                         if (i > 0) {
@@ -6799,8 +6819,11 @@ Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
                         }
             }
          return 0;
-      /* $' / ${^POSTMATCH} */
-      case RX_BUFF_IDX_POSTMATCH:
+
+      case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */
+         if (!(rx->extflags & RXf_PMf_KEEPCOPY))
+            goto warn_undef;
+      case RX_BUFF_IDX_POSTMATCH:       /* $' */
             if (rx->offs[0].end != -1) {
                         i = rx->sublen - rx->offs[0].end;
                         if (i > 0) {
@@ -6810,6 +6833,12 @@ Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
                         }
             }
          return 0;
+
+      case RX_BUFF_IDX_CARET_FULLMATCH: /* ${^MATCH} */
+         if (!(rx->extflags & RXf_PMf_KEEPCOPY))
+            goto warn_undef;
+        /*FALLTHROUGH*/
+
        /* $& / ${^MATCH}, $1, $2, ... */
        default:
             if (paren <= (I32)rx->nparens &&
@@ -6819,6 +6848,7 @@ Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
              i = t1 - s1;
              goto getlen;
          } else {
+          warn_undef:
              if (ckWARN(WARN_UNINITIALIZED))
                  report_uninit((const SV *)sv);
              return 0;
@@ -6826,7 +6856,7 @@ Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
      }
    getlen:
      if (i > 0 && RXp_MATCH_UTF8(rx)) {
-        const char * const s = rx->subbeg + s1;
+        const char * const s = rx->subbeg - rx->suboffset + s1;
          const U8 *ep;
          STRLEN el;
  
@@ -6979,9 +7009,10 @@ S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
   * list.)
   * Taking the complement (inverting) an inversion list is quite simple, if the
   * first element is 0, remove it; otherwise add a 0 element at the beginning.
- * This implementation reserves an element at the beginning of each inversion list
- * to contain 0 when the list contains 0, and contains 1 otherwise.  The actual
- * beginning of the list is either that element if 0, or the next one if 1.
+ * This implementation reserves an element at the beginning of each inversion
+ * list to contain 0 when the list contains 0, and contains 1 otherwise.  The
+ * actual beginning of the list is either that element if 0, or the next one if
+ * 1.
   *
   * More about inversion lists can be found in "Unicode Demystified"
   * Chapter 13 by Richard Gillam, published by Addison-Wesley.
@@ -9421,6 +9452,10 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
      char *parse_start;
  #endif
      const char *maxpos = NULL;
+
+    /* Save the original in case we change the emitted regop to a FAIL. */
+    regnode * const orig_emit = RExC_emit;
+
      GET_RE_DEBUG_FLAGS_DECL;
  
      PERL_ARGS_ASSERT_REGPIECE;
@@ -9467,6 +9502,23 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
             RExC_parse = next;
             nextchar(pRExC_state);
+            if (max < min) {    /* If can't match, warn and optimize to fail
+                                   unconditionally */
+                if (SIZE_ONLY) {
+                    ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
+
+                    /* We can't back off the size because we have to reserve
+                     * enough space for all the things we are about to throw
+                     * away, but we can shrink it by the ammount we are about
+                     * to re-use here */
+                    RExC_size = PREVOPER(RExC_size) - regarglen[(U8)OPFAIL];
+                }
+                else {
+                    RExC_emit = orig_emit;
+                }
+                ret = reg_node(pRExC_state, OPFAIL);
+                return ret;
+            }
  
         do_curly:
             if ((flags&SIMPLE)) {
@@ -9504,8 +9556,6 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                 *flagp = WORST;
             if (max > 0)
                 *flagp |= HASWIDTH;
-           if (max < min)
-               vFAIL("Can't do {n,m} with n > m");
             if (!SIZE_ONLY) {
                 ARG1_SET(ret, (U16)min);
                 ARG2_SET(ret, (U16)max);
@@ -11070,7 +11120,7 @@ S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
                     switch (skip) {
                     case 4:
                         if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
-                           namedclass = ANYOF_ALNUM;
+                           namedclass = ANYOF_WORDCHAR;
                         break;
                     case 5:
                         /* Names all of length 5.  */
@@ -11511,8 +11561,8 @@ parseit:
              * A similar issue a little bit later when switching on
              * namedclass. --jhi */
             switch ((I32)value) {
-           case 'w':   namedclass = ANYOF_ALNUM;       break;
-           case 'W':   namedclass = ANYOF_NALNUM;      break;
+           case 'w':   namedclass = ANYOF_WORDCHAR;    break;
+           case 'W':   namedclass = ANYOF_NWORDCHAR;   break;
             case 's':   namedclass = ANYOF_SPACE;       break;
             case 'S':   namedclass = ANYOF_NSPACE;      break;
             case 'd':   namedclass = ANYOF_DIGIT;       break;
@@ -11806,32 +11856,83 @@ parseit:
                          runtime_posix_matches_above_Unicode);
                     break;
                 case ANYOF_ASCII:
+#ifdef HAS_ISASCII
                     if (LOC) {
                         ANYOF_CLASS_SET(ret, namedclass);
                     }
-                    else {
+                    else
+#endif  /* Not isascii(); just use the hard-coded definition for it */
                          _invlist_union(posixes, PL_ASCII, &posixes);
-                    }
                     break;
                 case ANYOF_NASCII:
+#ifdef HAS_ISASCII
                     if (LOC) {
                         ANYOF_CLASS_SET(ret, namedclass);
                     }
                      else {
+#endif
                          _invlist_union_complement_2nd(posixes,
                                                      PL_ASCII, &posixes);
                          if (DEPENDS_SEMANTICS) {
                              ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL;
                          }
+#ifdef HAS_ISASCII
                      }
+#endif
                     break;
                 case ANYOF_BLANK:
-                    DO_POSIX(ret, namedclass, posixes,
+                    if (hasISBLANK || ! LOC) {
+                        DO_POSIX(ret, namedclass, posixes,
                                              PL_PosixBlank, PL_XPosixBlank);
+                    }
+                    else { /* There is no isblank() and we are in locale:  We
+                              use the ASCII range and the above-Latin1 range
+                              code points */
+                        SV* scratch_list = NULL;
+
+                        /* Include all above-Latin1 blanks */
+                        _invlist_intersection(PL_AboveLatin1,
+                                              PL_XPosixBlank,
+                                              &scratch_list);
+                        /* Add it to the running total of posix classes */
+                        if (! posixes) {
+                            posixes = scratch_list;
+                        }
+                        else {
+                            _invlist_union(posixes, scratch_list, &posixes);
+                            SvREFCNT_dec(scratch_list);
+                        }
+                        /* Add the ASCII-range blanks to the running total. */
+                        _invlist_union(posixes, PL_PosixBlank, &posixes);
+                    }
                     break;
                 case ANYOF_NBLANK:
-                    DO_N_POSIX(ret, namedclass, posixes,
-                                            PL_PosixBlank, PL_XPosixBlank);
+                    if (hasISBLANK || ! LOC) {
+                        DO_N_POSIX(ret, namedclass, posixes,
+                                                PL_PosixBlank, PL_XPosixBlank);
+                    }
+                    else { /* There is no isblank() and we are in locale */
+                        SV* scratch_list = NULL;
+
+                        /* Include all above-Latin1 non-blanks */
+                        _invlist_subtract(PL_AboveLatin1, PL_XPosixBlank, &scratch_list);
+
+                        /* Add them to the running total of posix classes */
+                        _invlist_subtract(PL_AboveLatin1, PL_XPosixBlank, &scratch_list);
+                        if (! posixes) {
+                            posixes = scratch_list;
+                        }
+                        else {
+                            _invlist_union(posixes, scratch_list, &posixes);
+                            SvREFCNT_dec(scratch_list);
+                        }
+
+                        /* Get the list of all non-ASCII-blanks in Latin 1, and
+                         * add them to the running total */
+                        _invlist_subtract(PL_Latin1, PL_PosixBlank, &scratch_list);
+                        _invlist_union(posixes, scratch_list, &posixes);
+                        SvREFCNT_dec(scratch_list);
+                    }
                     break;
                 case ANYOF_CNTRL:
                      DO_POSIX(ret, namedclass, posixes,
@@ -11967,11 +12068,11 @@ parseit:
                     }
                     break;
                 }
-               case ANYOF_ALNUM:   /* Really is 'Word' */
+               case ANYOF_WORDCHAR:
                     DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
                              PL_PosixWord, PL_L1PosixWord, "XPosixWord", listsv);
                     break;
-               case ANYOF_NALNUM:
+               case ANYOF_NWORDCHAR:
                     DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
                              PL_PosixWord, PL_L1PosixWord, "XPosixWord", listsv,
                              runtime_posix_matches_above_Unicode);
@@ -12102,10 +12203,10 @@ parseit:
                   * modifier to the regex.  We first calculate the base node
                   * type, and if it should be inverted */
  
-                case ANYOF_NALNUM:
+                case ANYOF_NWORDCHAR:
                      invert = ! invert;
                      /* FALLTHROUGH */
-                case ANYOF_ALNUM:
+                case ANYOF_WORDCHAR:
                      op = ALNUM;
                      goto join_charset_classes;
  
@@ -14429,6 +14530,8 @@ Perl_save_re_context(pTHX)
  
      PL_reg_oldsaved = NULL;
      PL_reg_oldsavedlen = 0;
+    PL_reg_oldsavedoffset = 0;
+    PL_reg_oldsavedcoffset = 0;
      PL_reg_maxiter = 0;
      PL_reg_leftiter = 0;
      PL_reg_poscache = NULL;