regcomp.c: Stop regexp-to-pv assignemnt from leaking

[perl5.git] / regcomp.c
diff --git a/regcomp.c b/regcomp.c

index bad6b42..d524a99 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -231,10 +231,12 @@ typedef struct RExC_state_t {
  #define        WORST           0       /* Worst case. */
  #define        HASWIDTH        0x01    /* Known to match non-null strings. */
  
-/* Simple enough to be STAR/PLUS operand; in an EXACT node must be a single
- * character.  Note that this is not the same thing as REGNODE_SIMPLE */
+/* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single
+ * character.  (There needs to be a case: in the switch statement in regexec.c
+ * for any node marked SIMPLE.)  Note that this is not the same thing as
+ * REGNODE_SIMPLE */
  #define        SIMPLE          0x02
-#define        SPSTART         0x04    /* Starts with * or +. */
+#define        SPSTART         0x04    /* Starts with * or + */
  #define TRYAGAIN       0x08    /* Weeded out a declaration. */
  #define POSTPONED      0x10    /* (?1),(?&name), (??{...}) or similar */
  
@@ -758,7 +760,7 @@ S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *c
  
      ANYOF_BITMAP_SETALL(cl);
      cl->flags = ANYOF_CLASS|ANYOF_EOS|ANYOF_UNICODE_ALL
-               |ANYOF_LOC_NONBITMAP_FOLD|ANYOF_NON_UTF8_LATIN1_ALL;
+               |ANYOF_NON_UTF8_LATIN1_ALL;
  
      /* If any portion of the regex is to operate under locale rules,
       * initialization includes it.  The reason this isn't done for all regexes
@@ -769,7 +771,7 @@ S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *c
       * necessary. */
      if (RExC_contains_locale) {
         ANYOF_CLASS_SETALL(cl);     /* /l uses class */
-       cl->flags |= ANYOF_LOCALE;
+       cl->flags |= ANYOF_LOCALE|ANYOF_LOC_FOLD;
      }
      else {
         ANYOF_CLASS_ZERO(cl);       /* Only /l uses class now */
@@ -824,8 +826,8 @@ S_cl_and(struct regnode_charclass_class *cl,
      if (!(ANYOF_CLASS_TEST_ANY_SET(and_with))
         && !(ANYOF_CLASS_TEST_ANY_SET(cl))
         && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
-       && !(and_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
-       && !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) {
+       && !(and_with->flags & ANYOF_LOC_FOLD)
+       && !(cl->flags & ANYOF_LOC_FOLD)) {
         int i;
  
         if (and_with->flags & ANYOF_INVERT)
@@ -958,8 +960,8 @@ S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, con
          *   (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
          */
         else if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
-            && !(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
-            && !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD) ) {
+            && !(or_with->flags & ANYOF_LOC_FOLD)
+            && !(cl->flags & ANYOF_LOC_FOLD) ) {
             int i;
  
             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
@@ -985,8 +987,8 @@ S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, con
      } else {    /* 'or_with' is not inverted */
         /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
         if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
-            && (!(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
-                || (cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) ) {
+            && (!(or_with->flags & ANYOF_LOC_FOLD)
+                || (cl->flags & ANYOF_LOC_FOLD)) ) {
             int i;
  
             /* OR char bitmap and class bitmap separately */
@@ -2866,7 +2868,7 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, b
              /* Here, the pattern is not UTF-8.  Look for the multi-char folds
               * that are all ASCII.  As in the above case, EXACTFL and EXACTFA
               * nodes can't have multi-char folds to this range (and there are
-             * no existing ones to the upper latin1 range).  In the EXACTF
+             * no existing ones in the upper latin1 range).  In the EXACTF
               * case we look also for the sharp s, which can be in the final
               * position.  Otherwise we can stop looking 1 byte earlier because
               * have to find at least two characters for a multi-fold */
@@ -2883,7 +2885,7 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, b
              const U8 s_masked = 's' & S_or_s_mask;
  
             while (s < upper) {
-                int len = is_MULTI_CHAR_FOLD_low_safe(s, s_end);
+                int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end);
                  if (! len) {    /* Not a multi-char fold. */
                      if (*s == LATIN_SMALL_LETTER_SHARP_S && OP(scan) == EXACTF)
                      {
@@ -3583,7 +3585,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                 if (uc >= 0x100 ||
                     (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
                     && !ANYOF_BITMAP_TEST(data->start_class, uc)
-                   && (!(data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD)
+                   && (!(data->start_class->flags & ANYOF_LOC_FOLD)
                         || !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
                      )
                 {
@@ -3669,12 +3671,11 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                 if (compat) {
                     ANYOF_BITMAP_SET(data->start_class, uc);
                     data->start_class->flags &= ~ANYOF_EOS;
-                   data->start_class->flags |= ANYOF_LOC_NONBITMAP_FOLD;
                     if (OP(scan) == EXACTFL) {
                         /* XXX This set is probably no longer necessary, and
                          * probably wrong as LOCALE now is on in the initial
                          * state */
-                       data->start_class->flags |= ANYOF_LOCALE;
+                       data->start_class->flags |= ANYOF_LOCALE|ANYOF_LOC_FOLD;
                     }
                     else {
  
@@ -3709,7 +3710,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                 }
             }
             else if (flags & SCF_DO_STCLASS_OR) {
-               if (data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD) {
+               if (data->start_class->flags & ANYOF_LOC_FOLD) {
                     /* false positive possible if the class is case-folded.
                        Assume that the locale settings are the same... */
                     if (uc < 0x100) {
@@ -9801,6 +9802,7 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, UV *valuep, I
          if (in_char_class && has_multiple_chars) {
             ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
          }
+
          RExC_parse = endbrace + 1;
      }
      else if (! node_p || ! has_multiple_chars) {
@@ -11440,15 +11442,14 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
  
      dVAR;
      UV nextvalue;
-    UV prevvalue, save_prevvalue = OOB_UNICODE;
+    UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE;
      IV range = 0;
-    UV value, save_value = 0;
+    UV value = OOB_UNICODE, save_value = OOB_UNICODE;
      regnode *ret;
      STRLEN numlen;
      IV namedclass = OOB_NAMEDCLASS;
      char *rangebegin = NULL;
      bool need_class = 0;
-    bool allow_full_fold = TRUE;   /* Assume wants multi-char folding */
      SV *listsv = NULL;
      STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
                                       than just initialized.  */
@@ -11509,27 +11510,14 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
      /* Assume we are going to generate an ANYOF node. */
      ret = reganode(pRExC_state, ANYOF, 0);
  
-
      if (!SIZE_ONLY) {
         ANYOF_FLAGS(ret) = 0;
      }
  
      if (UCHARAT(RExC_parse) == '^') {  /* Complement of range. */
         RExC_parse++;
-        if (! RExC_in_multi_char_class) {
-            invert = TRUE;
-            RExC_naughty++;
-
-            /* We have decided to not allow multi-char folds in inverted
-             * character classes, due to the confusion that can happen,
-             * especially with classes that are designed for a non-Unicode
-             * world:  You have the peculiar case that:
-                "s s" =~ /^[^\xDF]+$/i => Y
-                "ss"  =~ /^[^\xDF]+$/i => N
-            *
-            * See [perl #89750] */
-            allow_full_fold = FALSE;
-        }
+        invert = TRUE;
+        RExC_naughty++;
      }
  
      if (SIZE_ONLY) {
@@ -11948,10 +11936,12 @@ parseit:
                          SV* scratch_list = NULL;
  
                          /* Include all above-Latin1 non-blanks */
-                        _invlist_subtract(PL_AboveLatin1, PL_XPosixBlank, &scratch_list);
+                        _invlist_subtract(PL_AboveLatin1, PL_XPosixBlank,
+                                          &scratch_list);
  
                          /* Add them to the running total of posix classes */
-                        _invlist_subtract(PL_AboveLatin1, PL_XPosixBlank, &scratch_list);
+                        _invlist_subtract(PL_AboveLatin1, PL_XPosixBlank,
+                                          &scratch_list);
                          if (! posixes) {
                              posixes = scratch_list;
                          }
@@ -11962,7 +11952,8 @@ parseit:
  
                          /* Get the list of all non-ASCII-blanks in Latin 1, and
                           * add them to the running total */
-                        _invlist_subtract(PL_Latin1, PL_PosixBlank, &scratch_list);
+                        _invlist_subtract(PL_Latin1, PL_PosixBlank,
+                                          &scratch_list);
                          _invlist_union(posixes, scratch_list, &posixes);
                          SvREFCNT_dec(scratch_list);
                      }
@@ -12189,7 +12180,11 @@ parseit:
           * For single-valued non-inverted ranges, we consider the possibility
           * of multi-char folds.  (We made a conscious decision to not do this
           * for the other cases because it can often lead to non-intuitive
-         * results) */
+         * results.  For example, you have the peculiar case that:
+         *  "s s" =~ /^[^\xDF]+$/i => Y
+         *  "ss"  =~ /^[^\xDF]+$/i => N
+         *
+         * See [perl #89750] */
          if (FOLD && ! invert && value == prevvalue) {
              if (value == LATIN_SMALL_LETTER_SHARP_S
                  || (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold,
@@ -12225,7 +12220,8 @@ parseit:
                      if (! RExC_in_multi_char_class) {
                          AV** this_array_ptr;
                          AV* this_array;
-                        STRLEN cp_count = utf8_length(foldbuf, foldbuf + foldlen);
+                        STRLEN cp_count = utf8_length(foldbuf,
+                                                      foldbuf + foldlen);
                          SV* multi_fold = sv_2mortal(newSVpvn("", 0));
  
                          Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%"UVXf"}", value);
@@ -12250,17 +12246,20 @@ parseit:
                           * ok.  This makes the test for the ligature 'ffi' come
                           * before the test for 'ff' */
                          if (av_exists(multi_char_matches, cp_count)) {
-                            this_array_ptr = (AV**) av_fetch(multi_char_matches, cp_count, FALSE);
+                            this_array_ptr = (AV**) av_fetch(multi_char_matches,
+                                                             cp_count, FALSE);
                              this_array = *this_array_ptr;
                          }
                          else {
                              this_array = newAV();
-                            av_store(multi_char_matches, cp_count, (SV*) this_array);
+                            av_store(multi_char_matches, cp_count,
+                                     (SV*) this_array);
                          }
                          av_push(this_array, multi_fold);
                      }
  
-                    /* This element should not be processed further in this class */
+                    /* This element should not be processed further in this
+                     * class */
                      element_count--;
                      value = save_value;
                      prevvalue = save_prevvalue;
@@ -12328,8 +12327,11 @@ parseit:
                  AV** this_array_ptr;
                  SV* this_sequence;
  
-                this_array_ptr = (AV**) av_fetch(multi_char_matches, cp_count, FALSE);
-                while ((this_sequence = av_pop(*this_array_ptr)) != &PL_sv_undef) {
+                this_array_ptr = (AV**) av_fetch(multi_char_matches,
+                                                 cp_count, FALSE);
+                while ((this_sequence = av_pop(*this_array_ptr)) !=
+                                                                &PL_sv_undef)
+                {
                      if (! first_time) {
                          sv_catpv(substitute_parse, "|");
                      }
@@ -12365,7 +12367,7 @@ parseit:
  
         ret = reg(pRExC_state, 1, &reg_flags, depth+1);
  
-       *flagp |= reg_flags&(HASWIDTH|SPSTART|POSTPONED);
+       *flagp |= reg_flags&(HASWIDTH|SIMPLE|SPSTART|POSTPONED);
  
         RExC_parse = save_parse;
         RExC_end = save_end;
@@ -12609,10 +12611,9 @@ parseit:
                       * to force that */
                      if (! PL_utf8_tofold) {
                          U8 dummy[UTF8_MAXBYTES+1];
-                        STRLEN dummy_len;
  
                          /* This string is just a short named one above \xff */
-                        to_utf8_fold((U8*) HYPHEN_UTF8, dummy, &dummy_len);
+                        to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
                          assert(PL_utf8_tofold); /* Verify that worked */
                      }
                      PL_utf8_foldclosures =
@@ -12643,7 +12644,6 @@ parseit:
  
                 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
                 STRLEN foldlen;
-                UV f;
                  SV** listp;
  
                  if (j < 256) {
@@ -12743,54 +12743,53 @@ parseit:
                   * hard-coded for it.  First, get its fold.  This is the simple
                   * fold, as the multi-character folds have been handled earlier
                   * and separated out */
-               f = _to_uni_fold_flags(j, foldbuf, &foldlen,
-                                        ((LOC)
-                                        ? FOLD_FLAGS_LOCALE
-                                        : (ASCII_FOLD_RESTRICTED)
-                                            ? FOLD_FLAGS_NOMIX_ASCII
-                                            : 0));
-
-                    /* Single character fold of above Latin1.  Add everything
-                     * in its fold closure to the list that this node should
-                     * match */
-                   /* The fold closures data structure is a hash with the keys
-                    * being every character that is folded to, like 'k', and
-                    * the values each an array of everything that folds to its
-                    * key.  e.g. [ 'k', 'K', KELVIN_SIGN ] */
-                   if ((listp = hv_fetch(PL_utf8_foldclosures,
-                                   (char *) foldbuf, foldlen, FALSE)))
-                   {
-                       AV* list = (AV*) *listp;
-                       IV k;
-                       for (k = 0; k <= av_len(list); k++) {
-                           SV** c_p = av_fetch(list, k, FALSE);
-                           UV c;
-                           if (c_p == NULL) {
-                               Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
-                           }
-                           c = SvUV(*c_p);
-
-                           /* /aa doesn't allow folds between ASCII and non-;
-                            * /l doesn't allow them between above and below
-                            * 256 */
-                           if ((ASCII_FOLD_RESTRICTED
-                                      && (isASCII(c) != isASCII(j)))
-                               || (LOC && ((c < 256) != (j < 256))))
-                           {
-                               continue;
-                           }
+               _to_uni_fold_flags(j, foldbuf, &foldlen,
+                                               ((LOC)
+                                               ? FOLD_FLAGS_LOCALE
+                                               : (ASCII_FOLD_RESTRICTED)
+                                                  ? FOLD_FLAGS_NOMIX_ASCII
+                                                  : 0));
+
+                /* Single character fold of above Latin1.  Add everything in
+                 * its fold closure to the list that this node should match.
+                 * The fold closures data structure is a hash with the keys
+                 * being the UTF-8 of every character that is folded to, like
+                 * 'k', and the values each an array of all code points that
+                 * fold to its key.  e.g. [ 'k', 'K', KELVIN_SIGN ].
+                 * Multi-character folds are not included */
+                if ((listp = hv_fetch(PL_utf8_foldclosures,
+                                      (char *) foldbuf, foldlen, FALSE)))
+                {
+                    AV* list = (AV*) *listp;
+                    IV k;
+                    for (k = 0; k <= av_len(list); k++) {
+                        SV** c_p = av_fetch(list, k, FALSE);
+                        UV c;
+                        if (c_p == NULL) {
+                            Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
+                        }
+                        c = SvUV(*c_p);
  
-                            /* Folds involving non-ascii Latin1 characters
-                             * under /d are added to a separate list */
-                           if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS)
-                            {
-                               cp_list = add_cp_to_invlist(cp_list, c);
-                            }
-                            else {
-                              depends_list = add_cp_to_invlist(depends_list, c);
-                           }
-                       }
-                   }
+                        /* /aa doesn't allow folds between ASCII and non-; /l
+                         * doesn't allow them between above and below 256 */
+                        if ((ASCII_FOLD_RESTRICTED
+                                  && (isASCII(c) != isASCII(j)))
+                            || (LOC && ((c < 256) != (j < 256))))
+                        {
+                            continue;
+                        }
+
+                        /* Folds involving non-ascii Latin1 characters
+                         * under /d are added to a separate list */
+                        if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS)
+                        {
+                            cp_list = add_cp_to_invlist(cp_list, c);
+                        }
+                        else {
+                          depends_list = add_cp_to_invlist(depends_list, c);
+                        }
+                    }
+                }
              }
         }
         SvREFCNT_dec(fold_intersection);
@@ -12894,8 +12893,9 @@ parseit:
       * folded until runtime */
  
      /* Optimize inverted simple patterns (e.g. [^a-z]) when everything is known
-     * at compile time.  Besides not inverting folded locale now, we can't invert
-     * if there are things such as \w, which aren't known until runtime */
+     * at compile time.  Besides not inverting folded locale now, we can't
+     * invert if there are things such as \w, which aren't known until runtime
+     * */
      if (invert
          && ! (LOC && (FOLD || (ANYOF_FLAGS(ret) & ANYOF_CLASS)))
         && ! depends_list
@@ -12919,7 +12919,7 @@ parseit:
       * fetching) */
      if (FOLD && LOC)
      {
-       ANYOF_FLAGS(ret) |= ANYOF_LOC_NONBITMAP_FOLD;
+       ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD;
      }
  
      /* Some character classes are equivalent to other nodes.  Such nodes take
@@ -13920,7 +13920,7 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
  
         if (flags & ANYOF_LOCALE)
             sv_catpvs(sv, "{loc}");
-       if (flags & ANYOF_LOC_NONBITMAP_FOLD)
+       if (flags & ANYOF_LOC_FOLD)
             sv_catpvs(sv, "{i}");
         Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
         if (flags & ANYOF_INVERT)
@@ -14178,6 +14178,9 @@ Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
  
      if (!ret_x)
         ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
+    /* This ensures that SvTHINKFIRST(sv) is true, and hence that
+       sv_force_normal(sv) is called.  */
+    SvFAKE_on(ret_x);
      ret = (struct regexp *)SvANY(ret_x);
      
      (void)ReREFCNT_inc(rx);
@@ -14185,13 +14188,12 @@ Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
         by pointing directly at the buffer, but flagging that the allocated
         space in the copy is zero. As we've just done a struct copy, it's now
         a case of zero-ing that, rather than copying the current length.  */
+    if (SvPOKp(ret_x)) SvPV_free(ret_x);
      SvPV_set(ret_x, RX_WRAPPED(rx));
      SvFLAGS(ret_x) |= SvFLAGS(rx) & (SVf_POK|SVp_POK|SVf_UTF8);
      memcpy(&(ret->xpv_cur), &(r->xpv_cur),
            sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
      SvLEN_set(ret_x, 0);
-    SvSTASH_set(ret_x, NULL);
-    SvMAGIC_set(ret_x, NULL);
      if (r->offs) {
          const I32 npar = r->nparens+1;
          Newx(ret->offs, npar, regexp_paren_pair);