This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
regcomp.c: Stop regexp-to-pv assignemnt from leaking
[perl5.git] / regcomp.c
index bad6b42..d524a99 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -231,10 +231,12 @@ typedef struct RExC_state_t {
 #define        WORST           0       /* Worst case. */
 #define        HASWIDTH        0x01    /* Known to match non-null strings. */
 
-/* Simple enough to be STAR/PLUS operand; in an EXACT node must be a single
- * character.  Note that this is not the same thing as REGNODE_SIMPLE */
+/* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single
+ * character.  (There needs to be a case: in the switch statement in regexec.c
+ * for any node marked SIMPLE.)  Note that this is not the same thing as
+ * REGNODE_SIMPLE */
 #define        SIMPLE          0x02
-#define        SPSTART         0x04    /* Starts with * or +. */
+#define        SPSTART         0x04    /* Starts with * or + */
 #define TRYAGAIN       0x08    /* Weeded out a declaration. */
 #define POSTPONED      0x10    /* (?1),(?&name), (??{...}) or similar */
 
@@ -758,7 +760,7 @@ S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *c
 
     ANYOF_BITMAP_SETALL(cl);
     cl->flags = ANYOF_CLASS|ANYOF_EOS|ANYOF_UNICODE_ALL
-               |ANYOF_LOC_NONBITMAP_FOLD|ANYOF_NON_UTF8_LATIN1_ALL;
+               |ANYOF_NON_UTF8_LATIN1_ALL;
 
     /* If any portion of the regex is to operate under locale rules,
      * initialization includes it.  The reason this isn't done for all regexes
@@ -769,7 +771,7 @@ S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *c
      * necessary. */
     if (RExC_contains_locale) {
        ANYOF_CLASS_SETALL(cl);     /* /l uses class */
-       cl->flags |= ANYOF_LOCALE;
+       cl->flags |= ANYOF_LOCALE|ANYOF_LOC_FOLD;
     }
     else {
        ANYOF_CLASS_ZERO(cl);       /* Only /l uses class now */
@@ -824,8 +826,8 @@ S_cl_and(struct regnode_charclass_class *cl,
     if (!(ANYOF_CLASS_TEST_ANY_SET(and_with))
        && !(ANYOF_CLASS_TEST_ANY_SET(cl))
        && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
-       && !(and_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
-       && !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) {
+       && !(and_with->flags & ANYOF_LOC_FOLD)
+       && !(cl->flags & ANYOF_LOC_FOLD)) {
        int i;
 
        if (and_with->flags & ANYOF_INVERT)
@@ -958,8 +960,8 @@ S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, con
         *   (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
         */
        else if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
-            && !(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
-            && !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD) ) {
+            && !(or_with->flags & ANYOF_LOC_FOLD)
+            && !(cl->flags & ANYOF_LOC_FOLD) ) {
            int i;
 
            for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
@@ -985,8 +987,8 @@ S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, con
     } else {    /* 'or_with' is not inverted */
        /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
        if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
-            && (!(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
-                || (cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) ) {
+            && (!(or_with->flags & ANYOF_LOC_FOLD)
+                || (cl->flags & ANYOF_LOC_FOLD)) ) {
            int i;
 
            /* OR char bitmap and class bitmap separately */
@@ -2866,7 +2868,7 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, b
             /* Here, the pattern is not UTF-8.  Look for the multi-char folds
              * that are all ASCII.  As in the above case, EXACTFL and EXACTFA
              * nodes can't have multi-char folds to this range (and there are
-             * no existing ones to the upper latin1 range).  In the EXACTF
+             * no existing ones in the upper latin1 range).  In the EXACTF
              * case we look also for the sharp s, which can be in the final
              * position.  Otherwise we can stop looking 1 byte earlier because
              * have to find at least two characters for a multi-fold */
@@ -2883,7 +2885,7 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, b
             const U8 s_masked = 's' & S_or_s_mask;
 
            while (s < upper) {
-                int len = is_MULTI_CHAR_FOLD_low_safe(s, s_end);
+                int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end);
                 if (! len) {    /* Not a multi-char fold. */
                     if (*s == LATIN_SMALL_LETTER_SHARP_S && OP(scan) == EXACTF)
                     {
@@ -3583,7 +3585,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                if (uc >= 0x100 ||
                    (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
                    && !ANYOF_BITMAP_TEST(data->start_class, uc)
-                   && (!(data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD)
+                   && (!(data->start_class->flags & ANYOF_LOC_FOLD)
                        || !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
                     )
                {
@@ -3669,12 +3671,11 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                if (compat) {
                    ANYOF_BITMAP_SET(data->start_class, uc);
                    data->start_class->flags &= ~ANYOF_EOS;
-                   data->start_class->flags |= ANYOF_LOC_NONBITMAP_FOLD;
                    if (OP(scan) == EXACTFL) {
                        /* XXX This set is probably no longer necessary, and
                         * probably wrong as LOCALE now is on in the initial
                         * state */
-                       data->start_class->flags |= ANYOF_LOCALE;
+                       data->start_class->flags |= ANYOF_LOCALE|ANYOF_LOC_FOLD;
                    }
                    else {
 
@@ -3709,7 +3710,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                }
            }
            else if (flags & SCF_DO_STCLASS_OR) {
-               if (data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD) {
+               if (data->start_class->flags & ANYOF_LOC_FOLD) {
                    /* false positive possible if the class is case-folded.
                       Assume that the locale settings are the same... */
                    if (uc < 0x100) {
@@ -9801,6 +9802,7 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, UV *valuep, I
         if (in_char_class && has_multiple_chars) {
            ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
         }
+
         RExC_parse = endbrace + 1;
     }
     else if (! node_p || ! has_multiple_chars) {
@@ -11440,15 +11442,14 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
 
     dVAR;
     UV nextvalue;
-    UV prevvalue, save_prevvalue = OOB_UNICODE;
+    UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE;
     IV range = 0;
-    UV value, save_value = 0;
+    UV value = OOB_UNICODE, save_value = OOB_UNICODE;
     regnode *ret;
     STRLEN numlen;
     IV namedclass = OOB_NAMEDCLASS;
     char *rangebegin = NULL;
     bool need_class = 0;
-    bool allow_full_fold = TRUE;   /* Assume wants multi-char folding */
     SV *listsv = NULL;
     STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
                                      than just initialized.  */
@@ -11509,27 +11510,14 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
     /* Assume we are going to generate an ANYOF node. */
     ret = reganode(pRExC_state, ANYOF, 0);
 
-
     if (!SIZE_ONLY) {
        ANYOF_FLAGS(ret) = 0;
     }
 
     if (UCHARAT(RExC_parse) == '^') {  /* Complement of range. */
        RExC_parse++;
-        if (! RExC_in_multi_char_class) {
-            invert = TRUE;
-            RExC_naughty++;
-
-            /* We have decided to not allow multi-char folds in inverted
-             * character classes, due to the confusion that can happen,
-             * especially with classes that are designed for a non-Unicode
-             * world:  You have the peculiar case that:
-                "s s" =~ /^[^\xDF]+$/i => Y
-                "ss"  =~ /^[^\xDF]+$/i => N
-            *
-            * See [perl #89750] */
-            allow_full_fold = FALSE;
-        }
+        invert = TRUE;
+        RExC_naughty++;
     }
 
     if (SIZE_ONLY) {
@@ -11948,10 +11936,12 @@ parseit:
                         SV* scratch_list = NULL;
 
                         /* Include all above-Latin1 non-blanks */
-                        _invlist_subtract(PL_AboveLatin1, PL_XPosixBlank, &scratch_list);
+                        _invlist_subtract(PL_AboveLatin1, PL_XPosixBlank,
+                                          &scratch_list);
 
                         /* Add them to the running total of posix classes */
-                        _invlist_subtract(PL_AboveLatin1, PL_XPosixBlank, &scratch_list);
+                        _invlist_subtract(PL_AboveLatin1, PL_XPosixBlank,
+                                          &scratch_list);
                         if (! posixes) {
                             posixes = scratch_list;
                         }
@@ -11962,7 +11952,8 @@ parseit:
 
                         /* Get the list of all non-ASCII-blanks in Latin 1, and
                          * add them to the running total */
-                        _invlist_subtract(PL_Latin1, PL_PosixBlank, &scratch_list);
+                        _invlist_subtract(PL_Latin1, PL_PosixBlank,
+                                          &scratch_list);
                         _invlist_union(posixes, scratch_list, &posixes);
                         SvREFCNT_dec(scratch_list);
                     }
@@ -12189,7 +12180,11 @@ parseit:
          * For single-valued non-inverted ranges, we consider the possibility
          * of multi-char folds.  (We made a conscious decision to not do this
          * for the other cases because it can often lead to non-intuitive
-         * results) */
+         * results.  For example, you have the peculiar case that:
+         *  "s s" =~ /^[^\xDF]+$/i => Y
+         *  "ss"  =~ /^[^\xDF]+$/i => N
+         *
+         * See [perl #89750] */
         if (FOLD && ! invert && value == prevvalue) {
             if (value == LATIN_SMALL_LETTER_SHARP_S
                 || (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold,
@@ -12225,7 +12220,8 @@ parseit:
                     if (! RExC_in_multi_char_class) {
                         AV** this_array_ptr;
                         AV* this_array;
-                        STRLEN cp_count = utf8_length(foldbuf, foldbuf + foldlen);
+                        STRLEN cp_count = utf8_length(foldbuf,
+                                                      foldbuf + foldlen);
                         SV* multi_fold = sv_2mortal(newSVpvn("", 0));
 
                         Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%"UVXf"}", value);
@@ -12250,17 +12246,20 @@ parseit:
                          * ok.  This makes the test for the ligature 'ffi' come
                          * before the test for 'ff' */
                         if (av_exists(multi_char_matches, cp_count)) {
-                            this_array_ptr = (AV**) av_fetch(multi_char_matches, cp_count, FALSE);
+                            this_array_ptr = (AV**) av_fetch(multi_char_matches,
+                                                             cp_count, FALSE);
                             this_array = *this_array_ptr;
                         }
                         else {
                             this_array = newAV();
-                            av_store(multi_char_matches, cp_count, (SV*) this_array);
+                            av_store(multi_char_matches, cp_count,
+                                     (SV*) this_array);
                         }
                         av_push(this_array, multi_fold);
                     }
 
-                    /* This element should not be processed further in this class */
+                    /* This element should not be processed further in this
+                     * class */
                     element_count--;
                     value = save_value;
                     prevvalue = save_prevvalue;
@@ -12328,8 +12327,11 @@ parseit:
                 AV** this_array_ptr;
                 SV* this_sequence;
 
-                this_array_ptr = (AV**) av_fetch(multi_char_matches, cp_count, FALSE);
-                while ((this_sequence = av_pop(*this_array_ptr)) != &PL_sv_undef) {
+                this_array_ptr = (AV**) av_fetch(multi_char_matches,
+                                                 cp_count, FALSE);
+                while ((this_sequence = av_pop(*this_array_ptr)) !=
+                                                                &PL_sv_undef)
+                {
                     if (! first_time) {
                         sv_catpv(substitute_parse, "|");
                     }
@@ -12365,7 +12367,7 @@ parseit:
 
        ret = reg(pRExC_state, 1, &reg_flags, depth+1);
 
-       *flagp |= reg_flags&(HASWIDTH|SPSTART|POSTPONED);
+       *flagp |= reg_flags&(HASWIDTH|SIMPLE|SPSTART|POSTPONED);
 
        RExC_parse = save_parse;
        RExC_end = save_end;
@@ -12609,10 +12611,9 @@ parseit:
                      * to force that */
                     if (! PL_utf8_tofold) {
                         U8 dummy[UTF8_MAXBYTES+1];
-                        STRLEN dummy_len;
 
                         /* This string is just a short named one above \xff */
-                        to_utf8_fold((U8*) HYPHEN_UTF8, dummy, &dummy_len);
+                        to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
                         assert(PL_utf8_tofold); /* Verify that worked */
                     }
                     PL_utf8_foldclosures =
@@ -12643,7 +12644,6 @@ parseit:
 
                U8 foldbuf[UTF8_MAXBYTES_CASE+1];
                STRLEN foldlen;
-                UV f;
                 SV** listp;
 
                 if (j < 256) {
@@ -12743,54 +12743,53 @@ parseit:
                  * hard-coded for it.  First, get its fold.  This is the simple
                  * fold, as the multi-character folds have been handled earlier
                  * and separated out */
-               f = _to_uni_fold_flags(j, foldbuf, &foldlen,
-                                        ((LOC)
-                                        ? FOLD_FLAGS_LOCALE
-                                        : (ASCII_FOLD_RESTRICTED)
-                                            ? FOLD_FLAGS_NOMIX_ASCII
-                                            : 0));
-
-                    /* Single character fold of above Latin1.  Add everything
-                     * in its fold closure to the list that this node should
-                     * match */
-                   /* The fold closures data structure is a hash with the keys
-                    * being every character that is folded to, like 'k', and
-                    * the values each an array of everything that folds to its
-                    * key.  e.g. [ 'k', 'K', KELVIN_SIGN ] */
-                   if ((listp = hv_fetch(PL_utf8_foldclosures,
-                                   (char *) foldbuf, foldlen, FALSE)))
-                   {
-                       AV* list = (AV*) *listp;
-                       IV k;
-                       for (k = 0; k <= av_len(list); k++) {
-                           SV** c_p = av_fetch(list, k, FALSE);
-                           UV c;
-                           if (c_p == NULL) {
-                               Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
-                           }
-                           c = SvUV(*c_p);
-
-                           /* /aa doesn't allow folds between ASCII and non-;
-                            * /l doesn't allow them between above and below
-                            * 256 */
-                           if ((ASCII_FOLD_RESTRICTED
-                                      && (isASCII(c) != isASCII(j)))
-                               || (LOC && ((c < 256) != (j < 256))))
-                           {
-                               continue;
-                           }
+               _to_uni_fold_flags(j, foldbuf, &foldlen,
+                                               ((LOC)
+                                               ? FOLD_FLAGS_LOCALE
+                                               : (ASCII_FOLD_RESTRICTED)
+                                                  ? FOLD_FLAGS_NOMIX_ASCII
+                                                  : 0));
+
+                /* Single character fold of above Latin1.  Add everything in
+                 * its fold closure to the list that this node should match.
+                 * The fold closures data structure is a hash with the keys
+                 * being the UTF-8 of every character that is folded to, like
+                 * 'k', and the values each an array of all code points that
+                 * fold to its key.  e.g. [ 'k', 'K', KELVIN_SIGN ].
+                 * Multi-character folds are not included */
+                if ((listp = hv_fetch(PL_utf8_foldclosures,
+                                      (char *) foldbuf, foldlen, FALSE)))
+                {
+                    AV* list = (AV*) *listp;
+                    IV k;
+                    for (k = 0; k <= av_len(list); k++) {
+                        SV** c_p = av_fetch(list, k, FALSE);
+                        UV c;
+                        if (c_p == NULL) {
+                            Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
+                        }
+                        c = SvUV(*c_p);
 
-                            /* Folds involving non-ascii Latin1 characters
-                             * under /d are added to a separate list */
-                           if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS)
-                            {
-                               cp_list = add_cp_to_invlist(cp_list, c);
-                            }
-                            else {
-                              depends_list = add_cp_to_invlist(depends_list, c);
-                           }
-                       }
-                   }
+                        /* /aa doesn't allow folds between ASCII and non-; /l
+                         * doesn't allow them between above and below 256 */
+                        if ((ASCII_FOLD_RESTRICTED
+                                  && (isASCII(c) != isASCII(j)))
+                            || (LOC && ((c < 256) != (j < 256))))
+                        {
+                            continue;
+                        }
+
+                        /* Folds involving non-ascii Latin1 characters
+                         * under /d are added to a separate list */
+                        if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS)
+                        {
+                            cp_list = add_cp_to_invlist(cp_list, c);
+                        }
+                        else {
+                          depends_list = add_cp_to_invlist(depends_list, c);
+                        }
+                    }
+                }
             }
        }
        SvREFCNT_dec(fold_intersection);
@@ -12894,8 +12893,9 @@ parseit:
      * folded until runtime */
 
     /* Optimize inverted simple patterns (e.g. [^a-z]) when everything is known
-     * at compile time.  Besides not inverting folded locale now, we can't invert
-     * if there are things such as \w, which aren't known until runtime */
+     * at compile time.  Besides not inverting folded locale now, we can't
+     * invert if there are things such as \w, which aren't known until runtime
+     * */
     if (invert
         && ! (LOC && (FOLD || (ANYOF_FLAGS(ret) & ANYOF_CLASS)))
        && ! depends_list
@@ -12919,7 +12919,7 @@ parseit:
      * fetching) */
     if (FOLD && LOC)
     {
-       ANYOF_FLAGS(ret) |= ANYOF_LOC_NONBITMAP_FOLD;
+       ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD;
     }
 
     /* Some character classes are equivalent to other nodes.  Such nodes take
@@ -13920,7 +13920,7 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
 
        if (flags & ANYOF_LOCALE)
            sv_catpvs(sv, "{loc}");
-       if (flags & ANYOF_LOC_NONBITMAP_FOLD)
+       if (flags & ANYOF_LOC_FOLD)
            sv_catpvs(sv, "{i}");
        Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
        if (flags & ANYOF_INVERT)
@@ -14178,6 +14178,9 @@ Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
 
     if (!ret_x)
        ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
+    /* This ensures that SvTHINKFIRST(sv) is true, and hence that
+       sv_force_normal(sv) is called.  */
+    SvFAKE_on(ret_x);
     ret = (struct regexp *)SvANY(ret_x);
     
     (void)ReREFCNT_inc(rx);
@@ -14185,13 +14188,12 @@ Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
        by pointing directly at the buffer, but flagging that the allocated
        space in the copy is zero. As we've just done a struct copy, it's now
        a case of zero-ing that, rather than copying the current length.  */
+    if (SvPOKp(ret_x)) SvPV_free(ret_x);
     SvPV_set(ret_x, RX_WRAPPED(rx));
     SvFLAGS(ret_x) |= SvFLAGS(rx) & (SVf_POK|SVp_POK|SVf_UTF8);
     memcpy(&(ret->xpv_cur), &(r->xpv_cur),
           sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
     SvLEN_set(ret_x, 0);
-    SvSTASH_set(ret_x, NULL);
-    SvMAGIC_set(ret_x, NULL);
     if (r->offs) {
         const I32 npar = r->nparens+1;
         Newx(ret->offs, npar, regexp_paren_pair);